import requests from bs4 import BeautifulSoup import pymysql DB_CONFIG = { 'host': '8.149.233.36', 'user': 'ai_article_read', 'password': '7aK_H2yvokVumr84lLNDt8fDBp6P', 'database': 'ai_article', 'charset': 'utf8mb4' } def init_db(): # 只做连接关闭,去除建表语句 pass def insert_keywords(words): conn = pymysql.connect(**DB_CONFIG) cursor = conn.cursor() for word in words: try: cursor.execute( "INSERT IGNORE INTO baidu_keyword (keyword) VALUES (%s)", (word,) ) except Exception as e: print(f'插入失败:{word},原因:{e}') conn.commit() cursor.close() conn.close() def get_random_keyword(): conn = pymysql.connect(**DB_CONFIG) cursor = conn.cursor() cursor.execute("SELECT keyword FROM baidu_keyword WHERE crawled=0 ORDER BY RAND() LIMIT 1") result = cursor.fetchone() cursor.close() conn.close() return result[0] if result else None # 不再需要 mark_crawled def export_all_keywords(filename='seed_pool.txt'): conn = pymysql.connect(**DB_CONFIG) cursor = conn.cursor() cursor.execute("SELECT keyword FROM baidu_keyword") all_words = [row[0] for row in cursor.fetchall()] cursor.close() conn.close() with open(filename, 'w', encoding='utf-8') as f: for word in sorted(all_words): f.write(word + '\n') print(f'已导出 {len(all_words)} 个关键词到 {filename}') def fetch_related_words(keyword): cookies = { # ...existing cookies... 'PSTM': '1764302604', 'BAIDUID': '17E56B6A4915D5B98222C8D7A7CFF059:FG=1', 'BD_HOME': '1', 'H_PS_PSSID': '63140_64007_65866_66117_66218_66194_66236_66243_66168_66362_66281_66264_66393_66395_66479_66510_66529_66553_66589_66590_66602_66614_66647_66679_66692_66695_66687', 'delPer': '0', 'BD_CK_SAM': '1', 'PSINO': '3', 'BAIDUID_BFESS': '17E56B6A4915D5B98222C8D7A7CFF059:FG=1', 'PAD_BROWSER': '1', 'BD_UPN': '12314753', 'BDORZ': 'B490B5EBF6F3CD402E515D22BCDA1598', 'BA_HECTOR': 'ala10k2421a0ag25a5a40524a10l8n1kii7of24', 'BIDUPSID': 'C047CB4D757AC8632D7B5792A4254C89', 'ZFY': 'hLpeh2:BHPDeKfEN3yuM7C:A7dmFl03pP:AkeekLlPw5J4:C', 'channel': 'baidusearch', 'H_WISE_SIDS': '63140_64007_65866_66117_66218_66194_66236_66243_66168_66362_66281_66264_66393_66395_66479_66510_66529_66553_66589_66590_66602_66614_66647_66679_66692_66695_66687', 'baikeVisitId': '7f510782-16ce-4371-ad1d-cc8c0ba5ccc8', 'COOKIE_SESSION': '0_0_1_0_0_0_1_0_1_1_7462_1_0_0_0_0_0_0_1764302605%7C1%230_0_1764302605%7C1', 'H_PS_645EC': 'c7554ktAJah5Z6fmLi0RDEpB3a2TvS0rgHEQ7JP12K2UeBuFhGHrlxODIbY', 'BDSVRTM': '16', 'WWW_ST': '1764310101036', } from urllib.parse import quote headers = { 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Referer': f'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd={quote(keyword)}', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', 'is_xhr': '1', 'sec-ch-ua': '"Chromium";v="142", "Google Chrome";v="142", "Not_A Brand";v="99"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', } params = [ ('ie', 'utf-8'), ('mod', '1'), ('isbd', '1'), ('isid', 'b857f27e00205440'), ('ie', 'utf-8'), ('f', '8'), ('rsv_bp', '1'), ('tn', 'baidu'), ('wd', keyword), ('oq', keyword), ('rsv_pq', 'b857f27e00205440'), ('rsv_t', 'c7554ktAJah5Z6fmLi0RDEpB3a2TvS0rgHEQ7JP12K2UeBuFhGHrlxODIbY'), ('rqlang', 'cn'), ('rsv_enter', '1'), ('rsv_dl', 'tb'), ('rsv_btype', 't'), ('inputT', '23852'), ('rsv_sug2', '0'), ('rsv_sug3', '15'), ('rsv_sug1', '23'), ('rsv_sug7', '100'), ('rsv_sug4', '23852'), ('bs', keyword), ('rsv_sid', 'undefined'), ('_ss', '1'), ('clist', 'ddad409c4a1855aa'), ('hsug', ''), ('f4s', '1'), ('csor', '3'), ('_cr1', '35542'), ] response = requests.get('https://www.baidu.com/s', params=params, cookies=cookies, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') result = [] div = soup.find('div', class_='list_1V4Yg') if div: for a in div.find_all('a', class_='item_3WKCf'): spans = a.find_all('span') if len(spans) > 1: result.append(spans[1].get_text(strip=True)) related_search = [] rs_label = soup.find('div', class_='c-color-t rs-label_ihUhK') if rs_label: rs_table = rs_label.find_next('table', class_='rs-table_3RiQc') if rs_table: for a in rs_table.find_all('a', class_='rs-link_2DE3Q'): span = a.find('span', class_='rs-text_3K5mR') if span: related_search.append(span.get_text(strip=True)) return result, related_search def main(): # 首次插入种子词(如数据库为空时) insert_keywords(['糖尿病']) while True: keyword = get_random_keyword() if not keyword: print('数据库无未抓取关键词,等待新词...') import time time.sleep(10) continue print(f'正在抓取:{keyword}') try: words, related_search = fetch_related_words(keyword) except Exception as e: print(f'抓取失败:{keyword},原因:{e}') continue new_words = set(words + related_search) insert_keywords(new_words) # 抓取后标记 crawled=1 conn = pymysql.connect(**DB_CONFIG) cursor = conn.cursor() cursor.execute("UPDATE baidu_keyword SET crawled=1 WHERE keyword=%s", (keyword,)) conn.commit() cursor.close() conn.close() export_all_keywords() if __name__ == '__main__': main()