2025-12-25 upload
This commit is contained in:
3
参考代码/ai_keyword_crawl/.idea/.gitignore
generated
vendored
Normal file
3
参考代码/ai_keyword_crawl/.idea/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
# 默认忽略的文件
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
1
参考代码/ai_keyword_crawl/.idea/.name
generated
Normal file
1
参考代码/ai_keyword_crawl/.idea/.name
generated
Normal file
@@ -0,0 +1 @@
|
||||
baidu_crawl.py
|
||||
14
参考代码/ai_keyword_crawl/.idea/ai_keyword_crawl.iml
generated
Normal file
14
参考代码/ai_keyword_crawl/.idea/ai_keyword_crawl.iml
generated
Normal file
@@ -0,0 +1,14 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.12 (ai_keyword_crawl)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
<option name="format" value="GOOGLE" />
|
||||
<option name="myDocStringFormat" value="Google" />
|
||||
</component>
|
||||
</module>
|
||||
12
参考代码/ai_keyword_crawl/.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
12
参考代码/ai_keyword_crawl/.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
@@ -0,0 +1,12 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredIdentifiers">
|
||||
<list>
|
||||
<option value="deepseek.*" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
||||
6
参考代码/ai_keyword_crawl/.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
参考代码/ai_keyword_crawl/.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
||||
4
参考代码/ai_keyword_crawl/.idea/misc.xml
generated
Normal file
4
参考代码/ai_keyword_crawl/.idea/misc.xml
generated
Normal file
@@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (ai_keyword_crawl)" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
8
参考代码/ai_keyword_crawl/.idea/modules.xml
generated
Normal file
8
参考代码/ai_keyword_crawl/.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/ai_keyword_crawl.iml" filepath="$PROJECT_DIR$/.idea/ai_keyword_crawl.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
554
参考代码/ai_keyword_crawl/baidu_crawl.py
Normal file
554
参考代码/ai_keyword_crawl/baidu_crawl.py
Normal file
@@ -0,0 +1,554 @@
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from multiprocessing import Process
|
||||
import os
|
||||
from log_config import setup_baidu_crawl_logger
|
||||
from database_config import db_manager
|
||||
|
||||
|
||||
# 禁用系统代理环境变量
|
||||
os.environ['NO_PROXY'] = '*'
|
||||
os.environ['no_proxy'] = '*'
|
||||
if 'HTTP_PROXY' in os.environ:
|
||||
del os.environ['HTTP_PROXY']
|
||||
if 'HTTPS_PROXY' in os.environ:
|
||||
del os.environ['HTTPS_PROXY']
|
||||
if 'http_proxy' in os.environ:
|
||||
del os.environ['http_proxy']
|
||||
if 'https_proxy' in os.environ:
|
||||
del os.environ['https_proxy']
|
||||
|
||||
|
||||
# 使用统一的日志配置
|
||||
logger = setup_baidu_crawl_logger()
|
||||
|
||||
# 简单的代理获取配置 - 大麦代理IP
|
||||
PROXY_API_URL = (
|
||||
'https://api2.damaiip.com/index.php?s=/front/user/getIPlist&xsn=e054861d08471263d970bde4f4905181&osn=TC_NO176655872088456223&tiqu=1'
|
||||
)
|
||||
|
||||
# 大麦代理账号密码认证
|
||||
PROXY_USERNAME = '694b8c3172af7'
|
||||
PROXY_PASSWORD = 'q8yA8x1dwCpdyIK'
|
||||
|
||||
# 备用固定代理IP池(格式:'IP:端口', '用户名', '密码')
|
||||
BACKUP_PROXY_POOL = [
|
||||
{'ip': '61.171.69.167:50000', 'user': '6jinnh', 'password': 'fi9k7q5d'},
|
||||
{'ip': '36.213.32.122:50001', 'user': '9w6xpg', 'password': 'tqswr1ee'},
|
||||
]
|
||||
|
||||
def init_db():
|
||||
# 只做连接关闭,去除建表语句
|
||||
pass
|
||||
|
||||
def insert_keywords(words, parent_id=0, seed_id=0, seed_name=''):
|
||||
"""插入关键词到baidu_keyword表(自动提交模式,高效快速)
|
||||
|
||||
Args:
|
||||
words: 关键词列表
|
||||
parent_id: 父层级ID
|
||||
seed_id: 种子ID
|
||||
seed_name: 种子名称
|
||||
"""
|
||||
if not words:
|
||||
return
|
||||
|
||||
try:
|
||||
sql = """INSERT IGNORE INTO baidu_keyword
|
||||
(keyword, parents_id, seed_id, seed_name)
|
||||
VALUES (%s, %s, %s, %s)"""
|
||||
|
||||
params_list = [(word, parent_id, seed_id, seed_name) for word in words]
|
||||
inserted_count = db_manager.execute_many(sql, params_list, autocommit=True)
|
||||
|
||||
if inserted_count > 0:
|
||||
logger.info(f'成功插入 {inserted_count}/{len(words)} 个新关键词 (父ID:{parent_id}, 种子:{seed_name})')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'插入关键词异常:{e},父ID:{parent_id},种子:{seed_name}')
|
||||
|
||||
def get_seed_keywords_batch(batch_size=4):
|
||||
"""从baidu_seed_keywords表获取一批未抓取的种子词
|
||||
|
||||
Args:
|
||||
batch_size: 批次大小
|
||||
|
||||
Returns:
|
||||
list: 种子词列表,每个元素为(id, keyword)
|
||||
"""
|
||||
try:
|
||||
conn = pymysql.connect(**DB_CONFIG)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT id, keyword FROM baidu_seed_keywords WHERE crawled=0 LIMIT %s",
|
||||
(batch_size,)
|
||||
)
|
||||
results = cursor.fetchall()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
logger.info(f'从种子表获取到 {len(results)} 个未抓取的种子词')
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.error(f'查询种子词失败:{e}')
|
||||
return []
|
||||
|
||||
|
||||
def get_keyword_info(keyword):
|
||||
"""获取关键词的详细信息
|
||||
|
||||
Args:
|
||||
keyword: 关键词
|
||||
|
||||
Returns:
|
||||
tuple: (id, keyword, parents_id, seed_id, seed_name) 或 None
|
||||
"""
|
||||
try:
|
||||
sql = "SELECT id, keyword, parents_id, seed_id, seed_name FROM baidu_keyword WHERE keyword=%s"
|
||||
result = db_manager.execute_query(sql, (keyword,), fetch_one=True)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f'查询关键词信息失败:{e}')
|
||||
return None
|
||||
|
||||
|
||||
def export_all_keywords(filename='seed_pool.txt'):
|
||||
try:
|
||||
sql = "SELECT keyword FROM baidu_keyword"
|
||||
results = db_manager.execute_query(sql)
|
||||
all_words = [row[0] for row in results] if results else []
|
||||
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
for word in sorted(all_words):
|
||||
f.write(word + '\n')
|
||||
logger.info(f'已导出 {len(all_words)} 个关键词到 {filename}')
|
||||
except Exception as e:
|
||||
logger.error(f'导出关键词失败:{e}')
|
||||
|
||||
def fetch_proxy(custom_logger=None):
|
||||
"""从代理服务获取一个可用代理,失败时使用备用固定代理。
|
||||
|
||||
Args:
|
||||
custom_logger: 自定义logger,用于多进程环境
|
||||
"""
|
||||
log = custom_logger if custom_logger else logger
|
||||
try:
|
||||
# 使用大麦代理API获取IP
|
||||
resp = requests.get(PROXY_API_URL, timeout=10)
|
||||
resp.raise_for_status()
|
||||
|
||||
# 尝试解析JSON格式
|
||||
try:
|
||||
result = resp.json()
|
||||
if result.get('code') == 0 and result.get('data'):
|
||||
ip_list = result['data']
|
||||
if ip_list and len(ip_list) > 0:
|
||||
ip_info = ip_list[0]
|
||||
ip_port = f"{ip_info['ip']}:{ip_info['port']}"
|
||||
nowtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
||||
log.info(f'提取大麦代理IP: {ip_port}_{nowtime}')
|
||||
|
||||
# 构建带账密的代理URL
|
||||
proxy_url = f'http://{PROXY_USERNAME}:{PROXY_PASSWORD}@{ip_info["ip"]}:{ip_info["port"]}'
|
||||
return {
|
||||
'http': proxy_url,
|
||||
'https': proxy_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
# 如果不是JSON格式,尝试解析纯文本格式
|
||||
ip_port = resp.text.strip().split('\n', 1)[0]
|
||||
if ':' in ip_port:
|
||||
ip_message = resp.text.strip()
|
||||
nowtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
||||
log.info(f'提取大麦代理IP: {ip_port}_{nowtime}')
|
||||
log.info(f'提取IPMessage: {ip_message}')
|
||||
|
||||
host, port = ip_port.split(':', 1)
|
||||
proxy_url = f'http://{PROXY_USERNAME}:{PROXY_PASSWORD}@{host}:{port}'
|
||||
return {
|
||||
'http': proxy_url,
|
||||
'https': proxy_url,
|
||||
}
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(f'大麦代理API获取失败:{exc},使用备用固定代理池')
|
||||
# 从备用代理池随机选择一个(支持账密认证)
|
||||
import random
|
||||
backup_proxy = random.choice(BACKUP_PROXY_POOL)
|
||||
ip_port = backup_proxy['ip']
|
||||
username = backup_proxy['user']
|
||||
password = backup_proxy['password']
|
||||
|
||||
# 构建带账密的代理URL: http://username:password@host:port
|
||||
host, port = ip_port.split(':', 1)
|
||||
proxy_url = f'http://{username}:{password}@{host}:{port}'
|
||||
|
||||
log.info(f'使用备用代理:{username}@{ip_port}')
|
||||
return {
|
||||
'http': proxy_url,
|
||||
'https': proxy_url,
|
||||
}
|
||||
|
||||
|
||||
def fetch_related_words(keyword, proxies=None):
|
||||
cookies = {
|
||||
# ...existing cookies...
|
||||
'PSTM': '1764302604',
|
||||
'BAIDUID': '17E56B6A4915D5B98222C8D7A7CFF059:FG=1',
|
||||
'BD_HOME': '1',
|
||||
'H_PS_PSSID': '63140_64007_65866_66117_66218_66194_66236_66243_66168_66362_66281_66264_66393_66395_66479_66510_66529_66553_66589_66590_66602_66614_66647_66679_66692_66695_66687',
|
||||
'delPer': '0',
|
||||
'BD_CK_SAM': '1',
|
||||
'PSINO': '3',
|
||||
'BAIDUID_BFESS': '17E56B6A4915D5B98222C8D7A7CFF059:FG=1',
|
||||
'PAD_BROWSER': '1',
|
||||
'BD_UPN': '12314753',
|
||||
'BDORZ': 'B490B5EBF6F3CD402E515D22BCDA1598',
|
||||
'BA_HECTOR': 'ala10k2421a0ag25a5a40524a10l8n1kii7of24',
|
||||
'BIDUPSID': 'C047CB4D757AC8632D7B5792A4254C89',
|
||||
'ZFY': 'hLpeh2:BHPDeKfEN3yuM7C:A7dmFl03pP:AkeekLlPw5J4:C',
|
||||
'channel': 'baidusearch',
|
||||
'H_WISE_SIDS': '63140_64007_65866_66117_66218_66194_66236_66243_66168_66362_66281_66264_66393_66395_66479_66510_66529_66553_66589_66590_66602_66614_66647_66679_66692_66695_66687',
|
||||
'baikeVisitId': '7f510782-16ce-4371-ad1d-cc8c0ba5ccc8',
|
||||
'COOKIE_SESSION': '0_0_1_0_0_0_1_0_1_1_7462_1_0_0_0_0_0_0_1764302605%7C1%230_0_1764302605%7C1',
|
||||
'H_PS_645EC': 'c7554ktAJah5Z6fmLi0RDEpB3a2TvS0rgHEQ7JP12K2UeBuFhGHrlxODIbY',
|
||||
'BDSVRTM': '16',
|
||||
'WWW_ST': '1764310101036',
|
||||
}
|
||||
|
||||
from urllib.parse import quote
|
||||
headers = {
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||
'Connection': 'keep-alive',
|
||||
'Referer': f'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd={quote(keyword)}',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'is_xhr': '1',
|
||||
'sec-ch-ua': '"Chromium";v="142", "Google Chrome";v="142", "Not_A Brand";v="99"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
}
|
||||
|
||||
params = [
|
||||
('ie', 'utf-8'),
|
||||
('mod', '1'),
|
||||
('isbd', '1'),
|
||||
('isid', 'b857f27e00205440'),
|
||||
('ie', 'utf-8'),
|
||||
('f', '8'),
|
||||
('rsv_bp', '1'),
|
||||
('tn', 'baidu'),
|
||||
('wd', keyword),
|
||||
('oq', keyword),
|
||||
('rsv_pq', 'b857f27e00205440'),
|
||||
('rsv_t', 'c7554ktAJah5Z6fmLi0RDEpB3a2TvS0rgHEQ7JP12K2UeBuFhGHrlxODIbY'),
|
||||
('rqlang', 'cn'),
|
||||
('rsv_enter', '1'),
|
||||
('rsv_dl', 'tb'),
|
||||
('rsv_btype', 't'),
|
||||
('inputT', '23852'),
|
||||
('rsv_sug2', '0'),
|
||||
('rsv_sug3', '15'),
|
||||
('rsv_sug1', '23'),
|
||||
('rsv_sug7', '100'),
|
||||
('rsv_sug4', '23852'),
|
||||
('bs', keyword),
|
||||
('rsv_sid', 'undefined'),
|
||||
('_ss', '1'),
|
||||
('clist', 'ddad409c4a1855aa'),
|
||||
('hsug', ''),
|
||||
('f4s', '1'),
|
||||
('csor', '3'),
|
||||
('_cr1', '35542'),
|
||||
]
|
||||
|
||||
response = requests.get(
|
||||
'https://www.baidu.com/s',
|
||||
params=params,
|
||||
cookies=cookies,
|
||||
headers=headers,
|
||||
proxies=proxies,
|
||||
timeout=10,
|
||||
)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
result = []
|
||||
div = soup.find('div', class_='list_1V4Yg')
|
||||
if div:
|
||||
for a in div.find_all('a', class_='item_3WKCf'):
|
||||
spans = a.find_all('span')
|
||||
if len(spans) > 1:
|
||||
result.append(spans[1].get_text(strip=True))
|
||||
|
||||
related_search = []
|
||||
rs_label = soup.find('div', class_='c-color-t rs-label_ihUhK')
|
||||
if rs_label:
|
||||
rs_table = rs_label.find_next('table', class_='rs-table_3RiQc')
|
||||
if rs_table:
|
||||
for a in rs_table.find_all('a', class_='rs-link_2DE3Q'):
|
||||
span = a.find('span', class_='rs-text_3K5mR')
|
||||
if span:
|
||||
related_search.append(span.get_text(strip=True))
|
||||
return result, related_search
|
||||
|
||||
|
||||
def crawl_keyword_worker(keyword, parent_id, seed_id, seed_name, is_seed=False):
|
||||
"""单个关键词抓取工作进程
|
||||
|
||||
Args:
|
||||
keyword: 要抓取的关键词
|
||||
parent_id: 父层级ID(新抓取词的父ID,就是当前关键词的ID)
|
||||
seed_id: 种子ID
|
||||
seed_name: 种子名称
|
||||
is_seed: 是否为种子词
|
||||
"""
|
||||
# 数据验证:确保 seed_id 和 seed_name 不为空
|
||||
if not seed_id or seed_id == 0:
|
||||
logger.error(f'[数据错误] {keyword} 的 seed_id 为空或0,跳过抓取')
|
||||
return
|
||||
|
||||
if not seed_name or seed_name.strip() == '':
|
||||
logger.error(f'[数据错误] {keyword} 的 seed_name 为空,跳过抓取')
|
||||
return
|
||||
|
||||
logger.info(f'[进程启动] 正在抓取:{keyword} (ID作为新词父ID:{parent_id}, 种子ID:{seed_id}, 种子名:{seed_name})')
|
||||
|
||||
try:
|
||||
proxies = fetch_proxy()
|
||||
words, related_search = fetch_related_words(keyword, proxies=proxies)
|
||||
new_words = set(words + related_search)
|
||||
|
||||
logger.info(f'[抓取结果] {keyword} -> 推荐词:{len(words)}个, 相关搜索:{len(related_search)}个')
|
||||
|
||||
# 插入新关键词:parent_id=当前关键词的ID,seed_id和seed_name延续
|
||||
if new_words:
|
||||
insert_keywords(new_words, parent_id, seed_id, seed_name)
|
||||
logger.info(f'[成功] {keyword} -> 抓取到 {len(new_words)} 个相关词并入库 (子词父ID:{parent_id}, 种子:{seed_name})')
|
||||
else:
|
||||
logger.warning(f'[空结果] {keyword} -> 未抓取到任何相关词')
|
||||
|
||||
# 标记为已抓取
|
||||
if is_seed:
|
||||
# 更新种子表
|
||||
sql = "UPDATE baidu_seed_keywords SET crawled=1 WHERE keyword=%s"
|
||||
db_manager.execute_update(sql, (keyword,), autocommit=True)
|
||||
logger.info(f'[标记完成] 种子词 {keyword} 已标记为已抓取')
|
||||
else:
|
||||
# 更新关键词表
|
||||
sql = "UPDATE baidu_keyword SET crawled=1 WHERE keyword=%s"
|
||||
db_manager.execute_update(sql, (keyword,), autocommit=True)
|
||||
logger.info(f'[标记完成] 关键词 {keyword} 已标记为已抓取')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'[失败] 抓取失败:{keyword},种子:{seed_name},错误:{e}', exc_info=True)
|
||||
|
||||
|
||||
def process_seed_keywords():
|
||||
"""处理种子关键词:一次只处理2个种子词,插入到baidu_keyword表中且crawled=0"""
|
||||
logger.info('\n========== 处理2个种子关键词 ==========')
|
||||
|
||||
# 获取一批种子词(2个)
|
||||
seeds = get_seed_keywords_batch(batch_size=2)
|
||||
|
||||
if not seeds:
|
||||
logger.info('当前无未处理的种子词')
|
||||
return False
|
||||
|
||||
logger.info(f'\n[批次开始] 获取到 {len(seeds)} 个种子词:{[s[1] for s in seeds]}')
|
||||
|
||||
for seed_id, seed_keyword in seeds:
|
||||
# 将种子词插入到baidu_keyword表中,crawled=0, parents_id=0
|
||||
insert_keywords(
|
||||
[seed_keyword],
|
||||
parent_id=0,
|
||||
seed_id=seed_id,
|
||||
seed_name=seed_keyword
|
||||
)
|
||||
|
||||
# 标记种子表中的种子词为已处理
|
||||
try:
|
||||
sql = "UPDATE baidu_seed_keywords SET crawled=1 WHERE id=%s"
|
||||
db_manager.execute_update(sql, (seed_id,), autocommit=True)
|
||||
logger.info(f' ✓ 种子词已插入: {seed_keyword} (种子ID:{seed_id}, crawled=0, parents_id=0)')
|
||||
except Exception as e:
|
||||
logger.error(f'标记种子表失败:{e}')
|
||||
|
||||
logger.info(f'[批次完成] 本批 {len(seeds)} 个种子词已插入baidu_keyword表,等待process_layer_keywords()处理')
|
||||
return True
|
||||
|
||||
|
||||
def process_seed_worker(seed_id, seed_name):
|
||||
"""单个种子的工作进程:顺序处理该种子下的所有关键词
|
||||
|
||||
Args:
|
||||
seed_id: 种子ID
|
||||
seed_name: 种子名称
|
||||
"""
|
||||
# 关键修复:子进程中重新初始化logger,确保错误日志正确写入文件
|
||||
from log_config import setup_baidu_crawl_logger
|
||||
process_logger = setup_baidu_crawl_logger(force_reinit=True)
|
||||
|
||||
process_logger.info(f'[进程启动] 种子ID:{seed_id}, 种子名:{seed_name} - 开始处理')
|
||||
|
||||
while True:
|
||||
try:
|
||||
# 获取该种子下的未抓取关键词(按id ASC顺序)
|
||||
sql = """SELECT id, keyword, parents_id
|
||||
FROM baidu_keyword
|
||||
WHERE crawled=0 AND seed_id=%s
|
||||
ORDER BY id ASC
|
||||
LIMIT 1"""
|
||||
|
||||
record = db_manager.execute_query(sql, (seed_id,), fetch_one=True)
|
||||
|
||||
if not record:
|
||||
# 没有crawled=0的记录了,结束进程
|
||||
process_logger.info(f'[种子{seed_id}] 没有crawled=0的记录,进程结束')
|
||||
break
|
||||
|
||||
# 有记录需要处理
|
||||
keyword_id, keyword, parent_id = record
|
||||
process_logger.info(f'[种子{seed_id}] 处理关键词: {keyword} (ID:{keyword_id}, 父ID:{parent_id})')
|
||||
|
||||
try:
|
||||
# 抓取相关关键词 - 增加代理重试机制
|
||||
max_retries = 3
|
||||
retry_count = 0
|
||||
words, related_search = [], []
|
||||
|
||||
while retry_count < max_retries:
|
||||
try:
|
||||
proxies = fetch_proxy(custom_logger=process_logger)
|
||||
words, related_search = fetch_related_words(keyword, proxies=proxies)
|
||||
break # 成功则退出重试循环
|
||||
except Exception as retry_error:
|
||||
retry_count += 1
|
||||
if retry_count < max_retries:
|
||||
process_logger.warning(f'[种子{seed_id}] {keyword} 第{retry_count}次尝试失败,重试中... 错误:{retry_error}')
|
||||
time.sleep(2) # 等待2秒再重试
|
||||
else:
|
||||
process_logger.error(f'[种子{seed_id}] {keyword} 经过{max_retries}次重试仍然失败,放弃该关键词')
|
||||
raise # 重试失败,向外抛出异常
|
||||
|
||||
new_words = set(words + related_search)
|
||||
|
||||
if new_words:
|
||||
insert_keywords(new_words, keyword_id, seed_id, seed_name)
|
||||
process_logger.info(f'[种子{seed_id}] {keyword} -> 抓取到 {len(new_words)} 个相关词')
|
||||
else:
|
||||
process_logger.info(f'[种子{seed_id}] {keyword} -> 未抓取到相关词')
|
||||
|
||||
# 更新状态:parents_id=0标记为2,其他标记为1
|
||||
if parent_id == 0:
|
||||
# parents_id=0,标记为crawled=2(临时状态)
|
||||
sql = "UPDATE baidu_keyword SET crawled=2 WHERE id=%s"
|
||||
db_manager.execute_update(sql, (keyword_id,), autocommit=True)
|
||||
process_logger.info(f'[种子{seed_id}] {keyword} parents_id=0,标记为crawled=2')
|
||||
else:
|
||||
# parents_id!=0,标记为crawled=1
|
||||
sql = "UPDATE baidu_keyword SET crawled=1 WHERE id=%s"
|
||||
db_manager.execute_update(sql, (keyword_id,), autocommit=True)
|
||||
process_logger.info(f'[种子{seed_id}] {keyword} 已标记为crawled=1')
|
||||
|
||||
except Exception as e:
|
||||
process_logger.error(f'[种子{seed_id}] 抓取失败:{keyword},错误:{e}', exc_info=True)
|
||||
# 失败也标记,避免卡住
|
||||
try:
|
||||
if parent_id == 0:
|
||||
sql = "UPDATE baidu_keyword SET crawled=2 WHERE id=%s"
|
||||
db_manager.execute_update(sql, (keyword_id,), autocommit=True)
|
||||
else:
|
||||
sql = "UPDATE baidu_keyword SET crawled=1 WHERE id=%s"
|
||||
db_manager.execute_update(sql, (keyword_id,), autocommit=True)
|
||||
except:
|
||||
pass
|
||||
|
||||
# 等待3秒再处理下一条
|
||||
time.sleep(3)
|
||||
|
||||
except Exception as e:
|
||||
process_logger.error(f'[种子{seed_id}] 进程异常:{e}', exc_info=True)
|
||||
time.sleep(3)
|
||||
|
||||
process_logger.info(f'[进程结束] 种子ID:{seed_id}, 种子名:{seed_name} - 处理完成')
|
||||
|
||||
|
||||
def process_layer_keywords():
|
||||
"""循环监听处理关键词:
|
||||
1. 获取待处理的关键词(crawled=0)
|
||||
2. 按seed_id聚合,每个seed_id开启一个进程
|
||||
3. 每个进程顺序处理该种子下的所有关键词
|
||||
"""
|
||||
logger.info('\n========== 开始循环监听处理关键词 ==========')
|
||||
|
||||
while True:
|
||||
try:
|
||||
# 获取待处理的关键词(crawled=0, seed_id>0)
|
||||
sql = """SELECT seed_id, seed_name
|
||||
FROM baidu_keyword
|
||||
WHERE crawled=0 AND seed_id>0
|
||||
GROUP BY seed_id, seed_name
|
||||
ORDER BY MIN(created_at) ASC
|
||||
LIMIT 10"""
|
||||
|
||||
seed_groups = db_manager.execute_query(sql)
|
||||
|
||||
if not seed_groups:
|
||||
logger.info('\n[监听中] 当前没有待处理的关键词,3秒后重试...')
|
||||
time.sleep(3)
|
||||
continue
|
||||
|
||||
logger.info(f'\n[批次开始] 获取到 {len(seed_groups)} 个种子需要处理')
|
||||
|
||||
# 按seed_id启动多个进程
|
||||
processes = []
|
||||
for seed_id, seed_name in seed_groups:
|
||||
logger.info(f' - 启动进程: 种子ID:{seed_id}, 种子名:{seed_name}')
|
||||
p = Process(target=process_seed_worker, args=(seed_id, seed_name))
|
||||
p.start()
|
||||
processes.append(p)
|
||||
|
||||
# 等待所有进程完成
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
logger.info(f'[批次完成] {len(seed_groups)} 个种子处理完成,继续监听...')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'循环监听异常:{e}', exc_info=True)
|
||||
time.sleep(3)
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
主流程:
|
||||
1. 每次从 baidu_seed_keywords 表处理2个种子词
|
||||
2. 等待该批种子词产生的所有层级关键词全部抓取完成(baidu_keyword.crawled 全部=1)
|
||||
3. 再处理下一扉2个种子词
|
||||
4. 循环直到 baidu_seed_keywords 全部消耗完
|
||||
5. 然后每10秒监听是否有新的种子词,有则重新开始
|
||||
"""
|
||||
|
||||
try:
|
||||
logger.info('='*70)
|
||||
logger.info('百度关键词抓取系统启动 - ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
|
||||
logger.info('='*70)
|
||||
|
||||
process_layer_keywords()
|
||||
|
||||
logger.info(f'\n{"-"*70}')
|
||||
logger.info(f'{"-"*70}\n')
|
||||
except Exception as e:
|
||||
logger.error(f'程序异常退出:{e}', exc_info=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
564
参考代码/ai_keyword_crawl/baidu_crawl_again.py
Normal file
564
参考代码/ai_keyword_crawl/baidu_crawl_again.py
Normal file
@@ -0,0 +1,564 @@
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from multiprocessing import Process
|
||||
import os
|
||||
from log_config import setup_baidu_crawl_again_logger
|
||||
from database_config import db_manager
|
||||
|
||||
# 使用统一的日志配置
|
||||
logger = setup_baidu_crawl_again_logger()
|
||||
|
||||
# 禁用系统代理环境变量
|
||||
os.environ['NO_PROXY'] = '*'
|
||||
os.environ['no_proxy'] = '*'
|
||||
if 'HTTP_PROXY' in os.environ:
|
||||
del os.environ['HTTP_PROXY']
|
||||
if 'HTTPS_PROXY' in os.environ:
|
||||
del os.environ['HTTPS_PROXY']
|
||||
if 'http_proxy' in os.environ:
|
||||
del os.environ['http_proxy']
|
||||
if 'https_proxy' in os.environ:
|
||||
del os.environ['https_proxy']
|
||||
|
||||
|
||||
# 简单的代理获取配置 - 大麦代理IP
|
||||
PROXY_API_URL = (
|
||||
'https://api2.damaiip.com/index.php?s=/front/user/getIPlist&xsn=e054861d08471263d970bde4f4905181&osn=TC_NO176655872088456223&tiqu=1'
|
||||
)
|
||||
|
||||
# 大麦代理账号密码认证
|
||||
PROXY_USERNAME = '694b8c3172af7'
|
||||
PROXY_PASSWORD = 'q8yA8x1dwCpdyIK'
|
||||
|
||||
# 备用固定代理IP池(格式:'IP:端口', '用户名', '密码')
|
||||
BACKUP_PROXY_POOL = [
|
||||
{'ip': '61.171.69.167:50000', 'user': '6jinnh', 'password': 'fi9k7q5d'},
|
||||
{'ip': '36.213.32.122:50001', 'user': '9w6xpg', 'password': 'tqswr1ee'},
|
||||
]
|
||||
|
||||
def init_db():
|
||||
# 只做连接关闭,去除建表语句
|
||||
pass
|
||||
|
||||
def insert_keywords(words, parent_id=0, seed_id=0, seed_name=''):
|
||||
"""插入关键词到baidu_keyword表(自动提交模式,高效快速)
|
||||
|
||||
Args:
|
||||
words: 关键词列表
|
||||
parent_id: 父层级ID
|
||||
seed_id: 种子ID
|
||||
seed_name: 种子名称
|
||||
"""
|
||||
if not words:
|
||||
return
|
||||
|
||||
try:
|
||||
sql = """INSERT IGNORE INTO baidu_keyword
|
||||
(keyword, parents_id, seed_id, seed_name)
|
||||
VALUES (%s, %s, %s, %s)"""
|
||||
|
||||
params_list = [(word, parent_id, seed_id, seed_name) for word in words]
|
||||
inserted_count = db_manager.execute_many(sql, params_list, autocommit=True)
|
||||
|
||||
if inserted_count > 0:
|
||||
logger.info(f'成功插入 {inserted_count}/{len(words)} 个新关键词 (父ID:{parent_id}, 种子:{seed_name})')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'插入关键词异常:{e},父ID:{parent_id},种子:{seed_name}')
|
||||
|
||||
def get_seed_keywords_batch(batch_size=4):
|
||||
"""从baidu_seed_keywords表获取一批未抓取的种子词
|
||||
|
||||
Args:
|
||||
batch_size: 批次大小
|
||||
|
||||
Returns:
|
||||
list: 种子词列表,每个元素为(id, keyword)
|
||||
"""
|
||||
try:
|
||||
conn = pymysql.connect(**DB_CONFIG)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT id, keyword FROM baidu_seed_keywords WHERE crawled=0 LIMIT %s",
|
||||
(batch_size,)
|
||||
)
|
||||
results = cursor.fetchall()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
logger.info(f'从种子表获取到 {len(results)} 个未抓取的种子词')
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.error(f'查询种子词失败:{e}')
|
||||
return []
|
||||
|
||||
|
||||
def get_keyword_info(keyword):
|
||||
"""获取关键词的详细信息
|
||||
|
||||
Args:
|
||||
keyword: 关键词
|
||||
|
||||
Returns:
|
||||
tuple: (id, keyword, parents_id, seed_id, seed_name) 或 None
|
||||
"""
|
||||
try:
|
||||
sql = "SELECT id, keyword, parents_id, seed_id, seed_name FROM baidu_keyword WHERE keyword=%s"
|
||||
result = db_manager.execute_query(sql, (keyword,), fetch_one=True)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f'查询关键词信息失败:{e}')
|
||||
return None
|
||||
|
||||
|
||||
def export_all_keywords(filename='seed_pool.txt'):
|
||||
try:
|
||||
sql = "SELECT keyword FROM baidu_keyword"
|
||||
results = db_manager.execute_query(sql)
|
||||
all_words = [row[0] for row in results] if results else []
|
||||
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
for word in sorted(all_words):
|
||||
f.write(word + '\n')
|
||||
logger.info(f'已导出 {len(all_words)} 个关键词到 {filename}')
|
||||
except Exception as e:
|
||||
logger.error(f'导出关键词失败:{e}')
|
||||
|
||||
def fetch_proxy(custom_logger=None):
|
||||
"""从代理服务获取一个可用代理,失败时使用备用固定代理。
|
||||
|
||||
Args:
|
||||
custom_logger: 自定义logger,用于多进程环境
|
||||
"""
|
||||
log = custom_logger if custom_logger else logger
|
||||
try:
|
||||
# 使用大麦代理API获取IP
|
||||
resp = requests.get(PROXY_API_URL, timeout=10)
|
||||
resp.raise_for_status()
|
||||
|
||||
# 尝试解析JSON格式
|
||||
try:
|
||||
result = resp.json()
|
||||
if result.get('code') == 0 and result.get('data'):
|
||||
ip_list = result['data']
|
||||
if ip_list and len(ip_list) > 0:
|
||||
ip_info = ip_list[0]
|
||||
ip_port = f"{ip_info['ip']}:{ip_info['port']}"
|
||||
nowtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
||||
log.info(f'提取大麦代理IP: {ip_port}_{nowtime}')
|
||||
|
||||
# 构建带账密的代理URL
|
||||
proxy_url = f'http://{PROXY_USERNAME}:{PROXY_PASSWORD}@{ip_info["ip"]}:{ip_info["port"]}'
|
||||
return {
|
||||
'http': proxy_url,
|
||||
'https': proxy_url,
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
# 如果不是JSON格式,尝试解析纯文本格式
|
||||
ip_port = resp.text.strip().split('\n', 1)[0]
|
||||
if ':' in ip_port:
|
||||
ip_message = resp.text.strip()
|
||||
nowtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
||||
log.info(f'提取大麦代理IP: {ip_port}_{nowtime}')
|
||||
log.info(f'提取IPMessage: {ip_message}')
|
||||
|
||||
host, port = ip_port.split(':', 1)
|
||||
proxy_url = f'http://{PROXY_USERNAME}:{PROXY_PASSWORD}@{host}:{port}'
|
||||
return {
|
||||
'http': proxy_url,
|
||||
'https': proxy_url,
|
||||
}
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(f'大麦代理API获取失败:{exc},使用备用固定代理池')
|
||||
# 从备用代理池随机选择一个(支持账密认证)
|
||||
import random
|
||||
backup_proxy = random.choice(BACKUP_PROXY_POOL)
|
||||
ip_port = backup_proxy['ip']
|
||||
username = backup_proxy['user']
|
||||
password = backup_proxy['password']
|
||||
|
||||
# 构建带账密的代理URL: http://username:password@host:port
|
||||
host, port = ip_port.split(':', 1)
|
||||
proxy_url = f'http://{username}:{password}@{host}:{port}'
|
||||
|
||||
log.info(f'使用备用代理:{username}@{ip_port}')
|
||||
return {
|
||||
'http': proxy_url,
|
||||
'https': proxy_url,
|
||||
}
|
||||
|
||||
|
||||
def fetch_related_words(keyword, proxies=None):
|
||||
cookies = {
|
||||
# ...existing cookies...
|
||||
'PSTM': '1764302604',
|
||||
'BAIDUID': '17E56B6A4915D5B98222C8D7A7CFF059:FG=1',
|
||||
'BD_HOME': '1',
|
||||
'H_PS_PSSID': '63140_64007_65866_66117_66218_66194_66236_66243_66168_66362_66281_66264_66393_66395_66479_66510_66529_66553_66589_66590_66602_66614_66647_66679_66692_66695_66687',
|
||||
'delPer': '0',
|
||||
'BD_CK_SAM': '1',
|
||||
'PSINO': '3',
|
||||
'BAIDUID_BFESS': '17E56B6A4915D5B98222C8D7A7CFF059:FG=1',
|
||||
'PAD_BROWSER': '1',
|
||||
'BD_UPN': '12314753',
|
||||
'BDORZ': 'B490B5EBF6F3CD402E515D22BCDA1598',
|
||||
'BA_HECTOR': 'ala10k2421a0ag25a5a40524a10l8n1kii7of24',
|
||||
'BIDUPSID': 'C047CB4D757AC8632D7B5792A4254C89',
|
||||
'ZFY': 'hLpeh2:BHPDeKfEN3yuM7C:A7dmFl03pP:AkeekLlPw5J4:C',
|
||||
'channel': 'baidusearch',
|
||||
'H_WISE_SIDS': '63140_64007_65866_66117_66218_66194_66236_66243_66168_66362_66281_66264_66393_66395_66479_66510_66529_66553_66589_66590_66602_66614_66647_66679_66692_66695_66687',
|
||||
'baikeVisitId': '7f510782-16ce-4371-ad1d-cc8c0ba5ccc8',
|
||||
'COOKIE_SESSION': '0_0_1_0_0_0_1_0_1_1_7462_1_0_0_0_0_0_0_1764302605%7C1%230_0_1764302605%7C1',
|
||||
'H_PS_645EC': 'c7554ktAJah5Z6fmLi0RDEpB3a2TvS0rgHEQ7JP12K2UeBuFhGHrlxODIbY',
|
||||
'BDSVRTM': '16',
|
||||
'WWW_ST': '1764310101036',
|
||||
}
|
||||
|
||||
from urllib.parse import quote
|
||||
headers = {
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||
'Connection': 'keep-alive',
|
||||
'Referer': f'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd={quote(keyword)}',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'is_xhr': '1',
|
||||
'sec-ch-ua': '"Chromium";v="142", "Google Chrome";v="142", "Not_A Brand";v="99"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
}
|
||||
|
||||
params = [
|
||||
('ie', 'utf-8'),
|
||||
('mod', '1'),
|
||||
('isbd', '1'),
|
||||
('isid', 'b857f27e00205440'),
|
||||
('ie', 'utf-8'),
|
||||
('f', '8'),
|
||||
('rsv_bp', '1'),
|
||||
('tn', 'baidu'),
|
||||
('wd', keyword),
|
||||
('oq', keyword),
|
||||
('rsv_pq', 'b857f27e00205440'),
|
||||
('rsv_t', 'c7554ktAJah5Z6fmLi0RDEpB3a2TvS0rgHEQ7JP12K2UeBuFhGHrlxODIbY'),
|
||||
('rqlang', 'cn'),
|
||||
('rsv_enter', '1'),
|
||||
('rsv_dl', 'tb'),
|
||||
('rsv_btype', 't'),
|
||||
('inputT', '23852'),
|
||||
('rsv_sug2', '0'),
|
||||
('rsv_sug3', '15'),
|
||||
('rsv_sug1', '23'),
|
||||
('rsv_sug7', '100'),
|
||||
('rsv_sug4', '23852'),
|
||||
('bs', keyword),
|
||||
('rsv_sid', 'undefined'),
|
||||
('_ss', '1'),
|
||||
('clist', 'ddad409c4a1855aa'),
|
||||
('hsug', ''),
|
||||
('f4s', '1'),
|
||||
('csor', '3'),
|
||||
('_cr1', '35542'),
|
||||
]
|
||||
|
||||
response = requests.get(
|
||||
'https://www.baidu.com/s',
|
||||
params=params,
|
||||
cookies=cookies,
|
||||
headers=headers,
|
||||
proxies=proxies,
|
||||
timeout=10,
|
||||
)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
result = []
|
||||
div = soup.find('div', class_='list_1V4Yg')
|
||||
if div:
|
||||
for a in div.find_all('a', class_='item_3WKCf'):
|
||||
spans = a.find_all('span')
|
||||
if len(spans) > 1:
|
||||
result.append(spans[1].get_text(strip=True))
|
||||
|
||||
related_search = []
|
||||
rs_label = soup.find('div', class_='c-color-t rs-label_ihUhK')
|
||||
if rs_label:
|
||||
rs_table = rs_label.find_next('table', class_='rs-table_3RiQc')
|
||||
if rs_table:
|
||||
for a in rs_table.find_all('a', class_='rs-link_2DE3Q'):
|
||||
span = a.find('span', class_='rs-text_3K5mR')
|
||||
if span:
|
||||
related_search.append(span.get_text(strip=True))
|
||||
return result, related_search
|
||||
|
||||
|
||||
def crawl_keyword_worker(keyword, parent_id, seed_id, seed_name, is_seed=False):
|
||||
"""单个关键词抓取工作进程
|
||||
|
||||
Args:
|
||||
keyword: 要抓取的关键词
|
||||
parent_id: 父层级ID(新抓取词的父ID,就是当前关键词的ID)
|
||||
seed_id: 种子ID
|
||||
seed_name: 种子名称
|
||||
is_seed: 是否为种子词
|
||||
"""
|
||||
# 数据验证:确保 seed_id 和 seed_name 不为空
|
||||
if not seed_id or seed_id == 0:
|
||||
logger.error(f'[数据错误] {keyword} 的 seed_id 为空或0,跳过抓取')
|
||||
return
|
||||
|
||||
if not seed_name or seed_name.strip() == '':
|
||||
logger.error(f'[数据错误] {keyword} 的 seed_name 为空,跳过抓取')
|
||||
return
|
||||
|
||||
logger.info(f'[进程启动] 正在抓取:{keyword} (ID作为新词父ID:{parent_id}, 种子ID:{seed_id}, 种子名:{seed_name})')
|
||||
|
||||
try:
|
||||
proxies = fetch_proxy()
|
||||
words, related_search = fetch_related_words(keyword, proxies=proxies)
|
||||
new_words = set(words + related_search)
|
||||
|
||||
logger.info(f'[抓取结果] {keyword} -> 推荐词:{len(words)}个, 相关搜索:{len(related_search)}个')
|
||||
|
||||
# 插入新关键词:parent_id=当前关键词的ID,seed_id和seed_name延续
|
||||
if new_words:
|
||||
insert_keywords(new_words, parent_id, seed_id, seed_name)
|
||||
logger.info(f'[成功] {keyword} -> 抓取到 {len(new_words)} 个相关词并入库 (子词父ID:{parent_id}, 种子:{seed_name})')
|
||||
else:
|
||||
logger.warning(f'[空结果] {keyword} -> 未抓取到任何相关词')
|
||||
|
||||
# 标记为已抓取
|
||||
if is_seed:
|
||||
# 更新种子表
|
||||
sql = "UPDATE baidu_seed_keywords SET crawled=1 WHERE keyword=%s"
|
||||
db_manager.execute_update(sql, (keyword,), autocommit=True)
|
||||
logger.info(f'[标记完成] 种子词 {keyword} 已标记为已抓取')
|
||||
else:
|
||||
# 更新关键词表
|
||||
sql = "UPDATE baidu_keyword SET crawled=1 WHERE keyword=%s"
|
||||
db_manager.execute_update(sql, (keyword,), autocommit=True)
|
||||
logger.info(f'[标记完成] 关键词 {keyword} 已标记为已抓取')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'[失败] 抓取失败:{keyword},种子:{seed_name},错误:{e}', exc_info=True)
|
||||
|
||||
|
||||
def process_seed_keywords():
|
||||
"""处理种子关键词:一次只处理2个种子词,插入到baidu_keyword表中且crawled=0"""
|
||||
logger.info('\n========== 处理2个种子关键词 ==========')
|
||||
|
||||
# 获取一批种子词(2个)
|
||||
seeds = get_seed_keywords_batch(batch_size=2)
|
||||
|
||||
if not seeds:
|
||||
logger.info('当前无未处理的种子词')
|
||||
return False
|
||||
|
||||
logger.info(f'\n[批次开始] 获取到 {len(seeds)} 个种子词:{[s[1] for s in seeds]}')
|
||||
|
||||
for seed_id, seed_keyword in seeds:
|
||||
# 将种子词插入到baidu_keyword表中,crawled=0, parents_id=0
|
||||
insert_keywords(
|
||||
[seed_keyword],
|
||||
parent_id=0,
|
||||
seed_id=seed_id,
|
||||
seed_name=seed_keyword
|
||||
)
|
||||
|
||||
# 标记种子表中的种子词为已处理
|
||||
try:
|
||||
sql = "UPDATE baidu_seed_keywords SET crawled=1 WHERE id=%s"
|
||||
db_manager.execute_update(sql, (seed_id,), autocommit=True)
|
||||
logger.info(f' ✓ 种子词已插入: {seed_keyword} (种子ID:{seed_id}, crawled=0, parents_id=0)')
|
||||
except Exception as e:
|
||||
logger.error(f'标记种子表失败:{e}')
|
||||
|
||||
logger.info(f'[批次完成] 本批 {len(seeds)} 个种子词已插入baidu_keyword表,等待process_layer_keywords()处理')
|
||||
return True
|
||||
|
||||
|
||||
def process_seed_worker(seed_id, seed_name):
|
||||
"""单个种子的工作进程:顺序处理该种子下的所有关键词
|
||||
|
||||
Args:
|
||||
seed_id: 种子ID
|
||||
seed_name: 种子名称
|
||||
"""
|
||||
# 关键修复:子进程中重新初始化logger,确保错误日志正确写入文件
|
||||
from log_config import setup_baidu_crawl_again_logger
|
||||
process_logger = setup_baidu_crawl_again_logger(force_reinit=True)
|
||||
|
||||
process_logger.info(f'[进程启动] 种子ID:{seed_id}, 种子名:{seed_name} - 开始处理')
|
||||
|
||||
while True:
|
||||
try:
|
||||
# 获取该种子下的待处理关键词(按id ASC顺序):
|
||||
# 2. crawled=2, parents_id=0 且该seed_id只有一条记录的特殊case(需要重新抓取)
|
||||
sql = """SELECT id, keyword, parents_id
|
||||
FROM baidu_keyword
|
||||
WHERE seed_id=%s
|
||||
AND crawled=2
|
||||
AND parents_id=0
|
||||
AND (SELECT COUNT(*) FROM baidu_keyword WHERE seed_id=%s AND crawled=2 AND parents_id=0) = 1
|
||||
ORDER BY id ASC
|
||||
LIMIT 1"""
|
||||
|
||||
record = db_manager.execute_query(sql, (seed_id, seed_id), fetch_one=True)
|
||||
|
||||
if not record:
|
||||
# 没有待处理的记录了,结束进程
|
||||
process_logger.info(f'[种子{seed_id}] 没有待处理的记录,进程结束')
|
||||
break
|
||||
|
||||
# 有记录需要处理
|
||||
keyword_id, keyword, parent_id = record
|
||||
process_logger.info(f'[种子{seed_id}] 处理关键词: {keyword} (ID:{keyword_id}, 父ID:{parent_id})')
|
||||
|
||||
try:
|
||||
# 抓取相关关键词 - 增加代理重试机制
|
||||
max_retries = 3
|
||||
retry_count = 0
|
||||
words, related_search = [], []
|
||||
|
||||
while retry_count < max_retries:
|
||||
try:
|
||||
proxies = fetch_proxy(custom_logger=process_logger)
|
||||
words, related_search = fetch_related_words(keyword, proxies=proxies)
|
||||
break # 成功则退出重试循环
|
||||
except Exception as retry_error:
|
||||
retry_count += 1
|
||||
if retry_count < max_retries:
|
||||
process_logger.warning(f'[种子{seed_id}] {keyword} 第{retry_count}次尝试失败,重试中... 错误:{retry_error}')
|
||||
time.sleep(2) # 等待2秒再重试
|
||||
else:
|
||||
process_logger.error(f'[种子{seed_id}] {keyword} 经过{max_retries}次重试仍然失败,放弃该关键词')
|
||||
raise # 重试失败,向外抛出异常
|
||||
|
||||
new_words = set(words + related_search)
|
||||
|
||||
if new_words:
|
||||
insert_keywords(new_words, keyword_id, seed_id, seed_name)
|
||||
process_logger.info(f'[种子{seed_id}] {keyword} -> 抓取到 {len(new_words)} 个相关词')
|
||||
else:
|
||||
process_logger.info(f'[种子{seed_id}] {keyword} -> 未抓取到相关词')
|
||||
|
||||
# 更新状态:parents_id=0标记为2,其他标记为1
|
||||
if parent_id == 0:
|
||||
# parents_id=0,标记为crawled=2(临时状态)
|
||||
sql = "UPDATE baidu_keyword SET crawled=2 WHERE id=%s"
|
||||
db_manager.execute_update(sql, (keyword_id,), autocommit=True)
|
||||
process_logger.info(f'[种子{seed_id}] {keyword} parents_id=0,标记为crawled=2')
|
||||
else:
|
||||
# parents_id!=0,标记为crawled=1
|
||||
sql = "UPDATE baidu_keyword SET crawled=1 WHERE id=%s"
|
||||
db_manager.execute_update(sql, (keyword_id,), autocommit=True)
|
||||
process_logger.info(f'[种子{seed_id}] {keyword} 已标记为crawled=1')
|
||||
|
||||
except Exception as e:
|
||||
process_logger.error(f'[种子{seed_id}] 抓取失败:{keyword},错误:{e}', exc_info=True)
|
||||
# 失败也标记,避免卡住
|
||||
try:
|
||||
if parent_id == 0:
|
||||
sql = "UPDATE baidu_keyword SET crawled=2 WHERE id=%s"
|
||||
db_manager.execute_update(sql, (keyword_id,), autocommit=True)
|
||||
else:
|
||||
sql = "UPDATE baidu_keyword SET crawled=1 WHERE id=%s"
|
||||
db_manager.execute_update(sql, (keyword_id,), autocommit=True)
|
||||
except:
|
||||
pass
|
||||
|
||||
# 等待3秒再处理下一条
|
||||
time.sleep(3)
|
||||
|
||||
except Exception as e:
|
||||
process_logger.error(f'[种子{seed_id}] 进程异常:{e}', exc_info=True)
|
||||
time.sleep(3)
|
||||
|
||||
process_logger.info(f'[进程结束] 种子ID:{seed_id}, 种子名:{seed_name} - 处理完成')
|
||||
|
||||
|
||||
def process_layer_keywords():
|
||||
"""循环监听处理关键词:
|
||||
1. 获取待处理的关键词(crawled=0)
|
||||
2. 按seed_id聚合,每个seed_id开启一个进程
|
||||
3. 每个进程顺序处理该种子下的所有关键词
|
||||
4. 补充处理:crawled=2, parents_id=0, seed_id>0 且该seed_id只有一条记录时重新抓取
|
||||
"""
|
||||
logger.info('\n========== 开始循环监听处理关键词 ==========')
|
||||
|
||||
while True:
|
||||
try:
|
||||
# 获取待处理的关键词:
|
||||
# 2. crawled=2, parents_id=0, seed_id>0 且该seed_id只有一条记录的特殊case(需要重新抓取)
|
||||
sql = """SELECT t.seed_id, t.seed_name
|
||||
FROM (
|
||||
SELECT seed_id, seed_name
|
||||
FROM baidu_keyword
|
||||
WHERE crawled=2 AND parents_id=0 AND seed_id>0
|
||||
GROUP BY seed_id, seed_name
|
||||
HAVING COUNT(*) = 1
|
||||
) t
|
||||
GROUP BY t.seed_id, t.seed_name
|
||||
ORDER BY (SELECT MIN(created_at) FROM baidu_keyword WHERE seed_id=t.seed_id) ASC
|
||||
LIMIT 2"""
|
||||
|
||||
seed_groups = db_manager.execute_query(sql)
|
||||
|
||||
if not seed_groups:
|
||||
logger.info('\n[监听中] 当前没有待处理的关键词,3秒后重试...')
|
||||
time.sleep(3)
|
||||
continue
|
||||
|
||||
logger.info(f'\n[批次开始] 获取到 {len(seed_groups)} 个种子需要处理')
|
||||
|
||||
# 按seed_id启动多个进程
|
||||
processes = []
|
||||
for seed_id, seed_name in seed_groups:
|
||||
logger.info(f' - 启动进程: 种子ID:{seed_id}, 种子名:{seed_name}')
|
||||
p = Process(target=process_seed_worker, args=(seed_id, seed_name))
|
||||
p.start()
|
||||
processes.append(p)
|
||||
|
||||
# 等待所有进程完成
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
logger.info(f'[批次完成] {len(seed_groups)} 个种子处理完成,继续监听...')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'循环监听异常:{e}', exc_info=True)
|
||||
time.sleep(3)
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
主流程:
|
||||
1. 每次从 baidu_seed_keywords 表处理2个种子词
|
||||
2. 等待该批种子词产生的所有层级关键词全部抓取完成(baidu_keyword.crawled 全部=1)
|
||||
3. 再处理下一扉2个种子词
|
||||
4. 循环直到 baidu_seed_keywords 全部消耗完
|
||||
5. 然后每10秒监听是否有新的种子词,有则重新开始
|
||||
"""
|
||||
|
||||
try:
|
||||
logger.info('='*70)
|
||||
logger.info('百度关键词抓取系统启动 - ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
|
||||
logger.info('='*70)
|
||||
|
||||
process_layer_keywords()
|
||||
|
||||
logger.info(f'\n{"-"*70}')
|
||||
logger.info(f'{"-"*70}\n')
|
||||
except Exception as e:
|
||||
logger.error(f'程序异常退出:{e}', exc_info=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
246
参考代码/ai_keyword_crawl/baidu_seed.py
Normal file
246
参考代码/ai_keyword_crawl/baidu_seed.py
Normal file
@@ -0,0 +1,246 @@
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from multiprocessing import Process
|
||||
from log_config import setup_baidu_seed_logger
|
||||
from database_config import db_manager
|
||||
|
||||
# 使用统一的日志配置
|
||||
logger = setup_baidu_seed_logger()
|
||||
|
||||
|
||||
def init_db():
|
||||
# 只做连接关闭,去除建表语句
|
||||
pass
|
||||
|
||||
def insert_keywords(words, parent_id=0, seed_id=0, seed_name=''):
|
||||
"""插入关键词到baidu_keyword表(自动提交模式,高效快速)
|
||||
|
||||
Args:
|
||||
words: 关键词列表
|
||||
parent_id: 父层级ID
|
||||
seed_id: 种子ID
|
||||
seed_name: 种子名称
|
||||
"""
|
||||
if not words:
|
||||
return
|
||||
|
||||
try:
|
||||
# (1)先强制删除 parents_id=0 且 seed_id=当前ID 的记录
|
||||
delete_sql = "DELETE FROM baidu_keyword WHERE parents_id=0 AND seed_id=%s LIMIT 1"
|
||||
deleted = db_manager.execute_update(delete_sql, (seed_id,), autocommit=True)
|
||||
if deleted > 0:
|
||||
logger.info(f'[删除旧记录] 种子ID:{seed_id}, parents_id=0, 删除{deleted}条')
|
||||
|
||||
inserted_count = 0
|
||||
for word in words:
|
||||
# (2)插入新记录,不使用 IGNORE,重复就报错
|
||||
insert_sql = """INSERT INTO baidu_keyword
|
||||
(keyword, parents_id, seed_id, seed_name)
|
||||
VALUES (%s, %s, %s, %s)"""
|
||||
try:
|
||||
db_manager.execute_update(insert_sql, (word, parent_id, seed_id, seed_name), autocommit=True)
|
||||
inserted_count += 1
|
||||
logger.info(f'[插入成功] {word} (父ID:{parent_id}, 种子ID:{seed_id})')
|
||||
except Exception as insert_err:
|
||||
logger.warning(f'[插入失败] {word} - {insert_err}')
|
||||
|
||||
if inserted_count > 0:
|
||||
logger.info(f'成功插入 {inserted_count}/{len(words)} 个新关键词 (父ID:{parent_id}, 种子:{seed_name})')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f'插入关键词异常:{e},父ID:{parent_id},种子:{seed_name}')
|
||||
|
||||
def get_seed_keywords_batch(batch_size=4):
|
||||
"""从 baidu_seed_keywords表获取一批未抓取的种子词
|
||||
|
||||
Args:
|
||||
batch_size: 批次大小
|
||||
|
||||
Returns:
|
||||
list: 种子词列表,每个元素为(id, keyword)
|
||||
"""
|
||||
try:
|
||||
sql = "SELECT id, keyword FROM baidu_seed_keywords WHERE crawled=0 AND status = 'ready' LIMIT %s"
|
||||
results = db_manager.execute_query(sql, (batch_size,))
|
||||
return results or []
|
||||
except Exception as e:
|
||||
logger.error(f'查询种子词失败:{e}')
|
||||
return []
|
||||
|
||||
|
||||
def get_keyword_info(keyword):
|
||||
"""获取关键词的详细信息
|
||||
|
||||
Args:
|
||||
keyword: 关键词
|
||||
|
||||
Returns:
|
||||
tuple: (id, keyword, parents_id, seed_id, seed_name) 或 None
|
||||
"""
|
||||
try:
|
||||
sql = "SELECT id, keyword, parents_id, seed_id, seed_name FROM baidu_keyword WHERE keyword=%s"
|
||||
result = db_manager.execute_query(sql, (keyword,), fetch_one=True)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f'查询关键词信息失败:{e}')
|
||||
return None
|
||||
|
||||
|
||||
def process_seed_keywords():
|
||||
"""处理种子关键词:一次只处理2个种子词,插入到baidu_keyword表中且crawled=0"""
|
||||
logger.info('\n========== 处理2个种子关键词 ==========')
|
||||
|
||||
# 获取一批种子词(2个)
|
||||
seeds = get_seed_keywords_batch(batch_size=2)
|
||||
|
||||
if not seeds:
|
||||
logger.info('当前无未处理的种子词')
|
||||
return False
|
||||
|
||||
logger.info(f'\n[批次开始] 获取到 {len(seeds)} 个种子词:{[s[1] for s in seeds]}')
|
||||
|
||||
for seed_id, seed_keyword in seeds:
|
||||
# 将种子词插入到baidu_keyword表中,crawled=0, parents_id=0
|
||||
insert_keywords(
|
||||
[seed_keyword],
|
||||
parent_id=0,
|
||||
seed_id=seed_id,
|
||||
seed_name=seed_keyword
|
||||
)
|
||||
|
||||
# 更新种子表中的状态为doing
|
||||
try:
|
||||
sql = "UPDATE baidu_seed_keywords SET status='doing' WHERE id=%s"
|
||||
db_manager.execute_update(sql, (seed_id,), autocommit=True)
|
||||
|
||||
logger.info(f' ✓ 种子词已插入: {seed_keyword} (种子ID:{seed_id}, crawled=0, parents_id=0, status=doing)')
|
||||
except Exception as e:
|
||||
logger.error(f'更新种子表状态失败:{e}')
|
||||
|
||||
logger.info(f'[批次完成] 本批 {len(seeds)} 个种子词已插入baidu_keyword表,等待baidu_crawl.py处理')
|
||||
return True
|
||||
|
||||
|
||||
def check_and_mark_finished_seeds():
|
||||
"""检查并标记已完成的种子:
|
||||
- 条件1:seed_id下没有crawled=0的记录
|
||||
- 条件2:seed_id下crawled=1的最后一条记录超过30分钟
|
||||
- 满足以上两个条件,则将parents_id=0且crawled=2的记录更新为crawled=1
|
||||
- 同步更新 baidu_seed_keywords 表
|
||||
"""
|
||||
try:
|
||||
# 查找所有有 parents_id=0, crawled=2 的种子(候选种子)
|
||||
sql = """SELECT DISTINCT seed_id, seed_name
|
||||
FROM baidu_keyword
|
||||
WHERE parents_id=0 AND crawled=2 AND seed_id>0"""
|
||||
candidate_seeds = db_manager.execute_query(sql)
|
||||
|
||||
if not candidate_seeds:
|
||||
return 0
|
||||
|
||||
logger.info(f'\n[检查候选种子] 找到 {len(candidate_seeds)} 个候选种子,开始检查完成条件...')
|
||||
|
||||
marked_count = 0
|
||||
for seed_id, seed_name in candidate_seeds:
|
||||
try:
|
||||
# 条件1:检查是否有 crawled=0 的记录
|
||||
sql = "SELECT COUNT(*) FROM baidu_keyword WHERE seed_id=%s AND crawled=0"
|
||||
result = db_manager.execute_query(sql, (seed_id,), fetch_one=True)
|
||||
count_crawled_0 = result[0] if result else 0
|
||||
|
||||
if count_crawled_0 > 0:
|
||||
logger.info(f' - 种子{seed_id}({seed_name}) 还有 {count_crawled_0} 条crawled=0的记录,跳过')
|
||||
continue
|
||||
|
||||
# 条件2:检查 crawled=1 的最后一条记录是否超过30分钟
|
||||
sql = """SELECT created_at
|
||||
FROM baidu_keyword
|
||||
WHERE seed_id=%s AND crawled=1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1"""
|
||||
last_record = db_manager.execute_query(sql, (seed_id,), fetch_one=True)
|
||||
|
||||
if not last_record:
|
||||
logger.warning(f' - 种子{seed_id}({seed_name}) 没有crawled=1的记录,跳过')
|
||||
continue
|
||||
|
||||
last_created_at = last_record[0]
|
||||
time_diff = (datetime.now() - last_created_at).total_seconds()
|
||||
|
||||
if time_diff <= 1800: # 30分钟 = 1800秒
|
||||
logger.info(f' - 种子{seed_id}({seed_name}) 最后一条记录时间差:{time_diff/60:.1f}分钟,未超过30分钟,跳过')
|
||||
continue
|
||||
|
||||
# 满足两个条件,开始标记完成
|
||||
logger.info(f' ✓ 种子{seed_id}({seed_name}) 满足完成条件:')
|
||||
logger.info(f' - crawled=0记录数: 0')
|
||||
logger.info(f' - 最后一条记录时间差: {time_diff/60:.1f}分钟')
|
||||
|
||||
# 更新baidu_keyword表:将parents_id=0且crawled=2改为1
|
||||
sql1 = "UPDATE baidu_keyword SET crawled=1 WHERE parents_id=0 AND seed_id=%s AND crawled=2"
|
||||
result_count = db_manager.execute_update(sql1, (seed_id,), autocommit=True)
|
||||
logger.info(f' - 已将 {result_count} 条parents_id=0的记录从crawled=2更新为1')
|
||||
|
||||
# 更新baidu_seed_keywords表
|
||||
sql2 = "UPDATE baidu_seed_keywords SET crawled=1, status='finished' WHERE id=%s"
|
||||
db_manager.execute_update(sql2, (seed_id,), autocommit=True)
|
||||
|
||||
logger.info(f' ✓ 种子{seed_id}({seed_name}) 已标记为完成,status=finished')
|
||||
marked_count += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f' ✗ 种子{seed_id} 检查失败:{e}')
|
||||
|
||||
return marked_count
|
||||
except Exception as e:
|
||||
logger.error(f'检查已完成种子失败:{e}')
|
||||
return 0
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
主流程:
|
||||
1. 监听baidu_seed_keywords表,获取2个种子同步到baidu_keyword表
|
||||
2. 监听baidu_keyword表,检查并标记已完成的种子
|
||||
3. 循环执行,每3秒检查一次
|
||||
"""
|
||||
logger.info('='*70)
|
||||
logger.info('百度种子词监听系统启动 - ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
|
||||
logger.info('='*70)
|
||||
|
||||
try:
|
||||
cycle_count = 0
|
||||
|
||||
while True:
|
||||
cycle_count += 1
|
||||
logger.info(f'\n\n{"="*70}')
|
||||
logger.info(f'【第{cycle_count}轮循环】开始监听')
|
||||
logger.info(f'{"="*70}')
|
||||
|
||||
|
||||
# (2)监听baidu_keyword,检查并标记已完成的种子
|
||||
marked_count = check_and_mark_finished_seeds()
|
||||
if marked_count > 0:
|
||||
logger.info(f'\n✅ 本轮共标记 {marked_count} 个种子为完成')
|
||||
|
||||
# (1)监听baidu_seed_keywords,处理2个种子词
|
||||
has_seeds = process_seed_keywords()
|
||||
|
||||
# 等待3秒后继续下一轮
|
||||
logger.info(f'\n{"="*70}')
|
||||
logger.info(f'【第{cycle_count}轮循环】完成,3秒后继续...')
|
||||
logger.info(f'{"="*70}\n')
|
||||
time.sleep(10)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info('\n接收到停止信号,程序退出')
|
||||
except Exception as e:
|
||||
logger.error(f'程序异常退出:{e}', exc_info=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
160
参考代码/ai_keyword_crawl/database_config.py
Normal file
160
参考代码/ai_keyword_crawl/database_config.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""
|
||||
数据库配置管理模块
|
||||
统一管理数据库连接和SQL操作
|
||||
"""
|
||||
import pymysql
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 数据库配置
|
||||
DB_CONFIG = {
|
||||
'host': '127.0.0.1',
|
||||
'user': 'root',
|
||||
'password': '123456',
|
||||
'database': 'ai_article',
|
||||
'charset': 'utf8mb4'
|
||||
}
|
||||
|
||||
|
||||
class DatabaseManager:
|
||||
"""数据库管理器:统一管理数据库连接和操作"""
|
||||
|
||||
def __init__(self, config=None):
|
||||
"""初始化数据库管理器
|
||||
|
||||
Args:
|
||||
config: 数据库配置字典,默认使用 DB_CONFIG
|
||||
"""
|
||||
self.config = config or DB_CONFIG
|
||||
|
||||
def get_connection(self, autocommit=False):
|
||||
"""获取数据库连接
|
||||
|
||||
Args:
|
||||
autocommit: 是否启用自动提交模式
|
||||
|
||||
Returns:
|
||||
pymysql连接对象
|
||||
"""
|
||||
return pymysql.connect(**self.config, autocommit=autocommit)
|
||||
|
||||
def execute_query(self, sql, params=None, fetch_one=False):
|
||||
"""执行查询SQL(SELECT)
|
||||
|
||||
Args:
|
||||
sql: SQL语句
|
||||
params: SQL参数(tuple或list)
|
||||
fetch_one: True返回单条记录,False返回所有记录
|
||||
|
||||
Returns:
|
||||
查询结果
|
||||
"""
|
||||
conn = None
|
||||
cursor = None
|
||||
try:
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
logger.info(f'[SQL] {sql.strip()} | params: {params}')
|
||||
cursor.execute(sql, params or ())
|
||||
|
||||
if fetch_one:
|
||||
result = cursor.fetchone()
|
||||
else:
|
||||
result = cursor.fetchall()
|
||||
|
||||
logger.debug(f'[SQL结果] 返回 {len(result) if not fetch_one and result else (1 if result else 0)} 条记录')
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f'执行查询失败:{e}', exc_info=True)
|
||||
raise
|
||||
finally:
|
||||
if cursor:
|
||||
cursor.close()
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def execute_update(self, sql, params=None, autocommit=True):
|
||||
"""执行更新SQL(INSERT/UPDATE/DELETE)
|
||||
|
||||
Args:
|
||||
sql: SQL语句
|
||||
params: SQL参数(tuple或list)
|
||||
autocommit: 是否自动提交
|
||||
|
||||
Returns:
|
||||
影响的行数
|
||||
"""
|
||||
conn = None
|
||||
cursor = None
|
||||
try:
|
||||
conn = self.get_connection(autocommit=autocommit)
|
||||
cursor = conn.cursor()
|
||||
|
||||
logger.info(f'[SQL] {sql.strip()} | params: {params}')
|
||||
result = cursor.execute(sql, params or ())
|
||||
|
||||
if not autocommit:
|
||||
conn.commit()
|
||||
|
||||
logger.info(f'[SQL执行] 影响 {result} 行')
|
||||
return result
|
||||
except Exception as e:
|
||||
if not autocommit and conn:
|
||||
conn.rollback()
|
||||
logger.error(f'执行更新失败:{e}', exc_info=True)
|
||||
raise
|
||||
finally:
|
||||
if cursor:
|
||||
cursor.close()
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def execute_many(self, sql, params_list, autocommit=True):
|
||||
"""批量执行SQL
|
||||
|
||||
Args:
|
||||
sql: SQL语句
|
||||
params_list: 参数列表,每个元素是一组参数
|
||||
autocommit: 是否自动提交
|
||||
|
||||
Returns:
|
||||
成功执行的行数
|
||||
"""
|
||||
conn = None
|
||||
cursor = None
|
||||
try:
|
||||
conn = self.get_connection(autocommit=autocommit)
|
||||
cursor = conn.cursor()
|
||||
|
||||
logger.info(f'[SQL批量] {sql.strip()} | 批次数: {len(params_list)}')
|
||||
|
||||
success_count = 0
|
||||
for params in params_list:
|
||||
try:
|
||||
result = cursor.execute(sql, params)
|
||||
if result > 0:
|
||||
success_count += 1
|
||||
except Exception as e:
|
||||
logger.debug(f'批量执行跳过:params={params},错误:{e}')
|
||||
|
||||
if not autocommit:
|
||||
conn.commit()
|
||||
|
||||
logger.info(f'[SQL批量执行] 成功 {success_count}/{len(params_list)} 条')
|
||||
return success_count
|
||||
except Exception as e:
|
||||
if not autocommit and conn:
|
||||
conn.rollback()
|
||||
logger.error(f'批量执行失败:{e}', exc_info=True)
|
||||
raise
|
||||
finally:
|
||||
if cursor:
|
||||
cursor.close()
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
|
||||
# 创建全局数据库管理器实例
|
||||
db_manager = DatabaseManager()
|
||||
85
参考代码/ai_keyword_crawl/debug_page.html
Normal file
85
参考代码/ai_keyword_crawl/debug_page.html
Normal file
File diff suppressed because one or more lines are too long
62
参考代码/ai_keyword_crawl/demo_python.py
Normal file
62
参考代码/ai_keyword_crawl/demo_python.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import requests
|
||||
import time
|
||||
import os
|
||||
|
||||
# 禁用系统代理环境变量
|
||||
os.environ['NO_PROXY'] = '*'
|
||||
os.environ['no_proxy'] = '*'
|
||||
if 'HTTP_PROXY' in os.environ:
|
||||
del os.environ['HTTP_PROXY']
|
||||
if 'HTTPS_PROXY' in os.environ:
|
||||
del os.environ['HTTPS_PROXY']
|
||||
if 'http_proxy' in os.environ:
|
||||
del os.environ['http_proxy']
|
||||
if 'https_proxy' in os.environ:
|
||||
del os.environ['https_proxy']
|
||||
if __name__ == '__main__':
|
||||
# 客户ip提取链接,每次提取1个,提取链接可以换成自己购买的
|
||||
#url = 'http://api.tianqiip.com/getip?secret=ew9mj7j3yplbk3xb&num=1&type=txt&port=1&time=3&mr=1&sign=5451e454a54b9f1f06222606c418e12f'
|
||||
url = 'http://api.tianqiip.com/getip?secret=6o09i8io&num=1&type=txt&port=1&mr=1&sign=5451e454a54b9f1f06222606c418e12f'
|
||||
# 访问的目标地址
|
||||
targeturl = 'https://yunqueai.net/?rwl'
|
||||
# 关键修复:获取代理IP时不使用本地代理
|
||||
response = requests.get(url, proxies={})
|
||||
content = response.content.decode("utf-8").strip()
|
||||
# 只取第一个IP(如果返回多个)
|
||||
content = content.split('\n')[0].strip()
|
||||
print('提取IP:' + content)
|
||||
nowtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
||||
print('提取IP时间:' + nowtime)
|
||||
sj = content.strip().split(":", 1)
|
||||
sj1 = sj[0]
|
||||
print("IP:", sj1)
|
||||
sj2 = sj[1]
|
||||
print("端口:", sj2)
|
||||
try:
|
||||
#proxyMeta = "http://nfd0p2:bHQAp5iW@%(host)s:%(port)s" % { # 账密验证,需要购买的代理套餐开通才可使用账密验证,此种情况无需加白名单
|
||||
proxyMeta = "http://%(host)s:%(port)s" % {#白名单验证
|
||||
"host": sj1,
|
||||
"port": sj2,
|
||||
}
|
||||
print("代理1:", proxyMeta)
|
||||
proxysdata = {
|
||||
'http': proxyMeta,
|
||||
'https': proxyMeta
|
||||
}
|
||||
print("代理2:", proxysdata)
|
||||
headers = {
|
||||
"user-agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.'
|
||||
}
|
||||
start = int(round(time.time() * 1000))
|
||||
resp = requests.get(targeturl, proxies=proxysdata, headers=headers, timeout=20)
|
||||
costTime = int(round(time.time() * 1000)) - start
|
||||
print("耗时:" + str(costTime) + "ms")
|
||||
print("返回:",resp.text)
|
||||
s = requests.session()
|
||||
s.keep_alive = False
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
|
||||
|
||||
|
||||
337
参考代码/ai_keyword_crawl/log_config.py
Normal file
337
参考代码/ai_keyword_crawl/log_config.py
Normal file
@@ -0,0 +1,337 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
统一日志配置模块
|
||||
提供按日期自动切割日志文件的功能
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import sys
|
||||
from logging.handlers import TimedRotatingFileHandler
|
||||
from datetime import datetime
|
||||
|
||||
def setup_logger(name, log_file, error_log_file=None, level=logging.INFO,
|
||||
backup_count=30, error_backup_count=90, console_output=True, force_reinit=False):
|
||||
"""
|
||||
设置日志记录器,支持按日期自动切割
|
||||
|
||||
Args:
|
||||
name: 日志记录器名称
|
||||
log_file: 主日志文件路径
|
||||
error_log_file: 错误日志文件路径(可选)
|
||||
level: 日志级别
|
||||
backup_count: 主日志文件保留天数
|
||||
error_backup_count: 错误日志文件保留天数
|
||||
console_output: 是否输出到控制台
|
||||
force_reinit: 是否强制重新初始化(删除现有handlers)
|
||||
|
||||
Returns:
|
||||
logging.Logger: 配置好的日志记录器
|
||||
"""
|
||||
# 创建logs目录
|
||||
log_dir = os.path.dirname(log_file)
|
||||
if log_dir and not os.path.exists(log_dir):
|
||||
os.makedirs(log_dir)
|
||||
|
||||
# 获取或创建logger
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(level)
|
||||
|
||||
# 检查是否需要重新初始化
|
||||
need_reinit = force_reinit or not logger.handlers
|
||||
|
||||
# 如果强制重新初始化或没有handlers,则清除现有handlers
|
||||
if force_reinit and logger.handlers:
|
||||
print(f"强制重新初始化日志记录器: {name}")
|
||||
for handler in logger.handlers[:]: # 使用切片创建副本
|
||||
logger.removeHandler(handler)
|
||||
need_reinit = True
|
||||
|
||||
# 如果没有handlers,则添加新的handlers
|
||||
if need_reinit:
|
||||
# 创建日志格式
|
||||
formatter = logging.Formatter(
|
||||
'%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
|
||||
# 1. 主日志文件处理器 - 按日期切割
|
||||
file_handler = TimedRotatingFileHandler(
|
||||
filename=log_file,
|
||||
when='midnight', # 每天午夜切割
|
||||
interval=1, # 每1天切割一次
|
||||
backupCount=backup_count, # 保留天数
|
||||
encoding='utf-8'
|
||||
)
|
||||
file_handler.setLevel(level)
|
||||
file_handler.setFormatter(formatter)
|
||||
|
||||
# 设置切割后的文件名格式:filename.log.2025-07-21
|
||||
file_handler.suffix = "%Y-%m-%d"
|
||||
|
||||
# 自定义文件名生成函数,确保格式正确
|
||||
def namer(default_name):
|
||||
# 确保文件名格式为 filename.log.2025-07-21
|
||||
return default_name
|
||||
file_handler.namer = namer
|
||||
|
||||
# 添加主日志处理器
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# 2. 错误日志文件处理器(如果指定)
|
||||
if error_log_file:
|
||||
error_file_handler = TimedRotatingFileHandler(
|
||||
filename=error_log_file,
|
||||
when='midnight',
|
||||
interval=1,
|
||||
backupCount=error_backup_count, # 错误日志保留更长时间
|
||||
encoding='utf-8'
|
||||
)
|
||||
error_file_handler.setLevel(logging.ERROR)
|
||||
error_file_handler.setFormatter(formatter)
|
||||
error_file_handler.suffix = "%Y-%m-%d"
|
||||
error_file_handler.namer = namer
|
||||
logger.addHandler(error_file_handler)
|
||||
|
||||
# 3. 控制台处理器(如果启用)
|
||||
if console_output:
|
||||
console_handler = logging.StreamHandler(sys.stdout)
|
||||
console_handler.setLevel(level)
|
||||
console_formatter = logging.Formatter(
|
||||
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
datefmt='%H:%M:%S'
|
||||
)
|
||||
console_handler.setFormatter(console_formatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
# 设置第三方库的日志级别
|
||||
logging.getLogger('requests').setLevel(logging.WARNING)
|
||||
logging.getLogger('urllib3').setLevel(logging.WARNING)
|
||||
logging.getLogger('whoosh').setLevel(logging.WARNING)
|
||||
|
||||
# 记录日志系统启动信息
|
||||
logger.info(f"日志系统已启动 - 记录器: {name}")
|
||||
logger.info(f"主日志文件: {log_file}")
|
||||
if error_log_file:
|
||||
logger.info(f"错误日志文件: {error_log_file}")
|
||||
logger.info(f"日志保留策略: 每天午夜分割,主日志保留{backup_count}天")
|
||||
if error_log_file:
|
||||
logger.info(f"错误日志保留策略: 每天午夜分割,保留{error_backup_count}天")
|
||||
|
||||
return logger
|
||||
|
||||
def setup_curl_convert_logger(force_reinit=False):
|
||||
"""设置curl_convert.py的日志记录器"""
|
||||
return setup_logger(
|
||||
name='curl_convert',
|
||||
log_file='logs/curl_convert.log',
|
||||
error_log_file='logs/curl_convert_error.log',
|
||||
level=logging.INFO,
|
||||
backup_count=30,
|
||||
error_backup_count=90,
|
||||
console_output=True,
|
||||
force_reinit=force_reinit
|
||||
)
|
||||
|
||||
def setup_article_server_logger(force_reinit=False):
|
||||
"""设置flask_article_server.py的日志记录器"""
|
||||
return setup_logger(
|
||||
name='article_server',
|
||||
log_file='logs/article_server.log',
|
||||
error_log_file='logs/article_error.log',
|
||||
level=logging.INFO,
|
||||
backup_count=3,
|
||||
error_backup_count=9,
|
||||
console_output=True,
|
||||
force_reinit=force_reinit
|
||||
)
|
||||
|
||||
def setup_article_server_search_logger(force_reinit=False):
|
||||
"""设置flask_article_server_search.py的日志记录器"""
|
||||
return setup_logger(
|
||||
name='article_server_search',
|
||||
log_file='logs/article_server_search.log',
|
||||
error_log_file='logs/article_server_search_error.log',
|
||||
level=logging.INFO,
|
||||
backup_count=3,
|
||||
error_backup_count=9,
|
||||
console_output=True,
|
||||
force_reinit=force_reinit
|
||||
)
|
||||
|
||||
def setup_aiarticle_server_logger(force_reinit=False):
|
||||
"""设置flask_aiarticle_server.py的日志记录器"""
|
||||
return setup_logger(
|
||||
name='aiarticle_server',
|
||||
log_file='logs/aiarticle_server.log',
|
||||
error_log_file='logs/aiarticle_server_error.log',
|
||||
level=logging.INFO,
|
||||
backup_count=30,
|
||||
error_backup_count=90,
|
||||
console_output=True,
|
||||
force_reinit=force_reinit
|
||||
)
|
||||
|
||||
def setup_whoosh_search_tags_logger(force_reinit=False):
|
||||
"""设置whoosh_search_tags.py的日志记录器"""
|
||||
return setup_logger(
|
||||
name='whoosh_search_tags',
|
||||
log_file='logs/whoosh_search_tags.log',
|
||||
error_log_file='logs/whoosh_search_tags_error.log',
|
||||
level=logging.INFO,
|
||||
backup_count=30,
|
||||
error_backup_count=90,
|
||||
console_output=True,
|
||||
force_reinit=force_reinit
|
||||
)
|
||||
|
||||
def setup_baidu_crawl_logger(force_reinit=False):
|
||||
"""设置baidu_crawl.py的日志记录器"""
|
||||
return setup_logger(
|
||||
name='baidu_crawl',
|
||||
log_file='logs/baidu_crawl.log',
|
||||
error_log_file='logs/baidu_crawl_error.log',
|
||||
level=logging.INFO,
|
||||
backup_count=3,
|
||||
error_backup_count=3,
|
||||
console_output=True,
|
||||
force_reinit=force_reinit
|
||||
)
|
||||
|
||||
def setup_baidu_seed_logger(force_reinit=False):
|
||||
"""设置baidu_seed.py的日志记录器"""
|
||||
return setup_logger(
|
||||
name='baidu_seed',
|
||||
log_file='logs/baidu_seed.log',
|
||||
error_log_file='logs/baidu_seed_error.log',
|
||||
level=logging.INFO,
|
||||
backup_count=3,
|
||||
error_backup_count=3,
|
||||
console_output=True,
|
||||
force_reinit=force_reinit
|
||||
)
|
||||
|
||||
def setup_baidu_crawl_again_logger(force_reinit=False):
|
||||
"""设置baidu_seed.py的日志记录器"""
|
||||
return setup_logger(
|
||||
name='baidu_crawl_again',
|
||||
log_file='logs/baidu_crawl_again.log',
|
||||
error_log_file='logs/baidu_crawl_again_error.log',
|
||||
level=logging.INFO,
|
||||
backup_count=3,
|
||||
error_backup_count=3,
|
||||
console_output=True,
|
||||
force_reinit=force_reinit
|
||||
)
|
||||
|
||||
def reinitialize_all_loggers():
|
||||
"""重新初始化所有日志记录器"""
|
||||
print("重新初始化所有日志记录器...")
|
||||
|
||||
# 重新初始化所有日志记录器
|
||||
setup_curl_convert_logger(force_reinit=True)
|
||||
setup_article_server_logger(force_reinit=True)
|
||||
setup_article_server_search_logger(force_reinit=True)
|
||||
setup_aiarticle_server_logger(force_reinit=True)
|
||||
setup_whoosh_search_tags_logger(force_reinit=True)
|
||||
setup_baidu_crawl_logger(force_reinit=True)
|
||||
setup_baidu_seed_logger(force_reinit=True)
|
||||
|
||||
print("所有日志记录器重新初始化完成")
|
||||
|
||||
def cleanup_old_logs(log_dir='logs', days_to_keep=30):
|
||||
"""
|
||||
清理旧的日志文件
|
||||
|
||||
Args:
|
||||
log_dir: 日志目录
|
||||
days_to_keep: 保留天数
|
||||
"""
|
||||
import glob
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
if not os.path.exists(log_dir):
|
||||
return
|
||||
|
||||
cutoff_date = datetime.now() - timedelta(days=days_to_keep)
|
||||
|
||||
# 查找所有日志文件
|
||||
log_patterns = [
|
||||
os.path.join(log_dir, '*.log.*'), # 切割后的日志文件
|
||||
os.path.join(log_dir, '*.log') # 当前日志文件
|
||||
]
|
||||
|
||||
for pattern in log_patterns:
|
||||
for log_file in glob.glob(pattern):
|
||||
try:
|
||||
# 获取文件修改时间
|
||||
file_mtime = datetime.fromtimestamp(os.path.getmtime(log_file))
|
||||
if file_mtime < cutoff_date:
|
||||
os.remove(log_file)
|
||||
print(f"已删除旧日志文件: {log_file}")
|
||||
except Exception as e:
|
||||
print(f"删除日志文件失败 {log_file}: {e}")
|
||||
|
||||
def get_log_file_info(log_dir='logs'):
|
||||
"""
|
||||
获取日志文件信息
|
||||
|
||||
Args:
|
||||
log_dir: 日志目录
|
||||
|
||||
Returns:
|
||||
dict: 日志文件信息
|
||||
"""
|
||||
if not os.path.exists(log_dir):
|
||||
return {}
|
||||
|
||||
log_info = {}
|
||||
|
||||
for filename in os.listdir(log_dir):
|
||||
if filename.endswith('.log'):
|
||||
file_path = os.path.join(log_dir, filename)
|
||||
try:
|
||||
size = os.path.getsize(file_path)
|
||||
mtime = datetime.fromtimestamp(os.path.getmtime(file_path))
|
||||
log_info[filename] = {
|
||||
'size': size,
|
||||
'size_mb': round(size / (1024 * 1024), 2),
|
||||
'modified': mtime.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'path': file_path
|
||||
}
|
||||
except Exception as e:
|
||||
log_info[filename] = {'error': str(e)}
|
||||
|
||||
return log_info
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试日志配置
|
||||
print("测试日志配置...")
|
||||
|
||||
# 测试各个日志记录器
|
||||
logger1 = setup_curl_convert_logger()
|
||||
logger1.info("curl_convert 日志测试")
|
||||
|
||||
logger2 = setup_article_server_logger()
|
||||
logger2.info("article_server 日志测试")
|
||||
|
||||
logger3 = setup_article_server_search_logger()
|
||||
logger3.info("article_server_search 日志测试")
|
||||
|
||||
logger4 = setup_aiarticle_server_logger()
|
||||
logger4.info("aiarticle_server 日志测试")
|
||||
|
||||
logger5 = setup_whoosh_search_tags_logger()
|
||||
logger5.info("whoosh_search_tags 日志测试")
|
||||
|
||||
# 显示日志文件信息
|
||||
print("\n当前日志文件信息:")
|
||||
log_info = get_log_file_info()
|
||||
for filename, info in log_info.items():
|
||||
if 'error' not in info:
|
||||
print(f"{filename}: {info['size_mb']}MB, 修改时间: {info['modified']}")
|
||||
else:
|
||||
print(f"{filename}: 错误 - {info['error']}")
|
||||
|
||||
print("\n日志配置测试完成!")
|
||||
32
参考代码/ai_keyword_crawl/scrapy_proxy.py
Normal file
32
参考代码/ai_keyword_crawl/scrapy_proxy.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import scrapy
|
||||
|
||||
class MimvpSpider(scrapy.spiders.Spider):
|
||||
name = "mimvp"
|
||||
allowed_domains = ["mimvp.com"]
|
||||
start_urls = [
|
||||
"http://proxy.mimvp.com/exist.php",
|
||||
"https://proxy.mimvp.com/exist.php",
|
||||
]
|
||||
|
||||
## <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>÷<EFBFBD>ʽ1<CABD><31>ֱ<EFBFBD><D6B1><EFBFBD>ڴ<EFBFBD><DAB4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
def start_requests(self):
|
||||
urls = [
|
||||
"http://proxy.mimvp.com/exist.php",
|
||||
"https://proxy.mimvp.com/exist.php",
|
||||
]
|
||||
for url in urls:
|
||||
meta_proxy = ""
|
||||
if url.startswith("http://"):
|
||||
meta_proxy = "http://180.96.27.12:88" # http<74><70><EFBFBD><EFBFBD>
|
||||
elif url.startswith("https://"):
|
||||
meta_proxy = "http://109.108.87.136:53281" # https<70><73><EFBFBD><EFBFBD>
|
||||
|
||||
yield scrapy.Request(url=url, callback=self.parse, meta={'proxy': meta_proxy})
|
||||
|
||||
|
||||
def parse(self, response):
|
||||
mimvp_url = response.url # <20><>ȡʱ<C8A1><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD>url
|
||||
body = response.body # <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ҳ<EFBFBD><D2B3><EFBFBD><EFBFBD>
|
||||
|
||||
print("mimvp_url : " + str(mimvp_url))
|
||||
print("body : " + str(body))
|
||||
Reference in New Issue
Block a user