Files
ai_mip/test_baidu_crawler.py
2026-01-21 14:33:10 +08:00

99 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
测试百度搜索爬虫
"""
from loguru import logger
from baidu_crawler import BaiduSearchCrawler
from db_manager import QueryTaskManager
from datetime import datetime
import sys
logger.remove()
logger.add(sys.stdout, format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>")
def test_single_query():
"""测试爬取单个查询词"""
print("="*70)
print(" 测试爬取单个查询词")
print("="*70)
# headless=False可以看到浏览器
# 会自动滚动直到无新内容
crawler = BaiduSearchCrawler(headless=False)
# 测试查询设置阈值50
query_word = "糖尿病治疗"
result = crawler.crawl_query(query_word, category="医疗", threshold_max=50)
print("\n爬取结果:")
print(f" 查询词: {result['query_word']}")
print(f" 是否成功: {result['success']}")
print(f" 爬取数量: {result['crawled_count']}")
print(f" 有效数量: {result['valid_count']}")
print(f" 新增数量: {result['new_count']}")
if result['error']:
print(f" 错误信息: {result['error']}")
def test_batch_crawl():
"""测试批量爬取任务"""
print("="*70)
print(" 测试批量爬取任务")
print("="*70)
# 先创建一些测试任务
task_mgr = QueryTaskManager()
task_date = datetime.now().strftime('%Y%m%d')
test_queries = [
("高血压怎么治疗", "keyword", "医疗", 3, 30), # 阈值30
("在线教育平台哪个好", "phrase", "教育", 5, 20), # 阈值20
("免费法律咨询", "keyword", "法律", 4, 25), # 阈值25
]
logger.info("创建测试任务...")
for query, qtype, category, priority, threshold in test_queries:
task_mgr.create_task(
query_word=query,
query_type=qtype,
task_date=task_date,
threshold_max=threshold, # 使用各自的阈值
priority=priority,
category=category,
remark="测试任务"
)
print()
# 执行批量爬取,会自动滚动直到达到阈值
crawler = BaiduSearchCrawler(headless=False)
stats = crawler.crawl_tasks(limit=3)
print("\n批量爬取统计:")
print(f" 总任务数: {stats['total_tasks']}")
print(f" 成功: {stats['success_count']}")
print(f" 失败: {stats['failed_count']}")
print(f" 总爬取: {stats['total_crawled']}")
print(f" 新增保存: {stats['total_saved']}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='测试百度搜索爬虫')
parser.add_argument('--mode', choices=['single', 'batch'], default='single',
help='测试模式single=单个查询, batch=批量任务')
args = parser.parse_args()
try:
if args.mode == 'single':
test_single_query()
else:
test_batch_crawl()
except Exception as e:
logger.error(f"测试失败: {str(e)}")
import traceback
traceback.print_exc()