99 lines
3.1 KiB
Python
99 lines
3.1 KiB
Python
"""
|
||
测试百度搜索爬虫
|
||
"""
|
||
|
||
from loguru import logger
|
||
from baidu_crawler import BaiduSearchCrawler
|
||
from db_manager import QueryTaskManager
|
||
from datetime import datetime
|
||
import sys
|
||
|
||
logger.remove()
|
||
logger.add(sys.stdout, format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>")
|
||
|
||
|
||
def test_single_query():
|
||
"""测试爬取单个查询词"""
|
||
print("="*70)
|
||
print(" 测试爬取单个查询词")
|
||
print("="*70)
|
||
|
||
# headless=False可以看到浏览器
|
||
# 会自动滚动直到无新内容
|
||
crawler = BaiduSearchCrawler(headless=False)
|
||
|
||
# 测试查询,设置阈值50
|
||
query_word = "糖尿病治疗"
|
||
result = crawler.crawl_query(query_word, category="医疗", threshold_max=50)
|
||
|
||
print("\n爬取结果:")
|
||
print(f" 查询词: {result['query_word']}")
|
||
print(f" 是否成功: {result['success']}")
|
||
print(f" 爬取数量: {result['crawled_count']}")
|
||
print(f" 有效数量: {result['valid_count']}")
|
||
print(f" 新增数量: {result['new_count']}")
|
||
if result['error']:
|
||
print(f" 错误信息: {result['error']}")
|
||
|
||
|
||
def test_batch_crawl():
|
||
"""测试批量爬取任务"""
|
||
print("="*70)
|
||
print(" 测试批量爬取任务")
|
||
print("="*70)
|
||
|
||
# 先创建一些测试任务
|
||
task_mgr = QueryTaskManager()
|
||
task_date = datetime.now().strftime('%Y%m%d')
|
||
|
||
test_queries = [
|
||
("高血压怎么治疗", "keyword", "医疗", 3, 30), # 阈值30
|
||
("在线教育平台哪个好", "phrase", "教育", 5, 20), # 阈值20
|
||
("免费法律咨询", "keyword", "法律", 4, 25), # 阈值25
|
||
]
|
||
|
||
logger.info("创建测试任务...")
|
||
for query, qtype, category, priority, threshold in test_queries:
|
||
task_mgr.create_task(
|
||
query_word=query,
|
||
query_type=qtype,
|
||
task_date=task_date,
|
||
threshold_max=threshold, # 使用各自的阈值
|
||
priority=priority,
|
||
category=category,
|
||
remark="测试任务"
|
||
)
|
||
|
||
print()
|
||
|
||
# 执行批量爬取,会自动滚动直到达到阈值
|
||
crawler = BaiduSearchCrawler(headless=False)
|
||
stats = crawler.crawl_tasks(limit=3)
|
||
|
||
print("\n批量爬取统计:")
|
||
print(f" 总任务数: {stats['total_tasks']}")
|
||
print(f" 成功: {stats['success_count']}")
|
||
print(f" 失败: {stats['failed_count']}")
|
||
print(f" 总爬取: {stats['total_crawled']}")
|
||
print(f" 新增保存: {stats['total_saved']}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(description='测试百度搜索爬虫')
|
||
parser.add_argument('--mode', choices=['single', 'batch'], default='single',
|
||
help='测试模式:single=单个查询, batch=批量任务')
|
||
|
||
args = parser.parse_args()
|
||
|
||
try:
|
||
if args.mode == 'single':
|
||
test_single_query()
|
||
else:
|
||
test_batch_crawl()
|
||
except Exception as e:
|
||
logger.error(f"测试失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|