99 lines
3.1 KiB
Python
99 lines
3.1 KiB
Python
|
|
"""
|
|||
|
|
测试百度搜索爬虫
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from loguru import logger
|
|||
|
|
from baidu_crawler import BaiduSearchCrawler
|
|||
|
|
from db_manager import QueryTaskManager
|
|||
|
|
from datetime import datetime
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
logger.remove()
|
|||
|
|
logger.add(sys.stdout, format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def test_single_query():
|
|||
|
|
"""测试爬取单个查询词"""
|
|||
|
|
print("="*70)
|
|||
|
|
print(" 测试爬取单个查询词")
|
|||
|
|
print("="*70)
|
|||
|
|
|
|||
|
|
# headless=False可以看到浏览器
|
|||
|
|
# 会自动滚动直到无新内容
|
|||
|
|
crawler = BaiduSearchCrawler(headless=False)
|
|||
|
|
|
|||
|
|
# 测试查询,设置阈值50
|
|||
|
|
query_word = "糖尿病治疗"
|
|||
|
|
result = crawler.crawl_query(query_word, category="医疗", threshold_max=50)
|
|||
|
|
|
|||
|
|
print("\n爬取结果:")
|
|||
|
|
print(f" 查询词: {result['query_word']}")
|
|||
|
|
print(f" 是否成功: {result['success']}")
|
|||
|
|
print(f" 爬取数量: {result['crawled_count']}")
|
|||
|
|
print(f" 有效数量: {result['valid_count']}")
|
|||
|
|
print(f" 新增数量: {result['new_count']}")
|
|||
|
|
if result['error']:
|
|||
|
|
print(f" 错误信息: {result['error']}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def test_batch_crawl():
|
|||
|
|
"""测试批量爬取任务"""
|
|||
|
|
print("="*70)
|
|||
|
|
print(" 测试批量爬取任务")
|
|||
|
|
print("="*70)
|
|||
|
|
|
|||
|
|
# 先创建一些测试任务
|
|||
|
|
task_mgr = QueryTaskManager()
|
|||
|
|
task_date = datetime.now().strftime('%Y%m%d')
|
|||
|
|
|
|||
|
|
test_queries = [
|
|||
|
|
("高血压怎么治疗", "keyword", "医疗", 3, 30), # 阈值30
|
|||
|
|
("在线教育平台哪个好", "phrase", "教育", 5, 20), # 阈值20
|
|||
|
|
("免费法律咨询", "keyword", "法律", 4, 25), # 阈值25
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
logger.info("创建测试任务...")
|
|||
|
|
for query, qtype, category, priority, threshold in test_queries:
|
|||
|
|
task_mgr.create_task(
|
|||
|
|
query_word=query,
|
|||
|
|
query_type=qtype,
|
|||
|
|
task_date=task_date,
|
|||
|
|
threshold_max=threshold, # 使用各自的阈值
|
|||
|
|
priority=priority,
|
|||
|
|
category=category,
|
|||
|
|
remark="测试任务"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# 执行批量爬取,会自动滚动直到达到阈值
|
|||
|
|
crawler = BaiduSearchCrawler(headless=False)
|
|||
|
|
stats = crawler.crawl_tasks(limit=3)
|
|||
|
|
|
|||
|
|
print("\n批量爬取统计:")
|
|||
|
|
print(f" 总任务数: {stats['total_tasks']}")
|
|||
|
|
print(f" 成功: {stats['success_count']}")
|
|||
|
|
print(f" 失败: {stats['failed_count']}")
|
|||
|
|
print(f" 总爬取: {stats['total_crawled']}")
|
|||
|
|
print(f" 新增保存: {stats['total_saved']}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
import argparse
|
|||
|
|
|
|||
|
|
parser = argparse.ArgumentParser(description='测试百度搜索爬虫')
|
|||
|
|
parser.add_argument('--mode', choices=['single', 'batch'], default='single',
|
|||
|
|
help='测试模式:single=单个查询, batch=批量任务')
|
|||
|
|
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
if args.mode == 'single':
|
|||
|
|
test_single_query()
|
|||
|
|
else:
|
|||
|
|
test_batch_crawl()
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"测试失败: {str(e)}")
|
|||
|
|
import traceback
|
|||
|
|
traceback.print_exc()
|