This commit is contained in:
sjk
2026-02-24 12:46:35 +08:00
parent be0c13e1a6
commit 85224e01e6
116 changed files with 8380 additions and 9642 deletions

277
main.py
View File

@@ -28,6 +28,7 @@ import signal
import sys
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
from typing import Dict, List
@@ -108,6 +109,10 @@ MIP广告点击服务
self.total_clicks_today = 0
self.error_count = 0
# 线程安全锁(用于并发执行)
self._click_records_lock = threading.Lock()
self._stats_lock = threading.Lock()
# 健康检查API
self.health_app = Flask(__name__)
self.health_app.logger.disabled = True # 禁用Flask日志
@@ -116,10 +121,10 @@ MIP广告点击服务
logger.info(f"调度器初始化完成")
logger.info(f"工作时间: {self.work_start_hour:02d}:00 - {self.work_end_hour:02d}:00")
logger.info(f"点击间隔: {self.click_interval_minutes} 分钟")
logger.info(f"并发数: {max_workers}")
logger.info(f"并发模式: {'启用' if max_workers > 1 else '禁用'} (最大并发数: {max_workers})")
def _setup_health_api(self):
"""配置健康检查API"""
"""配置健康检查API和调度器控制API"""
@self.health_app.route('/health', methods=['GET'])
def health_check():
"""健康检查端点"""
@@ -140,6 +145,44 @@ MIP广告点击服务
'work_hours': f"{self.work_start_hour:02d}:00-{self.work_end_hour:02d}:00",
'is_working_time': self.is_working_time()
})
@self.health_app.route('/scheduler/status', methods=['GET'])
def scheduler_status():
"""获取调度器状态供远程Web服务调用"""
return jsonify({
'success': True,
'data': {
'status': 'running' if self.running else 'stopped',
'is_working_time': self.is_working_time(),
'total_sites': len(self.click_records),
'completed_sites': sum(1 for r in self.click_records.values() if r['today_count'] >= r['target_count']),
'total_clicks_today': sum(r['today_count'] for r in self.click_records.values()),
'error_count': self.error_count
}
})
@self.health_app.route('/scheduler/start', methods=['POST'])
def scheduler_start():
"""启动调度器供远程Web服务调用"""
if self.running:
return jsonify({'success': True, 'message': '调度器已在运行中'})
# 重新初始化并启动
self.running = True
self.start_time = datetime.now()
self.reset_daily_records()
logger.info("调度器已通过远程API启动")
return jsonify({'success': True, 'message': '调度器已启动'})
@self.health_app.route('/scheduler/stop', methods=['POST'])
def scheduler_stop():
"""停止调度器供远程Web服务调用"""
if not self.running:
return jsonify({'success': True, 'message': '调度器未运行'})
self.running = False
logger.info("调度器已通过远程API停止")
return jsonify({'success': True, 'message': '调度器已停止'})
def _acquire_lock(self) -> bool:
"""
@@ -222,7 +265,8 @@ MIP广告点击服务
'last_click': None,
'today_count': 0,
'target_count': target_count,
'site_url': site.get('site_url')
'site_url': site.get('site_url'),
'click_count': site.get('click_count', 0) # 历史总点击次数
}
logger.info(f"站点 {site_id}: {site.get('site_url')} - 今日目标 {target_count}")
@@ -233,12 +277,40 @@ MIP广告点击服务
获取待点击的站点列表
Returns:
待点击的站点列表
待点击的站点列表(优先返回今日未点击的站点)
"""
if not self.click_records:
logger.warning("点击记录为空,执行重置")
self.reset_daily_records()
# 动态检测新导入的站点
current_sites = self.dm.get_active_urls()
current_site_ids = {site.get('id') for site in current_sites}
existing_site_ids = set(self.click_records.keys())
# 发现新站点,自动添加到点击记录
new_site_ids = current_site_ids - existing_site_ids
if new_site_ids:
logger.info(f"发现 {len(new_site_ids)} 个新导入的站点,加入点击队列")
for site in current_sites:
site_id = site.get('id')
if site_id in new_site_ids:
target_count = random.randint(Config.MIN_CLICK_COUNT, Config.MAX_CLICK_COUNT)
self.click_records[site_id] = {
'last_click': None,
'today_count': 0,
'target_count': target_count,
'site_url': site.get('site_url'),
'click_count': site.get('click_count', 0) # 历史总点击次数
}
logger.info(f"新站点 {site_id}: {site.get('site_url')} - 今日目标 {target_count}")
# 移除已删除的站点
removed_site_ids = existing_site_ids - current_site_ids
for site_id in removed_site_ids:
del self.click_records[site_id]
logger.info(f"站点 {site_id} 已从数据库删除,移除出点击队列")
now = datetime.now()
pending_sites = []
@@ -257,9 +329,21 @@ MIP广告点击服务
'id': site_id,
'site_url': record['site_url'],
'today_count': record['today_count'],
'target_count': record['target_count']
'target_count': record['target_count'],
'click_count': record.get('click_count', 0), # 历史总点击次数
'last_click': record['last_click']
})
# 排序优先级:
# 1. 历史从未点击的(click_count=0)最优先
# 2. 今日未点击的(today_count=0)次优先
# 3. 按上次点击时间升序(最久未点击的优先)
pending_sites.sort(key=lambda x: (
x['click_count'] > 0, # False(历史0次) 排在最前面
x['today_count'] > 0, # False(今日0次) 排在前面
x['last_click'] or datetime.min # 从未点击的排在前面
))
return pending_sites
def execute_click_task(self, site: Dict):
@@ -341,13 +425,28 @@ MIP广告点击服务
logger.info(f"找到 {len(pending_sites)} 个待点击站点")
# 随机打乱顺序(模拟真实行为)
random.shuffle(pending_sites)
# 分组:历史从未点击的 vs 历史点击过的
never_clicked = [s for s in pending_sites if s.get('click_count', 0) == 0]
has_clicked = [s for s in pending_sites if s.get('click_count', 0) > 0]
# 各组内随机打乱,但保持"历史从未点击优先"的整体顺序
random.shuffle(never_clicked)
random.shuffle(has_clicked)
pending_sites = never_clicked + has_clicked
if never_clicked:
logger.info(f"其中 {len(never_clicked)} 个站点历史从未点击(最优先处理)")
# 根据并发数执行
if self.max_workers == 1:
# 串行执行
for site in pending_sites:
# 每次执行前检查工作时间
if not self.is_working_time():
current_time = datetime.now().strftime('%H:%M')
logger.info(f"当前时间 {current_time} 已超出工作时间,停止执行剩余任务")
break
self.execute_click_task(site)
# 任务间随机间隔(使用配置文件中的范围)
@@ -356,11 +455,8 @@ MIP广告点击服务
logger.info(f"等待 {wait_minutes} 分钟后执行下一个任务...")
time.sleep(wait_minutes * 60)
else:
# 并发执行(暂不支持,避免资源冲突)
logger.warning("当前版本仅支持串行执行")
for site in pending_sites:
self.execute_click_task(site)
time.sleep(random.randint(Config.MIN_TASK_INTERVAL_MINUTES, Config.MAX_TASK_INTERVAL_MINUTES) * 60)
# 并发执行
self._run_concurrent_cycle(pending_sites)
# 显示今日进度
completed = sum(1 for r in self.click_records.values() if r['today_count'] >= r['target_count'])
@@ -373,6 +469,150 @@ MIP广告点击服务
logger.info(f"点击次数: {total_clicks}/{target_clicks}")
logger.info("-" * 60)
def _run_concurrent_cycle(self, pending_sites: List[Dict]):
"""
并发执行点击任务
Args:
pending_sites: 待点击的站点列表
"""
# 按 max_workers 分批
batch_size = self.max_workers
batches = [pending_sites[i:i+batch_size] for i in range(0, len(pending_sites), batch_size)]
logger.info(f"并发模式: 共 {len(pending_sites)} 个任务,分 {len(batches)} 批执行(每批最多 {batch_size} 个)")
for batch_idx, batch in enumerate(batches):
# 检查工作时间
if not self.is_working_time():
current_time = datetime.now().strftime('%H:%M')
logger.info(f"当前时间 {current_time} 已超出工作时间,停止执行剩余批次")
break
logger.info(f"=" * 40)
logger.info(f"开始批次 {batch_idx+1}/{len(batches)}: {len(batch)} 个任务并发执行")
for i, site in enumerate(batch, 1):
logger.info(f" - [Worker {i}] Site {site['id']}: {site['site_url'][:50]}...")
batch_start_time = time.time()
success_count = 0
fail_count = 0
# 使用 ThreadPoolExecutor 并发执行
with ThreadPoolExecutor(max_workers=len(batch)) as executor:
futures = {
executor.submit(self._execute_click_task_wrapper, site, worker_id): site
for worker_id, site in enumerate(batch, 1)
}
# 等待所有任务完成
for future in as_completed(futures):
site = futures[future]
try:
result = future.result()
if result.get('success'):
success_count += 1
else:
fail_count += 1
except Exception as e:
fail_count += 1
logger.error(f"任务异常: Site {site['id']} - {e}")
batch_duration = (time.time() - batch_start_time) / 60
logger.info(f"批次 {batch_idx+1} 完成: 成功 {success_count}, 失败 {fail_count}, 耗时 {batch_duration:.1f} 分钟")
# 批次间随机间隔(不是最后一批)
if batch_idx < len(batches) - 1:
# 再次检查工作时间
if not self.is_working_time():
current_time = datetime.now().strftime('%H:%M')
logger.info(f"当前时间 {current_time} 已超出工作时间,停止执行")
break
wait_minutes = random.randint(Config.MIN_TASK_INTERVAL_MINUTES, Config.MAX_TASK_INTERVAL_MINUTES)
logger.info(f"等待 {wait_minutes} 分钟后执行下一批次...")
time.sleep(wait_minutes * 60)
def _execute_click_task_wrapper(self, site: Dict, worker_id: int) -> Dict:
"""
线程安全的任务执行包装器
Args:
site: 站点信息
worker_id: Worker编号用于日志标识
Returns:
执行结果字典
"""
site_id = site['id']
site_url = site['site_url']
# 错峰启动:每个 worker 间隔 5-10 秒,避免同时调用 AdsPower API 触发限频
if worker_id > 1:
stagger_delay = (worker_id - 1) * random.randint(5, 10)
logger.info(f"[Worker {worker_id}] [Site {site_id}] 错峰等待 {stagger_delay} 秒后启动...")
time.sleep(stagger_delay)
logger.info(f"[Worker {worker_id}] [Site {site_id}] 开始点击: {site_url[:50]}...")
executor = None
try:
# 创建独立的 TaskExecutor 实例
executor = TaskExecutor(max_workers=1, use_proxy=self.use_proxy)
# 创建浏览器环境(每个 worker 独立的 Profile 和代理)
profile_info = executor.create_browser_profile(worker_id)
if not profile_info:
logger.error(f"[Worker {worker_id}] [Site {site_id}] 创建浏览器环境失败")
with self._stats_lock:
self.error_count += 1
return {'success': False, 'error': '创建浏览器环境失败'}
time.sleep(2)
# 获取完整站点信息
all_sites = self.dm.get_active_urls()
target_site = next((s for s in all_sites if s.get('id') == site_id), None)
if not target_site:
logger.error(f"[Worker {worker_id}] [Site {site_id}] 未找到站点信息")
return {'success': False, 'error': '未找到站点信息'}
# 执行任务
result = executor.execute_single_task(target_site, worker_id, profile_info['profile_id'])
if result['success']:
# 线程安全更新点击记录
with self._click_records_lock:
if site_id in self.click_records:
self.click_records[site_id]['last_click'] = datetime.now()
self.click_records[site_id]['today_count'] += 1
with self._stats_lock:
self.total_clicks_today += 1
logger.info(f"[Worker {worker_id}] [Site {site_id}] ✅ 点击完成")
else:
with self._stats_lock:
self.error_count += 1
logger.warning(f"[Worker {worker_id}] [Site {site_id}] ⚠️ 点击失败: {result.get('error', '未知错误')}")
return result
except Exception as e:
with self._stats_lock:
self.error_count += 1
logger.error(f"[Worker {worker_id}] [Site {site_id}] ❌ 异常: {str(e)}")
import traceback
traceback.print_exc()
return {'success': False, 'error': str(e)}
finally:
# 确保资源清理
if executor:
try:
executor.close_browser()
except Exception as e:
logger.warning(f"[Worker {worker_id}] 清理资源失败: {e}")
def run_crawler_cycle(self):
"""执行一次爬虫循环"""
if not self.crawler:
@@ -397,6 +637,15 @@ MIP广告点击服务
import traceback
traceback.print_exc()
def run_query_import_scan(self):
"""扫描Query上传目录导入待处理文件"""
try:
from query_keyword_importer import QueryKeywordImporter
importer = QueryKeywordImporter()
importer.scan_and_import()
except Exception as e:
logger.error(f"Query导入扫描异常: {e}")
def start(self):
"""启动调度器"""
# 获取进程锁
@@ -457,6 +706,10 @@ MIP广告点击服务
else:
logger.info(" - 网址爬取未启用")
# 4. Query挖掘目录扫描每15分钟
schedule.every(15).minutes.do(self.run_query_import_scan)
logger.info(" - 每 15 分钟扫描Query上传目录")
logger.info("")
# 立即执行一次(如果在工作时间内)