commit
This commit is contained in:
277
main.py
277
main.py
@@ -28,6 +28,7 @@ import signal
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
@@ -108,6 +109,10 @@ MIP广告点击服务
|
||||
self.total_clicks_today = 0
|
||||
self.error_count = 0
|
||||
|
||||
# 线程安全锁(用于并发执行)
|
||||
self._click_records_lock = threading.Lock()
|
||||
self._stats_lock = threading.Lock()
|
||||
|
||||
# 健康检查API
|
||||
self.health_app = Flask(__name__)
|
||||
self.health_app.logger.disabled = True # 禁用Flask日志
|
||||
@@ -116,10 +121,10 @@ MIP广告点击服务
|
||||
logger.info(f"调度器初始化完成")
|
||||
logger.info(f"工作时间: {self.work_start_hour:02d}:00 - {self.work_end_hour:02d}:00")
|
||||
logger.info(f"点击间隔: {self.click_interval_minutes} 分钟")
|
||||
logger.info(f"并发数: {max_workers}")
|
||||
logger.info(f"并发模式: {'启用' if max_workers > 1 else '禁用'} (最大并发数: {max_workers})")
|
||||
|
||||
def _setup_health_api(self):
|
||||
"""配置健康检查API"""
|
||||
"""配置健康检查API和调度器控制API"""
|
||||
@self.health_app.route('/health', methods=['GET'])
|
||||
def health_check():
|
||||
"""健康检查端点"""
|
||||
@@ -140,6 +145,44 @@ MIP广告点击服务
|
||||
'work_hours': f"{self.work_start_hour:02d}:00-{self.work_end_hour:02d}:00",
|
||||
'is_working_time': self.is_working_time()
|
||||
})
|
||||
|
||||
@self.health_app.route('/scheduler/status', methods=['GET'])
|
||||
def scheduler_status():
|
||||
"""获取调度器状态(供远程Web服务调用)"""
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'data': {
|
||||
'status': 'running' if self.running else 'stopped',
|
||||
'is_working_time': self.is_working_time(),
|
||||
'total_sites': len(self.click_records),
|
||||
'completed_sites': sum(1 for r in self.click_records.values() if r['today_count'] >= r['target_count']),
|
||||
'total_clicks_today': sum(r['today_count'] for r in self.click_records.values()),
|
||||
'error_count': self.error_count
|
||||
}
|
||||
})
|
||||
|
||||
@self.health_app.route('/scheduler/start', methods=['POST'])
|
||||
def scheduler_start():
|
||||
"""启动调度器(供远程Web服务调用)"""
|
||||
if self.running:
|
||||
return jsonify({'success': True, 'message': '调度器已在运行中'})
|
||||
|
||||
# 重新初始化并启动
|
||||
self.running = True
|
||||
self.start_time = datetime.now()
|
||||
self.reset_daily_records()
|
||||
logger.info("调度器已通过远程API启动")
|
||||
return jsonify({'success': True, 'message': '调度器已启动'})
|
||||
|
||||
@self.health_app.route('/scheduler/stop', methods=['POST'])
|
||||
def scheduler_stop():
|
||||
"""停止调度器(供远程Web服务调用)"""
|
||||
if not self.running:
|
||||
return jsonify({'success': True, 'message': '调度器未运行'})
|
||||
|
||||
self.running = False
|
||||
logger.info("调度器已通过远程API停止")
|
||||
return jsonify({'success': True, 'message': '调度器已停止'})
|
||||
|
||||
def _acquire_lock(self) -> bool:
|
||||
"""
|
||||
@@ -222,7 +265,8 @@ MIP广告点击服务
|
||||
'last_click': None,
|
||||
'today_count': 0,
|
||||
'target_count': target_count,
|
||||
'site_url': site.get('site_url')
|
||||
'site_url': site.get('site_url'),
|
||||
'click_count': site.get('click_count', 0) # 历史总点击次数
|
||||
}
|
||||
logger.info(f"站点 {site_id}: {site.get('site_url')} - 今日目标 {target_count} 次")
|
||||
|
||||
@@ -233,12 +277,40 @@ MIP广告点击服务
|
||||
获取待点击的站点列表
|
||||
|
||||
Returns:
|
||||
待点击的站点列表
|
||||
待点击的站点列表(优先返回今日未点击的站点)
|
||||
"""
|
||||
if not self.click_records:
|
||||
logger.warning("点击记录为空,执行重置")
|
||||
self.reset_daily_records()
|
||||
|
||||
# 动态检测新导入的站点
|
||||
current_sites = self.dm.get_active_urls()
|
||||
current_site_ids = {site.get('id') for site in current_sites}
|
||||
existing_site_ids = set(self.click_records.keys())
|
||||
|
||||
# 发现新站点,自动添加到点击记录
|
||||
new_site_ids = current_site_ids - existing_site_ids
|
||||
if new_site_ids:
|
||||
logger.info(f"发现 {len(new_site_ids)} 个新导入的站点,加入点击队列")
|
||||
for site in current_sites:
|
||||
site_id = site.get('id')
|
||||
if site_id in new_site_ids:
|
||||
target_count = random.randint(Config.MIN_CLICK_COUNT, Config.MAX_CLICK_COUNT)
|
||||
self.click_records[site_id] = {
|
||||
'last_click': None,
|
||||
'today_count': 0,
|
||||
'target_count': target_count,
|
||||
'site_url': site.get('site_url'),
|
||||
'click_count': site.get('click_count', 0) # 历史总点击次数
|
||||
}
|
||||
logger.info(f"新站点 {site_id}: {site.get('site_url')} - 今日目标 {target_count} 次")
|
||||
|
||||
# 移除已删除的站点
|
||||
removed_site_ids = existing_site_ids - current_site_ids
|
||||
for site_id in removed_site_ids:
|
||||
del self.click_records[site_id]
|
||||
logger.info(f"站点 {site_id} 已从数据库删除,移除出点击队列")
|
||||
|
||||
now = datetime.now()
|
||||
pending_sites = []
|
||||
|
||||
@@ -257,9 +329,21 @@ MIP广告点击服务
|
||||
'id': site_id,
|
||||
'site_url': record['site_url'],
|
||||
'today_count': record['today_count'],
|
||||
'target_count': record['target_count']
|
||||
'target_count': record['target_count'],
|
||||
'click_count': record.get('click_count', 0), # 历史总点击次数
|
||||
'last_click': record['last_click']
|
||||
})
|
||||
|
||||
# 排序优先级:
|
||||
# 1. 历史从未点击的(click_count=0)最优先
|
||||
# 2. 今日未点击的(today_count=0)次优先
|
||||
# 3. 按上次点击时间升序(最久未点击的优先)
|
||||
pending_sites.sort(key=lambda x: (
|
||||
x['click_count'] > 0, # False(历史0次) 排在最前面
|
||||
x['today_count'] > 0, # False(今日0次) 排在前面
|
||||
x['last_click'] or datetime.min # 从未点击的排在前面
|
||||
))
|
||||
|
||||
return pending_sites
|
||||
|
||||
def execute_click_task(self, site: Dict):
|
||||
@@ -341,13 +425,28 @@ MIP广告点击服务
|
||||
|
||||
logger.info(f"找到 {len(pending_sites)} 个待点击站点")
|
||||
|
||||
# 随机打乱顺序(模拟真实行为)
|
||||
random.shuffle(pending_sites)
|
||||
# 分组:历史从未点击的 vs 历史点击过的
|
||||
never_clicked = [s for s in pending_sites if s.get('click_count', 0) == 0]
|
||||
has_clicked = [s for s in pending_sites if s.get('click_count', 0) > 0]
|
||||
|
||||
# 各组内随机打乱,但保持"历史从未点击优先"的整体顺序
|
||||
random.shuffle(never_clicked)
|
||||
random.shuffle(has_clicked)
|
||||
pending_sites = never_clicked + has_clicked
|
||||
|
||||
if never_clicked:
|
||||
logger.info(f"其中 {len(never_clicked)} 个站点历史从未点击(最优先处理)")
|
||||
|
||||
# 根据并发数执行
|
||||
if self.max_workers == 1:
|
||||
# 串行执行
|
||||
for site in pending_sites:
|
||||
# 每次执行前检查工作时间
|
||||
if not self.is_working_time():
|
||||
current_time = datetime.now().strftime('%H:%M')
|
||||
logger.info(f"当前时间 {current_time} 已超出工作时间,停止执行剩余任务")
|
||||
break
|
||||
|
||||
self.execute_click_task(site)
|
||||
|
||||
# 任务间随机间隔(使用配置文件中的范围)
|
||||
@@ -356,11 +455,8 @@ MIP广告点击服务
|
||||
logger.info(f"等待 {wait_minutes} 分钟后执行下一个任务...")
|
||||
time.sleep(wait_minutes * 60)
|
||||
else:
|
||||
# 并发执行(暂不支持,避免资源冲突)
|
||||
logger.warning("当前版本仅支持串行执行")
|
||||
for site in pending_sites:
|
||||
self.execute_click_task(site)
|
||||
time.sleep(random.randint(Config.MIN_TASK_INTERVAL_MINUTES, Config.MAX_TASK_INTERVAL_MINUTES) * 60)
|
||||
# 并发执行
|
||||
self._run_concurrent_cycle(pending_sites)
|
||||
|
||||
# 显示今日进度
|
||||
completed = sum(1 for r in self.click_records.values() if r['today_count'] >= r['target_count'])
|
||||
@@ -373,6 +469,150 @@ MIP广告点击服务
|
||||
logger.info(f"点击次数: {total_clicks}/{target_clicks} 次")
|
||||
logger.info("-" * 60)
|
||||
|
||||
def _run_concurrent_cycle(self, pending_sites: List[Dict]):
|
||||
"""
|
||||
并发执行点击任务
|
||||
|
||||
Args:
|
||||
pending_sites: 待点击的站点列表
|
||||
"""
|
||||
# 按 max_workers 分批
|
||||
batch_size = self.max_workers
|
||||
batches = [pending_sites[i:i+batch_size] for i in range(0, len(pending_sites), batch_size)]
|
||||
|
||||
logger.info(f"并发模式: 共 {len(pending_sites)} 个任务,分 {len(batches)} 批执行(每批最多 {batch_size} 个)")
|
||||
|
||||
for batch_idx, batch in enumerate(batches):
|
||||
# 检查工作时间
|
||||
if not self.is_working_time():
|
||||
current_time = datetime.now().strftime('%H:%M')
|
||||
logger.info(f"当前时间 {current_time} 已超出工作时间,停止执行剩余批次")
|
||||
break
|
||||
|
||||
logger.info(f"=" * 40)
|
||||
logger.info(f"开始批次 {batch_idx+1}/{len(batches)}: {len(batch)} 个任务并发执行")
|
||||
for i, site in enumerate(batch, 1):
|
||||
logger.info(f" - [Worker {i}] Site {site['id']}: {site['site_url'][:50]}...")
|
||||
|
||||
batch_start_time = time.time()
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
|
||||
# 使用 ThreadPoolExecutor 并发执行
|
||||
with ThreadPoolExecutor(max_workers=len(batch)) as executor:
|
||||
futures = {
|
||||
executor.submit(self._execute_click_task_wrapper, site, worker_id): site
|
||||
for worker_id, site in enumerate(batch, 1)
|
||||
}
|
||||
|
||||
# 等待所有任务完成
|
||||
for future in as_completed(futures):
|
||||
site = futures[future]
|
||||
try:
|
||||
result = future.result()
|
||||
if result.get('success'):
|
||||
success_count += 1
|
||||
else:
|
||||
fail_count += 1
|
||||
except Exception as e:
|
||||
fail_count += 1
|
||||
logger.error(f"任务异常: Site {site['id']} - {e}")
|
||||
|
||||
batch_duration = (time.time() - batch_start_time) / 60
|
||||
logger.info(f"批次 {batch_idx+1} 完成: 成功 {success_count}, 失败 {fail_count}, 耗时 {batch_duration:.1f} 分钟")
|
||||
|
||||
# 批次间随机间隔(不是最后一批)
|
||||
if batch_idx < len(batches) - 1:
|
||||
# 再次检查工作时间
|
||||
if not self.is_working_time():
|
||||
current_time = datetime.now().strftime('%H:%M')
|
||||
logger.info(f"当前时间 {current_time} 已超出工作时间,停止执行")
|
||||
break
|
||||
|
||||
wait_minutes = random.randint(Config.MIN_TASK_INTERVAL_MINUTES, Config.MAX_TASK_INTERVAL_MINUTES)
|
||||
logger.info(f"等待 {wait_minutes} 分钟后执行下一批次...")
|
||||
time.sleep(wait_minutes * 60)
|
||||
|
||||
def _execute_click_task_wrapper(self, site: Dict, worker_id: int) -> Dict:
|
||||
"""
|
||||
线程安全的任务执行包装器
|
||||
|
||||
Args:
|
||||
site: 站点信息
|
||||
worker_id: Worker编号(用于日志标识)
|
||||
|
||||
Returns:
|
||||
执行结果字典
|
||||
"""
|
||||
site_id = site['id']
|
||||
site_url = site['site_url']
|
||||
|
||||
# 错峰启动:每个 worker 间隔 5-10 秒,避免同时调用 AdsPower API 触发限频
|
||||
if worker_id > 1:
|
||||
stagger_delay = (worker_id - 1) * random.randint(5, 10)
|
||||
logger.info(f"[Worker {worker_id}] [Site {site_id}] 错峰等待 {stagger_delay} 秒后启动...")
|
||||
time.sleep(stagger_delay)
|
||||
|
||||
logger.info(f"[Worker {worker_id}] [Site {site_id}] 开始点击: {site_url[:50]}...")
|
||||
|
||||
executor = None
|
||||
try:
|
||||
# 创建独立的 TaskExecutor 实例
|
||||
executor = TaskExecutor(max_workers=1, use_proxy=self.use_proxy)
|
||||
|
||||
# 创建浏览器环境(每个 worker 独立的 Profile 和代理)
|
||||
profile_info = executor.create_browser_profile(worker_id)
|
||||
if not profile_info:
|
||||
logger.error(f"[Worker {worker_id}] [Site {site_id}] 创建浏览器环境失败")
|
||||
with self._stats_lock:
|
||||
self.error_count += 1
|
||||
return {'success': False, 'error': '创建浏览器环境失败'}
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
# 获取完整站点信息
|
||||
all_sites = self.dm.get_active_urls()
|
||||
target_site = next((s for s in all_sites if s.get('id') == site_id), None)
|
||||
|
||||
if not target_site:
|
||||
logger.error(f"[Worker {worker_id}] [Site {site_id}] 未找到站点信息")
|
||||
return {'success': False, 'error': '未找到站点信息'}
|
||||
|
||||
# 执行任务
|
||||
result = executor.execute_single_task(target_site, worker_id, profile_info['profile_id'])
|
||||
|
||||
if result['success']:
|
||||
# 线程安全更新点击记录
|
||||
with self._click_records_lock:
|
||||
if site_id in self.click_records:
|
||||
self.click_records[site_id]['last_click'] = datetime.now()
|
||||
self.click_records[site_id]['today_count'] += 1
|
||||
with self._stats_lock:
|
||||
self.total_clicks_today += 1
|
||||
|
||||
logger.info(f"[Worker {worker_id}] [Site {site_id}] ✅ 点击完成")
|
||||
else:
|
||||
with self._stats_lock:
|
||||
self.error_count += 1
|
||||
logger.warning(f"[Worker {worker_id}] [Site {site_id}] ⚠️ 点击失败: {result.get('error', '未知错误')}")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
with self._stats_lock:
|
||||
self.error_count += 1
|
||||
logger.error(f"[Worker {worker_id}] [Site {site_id}] ❌ 异常: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return {'success': False, 'error': str(e)}
|
||||
finally:
|
||||
# 确保资源清理
|
||||
if executor:
|
||||
try:
|
||||
executor.close_browser()
|
||||
except Exception as e:
|
||||
logger.warning(f"[Worker {worker_id}] 清理资源失败: {e}")
|
||||
|
||||
def run_crawler_cycle(self):
|
||||
"""执行一次爬虫循环"""
|
||||
if not self.crawler:
|
||||
@@ -397,6 +637,15 @@ MIP广告点击服务
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def run_query_import_scan(self):
|
||||
"""扫描Query上传目录,导入待处理文件"""
|
||||
try:
|
||||
from query_keyword_importer import QueryKeywordImporter
|
||||
importer = QueryKeywordImporter()
|
||||
importer.scan_and_import()
|
||||
except Exception as e:
|
||||
logger.error(f"Query导入扫描异常: {e}")
|
||||
|
||||
def start(self):
|
||||
"""启动调度器"""
|
||||
# 获取进程锁
|
||||
@@ -457,6 +706,10 @@ MIP广告点击服务
|
||||
else:
|
||||
logger.info(" - 网址爬取未启用")
|
||||
|
||||
# 4. Query挖掘目录扫描(每15分钟)
|
||||
schedule.every(15).minutes.do(self.run_query_import_scan)
|
||||
logger.info(" - 每 15 分钟扫描Query上传目录")
|
||||
|
||||
logger.info("")
|
||||
|
||||
# 立即执行一次(如果在工作时间内)
|
||||
|
||||
Reference in New Issue
Block a user