223 lines
6.6 KiB
Python
223 lines
6.6 KiB
Python
|
|
#!/usr/bin/env python
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
TaskWorker 状态检查和修复工具
|
|||
|
|
用于诊断和解决任务卡在等待中的问题
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import logging
|
|||
|
|
import psutil
|
|||
|
|
import time
|
|||
|
|
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s [%(levelname)s] %(message)s'
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def check_taskworker_lock():
|
|||
|
|
"""检查 TaskWorker 锁文件"""
|
|||
|
|
lock_file = 'data/taskworker.lock'
|
|||
|
|
|
|||
|
|
if os.path.exists(lock_file):
|
|||
|
|
try:
|
|||
|
|
with open(lock_file, 'r') as f:
|
|||
|
|
pid = f.read().strip()
|
|||
|
|
|
|||
|
|
logger.info(f"发现锁文件,记录的PID: {pid}")
|
|||
|
|
|
|||
|
|
# 检查进程是否存在
|
|||
|
|
try:
|
|||
|
|
pid_int = int(pid)
|
|||
|
|
if psutil.pid_exists(pid_int):
|
|||
|
|
proc = psutil.Process(pid_int)
|
|||
|
|
logger.info(f"进程 {pid} 存在: {proc.name()} - {proc.status()}")
|
|||
|
|
return True, pid_int
|
|||
|
|
else:
|
|||
|
|
logger.warning(f"进程 {pid} 不存在,锁文件已失效")
|
|||
|
|
return False, None
|
|||
|
|
except ValueError:
|
|||
|
|
logger.error(f"锁文件内容无效: {pid}")
|
|||
|
|
return False, None
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"读取锁文件失败: {e}")
|
|||
|
|
return False, None
|
|||
|
|
else:
|
|||
|
|
logger.info("未发现锁文件")
|
|||
|
|
return False, None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def check_pending_tasks():
|
|||
|
|
"""检查等待中的任务数量"""
|
|||
|
|
try:
|
|||
|
|
from task_queue import get_task_queue
|
|||
|
|
queue = get_task_queue()
|
|||
|
|
tasks = queue.get_all_tasks()
|
|||
|
|
|
|||
|
|
pending_tasks = [t for t in tasks if t.get('status') == 'pending']
|
|||
|
|
processing_tasks = [t for t in tasks if t.get('status') == 'processing']
|
|||
|
|
|
|||
|
|
logger.info(f"待处理任务: {len(pending_tasks)} 个")
|
|||
|
|
logger.info(f"处理中任务: {len(processing_tasks)} 个")
|
|||
|
|
|
|||
|
|
if pending_tasks:
|
|||
|
|
logger.info("待处理任务列表:")
|
|||
|
|
for task in pending_tasks[:5]: # 只显示前5个
|
|||
|
|
logger.info(f" - {task['task_id']}: {task.get('url', 'N/A')[:50]}")
|
|||
|
|
|
|||
|
|
return len(pending_tasks), len(processing_tasks)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"检查任务失败: {e}")
|
|||
|
|
return 0, 0
|
|||
|
|
|
|||
|
|
|
|||
|
|
def check_worker_threads():
|
|||
|
|
"""检查 TaskWorker 线程是否运行"""
|
|||
|
|
try:
|
|||
|
|
from task_worker import get_task_worker
|
|||
|
|
worker = get_task_worker()
|
|||
|
|
|
|||
|
|
logger.info(f"TaskWorker 运行状态: {worker.running}")
|
|||
|
|
logger.info(f"当前并发数: {worker.current_workers}/{worker.max_workers}")
|
|||
|
|
logger.info(f"工作线程数: {len(worker.worker_threads)}")
|
|||
|
|
logger.info(f"正在处理的任务: {len(worker.processing_tasks)}")
|
|||
|
|
|
|||
|
|
# 检查线程是否活跃
|
|||
|
|
alive_threads = sum(1 for t in worker.worker_threads if t and t.is_alive())
|
|||
|
|
logger.info(f"活跃线程数: {alive_threads}")
|
|||
|
|
|
|||
|
|
return worker.running, alive_threads
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"检查 TaskWorker 失败: {e}")
|
|||
|
|
import traceback
|
|||
|
|
logger.error(traceback.format_exc())
|
|||
|
|
return False, 0
|
|||
|
|
|
|||
|
|
|
|||
|
|
def restart_taskworker():
|
|||
|
|
"""重启 TaskWorker"""
|
|||
|
|
logger.info("正在重启 TaskWorker...")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from task_worker import get_task_worker
|
|||
|
|
worker = get_task_worker()
|
|||
|
|
|
|||
|
|
# 停止现有 worker
|
|||
|
|
if worker.running:
|
|||
|
|
logger.info("停止现有 TaskWorker...")
|
|||
|
|
worker.stop()
|
|||
|
|
time.sleep(2)
|
|||
|
|
|
|||
|
|
# 启动新的 worker
|
|||
|
|
logger.info("启动新的 TaskWorker...")
|
|||
|
|
worker.start()
|
|||
|
|
time.sleep(1)
|
|||
|
|
|
|||
|
|
# 验证启动状态
|
|||
|
|
running, alive_threads = check_worker_threads()
|
|||
|
|
if running and alive_threads > 0:
|
|||
|
|
logger.info("✅ TaskWorker 重启成功")
|
|||
|
|
return True
|
|||
|
|
else:
|
|||
|
|
logger.error("❌ TaskWorker 重启失败")
|
|||
|
|
return False
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"重启 TaskWorker 失败: {e}")
|
|||
|
|
import traceback
|
|||
|
|
logger.error(traceback.format_exc())
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
def clean_stale_lock():
|
|||
|
|
"""清理失效的锁文件"""
|
|||
|
|
lock_file = 'data/taskworker.lock'
|
|||
|
|
|
|||
|
|
if os.path.exists(lock_file):
|
|||
|
|
try:
|
|||
|
|
os.remove(lock_file)
|
|||
|
|
logger.info("✅ 已清理失效的锁文件")
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"清理锁文件失败: {e}")
|
|||
|
|
return False
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("TaskWorker 状态检查工具")
|
|||
|
|
print("=" * 60)
|
|||
|
|
|
|||
|
|
# 1. 检查锁文件
|
|||
|
|
print("\n[1] 检查锁文件...")
|
|||
|
|
lock_exists, lock_pid = check_taskworker_lock()
|
|||
|
|
|
|||
|
|
# 2. 检查待处理任务
|
|||
|
|
print("\n[2] 检查任务队列...")
|
|||
|
|
pending_count, processing_count = check_pending_tasks()
|
|||
|
|
|
|||
|
|
# 3. 检查 Worker 线程
|
|||
|
|
print("\n[3] 检查 TaskWorker 状态...")
|
|||
|
|
try:
|
|||
|
|
is_running, alive_threads = check_worker_threads()
|
|||
|
|
except:
|
|||
|
|
is_running, alive_threads = False, 0
|
|||
|
|
|
|||
|
|
# 4. 诊断和修复
|
|||
|
|
print("\n[4] 诊断结果:")
|
|||
|
|
print("-" * 60)
|
|||
|
|
|
|||
|
|
need_fix = False
|
|||
|
|
|
|||
|
|
if pending_count > 0 and alive_threads == 0:
|
|||
|
|
print("❌ 问题: 有待处理任务,但没有活跃的工作线程")
|
|||
|
|
need_fix = True
|
|||
|
|
|
|||
|
|
if lock_exists and not lock_pid:
|
|||
|
|
print("⚠️ 警告: 锁文件存在但进程不存在(僵尸锁)")
|
|||
|
|
need_fix = True
|
|||
|
|
|
|||
|
|
if not is_running:
|
|||
|
|
print("❌ 问题: TaskWorker 未运行")
|
|||
|
|
need_fix = True
|
|||
|
|
|
|||
|
|
if not need_fix:
|
|||
|
|
print("✅ TaskWorker 运行正常")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 5. 修复
|
|||
|
|
print("\n[5] 开始修复...")
|
|||
|
|
print("-" * 60)
|
|||
|
|
|
|||
|
|
if '--fix' in sys.argv or '--auto-fix' in sys.argv:
|
|||
|
|
# 清理失效的锁文件
|
|||
|
|
clean_stale_lock()
|
|||
|
|
|
|||
|
|
# 重启 TaskWorker
|
|||
|
|
if restart_taskworker():
|
|||
|
|
print("\n✅ 修复完成!")
|
|||
|
|
print("\n重新检查状态...")
|
|||
|
|
time.sleep(2)
|
|||
|
|
check_worker_threads()
|
|||
|
|
check_pending_tasks()
|
|||
|
|
else:
|
|||
|
|
print("\n❌ 修复失败,请手动重启服务")
|
|||
|
|
else:
|
|||
|
|
print("\n提示: 使用 --fix 参数自动修复问题")
|
|||
|
|
print("示例: python check_taskworker.py --fix")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
try:
|
|||
|
|
main()
|
|||
|
|
except KeyboardInterrupt:
|
|||
|
|
print("\n\n用户中断")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"执行失败: {e}")
|
|||
|
|
import traceback
|
|||
|
|
traceback.print_exc()
|