#!/usr/bin/env python # -*- coding: utf-8 -*- """ TaskWorker 自动监控和恢复守护进程 用于生产环境中自动检测和修复任务卡住的问题 """ import os import sys import time import logging import signal import threading from datetime import datetime # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.FileHandler('logs/taskworker_monitor.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) class TaskWorkerMonitor: """TaskWorker 监控器""" def __init__(self, check_interval=60): """ Args: check_interval: 检查间隔(秒),默认60秒 """ self.check_interval = check_interval self.running = False self.monitor_thread = None def check_worker_status(self): """检查 TaskWorker 状态""" try: from task_worker import get_task_worker from task_queue import get_task_queue worker = get_task_worker() queue = get_task_queue() # 获取任务统计 tasks = queue.get_all_tasks() pending_count = len([t for t in tasks if t.get('status') == 'pending']) processing_count = len([t for t in tasks if t.get('status') == 'processing']) # 检查 worker 状态 is_running = worker.running alive_threads = sum(1 for t in worker.worker_threads if t and t.is_alive()) logger.info(f"状态检查 - 运行:{is_running} 活跃线程:{alive_threads} " f"待处理:{pending_count} 处理中:{processing_count}") # 判断是否需要修复 need_fix = False reason = "" if not is_running: need_fix = True reason = "TaskWorker 未运行" elif alive_threads == 0 and pending_count > 0: need_fix = True reason = f"有 {pending_count} 个待处理任务,但没有活跃线程" elif processing_count > 0: # 检查处理中的任务是否长时间未更新 # 这里可以添加更复杂的逻辑 pass return need_fix, reason, { 'running': is_running, 'alive_threads': alive_threads, 'pending_count': pending_count, 'processing_count': processing_count } except Exception as e: logger.error(f"检查状态失败: {e}") import traceback logger.error(traceback.format_exc()) return True, f"检查失败: {e}", {} def restart_worker(self): """重启 TaskWorker""" logger.warning("正在重启 TaskWorker...") try: from task_worker import get_task_worker worker = get_task_worker() # 停止现有 worker if worker.running: logger.info("停止现有 TaskWorker...") worker.stop() time.sleep(2) # 启动新的 worker logger.info("启动新的 TaskWorker...") worker.start() time.sleep(2) # 验证启动状态 if worker.running: alive_threads = sum(1 for t in worker.worker_threads if t and t.is_alive()) logger.info(f"✅ TaskWorker 重启成功,活跃线程: {alive_threads}") return True else: logger.error("❌ TaskWorker 重启后未运行") return False except Exception as e: logger.error(f"重启 TaskWorker 失败: {e}") import traceback logger.error(traceback.format_exc()) return False def monitor_loop(self): """监控循环""" logger.info(f"监控循环启动,检查间隔: {self.check_interval}秒") consecutive_failures = 0 max_consecutive_failures = 3 while self.running: try: # 检查状态 need_fix, reason, status = self.check_worker_status() if need_fix: logger.warning(f"⚠️ 检测到问题: {reason}") logger.info(f"状态详情: {status}") # 尝试修复 if self.restart_worker(): logger.info("✅ 自动修复成功") consecutive_failures = 0 else: consecutive_failures += 1 logger.error(f"❌ 自动修复失败 (连续失败 {consecutive_failures} 次)") if consecutive_failures >= max_consecutive_failures: logger.critical(f"连续修复失败 {consecutive_failures} 次,请人工介入!") # 可以在这里发送告警通知 else: consecutive_failures = 0 # 等待下次检查 time.sleep(self.check_interval) except Exception as e: logger.error(f"监控循环错误: {e}") import traceback logger.error(traceback.format_exc()) time.sleep(self.check_interval) logger.info("监控循环已停止") def start(self): """启动监控""" if self.running: logger.warning("监控已在运行") return self.running = True self.monitor_thread = threading.Thread(target=self.monitor_loop, daemon=True) self.monitor_thread.start() logger.info("TaskWorker 监控器已启动") def stop(self): """停止监控""" self.running = False if self.monitor_thread: self.monitor_thread.join(timeout=5) logger.info("TaskWorker 监控器已停止") def signal_handler(signum, frame): """信号处理器""" logger.info(f"收到信号 {signum},正在停止...") if monitor: monitor.stop() sys.exit(0) if __name__ == '__main__': # 创建日志目录 os.makedirs('logs', exist_ok=True) # 注册信号处理器 signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # 创建监控器 monitor = TaskWorkerMonitor(check_interval=60) # 每60秒检查一次 print("=" * 60) print("TaskWorker 自动监控守护进程") print("=" * 60) print(f"检查间隔: {monitor.check_interval} 秒") print("按 Ctrl+C 停止") print("=" * 60) # 启动监控 monitor.start() # 保持运行 try: while True: time.sleep(1) except KeyboardInterrupt: logger.info("用户中断") monitor.stop()