# -*- coding: utf-8 -*- import json import logging import os import re from datetime import datetime, timedelta from functools import wraps import pandas as pd import requests from flask import Flask, render_template, request, jsonify, send_file, session, redirect, url_for from flask_cors import CORS from flask_socketio import SocketIO, emit from task_queue import get_task_queue from task_worker import start_task_worker logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = Flask(__name__, static_folder='static', template_folder='templates') app.secret_key = 'your-secret-key-change-this-in-production' # 请在生产环境中修改为随机密钥 CORS(app) # 初始化 SocketIO socketio = SocketIO(app, cors_allowed_origins="*", async_mode='threading') # 简单的用户数据库(生产环境应使用数据库) USERS = { 'admin': 'admin123', # 用户名: 密码 } # 登录验证装饰器 def login_required(f): @wraps(f) def decorated_function(*args, **kwargs): if 'username' not in session: return jsonify({'success': False, 'message': '请先登录', 'need_login': True}), 401 return f(*args, **kwargs) return decorated_function class BaijiahaoScraper: def __init__(self, uk, cookies=None, use_proxy=False, proxy_api_url=None): self.uk = uk self.api_url = 'https://mbd.baidu.com/webpage' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'Referer': 'https://baijiahao.baidu.com/', 'Accept': '*/*', } self.session = requests.Session() self.session.keep_alive = False # 禁用长连接 if cookies: self.session.cookies.update(cookies) # 代理配置 self.use_proxy = use_proxy self.proxy_api_url = proxy_api_url or 'http://api.tianqiip.com/getip?secret=lu29e593&num=1&type=txt&port=1&mr=1&sign=4b81a62eaed89ba802a8f34053e2c964' self.current_proxy = None def get_proxy(self): """从代理池获取一个代理IP""" try: print(f"正在从代理池获取IP: {self.proxy_api_url}") response = requests.get(self.proxy_api_url, timeout=5) # 优化超时为5秒 content = response.content.decode("utf-8").strip() print(f"代理API响应: {content}") # 检查是否是JSON错误响应 if content.startswith('{'): try: import json error_data = json.loads(content) error_msg = error_data.get('msg', '未知错误') print(f"❌ 代理IP池错误: {error_msg}") raise Exception(f"代理IP池错误: {error_msg}") except json.JSONDecodeError: pass # 不是JSON,继续按IP处理 # 解析IP和端口 if ':' in content: sj = content.strip().split(":", 1) sj1 = sj[0] # IP sj2 = sj[1] # 端口 print(f"IP: {sj1}, 端口: {sj2}") # 构建代理配置 proxy_meta = f"http://{sj1}:{sj2}" proxies = { 'http': proxy_meta, 'https': proxy_meta } self.current_proxy = proxies print(f"代理配置成功: {proxies}") return proxies else: print("代理IP格式错误") raise Exception(f"代理IP格式错误: {content}") except Exception as e: print(f"获取代理IP失败: {e}") raise def make_request(self, url, **kwargs): """使用代理发起请求,失败时自动切换IP""" max_retries = 3 # 最大重试次数 retry_count = 0 last_was_anti_crawl = False # 标记上次是否因反爬重试 while retry_count < max_retries: if self.use_proxy: # 如果没有当前代理或需要刷新,获取新代理 if not self.current_proxy or retry_count > 0: print(f"{'立即切换' if retry_count > 0 else ''}获取代理IP(第{retry_count + 1}次)") proxy = self.get_proxy() if not proxy: raise Exception("启用了代理但无法获取代理IP,拒绝使用本机IP") # 必须有代理才请求 if not self.current_proxy: raise Exception("启用了代理但当前无代理IP,拒绝使用本机IP") kwargs['proxies'] = self.current_proxy # 设置超时 if 'timeout' not in kwargs: kwargs['timeout'] = 20 try: import time start = int(round(time.time() * 1000)) response = self.session.get(url, **kwargs) cost_time = int(round(time.time() * 1000)) - start print(f"请求耗时: {cost_time}ms") # 检查是否被反爬 if self._check_anti_spider(response): print("⚠️ 检测到反爬,立即切换IP(无需等待)") self.current_proxy = None # 清空当前代理 retry_count += 1 last_was_anti_crawl = True if retry_count < max_retries: continue # 立即重试,不等待 else: raise Exception("多次重试后仍被反爬拦截,请稍后再试") return response except requests.exceptions.ProxyError as e: print(f"代理错误: {e},立即切换IP(无需等待)") self.current_proxy = None retry_count += 1 last_was_anti_crawl = True if retry_count < max_retries: continue # 立即重试,不等待 else: raise except requests.exceptions.HTTPError as e: # 检查是否是407错误(代理IP池限流) if e.response and e.response.status_code == 407: print(f"⚠️ 检测到407错误(代理IP池限流),等待3秒后重新获取IP...") import time time.sleep(3) # 等待3秒 if self.use_proxy: print("重新获取代理IP...") self.current_proxy = None retry_count += 1 if retry_count < max_retries: continue raise else: print(f"请求失败: {e}") raise except Exception as e: print(f"请求失败: {e}") # 如果使用代理失败,尝试重新获取代理 if self.use_proxy: print("立即切换代理(无需等待)") self.current_proxy = None retry_count += 1 if retry_count < max_retries: continue # 立即重试,不等待 raise raise Exception(f"请求失败,已重试{max_retries}次") def _check_anti_spider(self, response): """检查响应是否被反爬拦截""" # 检查状态码 if response.status_code in [403, 429, 503]: return True # 尝试解析JSON响应,检查is_need_foe字段 try: data = response.json() if data.get('data', {}).get('foe', {}).get('is_need_foe') == True: print("检测到is_need_foe=True,需要切换IP") return True except: pass # 如果不是JSON响应,继续检查文本内容 # 检查响应内容中的反爬特征 content = response.text.lower() anti_spider_keywords = [ '验证码', 'captcha', '请输入验证码', '访问频繁', '异常访问', '请稍后再试', 'access denied', 'forbidden', '安全验证', '人机验证' ] for keyword in anti_spider_keywords: if keyword in content: print(f"检测到反爬关键词: {keyword}") return True return False @staticmethod def get_uk_from_app_id(app_id, use_proxy=False, proxy_api_url=None): """从baijiahao主页提取uk参数""" url = f"https://baijiahao.baidu.com/u?app_id={app_id}" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', } print(f"正在从主页获取uk: {url}") max_retries = 3 # 最大重试次数 retry_count = 0 while retry_count < max_retries: # 准备代理配置 kwargs = {'headers': headers, 'timeout': 30} if use_proxy: proxy_url = proxy_api_url or 'http://api.tianqiip.com/getip?secret=lu29e593&num=1&type=txt&port=1&mr=1&sign=4b81a62eaed89ba802a8f34053e2c964' try: print(f"{'重新' if retry_count > 0 else ''}使用代理获取uk(第{retry_count + 1}次)") proxy_response = requests.get(proxy_url, timeout=5) # 优化超时为5秒 content = proxy_response.content.decode("utf-8").strip() print(f"代理API响应: {content}") # 检查是否是JSON错误响应 if content.startswith('{'): try: import json error_data = json.loads(content) error_msg = error_data.get('msg', '未知错误') raise Exception(f"代理IP池错误: {error_msg}") except json.JSONDecodeError: pass # 不是JSON,继续按IP处理 if ':' in content: sj = content.strip().split(":", 1) proxy_meta = f"http://{sj[0]}:{sj[1]}" kwargs['proxies'] = { 'http': proxy_meta, 'https': proxy_meta } print(f"使用代理: {proxy_meta}") else: raise Exception(f"代理IP格式错误: {content}") except Exception as e: # 启用了代理但获取失败,不使用本机IP,直接抛出异常 raise Exception(f"启用了代理但获取代理IP失败,拒绝使用本机IP: {e}") try: response = requests.get(url, **kwargs) if response.status_code != 200: print(f"访问主页失败,状态码: {response.status_code}") retry_count += 1 if retry_count < max_retries: print(f"立即重试(无需等待)") continue # 立即重试 else: raise Exception(f"访问主页失败,状态码: {response.status_code}") # 检查是否被反爬 content_lower = response.text.lower() anti_spider_detected = False # 检查is_need_foe字段 try: data = response.json() if data.get('data', {}).get('foe', {}).get('is_need_foe') == True: print("⚠️ 检测到is_need_foe=True,需要切换IP") anti_spider_detected = True except: pass # 不是JSON响应,继续检查其他特征 # 检查反爬关键词 if not anti_spider_detected: anti_spider_keywords = ['验证码', 'captcha', '访问频繁', '异常访问', 'access denied'] for keyword in anti_spider_keywords: if keyword in content_lower: print(f"⚠️ 检测到反爬关键词: {keyword}") anti_spider_detected = True break if anti_spider_detected and use_proxy: retry_count += 1 if retry_count < max_retries: print(f"检测到反爬,立即切换IP重试(无需等待)") continue # 立即重试,不等待 else: raise Exception("多次重试后仍被反爬拦截") # 使用正则表达式提取uk uk_match = re.search(r'"uk"\s*:\s*"([^"]+)"', response.text) if not uk_match: # 尝试另一种模式 uk_match = re.search(r'uk=([^&\s"]+)', response.text) if uk_match: uk = uk_match.group(1) print(f">> 成功获取UK: {uk}") # 获取Cookie cookies = response.cookies print(f">> 成功获取Cookie") return uk, cookies else: raise Exception("无法从页面中提取UK参数") except requests.exceptions.ProxyError as e: print(f"代理错误: {e},立即切换代理重试(无需等待)") retry_count += 1 if retry_count < max_retries and use_proxy: continue # 立即重试,不等待 else: raise except requests.exceptions.HTTPError as e: # 检查是否是407错误(代理IP池限流) if e.response and e.response.status_code == 407: print(f"⚠️ 检测到407错误(代理IP池限流),等待3秒后重新获取IP...") import time time.sleep(3) # 等待3秒 retry_count += 1 if retry_count < max_retries and use_proxy: print("重新获取代理IP,继续重试...") continue raise else: print(f"请求失败: {e}") raise except Exception as e: if retry_count < max_retries - 1: retry_count += 1 print(f"错误: {e},立即重试(无需等待)") continue # 立即重试,不等待 else: print(f"获取UK失败: {e}") raise def get_articles(self, months=6, app_id=None, articles_only=True, task_id=None, on_page_fetched=None, start_page=1, start_ctime=None): """使用API接口获取文章列表(调用baidu_api.py的get_baidu_data_sync) Args: months: 获取近几个月的数据 app_id: 百家号app_id articles_only: 是否仅爬取文章(跳过视频) task_id: 任务ID(用于数据库缓存) on_page_fetched: 每页数据获取后的回调函数 start_page: 起始页码(断点续传) start_ctime: 起始分页参数(断点续传) Returns: dict: {'last_page': 最后页码, 'last_ctime': 分页参数, 'completed': 是否完成} """ import sys import os # 导入baidu_api.py中的同步函数 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from baidu_api import get_baidu_data_sync from datetime import datetime, timedelta # 支持小数月份(如0.33个月10天) days = int(months * 30) target_date = datetime.now() - timedelta(days=days) print(f"\n{'=' * 50}") print(f"开始获取百家号作者(uk={self.uk})的文章...") if start_page > 1: print(f"🔄 断点续传:从第{start_page}页开始") print(f"{'=' * 50}\n") # 定义处理每页数据的回调函数 def process_page(page, items, ctime): """处理每页的原始数据,提取文章并调用上层回调""" processed_articles = [] for item in items: item_data = item.get('itemData', {}) # 如果启用articles_only,过滤掉视频内容 if articles_only: meta_type = item_data.get('meta_type', '') if meta_type == 'video': print(f" ✖ 跳过视频: {item_data.get('title', '[无标题]')[:50]}...") continue title = item_data.get('title', '') article_url = item_data.get('url', '') # 优先使用ctime(Unix时间戳),更准确 publish_time = '未知时间' if 'ctime' in item_data and item_data['ctime']: try: timestamp = int(item_data['ctime']) article_date = datetime.fromtimestamp(timestamp) publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S') except Exception as e: print(f" ✗ 解析ctime失败: {e}") time_str = item_data.get('time', '未知时间') if time_str != '未知时间': article_date = self._parse_article_date(time_str) if article_date: publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S') else: time_str = item_data.get('time', '未知时间') if time_str != '未知时间': article_date = self._parse_article_date(time_str) if article_date: publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S') processed_articles.append({ '标题': title.strip().split('\n')[0][:500] if title else '无标题', '链接': article_url, '发布时间': publish_time }) print(f"第{page}页处理完成:{len(processed_articles)}篇文章") # 调用上层回调(如果提供) if on_page_fetched: on_page_fetched(page, processed_articles, ctime) try: # 调用爬虫,传入回调函数 if months < 1: print(f"调用get_baidu_data_sync获取数据(近{days}天)...\n") else: print(f"调用get_baidu_data_sync获取数据(近{int(months)}个月)...\n") # 传递代理参数和回调函数 result = get_baidu_data_sync( uk=self.uk, months=months, use_proxy=self.use_proxy, proxy_api_url=self.proxy_api_url if self.proxy_api_url else None, on_page_fetched=process_page, start_page=start_page, start_ctime=start_ctime ) if not result: print("\n✗ 未获取到数据") return {'last_page': start_page, 'last_ctime': start_ctime, 'completed': False} print(f"\n✓ 爬取完成!") return result except Exception as e: print(f"\n✗ 调用get_baidu_data_sync失败: {e}") import traceback traceback.print_exc() return {'last_page': start_page, 'last_ctime': start_ctime, 'completed': False} def _parse_article_date(self, time_str): """解析文章时间字符串,返回datetime对象""" from datetime import datetime, timedelta import re if not time_str or time_str == '未知': return None current_year = datetime.now().year now = datetime.now() # 格式1: 相对时间 "1天前", "2小时前", "30分钟前", "1个月前" if '前' in time_str: # 天前 match = re.search(r'(\d+)\s*天', time_str) if match: days = int(match.group(1)) return now - timedelta(days=days) # 小时前 match = re.search(r'(\d+)\s*小时', time_str) if match: hours = int(match.group(1)) return now - timedelta(hours=hours) # 分钟前 match = re.search(r'(\d+)\s*分钟', time_str) if match: minutes = int(match.group(1)) return now - timedelta(minutes=minutes) # 月前 match = re.search(r'(\d+)\s*个?月', time_str) if match: months = int(match.group(1)) return now - timedelta(days=months * 30) # 近似计算 # 格式2: "11-29 07:23" 或 "11-29" match = re.match(r'(\d{1,2})-(\d{1,2})', time_str) if match: month = int(match.group(1)) day = int(match.group(2)) return datetime(current_year, month, day) # 格式3: "2024-11-29" match = re.match(r'(\d{4})-(\d{1,2})-(\d{1,2})', time_str) if match: year = int(match.group(1)) month = int(match.group(2)) day = int(match.group(3)) return datetime(year, month, day) return None def get_articles_from_html(self, months=6, app_id=None): """直接解析HTML获取文章(不依赖Selenium)""" if not app_id: raise Exception("需要提供app_id") articles = [] try: url = f"https://baijiahao.baidu.com/u?app_id={app_id}" print(f"访问页面: {url}") headers = { 'User-Agent': self.headers['User-Agent'], 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', } response = self.session.get(url, headers=headers, timeout=30) if response.status_code != 200: raise Exception(f"访问失败,状态码: {response.status_code}") html = response.text # 使用简单的字符串查找方式提取文章 # 查找所有包含 /s?id= 的链接 import re links = re.findall(r'href="(/s\?id=[^"]+)"', html) links += re.findall(r"href='(/s\?id=[^']+)'", html) # 尝试提取标题 for link in set(links): # 去重 # 尝试找到这个链接对应的title title_pattern = f'href="{link}"[^>]*title="([^"]+)"' title_match = re.search(title_pattern, html) if not title_match: title_pattern = f'title="([^"]+)"[^>]*href="{link}"' title_match = re.search(title_pattern, html) title = title_match.group(1) if title_match else "未知标题" full_link = 'https://baijiahao.baidu.com' + link articles.append({ '标题': title.strip(), '链接': full_link, '发布时间': '未知' }) print(f"成功提取 {len(articles)} 篇文章") except Exception as e: print(f"HTML解析失败: {e}") raise return articles # ========== SocketIO 事件处理 ========== @socketio.on('connect') def handle_connect(): """客户端连接时触发""" logger.info(f"客户端已连接: {request.sid}") emit('connected', {'message': '已连接到服务器'}) @socketio.on('disconnect') def handle_disconnect(): """客户端断开时触发""" logger.info(f"客户端已断开: {request.sid}") @socketio.on('subscribe_task') def handle_subscribe_task(data): """客户端订阅任务进度""" task_id = data.get('task_id') logger.info(f"客户端 {request.sid} 订阅任务: {task_id}") emit('subscribed', {'task_id': task_id}) def emit_task_log(task_id, message, level='info'): """发送任务日志到前端""" socketio.emit('task_log', { 'task_id': task_id, 'message': message, 'level': level, 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S') }) def emit_task_progress(task_id, progress, current_step='', **kwargs): """发送任务进度到前端""" data = { 'task_id': task_id, 'progress': progress, 'current_step': current_step, 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } data.update(kwargs) socketio.emit('task_progress', data) # ========== Flask 路由 ========== @app.route('/') def index(): """首页""" # 如果未登录,重定向到登录页 if 'username' not in session: return redirect(url_for('login_page')) return render_template('index.html', username=session.get('username')) @app.route('/queue') def queue_page(): """任务队列页面""" # 如果未登录,重定向到登录页 if 'username' not in session: return redirect(url_for('login_page')) return render_template('queue.html', username=session.get('username')) @app.route('/login') def login_page(): """登录页面""" # 如果已登录,重定向到首页 if 'username' in session: return redirect(url_for('index')) return render_template('login.html') @app.route('/api/login', methods=['POST']) def login(): """登录接口""" try: data = request.get_json() username = data.get('username', '').strip() password = data.get('password', '').strip() if not username or not password: return jsonify({'success': False, 'message': '请输入用户名和密码'}) # 验证用户名和密码 if username in USERS and USERS[username] == password: session['username'] = username return jsonify({'success': True, 'message': '登录成功'}) else: return jsonify({'success': False, 'message': '用户名或密码错误'}) except Exception as e: return jsonify({'success': False, 'message': f'登录失败: {str(e)}'}) @app.route('/api/logout', methods=['POST']) def logout(): """登出接口""" session.pop('username', None) return jsonify({'success': True, 'message': '已登出'}) @app.route('/api/export', methods=['POST']) @login_required def export_articles(): """导出文章到Excel""" try: data = request.get_json() url = data.get('url', '') cookies_str = data.get('cookies', '') # 从前端获取Cookie months = data.get('months', 6) # 获取时间范围,默认6个月 # 从URL中提取app_id,并转换为uk参数 app_id_match = re.search(r'app_id=(\d+)', url) if not app_id_match: return jsonify({'success': False, 'message': 'URL格式不正确,无法提取app_id'}) app_id = app_id_match.group(1) print(f"开始导出,app_id={app_id}") print(f"Cookie长度: {len(cookies_str) if cookies_str else 0}") # 检查是否使用代理 use_proxy = data.get('use_proxy', False) proxy_api_url = data.get('proxy_api_url', '') articles_only = data.get('articles_only', True) # 获取是否仅爬取文章 # 从主页获取uk和Cookie uk, auto_cookies = BaijiahaoScraper.get_uk_from_app_id(app_id, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None) if not uk: return jsonify({'success': False, 'message': '无法获取用户UK,请检查URL是否正确'}) print(f"成功获取uk={uk}") # 如果用户提供了Cookie,使用用户的Cookie if cookies_str: print("使用用户提供的Cookie") # 解析Cookie字符串 cookies_dict = {} for item in cookies_str.split(';'): item = item.strip() if '=' in item: key, value = item.split('=', 1) cookies_dict[key.strip()] = value.strip() # 转换为requests.cookies.RequestsCookieJar from requests.cookies import cookiejar_from_dict user_cookies = cookiejar_from_dict(cookies_dict) scraper = BaijiahaoScraper(uk, user_cookies, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None) else: print("使用自动获取的Cookie") scraper = BaijiahaoScraper(uk, auto_cookies, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None) # 使用 API 方式获取文章(仅保留代理IP池 + API 方式) print(f"使用 API 方式获取文章(近{months}个月)...") try: articles = scraper.get_articles(months=months, app_id=app_id, articles_only=articles_only) except Exception as e: print(f"API 方式失败: {e}") articles = [] if not articles: return jsonify({ 'success': False, 'message': ( '未能获取到文章数据。\n\n' '请确保:\n' '1. URL正确且该作者有发布过文章\n' '2. 网络连接正常\n' '3. 如需使用代理,请配置代理IP池' ) }) # 创建Excel文件 df = pd.DataFrame(articles) # 确保输出目录存在 output_dir = os.path.join(os.path.dirname(__file__), 'exports') os.makedirs(output_dir, exist_ok=True) # 生成文件名 timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') filename = f'baijiahao_articles_{app_id}_{timestamp}.xlsx' filepath = os.path.join(output_dir, filename) # 保存Excel with pd.ExcelWriter(filepath, engine='openpyxl') as writer: df.to_excel(writer, index=False, sheet_name='文章列表') # 调整列宽 worksheet = writer.sheets['文章列表'] worksheet.column_dimensions['A'].width = 80 # 标题列 worksheet.column_dimensions['B'].width = 20 # 时间列 return jsonify({ 'success': True, 'message': f'成功导出{len(articles)}篇文章', 'filename': filename, 'count': len(articles), 'articles': articles[:100] # 返回前100篇文章用于预览 }) except Exception as e: import traceback traceback.print_exc() return jsonify({'success': False, 'message': f'导出失败: {str(e)}'}) @app.route('/api/download/') @login_required def download_file(filename): """下载Excel文件""" try: filepath = os.path.join(os.path.dirname(__file__), 'exports', filename) if os.path.exists(filepath): return send_file(filepath, as_attachment=True, download_name=filename) else: return jsonify({'success': False, 'message': '文件不存在'}) except Exception as e: return jsonify({'success': False, 'message': f'下载失败: {str(e)}'}) # ==================== 任务队列相关API ==================== @app.route('/api/queue/add', methods=['POST']) @login_required def add_task_to_queue(): """添加任务到队列""" try: data = request.get_json() url = data.get('url', '') months = data.get('months', 6) use_proxy = data.get('use_proxy', False) proxy_api_url = data.get('proxy_api_url', '') articles_only = data.get('articles_only', True) # 获取是否仅爬取文章 if not url: return jsonify({'success': False, 'message': 'URL不能为空'}) # 添加任务到队列 queue = get_task_queue() task_id = queue.add_task( url=url, months=months, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None, username=session.get('username'), articles_only=articles_only ) return jsonify({ 'success': True, 'message': '任务已添加到队列', 'task_id': task_id }) except Exception as e: import traceback traceback.print_exc() return jsonify({'success': False, 'message': f'添加任务失败: {str(e)}'}) @app.route('/api/queue/tasks', methods=['GET']) @login_required def get_tasks(): """获取任务列表""" try: queue = get_task_queue() username = session.get('username') # 获取用户的所有任务 tasks = queue.get_all_tasks(username=username) # 按创建时间倒序排列 tasks.sort(key=lambda x: x.get('created_at', ''), reverse=True) return jsonify({ 'success': True, 'tasks': tasks }) except Exception as e: return jsonify({'success': False, 'message': f'获取任务列表失败: {str(e)}'}) @app.route('/api/queue/task/', methods=['GET']) @login_required def get_task_detail(task_id): """获取任务详情""" try: queue = get_task_queue() task = queue.get_task(task_id) if not task: return jsonify({'success': False, 'message': '任务不存在'}) # 检查权限 if task.get('username') != session.get('username'): return jsonify({'success': False, 'message': '无权查看此任务'}) return jsonify({ 'success': True, 'task': task }) except Exception as e: return jsonify({'success': False, 'message': f'获取任务详情失败: {str(e)}'}) @app.route('/api/queue/stats', methods=['GET']) @login_required def get_queue_stats(): """获取队列统计信息""" try: queue = get_task_queue() username = session.get('username') stats = queue.get_queue_stats(username=username) return jsonify({ 'success': True, 'stats': stats }) except Exception as e: return jsonify({'success': False, 'message': f'获取统计信息失败: {str(e)}'}) @app.route('/api/queue/download/', methods=['GET']) @login_required def download_task_result(task_id): """下载任务结果文件(通过任务ID)""" try: queue = get_task_queue() task = queue.get_task(task_id) if not task: return jsonify({'success': False, 'message': '任务不存在'}) # 检查权限 if task.get('username') != session.get('username'): return jsonify({'success': False, 'message': '无权下载此文件'}) if task.get('status') != 'completed': return jsonify({'success': False, 'message': '任务未完成'}) result_file = task.get('result_file') if not result_file: return jsonify({'success': False, 'message': '结果文件不存在'}) filepath = os.path.join(queue.results_dir, result_file) if not os.path.exists(filepath): return jsonify({'success': False, 'message': '文件不存在'}) return send_file(filepath, as_attachment=True, download_name=result_file) except Exception as e: return jsonify({'success': False, 'message': f'下载失败: {str(e)}'}) @app.route('/api/queue/task//delete', methods=['POST']) @login_required def delete_task(task_id): """删除任务(自动终止后删除)""" queue = get_task_queue() task = queue.get_task(task_id) if not task: return jsonify({'success': False, 'message': '任务不存在'}) if task.get('username') != session.get('username'): return jsonify({'success': False, 'message': '无权删除此任务'}) ok = queue.delete_task(task_id) return jsonify({'success': ok}) @app.route('/api/queue/task//cancel', methods=['POST']) @login_required def cancel_task(task_id): """终止任务(等待中或处理中)""" queue = get_task_queue() task = queue.get_task(task_id) if not task: return jsonify({'success': False, 'message': '任务不存在'}) if task.get('username') != session.get('username'): return jsonify({'success': False, 'message': '无权终止此任务'}) if task.get('status') not in ['pending', 'processing']: return jsonify({'success': False, 'message': '仅可终止等待中或处理中任务'}) ok = queue.cancel_task(task_id) return jsonify({'success': ok}) @app.route('/api/queue/task//logs', methods=['GET']) @login_required def get_task_logs(task_id): """获取任务的历史日志""" try: queue = get_task_queue() task = queue.get_task(task_id) if not task: return jsonify({'success': False, 'message': '任务不存在'}) # 检查权限 if task.get('username') != session.get('username'): return jsonify({'success': False, 'message': '无权查看此任务日志'}) # 从数据库获取日志 from database import get_database db = get_database() logs = db.get_task_logs(task_id) return jsonify({ 'success': True, 'logs': logs }) except Exception as e: return jsonify({'success': False, 'message': f'获取日志失败: {str(e)}'}) @app.route('/health/taskworker') def health_taskworker(): """TaskWorker 健康检查接口""" try: from task_worker import get_task_worker from task_queue import get_task_queue worker = get_task_worker() queue = get_task_queue() # 统计信息 tasks = queue.get_all_tasks() pending_count = len([t for t in tasks if t.get('status') == 'pending']) processing_count = len([t for t in tasks if t.get('status') == 'processing']) # Worker 状态 alive_threads = sum(1 for t in worker.worker_threads if t and t.is_alive()) status = 'healthy' if worker.running and alive_threads > 0 else 'unhealthy' return jsonify({ 'status': status, 'worker': { 'running': worker.running, 'alive_threads': alive_threads, 'current_workers': worker.current_workers, 'max_workers': worker.max_workers, 'processing_tasks': len(worker.processing_tasks) }, 'queue': { 'pending': pending_count, 'processing': processing_count, 'total': len(tasks) }, 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S') }) except Exception as e: return jsonify({ 'status': 'error', 'message': str(e) }), 500 if __name__ == '__main__': import sys # 检查并安装依赖 def check_dependencies(): """检查关键依赖是否安装""" missing = [] try: import flask except ImportError: missing.append('flask') try: import pandas except ImportError: missing.append('pandas') try: import openpyxl except ImportError: missing.append('openpyxl') if missing: print(f"\n⚠️ 缺少依赖: {', '.join(missing)}") print("请运行: pip install -r requirements.txt\n") return False return True if not check_dependencies(): sys.exit(1) # 创建必要的目录 os.makedirs('exports', exist_ok=True) os.makedirs('templates', exist_ok=True) os.makedirs('static/css', exist_ok=True) os.makedirs('static/js', exist_ok=True) os.makedirs('data', exist_ok=True) os.makedirs('data/results', exist_ok=True) # 启动任务处理器 print('🔧 启动任务处理器...') # 设置 SocketIO 实例到 task_worker from task_worker import set_socketio set_socketio(socketio) start_task_worker() print('✅ 任务处理器已启动') # 检测是否是生产环境 is_production = os.environ.get('FLASK_ENV') == 'production' if is_production: print('✅ 生产环境启动') print('请使用 gunicorn 或 uwsgi 运行:') print(' gunicorn -w 4 -b 0.0.0.0:5001 app:app') print('\n如果要直接运行,请使用: python app.py --dev') if '--dev' not in sys.argv: sys.exit(1) print('🚀 服务器启动成功!') print('请访问: http://127.0.0.1:8030') # 开发环境使用 SocketIO 服务器 socketio.run( app, debug=not is_production, host='0.0.0.0', port=8030, allow_unsafe_werkzeug=True )