Files
ai_baijiahao/app.py

1124 lines
41 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
import json
import logging
import os
import re
from datetime import datetime, timedelta
from functools import wraps
import pandas as pd
import requests
from flask import Flask, render_template, request, jsonify, send_file, session, redirect, url_for
from flask_cors import CORS
from flask_socketio import SocketIO, emit
from task_queue import get_task_queue
from task_worker import start_task_worker
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = Flask(__name__, static_folder='static', template_folder='templates')
app.secret_key = 'your-secret-key-change-this-in-production' # 请在生产环境中修改为随机密钥
CORS(app)
# 初始化 SocketIO
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='threading')
# 简单的用户数据库(生产环境应使用数据库)
USERS = {
'admin': 'admin123', # 用户名: 密码
}
# 登录验证装饰器
def login_required(f):
@wraps(f)
def decorated_function(*args, **kwargs):
if 'username' not in session:
return jsonify({'success': False, 'message': '请先登录', 'need_login': True}), 401
return f(*args, **kwargs)
return decorated_function
class BaijiahaoScraper:
def __init__(self, uk, cookies=None, use_proxy=False, proxy_api_url=None):
self.uk = uk
self.api_url = 'https://mbd.baidu.com/webpage'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
'Referer': 'https://baijiahao.baidu.com/',
'Accept': '*/*',
}
self.session = requests.Session()
self.session.keep_alive = False # 禁用长连接
if cookies:
self.session.cookies.update(cookies)
# 代理配置
self.use_proxy = use_proxy
self.proxy_api_url = proxy_api_url or 'http://api.tianqiip.com/getip?secret=lu29e593&num=1&type=txt&port=1&mr=1&sign=4b81a62eaed89ba802a8f34053e2c964'
self.current_proxy = None
def get_proxy(self):
"""从代理池获取一个代理IP"""
try:
print(f"正在从代理池获取IP: {self.proxy_api_url}")
response = requests.get(self.proxy_api_url, timeout=5) # 优化超时为5秒
content = response.content.decode("utf-8").strip()
print(f"代理API响应: {content}")
# 检查是否是JSON错误响应
if content.startswith('{'):
try:
import json
error_data = json.loads(content)
error_msg = error_data.get('msg', '未知错误')
print(f"❌ 代理IP池错误: {error_msg}")
raise Exception(f"代理IP池错误: {error_msg}")
except json.JSONDecodeError:
pass # 不是JSON继续按IP处理
# 解析IP和端口
if ':' in content:
sj = content.strip().split(":", 1)
sj1 = sj[0] # IP
sj2 = sj[1] # 端口
print(f"IP: {sj1}, 端口: {sj2}")
# 构建代理配置
proxy_meta = f"http://{sj1}:{sj2}"
proxies = {
'http': proxy_meta,
'https': proxy_meta
}
self.current_proxy = proxies
print(f"代理配置成功: {proxies}")
return proxies
else:
print("代理IP格式错误")
raise Exception(f"代理IP格式错误: {content}")
except Exception as e:
print(f"获取代理IP失败: {e}")
raise
def make_request(self, url, **kwargs):
"""使用代理发起请求失败时自动切换IP"""
max_retries = 3 # 最大重试次数
retry_count = 0
last_was_anti_crawl = False # 标记上次是否因反爬重试
while retry_count < max_retries:
if self.use_proxy:
# 如果没有当前代理或需要刷新,获取新代理
if not self.current_proxy or retry_count > 0:
print(f"{'立即切换' if retry_count > 0 else ''}获取代理IP{retry_count + 1}次)")
proxy = self.get_proxy()
if not proxy:
raise Exception("启用了代理但无法获取代理IP拒绝使用本机IP")
# 必须有代理才请求
if not self.current_proxy:
raise Exception("启用了代理但当前无代理IP拒绝使用本机IP")
kwargs['proxies'] = self.current_proxy
# 设置超时
if 'timeout' not in kwargs:
kwargs['timeout'] = 20
try:
import time
start = int(round(time.time() * 1000))
response = self.session.get(url, **kwargs)
cost_time = int(round(time.time() * 1000)) - start
print(f"请求耗时: {cost_time}ms")
# 检查是否被反爬
if self._check_anti_spider(response):
print("⚠️ 检测到反爬立即切换IP无需等待")
self.current_proxy = None # 清空当前代理
retry_count += 1
last_was_anti_crawl = True
if retry_count < max_retries:
continue # 立即重试,不等待
else:
raise Exception("多次重试后仍被反爬拦截,请稍后再试")
return response
except requests.exceptions.ProxyError as e:
print(f"代理错误: {e}立即切换IP无需等待")
self.current_proxy = None
retry_count += 1
last_was_anti_crawl = True
if retry_count < max_retries:
continue # 立即重试,不等待
else:
raise
except requests.exceptions.HTTPError as e:
# 检查是否是407错误代理IP池限流
if e.response and e.response.status_code == 407:
print(f"⚠️ 检测到407错误代理IP池限流等待3秒后重新获取IP...")
import time
time.sleep(3) # 等待3秒
if self.use_proxy:
print("重新获取代理IP...")
self.current_proxy = None
retry_count += 1
if retry_count < max_retries:
continue
raise
else:
print(f"请求失败: {e}")
raise
except Exception as e:
print(f"请求失败: {e}")
# 如果使用代理失败,尝试重新获取代理
if self.use_proxy:
print("立即切换代理(无需等待)")
self.current_proxy = None
retry_count += 1
if retry_count < max_retries:
continue # 立即重试,不等待
raise
raise Exception(f"请求失败,已重试{max_retries}")
def _check_anti_spider(self, response):
"""检查响应是否被反爬拦截"""
# 检查状态码
if response.status_code in [403, 429, 503]:
return True
# 尝试解析JSON响应检查is_need_foe字段
try:
data = response.json()
if data.get('data', {}).get('foe', {}).get('is_need_foe') == True:
print("检测到is_need_foe=True需要切换IP")
return True
except:
pass # 如果不是JSON响应继续检查文本内容
# 检查响应内容中的反爬特征
content = response.text.lower()
anti_spider_keywords = [
'验证码',
'captcha',
'请输入验证码',
'访问频繁',
'异常访问',
'请稍后再试',
'access denied',
'forbidden',
'安全验证',
'人机验证'
]
for keyword in anti_spider_keywords:
if keyword in content:
print(f"检测到反爬关键词: {keyword}")
return True
return False
@staticmethod
def get_uk_from_app_id(app_id, use_proxy=False, proxy_api_url=None):
"""从baijiahao主页提取uk参数"""
url = f"https://baijiahao.baidu.com/u?app_id={app_id}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}
print(f"正在从主页获取uk: {url}")
max_retries = 3 # 最大重试次数
retry_count = 0
while retry_count < max_retries:
# 准备代理配置
kwargs = {'headers': headers, 'timeout': 30}
if use_proxy:
proxy_url = proxy_api_url or 'http://api.tianqiip.com/getip?secret=lu29e593&num=1&type=txt&port=1&mr=1&sign=4b81a62eaed89ba802a8f34053e2c964'
try:
print(f"{'重新' if retry_count > 0 else ''}使用代理获取uk{retry_count + 1}次)")
proxy_response = requests.get(proxy_url, timeout=5) # 优化超时为5秒
content = proxy_response.content.decode("utf-8").strip()
print(f"代理API响应: {content}")
# 检查是否是JSON错误响应
if content.startswith('{'):
try:
import json
error_data = json.loads(content)
error_msg = error_data.get('msg', '未知错误')
raise Exception(f"代理IP池错误: {error_msg}")
except json.JSONDecodeError:
pass # 不是JSON继续按IP处理
if ':' in content:
sj = content.strip().split(":", 1)
proxy_meta = f"http://{sj[0]}:{sj[1]}"
kwargs['proxies'] = {
'http': proxy_meta,
'https': proxy_meta
}
print(f"使用代理: {proxy_meta}")
else:
raise Exception(f"代理IP格式错误: {content}")
except Exception as e:
# 启用了代理但获取失败不使用本机IP直接抛出异常
raise Exception(f"启用了代理但获取代理IP失败拒绝使用本机IP: {e}")
try:
response = requests.get(url, **kwargs)
if response.status_code != 200:
print(f"访问主页失败,状态码: {response.status_code}")
retry_count += 1
if retry_count < max_retries:
print(f"立即重试(无需等待)")
continue # 立即重试
else:
raise Exception(f"访问主页失败,状态码: {response.status_code}")
# 检查是否被反爬
content_lower = response.text.lower()
anti_spider_detected = False
# 检查is_need_foe字段
try:
data = response.json()
if data.get('data', {}).get('foe', {}).get('is_need_foe') == True:
print("⚠️ 检测到is_need_foe=True需要切换IP")
anti_spider_detected = True
except:
pass # 不是JSON响应继续检查其他特征
# 检查反爬关键词
if not anti_spider_detected:
anti_spider_keywords = ['验证码', 'captcha', '访问频繁', '异常访问', 'access denied']
for keyword in anti_spider_keywords:
if keyword in content_lower:
print(f"⚠️ 检测到反爬关键词: {keyword}")
anti_spider_detected = True
break
if anti_spider_detected and use_proxy:
retry_count += 1
if retry_count < max_retries:
print(f"检测到反爬立即切换IP重试无需等待")
continue # 立即重试,不等待
else:
raise Exception("多次重试后仍被反爬拦截")
# 使用正则表达式提取uk
uk_match = re.search(r'"uk"\s*:\s*"([^"]+)"', response.text)
if not uk_match:
# 尝试另一种模式
uk_match = re.search(r'uk=([^&\s"]+)', response.text)
if uk_match:
uk = uk_match.group(1)
print(f">> 成功获取UK: {uk}")
# 获取Cookie
cookies = response.cookies
print(f">> 成功获取Cookie")
return uk, cookies
else:
raise Exception("无法从页面中提取UK参数")
except requests.exceptions.ProxyError as e:
print(f"代理错误: {e},立即切换代理重试(无需等待)")
retry_count += 1
if retry_count < max_retries and use_proxy:
continue # 立即重试,不等待
else:
raise
except requests.exceptions.HTTPError as e:
# 检查是否是407错误代理IP池限流
if e.response and e.response.status_code == 407:
print(f"⚠️ 检测到407错误代理IP池限流等待3秒后重新获取IP...")
import time
time.sleep(3) # 等待3秒
retry_count += 1
if retry_count < max_retries and use_proxy:
print("重新获取代理IP继续重试...")
continue
raise
else:
print(f"请求失败: {e}")
raise
except Exception as e:
if retry_count < max_retries - 1:
retry_count += 1
print(f"错误: {e},立即重试(无需等待)")
continue # 立即重试,不等待
else:
print(f"获取UK失败: {e}")
raise
def get_articles(self, months=6, app_id=None, articles_only=True, task_id=None, on_page_fetched=None,
start_page=1, start_ctime=None):
"""使用API接口获取文章列表(调用baidu_api.py的get_baidu_data_sync)
Args:
months: 获取近几个月的数据
app_id: 百家号app_id
articles_only: 是否仅爬取文章跳过视频
task_id: 任务ID用于数据库缓存
on_page_fetched: 每页数据获取后的回调函数
start_page: 起始页码断点续传
start_ctime: 起始分页参数断点续传
Returns:
dict: {'last_page': 最后页码, 'last_ctime': 分页参数, 'completed': 是否完成}
"""
import sys
import os
# 导入baidu_api.py中的同步函数
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from baidu_api import get_baidu_data_sync
from datetime import datetime, timedelta
# 支持小数月份(如0.33个月10天)
days = int(months * 30)
target_date = datetime.now() - timedelta(days=days)
print(f"\n{'=' * 50}")
print(f"开始获取百家号作者(uk={self.uk})的文章...")
if start_page > 1:
print(f"🔄 断点续传:从第{start_page}页开始")
print(f"{'=' * 50}\n")
# 定义处理每页数据的回调函数
def process_page(page, items, ctime):
"""处理每页的原始数据,提取文章并调用上层回调"""
processed_articles = []
for item in items:
item_data = item.get('itemData', {})
# 如果启用articles_only过滤掉视频内容
if articles_only:
meta_type = item_data.get('meta_type', '')
if meta_type == 'video':
print(f" ✖ 跳过视频: {item_data.get('title', '[无标题]')[:50]}...")
continue
title = item_data.get('title', '')
article_url = item_data.get('url', '')
# 优先使用ctime(Unix时间戳),更准确
publish_time = '未知时间'
if 'ctime' in item_data and item_data['ctime']:
try:
timestamp = int(item_data['ctime'])
article_date = datetime.fromtimestamp(timestamp)
publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S')
except Exception as e:
print(f" ✗ 解析ctime失败: {e}")
time_str = item_data.get('time', '未知时间')
if time_str != '未知时间':
article_date = self._parse_article_date(time_str)
if article_date:
publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S')
else:
time_str = item_data.get('time', '未知时间')
if time_str != '未知时间':
article_date = self._parse_article_date(time_str)
if article_date:
publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S')
processed_articles.append({
'标题': title.strip().split('\n')[0][:500] if title else '无标题',
'链接': article_url,
'发布时间': publish_time
})
print(f"{page}页处理完成:{len(processed_articles)}篇文章")
# 调用上层回调(如果提供)
if on_page_fetched:
on_page_fetched(page, processed_articles, ctime)
try:
# 调用爬虫,传入回调函数
if months < 1:
print(f"调用get_baidu_data_sync获取数据(近{days}天)...\n")
else:
print(f"调用get_baidu_data_sync获取数据(近{int(months)}个月)...\n")
# 传递代理参数和回调函数
result = get_baidu_data_sync(
uk=self.uk,
months=months,
use_proxy=self.use_proxy,
proxy_api_url=self.proxy_api_url if self.proxy_api_url else None,
on_page_fetched=process_page,
start_page=start_page,
start_ctime=start_ctime
)
if not result:
print("\n✗ 未获取到数据")
return {'last_page': start_page, 'last_ctime': start_ctime, 'completed': False}
print(f"\n✓ 爬取完成!")
return result
except Exception as e:
print(f"\n✗ 调用get_baidu_data_sync失败: {e}")
import traceback
traceback.print_exc()
return {'last_page': start_page, 'last_ctime': start_ctime, 'completed': False}
def _parse_article_date(self, time_str):
"""解析文章时间字符串,返回datetime对象"""
from datetime import datetime, timedelta
import re
if not time_str or time_str == '未知':
return None
current_year = datetime.now().year
now = datetime.now()
# 格式1: 相对时间 "1天前", "2小时前", "30分钟前", "1个月前"
if '' in time_str:
# 天前
match = re.search(r'(\d+)\s*天', time_str)
if match:
days = int(match.group(1))
return now - timedelta(days=days)
# 小时前
match = re.search(r'(\d+)\s*小时', time_str)
if match:
hours = int(match.group(1))
return now - timedelta(hours=hours)
# 分钟前
match = re.search(r'(\d+)\s*分钟', time_str)
if match:
minutes = int(match.group(1))
return now - timedelta(minutes=minutes)
# 月前
match = re.search(r'(\d+)\s*个?月', time_str)
if match:
months = int(match.group(1))
return now - timedelta(days=months * 30) # 近似计算
# 格式2: "11-29 07:23" 或 "11-29"
match = re.match(r'(\d{1,2})-(\d{1,2})', time_str)
if match:
month = int(match.group(1))
day = int(match.group(2))
return datetime(current_year, month, day)
# 格式3: "2024-11-29"
match = re.match(r'(\d{4})-(\d{1,2})-(\d{1,2})', time_str)
if match:
year = int(match.group(1))
month = int(match.group(2))
day = int(match.group(3))
return datetime(year, month, day)
return None
def get_articles_from_html(self, months=6, app_id=None):
"""直接解析HTML获取文章(不依赖Selenium)"""
if not app_id:
raise Exception("需要提供app_id")
articles = []
try:
url = f"https://baijiahao.baidu.com/u?app_id={app_id}"
print(f"访问页面: {url}")
headers = {
'User-Agent': self.headers['User-Agent'],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
}
response = self.session.get(url, headers=headers, timeout=30)
if response.status_code != 200:
raise Exception(f"访问失败,状态码: {response.status_code}")
html = response.text
# 使用简单的字符串查找方式提取文章
# 查找所有包含 /s?id= 的链接
import re
links = re.findall(r'href="(/s\?id=[^"]+)"', html)
links += re.findall(r"href='(/s\?id=[^']+)'", html)
# 尝试提取标题
for link in set(links): # 去重
# 尝试找到这个链接对应的title
title_pattern = f'href="{link}"[^>]*title="([^"]+)"'
title_match = re.search(title_pattern, html)
if not title_match:
title_pattern = f'title="([^"]+)"[^>]*href="{link}"'
title_match = re.search(title_pattern, html)
title = title_match.group(1) if title_match else "未知标题"
full_link = 'https://baijiahao.baidu.com' + link
articles.append({
'标题': title.strip(),
'链接': full_link,
'发布时间': '未知'
})
print(f"成功提取 {len(articles)} 篇文章")
except Exception as e:
print(f"HTML解析失败: {e}")
raise
return articles
# ========== SocketIO 事件处理 ==========
@socketio.on('connect')
def handle_connect():
"""客户端连接时触发"""
logger.info(f"客户端已连接: {request.sid}")
emit('connected', {'message': '已连接到服务器'})
@socketio.on('disconnect')
def handle_disconnect():
"""客户端断开时触发"""
logger.info(f"客户端已断开: {request.sid}")
@socketio.on('subscribe_task')
def handle_subscribe_task(data):
"""客户端订阅任务进度"""
task_id = data.get('task_id')
logger.info(f"客户端 {request.sid} 订阅任务: {task_id}")
emit('subscribed', {'task_id': task_id})
def emit_task_log(task_id, message, level='info'):
"""发送任务日志到前端"""
socketio.emit('task_log', {
'task_id': task_id,
'message': message,
'level': level,
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
})
def emit_task_progress(task_id, progress, current_step='', **kwargs):
"""发送任务进度到前端"""
data = {
'task_id': task_id,
'progress': progress,
'current_step': current_step,
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
data.update(kwargs)
socketio.emit('task_progress', data)
# ========== Flask 路由 ==========
@app.route('/')
def index():
"""首页"""
# 如果未登录,重定向到登录页
if 'username' not in session:
return redirect(url_for('login_page'))
return render_template('index.html', username=session.get('username'))
@app.route('/queue')
def queue_page():
"""任务队列页面"""
# 如果未登录,重定向到登录页
if 'username' not in session:
return redirect(url_for('login_page'))
return render_template('queue.html', username=session.get('username'))
@app.route('/login')
def login_page():
"""登录页面"""
# 如果已登录,重定向到首页
if 'username' in session:
return redirect(url_for('index'))
return render_template('login.html')
@app.route('/api/login', methods=['POST'])
def login():
"""登录接口"""
try:
data = request.get_json()
username = data.get('username', '').strip()
password = data.get('password', '').strip()
if not username or not password:
return jsonify({'success': False, 'message': '请输入用户名和密码'})
# 验证用户名和密码
if username in USERS and USERS[username] == password:
session['username'] = username
return jsonify({'success': True, 'message': '登录成功'})
else:
return jsonify({'success': False, 'message': '用户名或密码错误'})
except Exception as e:
return jsonify({'success': False, 'message': f'登录失败: {str(e)}'})
@app.route('/api/logout', methods=['POST'])
def logout():
"""登出接口"""
session.pop('username', None)
return jsonify({'success': True, 'message': '已登出'})
@app.route('/api/export', methods=['POST'])
@login_required
def export_articles():
"""导出文章到Excel"""
try:
data = request.get_json()
url = data.get('url', '')
cookies_str = data.get('cookies', '') # 从前端获取Cookie
months = data.get('months', 6) # 获取时间范围默认6个月
# 从URL中提取app_id并转换为uk参数
app_id_match = re.search(r'app_id=(\d+)', url)
if not app_id_match:
return jsonify({'success': False, 'message': 'URL格式不正确无法提取app_id'})
app_id = app_id_match.group(1)
print(f"开始导出app_id={app_id}")
print(f"Cookie长度: {len(cookies_str) if cookies_str else 0}")
# 检查是否使用代理
use_proxy = data.get('use_proxy', False)
proxy_api_url = data.get('proxy_api_url', '')
articles_only = data.get('articles_only', True) # 获取是否仅爬取文章
# 从主页获取uk和Cookie
uk, auto_cookies = BaijiahaoScraper.get_uk_from_app_id(app_id, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None)
if not uk:
return jsonify({'success': False, 'message': '无法获取用户UK请检查URL是否正确'})
print(f"成功获取uk={uk}")
# 如果用户提供了Cookie,使用用户的Cookie
if cookies_str:
print("使用用户提供的Cookie")
# 解析Cookie字符串
cookies_dict = {}
for item in cookies_str.split(';'):
item = item.strip()
if '=' in item:
key, value = item.split('=', 1)
cookies_dict[key.strip()] = value.strip()
# 转换为requests.cookies.RequestsCookieJar
from requests.cookies import cookiejar_from_dict
user_cookies = cookiejar_from_dict(cookies_dict)
scraper = BaijiahaoScraper(uk, user_cookies, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None)
else:
print("使用自动获取的Cookie")
scraper = BaijiahaoScraper(uk, auto_cookies, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None)
# 使用 API 方式获取文章仅保留代理IP池 + API 方式)
print(f"使用 API 方式获取文章(近{months}个月)...")
try:
articles = scraper.get_articles(months=months, app_id=app_id, articles_only=articles_only)
except Exception as e:
print(f"API 方式失败: {e}")
articles = []
if not articles:
return jsonify({
'success': False,
'message': (
'未能获取到文章数据。\n\n'
'请确保:\n'
'1. URL正确且该作者有发布过文章\n'
'2. 网络连接正常\n'
'3. 如需使用代理请配置代理IP池'
)
})
# 创建Excel文件
df = pd.DataFrame(articles)
# 确保输出目录存在
output_dir = os.path.join(os.path.dirname(__file__), 'exports')
os.makedirs(output_dir, exist_ok=True)
# 生成文件名
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'baijiahao_articles_{app_id}_{timestamp}.xlsx'
filepath = os.path.join(output_dir, filename)
# 保存Excel
with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name='文章列表')
# 调整列宽
worksheet = writer.sheets['文章列表']
worksheet.column_dimensions['A'].width = 80 # 标题列
worksheet.column_dimensions['B'].width = 20 # 时间列
return jsonify({
'success': True,
'message': f'成功导出{len(articles)}篇文章',
'filename': filename,
'count': len(articles),
'articles': articles[:100] # 返回前100篇文章用于预览
})
except Exception as e:
import traceback
traceback.print_exc()
return jsonify({'success': False, 'message': f'导出失败: {str(e)}'})
@app.route('/api/download/<filename>')
@login_required
def download_file(filename):
"""下载Excel文件"""
try:
filepath = os.path.join(os.path.dirname(__file__), 'exports', filename)
if os.path.exists(filepath):
return send_file(filepath, as_attachment=True, download_name=filename)
else:
return jsonify({'success': False, 'message': '文件不存在'})
except Exception as e:
return jsonify({'success': False, 'message': f'下载失败: {str(e)}'})
# ==================== 任务队列相关API ====================
@app.route('/api/queue/add', methods=['POST'])
@login_required
def add_task_to_queue():
"""添加任务到队列"""
try:
data = request.get_json()
url = data.get('url', '')
months = data.get('months', 6)
use_proxy = data.get('use_proxy', False)
proxy_api_url = data.get('proxy_api_url', '')
articles_only = data.get('articles_only', True) # 获取是否仅爬取文章
if not url:
return jsonify({'success': False, 'message': 'URL不能为空'})
# 添加任务到队列
queue = get_task_queue()
task_id = queue.add_task(
url=url,
months=months,
use_proxy=use_proxy,
proxy_api_url=proxy_api_url if proxy_api_url else None,
username=session.get('username'),
articles_only=articles_only
)
return jsonify({
'success': True,
'message': '任务已添加到队列',
'task_id': task_id
})
except Exception as e:
import traceback
traceback.print_exc()
return jsonify({'success': False, 'message': f'添加任务失败: {str(e)}'})
@app.route('/api/queue/tasks', methods=['GET'])
@login_required
def get_tasks():
"""获取任务列表"""
try:
queue = get_task_queue()
username = session.get('username')
# 获取用户的所有任务
tasks = queue.get_all_tasks(username=username)
# 按创建时间倒序排列
tasks.sort(key=lambda x: x.get('created_at', ''), reverse=True)
return jsonify({
'success': True,
'tasks': tasks
})
except Exception as e:
return jsonify({'success': False, 'message': f'获取任务列表失败: {str(e)}'})
@app.route('/api/queue/task/<task_id>', methods=['GET'])
@login_required
def get_task_detail(task_id):
"""获取任务详情"""
try:
queue = get_task_queue()
task = queue.get_task(task_id)
if not task:
return jsonify({'success': False, 'message': '任务不存在'})
# 检查权限
if task.get('username') != session.get('username'):
return jsonify({'success': False, 'message': '无权查看此任务'})
return jsonify({
'success': True,
'task': task
})
except Exception as e:
return jsonify({'success': False, 'message': f'获取任务详情失败: {str(e)}'})
@app.route('/api/queue/stats', methods=['GET'])
@login_required
def get_queue_stats():
"""获取队列统计信息"""
try:
queue = get_task_queue()
username = session.get('username')
stats = queue.get_queue_stats(username=username)
return jsonify({
'success': True,
'stats': stats
})
except Exception as e:
return jsonify({'success': False, 'message': f'获取统计信息失败: {str(e)}'})
@app.route('/api/queue/download/<task_id>', methods=['GET'])
@login_required
def download_task_result(task_id):
"""下载任务结果文件通过任务ID"""
try:
queue = get_task_queue()
task = queue.get_task(task_id)
if not task:
return jsonify({'success': False, 'message': '任务不存在'})
# 检查权限
if task.get('username') != session.get('username'):
return jsonify({'success': False, 'message': '无权下载此文件'})
if task.get('status') != 'completed':
return jsonify({'success': False, 'message': '任务未完成'})
result_file = task.get('result_file')
if not result_file:
return jsonify({'success': False, 'message': '结果文件不存在'})
filepath = os.path.join(queue.results_dir, result_file)
if not os.path.exists(filepath):
return jsonify({'success': False, 'message': '文件不存在'})
return send_file(filepath, as_attachment=True, download_name=result_file)
except Exception as e:
return jsonify({'success': False, 'message': f'下载失败: {str(e)}'})
@app.route('/api/queue/task/<task_id>/delete', methods=['POST'])
@login_required
def delete_task(task_id):
"""删除任务(自动终止后删除)"""
queue = get_task_queue()
task = queue.get_task(task_id)
if not task:
return jsonify({'success': False, 'message': '任务不存在'})
if task.get('username') != session.get('username'):
return jsonify({'success': False, 'message': '无权删除此任务'})
ok = queue.delete_task(task_id)
return jsonify({'success': ok})
@app.route('/api/queue/task/<task_id>/cancel', methods=['POST'])
@login_required
def cancel_task(task_id):
"""终止任务(等待中或处理中)"""
queue = get_task_queue()
task = queue.get_task(task_id)
if not task:
return jsonify({'success': False, 'message': '任务不存在'})
if task.get('username') != session.get('username'):
return jsonify({'success': False, 'message': '无权终止此任务'})
if task.get('status') not in ['pending', 'processing']:
return jsonify({'success': False, 'message': '仅可终止等待中或处理中任务'})
ok = queue.cancel_task(task_id)
return jsonify({'success': ok})
@app.route('/api/queue/task/<task_id>/logs', methods=['GET'])
@login_required
def get_task_logs(task_id):
"""获取任务的历史日志"""
try:
queue = get_task_queue()
task = queue.get_task(task_id)
if not task:
return jsonify({'success': False, 'message': '任务不存在'})
# 检查权限
if task.get('username') != session.get('username'):
return jsonify({'success': False, 'message': '无权查看此任务日志'})
# 从数据库获取日志
from database import get_database
db = get_database()
logs = db.get_task_logs(task_id)
return jsonify({
'success': True,
'logs': logs
})
except Exception as e:
return jsonify({'success': False, 'message': f'获取日志失败: {str(e)}'})
@app.route('/health/taskworker')
def health_taskworker():
"""TaskWorker 健康检查接口"""
try:
from task_worker import get_task_worker
from task_queue import get_task_queue
worker = get_task_worker()
queue = get_task_queue()
# 统计信息
tasks = queue.get_all_tasks()
pending_count = len([t for t in tasks if t.get('status') == 'pending'])
processing_count = len([t for t in tasks if t.get('status') == 'processing'])
# Worker 状态
alive_threads = sum(1 for t in worker.worker_threads if t and t.is_alive())
status = 'healthy' if worker.running and alive_threads > 0 else 'unhealthy'
return jsonify({
'status': status,
'worker': {
'running': worker.running,
'alive_threads': alive_threads,
'current_workers': worker.current_workers,
'max_workers': worker.max_workers,
'processing_tasks': len(worker.processing_tasks)
},
'queue': {
'pending': pending_count,
'processing': processing_count,
'total': len(tasks)
},
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
})
except Exception as e:
return jsonify({
'status': 'error',
'message': str(e)
}), 500
if __name__ == '__main__':
import sys
# 检查并安装依赖
def check_dependencies():
"""检查关键依赖是否安装"""
missing = []
try:
import flask
except ImportError:
missing.append('flask')
try:
import pandas
except ImportError:
missing.append('pandas')
try:
import openpyxl
except ImportError:
missing.append('openpyxl')
if missing:
print(f"\n⚠️ 缺少依赖: {', '.join(missing)}")
print("请运行: pip install -r requirements.txt\n")
return False
return True
if not check_dependencies():
sys.exit(1)
# 创建必要的目录
os.makedirs('exports', exist_ok=True)
os.makedirs('templates', exist_ok=True)
os.makedirs('static/css', exist_ok=True)
os.makedirs('static/js', exist_ok=True)
os.makedirs('data', exist_ok=True)
os.makedirs('data/results', exist_ok=True)
# 启动任务处理器
print('🔧 启动任务处理器...')
# 设置 SocketIO 实例到 task_worker
from task_worker import set_socketio
set_socketio(socketio)
start_task_worker()
print('✅ 任务处理器已启动')
# 检测是否是生产环境
is_production = os.environ.get('FLASK_ENV') == 'production'
if is_production:
print('✅ 生产环境启动')
print('请使用 gunicorn 或 uwsgi 运行:')
print(' gunicorn -w 4 -b 0.0.0.0:5001 app:app')
print('\n如果要直接运行,请使用: python app.py --dev')
if '--dev' not in sys.argv:
sys.exit(1)
print('🚀 服务器启动成功!')
print('请访问: http://127.0.0.1:8030')
# 开发环境使用 SocketIO 服务器
socketio.run(
app,
debug=not is_production,
host='0.0.0.0',
port=8030,
allow_unsafe_werkzeug=True
)