Files
ai_baijiahao/app.py

1124 lines
41 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import json
import logging
import os
import re
from datetime import datetime, timedelta
from functools import wraps
import pandas as pd
import requests
from flask import Flask, render_template, request, jsonify, send_file, session, redirect, url_for
from flask_cors import CORS
from flask_socketio import SocketIO, emit
from task_queue import get_task_queue
from task_worker import start_task_worker
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = Flask(__name__, static_folder='static', template_folder='templates')
app.secret_key = 'your-secret-key-change-this-in-production' # 请在生产环境中修改为随机密钥
CORS(app)
# 初始化 SocketIO
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='threading')
# 简单的用户数据库(生产环境应使用数据库)
USERS = {
'admin': 'admin123', # 用户名: 密码
}
# 登录验证装饰器
def login_required(f):
@wraps(f)
def decorated_function(*args, **kwargs):
if 'username' not in session:
return jsonify({'success': False, 'message': '请先登录', 'need_login': True}), 401
return f(*args, **kwargs)
return decorated_function
class BaijiahaoScraper:
def __init__(self, uk, cookies=None, use_proxy=False, proxy_api_url=None):
self.uk = uk
self.api_url = 'https://mbd.baidu.com/webpage'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
'Referer': 'https://baijiahao.baidu.com/',
'Accept': '*/*',
}
self.session = requests.Session()
self.session.keep_alive = False # 禁用长连接
if cookies:
self.session.cookies.update(cookies)
# 代理配置
self.use_proxy = use_proxy
self.proxy_api_url = proxy_api_url or 'http://api.tianqiip.com/getip?secret=lu29e593&num=1&type=txt&port=1&mr=1&sign=4b81a62eaed89ba802a8f34053e2c964'
self.current_proxy = None
def get_proxy(self):
"""从代理池获取一个代理IP"""
try:
print(f"正在从代理池获取IP: {self.proxy_api_url}")
response = requests.get(self.proxy_api_url, timeout=5) # 优化超时为5秒
content = response.content.decode("utf-8").strip()
print(f"代理API响应: {content}")
# 检查是否是JSON错误响应
if content.startswith('{'):
try:
import json
error_data = json.loads(content)
error_msg = error_data.get('msg', '未知错误')
print(f"❌ 代理IP池错误: {error_msg}")
raise Exception(f"代理IP池错误: {error_msg}")
except json.JSONDecodeError:
pass # 不是JSON继续按IP处理
# 解析IP和端口
if ':' in content:
sj = content.strip().split(":", 1)
sj1 = sj[0] # IP
sj2 = sj[1] # 端口
print(f"IP: {sj1}, 端口: {sj2}")
# 构建代理配置
proxy_meta = f"http://{sj1}:{sj2}"
proxies = {
'http': proxy_meta,
'https': proxy_meta
}
self.current_proxy = proxies
print(f"代理配置成功: {proxies}")
return proxies
else:
print("代理IP格式错误")
raise Exception(f"代理IP格式错误: {content}")
except Exception as e:
print(f"获取代理IP失败: {e}")
raise
def make_request(self, url, **kwargs):
"""使用代理发起请求失败时自动切换IP"""
max_retries = 3 # 最大重试次数
retry_count = 0
last_was_anti_crawl = False # 标记上次是否因反爬重试
while retry_count < max_retries:
if self.use_proxy:
# 如果没有当前代理或需要刷新,获取新代理
if not self.current_proxy or retry_count > 0:
print(f"{'立即切换' if retry_count > 0 else ''}获取代理IP{retry_count + 1}次)")
proxy = self.get_proxy()
if not proxy:
raise Exception("启用了代理但无法获取代理IP拒绝使用本机IP")
# 必须有代理才请求
if not self.current_proxy:
raise Exception("启用了代理但当前无代理IP拒绝使用本机IP")
kwargs['proxies'] = self.current_proxy
# 设置超时
if 'timeout' not in kwargs:
kwargs['timeout'] = 20
try:
import time
start = int(round(time.time() * 1000))
response = self.session.get(url, **kwargs)
cost_time = int(round(time.time() * 1000)) - start
print(f"请求耗时: {cost_time}ms")
# 检查是否被反爬
if self._check_anti_spider(response):
print("⚠️ 检测到反爬立即切换IP无需等待")
self.current_proxy = None # 清空当前代理
retry_count += 1
last_was_anti_crawl = True
if retry_count < max_retries:
continue # 立即重试,不等待
else:
raise Exception("多次重试后仍被反爬拦截,请稍后再试")
return response
except requests.exceptions.ProxyError as e:
print(f"代理错误: {e}立即切换IP无需等待")
self.current_proxy = None
retry_count += 1
last_was_anti_crawl = True
if retry_count < max_retries:
continue # 立即重试,不等待
else:
raise
except requests.exceptions.HTTPError as e:
# 检查是否是407错误代理IP池限流
if e.response and e.response.status_code == 407:
print(f"⚠️ 检测到407错误代理IP池限流等待3秒后重新获取IP...")
import time
time.sleep(3) # 等待3秒
if self.use_proxy:
print("重新获取代理IP...")
self.current_proxy = None
retry_count += 1
if retry_count < max_retries:
continue
raise
else:
print(f"请求失败: {e}")
raise
except Exception as e:
print(f"请求失败: {e}")
# 如果使用代理失败,尝试重新获取代理
if self.use_proxy:
print("立即切换代理(无需等待)")
self.current_proxy = None
retry_count += 1
if retry_count < max_retries:
continue # 立即重试,不等待
raise
raise Exception(f"请求失败,已重试{max_retries}")
def _check_anti_spider(self, response):
"""检查响应是否被反爬拦截"""
# 检查状态码
if response.status_code in [403, 429, 503]:
return True
# 尝试解析JSON响应检查is_need_foe字段
try:
data = response.json()
if data.get('data', {}).get('foe', {}).get('is_need_foe') == True:
print("检测到is_need_foe=True需要切换IP")
return True
except:
pass # 如果不是JSON响应继续检查文本内容
# 检查响应内容中的反爬特征
content = response.text.lower()
anti_spider_keywords = [
'验证码',
'captcha',
'请输入验证码',
'访问频繁',
'异常访问',
'请稍后再试',
'access denied',
'forbidden',
'安全验证',
'人机验证'
]
for keyword in anti_spider_keywords:
if keyword in content:
print(f"检测到反爬关键词: {keyword}")
return True
return False
@staticmethod
def get_uk_from_app_id(app_id, use_proxy=False, proxy_api_url=None):
"""从baijiahao主页提取uk参数"""
url = f"https://baijiahao.baidu.com/u?app_id={app_id}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}
print(f"正在从主页获取uk: {url}")
max_retries = 3 # 最大重试次数
retry_count = 0
while retry_count < max_retries:
# 准备代理配置
kwargs = {'headers': headers, 'timeout': 30}
if use_proxy:
proxy_url = proxy_api_url or 'http://api.tianqiip.com/getip?secret=lu29e593&num=1&type=txt&port=1&mr=1&sign=4b81a62eaed89ba802a8f34053e2c964'
try:
print(f"{'重新' if retry_count > 0 else ''}使用代理获取uk{retry_count + 1}次)")
proxy_response = requests.get(proxy_url, timeout=5) # 优化超时为5秒
content = proxy_response.content.decode("utf-8").strip()
print(f"代理API响应: {content}")
# 检查是否是JSON错误响应
if content.startswith('{'):
try:
import json
error_data = json.loads(content)
error_msg = error_data.get('msg', '未知错误')
raise Exception(f"代理IP池错误: {error_msg}")
except json.JSONDecodeError:
pass # 不是JSON继续按IP处理
if ':' in content:
sj = content.strip().split(":", 1)
proxy_meta = f"http://{sj[0]}:{sj[1]}"
kwargs['proxies'] = {
'http': proxy_meta,
'https': proxy_meta
}
print(f"使用代理: {proxy_meta}")
else:
raise Exception(f"代理IP格式错误: {content}")
except Exception as e:
# 启用了代理但获取失败不使用本机IP直接抛出异常
raise Exception(f"启用了代理但获取代理IP失败拒绝使用本机IP: {e}")
try:
response = requests.get(url, **kwargs)
if response.status_code != 200:
print(f"访问主页失败,状态码: {response.status_code}")
retry_count += 1
if retry_count < max_retries:
print(f"立即重试(无需等待)")
continue # 立即重试
else:
raise Exception(f"访问主页失败,状态码: {response.status_code}")
# 检查是否被反爬
content_lower = response.text.lower()
anti_spider_detected = False
# 检查is_need_foe字段
try:
data = response.json()
if data.get('data', {}).get('foe', {}).get('is_need_foe') == True:
print("⚠️ 检测到is_need_foe=True需要切换IP")
anti_spider_detected = True
except:
pass # 不是JSON响应继续检查其他特征
# 检查反爬关键词
if not anti_spider_detected:
anti_spider_keywords = ['验证码', 'captcha', '访问频繁', '异常访问', 'access denied']
for keyword in anti_spider_keywords:
if keyword in content_lower:
print(f"⚠️ 检测到反爬关键词: {keyword}")
anti_spider_detected = True
break
if anti_spider_detected and use_proxy:
retry_count += 1
if retry_count < max_retries:
print(f"检测到反爬立即切换IP重试无需等待")
continue # 立即重试,不等待
else:
raise Exception("多次重试后仍被反爬拦截")
# 使用正则表达式提取uk
uk_match = re.search(r'"uk"\s*:\s*"([^"]+)"', response.text)
if not uk_match:
# 尝试另一种模式
uk_match = re.search(r'uk=([^&\s"]+)', response.text)
if uk_match:
uk = uk_match.group(1)
print(f">> 成功获取UK: {uk}")
# 获取Cookie
cookies = response.cookies
print(f">> 成功获取Cookie")
return uk, cookies
else:
raise Exception("无法从页面中提取UK参数")
except requests.exceptions.ProxyError as e:
print(f"代理错误: {e},立即切换代理重试(无需等待)")
retry_count += 1
if retry_count < max_retries and use_proxy:
continue # 立即重试,不等待
else:
raise
except requests.exceptions.HTTPError as e:
# 检查是否是407错误代理IP池限流
if e.response and e.response.status_code == 407:
print(f"⚠️ 检测到407错误代理IP池限流等待3秒后重新获取IP...")
import time
time.sleep(3) # 等待3秒
retry_count += 1
if retry_count < max_retries and use_proxy:
print("重新获取代理IP继续重试...")
continue
raise
else:
print(f"请求失败: {e}")
raise
except Exception as e:
if retry_count < max_retries - 1:
retry_count += 1
print(f"错误: {e},立即重试(无需等待)")
continue # 立即重试,不等待
else:
print(f"获取UK失败: {e}")
raise
def get_articles(self, months=6, app_id=None, articles_only=True, task_id=None, on_page_fetched=None,
start_page=1, start_ctime=None):
"""使用API接口获取文章列表(调用baidu_api.py的get_baidu_data_sync)
Args:
months: 获取近几个月的数据
app_id: 百家号app_id
articles_only: 是否仅爬取文章(跳过视频)
task_id: 任务ID用于数据库缓存
on_page_fetched: 每页数据获取后的回调函数
start_page: 起始页码(断点续传)
start_ctime: 起始分页参数(断点续传)
Returns:
dict: {'last_page': 最后页码, 'last_ctime': 分页参数, 'completed': 是否完成}
"""
import sys
import os
# 导入baidu_api.py中的同步函数
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from baidu_api import get_baidu_data_sync
from datetime import datetime, timedelta
# 支持小数月份(如0.33个月10天)
days = int(months * 30)
target_date = datetime.now() - timedelta(days=days)
print(f"\n{'=' * 50}")
print(f"开始获取百家号作者(uk={self.uk})的文章...")
if start_page > 1:
print(f"🔄 断点续传:从第{start_page}页开始")
print(f"{'=' * 50}\n")
# 定义处理每页数据的回调函数
def process_page(page, items, ctime):
"""处理每页的原始数据,提取文章并调用上层回调"""
processed_articles = []
for item in items:
item_data = item.get('itemData', {})
# 如果启用articles_only过滤掉视频内容
if articles_only:
meta_type = item_data.get('meta_type', '')
if meta_type == 'video':
print(f" ✖ 跳过视频: {item_data.get('title', '[无标题]')[:50]}...")
continue
title = item_data.get('title', '')
article_url = item_data.get('url', '')
# 优先使用ctime(Unix时间戳),更准确
publish_time = '未知时间'
if 'ctime' in item_data and item_data['ctime']:
try:
timestamp = int(item_data['ctime'])
article_date = datetime.fromtimestamp(timestamp)
publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S')
except Exception as e:
print(f" ✗ 解析ctime失败: {e}")
time_str = item_data.get('time', '未知时间')
if time_str != '未知时间':
article_date = self._parse_article_date(time_str)
if article_date:
publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S')
else:
time_str = item_data.get('time', '未知时间')
if time_str != '未知时间':
article_date = self._parse_article_date(time_str)
if article_date:
publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S')
processed_articles.append({
'标题': title.strip().split('\n')[0][:500] if title else '无标题',
'链接': article_url,
'发布时间': publish_time
})
print(f"{page}页处理完成:{len(processed_articles)}篇文章")
# 调用上层回调(如果提供)
if on_page_fetched:
on_page_fetched(page, processed_articles, ctime)
try:
# 调用爬虫,传入回调函数
if months < 1:
print(f"调用get_baidu_data_sync获取数据(近{days}天)...\n")
else:
print(f"调用get_baidu_data_sync获取数据(近{int(months)}个月)...\n")
# 传递代理参数和回调函数
result = get_baidu_data_sync(
uk=self.uk,
months=months,
use_proxy=self.use_proxy,
proxy_api_url=self.proxy_api_url if self.proxy_api_url else None,
on_page_fetched=process_page,
start_page=start_page,
start_ctime=start_ctime
)
if not result:
print("\n✗ 未获取到数据")
return {'last_page': start_page, 'last_ctime': start_ctime, 'completed': False}
print(f"\n✓ 爬取完成!")
return result
except Exception as e:
print(f"\n✗ 调用get_baidu_data_sync失败: {e}")
import traceback
traceback.print_exc()
return {'last_page': start_page, 'last_ctime': start_ctime, 'completed': False}
def _parse_article_date(self, time_str):
"""解析文章时间字符串,返回datetime对象"""
from datetime import datetime, timedelta
import re
if not time_str or time_str == '未知':
return None
current_year = datetime.now().year
now = datetime.now()
# 格式1: 相对时间 "1天前", "2小时前", "30分钟前", "1个月前"
if '' in time_str:
# 天前
match = re.search(r'(\d+)\s*天', time_str)
if match:
days = int(match.group(1))
return now - timedelta(days=days)
# 小时前
match = re.search(r'(\d+)\s*小时', time_str)
if match:
hours = int(match.group(1))
return now - timedelta(hours=hours)
# 分钟前
match = re.search(r'(\d+)\s*分钟', time_str)
if match:
minutes = int(match.group(1))
return now - timedelta(minutes=minutes)
# 月前
match = re.search(r'(\d+)\s*个?月', time_str)
if match:
months = int(match.group(1))
return now - timedelta(days=months * 30) # 近似计算
# 格式2: "11-29 07:23" 或 "11-29"
match = re.match(r'(\d{1,2})-(\d{1,2})', time_str)
if match:
month = int(match.group(1))
day = int(match.group(2))
return datetime(current_year, month, day)
# 格式3: "2024-11-29"
match = re.match(r'(\d{4})-(\d{1,2})-(\d{1,2})', time_str)
if match:
year = int(match.group(1))
month = int(match.group(2))
day = int(match.group(3))
return datetime(year, month, day)
return None
def get_articles_from_html(self, months=6, app_id=None):
"""直接解析HTML获取文章(不依赖Selenium)"""
if not app_id:
raise Exception("需要提供app_id")
articles = []
try:
url = f"https://baijiahao.baidu.com/u?app_id={app_id}"
print(f"访问页面: {url}")
headers = {
'User-Agent': self.headers['User-Agent'],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
}
response = self.session.get(url, headers=headers, timeout=30)
if response.status_code != 200:
raise Exception(f"访问失败,状态码: {response.status_code}")
html = response.text
# 使用简单的字符串查找方式提取文章
# 查找所有包含 /s?id= 的链接
import re
links = re.findall(r'href="(/s\?id=[^"]+)"', html)
links += re.findall(r"href='(/s\?id=[^']+)'", html)
# 尝试提取标题
for link in set(links): # 去重
# 尝试找到这个链接对应的title
title_pattern = f'href="{link}"[^>]*title="([^"]+)"'
title_match = re.search(title_pattern, html)
if not title_match:
title_pattern = f'title="([^"]+)"[^>]*href="{link}"'
title_match = re.search(title_pattern, html)
title = title_match.group(1) if title_match else "未知标题"
full_link = 'https://baijiahao.baidu.com' + link
articles.append({
'标题': title.strip(),
'链接': full_link,
'发布时间': '未知'
})
print(f"成功提取 {len(articles)} 篇文章")
except Exception as e:
print(f"HTML解析失败: {e}")
raise
return articles
# ========== SocketIO 事件处理 ==========
@socketio.on('connect')
def handle_connect():
"""客户端连接时触发"""
logger.info(f"客户端已连接: {request.sid}")
emit('connected', {'message': '已连接到服务器'})
@socketio.on('disconnect')
def handle_disconnect():
"""客户端断开时触发"""
logger.info(f"客户端已断开: {request.sid}")
@socketio.on('subscribe_task')
def handle_subscribe_task(data):
"""客户端订阅任务进度"""
task_id = data.get('task_id')
logger.info(f"客户端 {request.sid} 订阅任务: {task_id}")
emit('subscribed', {'task_id': task_id})
def emit_task_log(task_id, message, level='info'):
"""发送任务日志到前端"""
socketio.emit('task_log', {
'task_id': task_id,
'message': message,
'level': level,
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
})
def emit_task_progress(task_id, progress, current_step='', **kwargs):
"""发送任务进度到前端"""
data = {
'task_id': task_id,
'progress': progress,
'current_step': current_step,
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
data.update(kwargs)
socketio.emit('task_progress', data)
# ========== Flask 路由 ==========
@app.route('/')
def index():
"""首页"""
# 如果未登录,重定向到登录页
if 'username' not in session:
return redirect(url_for('login_page'))
return render_template('index.html', username=session.get('username'))
@app.route('/queue')
def queue_page():
"""任务队列页面"""
# 如果未登录,重定向到登录页
if 'username' not in session:
return redirect(url_for('login_page'))
return render_template('queue.html', username=session.get('username'))
@app.route('/login')
def login_page():
"""登录页面"""
# 如果已登录,重定向到首页
if 'username' in session:
return redirect(url_for('index'))
return render_template('login.html')
@app.route('/api/login', methods=['POST'])
def login():
"""登录接口"""
try:
data = request.get_json()
username = data.get('username', '').strip()
password = data.get('password', '').strip()
if not username or not password:
return jsonify({'success': False, 'message': '请输入用户名和密码'})
# 验证用户名和密码
if username in USERS and USERS[username] == password:
session['username'] = username
return jsonify({'success': True, 'message': '登录成功'})
else:
return jsonify({'success': False, 'message': '用户名或密码错误'})
except Exception as e:
return jsonify({'success': False, 'message': f'登录失败: {str(e)}'})
@app.route('/api/logout', methods=['POST'])
def logout():
"""登出接口"""
session.pop('username', None)
return jsonify({'success': True, 'message': '已登出'})
@app.route('/api/export', methods=['POST'])
@login_required
def export_articles():
"""导出文章到Excel"""
try:
data = request.get_json()
url = data.get('url', '')
cookies_str = data.get('cookies', '') # 从前端获取Cookie
months = data.get('months', 6) # 获取时间范围默认6个月
# 从URL中提取app_id并转换为uk参数
app_id_match = re.search(r'app_id=(\d+)', url)
if not app_id_match:
return jsonify({'success': False, 'message': 'URL格式不正确无法提取app_id'})
app_id = app_id_match.group(1)
print(f"开始导出app_id={app_id}")
print(f"Cookie长度: {len(cookies_str) if cookies_str else 0}")
# 检查是否使用代理
use_proxy = data.get('use_proxy', False)
proxy_api_url = data.get('proxy_api_url', '')
articles_only = data.get('articles_only', True) # 获取是否仅爬取文章
# 从主页获取uk和Cookie
uk, auto_cookies = BaijiahaoScraper.get_uk_from_app_id(app_id, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None)
if not uk:
return jsonify({'success': False, 'message': '无法获取用户UK请检查URL是否正确'})
print(f"成功获取uk={uk}")
# 如果用户提供了Cookie,使用用户的Cookie
if cookies_str:
print("使用用户提供的Cookie")
# 解析Cookie字符串
cookies_dict = {}
for item in cookies_str.split(';'):
item = item.strip()
if '=' in item:
key, value = item.split('=', 1)
cookies_dict[key.strip()] = value.strip()
# 转换为requests.cookies.RequestsCookieJar
from requests.cookies import cookiejar_from_dict
user_cookies = cookiejar_from_dict(cookies_dict)
scraper = BaijiahaoScraper(uk, user_cookies, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None)
else:
print("使用自动获取的Cookie")
scraper = BaijiahaoScraper(uk, auto_cookies, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None)
# 使用 API 方式获取文章仅保留代理IP池 + API 方式)
print(f"使用 API 方式获取文章(近{months}个月)...")
try:
articles = scraper.get_articles(months=months, app_id=app_id, articles_only=articles_only)
except Exception as e:
print(f"API 方式失败: {e}")
articles = []
if not articles:
return jsonify({
'success': False,
'message': (
'未能获取到文章数据。\n\n'
'请确保:\n'
'1. URL正确且该作者有发布过文章\n'
'2. 网络连接正常\n'
'3. 如需使用代理请配置代理IP池'
)
})
# 创建Excel文件
df = pd.DataFrame(articles)
# 确保输出目录存在
output_dir = os.path.join(os.path.dirname(__file__), 'exports')
os.makedirs(output_dir, exist_ok=True)
# 生成文件名
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'baijiahao_articles_{app_id}_{timestamp}.xlsx'
filepath = os.path.join(output_dir, filename)
# 保存Excel
with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name='文章列表')
# 调整列宽
worksheet = writer.sheets['文章列表']
worksheet.column_dimensions['A'].width = 80 # 标题列
worksheet.column_dimensions['B'].width = 20 # 时间列
return jsonify({
'success': True,
'message': f'成功导出{len(articles)}篇文章',
'filename': filename,
'count': len(articles),
'articles': articles[:100] # 返回前100篇文章用于预览
})
except Exception as e:
import traceback
traceback.print_exc()
return jsonify({'success': False, 'message': f'导出失败: {str(e)}'})
@app.route('/api/download/<filename>')
@login_required
def download_file(filename):
"""下载Excel文件"""
try:
filepath = os.path.join(os.path.dirname(__file__), 'exports', filename)
if os.path.exists(filepath):
return send_file(filepath, as_attachment=True, download_name=filename)
else:
return jsonify({'success': False, 'message': '文件不存在'})
except Exception as e:
return jsonify({'success': False, 'message': f'下载失败: {str(e)}'})
# ==================== 任务队列相关API ====================
@app.route('/api/queue/add', methods=['POST'])
@login_required
def add_task_to_queue():
"""添加任务到队列"""
try:
data = request.get_json()
url = data.get('url', '')
months = data.get('months', 6)
use_proxy = data.get('use_proxy', False)
proxy_api_url = data.get('proxy_api_url', '')
articles_only = data.get('articles_only', True) # 获取是否仅爬取文章
if not url:
return jsonify({'success': False, 'message': 'URL不能为空'})
# 添加任务到队列
queue = get_task_queue()
task_id = queue.add_task(
url=url,
months=months,
use_proxy=use_proxy,
proxy_api_url=proxy_api_url if proxy_api_url else None,
username=session.get('username'),
articles_only=articles_only
)
return jsonify({
'success': True,
'message': '任务已添加到队列',
'task_id': task_id
})
except Exception as e:
import traceback
traceback.print_exc()
return jsonify({'success': False, 'message': f'添加任务失败: {str(e)}'})
@app.route('/api/queue/tasks', methods=['GET'])
@login_required
def get_tasks():
"""获取任务列表"""
try:
queue = get_task_queue()
username = session.get('username')
# 获取用户的所有任务
tasks = queue.get_all_tasks(username=username)
# 按创建时间倒序排列
tasks.sort(key=lambda x: x.get('created_at', ''), reverse=True)
return jsonify({
'success': True,
'tasks': tasks
})
except Exception as e:
return jsonify({'success': False, 'message': f'获取任务列表失败: {str(e)}'})
@app.route('/api/queue/task/<task_id>', methods=['GET'])
@login_required
def get_task_detail(task_id):
"""获取任务详情"""
try:
queue = get_task_queue()
task = queue.get_task(task_id)
if not task:
return jsonify({'success': False, 'message': '任务不存在'})
# 检查权限
if task.get('username') != session.get('username'):
return jsonify({'success': False, 'message': '无权查看此任务'})
return jsonify({
'success': True,
'task': task
})
except Exception as e:
return jsonify({'success': False, 'message': f'获取任务详情失败: {str(e)}'})
@app.route('/api/queue/stats', methods=['GET'])
@login_required
def get_queue_stats():
"""获取队列统计信息"""
try:
queue = get_task_queue()
username = session.get('username')
stats = queue.get_queue_stats(username=username)
return jsonify({
'success': True,
'stats': stats
})
except Exception as e:
return jsonify({'success': False, 'message': f'获取统计信息失败: {str(e)}'})
@app.route('/api/queue/download/<task_id>', methods=['GET'])
@login_required
def download_task_result(task_id):
"""下载任务结果文件通过任务ID"""
try:
queue = get_task_queue()
task = queue.get_task(task_id)
if not task:
return jsonify({'success': False, 'message': '任务不存在'})
# 检查权限
if task.get('username') != session.get('username'):
return jsonify({'success': False, 'message': '无权下载此文件'})
if task.get('status') != 'completed':
return jsonify({'success': False, 'message': '任务未完成'})
result_file = task.get('result_file')
if not result_file:
return jsonify({'success': False, 'message': '结果文件不存在'})
filepath = os.path.join(queue.results_dir, result_file)
if not os.path.exists(filepath):
return jsonify({'success': False, 'message': '文件不存在'})
return send_file(filepath, as_attachment=True, download_name=result_file)
except Exception as e:
return jsonify({'success': False, 'message': f'下载失败: {str(e)}'})
@app.route('/api/queue/task/<task_id>/delete', methods=['POST'])
@login_required
def delete_task(task_id):
"""删除任务(自动终止后删除)"""
queue = get_task_queue()
task = queue.get_task(task_id)
if not task:
return jsonify({'success': False, 'message': '任务不存在'})
if task.get('username') != session.get('username'):
return jsonify({'success': False, 'message': '无权删除此任务'})
ok = queue.delete_task(task_id)
return jsonify({'success': ok})
@app.route('/api/queue/task/<task_id>/cancel', methods=['POST'])
@login_required
def cancel_task(task_id):
"""终止任务(等待中或处理中)"""
queue = get_task_queue()
task = queue.get_task(task_id)
if not task:
return jsonify({'success': False, 'message': '任务不存在'})
if task.get('username') != session.get('username'):
return jsonify({'success': False, 'message': '无权终止此任务'})
if task.get('status') not in ['pending', 'processing']:
return jsonify({'success': False, 'message': '仅可终止等待中或处理中任务'})
ok = queue.cancel_task(task_id)
return jsonify({'success': ok})
@app.route('/api/queue/task/<task_id>/logs', methods=['GET'])
@login_required
def get_task_logs(task_id):
"""获取任务的历史日志"""
try:
queue = get_task_queue()
task = queue.get_task(task_id)
if not task:
return jsonify({'success': False, 'message': '任务不存在'})
# 检查权限
if task.get('username') != session.get('username'):
return jsonify({'success': False, 'message': '无权查看此任务日志'})
# 从数据库获取日志
from database import get_database
db = get_database()
logs = db.get_task_logs(task_id)
return jsonify({
'success': True,
'logs': logs
})
except Exception as e:
return jsonify({'success': False, 'message': f'获取日志失败: {str(e)}'})
@app.route('/health/taskworker')
def health_taskworker():
"""TaskWorker 健康检查接口"""
try:
from task_worker import get_task_worker
from task_queue import get_task_queue
worker = get_task_worker()
queue = get_task_queue()
# 统计信息
tasks = queue.get_all_tasks()
pending_count = len([t for t in tasks if t.get('status') == 'pending'])
processing_count = len([t for t in tasks if t.get('status') == 'processing'])
# Worker 状态
alive_threads = sum(1 for t in worker.worker_threads if t and t.is_alive())
status = 'healthy' if worker.running and alive_threads > 0 else 'unhealthy'
return jsonify({
'status': status,
'worker': {
'running': worker.running,
'alive_threads': alive_threads,
'current_workers': worker.current_workers,
'max_workers': worker.max_workers,
'processing_tasks': len(worker.processing_tasks)
},
'queue': {
'pending': pending_count,
'processing': processing_count,
'total': len(tasks)
},
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
})
except Exception as e:
return jsonify({
'status': 'error',
'message': str(e)
}), 500
if __name__ == '__main__':
import sys
# 检查并安装依赖
def check_dependencies():
"""检查关键依赖是否安装"""
missing = []
try:
import flask
except ImportError:
missing.append('flask')
try:
import pandas
except ImportError:
missing.append('pandas')
try:
import openpyxl
except ImportError:
missing.append('openpyxl')
if missing:
print(f"\n⚠️ 缺少依赖: {', '.join(missing)}")
print("请运行: pip install -r requirements.txt\n")
return False
return True
if not check_dependencies():
sys.exit(1)
# 创建必要的目录
os.makedirs('exports', exist_ok=True)
os.makedirs('templates', exist_ok=True)
os.makedirs('static/css', exist_ok=True)
os.makedirs('static/js', exist_ok=True)
os.makedirs('data', exist_ok=True)
os.makedirs('data/results', exist_ok=True)
# 启动任务处理器
print('🔧 启动任务处理器...')
# 设置 SocketIO 实例到 task_worker
from task_worker import set_socketio
set_socketio(socketio)
start_task_worker()
print('✅ 任务处理器已启动')
# 检测是否是生产环境
is_production = os.environ.get('FLASK_ENV') == 'production'
if is_production:
print('✅ 生产环境启动')
print('请使用 gunicorn 或 uwsgi 运行:')
print(' gunicorn -w 4 -b 0.0.0.0:5001 app:app')
print('\n如果要直接运行,请使用: python app.py --dev')
if '--dev' not in sys.argv:
sys.exit(1)
print('🚀 服务器启动成功!')
print('请访问: http://127.0.0.1:8030')
# 开发环境使用 SocketIO 服务器
socketio.run(
app,
debug=not is_production,
host='0.0.0.0',
port=8030,
allow_unsafe_werkzeug=True
)