1124 lines
41 KiB
Python
1124 lines
41 KiB
Python
# -*- coding: utf-8 -*-
|
||
import json
|
||
import logging
|
||
import os
|
||
import re
|
||
from datetime import datetime, timedelta
|
||
from functools import wraps
|
||
|
||
import pandas as pd
|
||
import requests
|
||
from flask import Flask, render_template, request, jsonify, send_file, session, redirect, url_for
|
||
from flask_cors import CORS
|
||
from flask_socketio import SocketIO, emit
|
||
from task_queue import get_task_queue
|
||
from task_worker import start_task_worker
|
||
|
||
logging.basicConfig(level=logging.INFO)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
app = Flask(__name__, static_folder='static', template_folder='templates')
|
||
app.secret_key = 'your-secret-key-change-this-in-production' # 请在生产环境中修改为随机密钥
|
||
CORS(app)
|
||
|
||
# 初始化 SocketIO
|
||
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='threading')
|
||
|
||
# 简单的用户数据库(生产环境应使用数据库)
|
||
USERS = {
|
||
'admin': 'admin123', # 用户名: 密码
|
||
}
|
||
|
||
# 登录验证装饰器
|
||
def login_required(f):
|
||
@wraps(f)
|
||
def decorated_function(*args, **kwargs):
|
||
if 'username' not in session:
|
||
return jsonify({'success': False, 'message': '请先登录', 'need_login': True}), 401
|
||
return f(*args, **kwargs)
|
||
return decorated_function
|
||
|
||
|
||
class BaijiahaoScraper:
|
||
def __init__(self, uk, cookies=None, use_proxy=False, proxy_api_url=None):
|
||
self.uk = uk
|
||
self.api_url = 'https://mbd.baidu.com/webpage'
|
||
self.headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
|
||
'Referer': 'https://baijiahao.baidu.com/',
|
||
'Accept': '*/*',
|
||
}
|
||
self.session = requests.Session()
|
||
self.session.keep_alive = False # 禁用长连接
|
||
if cookies:
|
||
self.session.cookies.update(cookies)
|
||
|
||
# 代理配置
|
||
self.use_proxy = use_proxy
|
||
self.proxy_api_url = proxy_api_url or 'http://api.tianqiip.com/getip?secret=lu29e593&num=1&type=txt&port=1&mr=1&sign=4b81a62eaed89ba802a8f34053e2c964'
|
||
self.current_proxy = None
|
||
|
||
def get_proxy(self):
|
||
"""从代理池获取一个代理IP"""
|
||
try:
|
||
print(f"正在从代理池获取IP: {self.proxy_api_url}")
|
||
response = requests.get(self.proxy_api_url, timeout=5) # 优化超时为5秒
|
||
content = response.content.decode("utf-8").strip()
|
||
print(f"代理API响应: {content}")
|
||
|
||
# 检查是否是JSON错误响应
|
||
if content.startswith('{'):
|
||
try:
|
||
import json
|
||
error_data = json.loads(content)
|
||
error_msg = error_data.get('msg', '未知错误')
|
||
print(f"❌ 代理IP池错误: {error_msg}")
|
||
raise Exception(f"代理IP池错误: {error_msg}")
|
||
except json.JSONDecodeError:
|
||
pass # 不是JSON,继续按IP处理
|
||
|
||
# 解析IP和端口
|
||
if ':' in content:
|
||
sj = content.strip().split(":", 1)
|
||
sj1 = sj[0] # IP
|
||
sj2 = sj[1] # 端口
|
||
print(f"IP: {sj1}, 端口: {sj2}")
|
||
|
||
# 构建代理配置
|
||
proxy_meta = f"http://{sj1}:{sj2}"
|
||
proxies = {
|
||
'http': proxy_meta,
|
||
'https': proxy_meta
|
||
}
|
||
self.current_proxy = proxies
|
||
print(f"代理配置成功: {proxies}")
|
||
return proxies
|
||
else:
|
||
print("代理IP格式错误")
|
||
raise Exception(f"代理IP格式错误: {content}")
|
||
except Exception as e:
|
||
print(f"获取代理IP失败: {e}")
|
||
raise
|
||
|
||
def make_request(self, url, **kwargs):
|
||
"""使用代理发起请求,失败时自动切换IP"""
|
||
max_retries = 3 # 最大重试次数
|
||
retry_count = 0
|
||
last_was_anti_crawl = False # 标记上次是否因反爬重试
|
||
|
||
while retry_count < max_retries:
|
||
if self.use_proxy:
|
||
# 如果没有当前代理或需要刷新,获取新代理
|
||
if not self.current_proxy or retry_count > 0:
|
||
print(f"{'立即切换' if retry_count > 0 else ''}获取代理IP(第{retry_count + 1}次)")
|
||
proxy = self.get_proxy()
|
||
if not proxy:
|
||
raise Exception("启用了代理但无法获取代理IP,拒绝使用本机IP")
|
||
|
||
# 必须有代理才请求
|
||
if not self.current_proxy:
|
||
raise Exception("启用了代理但当前无代理IP,拒绝使用本机IP")
|
||
kwargs['proxies'] = self.current_proxy
|
||
|
||
# 设置超时
|
||
if 'timeout' not in kwargs:
|
||
kwargs['timeout'] = 20
|
||
|
||
try:
|
||
import time
|
||
start = int(round(time.time() * 1000))
|
||
response = self.session.get(url, **kwargs)
|
||
cost_time = int(round(time.time() * 1000)) - start
|
||
print(f"请求耗时: {cost_time}ms")
|
||
|
||
# 检查是否被反爬
|
||
if self._check_anti_spider(response):
|
||
print("⚠️ 检测到反爬,立即切换IP(无需等待)")
|
||
self.current_proxy = None # 清空当前代理
|
||
retry_count += 1
|
||
last_was_anti_crawl = True
|
||
if retry_count < max_retries:
|
||
continue # 立即重试,不等待
|
||
else:
|
||
raise Exception("多次重试后仍被反爬拦截,请稍后再试")
|
||
|
||
return response
|
||
except requests.exceptions.ProxyError as e:
|
||
print(f"代理错误: {e},立即切换IP(无需等待)")
|
||
self.current_proxy = None
|
||
retry_count += 1
|
||
last_was_anti_crawl = True
|
||
if retry_count < max_retries:
|
||
continue # 立即重试,不等待
|
||
else:
|
||
raise
|
||
except requests.exceptions.HTTPError as e:
|
||
# 检查是否是407错误(代理IP池限流)
|
||
if e.response and e.response.status_code == 407:
|
||
print(f"⚠️ 检测到407错误(代理IP池限流),等待3秒后重新获取IP...")
|
||
import time
|
||
time.sleep(3) # 等待3秒
|
||
if self.use_proxy:
|
||
print("重新获取代理IP...")
|
||
self.current_proxy = None
|
||
retry_count += 1
|
||
if retry_count < max_retries:
|
||
continue
|
||
raise
|
||
else:
|
||
print(f"请求失败: {e}")
|
||
raise
|
||
except Exception as e:
|
||
print(f"请求失败: {e}")
|
||
# 如果使用代理失败,尝试重新获取代理
|
||
if self.use_proxy:
|
||
print("立即切换代理(无需等待)")
|
||
self.current_proxy = None
|
||
retry_count += 1
|
||
if retry_count < max_retries:
|
||
continue # 立即重试,不等待
|
||
raise
|
||
|
||
raise Exception(f"请求失败,已重试{max_retries}次")
|
||
|
||
def _check_anti_spider(self, response):
|
||
"""检查响应是否被反爬拦截"""
|
||
# 检查状态码
|
||
if response.status_code in [403, 429, 503]:
|
||
return True
|
||
|
||
# 尝试解析JSON响应,检查is_need_foe字段
|
||
try:
|
||
data = response.json()
|
||
if data.get('data', {}).get('foe', {}).get('is_need_foe') == True:
|
||
print("检测到is_need_foe=True,需要切换IP")
|
||
return True
|
||
except:
|
||
pass # 如果不是JSON响应,继续检查文本内容
|
||
|
||
# 检查响应内容中的反爬特征
|
||
content = response.text.lower()
|
||
anti_spider_keywords = [
|
||
'验证码',
|
||
'captcha',
|
||
'请输入验证码',
|
||
'访问频繁',
|
||
'异常访问',
|
||
'请稍后再试',
|
||
'access denied',
|
||
'forbidden',
|
||
'安全验证',
|
||
'人机验证'
|
||
]
|
||
|
||
for keyword in anti_spider_keywords:
|
||
if keyword in content:
|
||
print(f"检测到反爬关键词: {keyword}")
|
||
return True
|
||
|
||
return False
|
||
|
||
@staticmethod
|
||
def get_uk_from_app_id(app_id, use_proxy=False, proxy_api_url=None):
|
||
"""从baijiahao主页提取uk参数"""
|
||
url = f"https://baijiahao.baidu.com/u?app_id={app_id}"
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
}
|
||
|
||
print(f"正在从主页获取uk: {url}")
|
||
|
||
max_retries = 3 # 最大重试次数
|
||
retry_count = 0
|
||
|
||
while retry_count < max_retries:
|
||
# 准备代理配置
|
||
kwargs = {'headers': headers, 'timeout': 30}
|
||
|
||
if use_proxy:
|
||
proxy_url = proxy_api_url or 'http://api.tianqiip.com/getip?secret=lu29e593&num=1&type=txt&port=1&mr=1&sign=4b81a62eaed89ba802a8f34053e2c964'
|
||
try:
|
||
print(f"{'重新' if retry_count > 0 else ''}使用代理获取uk(第{retry_count + 1}次)")
|
||
proxy_response = requests.get(proxy_url, timeout=5) # 优化超时为5秒
|
||
content = proxy_response.content.decode("utf-8").strip()
|
||
print(f"代理API响应: {content}")
|
||
|
||
# 检查是否是JSON错误响应
|
||
if content.startswith('{'):
|
||
try:
|
||
import json
|
||
error_data = json.loads(content)
|
||
error_msg = error_data.get('msg', '未知错误')
|
||
raise Exception(f"代理IP池错误: {error_msg}")
|
||
except json.JSONDecodeError:
|
||
pass # 不是JSON,继续按IP处理
|
||
|
||
if ':' in content:
|
||
sj = content.strip().split(":", 1)
|
||
proxy_meta = f"http://{sj[0]}:{sj[1]}"
|
||
kwargs['proxies'] = {
|
||
'http': proxy_meta,
|
||
'https': proxy_meta
|
||
}
|
||
print(f"使用代理: {proxy_meta}")
|
||
else:
|
||
raise Exception(f"代理IP格式错误: {content}")
|
||
except Exception as e:
|
||
# 启用了代理但获取失败,不使用本机IP,直接抛出异常
|
||
raise Exception(f"启用了代理但获取代理IP失败,拒绝使用本机IP: {e}")
|
||
|
||
try:
|
||
response = requests.get(url, **kwargs)
|
||
|
||
if response.status_code != 200:
|
||
print(f"访问主页失败,状态码: {response.status_code}")
|
||
retry_count += 1
|
||
if retry_count < max_retries:
|
||
print(f"立即重试(无需等待)")
|
||
continue # 立即重试
|
||
else:
|
||
raise Exception(f"访问主页失败,状态码: {response.status_code}")
|
||
|
||
# 检查是否被反爬
|
||
content_lower = response.text.lower()
|
||
anti_spider_detected = False
|
||
|
||
# 检查is_need_foe字段
|
||
try:
|
||
data = response.json()
|
||
if data.get('data', {}).get('foe', {}).get('is_need_foe') == True:
|
||
print("⚠️ 检测到is_need_foe=True,需要切换IP")
|
||
anti_spider_detected = True
|
||
except:
|
||
pass # 不是JSON响应,继续检查其他特征
|
||
|
||
# 检查反爬关键词
|
||
if not anti_spider_detected:
|
||
anti_spider_keywords = ['验证码', 'captcha', '访问频繁', '异常访问', 'access denied']
|
||
for keyword in anti_spider_keywords:
|
||
if keyword in content_lower:
|
||
print(f"⚠️ 检测到反爬关键词: {keyword}")
|
||
anti_spider_detected = True
|
||
break
|
||
|
||
if anti_spider_detected and use_proxy:
|
||
retry_count += 1
|
||
if retry_count < max_retries:
|
||
print(f"检测到反爬,立即切换IP重试(无需等待)")
|
||
continue # 立即重试,不等待
|
||
else:
|
||
raise Exception("多次重试后仍被反爬拦截")
|
||
|
||
# 使用正则表达式提取uk
|
||
uk_match = re.search(r'"uk"\s*:\s*"([^"]+)"', response.text)
|
||
|
||
if not uk_match:
|
||
# 尝试另一种模式
|
||
uk_match = re.search(r'uk=([^&\s"]+)', response.text)
|
||
|
||
if uk_match:
|
||
uk = uk_match.group(1)
|
||
print(f">> 成功获取UK: {uk}")
|
||
|
||
# 获取Cookie
|
||
cookies = response.cookies
|
||
print(f">> 成功获取Cookie")
|
||
|
||
return uk, cookies
|
||
else:
|
||
raise Exception("无法从页面中提取UK参数")
|
||
|
||
except requests.exceptions.ProxyError as e:
|
||
print(f"代理错误: {e},立即切换代理重试(无需等待)")
|
||
retry_count += 1
|
||
if retry_count < max_retries and use_proxy:
|
||
continue # 立即重试,不等待
|
||
else:
|
||
raise
|
||
except requests.exceptions.HTTPError as e:
|
||
# 检查是否是407错误(代理IP池限流)
|
||
if e.response and e.response.status_code == 407:
|
||
print(f"⚠️ 检测到407错误(代理IP池限流),等待3秒后重新获取IP...")
|
||
import time
|
||
time.sleep(3) # 等待3秒
|
||
retry_count += 1
|
||
if retry_count < max_retries and use_proxy:
|
||
print("重新获取代理IP,继续重试...")
|
||
continue
|
||
raise
|
||
else:
|
||
print(f"请求失败: {e}")
|
||
raise
|
||
except Exception as e:
|
||
if retry_count < max_retries - 1:
|
||
retry_count += 1
|
||
print(f"错误: {e},立即重试(无需等待)")
|
||
continue # 立即重试,不等待
|
||
else:
|
||
print(f"获取UK失败: {e}")
|
||
raise
|
||
|
||
def get_articles(self, months=6, app_id=None, articles_only=True, task_id=None, on_page_fetched=None,
|
||
start_page=1, start_ctime=None):
|
||
"""使用API接口获取文章列表(调用baidu_api.py的get_baidu_data_sync)
|
||
|
||
Args:
|
||
months: 获取近几个月的数据
|
||
app_id: 百家号app_id
|
||
articles_only: 是否仅爬取文章(跳过视频)
|
||
task_id: 任务ID(用于数据库缓存)
|
||
on_page_fetched: 每页数据获取后的回调函数
|
||
start_page: 起始页码(断点续传)
|
||
start_ctime: 起始分页参数(断点续传)
|
||
|
||
Returns:
|
||
dict: {'last_page': 最后页码, 'last_ctime': 分页参数, 'completed': 是否完成}
|
||
"""
|
||
import sys
|
||
import os
|
||
|
||
# 导入baidu_api.py中的同步函数
|
||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||
from baidu_api import get_baidu_data_sync
|
||
from datetime import datetime, timedelta
|
||
|
||
# 支持小数月份(如0.33个月10天)
|
||
days = int(months * 30)
|
||
target_date = datetime.now() - timedelta(days=days)
|
||
|
||
print(f"\n{'=' * 50}")
|
||
print(f"开始获取百家号作者(uk={self.uk})的文章...")
|
||
if start_page > 1:
|
||
print(f"🔄 断点续传:从第{start_page}页开始")
|
||
print(f"{'=' * 50}\n")
|
||
|
||
# 定义处理每页数据的回调函数
|
||
def process_page(page, items, ctime):
|
||
"""处理每页的原始数据,提取文章并调用上层回调"""
|
||
processed_articles = []
|
||
|
||
for item in items:
|
||
item_data = item.get('itemData', {})
|
||
|
||
# 如果启用articles_only,过滤掉视频内容
|
||
if articles_only:
|
||
meta_type = item_data.get('meta_type', '')
|
||
if meta_type == 'video':
|
||
print(f" ✖ 跳过视频: {item_data.get('title', '[无标题]')[:50]}...")
|
||
continue
|
||
|
||
title = item_data.get('title', '')
|
||
article_url = item_data.get('url', '')
|
||
|
||
# 优先使用ctime(Unix时间戳),更准确
|
||
publish_time = '未知时间'
|
||
if 'ctime' in item_data and item_data['ctime']:
|
||
try:
|
||
timestamp = int(item_data['ctime'])
|
||
article_date = datetime.fromtimestamp(timestamp)
|
||
publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S')
|
||
except Exception as e:
|
||
print(f" ✗ 解析ctime失败: {e}")
|
||
time_str = item_data.get('time', '未知时间')
|
||
if time_str != '未知时间':
|
||
article_date = self._parse_article_date(time_str)
|
||
if article_date:
|
||
publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S')
|
||
else:
|
||
time_str = item_data.get('time', '未知时间')
|
||
if time_str != '未知时间':
|
||
article_date = self._parse_article_date(time_str)
|
||
if article_date:
|
||
publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S')
|
||
|
||
processed_articles.append({
|
||
'标题': title.strip().split('\n')[0][:500] if title else '无标题',
|
||
'链接': article_url,
|
||
'发布时间': publish_time
|
||
})
|
||
|
||
print(f"第{page}页处理完成:{len(processed_articles)}篇文章")
|
||
|
||
# 调用上层回调(如果提供)
|
||
if on_page_fetched:
|
||
on_page_fetched(page, processed_articles, ctime)
|
||
|
||
try:
|
||
# 调用爬虫,传入回调函数
|
||
if months < 1:
|
||
print(f"调用get_baidu_data_sync获取数据(近{days}天)...\n")
|
||
else:
|
||
print(f"调用get_baidu_data_sync获取数据(近{int(months)}个月)...\n")
|
||
|
||
# 传递代理参数和回调函数
|
||
result = get_baidu_data_sync(
|
||
uk=self.uk,
|
||
months=months,
|
||
use_proxy=self.use_proxy,
|
||
proxy_api_url=self.proxy_api_url if self.proxy_api_url else None,
|
||
on_page_fetched=process_page,
|
||
start_page=start_page,
|
||
start_ctime=start_ctime
|
||
)
|
||
|
||
if not result:
|
||
print("\n✗ 未获取到数据")
|
||
return {'last_page': start_page, 'last_ctime': start_ctime, 'completed': False}
|
||
|
||
print(f"\n✓ 爬取完成!")
|
||
return result
|
||
|
||
except Exception as e:
|
||
print(f"\n✗ 调用get_baidu_data_sync失败: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return {'last_page': start_page, 'last_ctime': start_ctime, 'completed': False}
|
||
|
||
def _parse_article_date(self, time_str):
|
||
"""解析文章时间字符串,返回datetime对象"""
|
||
from datetime import datetime, timedelta
|
||
import re
|
||
|
||
if not time_str or time_str == '未知':
|
||
return None
|
||
|
||
current_year = datetime.now().year
|
||
now = datetime.now()
|
||
|
||
# 格式1: 相对时间 "1天前", "2小时前", "30分钟前", "1个月前"
|
||
if '前' in time_str:
|
||
# 天前
|
||
match = re.search(r'(\d+)\s*天', time_str)
|
||
if match:
|
||
days = int(match.group(1))
|
||
return now - timedelta(days=days)
|
||
|
||
# 小时前
|
||
match = re.search(r'(\d+)\s*小时', time_str)
|
||
if match:
|
||
hours = int(match.group(1))
|
||
return now - timedelta(hours=hours)
|
||
|
||
# 分钟前
|
||
match = re.search(r'(\d+)\s*分钟', time_str)
|
||
if match:
|
||
minutes = int(match.group(1))
|
||
return now - timedelta(minutes=minutes)
|
||
|
||
# 月前
|
||
match = re.search(r'(\d+)\s*个?月', time_str)
|
||
if match:
|
||
months = int(match.group(1))
|
||
return now - timedelta(days=months * 30) # 近似计算
|
||
|
||
# 格式2: "11-29 07:23" 或 "11-29"
|
||
match = re.match(r'(\d{1,2})-(\d{1,2})', time_str)
|
||
if match:
|
||
month = int(match.group(1))
|
||
day = int(match.group(2))
|
||
return datetime(current_year, month, day)
|
||
|
||
# 格式3: "2024-11-29"
|
||
match = re.match(r'(\d{4})-(\d{1,2})-(\d{1,2})', time_str)
|
||
if match:
|
||
year = int(match.group(1))
|
||
month = int(match.group(2))
|
||
day = int(match.group(3))
|
||
return datetime(year, month, day)
|
||
|
||
return None
|
||
|
||
def get_articles_from_html(self, months=6, app_id=None):
|
||
"""直接解析HTML获取文章(不依赖Selenium)"""
|
||
if not app_id:
|
||
raise Exception("需要提供app_id")
|
||
|
||
articles = []
|
||
|
||
try:
|
||
url = f"https://baijiahao.baidu.com/u?app_id={app_id}"
|
||
print(f"访问页面: {url}")
|
||
|
||
headers = {
|
||
'User-Agent': self.headers['User-Agent'],
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||
'Connection': 'keep-alive',
|
||
}
|
||
|
||
response = self.session.get(url, headers=headers, timeout=30)
|
||
|
||
if response.status_code != 200:
|
||
raise Exception(f"访问失败,状态码: {response.status_code}")
|
||
|
||
html = response.text
|
||
|
||
# 使用简单的字符串查找方式提取文章
|
||
# 查找所有包含 /s?id= 的链接
|
||
import re
|
||
links = re.findall(r'href="(/s\?id=[^"]+)"', html)
|
||
links += re.findall(r"href='(/s\?id=[^']+)'", html)
|
||
|
||
# 尝试提取标题
|
||
for link in set(links): # 去重
|
||
# 尝试找到这个链接对应的title
|
||
title_pattern = f'href="{link}"[^>]*title="([^"]+)"'
|
||
title_match = re.search(title_pattern, html)
|
||
if not title_match:
|
||
title_pattern = f'title="([^"]+)"[^>]*href="{link}"'
|
||
title_match = re.search(title_pattern, html)
|
||
|
||
title = title_match.group(1) if title_match else "未知标题"
|
||
|
||
full_link = 'https://baijiahao.baidu.com' + link
|
||
articles.append({
|
||
'标题': title.strip(),
|
||
'链接': full_link,
|
||
'发布时间': '未知'
|
||
})
|
||
|
||
print(f"成功提取 {len(articles)} 篇文章")
|
||
|
||
except Exception as e:
|
||
print(f"HTML解析失败: {e}")
|
||
raise
|
||
|
||
return articles
|
||
|
||
|
||
# ========== SocketIO 事件处理 ==========
|
||
|
||
@socketio.on('connect')
|
||
def handle_connect():
|
||
"""客户端连接时触发"""
|
||
logger.info(f"客户端已连接: {request.sid}")
|
||
emit('connected', {'message': '已连接到服务器'})
|
||
|
||
|
||
@socketio.on('disconnect')
|
||
def handle_disconnect():
|
||
"""客户端断开时触发"""
|
||
logger.info(f"客户端已断开: {request.sid}")
|
||
|
||
|
||
@socketio.on('subscribe_task')
|
||
def handle_subscribe_task(data):
|
||
"""客户端订阅任务进度"""
|
||
task_id = data.get('task_id')
|
||
logger.info(f"客户端 {request.sid} 订阅任务: {task_id}")
|
||
emit('subscribed', {'task_id': task_id})
|
||
|
||
|
||
def emit_task_log(task_id, message, level='info'):
|
||
"""发送任务日志到前端"""
|
||
socketio.emit('task_log', {
|
||
'task_id': task_id,
|
||
'message': message,
|
||
'level': level,
|
||
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
})
|
||
|
||
|
||
def emit_task_progress(task_id, progress, current_step='', **kwargs):
|
||
"""发送任务进度到前端"""
|
||
data = {
|
||
'task_id': task_id,
|
||
'progress': progress,
|
||
'current_step': current_step,
|
||
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
}
|
||
data.update(kwargs)
|
||
socketio.emit('task_progress', data)
|
||
|
||
|
||
# ========== Flask 路由 ==========
|
||
|
||
@app.route('/')
|
||
def index():
|
||
"""首页"""
|
||
# 如果未登录,重定向到登录页
|
||
if 'username' not in session:
|
||
return redirect(url_for('login_page'))
|
||
return render_template('index.html', username=session.get('username'))
|
||
|
||
|
||
@app.route('/queue')
|
||
def queue_page():
|
||
"""任务队列页面"""
|
||
# 如果未登录,重定向到登录页
|
||
if 'username' not in session:
|
||
return redirect(url_for('login_page'))
|
||
return render_template('queue.html', username=session.get('username'))
|
||
|
||
|
||
@app.route('/login')
|
||
def login_page():
|
||
"""登录页面"""
|
||
# 如果已登录,重定向到首页
|
||
if 'username' in session:
|
||
return redirect(url_for('index'))
|
||
return render_template('login.html')
|
||
|
||
|
||
@app.route('/api/login', methods=['POST'])
|
||
def login():
|
||
"""登录接口"""
|
||
try:
|
||
data = request.get_json()
|
||
username = data.get('username', '').strip()
|
||
password = data.get('password', '').strip()
|
||
|
||
if not username or not password:
|
||
return jsonify({'success': False, 'message': '请输入用户名和密码'})
|
||
|
||
# 验证用户名和密码
|
||
if username in USERS and USERS[username] == password:
|
||
session['username'] = username
|
||
return jsonify({'success': True, 'message': '登录成功'})
|
||
else:
|
||
return jsonify({'success': False, 'message': '用户名或密码错误'})
|
||
except Exception as e:
|
||
return jsonify({'success': False, 'message': f'登录失败: {str(e)}'})
|
||
|
||
|
||
@app.route('/api/logout', methods=['POST'])
|
||
def logout():
|
||
"""登出接口"""
|
||
session.pop('username', None)
|
||
return jsonify({'success': True, 'message': '已登出'})
|
||
|
||
|
||
@app.route('/api/export', methods=['POST'])
|
||
@login_required
|
||
def export_articles():
|
||
"""导出文章到Excel"""
|
||
try:
|
||
data = request.get_json()
|
||
url = data.get('url', '')
|
||
cookies_str = data.get('cookies', '') # 从前端获取Cookie
|
||
months = data.get('months', 6) # 获取时间范围,默认6个月
|
||
|
||
# 从URL中提取app_id,并转换为uk参数
|
||
app_id_match = re.search(r'app_id=(\d+)', url)
|
||
if not app_id_match:
|
||
return jsonify({'success': False, 'message': 'URL格式不正确,无法提取app_id'})
|
||
|
||
app_id = app_id_match.group(1)
|
||
print(f"开始导出,app_id={app_id}")
|
||
print(f"Cookie长度: {len(cookies_str) if cookies_str else 0}")
|
||
|
||
# 检查是否使用代理
|
||
use_proxy = data.get('use_proxy', False)
|
||
proxy_api_url = data.get('proxy_api_url', '')
|
||
articles_only = data.get('articles_only', True) # 获取是否仅爬取文章
|
||
|
||
# 从主页获取uk和Cookie
|
||
uk, auto_cookies = BaijiahaoScraper.get_uk_from_app_id(app_id, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None)
|
||
|
||
if not uk:
|
||
return jsonify({'success': False, 'message': '无法获取用户UK,请检查URL是否正确'})
|
||
|
||
print(f"成功获取uk={uk}")
|
||
|
||
# 如果用户提供了Cookie,使用用户的Cookie
|
||
if cookies_str:
|
||
print("使用用户提供的Cookie")
|
||
# 解析Cookie字符串
|
||
cookies_dict = {}
|
||
for item in cookies_str.split(';'):
|
||
item = item.strip()
|
||
if '=' in item:
|
||
key, value = item.split('=', 1)
|
||
cookies_dict[key.strip()] = value.strip()
|
||
|
||
# 转换为requests.cookies.RequestsCookieJar
|
||
from requests.cookies import cookiejar_from_dict
|
||
user_cookies = cookiejar_from_dict(cookies_dict)
|
||
scraper = BaijiahaoScraper(uk, user_cookies, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None)
|
||
else:
|
||
print("使用自动获取的Cookie")
|
||
scraper = BaijiahaoScraper(uk, auto_cookies, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None)
|
||
|
||
# 使用 API 方式获取文章(仅保留代理IP池 + API 方式)
|
||
print(f"使用 API 方式获取文章(近{months}个月)...")
|
||
try:
|
||
articles = scraper.get_articles(months=months, app_id=app_id, articles_only=articles_only)
|
||
except Exception as e:
|
||
print(f"API 方式失败: {e}")
|
||
articles = []
|
||
|
||
if not articles:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': (
|
||
'未能获取到文章数据。\n\n'
|
||
'请确保:\n'
|
||
'1. URL正确且该作者有发布过文章\n'
|
||
'2. 网络连接正常\n'
|
||
'3. 如需使用代理,请配置代理IP池'
|
||
)
|
||
})
|
||
|
||
# 创建Excel文件
|
||
df = pd.DataFrame(articles)
|
||
|
||
# 确保输出目录存在
|
||
output_dir = os.path.join(os.path.dirname(__file__), 'exports')
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
|
||
# 生成文件名
|
||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||
filename = f'baijiahao_articles_{app_id}_{timestamp}.xlsx'
|
||
filepath = os.path.join(output_dir, filename)
|
||
|
||
# 保存Excel
|
||
with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
|
||
df.to_excel(writer, index=False, sheet_name='文章列表')
|
||
|
||
# 调整列宽
|
||
worksheet = writer.sheets['文章列表']
|
||
worksheet.column_dimensions['A'].width = 80 # 标题列
|
||
worksheet.column_dimensions['B'].width = 20 # 时间列
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'message': f'成功导出{len(articles)}篇文章',
|
||
'filename': filename,
|
||
'count': len(articles),
|
||
'articles': articles[:100] # 返回前100篇文章用于预览
|
||
})
|
||
|
||
except Exception as e:
|
||
import traceback
|
||
traceback.print_exc()
|
||
return jsonify({'success': False, 'message': f'导出失败: {str(e)}'})
|
||
|
||
|
||
@app.route('/api/download/<filename>')
|
||
@login_required
|
||
def download_file(filename):
|
||
"""下载Excel文件"""
|
||
try:
|
||
filepath = os.path.join(os.path.dirname(__file__), 'exports', filename)
|
||
if os.path.exists(filepath):
|
||
return send_file(filepath, as_attachment=True, download_name=filename)
|
||
else:
|
||
return jsonify({'success': False, 'message': '文件不存在'})
|
||
except Exception as e:
|
||
return jsonify({'success': False, 'message': f'下载失败: {str(e)}'})
|
||
|
||
|
||
# ==================== 任务队列相关API ====================
|
||
|
||
@app.route('/api/queue/add', methods=['POST'])
|
||
@login_required
|
||
def add_task_to_queue():
|
||
"""添加任务到队列"""
|
||
try:
|
||
data = request.get_json()
|
||
url = data.get('url', '')
|
||
months = data.get('months', 6)
|
||
use_proxy = data.get('use_proxy', False)
|
||
proxy_api_url = data.get('proxy_api_url', '')
|
||
articles_only = data.get('articles_only', True) # 获取是否仅爬取文章
|
||
|
||
if not url:
|
||
return jsonify({'success': False, 'message': 'URL不能为空'})
|
||
|
||
# 添加任务到队列
|
||
queue = get_task_queue()
|
||
task_id = queue.add_task(
|
||
url=url,
|
||
months=months,
|
||
use_proxy=use_proxy,
|
||
proxy_api_url=proxy_api_url if proxy_api_url else None,
|
||
username=session.get('username'),
|
||
articles_only=articles_only
|
||
)
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'message': '任务已添加到队列',
|
||
'task_id': task_id
|
||
})
|
||
|
||
except Exception as e:
|
||
import traceback
|
||
traceback.print_exc()
|
||
return jsonify({'success': False, 'message': f'添加任务失败: {str(e)}'})
|
||
|
||
|
||
@app.route('/api/queue/tasks', methods=['GET'])
|
||
@login_required
|
||
def get_tasks():
|
||
"""获取任务列表"""
|
||
try:
|
||
queue = get_task_queue()
|
||
username = session.get('username')
|
||
|
||
# 获取用户的所有任务
|
||
tasks = queue.get_all_tasks(username=username)
|
||
|
||
# 按创建时间倒序排列
|
||
tasks.sort(key=lambda x: x.get('created_at', ''), reverse=True)
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'tasks': tasks
|
||
})
|
||
|
||
except Exception as e:
|
||
return jsonify({'success': False, 'message': f'获取任务列表失败: {str(e)}'})
|
||
|
||
|
||
@app.route('/api/queue/task/<task_id>', methods=['GET'])
|
||
@login_required
|
||
def get_task_detail(task_id):
|
||
"""获取任务详情"""
|
||
try:
|
||
queue = get_task_queue()
|
||
task = queue.get_task(task_id)
|
||
|
||
if not task:
|
||
return jsonify({'success': False, 'message': '任务不存在'})
|
||
|
||
# 检查权限
|
||
if task.get('username') != session.get('username'):
|
||
return jsonify({'success': False, 'message': '无权查看此任务'})
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'task': task
|
||
})
|
||
|
||
except Exception as e:
|
||
return jsonify({'success': False, 'message': f'获取任务详情失败: {str(e)}'})
|
||
|
||
|
||
@app.route('/api/queue/stats', methods=['GET'])
|
||
@login_required
|
||
def get_queue_stats():
|
||
"""获取队列统计信息"""
|
||
try:
|
||
queue = get_task_queue()
|
||
username = session.get('username')
|
||
|
||
stats = queue.get_queue_stats(username=username)
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'stats': stats
|
||
})
|
||
|
||
except Exception as e:
|
||
return jsonify({'success': False, 'message': f'获取统计信息失败: {str(e)}'})
|
||
|
||
|
||
@app.route('/api/queue/download/<task_id>', methods=['GET'])
|
||
@login_required
|
||
def download_task_result(task_id):
|
||
"""下载任务结果文件(通过任务ID)"""
|
||
try:
|
||
queue = get_task_queue()
|
||
task = queue.get_task(task_id)
|
||
|
||
if not task:
|
||
return jsonify({'success': False, 'message': '任务不存在'})
|
||
|
||
# 检查权限
|
||
if task.get('username') != session.get('username'):
|
||
return jsonify({'success': False, 'message': '无权下载此文件'})
|
||
|
||
if task.get('status') != 'completed':
|
||
return jsonify({'success': False, 'message': '任务未完成'})
|
||
|
||
result_file = task.get('result_file')
|
||
if not result_file:
|
||
return jsonify({'success': False, 'message': '结果文件不存在'})
|
||
|
||
filepath = os.path.join(queue.results_dir, result_file)
|
||
if not os.path.exists(filepath):
|
||
return jsonify({'success': False, 'message': '文件不存在'})
|
||
|
||
return send_file(filepath, as_attachment=True, download_name=result_file)
|
||
|
||
except Exception as e:
|
||
return jsonify({'success': False, 'message': f'下载失败: {str(e)}'})
|
||
|
||
|
||
@app.route('/api/queue/task/<task_id>/delete', methods=['POST'])
|
||
@login_required
|
||
def delete_task(task_id):
|
||
"""删除任务(自动终止后删除)"""
|
||
queue = get_task_queue()
|
||
task = queue.get_task(task_id)
|
||
if not task:
|
||
return jsonify({'success': False, 'message': '任务不存在'})
|
||
if task.get('username') != session.get('username'):
|
||
return jsonify({'success': False, 'message': '无权删除此任务'})
|
||
ok = queue.delete_task(task_id)
|
||
return jsonify({'success': ok})
|
||
|
||
|
||
@app.route('/api/queue/task/<task_id>/cancel', methods=['POST'])
|
||
@login_required
|
||
def cancel_task(task_id):
|
||
"""终止任务(等待中或处理中)"""
|
||
queue = get_task_queue()
|
||
task = queue.get_task(task_id)
|
||
if not task:
|
||
return jsonify({'success': False, 'message': '任务不存在'})
|
||
if task.get('username') != session.get('username'):
|
||
return jsonify({'success': False, 'message': '无权终止此任务'})
|
||
if task.get('status') not in ['pending', 'processing']:
|
||
return jsonify({'success': False, 'message': '仅可终止等待中或处理中任务'})
|
||
ok = queue.cancel_task(task_id)
|
||
return jsonify({'success': ok})
|
||
|
||
|
||
@app.route('/api/queue/task/<task_id>/logs', methods=['GET'])
|
||
@login_required
|
||
def get_task_logs(task_id):
|
||
"""获取任务的历史日志"""
|
||
try:
|
||
queue = get_task_queue()
|
||
task = queue.get_task(task_id)
|
||
|
||
if not task:
|
||
return jsonify({'success': False, 'message': '任务不存在'})
|
||
|
||
# 检查权限
|
||
if task.get('username') != session.get('username'):
|
||
return jsonify({'success': False, 'message': '无权查看此任务日志'})
|
||
|
||
# 从数据库获取日志
|
||
from database import get_database
|
||
db = get_database()
|
||
logs = db.get_task_logs(task_id)
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'logs': logs
|
||
})
|
||
|
||
except Exception as e:
|
||
return jsonify({'success': False, 'message': f'获取日志失败: {str(e)}'})
|
||
|
||
|
||
@app.route('/health/taskworker')
|
||
def health_taskworker():
|
||
"""TaskWorker 健康检查接口"""
|
||
try:
|
||
from task_worker import get_task_worker
|
||
from task_queue import get_task_queue
|
||
|
||
worker = get_task_worker()
|
||
queue = get_task_queue()
|
||
|
||
# 统计信息
|
||
tasks = queue.get_all_tasks()
|
||
pending_count = len([t for t in tasks if t.get('status') == 'pending'])
|
||
processing_count = len([t for t in tasks if t.get('status') == 'processing'])
|
||
|
||
# Worker 状态
|
||
alive_threads = sum(1 for t in worker.worker_threads if t and t.is_alive())
|
||
|
||
status = 'healthy' if worker.running and alive_threads > 0 else 'unhealthy'
|
||
|
||
return jsonify({
|
||
'status': status,
|
||
'worker': {
|
||
'running': worker.running,
|
||
'alive_threads': alive_threads,
|
||
'current_workers': worker.current_workers,
|
||
'max_workers': worker.max_workers,
|
||
'processing_tasks': len(worker.processing_tasks)
|
||
},
|
||
'queue': {
|
||
'pending': pending_count,
|
||
'processing': processing_count,
|
||
'total': len(tasks)
|
||
},
|
||
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
})
|
||
|
||
except Exception as e:
|
||
return jsonify({
|
||
'status': 'error',
|
||
'message': str(e)
|
||
}), 500
|
||
|
||
|
||
if __name__ == '__main__':
|
||
import sys
|
||
|
||
|
||
# 检查并安装依赖
|
||
def check_dependencies():
|
||
"""检查关键依赖是否安装"""
|
||
missing = []
|
||
try:
|
||
import flask
|
||
except ImportError:
|
||
missing.append('flask')
|
||
try:
|
||
import pandas
|
||
except ImportError:
|
||
missing.append('pandas')
|
||
try:
|
||
import openpyxl
|
||
except ImportError:
|
||
missing.append('openpyxl')
|
||
|
||
if missing:
|
||
print(f"\n⚠️ 缺少依赖: {', '.join(missing)}")
|
||
print("请运行: pip install -r requirements.txt\n")
|
||
return False
|
||
return True
|
||
|
||
|
||
if not check_dependencies():
|
||
sys.exit(1)
|
||
|
||
# 创建必要的目录
|
||
os.makedirs('exports', exist_ok=True)
|
||
os.makedirs('templates', exist_ok=True)
|
||
os.makedirs('static/css', exist_ok=True)
|
||
os.makedirs('static/js', exist_ok=True)
|
||
os.makedirs('data', exist_ok=True)
|
||
os.makedirs('data/results', exist_ok=True)
|
||
|
||
# 启动任务处理器
|
||
print('🔧 启动任务处理器...')
|
||
|
||
# 设置 SocketIO 实例到 task_worker
|
||
from task_worker import set_socketio
|
||
set_socketio(socketio)
|
||
|
||
start_task_worker()
|
||
print('✅ 任务处理器已启动')
|
||
|
||
# 检测是否是生产环境
|
||
is_production = os.environ.get('FLASK_ENV') == 'production'
|
||
|
||
if is_production:
|
||
print('✅ 生产环境启动')
|
||
print('请使用 gunicorn 或 uwsgi 运行:')
|
||
print(' gunicorn -w 4 -b 0.0.0.0:5001 app:app')
|
||
print('\n如果要直接运行,请使用: python app.py --dev')
|
||
|
||
if '--dev' not in sys.argv:
|
||
sys.exit(1)
|
||
|
||
print('🚀 服务器启动成功!')
|
||
print('请访问: http://127.0.0.1:8030')
|
||
|
||
# 开发环境使用 SocketIO 服务器
|
||
socketio.run(
|
||
app,
|
||
debug=not is_production,
|
||
host='0.0.0.0',
|
||
port=8030,
|
||
allow_unsafe_werkzeug=True
|
||
)
|