ai_baijiahao/app.py

# -*- coding: utf-8 -*-
import json
import logging
import os
import re
from datetime import datetime, timedelta
from functools import wraps

import pandas as pd
import requests
from flask import Flask, render_template, request, jsonify, send_file, session, redirect, url_for
from flask_cors import CORS
from flask_socketio import SocketIO, emit
from task_queue import get_task_queue
from task_worker import start_task_worker

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = Flask(__name__, static_folder='static', template_folder='templates')
app.secret_key = 'your-secret-key-change-this-in-production'  # 请在生产环境中修改为随机密钥
CORS(app)

# 初始化 SocketIO
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='threading')

# 简单的用户数据库（生产环境应使用数据库）
USERS = {
    'admin': 'admin123',  # 用户名: 密码
}

# 登录验证装饰器
def login_required(f):
    @wraps(f)
    def decorated_function(*args, **kwargs):
        if 'username' not in session:
            return jsonify({'success': False, 'message': '请先登录', 'need_login': True}), 401
        return f(*args, **kwargs)
    return decorated_function


class BaijiahaoScraper:
    def __init__(self, uk, cookies=None, use_proxy=False, proxy_api_url=None):
        self.uk = uk
        self.api_url = 'https://mbd.baidu.com/webpage'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
            'Referer': 'https://baijiahao.baidu.com/',
            'Accept': '*/*',
        }
        self.session = requests.Session()
        self.session.keep_alive = False  # 禁用长连接
        if cookies:
            self.session.cookies.update(cookies)
        
        # 代理配置
        self.use_proxy = use_proxy
        self.proxy_api_url = proxy_api_url or 'http://api.tianqiip.com/getip?secret=lu29e593&num=1&type=txt&port=1&mr=1&sign=4b81a62eaed89ba802a8f34053e2c964'
        self.current_proxy = None
    
    def get_proxy(self):
        """从代理池获取一个代理IP"""
        try:
            print(f"正在从代理池获取IP: {self.proxy_api_url}")
            response = requests.get(self.proxy_api_url, timeout=5)  # 优化超时为5秒
            content = response.content.decode("utf-8").strip()
            print(f"代理API响应: {content}")
            
            # 检查是否是JSON错误响应
            if content.startswith('{'):
                try:
                    import json
                    error_data = json.loads(content)
                    error_msg = error_data.get('msg', '未知错误')
                    print(f"❌ 代理IP池错误: {error_msg}")
                    raise Exception(f"代理IP池错误: {error_msg}")
                except json.JSONDecodeError:
                    pass  # 不是JSON，继续按IP处理
            
            # 解析IP和端口
            if ':' in content:
                sj = content.strip().split(":", 1)
                sj1 = sj[0]  # IP
                sj2 = sj[1]  # 端口
                print(f"IP: {sj1}, 端口: {sj2}")
                
                # 构建代理配置
                proxy_meta = f"http://{sj1}:{sj2}"
                proxies = {
                    'http': proxy_meta,
                    'https': proxy_meta
                }
                self.current_proxy = proxies
                print(f"代理配置成功: {proxies}")
                return proxies
            else:
                print("代理IP格式错误")
                raise Exception(f"代理IP格式错误: {content}")
        except Exception as e:
            print(f"获取代理IP失败: {e}")
            raise
    
    def make_request(self, url, **kwargs):
        """使用代理发起请求，失败时自动切换IP"""
        max_retries = 3  # 最大重试次数
        retry_count = 0
        last_was_anti_crawl = False  # 标记上次是否因反爬重试
        
        while retry_count < max_retries:
            if self.use_proxy:
                # 如果没有当前代理或需要刷新，获取新代理
                if not self.current_proxy or retry_count > 0:
                    print(f"{'立即切换' if retry_count > 0 else ''}获取代理IP（第{retry_count + 1}次）")
                    proxy = self.get_proxy()
                    if not proxy:
                        raise Exception("启用了代理但无法获取代理IP，拒绝使用本机IP")
                
                # 必须有代理才请求
                if not self.current_proxy:
                    raise Exception("启用了代理但当前无代理IP，拒绝使用本机IP")
                kwargs['proxies'] = self.current_proxy
            
            # 设置超时
            if 'timeout' not in kwargs:
                kwargs['timeout'] = 20
            
            try:
                import time
                start = int(round(time.time() * 1000))
                response = self.session.get(url, **kwargs)
                cost_time = int(round(time.time() * 1000)) - start
                print(f"请求耗时: {cost_time}ms")
                
                # 检查是否被反爬
                if self._check_anti_spider(response):
                    print("⚠️ 检测到反爬，立即切换IP（无需等待）")
                    self.current_proxy = None  # 清空当前代理
                    retry_count += 1
                    last_was_anti_crawl = True
                    if retry_count < max_retries:
                        continue  # 立即重试，不等待
                    else:
                        raise Exception("多次重试后仍被反爬拦截，请稍后再试")
                
                return response
            except requests.exceptions.ProxyError as e:
                print(f"代理错误: {e}，立即切换IP（无需等待）")
                self.current_proxy = None
                retry_count += 1
                last_was_anti_crawl = True
                if retry_count < max_retries:
                    continue  # 立即重试，不等待
                else:
                    raise
            except requests.exceptions.HTTPError as e:
                # 检查是否是407错误（代理IP池限流）
                if e.response and e.response.status_code == 407:
                    print(f"⚠️ 检测到407错误（代理IP池限流），等待3秒后重新获取IP...")
                    import time
                    time.sleep(3)  # 等待3秒
                    if self.use_proxy:
                        print("重新获取代理IP...")
                        self.current_proxy = None
                        retry_count += 1
                        if retry_count < max_retries:
                            continue
                    raise
                else:
                    print(f"请求失败: {e}")
                    raise
            except Exception as e:
                print(f"请求失败: {e}")
                # 如果使用代理失败，尝试重新获取代理
                if self.use_proxy:
                    print("立即切换代理（无需等待）")
                    self.current_proxy = None
                    retry_count += 1
                    if retry_count < max_retries:
                        continue  # 立即重试，不等待
                raise
        
        raise Exception(f"请求失败，已重试{max_retries}次")
    
    def _check_anti_spider(self, response):
        """检查响应是否被反爬拦截"""
        # 检查状态码
        if response.status_code in [403, 429, 503]:
            return True
        
        # 尝试解析JSON响应，检查is_need_foe字段
        try:
            data = response.json()
            if data.get('data', {}).get('foe', {}).get('is_need_foe') == True:
                print("检测到is_need_foe=True，需要切换IP")
                return True
        except:
            pass  # 如果不是JSON响应，继续检查文本内容
        
        # 检查响应内容中的反爬特征
        content = response.text.lower()
        anti_spider_keywords = [
            '验证码',
            'captcha',
            '请输入验证码',
            '访问频繁',
            '异常访问',
            '请稍后再试',
            'access denied',
            'forbidden',
            '安全验证',
            '人机验证'
        ]
        
        for keyword in anti_spider_keywords:
            if keyword in content:
                print(f"检测到反爬关键词: {keyword}")
                return True
        
        return False

    @staticmethod
    def get_uk_from_app_id(app_id, use_proxy=False, proxy_api_url=None):
        """从baijiahao主页提取uk参数"""
        url = f"https://baijiahao.baidu.com/u?app_id={app_id}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        }

        print(f"正在从主页获取uk: {url}")
        
        max_retries = 3  # 最大重试次数
        retry_count = 0
        
        while retry_count < max_retries:
            # 准备代理配置
            kwargs = {'headers': headers, 'timeout': 30}
            
            if use_proxy:
                proxy_url = proxy_api_url or 'http://api.tianqiip.com/getip?secret=lu29e593&num=1&type=txt&port=1&mr=1&sign=4b81a62eaed89ba802a8f34053e2c964'
                try:
                    print(f"{'重新' if retry_count > 0 else ''}使用代理获取uk（第{retry_count + 1}次）")
                    proxy_response = requests.get(proxy_url, timeout=5)  # 优化超时为5秒
                    content = proxy_response.content.decode("utf-8").strip()
                    print(f"代理API响应: {content}")
                    
                    # 检查是否是JSON错误响应
                    if content.startswith('{'):
                        try:
                            import json
                            error_data = json.loads(content)
                            error_msg = error_data.get('msg', '未知错误')
                            raise Exception(f"代理IP池错误: {error_msg}")
                        except json.JSONDecodeError:
                            pass  # 不是JSON，继续按IP处理
                    
                    if ':' in content:
                        sj = content.strip().split(":", 1)
                        proxy_meta = f"http://{sj[0]}:{sj[1]}"
                        kwargs['proxies'] = {
                            'http': proxy_meta,
                            'https': proxy_meta
                        }
                        print(f"使用代理: {proxy_meta}")
                    else:
                        raise Exception(f"代理IP格式错误: {content}")
                except Exception as e:
                    # 启用了代理但获取失败，不使用本机IP，直接抛出异常
                    raise Exception(f"启用了代理但获取代理IP失败，拒绝使用本机IP: {e}")

            try:
                response = requests.get(url, **kwargs)

                if response.status_code != 200:
                    print(f"访问主页失败，状态码: {response.status_code}")
                    retry_count += 1
                    if retry_count < max_retries:
                        print(f"立即重试（无需等待）")
                        continue  # 立即重试
                    else:
                        raise Exception(f"访问主页失败，状态码: {response.status_code}")

                # 检查是否被反爬
                content_lower = response.text.lower()
                anti_spider_detected = False
                
                # 检查is_need_foe字段
                try:
                    data = response.json()
                    if data.get('data', {}).get('foe', {}).get('is_need_foe') == True:
                        print("⚠️ 检测到is_need_foe=True，需要切换IP")
                        anti_spider_detected = True
                except:
                    pass  # 不是JSON响应，继续检查其他特征
                
                # 检查反爬关键词
                if not anti_spider_detected:
                    anti_spider_keywords = ['验证码', 'captcha', '访问频繁', '异常访问', 'access denied']
                    for keyword in anti_spider_keywords:
                        if keyword in content_lower:
                            print(f"⚠️ 检测到反爬关键词: {keyword}")
                            anti_spider_detected = True
                            break
                
                if anti_spider_detected and use_proxy:
                    retry_count += 1
                    if retry_count < max_retries:
                        print(f"检测到反爬，立即切换IP重试（无需等待）")
                        continue  # 立即重试，不等待
                    else:
                        raise Exception("多次重试后仍被反爬拦截")

                # 使用正则表达式提取uk
                uk_match = re.search(r'"uk"\s*:\s*"([^"]+)"', response.text)

                if not uk_match:
                    # 尝试另一种模式
                    uk_match = re.search(r'uk=([^&\s"]+)', response.text)

                if uk_match:
                    uk = uk_match.group(1)
                    print(f">> 成功获取UK: {uk}")

                    # 获取Cookie
                    cookies = response.cookies
                    print(f">> 成功获取Cookie")

                    return uk, cookies
                else:
                    raise Exception("无法从页面中提取UK参数")

            except requests.exceptions.ProxyError as e:
                print(f"代理错误: {e}，立即切换代理重试（无需等待）")
                retry_count += 1
                if retry_count < max_retries and use_proxy:
                    continue  # 立即重试，不等待
                else:
                    raise
            except requests.exceptions.HTTPError as e:
                # 检查是否是407错误（代理IP池限流）
                if e.response and e.response.status_code == 407:
                    print(f"⚠️ 检测到407错误（代理IP池限流），等待3秒后重新获取IP...")
                    import time
                    time.sleep(3)  # 等待3秒
                    retry_count += 1
                    if retry_count < max_retries and use_proxy:
                        print("重新获取代理IP，继续重试...")
                        continue
                    raise
                else:
                    print(f"请求失败: {e}")
                    raise
            except Exception as e:
                if retry_count < max_retries - 1:
                    retry_count += 1
                    print(f"错误: {e}，立即重试（无需等待）")
                    continue  # 立即重试，不等待
                else:
                    print(f"获取UK失败: {e}")
                    raise

    def get_articles(self, months=6, app_id=None, articles_only=True, task_id=None, on_page_fetched=None, 
                     start_page=1, start_ctime=None):
        """使用API接口获取文章列表(调用baidu_api.py的get_baidu_data_sync)
        
        Args:
            months: 获取近几个月的数据
            app_id: 百家号app_id
            articles_only: 是否仅爬取文章（跳过视频）
            task_id: 任务ID（用于数据库缓存）
            on_page_fetched: 每页数据获取后的回调函数
            start_page: 起始页码（断点续传）
            start_ctime: 起始分页参数（断点续传）
            
        Returns:
            dict: {'last_page': 最后页码, 'last_ctime': 分页参数, 'completed': 是否完成}
        """
        import sys
        import os
        
        # 导入baidu_api.py中的同步函数
        sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
        from baidu_api import get_baidu_data_sync
        from datetime import datetime, timedelta
        
        # 支持小数月份(如0.33个月10天)
        days = int(months * 30)
        target_date = datetime.now() - timedelta(days=days)
        
        print(f"\n{'=' * 50}")
        print(f"开始获取百家号作者(uk={self.uk})的文章...")
        if start_page > 1:
            print(f"🔄 断点续传：从第{start_page}页开始")
        print(f"{'=' * 50}\n")
        
        # 定义处理每页数据的回调函数
        def process_page(page, items, ctime):
            """处理每页的原始数据，提取文章并调用上层回调"""
            processed_articles = []
            
            for item in items:
                item_data = item.get('itemData', {})
                
                # 如果启用articles_only，过滤掉视频内容
                if articles_only:
                    meta_type = item_data.get('meta_type', '')
                    if meta_type == 'video':
                        print(f"  ✖ 跳过视频: {item_data.get('title', '[无标题]')[:50]}...")
                        continue
                
                title = item_data.get('title', '')
                article_url = item_data.get('url', '')
                
                # 优先使用ctime(Unix时间戳),更准确
                publish_time = '未知时间'
                if 'ctime' in item_data and item_data['ctime']:
                    try:
                        timestamp = int(item_data['ctime'])
                        article_date = datetime.fromtimestamp(timestamp)
                        publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S')
                    except Exception as e:
                        print(f"  ✗ 解析ctime失败: {e}")
                        time_str = item_data.get('time', '未知时间')
                        if time_str != '未知时间':
                            article_date = self._parse_article_date(time_str)
                            if article_date:
                                publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S')
                else:
                    time_str = item_data.get('time', '未知时间')
                    if time_str != '未知时间':
                        article_date = self._parse_article_date(time_str)
                        if article_date:
                            publish_time = article_date.strftime('%Y-%m-%d %H:%M:%S')
                
                processed_articles.append({
                    '标题': title.strip().split('\n')[0][:500] if title else '无标题',
                    '链接': article_url,
                    '发布时间': publish_time
                })
            
            print(f"第{page}页处理完成：{len(processed_articles)}篇文章")
            
            # 调用上层回调（如果提供）
            if on_page_fetched:
                on_page_fetched(page, processed_articles, ctime)
        
        try:
            # 调用爬虫，传入回调函数
            if months < 1:
                print(f"调用get_baidu_data_sync获取数据(近{days}天)...\n")
            else:
                print(f"调用get_baidu_data_sync获取数据(近{int(months)}个月)...\n")
            
            # 传递代理参数和回调函数
            result = get_baidu_data_sync(
                uk=self.uk, 
                months=months,
                use_proxy=self.use_proxy,
                proxy_api_url=self.proxy_api_url if self.proxy_api_url else None,
                on_page_fetched=process_page,
                start_page=start_page,
                start_ctime=start_ctime
            )
            
            if not result:
                print("\n✗ 未获取到数据")
                return {'last_page': start_page, 'last_ctime': start_ctime, 'completed': False}
            
            print(f"\n✓ 爬取完成！")
            return result
            
        except Exception as e:
            print(f"\n✗ 调用get_baidu_data_sync失败: {e}")
            import traceback
            traceback.print_exc()
            return {'last_page': start_page, 'last_ctime': start_ctime, 'completed': False}

    def _parse_article_date(self, time_str):
        """解析文章时间字符串,返回datetime对象"""
        from datetime import datetime, timedelta
        import re

        if not time_str or time_str == '未知':
            return None

        current_year = datetime.now().year
        now = datetime.now()

        # 格式1: 相对时间 "1天前", "2小时前", "30分钟前", "1个月前"
        if '前' in time_str:
            # 天前
            match = re.search(r'(\d+)\s*天', time_str)
            if match:
                days = int(match.group(1))
                return now - timedelta(days=days)

            # 小时前
            match = re.search(r'(\d+)\s*小时', time_str)
            if match:
                hours = int(match.group(1))
                return now - timedelta(hours=hours)

            # 分钟前
            match = re.search(r'(\d+)\s*分钟', time_str)
            if match:
                minutes = int(match.group(1))
                return now - timedelta(minutes=minutes)

            # 月前
            match = re.search(r'(\d+)\s*个?月', time_str)
            if match:
                months = int(match.group(1))
                return now - timedelta(days=months * 30)  # 近似计算

        # 格式2: "11-29 07:23" 或 "11-29"
        match = re.match(r'(\d{1,2})-(\d{1,2})', time_str)
        if match:
            month = int(match.group(1))
            day = int(match.group(2))
            return datetime(current_year, month, day)

        # 格式3: "2024-11-29"
        match = re.match(r'(\d{4})-(\d{1,2})-(\d{1,2})', time_str)
        if match:
            year = int(match.group(1))
            month = int(match.group(2))
            day = int(match.group(3))
            return datetime(year, month, day)

        return None

    def get_articles_from_html(self, months=6, app_id=None):
        """直接解析HTML获取文章(不依赖Selenium)"""
        if not app_id:
            raise Exception("需要提供app_id")

        articles = []

        try:
            url = f"https://baijiahao.baidu.com/u?app_id={app_id}"
            print(f"访问页面: {url}")

            headers = {
                'User-Agent': self.headers['User-Agent'],
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Connection': 'keep-alive',
            }

            response = self.session.get(url, headers=headers, timeout=30)

            if response.status_code != 200:
                raise Exception(f"访问失败,状态码: {response.status_code}")

            html = response.text

            # 使用简单的字符串查找方式提取文章
            # 查找所有包含 /s?id= 的链接
            import re
            links = re.findall(r'href="(/s\?id=[^"]+)"', html)
            links += re.findall(r"href='(/s\?id=[^']+)'", html)

            # 尝试提取标题
            for link in set(links):  # 去重
                # 尝试找到这个链接对应的title
                title_pattern = f'href="{link}"[^>]*title="([^"]+)"'
                title_match = re.search(title_pattern, html)
                if not title_match:
                    title_pattern = f'title="([^"]+)"[^>]*href="{link}"'
                    title_match = re.search(title_pattern, html)

                title = title_match.group(1) if title_match else "未知标题"

                full_link = 'https://baijiahao.baidu.com' + link
                articles.append({
                    '标题': title.strip(),
                    '链接': full_link,
                    '发布时间': '未知'
                })

            print(f"成功提取 {len(articles)} 篇文章")

        except Exception as e:
            print(f"HTML解析失败: {e}")
            raise

        return articles


# ========== SocketIO 事件处理 ==========

@socketio.on('connect')
def handle_connect():
    """客户端连接时触发"""
    logger.info(f"客户端已连接: {request.sid}")
    emit('connected', {'message': '已连接到服务器'})


@socketio.on('disconnect')
def handle_disconnect():
    """客户端断开时触发"""
    logger.info(f"客户端已断开: {request.sid}")


@socketio.on('subscribe_task')
def handle_subscribe_task(data):
    """客户端订阅任务进度"""
    task_id = data.get('task_id')
    logger.info(f"客户端 {request.sid} 订阅任务: {task_id}")
    emit('subscribed', {'task_id': task_id})


def emit_task_log(task_id, message, level='info'):
    """发送任务日志到前端"""
    socketio.emit('task_log', {
        'task_id': task_id,
        'message': message,
        'level': level,
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    })


def emit_task_progress(task_id, progress, current_step='', **kwargs):
    """发送任务进度到前端"""
    data = {
        'task_id': task_id,
        'progress': progress,
        'current_step': current_step,
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }
    data.update(kwargs)
    socketio.emit('task_progress', data)


# ========== Flask 路由 ==========

@app.route('/')
def index():
    """首页"""
    # 如果未登录，重定向到登录页
    if 'username' not in session:
        return redirect(url_for('login_page'))
    return render_template('index.html', username=session.get('username'))


@app.route('/queue')
def queue_page():
    """任务队列页面"""
    # 如果未登录，重定向到登录页
    if 'username' not in session:
        return redirect(url_for('login_page'))
    return render_template('queue.html', username=session.get('username'))


@app.route('/login')
def login_page():
    """登录页面"""
    # 如果已登录，重定向到首页
    if 'username' in session:
        return redirect(url_for('index'))
    return render_template('login.html')


@app.route('/api/login', methods=['POST'])
def login():
    """登录接口"""
    try:
        data = request.get_json()
        username = data.get('username', '').strip()
        password = data.get('password', '').strip()
        
        if not username or not password:
            return jsonify({'success': False, 'message': '请输入用户名和密码'})
        
        # 验证用户名和密码
        if username in USERS and USERS[username] == password:
            session['username'] = username
            return jsonify({'success': True, 'message': '登录成功'})
        else:
            return jsonify({'success': False, 'message': '用户名或密码错误'})
    except Exception as e:
        return jsonify({'success': False, 'message': f'登录失败: {str(e)}'})


@app.route('/api/logout', methods=['POST'])
def logout():
    """登出接口"""
    session.pop('username', None)
    return jsonify({'success': True, 'message': '已登出'})


@app.route('/api/export', methods=['POST'])
@login_required
def export_articles():
    """导出文章到Excel"""
    try:
        data = request.get_json()
        url = data.get('url', '')
        cookies_str = data.get('cookies', '')  # 从前端获取Cookie
        months = data.get('months', 6)  # 获取时间范围，默认6个月

        # 从URL中提取app_id，并转换为uk参数
        app_id_match = re.search(r'app_id=(\d+)', url)
        if not app_id_match:
            return jsonify({'success': False, 'message': 'URL格式不正确，无法提取app_id'})

        app_id = app_id_match.group(1)
        print(f"开始导出，app_id={app_id}")
        print(f"Cookie长度: {len(cookies_str) if cookies_str else 0}")

        # 检查是否使用代理
        use_proxy = data.get('use_proxy', False)
        proxy_api_url = data.get('proxy_api_url', '')
        articles_only = data.get('articles_only', True)  # 获取是否仅爬取文章
        
        # 从主页获取uk和Cookie
        uk, auto_cookies = BaijiahaoScraper.get_uk_from_app_id(app_id, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None)

        if not uk:
            return jsonify({'success': False, 'message': '无法获取用户UK，请检查URL是否正确'})

        print(f"成功获取uk={uk}")

        # 如果用户提供了Cookie,使用用户的Cookie
        if cookies_str:
            print("使用用户提供的Cookie")
            # 解析Cookie字符串
            cookies_dict = {}
            for item in cookies_str.split(';'):
                item = item.strip()
                if '=' in item:
                    key, value = item.split('=', 1)
                    cookies_dict[key.strip()] = value.strip()

            # 转换为requests.cookies.RequestsCookieJar
            from requests.cookies import cookiejar_from_dict
            user_cookies = cookiejar_from_dict(cookies_dict)
            scraper = BaijiahaoScraper(uk, user_cookies, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None)
        else:
            print("使用自动获取的Cookie")
            scraper = BaijiahaoScraper(uk, auto_cookies, use_proxy=use_proxy, proxy_api_url=proxy_api_url if proxy_api_url else None)

        # 使用 API 方式获取文章（仅保留代理IP池 + API 方式）
        print(f"使用 API 方式获取文章（近{months}个月）...")
        try:
            articles = scraper.get_articles(months=months, app_id=app_id, articles_only=articles_only)
        except Exception as e:
            print(f"API 方式失败: {e}")
            articles = []

        if not articles:
            return jsonify({
                'success': False,
                'message': (
                    '未能获取到文章数据。\n\n'
                    '请确保:\n'
                    '1. URL正确且该作者有发布过文章\n'
                    '2. 网络连接正常\n'
                    '3. 如需使用代理，请配置代理IP池'
                )
            })

        # 创建Excel文件
        df = pd.DataFrame(articles)

        # 确保输出目录存在
        output_dir = os.path.join(os.path.dirname(__file__), 'exports')
        os.makedirs(output_dir, exist_ok=True)

        # 生成文件名
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f'baijiahao_articles_{app_id}_{timestamp}.xlsx'
        filepath = os.path.join(output_dir, filename)

        # 保存Excel
        with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
            df.to_excel(writer, index=False, sheet_name='文章列表')

            # 调整列宽
            worksheet = writer.sheets['文章列表']
            worksheet.column_dimensions['A'].width = 80  # 标题列
            worksheet.column_dimensions['B'].width = 20  # 时间列

        return jsonify({
            'success': True,
            'message': f'成功导出{len(articles)}篇文章',
            'filename': filename,
            'count': len(articles),
            'articles': articles[:100]  # 返回前100篇文章用于预览
        })

    except Exception as e:
        import traceback
        traceback.print_exc()
        return jsonify({'success': False, 'message': f'导出失败: {str(e)}'})


@app.route('/api/download/<filename>')
@login_required
def download_file(filename):
    """下载Excel文件"""
    try:
        filepath = os.path.join(os.path.dirname(__file__), 'exports', filename)
        if os.path.exists(filepath):
            return send_file(filepath, as_attachment=True, download_name=filename)
        else:
            return jsonify({'success': False, 'message': '文件不存在'})
    except Exception as e:
        return jsonify({'success': False, 'message': f'下载失败: {str(e)}'})


# ==================== 任务队列相关API ====================

@app.route('/api/queue/add', methods=['POST'])
@login_required
def add_task_to_queue():
    """添加任务到队列"""
    try:
        data = request.get_json()
        url = data.get('url', '')
        months = data.get('months', 6)
        use_proxy = data.get('use_proxy', False)
        proxy_api_url = data.get('proxy_api_url', '')
        articles_only = data.get('articles_only', True)  # 获取是否仅爬取文章
        
        if not url:
            return jsonify({'success': False, 'message': 'URL不能为空'})
        
        # 添加任务到队列
        queue = get_task_queue()
        task_id = queue.add_task(
            url=url,
            months=months,
            use_proxy=use_proxy,
            proxy_api_url=proxy_api_url if proxy_api_url else None,
            username=session.get('username'),
            articles_only=articles_only
        )
        
        return jsonify({
            'success': True,
            'message': '任务已添加到队列',
            'task_id': task_id
        })
        
    except Exception as e:
        import traceback
        traceback.print_exc()
        return jsonify({'success': False, 'message': f'添加任务失败: {str(e)}'})


@app.route('/api/queue/tasks', methods=['GET'])
@login_required
def get_tasks():
    """获取任务列表"""
    try:
        queue = get_task_queue()
        username = session.get('username')
        
        # 获取用户的所有任务
        tasks = queue.get_all_tasks(username=username)
        
        # 按创建时间倒序排列
        tasks.sort(key=lambda x: x.get('created_at', ''), reverse=True)
        
        return jsonify({
            'success': True,
            'tasks': tasks
        })
        
    except Exception as e:
        return jsonify({'success': False, 'message': f'获取任务列表失败: {str(e)}'})


@app.route('/api/queue/task/<task_id>', methods=['GET'])
@login_required
def get_task_detail(task_id):
    """获取任务详情"""
    try:
        queue = get_task_queue()
        task = queue.get_task(task_id)
        
        if not task:
            return jsonify({'success': False, 'message': '任务不存在'})
        
        # 检查权限
        if task.get('username') != session.get('username'):
            return jsonify({'success': False, 'message': '无权查看此任务'})
        
        return jsonify({
            'success': True,
            'task': task
        })
        
    except Exception as e:
        return jsonify({'success': False, 'message': f'获取任务详情失败: {str(e)}'})


@app.route('/api/queue/stats', methods=['GET'])
@login_required
def get_queue_stats():
    """获取队列统计信息"""
    try:
        queue = get_task_queue()
        username = session.get('username')
        
        stats = queue.get_queue_stats(username=username)
        
        return jsonify({
            'success': True,
            'stats': stats
        })
        
    except Exception as e:
        return jsonify({'success': False, 'message': f'获取统计信息失败: {str(e)}'})


@app.route('/api/queue/download/<task_id>', methods=['GET'])
@login_required
def download_task_result(task_id):
    """下载任务结果文件（通过任务ID）"""
    try:
        queue = get_task_queue()
        task = queue.get_task(task_id)
        
        if not task:
            return jsonify({'success': False, 'message': '任务不存在'})
        
        # 检查权限
        if task.get('username') != session.get('username'):
            return jsonify({'success': False, 'message': '无权下载此文件'})
        
        if task.get('status') != 'completed':
            return jsonify({'success': False, 'message': '任务未完成'})
        
        result_file = task.get('result_file')
        if not result_file:
            return jsonify({'success': False, 'message': '结果文件不存在'})
        
        filepath = os.path.join(queue.results_dir, result_file)
        if not os.path.exists(filepath):
            return jsonify({'success': False, 'message': '文件不存在'})
        
        return send_file(filepath, as_attachment=True, download_name=result_file)
        
    except Exception as e:
        return jsonify({'success': False, 'message': f'下载失败: {str(e)}'})


@app.route('/api/queue/task/<task_id>/delete', methods=['POST'])
@login_required
def delete_task(task_id):
    """删除任务（自动终止后删除）"""
    queue = get_task_queue()
    task = queue.get_task(task_id)
    if not task:
        return jsonify({'success': False, 'message': '任务不存在'})
    if task.get('username') != session.get('username'):
        return jsonify({'success': False, 'message': '无权删除此任务'})
    ok = queue.delete_task(task_id)
    return jsonify({'success': ok})


@app.route('/api/queue/task/<task_id>/cancel', methods=['POST'])
@login_required
def cancel_task(task_id):
    """终止任务（等待中或处理中）"""
    queue = get_task_queue()
    task = queue.get_task(task_id)
    if not task:
        return jsonify({'success': False, 'message': '任务不存在'})
    if task.get('username') != session.get('username'):
        return jsonify({'success': False, 'message': '无权终止此任务'})
    if task.get('status') not in ['pending', 'processing']:
        return jsonify({'success': False, 'message': '仅可终止等待中或处理中任务'})
    ok = queue.cancel_task(task_id)
    return jsonify({'success': ok})


@app.route('/api/queue/task/<task_id>/logs', methods=['GET'])
@login_required
def get_task_logs(task_id):
    """获取任务的历史日志"""
    try:
        queue = get_task_queue()
        task = queue.get_task(task_id)
        
        if not task:
            return jsonify({'success': False, 'message': '任务不存在'})
        
        # 检查权限
        if task.get('username') != session.get('username'):
            return jsonify({'success': False, 'message': '无权查看此任务日志'})
        
        # 从数据库获取日志
        from database import get_database
        db = get_database()
        logs = db.get_task_logs(task_id)
        
        return jsonify({
            'success': True,
            'logs': logs
        })
        
    except Exception as e:
        return jsonify({'success': False, 'message': f'获取日志失败: {str(e)}'})


@app.route('/health/taskworker')
def health_taskworker():
    """TaskWorker 健康检查接口"""
    try:
        from task_worker import get_task_worker
        from task_queue import get_task_queue
        
        worker = get_task_worker()
        queue = get_task_queue()
        
        # 统计信息
        tasks = queue.get_all_tasks()
        pending_count = len([t for t in tasks if t.get('status') == 'pending'])
        processing_count = len([t for t in tasks if t.get('status') == 'processing'])
        
        # Worker 状态
        alive_threads = sum(1 for t in worker.worker_threads if t and t.is_alive())
        
        status = 'healthy' if worker.running and alive_threads > 0 else 'unhealthy'
        
        return jsonify({
            'status': status,
            'worker': {
                'running': worker.running,
                'alive_threads': alive_threads,
                'current_workers': worker.current_workers,
                'max_workers': worker.max_workers,
                'processing_tasks': len(worker.processing_tasks)
            },
            'queue': {
                'pending': pending_count,
                'processing': processing_count,
                'total': len(tasks)
            },
            'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        })
        
    except Exception as e:
        return jsonify({
            'status': 'error',
            'message': str(e)
        }), 500


if __name__ == '__main__':
    import sys


    # 检查并安装依赖
    def check_dependencies():
        """检查关键依赖是否安装"""
        missing = []
        try:
            import flask
        except ImportError:
            missing.append('flask')
        try:
            import pandas
        except ImportError:
            missing.append('pandas')
        try:
            import openpyxl
        except ImportError:
            missing.append('openpyxl')

        if missing:
            print(f"\n⚠️  缺少依赖: {', '.join(missing)}")
            print("请运行: pip install -r requirements.txt\n")
            return False
        return True


    if not check_dependencies():
        sys.exit(1)

    # 创建必要的目录
    os.makedirs('exports', exist_ok=True)
    os.makedirs('templates', exist_ok=True)
    os.makedirs('static/css', exist_ok=True)
    os.makedirs('static/js', exist_ok=True)
    os.makedirs('data', exist_ok=True)
    os.makedirs('data/results', exist_ok=True)
    
    # 启动任务处理器
    print('🔧 启动任务处理器...')
    
    # 设置 SocketIO 实例到 task_worker
    from task_worker import set_socketio
    set_socketio(socketio)
    
    start_task_worker()
    print('✅ 任务处理器已启动')

    # 检测是否是生产环境
    is_production = os.environ.get('FLASK_ENV') == 'production'

    if is_production:
        print('✅ 生产环境启动')
        print('请使用 gunicorn 或 uwsgi 运行:')
        print('  gunicorn -w 4 -b 0.0.0.0:5001 app:app')
        print('\n如果要直接运行,请使用: python app.py --dev')

        if '--dev' not in sys.argv:
            sys.exit(1)

    print('🚀 服务器启动成功！')
    print('请访问: http://127.0.0.1:8030')

    # 开发环境使用 SocketIO 服务器
    socketio.run(
        app,
        debug=not is_production,
        host='0.0.0.0',
        port=8030,
        allow_unsafe_werkzeug=True
    )