ai_wht_B/ver_25121821/generate_Atlas_Qianwen_article.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
AI文章自动生成监控脚本
监控数据库中status为topic的记录，自动调用Coze API生成文章并提交
"""

import os
import sys
import time
import json
import logging
import requests
import pymysql
import random
from datetime import datetime
from typing import Dict, List, Optional, Any
import traceback
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue, Empty
import emoji

# 添加项目根目录到Python路径
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from database_config import get_db_manager
from log_config import setup_logger
from dashvector_get_similar_topic import search_chinese, init_dashvector_client

# 配置日志记录器，支持按日期切割和控制台输出
logger = setup_logger(
    name='generate_Atlas_Qianwen',
    log_file='logs/generate_Atlas_Qianwen_article.log',
    error_log_file='logs/generate_Atlas_Qianwen_error.log',
    level=logging.INFO,
    console_output=True
)

# 配置常量
#BASE_URL = "http://47.99.184.230:8321"
BASE_URL = "http://127.0.0.1:8215"
COZE_API_URL = "https://api.coze.cn/v1/workflow/stream_run"
SLEEP_INTERVAL = 5  # 监控间隔（秒）
WORKER_COUNT = 6 # 并行处理worker数量，可配置

# 全局变量
AUTH_TOKEN = None
WORKFLOW_ID = None
JWT_TOKEN = None

class CozeArticleGenerator:
    def __init__(self):
        # API配置
        self.base_url = BASE_URL
        self.coze_api_url = COZE_API_URL

        # 认证信息
        self.jwt_token = None

        # 使用统一的数据库管理器
        self.db_manager = get_db_manager()

        # 登录配置
        self.login_credentials = {
            'username': 'user010',
            'password': '@5^2W6R7'
        }

        # 禁用代理
        self.proxies = {
            'http': None,
            'https': None
        }

        # 并行处理相关
        self.processing_lock = threading.Lock()  # 用于线程安全的记录分配
        self.processed_ids = set()  # 已处理的记录ID集合

        # 初始化DashVector客户端（向量检索）
        logger.info("开始初始化DashVector客户端")
        if init_dashvector_client():
            logger.info("DashVector客户端初始化成功")
        else:
            logger.warning("DashVector客户端初始化失败，相似topic检索功能将不可用")

        logger.info("CozeArticleGenerator 初始化完成")

    def log_to_database(self, level: str, message: str, details: str = None):
        """获取数据库连接"""
        try:
            return self.db_manager.get_connection()
        except Exception as e:
            logger.error(f"数据库连接失败: {e}")
            return None

    def get_article_contents_by_ids(self, article_ids: List[int]) -> str:
        """
        根据article_id列表从数据库获取content内容

        Args:
            article_ids: 文章ID列表

        Returns:
            str: 合并后的content内容，多条用\n\n分隔
        """
        if not article_ids:
            logger.info("没有文章ID，返回空字符串")
            return ""

        try:
            # 去重并限制最多2条
            article_ids = list(set(article_ids))[:2]

            logger.info(f"开始查询文章content，article_ids: {article_ids}")

            with self.db_manager.get_cursor() as cursor:
                # 构建IN查询
                placeholders = ','.join(['%s'] * len(article_ids))
                sql = f"""
                SELECT id, content, created_at
                FROM ai_articles
                WHERE id IN ({placeholders})
                AND content IS NOT NULL AND content != ''
                ORDER BY created_at DESC
                LIMIT 2
                """

                cursor.execute(sql, article_ids)
                results = cursor.fetchall()

                if not results:
                    logger.warning(f"未查询到文章content，article_ids: {article_ids}")
                    return ""

                logger.info(f"查询到 {len(results)} 条文章content")

                # 合并content
                contents = []
                for row in results:
                    content = row.get('content', '').strip()
                    if content:
                        contents.append(content)
                        logger.info(f"添加文章content，ID: {row.get('id')}, 长度: {len(content)} 字符")

                # 用两个换行符分隔
                merged_content = "\n\n".join(contents)
                logger.info(f"合并后的content总长度: {len(merged_content)} 字符")

                return merged_content

        except Exception as e:
            error_msg = f"查询文章content异常: {e}"
            logger.error(error_msg)
            self.log_to_database('ERROR', error_msg, traceback.format_exc())
            return ""

    def log_to_database(self, level: str, message: str, details: str = None):
        """记录日志到数据库ai_logs表"""
        try:
            with self.db_manager.get_cursor() as cursor:
                # 映射日志级别到数据库状态
                status_map = {
                    'INFO': 'success',
                    'WARNING': 'warning',
                    'ERROR': 'error'
                }
                status = status_map.get(level, 'success')

                sql = """
                INSERT INTO ai_logs (user_id, action, description, status, error_message, created_at)
                VALUES (%s, %s, %s, %s, %s, NOW())
                """
                cursor.execute(sql, (None, 'coze_generator', message, status, details))
                logger.info(f"日志已记录到数据库: {level} - {message}")
        except Exception as e:
            logger.error(f"记录日志到数据库失败: {e}")

    def login_and_get_jwt_token(self) -> bool:
        """登录获取JWT token，参考JavaScript逻辑"""
        try:
            login_url = f"{self.base_url}/api/auth/login"
            login_data = {
                "username": "13621242430",  # 使用用户指定的账号
                "password": "admin123"
            }

            logger.info(f"尝试登录: {login_data['username']}")
            self.log_to_database('INFO', f"尝试登录用户: {login_data['username']}")

            response = requests.post(
                login_url,
                json=login_data,
                headers={'Content-Type': 'application/json'},
                proxies=self.proxies  # 禁用代理
            )

            if response.status_code == 200:
                result = response.json()
                if result.get('code') == 200:
                    self.jwt_token = result['data']['token']
                    logger.info("JWT token获取成功")
                    self.log_to_database('INFO', "JWT token获取成功", json.dumps(result['data']))
                    return True
                else:
                    error_msg = f"登录失败: {result.get('message', '未知错误')}"
                    logger.error(error_msg)
                    self.log_to_database('ERROR', error_msg, json.dumps(result))
                    return False
            else:
                error_msg = f"登录请求失败: {response.status_code}"
                logger.error(error_msg)
                self.log_to_database('ERROR', error_msg, response.text)
                return False

        except Exception as e:
            error_msg = f"登录异常: {e}"
            logger.error(error_msg)
            self.log_to_database('ERROR', error_msg, traceback.format_exc())
            return False

    def generate_article_from_coze(self, title: str, context_injection: str, workflow_id: str, auth_token: str) -> Optional[Dict]:
        """调用Coze API生成文章，100%参考JavaScript流式处理逻辑"""
        try:
            logger.info(f"开始为主题'{title}'生成文章...")
            logger.info(f"上下文注入内容长度: {len(context_injection)} 字符")
            self.log_to_database('INFO', f"开始为主题生成文章: {title}", f"context_injection长度: {len(context_injection)}")

            # 验证传入的认证信息
            if not auth_token or not workflow_id:
                error_msg = f"'{title}' - workflow_id 或 auth_token 参数缺失"
                logger.error(error_msg)
                self.log_to_database('ERROR', error_msg)
                return None

            # 构建请求数据，增加context_injection参数
            request_data = {
                'workflow_id': workflow_id,
                'parameters': {
                    'title': title,
                    'context_injection': context_injection  # 新增上下文注入参数
                }
            }
            logger.info(f"提交coze工作流数据详情: {json.dumps(request_data['parameters'], ensure_ascii=False)[:200]}...")

            # 发送流式请求
            headers = {
                'Authorization': f'Bearer {auth_token}',
                'Content-Type': 'application/json'
            }

            logger.info(f"'{title}' - 发送Coze API请求...")

            response = requests.post(
                COZE_API_URL,
                json=request_data,
                headers=headers,
                stream=True,
                timeout=300  # 5分钟超时
            )

            logger.info(f"'{title}' - Coze API响应状态码: {response.status_code}")

            if not response.ok:
                error_msg = f"'{title}' - Coze API请求失败，状态码: {response.status_code}"
                logger.error(error_msg)
                self.log_to_database('ERROR', error_msg, response.text)
                return None

            # 调用流式响应解析方法
            return self.parse_stream_response(response, title)

        except Exception as e:
            error_msg = f"生成文章异常: {e}, 主题: {title}"
            logger.error(error_msg)
            self.log_to_database('ERROR', error_msg, traceback.format_exc())
            return None

    def parse_stream_response(self, response, title: str) -> Optional[Dict[str, Any]]:
        """解析流式响应，100%参考JavaScript事件处理逻辑"""
        try:
            buffer = ''
            last_structured = None
            all_img = []

            logger.info(f"'{title}' - 开始接收流式数据...")

            # 设置响应编码为UTF-8
            response.encoding = 'utf-8'

            for chunk in response.iter_content(chunk_size=1024, decode_unicode=True):
                if chunk:
                    buffer += chunk
                    events = buffer.split('\n\n')
                    buffer = events.pop() or ''

                    for event_str in events:
                        if not event_str.strip():
                            continue

                        lines = event_str.split('\n')
                        event_type = ''
                        data_str = ''

                        # 解析事件类型和数据，完全按照JavaScript逻辑
                        for line in lines:
                            if line.startswith('event:'):
                                event_type = line[6:].strip()
                            elif line.startswith('data:'):
                                data_str = line[5:].strip()

                        logger.info(f"'{title}' - 收到事件: {event_type}")
                        self.log_to_database('INFO', f"收到Coze事件: {event_type}", f"主题: {title}")

                        # 处理错误事件
                        if event_type == 'Error':
                            logger.error(f"'{title}' - Coze API返回错误: {data_str}")
                            self.log_to_database('ERROR', f"Coze API返回错误: {title}", data_str)
                            try:
                                err_data = json.loads(data_str)
                                error_detail = f"错误代码: {err_data.get('error_code', '未知错误')}, 错误信息: {err_data.get('error_message', '无详细信息')}"
                                logger.error(f"'{title}' - {error_detail}")
                                self.log_to_database('ERROR', f"Coze API错误详情: {title}", error_detail)
                            except json.JSONDecodeError:
                                logger.error(f"'{title}' - 无法解析错误数据")
                                self.log_to_database('ERROR', f"无法解析Coze错误数据: {title}", data_str)
                            return None

                        # 跳过PING和End事件
                        if event_type in ['PING', 'End']:
                            continue

                        # 处理Message事件
                        if event_type == 'Message':
                            try:
                                logger.info(f"'{title}' - 收到Message事件，数据: {data_str[:200]}...")
                                data = json.loads(data_str)

                                # 解析content字段为JSON对象
                                content_obj = {}
                                if data.get('content') and isinstance(data['content'], str):
                                    try:
                                        content_obj = json.loads(data['content'])
                                        logger.info(f"'{title}' - 解析后的content: {list(content_obj.keys())}")
                                    except json.JSONDecodeError as e:
                                        logger.error(f"'{title}' - 解析content字段失败: {e}")
                                        continue

                                # 保存结构化数据 - 修改逻辑：即使API返回的title为空也保存数据
                                if content_obj.get('title') or content_obj.get('contents') or content_obj.get('introduction'):
                                    # 使用API返回的title，如果为空则使用原始输入的title
                                    final_title = content_obj.get('title') or title
                                    last_structured = {
                                        'title': final_title,
                                        'tags': content_obj.get('tags', ''),
                                        'introduction': content_obj.get('introduction', ''),
                                        'conclusion': content_obj.get('conclusion', ''),
                                        'contents': content_obj.get('contents', []) if isinstance(content_obj.get('contents'), list) else []
                                    }
                                    logger.info(f"'{title}' - 保存结构化数据，最终标题: {final_title}")
                                    logger.info(f"'{title}' - 内容项数量: {len(last_structured['contents'])}")

                            except json.JSONDecodeError as e:
                                logger.error(f"'{title}' - 解析消息错误: {e}")
                                continue

            if last_structured:
                success_msg = f"'{title}' - 文章生成成功，包含{len(all_img)}张图片"
                logger.info(success_msg)
                self.log_to_database('INFO', success_msg, json.dumps(last_structured, ensure_ascii=False))
                return last_structured
            else:
                warning_msg = f"'{title}' - 未获取到有效的文章内容"
                logger.warning(warning_msg)
                self.log_to_database('WARNING', warning_msg)
                return None

        except Exception as e:
            error_msg = f"'{title}' - 解析流式响应异常: {e}"
            logger.error(error_msg)
            self.log_to_database('ERROR', error_msg, traceback.format_exc())
            return None

    def convert_structured_to_dynamic(self, structured_data: Dict) -> str:
        """将结构化数据转换为Dynamic格式，参考JavaScript的convertStructuredToDynamic函数"""
        try:
            title = structured_data.get('title', '')
            introduction = structured_data.get('introduction', '')
            contents = structured_data.get('contents', [])
            conclusion = structured_data.get('conclusion', '')
            tags = structured_data.get('tags', '')

            logger.info(f"'{title}' - 开始转换Dynamic格式")

            html_content = ''

            # 添加title
            if title:
                html_content += f"{title}\n\n"

            # 添加引子部分
            if introduction and introduction.strip():
                html_content += f"{introduction.strip()}\n\n"
                logger.info(f"'{title}' - 添加引言段落")

            # 添加内容项
            if contents and isinstance(contents, list):
                for i, content in enumerate(contents):
                    if isinstance(content, dict):
                        # 修复bug：使用content_item字段而不是content字段，与JavaScript保持一致
                        content_text = content.get('content_item') or content.get('content', '')
                        # Emoji前缀逻辑处理
                        if content_text and content_text.strip():
                            # 检查content_text最前面是否有emoji
                            emojis_text = emoji.emoji_list(content_text)
                            has_emoji_text = emojis_text and emojis_text[0]['match_start'] == 0

                            if not has_emoji_text:
                                # content_text没有emoji，检查content_title
                                emojis_title = emoji.emoji_list(content_title) if content_title else []
                                has_emoji_title = emojis_title and emojis_title[0]['match_start'] == 0

                                if has_emoji_title:
                                    # content_title有emoji，补充到content_text前缀
                                    first_emoji = emojis_title[0]['emoji']
                                    content_text = first_emoji + content_text
                                    logger.info(f"'{title}' - contents[{i}] 从content_title补充emoji: {first_emoji}")
                                else:
                                    # content_title也没有emoji，按索引循环选择
                                    emoji_list = ['🔥', '💡', '⭕', '🌟']
                                    rand_emoji = emoji_list[i % len(emoji_list)]
                                    content_text = rand_emoji + content_text
                                    logger.info(f"'{title}' - contents[{i}] content_text: none, content_title: none, rand_emoji: {rand_emoji}")
                        if content_text and content_text.strip():
                            # 将换行符转换为段落标签
                            paragraphs = content_text.split('\n')
                            filtered_paragraphs = [p.strip() for p in paragraphs if p.strip()]
                            for paragraph in filtered_paragraphs:
                                html_content += f"{paragraph}\n\n"
                            logger.info(f"'{title}' - 添加内容段落 {i+1}，字段: {'content_item' if content.get('content_item') else 'content'}")
                    elif isinstance(content, str) and content.strip():
                        # 将换行符转换为段落标签
                        paragraphs = content.split('\n')
                        filtered_paragraphs = [p.strip() for p in paragraphs if p.strip()]
                        for paragraph in filtered_paragraphs:
                            html_content += f"{paragraph}\n\n"
                        logger.info(f"'{title}' - 添加内容段落 {i+1}")

            # 添加结论部分
            if conclusion and conclusion.strip():
                html_content += f"{conclusion.strip()}\n\n"
                logger.info(f"'{title}' - 添加结论段落")

            # 添加tags
            if tags:
                #html_content += f"{tags}\n\n"
                logger.info(f"'{title}' - 添加标签")

            logger.info(f"'{title}' - Dynamic格式转换完成")

            return html_content

        except Exception as e:
            error_msg = f"转换HTML格式异常: {e}"
            logger.error(error_msg)
            self.log_to_database('ERROR', error_msg, traceback.format_exc())
            return ""

    def generate_article(self, structured_data: Dict, article_id: int, existing_batch_id: int) -> bool:
        """提交文章到generate_article接口，100%参考JavaScript的sendInfoToBaijiahao函数"""
        try:
            # 增加判断：structured_data['contents']为空，报错
            if not structured_data or not structured_data.get('contents'):
                logger.error(f"[Worker] 生成文章失败: structured_data['contents']为空")
                # 移除直接数据库操作：不再直接更新状态为generate_failed
                # 状态管理交给接口处理
                return False

            title = structured_data.get('title', 'Unknown')
            logger.info(f"'{title}' - 开始提交文章到generate_article接口")
            self.log_to_database('INFO', f"开始提交文章: {title}", f"article_id: {article_id}")

            # 确保有JWT token
            if not self.jwt_token:
                logger.warning(f"'{title}' - JWT token缺失，尝试重新登录")
                self.log_to_database('WARNING', f"JWT token缺失，重新登录: {title}")
                if not self.login_and_get_jwt_token():
                    error_msg = f"'{title}' - 重新登录失败"
                    logger.error(error_msg)
                    self.log_to_database('ERROR', error_msg)
                    return False

            # 如果没有找到现有batch_id，生成新的unique_id
            if not existing_batch_id:
                timestamp = int(time.time())
                random_num = str(int(time.time() * 1000) % 10000).zfill(4)
                existing_batch_id = f"{timestamp}{random_num}"
                logger.warning(f"'{title}' - 生成新的batch_id: {existing_batch_id}")
                logger.error(f"'{title}' - 查询batch_id失败: {e}")

            # 转换内容为HTML格式
            html_content = self.convert_structured_to_dynamic(structured_data)

            # 构建发文数据，使用现有的batch_id以触发更新模式
            publish_data = {
                "title": structured_data['title'],
                "content": html_content,
                "tags": structured_data.get('tags', ''),
                "cover_image": structured_data.get('home_img', ''),
                "article_id": article_id,
                "batch_id": existing_batch_id,  # 使用现有的batch_id
                "uniq_id": existing_batch_id,
                "source": "coze_auto_generator",  # 标识来源
                "username": self.login_credentials['username']
            }

            logger.info(f"'{title}' - 准备提交的数据: article_id={article_id}, batch_id={existing_batch_id}")
            logger.info(f"'{title}' - 提交数据详情: {json.dumps(publish_data, ensure_ascii=False)[:200]}...")

            # 发送请求
            upload_url = f"{self.base_url}/api/generate_article"
            headers = {
                'Authorization': f'Bearer {self.jwt_token}',
                'Content-Type': 'application/json',
                'Accept': 'application/json'
            }

            response = requests.post(
                upload_url,
                json=publish_data,
                headers=headers,
                timeout=60,
                proxies=self.proxies
            )

            logger.info(f"'{title}' - 提交响应状态码: {response.status_code}")

            if response.status_code == 200:
                try:
                    result = response.json()
                    logger.info(f"'{title}' - 提交响应内容: {result}")

                    if result.get('success') or result.get('errno') == 0:
                        success_msg = f"'{title}' - 文章提交成功, ID: {existing_batch_id}"
                        logger.info(success_msg)
                        self.log_to_database('INFO', success_msg, f"article_id: {article_id}, batch_id: {existing_batch_id}")
                        return True
                    else:
                        error_msg = f"'{title}' - 文章提交失败: {result.get('message', result.get('errmsg', '未知错误'))}"
                        logger.error(error_msg)
                        self.log_to_database('ERROR', error_msg, f"article_id: {article_id}, response: {result}")
                        return False
                except json.JSONDecodeError as e:
                    error_msg = f"'{title}' - 解析提交响应失败: {e}"
                    logger.error(error_msg)
                    self.log_to_database('ERROR', error_msg, f"response_text: {response.text}")
                    return False
            elif response.status_code == 401:
                # 处理401错误：JWT token过期，重新登录后重试
                logger.warning(f"'{title}' - JWT token过期(401)，尝试重新登录")
                self.log_to_database('WARNING', f"JWT token过期，重新登录: {title}", f"article_id: {article_id}")

                if self.login_and_get_jwt_token():
                    logger.info(f"'{title}' - 重新登录成功，重试提交文章")
                    # 更新headers中的token
                    headers['Authorization'] = f'Bearer {self.jwt_token}'

                    # 重试请求
                    retry_response = requests.post(
                        upload_url,
                        json=publish_data,
                        headers=headers,
                        timeout=60,
                        proxies=self.proxies
                    )

                    logger.info(f"'{title}' - 重试响应状态码: {retry_response.status_code}")

                    if retry_response.status_code == 200:
                        try:
                            retry_result = retry_response.json()
                            logger.info(f"'{title}' - 重试响应内容: {retry_result}")

                            if retry_result.get('success') or retry_result.get('errno') == 0:
                                success_msg = f"'{title}' - 重试提交成功, ID: {existing_batch_id}"
                                logger.info(success_msg)
                                self.log_to_database('INFO', success_msg, f"article_id: {article_id}, batch_id: {existing_batch_id}")
                                return True
                            else:
                                error_msg = f"'{title}' - 重试提交失败: {retry_result.get('message', retry_result.get('errmsg', '未知错误'))}"
                                logger.error(error_msg)
                                self.log_to_database('ERROR', error_msg, f"article_id: {article_id}, retry_response: {retry_result}")
                                return False
                        except json.JSONDecodeError as e:
                            error_msg = f"'{title}' - 解析重试响应失败: {e}"
                            logger.error(error_msg)
                            self.log_to_database('ERROR', error_msg, f"retry_response_text: {retry_response.text}")
                            return False
                    else:
                        error_msg = f"'{title}' - 重试请求仍然失败，状态码: {retry_response.status_code}"
                        logger.error(error_msg)
                        self.log_to_database('ERROR', error_msg, f"retry_response_text: {retry_response.text}")
                        return False
                else:
                    error_msg = f"'{title}' - 重新登录失败，无法重试"
                    logger.error(error_msg)
                    self.log_to_database('ERROR', error_msg, f"article_id: {article_id}")
                    return False
            else:
                error_msg = f"'{title}' - 文章提交请求失败，状态码: {response.status_code}"
                logger.error(error_msg)
                self.log_to_database('ERROR', error_msg, f"response_text: {response.text}")
                return False

        except requests.exceptions.Timeout:
            error_msg = f"'{title}' - 提交文章请求超时"
            logger.error(error_msg)
            self.log_to_database('ERROR', error_msg, f"article_id: {article_id}")
            return False
        except requests.exceptions.RequestException as e:
            error_msg = f"'{title}' - 提交文章网络异常: {e}"
            logger.error(error_msg)
            self.log_to_database('ERROR', error_msg, traceback.format_exc())
            return False
        except Exception as e:
            error_msg = f"'{title}' - 提交文章异常: {e}"
            logger.error(error_msg)
            self.log_to_database('ERROR', error_msg, traceback.format_exc())
            return False

    def get_generate_topics(self) -> List[Dict]:
        """获取状态为topic或failed的待处理数据，支持失败重试"""
        try:
            with self.db_manager.get_cursor() as cursor:
                # 查询topic状态和failed状态的文章（支持失败重试）
                # LEFT JOIN ai_prompt_workflow 表获取 auth_token 和 workflow_id
                sql = """
                SELECT a.id, a.topic, a.batch_id, a.status, a.created_at, a.updated_at,
                       p.auth_token, p.workflow_id, p.prompt_workflow_name
                FROM ai_articles a
                LEFT JOIN ai_prompt_workflow p ON a.prompt_workflow_id = p.id
                WHERE a.status IN ('generate', 'generate_failed')
                AND a.topic > '' AND a.prompt_workflow_id > 0
                ORDER BY
                    CASE WHEN a.status = 'generate' THEN 1 ELSE 2 END,
                    a.id ASC
                LIMIT 1000
                """
                cursor.execute(sql)
                results = cursor.fetchall()

                if results:
                    logger.info(f"查询到 {len(results)} 个待处理主题")
                    for result in results:
                        logger.info(f"待处理文章 - ID: {result['id']}, 主题: {result['topic']}, 状态: {result['status']}, auth_token: {result.get('auth_token', 'N/A')}, workflow_id: {result.get('workflow_id', 'N/A')}")
                        self.log_to_database('INFO', f"发现待处理文章: {result['topic']}",
                                           f"ID: {result['id']}, 状态: {result['status']}, auth_token: {result.get('auth_token', 'N/A')}, workflow_id: {result.get('workflow_id', 'N/A')}")
                else:
                    logger.info("未查询到待处理主题")

                return results
        except Exception as e:
            error_msg = f"查询待处理主题异常: {e}"
            logger.error(error_msg)
            self.log_to_database('ERROR', error_msg, traceback.format_exc())
            return []

    def get_next_available_topic(self, pending_topics: List[Dict]) -> Optional[Dict]:
        """线程安全地获取下一个可处理的主题"""
        with self.processing_lock:
            for topic_data in pending_topics:
                article_id = topic_data['id']
                if article_id not in self.processed_ids:
                    self.processed_ids.add(article_id)
                    return topic_data
            return None

    def process_single_topic(self, topic_data: Dict) -> bool:
        """处理单个主题"""
        article_id = topic_data['id']
        topic = topic_data['topic']
        workflow_id = topic_data.get('workflow_id')
        auth_token = topic_data.get('auth_token')
        prompt_workflow_name = topic_data.get('prompt_workflow_name')
        worker_id = threading.current_thread().name
        batch_id = topic_data.get('batch_id')

        # ====== 新增：查找相似topic ======
        context_injection = ""  # 初始化上下文注入内容

        try:
            logger.info(f"[Worker-{worker_id}] 开始查找相似topic，当前topic: '{topic}'")
            self.log_to_database('INFO', f"开始查找相似topic: {topic}", f"article_id: {article_id}")

            # 调用向量检索接口
            similar_topics = search_chinese(
                query_text=topic,
                topk=3,  # 查询top3，后面会过滤到最多2条
                similarity_threshold=0.5  # 相似度阈值0.5
            )

            if similar_topics:
                logger.info(f"[Worker-{worker_id}] 找到 {len(similar_topics)} 个相似topic")
                logger.info(f"[Worker-{worker_id}] similar_topics完整返回值: {json.dumps(similar_topics, ensure_ascii=False)}")

                # 提取article_id列表（注意：返回的字段是id，不是article_id）
                article_ids = []
                for item in similar_topics:
                    aid = item.get('id', '')
                    if aid:
                        try:
                            # 将字符串id转换为int
                            article_ids.append(int(aid))
                        except (ValueError, TypeError):
                            logger.warning(f"[Worker-{worker_id}] 无法转换id为整数: {aid}")
                article_ids = [702, 699] #test测试rwl
                if article_ids:
                    logger.info(f"[Worker-{worker_id}] 提取到文章ID列表: {article_ids}")

                    # 从数据库查询content
                    context_injection = self.get_article_contents_by_ids(article_ids)

                    if context_injection:
                        logger.info(f"[Worker-{worker_id}] 获取到上下文注入内容，长度: {len(context_injection)} 字符")
                        self.log_to_database(
                            'INFO',
                            f"获取上下文注入内容: {topic}",
                            f"article_id: {article_id}, 相似文章IDs: {article_ids}, 内容长度: {len(context_injection)}"
                        )
                    else:
                        logger.warning(f"[Worker-{worker_id}] 未从article_ids {article_ids} 查询到content")
                        self.log_to_database('WARNING', f"未查询到content: {topic}", f"article_ids: {article_ids}")
                else:
                    logger.warning(f"[Worker-{worker_id}] 相似topic中没有有效的id")

                # 打印相似topic详情
                for i, similar in enumerate(similar_topics, 1):
                    logger.info(f"[Worker-{worker_id}]   相似topic[{i}]: {similar.get('title', 'N/A')}, 相似度: {similar.get('similar', 0):.4f}, 文章ID: {similar.get('id', 'N/A')}")
                self.log_to_database(
                    'INFO',
                    f"找到相似topic: {topic}",
                    f"article_id: {article_id}, 相似topic数量: {len(similar_topics)}, 详情: {json.dumps(similar_topics, ensure_ascii=False)}"
                )
            else:
                logger.info(f"[Worker-{worker_id}] 未找到相似topic（相似度>0.5）")
                self.log_to_database('INFO', f"未找到相似topic: {topic}", f"article_id: {article_id}")
        except Exception as e:
            error_msg = f"[Worker-{worker_id}] 查找相似topic异常: {e}"
            logger.error(error_msg)
            self.log_to_database('ERROR', error_msg, traceback.format_exc())
            # 即使查找相似topic失败，也继续处理文章生成
        # ====== 相似topic查找结束 ======

        try:
            logger.info(f"[Worker-{worker_id}] 开始处理主题 ID:{article_id}, Topic:'{topic}', Prompt={prompt_workflow_name}")

            # 验证必要的参数
            if not workflow_id or not auth_token:
                error_msg = f"[Worker-{worker_id}] workflow_id 或 auth_token 缺失，Topic:'{topic}'"
                logger.error(error_msg)
                self.log_to_database('ERROR', error_msg, f"article_id: {article_id}")
                return False

            # 生成文章 - 开始计时
            start_time = time.time()
            structured_data = self.generate_article_from_coze(topic, context_injection, workflow_id, auth_token)
            end_time = time.time()
            elapsed_time = end_time - start_time

            logger.info(f"[Worker-{worker_id}] Coze文章生成耗时: {elapsed_time:.2f}秒, Topic:'{topic}'")

            if not structured_data:
                logger.error(f"[Worker-{worker_id}] 生成文章失败: {topic}")
                # 移除直接数据库操作：不再直接更新状态为generate_failed
                # 状态管理交给接口处理
                return False

            # 增加判断：structured_data['contents']为空，报错
            if not structured_data.get('contents'):
                logger.error(f"[Worker-{worker_id}] 生成文章失败: {topic} - structured_data['contents']为空")
                # 移除直接数据库操作：不再直接更新状态为generate_failed
                # 状态管理交给接口处理
                return False

            # 提交文章
            if self.generate_article(structured_data, article_id, batch_id):
                logger.info(f"[Worker-{worker_id}] 文章处理完成: {topic}")
                # Bug修复：正确发文状态应该是pending_review，不是draft
                # 注意：调用接口后不应再直接操作数据库，接口内部会处理状态
                return True
            else:
                logger.error(f"[Worker-{worker_id}] 文章提交失败: {topic}")
                # 移除直接数据库操作：不再直接更新状态为generate_failed
                # 状态管理交给接口处理
                return False

        except Exception as e:
            logger.error(f"[Worker-{worker_id}] 处理主题异常: {e}")
            # 移除直接数据库操作：不再直接更新状态为generate_failed
            # 状态管理交给接口处理
            return False

    def worker_process_topics(self, pending_topics: List[Dict], worker_id: int) -> int:
        """Worker线程处理主题的方法"""
        processed_count = 0
        thread_name = f"Worker-{worker_id}"
        threading.current_thread().name = thread_name

        logger.info(f"[{thread_name}] 启动，准备处理主题")

        while True:
            # 线程安全地获取下一个待处理主题
            topic_data = self.get_next_available_topic(pending_topics)
            if not topic_data:
                logger.info(f"[{thread_name}] 没有更多待处理主题，退出")
                break

            # 处理主题
            if self.process_single_topic(topic_data):
                processed_count += 1
                logger.info(f"[{thread_name}] 成功处理主题: {topic_data['topic']}")
            else:
                logger.error(f"[{thread_name}] 处理主题失败: {topic_data['topic']}")

        logger.info(f"[{thread_name}] 完成，共处理 {processed_count} 个主题")
        return processed_count

    def run_monitor(self):
        """运行监控循环，支持多worker并行处理"""
        logger.info(f"开始监控ai_articles表，使用 {WORKER_COUNT} 个worker并行处理...")
        self.log_to_database('INFO', f'启动文章自动生成监控服务，worker数量: {WORKER_COUNT}', 'run_monitor')

        while True:
            try:
                # 获取待处理的主题
                pending_topics = self.get_generate_topics()

                if pending_topics:
                    logger.info(f"发现 {len(pending_topics)} 个待处理主题，启动 {WORKER_COUNT} 个worker并行处理")
                    self.log_to_database('INFO', f'发现待处理主题，启动并行处理', f'主题数量: {len(pending_topics)}, worker数量: {WORKER_COUNT}')

                    # 清空已处理记录集合
                    with self.processing_lock:
                        self.processed_ids.clear()

                    # 使用线程池并行处理
                    with ThreadPoolExecutor(max_workers=WORKER_COUNT, thread_name_prefix="CozeWorker") as executor:
                        # 提交worker任务
                        future_to_worker = {}
                        for worker_id in range(1, WORKER_COUNT + 1):
                            future = executor.submit(self.worker_process_topics, pending_topics, worker_id)
                            future_to_worker[future] = worker_id

                        # 等待所有worker完成
                        total_processed = 0
                        for future in as_completed(future_to_worker):
                            worker_id = future_to_worker[future]
                            try:
                                processed_count = future.result()
                                total_processed += processed_count
                                logger.info(f"Worker-{worker_id} 完成，处理了 {processed_count} 个主题")
                            except Exception as e:
                                logger.error(f"Worker-{worker_id} 执行异常: {e}")
                                self.log_to_database('ERROR', f'Worker-{worker_id} 执行异常', str(e))

                    logger.info(f"本轮并行处理完成，共处理 {total_processed} 个主题")
                    self.log_to_database('INFO', f'本轮并行处理完成', f'共处理 {total_processed} 个主题')

                    # 处理完一轮后稍作休息
                    time.sleep(5)
                else:
                    logger.info("暂无待处理主题，继续监控...")

                # 每秒检查一次
                time.sleep(SLEEP_INTERVAL)

            except KeyboardInterrupt:
                logger.info("收到中断信号，停止监控")
                self.log_to_database('INFO', '监控服务手动停止', 'KeyboardInterrupt')
                break
            except Exception as e:
                error_msg = f"监控循环异常: {e}"
                logger.error(error_msg)
                self.log_to_database('ERROR', error_msg, traceback.format_exc())
                time.sleep(5)  # 异常时等待5秒再继续

def main():
    """主函数"""
    generator = CozeArticleGenerator()

    try:
        # 先登录获取JWT token
        logger.info("开始登录获取JWT token")
        if not generator.login_and_get_jwt_token():
            logger.error("登录失败，程序退出")
            return

        # 开始监控
        generator.run_monitor()

    except Exception as e:
        logger.error(f"程序运行异常: {e}")
        generator.log_to_database('ERROR', f'程序运行异常: {e}', traceback.format_exc())

if __name__ == "__main__":
    main()