From 97dcff8c8b6ad252860f363e3528f848d33ef527 Mon Sep 17 00:00:00 2001 From: shengyudong Date: Thu, 5 Feb 2026 20:25:23 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=B0=81=E9=9D=A2?= =?UTF-8?q?=E5=9B=BE=E5=8E=8B=E5=AD=97=E8=8A=B1=E5=8A=9F=E8=83=BD=E5=92=8C?= =?UTF-8?q?=E5=90=AF=E5=8A=A8=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增封面图本地化压字花处理(深褐色文字+白色描边,居中显示) - 支持Linux/Windows跨平台字体加载 - 新增启动脚本 start_article_auto_image_matching.sh - 优化图片生成策略(0张图/1张图/多张图不同处理) - 绕过网络接口IncompleteRead问题,本地化处理更稳定 - 更新README文档,完善使用说明 --- README.md | 283 +++++-- abc.py | 436 ++++++++++ activate_keyword_crawl_env.sh | 41 + article_auto_image_matching.py | 754 ++++++++++++++--- image_matching.py | 910 +++++++++++++++++++++ match_article_images.py | 910 --------------------- setup_env.sh | 29 +- start_article_auto_image_matching.sh | 77 ++ 参考脚本/generate_injection_article(1).py | 945 ++++++++++++++++++++++ 9 files changed, 3292 insertions(+), 1093 deletions(-) create mode 100644 abc.py create mode 100644 activate_keyword_crawl_env.sh create mode 100644 image_matching.py create mode 100644 start_article_auto_image_matching.sh create mode 100644 参考脚本/generate_injection_article(1).py diff --git a/README.md b/README.md index 324d096..48b6784 100644 --- a/README.md +++ b/README.md @@ -1,69 +1,87 @@ -# 文字匹配图片项目 +# 文章图片智能匹配系统 -AI驱动的文章与图片智能匹配系统,支持从数据库读取文章数据,自动匹配图片或使用Gemini生成图片,并批量发布文章。 +AI驱动的文章与图片智能匹配系统,支持自动匹配现有图片、Gemini生成新图片、封面图压字花处理,并批量发布文章。 ## 项目概述 本项目实现了以下核心功能: -- 从数据库读取待处理的文章数据 -- 基于文章标签智能匹配现有图片库 -- 使用通义千问大模型评估文章与图片的匹配度 -- 匹配失败时自动调用Gemini API生成相关图片 -- 将生成的图片信息插入数据库(ai_images、ai_image_tags、ai_article_images) -- 上传图片到服务器 -- 批量发布处理完成的文章 +- 从数据库读取待处理的文章数据(status='pending_review' 且 review_user_id=152) +- 基于文章标签和科室智能匹配现有图片库(优先实拍图,后模板图) +- 使用通义千问大模型评估文章与图片的匹配度(阈值0.6) +- 匹配失败时自动调用Gemini API生成图片(0张图:1封面+2详情;1张图:补充缺失类型) +- **封面图压字花处理**:本地化图片文字融合(深褐色文字+白色描边,居中显示) +- 将图片信息插入数据库并上传到服务器 +- 所有图片生成完成后统一调用RPA审核接口 ## 技术栈 - **Python 3.12** +- **运行平台**: Linux 服务器(不支持Windows) - **数据库**: MySQL (PyMySQL) - **AI服务**: - - Google Gemini API (图片生成) - - 通义千问 API (文章图片匹配评估) -- **依赖库**: + - Google Gemini API (图片生成,模型:gemini-3-pro-image-preview) + - 通义千问 API (文章图片匹配评估,模型:qwen-max) +- **核心依赖库**: - `requests==2.31.0` - HTTP请求 - `google-genai==0.1.0` - Gemini API调用 - `pymysql` - MySQL数据库连接 - - `Pillow==10.0.0` - 图片处理 + - `Pillow==10.0.0` - 图片处理和文字融合 ## 项目结构 ``` 文字匹配图片/ -├── match_article_images.py # 主程序:文章图片匹配 -├── database_config.py # 数据库配置管理 -├── log_config.py # 日志配置 -├── export_approved_articles.py # 导出审核通过的文章 -├── export_image_tags.py # 导出图片标签数据 -├── push_article_published.py # 文章发布监控脚本 -├── requirements.txt # 项目依赖 -├── setup_env.bat # Windows环境初始化脚本 -├── setup_env.sh # Linux/macOS环境初始化脚本 -├── db/ # 数据库表结构 -│ ├── split_tables/ # 按表拆分的SQL文件 -│ └── ai_articles.sql # 完整数据库结构 -└── logs/ # 日志目录 +├── article_auto_image_matching.py # 主程序:文章图片智能匹配 +├── start_article_auto_image_matching.sh # 启动脚本 +├── database_config.py # 数据库配置管理 +├── log_config.py # 日志配置 +├── export_approved_articles.py # 导出审核通过的文章 +├── export_image_tags.py # 导出图片标签数据 +├── push_article_published.py # 文章发布监控脚本 +├── requirements.txt # 项目依赖 +├── setup_env.bat # Windows环境初始化脚本 +├── setup_env.sh # Linux/macOS环境初始化脚本 +├── db/ # 数据库表结构 +│ ├── split_tables/ # 按表拆分的SQL文件(24个表) +│ └── ai_articles.sql # 完整数据库结构 +└── logs/ # 日志目录 + ├── article_image_matching.log # 匹配日志 + ├── article_image_matching_error.log # 错误日志 + └── start_*.log # 启动日志 ``` ## 环境配置 -### 1. 创建虚拟环境 +### 1. Linux 中文字体安装(必需) -**Windows**: +封面图压字花功能需要中文字体支持,请先安装: + +**Ubuntu/Debian**: ```bash -python -m venv venv -venv\Scripts\activate +sudo apt-get update +sudo apt-get install fonts-wqy-zenhei fonts-wqy-microhei ``` -**Linux/macOS**: +**CentOS/RHEL**: ```bash -python3 -m venv venv -source venv/bin/activate +sudo yum install wqy-zenhei-fonts wqy-microhei-fonts +# 或安装 Google Noto 字体 +sudo yum install google-noto-sans-cjk-fonts ``` -### 2. 安装依赖 - +**验证字体安装**: ```bash +fc-list :lang=zh +``` + +### 2. Python 虚拟环境 + +本项目使用共享虚拟环境:`/home/work/keyword_crawl/venv` + +如需创建新环境: +```bash +python3 -m venv /home/work/keyword_crawl/venv +source /home/work/keyword_crawl/venv/bin/activate pip install -r requirements.txt ``` @@ -75,23 +93,45 @@ DB_CONFIG = { 'host': 'your_host', 'user': 'your_user', 'password': 'your_password', - 'database': 'ai_article', + 'database': 'ai_articles', 'charset': 'utf8mb4' } ``` ## 使用方法 -### 文章图片匹配 +### 快速启动(推荐) -**测试模式**(处理前N篇文章): +**1. 赋予执行权限**: ```bash -python match_article_images.py --test 3 +chmod +x start_article_auto_image_matching.sh ``` -**正常模式**(处理所有文章): +**2. 前台运行**(查看实时输出): ```bash -python match_article_images.py +./start_article_auto_image_matching.sh +``` + +**3. 后台运行**: +```bash +nohup ./start_article_auto_image_matching.sh > /dev/null 2>&1 & +``` + +**4. 查看运行状态**: +```bash +# 查看进程 +ps aux | grep article_auto_image_matching + +# 查看最新日志 +ls -lt logs/start_*.log | head -1 +tail -f logs/start_*.log +``` + +### 手动运行 + +```bash +source /home/work/keyword_crawl/venv/bin/activate +python article_auto_image_matching.py ``` ### 导出数据 @@ -110,25 +150,57 @@ python export_image_tags.py ### 1. 文章图片匹配流程 -1. 从数据库读取状态为 `approved` 的文章 -2. 根据文章标签匹配图片库中的图片(image_attached_article_count < 5) -3. 使用通义千问API评估匹配质量 -4. 匹配成功:更新图片使用计数 -5. 匹配失败:调用Gemini生成新图片 +1. **查询待匹配文章**:status='pending_review' 且 review_user_id=152,无图片关联 +2. **获取可用图片**:根据文章科室ID查询可用图片(image_attached_article_count < 5 且 status='generate') +3. **图片优先级排序**:实拍图(image_source=2)> 模板图(image_source=1),按挂载次数升序 +4. **通义千问评估**:调用API评估匹配度,阈值0.6 +5. **匹配成功**:插入关联记录,更新图片状态为published +6. **匹配失败**:根据当前图片数量采用不同策略生成图片 -### 2. Gemini图片生成流程 +### 2. 图片生成策略 + +- **0张图**:生成1张封面图(image_source=12)+ 2张详情图(image_source=13) +- **1张图**: + - 缺少实拍图:生成1张封面图(image_source=12) + - 缺少AI生成图:补充详情图至2张(image_source=13) +- **≥2张图**:检查并补充缺失类型图片 + +### 3. 封面图压字花处理(本地化) + +**核心特性**: +- ✅ 文字居中显示:自动计算居中坐标 +- ✅ 深褐色文字:RGB(180, 60, 50) +- ✅ 白色描边效果:3像素宽度 +- ✅ 自适应字体大小:基础120px,根据图片尺寸调整(40-150px) +- ✅ 自动换行:每行最多12个字符 +- ✅ 跨平台支持:自动检测操作系统并加载对应字体 + +**处理流程**: +1. Gemini生成封面图片 +2. 本地压字花处理(添加文章标题) +3. 上传到通用图片接口 +4. 插入数据库关联记录(image_source=12) + +**技术优势**: +- 绕过网络接口的 IncompleteRead 问题 +- 本地处理更快更稳定 +- 完全符合视觉规范 + +### 4. Gemini图片生成流程 1. 根据文章标题和标签生成提示词 -2. 调用Gemini API生成图片 -3. 将图片信息插入 `ai_images` 表 -4. 将图片标签信息插入 `ai_image_tags` 表(image_source=3表示AI生成) -5. 上传图片到服务器 -6. 将文章与图片关联信息插入 `ai_article_images` 表(image_source=0) -7. sort_order自动设置为当前文章下最大值+1 +2. 调用Gemini API生成图片(模型:gemini-3-pro-image-preview) +3. 插入 `ai_images` 表 +4. 插入 `ai_image_tags` 表(image_source=3表示AI生成) +5. 上传图片到服务器(获取真实URL) +6. 更新数据库中的图片URL +7. 插入 `ai_article_images` 表(sort_order自动递增) -### 3. 批量发布 +### 5. RPA审核接口 -处理完成后自动调用 `/api/articles/batch-publish-auto` 接口批量发布文章。 +所有图片(1封面+2详情)生成完成后,统一调用RPA审核接口: +- 端点:`POST /api/articles/rpa/review` +- 参数:`article_ids`(文章ID列表)、`image_source`(图片来源类型) ## 数据库表结构 @@ -142,29 +214,100 @@ python export_image_tags.py - **ai_keywords**: 关键词表 - **ai_departments**: 部门表 -## API配置 +## 配置参数 -### Gemini API -- 端点: `https://work.poloapi.com` -- 模型: `gemini-3-pro-image-preview` +### 核心常量 -### 文章发布API -- 登录: `http://47.99.184.230:8324/api/auth/login` -- 图片上传: `http://47.99.184.230:8324/api/images/upload` -- 批量发布: `http://47.99.184.230:8324/api/articles/batch-publish-auto` +```python +WORKER_COUNT = 4 # 并行处理worker数量 +BATCH_SIZE = 50 # 每批处理的文章数量 +MATCH_THRESHOLD = 0.6 # 匹配分数阈值(0-1) +``` -## 日志 +### API配置 + +**Gemini API**: +- 端点:`https://work.poloapi.com` +- 模型:`gemini-3-pro-image-preview` +- API Key:配置在代码中 + +**通义千问 API**: +- 端点:`https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation` +- 模型:`qwen-max` +- API Key:配置在代码中 + +**后端服务 API**: +- 登录:`http://47.99.184.230:8324/api/auth/login` +- 图片上传:`http://47.99.184.230:8324/api/images/upload` +- RPA审核:`http://47.99.184.230:8324/api/articles/rpa/review` + +## 日志说明 + +### 日志文件位置 日志文件存储在 `logs/` 目录下: -- `article_matching.log` - 文章匹配日志 -- `article_matching_error.log` - 错误日志 +- `article_image_matching.log` - 文章匹配主日志 +- `article_image_matching_error.log` - 错误日志 +- `start_YYYYMMDD_HHMMSS.log` - 启动脚本日志 + +### 日志级别 + +- **INFO**:正常流程信息 +- **WARNING**:警告信息(如字体加载失败) +- **ERROR**:错误信息(如图片生成失败) + +### 数据库日志 + +系统会将关键操作记录到 `ai_logs` 表: +- 启动/停止服务 +- 文章匹配成功/失败 +- 图片生成成功/失败 +- API调用结果 ## 注意事项 -1. 虚拟环境 (`venv/`) 已在 `.gitignore` 中排除 -2. API密钥和数据库密码请妥善保管 -3. 生产环境建议使用环境变量管理敏感信息 -4. 测试模式不会触发批量发布操作 +1. **运行平台**:仅支持Linux服务器,不支持Windows +2. **中文字体**:封面图压字花功能必须安装中文字体(文泉驿正黑/微米黑) +3. **虚拟环境**:使用共享虚拟环境 `/home/work/keyword_crawl/venv` +4. **API密钥**:妥善保管Gemini和通义千问的API密钥 +5. **数据库密码**:生产环境建议使用环境变量管理敏感信息 +6. **并发控制**:默认4个worker并行处理,可根据服务器性能调整 +7. **图片生成策略**:所有图片生成完成后才调用RPA审核接口 +8. **网络稳定性**:封面图上传已本地化处理,避免 IncompleteRead 错误 + +## 常见问题 + +### 1. 字体加载失败 + +**错误**:`无法加载任何中文字体` + +**解决**: +```bash +# Ubuntu/Debian +sudo apt-get install fonts-wqy-zenhei fonts-wqy-microhei + +# CentOS/RHEL +sudo yum install wqy-zenhei-fonts + +# 验证安装 +fc-list :lang=zh +``` + +### 2. 虚拟环境未找到 + +**错误**:`[警告] 未找到虚拟环境` + +**解决**:检查虚拟环境路径是否正确,或使用系统Python + +### 3. 图片上传失败 + +**原因**:网络不稳定导致的 IncompleteRead 错误 + +**解决**:已通过本地化处理解决,封面图使用本地压字花+通用上传接口 + +### 4. Gemini生成超时 + +**解决**:检查网络连接和API密钥配置 ## 许可证 diff --git a/abc.py b/abc.py new file mode 100644 index 0000000..d7eb37a --- /dev/null +++ b/abc.py @@ -0,0 +1,436 @@ + + +@image_bp.route('/add-article-cover-images/', methods=['POST']) +@require_auth +def add_article_cover_images(article_id): + """添加文章封面图片 + + 支持两种上传方式: + 1. action=image_content (默认): 从request.files上传图片文件 + 2. action=image_url: 从image_url参数下载图片 + """ + try: + # 获取action参数,默认为image_content + action = request.form.get('action', 'image_content') + logger.info(f"[添加文章封面] 上传方式: {action}") + + # 根据action参数处理图片源 + if action == 'image_url': + # 方式1: 从URL下载图片 + image_url = request.form.get('image_url', '').strip() + if not image_url: + return jsonify({ + 'code': 400, + 'message': '缺少image_url参数', + 'data': None + }), 400 + + logger.info(f"[添加文章封面] 从URL下载图片: {image_url}") + + try: + import requests + from io import BytesIO + + # 下载图片(禁用代理) + session = requests.Session() + session.trust_env = False + response = session.get(image_url, timeout=30) + response.raise_for_status() + + # 检查Content-Type + content_type = response.headers.get('Content-Type', '') + if 'image' not in content_type: + return jsonify({ + 'code': 400, + 'message': f'URL不是有效的图片资源,Content-Type: {content_type}', + 'data': None + }), 400 + + # 获取文件扩展名 + file_ext = 'jpg' # 默认扩展名 + if 'png' in content_type: + file_ext = 'png' + elif 'jpeg' in content_type or 'jpg' in content_type: + file_ext = 'jpg' + elif 'gif' in content_type: + file_ext = 'gif' + elif 'webp' in content_type: + file_ext = 'webp' + + # 创建一个模拟的文件对象 + image_data = response.content + file = BytesIO(image_data) + file.filename = f"downloaded_image.{file_ext}" + file.seek(0) + + logger.info(f"[添加文章封面] 图片下载成功,大小: {len(image_data)} bytes, 类型: {file_ext}") + + except requests.RequestException as e: + logger.error(f"[添加文章封面] 下载图片失败: {str(e)}") + return jsonify({ + 'code': 400, + 'message': f'下载图片失败: {str(e)}', + 'data': None + }), 400 + except Exception as e: + logger.error(f"[添加文章封面] 处理URL图片失败: {str(e)}") + return jsonify({ + 'code': 400, + 'message': f'处理URL图片失败: {str(e)}', + 'data': None + }), 400 + else: + # 方式2: 从request.files上传图片(默认方式) + if 'image' not in request.files: + return jsonify({ + 'code': 400, + 'message': '没有上传图片文件', + 'data': None + }), 400 + + file = request.files['image'] + if file.filename == '': + return jsonify({ + 'code': 400, + 'message': '没有选择文件', + 'data': None + }), 400 + + # 验证文件类型 + allowed_extensions = {'png', 'jpg', 'jpeg', 'gif', 'webp'} + file_ext = file.filename.rsplit('.', 1)[1].lower() if '.' in file.filename else '' + if file_ext not in allowed_extensions: + return jsonify({ + 'code': 400, + 'message': '不支持的文件格式,仅支持: png, jpg, jpeg, gif, webp', + 'data': None + }), 400 + + logger.info(f"[添加文章封面] 从文件上传,文件名: {file.filename}, 类型: {file_ext}") + + db_manager = get_db_manager() + current_user = AuthUtils.get_current_user() + + # 检查文章是否存在 + check_article_sql = "SELECT id, title, topic, created_user_id FROM ai_articles WHERE id = %s AND status = %s" + logger.info(f"[添加文章封面] 执行SQL查询文章: {check_article_sql} - 参数: {article_id}") + article_result = db_manager.execute_query(check_article_sql, (article_id, 'pending_review', )) + logger.info(f"[添加文章封面] 查询文章结果: {article_result}") + + if not article_result: + return jsonify({ + 'code': 404, + 'message': '文章不存在', + 'data': None + }), 404 + + article = article_result[0] + topic = article['topic'] + + # 检查权限(只有创建者或管理员可以编辑) + if article['created_user_id'] != current_user['user_id'] and current_user['role'] != 'admin': + return jsonify({ + 'code': 403, + 'message': '没有权限编辑此文章', + 'data': None + }), 403 + + # 生成新的文件名 + current_time = datetime.now() + date_str = current_time.strftime('%Y%m%d') + timestamp = int(time.time() * 1000) # 毫秒时间戳 + random_num = random.randint(100, 999) # 3位随机数 + base_filename = f"{timestamp}{random_num}" + new_filename = f"{base_filename}.png" + #original_filename = f"{base_filename}_original.png" + thumb_filename = f"{base_filename}_thumb.png" + + # 创建日期目录 + date_dir = os.path.join(IMAGE_UPLOAD_DIR, date_str) + os.makedirs(date_dir, exist_ok=True) + + # 处理图片:优化压缩和生成缩略图 + try: + # 读取上传的图片 + image_data = file.read() + original_image = Image.open(io.BytesIO(image_data)) + + # 转换为RGB模式(确保兼容性) + if original_image.mode in ('RGBA', 'LA', 'P'): + # 创建白色背景 + background = Image.new('RGB', original_image.size, (255, 255, 255)) + if original_image.mode == 'P': + original_image = original_image.convert('RGBA') + background.paste(original_image, mask=original_image.split()[-1] if original_image.mode == 'RGBA' else None) + original_image = background + elif original_image.mode != 'RGB': + original_image = original_image.convert('RGB') + + # 1. 保存优化后的原图 + file_path = os.path.join(date_dir, new_filename) + # 1. 保存优化后的原图 + #original_file_path = os.path.join(date_dir, original_filename) + + # 大图优化:在清晰条件下缩小质量 + # 如果图片过大,先进行适当缩放 + max_size = (1920, 1080) # 最大尺寸限制 + if original_image.size[0] > max_size[0] or original_image.size[1] > max_size[1]: + original_image.thumbnail(max_size, Image.Resampling.LANCZOS) + logger.info(f"[添加文章封面] 图片尺寸优化: 缩放到 {original_image.size}") + + # 保存优化后的原图(高质量压缩) + #original_image.save(original_file_path, 'PNG', optimize=True, compress_level=6) + #logger.info(f"[添加文章封面] 优化原图保存成功: {original_file_path}") + + # 保存优化后的原图(高质量压缩) + original_image.save(file_path, 'PNG', optimize=True, compress_level=6) + logger.info(f"[添加文章封面] 优化原图保存成功: {file_path}") + + # ⭐ 关键修改:图片和文字融合 + try: + logger.info(f"[添加文章封面] 开始图片文字融合,文章标题: {topic}") + + # 生成带文字的文件名 + text_filename = f"{base_filename}_text.png" + text_file_path = os.path.join(date_dir, text_filename) + + # 调用图片文字融合功能(文字居中显示,深褐色文字+白色描边) + fusion_success = add_text_to_image( + image_path=file_path, + text=topic, + output_path=text_file_path, + position='center', # ⭐ 文字居中 + font_size=120, # 字体大小(基础值,会自适应调整) + font_color=(180, 60, 50) # ⭐ 深褐色文字(与参考图保持一致) + ) + + if fusion_success: + logger.info(f"[添加文章封面] 图片文字融合成功: {text_file_path}") + # 用融合后的图片替换原图 + os.replace(text_file_path, file_path) + logger.info(f"[添加文章封面] 已用融合图片替换原图: {file_path}") + else: + logger.warning(f"[添加文章封面] 图片文字融合失败,继续使用原图") + + except Exception as fusion_error: + logger.error(f"[添加文章封面] 图片文字融合异常: {str(fusion_error)}", exc_info=True) + logger.info(f"[添加文章封面] 融合失败,继续使用原图") + + # 2. 生成缩略图 (120x160) + thumb_path = os.path.join(date_dir, thumb_filename) + thumb_image = original_image.copy() + + # 使用高质量重采样算法生成缩略图 + thumb_size = (120, 160) + + # 计算缩放比例,保持宽高比 + img_ratio = thumb_image.size[0] / thumb_image.size[1] + thumb_ratio = thumb_size[0] / thumb_size[1] + + if img_ratio > thumb_ratio: + # 图片更宽,以高度为准 + new_height = thumb_size[1] + new_width = int(new_height * img_ratio) + thumb_image = thumb_image.resize((new_width, new_height), Image.Resampling.LANCZOS) + # 裁剪中心部分 + left = (new_width - thumb_size[0]) // 2 + thumb_image = thumb_image.crop((left, 0, left + thumb_size[0], thumb_size[1])) + else: + # 图片更高,以宽度为准 + new_width = thumb_size[0] + new_height = int(new_width / img_ratio) + thumb_image = thumb_image.resize((new_width, new_height), Image.Resampling.LANCZOS) + # 裁剪中心部分 + top = (new_height - thumb_size[1]) // 2 + thumb_image = thumb_image.crop((0, top, thumb_size[0], top + thumb_size[1])) + + # 保存缩略图 + thumb_image.save(thumb_path, 'PNG', optimize=True, compress_level=9) + logger.info(f"[添加文章封面] 缩略图生成成功: {thumb_path} (尺寸: {thumb_image.size})") + + except Exception as img_error: + logger.error(f"[添加文章封面] 图片处理失败: {str(img_error)}", exc_info=True) + # 如果图片处理失败,回退到原始保存方式 + file.seek(0) # 重置文件指针 + file_path = os.path.join(date_dir, new_filename) + file.save(file_path) + logger.info(f"[添加文章封面] 回退保存成功: {file_path}") + + # 生成相对路径用于数据库存储 + relative_path = f"{date_str}/{new_filename}" + thumb_relative_path = f"{date_str}/{thumb_filename}" + #fusion_before_relative_path = f"{date_str}/{original_filename}" + + # 图片上传成功后,调用 TransformerImage 方法处理原图、缩图 + try: + logger.info(f"[添加文章封面] 开始调用 TransformerImage 处理图片") + + # 创建 SyncImageToOSS 实例 + sync_oss = SyncImageToOSS() + + # 调用 TransformerImage 方法处理原图 + #local_result = sync_oss.TransformerImage(original_file_path) + #logger.info(f"[添加文章封面] 原图上传结果: {local_result}") + + # 调用 TransformerImage 方法处理原图 + original_result = sync_oss.TransformerImage(file_path) + logger.info(f"[添加文章封面] 原图上传结果: {original_result}") + + # 调用 TransformerImage 方法处理缩图 + thumb_result = sync_oss.TransformerImage(thumb_path) + logger.info(f"[添加文章封面] 缩图上传结果: {thumb_result}") + + # 检查所有上传是否成功 + if not (original_result['success'] and thumb_result['success']): + logger.warning(f"[添加文章封面] OSS上传部分失败 - 原图: {original_result['success']}, 缩图: {thumb_result['success']}") + + except Exception as e: + logger.error(f"[添加文章封面] TransformerImage 调用失败: {str(e)}") + + # 在 ai_images 表中创建新记录 + image_insert_sql = """INSERT INTO ai_images + (image_name, image_url, image_thumb_url, upload_user_id, status) + VALUES (%s, %s, %s, %s, %s)""" + logger.info(f"[添加文章封面] 执行SQL插入图片记录") + image_id = db_manager.execute_insert(image_insert_sql, ( + new_filename, relative_path, thumb_relative_path, current_user['user_id'], 'active' + )) + logger.info(f"[添加文章封面] 图片记录创建成功: 图片ID {image_id}") + + # ⭐ 关键修改:先判断是否存在 sort_order=1 的封面图 + check_cover_sql = """SELECT id, image_id FROM ai_article_images + WHERE article_id = %s AND sort_order = 1""" + logger.info(f"[添加文章封面] 检查是否已存在封面图: article_id={article_id}, sort_order=1") + existing_cover = db_manager.execute_query(check_cover_sql, (article_id,)) + logger.info(f"[添加文章封面] 查询结果: {existing_cover}") + + if existing_cover: + # 已存在封面图,走更新流程 + old_relation_id = existing_cover[0]['id'] + old_image_id = existing_cover[0]['image_id'] + + logger.info(f"[添加文章封面] 检测到已存在封面图,执行更新操作") + logger.info(f"[添加文章封面] 旧关联ID: {old_relation_id}, 旧图片ID: {old_image_id}") + + # 更新 ai_article_images 表中的关联记录 + relation_update_sql = """UPDATE ai_article_images + SET image_id = %s, image_url = %s, image_thumb_url = %s, updated_at = NOW() + WHERE id = %s""" + logger.info(f"[添加文章封面] 执行SQL更新文章图片关联") + db_manager.execute_update(relation_update_sql, ( + image_id, relative_path, thumb_relative_path, old_relation_id + )) + logger.info(f"[添加文章封面] 文章图片关联更新成功: 关联ID {old_relation_id}, 新图片ID {image_id}") + + # 记录操作日志 + log_sql = """ + INSERT INTO ai_logs (user_id, action, target_type, target_id, description, ip_address, user_agent, status) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s) + """ + client_ip = request.environ.get('HTTP_X_FORWARDED_FOR', request.environ.get('REMOTE_ADDR', '未知')) + user_agent = request.headers.get('User-Agent', '未知') + action_desc = f'更新文章封面: 文章ID {article_id}, 新图片ID {image_id}, 旧图片ID {old_image_id}, 路径 {relative_path}' + log_params = ( + current_user['user_id'], + 'update_article_cover_images', + 'article_cover', + article_id, + action_desc, + client_ip, + user_agent, + 'success' + ) + logger.info(f"[添加文章封面] 执行SQL插入日志: {log_sql} - 参数: {log_params}") + db_manager.execute_insert(log_sql, log_params) + logger.info(f"[添加文章封面] 操作日志记录成功") + + logger.info(f"更新文章封面成功: {action_desc}") + + return jsonify({ + 'code': 200, + 'message': '封面图片更新成功', + 'data': { + 'article_id': article_id, + 'image_id': image_id, + 'old_image_id': old_image_id, + 'image_url': IMAGE_BASE_URL + relative_path, + 'relative_path': relative_path, + 'image_thumb_url': IMAGE_BASE_URL + thumb_relative_path, + 'thumb_relative_path': thumb_relative_path, + 'operation': 'update', + 'optimization_info': { + 'original_size': f"{original_image.size[0]}x{original_image.size[1]}" if 'original_image' in locals() else 'unknown', + 'thumb_size': '120x160', + 'compression': 'PNG优化压缩', + 'features': ['大图优化', '缩略图生成', '智能裁剪', '高质量重采样', '文字融合'] + } + }, + 'timestamp': int(datetime.now().timestamp() * 1000) + }) + else: + # 不存在封面图,走新增流程 + logger.info(f"[添加文章封面] 未检测到封面图,执行新增操作") + + # 在 ai_article_images 表中创建关联记录 + relation_insert_sql = """INSERT INTO ai_article_images + (article_id, image_id, image_url, image_thumb_url, image_source, sort_order) + VALUES (%s, %s, %s, %s, %s, %s)""" + logger.info(f"[添加文章封面] 执行SQL创建文章图片关联") + db_manager.execute_insert(relation_insert_sql, ( + article_id, image_id, relative_path, thumb_relative_path, 4, 1 # image_source=4表示封面图片,sort_order=1表示第一张 + )) + logger.info(f"[添加文章封面] 文章图片关联创建成功: 文章ID {article_id}, 图片ID {image_id}") + + # 记录操作日志 + log_sql = """ + INSERT INTO ai_logs (user_id, action, target_type, target_id, description, ip_address, user_agent, status) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s) + """ + client_ip = request.environ.get('HTTP_X_FORWARDED_FOR', request.environ.get('REMOTE_ADDR', '未知')) + user_agent = request.headers.get('User-Agent', '未知') + action_desc = f'添加文章封面: 文章ID {article_id}, 图片ID {image_id}, 路径 {relative_path}' + log_params = ( + current_user['user_id'], + 'add_article_cover_images', + 'article_cover', + article_id, + action_desc, + client_ip, + user_agent, + 'success' + ) + logger.info(f"[添加文章封面] 执行SQL插入日志: {log_sql} - 参数: {log_params}") + db_manager.execute_insert(log_sql, log_params) + logger.info(f"[添加文章封面] 操作日志记录成功") + + logger.info(f"添加文章封面成功: {action_desc}") + + return jsonify({ + 'code': 200, + 'message': '封面图片添加成功', + 'data': { + 'article_id': article_id, + 'image_id': image_id, + 'image_url': IMAGE_BASE_URL + relative_path, + 'relative_path': relative_path, + 'image_thumb_url': IMAGE_BASE_URL + thumb_relative_path, + 'thumb_relative_path': thumb_relative_path, + 'operation': 'insert', + 'optimization_info': { + 'original_size': f"{original_image.size[0]}x{original_image.size[1]}" if 'original_image' in locals() else 'unknown', + 'thumb_size': '120x160', + 'compression': 'PNG优化压缩', + 'features': ['大图优化', '缩略图生成', '智能裁剪', '高质量重采样', '文字融合'] + } + }, + 'timestamp': int(datetime.now().timestamp() * 1000) + }) + + except Exception as e: + logger.error(f"[添加文章封面] 处理请求时发生错误: {str(e)}", exc_info=True) + return jsonify({ + 'code': 500, + 'message': '服务器内部错误', + 'data': None + }), 500 + diff --git a/activate_keyword_crawl_env.sh b/activate_keyword_crawl_env.sh new file mode 100644 index 0000000..62023c7 --- /dev/null +++ b/activate_keyword_crawl_env.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# 脚本:activate_keyword_crawl_env.sh +# 功能:自动激活 /home/work/keyword_crawl/venv 虚拟环境并执行命令 + +# 定义虚拟环境路径 +VENV_PATH="/home/work/keyword_crawl/venv" + +# 检查虚拟环境是否存在 +if [ ! -d "$VENV_PATH" ]; then + echo "错误: 虚拟环境不存在: $VENV_PATH" + echo "请确保虚拟环境已正确创建" + exit 1 +fi + +# 检查虚拟环境中的 Python 解释器 +PYTHON_PATH="$VENV_PATH/bin/python" +if [ ! -f "$PYTHON_PATH" ]; then + echo "错误: 在 $VENV_PATH 中找不到 Python 解释器" + exit 1 +fi + +echo "检测到虚拟环境: $VENV_PATH" +echo "Python 版本: $($PYTHON_PATH --version)" + +# 如果提供了命令行参数,则在虚拟环境中执行 +if [ $# -gt 0 ]; then + echo "在虚拟环境中执行命令: $*" + "$VENV_PATH/bin/python" "$@" +else + # 否则,仅显示如何手动激活虚拟环境 + echo "" + echo "虚拟环境已验证通过!" + echo "" + echo "要手动激活虚拟环境,请运行:" + echo "source $VENV_PATH/bin/activate" + echo "" + echo "或者,您可以直接运行此脚本并附带要执行的 Python 文件:" + echo "./$(basename "$0") script.py" + echo "" +fi \ No newline at end of file diff --git a/article_auto_image_matching.py b/article_auto_image_matching.py index df55408..23235d2 100644 --- a/article_auto_image_matching.py +++ b/article_auto_image_matching.py @@ -95,7 +95,8 @@ class ArticleImageMatcher: a.title, a.content, a.coze_tag, - a.department + a.department, + a.department_id FROM ai_articles a WHERE NOT EXISTS ( SELECT 1 FROM ai_article_images ai @@ -135,7 +136,7 @@ class ArticleImageMatcher: connection = self.db_manager.get_connection() try: with connection.cursor(pymysql.cursors.DictCursor) as cursor: - # 查询指定科室ID、状态为generate且附加文章数量小于5的图片(不使用JOIN) + # 查询指定科室ID且状态为generate且附加文章数量小于5的图片 # 包含image_source字段用于区分实拍图和模板图 if article_department_id > 0: sql = """ @@ -156,11 +157,7 @@ class ArticleImageMatcher: FROM ai_image_tags it WHERE it.image_attached_article_count < 5 AND it.department_id = %s - AND EXISTS ( - SELECT 1 FROM ai_images i - WHERE i.id = it.image_id - AND i.status = 'generate' - ) + AND it.status = 'generate' ORDER BY it.image_attached_article_count ASC, it.id DESC """ cursor.execute(sql, (article_department_id,)) @@ -183,11 +180,7 @@ class ArticleImageMatcher: it.image_source FROM ai_image_tags it WHERE it.image_attached_article_count < 5 - AND EXISTS ( - SELECT 1 FROM ai_images i - WHERE i.id = it.image_id - AND i.status = 'generate' - ) + AND it.status = 'generate' ORDER BY it.image_attached_article_count ASC, it.id DESC """ cursor.execute(sql) @@ -198,6 +191,9 @@ class ArticleImageMatcher: self.log_to_database('INFO', f"查询到可用图片", f"数量: {len(results)}") else: logger.info("未查询到可用图片") + # 如果相关科室下没有可使用的图片,记录日志 + if article_department_id > 0: + logger.info(f"科室ID {article_department_id} 下没有可使用的图片,将进行Gemini生图") return results finally: @@ -328,6 +324,36 @@ class ArticleImageMatcher: self.log_to_database('ERROR', error_msg, traceback.format_exc()) return False, 0.0 + def get_article_image_count(self, article_id: int) -> int: + """ + 获取文章当前已关联的图片数量 + + Args: + article_id: 文章ID + + Returns: + 图片数量 + """ + try: + connection = self.db_manager.get_connection() + try: + with connection.cursor(pymysql.cursors.DictCursor) as cursor: + sql = """ + SELECT COUNT(*) as image_count + FROM ai_article_images + WHERE article_id = %s + """ + cursor.execute(sql, (article_id,)) + result = cursor.fetchone() + return result['image_count'] if result else 0 + finally: + connection.close() + except Exception as e: + error_msg = f"查询文章图片数量异常: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, traceback.format_exc()) + return 0 + def update_article_status(self, article_id: int, new_status: str) -> bool: """ 更新文章状态 @@ -409,7 +435,7 @@ class ArticleImageMatcher: image_data['keywords_name'], image_data['department_id'], image_data['department_name'], - 1 # image_source: 1表示tag匹配 + image_data['image_source'] # 使用原始图片的image_source值 )) # 更新图片附加文章计数 @@ -428,8 +454,12 @@ class ArticleImageMatcher: """ cursor.execute(update_image_status_sql, (image_data['image_id'],)) - # 更新文章状态为published_review - self.update_article_status(article_id, 'published_review') + # 调用RPA审核接口更新文章状态 + if self.call_rpa_review_api([article_id], 13): + logger.info(f"已通过RPA接口更新文章 {article_id} 状态") + else: + logger.error(f"通过RPA接口更新文章 {article_id} 状态失败") + return False connection.commit() logger.info(f"成功插入文章图片关联 - 文章ID: {article_id}, 图片ID: {image_data['image_id']}, 分数: {match_score}") @@ -443,7 +473,295 @@ class ArticleImageMatcher: self.log_to_database('ERROR', error_msg, traceback.format_exc()) return False - def generate_image_with_gemini(self, prompt: str, article_tags: List[str], article_id: int) -> Optional[str]: + def get_article_info(self, article_id: int) -> Optional[Dict]: + """ + 获取文章信息,包括部门和关键词信息 + + Args: + article_id: 文章ID + + Returns: + 文章信息字典 + """ + try: + connection = self.db_manager.get_connection() + try: + with connection.cursor(pymysql.cursors.DictCursor) as cursor: + sql = """ + SELECT id, title, content, coze_tag, department, department_id + FROM ai_articles + WHERE id = %s + """ + cursor.execute(sql, (article_id,)) + result = cursor.fetchone() + return result + finally: + connection.close() + except Exception as e: + error_msg = f"查询文章信息异常: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, traceback.format_exc()) + return None + + def get_article_image_sources(self, article_id: int) -> List[int]: + """ + 获取文章现有图片的image_source值列表 + + Args: + article_id: 文章ID + + Returns: + image_source值列表 + """ + try: + connection = self.db_manager.get_connection() + try: + with connection.cursor(pymysql.cursors.DictCursor) as cursor: + sql = """ + SELECT image_source + FROM ai_article_images + WHERE article_id = %s + """ + cursor.execute(sql, (article_id,)) + results = cursor.fetchall() + return [row['image_source'] for row in results] if results else [] + finally: + connection.close() + except Exception as e: + error_msg = f"查询文章图片source异常: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, traceback.format_exc()) + return [] + + def add_text_to_image_local(self, image_path: str, text: str, output_path: str) -> bool: + """ + 本地图片文字融合处理(压字花) + + Args: + image_path: 原图路径 + text: 要添加的文字(文章标题) + output_path: 输出路径 + + Returns: + 是否处理成功 + """ + try: + from PIL import Image, ImageDraw, ImageFont + import textwrap + + # 打开图片 + image = Image.open(image_path) + draw = ImageDraw.Draw(image) + + # 获取图片尺寸 + img_width, img_height = image.size + + # 计算自适应字体大小(基础120px) + base_font_size = 120 + font_size = int(base_font_size * min(img_width / 1920, img_height / 1080)) + font_size = max(40, min(font_size, 150)) # 限制范围40-150px + + # 尝试加载字体(支持Windows和Linux) + font = None + font_loaded = False + + try: + # 根据操作系统选择字体路径 + import platform + system = platform.system() + + if system == 'Windows': + font_paths = [ + 'C:/Windows/Fonts/msyh.ttc', # 微软雅黑 + 'C:/Windows/Fonts/simhei.ttf', # 黑体 + 'C:/Windows/Fonts/simsun.ttc', # 宋体 + 'C:/Windows/Fonts/msyhbd.ttc', # 微软雅黑Bold + 'C:/Windows/Fonts/simkai.ttf', # 楷体 + ] + else: # Linux/Unix + font_paths = [ + '/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc', # 文泉驿正黑 + '/usr/share/fonts/wqy-zenhei/wqy-zenhei.ttc', # 文泉驿正黑(旧路径) + '/usr/share/fonts/truetype/wqy/wqy-microhei.ttc', # 文泉驿微米黑 + '/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf', # Droid Sans + '/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc', # Noto Sans CJK + '/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc', # Noto Sans CJK + '/usr/share/fonts/truetype/arphic/uming.ttc', # AR PL UMing + '/usr/share/fonts/truetype/arphic/ukai.ttc', # AR PL UKai + ] + + logger.info(f"[压字花] 检测到操作系统: {system},尝试加载中文字体...") + + for font_path in font_paths: + try: + if os.path.exists(font_path): + font = ImageFont.truetype(font_path, font_size) + font_loaded = True + logger.info(f"[压字花] 成功加载字体: {font_path}") + break + else: + logger.debug(f"[压字花] 字体文件不存在: {font_path}") + except Exception as font_err: + logger.debug(f"[压字花] 字体加载失败 {font_path}: {font_err}") + continue + except Exception as e: + logger.warning(f"[压字花] 字体加载异常: {e}") + + # 如果所有字体都加载失败,给出安装提示 + if not font_loaded or font is None: + if system == 'Linux': + error_msg = ( + "无法加载任何中文字体。请在Linux服务器上安装中文字体:\n" + "Ubuntu/Debian: sudo apt-get install fonts-wqy-zenhei fonts-wqy-microhei\n" + "CentOS/RHEL: sudo yum install wqy-zenhei-fonts wqy-microhei-fonts\n" + "或: sudo yum install google-noto-sans-cjk-fonts" + ) + else: + error_msg = "无法加载任何中文字体,压字花功能需要中文字体支持" + logger.error(f"[压字花] {error_msg}") + raise Exception(error_msg) + + # 文字自动换行(每行最多12个字符,避免过长) + max_chars_per_line = 12 + lines = textwrap.wrap(text, width=max_chars_per_line) + + # 如果标题过长,手动分割 + if not lines: + lines = [text] + + # 计算文字总高度 + line_height = font_size + 20 # 行间距 + total_text_height = len(lines) * line_height + + # 计算文字起始Y坐标(居中) + start_y = (img_height - total_text_height) // 2 + + # 深褐色文字颜色 RGB(180, 60, 50) + text_color = (180, 60, 50) + # 白色描边颜色 + outline_color = (255, 255, 255) + outline_width = 3 # 描边宽度 + + # 绘制每一行文字 + for i, line in enumerate(lines): + # 计算文字宽度(居中) + try: + bbox = draw.textbbox((0, 0), line, font=font) + text_width = bbox[2] - bbox[0] + except: + # 兼容旧版本Pillow + text_width = len(line) * font_size * 0.6 + + x = (img_width - text_width) // 2 + y = start_y + i * line_height + + # 绘制白色描边(多次绘制形成描边效果) + for offset_x in range(-outline_width, outline_width + 1): + for offset_y in range(-outline_width, outline_width + 1): + if offset_x != 0 or offset_y != 0: + draw.text((x + offset_x, y + offset_y), line, font=font, fill=outline_color) + + # 绘制深褐色文字 + draw.text((x, y), line, font=font, fill=text_color) + + # 保存图片 + image.save(output_path, 'PNG', optimize=True, compress_level=6) + logger.info(f"[压字花] 文字融合成功: {output_path}") + return True + + except Exception as e: + logger.error(f"[压字花] 文字融合失败: {e}") + return False + + def upload_cover_image_with_text_fusion(self, image_path: str, article_id: int, article_title: str, + image_info: Dict) -> bool: + """ + 本地化处理封面图(压字花 + 上传 + 关联) + + Args: + image_path: 本地图片路径 + article_id: 文章ID + article_title: 文章标题(用于压字花) + image_info: 图片信息字典 + + Returns: + 是否上传成功 + """ + try: + # 1. 本地压字花处理 + logger.info(f"[封面图本地化] 开始压字花处理,文章ID: {article_id}, 标题: {article_title}") + + # 生成带文字的临时文件 + import uuid + text_temp_path = f"temp_text_{uuid.uuid4().hex}.png" + + fusion_success = self.add_text_to_image_local( + image_path=image_path, + text=article_title, + output_path=text_temp_path + ) + + if not fusion_success: + logger.error(f"[封面图本地化] 压字花处理失败") + return False + + # 2. 使用通用图片上传接口 + logger.info(f"[封面图本地化] 开始上传压字花图片") + upload_result = self.upload_image_to_server(text_temp_path, image_info['tag_image_id']) + + if not upload_result: + # 删除临时文件 + if os.path.exists(text_temp_path): + os.remove(text_temp_path) + logger.error(f"[封面图本地化] 上传失败") + return False + + # 获取上传后的真实路径 + uploaded_relative_path = upload_result.get('relative_path') or upload_result.get('image_url') + uploaded_thumb_path = upload_result.get('thumb_relative_path') or upload_result.get('image_thumb_url', '') + + logger.info(f"[封面图本地化] 上传成功,相对路径: {uploaded_relative_path}") + + # 3. 更新数据库中的图片URL + self.update_image_urls_after_upload( + image_id=image_info['image_id'], + tag_image_id=image_info['tag_image_id'], + image_url=uploaded_relative_path, + image_thumb_url=uploaded_thumb_path + ) + + # 4. 插入文章图片关联记录(image_source=12表示封面图) + article_image_id = self.insert_article_image_relation_for_generated( + article_id=article_id, + image_id=image_info['image_id'], + image_url=uploaded_relative_path, + image_thumb_url=uploaded_thumb_path, + tag_image_id=image_info['tag_image_id'], + keywords_id=image_info['keywords_id'], + keywords_name=image_info['keywords_name'], + department_id=image_info['department_id'], + department_name=image_info['department_name'], + image_source=12 # 封面图固定为12(实拍图) + ) + + if article_image_id: + logger.info(f"[封面图本地化] 文章图片关联信息已创建,ai_article_images.id: {article_image_id}") + else: + logger.error(f"[封面图本地化] 文章图片关联创建失败") + return False + + # 删除临时文件 + if os.path.exists(text_temp_path): + os.remove(text_temp_path) + + logger.info(f"[封面图本地化] 处理完成") + return True + + except Exception as e: + logger.error(f"[封面图本地化] 处理失败: {e}") + return False + + def generate_image_with_gemini(self, prompt: str, article_tags: List[str], article_id: int, image_type: str = "默认") -> Optional[str]: """ 使用Gemini生成图片并上传到服务器 @@ -451,11 +769,17 @@ class ArticleImageMatcher: prompt: 图片生成提示词 article_tags: 文章标签列表,用于查询department和keywords article_id: 文章ID,用于关联图片 + image_type: 图片类型(封面图/详情图/海报图) Returns: 上传后的图片URL,失败返回None """ try: + # 从文章表获取文章的部门信息 + article_info = self.get_article_info(article_id) + article_department = article_info.get('department', '') if article_info else '' + article_department_id = article_info.get('department_id', 0) if article_info else 0 + # 导入必要的库 from google import genai from google.genai.client import HttpOptions @@ -486,51 +810,109 @@ class ArticleImageMatcher: if hasattr(part, 'inline_data') and part.inline_data is not None: image_data = part.inline_data if image_data.data is not None: - # 生成唯一的文件名(基于时间戳) + # 生成临时文件名 timestamp_ms = int(time.time() * 1000) - image_filename = f"{timestamp_ms}.png" - today_date = datetime.now().strftime("%Y%m%d") - image_url_path = f"{today_date}/{image_filename}" - temp_filename = f"temp_generated_image_{timestamp_ms}.png" + # 保存图片数据到临时文件 with open(temp_filename, 'wb') as f: f.write(image_data.data) logger.info(f"Gemini生成图片成功: {temp_filename}") - # 先将图片信息插入数据库 - image_info = self.insert_generated_image_to_db(image_filename, image_url_path, article_tags) + # 先上传图片到服务器,获取真实的文件名和路径 + # 注意:需要先插入ai_image_tags表获取tag_image_id,才能上传 + # 所以这里先使用临时路径插入数据库 + today_date = datetime.now().strftime("%Y%m%d") + temp_image_path = f"{today_date}/{timestamp_ms}.png" + + # 先将图片信息插入数据库(使用临时路径) + image_info = self.insert_generated_image_to_db( + f"{timestamp_ms}.png", # 临时文件名 + temp_image_path, # 临时路径 + article_department=article_department, + article_department_id=article_department_id, + article_keywords='', + article_keywords_id=0, + article_tags=article_tags + ) if not image_info: + os.remove(temp_filename) raise Exception("插入图片信息到数据库失败") logger.info(f"图片信息已插入数据库,tag_image_id: {image_info['tag_image_id']}, image_id: {image_info['image_id']}") # 使用tag_image_id上传图片到服务器 - uploaded_url = self.upload_image_to_server(temp_filename, image_info['tag_image_id']) + upload_result = self.upload_image_to_server(temp_filename, image_info['tag_image_id']) - # 将文章与图片的关联信息插入ai_article_images表 - article_image_id = self.insert_article_image_relation_for_generated( - article_id=article_id, + if not upload_result: + os.remove(temp_filename) + raise Exception("图片上传失败") + + # 从上传响应中获取真实的文件名和路径 + uploaded_relative_path = upload_result.get('relative_path') or upload_result.get('image_url') + uploaded_thumb_path = upload_result.get('thumb_relative_path') or upload_result.get('image_thumb_url', '') + + logger.info(f"图片上传成功,真实相对路径: {uploaded_relative_path}") + + # 更新数据库中的图片URL为上传后的真实路径 + self.update_image_urls_after_upload( image_id=image_info['image_id'], - image_url=image_info['image_url'], - image_thumb_url=image_info['image_thumb_url'], tag_image_id=image_info['tag_image_id'], - keywords_id=image_info['keywords_id'], - keywords_name=image_info['keywords_name'], - department_id=image_info['department_id'], - department_name=image_info['department_name'], - image_source=0 # 默认值 + image_url=uploaded_relative_path, + image_thumb_url=uploaded_thumb_path ) - if article_image_id: - logger.info(f"文章图片关联信息已创建,ai_article_images.id: {article_image_id}") + # 更新image_info为上传后的真实路径 + image_info['image_url'] = uploaded_relative_path + image_info['image_thumb_url'] = uploaded_thumb_path + + # ⭐ 关键修改:封面图本地化处理(本地压字花 + 通用上传 + 关联) + if image_type == '封面图': + # 获取文章标题用于压字花 + article_info = self.get_article_info(article_id) + article_title = article_info.get('title', '') if article_info else '' + + # 本地化处理封面图(绕过网络接口问题) + upload_success = self.upload_cover_image_with_text_fusion( + image_path=temp_filename, + article_id=article_id, + article_title=article_title, + image_info=image_info + ) + + if upload_success: + logger.info(f"[封面图] 文章 {article_id} 封面图本地化处理成功(已完成压字花和数据库关联)") + else: + logger.error(f"[封面图] 文章 {article_id} 封面图本地化处理失败") + # 删除临时文件 + if os.path.exists(temp_filename): + os.remove(temp_filename) + return None + else: + # 详情图:使用原有逻辑插入关联(image_source=13) + article_image_id = self.insert_article_image_relation_for_generated( + article_id=article_id, + image_id=image_info['image_id'], + image_url=image_info['image_url'], + image_thumb_url=image_info['image_thumb_url'], + tag_image_id=image_info['tag_image_id'], + keywords_id=image_info['keywords_id'], + keywords_name=image_info['keywords_name'], + department_id=image_info['department_id'], + department_name=image_info['department_name'], + image_source=13 # 详情图固定为13(AI生成图) + ) + + if article_image_id: + logger.info(f"[详情图] 文章图片关联信息已创建,ai_article_images.id: {article_image_id}") + else: + logger.error(f"[详情图] 文章 {article_id} 图片关联创建失败") # 删除临时文件 os.remove(temp_filename) - logger.info(f"图片已上传到服务器: {uploaded_url}") - return uploaded_url + return uploaded_relative_path raise Exception("Gemini API未返回有效的图片数据") @@ -543,13 +925,17 @@ class ArticleImageMatcher: self.log_to_database('ERROR', error_msg, traceback.format_exc()) return None - def insert_generated_image_to_db(self, image_name: str, image_url: str, article_tags: List[str]) -> Optional[Dict]: + def insert_generated_image_to_db(self, image_name: str, image_url: str, article_department: str = "", article_department_id: int = 0, article_keywords: str = "", article_keywords_id: int = 0, article_tags: List[str] = []) -> Optional[Dict]: """ 将Gemini生成的图片信息插入数据库 Args: image_name: 图片文件名 image_url: 图片URL路径 + article_department: 文章部门名称 + article_department_id: 文章部门ID + article_keywords: 文章关键词名称 + article_keywords_id: 文章关键词ID article_tags: 文章标签列表 Returns: @@ -559,39 +945,43 @@ class ArticleImageMatcher: connection = self.db_manager.get_connection() try: with connection.cursor(pymysql.cursors.DictCursor) as cursor: - # 根据文章标签查询ai_image_tags表 + # 使用文章的部门和关键词信息,如果没有则使用默认值 + department_id = article_department_id if article_department_id > 0 else 1 + keywords_id = article_keywords_id if article_keywords_id > 0 else 1 + department = article_department if article_department else 'AI生成' + keywords = article_keywords if article_keywords else 'AI图片' + + # 先确保文章的标签存在于ai_tags表中 + tag_id = 1 # 默认tag_id + if article_tags: - query = """ - SELECT department_name, keywords_name, department_id, keywords_id, tag_id - FROM ai_image_tags + # 首先查询ai_tags表中是否已存在该标签 + query_tag = """ + SELECT id + FROM ai_tags WHERE tag_name = %s LIMIT 1 """ - cursor.execute(query, (article_tags[0],)) + cursor.execute(query_tag, (article_tags[0],)) tag_info = cursor.fetchone() if tag_info: - department = tag_info['department_name'] - keywords = tag_info['keywords_name'] - department_id = tag_info['department_id'] - keywords_id = tag_info['keywords_id'] - tag_id = tag_info['tag_id'] + # 如果标签已存在,使用现有的信息 + tag_id = tag_info['id'] tag_name = article_tags[0] else: - department = "AI生成" - keywords = "AI图片" - department_id = 1 - keywords_id = 1 - tag_id = 1 - tag_name = article_tags[0] if article_tags else "AI生成" + # 如果标签不存在,则插入新标签 + insert_tag_query = """ + INSERT INTO ai_tags (tag_name, created_at, updated_at) + VALUES (%s, NOW(), NOW()) + """ + cursor.execute(insert_tag_query, (article_tags[0],)) + tag_id = cursor.lastrowid + tag_name = article_tags[0] else: - department = "AI生成" - keywords = "AI图片" - department_id = 1 - keywords_id = 1 - tag_id = 1 + # 如果没有文章标签,使用默认值 tag_name = "AI生成" - + # 插入ai_images表 insert_image_query = """ INSERT INTO ai_images @@ -638,7 +1028,7 @@ class ArticleImageMatcher: logger.error(f"插入图片信息到数据库失败: {e}") return None - def upload_image_to_server(self, image_path: str, tag_image_id: int) -> str: + def upload_image_to_server(self, image_path: str, tag_image_id: int) -> Optional[Dict]: """ 上传图片到服务器 @@ -647,7 +1037,7 @@ class ArticleImageMatcher: tag_image_id: 图片标签ID Returns: - 服务器上的图片URL + 上传响应数据字典,包含relative_path等字段 """ base_url = "http://47.99.184.230:8324" jwt_token = self.login_and_get_jwt_token(base_url) @@ -669,12 +1059,57 @@ class ArticleImageMatcher: if response.status_code == 200: result = response.json() if result.get('code') == 200: - return result['data']['http_image_url'] + upload_data = result['data'] + logger.info(f"上传成功,相对路径: {upload_data.get('relative_path')}") + return upload_data else: raise Exception(f"图片上传失败: {result.get('message', '未知错误')}") else: raise Exception(f"图片上传请求失败,状态码: {response.status_code}") + def update_image_urls_after_upload(self, image_id: int, tag_image_id: int, image_url: str, image_thumb_url: str) -> bool: + """ + 上传成功后更新数据库中的图片URL + + Args: + image_id: ai_images表的图片ID + tag_image_id: ai_image_tags表的标签ID + image_url: 上传后的相对路径 + image_thumb_url: 缩略图相对路径 + + Returns: + 是否更新成功 + """ + try: + connection = self.db_manager.get_connection() + try: + with connection.cursor(pymysql.cursors.DictCursor) as cursor: + # 更新ai_images表 + update_images_sql = """ + UPDATE ai_images + SET image_url = %s, image_thumb_url = %s + WHERE id = %s + """ + cursor.execute(update_images_sql, (image_url, image_thumb_url, image_id)) + logger.info(f"已更新ai_images表,image_id: {image_id}, URL: {image_url}") + + # 更新ai_image_tags表 + update_tags_sql = """ + UPDATE ai_image_tags + SET image_url = %s, image_thumb_url = %s + WHERE id = %s + """ + cursor.execute(update_tags_sql, (image_url, image_thumb_url, tag_image_id)) + logger.info(f"已更新ai_image_tags表,tag_image_id: {tag_image_id}, URL: {image_url}") + + connection.commit() + return True + finally: + connection.close() + except Exception as e: + logger.error(f"更新图片URL失败: {e}") + return False + def login_and_get_jwt_token(self, base_url: str) -> Optional[str]: """登录获取JWT token""" login_url = f"{base_url}/api/auth/login" @@ -695,14 +1130,74 @@ class ArticleImageMatcher: logger.error(f"登录异常: {e}") return None + def call_rpa_review_api(self, article_ids: List[int], image_source: int = 0) -> bool: + """ + 调用RPA审核接口 + + Args: + article_ids: 文章ID列表 + image_source: 图片来源类型 (11=模板图, 12=实拍图, 13=AI生成图) + + Returns: + 是否调用成功 + """ + try: + base_url = "http://47.99.184.230:8324" # API基础URL + jwt_token = self.login_and_get_jwt_token(base_url) + + if not jwt_token: + logger.error("获取JWT token失败,无法调用RPA审核接口") + return False + + # 准备请求数据 + api_url = f"{base_url}/api/articles/rpa/review" + headers = { + 'Authorization': f'Bearer {jwt_token}', + 'Content-Type': 'application/json' + } + + payload = { + "article_ids": article_ids, + "image_source": image_source + } + + logger.info(f"调用RPA审核接口: {api_url}") + logger.info(f"请求参数: article_ids={article_ids}, image_source={image_source}") + + response = requests.post(api_url, json=payload, headers=headers, timeout=30) + + logger.info(f"RPA审核接口响应状态码: {response.status_code}") + logger.info(f"RPA审核接口响应内容: {response.text}") + + if response.status_code == 200: + result = response.json() + if result.get('code') == 200: + logger.info(f"RPA审核接口调用成功: {result.get('message', '操作完成')}") + return True + else: + logger.error(f"RPA审核接口返回错误: {result.get('message', '未知错误')}") + return False + else: + logger.error(f"RPA审核接口调用失败,状态码: {response.status_code}") + return False + + except requests.exceptions.Timeout: + logger.error("RPA审核接口调用超时") + return False + except Exception as e: + logger.error(f"调用RPA审核接口异常: {e}") + return False + def insert_article_image_relation_for_generated(self, article_id: int, image_id: int, image_url: str, image_thumb_url: str, tag_image_id: int, keywords_id: int, keywords_name: str, department_id: int, department_name: str, image_source: int = 0) -> Optional[int]: """ - 将文章与生成图片的关联信息插入ai_article_images表 + 将文章与生成图片的关联信息处理(不调用RPA接口) + 注意:根据新要求,只插入关联信息,等所有图片生成完成后统一调用RPA接口 """ try: + # 1. 首先插入ai_article_images表(保持原有逻辑) connection = self.db_manager.get_connection() try: with connection.cursor(pymysql.cursors.DictCursor) as cursor: @@ -726,36 +1221,25 @@ class ArticleImageMatcher: """ cursor.execute(insert_query, ( article_id, image_id, image_url, image_thumb_url, tag_image_id, new_sort_order, - keywords_id, keywords_name, department_id, department_name, image_source + keywords_id, keywords_name, department_id, department_name, image_source # 使用传入的image_source值 )) + article_image_id = cursor.lastrowid - logger.info(f"文章图片关联信息已插入ai_article_images表,id: {article_image_id}") - - # 更新图片附加文章计数 - update_count_sql = """ - UPDATE ai_image_tags - SET image_attached_article_count = image_attached_article_count + 1 - WHERE id = %s - """ - cursor.execute(update_count_sql, (tag_image_id,)) - - # 更新图片状态为published - update_image_status_sql = """ - UPDATE ai_images - SET status = 'published' - WHERE id = %s - """ - cursor.execute(update_image_status_sql, (image_id,)) - - # 更新文章状态为published_review - self.update_article_status(article_id, 'published_review') + logger.info(f"文章图片关联信息已插入ai_article_images表,id: {article_image_id}, sort_order: {new_sort_order}, image_source: {image_source}") + # 提交插入操作 connection.commit() - return article_image_id finally: connection.close() + + # 注意:不再在这里调用RPA接口,而是在所有图片都生成完成后统一调用 + + # 记录操作日志到ai_logs表 + self.log_to_database('INFO', f"AI生成图片关联完成", f"文章ID: {article_id}, 图片ID: {image_id}, 关联记录ID: {article_image_id}") + + return article_image_id except Exception as e: - logger.error(f"插入文章图片关联信息失败: {e}") + logger.error(f"处理文章图片关联信息失败: {e}") return None def match_article_with_images(self, article_data: Dict) -> bool: @@ -792,8 +1276,7 @@ class ArticleImageMatcher: available_images = self.get_available_images_with_tags(article_department_id) if not available_images: - logger.warning(f"文章 {article_id} 没有找到对应科室的可用图片,跳过") - return False + logger.info(f"文章 {article_id} 没有找到对应科室的可用图片,将进行Gemini生图") # 根据图片类型(实拍图/模板图)进行分类处理 # 根据image_source字段:1=clean_images(模板图), 2=Flower_character(实拍图) @@ -846,22 +1329,85 @@ class ArticleImageMatcher: else: return False else: - # 未找到合适的匹配图片,使用Gemini生成图片 - logger.info(f"文章 {article_id} 未找到合适的匹配图片,调用Gemini生成图片") - self.log_to_database('WARNING', f"文章未找到匹配图片,尝试生成图片", f"文章ID: {article_id}") + # 未找到合适的匹配图片,根据当前图片数量采用不同策略生成图片 + current_image_count = self.get_article_image_count(article_id) + logger.info(f"文章 {article_id} 当前已有 {current_image_count} 张图片,采用相应生成策略") - # 构建生成提示词 - prompt = f"与'{article_title}'相关的插图,标签: {', '.join(article_tags)}" - generated_image_url = self.generate_image_with_gemini(prompt, article_tags, article_id) - - if generated_image_url: - logger.info(f"文章 {article_id} 成功生成图片: {generated_image_url}") - self.log_to_database('INFO', f"文章生成图片成功", - f"文章ID: {article_id}, 图片URL: {generated_image_url}") - return True + images_to_generate = [] + if current_image_count == 0: + # 0张图:生成1张封面图(实拍图) + 2张详情图(AI生成图) + images_to_generate = ['封面图', '详情图', '详情图'] + logger.info(f"文章 {article_id} 无图片,将生成1张封面图(image_source=12)和2张详情图(image_source=13)") + elif current_image_count == 1: + # 1张图:根据现有图片类型决定生成策略 + # 查询现有图片的image_source + existing_image_sources = self.get_article_image_sources(article_id) + if 12 not in existing_image_sources: + # 缺少实拍图,生成1张封面图 + images_to_generate = ['封面图'] + logger.info(f"文章 {article_id} 缺少实拍图,将生成1张封面图(image_source=12)") + elif existing_image_sources.count(13) < 2: + # 缺少AI生成图,生成详情图补充到2张 + need_count = 2 - existing_image_sources.count(13) + images_to_generate = ['详情图'] * need_count + logger.info(f"文章 {article_id} 缺少AI生成图,将生成{need_count}张详情图(image_source=13)") + else: + logger.info(f"文章 {article_id} 已满足图片要求,无需生成更多图片") + return True else: - logger.error(f"文章 {article_id} 生成图片失败") - self.log_to_database('ERROR', f"文章生成图片失败", f"文章ID: {article_id}") + # 2张或以上:检查是否满足要求 + existing_image_sources = self.get_article_image_sources(article_id) + need_cover = 12 not in existing_image_sources + need_template = existing_image_sources.count(13) < 2 + + if need_cover or need_template: + if need_cover: + images_to_generate.append('封面图') + if need_template: + need_count = 2 - existing_image_sources.count(13) + images_to_generate.extend(['详情图'] * need_count) + logger.info(f"文章 {article_id} 需要补充图片: 实拍图={need_cover}, AI生成图={need_template}, 将生成{len(images_to_generate)}张图片") + else: + logger.info(f"文章 {article_id} 已满足图片要求,无需生成更多图片") + return True + + # 生成相应数量和类型的图片 + generated_count = 0 + for image_type in images_to_generate: + # 构建针对不同类型图片的生成提示词 + if image_type == '封面图': + prompt = f"为文章'{article_title}'生成封面图,要求:主题突出、视觉冲击力强、适合首页展示,标签: {', '.join(article_tags)}" + elif image_type == '详情图': + prompt = f"为文章'{article_title}'生成详情说明图,要求:内容相关、清晰易懂、辅助理解文章内容,标签: {', '.join(article_tags)}" + else: # 海报图 + prompt = f"为文章'{article_title}'生成宣传海报图,要求:吸引眼球、信息明确、适合推广传播,标签: {', '.join(article_tags)}" + + generated_image_url = self.generate_image_with_gemini(prompt, article_tags, article_id, image_type) + + if generated_image_url: + generated_count += 1 + logger.info(f"文章 {article_id} 成功生成{image_type}: {generated_image_url}") + self.log_to_database('INFO', f"文章生成{image_type}成功", + f"文章ID: {article_id}, 图片URL: {generated_image_url}, 类型: {image_type}") + else: + logger.error(f"文章 {article_id} 生成{image_type}失败") + self.log_to_database('ERROR', f"文章生成{image_type}失败", f"文章ID: {article_id}, 类型: {image_type}") + + # 检查是否所有图片都生成成功 + if generated_count == len(images_to_generate): + logger.info(f"文章 {article_id} 共成功生成 {generated_count} 张图片,所有图片都已生成,现在调用RPA接口") + # 所有图片都生成成功后,才调用RPA接口 + if self.call_rpa_review_api([article_id]): + logger.info(f"文章 {article_id} RPA审核接口调用成功") + return True + else: + logger.error(f"文章 {article_id} RPA审核接口调用失败") + return False + elif generated_count > 0: + logger.warning(f"文章 {article_id} 只成功生成 {generated_count}/{len(images_to_generate)} 张图片,未达到要求,不调用RPA接口") + return False + else: + logger.error(f"文章 {article_id} 生成图片全部失败") return False except Exception as e: diff --git a/image_matching.py b/image_matching.py new file mode 100644 index 0000000..2eaf13a --- /dev/null +++ b/image_matching.py @@ -0,0 +1,910 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +文章与图片智能挂靠脚本 +根据文章标签匹配ai_image_tags表中的图片,使用大模型进行处理, +如果挂靠失败或没有相同标签的图片,则使用Gemini生成图片 +""" + +import json +import os +import re +import requests +import csv +import pymysql +from typing import List, Dict, Tuple, Optional +from collections import defaultdict +from database_config import db_manager +from log_config import setup_logger +import time +import random +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + + +def get_articles_with_tags_from_db() -> List[Dict]: + """ + 从数据库获取文章及其标签 + + Returns: + 包含文章信息的字典列表 + """ + # 设置日志记录器 + logger = setup_logger('article_matching', 'logs/article_matching.log', 'logs/article_matching_error.log') + + articles = [] + + try: + # 查询审核通过的文章,包含内容和标签 + sql = """ + SELECT id, title, content, coze_tag + FROM ai_articles + WHERE status = 'approved' + ORDER BY id + """ + + logger.info("开始查询审核通过的文章数据...") + results = db_manager.execute_query(sql) + + if not results: + logger.warning("没有找到状态为approved的文章") + print("没有找到状态为approved的文章") + return articles + + logger.info(f"查询到 {len(results)} 条审核通过的文章") + print(f"查询到 {len(results)} 条审核通过的文章") + + for row in results: + article_id, title, content, coze_tag = row + + # 解析标签 + tags = [] + if coze_tag: + try: + # 尝试解析JSON格式的标签 + tags_data = json.loads(coze_tag) + if isinstance(tags_data, list): + tags = tags_data + elif isinstance(tags_data, dict): + # 如果是字典格式,提取值 + tags = list(tags_data.values()) if isinstance(list(tags_data.values())[0], list) else list(tags_data.values()) + else: + # 如果是字符串,尝试按逗号分割 + tags = [tag.strip() for tag in str(tags_data).split(',') if tag.strip()] + except json.JSONDecodeError: + # 如果不是JSON格式,按逗号分割 + tags = [tag.strip() for tag in str(coze_tag).split(',') if tag.strip()] + + articles.append({ + 'id': article_id, + 'title': title, + 'content': content, + 'tags': tags + }) + except Exception as e: + logger.error(f"从数据库获取文章数据时发生错误: {e}", exc_info=True) + print(f"从数据库获取文章数据时发生错误: {e}") + raise + + return articles + + +def get_images_by_tags_from_db(tags: List[str] = [], used_counts: Dict[str, int] = {}) -> List[Dict]: + """ + 从数据库根据标签获取图片 + + Args: + tags: 标签列表 + used_counts: 已使用次数的字典,key为图片ID,value为使用次数 + + Returns: + 包含图片信息的字典列表 + """ + if not tags: + return [] + + # 设置日志记录器 + logger = setup_logger('article_matching', 'logs/article_matching.log', 'logs/article_matching_error.log') + + images = [] + + try: + # 查询符合条件的图像标签数据 + sql = """ + SELECT id, image_id, image_name, image_url, tag_name, keywords_name, department_name, image_attached_article_count + FROM ai_image_tags + WHERE image_attached_article_count < 5 + ORDER BY id + """ + + logger.info("开始查询符合条件的图像标签数据...") + results = db_manager.execute_query(sql) + + if not results: + logger.warning("没有找到符合条件的图像标签数据 (image_attached_article_count < 5)") + print("没有找到符合条件的图像标签数据 (image_attached_article_count < 5)") + return images + + logger.info(f"查询到 {len(results)} 条符合条件的图像标签数据") + print(f"查询到 {len(results)} 条符合条件的图像标签数据") + + for row in results: + ( + image_id, db_image_id, image_name, image_url, tag_name, + keywords_name, department_name, base_count + ) = row + + # 检查图片的附加文章数量是否小于5,考虑已使用次数 + used_count = used_counts.get(str(image_id), 0) + total_count = base_count + used_count + + if total_count >= 5: + continue + + # 检查标签是否匹配 + if any(tag.lower() in tag_name.lower() for tag in tags): + images.append({ + 'id': str(image_id), + 'image_id': db_image_id, + 'image_name': image_name, + 'image_url': image_url, + 'tag_name': tag_name, + 'keywords_name': keywords_name, + 'department_name': department_name, + 'base_count': base_count + }) + except Exception as e: + logger.error(f"从数据库获取图片数据时发生错误: {e}", exc_info=True) + print(f"从数据库获取图片数据时发生错误: {e}") + raise + + print(f"从数据库找到 {len(images)} 张符合条件的匹配图片") + return images + + +def call_qwen_model(article: Dict, image_urls: List[str]) -> bool: + """ + 调用通义千问大模型进行文章与图片挂靠评估 + + Args: + article: 文章信息 + image_urls: 图片URL列表 + + Returns: + 挂靠是否成功 + """ + # 通义千问API配置 + api_key = "sk-e6a38204022a4b538b8954f0584712af" + api_url = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation" + + # 构建请求内容 + content = f""" + 请评估以下文章与图片的匹配度: + + 文章标题: {article['title']} + 文章内容: {article['content'][:500]}... # 限制内容长度 + + 图片URLs: {', '.join(image_urls)} + + 请判断这些图片是否适合用于这篇文章。如果匹配度高,请回复"匹配成功";如果匹配度低,请回复"匹配失败"。 + """ + + headers = { + 'Authorization': f'Bearer {api_key}', + 'Content-Type': 'application/json' + } + + payload = { + "model": "qwen-max", # 或其他合适的模型 + "input": { + "messages": [ + { + "role": "user", + "content": content + } + ] + }, + "parameters": { + "temperature": 0.7 + } + } + + try: + response = requests.post(api_url, headers=headers, json=payload) + + if response.status_code == 200: + result = response.json() + # 解析响应,判断匹配结果 + if 'output' in result and 'text' in result['output']: + response_text = result['output']['text'].lower() + # 根据响应内容判断是否匹配 + if '匹配成功' in response_text or '是的' in response_text or '合适' in response_text: + print(f"通义千问评估结果: 匹配成功 - 文章 '{article['title']}'") + return True + else: + print(f"通义千问评估结果: 匹配失败 - 文章 '{article['title']}'") + return False + else: + print(f"通义千问API响应格式异常: {result}") + return False + else: + print(f"通义千问API调用失败: {response.status_code} - {response.text}") + # API调用失败时,仍然尝试匹配,这里返回False触发图片生成 + return False + + except Exception as e: + print(f"调用通义千问API时发生错误: {e}") + # 发生错误时,返回False以触发图片生成 + return False + + +def insert_generated_image_to_db(image_name: str, image_url: str, article_tags: List[str]) -> Optional[Dict]: + """ + 将Gemini生成的图片信息插入数据库 + + Args: + image_name: 图片文件名,如 "1755310671174988.png" + image_url: 图片URL路径,如 "20250816/1755310671174988.png" + article_tags: 文章标签列表,用于查询department和keywords + + Returns: + 包含插入信息的字典:{ + 'tag_image_id': tag_image_id, + 'image_id': image_id, + 'image_url': image_url, + 'image_thumb_url': image_thumb_url, + 'keywords_id': keywords_id, + 'keywords_name': keywords_name, + 'department_id': department_id, + 'department_name': department_name + } + """ + connection = db_manager.get_connection() + if connection is None: + print("无法连接到数据库") + return None + + try: + with connection.cursor(pymysql.cursors.DictCursor) as cursor: + # 1. 根据文章标签查询ai_image_tags表,获取department和keywords信息 + if article_tags: + # 使用第一个标签查询 + query = """ + SELECT department_name, keywords_name, department_id, keywords_id, tag_id + FROM ai_image_tags + WHERE tag_name = %s + LIMIT 1 + """ + cursor.execute(query, (article_tags[0],)) + tag_info = cursor.fetchone() + + if tag_info: + department = tag_info['department_name'] + keywords = tag_info['keywords_name'] + department_id = tag_info['department_id'] + keywords_id = tag_info['keywords_id'] + tag_id = tag_info['tag_id'] + tag_name = article_tags[0] + else: + # 如果没有找到,使用默认值 + department = "AI生成" + keywords = "AI图片" + department_id = 1 + keywords_id = 1 + tag_id = 1 + tag_name = article_tags[0] if article_tags else "AI生成" + else: + # 没有标签,使用默认值 + department = "AI生成" + keywords = "AI图片" + department_id = 1 + keywords_id = 1 + tag_id = 1 + tag_name = "AI生成" + + # 2. 插入ai_images表 + insert_image_query = """ + INSERT INTO ai_images + (image_name, image_url, image_thumb_url, department, keywords, image_type, upload_user_id, status) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s) + """ + cursor.execute(insert_image_query, ( + image_name, + image_url, + '', # image_thumb_url + department, + keywords, + 'medical', # image_type + 1, # upload_user_id(默认用户ID) + 'active' # status + )) + image_id = cursor.lastrowid + print(f"图片信息已插入ai_images表,image_id: {image_id}") + + # 3. 插入ai_image_tags表 + insert_tag_query = """ + INSERT INTO ai_image_tags + (image_id, image_name, image_url, image_thumb_url, tag_id, tag_name, + keywords_id, keywords_name, department_id, department_name, + image_source, created_user_id, image_attached_article_count) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + """ + cursor.execute(insert_tag_query, ( + image_id, + image_name, + image_url, + '', # image_thumb_url + tag_id, + tag_name, + keywords_id, + keywords, + department_id, + department, + 3, # image_source: 3表示AI生成 + 1, # created_user_id + 0 # image_attached_article_count + )) + tag_image_id = cursor.lastrowid + print(f"图片标签信息已插入ai_image_tags表,tag_image_id: {tag_image_id}") + + # 提交事务 + connection.commit() + + # 返回包含所有需要信息的字典 + return { + 'tag_image_id': tag_image_id, + 'image_id': image_id, + 'image_url': image_url, + 'image_thumb_url': '', + 'keywords_id': keywords_id, + 'keywords_name': keywords, + 'department_id': department_id, + 'department_name': department + } + + except Exception as e: + print(f"插入图片信息到数据库失败: {e}") + connection.rollback() + return None + finally: + connection.close() + + +def insert_article_image_relation(article_id: int, image_id: int, image_url: str, image_thumb_url: str, + tag_image_id: int, keywords_id: int, keywords_name: str, + department_id: int, department_name: str, image_source: int = 0) -> Optional[int]: + """ + 将文章与图片的关联信息插入ai_article_images表 + + Args: + article_id: 文章ID + image_id: 图片ID(ai_images表的id) + image_url: 图片URL + image_thumb_url: 缩略图URL + tag_image_id: 图片标签ID(ai_image_tags表的id) + keywords_id: 关键词ID + keywords_name: 关键词名称 + department_id: 部门ID + department_name: 部门名称 + image_source: 图片来源(0表示默认) + + Returns: + 插入的ai_article_images表的ID + """ + connection = db_manager.get_connection() + if connection is None: + print("无法连接到数据库") + return None + + try: + with connection.cursor(pymysql.cursors.DictCursor) as cursor: + # 1. 查询当前文章下已有图片的最大sort_order + query_max_sort = """ + SELECT COALESCE(MAX(sort_order), 0) as max_sort_order + FROM ai_article_images + WHERE article_id = %s + """ + cursor.execute(query_max_sort, (article_id,)) + result = cursor.fetchone() + max_sort_order = result['max_sort_order'] if result else 0 + new_sort_order = max_sort_order + 1 + + print(f"文章 {article_id} 当前最大sort_order: {max_sort_order}, 新图片sort_order: {new_sort_order}") + + # 2. 插入ai_article_images表 + insert_query = """ + INSERT INTO ai_article_images + (article_id, image_id, image_url, image_thumb_url, image_tag_id, sort_order, + keywords_id, keywords_name, department_id, department_name, image_source) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + """ + cursor.execute(insert_query, ( + article_id, + image_id, + image_url, + image_thumb_url, + tag_image_id, + new_sort_order, + keywords_id, + keywords_name, + department_id, + department_name, + image_source + )) + article_image_id = cursor.lastrowid + print(f"文章图片关联信息已插入ai_article_images表,id: {article_image_id}") + + # 提交事务 + connection.commit() + + return article_image_id + + except Exception as e: + print(f"插入文章图片关联信息失败: {e}") + connection.rollback() + return None + finally: + connection.close() + + +def generate_image_with_gemini(prompt: str, article_tags: List[str], article_id: int) -> str: + """ + 使用Gemini生成图片并上传到服务器 + + Args: + prompt: 图片生成提示词 + article_tags: 文章标签列表,用于查询department和keywords + article_id: 文章ID,用于关联图片 + + Returns: + 上传后的图片URL + """ + # 导入必要的库 + try: + from google import genai + from google.genai import types + from google.genai.client import HttpOptions + + except ImportError: + print("错误:未安装google-genai库,请运行 'pip install google-genai' 进行安装") + raise + + client = genai.Client(http_options=HttpOptions(base_url="https://work.poloapi.com"), + api_key="sk-V4tPnDgzFPa7nxWrvKnNJsW8ZcBXXPuGmjfgvPVRnwpHoeob") + + print(f"正在调用Gemini API生成图片,提示词: {prompt[:50]}...") + + # 生成内容 + response = client.models.generate_content( + model="gemini-3-pro-image-preview", + contents=[prompt], + ) + + # 检查是否有候选答案 + if not response.candidates: + raise Exception("Gemini API未返回任何候选答案") + + # 处理响应 - 遍历第一个候选答案的内容部分 + candidate = response.candidates[0] + if not candidate.content or not candidate.content.parts: + raise Exception("Gemini API返回的候选答案中没有内容部分") + + for part in candidate.content.parts: + if hasattr(part, 'text') and part.text is not None: + print(f"Gemini响应文本: {part.text}") + elif hasattr(part, 'inline_data') and part.inline_data is not None: + image_data = part.inline_data + if image_data.data is not None: + # 生成唯一的文件名(基于时间戳) + import time + import os + from datetime import datetime + + timestamp_ms = int(time.time() * 1000) # 毫秒级时间戳 + image_filename = f"{timestamp_ms}.png" + today_date = datetime.now().strftime("%Y%m%d") + image_url_path = f"{today_date}/{image_filename}" + + temp_filename = f"temp_generated_image_{timestamp_ms}.png" + # 保存图片数据到临时文件 + with open(temp_filename, 'wb') as f: + f.write(image_data.data) + print(f"Gemini生成图片成功: {temp_filename}") + + # 先将图片信息插入数据库,获取相关信息 + image_info = insert_generated_image_to_db(image_filename, image_url_path, article_tags) + + if not image_info: + raise Exception("插入图片信息到数据库失败") + + print(f"图片信息已插入数据库,tag_image_id: {image_info['tag_image_id']}, image_id: {image_info['image_id']}") + + # 使用tag_image_id上传图片到服务器 + uploaded_url = upload_image_to_server(temp_filename, image_info['tag_image_id']) + + # 将文章与图片的关联信息插入ai_article_images表 + article_image_id = insert_article_image_relation( + article_id=article_id, + image_id=image_info['image_id'], + image_url=image_info['image_url'], + image_thumb_url=image_info['image_thumb_url'], + tag_image_id=image_info['tag_image_id'], + keywords_id=image_info['keywords_id'], + keywords_name=image_info['keywords_name'], + department_id=image_info['department_id'], + department_name=image_info['department_name'], + image_source=0 # 默认值 + ) + + if article_image_id: + print(f"文章图片关联信息已创建,ai_article_images.id: {article_image_id}") + + # 删除临时文件 + os.remove(temp_filename) + + print(f"图片已上传到服务器: {uploaded_url}") + # 返回上传后的图片URL + return uploaded_url + + # 如果没有返回图片数据,抛出异常 + raise Exception("Gemini API未返回有效的图片数据") + + +def upload_image_to_server(image_path: str, tag_image_id: int) -> str: + """ + 上传图片到服务器 + + Args: + image_path: 本地图片路径 + tag_image_id: 图片标签ID + + Returns: + 服务器上的图片URL + """ + import requests + import json + + # 登录获取JWT token + base_url = "http://47.99.184.230:8324" # 使用外网API地址 + jwt_token = login_and_get_jwt_token(base_url) + + if not jwt_token: + raise Exception("获取JWT token失败,无法上传图片") + + # 准备上传请求 + upload_url = f"{base_url}/api/images/upload" + headers = { + 'Authorization': f'Bearer {jwt_token}', + } + + # 读取图片文件 + with open(image_path, 'rb') as image_file: + files = {'file': image_file} + data = {'tag_image_id': tag_image_id} # 添加必传参数 + + response = requests.post(upload_url, headers=headers, files=files, data=data) + + print(f"图片上传响应状态码: {response.status_code}") + print(f"图片上传响应内容: {response.text}") + + if response.status_code == 200: + result = response.json() + if result.get('code') == 200: + # 返回服务器上的图片URL + return result['data']['http_image_url'] + else: + raise Exception(f"图片上传失败: {result.get('message', '未知错误')}") + else: + raise Exception(f"图片上传请求失败,状态码: {response.status_code}, 响应: {response.text}") + + +def login_and_get_jwt_token(base_url: str) -> Optional[str]: + """ + 登录获取JWT token + """ + login_url = f"{base_url}/api/auth/login" + login_data = { + "username": "user010", # 使用固定的账号 + "password": "@5^2W6R7" + } + + print(f"尝试登录: {login_data['username']}") + print(f"登录URL: {login_url}") + + try: + response = requests.post(login_url, json=login_data, headers={'Content-Type': 'application/json'}) + print(f"响应状态码: {response.status_code}") + + if response.status_code == 200: + result = response.json() + if result.get('code') == 200: + jwt_token = result['data']['token'] + print("JWT token获取成功") + return jwt_token + else: + print(f"登录失败: {result.get('message', '未知错误')}") + return None + else: + print(f"登录请求失败: {response.status_code}") + return None + + except Exception as e: + print(f"登录异常: {e}") + return None + + +def batch_publish_articles(base_url: str, jwt_token: str, article_ids: List[int]) -> bool: + """ + 批量提交文章到/api/articles/batch-publish-auto接口 + """ + try: + print(f"开始批量提交 {len(article_ids)} 篇文章到batch-publish-auto接口") + + # 构建批量发布数据 + publish_data = { + "article_ids": article_ids + } + + print(f"准备批量提交的数据: {json.dumps(publish_data, ensure_ascii=False)}") + + # 发送请求 + upload_url = f"{base_url}/api/articles/batch-publish-auto" + headers = { + 'Authorization': f'Bearer {jwt_token}', + 'Content-Type': 'application/json', + 'Accept': 'application/json' + } + + response = requests.post(upload_url, json=publish_data, headers=headers) + + print(f"批量提交响应状态码: {response.status_code}") + + if response.status_code == 200: + try: + result = response.json() + print(f"批量提交响应内容: {result}") + + # 根据接口实际返回格式判断成功 + if result.get('code') == 200: + data = result.get('data', {}) + published_count = data.get('published_count', 0) + failed_count = data.get('failed_count', 0) + + success_msg = f"批量提交成功,发布: {published_count}篇,失败: {failed_count}篇" + print(success_msg) + return True + else: + print(f"批量提交失败: {result.get('message', '未知错误')}") + return False + except json.JSONDecodeError as e: + print(f"解析批量提交响应失败: {e}") + return False + elif response.status_code == 401: + # Token过期 + print("收到401错误,JWT token可能已过期") + return False + else: + print(f"批量提交请求失败,状态码: {response.status_code}") + return False + + except Exception as e: + print(f"批量提交异常: {e}") + return False + + +def process_single_article(article, used_image_counts, match_results): + """ + 处理单个文章与图片的匹配和挂靠 + + Args: + article: 单个文章数据 + used_image_counts: 图片使用计数 + match_results: 匹配结果列表 + + Returns: + 是否处理成功 + """ + print(f"\n处理文章: {article['title']} (ID: {article['id']})") + + # 根据文章标签获取匹配的图片(考虑已使用次数) + matched_images = get_images_by_tags_from_db(article['tags'], used_image_counts) + + if matched_images: + print(f"找到 {len(matched_images)} 张符合条件的匹配图片") + + # 按基础使用次数排序,优先使用基础计数较低的图片 + matched_images.sort(key=lambda x: x['base_count']) + + matched = False + for img in matched_images: + # 提取图片URL并添加前缀 + image_url = "http://images11.bxmkb.cn/Images/" + img['image_url'] + + if image_url: # 确保图片URL存在 + # 调用通义千问大模型进行挂靠评估 + match_success = call_qwen_model(article, [image_url]) + + if match_success: + print(f"文章与图片挂靠成功: {article['title']}") + + # 更新图片使用次数 + used_image_counts[img['id']] += 1 + + # 记录匹配结果 + match_results.append({ + '文章ID': article['id'], + '文章标题': article['title'], + '文章内容': article['content'][:100] + '...' if len(article['content']) > 100 else article['content'], # 限制内容长度 + '标签': ', '.join(article['tags']), + '匹配的图片URL': image_url, + '图片ID': img['id'], + '图片名称': img['image_name'], + '图片标签': img['tag_name'], + '图片关键词': img['keywords_name'], + '图片部门': img['department_name'], + '匹配状态': '成功' + }) + + return True + + if not matched: + print(f"文章未能与任何图片成功匹配,使用Gemini生成图片: {article['title']}") + + # 使用文章标题和标签生成提示词 + prompt = f"与'{article['title']}'相关的插图,标签: {', '.join(article['tags'])}" + generated_image_url = generate_image_with_gemini(prompt, article['tags'], article['id']) + print(f"生成的图片URL: {generated_image_url}") + + # 记录生成图片的结果 + match_results.append({ + '文章ID': article['id'], + '文章标题': article['title'], + '文章内容': article['content'][:100] + '...' if len(article['content']) > 100 else article['content'], + '标签': ', '.join(article['tags']), + '匹配的图片URL': generated_image_url, + '图片ID': 'N/A', + '图片名称': 'Generated', + '图片标签': 'N/A', + '图片关键词': 'N/A', + '图片部门': 'N/A', + '匹配状态': '生成图片' + }) + + return True + else: + print(f"没有找到符合条件的匹配图片,使用Gemini生成图片: {article['title']}") + + # 使用文章标题和标签生成提示词 + prompt = f"与'{article['title']}'相关的插图,标签: {', '.join(article['tags'])}" + generated_image_url = generate_image_with_gemini(prompt, article['tags'], article['id']) + print(f"生成的图片URL: {generated_image_url}") + + # 记录生成图片的结果 + match_results.append({ + '文章ID': article['id'], + '文章标题': article['title'], + '文章内容': article['content'][:100] + '...' if len(article['content']) > 100 else article['content'], + '标签': ', '.join(article['tags']), + '匹配的图片URL': generated_image_url, + '图片ID': 'N/A', + '图片名称': 'Generated', + '图片标签': 'N/A', + '图片关键词': 'N/A', + '图片部门': 'N/A', + '匹配状态': '生成图片' + }) + + return True + + +def process_article_image_matching(test_mode=False, test_count=None): + """ + 处理文章与图片的匹配和挂靠 + + Args: + test_mode: 是否为测试模式 + test_count: 测试文章数量(仅在测试模式下使用) + """ + # 用于跟踪每张图片的使用次数 + used_image_counts = defaultdict(int) + # 存储匹配结果 + match_results = [] + + try: + # 根据模式决定获取哪些文章 + articles = get_articles_with_tags_from_db() + + if not articles: + print("没有找到文章") + return + + # 如果是测试模式,只取前test_count条数据 + if test_mode: + if test_count is None: + test_count = 3 # 默认测试前3条 + articles = articles[:test_count] + print(f"测试模式:处理前 {len(articles)} 篇文章") + + success_count = 0 + generated_count = 0 + + # 收集所有处理后的文章ID用于发布 + processed_article_ids = [] + + for article in articles: + if process_single_article(article, used_image_counts, match_results): + success_count += 1 + processed_article_ids.append(article['id']) + else: + print(f"处理文章 {article['id']} 失败") + + # 将匹配结果写入CSV文件 + output_csv = 'article_image_match_results.csv' + with open(output_csv, 'w', newline='', encoding='utf-8-sig') as csvfile: + fieldnames = [ + '文章ID', '文章标题', '文章内容', '标签', + '匹配的图片URL', '图片ID', '图片名称', + '图片标签', '图片关键词', '图片部门', '匹配状态' + ] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + + writer.writeheader() + for result in match_results: + writer.writerow(result) + + if not test_mode: + print(f"\n处理完成! 成功挂靠: {success_count} 篇, 生成图片: {generated_count} 篇") + print(f"匹配结果已保存至: {output_csv}") + + # 如果有处理过的文章,将它们提交到发布接口 + if processed_article_ids: + print(f"\n开始发布处理过的 {len(processed_article_ids)} 篇文章...") + + # 登录获取JWT token + base_url = "http://47.99.184.230:8324" # 使用外网API地址 + jwt_token = login_and_get_jwt_token(base_url) + + if jwt_token: + # 批量发布文章 + if batch_publish_articles(base_url, jwt_token, processed_article_ids): + print(f"成功发布 {len(processed_article_ids)} 篇文章") + else: + print("批量发布失败") + else: + print("获取JWT token失败,无法发布文章") + else: + print("\n没有处理过的文章,跳过发布步骤") + else: + print(f"\n测试模式完成! 处理了 {len(articles)} 篇文章,成功挂靠: {success_count} 篇, 生成图片: {len([r for r in match_results if r['匹配状态'] == '生成图片'])} 篇") + print(f"处理结果已保存至: {output_csv}") + + except Exception as e: + print(f"处理文章图片匹配时发生错误: {e}") + raise + + +if __name__ == "__main__": + import sys + + print("开始处理文章与图片的智能挂靠...") + + # 检查命令行参数 + if len(sys.argv) > 1: + if sys.argv[1] == "--test" and len(sys.argv) > 2: + # 测试模式:处理前N篇文章 + test_count = int(sys.argv[2]) + print(f"启动测试模式,处理前 {test_count} 篇文章") + process_article_image_matching(test_mode=True, test_count=test_count) + elif sys.argv[1] == "--test" and len(sys.argv) == 2: + # 提示用户输入要测试的文章数量 + test_count_input = input("请输入要测试的文章数量 (默认3): ") + test_count = int(test_count_input) if test_count_input.strip().isdigit() else 3 + print(f"启动测试模式,处理前 {test_count} 篇文章") + process_article_image_matching(test_mode=True, test_count=test_count) + else: + print("使用方法:") + print(" 正常模式: python match_article_images.py") + print(" 测试模式: python match_article_images.py --test [文章ID]") + else: + # 正常模式:处理所有文章 + process_article_image_matching() \ No newline at end of file diff --git a/match_article_images.py b/match_article_images.py index 2eaf13a..e69de29 100644 --- a/match_article_images.py +++ b/match_article_images.py @@ -1,910 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -文章与图片智能挂靠脚本 -根据文章标签匹配ai_image_tags表中的图片,使用大模型进行处理, -如果挂靠失败或没有相同标签的图片,则使用Gemini生成图片 -""" - -import json -import os -import re -import requests -import csv -import pymysql -from typing import List, Dict, Tuple, Optional -from collections import defaultdict -from database_config import db_manager -from log_config import setup_logger -import time -import random -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - - -def get_articles_with_tags_from_db() -> List[Dict]: - """ - 从数据库获取文章及其标签 - - Returns: - 包含文章信息的字典列表 - """ - # 设置日志记录器 - logger = setup_logger('article_matching', 'logs/article_matching.log', 'logs/article_matching_error.log') - - articles = [] - - try: - # 查询审核通过的文章,包含内容和标签 - sql = """ - SELECT id, title, content, coze_tag - FROM ai_articles - WHERE status = 'approved' - ORDER BY id - """ - - logger.info("开始查询审核通过的文章数据...") - results = db_manager.execute_query(sql) - - if not results: - logger.warning("没有找到状态为approved的文章") - print("没有找到状态为approved的文章") - return articles - - logger.info(f"查询到 {len(results)} 条审核通过的文章") - print(f"查询到 {len(results)} 条审核通过的文章") - - for row in results: - article_id, title, content, coze_tag = row - - # 解析标签 - tags = [] - if coze_tag: - try: - # 尝试解析JSON格式的标签 - tags_data = json.loads(coze_tag) - if isinstance(tags_data, list): - tags = tags_data - elif isinstance(tags_data, dict): - # 如果是字典格式,提取值 - tags = list(tags_data.values()) if isinstance(list(tags_data.values())[0], list) else list(tags_data.values()) - else: - # 如果是字符串,尝试按逗号分割 - tags = [tag.strip() for tag in str(tags_data).split(',') if tag.strip()] - except json.JSONDecodeError: - # 如果不是JSON格式,按逗号分割 - tags = [tag.strip() for tag in str(coze_tag).split(',') if tag.strip()] - - articles.append({ - 'id': article_id, - 'title': title, - 'content': content, - 'tags': tags - }) - except Exception as e: - logger.error(f"从数据库获取文章数据时发生错误: {e}", exc_info=True) - print(f"从数据库获取文章数据时发生错误: {e}") - raise - - return articles - - -def get_images_by_tags_from_db(tags: List[str] = [], used_counts: Dict[str, int] = {}) -> List[Dict]: - """ - 从数据库根据标签获取图片 - - Args: - tags: 标签列表 - used_counts: 已使用次数的字典,key为图片ID,value为使用次数 - - Returns: - 包含图片信息的字典列表 - """ - if not tags: - return [] - - # 设置日志记录器 - logger = setup_logger('article_matching', 'logs/article_matching.log', 'logs/article_matching_error.log') - - images = [] - - try: - # 查询符合条件的图像标签数据 - sql = """ - SELECT id, image_id, image_name, image_url, tag_name, keywords_name, department_name, image_attached_article_count - FROM ai_image_tags - WHERE image_attached_article_count < 5 - ORDER BY id - """ - - logger.info("开始查询符合条件的图像标签数据...") - results = db_manager.execute_query(sql) - - if not results: - logger.warning("没有找到符合条件的图像标签数据 (image_attached_article_count < 5)") - print("没有找到符合条件的图像标签数据 (image_attached_article_count < 5)") - return images - - logger.info(f"查询到 {len(results)} 条符合条件的图像标签数据") - print(f"查询到 {len(results)} 条符合条件的图像标签数据") - - for row in results: - ( - image_id, db_image_id, image_name, image_url, tag_name, - keywords_name, department_name, base_count - ) = row - - # 检查图片的附加文章数量是否小于5,考虑已使用次数 - used_count = used_counts.get(str(image_id), 0) - total_count = base_count + used_count - - if total_count >= 5: - continue - - # 检查标签是否匹配 - if any(tag.lower() in tag_name.lower() for tag in tags): - images.append({ - 'id': str(image_id), - 'image_id': db_image_id, - 'image_name': image_name, - 'image_url': image_url, - 'tag_name': tag_name, - 'keywords_name': keywords_name, - 'department_name': department_name, - 'base_count': base_count - }) - except Exception as e: - logger.error(f"从数据库获取图片数据时发生错误: {e}", exc_info=True) - print(f"从数据库获取图片数据时发生错误: {e}") - raise - - print(f"从数据库找到 {len(images)} 张符合条件的匹配图片") - return images - - -def call_qwen_model(article: Dict, image_urls: List[str]) -> bool: - """ - 调用通义千问大模型进行文章与图片挂靠评估 - - Args: - article: 文章信息 - image_urls: 图片URL列表 - - Returns: - 挂靠是否成功 - """ - # 通义千问API配置 - api_key = "sk-e6a38204022a4b538b8954f0584712af" - api_url = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation" - - # 构建请求内容 - content = f""" - 请评估以下文章与图片的匹配度: - - 文章标题: {article['title']} - 文章内容: {article['content'][:500]}... # 限制内容长度 - - 图片URLs: {', '.join(image_urls)} - - 请判断这些图片是否适合用于这篇文章。如果匹配度高,请回复"匹配成功";如果匹配度低,请回复"匹配失败"。 - """ - - headers = { - 'Authorization': f'Bearer {api_key}', - 'Content-Type': 'application/json' - } - - payload = { - "model": "qwen-max", # 或其他合适的模型 - "input": { - "messages": [ - { - "role": "user", - "content": content - } - ] - }, - "parameters": { - "temperature": 0.7 - } - } - - try: - response = requests.post(api_url, headers=headers, json=payload) - - if response.status_code == 200: - result = response.json() - # 解析响应,判断匹配结果 - if 'output' in result and 'text' in result['output']: - response_text = result['output']['text'].lower() - # 根据响应内容判断是否匹配 - if '匹配成功' in response_text or '是的' in response_text or '合适' in response_text: - print(f"通义千问评估结果: 匹配成功 - 文章 '{article['title']}'") - return True - else: - print(f"通义千问评估结果: 匹配失败 - 文章 '{article['title']}'") - return False - else: - print(f"通义千问API响应格式异常: {result}") - return False - else: - print(f"通义千问API调用失败: {response.status_code} - {response.text}") - # API调用失败时,仍然尝试匹配,这里返回False触发图片生成 - return False - - except Exception as e: - print(f"调用通义千问API时发生错误: {e}") - # 发生错误时,返回False以触发图片生成 - return False - - -def insert_generated_image_to_db(image_name: str, image_url: str, article_tags: List[str]) -> Optional[Dict]: - """ - 将Gemini生成的图片信息插入数据库 - - Args: - image_name: 图片文件名,如 "1755310671174988.png" - image_url: 图片URL路径,如 "20250816/1755310671174988.png" - article_tags: 文章标签列表,用于查询department和keywords - - Returns: - 包含插入信息的字典:{ - 'tag_image_id': tag_image_id, - 'image_id': image_id, - 'image_url': image_url, - 'image_thumb_url': image_thumb_url, - 'keywords_id': keywords_id, - 'keywords_name': keywords_name, - 'department_id': department_id, - 'department_name': department_name - } - """ - connection = db_manager.get_connection() - if connection is None: - print("无法连接到数据库") - return None - - try: - with connection.cursor(pymysql.cursors.DictCursor) as cursor: - # 1. 根据文章标签查询ai_image_tags表,获取department和keywords信息 - if article_tags: - # 使用第一个标签查询 - query = """ - SELECT department_name, keywords_name, department_id, keywords_id, tag_id - FROM ai_image_tags - WHERE tag_name = %s - LIMIT 1 - """ - cursor.execute(query, (article_tags[0],)) - tag_info = cursor.fetchone() - - if tag_info: - department = tag_info['department_name'] - keywords = tag_info['keywords_name'] - department_id = tag_info['department_id'] - keywords_id = tag_info['keywords_id'] - tag_id = tag_info['tag_id'] - tag_name = article_tags[0] - else: - # 如果没有找到,使用默认值 - department = "AI生成" - keywords = "AI图片" - department_id = 1 - keywords_id = 1 - tag_id = 1 - tag_name = article_tags[0] if article_tags else "AI生成" - else: - # 没有标签,使用默认值 - department = "AI生成" - keywords = "AI图片" - department_id = 1 - keywords_id = 1 - tag_id = 1 - tag_name = "AI生成" - - # 2. 插入ai_images表 - insert_image_query = """ - INSERT INTO ai_images - (image_name, image_url, image_thumb_url, department, keywords, image_type, upload_user_id, status) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s) - """ - cursor.execute(insert_image_query, ( - image_name, - image_url, - '', # image_thumb_url - department, - keywords, - 'medical', # image_type - 1, # upload_user_id(默认用户ID) - 'active' # status - )) - image_id = cursor.lastrowid - print(f"图片信息已插入ai_images表,image_id: {image_id}") - - # 3. 插入ai_image_tags表 - insert_tag_query = """ - INSERT INTO ai_image_tags - (image_id, image_name, image_url, image_thumb_url, tag_id, tag_name, - keywords_id, keywords_name, department_id, department_name, - image_source, created_user_id, image_attached_article_count) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - """ - cursor.execute(insert_tag_query, ( - image_id, - image_name, - image_url, - '', # image_thumb_url - tag_id, - tag_name, - keywords_id, - keywords, - department_id, - department, - 3, # image_source: 3表示AI生成 - 1, # created_user_id - 0 # image_attached_article_count - )) - tag_image_id = cursor.lastrowid - print(f"图片标签信息已插入ai_image_tags表,tag_image_id: {tag_image_id}") - - # 提交事务 - connection.commit() - - # 返回包含所有需要信息的字典 - return { - 'tag_image_id': tag_image_id, - 'image_id': image_id, - 'image_url': image_url, - 'image_thumb_url': '', - 'keywords_id': keywords_id, - 'keywords_name': keywords, - 'department_id': department_id, - 'department_name': department - } - - except Exception as e: - print(f"插入图片信息到数据库失败: {e}") - connection.rollback() - return None - finally: - connection.close() - - -def insert_article_image_relation(article_id: int, image_id: int, image_url: str, image_thumb_url: str, - tag_image_id: int, keywords_id: int, keywords_name: str, - department_id: int, department_name: str, image_source: int = 0) -> Optional[int]: - """ - 将文章与图片的关联信息插入ai_article_images表 - - Args: - article_id: 文章ID - image_id: 图片ID(ai_images表的id) - image_url: 图片URL - image_thumb_url: 缩略图URL - tag_image_id: 图片标签ID(ai_image_tags表的id) - keywords_id: 关键词ID - keywords_name: 关键词名称 - department_id: 部门ID - department_name: 部门名称 - image_source: 图片来源(0表示默认) - - Returns: - 插入的ai_article_images表的ID - """ - connection = db_manager.get_connection() - if connection is None: - print("无法连接到数据库") - return None - - try: - with connection.cursor(pymysql.cursors.DictCursor) as cursor: - # 1. 查询当前文章下已有图片的最大sort_order - query_max_sort = """ - SELECT COALESCE(MAX(sort_order), 0) as max_sort_order - FROM ai_article_images - WHERE article_id = %s - """ - cursor.execute(query_max_sort, (article_id,)) - result = cursor.fetchone() - max_sort_order = result['max_sort_order'] if result else 0 - new_sort_order = max_sort_order + 1 - - print(f"文章 {article_id} 当前最大sort_order: {max_sort_order}, 新图片sort_order: {new_sort_order}") - - # 2. 插入ai_article_images表 - insert_query = """ - INSERT INTO ai_article_images - (article_id, image_id, image_url, image_thumb_url, image_tag_id, sort_order, - keywords_id, keywords_name, department_id, department_name, image_source) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - """ - cursor.execute(insert_query, ( - article_id, - image_id, - image_url, - image_thumb_url, - tag_image_id, - new_sort_order, - keywords_id, - keywords_name, - department_id, - department_name, - image_source - )) - article_image_id = cursor.lastrowid - print(f"文章图片关联信息已插入ai_article_images表,id: {article_image_id}") - - # 提交事务 - connection.commit() - - return article_image_id - - except Exception as e: - print(f"插入文章图片关联信息失败: {e}") - connection.rollback() - return None - finally: - connection.close() - - -def generate_image_with_gemini(prompt: str, article_tags: List[str], article_id: int) -> str: - """ - 使用Gemini生成图片并上传到服务器 - - Args: - prompt: 图片生成提示词 - article_tags: 文章标签列表,用于查询department和keywords - article_id: 文章ID,用于关联图片 - - Returns: - 上传后的图片URL - """ - # 导入必要的库 - try: - from google import genai - from google.genai import types - from google.genai.client import HttpOptions - - except ImportError: - print("错误:未安装google-genai库,请运行 'pip install google-genai' 进行安装") - raise - - client = genai.Client(http_options=HttpOptions(base_url="https://work.poloapi.com"), - api_key="sk-V4tPnDgzFPa7nxWrvKnNJsW8ZcBXXPuGmjfgvPVRnwpHoeob") - - print(f"正在调用Gemini API生成图片,提示词: {prompt[:50]}...") - - # 生成内容 - response = client.models.generate_content( - model="gemini-3-pro-image-preview", - contents=[prompt], - ) - - # 检查是否有候选答案 - if not response.candidates: - raise Exception("Gemini API未返回任何候选答案") - - # 处理响应 - 遍历第一个候选答案的内容部分 - candidate = response.candidates[0] - if not candidate.content or not candidate.content.parts: - raise Exception("Gemini API返回的候选答案中没有内容部分") - - for part in candidate.content.parts: - if hasattr(part, 'text') and part.text is not None: - print(f"Gemini响应文本: {part.text}") - elif hasattr(part, 'inline_data') and part.inline_data is not None: - image_data = part.inline_data - if image_data.data is not None: - # 生成唯一的文件名(基于时间戳) - import time - import os - from datetime import datetime - - timestamp_ms = int(time.time() * 1000) # 毫秒级时间戳 - image_filename = f"{timestamp_ms}.png" - today_date = datetime.now().strftime("%Y%m%d") - image_url_path = f"{today_date}/{image_filename}" - - temp_filename = f"temp_generated_image_{timestamp_ms}.png" - # 保存图片数据到临时文件 - with open(temp_filename, 'wb') as f: - f.write(image_data.data) - print(f"Gemini生成图片成功: {temp_filename}") - - # 先将图片信息插入数据库,获取相关信息 - image_info = insert_generated_image_to_db(image_filename, image_url_path, article_tags) - - if not image_info: - raise Exception("插入图片信息到数据库失败") - - print(f"图片信息已插入数据库,tag_image_id: {image_info['tag_image_id']}, image_id: {image_info['image_id']}") - - # 使用tag_image_id上传图片到服务器 - uploaded_url = upload_image_to_server(temp_filename, image_info['tag_image_id']) - - # 将文章与图片的关联信息插入ai_article_images表 - article_image_id = insert_article_image_relation( - article_id=article_id, - image_id=image_info['image_id'], - image_url=image_info['image_url'], - image_thumb_url=image_info['image_thumb_url'], - tag_image_id=image_info['tag_image_id'], - keywords_id=image_info['keywords_id'], - keywords_name=image_info['keywords_name'], - department_id=image_info['department_id'], - department_name=image_info['department_name'], - image_source=0 # 默认值 - ) - - if article_image_id: - print(f"文章图片关联信息已创建,ai_article_images.id: {article_image_id}") - - # 删除临时文件 - os.remove(temp_filename) - - print(f"图片已上传到服务器: {uploaded_url}") - # 返回上传后的图片URL - return uploaded_url - - # 如果没有返回图片数据,抛出异常 - raise Exception("Gemini API未返回有效的图片数据") - - -def upload_image_to_server(image_path: str, tag_image_id: int) -> str: - """ - 上传图片到服务器 - - Args: - image_path: 本地图片路径 - tag_image_id: 图片标签ID - - Returns: - 服务器上的图片URL - """ - import requests - import json - - # 登录获取JWT token - base_url = "http://47.99.184.230:8324" # 使用外网API地址 - jwt_token = login_and_get_jwt_token(base_url) - - if not jwt_token: - raise Exception("获取JWT token失败,无法上传图片") - - # 准备上传请求 - upload_url = f"{base_url}/api/images/upload" - headers = { - 'Authorization': f'Bearer {jwt_token}', - } - - # 读取图片文件 - with open(image_path, 'rb') as image_file: - files = {'file': image_file} - data = {'tag_image_id': tag_image_id} # 添加必传参数 - - response = requests.post(upload_url, headers=headers, files=files, data=data) - - print(f"图片上传响应状态码: {response.status_code}") - print(f"图片上传响应内容: {response.text}") - - if response.status_code == 200: - result = response.json() - if result.get('code') == 200: - # 返回服务器上的图片URL - return result['data']['http_image_url'] - else: - raise Exception(f"图片上传失败: {result.get('message', '未知错误')}") - else: - raise Exception(f"图片上传请求失败,状态码: {response.status_code}, 响应: {response.text}") - - -def login_and_get_jwt_token(base_url: str) -> Optional[str]: - """ - 登录获取JWT token - """ - login_url = f"{base_url}/api/auth/login" - login_data = { - "username": "user010", # 使用固定的账号 - "password": "@5^2W6R7" - } - - print(f"尝试登录: {login_data['username']}") - print(f"登录URL: {login_url}") - - try: - response = requests.post(login_url, json=login_data, headers={'Content-Type': 'application/json'}) - print(f"响应状态码: {response.status_code}") - - if response.status_code == 200: - result = response.json() - if result.get('code') == 200: - jwt_token = result['data']['token'] - print("JWT token获取成功") - return jwt_token - else: - print(f"登录失败: {result.get('message', '未知错误')}") - return None - else: - print(f"登录请求失败: {response.status_code}") - return None - - except Exception as e: - print(f"登录异常: {e}") - return None - - -def batch_publish_articles(base_url: str, jwt_token: str, article_ids: List[int]) -> bool: - """ - 批量提交文章到/api/articles/batch-publish-auto接口 - """ - try: - print(f"开始批量提交 {len(article_ids)} 篇文章到batch-publish-auto接口") - - # 构建批量发布数据 - publish_data = { - "article_ids": article_ids - } - - print(f"准备批量提交的数据: {json.dumps(publish_data, ensure_ascii=False)}") - - # 发送请求 - upload_url = f"{base_url}/api/articles/batch-publish-auto" - headers = { - 'Authorization': f'Bearer {jwt_token}', - 'Content-Type': 'application/json', - 'Accept': 'application/json' - } - - response = requests.post(upload_url, json=publish_data, headers=headers) - - print(f"批量提交响应状态码: {response.status_code}") - - if response.status_code == 200: - try: - result = response.json() - print(f"批量提交响应内容: {result}") - - # 根据接口实际返回格式判断成功 - if result.get('code') == 200: - data = result.get('data', {}) - published_count = data.get('published_count', 0) - failed_count = data.get('failed_count', 0) - - success_msg = f"批量提交成功,发布: {published_count}篇,失败: {failed_count}篇" - print(success_msg) - return True - else: - print(f"批量提交失败: {result.get('message', '未知错误')}") - return False - except json.JSONDecodeError as e: - print(f"解析批量提交响应失败: {e}") - return False - elif response.status_code == 401: - # Token过期 - print("收到401错误,JWT token可能已过期") - return False - else: - print(f"批量提交请求失败,状态码: {response.status_code}") - return False - - except Exception as e: - print(f"批量提交异常: {e}") - return False - - -def process_single_article(article, used_image_counts, match_results): - """ - 处理单个文章与图片的匹配和挂靠 - - Args: - article: 单个文章数据 - used_image_counts: 图片使用计数 - match_results: 匹配结果列表 - - Returns: - 是否处理成功 - """ - print(f"\n处理文章: {article['title']} (ID: {article['id']})") - - # 根据文章标签获取匹配的图片(考虑已使用次数) - matched_images = get_images_by_tags_from_db(article['tags'], used_image_counts) - - if matched_images: - print(f"找到 {len(matched_images)} 张符合条件的匹配图片") - - # 按基础使用次数排序,优先使用基础计数较低的图片 - matched_images.sort(key=lambda x: x['base_count']) - - matched = False - for img in matched_images: - # 提取图片URL并添加前缀 - image_url = "http://images11.bxmkb.cn/Images/" + img['image_url'] - - if image_url: # 确保图片URL存在 - # 调用通义千问大模型进行挂靠评估 - match_success = call_qwen_model(article, [image_url]) - - if match_success: - print(f"文章与图片挂靠成功: {article['title']}") - - # 更新图片使用次数 - used_image_counts[img['id']] += 1 - - # 记录匹配结果 - match_results.append({ - '文章ID': article['id'], - '文章标题': article['title'], - '文章内容': article['content'][:100] + '...' if len(article['content']) > 100 else article['content'], # 限制内容长度 - '标签': ', '.join(article['tags']), - '匹配的图片URL': image_url, - '图片ID': img['id'], - '图片名称': img['image_name'], - '图片标签': img['tag_name'], - '图片关键词': img['keywords_name'], - '图片部门': img['department_name'], - '匹配状态': '成功' - }) - - return True - - if not matched: - print(f"文章未能与任何图片成功匹配,使用Gemini生成图片: {article['title']}") - - # 使用文章标题和标签生成提示词 - prompt = f"与'{article['title']}'相关的插图,标签: {', '.join(article['tags'])}" - generated_image_url = generate_image_with_gemini(prompt, article['tags'], article['id']) - print(f"生成的图片URL: {generated_image_url}") - - # 记录生成图片的结果 - match_results.append({ - '文章ID': article['id'], - '文章标题': article['title'], - '文章内容': article['content'][:100] + '...' if len(article['content']) > 100 else article['content'], - '标签': ', '.join(article['tags']), - '匹配的图片URL': generated_image_url, - '图片ID': 'N/A', - '图片名称': 'Generated', - '图片标签': 'N/A', - '图片关键词': 'N/A', - '图片部门': 'N/A', - '匹配状态': '生成图片' - }) - - return True - else: - print(f"没有找到符合条件的匹配图片,使用Gemini生成图片: {article['title']}") - - # 使用文章标题和标签生成提示词 - prompt = f"与'{article['title']}'相关的插图,标签: {', '.join(article['tags'])}" - generated_image_url = generate_image_with_gemini(prompt, article['tags'], article['id']) - print(f"生成的图片URL: {generated_image_url}") - - # 记录生成图片的结果 - match_results.append({ - '文章ID': article['id'], - '文章标题': article['title'], - '文章内容': article['content'][:100] + '...' if len(article['content']) > 100 else article['content'], - '标签': ', '.join(article['tags']), - '匹配的图片URL': generated_image_url, - '图片ID': 'N/A', - '图片名称': 'Generated', - '图片标签': 'N/A', - '图片关键词': 'N/A', - '图片部门': 'N/A', - '匹配状态': '生成图片' - }) - - return True - - -def process_article_image_matching(test_mode=False, test_count=None): - """ - 处理文章与图片的匹配和挂靠 - - Args: - test_mode: 是否为测试模式 - test_count: 测试文章数量(仅在测试模式下使用) - """ - # 用于跟踪每张图片的使用次数 - used_image_counts = defaultdict(int) - # 存储匹配结果 - match_results = [] - - try: - # 根据模式决定获取哪些文章 - articles = get_articles_with_tags_from_db() - - if not articles: - print("没有找到文章") - return - - # 如果是测试模式,只取前test_count条数据 - if test_mode: - if test_count is None: - test_count = 3 # 默认测试前3条 - articles = articles[:test_count] - print(f"测试模式:处理前 {len(articles)} 篇文章") - - success_count = 0 - generated_count = 0 - - # 收集所有处理后的文章ID用于发布 - processed_article_ids = [] - - for article in articles: - if process_single_article(article, used_image_counts, match_results): - success_count += 1 - processed_article_ids.append(article['id']) - else: - print(f"处理文章 {article['id']} 失败") - - # 将匹配结果写入CSV文件 - output_csv = 'article_image_match_results.csv' - with open(output_csv, 'w', newline='', encoding='utf-8-sig') as csvfile: - fieldnames = [ - '文章ID', '文章标题', '文章内容', '标签', - '匹配的图片URL', '图片ID', '图片名称', - '图片标签', '图片关键词', '图片部门', '匹配状态' - ] - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - - writer.writeheader() - for result in match_results: - writer.writerow(result) - - if not test_mode: - print(f"\n处理完成! 成功挂靠: {success_count} 篇, 生成图片: {generated_count} 篇") - print(f"匹配结果已保存至: {output_csv}") - - # 如果有处理过的文章,将它们提交到发布接口 - if processed_article_ids: - print(f"\n开始发布处理过的 {len(processed_article_ids)} 篇文章...") - - # 登录获取JWT token - base_url = "http://47.99.184.230:8324" # 使用外网API地址 - jwt_token = login_and_get_jwt_token(base_url) - - if jwt_token: - # 批量发布文章 - if batch_publish_articles(base_url, jwt_token, processed_article_ids): - print(f"成功发布 {len(processed_article_ids)} 篇文章") - else: - print("批量发布失败") - else: - print("获取JWT token失败,无法发布文章") - else: - print("\n没有处理过的文章,跳过发布步骤") - else: - print(f"\n测试模式完成! 处理了 {len(articles)} 篇文章,成功挂靠: {success_count} 篇, 生成图片: {len([r for r in match_results if r['匹配状态'] == '生成图片'])} 篇") - print(f"处理结果已保存至: {output_csv}") - - except Exception as e: - print(f"处理文章图片匹配时发生错误: {e}") - raise - - -if __name__ == "__main__": - import sys - - print("开始处理文章与图片的智能挂靠...") - - # 检查命令行参数 - if len(sys.argv) > 1: - if sys.argv[1] == "--test" and len(sys.argv) > 2: - # 测试模式:处理前N篇文章 - test_count = int(sys.argv[2]) - print(f"启动测试模式,处理前 {test_count} 篇文章") - process_article_image_matching(test_mode=True, test_count=test_count) - elif sys.argv[1] == "--test" and len(sys.argv) == 2: - # 提示用户输入要测试的文章数量 - test_count_input = input("请输入要测试的文章数量 (默认3): ") - test_count = int(test_count_input) if test_count_input.strip().isdigit() else 3 - print(f"启动测试模式,处理前 {test_count} 篇文章") - process_article_image_matching(test_mode=True, test_count=test_count) - else: - print("使用方法:") - print(" 正常模式: python match_article_images.py") - print(" 测试模式: python match_article_images.py --test [文章ID]") - else: - # 正常模式:处理所有文章 - process_article_image_matching() \ No newline at end of file diff --git a/setup_env.sh b/setup_env.sh index b244066..eacee99 100644 --- a/setup_env.sh +++ b/setup_env.sh @@ -1,13 +1,24 @@ #!/bin/bash -echo "正在创建虚拟环境..." -python3 -m venv venv -echo "虚拟环境创建完成!" -echo "正在激活虚拟环境..." -source venv/bin/activate +# 设置虚拟环境路径 +VENV_PATH="/home/work/keyword_crawl/venv" -echo "正在安装依赖..." -pip install -r requirements.txt +# 检查虚拟环境是否存在 +echo "正在检查虚拟环境: $VENV_PATH" +if [ ! -d "$VENV_PATH" ]; then + echo "错误: 虚拟环境不存在: $VENV_PATH" + exit 1 +fi -echo "虚拟环境设置完成!" -echo "激活虚拟环境的命令: source venv/bin/activate" \ No newline at end of file +echo "正在激活虚拟环境: $VENV_PATH" +source "$VENV_PATH/bin/activate" + +echo "检查Python版本:" +python --version + +echo "检查已安装的包:" +pip list + +echo "虚拟环境激活完成!" +echo "当前使用的Python路径: $(which python)" +echo "当前使用的pip路径: $(which pip)" \ No newline at end of file diff --git a/start_article_auto_image_matching.sh b/start_article_auto_image_matching.sh new file mode 100644 index 0000000..b69da29 --- /dev/null +++ b/start_article_auto_image_matching.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +# 文章图片智能匹配脚本启动器 +# 用途:启动文章与图片自动匹配、生成图片的后台服务 + +# 获取脚本所在目录 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# 日志文件路径 +LOG_DIR="$SCRIPT_DIR/logs" +TIMESTAMP=$(date +"%Y%m%d_%H%M%S") +LOG_FILE="$LOG_DIR/start_${TIMESTAMP}.log" + +# 确保日志目录存在 +mkdir -p "$LOG_DIR" + +echo "========================================" | tee -a "$LOG_FILE" +echo "启动时间: $(date '+%Y-%m-%d %H:%M:%S')" | tee -a "$LOG_FILE" +echo "工作目录: $SCRIPT_DIR" | tee -a "$LOG_FILE" +echo "========================================" | tee -a "$LOG_FILE" + +# 检查Python虚拟环境 +VENV_PATH="/home/work/keyword_crawl/venv" +if [ -d "$VENV_PATH" ]; then + echo "[信息] 检测到虚拟环境: $VENV_PATH" | tee -a "$LOG_FILE" + source "$VENV_PATH/bin/activate" + echo "[信息] 虚拟环境已激活" | tee -a "$LOG_FILE" +else + echo "[警告] 未找到虚拟环境,使用系统Python" | tee -a "$LOG_FILE" +fi + +# 显示Python版本 +echo "[信息] Python版本: $(python --version 2>&1)" | tee -a "$LOG_FILE" +echo "[信息] Python路径: $(which python)" | tee -a "$LOG_FILE" + +# 检查主脚本文件是否存在 +MAIN_SCRIPT="$SCRIPT_DIR/article_auto_image_matching.py" +if [ ! -f "$MAIN_SCRIPT" ]; then + echo "[错误] 未找到主脚本: $MAIN_SCRIPT" | tee -a "$LOG_FILE" + exit 1 +fi + +echo "[信息] 主脚本: $MAIN_SCRIPT" | tee -a "$LOG_FILE" + +# 检查中文字体是否已安装 +echo "[信息] 检查中文字体..." | tee -a "$LOG_FILE" +if command -v fc-list &> /dev/null; then + FONT_COUNT=$(fc-list :lang=zh 2>/dev/null | wc -l) + if [ "$FONT_COUNT" -gt 0 ]; then + echo "[信息] 检测到 $FONT_COUNT 个中文字体" | tee -a "$LOG_FILE" + else + echo "[警告] 未检测到中文字体,压字花功能可能无法使用" | tee -a "$LOG_FILE" + echo "[提示] Ubuntu/Debian: sudo apt-get install fonts-wqy-zenhei" | tee -a "$LOG_FILE" + echo "[提示] CentOS/RHEL: sudo yum install wqy-zenhei-fonts" | tee -a "$LOG_FILE" + fi +else + echo "[警告] 无法检测字体(fc-list命令不存在)" | tee -a "$LOG_FILE" +fi + +echo "========================================" | tee -a "$LOG_FILE" +echo "[启动] 正在启动文章图片智能匹配服务..." | tee -a "$LOG_FILE" +echo "========================================" | tee -a "$LOG_FILE" + +# 启动主脚本 +python "$MAIN_SCRIPT" 2>&1 | tee -a "$LOG_FILE" + +# 获取退出状态码 +EXIT_CODE=$? + +echo "========================================" | tee -a "$LOG_FILE" +echo "结束时间: $(date '+%Y-%m-%d %H:%M:%S')" | tee -a "$LOG_FILE" +echo "退出状态码: $EXIT_CODE" | tee -a "$LOG_FILE" +echo "========================================" | tee -a "$LOG_FILE" + +# 退出 +exit $EXIT_CODE diff --git a/参考脚本/generate_injection_article(1).py b/参考脚本/generate_injection_article(1).py new file mode 100644 index 0000000..0f09e58 --- /dev/null +++ b/参考脚本/generate_injection_article(1).py @@ -0,0 +1,945 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +AI文章自动生成监控脚本 +监控数据库中status为topic的记录,自动调用Coze API生成文章并提交 +""" + +import os +import sys +import time +import json +import logging +import requests +import pymysql +from datetime import datetime +from typing import Dict, List, Optional, Any +import traceback +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed +from queue import Queue, Empty + +# 添加项目根目录到Python路径 +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +from database_config import get_db_manager +from log_config import setup_logger +from dashvector_get_similar_topic import search_chinese, init_dashvector_client + +# 配置日志记录器,支持按日期切割和控制台输出 +logger = setup_logger( + name='generate_injection', + log_file='logs/generate_injection_article.log', + error_log_file='logs/generate_injection_error.log', + level=logging.INFO, + console_output=True +) + +# 配置常量 +#BASE_URL = "http://47.99.184.230:8321" +BASE_URL = "http://127.0.0.1:8321" +COZE_API_URL = "https://api.coze.cn/v1/workflow/stream_run" +SLEEP_INTERVAL = 5 # 监控间隔(秒) +WORKER_COUNT = 6 # 并行处理worker数量,可配置 + +# 全局变量 +AUTH_TOKEN = None +WORKFLOW_ID = None +JWT_TOKEN = None + +class CozeArticleGenerator: + def __init__(self): + # API配置 + self.base_url = BASE_URL + self.coze_api_url = COZE_API_URL + + # 认证信息 + self.jwt_token = None + + # 使用统一的数据库管理器 + self.db_manager = get_db_manager() + + # 登录配置 + self.login_credentials = { + 'username': 'user010', + 'password': '@5^2W6R7' + } + + # 禁用代理 + self.proxies = { + 'http': None, + 'https': None + } + + # 并行处理相关 + self.processing_lock = threading.Lock() # 用于线程安全的记录分配 + self.processed_ids = set() # 已处理的记录ID集合 + + # 初始化DashVector客户端(向量检索) + logger.info("开始初始化DashVector客户端") + if init_dashvector_client(): + logger.info("DashVector客户端初始化成功") + else: + logger.warning("DashVector客户端初始化失败,相似topic检索功能将不可用") + + logger.info("CozeArticleGenerator 初始化完成") + + def get_article_contents_by_ids(self, article_ids: List[int]) -> str: + """ + 根据article_id列表从数据库获取content内容 + + Args: + article_ids: 文章ID列表 + + Returns: + str: 合并后的content内容,多条用\n\n分隔 + """ + if not article_ids: + logger.info("没有文章ID,返回空字符串") + return "" + + try: + # 去重并限制最多2条 + article_ids = list(set(article_ids))[:2] + + logger.info(f"开始查询文章content,article_ids: {article_ids}") + + with self.db_manager.get_cursor() as cursor: + # 构建IN查询 + placeholders = ','.join(['%s'] * len(article_ids)) + sql = f""" + SELECT id, content, created_at + FROM ai_articles + WHERE id IN ({placeholders}) + AND content IS NOT NULL AND content != '' + ORDER BY created_at DESC + LIMIT 2 + """ + + cursor.execute(sql, article_ids) + results = cursor.fetchall() + + if not results: + logger.warning(f"未查询到文章content,article_ids: {article_ids}") + return "" + + logger.info(f"查询到 {len(results)} 条文章content") + + # 合并content + contents = [] + for row in results: + content = row.get('content', '').strip() + if content: + contents.append(content) + logger.info(f"添加文章content,ID: {row.get('id')}, 长度: {len(content)} 字符") + + # 用两个换行符分隔 + merged_content = "\n\n".join(contents) + logger.info(f"合并后的content总长度: {len(merged_content)} 字符") + + return merged_content + + except Exception as e: + error_msg = f"查询文章content异常: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, traceback.format_exc()) + return "" + + def log_to_database(self, level: str, message: str, details: str = None): + """记录日志到数据库ai_logs表""" + try: + with self.db_manager.get_cursor() as cursor: + # 映射日志级别到数据库状态 + status_map = { + 'INFO': 'success', + 'WARNING': 'warning', + 'ERROR': 'error' + } + status = status_map.get(level, 'success') + + sql = """ + INSERT INTO ai_logs (user_id, action, description, status, error_message, created_at) + VALUES (%s, %s, %s, %s, %s, NOW()) + """ + #cursor.execute(sql, (None, 'coze_generator', message, status, details)) + logger.info(f"日志已记录到数据库: {level} - {message}") + except Exception as e: + logger.error(f"记录日志到数据库失败: {e}") + + def login_and_get_jwt_token(self) -> bool: + """登录获取JWT token,参考JavaScript逻辑""" + try: + login_url = f"{self.base_url}/api/auth/login" + login_data = { + "username": "user010", # 使用用户指定的账号 + "password": "@5^2W6R7" + } + + logger.info(f"尝试登录: {login_data['username']}") + self.log_to_database('INFO', f"尝试登录用户: {login_data['username']}") + + response = requests.post( + login_url, + json=login_data, + headers={'Content-Type': 'application/json'}, + proxies=self.proxies # 禁用代理 + ) + + if response.status_code == 200: + result = response.json() + if result.get('code') == 200: + self.jwt_token = result['data']['token'] + logger.info("JWT token获取成功") + self.log_to_database('INFO', "JWT token获取成功", json.dumps(result['data'])) + return True + else: + error_msg = f"登录失败: {result.get('message', '未知错误')}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, json.dumps(result)) + return False + else: + error_msg = f"登录请求失败: {response.status_code}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, response.text) + return False + + except Exception as e: + error_msg = f"登录异常: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, traceback.format_exc()) + return False + + def get_coze_token(self) -> bool: + """获取Coze API token和workflow ID,100%参考JavaScript逻辑""" + try: + if not self.jwt_token: + logger.error("JWT token为空,请先登录") + self.log_to_database('ERROR', "JWT token为空,请先登录") + return False + + logger.info("正在获取Coze token...") + self.log_to_database('INFO', "开始获取Coze token") + + coze_token_url = f"{self.base_url}/api/coze_token_auto" + headers = { + 'Authorization': f'Bearer {self.jwt_token}', + 'Content-Type': 'application/json', + 'Accept': 'application/json' + } + + # 完全按照JavaScript代码的payload格式 + payload = { + "app_key": "MhzxKhl" # 注意这里是MhzxKhl,不是MhxzKhl + } + + response = requests.post( + coze_token_url, + json=payload, + headers=headers, + timeout=30, + proxies=self.proxies # 禁用代理 + ) + + if response.status_code == 200: + result = response.json() + logger.info(f"[Coze token获取成功] 数据: {json.dumps(result, ensure_ascii=False)}") + if result.get('errno') == 0 and 'account_info' in result: + account_info = result['account_info'] + auth_token = account_info.get('AUTH_TOKEN') + workflow_id = account_info.get('WORKFLOW_ID') + + if auth_token and workflow_id: + logger.info(f"Coze token获取成功,WORKFLOW_ID: {workflow_id}") + self.log_to_database('INFO', f"Coze token获取成功,WORKFLOW_ID: {workflow_id}", json.dumps(account_info)) + return True + else: + error_msg = "Coze token或workflow ID为空" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, json.dumps(result)) + return False + else: + error_msg = f"获取Coze token失败: {result.get('errmsg', '未知错误')}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, json.dumps(result)) + return False + else: + error_msg = f"获取Coze token请求失败,状态码: {response.status_code}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, response.text) + return False + + except Exception as e: + error_msg = f"获取Coze token异常: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, traceback.format_exc()) + return False + + def generate_article_from_coze(self, title: str, context_injection: str, workflow_id: str, auth_token: str) -> Optional[Dict]: + """调用Coze API生成文章,100%参考JavaScript流式处理逻辑""" + try: + logger.info(f"开始为主题'{title}'生成文章...") + logger.info(f"上下文注入内容长度: {len(context_injection)} 字符") + self.log_to_database('INFO', f"开始为主题生成文章: {title}", f"context_injection长度: {len(context_injection)}") + + # 验证传入的认证信息 + if not auth_token or not workflow_id: + error_msg = f"'{title}' - workflow_id 或 auth_token 参数缺失" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg) + return None + + # 构建请求数据,增加context_injection参数 + request_data = { + 'workflow_id': workflow_id, + 'parameters': { + 'title': title, + 'context_injection': context_injection # 新增上下文注入参数 + } + } + logger.info(f"提交coze工作流数据详情: {json.dumps(request_data['parameters'], ensure_ascii=False)[:600]}...") + + # 发送流式请求 + headers = { + 'Authorization': f'Bearer {auth_token}', + 'Content-Type': 'application/json' + } + + logger.info(f"'{title}' - 发送Coze API请求...") + + response = requests.post( + COZE_API_URL, + json=request_data, + headers=headers, + stream=True, + timeout=300 # 5分钟超时 + ) + + logger.info(f"'{title}' - Coze API响应状态码: {response.status_code}") + + if not response.ok: + error_msg = f"'{title}' - Coze API请求失败,状态码: {response.status_code}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, response.text) + return None + + # 调用流式响应解析方法 + return self.parse_stream_response(response, title) + + except Exception as e: + error_msg = f"生成文章异常: {e}, 主题: {title}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, traceback.format_exc()) + return None + + def parse_stream_response(self, response, title: str) -> Optional[Dict[str, Any]]: + """解析流式响应,100%参考JavaScript事件处理逻辑""" + try: + buffer = '' + last_structured = None + all_img = [] + + logger.info(f"'{title}' - 开始接收流式数据...") + + # 设置响应编码为UTF-8 + response.encoding = 'utf-8' + + for chunk in response.iter_content(chunk_size=1024, decode_unicode=True): + if chunk: + buffer += chunk + events = buffer.split('\n\n') + buffer = events.pop() or '' + + for event_str in events: + if not event_str.strip(): + continue + + lines = event_str.split('\n') + event_type = '' + data_str = '' + + # 解析事件类型和数据,完全按照JavaScript逻辑 + for line in lines: + if line.startswith('event:'): + event_type = line[6:].strip() + elif line.startswith('data:'): + data_str = line[5:].strip() + + logger.info(f"'{title}' - 收到事件: {event_type}") + self.log_to_database('INFO', f"收到Coze事件: {event_type}", f"主题: {title}") + + # 处理错误事件 + if event_type == 'Error': + logger.error(f"'{title}' - Coze API返回错误: {data_str}") + self.log_to_database('ERROR', f"Coze API返回错误: {title}", data_str) + try: + err_data = json.loads(data_str) + error_detail = f"错误代码: {err_data.get('error_code', '未知错误')}, 错误信息: {err_data.get('error_message', '无详细信息')}" + logger.error(f"'{title}' - {error_detail}") + self.log_to_database('ERROR', f"Coze API错误详情: {title}", error_detail) + except json.JSONDecodeError: + logger.error(f"'{title}' - 无法解析错误数据") + self.log_to_database('ERROR', f"无法解析Coze错误数据: {title}", data_str) + return None + + # 跳过PING和End事件 + if event_type in ['PING', 'End']: + continue + + # 处理Message事件 + if event_type == 'Message': + try: + logger.info(f"'{title}' - 收到Message事件,数据: {data_str[:600]}...") + data = json.loads(data_str) + + # 解析content字段为JSON对象 + content_obj = {} + if data.get('content') and isinstance(data['content'], str): + try: + content_obj = json.loads(data['content']) + logger.info(f"'{title}' - 解析后的content: {list(content_obj.keys())}") + except json.JSONDecodeError as e: + logger.error(f"'{title}' - 解析content字段失败: {e}") + continue + + # 保存结构化数据 - 修改逻辑:即使API返回的title为空也保存数据 + if content_obj.get('title') or content_obj.get('contents') or content_obj.get('introduction'): + # 使用API返回的title,如果为空则使用原始输入的title + final_title = content_obj.get('title') or title + last_structured = { + 'title': final_title, + 'tags': content_obj.get('tags', ''), + 'introduction': content_obj.get('introduction', ''), + 'conclusion': content_obj.get('conclusion', ''), + 'contents': content_obj.get('contents', []) if isinstance(content_obj.get('contents'), list) else [] + } + logger.info(f"'{title}' - 保存结构化数据,最终标题: {final_title}") + logger.info(f"'{title}' - 内容项数量: {len(last_structured['contents'])}") + + except json.JSONDecodeError as e: + logger.error(f"'{title}' - 解析消息错误: {e}") + continue + + if last_structured: + success_msg = f"'{title}' - 文章生成成功,包含{len(all_img)}张图片" + logger.info(success_msg) + self.log_to_database('INFO', success_msg, json.dumps(last_structured, ensure_ascii=False)) + return last_structured + else: + warning_msg = f"'{title}' - 未获取到有效的文章内容" + logger.warning(warning_msg) + self.log_to_database('WARNING', warning_msg) + return None + + except Exception as e: + error_msg = f"'{title}' - 解析流式响应异常: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, traceback.format_exc()) + return None + + def convert_structured_to_dynamic(self, structured_data: Dict) -> str: + """将结构化数据转换为Dynamic格式,参考JavaScript的convertStructuredToDynamic函数""" + try: + title = structured_data.get('title', '') + introduction = structured_data.get('introduction', '') + contents = structured_data.get('contents', []) + conclusion = structured_data.get('conclusion', '') + tags = structured_data.get('tags', '') + + logger.info(f"'{title}' - 开始转换Dynamic格式") + + html_content = '' + + # 添加title + if title: + html_content += f"{title}\n\n" + + # 添加引子部分 + if introduction and introduction.strip(): + html_content += f"{introduction.strip()}\n\n" + logger.info(f"'{title}' - 添加引言段落") + + # 添加内容项 + if contents and isinstance(contents, list): + for i, content in enumerate(contents): + if isinstance(content, dict): + # 修复bug:使用content_item字段而不是content字段,与JavaScript保持一致 + content_text = content.get('content_item') or content.get('content', '') + if content_text and content_text.strip(): + # 将换行符转换为段落标签 + paragraphs = content_text.split('\n') + filtered_paragraphs = [p.strip() for p in paragraphs if p.strip()] + for paragraph in filtered_paragraphs: + html_content += f"{paragraph}\n\n" + logger.info(f"'{title}' - 添加内容段落 {i+1},字段: {'content_item' if content.get('content_item') else 'content'}") + elif isinstance(content, str) and content.strip(): + # 将换行符转换为段落标签 + paragraphs = content.split('\n') + filtered_paragraphs = [p.strip() for p in paragraphs if p.strip()] + for paragraph in filtered_paragraphs: + html_content += f"{paragraph}\n\n" + logger.info(f"'{title}' - 添加内容段落 {i+1}") + + # 添加结论部分 + if conclusion and conclusion.strip(): + html_content += f"{conclusion.strip()}\n\n" + logger.info(f"'{title}' - 添加结论段落") + + # 添加tags + if tags: + #html_content += f"{tags}\n\n" + logger.info(f"'{title}' - 添加标签") + + logger.info(f"'{title}' - Dynamic格式转换完成") + + return html_content + + except Exception as e: + error_msg = f"转换HTML格式异常: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, traceback.format_exc()) + return "" + + def generate_article(self, structured_data: Dict, article_id: int, existing_batch_id: int) -> bool: + """提交文章到generate_article接口,100%参考JavaScript的sendInfoToBaijiahao函数""" + try: + # 增加判断:structured_data['contents']为空,报错 + if not structured_data or not structured_data.get('contents'): + logger.error(f"[Worker] 生成文章失败: structured_data['contents']为空") + # 移除直接数据库操作:不再直接更新状态为generate_failed + # 状态管理交给接口处理 + return False + + title = structured_data.get('title', 'Unknown') + logger.info(f"'{title}' - 开始提交文章到generate_article接口") + self.log_to_database('INFO', f"开始提交文章: {title}", f"article_id: {article_id}") + + # 确保有JWT token + if not self.jwt_token: + logger.warning(f"'{title}' - JWT token缺失,尝试重新登录") + self.log_to_database('WARNING', f"JWT token缺失,重新登录: {title}") + if not self.login_and_get_jwt_token(): + error_msg = f"'{title}' - 重新登录失败" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg) + return False + + # 如果没有找到现有batch_id,生成新的unique_id + if not existing_batch_id: + timestamp = int(time.time()) + random_num = str(int(time.time() * 1000) % 10000).zfill(4) + existing_batch_id = f"{timestamp}{random_num}" + logger.warning(f"'{title}' - 生成新的batch_id: {existing_batch_id}") + logger.error(f"'{title}' - 查询batch_id失败: {e}") + + # 转换内容为HTML格式 + html_content = self.convert_structured_to_dynamic(structured_data) + + # 构建发文数据,使用现有的batch_id以触发更新模式 + publish_data = { + "title": structured_data['title'], + "content": html_content, + "tags": structured_data.get('tags', ''), + "cover_image": structured_data.get('home_img', ''), + "article_id": article_id, + "batch_id": existing_batch_id, # 使用现有的batch_id + "uniq_id": existing_batch_id, + "source": "coze_auto_generator", # 标识来源 + "username": self.login_credentials['username'] + } + + logger.info(f"'{title}' - 准备提交的数据: article_id={article_id}, batch_id={existing_batch_id}") + logger.info(f"'{title}' - 提交数据详情: {json.dumps(publish_data, ensure_ascii=False)[:600]}...") + + # 发送请求 + upload_url = f"{self.base_url}/api/generate_article" + headers = { + 'Authorization': f'Bearer {self.jwt_token}', + 'Content-Type': 'application/json', + 'Accept': 'application/json' + } + + response = requests.post( + upload_url, + json=publish_data, + headers=headers, + timeout=60, + proxies=self.proxies + ) + + logger.info(f"'{title}' - 提交响应状态码: {response.status_code}") + + if response.status_code ==200: + try: + result = response.json() + logger.info(f"'{title}' - 提交响应内容: {result}") + + if result.get('success') or result.get('errno') == 0: + success_msg = f"'{title}' - 文章提交成功, ID: {existing_batch_id}" + logger.info(success_msg) + self.log_to_database('INFO', success_msg, f"article_id: {article_id}, batch_id: {existing_batch_id}") + return True + else: + error_msg = f"'{title}' - 文章提交失败: {result.get('message', result.get('errmsg', '未知错误'))}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, f"article_id: {article_id}, response: {result}") + return False + except json.JSONDecodeError as e: + error_msg = f"'{title}' - 解析提交响应失败: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, f"response_text: {response.text}") + return False + elif response.status_code == 401: + # 处理401错误:JWT token过期,重新登录后重试 + logger.warning(f"'{title}' - JWT token过期(401),尝试重新登录") + self.log_to_database('WARNING', f"JWT token过期,重新登录: {title}", f"article_id: {article_id}") + + if self.login_and_get_jwt_token(): + logger.info(f"'{title}' - 重新登录成功,重试提交文章") + # 更新headers中的token + headers['Authorization'] = f'Bearer {self.jwt_token}' + + # 重试请求 + retry_response = requests.post( + upload_url, + json=publish_data, + headers=headers, + timeout=60, + proxies=self.proxies + ) + + logger.info(f"'{title}' - 重试响应状态码: {retry_response.status_code}") + + if retry_response.status_code == 200: + try: + retry_result = retry_response.json() + logger.info(f"'{title}' - 重试响应内容: {retry_result}") + + if retry_result.get('success') or retry_result.get('errno') == 0: + success_msg = f"'{title}' - 重试提交成功, ID: {existing_batch_id}" + logger.info(success_msg) + self.log_to_database('INFO', success_msg, f"article_id: {article_id}, batch_id: {existing_batch_id}") + return True + else: + error_msg = f"'{title}' - 重试提交失败: {retry_result.get('message', retry_result.get('errmsg', '未知错误'))}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, f"article_id: {article_id}, retry_response: {retry_result}") + return False + except json.JSONDecodeError as e: + error_msg = f"'{title}' - 解析重试响应失败: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, f"retry_response_text: {retry_response.text}") + return False + else: + error_msg = f"'{title}' - 重试请求仍然失败,状态码: {retry_response.status_code}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, f"retry_response_text: {retry_response.text}") + return False + else: + error_msg = f"'{title}' - 重新登录失败,无法重试" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, f"article_id: {article_id}") + return False + else: + error_msg = f"'{title}' - 文章提交请求失败,状态码: {response.status_code}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, f"response_text: {response.text}") + return False + + except requests.exceptions.Timeout: + error_msg = f"'{title}' - 提交文章请求超时" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, f"article_id: {article_id}") + return False + except requests.exceptions.RequestException as e: + error_msg = f"'{title}' - 提交文章网络异常: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, traceback.format_exc()) + return False + except Exception as e: + error_msg = f"'{title}' - 提交文章异常: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, traceback.format_exc()) + return False + + def get_generate_topics(self) -> List[Dict]: + """获取状态为topic或failed的待处理数据,支持失败重试""" + try: + with self.db_manager.get_cursor() as cursor: + # 查询topic状态和failed状态的文章(支持失败重试) + # LEFT JOIN ai_prompt_workflow 表获取 auth_token 和 workflow_id + sql = """ + SELECT a.id, a.topic, a.batch_id, a.status, a.created_at, a.updated_at, + p.auth_token, p.workflow_id, p.prompt_workflow_name + FROM ai_articles a + LEFT JOIN ai_prompt_workflow p ON a.prompt_workflow_id = p.id + WHERE a.status IN ('generate', 'generate_failed') + AND a.topic > '' AND a.prompt_workflow_id = 22 + ORDER BY + CASE WHEN a.status = 'generate' THEN 1 ELSE 2 END, + a.id ASC + LIMIT 1000 + """ + cursor.execute(sql) + results = cursor.fetchall() + + if results: + logger.info(f"查询到 {len(results)} 个待处理主题") + for result in results: + logger.info(f"待处理文章 - ID: {result['id']}, 主题: {result['topic']}, 状态: {result['status']}, auth_token: {result.get('auth_token', 'N/A')}, workflow_id: {result.get('workflow_id', 'N/A')}") + self.log_to_database('INFO', f"发现待处理文章: {result['topic']}", + f"ID: {result['id']}, 状态: {result['status']}, auth_token: {result.get('auth_token', 'N/A')}, workflow_id: {result.get('workflow_id', 'N/A')}") + else: + logger.info("未查询到待处理主题") + + return results + except Exception as e: + error_msg = f"查询待处理主题异常: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, traceback.format_exc()) + return [] + + def get_next_available_topic(self, pending_topics: List[Dict]) -> Optional[Dict]: + """线程安全地获取下一个可处理的主题""" + with self.processing_lock: + for topic_data in pending_topics: + article_id = topic_data['id'] + if article_id not in self.processed_ids: + self.processed_ids.add(article_id) + return topic_data + return None + + def process_single_topic(self, topic_data: Dict) -> bool: + """处理单个主题""" + article_id = topic_data['id'] + topic = topic_data['topic'] + workflow_id = topic_data.get('workflow_id') + auth_token = topic_data.get('auth_token') + prompt_workflow_name = topic_data.get('prompt_workflow_name') + worker_id = threading.current_thread().name + batch_id = topic_data.get('batch_id') + + # ====== 新增:查找相似topic ====== + context_injection = "" # 初始化上下文注入内容 + + try: + logger.info(f"[Worker-{worker_id}] 开始查找相似topic,当前topic: '{topic}'") + self.log_to_database('INFO', f"开始查找相似topic: {topic}", f"article_id: {article_id}") + + # 调用向量检索接口 + similar_topics = search_chinese( + query_text=topic, + topk=3, # 查询top3,后面会过滤到最多2条 + similarity_threshold=0.5 # 相似度阈值0.5 + ) + + if similar_topics: + logger.info(f"[Worker-{worker_id}] 找到 {len(similar_topics)} 个相似topic") + logger.info(f"[Worker-{worker_id}] similar_topics完整返回值: {json.dumps(similar_topics, ensure_ascii=False)}") + + # 提取article_id列表(注意:返回的字段是id,不是article_id) + article_ids = [] + for item in similar_topics: + aid = item.get('id', '') + if aid: + try: + # 将字符串id转换为int + article_ids.append(int(aid)) + except (ValueError, TypeError): + logger.warning(f"[Worker-{worker_id}] 无法转换id为整数: {aid}") + #article_ids = [702, 699] #test测试rwl + if article_ids: + logger.info(f"[Worker-{worker_id}] 提取到文章ID列表: {article_ids}") + + # 从数据库查询content + context_injection = self.get_article_contents_by_ids(article_ids) + + if context_injection: + logger.info(f"[Worker-{worker_id}] 获取到上下文注入内容,长度: {len(context_injection)} 字符") + self.log_to_database( + 'INFO', + f"获取上下文注入内容: {topic}", + f"article_id: {article_id}, 相似文章IDs: {article_ids}, 内容长度: {len(context_injection)}" + ) + else: + logger.warning(f"[Worker-{worker_id}] 未从article_ids {article_ids} 查询到content") + self.log_to_database('WARNING', f"未查询到content: {topic}", f"article_ids: {article_ids}") + else: + logger.warning(f"[Worker-{worker_id}] 相似topic中没有有效的id") + + # 打印相似topic详情 + for i, similar in enumerate(similar_topics, 1): + logger.info(f"[Worker-{worker_id}] 相似topic[{i}]: {similar.get('title', 'N/A')}, 相似度: {similar.get('similar', 0):.4f}, 文章ID: {similar.get('id', 'N/A')}") + self.log_to_database( + 'INFO', + f"找到相似topic: {topic}", + f"article_id: {article_id}, 相似topic数量: {len(similar_topics)}, 详情: {json.dumps(similar_topics, ensure_ascii=False)}" + ) + else: + logger.info(f"[Worker-{worker_id}] 未找到相似topic(相似度>0.5)") + self.log_to_database('INFO', f"未找到相似topic: {topic}", f"article_id: {article_id}") + except Exception as e: + error_msg = f"[Worker-{worker_id}] 查找相似topic异常: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, traceback.format_exc()) + # 即使查找相似topic失败,也继续处理文章生成 + # ====== 相似topic查找结束 ====== + + try: + logger.info(f"[Worker-{worker_id}] 开始处理主题 ID:{article_id}, Topic:'{topic}', Prompt={prompt_workflow_name}") + + # 验证必要的参数 + if not workflow_id or not auth_token: + error_msg = f"[Worker-{worker_id}] workflow_id 或 auth_token 缺失,Topic:'{topic}'" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, f"article_id: {article_id}") + return False + + # 生成文章 - 开始计时 + start_time = time.time() + structured_data = self.generate_article_from_coze(topic, context_injection, workflow_id, auth_token) + end_time = time.time() + elapsed_time = end_time - start_time + + logger.info(f"[Worker-{worker_id}] Coze文章生成耗时: {elapsed_time:.2f}秒, Topic:'{topic}'") + + if not structured_data: + logger.error(f"[Worker-{worker_id}] 生成文章失败: {topic}") + # 移除直接数据库操作:不再直接更新状态为generate_failed + # 状态管理交给接口处理 + return False + + # 增加判断:structured_data['contents']为空,报错 + if not structured_data.get('contents'): + logger.error(f"[Worker-{worker_id}] 生成文章失败: {topic} - structured_data['contents']为空") + # 移除直接数据库操作:不再直接更新状态为generate_failed + # 状态管理交给接口处理 + return False + + # 提交文章 + if self.generate_article(structured_data, article_id, batch_id): + logger.info(f"[Worker-{worker_id}] 文章处理完成: {topic}") + # Bug修复:正确发文状态应该是pending_review,不是draft + # 注意:调用接口后不应再直接操作数据库,接口内部会处理状态 + return True + else: + logger.error(f"[Worker-{worker_id}] 文章提交失败: {topic}") + # 移除直接数据库操作:不再直接更新状态为generate_failed + # 状态管理交给接口处理 + return False + + except Exception as e: + logger.error(f"[Worker-{worker_id}] 处理主题异常: {e}") + # 移除直接数据库操作:不再直接更新状态为generate_failed + # 状态管理交给接口处理 + return False + + def worker_process_topics(self, pending_topics: List[Dict], worker_id: int) -> int: + """Worker线程处理主题的方法""" + processed_count = 0 + thread_name = f"Worker-{worker_id}" + threading.current_thread().name = thread_name + + logger.info(f"[{thread_name}] 启动,准备处理主题") + + while True: + # 线程安全地获取下一个待处理主题 + topic_data = self.get_next_available_topic(pending_topics) + if not topic_data: + logger.info(f"[{thread_name}] 没有更多待处理主题,退出") + break + + # 处理主题 + if self.process_single_topic(topic_data): + processed_count += 1 + logger.info(f"[{thread_name}] 成功处理主题: {topic_data['topic']}") + else: + logger.error(f"[{thread_name}] 处理主题失败: {topic_data['topic']}") + + logger.info(f"[{thread_name}] 完成,共处理 {processed_count} 个主题") + return processed_count + + def run_monitor(self): + """运行监控循环,支持多worker并行处理""" + logger.info(f"开始监控ai_articles表,使用 {WORKER_COUNT} 个worker并行处理...") + self.log_to_database('INFO', f'启动文章自动生成监控服务,worker数量: {WORKER_COUNT}', 'run_monitor') + + while True: + try: + # 获取待处理的主题 + pending_topics = self.get_generate_topics() + + if pending_topics: + logger.info(f"发现 {len(pending_topics)} 个待处理主题,启动 {WORKER_COUNT} 个worker并行处理") + self.log_to_database('INFO', f'发现待处理主题,启动并行处理', f'主题数量: {len(pending_topics)}, worker数量: {WORKER_COUNT}') + + # 清空已处理记录集合 + with self.processing_lock: + self.processed_ids.clear() + + # 使用线程池并行处理 + with ThreadPoolExecutor(max_workers=WORKER_COUNT, thread_name_prefix="CozeWorker") as executor: + # 提交worker任务 + future_to_worker = {} + for worker_id in range(1, WORKER_COUNT + 1): + future = executor.submit(self.worker_process_topics, pending_topics, worker_id) + future_to_worker[future] = worker_id + + # 等待所有worker完成 + total_processed = 0 + for future in as_completed(future_to_worker): + worker_id = future_to_worker[future] + try: + processed_count = future.result() + total_processed += processed_count + logger.info(f"Worker-{worker_id} 完成,处理了 {processed_count} 个主题") + except Exception as e: + logger.error(f"Worker-{worker_id} 执行异常: {e}") + self.log_to_database('ERROR', f'Worker-{worker_id} 执行异常', str(e)) + + logger.info(f"本轮并行处理完成,共处理 {total_processed} 个主题") + self.log_to_database('INFO', f'本轮并行处理完成', f'共处理 {total_processed} 个主题') + + # 处理完一轮后稍作休息 + time.sleep(5) + else: + logger.info("暂无待处理主题,继续监控...") + + # 每秒检查一次 + time.sleep(SLEEP_INTERVAL) + + except KeyboardInterrupt: + logger.info("收到中断信号,停止监控") + self.log_to_database('INFO', '监控服务手动停止', 'KeyboardInterrupt') + break + except Exception as e: + error_msg = f"监控循环异常: {e}" + logger.error(error_msg) + self.log_to_database('ERROR', error_msg, traceback.format_exc()) + time.sleep(5) # 异常时等待5秒再继续 + +def main(): + """主函数""" + generator = CozeArticleGenerator() + + try: + # 先登录获取JWT token + logger.info("开始登录获取JWT token") + if not generator.login_and_get_jwt_token(): + logger.error("登录失败,程序退出") + return + + # 获取Coze认证信息 + logger.info("开始获取Coze认证信息") + if not generator.get_coze_token(): + logger.error("获取Coze认证信息失败,程序退出") + return + + # 开始监控 + generator.run_monitor() + + except Exception as e: + logger.error(f"程序运行异常: {e}") + generator.log_to_database('ERROR', f'程序运行异常: {e}', traceback.format_exc()) + +if __name__ == "__main__": + main()