diff --git a/article_auto_image_matching.py b/article_auto_image_matching.py index 4c0b806..1ccd08c 100644 --- a/article_auto_image_matching.py +++ b/article_auto_image_matching.py @@ -79,7 +79,7 @@ class ArticleImageMatcher: def get_articles_with_tags(self, limit: int = BATCH_SIZE) -> List[Dict]: """ - 从ai_article_tags表获取需要匹配图片的文章 + 从ai_articles表获取需要匹配图片的文章 Returns: 包含文章ID、标签等信息的列表 @@ -91,55 +91,32 @@ class ArticleImageMatcher: # 查询有标签但未匹配图片的文章 sql = """ SELECT - at.id, - at.article_id, - at.coze_tag - FROM ai_article_tags at - WHERE at.coze_tag IS NOT NULL - AND at.coze_tag != '' + a.id as article_id, + a.title, + a.content, + a.coze_tag, + a.department + FROM ai_articles a + WHERE a.coze_tag IS NOT NULL + AND a.coze_tag != '' AND NOT EXISTS ( SELECT 1 FROM ai_article_images ai - WHERE ai.article_id = at.article_id + WHERE ai.article_id = a.id ) - AND at.article_id IN ( - SELECT id FROM ai_articles - WHERE status = 'pending_review' - ) - ORDER BY at.id DESC + AND a.status = 'pending_review' + ORDER BY a.id DESC LIMIT %s """ cursor.execute(sql, (limit,)) results = cursor.fetchall() - # 为每个结果添加文章标题和内容 - processed_results = [] - for row in results: - # 查询文章标题和内容 - article_sql = """ - SELECT title, content - FROM ai_articles - WHERE id = %s - """ - cursor.execute(article_sql, (row['article_id'],)) - article_data = cursor.fetchone() - - if article_data: - processed_row = { - 'id': row['id'], - 'article_id': row['article_id'], - 'coze_tag': row['coze_tag'], - 'title': article_data['title'], - 'content': article_data['content'] - } - processed_results.append(processed_row) - - if processed_results: - logger.info(f"查询到 {len(processed_results)} 篇需要匹配图片的文章") - self.log_to_database('INFO', f"查询到需要匹配图片的文章", f"数量: {len(processed_results)}") + if results: + logger.info(f"查询到 {len(results)} 篇需要匹配图片的文章") + self.log_to_database('INFO', f"查询到需要匹配图片的文章", f"数量: {len(results)}") else: logger.info("未查询到需要匹配图片的文章") - return processed_results + return results finally: connection.close() except Exception as e: @@ -159,7 +136,8 @@ class ArticleImageMatcher: connection = self.db_manager.get_connection() try: with connection.cursor(pymysql.cursors.DictCursor) as cursor: - # 查询状态为generate且附加文章数量小于5的图片 + # 查询状态为generate且附加文章数量小于5的图片(不使用JOIN) + # 包含image_source字段用于区分实拍图和模板图 sql = """ SELECT it.id, @@ -173,11 +151,14 @@ class ArticleImageMatcher: it.keywords_name, it.department_id, it.department_name, - it.image_attached_article_count + it.image_attached_article_count, + it.image_source FROM ai_image_tags it - INNER JOIN ai_images i ON it.image_id = i.id WHERE it.image_attached_article_count < 5 - AND i.status = 'generate' + AND it.image_id IN ( + SELECT id FROM ai_images + WHERE status = 'generate' + ) ORDER BY it.image_attached_article_count ASC, it.id DESC """ cursor.execute(sql) @@ -763,6 +744,7 @@ class ArticleImageMatcher: article_title = article_data.get('title', '') article_content = article_data.get('content', '') coze_tag = article_data.get('coze_tag', '') + article_department = article_data.get('department', '') try: # 解析文章标签 @@ -771,15 +753,46 @@ class ArticleImageMatcher: logger.warning(f"文章 {article_id} 没有有效标签,跳过") return False - logger.info(f"开始为文章 {article_id} 匹配图片 - 标题: {article_title}, 标签: {article_tags}") + logger.info(f"开始为文章 {article_id} 匹配图片 - 标题: {article_title}, 标签: {article_tags}, 科室: {article_department}") + + # 根据文章科室缩小图片范围 + department_filtered_images = [] + for img in available_images: + # 优先匹配科室相同的图片 + if article_department and img.get('department_name', '').lower() == article_department.lower(): + department_filtered_images.append(img) + + # 如果没有匹配科室的图片,则使用所有图片 + if not department_filtered_images: + department_filtered_images = available_images + + # 根据图片类型(实拍图/模板图)进行分类处理 + # 根据image_source字段:1=clean_images(模板图), 2=Flower_character(实拍图) + actual_photos = [] # 实拍图 + template_photos = [] # 模板图 + + for img in department_filtered_images: + image_source = img.get('image_source', 1) # 默认为模板图 + if image_source == 2: # 实拍图 + actual_photos.append(img) + else: # 模板图 + template_photos.append(img) + + # 按照挂载次数排序,优先选择挂载次数少的图片 + actual_photos.sort(key=lambda x: x['image_attached_article_count']) + template_photos.sort(key=lambda x: x['image_attached_article_count']) + + # 合并图片列表,优先使用实拍图,然后是模板图 + filtered_images = actual_photos + template_photos best_match = None best_score = 0.0 - # 遍历可用图片,找到最佳匹配 - for image_data in available_images: + # 遍历筛选后的可用图片,找到最佳匹配 + for image_data in filtered_images: image_tags = [image_data['tag_name']] image_keywords = image_data.get('keywords_name', '') + image_department = image_data.get('department_name', '') # 调用通义千问评估匹配度 is_match, score = self.call_qwen_for_matching( @@ -787,7 +800,7 @@ class ArticleImageMatcher: ) # 记录评估结果 - logger.info(f"文章 {article_id} vs 图片 {image_data['image_id']} - 匹配: {is_match}, 分数: {score}") + logger.info(f"文章 {article_id} vs 图片 {image_data['image_id']} - 匹配: {is_match}, 分数: {score}, 科室: {image_department}") # 更新最佳匹配 if is_match and score > best_score and score >= MATCH_THRESHOLD: