修改文章图片匹配逻辑：合并文章查询、按科室缩小图片范围、按挂载次数排序、按图片类型分类处理

2026-02-05 12:21:33 +08:00
parent d0a4583cb1
commit 4805321722
1 changed files with 60 additions and 47 deletions
--- a/article_auto_image_matching.py
+++ b/article_auto_image_matching.py
@@ -79,7 +79,7 @@ class ArticleImageMatcher:
    
    def get_articles_with_tags(self, limit: int = BATCH_SIZE) -> List[Dict]:
        """
-        从ai_article_tags表获取需要匹配图片的文章
+        从ai_articles表获取需要匹配图片的文章
        
        Returns:
            包含文章ID、标签等信息的列表
@@ -91,55 +91,32 @@ class ArticleImageMatcher:
                    # 查询有标签但未匹配图片的文章
                    sql = """
                    SELECT 
-                        at.id,
-                        at.article_id,
-                        at.coze_tag
-                    FROM ai_article_tags at
-                    WHERE at.coze_tag IS NOT NULL 
-                    AND at.coze_tag != ''
+                        a.id as article_id,
+                        a.title,
+                        a.content,
+                        a.coze_tag,
+                        a.department
+                    FROM ai_articles a
+                    WHERE a.coze_tag IS NOT NULL 
+                    AND a.coze_tag != ''
                    AND NOT EXISTS (
                        SELECT 1 FROM ai_article_images ai 
-                        WHERE ai.article_id = at.article_id
+                        WHERE ai.article_id = a.id
                    )
-                    AND at.article_id IN (
-                        SELECT id FROM ai_articles 
-                        WHERE status = 'pending_review'
-                    )
-                    ORDER BY at.id DESC
+                    AND a.status = 'pending_review'
+                    ORDER BY a.id DESC
                    LIMIT %s
                    """
                    cursor.execute(sql, (limit,))
                    results = cursor.fetchall()
                    
-                    # 为每个结果添加文章标题和内容
-                    processed_results = []
-                    for row in results:
-                        # 查询文章标题和内容
-                        article_sql = """
-                        SELECT title, content
-                        FROM ai_articles
-                        WHERE id = %s
-                        """
-                        cursor.execute(article_sql, (row['article_id'],))
-                        article_data = cursor.fetchone()
-                        
-                        if article_data:
-                            processed_row = {
-                                'id': row['id'],
-                                'article_id': row['article_id'],
-                                'coze_tag': row['coze_tag'],
-                                'title': article_data['title'],
-                                'content': article_data['content']
-                            }
-                            processed_results.append(processed_row)
-                    
-                    if processed_results:
-                        logger.info(f"查询到 {len(processed_results)} 篇需要匹配图片的文章")
-                        self.log_to_database('INFO', f"查询到需要匹配图片的文章", f"数量: {len(processed_results)}")
+                    if results:
+                        logger.info(f"查询到 {len(results)} 篇需要匹配图片的文章")
+                        self.log_to_database('INFO', f"查询到需要匹配图片的文章", f"数量: {len(results)}")
                    else:
                        logger.info("未查询到需要匹配图片的文章")
                    
-                    return processed_results
+                    return results
            finally:
                connection.close()
        except Exception as e:
@@ -159,7 +136,8 @@ class ArticleImageMatcher:
            connection = self.db_manager.get_connection()
            try:
                with connection.cursor(pymysql.cursors.DictCursor) as cursor:
-                    # 查询状态为generate且附加文章数量小于5的图片
+                    # 查询状态为generate且附加文章数量小于5的图片（不使用JOIN）
+                    # 包含image_source字段用于区分实拍图和模板图
                    sql = """
                    SELECT 
                        it.id,
@@ -173,11 +151,14 @@ class ArticleImageMatcher:
                        it.keywords_name,
                        it.department_id,
                        it.department_name,
-                        it.image_attached_article_count
+                        it.image_attached_article_count,
+                        it.image_source
                    FROM ai_image_tags it
-                    INNER JOIN ai_images i ON it.image_id = i.id
                    WHERE it.image_attached_article_count < 5
-                    AND i.status = 'generate'
+                    AND it.image_id IN (
+                        SELECT id FROM ai_images 
+                        WHERE status = 'generate'
+                    )
                    ORDER BY it.image_attached_article_count ASC, it.id DESC
                    """
                    cursor.execute(sql)
@@ -763,6 +744,7 @@ class ArticleImageMatcher:
        article_title = article_data.get('title', '')
        article_content = article_data.get('content', '')
        coze_tag = article_data.get('coze_tag', '')
+        article_department = article_data.get('department', '')
        
        try:
            # 解析文章标签
@@ -771,15 +753,46 @@ class ArticleImageMatcher:
                logger.warning(f"文章 {article_id} 没有有效标签，跳过")
                return False
            
-            logger.info(f"开始为文章 {article_id} 匹配图片 - 标题: {article_title}, 标签: {article_tags}")
+            logger.info(f"开始为文章 {article_id} 匹配图片 - 标题: {article_title}, 标签: {article_tags}, 科室: {article_department}")
+            
+            # 根据文章科室缩小图片范围
+            department_filtered_images = []
+            for img in available_images:
+                # 优先匹配科室相同的图片
+                if article_department and img.get('department_name', '').lower() == article_department.lower():
+                    department_filtered_images.append(img)
+            
+            # 如果没有匹配科室的图片，则使用所有图片
+            if not department_filtered_images:
+                department_filtered_images = available_images
+            
+            # 根据图片类型（实拍图/模板图）进行分类处理
+            # 根据image_source字段：1=clean_images(模板图), 2=Flower_character(实拍图)
+            actual_photos = []  # 实拍图
+            template_photos = []  # 模板图
+            
+            for img in department_filtered_images:
+                image_source = img.get('image_source', 1)  # 默认为模板图
+                if image_source == 2:  # 实拍图
+                    actual_photos.append(img)
+                else:  # 模板图
+                    template_photos.append(img)
+            
+            # 按照挂载次数排序，优先选择挂载次数少的图片
+            actual_photos.sort(key=lambda x: x['image_attached_article_count'])
+            template_photos.sort(key=lambda x: x['image_attached_article_count'])
+            
+            # 合并图片列表，优先使用实拍图，然后是模板图
+            filtered_images = actual_photos + template_photos
            
            best_match = None
            best_score = 0.0
            
-            # 遍历可用图片，找到最佳匹配
-            for image_data in available_images:
+            # 遍历筛选后的可用图片，找到最佳匹配
+            for image_data in filtered_images:
                image_tags = [image_data['tag_name']]
                image_keywords = image_data.get('keywords_name', '')
+                image_department = image_data.get('department_name', '')
                
                # 调用通义千问评估匹配度
                is_match, score = self.call_qwen_for_matching(
@@ -787,7 +800,7 @@ class ArticleImageMatcher:
                )
                
                # 记录评估结果
-                logger.info(f"文章 {article_id} vs 图片 {image_data['image_id']} - 匹配: {is_match}, 分数: {score}")
+                logger.info(f"文章 {article_id} vs 图片 {image_data['image_id']} - 匹配: {is_match}, 分数: {score}, 科室: {image_department}")
                
                # 更新最佳匹配
                if is_match and score > best_score and score >= MATCH_THRESHOLD: