Initial commit with .gitignore

2026-02-04 14:36:13 +08:00
commit 82b5fbf875
9 changed files with 1704 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,27 @@
+# Virtual environment
+venv/
+.venv/
+env/
+.env/
+
+# Python cache
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# Config with sensitive data
+config.ini
+
+# Logs
+*.log
+
+# OS files
+.DS_Store
+Thumbs.db
--- a/ai_article.sql
+++ b/ai_article.sql
--- a/ai_image_tags.txt
+++ b/ai_image_tags.txt
@@ -0,0 +1,34 @@
+8.149.233.36/ai_article/ai_image_tags/		http://47.99.184.230:8008/andes/index.php?route=/sql&pos=0&db=ai_article&table=ai_image_tags
+
+   正在显示第 25 - 49 行 (共 32937 行, 查询花费 0.0009 秒。)
+
+
+SELECT * FROM `ai_image_tags`
+
+
+id	image_id	image_name	image_url	image_thumb_url	tag_id	tag_name	default_tag_id	default_tag_name	keywords_id	keywords_name	department_id	department_name	image_source	created_user_id	created_at	updated_at	image_attached_article_count	status	blocking_reason	
+16495	19346	1755312359566253.png	20250816/1755312359566253.png	20250816/1755312359566253_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:19:11	7	draft		
+16496	19347	1755312362360723.png	20250816/1755312362360723.png	20250816/1755312362360723_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:18:55	8	draft		
+16497	19348	1755312364406476.png	20250816/1755312364406476.png	20250816/1755312364406476_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:18:55	8	draft		
+16498	19349	1755312367284353.png	20250816/1755312367284353.png	20250816/1755312367284353_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:18:55	8	draft		
+16499	19350	1755312370484005.png	20250816/1755312370484005.png	20250816/1755312370484005_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:19:11	7	draft		
+16500	19351	1755312373245801.png	20250816/1755312373245801.png	20250816/1755312373245801_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:35:41	17	draft		
+16501	19352	1755312378278262.png	20250816/1755312378278262.png	20250816/1755312378278262_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:34:55	35	draft		
+16502	19353	1755312380298110.png	20250816/1755312380298110.png	20250816/1755312380298110_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:34:51	37	draft		
+16503	19354	1755312382399131.png	20250816/1755312382399131.png	20250816/1755312382399131_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:33:30	93	draft		
+16504	19355	1755312386945978.png	20250816/1755312386945978.png	20250816/1755312386945978_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:35:33	20	draft		
+16505	19356	1755312388894962.png	20250816/1755312388894962.png	20250816/1755312388894962_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:35:06	30	draft		
+16506	19357	1755312391383717.png	20250816/1755312391383717.png	20250816/1755312391383717_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:34:30	49	draft		
+16507	19358	1755312393565035.png	20250816/1755312393565035.png	20250816/1755312393565035_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:32:57	135	draft		
+16508	19359	1755312396609453.png	20250816/1755312396609453.png	20250816/1755312396609453_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:18:55	8	draft		
+16509	19360	1755312401479871.png	20250816/1755312401479871.png	20250816/1755312401479871_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:35:41	17	draft		
+16510	19361	1755312407229190.png	20250816/1755312407229190.png	20250816/1755312407229190_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:35:29	21	draft		
+16511	19362	1755312410797310.png	20250816/1755312410797310.png	20250816/1755312410797310_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		186	妇科炎症	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:35:08	29	draft		
+16512	19363	1755312437724619.png	20250816/1755312437724619.png	20250816/1755312437724619_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		265	废止	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:33:59	69	draft		
+16513	19364	1755312440270419.png	20250816/1755312440270419.png	20250816/1755312440270419_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		265	废止	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:33:29	94	draft		
+16514	19365	1755312442259884.png	20250816/1755312442259884.png	20250816/1755312442259884_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		265	废止	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:33:18	107	draft		
+16515	19366	1755312445610363.png	20250816/1755312445610363.png	20250816/1755312445610363_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		265	废止	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:32:36	173	draft		
+16516	19367	1755312448884355.png	20250816/1755312448884355.png	20250816/1755312448884355_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		265	废止	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:33:14	111	draft		
+16517	19368	1755312451681906.png	20250816/1755312451681906.png	20250816/1755312451681906_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		265	废止	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:33:29	94	draft		
+16518	19369	1755312453351689.png	20250816/1755312453351689.png	20250816/1755312453351689_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		265	废止	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:33:24	100	draft		
+16519	19370	1755312456284588.png	20250816/1755312456284588.png	20250816/1755312456284588_thumb.png	12679	#妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办#	0		265	废止	11	妇科	1	0	2025-08-16 21:48:16	2026-01-30 14:33:09	118	draft		
--- a/basket.py
+++ b/basket.py
@@ -0,0 +1,72 @@
+import requests
+from dashvector import Client, Doc
+
+# === 配置 ===
+DASHVECTOR_API_KEY = 'sk-55x6oBXypSlPHQ8NvPHfyBABcMIMUE0407A0FCC2A11F0B9C802831A608ABB'
+DASHVECTOR_ENDPOINT = 'vrs-cn-2ml4jm42o0001r.dashvector.cn-hangzhou.aliyuncs.com'
+
+# 从 DashScope 控制台获取（不是百炼 Model Studio！）
+DASHSCOPE_API_KEY = 'sk-d3f235925afa4e4e83d707dde04b9e52'  # 👈 替换这里！
+
+def get_embedding(text):
+    url = "https://dashscope.aliyuncs.com/api/v1/services/embeddings/text-embedding/text-embedding-v1"
+    headers = {
+        "Authorization": f"Bearer {DASHSCOPE_API_KEY}",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "input": {"texts": [text]},
+        "model": "text-embedding-v1"
+    }
+    resp = requests.post(url, headers=headers, json=data)
+    if resp.status_code == 200:
+        return resp.json()["output"]["embeddings"][0]["embedding"]
+    else:
+        raise Exception(f"❌ Embedding API 错误: {resp.status_code} - {resp.text}")
+
+# === 初始化 DashVector 客户端 ===
+client = Client(api_key=DASHVECTOR_API_KEY, endpoint=DASHVECTOR_ENDPOINT)
+
+# === 创建集合（注意维度是 1536！）===
+collection_name = "medical_topics"
+try:
+    client.delete(collection_name)
+except:
+    pass
+
+client.create(name=collection_name, dimension=1536)  # text-embedding-v1 输出 1536 维
+collection = client.get(collection_name)
+print("✅ 集合已创建并获取")
+
+# === 插入数据 ===
+topics = [
+    "如何治疗阳痿、早泄和肾虚？",
+    "早泄可以吃哪些中药？",
+    "该如何治疗早泄？",
+    "前列腺肥大是什么原因引起的？"
+]
+
+docs = []
+for i, text in enumerate(topics, 1):
+    emb = get_embedding(text)
+    docs.append(Doc(id=f"topic_{i}", vector=emb, fields={"content": text}))
+
+resp = collection.insert(docs)
+if resp.success:
+    print("✅ 4 条中文话题已成功插入！")
+else:
+    print("❌ 插入失败:", resp)
+    exit(1)
+
+# === 查询测试 ===
+query_text = "早泄的治疗方法有哪些？"
+query_vec = get_embedding(query_text)
+
+rets = collection.query(vector=query_vec, topk=3, output_fields=["content"])
+if rets.success:
+    print(f"\n🔍 查询 '{query_text}' 的结果:")
+    for doc in rets.documents:
+        print(f"  ID: {doc.id} | 相似度: {doc.score:.4f} | 内容: {doc.fields['content']}")
+else:
+    print("❌ 查询失败:", rets)
+
--- a/image_similarity_check.py
+++ b/image_similarity_check.py
@@ -0,0 +1,386 @@
+# -*- coding: utf-8 -*-
+"""
+图片去重审核脚本 - DashScope 多模态版
+采用: pHash预筛 + DashScope多模态Embedding + 异步批量处理
+"""
+
+import configparser
+import logging
+import asyncio
+import aiohttp
+import imagehash
+import base64
+import time
+import dashscope
+from dashscope import MultiModalEmbedding
+from io import BytesIO
+from typing import Optional, Tuple, List, Dict
+
+import pymysql
+from dashvector import Client, Doc
+from PIL import Image
+
+
+class ImageSimilarityChecker:
+    """图片相似度检查器 - DashScope 多模态版"""
+    
+    def __init__(self, config_path: str = 'config.ini'):
+        self.config = configparser.ConfigParser()
+        self.config.read(config_path, encoding='utf-8')
+        
+        self._setup_logging()
+        
+        # 连接
+        self.db_conn = None
+        self.dashvector_client = None
+        self.collection = None
+        
+        # DashScope API
+        self.dashscope_api_key = self.config.get('dashscope', 'api_key')
+        dashscope.api_key = self.dashscope_api_key
+        
+        # pHash 缓存 {phash_str: image_tag_id}
+        self.phash_cache: Dict[str, int] = {}
+        
+        # 配置参数
+        self.image_cdn_base = self.config.get('image', 'cdn_base')
+        self.phash_threshold = self.config.getint('similarity', 'phash_threshold')
+        self.vector_threshold = self.config.getfloat('similarity', 'vector_threshold')
+        self.batch_size = self.config.getint('process', 'batch_size')
+        self.concurrent_downloads = self.config.getint('process', 'concurrent_downloads')
+    
+    def _setup_logging(self):
+        log_level = self.config.get('process', 'log_level', fallback='INFO')
+        log_file = self.config.get('process', 'log_file', fallback='image_similarity.log')
+        
+        self.logger = logging.getLogger(__name__)
+        
+        # 避免重复添加 handler
+        if not self.logger.handlers:
+            self.logger.setLevel(getattr(logging, log_level))
+            formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+            
+            fh = logging.FileHandler(log_file, encoding='utf-8')
+            fh.setFormatter(formatter)
+            self.logger.addHandler(fh)
+            
+            sh = logging.StreamHandler()
+            sh.setFormatter(formatter)
+            self.logger.addHandler(sh)
+    
+    def connect_db(self):
+        """连接数据库"""
+        self.db_conn = pymysql.connect(
+            host=self.config.get('database', 'host'),
+            port=self.config.getint('database', 'port'),
+            user=self.config.get('database', 'user'),
+            password=self.config.get('database', 'password'),
+            database=self.config.get('database', 'database'),
+            charset=self.config.get('database', 'charset'),
+            cursorclass=pymysql.cursors.DictCursor
+        )
+        self.logger.info("数据库连接成功")
+    
+    def connect_dashvector(self):
+        """连接 DashVector"""
+        api_key = self.config.get('dashvector', 'api_key')
+        endpoint = self.config.get('dashvector', 'endpoint')
+        collection_name = self.config.get('dashvector', 'collection_name')
+        dimension = self.config.getint('dashvector', 'vector_dimension')
+        
+        self.dashvector_client = Client(api_key=api_key, endpoint=endpoint)
+        
+        # 检查集合是否存在
+        existing = self.dashvector_client.get(collection_name)
+        if existing is None:
+            self.logger.info(f"创建集合 {collection_name}，维度 {dimension}")
+            self.dashvector_client.create(collection_name, dimension=dimension)
+        else:
+            self.logger.info(f"集合 {collection_name} 已存在，直接复用")
+        
+        self.collection = self.dashvector_client.get(collection_name)
+        self.logger.info("DashVector 连接成功")
+    
+    def get_image_embedding(self, image_url: str = None, image_base64: str = None, max_retries: int = 5) -> Optional[List[float]]:
+        """
+        调用 DashScope 多模态 Embedding SDK 获取图片向量
+        支持传入 URL 或 base64，带限流退避重试
+        """
+        for attempt in range(max_retries):
+            try:
+                # 构建输入
+                if image_url:
+                    input_data = [{'image': image_url}]
+                elif image_base64:
+                    input_data = [{'image': f'data:image/jpeg;base64,{image_base64}'}]
+                else:
+                    return None
+                
+                resp = MultiModalEmbedding.call(
+                    model='multimodal-embedding-v1',
+                    input=input_data
+                )
+                
+                if resp.status_code == 200:
+                    return resp.output['embeddings'][0]['embedding']
+                elif resp.status_code in (429, 403):
+                    wait_time = 3 + attempt * 3
+                    self.logger.warning(f"API 限流，等待 {wait_time} 秒后重试 ({attempt + 1}/{max_retries})...")
+                    time.sleep(wait_time)
+                else:
+                    self.logger.warning(f"Embedding API 错误: {resp.status_code} - {resp.message}")
+                    return None
+            except Exception as e:
+                self.logger.warning(f"Embedding API 异常: {e}")
+                time.sleep(2)
+        
+        return None
+    
+    def load_phash_cache(self):
+        """初始化 pHash 缓存"""
+        self.logger.info("pHash 缓存初始化完成")
+    
+    def compute_phash(self, image: Image.Image) -> str:
+        """计算感知哈希"""
+        return str(imagehash.phash(image))
+    
+
+    
+    def check_phash_duplicate(self, phash: str) -> Tuple[bool, Optional[int], Optional[int]]:
+        """通过 pHash 检查是否重复"""
+        phash_obj = imagehash.hex_to_hash(phash)
+        
+        for cached_phash, image_id in self.phash_cache.items():
+            cached_obj = imagehash.hex_to_hash(cached_phash)
+            distance = phash_obj - cached_obj
+            
+            if distance <= self.phash_threshold:
+                return True, image_id, distance
+        
+        return False, None, None
+    
+    async def download_image_async(self, session: aiohttp.ClientSession, 
+                                    image_id: int, url: str) -> Tuple[int, Optional[Image.Image], Optional[bytes]]:
+        """异步下载单张图片"""
+        try:
+            async with session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as response:
+                if response.status == 200:
+                    data = await response.read()
+                    image = Image.open(BytesIO(data)).convert('RGB')
+                    return image_id, image, data
+        except Exception as e:
+            self.logger.warning(f"下载失败 ID={image_id}: {e}")
+        
+        return image_id, None, None
+    
+    async def download_images_batch(self, image_records: List[dict]) -> Dict[int, Tuple[Image.Image, bytes, str]]:
+        """批量异步下载图片"""
+        images = {}
+        
+        connector = aiohttp.TCPConnector(limit=self.concurrent_downloads)
+        async with aiohttp.ClientSession(connector=connector) as session:
+            tasks = [
+                self.download_image_async(session, rec['id'], rec['image_url'])
+                for rec in image_records
+            ]
+            results = await asyncio.gather(*tasks)
+            
+            for i, (image_id, image, data) in enumerate(results):
+                if image is not None:
+                    url = image_records[i]['image_url']
+                    images[image_id] = (image, data, url)
+        
+        return images
+    
+    def search_similar(self, features: List[float], exclude_id: int) -> Tuple[bool, Optional[int], Optional[float]]:
+        """在 DashVector 中搜索相似图片（score越小越相似）"""
+        try:
+            results = self.collection.query(features, topk=3)
+            
+            if results and results.output:
+                for doc in results.output:
+                    similar_id = int(doc.id)
+                    if similar_id == exclude_id:
+                        continue
+                    
+                    # score 是距离，越小越相似，转换为相似度
+                    similarity = 1.0 - doc.score
+                    self.logger.info(f"搜索到: {similar_id}, 距离={doc.score:.4f}, 相似度={similarity:.4f}")
+                    
+                    if similarity >= self.vector_threshold:
+                        return True, similar_id, similarity
+            
+            return False, None, None
+        except Exception as e:
+            self.logger.warning(f"搜索失败: {e}")
+            return False, None, None
+    
+    def upsert_to_dashvector(self, image_id: int, features: List[float]):
+        """存入 DashVector"""
+        try:
+            doc = Doc(id=str(image_id), vector=features)
+            result = self.collection.upsert([doc])
+            if result.code == 0:
+                self.logger.info(f"向量入库成功: {image_id}")
+            else:
+                self.logger.warning(f"向量入库失败 ID={image_id}: code={result.code}, msg={result.message}")
+        except Exception as e:
+            self.logger.warning(f"存入 DashVector 异常 ID={image_id}: {e}")
+    
+    def get_draft_images(self) -> List[dict]:
+        """获取待处理图片"""
+        with self.db_conn.cursor() as cursor:
+            sql = """
+                SELECT id, image_id, image_url, image_thumb_url, image_name
+                FROM ai_image_tags 
+                WHERE status = 'draft' AND similarity = 'draft'
+                AND image_url != '' AND image_url IS NOT NULL
+                ORDER BY id ASC
+                LIMIT %s
+            """
+            cursor.execute(sql, (self.batch_size,))
+            return cursor.fetchall()
+    
+    def update_as_duplicate(self, image_id: int, similar_id: int, score: float):
+        """更新为重复图片"""
+        with self.db_conn.cursor() as cursor:
+            sql = """
+                UPDATE ai_image_tags 
+                SET status = 'similarity',
+                    similarity = 'yes',
+                    similarity_image_tags_id = %s,
+                    `similarity score` = %s,
+                    updated_at = NOW()
+                WHERE id = %s
+            """
+            cursor.execute(sql, (similar_id, score, image_id))
+        self.db_conn.commit()
+        self.logger.info(f"重复: {image_id} -> {similar_id} (分数={score:.4f})")
+    
+    def update_as_unique(self, image_id: int):
+        """更新为不重复图片"""
+        with self.db_conn.cursor() as cursor:
+            sql = """
+                UPDATE ai_image_tags 
+                SET status = 'tag_extension',
+                    similarity = 'calc',
+                    updated_at = NOW()
+                WHERE id = %s
+            """
+            cursor.execute(sql, (image_id,))
+        self.db_conn.commit()
+        self.logger.info(f"不重复: {image_id} -> tag_extension")
+    
+    def update_as_failed(self, image_id: int, reason: str):
+        """标记为处理失败"""
+        with self.db_conn.cursor() as cursor:
+            sql = """
+                UPDATE ai_image_tags 
+                SET status = 'draft',
+                    similarity = 'recalc',
+                    updated_at = NOW()
+                WHERE id = %s
+            """
+            cursor.execute(sql, (image_id,))
+        self.db_conn.commit()
+    
+    def process_batch(self, image_records: List[dict]) -> Tuple[int, int, int]:
+        """处理一批图片，返回 (重复数, 不重复数, 失败数)"""
+        if not image_records:
+            return 0, 0, 0
+        
+        duplicates = 0
+        unique = 0
+        failed = 0
+        
+        for rec in image_records:
+            image_id = rec['id']
+            # 检查是否有有效的图像URL
+            if not rec['image_url'] or rec['image_url'].strip() == '':
+                self.logger.warning(f"图像URL为空，跳过处理: {image_id}")
+                self.update_as_failed(image_id, "图像URL为空")
+                failed += 1
+                continue
+            
+            # 拼接 CDN URL，使用原图
+            full_url = f"{self.image_cdn_base}{rec['image_url']}"
+            
+
+            
+            try:
+                # 限流控制：免费版 2 QPS
+                time.sleep(0.5)
+                self.logger.info(f"获取 Embedding: {image_id} -> {full_url}")
+                
+                # 直接传 URL 给 DashScope
+                features = self.get_image_embedding(image_url=full_url)
+                
+                if features is None:
+                    self.logger.warning(f"Embedding 获取失败: {image_id}")
+                    self.update_as_failed(image_id, "Embedding API 失败")
+                    failed += 1
+                    continue
+                
+                # DashVector 搜索相似图片
+                is_dup, similar_id, score = self.search_similar(features, image_id)
+                
+                if is_dup:
+                    self.update_as_duplicate(image_id, similar_id, score)
+                    duplicates += 1
+                else:
+                    self.upsert_to_dashvector(image_id, features)
+                    self.update_as_unique(image_id)
+                    unique += 1
+                    
+            except Exception as e:
+                self.logger.error(f"处理失败 {image_id}: {e}")
+                self.update_as_failed(image_id, str(e)[:200])
+                failed += 1
+                continue
+        
+        return duplicates, unique, failed
+    
+    def run(self):
+        """运行主流程"""
+        self.logger.info("=" * 60)
+        self.logger.info("图片去重审核 - DashScope 多模态版")
+        self.logger.info("=" * 60)
+        
+        # 初始化
+        self.connect_db()
+        self.connect_dashvector()
+        self.load_phash_cache()
+        
+        total_duplicates = 0
+        total_unique = 0
+        batch_num = 0
+        
+        try:
+            while True:
+                images = self.get_draft_images()
+                
+                if not images:
+                    self.logger.info("没有待处理的图片")
+                    break
+                
+                batch_num += 1
+                self.logger.info(f"\n--- 批次 {batch_num}: {len(images)} 张 ---")
+                
+                dup, uniq, fail = self.process_batch(images)
+                total_duplicates += dup
+                total_unique += uniq
+                
+                self.logger.info(f"批次结果: 重复={dup}, 不重复={uniq}, 失败={fail}")
+        
+        finally:
+            if self.db_conn:
+                self.db_conn.close()
+        
+        self.logger.info("=" * 60)
+        self.logger.info(f"完成! 总重复: {total_duplicates}, 总不重复: {total_unique}")
+        self.logger.info("=" * 60)
+
+
+if __name__ == '__main__':
+    checker = ImageSimilarityChecker('config.ini')
+    checker.run()
--- a/query_status.py
+++ b/query_status.py
@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+"""
+查询图片相似度状态脚本
+"""
+
+import configparser
+import pymysql
+
+
+def main():
+    config = configparser.ConfigParser()
+    config.read('config.ini', encoding='utf-8')
+    
+    db_conn = pymysql.connect(
+        host=config.get('database', 'host'),
+        port=config.getint('database', 'port'),
+        user=config.get('database', 'user'),
+        password=config.get('database', 'password'),
+        database=config.get('database', 'database'),
+        charset=config.get('database', 'charset'),
+        cursorclass=pymysql.cursors.DictCursor
+    )
+    
+    with db_conn.cursor() as cursor:
+        sql = """
+            SELECT id, image_name, status, similarity, 
+                   similarity_image_tags_id, `similarity score`, blocking_reason
+            FROM ai_image_tags 
+            ORDER BY id
+        """
+        cursor.execute(sql)
+        rows = cursor.fetchall()
+    
+    db_conn.close()
+    
+    # 统计
+    total = len(rows)
+    draft_count = sum(1 for r in rows if r['status'] == 'draft')
+    unique_count = sum(1 for r in rows if r['status'] == 'tag_extension')
+    dup_count = sum(1 for r in rows if r['status'] == 'similarity')
+    failed_count = sum(1 for r in rows if r['similarity'] == 'recalc')
+    
+    print("=" * 100)
+    print(f"{'ID':<8} {'图片名称':<30} {'状态':<15} {'相似性':<8} {'相似ID':<8} {'分数':<8} {'原因'}")
+    print("=" * 100)
+    
+    for r in rows:
+        score = f"{r['similarity score']:.4f}" if r['similarity score'] else "-"
+        similar_id = r['similarity_image_tags_id'] if r['similarity_image_tags_id'] else "-"
+        reason = r['blocking_reason'][:20] if r['blocking_reason'] else "-"
+        print(f"{r['id']:<8} {r['image_name'][:28]:<30} {r['status']:<15} {r['similarity']:<8} {similar_id:<8} {score:<8} {reason}")
+    
+    print("=" * 100)
+    print(f"总计: {total} | 待处理: {draft_count} | 不重复: {unique_count} | 重复: {dup_count} | 失败: {failed_count}")
+    print("=" * 100)
+
+
+if __name__ == '__main__':
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+pymysql>=1.0.0
+dashvector>=1.0.0
+Pillow>=9.0.0
+aiohttp>=3.8.0
+imagehash>=4.3.0
+requests>=2.28.0
--- a/reset_data.py
+++ b/reset_data.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+"""
+重置关系数据库脚本
+"""
+
+import configparser
+import pymysql
+
+
+def main():
+    config = configparser.ConfigParser()
+    config.read('config.ini', encoding='utf-8')
+    
+    print("=" * 50)
+    print("重置 MySQL 数据")
+    print("=" * 50)
+    
+    db_conn = pymysql.connect(
+        host=config.get('database', 'host'),
+        port=config.getint('database', 'port'),
+        user=config.get('database', 'user'),
+        password=config.get('database', 'password'),
+        database=config.get('database', 'database'),
+        charset=config.get('database', 'charset'),
+        cursorclass=pymysql.cursors.DictCursor
+    )
+    
+    with db_conn.cursor() as cursor:
+        sql = """
+            UPDATE ai_image_tags 
+            SET status = 'draft', 
+                similarity = 'draft', 
+                similarity_image_tags_id = 0,
+                `similarity score` = 0
+            WHERE status != 'draft' OR similarity != 'draft'
+        """
+        affected = cursor.execute(sql)
+        db_conn.commit()
+        
+        cursor.execute("SELECT COUNT(*) as total FROM ai_image_tags WHERE status = 'draft'")
+        result = cursor.fetchone()
+        
+    db_conn.close()
+    
+    print(f"更新记录数: {affected}")
+    print(f"当前 draft 状态总数: {result['total']}")
+    print("=" * 50)
+
+
+if __name__ == '__main__':
+    main()
--- a/reset_vector.py
+++ b/reset_vector.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+"""
+重置向量数据库集合脚本
+"""
+
+import configparser
+import time
+from dashvector import Client
+
+
+def main():
+    config = configparser.ConfigParser()
+    config.read('config.ini', encoding='utf-8')
+    
+    print("=" * 50)
+    print("重置 DashVector 集合")
+    print("=" * 50)
+    
+    api_key = config.get('dashvector', 'api_key')
+    endpoint = config.get('dashvector', 'endpoint')
+    collection_name = config.get('dashvector', 'collection_name')
+    dimension = config.getint('dashvector', 'vector_dimension')
+    
+    client = Client(api_key=api_key, endpoint=endpoint)
+    
+    # 删除
+    del_result = client.delete(collection_name)
+    if del_result.code == 0:
+        print(f"已删除集合: {collection_name}")
+        print("等待删除完成...")
+        time.sleep(3)
+    else:
+        print(f"集合不存在，跳过删除")
+    
+    # 创建
+    create_result = client.create(collection_name, dimension=dimension)
+    if create_result.code == 0:
+        print(f"已创建集合: {collection_name} (维度={dimension})")
+    else:
+        print(f"创建失败: {create_result.message}")
+        return
+    
+    # 验证
+    collections = client.list()
+    if collection_name in collections.output:
+        print(f"验证通过: 集合已在列表中")
+    else:
+        print(f"警告: 集合未出现在列表中，请稍后重试")
+    
+    print("=" * 50)
+
+
+if __name__ == '__main__':
+    main()