Initial commit with .gitignore
This commit is contained in:
27
.gitignore
vendored
Normal file
27
.gitignore
vendored
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
# Virtual environment
|
||||||
|
venv/
|
||||||
|
.venv/
|
||||||
|
env/
|
||||||
|
.env/
|
||||||
|
|
||||||
|
# Python cache
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# Config with sensitive data
|
||||||
|
config.ini
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# OS files
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
1015
ai_article.sql
Normal file
1015
ai_article.sql
Normal file
File diff suppressed because it is too large
Load Diff
34
ai_image_tags.txt
Normal file
34
ai_image_tags.txt
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
8.149.233.36/ai_article/ai_image_tags/ http://47.99.184.230:8008/andes/index.php?route=/sql&pos=0&db=ai_article&table=ai_image_tags
|
||||||
|
|
||||||
|
正在显示第 25 - 49 行 (共 32937 行, 查询花费 0.0009 秒。)
|
||||||
|
|
||||||
|
|
||||||
|
SELECT * FROM `ai_image_tags`
|
||||||
|
|
||||||
|
|
||||||
|
id image_id image_name image_url image_thumb_url tag_id tag_name default_tag_id default_tag_name keywords_id keywords_name department_id department_name image_source created_user_id created_at updated_at image_attached_article_count status blocking_reason
|
||||||
|
16495 19346 1755312359566253.png 20250816/1755312359566253.png 20250816/1755312359566253_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:19:11 7 draft
|
||||||
|
16496 19347 1755312362360723.png 20250816/1755312362360723.png 20250816/1755312362360723_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:18:55 8 draft
|
||||||
|
16497 19348 1755312364406476.png 20250816/1755312364406476.png 20250816/1755312364406476_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:18:55 8 draft
|
||||||
|
16498 19349 1755312367284353.png 20250816/1755312367284353.png 20250816/1755312367284353_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:18:55 8 draft
|
||||||
|
16499 19350 1755312370484005.png 20250816/1755312370484005.png 20250816/1755312370484005_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:19:11 7 draft
|
||||||
|
16500 19351 1755312373245801.png 20250816/1755312373245801.png 20250816/1755312373245801_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:35:41 17 draft
|
||||||
|
16501 19352 1755312378278262.png 20250816/1755312378278262.png 20250816/1755312378278262_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:34:55 35 draft
|
||||||
|
16502 19353 1755312380298110.png 20250816/1755312380298110.png 20250816/1755312380298110_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:34:51 37 draft
|
||||||
|
16503 19354 1755312382399131.png 20250816/1755312382399131.png 20250816/1755312382399131_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:30 93 draft
|
||||||
|
16504 19355 1755312386945978.png 20250816/1755312386945978.png 20250816/1755312386945978_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:35:33 20 draft
|
||||||
|
16505 19356 1755312388894962.png 20250816/1755312388894962.png 20250816/1755312388894962_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:35:06 30 draft
|
||||||
|
16506 19357 1755312391383717.png 20250816/1755312391383717.png 20250816/1755312391383717_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:34:30 49 draft
|
||||||
|
16507 19358 1755312393565035.png 20250816/1755312393565035.png 20250816/1755312393565035_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:32:57 135 draft
|
||||||
|
16508 19359 1755312396609453.png 20250816/1755312396609453.png 20250816/1755312396609453_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:18:55 8 draft
|
||||||
|
16509 19360 1755312401479871.png 20250816/1755312401479871.png 20250816/1755312401479871_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:35:41 17 draft
|
||||||
|
16510 19361 1755312407229190.png 20250816/1755312407229190.png 20250816/1755312407229190_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:35:29 21 draft
|
||||||
|
16511 19362 1755312410797310.png 20250816/1755312410797310.png 20250816/1755312410797310_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:35:08 29 draft
|
||||||
|
16512 19363 1755312437724619.png 20250816/1755312437724619.png 20250816/1755312437724619_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:59 69 draft
|
||||||
|
16513 19364 1755312440270419.png 20250816/1755312440270419.png 20250816/1755312440270419_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:29 94 draft
|
||||||
|
16514 19365 1755312442259884.png 20250816/1755312442259884.png 20250816/1755312442259884_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:18 107 draft
|
||||||
|
16515 19366 1755312445610363.png 20250816/1755312445610363.png 20250816/1755312445610363_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:32:36 173 draft
|
||||||
|
16516 19367 1755312448884355.png 20250816/1755312448884355.png 20250816/1755312448884355_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:14 111 draft
|
||||||
|
16517 19368 1755312451681906.png 20250816/1755312451681906.png 20250816/1755312451681906_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:29 94 draft
|
||||||
|
16518 19369 1755312453351689.png 20250816/1755312453351689.png 20250816/1755312453351689_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:24 100 draft
|
||||||
|
16519 19370 1755312456284588.png 20250816/1755312456284588.png 20250816/1755312456284588_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:09 118 draft
|
||||||
72
basket.py
Normal file
72
basket.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
import requests
|
||||||
|
from dashvector import Client, Doc
|
||||||
|
|
||||||
|
# === 配置 ===
|
||||||
|
DASHVECTOR_API_KEY = 'sk-55x6oBXypSlPHQ8NvPHfyBABcMIMUE0407A0FCC2A11F0B9C802831A608ABB'
|
||||||
|
DASHVECTOR_ENDPOINT = 'vrs-cn-2ml4jm42o0001r.dashvector.cn-hangzhou.aliyuncs.com'
|
||||||
|
|
||||||
|
# 从 DashScope 控制台获取(不是百炼 Model Studio!)
|
||||||
|
DASHSCOPE_API_KEY = 'sk-d3f235925afa4e4e83d707dde04b9e52' # 👈 替换这里!
|
||||||
|
|
||||||
|
def get_embedding(text):
|
||||||
|
url = "https://dashscope.aliyuncs.com/api/v1/services/embeddings/text-embedding/text-embedding-v1"
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {DASHSCOPE_API_KEY}",
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
}
|
||||||
|
data = {
|
||||||
|
"input": {"texts": [text]},
|
||||||
|
"model": "text-embedding-v1"
|
||||||
|
}
|
||||||
|
resp = requests.post(url, headers=headers, json=data)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
return resp.json()["output"]["embeddings"][0]["embedding"]
|
||||||
|
else:
|
||||||
|
raise Exception(f"❌ Embedding API 错误: {resp.status_code} - {resp.text}")
|
||||||
|
|
||||||
|
# === 初始化 DashVector 客户端 ===
|
||||||
|
client = Client(api_key=DASHVECTOR_API_KEY, endpoint=DASHVECTOR_ENDPOINT)
|
||||||
|
|
||||||
|
# === 创建集合(注意维度是 1536!)===
|
||||||
|
collection_name = "medical_topics"
|
||||||
|
try:
|
||||||
|
client.delete(collection_name)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
client.create(name=collection_name, dimension=1536) # text-embedding-v1 输出 1536 维
|
||||||
|
collection = client.get(collection_name)
|
||||||
|
print("✅ 集合已创建并获取")
|
||||||
|
|
||||||
|
# === 插入数据 ===
|
||||||
|
topics = [
|
||||||
|
"如何治疗阳痿、早泄和肾虚?",
|
||||||
|
"早泄可以吃哪些中药?",
|
||||||
|
"该如何治疗早泄?",
|
||||||
|
"前列腺肥大是什么原因引起的?"
|
||||||
|
]
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
for i, text in enumerate(topics, 1):
|
||||||
|
emb = get_embedding(text)
|
||||||
|
docs.append(Doc(id=f"topic_{i}", vector=emb, fields={"content": text}))
|
||||||
|
|
||||||
|
resp = collection.insert(docs)
|
||||||
|
if resp.success:
|
||||||
|
print("✅ 4 条中文话题已成功插入!")
|
||||||
|
else:
|
||||||
|
print("❌ 插入失败:", resp)
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
# === 查询测试 ===
|
||||||
|
query_text = "早泄的治疗方法有哪些?"
|
||||||
|
query_vec = get_embedding(query_text)
|
||||||
|
|
||||||
|
rets = collection.query(vector=query_vec, topk=3, output_fields=["content"])
|
||||||
|
if rets.success:
|
||||||
|
print(f"\n🔍 查询 '{query_text}' 的结果:")
|
||||||
|
for doc in rets.documents:
|
||||||
|
print(f" ID: {doc.id} | 相似度: {doc.score:.4f} | 内容: {doc.fields['content']}")
|
||||||
|
else:
|
||||||
|
print("❌ 查询失败:", rets)
|
||||||
|
|
||||||
386
image_similarity_check.py
Normal file
386
image_similarity_check.py
Normal file
@@ -0,0 +1,386 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
图片去重审核脚本 - DashScope 多模态版
|
||||||
|
采用: pHash预筛 + DashScope多模态Embedding + 异步批量处理
|
||||||
|
"""
|
||||||
|
|
||||||
|
import configparser
|
||||||
|
import logging
|
||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import imagehash
|
||||||
|
import base64
|
||||||
|
import time
|
||||||
|
import dashscope
|
||||||
|
from dashscope import MultiModalEmbedding
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import Optional, Tuple, List, Dict
|
||||||
|
|
||||||
|
import pymysql
|
||||||
|
from dashvector import Client, Doc
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
|
class ImageSimilarityChecker:
|
||||||
|
"""图片相似度检查器 - DashScope 多模态版"""
|
||||||
|
|
||||||
|
def __init__(self, config_path: str = 'config.ini'):
|
||||||
|
self.config = configparser.ConfigParser()
|
||||||
|
self.config.read(config_path, encoding='utf-8')
|
||||||
|
|
||||||
|
self._setup_logging()
|
||||||
|
|
||||||
|
# 连接
|
||||||
|
self.db_conn = None
|
||||||
|
self.dashvector_client = None
|
||||||
|
self.collection = None
|
||||||
|
|
||||||
|
# DashScope API
|
||||||
|
self.dashscope_api_key = self.config.get('dashscope', 'api_key')
|
||||||
|
dashscope.api_key = self.dashscope_api_key
|
||||||
|
|
||||||
|
# pHash 缓存 {phash_str: image_tag_id}
|
||||||
|
self.phash_cache: Dict[str, int] = {}
|
||||||
|
|
||||||
|
# 配置参数
|
||||||
|
self.image_cdn_base = self.config.get('image', 'cdn_base')
|
||||||
|
self.phash_threshold = self.config.getint('similarity', 'phash_threshold')
|
||||||
|
self.vector_threshold = self.config.getfloat('similarity', 'vector_threshold')
|
||||||
|
self.batch_size = self.config.getint('process', 'batch_size')
|
||||||
|
self.concurrent_downloads = self.config.getint('process', 'concurrent_downloads')
|
||||||
|
|
||||||
|
def _setup_logging(self):
|
||||||
|
log_level = self.config.get('process', 'log_level', fallback='INFO')
|
||||||
|
log_file = self.config.get('process', 'log_file', fallback='image_similarity.log')
|
||||||
|
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# 避免重复添加 handler
|
||||||
|
if not self.logger.handlers:
|
||||||
|
self.logger.setLevel(getattr(logging, log_level))
|
||||||
|
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
|
||||||
|
fh = logging.FileHandler(log_file, encoding='utf-8')
|
||||||
|
fh.setFormatter(formatter)
|
||||||
|
self.logger.addHandler(fh)
|
||||||
|
|
||||||
|
sh = logging.StreamHandler()
|
||||||
|
sh.setFormatter(formatter)
|
||||||
|
self.logger.addHandler(sh)
|
||||||
|
|
||||||
|
def connect_db(self):
|
||||||
|
"""连接数据库"""
|
||||||
|
self.db_conn = pymysql.connect(
|
||||||
|
host=self.config.get('database', 'host'),
|
||||||
|
port=self.config.getint('database', 'port'),
|
||||||
|
user=self.config.get('database', 'user'),
|
||||||
|
password=self.config.get('database', 'password'),
|
||||||
|
database=self.config.get('database', 'database'),
|
||||||
|
charset=self.config.get('database', 'charset'),
|
||||||
|
cursorclass=pymysql.cursors.DictCursor
|
||||||
|
)
|
||||||
|
self.logger.info("数据库连接成功")
|
||||||
|
|
||||||
|
def connect_dashvector(self):
|
||||||
|
"""连接 DashVector"""
|
||||||
|
api_key = self.config.get('dashvector', 'api_key')
|
||||||
|
endpoint = self.config.get('dashvector', 'endpoint')
|
||||||
|
collection_name = self.config.get('dashvector', 'collection_name')
|
||||||
|
dimension = self.config.getint('dashvector', 'vector_dimension')
|
||||||
|
|
||||||
|
self.dashvector_client = Client(api_key=api_key, endpoint=endpoint)
|
||||||
|
|
||||||
|
# 检查集合是否存在
|
||||||
|
existing = self.dashvector_client.get(collection_name)
|
||||||
|
if existing is None:
|
||||||
|
self.logger.info(f"创建集合 {collection_name},维度 {dimension}")
|
||||||
|
self.dashvector_client.create(collection_name, dimension=dimension)
|
||||||
|
else:
|
||||||
|
self.logger.info(f"集合 {collection_name} 已存在,直接复用")
|
||||||
|
|
||||||
|
self.collection = self.dashvector_client.get(collection_name)
|
||||||
|
self.logger.info("DashVector 连接成功")
|
||||||
|
|
||||||
|
def get_image_embedding(self, image_url: str = None, image_base64: str = None, max_retries: int = 5) -> Optional[List[float]]:
|
||||||
|
"""
|
||||||
|
调用 DashScope 多模态 Embedding SDK 获取图片向量
|
||||||
|
支持传入 URL 或 base64,带限流退避重试
|
||||||
|
"""
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
# 构建输入
|
||||||
|
if image_url:
|
||||||
|
input_data = [{'image': image_url}]
|
||||||
|
elif image_base64:
|
||||||
|
input_data = [{'image': f'data:image/jpeg;base64,{image_base64}'}]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
resp = MultiModalEmbedding.call(
|
||||||
|
model='multimodal-embedding-v1',
|
||||||
|
input=input_data
|
||||||
|
)
|
||||||
|
|
||||||
|
if resp.status_code == 200:
|
||||||
|
return resp.output['embeddings'][0]['embedding']
|
||||||
|
elif resp.status_code in (429, 403):
|
||||||
|
wait_time = 3 + attempt * 3
|
||||||
|
self.logger.warning(f"API 限流,等待 {wait_time} 秒后重试 ({attempt + 1}/{max_retries})...")
|
||||||
|
time.sleep(wait_time)
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"Embedding API 错误: {resp.status_code} - {resp.message}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Embedding API 异常: {e}")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def load_phash_cache(self):
|
||||||
|
"""初始化 pHash 缓存"""
|
||||||
|
self.logger.info("pHash 缓存初始化完成")
|
||||||
|
|
||||||
|
def compute_phash(self, image: Image.Image) -> str:
|
||||||
|
"""计算感知哈希"""
|
||||||
|
return str(imagehash.phash(image))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def check_phash_duplicate(self, phash: str) -> Tuple[bool, Optional[int], Optional[int]]:
|
||||||
|
"""通过 pHash 检查是否重复"""
|
||||||
|
phash_obj = imagehash.hex_to_hash(phash)
|
||||||
|
|
||||||
|
for cached_phash, image_id in self.phash_cache.items():
|
||||||
|
cached_obj = imagehash.hex_to_hash(cached_phash)
|
||||||
|
distance = phash_obj - cached_obj
|
||||||
|
|
||||||
|
if distance <= self.phash_threshold:
|
||||||
|
return True, image_id, distance
|
||||||
|
|
||||||
|
return False, None, None
|
||||||
|
|
||||||
|
async def download_image_async(self, session: aiohttp.ClientSession,
|
||||||
|
image_id: int, url: str) -> Tuple[int, Optional[Image.Image], Optional[bytes]]:
|
||||||
|
"""异步下载单张图片"""
|
||||||
|
try:
|
||||||
|
async with session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
data = await response.read()
|
||||||
|
image = Image.open(BytesIO(data)).convert('RGB')
|
||||||
|
return image_id, image, data
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"下载失败 ID={image_id}: {e}")
|
||||||
|
|
||||||
|
return image_id, None, None
|
||||||
|
|
||||||
|
async def download_images_batch(self, image_records: List[dict]) -> Dict[int, Tuple[Image.Image, bytes, str]]:
|
||||||
|
"""批量异步下载图片"""
|
||||||
|
images = {}
|
||||||
|
|
||||||
|
connector = aiohttp.TCPConnector(limit=self.concurrent_downloads)
|
||||||
|
async with aiohttp.ClientSession(connector=connector) as session:
|
||||||
|
tasks = [
|
||||||
|
self.download_image_async(session, rec['id'], rec['image_url'])
|
||||||
|
for rec in image_records
|
||||||
|
]
|
||||||
|
results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
for i, (image_id, image, data) in enumerate(results):
|
||||||
|
if image is not None:
|
||||||
|
url = image_records[i]['image_url']
|
||||||
|
images[image_id] = (image, data, url)
|
||||||
|
|
||||||
|
return images
|
||||||
|
|
||||||
|
def search_similar(self, features: List[float], exclude_id: int) -> Tuple[bool, Optional[int], Optional[float]]:
|
||||||
|
"""在 DashVector 中搜索相似图片(score越小越相似)"""
|
||||||
|
try:
|
||||||
|
results = self.collection.query(features, topk=3)
|
||||||
|
|
||||||
|
if results and results.output:
|
||||||
|
for doc in results.output:
|
||||||
|
similar_id = int(doc.id)
|
||||||
|
if similar_id == exclude_id:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# score 是距离,越小越相似,转换为相似度
|
||||||
|
similarity = 1.0 - doc.score
|
||||||
|
self.logger.info(f"搜索到: {similar_id}, 距离={doc.score:.4f}, 相似度={similarity:.4f}")
|
||||||
|
|
||||||
|
if similarity >= self.vector_threshold:
|
||||||
|
return True, similar_id, similarity
|
||||||
|
|
||||||
|
return False, None, None
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"搜索失败: {e}")
|
||||||
|
return False, None, None
|
||||||
|
|
||||||
|
def upsert_to_dashvector(self, image_id: int, features: List[float]):
|
||||||
|
"""存入 DashVector"""
|
||||||
|
try:
|
||||||
|
doc = Doc(id=str(image_id), vector=features)
|
||||||
|
result = self.collection.upsert([doc])
|
||||||
|
if result.code == 0:
|
||||||
|
self.logger.info(f"向量入库成功: {image_id}")
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"向量入库失败 ID={image_id}: code={result.code}, msg={result.message}")
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"存入 DashVector 异常 ID={image_id}: {e}")
|
||||||
|
|
||||||
|
def get_draft_images(self) -> List[dict]:
|
||||||
|
"""获取待处理图片"""
|
||||||
|
with self.db_conn.cursor() as cursor:
|
||||||
|
sql = """
|
||||||
|
SELECT id, image_id, image_url, image_thumb_url, image_name
|
||||||
|
FROM ai_image_tags
|
||||||
|
WHERE status = 'draft' AND similarity = 'draft'
|
||||||
|
AND image_url != '' AND image_url IS NOT NULL
|
||||||
|
ORDER BY id ASC
|
||||||
|
LIMIT %s
|
||||||
|
"""
|
||||||
|
cursor.execute(sql, (self.batch_size,))
|
||||||
|
return cursor.fetchall()
|
||||||
|
|
||||||
|
def update_as_duplicate(self, image_id: int, similar_id: int, score: float):
|
||||||
|
"""更新为重复图片"""
|
||||||
|
with self.db_conn.cursor() as cursor:
|
||||||
|
sql = """
|
||||||
|
UPDATE ai_image_tags
|
||||||
|
SET status = 'similarity',
|
||||||
|
similarity = 'yes',
|
||||||
|
similarity_image_tags_id = %s,
|
||||||
|
`similarity score` = %s,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = %s
|
||||||
|
"""
|
||||||
|
cursor.execute(sql, (similar_id, score, image_id))
|
||||||
|
self.db_conn.commit()
|
||||||
|
self.logger.info(f"重复: {image_id} -> {similar_id} (分数={score:.4f})")
|
||||||
|
|
||||||
|
def update_as_unique(self, image_id: int):
|
||||||
|
"""更新为不重复图片"""
|
||||||
|
with self.db_conn.cursor() as cursor:
|
||||||
|
sql = """
|
||||||
|
UPDATE ai_image_tags
|
||||||
|
SET status = 'tag_extension',
|
||||||
|
similarity = 'calc',
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = %s
|
||||||
|
"""
|
||||||
|
cursor.execute(sql, (image_id,))
|
||||||
|
self.db_conn.commit()
|
||||||
|
self.logger.info(f"不重复: {image_id} -> tag_extension")
|
||||||
|
|
||||||
|
def update_as_failed(self, image_id: int, reason: str):
|
||||||
|
"""标记为处理失败"""
|
||||||
|
with self.db_conn.cursor() as cursor:
|
||||||
|
sql = """
|
||||||
|
UPDATE ai_image_tags
|
||||||
|
SET status = 'draft',
|
||||||
|
similarity = 'recalc',
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = %s
|
||||||
|
"""
|
||||||
|
cursor.execute(sql, (image_id,))
|
||||||
|
self.db_conn.commit()
|
||||||
|
|
||||||
|
def process_batch(self, image_records: List[dict]) -> Tuple[int, int, int]:
|
||||||
|
"""处理一批图片,返回 (重复数, 不重复数, 失败数)"""
|
||||||
|
if not image_records:
|
||||||
|
return 0, 0, 0
|
||||||
|
|
||||||
|
duplicates = 0
|
||||||
|
unique = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
for rec in image_records:
|
||||||
|
image_id = rec['id']
|
||||||
|
# 检查是否有有效的图像URL
|
||||||
|
if not rec['image_url'] or rec['image_url'].strip() == '':
|
||||||
|
self.logger.warning(f"图像URL为空,跳过处理: {image_id}")
|
||||||
|
self.update_as_failed(image_id, "图像URL为空")
|
||||||
|
failed += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 拼接 CDN URL,使用原图
|
||||||
|
full_url = f"{self.image_cdn_base}{rec['image_url']}"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 限流控制:免费版 2 QPS
|
||||||
|
time.sleep(0.5)
|
||||||
|
self.logger.info(f"获取 Embedding: {image_id} -> {full_url}")
|
||||||
|
|
||||||
|
# 直接传 URL 给 DashScope
|
||||||
|
features = self.get_image_embedding(image_url=full_url)
|
||||||
|
|
||||||
|
if features is None:
|
||||||
|
self.logger.warning(f"Embedding 获取失败: {image_id}")
|
||||||
|
self.update_as_failed(image_id, "Embedding API 失败")
|
||||||
|
failed += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# DashVector 搜索相似图片
|
||||||
|
is_dup, similar_id, score = self.search_similar(features, image_id)
|
||||||
|
|
||||||
|
if is_dup:
|
||||||
|
self.update_as_duplicate(image_id, similar_id, score)
|
||||||
|
duplicates += 1
|
||||||
|
else:
|
||||||
|
self.upsert_to_dashvector(image_id, features)
|
||||||
|
self.update_as_unique(image_id)
|
||||||
|
unique += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"处理失败 {image_id}: {e}")
|
||||||
|
self.update_as_failed(image_id, str(e)[:200])
|
||||||
|
failed += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
return duplicates, unique, failed
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""运行主流程"""
|
||||||
|
self.logger.info("=" * 60)
|
||||||
|
self.logger.info("图片去重审核 - DashScope 多模态版")
|
||||||
|
self.logger.info("=" * 60)
|
||||||
|
|
||||||
|
# 初始化
|
||||||
|
self.connect_db()
|
||||||
|
self.connect_dashvector()
|
||||||
|
self.load_phash_cache()
|
||||||
|
|
||||||
|
total_duplicates = 0
|
||||||
|
total_unique = 0
|
||||||
|
batch_num = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
images = self.get_draft_images()
|
||||||
|
|
||||||
|
if not images:
|
||||||
|
self.logger.info("没有待处理的图片")
|
||||||
|
break
|
||||||
|
|
||||||
|
batch_num += 1
|
||||||
|
self.logger.info(f"\n--- 批次 {batch_num}: {len(images)} 张 ---")
|
||||||
|
|
||||||
|
dup, uniq, fail = self.process_batch(images)
|
||||||
|
total_duplicates += dup
|
||||||
|
total_unique += uniq
|
||||||
|
|
||||||
|
self.logger.info(f"批次结果: 重复={dup}, 不重复={uniq}, 失败={fail}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
if self.db_conn:
|
||||||
|
self.db_conn.close()
|
||||||
|
|
||||||
|
self.logger.info("=" * 60)
|
||||||
|
self.logger.info(f"完成! 总重复: {total_duplicates}, 总不重复: {total_unique}")
|
||||||
|
self.logger.info("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
checker = ImageSimilarityChecker('config.ini')
|
||||||
|
checker.run()
|
||||||
59
query_status.py
Normal file
59
query_status.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
查询图片相似度状态脚本
|
||||||
|
"""
|
||||||
|
|
||||||
|
import configparser
|
||||||
|
import pymysql
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.read('config.ini', encoding='utf-8')
|
||||||
|
|
||||||
|
db_conn = pymysql.connect(
|
||||||
|
host=config.get('database', 'host'),
|
||||||
|
port=config.getint('database', 'port'),
|
||||||
|
user=config.get('database', 'user'),
|
||||||
|
password=config.get('database', 'password'),
|
||||||
|
database=config.get('database', 'database'),
|
||||||
|
charset=config.get('database', 'charset'),
|
||||||
|
cursorclass=pymysql.cursors.DictCursor
|
||||||
|
)
|
||||||
|
|
||||||
|
with db_conn.cursor() as cursor:
|
||||||
|
sql = """
|
||||||
|
SELECT id, image_name, status, similarity,
|
||||||
|
similarity_image_tags_id, `similarity score`, blocking_reason
|
||||||
|
FROM ai_image_tags
|
||||||
|
ORDER BY id
|
||||||
|
"""
|
||||||
|
cursor.execute(sql)
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
|
||||||
|
db_conn.close()
|
||||||
|
|
||||||
|
# 统计
|
||||||
|
total = len(rows)
|
||||||
|
draft_count = sum(1 for r in rows if r['status'] == 'draft')
|
||||||
|
unique_count = sum(1 for r in rows if r['status'] == 'tag_extension')
|
||||||
|
dup_count = sum(1 for r in rows if r['status'] == 'similarity')
|
||||||
|
failed_count = sum(1 for r in rows if r['similarity'] == 'recalc')
|
||||||
|
|
||||||
|
print("=" * 100)
|
||||||
|
print(f"{'ID':<8} {'图片名称':<30} {'状态':<15} {'相似性':<8} {'相似ID':<8} {'分数':<8} {'原因'}")
|
||||||
|
print("=" * 100)
|
||||||
|
|
||||||
|
for r in rows:
|
||||||
|
score = f"{r['similarity score']:.4f}" if r['similarity score'] else "-"
|
||||||
|
similar_id = r['similarity_image_tags_id'] if r['similarity_image_tags_id'] else "-"
|
||||||
|
reason = r['blocking_reason'][:20] if r['blocking_reason'] else "-"
|
||||||
|
print(f"{r['id']:<8} {r['image_name'][:28]:<30} {r['status']:<15} {r['similarity']:<8} {similar_id:<8} {score:<8} {reason}")
|
||||||
|
|
||||||
|
print("=" * 100)
|
||||||
|
print(f"总计: {total} | 待处理: {draft_count} | 不重复: {unique_count} | 重复: {dup_count} | 失败: {failed_count}")
|
||||||
|
print("=" * 100)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
6
requirements.txt
Normal file
6
requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
pymysql>=1.0.0
|
||||||
|
dashvector>=1.0.0
|
||||||
|
Pillow>=9.0.0
|
||||||
|
aiohttp>=3.8.0
|
||||||
|
imagehash>=4.3.0
|
||||||
|
requests>=2.28.0
|
||||||
51
reset_data.py
Normal file
51
reset_data.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
重置关系数据库脚本
|
||||||
|
"""
|
||||||
|
|
||||||
|
import configparser
|
||||||
|
import pymysql
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.read('config.ini', encoding='utf-8')
|
||||||
|
|
||||||
|
print("=" * 50)
|
||||||
|
print("重置 MySQL 数据")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
db_conn = pymysql.connect(
|
||||||
|
host=config.get('database', 'host'),
|
||||||
|
port=config.getint('database', 'port'),
|
||||||
|
user=config.get('database', 'user'),
|
||||||
|
password=config.get('database', 'password'),
|
||||||
|
database=config.get('database', 'database'),
|
||||||
|
charset=config.get('database', 'charset'),
|
||||||
|
cursorclass=pymysql.cursors.DictCursor
|
||||||
|
)
|
||||||
|
|
||||||
|
with db_conn.cursor() as cursor:
|
||||||
|
sql = """
|
||||||
|
UPDATE ai_image_tags
|
||||||
|
SET status = 'draft',
|
||||||
|
similarity = 'draft',
|
||||||
|
similarity_image_tags_id = 0,
|
||||||
|
`similarity score` = 0
|
||||||
|
WHERE status != 'draft' OR similarity != 'draft'
|
||||||
|
"""
|
||||||
|
affected = cursor.execute(sql)
|
||||||
|
db_conn.commit()
|
||||||
|
|
||||||
|
cursor.execute("SELECT COUNT(*) as total FROM ai_image_tags WHERE status = 'draft'")
|
||||||
|
result = cursor.fetchone()
|
||||||
|
|
||||||
|
db_conn.close()
|
||||||
|
|
||||||
|
print(f"更新记录数: {affected}")
|
||||||
|
print(f"当前 draft 状态总数: {result['total']}")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
54
reset_vector.py
Normal file
54
reset_vector.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
重置向量数据库集合脚本
|
||||||
|
"""
|
||||||
|
|
||||||
|
import configparser
|
||||||
|
import time
|
||||||
|
from dashvector import Client
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.read('config.ini', encoding='utf-8')
|
||||||
|
|
||||||
|
print("=" * 50)
|
||||||
|
print("重置 DashVector 集合")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
api_key = config.get('dashvector', 'api_key')
|
||||||
|
endpoint = config.get('dashvector', 'endpoint')
|
||||||
|
collection_name = config.get('dashvector', 'collection_name')
|
||||||
|
dimension = config.getint('dashvector', 'vector_dimension')
|
||||||
|
|
||||||
|
client = Client(api_key=api_key, endpoint=endpoint)
|
||||||
|
|
||||||
|
# 删除
|
||||||
|
del_result = client.delete(collection_name)
|
||||||
|
if del_result.code == 0:
|
||||||
|
print(f"已删除集合: {collection_name}")
|
||||||
|
print("等待删除完成...")
|
||||||
|
time.sleep(3)
|
||||||
|
else:
|
||||||
|
print(f"集合不存在,跳过删除")
|
||||||
|
|
||||||
|
# 创建
|
||||||
|
create_result = client.create(collection_name, dimension=dimension)
|
||||||
|
if create_result.code == 0:
|
||||||
|
print(f"已创建集合: {collection_name} (维度={dimension})")
|
||||||
|
else:
|
||||||
|
print(f"创建失败: {create_result.message}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 验证
|
||||||
|
collections = client.list()
|
||||||
|
if collection_name in collections.output:
|
||||||
|
print(f"验证通过: 集合已在列表中")
|
||||||
|
else:
|
||||||
|
print(f"警告: 集合未出现在列表中,请稍后重试")
|
||||||
|
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user