Initial commit with .gitignore

This commit is contained in:
2026-02-04 14:36:13 +08:00
commit 82b5fbf875
9 changed files with 1704 additions and 0 deletions

27
.gitignore vendored Normal file
View File

@@ -0,0 +1,27 @@
# Virtual environment
venv/
.venv/
env/
.env/
# Python cache
__pycache__/
*.py[cod]
*$py.class
*.so
# IDE
.idea/
.vscode/
*.swp
*.swo
# Config with sensitive data
config.ini
# Logs
*.log
# OS files
.DS_Store
Thumbs.db

1015
ai_article.sql Normal file

File diff suppressed because it is too large Load Diff

34
ai_image_tags.txt Normal file
View File

@@ -0,0 +1,34 @@
8.149.233.36/ai_article/ai_image_tags/ http://47.99.184.230:8008/andes/index.php?route=/sql&pos=0&db=ai_article&table=ai_image_tags
正在显示第 25 - 49 行 (共 32937 行, 查询花费 0.0009 秒。)
SELECT * FROM `ai_image_tags`
id image_id image_name image_url image_thumb_url tag_id tag_name default_tag_id default_tag_name keywords_id keywords_name department_id department_name image_source created_user_id created_at updated_at image_attached_article_count status blocking_reason
16495 19346 1755312359566253.png 20250816/1755312359566253.png 20250816/1755312359566253_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:19:11 7 draft
16496 19347 1755312362360723.png 20250816/1755312362360723.png 20250816/1755312362360723_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:18:55 8 draft
16497 19348 1755312364406476.png 20250816/1755312364406476.png 20250816/1755312364406476_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:18:55 8 draft
16498 19349 1755312367284353.png 20250816/1755312367284353.png 20250816/1755312367284353_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:18:55 8 draft
16499 19350 1755312370484005.png 20250816/1755312370484005.png 20250816/1755312370484005_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:19:11 7 draft
16500 19351 1755312373245801.png 20250816/1755312373245801.png 20250816/1755312373245801_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:35:41 17 draft
16501 19352 1755312378278262.png 20250816/1755312378278262.png 20250816/1755312378278262_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:34:55 35 draft
16502 19353 1755312380298110.png 20250816/1755312380298110.png 20250816/1755312380298110_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:34:51 37 draft
16503 19354 1755312382399131.png 20250816/1755312382399131.png 20250816/1755312382399131_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:30 93 draft
16504 19355 1755312386945978.png 20250816/1755312386945978.png 20250816/1755312386945978_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:35:33 20 draft
16505 19356 1755312388894962.png 20250816/1755312388894962.png 20250816/1755312388894962_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:35:06 30 draft
16506 19357 1755312391383717.png 20250816/1755312391383717.png 20250816/1755312391383717_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:34:30 49 draft
16507 19358 1755312393565035.png 20250816/1755312393565035.png 20250816/1755312393565035_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:32:57 135 draft
16508 19359 1755312396609453.png 20250816/1755312396609453.png 20250816/1755312396609453_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:18:55 8 draft
16509 19360 1755312401479871.png 20250816/1755312401479871.png 20250816/1755312401479871_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:35:41 17 draft
16510 19361 1755312407229190.png 20250816/1755312407229190.png 20250816/1755312407229190_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:35:29 21 draft
16511 19362 1755312410797310.png 20250816/1755312410797310.png 20250816/1755312410797310_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 186 妇科炎症 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:35:08 29 draft
16512 19363 1755312437724619.png 20250816/1755312437724619.png 20250816/1755312437724619_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:59 69 draft
16513 19364 1755312440270419.png 20250816/1755312440270419.png 20250816/1755312440270419_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:29 94 draft
16514 19365 1755312442259884.png 20250816/1755312442259884.png 20250816/1755312442259884_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:18 107 draft
16515 19366 1755312445610363.png 20250816/1755312445610363.png 20250816/1755312445610363_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:32:36 173 draft
16516 19367 1755312448884355.png 20250816/1755312448884355.png 20250816/1755312448884355_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:14 111 draft
16517 19368 1755312451681906.png 20250816/1755312451681906.png 20250816/1755312451681906_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:29 94 draft
16518 19369 1755312453351689.png 20250816/1755312453351689.png 20250816/1755312453351689_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:24 100 draft
16519 19370 1755312456284588.png 20250816/1755312456284588.png 20250816/1755312456284588_thumb.png 12679 #妇科炎症##妇科炎症原因##妇科炎症治疗##妇科炎症怎么办# 0 265 废止 11 妇科 1 0 2025-08-16 21:48:16 2026-01-30 14:33:09 118 draft

72
basket.py Normal file
View File

@@ -0,0 +1,72 @@
import requests
from dashvector import Client, Doc
# === 配置 ===
DASHVECTOR_API_KEY = 'sk-55x6oBXypSlPHQ8NvPHfyBABcMIMUE0407A0FCC2A11F0B9C802831A608ABB'
DASHVECTOR_ENDPOINT = 'vrs-cn-2ml4jm42o0001r.dashvector.cn-hangzhou.aliyuncs.com'
# 从 DashScope 控制台获取(不是百炼 Model Studio
DASHSCOPE_API_KEY = 'sk-d3f235925afa4e4e83d707dde04b9e52' # 👈 替换这里!
def get_embedding(text):
url = "https://dashscope.aliyuncs.com/api/v1/services/embeddings/text-embedding/text-embedding-v1"
headers = {
"Authorization": f"Bearer {DASHSCOPE_API_KEY}",
"Content-Type": "application/json"
}
data = {
"input": {"texts": [text]},
"model": "text-embedding-v1"
}
resp = requests.post(url, headers=headers, json=data)
if resp.status_code == 200:
return resp.json()["output"]["embeddings"][0]["embedding"]
else:
raise Exception(f"❌ Embedding API 错误: {resp.status_code} - {resp.text}")
# === 初始化 DashVector 客户端 ===
client = Client(api_key=DASHVECTOR_API_KEY, endpoint=DASHVECTOR_ENDPOINT)
# === 创建集合(注意维度是 1536===
collection_name = "medical_topics"
try:
client.delete(collection_name)
except:
pass
client.create(name=collection_name, dimension=1536) # text-embedding-v1 输出 1536 维
collection = client.get(collection_name)
print("✅ 集合已创建并获取")
# === 插入数据 ===
topics = [
"如何治疗阳痿、早泄和肾虚?",
"早泄可以吃哪些中药?",
"该如何治疗早泄?",
"前列腺肥大是什么原因引起的?"
]
docs = []
for i, text in enumerate(topics, 1):
emb = get_embedding(text)
docs.append(Doc(id=f"topic_{i}", vector=emb, fields={"content": text}))
resp = collection.insert(docs)
if resp.success:
print("✅ 4 条中文话题已成功插入!")
else:
print("❌ 插入失败:", resp)
exit(1)
# === 查询测试 ===
query_text = "早泄的治疗方法有哪些?"
query_vec = get_embedding(query_text)
rets = collection.query(vector=query_vec, topk=3, output_fields=["content"])
if rets.success:
print(f"\n🔍 查询 '{query_text}' 的结果:")
for doc in rets.documents:
print(f" ID: {doc.id} | 相似度: {doc.score:.4f} | 内容: {doc.fields['content']}")
else:
print("❌ 查询失败:", rets)

386
image_similarity_check.py Normal file
View File

@@ -0,0 +1,386 @@
# -*- coding: utf-8 -*-
"""
图片去重审核脚本 - DashScope 多模态版
采用: pHash预筛 + DashScope多模态Embedding + 异步批量处理
"""
import configparser
import logging
import asyncio
import aiohttp
import imagehash
import base64
import time
import dashscope
from dashscope import MultiModalEmbedding
from io import BytesIO
from typing import Optional, Tuple, List, Dict
import pymysql
from dashvector import Client, Doc
from PIL import Image
class ImageSimilarityChecker:
"""图片相似度检查器 - DashScope 多模态版"""
def __init__(self, config_path: str = 'config.ini'):
self.config = configparser.ConfigParser()
self.config.read(config_path, encoding='utf-8')
self._setup_logging()
# 连接
self.db_conn = None
self.dashvector_client = None
self.collection = None
# DashScope API
self.dashscope_api_key = self.config.get('dashscope', 'api_key')
dashscope.api_key = self.dashscope_api_key
# pHash 缓存 {phash_str: image_tag_id}
self.phash_cache: Dict[str, int] = {}
# 配置参数
self.image_cdn_base = self.config.get('image', 'cdn_base')
self.phash_threshold = self.config.getint('similarity', 'phash_threshold')
self.vector_threshold = self.config.getfloat('similarity', 'vector_threshold')
self.batch_size = self.config.getint('process', 'batch_size')
self.concurrent_downloads = self.config.getint('process', 'concurrent_downloads')
def _setup_logging(self):
log_level = self.config.get('process', 'log_level', fallback='INFO')
log_file = self.config.get('process', 'log_file', fallback='image_similarity.log')
self.logger = logging.getLogger(__name__)
# 避免重复添加 handler
if not self.logger.handlers:
self.logger.setLevel(getattr(logging, log_level))
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
fh = logging.FileHandler(log_file, encoding='utf-8')
fh.setFormatter(formatter)
self.logger.addHandler(fh)
sh = logging.StreamHandler()
sh.setFormatter(formatter)
self.logger.addHandler(sh)
def connect_db(self):
"""连接数据库"""
self.db_conn = pymysql.connect(
host=self.config.get('database', 'host'),
port=self.config.getint('database', 'port'),
user=self.config.get('database', 'user'),
password=self.config.get('database', 'password'),
database=self.config.get('database', 'database'),
charset=self.config.get('database', 'charset'),
cursorclass=pymysql.cursors.DictCursor
)
self.logger.info("数据库连接成功")
def connect_dashvector(self):
"""连接 DashVector"""
api_key = self.config.get('dashvector', 'api_key')
endpoint = self.config.get('dashvector', 'endpoint')
collection_name = self.config.get('dashvector', 'collection_name')
dimension = self.config.getint('dashvector', 'vector_dimension')
self.dashvector_client = Client(api_key=api_key, endpoint=endpoint)
# 检查集合是否存在
existing = self.dashvector_client.get(collection_name)
if existing is None:
self.logger.info(f"创建集合 {collection_name},维度 {dimension}")
self.dashvector_client.create(collection_name, dimension=dimension)
else:
self.logger.info(f"集合 {collection_name} 已存在,直接复用")
self.collection = self.dashvector_client.get(collection_name)
self.logger.info("DashVector 连接成功")
def get_image_embedding(self, image_url: str = None, image_base64: str = None, max_retries: int = 5) -> Optional[List[float]]:
"""
调用 DashScope 多模态 Embedding SDK 获取图片向量
支持传入 URL 或 base64带限流退避重试
"""
for attempt in range(max_retries):
try:
# 构建输入
if image_url:
input_data = [{'image': image_url}]
elif image_base64:
input_data = [{'image': f'data:image/jpeg;base64,{image_base64}'}]
else:
return None
resp = MultiModalEmbedding.call(
model='multimodal-embedding-v1',
input=input_data
)
if resp.status_code == 200:
return resp.output['embeddings'][0]['embedding']
elif resp.status_code in (429, 403):
wait_time = 3 + attempt * 3
self.logger.warning(f"API 限流,等待 {wait_time} 秒后重试 ({attempt + 1}/{max_retries})...")
time.sleep(wait_time)
else:
self.logger.warning(f"Embedding API 错误: {resp.status_code} - {resp.message}")
return None
except Exception as e:
self.logger.warning(f"Embedding API 异常: {e}")
time.sleep(2)
return None
def load_phash_cache(self):
"""初始化 pHash 缓存"""
self.logger.info("pHash 缓存初始化完成")
def compute_phash(self, image: Image.Image) -> str:
"""计算感知哈希"""
return str(imagehash.phash(image))
def check_phash_duplicate(self, phash: str) -> Tuple[bool, Optional[int], Optional[int]]:
"""通过 pHash 检查是否重复"""
phash_obj = imagehash.hex_to_hash(phash)
for cached_phash, image_id in self.phash_cache.items():
cached_obj = imagehash.hex_to_hash(cached_phash)
distance = phash_obj - cached_obj
if distance <= self.phash_threshold:
return True, image_id, distance
return False, None, None
async def download_image_async(self, session: aiohttp.ClientSession,
image_id: int, url: str) -> Tuple[int, Optional[Image.Image], Optional[bytes]]:
"""异步下载单张图片"""
try:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as response:
if response.status == 200:
data = await response.read()
image = Image.open(BytesIO(data)).convert('RGB')
return image_id, image, data
except Exception as e:
self.logger.warning(f"下载失败 ID={image_id}: {e}")
return image_id, None, None
async def download_images_batch(self, image_records: List[dict]) -> Dict[int, Tuple[Image.Image, bytes, str]]:
"""批量异步下载图片"""
images = {}
connector = aiohttp.TCPConnector(limit=self.concurrent_downloads)
async with aiohttp.ClientSession(connector=connector) as session:
tasks = [
self.download_image_async(session, rec['id'], rec['image_url'])
for rec in image_records
]
results = await asyncio.gather(*tasks)
for i, (image_id, image, data) in enumerate(results):
if image is not None:
url = image_records[i]['image_url']
images[image_id] = (image, data, url)
return images
def search_similar(self, features: List[float], exclude_id: int) -> Tuple[bool, Optional[int], Optional[float]]:
"""在 DashVector 中搜索相似图片score越小越相似"""
try:
results = self.collection.query(features, topk=3)
if results and results.output:
for doc in results.output:
similar_id = int(doc.id)
if similar_id == exclude_id:
continue
# score 是距离,越小越相似,转换为相似度
similarity = 1.0 - doc.score
self.logger.info(f"搜索到: {similar_id}, 距离={doc.score:.4f}, 相似度={similarity:.4f}")
if similarity >= self.vector_threshold:
return True, similar_id, similarity
return False, None, None
except Exception as e:
self.logger.warning(f"搜索失败: {e}")
return False, None, None
def upsert_to_dashvector(self, image_id: int, features: List[float]):
"""存入 DashVector"""
try:
doc = Doc(id=str(image_id), vector=features)
result = self.collection.upsert([doc])
if result.code == 0:
self.logger.info(f"向量入库成功: {image_id}")
else:
self.logger.warning(f"向量入库失败 ID={image_id}: code={result.code}, msg={result.message}")
except Exception as e:
self.logger.warning(f"存入 DashVector 异常 ID={image_id}: {e}")
def get_draft_images(self) -> List[dict]:
"""获取待处理图片"""
with self.db_conn.cursor() as cursor:
sql = """
SELECT id, image_id, image_url, image_thumb_url, image_name
FROM ai_image_tags
WHERE status = 'draft' AND similarity = 'draft'
AND image_url != '' AND image_url IS NOT NULL
ORDER BY id ASC
LIMIT %s
"""
cursor.execute(sql, (self.batch_size,))
return cursor.fetchall()
def update_as_duplicate(self, image_id: int, similar_id: int, score: float):
"""更新为重复图片"""
with self.db_conn.cursor() as cursor:
sql = """
UPDATE ai_image_tags
SET status = 'similarity',
similarity = 'yes',
similarity_image_tags_id = %s,
`similarity score` = %s,
updated_at = NOW()
WHERE id = %s
"""
cursor.execute(sql, (similar_id, score, image_id))
self.db_conn.commit()
self.logger.info(f"重复: {image_id} -> {similar_id} (分数={score:.4f})")
def update_as_unique(self, image_id: int):
"""更新为不重复图片"""
with self.db_conn.cursor() as cursor:
sql = """
UPDATE ai_image_tags
SET status = 'tag_extension',
similarity = 'calc',
updated_at = NOW()
WHERE id = %s
"""
cursor.execute(sql, (image_id,))
self.db_conn.commit()
self.logger.info(f"不重复: {image_id} -> tag_extension")
def update_as_failed(self, image_id: int, reason: str):
"""标记为处理失败"""
with self.db_conn.cursor() as cursor:
sql = """
UPDATE ai_image_tags
SET status = 'draft',
similarity = 'recalc',
updated_at = NOW()
WHERE id = %s
"""
cursor.execute(sql, (image_id,))
self.db_conn.commit()
def process_batch(self, image_records: List[dict]) -> Tuple[int, int, int]:
"""处理一批图片,返回 (重复数, 不重复数, 失败数)"""
if not image_records:
return 0, 0, 0
duplicates = 0
unique = 0
failed = 0
for rec in image_records:
image_id = rec['id']
# 检查是否有有效的图像URL
if not rec['image_url'] or rec['image_url'].strip() == '':
self.logger.warning(f"图像URL为空跳过处理: {image_id}")
self.update_as_failed(image_id, "图像URL为空")
failed += 1
continue
# 拼接 CDN URL使用原图
full_url = f"{self.image_cdn_base}{rec['image_url']}"
try:
# 限流控制:免费版 2 QPS
time.sleep(0.5)
self.logger.info(f"获取 Embedding: {image_id} -> {full_url}")
# 直接传 URL 给 DashScope
features = self.get_image_embedding(image_url=full_url)
if features is None:
self.logger.warning(f"Embedding 获取失败: {image_id}")
self.update_as_failed(image_id, "Embedding API 失败")
failed += 1
continue
# DashVector 搜索相似图片
is_dup, similar_id, score = self.search_similar(features, image_id)
if is_dup:
self.update_as_duplicate(image_id, similar_id, score)
duplicates += 1
else:
self.upsert_to_dashvector(image_id, features)
self.update_as_unique(image_id)
unique += 1
except Exception as e:
self.logger.error(f"处理失败 {image_id}: {e}")
self.update_as_failed(image_id, str(e)[:200])
failed += 1
continue
return duplicates, unique, failed
def run(self):
"""运行主流程"""
self.logger.info("=" * 60)
self.logger.info("图片去重审核 - DashScope 多模态版")
self.logger.info("=" * 60)
# 初始化
self.connect_db()
self.connect_dashvector()
self.load_phash_cache()
total_duplicates = 0
total_unique = 0
batch_num = 0
try:
while True:
images = self.get_draft_images()
if not images:
self.logger.info("没有待处理的图片")
break
batch_num += 1
self.logger.info(f"\n--- 批次 {batch_num}: {len(images)} 张 ---")
dup, uniq, fail = self.process_batch(images)
total_duplicates += dup
total_unique += uniq
self.logger.info(f"批次结果: 重复={dup}, 不重复={uniq}, 失败={fail}")
finally:
if self.db_conn:
self.db_conn.close()
self.logger.info("=" * 60)
self.logger.info(f"完成! 总重复: {total_duplicates}, 总不重复: {total_unique}")
self.logger.info("=" * 60)
if __name__ == '__main__':
checker = ImageSimilarityChecker('config.ini')
checker.run()

59
query_status.py Normal file
View File

@@ -0,0 +1,59 @@
# -*- coding: utf-8 -*-
"""
查询图片相似度状态脚本
"""
import configparser
import pymysql
def main():
config = configparser.ConfigParser()
config.read('config.ini', encoding='utf-8')
db_conn = pymysql.connect(
host=config.get('database', 'host'),
port=config.getint('database', 'port'),
user=config.get('database', 'user'),
password=config.get('database', 'password'),
database=config.get('database', 'database'),
charset=config.get('database', 'charset'),
cursorclass=pymysql.cursors.DictCursor
)
with db_conn.cursor() as cursor:
sql = """
SELECT id, image_name, status, similarity,
similarity_image_tags_id, `similarity score`, blocking_reason
FROM ai_image_tags
ORDER BY id
"""
cursor.execute(sql)
rows = cursor.fetchall()
db_conn.close()
# 统计
total = len(rows)
draft_count = sum(1 for r in rows if r['status'] == 'draft')
unique_count = sum(1 for r in rows if r['status'] == 'tag_extension')
dup_count = sum(1 for r in rows if r['status'] == 'similarity')
failed_count = sum(1 for r in rows if r['similarity'] == 'recalc')
print("=" * 100)
print(f"{'ID':<8} {'图片名称':<30} {'状态':<15} {'相似性':<8} {'相似ID':<8} {'分数':<8} {'原因'}")
print("=" * 100)
for r in rows:
score = f"{r['similarity score']:.4f}" if r['similarity score'] else "-"
similar_id = r['similarity_image_tags_id'] if r['similarity_image_tags_id'] else "-"
reason = r['blocking_reason'][:20] if r['blocking_reason'] else "-"
print(f"{r['id']:<8} {r['image_name'][:28]:<30} {r['status']:<15} {r['similarity']:<8} {similar_id:<8} {score:<8} {reason}")
print("=" * 100)
print(f"总计: {total} | 待处理: {draft_count} | 不重复: {unique_count} | 重复: {dup_count} | 失败: {failed_count}")
print("=" * 100)
if __name__ == '__main__':
main()

6
requirements.txt Normal file
View File

@@ -0,0 +1,6 @@
pymysql>=1.0.0
dashvector>=1.0.0
Pillow>=9.0.0
aiohttp>=3.8.0
imagehash>=4.3.0
requests>=2.28.0

51
reset_data.py Normal file
View File

@@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
"""
重置关系数据库脚本
"""
import configparser
import pymysql
def main():
config = configparser.ConfigParser()
config.read('config.ini', encoding='utf-8')
print("=" * 50)
print("重置 MySQL 数据")
print("=" * 50)
db_conn = pymysql.connect(
host=config.get('database', 'host'),
port=config.getint('database', 'port'),
user=config.get('database', 'user'),
password=config.get('database', 'password'),
database=config.get('database', 'database'),
charset=config.get('database', 'charset'),
cursorclass=pymysql.cursors.DictCursor
)
with db_conn.cursor() as cursor:
sql = """
UPDATE ai_image_tags
SET status = 'draft',
similarity = 'draft',
similarity_image_tags_id = 0,
`similarity score` = 0
WHERE status != 'draft' OR similarity != 'draft'
"""
affected = cursor.execute(sql)
db_conn.commit()
cursor.execute("SELECT COUNT(*) as total FROM ai_image_tags WHERE status = 'draft'")
result = cursor.fetchone()
db_conn.close()
print(f"更新记录数: {affected}")
print(f"当前 draft 状态总数: {result['total']}")
print("=" * 50)
if __name__ == '__main__':
main()

54
reset_vector.py Normal file
View File

@@ -0,0 +1,54 @@
# -*- coding: utf-8 -*-
"""
重置向量数据库集合脚本
"""
import configparser
import time
from dashvector import Client
def main():
config = configparser.ConfigParser()
config.read('config.ini', encoding='utf-8')
print("=" * 50)
print("重置 DashVector 集合")
print("=" * 50)
api_key = config.get('dashvector', 'api_key')
endpoint = config.get('dashvector', 'endpoint')
collection_name = config.get('dashvector', 'collection_name')
dimension = config.getint('dashvector', 'vector_dimension')
client = Client(api_key=api_key, endpoint=endpoint)
# 删除
del_result = client.delete(collection_name)
if del_result.code == 0:
print(f"已删除集合: {collection_name}")
print("等待删除完成...")
time.sleep(3)
else:
print(f"集合不存在,跳过删除")
# 创建
create_result = client.create(collection_name, dimension=dimension)
if create_result.code == 0:
print(f"已创建集合: {collection_name} (维度={dimension})")
else:
print(f"创建失败: {create_result.message}")
return
# 验证
collections = client.list()
if collection_name in collections.output:
print(f"验证通过: 集合已在列表中")
else:
print(f"警告: 集合未出现在列表中,请稍后重试")
print("=" * 50)
if __name__ == '__main__':
main()