142 lines
5.2 KiB
Python
142 lines
5.2 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
||
|
|
"""
|
||
|
|
图片相似度统计脚本
|
||
|
|
统计各状态的图片数量和重复率
|
||
|
|
"""
|
||
|
|
|
||
|
|
import configparser
|
||
|
|
import pymysql
|
||
|
|
from datetime import datetime
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
# 读取配置
|
||
|
|
config = configparser.ConfigParser()
|
||
|
|
config.read('config.ini', encoding='utf-8')
|
||
|
|
|
||
|
|
# 连接数据库
|
||
|
|
db_conn = pymysql.connect(
|
||
|
|
host=config.get('database', 'host'),
|
||
|
|
port=config.getint('database', 'port'),
|
||
|
|
user=config.get('database', 'user'),
|
||
|
|
password=config.get('database', 'password'),
|
||
|
|
database=config.get('database', 'database'),
|
||
|
|
charset=config.get('database', 'charset'),
|
||
|
|
cursorclass=pymysql.cursors.DictCursor
|
||
|
|
)
|
||
|
|
|
||
|
|
print("=" * 70)
|
||
|
|
print(f"图片相似度统计报告 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
with db_conn.cursor() as cursor:
|
||
|
|
# 1. 总体统计
|
||
|
|
print("\n【一、总体统计】\n")
|
||
|
|
|
||
|
|
stats = [
|
||
|
|
("待处理 (draft/draft)", "status = 'draft' AND similarity = 'draft'"),
|
||
|
|
("重复图片 (similarity/yes)", "status = 'similarity' AND similarity = 'yes'"),
|
||
|
|
("不重复图片 (tag_extension/calc)", "status = 'tag_extension' AND similarity = 'calc'"),
|
||
|
|
("待重算 (draft/recalc)", "status = 'draft' AND similarity = 'recalc'"),
|
||
|
|
]
|
||
|
|
|
||
|
|
results = {}
|
||
|
|
for name, condition in stats:
|
||
|
|
cursor.execute(f"SELECT COUNT(*) as cnt FROM ai_image_tags WHERE {condition}")
|
||
|
|
count = cursor.fetchone()['cnt']
|
||
|
|
results[name] = count
|
||
|
|
print(f" {name:<35} : {count:>10,} 张")
|
||
|
|
|
||
|
|
# 总计
|
||
|
|
cursor.execute("SELECT COUNT(*) as cnt FROM ai_image_tags")
|
||
|
|
total = cursor.fetchone()['cnt']
|
||
|
|
print(f" {'─' * 50}")
|
||
|
|
print(f" {'总计':<35} : {total:>10,} 张")
|
||
|
|
|
||
|
|
# 2. 重复率计算
|
||
|
|
print("\n【二、重复率分析】\n")
|
||
|
|
|
||
|
|
duplicate_count = results["重复图片 (similarity/yes)"]
|
||
|
|
unique_count = results["不重复图片 (tag_extension/calc)"]
|
||
|
|
processed = duplicate_count + unique_count
|
||
|
|
|
||
|
|
if processed > 0:
|
||
|
|
duplicate_rate = (duplicate_count / processed) * 100
|
||
|
|
print(f" 已处理图片数 : {processed:>10,} 张")
|
||
|
|
print(f" 重复图片数 : {duplicate_count:>10,} 张")
|
||
|
|
print(f" 不重复图片数 : {unique_count:>10,} 张")
|
||
|
|
print(f" 重复率 : {duplicate_rate:>10.2f} %")
|
||
|
|
else:
|
||
|
|
print(" 暂无已处理的图片数据")
|
||
|
|
|
||
|
|
# 3. 处理进度
|
||
|
|
print("\n【三、处理进度】\n")
|
||
|
|
|
||
|
|
pending = results["待处理 (draft/draft)"]
|
||
|
|
recalc = results["待重算 (draft/recalc)"]
|
||
|
|
|
||
|
|
if total > 0:
|
||
|
|
progress = (processed / total) * 100
|
||
|
|
print(f" 总进度 : {progress:>10.2f} %")
|
||
|
|
print(f" 待处理 : {pending:>10,} 张")
|
||
|
|
print(f" 待重算 : {recalc:>10,} 张")
|
||
|
|
print(f" 已完成 : {processed:>10,} 张")
|
||
|
|
|
||
|
|
# 4. 相似度分数分布(仅重复图片)
|
||
|
|
print("\n【四、相似度分数分布】\n")
|
||
|
|
|
||
|
|
cursor.execute("""
|
||
|
|
SELECT
|
||
|
|
CASE
|
||
|
|
WHEN similarity_score >= 0.99 THEN '0.99-1.00 (几乎相同)'
|
||
|
|
WHEN similarity_score >= 0.97 THEN '0.97-0.99 (非常相似)'
|
||
|
|
WHEN similarity_score >= 0.95 THEN '0.95-0.97 (高度相似)'
|
||
|
|
WHEN similarity_score >= 0.94 THEN '0.94-0.95 (相似)'
|
||
|
|
ELSE '< 0.94 (其他)'
|
||
|
|
END as score_range,
|
||
|
|
COUNT(*) as cnt
|
||
|
|
FROM ai_image_tags
|
||
|
|
WHERE status = 'similarity' AND similarity = 'yes'
|
||
|
|
GROUP BY score_range
|
||
|
|
ORDER BY score_range DESC
|
||
|
|
""")
|
||
|
|
|
||
|
|
score_stats = cursor.fetchall()
|
||
|
|
if score_stats:
|
||
|
|
for row in score_stats:
|
||
|
|
print(f" {row['score_range']:<25} : {row['cnt']:>10,} 张")
|
||
|
|
else:
|
||
|
|
print(" 暂无重复图片数据")
|
||
|
|
|
||
|
|
# 5. 最近处理记录
|
||
|
|
print("\n【五、最近 10 条重复记录】\n")
|
||
|
|
|
||
|
|
cursor.execute("""
|
||
|
|
SELECT id, image_name, similarity_image_tags_id, similarity_score, updated_at
|
||
|
|
FROM ai_image_tags
|
||
|
|
WHERE status = 'similarity' AND similarity = 'yes'
|
||
|
|
ORDER BY updated_at DESC
|
||
|
|
LIMIT 10
|
||
|
|
""")
|
||
|
|
|
||
|
|
recent = cursor.fetchall()
|
||
|
|
if recent:
|
||
|
|
print(f" {'ID':<10} {'相似ID':<10} {'分数':<10} {'更新时间':<20}")
|
||
|
|
print(f" {'-' * 55}")
|
||
|
|
for row in recent:
|
||
|
|
score = f"{row['similarity_score']:.4f}" if row['similarity_score'] else "-"
|
||
|
|
updated = row['updated_at'].strftime('%Y-%m-%d %H:%M') if row['updated_at'] else "-"
|
||
|
|
print(f" {row['id']:<10} {row['similarity_image_tags_id']:<10} {score:<10} {updated:<20}")
|
||
|
|
else:
|
||
|
|
print(" 暂无重复记录")
|
||
|
|
|
||
|
|
db_conn.close()
|
||
|
|
|
||
|
|
print("\n" + "=" * 70)
|
||
|
|
print("统计完成")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
main()
|