# -*- coding: utf-8 -*- """ 图片相似度统计脚本 统计各状态的图片数量和重复率 """ import configparser import pymysql from datetime import datetime def main(): # 读取配置 config = configparser.ConfigParser() config.read('config.ini', encoding='utf-8') # 连接数据库 db_conn = pymysql.connect( host=config.get('database', 'host'), port=config.getint('database', 'port'), user=config.get('database', 'user'), password=config.get('database', 'password'), database=config.get('database', 'database'), charset=config.get('database', 'charset'), cursorclass=pymysql.cursors.DictCursor ) print("=" * 70) print(f"图片相似度统计报告 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print("=" * 70) with db_conn.cursor() as cursor: # 1. 总体统计 print("\n【一、总体统计】\n") stats = [ ("待处理 (draft/draft)", "status = 'draft' AND similarity = 'draft'"), ("重复图片 (similarity/yes)", "status = 'similarity' AND similarity = 'yes'"), ("不重复图片 (tag_extension/calc)", "status = 'tag_extension' AND similarity = 'calc'"), ("待重算 (draft/recalc)", "status = 'draft' AND similarity = 'recalc'"), ] results = {} for name, condition in stats: cursor.execute(f"SELECT COUNT(*) as cnt FROM ai_image_tags WHERE {condition}") count = cursor.fetchone()['cnt'] results[name] = count print(f" {name:<35} : {count:>10,} 张") # 总计 cursor.execute("SELECT COUNT(*) as cnt FROM ai_image_tags") total = cursor.fetchone()['cnt'] print(f" {'─' * 50}") print(f" {'总计':<35} : {total:>10,} 张") # 2. 重复率计算 print("\n【二、重复率分析】\n") duplicate_count = results["重复图片 (similarity/yes)"] unique_count = results["不重复图片 (tag_extension/calc)"] processed = duplicate_count + unique_count if processed > 0: duplicate_rate = (duplicate_count / processed) * 100 print(f" 已处理图片数 : {processed:>10,} 张") print(f" 重复图片数 : {duplicate_count:>10,} 张") print(f" 不重复图片数 : {unique_count:>10,} 张") print(f" 重复率 : {duplicate_rate:>10.2f} %") else: print(" 暂无已处理的图片数据") # 3. 处理进度 print("\n【三、处理进度】\n") pending = results["待处理 (draft/draft)"] recalc = results["待重算 (draft/recalc)"] if total > 0: progress = (processed / total) * 100 print(f" 总进度 : {progress:>10.2f} %") print(f" 待处理 : {pending:>10,} 张") print(f" 待重算 : {recalc:>10,} 张") print(f" 已完成 : {processed:>10,} 张") # 4. 相似度分数分布(仅重复图片) print("\n【四、相似度分数分布】\n") cursor.execute(""" SELECT CASE WHEN similarity_score >= 0.99 THEN '0.99-1.00 (几乎相同)' WHEN similarity_score >= 0.97 THEN '0.97-0.99 (非常相似)' WHEN similarity_score >= 0.95 THEN '0.95-0.97 (高度相似)' WHEN similarity_score >= 0.94 THEN '0.94-0.95 (相似)' ELSE '< 0.94 (其他)' END as score_range, COUNT(*) as cnt FROM ai_image_tags WHERE status = 'similarity' AND similarity = 'yes' GROUP BY score_range ORDER BY score_range DESC """) score_stats = cursor.fetchall() if score_stats: for row in score_stats: print(f" {row['score_range']:<25} : {row['cnt']:>10,} 张") else: print(" 暂无重复图片数据") # 5. 最近处理记录 print("\n【五、最近 10 条重复记录】\n") cursor.execute(""" SELECT id, image_name, similarity_image_tags_id, similarity_score, updated_at FROM ai_image_tags WHERE status = 'similarity' AND similarity = 'yes' ORDER BY updated_at DESC LIMIT 10 """) recent = cursor.fetchall() if recent: print(f" {'ID':<10} {'相似ID':<10} {'分数':<10} {'更新时间':<20}") print(f" {'-' * 55}") for row in recent: score = f"{row['similarity_score']:.4f}" if row['similarity_score'] else "-" updated = row['updated_at'].strftime('%Y-%m-%d %H:%M') if row['updated_at'] else "-" print(f" {row['id']:<10} {row['similarity_image_tags_id']:<10} {score:<10} {updated:<20}") else: print(" 暂无重复记录") db_conn.close() print("\n" + "=" * 70) print("统计完成") print("=" * 70) if __name__ == '__main__': main()