refactor: 移除重算脚本部署,autocommit模式,2秒轮询

This commit is contained in:
2026-02-05 23:53:05 +08:00
parent 40b861ae3d
commit 76fcb106f3
4 changed files with 33 additions and 82 deletions

View File

@@ -9,7 +9,7 @@
- 支持 pHash 感知哈希预筛选 - 支持 pHash 感知哈希预筛选
- 异步批量下载和处理图片 - 异步批量下载和处理图片
- 自动标记重复图片并记录相似度分数 - 自动标记重复图片并记录相似度分数
- 守护模式运行,无数据时等待 10 秒后继续检查 - 守护模式运行,无数据时等待 2 秒后继续检查
## 环境要求 ## 环境要求
@@ -76,16 +76,10 @@ python stats_similarity.py
## 服务器部署 ## 服务器部署
```bash ```bash
# 启动所有服务 # 启动服务
./start_similarity.sh start ./start_similarity.sh start
# 只启动主处理脚本 # 停止服务
./start_similarity.sh start-check
# 只启动重算脚本
./start_similarity.sh start-recalc
# 停止所有服务
./start_similarity.sh stop ./start_similarity.sh stop
# 强制停止 # 强制停止

View File

@@ -77,7 +77,8 @@ class ImageSimilarityChecker:
password=self.config.get('database', 'password'), password=self.config.get('database', 'password'),
database=self.config.get('database', 'database'), database=self.config.get('database', 'database'),
charset=self.config.get('database', 'charset'), charset=self.config.get('database', 'charset'),
cursorclass=pymysql.cursors.DictCursor cursorclass=pymysql.cursors.DictCursor,
autocommit=True
) )
self.logger.info("数据库连接成功") self.logger.info("数据库连接成功")
@@ -254,7 +255,6 @@ class ImageSimilarityChecker:
WHERE id = %s WHERE id = %s
""" """
cursor.execute(sql, (similar_id, score, image_id)) cursor.execute(sql, (similar_id, score, image_id))
self.db_conn.commit()
self.logger.info(f"重复: {image_id} -> {similar_id} (分数={score:.4f})") self.logger.info(f"重复: {image_id} -> {similar_id} (分数={score:.4f})")
def update_as_unique(self, image_id: int): def update_as_unique(self, image_id: int):
@@ -268,7 +268,6 @@ class ImageSimilarityChecker:
WHERE id = %s WHERE id = %s
""" """
cursor.execute(sql, (image_id,)) cursor.execute(sql, (image_id,))
self.db_conn.commit()
self.logger.info(f"不重复: {image_id} -> tag_extension") self.logger.info(f"不重复: {image_id} -> tag_extension")
def update_as_failed(self, image_id: int, reason: str): def update_as_failed(self, image_id: int, reason: str):
@@ -282,7 +281,6 @@ class ImageSimilarityChecker:
WHERE id = %s WHERE id = %s
""" """
cursor.execute(sql, (image_id,)) cursor.execute(sql, (image_id,))
self.db_conn.commit()
def process_batch(self, image_records: List[dict]) -> Tuple[int, int, int]: def process_batch(self, image_records: List[dict]) -> Tuple[int, int, int]:
"""处理一批图片,返回 (重复数, 不重复数, 失败数)""" """处理一批图片,返回 (重复数, 不重复数, 失败数)"""
@@ -360,8 +358,8 @@ class ImageSimilarityChecker:
images = self.get_draft_images() images = self.get_draft_images()
if not images: if not images:
self.logger.info("没有待处理的图片,等待 10 秒后继续检查...") self.logger.info("没有待处理的图片,等待 2 秒后继续检查...")
time.sleep(10) time.sleep(2)
continue continue
batch_num += 1 batch_num += 1

View File

@@ -65,7 +65,8 @@ class ImageSimilarityRecalc:
password=self.config.get('database', 'password'), password=self.config.get('database', 'password'),
database=self.config.get('database', 'database'), database=self.config.get('database', 'database'),
charset=self.config.get('database', 'charset'), charset=self.config.get('database', 'charset'),
cursorclass=pymysql.cursors.DictCursor cursorclass=pymysql.cursors.DictCursor,
autocommit=True
) )
self.logger.info("数据库连接成功") self.logger.info("数据库连接成功")
@@ -165,7 +166,6 @@ class ImageSimilarityRecalc:
WHERE id = %s WHERE id = %s
""" """
cursor.execute(sql, (similar_id, score, image_id)) cursor.execute(sql, (similar_id, score, image_id))
self.db_conn.commit()
self.logger.info(f"重复: {image_id} -> {similar_id} (分数={score:.4f})") self.logger.info(f"重复: {image_id} -> {similar_id} (分数={score:.4f})")
def update_as_unique(self, image_id: int): def update_as_unique(self, image_id: int):
@@ -179,7 +179,6 @@ class ImageSimilarityRecalc:
WHERE id = %s WHERE id = %s
""" """
cursor.execute(sql, (image_id,)) cursor.execute(sql, (image_id,))
self.db_conn.commit()
self.logger.info(f"不重复: {image_id} -> tag_extension") self.logger.info(f"不重复: {image_id} -> tag_extension")
def update_as_failed(self, image_id: int, reason: str): def update_as_failed(self, image_id: int, reason: str):
@@ -191,7 +190,6 @@ class ImageSimilarityRecalc:
WHERE id = %s WHERE id = %s
""" """
cursor.execute(sql, (image_id,)) cursor.execute(sql, (image_id,))
self.db_conn.commit()
self.logger.warning(f"处理失败 {image_id}: {reason}") self.logger.warning(f"处理失败 {image_id}: {reason}")
def process_batch(self, image_records: List[dict]) -> Tuple[int, int, int]: def process_batch(self, image_records: List[dict]) -> Tuple[int, int, int]:
@@ -263,8 +261,8 @@ class ImageSimilarityRecalc:
images = self.get_recalc_images() images = self.get_recalc_images()
if not images: if not images:
self.logger.info("没有需要重新计算的图片,等待 10 秒后继续检查...") self.logger.info("没有需要重新计算的图片,等待 2 秒后继续检查...")
time.sleep(10) time.sleep(2)
continue continue
batch_num += 1 batch_num += 1

View File

@@ -15,12 +15,6 @@ CHECK_PID_FILE="${BASE_DIR}/image_similarity_check.pid"
CHECK_LOG_FILE="${BASE_DIR}/image_similarity.log" CHECK_LOG_FILE="${BASE_DIR}/image_similarity.log"
CHECK_MAX_PROCESSES=1 # 限制最多1个进程 CHECK_MAX_PROCESSES=1 # 限制最多1个进程
# image_similarity_recalc 配置
RECALC_SCRIPT="${BASE_DIR}/image_similarity_recalc.py"
RECALC_PID_FILE="${BASE_DIR}/image_similarity_recalc.pid"
RECALC_LOG_FILE="${BASE_DIR}/image_similarity_recalc.log"
RECALC_MAX_PROCESSES=1 # 限制最多1个进程
# stats_similarity 配置(仅用于查看状态,不需要常驻进程) # stats_similarity 配置(仅用于查看状态,不需要常驻进程)
STATS_SCRIPT="${BASE_DIR}/stats_similarity.py" STATS_SCRIPT="${BASE_DIR}/stats_similarity.py"
@@ -175,12 +169,12 @@ stop_single_all() {
# 启动所有服务 # 启动所有服务
start() { start() {
echo -e "${BLUE}========== 启动图片去重审核系统 ==========${NC}" echo -e "${BLUE}========== 启动图片去重审核系统 ===========${NC}"
echo -e "${YELLOW}进程限制每个服务最多启动1个实例${NC}" echo -e "${YELLOW}进程限制每个服务最多启动1个实例${NC}"
echo "" echo ""
# 检查总进程数 # 检查总进程数
local total_before=$(pgrep -f "image_similarity_check\|image_similarity_recalc" | wc -l) local total_before=$(pgrep -f "image_similarity_check" | wc -l)
if [ $total_before -gt 0 ]; then if [ $total_before -gt 0 ]; then
echo -e "${YELLOW}当前已有 ${total_before} 个审核进程在运行${NC}" echo -e "${YELLOW}当前已有 ${total_before} 个审核进程在运行${NC}"
echo -e "${YELLOW}建议先停止现有进程: $0 stop${NC}" echo -e "${YELLOW}建议先停止现有进程: $0 stop${NC}"
@@ -194,17 +188,11 @@ start() {
if ! start_single "$CHECK_SCRIPT" "$CHECK_PID_FILE" "$CHECK_LOG_FILE" "image_similarity_check" "$CHECK_MAX_PROCESSES"; then if ! start_single "$CHECK_SCRIPT" "$CHECK_PID_FILE" "$CHECK_LOG_FILE" "image_similarity_check" "$CHECK_MAX_PROCESSES"; then
has_errors=1 has_errors=1
fi fi
sleep 3
# 启动重算脚本
if ! start_single "$RECALC_SCRIPT" "$RECALC_PID_FILE" "$RECALC_LOG_FILE" "image_similarity_recalc" "$RECALC_MAX_PROCESSES"; then
has_errors=1
fi
echo "" echo ""
# 检查启动结果 # 检查启动结果
local total_after=$(pgrep -f "image_similarity_check\|image_similarity_recalc" | wc -l) local total_after=$(pgrep -f "image_similarity_check" | wc -l)
if [ $has_errors -eq 0 ]; then if [ $has_errors -eq 0 ]; then
echo -e "${GREEN}✅ 启动完成,当前运行 ${total_after} 个进程${NC}" echo -e "${GREEN}✅ 启动完成,当前运行 ${total_after} 个进程${NC}"
else else
@@ -216,40 +204,27 @@ start() {
# 只启动主脚本 # 只启动主脚本
start-check() { start-check() {
echo -e "${BLUE}========== 启动主处理脚本 ==========${NC}" echo -e "${BLUE}========== 启动主处理脚本 ===========${NC}"
start_single "$CHECK_SCRIPT" "$CHECK_PID_FILE" "$CHECK_LOG_FILE" "image_similarity_check" "$CHECK_MAX_PROCESSES" start_single "$CHECK_SCRIPT" "$CHECK_PID_FILE" "$CHECK_LOG_FILE" "image_similarity_check" "$CHECK_MAX_PROCESSES"
echo -e "${BLUE}====================================${NC}" echo -e "${BLUE}====================================${NC}"
} }
# 只启动重算脚本
start-recalc() {
echo -e "${BLUE}========== 启动重算脚本 ==========${NC}"
start_single "$RECALC_SCRIPT" "$RECALC_PID_FILE" "$RECALC_LOG_FILE" "image_similarity_recalc" "$RECALC_MAX_PROCESSES"
echo -e "${BLUE}==================================${NC}"
}
# 停止所有服务 # 停止所有服务
stop() { stop() {
echo -e "${BLUE}========== 停止图片去重审核系统 ==========${NC}" echo -e "${BLUE}========== 停止图片去重审核系统 ===========${NC}"
local has_errors=0 local has_errors=0
local total_before=$(pgrep -f "image_similarity_check\|image_similarity_recalc" | wc -l) local total_before=$(pgrep -f "image_similarity_check" | wc -l)
echo -e "${YELLOW}当前共有 ${total_before} 个审核进程${NC}" echo -e "${YELLOW}当前共有 ${total_before} 个审核进程${NC}"
# 停止重算脚本
if ! stop_single_all "$RECALC_SCRIPT" "image_similarity_recalc" "$RECALC_PID_FILE"; then
has_errors=1
fi
sleep 2
# 停止主处理脚本 # 停止主处理脚本
if ! stop_single_all "$CHECK_SCRIPT" "image_similarity_check" "$CHECK_PID_FILE"; then if ! stop_single_all "$CHECK_SCRIPT" "image_similarity_check" "$CHECK_PID_FILE"; then
has_errors=1 has_errors=1
fi fi
# 最终检查 # 最终检查
local total_after=$(pgrep -f "image_similarity_check\|image_similarity_recalc" | wc -l) local total_after=$(pgrep -f "image_similarity_check" | wc -l)
echo "" echo ""
if [ $total_after -eq 0 ]; then if [ $total_after -eq 0 ]; then
@@ -265,11 +240,10 @@ stop() {
# 强制停止所有服务 # 强制停止所有服务
force-stop() { force-stop() {
echo -e "${RED}========== 强制停止所有审核进程 ==========${NC}" echo -e "${RED}========== 强制停止所有审核进程 ===========${NC}"
# 停止所有相关进程 # 停止所有相关进程
pkill -9 -f "image_similarity_check.py" 2>/dev/null pkill -9 -f "image_similarity_check.py" 2>/dev/null
pkill -9 -f "image_similarity_recalc.py" 2>/dev/null
sleep 2 sleep 2
@@ -277,13 +251,13 @@ force-stop() {
rm -f "${BASE_DIR}"/*.pid rm -f "${BASE_DIR}"/*.pid
# 检查是否还有残留 # 检查是否还有残留
local remaining=$(pgrep -f "image_similarity_check\|image_similarity_recalc" | wc -l) local remaining=$(pgrep -f "image_similarity_check" | wc -l)
if [ $remaining -eq 0 ]; then if [ $remaining -eq 0 ]; then
echo -e "${GREEN}✅ 所有进程已强制停止${NC}" echo -e "${GREEN}✅ 所有进程已强制停止${NC}"
else else
echo -e "${RED}❌ 仍有 ${remaining} 个进程存活${NC}" echo -e "${RED}❌ 仍有 ${remaining} 个进程存活${NC}"
echo -e "请手动检查以下进程:" echo -e "请手动检查以下进程:"
pgrep -f "image_similarity_check\|image_similarity_recalc" | xargs ps -fp 2>/dev/null pgrep -f "image_similarity_check" | xargs ps -fp 2>/dev/null
fi fi
echo -e "${RED}==========================================${NC}" echo -e "${RED}==========================================${NC}"
@@ -330,7 +304,7 @@ status() {
echo -e "${BLUE}工作目录: ${BASE_DIR}${NC}" echo -e "${BLUE}工作目录: ${BASE_DIR}${NC}"
echo "" echo ""
local total_procs=$(pgrep -f "image_similarity_check\|image_similarity_recalc" | wc -l) local total_procs=$(pgrep -f "image_similarity_check" | wc -l)
echo -e "${YELLOW}📊 进程概览:${NC}" echo -e "${YELLOW}📊 进程概览:${NC}"
echo -e " 总进程数: ${total_procs}" echo -e " 总进程数: ${total_procs}"
@@ -366,7 +340,7 @@ status() {
mem_sum=$(echo "$mem_sum + $mem" | bc) mem_sum=$(echo "$mem_sum + $mem" | bc)
fi fi
fi fi
done < <(pgrep -f "image_similarity_check\|image_similarity_recalc" | xargs ps -o pid,cmd --no-headers 2>/dev/null) done < <(pgrep -f "image_similarity_check" | xargs ps -o pid,cmd --no-headers 2>/dev/null)
# 显示每个脚本的统计 # 显示每个脚本的统计
for script in "${!script_count[@]}"; do for script in "${!script_count[@]}"; do
@@ -393,7 +367,7 @@ status() {
echo "" echo ""
echo -e "${YELLOW}📁 PID文件状态${NC}" echo -e "${YELLOW}📁 PID文件状态${NC}"
for pid_file in "$CHECK_PID_FILE" "$RECALC_PID_FILE"; do for pid_file in "$CHECK_PID_FILE"; do
if [ -f "$pid_file" ]; then if [ -f "$pid_file" ]; then
script_name=$(basename "$pid_file" .pid) script_name=$(basename "$pid_file" .pid)
pid=$(cat "$pid_file" 2>/dev/null) pid=$(cat "$pid_file" 2>/dev/null)
@@ -421,25 +395,20 @@ status() {
# 查看日志 # 查看日志
logs() { logs() {
local lines=${1:-50} local lines=${1:-50}
echo -e "${BLUE}========== 查看日志 (最后 ${lines} 行) ==========${NC}" echo -e "${BLUE}========== 查看日志 (最后 ${lines} 行) ===========${NC}"
if [ -f "$CHECK_LOG_FILE" ]; then if [ -f "$CHECK_LOG_FILE" ]; then
echo -e "\n${YELLOW}--- image_similarity.log ---${NC}" echo -e "\n${YELLOW}--- image_similarity.log ---${NC}"
tail -$lines "$CHECK_LOG_FILE" tail -$lines "$CHECK_LOG_FILE"
fi fi
if [ -f "$RECALC_LOG_FILE" ]; then
echo -e "\n${YELLOW}--- image_similarity_recalc.log ---${NC}"
tail -$lines "$RECALC_LOG_FILE"
fi
echo -e "${BLUE}============================================${NC}" echo -e "${BLUE}============================================${NC}"
} }
# 实时查看日志 # 实时查看日志
logs-follow() { logs-follow() {
echo -e "${BLUE}========== 实时查看日志 (Ctrl+C 退出) ==========${NC}" echo -e "${BLUE}========== 实时查看日志 (Ctrl+C 退出) ===========${NC}"
tail -f "$CHECK_LOG_FILE" "$RECALC_LOG_FILE" tail -f "$CHECK_LOG_FILE"
} }
# 显示帮助 # 显示帮助
@@ -449,17 +418,14 @@ show_help() {
echo -e "${YELLOW}当前配置:${NC}" echo -e "${YELLOW}当前配置:${NC}"
echo -e " 工作目录: ${BASE_DIR}" echo -e " 工作目录: ${BASE_DIR}"
echo -e " image_similarity_check: 最多 ${CHECK_MAX_PROCESSES} 个进程" echo -e " image_similarity_check: 最多 ${CHECK_MAX_PROCESSES} 个进程"
echo -e " image_similarity_recalc: 最多 ${RECALC_MAX_PROCESSES} 个进程"
echo "" echo ""
echo -e "${BLUE}用法: $0 {命令}${NC}" echo -e "${BLUE}用法: $0 {命令}${NC}"
echo "" echo ""
echo -e "${GREEN}服务管理:${NC}" echo -e "${GREEN}服务管理:${NC}"
echo -e " ${YELLOW}start${NC} 启动所有服务" echo -e " ${YELLOW}start${NC} 启动服务"
echo -e " ${YELLOW}start-check${NC} 只启动主处理脚本" echo -e " ${YELLOW}stop${NC} 停止服务"
echo -e " ${YELLOW}start-recalc${NC} 只启动重算脚本" echo -e " ${YELLOW}force-stop${NC} 强制停止进程"
echo -e " ${YELLOW}stop${NC} 停止所有服务" echo -e " ${YELLOW}restart${NC} 重启服务"
echo -e " ${YELLOW}force-stop${NC} 强制停止所有进程"
echo -e " ${YELLOW}restart${NC} 重启所有服务"
echo -e " ${YELLOW}force-restart${NC} 强制重启" echo -e " ${YELLOW}force-restart${NC} 强制重启"
echo "" echo ""
echo -e "${GREEN}状态查看:${NC}" echo -e "${GREEN}状态查看:${NC}"
@@ -472,10 +438,8 @@ show_help() {
echo -e " ${YELLOW}help${NC} 显示帮助" echo -e " ${YELLOW}help${NC} 显示帮助"
echo "" echo ""
echo -e "${RED}注意:${NC}" echo -e "${RED}注意:${NC}"
echo -e " - 脚本会限制每个服务最多启动1个实例" echo -e " - 脚本会限制最多启动1个实例"
echo -e " - 两个脚本可以同时运行,处理不同状态的数据" echo -e " - 处理 status='draft', similarity='draft' 的数据"
echo -e " - image_similarity_check: 处理 status='draft', similarity='draft'"
echo -e " - image_similarity_recalc: 处理 status='draft', similarity='recalc'"
} }
# 主逻辑 # 主逻辑
@@ -486,9 +450,6 @@ case "$1" in
start-check) start-check)
start-check start-check
;; ;;
start-recalc)
start-recalc
;;
stop) stop)
stop stop
;; ;;