refactor: 移除重算脚本部署,autocommit模式,2秒轮询
This commit is contained in:
12
README.md
12
README.md
@@ -9,7 +9,7 @@
|
||||
- 支持 pHash 感知哈希预筛选
|
||||
- 异步批量下载和处理图片
|
||||
- 自动标记重复图片并记录相似度分数
|
||||
- 守护模式运行,无数据时等待 10 秒后继续检查
|
||||
- 守护模式运行,无数据时等待 2 秒后继续检查
|
||||
|
||||
## 环境要求
|
||||
|
||||
@@ -76,16 +76,10 @@ python stats_similarity.py
|
||||
## 服务器部署
|
||||
|
||||
```bash
|
||||
# 启动所有服务
|
||||
# 启动服务
|
||||
./start_similarity.sh start
|
||||
|
||||
# 只启动主处理脚本
|
||||
./start_similarity.sh start-check
|
||||
|
||||
# 只启动重算脚本
|
||||
./start_similarity.sh start-recalc
|
||||
|
||||
# 停止所有服务
|
||||
# 停止服务
|
||||
./start_similarity.sh stop
|
||||
|
||||
# 强制停止
|
||||
|
||||
@@ -77,7 +77,8 @@ class ImageSimilarityChecker:
|
||||
password=self.config.get('database', 'password'),
|
||||
database=self.config.get('database', 'database'),
|
||||
charset=self.config.get('database', 'charset'),
|
||||
cursorclass=pymysql.cursors.DictCursor
|
||||
cursorclass=pymysql.cursors.DictCursor,
|
||||
autocommit=True
|
||||
)
|
||||
self.logger.info("数据库连接成功")
|
||||
|
||||
@@ -254,7 +255,6 @@ class ImageSimilarityChecker:
|
||||
WHERE id = %s
|
||||
"""
|
||||
cursor.execute(sql, (similar_id, score, image_id))
|
||||
self.db_conn.commit()
|
||||
self.logger.info(f"重复: {image_id} -> {similar_id} (分数={score:.4f})")
|
||||
|
||||
def update_as_unique(self, image_id: int):
|
||||
@@ -268,7 +268,6 @@ class ImageSimilarityChecker:
|
||||
WHERE id = %s
|
||||
"""
|
||||
cursor.execute(sql, (image_id,))
|
||||
self.db_conn.commit()
|
||||
self.logger.info(f"不重复: {image_id} -> tag_extension")
|
||||
|
||||
def update_as_failed(self, image_id: int, reason: str):
|
||||
@@ -282,7 +281,6 @@ class ImageSimilarityChecker:
|
||||
WHERE id = %s
|
||||
"""
|
||||
cursor.execute(sql, (image_id,))
|
||||
self.db_conn.commit()
|
||||
|
||||
def process_batch(self, image_records: List[dict]) -> Tuple[int, int, int]:
|
||||
"""处理一批图片,返回 (重复数, 不重复数, 失败数)"""
|
||||
@@ -360,8 +358,8 @@ class ImageSimilarityChecker:
|
||||
images = self.get_draft_images()
|
||||
|
||||
if not images:
|
||||
self.logger.info("没有待处理的图片,等待 10 秒后继续检查...")
|
||||
time.sleep(10)
|
||||
self.logger.info("没有待处理的图片,等待 2 秒后继续检查...")
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
batch_num += 1
|
||||
|
||||
@@ -65,7 +65,8 @@ class ImageSimilarityRecalc:
|
||||
password=self.config.get('database', 'password'),
|
||||
database=self.config.get('database', 'database'),
|
||||
charset=self.config.get('database', 'charset'),
|
||||
cursorclass=pymysql.cursors.DictCursor
|
||||
cursorclass=pymysql.cursors.DictCursor,
|
||||
autocommit=True
|
||||
)
|
||||
self.logger.info("数据库连接成功")
|
||||
|
||||
@@ -165,7 +166,6 @@ class ImageSimilarityRecalc:
|
||||
WHERE id = %s
|
||||
"""
|
||||
cursor.execute(sql, (similar_id, score, image_id))
|
||||
self.db_conn.commit()
|
||||
self.logger.info(f"重复: {image_id} -> {similar_id} (分数={score:.4f})")
|
||||
|
||||
def update_as_unique(self, image_id: int):
|
||||
@@ -179,7 +179,6 @@ class ImageSimilarityRecalc:
|
||||
WHERE id = %s
|
||||
"""
|
||||
cursor.execute(sql, (image_id,))
|
||||
self.db_conn.commit()
|
||||
self.logger.info(f"不重复: {image_id} -> tag_extension")
|
||||
|
||||
def update_as_failed(self, image_id: int, reason: str):
|
||||
@@ -191,7 +190,6 @@ class ImageSimilarityRecalc:
|
||||
WHERE id = %s
|
||||
"""
|
||||
cursor.execute(sql, (image_id,))
|
||||
self.db_conn.commit()
|
||||
self.logger.warning(f"处理失败 {image_id}: {reason}")
|
||||
|
||||
def process_batch(self, image_records: List[dict]) -> Tuple[int, int, int]:
|
||||
@@ -263,8 +261,8 @@ class ImageSimilarityRecalc:
|
||||
images = self.get_recalc_images()
|
||||
|
||||
if not images:
|
||||
self.logger.info("没有需要重新计算的图片,等待 10 秒后继续检查...")
|
||||
time.sleep(10)
|
||||
self.logger.info("没有需要重新计算的图片,等待 2 秒后继续检查...")
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
batch_num += 1
|
||||
|
||||
@@ -15,12 +15,6 @@ CHECK_PID_FILE="${BASE_DIR}/image_similarity_check.pid"
|
||||
CHECK_LOG_FILE="${BASE_DIR}/image_similarity.log"
|
||||
CHECK_MAX_PROCESSES=1 # 限制最多1个进程
|
||||
|
||||
# image_similarity_recalc 配置
|
||||
RECALC_SCRIPT="${BASE_DIR}/image_similarity_recalc.py"
|
||||
RECALC_PID_FILE="${BASE_DIR}/image_similarity_recalc.pid"
|
||||
RECALC_LOG_FILE="${BASE_DIR}/image_similarity_recalc.log"
|
||||
RECALC_MAX_PROCESSES=1 # 限制最多1个进程
|
||||
|
||||
# stats_similarity 配置(仅用于查看状态,不需要常驻进程)
|
||||
STATS_SCRIPT="${BASE_DIR}/stats_similarity.py"
|
||||
|
||||
@@ -175,12 +169,12 @@ stop_single_all() {
|
||||
|
||||
# 启动所有服务
|
||||
start() {
|
||||
echo -e "${BLUE}========== 启动图片去重审核系统 ==========${NC}"
|
||||
echo -e "${BLUE}========== 启动图片去重审核系统 ===========${NC}"
|
||||
echo -e "${YELLOW}进程限制:每个服务最多启动1个实例${NC}"
|
||||
echo ""
|
||||
|
||||
# 检查总进程数
|
||||
local total_before=$(pgrep -f "image_similarity_check\|image_similarity_recalc" | wc -l)
|
||||
local total_before=$(pgrep -f "image_similarity_check" | wc -l)
|
||||
if [ $total_before -gt 0 ]; then
|
||||
echo -e "${YELLOW}当前已有 ${total_before} 个审核进程在运行${NC}"
|
||||
echo -e "${YELLOW}建议先停止现有进程: $0 stop${NC}"
|
||||
@@ -194,17 +188,11 @@ start() {
|
||||
if ! start_single "$CHECK_SCRIPT" "$CHECK_PID_FILE" "$CHECK_LOG_FILE" "image_similarity_check" "$CHECK_MAX_PROCESSES"; then
|
||||
has_errors=1
|
||||
fi
|
||||
sleep 3
|
||||
|
||||
# 启动重算脚本
|
||||
if ! start_single "$RECALC_SCRIPT" "$RECALC_PID_FILE" "$RECALC_LOG_FILE" "image_similarity_recalc" "$RECALC_MAX_PROCESSES"; then
|
||||
has_errors=1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# 检查启动结果
|
||||
local total_after=$(pgrep -f "image_similarity_check\|image_similarity_recalc" | wc -l)
|
||||
local total_after=$(pgrep -f "image_similarity_check" | wc -l)
|
||||
if [ $has_errors -eq 0 ]; then
|
||||
echo -e "${GREEN}✅ 启动完成,当前运行 ${total_after} 个进程${NC}"
|
||||
else
|
||||
@@ -216,40 +204,27 @@ start() {
|
||||
|
||||
# 只启动主脚本
|
||||
start-check() {
|
||||
echo -e "${BLUE}========== 启动主处理脚本 ==========${NC}"
|
||||
echo -e "${BLUE}========== 启动主处理脚本 ===========${NC}"
|
||||
start_single "$CHECK_SCRIPT" "$CHECK_PID_FILE" "$CHECK_LOG_FILE" "image_similarity_check" "$CHECK_MAX_PROCESSES"
|
||||
echo -e "${BLUE}====================================${NC}"
|
||||
}
|
||||
|
||||
# 只启动重算脚本
|
||||
start-recalc() {
|
||||
echo -e "${BLUE}========== 启动重算脚本 ==========${NC}"
|
||||
start_single "$RECALC_SCRIPT" "$RECALC_PID_FILE" "$RECALC_LOG_FILE" "image_similarity_recalc" "$RECALC_MAX_PROCESSES"
|
||||
echo -e "${BLUE}==================================${NC}"
|
||||
}
|
||||
|
||||
# 停止所有服务
|
||||
stop() {
|
||||
echo -e "${BLUE}========== 停止图片去重审核系统 ==========${NC}"
|
||||
echo -e "${BLUE}========== 停止图片去重审核系统 ===========${NC}"
|
||||
|
||||
local has_errors=0
|
||||
local total_before=$(pgrep -f "image_similarity_check\|image_similarity_recalc" | wc -l)
|
||||
local total_before=$(pgrep -f "image_similarity_check" | wc -l)
|
||||
|
||||
echo -e "${YELLOW}当前共有 ${total_before} 个审核进程${NC}"
|
||||
|
||||
# 停止重算脚本
|
||||
if ! stop_single_all "$RECALC_SCRIPT" "image_similarity_recalc" "$RECALC_PID_FILE"; then
|
||||
has_errors=1
|
||||
fi
|
||||
sleep 2
|
||||
|
||||
# 停止主处理脚本
|
||||
if ! stop_single_all "$CHECK_SCRIPT" "image_similarity_check" "$CHECK_PID_FILE"; then
|
||||
has_errors=1
|
||||
fi
|
||||
|
||||
# 最终检查
|
||||
local total_after=$(pgrep -f "image_similarity_check\|image_similarity_recalc" | wc -l)
|
||||
local total_after=$(pgrep -f "image_similarity_check" | wc -l)
|
||||
echo ""
|
||||
|
||||
if [ $total_after -eq 0 ]; then
|
||||
@@ -265,11 +240,10 @@ stop() {
|
||||
|
||||
# 强制停止所有服务
|
||||
force-stop() {
|
||||
echo -e "${RED}========== 强制停止所有审核进程 ==========${NC}"
|
||||
echo -e "${RED}========== 强制停止所有审核进程 ===========${NC}"
|
||||
|
||||
# 停止所有相关进程
|
||||
pkill -9 -f "image_similarity_check.py" 2>/dev/null
|
||||
pkill -9 -f "image_similarity_recalc.py" 2>/dev/null
|
||||
|
||||
sleep 2
|
||||
|
||||
@@ -277,13 +251,13 @@ force-stop() {
|
||||
rm -f "${BASE_DIR}"/*.pid
|
||||
|
||||
# 检查是否还有残留
|
||||
local remaining=$(pgrep -f "image_similarity_check\|image_similarity_recalc" | wc -l)
|
||||
local remaining=$(pgrep -f "image_similarity_check" | wc -l)
|
||||
if [ $remaining -eq 0 ]; then
|
||||
echo -e "${GREEN}✅ 所有进程已强制停止${NC}"
|
||||
else
|
||||
echo -e "${RED}❌ 仍有 ${remaining} 个进程存活${NC}"
|
||||
echo -e "请手动检查以下进程:"
|
||||
pgrep -f "image_similarity_check\|image_similarity_recalc" | xargs ps -fp 2>/dev/null
|
||||
pgrep -f "image_similarity_check" | xargs ps -fp 2>/dev/null
|
||||
fi
|
||||
|
||||
echo -e "${RED}==========================================${NC}"
|
||||
@@ -330,7 +304,7 @@ status() {
|
||||
echo -e "${BLUE}工作目录: ${BASE_DIR}${NC}"
|
||||
echo ""
|
||||
|
||||
local total_procs=$(pgrep -f "image_similarity_check\|image_similarity_recalc" | wc -l)
|
||||
local total_procs=$(pgrep -f "image_similarity_check" | wc -l)
|
||||
echo -e "${YELLOW}📊 进程概览:${NC}"
|
||||
echo -e " 总进程数: ${total_procs}"
|
||||
|
||||
@@ -366,7 +340,7 @@ status() {
|
||||
mem_sum=$(echo "$mem_sum + $mem" | bc)
|
||||
fi
|
||||
fi
|
||||
done < <(pgrep -f "image_similarity_check\|image_similarity_recalc" | xargs ps -o pid,cmd --no-headers 2>/dev/null)
|
||||
done < <(pgrep -f "image_similarity_check" | xargs ps -o pid,cmd --no-headers 2>/dev/null)
|
||||
|
||||
# 显示每个脚本的统计
|
||||
for script in "${!script_count[@]}"; do
|
||||
@@ -393,7 +367,7 @@ status() {
|
||||
|
||||
echo ""
|
||||
echo -e "${YELLOW}📁 PID文件状态:${NC}"
|
||||
for pid_file in "$CHECK_PID_FILE" "$RECALC_PID_FILE"; do
|
||||
for pid_file in "$CHECK_PID_FILE"; do
|
||||
if [ -f "$pid_file" ]; then
|
||||
script_name=$(basename "$pid_file" .pid)
|
||||
pid=$(cat "$pid_file" 2>/dev/null)
|
||||
@@ -421,25 +395,20 @@ status() {
|
||||
# 查看日志
|
||||
logs() {
|
||||
local lines=${1:-50}
|
||||
echo -e "${BLUE}========== 查看日志 (最后 ${lines} 行) ==========${NC}"
|
||||
echo -e "${BLUE}========== 查看日志 (最后 ${lines} 行) ===========${NC}"
|
||||
|
||||
if [ -f "$CHECK_LOG_FILE" ]; then
|
||||
echo -e "\n${YELLOW}--- image_similarity.log ---${NC}"
|
||||
tail -$lines "$CHECK_LOG_FILE"
|
||||
fi
|
||||
|
||||
if [ -f "$RECALC_LOG_FILE" ]; then
|
||||
echo -e "\n${YELLOW}--- image_similarity_recalc.log ---${NC}"
|
||||
tail -$lines "$RECALC_LOG_FILE"
|
||||
fi
|
||||
|
||||
echo -e "${BLUE}============================================${NC}"
|
||||
}
|
||||
|
||||
# 实时查看日志
|
||||
logs-follow() {
|
||||
echo -e "${BLUE}========== 实时查看日志 (Ctrl+C 退出) ==========${NC}"
|
||||
tail -f "$CHECK_LOG_FILE" "$RECALC_LOG_FILE"
|
||||
echo -e "${BLUE}========== 实时查看日志 (Ctrl+C 退出) ===========${NC}"
|
||||
tail -f "$CHECK_LOG_FILE"
|
||||
}
|
||||
|
||||
# 显示帮助
|
||||
@@ -449,17 +418,14 @@ show_help() {
|
||||
echo -e "${YELLOW}当前配置:${NC}"
|
||||
echo -e " 工作目录: ${BASE_DIR}"
|
||||
echo -e " image_similarity_check: 最多 ${CHECK_MAX_PROCESSES} 个进程"
|
||||
echo -e " image_similarity_recalc: 最多 ${RECALC_MAX_PROCESSES} 个进程"
|
||||
echo ""
|
||||
echo -e "${BLUE}用法: $0 {命令}${NC}"
|
||||
echo ""
|
||||
echo -e "${GREEN}服务管理:${NC}"
|
||||
echo -e " ${YELLOW}start${NC} 启动所有服务"
|
||||
echo -e " ${YELLOW}start-check${NC} 只启动主处理脚本"
|
||||
echo -e " ${YELLOW}start-recalc${NC} 只启动重算脚本"
|
||||
echo -e " ${YELLOW}stop${NC} 停止所有服务"
|
||||
echo -e " ${YELLOW}force-stop${NC} 强制停止所有进程"
|
||||
echo -e " ${YELLOW}restart${NC} 重启所有服务"
|
||||
echo -e " ${YELLOW}start${NC} 启动服务"
|
||||
echo -e " ${YELLOW}stop${NC} 停止服务"
|
||||
echo -e " ${YELLOW}force-stop${NC} 强制停止进程"
|
||||
echo -e " ${YELLOW}restart${NC} 重启服务"
|
||||
echo -e " ${YELLOW}force-restart${NC} 强制重启"
|
||||
echo ""
|
||||
echo -e "${GREEN}状态查看:${NC}"
|
||||
@@ -472,10 +438,8 @@ show_help() {
|
||||
echo -e " ${YELLOW}help${NC} 显示帮助"
|
||||
echo ""
|
||||
echo -e "${RED}注意:${NC}"
|
||||
echo -e " - 脚本会限制每个服务最多启动1个实例"
|
||||
echo -e " - 两个脚本可以同时运行,处理不同状态的数据"
|
||||
echo -e " - image_similarity_check: 处理 status='draft', similarity='draft'"
|
||||
echo -e " - image_similarity_recalc: 处理 status='draft', similarity='recalc'"
|
||||
echo -e " - 脚本会限制最多启动1个实例"
|
||||
echo -e " - 处理 status='draft', similarity='draft' 的数据"
|
||||
}
|
||||
|
||||
# 主逻辑
|
||||
@@ -486,9 +450,6 @@ case "$1" in
|
||||
start-check)
|
||||
start-check
|
||||
;;
|
||||
start-recalc)
|
||||
start-recalc
|
||||
;;
|
||||
stop)
|
||||
stop
|
||||
;;
|
||||
|
||||
Reference in New Issue
Block a user