初始提交:文字匹配图片项目

This commit is contained in:
2026-01-30 18:09:55 +08:00
commit fbf12f3fa3
57 changed files with 3552 additions and 0 deletions

680
push_article_published.py Normal file
View File

@@ -0,0 +1,680 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
AI文章自动生成监控脚本
监控数据库中status为topic的记录自动调用Coze API生成文章并提交
"""
import os
import sys
import time
import json
import logging
import requests
import pymysql
from datetime import datetime
from typing import Dict, List, Optional, Any
import traceback
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue, Empty
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# 添加项目根目录到Python路径
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from database_config import get_db_manager
from log_config import setup_logger
# 配置日志记录器,支持按日期切割和控制台输出
logger = setup_logger(
name='push_article',
log_file='logs/push_article_published.log',
error_log_file='logs/push_article_published_error.log',
level=logging.INFO,
console_output=True
)
# 配置常量
#BASE_URL = "http://47.99.184.230:8324"
BASE_URL = "http://127.0.0.1:8324"
SLEEP_INTERVAL = 5 # 监控间隔(秒)
WORKER_COUNT = 10 # 并行处理worker数量可配置
# 新增:批量发布配置
BATCH_SIZE = 8 # 一次处理的文章数量,可调
BATCH_INTERVAL = 2 # 批次间隔时间(秒),可调
# 网络重试配置
MAX_RETRIES = 3 # 最大重试次数
BACKOFF_FACTOR = 1 # 退避因子
RETRY_STATUS_CODES = [500, 502, 503, 504, 429] # 需要重试的HTTP状态码
CONNECTION_TIMEOUT = 30 # 连接超时(秒)
READ_TIMEOUT = 120 # 读取超时(秒)
# 全局变量
AUTH_TOKEN = None
WORKFLOW_ID = None
JWT_TOKEN = None
class PushArticlePublished:
def __init__(self):
# API配置
self.base_url = BASE_URL
# 认证信息
self.auth_token = None
self.workflow_id = None
self.jwt_token = None
# 使用统一的数据库管理器
self.db_manager = get_db_manager()
# 登录配置
self.login_credentials = {
'username': 'user010',
'password': '@5^2W6R7'
}
# 禁用代理
self.proxies = {
'http': None,
'https': None
}
# 并行处理相关
self.processing_lock = threading.Lock() # 用于线程安全的记录分配
self.processed_ids = set() # 已处理的记录ID集合
# 创建会话和配置重试策略
self.session = self._create_session()
# 网络统计
self.request_stats = {
'total_requests': 0,
'successful_requests': 0,
'failed_requests': 0,
'retry_attempts': 0,
'connection_errors': 0,
'timeout_errors': 0
}
logger.info("PushArticlePublished 初始化完成")
def _create_session(self):
"""创建配置了重试策略的requests会话"""
session = requests.Session()
# 配置重试策略
retry_strategy = Retry(
total=MAX_RETRIES,
status_forcelist=RETRY_STATUS_CODES,
backoff_factor=BACKOFF_FACTOR,
allowed_methods=["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"]
)
# 配置HTTP适配器
adapter = HTTPAdapter(
max_retries=retry_strategy,
pool_connections=10,
pool_maxsize=20
)
session.mount("http://", adapter)
session.mount("https://", adapter)
# 设置默认超时
session.timeout = (CONNECTION_TIMEOUT, READ_TIMEOUT)
return session
def _make_request_with_retry(self, method, url, **kwargs):
"""带重试机制的网络请求方法"""
self.request_stats['total_requests'] += 1
for attempt in range(MAX_RETRIES + 1):
try:
# 使用会话发送请求
response = self.session.request(
method=method,
url=url,
timeout=(CONNECTION_TIMEOUT, READ_TIMEOUT),
proxies=self.proxies,
**kwargs
)
# 请求成功
self.request_stats['successful_requests'] += 1
if attempt > 0:
logger.info(f"网络请求在第 {attempt + 1} 次尝试后成功")
return response
except requests.exceptions.ConnectionError as e:
self.request_stats['connection_errors'] += 1
if attempt < MAX_RETRIES:
self.request_stats['retry_attempts'] += 1
backoff_time = (BACKOFF_FACTOR * (2 ** attempt)) + random.uniform(0, 1)
logger.warning(f"连接错误 (尝试 {attempt + 1}/{MAX_RETRIES + 1}): {e}")
logger.info(f"等待 {backoff_time:.2f} 秒后重试...")
time.sleep(backoff_time)
else:
self.request_stats['failed_requests'] += 1
logger.error(f"连接最终失败,已重试 {MAX_RETRIES} 次: {e}")
raise
except requests.exceptions.Timeout as e:
self.request_stats['timeout_errors'] += 1
if attempt < MAX_RETRIES:
self.request_stats['retry_attempts'] += 1
backoff_time = (BACKOFF_FACTOR * (2 ** attempt)) + random.uniform(0, 1)
logger.warning(f"请求超时 (尝试 {attempt + 1}/{MAX_RETRIES + 1}): {e}")
logger.info(f"等待 {backoff_time:.2f} 秒后重试...")
time.sleep(backoff_time)
else:
self.request_stats['failed_requests'] += 1
logger.error(f"请求超时最终失败,已重试 {MAX_RETRIES} 次: {e}")
raise
except requests.exceptions.ChunkedEncodingError as e:
if attempt < MAX_RETRIES:
self.request_stats['retry_attempts'] += 1
backoff_time = (BACKOFF_FACTOR * (2 ** attempt)) + random.uniform(0, 1)
logger.warning(f"数据传输错误 (尝试 {attempt + 1}/{MAX_RETRIES + 1}): {e}")
logger.info(f"等待 {backoff_time:.2f} 秒后重试...")
time.sleep(backoff_time)
else:
self.request_stats['failed_requests'] += 1
logger.error(f"数据传输最终失败,已重试 {MAX_RETRIES} 次: {e}")
raise
except Exception as e:
self.request_stats['failed_requests'] += 1
logger.error(f"网络请求发生未预期错误: {e}")
raise
def log_network_stats(self):
"""记录网络统计信息"""
stats = self.request_stats
success_rate = (stats['successful_requests'] / stats['total_requests'] * 100) if stats['total_requests'] > 0 else 0
stats_msg = (
f"网络统计 - 总请求: {stats['total_requests']}, "
f"成功: {stats['successful_requests']}, "
f"失败: {stats['failed_requests']}, "
f"重试: {stats['retry_attempts']}, "
f"连接错误: {stats['connection_errors']}, "
f"超时错误: {stats['timeout_errors']}, "
f"成功率: {success_rate:.1f}%"
)
logger.info(stats_msg)
self.log_to_database('INFO', '网络统计', stats_msg)
def get_db_connection(self):
"""获取数据库连接"""
try:
return self.db_manager.get_connection()
except Exception as e:
logger.error(f"数据库连接失败: {e}")
return None
def log_to_database(self, level: str, message: str, details: str = None):
"""记录日志到数据库ai_logs表"""
try:
with self.db_manager.get_cursor() as cursor:
# 映射日志级别到数据库状态
status_map = {
'INFO': 'success',
'WARNING': 'warning',
'ERROR': 'error'
}
status = status_map.get(level, 'success')
sql = """
INSERT INTO ai_logs (user_id, action, description, status, error_message, created_at)
VALUES (%s, %s, %s, %s, %s, NOW())
"""
cursor.execute(sql, (None, 'coze_generator', message, status, details))
logger.info(f"日志已记录到数据库: {level} - {message}")
except Exception as e:
logger.error(f"记录日志到数据库失败: {e}")
def login_and_get_jwt_token(self) -> bool:
"""登录获取JWT token参考JavaScript逻辑"""
try:
login_url = f"{self.base_url}/api/auth/login"
login_data = {
"username": "user010", # 使用用户指定的账号
"password": "@5^2W6R7"
}
logger.info(f"尝试登录: {login_data['username']}")
logger.info(f"登录URL: {login_url}")
self.log_to_database('INFO', f"尝试登录用户: {login_data['username']}")
response = self._make_request_with_retry(
'POST',
login_url,
json=login_data,
headers={'Content-Type': 'application/json'}
)
logger.info(f"响应状态码: {response.status_code}")
logger.info(f"响应内容: {response.text[:500]}...")
if response.status_code == 200:
result = response.json()
if result.get('code') == 200:
self.jwt_token = result['data']['token']
logger.info("JWT token获取成功")
self.log_to_database('INFO', "JWT token获取成功", json.dumps(result['data']))
return True
else:
error_msg = f"登录失败: {result.get('message', '未知错误')}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, json.dumps(result))
return False
else:
error_msg = f"登录请求失败: {response.status_code}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, response.text)
return False
except Exception as e:
error_msg = f"登录异常: {e}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, traceback.format_exc())
return False
def batch_publish_auto(self, article_ids: List[int]) -> bool:
"""批量提交文章到/api/articles/batch-publish-auto接口"""
try:
logger.info(f"开始批量提交 {len(article_ids)} 篇文章到batch-publish-auto接口")
self.log_to_database('INFO', f"开始批量提交文章", f"article_ids: {article_ids}")
# 确保有JWT token
if not self.jwt_token:
logger.warning("JWT token缺失尝试重新登录")
self.log_to_database('WARNING', "JWT token缺失重新登录")
if not self.login_and_get_jwt_token():
error_msg = "重新登录失败"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg)
return False
# 构建批量发布数据 - 根据接口要求只需要article_ids
publish_data = {
"article_ids": article_ids
}
logger.info(f"准备批量提交的数据: {json.dumps(publish_data, ensure_ascii=False)}")
# 发送请求 - 修正接口路径
upload_url = f"{self.base_url}/api/articles/batch-publish-auto"
headers = {
'Authorization': f'Bearer {self.jwt_token}',
'Content-Type': 'application/json',
'Accept': 'application/json'
}
response = self._make_request_with_retry(
'POST',
upload_url,
json=publish_data,
headers=headers
)
logger.info(f"批量提交响应状态码: {response.status_code}")
if response.status_code == 200:
try:
result = response.json()
logger.info(f"批量提交响应内容: {result}")
# 根据接口实际返回格式判断成功
if result.get('code') == 200:
data = result.get('data', {})
published_count = data.get('published_count', 0)
failed_count = data.get('failed_count', 0)
success_msg = f"批量提交成功,发布: {published_count}篇,失败: {failed_count}"
logger.info(success_msg)
self.log_to_database('INFO', success_msg, f"article_ids: {article_ids}")
return True
else:
error_msg = f"批量提交失败: {result.get('message', '未知错误')}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, f"article_ids: {article_ids}, response: {result}")
return False
except json.JSONDecodeError as e:
error_msg = f"解析批量提交响应失败: {e}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, f"response_text: {response.text}")
return False
elif response.status_code == 401:
# Token过期尝试重新登录并重试一次
logger.warning("收到401错误JWT token可能已过期尝试重新登录")
self.log_to_database('WARNING', "JWT token过期重新登录", f"article_ids: {article_ids}")
if self.login_and_get_jwt_token():
logger.info("重新登录成功,重试批量提交请求")
# 更新headers中的token
headers['Authorization'] = f'Bearer {self.jwt_token}'
# 重试请求
retry_response = self._make_request_with_retry(
'POST',
upload_url,
json=publish_data,
headers=headers
)
if retry_response.status_code == 200:
try:
result = retry_response.json()
logger.info(f"重试批量提交响应内容: {result}")
if result.get('code') == 200:
data = result.get('data', {})
published_count = data.get('published_count', 0)
failed_count = data.get('failed_count', 0)
success_msg = f"重试批量提交成功,发布: {published_count}篇,失败: {failed_count}"
logger.info(success_msg)
self.log_to_database('INFO', success_msg, f"article_ids: {article_ids}")
return True
else:
error_msg = f"重试批量提交失败: {result.get('message', '未知错误')}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, f"article_ids: {article_ids}, response: {result}")
return False
except json.JSONDecodeError as e:
error_msg = f"解析重试批量提交响应失败: {e}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, f"response_text: {retry_response.text}")
return False
else:
error_msg = f"重试批量提交请求失败,状态码: {retry_response.status_code}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, f"response_text: {retry_response.text}")
return False
else:
error_msg = "重新登录失败,无法重试批量提交"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, f"article_ids: {article_ids}")
return False
else:
error_msg = f"批量提交请求失败,状态码: {response.status_code}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, f"response_text: {response.text}")
return False
except requests.exceptions.Timeout as e:
error_msg = f"批量提交请求超时: {e}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, f"article_ids: {article_ids}, timeout: {CONNECTION_TIMEOUT}s/{READ_TIMEOUT}s")
return False
except requests.exceptions.ConnectionError as e:
error_msg = f"批量提交连接错误: {e}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, f"article_ids: {article_ids}, base_url: {self.base_url}")
return False
except requests.exceptions.RequestException as e:
error_msg = f"批量提交网络异常: {e}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, f"article_ids: {article_ids}, exception_type: {type(e).__name__}")
return False
except Exception as e:
error_msg = f"批量提交异常: {e}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, f"article_ids: {article_ids}, traceback: {traceback.format_exc()}")
return False
def is_publish_time_allowed(self) -> bool:
"""检查当前时间是否在允许发布的时间窗口内北京时间6:00-23:59"""
current_hour = datetime.now().hour
# 凌晨00:00-05:59禁止发布6:00-23:59允许发布
if current_hour >= 6:
logger.info(f"当前时间 {datetime.now().strftime('%H:%M:%S')} 可以推送")
return True
else:
logger.info(f"当前时间 {datetime.now().strftime('%H:%M:%S')} 在禁止发布时段00:00-05:59跳过推送")
return False
def filter_articles_by_daily_limit(self, articles: List[Dict]) -> List[Dict]:
"""根据作者每日发文限制过滤文章
检查ai_statistics_days表中daily_published_count是否超过daily_post_max
如果超过,则该作者的文章今日不发
"""
if not articles:
return []
try:
today_date = datetime.now().strftime('%Y-%m-%d')
filtered_articles = []
with self.db_manager.get_cursor() as cursor:
for article in articles:
author_id = article.get('author_id')
if not author_id:
logger.warning(f"文章ID {article['id']} 缺少author_id跳过")
continue
# 先检查ai_authors表作者必须满足 daily_post_max > 0, status = 'active', channel = 1
author_check_sql = """
SELECT id, author_name, daily_post_max, status, channel
FROM ai_authors
WHERE id = %s AND daily_post_max > 0 AND status = 'active' AND channel = 1
"""
cursor.execute(author_check_sql, (author_id,))
author_result = cursor.fetchone()
if not author_result:
logger.info(f"[业务日志] 作者ID {author_id} 不符合发文条件(daily_post_max>0 AND status=active AND channel=1)文章ID {article['id']} 过滤掉")
# 将文章状态更新为pending_review重新走审批流程
update_sql = "UPDATE ai_articles SET status = 'pending_review', updated_at = NOW() WHERE id = %s"
cursor.execute(update_sql, (article['id'],))
logger.info(f"[业务日志] 文章ID {article['id']} 状态已更新为pending_review需重新审批")
continue
# 查询该作者当天的发文统计
sql = """
SELECT daily_published_count, daily_post_max
FROM ai_statistics_days
WHERE author_id = %s AND stat_date = %s
"""
cursor.execute(sql, (author_id, today_date))
result = cursor.fetchone()
if result:
daily_published_count = result['daily_published_count'] or 0
daily_post_max = result['daily_post_max'] or 0
# 检查daily_post_max是否小于1小于1则不允许发文
if daily_post_max < 1:
#logger.info(f"[业务日志] 作者ID {author_id} daily_post_max={daily_post_max} 小于1文章ID {article['id']} 过滤掉,不允许发文")
continue
if daily_published_count >= daily_post_max:
#logger.info(f"[业务日志] 作者ID {author_id} 今日已发 {daily_published_count} 篇,达到上限 {daily_post_max}文章ID {article['id']} 跳过")
continue
else:
#logger.info(f"[业务日志] 作者ID {author_id} 今日已发 {daily_published_count}/{daily_post_max}文章ID {article['id']} 允许发布")
filtered_articles.append(article)
else:
# 没有统计记录,默认不允许发布(需要先初始化统计记录)
logger.info(f"[业务日志] 作者ID {author_id} 无当日统计记录文章ID {article['id']} 过滤掉,需先初始化统计记录")
continue
logger.info(f"每日限制过滤完成: 原始 {len(articles)} 篇 -> 允许发布 {len(filtered_articles)}")
return filtered_articles
except Exception as e:
error_msg = f"检查每日发文限制异常: {e}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, traceback.format_exc())
# 异常时返回原始列表,避免阻塞
return articles
def get_published_review_articles(self) -> List[Dict]:
"""获取状态为published_review的待发布文章"""
try:
with self.db_manager.get_cursor() as cursor:
# 查询published_review状态的文章
sql = """
SELECT
id,
title,
status,
created_at,
updated_at,
author_id
FROM (
SELECT
id,
title,
status,
created_at,
updated_at,
author_id,
ROW_NUMBER() OVER (
PARTITION BY author_id
ORDER BY updated_at ASC, id ASC
) as author_rank
FROM ai_articles
WHERE status = 'published_review'
AND author_id > 0
) ranked_articles
"""
cursor.execute(sql)
results = cursor.fetchall()
if results:
logger.info(f"查询到 {len(results)} 个待发布文章")
for result in results:
logger.info(f"待发布文章 - ID: {result['id']}, 标题: {result['title']}, 状态: {result['status']}")
#self.log_to_database('INFO', f"发现待发布文章: {result['title']}",
#f"ID: {result['id']}, 状态: {result['status']}")
else:
logger.info("未查询到待发布文章")
return results
except Exception as e:
error_msg = f"查询待发布文章异常: {e}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, traceback.format_exc())
return []
def process_published_review_articles(self, published_articles: List[Dict], worker_id: int) -> int:
"""Worker线程处理published_review状态文章的方法"""
processed_count = 0
thread_name = f"PublishWorker-{worker_id}"
threading.current_thread().name = thread_name
logger.info(f"[{thread_name}] 启动,准备处理待发布文章")
# 按批次处理文章
for i in range(0, len(published_articles), BATCH_SIZE):
batch = published_articles[i:i + BATCH_SIZE]
article_ids = [article['id'] for article in batch]
logger.info(f"[{thread_name}] 处理批次 {i//BATCH_SIZE + 1}文章ID: {article_ids}")
# 批量提交文章
if self.batch_publish_auto(article_ids):
processed_count += len(article_ids)
logger.info(f"[{thread_name}] 成功处理批次,文章数量: {len(article_ids)}")
else:
logger.error(f"[{thread_name}] 处理批次失败文章ID: {article_ids}")
# 批次间隔
if i + BATCH_SIZE < len(published_articles):
logger.info(f"[{thread_name}] 等待 {BATCH_INTERVAL} 秒后处理下一批次")
time.sleep(BATCH_INTERVAL)
logger.info(f"[{thread_name}] 完成,共处理 {processed_count} 篇文章")
return processed_count
def run_monitor(self):
"""运行监控循环支持多worker并行处理"""
logger.info(f"开始监控ai_articles表使用 {WORKER_COUNT} 个worker并行处理...")
self.log_to_database('INFO', f'启动文章自动生成监控服务worker数量: {WORKER_COUNT}', 'run_monitor')
# 统计计数器
loop_count = 0
stats_interval = 60 # 每60次循环记录一次统计约5分钟
while True:
try:
# 获取待发布的文章
published_articles = self.get_published_review_articles()
# 逻辑1: 检查时间窗口北京时间6:00-23:59允许00:00-05:59禁止
if not self.is_publish_time_allowed():
published_articles = []
logger.info("当前处于禁止发布时段,清空待发布列表")
# 逻辑2: 根据作者每日发文限制过滤文章
if published_articles:
published_articles = self.filter_articles_by_daily_limit(published_articles)
# 处理待发布文章
if published_articles:
logger.info(f"发现 {len(published_articles)} 篇待发布文章,启动批量发布处理")
self.log_to_database('INFO', f'发现待发布文章,启动批量处理', f'文章数量: {len(published_articles)}')
# 使用单个worker处理批量发布避免并发冲突
try:
processed_count = self.process_published_review_articles(published_articles, 1)
logger.info(f"批量发布处理完成,共处理 {processed_count} 篇文章")
self.log_to_database('INFO', f'批量发布处理完成', f'共处理 {processed_count} 篇文章')
except Exception as e:
logger.error(f"批量发布处理异常: {e}")
self.log_to_database('ERROR', f'批量发布处理异常', str(e))
# 如果没有任何待处理任务
if not published_articles:
logger.info("暂无待处理任务,继续监控...")
# 每次循环后休息
time.sleep(SLEEP_INTERVAL)
# 定期记录网络统计
loop_count += 1
if loop_count % stats_interval == 0:
self.log_network_stats()
except KeyboardInterrupt:
logger.info("收到中断信号,停止监控")
self.log_to_database('INFO', '监控服务手动停止', 'KeyboardInterrupt')
break
except Exception as e:
error_msg = f"监控循环异常: {e}"
logger.error(error_msg)
self.log_to_database('ERROR', error_msg, traceback.format_exc())
time.sleep(5) # 异常时等待5秒再继续
def main():
"""主函数"""
generator = PushArticlePublished()
try:
# 先登录获取JWT token
logger.info("开始登录获取JWT token")
if not generator.login_and_get_jwt_token():
logger.error("登录失败,程序退出")
return
# 开始监控
generator.run_monitor()
except Exception as e:
logger.error(f"程序运行异常: {e}")
generator.log_to_database('ERROR', f'程序运行异常: {e}', traceback.format_exc())
if __name__ == "__main__":
main()