This commit is contained in:
sjk
2026-01-21 14:33:10 +08:00
parent c4711fc84f
commit 8f2f58b51c
21 changed files with 2358 additions and 727 deletions

View File

@@ -41,8 +41,7 @@ class DatabaseManager:
'database': Config.MYSQL_DATABASE,
'charset': 'utf8mb4'
}
logger.info(f"MySQL数据库初始化: {Config.MYSQL_HOST}:{Config.MYSQL_PORT}/{Config.MYSQL_DATABASE}")
def get_connection(self) -> 'pymysql.Connection':
"""获取MySQL数据库连接"""
conn = pymysql.connect(**self.db_config)
@@ -74,7 +73,8 @@ class SiteManager(DatabaseManager):
"""站点管理"""
def add_site(self, site_url: str, site_name: str = None,
site_dimension: str = None, frequency: int = None,
site_dimension: str = None, query_word: str = None,
frequency: int = None,
time_start: str = None, time_end: str = None,
interval_minutes: int = None) -> Optional[int]:
"""
@@ -84,6 +84,7 @@ class SiteManager(DatabaseManager):
site_url: 网站URL
site_name: 网站名称
site_dimension: 网站维度标签
query_word: 来源查询词(从哪个关键词抓取)
frequency: 频次
time_start: 开始时间
time_end: 结束时间
@@ -108,8 +109,8 @@ class SiteManager(DatabaseManager):
INSERT INTO ai_mip_site (
site_url, site_name, status, frequency,
time_start, time_end, interval_minutes,
site_dimension, created_by
) VALUES ({ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph})
site_dimension, query_word, created_by
) VALUES ({ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph})
"""
cursor.execute(sql, (
@@ -119,8 +120,9 @@ class SiteManager(DatabaseManager):
frequency or 1,
time_start or '09:00:00',
time_end or '21:00:00',
interval_minutes or 60,
interval_minutes or 30,
site_dimension,
query_word, # 新增:来源查询词
'system'
))
@@ -128,7 +130,7 @@ class SiteManager(DatabaseManager):
conn.commit()
conn.close()
logger.info(f"成功添加站点: {site_url} (ID: {site_id})")
logger.info(f"成功添加站点: {site_url} (ID: {site_id}, 查询词: {query_word})")
return site_id
except pymysql.IntegrityError:
@@ -545,3 +547,272 @@ class StatisticsManager(DatabaseManager):
except Exception as e:
logger.error(f"获取站点统计失败: {str(e)}")
return {}
class QueryTaskManager(DatabaseManager):
"""查询任务管理器"""
def create_task(self, query_word: str, task_date: str = None,
query_type: str = 'keyword', threshold_max: int = 100,
priority: int = 5, category: str = None,
source_platform: str = 'baidu',
created_by: str = 'system',
remark: str = None) -> Optional[int]:
"""
创建查询任务
Args:
query_word: 查询词
task_date: 任务日期 YYYYMMDD默认今天
query_type: 查询类型
threshold_max: 最大抓取数量
priority: 优先级 1-10
category: 分类标签
source_platform: 来源平台
created_by: 创建人
remark: 备注
Returns:
任务ID失败返回None
"""
try:
if task_date is None:
task_date = datetime.now().strftime('%Y%m%d')
conn = self.get_connection()
ph = self._get_placeholder()
sql = f"""
INSERT INTO ai_mip_query_task (
query_word, query_type, task_date, threshold_max,
priority, category, source_platform, created_by, remark
) VALUES ({ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph})
"""
cursor = conn.cursor()
cursor.execute(sql, (
query_word, query_type, task_date, threshold_max,
priority, category, source_platform, created_by, remark
))
task_id = cursor.lastrowid
conn.commit()
conn.close()
logger.info(f"创建查询任务成功: {query_word} (ID: {task_id})")
return task_id
except pymysql.IntegrityError:
logger.warning(f"查询任务已存在: {query_word} @ {task_date}")
return None
except Exception as e:
logger.error(f"创建查询任务失败: {str(e)}")
return None
def get_task_by_id(self, task_id: int) -> Optional[Dict]:
"""根据ID获取任务"""
try:
conn = self.get_connection()
ph = self._get_placeholder()
cursor = self._execute_query(
conn,
f"SELECT * FROM ai_mip_query_task WHERE id = {ph}",
(task_id,)
)
row = cursor.fetchone()
conn.close()
return self._dict_from_row(row) if row else None
except Exception as e:
logger.error(f"查询任务失败: {str(e)}")
return None
def get_ready_tasks(self, limit: int = None) -> List[Dict]:
"""
获取准备执行的任务(按优先级排序)
Args:
limit: 限制数量
Returns:
任务列表
"""
try:
conn = self.get_connection()
sql = "SELECT * FROM ai_mip_query_task WHERE status = 'ready' ORDER BY priority ASC, created_at ASC"
if limit:
sql += f" LIMIT {limit}"
cursor = self._execute_query(conn, sql)
rows = cursor.fetchall()
conn.close()
return [self._dict_from_row(row) for row in rows]
except Exception as e:
logger.error(f"查询ready任务失败: {str(e)}")
return []
def get_tasks_by_date(self, task_date: str) -> List[Dict]:
"""根据日期获取任务"""
try:
conn = self.get_connection()
ph = self._get_placeholder()
cursor = self._execute_query(
conn,
f"SELECT * FROM ai_mip_query_task WHERE task_date = {ph} ORDER BY priority ASC",
(task_date,)
)
rows = cursor.fetchall()
conn.close()
return [self._dict_from_row(row) for row in rows]
except Exception as e:
logger.error(f"查询日期任务失败: {str(e)}")
return []
def update_task_status(self, task_id: int, status: str,
error_message: str = None) -> bool:
"""
更新任务状态
Args:
task_id: 任务ID
status: 状态 ready/doing/failed/finished/closed
error_message: 错误信息(失败时)
"""
try:
conn = self.get_connection()
ph = self._get_placeholder()
# 根据状态更新时间字段
timestamp_field = None
if status == 'doing':
timestamp_field = 'started_at'
elif status in ['finished', 'failed']:
timestamp_field = 'finished_at'
elif status == 'closed':
timestamp_field = 'closed_at'
if timestamp_field:
sql = f"""
UPDATE ai_mip_query_task
SET status = {ph}, {timestamp_field} = NOW()
WHERE id = {ph}
"""
params = (status, task_id)
else:
sql = f"UPDATE ai_mip_query_task SET status = {ph} WHERE id = {ph}"
params = (status, task_id)
# 如果有错误信息更新error_message
if error_message:
sql = sql.replace('WHERE', f", error_message = '{error_message}' WHERE")
cursor = conn.cursor()
cursor.execute(sql, params)
conn.commit()
conn.close()
logger.info(f"更新任务状态: {task_id} -> {status}")
return True
except Exception as e:
logger.error(f"更新任务状态失败: {str(e)}")
return False
def increment_crawl_count(self, task_id: int,
crawl_count: int = 1,
valid_count: int = 0) -> bool:
"""
增加抓取计数
Args:
task_id: 任务ID
crawl_count: 抓取URL数量
valid_count: 有效URL数量带广告
"""
try:
conn = self.get_connection()
ph = self._get_placeholder()
sql = f"""
UPDATE ai_mip_query_task
SET crawl_url_count = crawl_url_count + {ph},
valid_url_count = valid_url_count + {ph},
current_count = current_count + {ph}
WHERE id = {ph}
"""
cursor = conn.cursor()
cursor.execute(sql, (crawl_count, valid_count, valid_count, task_id))
conn.commit()
conn.close()
return True
except Exception as e:
logger.error(f"更新抓取计数失败: {str(e)}")
return False
def check_threshold(self, task_id: int) -> bool:
"""
检查是否达到阈值,达到则自动关闭任务
Returns:
True=已达到阈值, False=未达到
"""
try:
task = self.get_task_by_id(task_id)
if not task:
return False
if task['current_count'] >= task['threshold_max']:
self.update_task_status(task_id, 'closed')
logger.info(f"任务达到阈值并关闭: {task['query_word']} ({task['current_count']}/{task['threshold_max']})")
return True
return False
except Exception as e:
logger.error(f"检查阈值失败: {str(e)}")
return False
def get_task_statistics(self, task_date: str = None) -> Dict:
"""
获取任务统计信息
Args:
task_date: 日期为None则统计所有
"""
try:
conn = self.get_connection()
if task_date:
ph = self._get_placeholder()
where_clause = f"WHERE task_date = {ph}"
params = (task_date,)
else:
where_clause = ""
params = None
sql = f"""
SELECT
COUNT(*) as total_tasks,
SUM(CASE WHEN status = 'ready' THEN 1 ELSE 0 END) as ready_count,
SUM(CASE WHEN status = 'doing' THEN 1 ELSE 0 END) as doing_count,
SUM(CASE WHEN status = 'finished' THEN 1 ELSE 0 END) as finished_count,
SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed_count,
SUM(CASE WHEN status = 'closed' THEN 1 ELSE 0 END) as closed_count,
SUM(crawl_url_count) as total_crawled,
SUM(valid_url_count) as total_valid
FROM ai_mip_query_task
{where_clause}
"""
cursor = self._execute_query(conn, sql, params)
row = cursor.fetchone()
conn.close()
return self._dict_from_row(row) if row else {}
except Exception as e:
logger.error(f"获取任务统计失败: {str(e)}")
return {}