commit
This commit is contained in:
285
db_manager.py
285
db_manager.py
@@ -41,8 +41,7 @@ class DatabaseManager:
|
||||
'database': Config.MYSQL_DATABASE,
|
||||
'charset': 'utf8mb4'
|
||||
}
|
||||
logger.info(f"MySQL数据库初始化: {Config.MYSQL_HOST}:{Config.MYSQL_PORT}/{Config.MYSQL_DATABASE}")
|
||||
|
||||
|
||||
def get_connection(self) -> 'pymysql.Connection':
|
||||
"""获取MySQL数据库连接"""
|
||||
conn = pymysql.connect(**self.db_config)
|
||||
@@ -74,7 +73,8 @@ class SiteManager(DatabaseManager):
|
||||
"""站点管理"""
|
||||
|
||||
def add_site(self, site_url: str, site_name: str = None,
|
||||
site_dimension: str = None, frequency: int = None,
|
||||
site_dimension: str = None, query_word: str = None,
|
||||
frequency: int = None,
|
||||
time_start: str = None, time_end: str = None,
|
||||
interval_minutes: int = None) -> Optional[int]:
|
||||
"""
|
||||
@@ -84,6 +84,7 @@ class SiteManager(DatabaseManager):
|
||||
site_url: 网站URL
|
||||
site_name: 网站名称
|
||||
site_dimension: 网站维度标签
|
||||
query_word: 来源查询词(从哪个关键词抓取)
|
||||
frequency: 频次
|
||||
time_start: 开始时间
|
||||
time_end: 结束时间
|
||||
@@ -108,8 +109,8 @@ class SiteManager(DatabaseManager):
|
||||
INSERT INTO ai_mip_site (
|
||||
site_url, site_name, status, frequency,
|
||||
time_start, time_end, interval_minutes,
|
||||
site_dimension, created_by
|
||||
) VALUES ({ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph})
|
||||
site_dimension, query_word, created_by
|
||||
) VALUES ({ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph})
|
||||
"""
|
||||
|
||||
cursor.execute(sql, (
|
||||
@@ -119,8 +120,9 @@ class SiteManager(DatabaseManager):
|
||||
frequency or 1,
|
||||
time_start or '09:00:00',
|
||||
time_end or '21:00:00',
|
||||
interval_minutes or 60,
|
||||
interval_minutes or 30,
|
||||
site_dimension,
|
||||
query_word, # 新增:来源查询词
|
||||
'system'
|
||||
))
|
||||
|
||||
@@ -128,7 +130,7 @@ class SiteManager(DatabaseManager):
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
logger.info(f"成功添加站点: {site_url} (ID: {site_id})")
|
||||
logger.info(f"成功添加站点: {site_url} (ID: {site_id}, 查询词: {query_word})")
|
||||
return site_id
|
||||
|
||||
except pymysql.IntegrityError:
|
||||
@@ -545,3 +547,272 @@ class StatisticsManager(DatabaseManager):
|
||||
except Exception as e:
|
||||
logger.error(f"获取站点统计失败: {str(e)}")
|
||||
return {}
|
||||
|
||||
|
||||
class QueryTaskManager(DatabaseManager):
|
||||
"""查询任务管理器"""
|
||||
|
||||
def create_task(self, query_word: str, task_date: str = None,
|
||||
query_type: str = 'keyword', threshold_max: int = 100,
|
||||
priority: int = 5, category: str = None,
|
||||
source_platform: str = 'baidu',
|
||||
created_by: str = 'system',
|
||||
remark: str = None) -> Optional[int]:
|
||||
"""
|
||||
创建查询任务
|
||||
|
||||
Args:
|
||||
query_word: 查询词
|
||||
task_date: 任务日期 YYYYMMDD,默认今天
|
||||
query_type: 查询类型
|
||||
threshold_max: 最大抓取数量
|
||||
priority: 优先级 1-10
|
||||
category: 分类标签
|
||||
source_platform: 来源平台
|
||||
created_by: 创建人
|
||||
remark: 备注
|
||||
|
||||
Returns:
|
||||
任务ID,失败返回None
|
||||
"""
|
||||
try:
|
||||
if task_date is None:
|
||||
task_date = datetime.now().strftime('%Y%m%d')
|
||||
|
||||
conn = self.get_connection()
|
||||
ph = self._get_placeholder()
|
||||
|
||||
sql = f"""
|
||||
INSERT INTO ai_mip_query_task (
|
||||
query_word, query_type, task_date, threshold_max,
|
||||
priority, category, source_platform, created_by, remark
|
||||
) VALUES ({ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph}, {ph})
|
||||
"""
|
||||
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(sql, (
|
||||
query_word, query_type, task_date, threshold_max,
|
||||
priority, category, source_platform, created_by, remark
|
||||
))
|
||||
|
||||
task_id = cursor.lastrowid
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
logger.info(f"创建查询任务成功: {query_word} (ID: {task_id})")
|
||||
return task_id
|
||||
|
||||
except pymysql.IntegrityError:
|
||||
logger.warning(f"查询任务已存在: {query_word} @ {task_date}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"创建查询任务失败: {str(e)}")
|
||||
return None
|
||||
|
||||
def get_task_by_id(self, task_id: int) -> Optional[Dict]:
|
||||
"""根据ID获取任务"""
|
||||
try:
|
||||
conn = self.get_connection()
|
||||
ph = self._get_placeholder()
|
||||
cursor = self._execute_query(
|
||||
conn,
|
||||
f"SELECT * FROM ai_mip_query_task WHERE id = {ph}",
|
||||
(task_id,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
return self._dict_from_row(row) if row else None
|
||||
except Exception as e:
|
||||
logger.error(f"查询任务失败: {str(e)}")
|
||||
return None
|
||||
|
||||
def get_ready_tasks(self, limit: int = None) -> List[Dict]:
|
||||
"""
|
||||
获取准备执行的任务(按优先级排序)
|
||||
|
||||
Args:
|
||||
limit: 限制数量
|
||||
|
||||
Returns:
|
||||
任务列表
|
||||
"""
|
||||
try:
|
||||
conn = self.get_connection()
|
||||
sql = "SELECT * FROM ai_mip_query_task WHERE status = 'ready' ORDER BY priority ASC, created_at ASC"
|
||||
|
||||
if limit:
|
||||
sql += f" LIMIT {limit}"
|
||||
|
||||
cursor = self._execute_query(conn, sql)
|
||||
rows = cursor.fetchall()
|
||||
conn.close()
|
||||
return [self._dict_from_row(row) for row in rows]
|
||||
except Exception as e:
|
||||
logger.error(f"查询ready任务失败: {str(e)}")
|
||||
return []
|
||||
|
||||
def get_tasks_by_date(self, task_date: str) -> List[Dict]:
|
||||
"""根据日期获取任务"""
|
||||
try:
|
||||
conn = self.get_connection()
|
||||
ph = self._get_placeholder()
|
||||
cursor = self._execute_query(
|
||||
conn,
|
||||
f"SELECT * FROM ai_mip_query_task WHERE task_date = {ph} ORDER BY priority ASC",
|
||||
(task_date,)
|
||||
)
|
||||
rows = cursor.fetchall()
|
||||
conn.close()
|
||||
return [self._dict_from_row(row) for row in rows]
|
||||
except Exception as e:
|
||||
logger.error(f"查询日期任务失败: {str(e)}")
|
||||
return []
|
||||
|
||||
def update_task_status(self, task_id: int, status: str,
|
||||
error_message: str = None) -> bool:
|
||||
"""
|
||||
更新任务状态
|
||||
|
||||
Args:
|
||||
task_id: 任务ID
|
||||
status: 状态 ready/doing/failed/finished/closed
|
||||
error_message: 错误信息(失败时)
|
||||
"""
|
||||
try:
|
||||
conn = self.get_connection()
|
||||
ph = self._get_placeholder()
|
||||
|
||||
# 根据状态更新时间字段
|
||||
timestamp_field = None
|
||||
if status == 'doing':
|
||||
timestamp_field = 'started_at'
|
||||
elif status in ['finished', 'failed']:
|
||||
timestamp_field = 'finished_at'
|
||||
elif status == 'closed':
|
||||
timestamp_field = 'closed_at'
|
||||
|
||||
if timestamp_field:
|
||||
sql = f"""
|
||||
UPDATE ai_mip_query_task
|
||||
SET status = {ph}, {timestamp_field} = NOW()
|
||||
WHERE id = {ph}
|
||||
"""
|
||||
params = (status, task_id)
|
||||
else:
|
||||
sql = f"UPDATE ai_mip_query_task SET status = {ph} WHERE id = {ph}"
|
||||
params = (status, task_id)
|
||||
|
||||
# 如果有错误信息,更新error_message
|
||||
if error_message:
|
||||
sql = sql.replace('WHERE', f", error_message = '{error_message}' WHERE")
|
||||
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(sql, params)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
logger.info(f"更新任务状态: {task_id} -> {status}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"更新任务状态失败: {str(e)}")
|
||||
return False
|
||||
|
||||
def increment_crawl_count(self, task_id: int,
|
||||
crawl_count: int = 1,
|
||||
valid_count: int = 0) -> bool:
|
||||
"""
|
||||
增加抓取计数
|
||||
|
||||
Args:
|
||||
task_id: 任务ID
|
||||
crawl_count: 抓取URL数量
|
||||
valid_count: 有效URL数量(带广告)
|
||||
"""
|
||||
try:
|
||||
conn = self.get_connection()
|
||||
ph = self._get_placeholder()
|
||||
|
||||
sql = f"""
|
||||
UPDATE ai_mip_query_task
|
||||
SET crawl_url_count = crawl_url_count + {ph},
|
||||
valid_url_count = valid_url_count + {ph},
|
||||
current_count = current_count + {ph}
|
||||
WHERE id = {ph}
|
||||
"""
|
||||
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(sql, (crawl_count, valid_count, valid_count, task_id))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"更新抓取计数失败: {str(e)}")
|
||||
return False
|
||||
|
||||
def check_threshold(self, task_id: int) -> bool:
|
||||
"""
|
||||
检查是否达到阈值,达到则自动关闭任务
|
||||
|
||||
Returns:
|
||||
True=已达到阈值, False=未达到
|
||||
"""
|
||||
try:
|
||||
task = self.get_task_by_id(task_id)
|
||||
if not task:
|
||||
return False
|
||||
|
||||
if task['current_count'] >= task['threshold_max']:
|
||||
self.update_task_status(task_id, 'closed')
|
||||
logger.info(f"任务达到阈值并关闭: {task['query_word']} ({task['current_count']}/{task['threshold_max']})")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"检查阈值失败: {str(e)}")
|
||||
return False
|
||||
|
||||
def get_task_statistics(self, task_date: str = None) -> Dict:
|
||||
"""
|
||||
获取任务统计信息
|
||||
|
||||
Args:
|
||||
task_date: 日期,为None则统计所有
|
||||
"""
|
||||
try:
|
||||
conn = self.get_connection()
|
||||
|
||||
if task_date:
|
||||
ph = self._get_placeholder()
|
||||
where_clause = f"WHERE task_date = {ph}"
|
||||
params = (task_date,)
|
||||
else:
|
||||
where_clause = ""
|
||||
params = None
|
||||
|
||||
sql = f"""
|
||||
SELECT
|
||||
COUNT(*) as total_tasks,
|
||||
SUM(CASE WHEN status = 'ready' THEN 1 ELSE 0 END) as ready_count,
|
||||
SUM(CASE WHEN status = 'doing' THEN 1 ELSE 0 END) as doing_count,
|
||||
SUM(CASE WHEN status = 'finished' THEN 1 ELSE 0 END) as finished_count,
|
||||
SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed_count,
|
||||
SUM(CASE WHEN status = 'closed' THEN 1 ELSE 0 END) as closed_count,
|
||||
SUM(crawl_url_count) as total_crawled,
|
||||
SUM(valid_url_count) as total_valid
|
||||
FROM ai_mip_query_task
|
||||
{where_clause}
|
||||
"""
|
||||
|
||||
cursor = self._execute_query(conn, sql, params)
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
return self._dict_from_row(row) if row else {}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取任务统计失败: {str(e)}")
|
||||
return {}
|
||||
|
||||
Reference in New Issue
Block a user