320 lines
13 KiB
Python
320 lines
13 KiB
Python
|
|
"""
|
|||
|
|
根据Excel文件更新baidu_keyword表中符合条件的记录
|
|||
|
|
"""
|
|||
|
|
import pandas as pd
|
|||
|
|
import logging
|
|||
|
|
import os
|
|||
|
|
import time
|
|||
|
|
from database_config import DatabaseManager
|
|||
|
|
from datetime import datetime
|
|||
|
|
|
|||
|
|
# 配置日志
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.DEBUG,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def read_excel_keywords_with_department(excel_path, query_column='query', department_column='科室'):
|
|||
|
|
"""
|
|||
|
|
读取Excel文件中的关键词和部门信息
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
excel_path: Excel文件路径
|
|||
|
|
query_column: query列名,默认为'query'
|
|||
|
|
department_column: 部门列名,默认为'科室'
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
包含(keyword, department)元组的列表
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
# 读取Excel文件
|
|||
|
|
df = pd.read_excel(excel_path)
|
|||
|
|
logger.info(f"成功读取Excel文件: {excel_path}")
|
|||
|
|
logger.info(f"Excel文件包含 {len(df)} 行数据")
|
|||
|
|
logger.info(f"Excel列名: {df.columns.tolist()}")
|
|||
|
|
|
|||
|
|
# 检查query列和department列是否存在
|
|||
|
|
if query_column not in df.columns:
|
|||
|
|
logger.error(f"未找到query列: {query_column}")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
if department_column not in df.columns:
|
|||
|
|
logger.error(f"未找到department列: {department_column}")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# 获取query和department数据
|
|||
|
|
query_data = df[query_column].dropna()
|
|||
|
|
department_data = df[department_column].dropna()
|
|||
|
|
|
|||
|
|
# 对齐数据长度,取最短长度
|
|||
|
|
min_length = min(len(query_data), len(department_data))
|
|||
|
|
query_list = query_data.iloc[:min_length].tolist()
|
|||
|
|
department_list = department_data.iloc[:min_length].tolist()
|
|||
|
|
|
|||
|
|
# 组合关键词和部门信息
|
|||
|
|
keyword_dept_pairs = []
|
|||
|
|
for i in range(min_length):
|
|||
|
|
keyword = str(query_list[i]).strip()
|
|||
|
|
department = str(department_list[i]).strip()
|
|||
|
|
if keyword and department: # 确保关键词和部门都不为空
|
|||
|
|
keyword_dept_pairs.append((keyword, department))
|
|||
|
|
|
|||
|
|
# 去除重复项,保留第一个出现的组合
|
|||
|
|
seen = set()
|
|||
|
|
unique_keyword_dept_pairs = []
|
|||
|
|
for keyword, dept in keyword_dept_pairs:
|
|||
|
|
if (keyword, dept) not in seen:
|
|||
|
|
seen.add((keyword, dept))
|
|||
|
|
unique_keyword_dept_pairs.append((keyword, dept))
|
|||
|
|
|
|||
|
|
logger.info(f"提取到 {len(unique_keyword_dept_pairs)} 个唯一的关键词-部门组合")
|
|||
|
|
|
|||
|
|
return unique_keyword_dept_pairs
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"读取Excel文件失败: {e}", exc_info=True)
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_department_id(db_manager, department_name):
|
|||
|
|
"""
|
|||
|
|
根据科室名称从ai_departments表中获取对应的ID
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
db_manager: 数据库管理器实例
|
|||
|
|
department_name: 科室名称
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
科室ID,如果未找到则抛出异常
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
# 查询科室ID - 使用正确的字段名
|
|||
|
|
sql = "SELECT id FROM ai_departments WHERE department_name = %s"
|
|||
|
|
result = db_manager.execute_query(sql, (department_name,), fetch_one=True)
|
|||
|
|
|
|||
|
|
if result:
|
|||
|
|
return result[0] # 返回ID
|
|||
|
|
else:
|
|||
|
|
error_msg = f"未找到科室 '{department_name}' 的ID,请先在ai_departments表中添加该科室"
|
|||
|
|
logger.error(error_msg)
|
|||
|
|
raise ValueError(error_msg)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"查询科室ID失败: {e}", exc_info=True)
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_author_info_by_department(db_manager, department_id):
|
|||
|
|
"""
|
|||
|
|
根据科室ID从ai_authors表中获取任一符合条件的作者信息
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
db_manager: 数据库管理器实例
|
|||
|
|
department_id: 科室ID
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
(author_id, author_name) 元组,如果未找到则返回 (0, '')
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
# 查询符合条件的作者信息
|
|||
|
|
sql = "SELECT id, author_name FROM ai_authors WHERE department_id = %s AND status = 'active' AND daily_post_max > 0 LIMIT 1"
|
|||
|
|
result = db_manager.execute_query(sql, (department_id,), fetch_one=True)
|
|||
|
|
|
|||
|
|
if result:
|
|||
|
|
return result[0], result[1] # 返回 author_id, author_name
|
|||
|
|
else:
|
|||
|
|
logger.warning(f"未找到科室ID {department_id} 下符合条件的活跃作者")
|
|||
|
|
return 0, '' # 返回默认值而不是None
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"查询作者信息失败: {e}", exc_info=True)
|
|||
|
|
return 0, '' # 返回默认值
|
|||
|
|
|
|||
|
|
|
|||
|
|
def update_keywords_from_excel(db_manager, keyword_dept_pairs, batch_size=100, sleep_seconds=0.1):
|
|||
|
|
"""
|
|||
|
|
根据Excel文件更新baidu_keyword表中符合条件的记录
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
db_manager: 数据库管理器实例
|
|||
|
|
keyword_dept_pairs: 包含(keyword, department)元组的列表
|
|||
|
|
batch_size: 日志批次大小,每多少条记录输出一次进度
|
|||
|
|
sleep_seconds: 每条记录间隔睡眠时间(秒),默认0.1秒
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
成功更新的数量
|
|||
|
|
"""
|
|||
|
|
if not keyword_dept_pairs:
|
|||
|
|
logger.warning("没有关键词需要更新")
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
logger.info(f"开始更新 {len(keyword_dept_pairs)} 个关键词-部门组合到数据库...")
|
|||
|
|
logger.info("采用逐条查询+更新模式,只更新存在的关键词")
|
|||
|
|
|
|||
|
|
# 准备SQL语句 - 符合指定条件的查询
|
|||
|
|
check_sql = "SELECT COUNT(*) FROM baidu_keyword WHERE keyword = %s AND seed_id = 9999 AND created_at > '2026-01-28 12:00:00' AND created_at < '2026-01-28 19:53:00' AND query_status = 'manual_review'"
|
|||
|
|
update_sql = """
|
|||
|
|
UPDATE baidu_keyword
|
|||
|
|
SET department = %s, department_id = %s, author_id = %s, author_name = %s
|
|||
|
|
WHERE keyword = %s AND seed_id = 9999 AND created_at > '2026-01-28 12:00:00' AND created_at < '2026-01-28 19:53:00' AND query_status = 'manual_review'
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
success_count = 0
|
|||
|
|
skip_count = 0
|
|||
|
|
failed_count = 0
|
|||
|
|
|
|||
|
|
# 逐条处理
|
|||
|
|
for idx, (keyword, department) in enumerate(keyword_dept_pairs, 1):
|
|||
|
|
try:
|
|||
|
|
logger.debug(f'[调试] 处理第 {idx}/{len(keyword_dept_pairs)} 条: {keyword}, 部门: {department}')
|
|||
|
|
|
|||
|
|
# 1. 查询关键词是否存在(在指定条件下)
|
|||
|
|
result = db_manager.execute_query(check_sql, (keyword,), fetch_one=True)
|
|||
|
|
exists = result[0] > 0 if result else False
|
|||
|
|
|
|||
|
|
if not exists:
|
|||
|
|
skip_count += 1
|
|||
|
|
logger.debug(f'[调试] 关键词不存在于指定条件中,跳过: {keyword}')
|
|||
|
|
continue # 跳过不存在的关键词
|
|||
|
|
|
|||
|
|
# 2. 获取科室ID(必须存在,否则抛出异常)
|
|||
|
|
dept_id = get_department_id(db_manager, department)
|
|||
|
|
|
|||
|
|
# 3. 获取作者信息
|
|||
|
|
author_id, author_name = get_author_info_by_department(db_manager, dept_id)
|
|||
|
|
|
|||
|
|
# 4. 存在则更新
|
|||
|
|
logger.debug(f'[调试] 准备更新: {keyword}, 部门: {department}, 部门ID: {dept_id}, 作者ID: {author_id}, 作者名: {author_name}')
|
|||
|
|
affected = db_manager.execute_update(
|
|||
|
|
update_sql,
|
|||
|
|
(department, dept_id, author_id, author_name, keyword),
|
|||
|
|
autocommit=True
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if affected > 0:
|
|||
|
|
success_count += 1
|
|||
|
|
logger.debug(f'[调试] 更新成功: {keyword}, 部门: {department}, 部门ID: {dept_id}, 作者ID: {author_id}, 作者名: {author_name}')
|
|||
|
|
|
|||
|
|
# 5. 输出进度
|
|||
|
|
if idx % batch_size == 0 or idx == len(keyword_dept_pairs):
|
|||
|
|
progress = (idx / len(keyword_dept_pairs)) * 100
|
|||
|
|
logger.info(f'[更新进度] {idx}/{len(keyword_dept_pairs)} ({progress:.1f}%) | 成功: {success_count} | 跳过: {skip_count} | 失败: {failed_count}')
|
|||
|
|
|
|||
|
|
# 6. 每次执行完sleep
|
|||
|
|
time.sleep(sleep_seconds)
|
|||
|
|
|
|||
|
|
except ValueError as ve:
|
|||
|
|
# 遇到科室不存在的错误,跳过该条记录继续
|
|||
|
|
logger.error(f'[错误] 第 {idx} 条记录遇到错误: {ve}')
|
|||
|
|
failed_count += 1
|
|||
|
|
continue
|
|||
|
|
except Exception as e:
|
|||
|
|
failed_count += 1
|
|||
|
|
logger.warning(f'[调试] 处理失败 [{idx}/{len(keyword_dept_pairs)}]: keyword={keyword}, 部门={department},错误:{e}')
|
|||
|
|
|
|||
|
|
logger.info(f"更新完成!成功更新: {success_count} | 跳过不存在: {skip_count} | 失败: {failed_count}")
|
|||
|
|
return success_count
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"更新关键词失败: {e}", exc_info=True)
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
# Excel文件路径
|
|||
|
|
excel_path = '/home/work/ai_improt_quary/副本query表-0128第一批.xlsx'
|
|||
|
|
|
|||
|
|
# 创建数据库连接配置
|
|||
|
|
db_config = {
|
|||
|
|
'host': '8.149.233.36',
|
|||
|
|
'port': 3306,
|
|||
|
|
'user': 'ai_article_read',
|
|||
|
|
'password': '7aK_H2yvokVumr84lLNDt8fDBp6P',
|
|||
|
|
'database': 'ai_article',
|
|||
|
|
'charset': 'utf8mb4'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
logger.info("=" * 60)
|
|||
|
|
logger.info("开始根据Excel更新baidu_keyword表中符合条件的记录")
|
|||
|
|
logger.info(f"数据库配置: {db_config['user']}@{db_config['host']}:3306/{db_config['database']}")
|
|||
|
|
logger.info("=" * 60)
|
|||
|
|
|
|||
|
|
# 创建数据库管理器
|
|||
|
|
db_manager = DatabaseManager(db_config)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 1. 读取Excel文件
|
|||
|
|
keyword_dept_pairs = read_excel_keywords_with_department(excel_path, 'query', '科室')
|
|||
|
|
|
|||
|
|
if not keyword_dept_pairs:
|
|||
|
|
logger.warning("没有可更新的关键词,程序退出")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 询问用户是要更新全部数据还是部分测试
|
|||
|
|
print(f"\nExcel中共有 {len(keyword_dept_pairs)} 条数据")
|
|||
|
|
while True:
|
|||
|
|
choice = input("请选择更新方式: A) 全部更新 B) 测试模式(输入前N条数据): ").strip().upper()
|
|||
|
|
if choice == 'A':
|
|||
|
|
# 全部更新
|
|||
|
|
break
|
|||
|
|
elif choice == 'B':
|
|||
|
|
try:
|
|||
|
|
test_count = int(input(f"请输入要测试的条数 (1-{len(keyword_dept_pairs)}): "))
|
|||
|
|
if 1 <= test_count <= len(keyword_dept_pairs):
|
|||
|
|
keyword_dept_pairs = keyword_dept_pairs[:test_count]
|
|||
|
|
print(f"已选择更新前 {test_count} 条数据进行测试")
|
|||
|
|
break
|
|||
|
|
else:
|
|||
|
|
print(f"输入超出范围,请输入1到{len(keyword_dept_pairs)}之间的数字")
|
|||
|
|
except ValueError:
|
|||
|
|
print("请输入有效的数字")
|
|||
|
|
else:
|
|||
|
|
print("请输入 A 或 B")
|
|||
|
|
|
|||
|
|
if not keyword_dept_pairs:
|
|||
|
|
logger.warning("没有可更新的关键词,程序退出")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 打印前10个关键词-部门组合作为预览
|
|||
|
|
logger.info(f"\n关键词-部门预览(前10个):")
|
|||
|
|
for i, (keyword, department) in enumerate(keyword_dept_pairs[:10], 1):
|
|||
|
|
logger.info(f" {i}. {keyword} (部门: {department})")
|
|||
|
|
|
|||
|
|
if len(keyword_dept_pairs) > 10:
|
|||
|
|
logger.info(f" ... 还有 {len(keyword_dept_pairs) - 10} 个关键词-部门组合")
|
|||
|
|
|
|||
|
|
# 2. 确认更新
|
|||
|
|
print("\n" + "=" * 60)
|
|||
|
|
print(f"即将更新 {len(keyword_dept_pairs)} 个关键词-部门组合到 baidu_keyword 表")
|
|||
|
|
print(f"条件: seed_id=9999 AND created_at BETWEEN '2026-01-28 12:00:00' AND '2026-01-28 19:53:00' AND query_status='manual_review'")
|
|||
|
|
confirm = input("确认更新? (y/n): ").strip().lower()
|
|||
|
|
|
|||
|
|
if confirm != 'y':
|
|||
|
|
logger.info("用户取消更新")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 3. 执行更新
|
|||
|
|
success_count = update_keywords_from_excel(
|
|||
|
|
db_manager=db_manager,
|
|||
|
|
keyword_dept_pairs=keyword_dept_pairs,
|
|||
|
|
batch_size=1,
|
|||
|
|
sleep_seconds=0.1
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
logger.info("=" * 60)
|
|||
|
|
logger.info(f"✓ 更新完成!共成功更新 {success_count} 个关键词")
|
|||
|
|
logger.info("=" * 60)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"✗ 更新过程出错: {e}", exc_info=True)
|
|||
|
|
logger.info("=" * 60)
|
|||
|
|
logger.info("✗ 更新失败")
|
|||
|
|
logger.info("=" * 60)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
main()
|