Files
ai_image_quary/update_keywords_from_excel.py

320 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
根据Excel文件更新baidu_keyword表中符合条件的记录
"""
import pandas as pd
import logging
import os
import time
from database_config import DatabaseManager
from datetime import datetime
# 配置日志
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def read_excel_keywords_with_department(excel_path, query_column='query', department_column='科室'):
"""
读取Excel文件中的关键词和部门信息
Args:
excel_path: Excel文件路径
query_column: query列名默认为'query'
department_column: 部门列名,默认为'科室'
Returns:
包含(keyword, department)元组的列表
"""
try:
# 读取Excel文件
df = pd.read_excel(excel_path)
logger.info(f"成功读取Excel文件: {excel_path}")
logger.info(f"Excel文件包含 {len(df)} 行数据")
logger.info(f"Excel列名: {df.columns.tolist()}")
# 检查query列和department列是否存在
if query_column not in df.columns:
logger.error(f"未找到query列: {query_column}")
return []
if department_column not in df.columns:
logger.error(f"未找到department列: {department_column}")
return []
# 获取query和department数据
query_data = df[query_column].dropna()
department_data = df[department_column].dropna()
# 对齐数据长度,取最短长度
min_length = min(len(query_data), len(department_data))
query_list = query_data.iloc[:min_length].tolist()
department_list = department_data.iloc[:min_length].tolist()
# 组合关键词和部门信息
keyword_dept_pairs = []
for i in range(min_length):
keyword = str(query_list[i]).strip()
department = str(department_list[i]).strip()
if keyword and department: # 确保关键词和部门都不为空
keyword_dept_pairs.append((keyword, department))
# 去除重复项,保留第一个出现的组合
seen = set()
unique_keyword_dept_pairs = []
for keyword, dept in keyword_dept_pairs:
if (keyword, dept) not in seen:
seen.add((keyword, dept))
unique_keyword_dept_pairs.append((keyword, dept))
logger.info(f"提取到 {len(unique_keyword_dept_pairs)} 个唯一的关键词-部门组合")
return unique_keyword_dept_pairs
except Exception as e:
logger.error(f"读取Excel文件失败: {e}", exc_info=True)
raise
def get_department_id(db_manager, department_name):
"""
根据科室名称从ai_departments表中获取对应的ID
Args:
db_manager: 数据库管理器实例
department_name: 科室名称
Returns:
科室ID如果未找到则抛出异常
"""
try:
# 查询科室ID - 使用正确的字段名
sql = "SELECT id FROM ai_departments WHERE department_name = %s"
result = db_manager.execute_query(sql, (department_name,), fetch_one=True)
if result:
return result[0] # 返回ID
else:
error_msg = f"未找到科室 '{department_name}' 的ID请先在ai_departments表中添加该科室"
logger.error(error_msg)
raise ValueError(error_msg)
except Exception as e:
logger.error(f"查询科室ID失败: {e}", exc_info=True)
raise
def get_author_info_by_department(db_manager, department_id):
"""
根据科室ID从ai_authors表中获取任一符合条件的作者信息
Args:
db_manager: 数据库管理器实例
department_id: 科室ID
Returns:
(author_id, author_name) 元组,如果未找到则返回 (0, '')
"""
try:
# 查询符合条件的作者信息
sql = "SELECT id, author_name FROM ai_authors WHERE department_id = %s AND status = 'active' AND daily_post_max > 0 LIMIT 1"
result = db_manager.execute_query(sql, (department_id,), fetch_one=True)
if result:
return result[0], result[1] # 返回 author_id, author_name
else:
logger.warning(f"未找到科室ID {department_id} 下符合条件的活跃作者")
return 0, '' # 返回默认值而不是None
except Exception as e:
logger.error(f"查询作者信息失败: {e}", exc_info=True)
return 0, '' # 返回默认值
def update_keywords_from_excel(db_manager, keyword_dept_pairs, batch_size=100, sleep_seconds=0.1):
"""
根据Excel文件更新baidu_keyword表中符合条件的记录
Args:
db_manager: 数据库管理器实例
keyword_dept_pairs: 包含(keyword, department)元组的列表
batch_size: 日志批次大小,每多少条记录输出一次进度
sleep_seconds: 每条记录间隔睡眠时间默认0.1秒
Returns:
成功更新的数量
"""
if not keyword_dept_pairs:
logger.warning("没有关键词需要更新")
return 0
try:
logger.info(f"开始更新 {len(keyword_dept_pairs)} 个关键词-部门组合到数据库...")
logger.info("采用逐条查询+更新模式,只更新存在的关键词")
# 准备SQL语句 - 符合指定条件的查询
check_sql = "SELECT COUNT(*) FROM baidu_keyword WHERE keyword = %s AND seed_id = 9999 AND created_at > '2026-01-28 12:00:00' AND created_at < '2026-01-28 19:53:00' AND query_status = 'manual_review'"
update_sql = """
UPDATE baidu_keyword
SET department = %s, department_id = %s, author_id = %s, author_name = %s
WHERE keyword = %s AND seed_id = 9999 AND created_at > '2026-01-28 12:00:00' AND created_at < '2026-01-28 19:53:00' AND query_status = 'manual_review'
"""
success_count = 0
skip_count = 0
failed_count = 0
# 逐条处理
for idx, (keyword, department) in enumerate(keyword_dept_pairs, 1):
try:
logger.debug(f'[调试] 处理第 {idx}/{len(keyword_dept_pairs)} 条: {keyword}, 部门: {department}')
# 1. 查询关键词是否存在(在指定条件下)
result = db_manager.execute_query(check_sql, (keyword,), fetch_one=True)
exists = result[0] > 0 if result else False
if not exists:
skip_count += 1
logger.debug(f'[调试] 关键词不存在于指定条件中,跳过: {keyword}')
continue # 跳过不存在的关键词
# 2. 获取科室ID必须存在否则抛出异常
dept_id = get_department_id(db_manager, department)
# 3. 获取作者信息
author_id, author_name = get_author_info_by_department(db_manager, dept_id)
# 4. 存在则更新
logger.debug(f'[调试] 准备更新: {keyword}, 部门: {department}, 部门ID: {dept_id}, 作者ID: {author_id}, 作者名: {author_name}')
affected = db_manager.execute_update(
update_sql,
(department, dept_id, author_id, author_name, keyword),
autocommit=True
)
if affected > 0:
success_count += 1
logger.debug(f'[调试] 更新成功: {keyword}, 部门: {department}, 部门ID: {dept_id}, 作者ID: {author_id}, 作者名: {author_name}')
# 5. 输出进度
if idx % batch_size == 0 or idx == len(keyword_dept_pairs):
progress = (idx / len(keyword_dept_pairs)) * 100
logger.info(f'[更新进度] {idx}/{len(keyword_dept_pairs)} ({progress:.1f}%) | 成功: {success_count} | 跳过: {skip_count} | 失败: {failed_count}')
# 6. 每次执行完sleep
time.sleep(sleep_seconds)
except ValueError as ve:
# 遇到科室不存在的错误,跳过该条记录继续
logger.error(f'[错误] 第 {idx} 条记录遇到错误: {ve}')
failed_count += 1
continue
except Exception as e:
failed_count += 1
logger.warning(f'[调试] 处理失败 [{idx}/{len(keyword_dept_pairs)}]: keyword={keyword}, 部门={department},错误:{e}')
logger.info(f"更新完成!成功更新: {success_count} | 跳过不存在: {skip_count} | 失败: {failed_count}")
return success_count
except Exception as e:
logger.error(f"更新关键词失败: {e}", exc_info=True)
raise
def main():
"""主函数"""
# Excel文件路径
excel_path = '/home/work/ai_improt_quary/副本query表-0128第一批.xlsx'
# 创建数据库连接配置
db_config = {
'host': '8.149.233.36',
'port': 3306,
'user': 'ai_article_read',
'password': '7aK_H2yvokVumr84lLNDt8fDBp6P',
'database': 'ai_article',
'charset': 'utf8mb4'
}
logger.info("=" * 60)
logger.info("开始根据Excel更新baidu_keyword表中符合条件的记录")
logger.info(f"数据库配置: {db_config['user']}@{db_config['host']}:3306/{db_config['database']}")
logger.info("=" * 60)
# 创建数据库管理器
db_manager = DatabaseManager(db_config)
try:
# 1. 读取Excel文件
keyword_dept_pairs = read_excel_keywords_with_department(excel_path, 'query', '科室')
if not keyword_dept_pairs:
logger.warning("没有可更新的关键词,程序退出")
return
# 询问用户是要更新全部数据还是部分测试
print(f"\nExcel中共有 {len(keyword_dept_pairs)} 条数据")
while True:
choice = input("请选择更新方式: A) 全部更新 B) 测试模式(输入前N条数据): ").strip().upper()
if choice == 'A':
# 全部更新
break
elif choice == 'B':
try:
test_count = int(input(f"请输入要测试的条数 (1-{len(keyword_dept_pairs)}): "))
if 1 <= test_count <= len(keyword_dept_pairs):
keyword_dept_pairs = keyword_dept_pairs[:test_count]
print(f"已选择更新前 {test_count} 条数据进行测试")
break
else:
print(f"输入超出范围请输入1到{len(keyword_dept_pairs)}之间的数字")
except ValueError:
print("请输入有效的数字")
else:
print("请输入 A 或 B")
if not keyword_dept_pairs:
logger.warning("没有可更新的关键词,程序退出")
return
# 打印前10个关键词-部门组合作为预览
logger.info(f"\n关键词-部门预览前10个:")
for i, (keyword, department) in enumerate(keyword_dept_pairs[:10], 1):
logger.info(f" {i}. {keyword} (部门: {department})")
if len(keyword_dept_pairs) > 10:
logger.info(f" ... 还有 {len(keyword_dept_pairs) - 10} 个关键词-部门组合")
# 2. 确认更新
print("\n" + "=" * 60)
print(f"即将更新 {len(keyword_dept_pairs)} 个关键词-部门组合到 baidu_keyword 表")
print(f"条件: seed_id=9999 AND created_at BETWEEN '2026-01-28 12:00:00' AND '2026-01-28 19:53:00' AND query_status='manual_review'")
confirm = input("确认更新? (y/n): ").strip().lower()
if confirm != 'y':
logger.info("用户取消更新")
return
# 3. 执行更新
success_count = update_keywords_from_excel(
db_manager=db_manager,
keyword_dept_pairs=keyword_dept_pairs,
batch_size=1,
sleep_seconds=0.1
)
logger.info("=" * 60)
logger.info(f"✓ 更新完成!共成功更新 {success_count} 个关键词")
logger.info("=" * 60)
except Exception as e:
logger.error(f"✗ 更新过程出错: {e}", exc_info=True)
logger.info("=" * 60)
logger.info("✗ 更新失败")
logger.info("=" * 60)
if __name__ == '__main__':
main()