feat: 完善代理重试机制,添加数据验证告警,新增README文档
This commit is contained in:
548
batch_import_history.py
Normal file
548
batch_import_history.py
Normal file
@@ -0,0 +1,548 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
批量历史数据导入脚本
|
||||
|
||||
功能:
|
||||
1. 按日期范围循环抓取百家号数据
|
||||
2. 每次抓取后自动导出CSV
|
||||
3. 自动导入数据库
|
||||
4. 记录执行日志和错误信息
|
||||
5. 自动重试机制(针对网络、代理等临时性错误)
|
||||
|
||||
使用方法:
|
||||
# 基本用法
|
||||
python batch_import_history.py --start 2025-12-01 --end 2025-12-25
|
||||
|
||||
# 跳过失败的日期继续执行
|
||||
python batch_import_history.py --start 2025-12-01 --end 2025-12-25 --skip-failed
|
||||
|
||||
# 自定义重试次数(默认3次)
|
||||
python batch_import_history.py --start 2025-12-01 --end 2025-12-25 --max-retries 5
|
||||
|
||||
# 组合使用
|
||||
python batch_import_history.py --start 2025-12-01 --end 2025-12-25 --skip-failed --max-retries 5
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import argparse
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Tuple, Optional
|
||||
import json
|
||||
import time
|
||||
|
||||
# 设置UTF-8编码
|
||||
if sys.platform == 'win32':
|
||||
import io
|
||||
if not isinstance(sys.stdout, io.TextIOWrapper) or sys.stdout.encoding != 'utf-8':
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||||
if not isinstance(sys.stderr, io.TextIOWrapper) or sys.stderr.encoding != 'utf-8':
|
||||
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
|
||||
|
||||
|
||||
class BatchImporter:
|
||||
"""批量历史数据导入器"""
|
||||
|
||||
def __init__(self, start_date: str, end_date: str, skip_failed: bool = False, max_retries: int = 3):
|
||||
"""初始化
|
||||
|
||||
Args:
|
||||
start_date: 开始日期 (YYYY-MM-DD)
|
||||
end_date: 结束日期 (YYYY-MM-DD)
|
||||
skip_failed: 是否跳过失败的日期继续执行
|
||||
max_retries: 每个步骤的最大重试次数(默认:3)
|
||||
"""
|
||||
self.script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
self.start_date = datetime.strptime(start_date, '%Y-%m-%d')
|
||||
self.end_date = datetime.strptime(end_date, '%Y-%m-%d')
|
||||
self.skip_failed = skip_failed
|
||||
self.max_retries = max_retries
|
||||
|
||||
# 脚本路径
|
||||
self.analytics_script = os.path.join(self.script_dir, 'bjh_analytics_date.py')
|
||||
self.export_script = os.path.join(self.script_dir, 'export_to_csv.py')
|
||||
self.import_script = os.path.join(self.script_dir, 'import_csv_to_database.py')
|
||||
|
||||
# 日志文件
|
||||
self.log_dir = os.path.join(self.script_dir, 'logs')
|
||||
if not os.path.exists(self.log_dir):
|
||||
os.makedirs(self.log_dir)
|
||||
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
self.log_file = os.path.join(self.log_dir, f'batch_import_{timestamp}.log')
|
||||
|
||||
# 执行结果记录
|
||||
self.results = []
|
||||
|
||||
# 验证脚本文件存在
|
||||
self._validate_scripts()
|
||||
|
||||
def _validate_scripts(self):
|
||||
"""验证所需脚本文件是否存在"""
|
||||
scripts = {
|
||||
'bjh_analytics_date.py': self.analytics_script,
|
||||
'export_to_csv.py': self.export_script,
|
||||
'import_csv_to_database.py': self.import_script
|
||||
}
|
||||
|
||||
missing_scripts = []
|
||||
for name, path in scripts.items():
|
||||
if not os.path.exists(path):
|
||||
missing_scripts.append(name)
|
||||
|
||||
if missing_scripts:
|
||||
print(f"[X] 缺少必要的脚本文件:")
|
||||
for script in missing_scripts:
|
||||
print(f" - {script}")
|
||||
raise FileNotFoundError("脚本文件缺失")
|
||||
|
||||
def log(self, message: str, level: str = 'INFO'):
|
||||
"""记录日志
|
||||
|
||||
Args:
|
||||
message: 日志消息
|
||||
level: 日志级别 (INFO, WARNING, ERROR)
|
||||
"""
|
||||
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
log_line = f"[{timestamp}] [{level}] {message}"
|
||||
|
||||
# 输出到控制台
|
||||
print(log_line)
|
||||
|
||||
# 写入日志文件
|
||||
try:
|
||||
with open(self.log_file, 'a', encoding='utf-8') as f:
|
||||
f.write(log_line + '\n')
|
||||
except Exception as e:
|
||||
print(f"[!] 写入日志文件失败: {e}")
|
||||
|
||||
def get_date_list(self) -> List[str]:
|
||||
"""生成日期列表
|
||||
|
||||
Returns:
|
||||
日期字符串列表 (YYYY-MM-DD)
|
||||
"""
|
||||
dates = []
|
||||
current = self.start_date
|
||||
|
||||
while current <= self.end_date:
|
||||
dates.append(current.strftime('%Y-%m-%d'))
|
||||
current += timedelta(days=1)
|
||||
|
||||
return dates
|
||||
|
||||
def run_command_with_retry(self, cmd: List[str], step_name: str, max_retries: Optional[int] = None) -> Tuple[bool, str]:
|
||||
"""执行命令(带重试机制)
|
||||
|
||||
Args:
|
||||
cmd: 命令列表
|
||||
step_name: 步骤名称
|
||||
max_retries: 最大重试次数,默认使用实例配置
|
||||
|
||||
Returns:
|
||||
(是否成功, 错误信息)
|
||||
"""
|
||||
if max_retries is None:
|
||||
max_retries = self.max_retries
|
||||
|
||||
retry_count = 0
|
||||
last_error = ""
|
||||
|
||||
while retry_count <= max_retries:
|
||||
if retry_count > 0:
|
||||
# 重试前等待,递增延迟:5秒、10秒、15秒
|
||||
wait_time = retry_count * 5
|
||||
self.log(f"{step_name} 第{retry_count}次重试,等待 {wait_time} 秒...", level='WARNING')
|
||||
time.sleep(wait_time)
|
||||
|
||||
# 执行命令
|
||||
success, error = self.run_command(cmd, step_name)
|
||||
|
||||
if success:
|
||||
if retry_count > 0:
|
||||
self.log(f"{step_name} 重试成功!(第{retry_count}次重试)", level='INFO')
|
||||
return True, ""
|
||||
|
||||
# 失败,记录错误
|
||||
last_error = error
|
||||
retry_count += 1
|
||||
|
||||
# 判断是否需要重试
|
||||
if retry_count <= max_retries:
|
||||
# 可重试的错误类型
|
||||
retryable_errors = [
|
||||
'超时',
|
||||
'timeout',
|
||||
'连接',
|
||||
'connection',
|
||||
'代理',
|
||||
'proxy',
|
||||
'网络',
|
||||
'network',
|
||||
'RemoteDisconnected',
|
||||
'ConnectionError',
|
||||
'ProxyError'
|
||||
]
|
||||
|
||||
# 检查错误信息是否包含可重试的关键词
|
||||
is_retryable = any(keyword in str(error).lower() for keyword in retryable_errors)
|
||||
|
||||
if is_retryable:
|
||||
self.log(f"{step_name} 出现可重试错误: {error}", level='WARNING')
|
||||
else:
|
||||
# 不可重试的错误,直接失败
|
||||
self.log(f"{step_name} 出现不可重试错误,停止重试: {error}", level='ERROR')
|
||||
return False, error
|
||||
|
||||
# 所有重试失败
|
||||
self.log(f"{step_name} 失败,已达最大重试次数 ({max_retries})", level='ERROR')
|
||||
return False, last_error
|
||||
|
||||
def run_command(self, cmd: List[str], step_name: str) -> Tuple[bool, str]:
|
||||
"""执行命令
|
||||
|
||||
Args:
|
||||
cmd: 命令列表
|
||||
step_name: 步骤名称
|
||||
|
||||
Returns:
|
||||
(是否成功, 错误信息)
|
||||
"""
|
||||
process = None
|
||||
try:
|
||||
self.log(f"执行命令: {' '.join(cmd)}")
|
||||
|
||||
# 使用subprocess运行命令,实时输出
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
cwd=self.script_dir,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT, # 合并stderr到stdout
|
||||
text=True,
|
||||
encoding='utf-8',
|
||||
bufsize=1, # 行缓冲
|
||||
universal_newlines=True
|
||||
)
|
||||
|
||||
# 实时读取输出
|
||||
output_lines = []
|
||||
if process.stdout:
|
||||
try:
|
||||
for line in process.stdout:
|
||||
line = line.rstrip()
|
||||
if line: # 只输出非空行
|
||||
print(f" {line}") # 实时输出到控制台
|
||||
output_lines.append(line)
|
||||
# 每10行记录一次日志,减少日志文件大小
|
||||
if len(output_lines) % 10 == 0:
|
||||
self.log(f"{step_name} 运行中... (已输出{len(output_lines)}行)")
|
||||
except Exception as e:
|
||||
self.log(f"读取输出异常: {e}", level='WARNING')
|
||||
|
||||
# 等待进程结束
|
||||
return_code = process.wait(timeout=600) # 10分钟超时
|
||||
|
||||
# 记录完整输出
|
||||
full_output = '\n'.join(output_lines)
|
||||
if full_output:
|
||||
self.log(f"{step_name} 输出:\n{full_output}")
|
||||
|
||||
# 检查返回码
|
||||
if return_code == 0:
|
||||
self.log(f"[✓] {step_name} 执行成功", level='INFO')
|
||||
return True, ""
|
||||
else:
|
||||
error_msg = f"返回码: {return_code}"
|
||||
self.log(f"[X] {step_name} 执行失败: {error_msg}", level='ERROR')
|
||||
return False, error_msg
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
if process:
|
||||
process.kill()
|
||||
error_msg = "命令执行超时(>10分钟)"
|
||||
self.log(f"[X] {step_name} 失败: {error_msg}", level='ERROR')
|
||||
return False, error_msg
|
||||
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
self.log(f"[X] {step_name} 异常: {error_msg}", level='ERROR')
|
||||
import traceback
|
||||
self.log(f"异常堆栈:\n{traceback.format_exc()}", level='ERROR')
|
||||
return False, error_msg
|
||||
|
||||
def process_date(self, date_str: str) -> bool:
|
||||
"""处理单个日期的数据
|
||||
|
||||
Args:
|
||||
date_str: 日期字符串 (YYYY-MM-DD)
|
||||
|
||||
Returns:
|
||||
是否成功
|
||||
"""
|
||||
self.log("="*70)
|
||||
self.log(f"开始处理日期: {date_str}")
|
||||
self.log("="*70)
|
||||
|
||||
result = {
|
||||
'date': date_str,
|
||||
'start_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'steps': {},
|
||||
'success': False,
|
||||
'error': None
|
||||
}
|
||||
|
||||
# 步骤1: 数据抓取(带重试)
|
||||
self.log(f"\n[步骤 1/3] 抓取 {date_str} 的数据...")
|
||||
cmd_analytics = [
|
||||
sys.executable,
|
||||
self.analytics_script,
|
||||
date_str,
|
||||
'--proxy',
|
||||
'--database',
|
||||
'--no-confirm' # 跳过确认提示
|
||||
]
|
||||
|
||||
success, error = self.run_command_with_retry(cmd_analytics, f"数据抓取 ({date_str})")
|
||||
result['steps']['analytics'] = {'success': success, 'error': error}
|
||||
|
||||
if not success:
|
||||
result['error'] = f"数据抓取失败: {error}"
|
||||
result['end_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
self.results.append(result)
|
||||
return False
|
||||
|
||||
# 等待2秒,确保文件写入完成
|
||||
time.sleep(2)
|
||||
|
||||
# 步骤2: 导出CSV(带重试)
|
||||
self.log(f"\n[步骤 2/3] 导出CSV文件...")
|
||||
cmd_export = [
|
||||
sys.executable,
|
||||
self.export_script,
|
||||
'--mode', 'csv',
|
||||
'--no-confirm' # 跳过确认提示
|
||||
]
|
||||
|
||||
success, error = self.run_command_with_retry(cmd_export, f"CSV导出 ({date_str})")
|
||||
|
||||
result['steps']['export'] = {'success': success, 'error': error}
|
||||
|
||||
if not success:
|
||||
result['error'] = f"CSV导出失败: {error}"
|
||||
result['end_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
self.results.append(result)
|
||||
return False
|
||||
|
||||
# 等待2秒
|
||||
time.sleep(2)
|
||||
|
||||
# 步骤3: 导入数据库(带重试)
|
||||
self.log(f"\n[步骤 3/3] 导入数据库...")
|
||||
cmd_import = [
|
||||
sys.executable,
|
||||
self.import_script
|
||||
]
|
||||
|
||||
success, error = self.run_command_with_retry(cmd_import, f"数据库导入 ({date_str})")
|
||||
result['steps']['import'] = {'success': success, 'error': error}
|
||||
|
||||
if not success:
|
||||
result['error'] = f"数据库导入失败: {error}"
|
||||
result['end_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
self.results.append(result)
|
||||
return False
|
||||
|
||||
# 全部成功
|
||||
result['success'] = True
|
||||
result['end_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
self.results.append(result)
|
||||
|
||||
self.log(f"\n[✓] {date_str} 处理完成!")
|
||||
self.log("="*70 + "\n")
|
||||
|
||||
return True
|
||||
|
||||
def run(self):
|
||||
"""执行批量导入"""
|
||||
dates = self.get_date_list()
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("批量历史数据导入")
|
||||
print("="*70)
|
||||
print(f"开始日期: {self.start_date.strftime('%Y-%m-%d')}")
|
||||
print(f"结束日期: {self.end_date.strftime('%Y-%m-%d')}")
|
||||
print(f"总天数: {len(dates)} 天")
|
||||
print(f"跳过失败: {'是' if self.skip_failed else '否'}")
|
||||
print(f"最大重试次数: {self.max_retries}")
|
||||
print(f"日志文件: {self.log_file}")
|
||||
print("="*70)
|
||||
|
||||
# 确认执行
|
||||
confirm = input("\n是否开始执行? (y/n): ").strip().lower()
|
||||
if confirm != 'y':
|
||||
print("已取消")
|
||||
return
|
||||
|
||||
self.log(f"开始批量导入: {len(dates)} 个日期")
|
||||
start_time = datetime.now()
|
||||
|
||||
success_count = 0
|
||||
failed_count = 0
|
||||
|
||||
for idx, date_str in enumerate(dates, 1):
|
||||
print(f"\n{'='*70}")
|
||||
print(f"进度: [{idx}/{len(dates)}] {date_str}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
success = self.process_date(date_str)
|
||||
|
||||
if success:
|
||||
success_count += 1
|
||||
else:
|
||||
failed_count += 1
|
||||
|
||||
# 如果不跳过失败,则停止执行
|
||||
if not self.skip_failed:
|
||||
self.log(f"[X] 日期 {date_str} 处理失败,停止执行", level='ERROR')
|
||||
break
|
||||
else:
|
||||
self.log(f"[!] 日期 {date_str} 处理失败,跳过继续", level='WARNING')
|
||||
|
||||
# 日期间延迟(避免请求过快)
|
||||
if idx < len(dates):
|
||||
delay = 5
|
||||
self.log(f"等待 {delay} 秒后处理下一个日期...")
|
||||
time.sleep(delay)
|
||||
|
||||
# 执行完成
|
||||
end_time = datetime.now()
|
||||
duration = end_time - start_time
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("批量导入完成")
|
||||
print("="*70)
|
||||
print(f"总耗时: {duration}")
|
||||
print(f"成功: {success_count} 天")
|
||||
print(f"失败: {failed_count} 天")
|
||||
print(f"日志文件: {self.log_file}")
|
||||
print("="*70)
|
||||
|
||||
self.log("="*70)
|
||||
self.log(f"批量导入完成: 成功 {success_count} 天, 失败 {failed_count} 天")
|
||||
self.log(f"总耗时: {duration}")
|
||||
self.log("="*70)
|
||||
|
||||
# 保存执行结果
|
||||
self._save_results()
|
||||
|
||||
# 显示失败的日期
|
||||
if failed_count > 0:
|
||||
print("\n失败的日期:")
|
||||
for r in self.results:
|
||||
if not r['success']:
|
||||
print(f" - {r['date']}: {r.get('error', '未知错误')}")
|
||||
|
||||
def _save_results(self):
|
||||
"""保存执行结果到JSON文件"""
|
||||
try:
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
result_file = os.path.join(self.log_dir, f'batch_result_{timestamp}.json')
|
||||
|
||||
summary = {
|
||||
'start_date': self.start_date.strftime('%Y-%m-%d'),
|
||||
'end_date': self.end_date.strftime('%Y-%m-%d'),
|
||||
'total_dates': len(self.results),
|
||||
'success_count': sum(1 for r in self.results if r['success']),
|
||||
'failed_count': sum(1 for r in self.results if not r['success']),
|
||||
'results': self.results
|
||||
}
|
||||
|
||||
with open(result_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(summary, f, ensure_ascii=False, indent=2)
|
||||
|
||||
self.log(f"执行结果已保存: {result_file}")
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"保存执行结果失败: {e}", level='ERROR')
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='批量历史数据导入脚本',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
示例用法:
|
||||
python batch_import_history.py --start 2025-12-01 --end 2025-12-25
|
||||
python batch_import_history.py --start 2025-12-01 --end 2025-12-25 --skip-failed
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--start',
|
||||
type=str,
|
||||
required=True,
|
||||
help='开始日期 (格式: YYYY-MM-DD)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--end',
|
||||
type=str,
|
||||
required=True,
|
||||
help='结束日期 (格式: YYYY-MM-DD)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--skip-failed',
|
||||
action='store_true',
|
||||
help='跳过失败的日期继续执行(默认:遇到失败停止)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-retries',
|
||||
type=int,
|
||||
default=3,
|
||||
help='每个步骤的最大重试次数(默认:3)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 验证日期格式
|
||||
try:
|
||||
start = datetime.strptime(args.start, '%Y-%m-%d')
|
||||
end = datetime.strptime(args.end, '%Y-%m-%d')
|
||||
|
||||
if start > end:
|
||||
print("[X] 开始日期不能晚于结束日期")
|
||||
return 1
|
||||
|
||||
except ValueError as e:
|
||||
print(f"[X] 日期格式错误: {e}")
|
||||
print(" 正确格式: YYYY-MM-DD (例如: 2025-12-01)")
|
||||
return 1
|
||||
|
||||
try:
|
||||
# 创建导入器
|
||||
importer = BatchImporter(
|
||||
start_date=args.start,
|
||||
end_date=args.end,
|
||||
skip_failed=args.skip_failed,
|
||||
max_retries=args.max_retries
|
||||
)
|
||||
|
||||
# 执行批量导入
|
||||
importer.run()
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n[X] 程序执行出错: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user