549 lines
18 KiB
Python
549 lines
18 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
批量历史数据导入脚本
|
||
|
||
功能:
|
||
1. 按日期范围循环抓取百家号数据
|
||
2. 每次抓取后自动导出CSV
|
||
3. 自动导入数据库
|
||
4. 记录执行日志和错误信息
|
||
5. 自动重试机制(针对网络、代理等临时性错误)
|
||
|
||
使用方法:
|
||
# 基本用法
|
||
python batch_import_history.py --start 2025-12-01 --end 2025-12-25
|
||
|
||
# 跳过失败的日期继续执行
|
||
python batch_import_history.py --start 2025-12-01 --end 2025-12-25 --skip-failed
|
||
|
||
# 自定义重试次数(默认3次)
|
||
python batch_import_history.py --start 2025-12-01 --end 2025-12-25 --max-retries 5
|
||
|
||
# 组合使用
|
||
python batch_import_history.py --start 2025-12-01 --end 2025-12-25 --skip-failed --max-retries 5
|
||
"""
|
||
|
||
import sys
|
||
import os
|
||
import subprocess
|
||
import argparse
|
||
from datetime import datetime, timedelta
|
||
from typing import List, Tuple, Optional
|
||
import json
|
||
import time
|
||
|
||
# 设置UTF-8编码
|
||
if sys.platform == 'win32':
|
||
import io
|
||
if not isinstance(sys.stdout, io.TextIOWrapper) or sys.stdout.encoding != 'utf-8':
|
||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||
if not isinstance(sys.stderr, io.TextIOWrapper) or sys.stderr.encoding != 'utf-8':
|
||
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
|
||
|
||
|
||
class BatchImporter:
|
||
"""批量历史数据导入器"""
|
||
|
||
def __init__(self, start_date: str, end_date: str, skip_failed: bool = False, max_retries: int = 3):
|
||
"""初始化
|
||
|
||
Args:
|
||
start_date: 开始日期 (YYYY-MM-DD)
|
||
end_date: 结束日期 (YYYY-MM-DD)
|
||
skip_failed: 是否跳过失败的日期继续执行
|
||
max_retries: 每个步骤的最大重试次数(默认:3)
|
||
"""
|
||
self.script_dir = os.path.dirname(os.path.abspath(__file__))
|
||
self.start_date = datetime.strptime(start_date, '%Y-%m-%d')
|
||
self.end_date = datetime.strptime(end_date, '%Y-%m-%d')
|
||
self.skip_failed = skip_failed
|
||
self.max_retries = max_retries
|
||
|
||
# 脚本路径
|
||
self.analytics_script = os.path.join(self.script_dir, 'bjh_analytics_date.py')
|
||
self.export_script = os.path.join(self.script_dir, 'export_to_csv.py')
|
||
self.import_script = os.path.join(self.script_dir, 'import_csv_to_database.py')
|
||
|
||
# 日志文件
|
||
self.log_dir = os.path.join(self.script_dir, 'logs')
|
||
if not os.path.exists(self.log_dir):
|
||
os.makedirs(self.log_dir)
|
||
|
||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||
self.log_file = os.path.join(self.log_dir, f'batch_import_{timestamp}.log')
|
||
|
||
# 执行结果记录
|
||
self.results = []
|
||
|
||
# 验证脚本文件存在
|
||
self._validate_scripts()
|
||
|
||
def _validate_scripts(self):
|
||
"""验证所需脚本文件是否存在"""
|
||
scripts = {
|
||
'bjh_analytics_date.py': self.analytics_script,
|
||
'export_to_csv.py': self.export_script,
|
||
'import_csv_to_database.py': self.import_script
|
||
}
|
||
|
||
missing_scripts = []
|
||
for name, path in scripts.items():
|
||
if not os.path.exists(path):
|
||
missing_scripts.append(name)
|
||
|
||
if missing_scripts:
|
||
print(f"[X] 缺少必要的脚本文件:")
|
||
for script in missing_scripts:
|
||
print(f" - {script}")
|
||
raise FileNotFoundError("脚本文件缺失")
|
||
|
||
def log(self, message: str, level: str = 'INFO'):
|
||
"""记录日志
|
||
|
||
Args:
|
||
message: 日志消息
|
||
level: 日志级别 (INFO, WARNING, ERROR)
|
||
"""
|
||
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
log_line = f"[{timestamp}] [{level}] {message}"
|
||
|
||
# 输出到控制台
|
||
print(log_line)
|
||
|
||
# 写入日志文件
|
||
try:
|
||
with open(self.log_file, 'a', encoding='utf-8') as f:
|
||
f.write(log_line + '\n')
|
||
except Exception as e:
|
||
print(f"[!] 写入日志文件失败: {e}")
|
||
|
||
def get_date_list(self) -> List[str]:
|
||
"""生成日期列表
|
||
|
||
Returns:
|
||
日期字符串列表 (YYYY-MM-DD)
|
||
"""
|
||
dates = []
|
||
current = self.start_date
|
||
|
||
while current <= self.end_date:
|
||
dates.append(current.strftime('%Y-%m-%d'))
|
||
current += timedelta(days=1)
|
||
|
||
return dates
|
||
|
||
def run_command_with_retry(self, cmd: List[str], step_name: str, max_retries: Optional[int] = None) -> Tuple[bool, str]:
|
||
"""执行命令(带重试机制)
|
||
|
||
Args:
|
||
cmd: 命令列表
|
||
step_name: 步骤名称
|
||
max_retries: 最大重试次数,默认使用实例配置
|
||
|
||
Returns:
|
||
(是否成功, 错误信息)
|
||
"""
|
||
if max_retries is None:
|
||
max_retries = self.max_retries
|
||
|
||
retry_count = 0
|
||
last_error = ""
|
||
|
||
while retry_count <= max_retries:
|
||
if retry_count > 0:
|
||
# 重试前等待,递增延迟:5秒、10秒、15秒
|
||
wait_time = retry_count * 5
|
||
self.log(f"{step_name} 第{retry_count}次重试,等待 {wait_time} 秒...", level='WARNING')
|
||
time.sleep(wait_time)
|
||
|
||
# 执行命令
|
||
success, error = self.run_command(cmd, step_name)
|
||
|
||
if success:
|
||
if retry_count > 0:
|
||
self.log(f"{step_name} 重试成功!(第{retry_count}次重试)", level='INFO')
|
||
return True, ""
|
||
|
||
# 失败,记录错误
|
||
last_error = error
|
||
retry_count += 1
|
||
|
||
# 判断是否需要重试
|
||
if retry_count <= max_retries:
|
||
# 可重试的错误类型
|
||
retryable_errors = [
|
||
'超时',
|
||
'timeout',
|
||
'连接',
|
||
'connection',
|
||
'代理',
|
||
'proxy',
|
||
'网络',
|
||
'network',
|
||
'RemoteDisconnected',
|
||
'ConnectionError',
|
||
'ProxyError'
|
||
]
|
||
|
||
# 检查错误信息是否包含可重试的关键词
|
||
is_retryable = any(keyword in str(error).lower() for keyword in retryable_errors)
|
||
|
||
if is_retryable:
|
||
self.log(f"{step_name} 出现可重试错误: {error}", level='WARNING')
|
||
else:
|
||
# 不可重试的错误,直接失败
|
||
self.log(f"{step_name} 出现不可重试错误,停止重试: {error}", level='ERROR')
|
||
return False, error
|
||
|
||
# 所有重试失败
|
||
self.log(f"{step_name} 失败,已达最大重试次数 ({max_retries})", level='ERROR')
|
||
return False, last_error
|
||
|
||
def run_command(self, cmd: List[str], step_name: str) -> Tuple[bool, str]:
|
||
"""执行命令
|
||
|
||
Args:
|
||
cmd: 命令列表
|
||
step_name: 步骤名称
|
||
|
||
Returns:
|
||
(是否成功, 错误信息)
|
||
"""
|
||
process = None
|
||
try:
|
||
self.log(f"执行命令: {' '.join(cmd)}")
|
||
|
||
# 使用subprocess运行命令,实时输出
|
||
process = subprocess.Popen(
|
||
cmd,
|
||
cwd=self.script_dir,
|
||
stdout=subprocess.PIPE,
|
||
stderr=subprocess.STDOUT, # 合并stderr到stdout
|
||
text=True,
|
||
encoding='utf-8',
|
||
bufsize=1, # 行缓冲
|
||
universal_newlines=True
|
||
)
|
||
|
||
# 实时读取输出
|
||
output_lines = []
|
||
if process.stdout:
|
||
try:
|
||
for line in process.stdout:
|
||
line = line.rstrip()
|
||
if line: # 只输出非空行
|
||
print(f" {line}") # 实时输出到控制台
|
||
output_lines.append(line)
|
||
# 每10行记录一次日志,减少日志文件大小
|
||
if len(output_lines) % 10 == 0:
|
||
self.log(f"{step_name} 运行中... (已输出{len(output_lines)}行)")
|
||
except Exception as e:
|
||
self.log(f"读取输出异常: {e}", level='WARNING')
|
||
|
||
# 等待进程结束
|
||
return_code = process.wait(timeout=600) # 10分钟超时
|
||
|
||
# 记录完整输出
|
||
full_output = '\n'.join(output_lines)
|
||
if full_output:
|
||
self.log(f"{step_name} 输出:\n{full_output}")
|
||
|
||
# 检查返回码
|
||
if return_code == 0:
|
||
self.log(f"[✓] {step_name} 执行成功", level='INFO')
|
||
return True, ""
|
||
else:
|
||
error_msg = f"返回码: {return_code}"
|
||
self.log(f"[X] {step_name} 执行失败: {error_msg}", level='ERROR')
|
||
return False, error_msg
|
||
|
||
except subprocess.TimeoutExpired:
|
||
if process:
|
||
process.kill()
|
||
error_msg = "命令执行超时(>10分钟)"
|
||
self.log(f"[X] {step_name} 失败: {error_msg}", level='ERROR')
|
||
return False, error_msg
|
||
|
||
except Exception as e:
|
||
error_msg = str(e)
|
||
self.log(f"[X] {step_name} 异常: {error_msg}", level='ERROR')
|
||
import traceback
|
||
self.log(f"异常堆栈:\n{traceback.format_exc()}", level='ERROR')
|
||
return False, error_msg
|
||
|
||
def process_date(self, date_str: str) -> bool:
|
||
"""处理单个日期的数据
|
||
|
||
Args:
|
||
date_str: 日期字符串 (YYYY-MM-DD)
|
||
|
||
Returns:
|
||
是否成功
|
||
"""
|
||
self.log("="*70)
|
||
self.log(f"开始处理日期: {date_str}")
|
||
self.log("="*70)
|
||
|
||
result = {
|
||
'date': date_str,
|
||
'start_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||
'steps': {},
|
||
'success': False,
|
||
'error': None
|
||
}
|
||
|
||
# 步骤1: 数据抓取(带重试)
|
||
self.log(f"\n[步骤 1/3] 抓取 {date_str} 的数据...")
|
||
cmd_analytics = [
|
||
sys.executable,
|
||
self.analytics_script,
|
||
date_str,
|
||
'--proxy',
|
||
'--database',
|
||
'--no-confirm' # 跳过确认提示
|
||
]
|
||
|
||
success, error = self.run_command_with_retry(cmd_analytics, f"数据抓取 ({date_str})")
|
||
result['steps']['analytics'] = {'success': success, 'error': error}
|
||
|
||
if not success:
|
||
result['error'] = f"数据抓取失败: {error}"
|
||
result['end_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
self.results.append(result)
|
||
return False
|
||
|
||
# 等待2秒,确保文件写入完成
|
||
time.sleep(2)
|
||
|
||
# 步骤2: 导出CSV(带重试)
|
||
self.log(f"\n[步骤 2/3] 导出CSV文件...")
|
||
cmd_export = [
|
||
sys.executable,
|
||
self.export_script,
|
||
'--mode', 'csv',
|
||
'--no-confirm' # 跳过确认提示
|
||
]
|
||
|
||
success, error = self.run_command_with_retry(cmd_export, f"CSV导出 ({date_str})")
|
||
|
||
result['steps']['export'] = {'success': success, 'error': error}
|
||
|
||
if not success:
|
||
result['error'] = f"CSV导出失败: {error}"
|
||
result['end_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
self.results.append(result)
|
||
return False
|
||
|
||
# 等待2秒
|
||
time.sleep(2)
|
||
|
||
# 步骤3: 导入数据库(带重试)
|
||
self.log(f"\n[步骤 3/3] 导入数据库...")
|
||
cmd_import = [
|
||
sys.executable,
|
||
self.import_script
|
||
]
|
||
|
||
success, error = self.run_command_with_retry(cmd_import, f"数据库导入 ({date_str})")
|
||
result['steps']['import'] = {'success': success, 'error': error}
|
||
|
||
if not success:
|
||
result['error'] = f"数据库导入失败: {error}"
|
||
result['end_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
self.results.append(result)
|
||
return False
|
||
|
||
# 全部成功
|
||
result['success'] = True
|
||
result['end_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
self.results.append(result)
|
||
|
||
self.log(f"\n[✓] {date_str} 处理完成!")
|
||
self.log("="*70 + "\n")
|
||
|
||
return True
|
||
|
||
def run(self):
|
||
"""执行批量导入"""
|
||
dates = self.get_date_list()
|
||
|
||
print("\n" + "="*70)
|
||
print("批量历史数据导入")
|
||
print("="*70)
|
||
print(f"开始日期: {self.start_date.strftime('%Y-%m-%d')}")
|
||
print(f"结束日期: {self.end_date.strftime('%Y-%m-%d')}")
|
||
print(f"总天数: {len(dates)} 天")
|
||
print(f"跳过失败: {'是' if self.skip_failed else '否'}")
|
||
print(f"最大重试次数: {self.max_retries}")
|
||
print(f"日志文件: {self.log_file}")
|
||
print("="*70)
|
||
|
||
# 确认执行
|
||
confirm = input("\n是否开始执行? (y/n): ").strip().lower()
|
||
if confirm != 'y':
|
||
print("已取消")
|
||
return
|
||
|
||
self.log(f"开始批量导入: {len(dates)} 个日期")
|
||
start_time = datetime.now()
|
||
|
||
success_count = 0
|
||
failed_count = 0
|
||
|
||
for idx, date_str in enumerate(dates, 1):
|
||
print(f"\n{'='*70}")
|
||
print(f"进度: [{idx}/{len(dates)}] {date_str}")
|
||
print(f"{'='*70}")
|
||
|
||
success = self.process_date(date_str)
|
||
|
||
if success:
|
||
success_count += 1
|
||
else:
|
||
failed_count += 1
|
||
|
||
# 如果不跳过失败,则停止执行
|
||
if not self.skip_failed:
|
||
self.log(f"[X] 日期 {date_str} 处理失败,停止执行", level='ERROR')
|
||
break
|
||
else:
|
||
self.log(f"[!] 日期 {date_str} 处理失败,跳过继续", level='WARNING')
|
||
|
||
# 日期间延迟(避免请求过快)
|
||
if idx < len(dates):
|
||
delay = 5
|
||
self.log(f"等待 {delay} 秒后处理下一个日期...")
|
||
time.sleep(delay)
|
||
|
||
# 执行完成
|
||
end_time = datetime.now()
|
||
duration = end_time - start_time
|
||
|
||
print("\n" + "="*70)
|
||
print("批量导入完成")
|
||
print("="*70)
|
||
print(f"总耗时: {duration}")
|
||
print(f"成功: {success_count} 天")
|
||
print(f"失败: {failed_count} 天")
|
||
print(f"日志文件: {self.log_file}")
|
||
print("="*70)
|
||
|
||
self.log("="*70)
|
||
self.log(f"批量导入完成: 成功 {success_count} 天, 失败 {failed_count} 天")
|
||
self.log(f"总耗时: {duration}")
|
||
self.log("="*70)
|
||
|
||
# 保存执行结果
|
||
self._save_results()
|
||
|
||
# 显示失败的日期
|
||
if failed_count > 0:
|
||
print("\n失败的日期:")
|
||
for r in self.results:
|
||
if not r['success']:
|
||
print(f" - {r['date']}: {r.get('error', '未知错误')}")
|
||
|
||
def _save_results(self):
|
||
"""保存执行结果到JSON文件"""
|
||
try:
|
||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||
result_file = os.path.join(self.log_dir, f'batch_result_{timestamp}.json')
|
||
|
||
summary = {
|
||
'start_date': self.start_date.strftime('%Y-%m-%d'),
|
||
'end_date': self.end_date.strftime('%Y-%m-%d'),
|
||
'total_dates': len(self.results),
|
||
'success_count': sum(1 for r in self.results if r['success']),
|
||
'failed_count': sum(1 for r in self.results if not r['success']),
|
||
'results': self.results
|
||
}
|
||
|
||
with open(result_file, 'w', encoding='utf-8') as f:
|
||
json.dump(summary, f, ensure_ascii=False, indent=2)
|
||
|
||
self.log(f"执行结果已保存: {result_file}")
|
||
|
||
except Exception as e:
|
||
self.log(f"保存执行结果失败: {e}", level='ERROR')
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
parser = argparse.ArgumentParser(
|
||
description='批量历史数据导入脚本',
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog="""
|
||
示例用法:
|
||
python batch_import_history.py --start 2025-12-01 --end 2025-12-25
|
||
python batch_import_history.py --start 2025-12-01 --end 2025-12-25 --skip-failed
|
||
"""
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--start',
|
||
type=str,
|
||
required=True,
|
||
help='开始日期 (格式: YYYY-MM-DD)'
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--end',
|
||
type=str,
|
||
required=True,
|
||
help='结束日期 (格式: YYYY-MM-DD)'
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--skip-failed',
|
||
action='store_true',
|
||
help='跳过失败的日期继续执行(默认:遇到失败停止)'
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--max-retries',
|
||
type=int,
|
||
default=3,
|
||
help='每个步骤的最大重试次数(默认:3)'
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
# 验证日期格式
|
||
try:
|
||
start = datetime.strptime(args.start, '%Y-%m-%d')
|
||
end = datetime.strptime(args.end, '%Y-%m-%d')
|
||
|
||
if start > end:
|
||
print("[X] 开始日期不能晚于结束日期")
|
||
return 1
|
||
|
||
except ValueError as e:
|
||
print(f"[X] 日期格式错误: {e}")
|
||
print(" 正确格式: YYYY-MM-DD (例如: 2025-12-01)")
|
||
return 1
|
||
|
||
try:
|
||
# 创建导入器
|
||
importer = BatchImporter(
|
||
start_date=args.start,
|
||
end_date=args.end,
|
||
skip_failed=args.skip_failed,
|
||
max_retries=args.max_retries
|
||
)
|
||
|
||
# 执行批量导入
|
||
importer.run()
|
||
|
||
return 0
|
||
|
||
except Exception as e:
|
||
print(f"\n[X] 程序执行出错: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return 1
|
||
|
||
|
||
if __name__ == '__main__':
|
||
sys.exit(main())
|