feat: 完善代理重试机制，添加数据验证告警，新增README文档

2026-01-16 18:36:52 +08:00
parent 322ac74336
commit b518e6aacf
55 changed files with 13202 additions and 34781 deletions
--- a/import_csv_to_database.py
+++ b/import_csv_to_database.py
@@ -154,10 +154,8 @@ class CSVImporter:
                continue
            
            try:
-                # 处理slide_ratio值
+                # 处理slide_ratio值（CSV中已是小数格式）
                slide_ratio_value = float(self.convert_value(row.get('slide_ratio', '0'), 'float') or 0.0)
-                if slide_ratio_value > 10:
-                    slide_ratio_value = slide_ratio_value / 100
                slide_ratio_value = min(slide_ratio_value, 9.9999)
                
                # 获取channel
@@ -271,9 +269,8 @@ class CSVImporter:
                continue
            
            try:
+                # 处理avg_slide_ratio值（CSV中已是小数格式）
                avg_slide_ratio_value = float(self.convert_value(row.get('avg_slide_ratio', '0'), 'float') or 0.0)
-                if avg_slide_ratio_value > 10:
-                    avg_slide_ratio_value = avg_slide_ratio_value / 100
                avg_slide_ratio_value = min(avg_slide_ratio_value, 9.9999)
                
                # 获取channel并查询author_id
@@ -348,13 +345,14 @@ class CSVImporter:
        return success_count > 0
    
    def import_ai_statistics_days(self, batch_size: int = 50) -> bool:
-        """导入 ai_statistics_days 表数据（使用批量提交）
+        """导入 ai_statistics_days 表数据（仅当日数据：day_revenue）
+        同时自动拆分数据到 ai_statistics_weekly 和 ai_statistics_monthly 表
        
        Args:
            batch_size: 批量提交大小，默认50条
        """
        print("\n" + "="*70)
-        print("开始导入 ai_statistics_days 表数据")
+        print("开始导入 ai_statistics_days 表数据（拆分到3个表）")
        print("="*70)
        
        csv_file = self.csv_files['ai_statistics_days']
@@ -365,14 +363,27 @@ class CSVImporter:
            self.logger.warning("ai_statistics_days表没有数据可导入")
            return False
        
-        self.logger.info(f"开始导入ai_statistics_days表数据，共 {len(rows)} 条记录，批量大小: {batch_size}")
-        print(f"\n总计 {len(rows)} 条记录，分批导入（每批 {batch_size} 条）\n")
+        self.logger.info(f"开始导入数据，共 {len(rows)} 条记录，批量大小: {batch_size}")
+        print(f"\n总计 {len(rows)} 条记录，将拆分到3个表\n")
        
-        success_count = 0
+        # 三个表的统计
+        days_success = 0
+        weekly_success = 0
+        monthly_success = 0
        failed_count = 0
-        batch_params = []
-        first_record_keys = None
-        sql_template = None
+        
+        # 批量参数
+        days_batch = []
+        weekly_batch = []
+        monthly_batch = []
+        
+        # SQL模板
+        days_sql = None
+        weekly_sql = None
+        monthly_sql = None
+        days_keys = None
+        weekly_keys = None
+        monthly_keys = None
        
        for idx, row in enumerate(rows, 1):
            author_name = row.get('author_name', '').strip()
@@ -388,68 +399,153 @@ class CSVImporter:
                    failed_count += 1
                    continue
                
-                # 处理day_revenue字段（每日收益）
-                day_revenue_value = self.convert_value(row.get('day_revenue', '0'), 'decimal')
-                if day_revenue_value is None:
-                    day_revenue_value = Decimal('0')
+                stat_date = row.get('stat_date', '').strip()
                
-                record = {
+                # 1. ai_statistics_days 表数据（仅当日数据）
+                day_revenue = self.convert_value(row.get('day_revenue', '0'), 'decimal') or Decimal('0')
+                daily_published_count = self.convert_value(row.get('daily_published_count', '0'), 'int') or 0
+                cumulative_published_count = self.convert_value(row.get('cumulative_published_count', '0'), 'int') or 0
+                
+                days_record = {
                    'author_id': author_id,
                    'author_name': author_name,
                    'channel': channel,
-                    'stat_date': row.get('stat_date', '').strip(),
-                    'daily_published_count': self.convert_value(row.get('daily_published_count', '0'), 'int') or 0,
-                    'cumulative_published_count': self.convert_value(row.get('cumulative_published_count', '0'), 'int') or 0,
-                    'day_revenue': day_revenue_value,  # 每日收益
-                    'monthly_revenue': self.convert_value(row.get('monthly_revenue', '0'), 'decimal') or Decimal('0'),
-                    'weekly_revenue': self.convert_value(row.get('weekly_revenue', '0'), 'decimal') or Decimal('0'),
-                    'revenue_mom_growth_rate': self.convert_value(row.get('revenue_mom_growth_rate', '0'), 'decimal') or Decimal('0'),
-                    'revenue_wow_growth_rate': self.convert_value(row.get('revenue_wow_growth_rate', '0'), 'decimal') or Decimal('0'),
-                    'updated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),  # 添加更新时间戳，强制更新
+                    'stat_date': stat_date,
+                    'daily_published_count': daily_published_count,
+                    'day_revenue': day_revenue,
+                    'updated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                }
                
-                if sql_template is None:
-                    first_record_keys = list(record.keys())
-                    columns = ', '.join(first_record_keys)
-                    placeholders = ', '.join(['%s'] * len(first_record_keys))
-                    update_parts = [f"{key} = VALUES({key})" for key in first_record_keys if key not in ['author_name', 'channel', 'stat_date']]
-                    sql_template = f"""
+                # 2. ai_statistics_weekly 表数据
+                weekly_revenue = self.convert_value(row.get('weekly_revenue', '0'), 'decimal') or Decimal('0')
+                revenue_wow_growth_rate = self.convert_value(row.get('revenue_wow_growth_rate', '0'), 'decimal') or Decimal('0')
+                
+                # 计算该日期所在周次（格式：WW，如51）
+                from datetime import datetime as dt, timedelta
+                date_obj = dt.strptime(stat_date, '%Y-%m-%d')
+                # 使用isocalendar()获取ISO周数（周一为一周开始）
+                year, week_num, _ = date_obj.isocalendar()
+                stat_weekly = week_num  # 直接使用数字
+                
+                weekly_record = {
+                    'author_id': author_id,
+                    'author_name': author_name,
+                    'channel': channel,
+                    'stat_weekly': stat_weekly,
+                    'weekly_revenue': weekly_revenue,
+                    'revenue_wow_growth_rate': revenue_wow_growth_rate,
+                    'updated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+                }
+                
+                # 3. ai_statistics_monthly 表数据
+                monthly_revenue = self.convert_value(row.get('monthly_revenue', '0'), 'decimal') or Decimal('0')
+                revenue_mom_growth_rate = self.convert_value(row.get('revenue_mom_growth_rate', '0'), 'decimal') or Decimal('0')
+                
+                # 计算该日期所在月份（格式：YYYY-MM，如2025-12）
+                stat_monthly = date_obj.strftime('%Y-%m')
+                
+                monthly_record = {
+                    'author_id': author_id,
+                    'author_name': author_name,
+                    'channel': channel,
+                    'stat_monthly': stat_monthly,
+                    'monthly_revenue': monthly_revenue,
+                    'revenue_mom_growth_rate': revenue_mom_growth_rate,
+                    'updated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+                }
+                
+                # 构建SQL模板（首次）
+                if days_sql is None:
+                    days_keys = list(days_record.keys())
+                    columns = ', '.join(days_keys)
+                    placeholders = ', '.join(['%s'] * len(days_keys))
+                    update_parts = [f"{key} = VALUES({key})" for key in days_keys if key not in ['author_name', 'channel', 'stat_date']]
+                    days_sql = f"""
                        INSERT INTO ai_statistics_days ({columns})
                        VALUES ({placeholders})
                        ON DUPLICATE KEY UPDATE {', '.join(update_parts)}
                    """
                
-                if first_record_keys is not None:
-                    batch_params.append(tuple(record[key] for key in first_record_keys))
+                if weekly_sql is None:
+                    weekly_keys = list(weekly_record.keys())
+                    columns = ', '.join(weekly_keys)
+                    placeholders = ', '.join(['%s'] * len(weekly_keys))
+                    update_parts = [f"{key} = VALUES({key})" for key in weekly_keys if key not in ['author_name', 'channel', 'stat_weekly']]
+                    weekly_sql = f"""
+                        INSERT INTO ai_statistics_weekly ({columns})
+                        VALUES ({placeholders})
+                        ON DUPLICATE KEY UPDATE {', '.join(update_parts)}
+                    """
                
-                if len(batch_params) >= batch_size or idx == len(rows):
+                if monthly_sql is None:
+                    monthly_keys = list(monthly_record.keys())
+                    columns = ', '.join(monthly_keys)
+                    placeholders = ', '.join(['%s'] * len(monthly_keys))
+                    update_parts = [f"{key} = VALUES({key})" for key in monthly_keys if key not in ['author_name', 'channel', 'stat_monthly']]
+                    monthly_sql = f"""
+                        INSERT INTO ai_statistics_monthly ({columns})
+                        VALUES ({placeholders})
+                        ON DUPLICATE KEY UPDATE {', '.join(update_parts)}
+                    """
+                
+                # 添加到批量参数
+                days_batch.append(tuple(days_record[key] for key in days_keys))
+                weekly_batch.append(tuple(weekly_record[key] for key in weekly_keys))
+                monthly_batch.append(tuple(monthly_record[key] for key in monthly_keys))
+                
+                # 批量提交
+                if len(days_batch) >= batch_size or idx == len(rows):
                    try:
-                        result_count = self.db_manager.execute_many(sql_template, batch_params, autocommit=True)
-                        success_count += result_count
-                        print(f"[批次提交] 已导入 {success_count} 条记录（本批: {result_count}/{len(batch_params)}）")
-                        self.logger.info(f"ai_statistics_days表批量提交: {result_count}/{len(batch_params)} 条记录")
-                        batch_params = []
-                    except Exception as batch_error:
-                        failed_count += len(batch_params)
-                        print(f"  [X] 批次提交失败: {batch_error}")
-                        self.logger.error(f"ai_statistics_days表批量提交失败: {batch_error}")
-                        batch_params = []
+                        # 提交 ai_statistics_days
+                        result = self.db_manager.execute_many(days_sql, days_batch, autocommit=True)
+                        days_success += result
+                        print(f"[days] 已导入 {days_success} 条")
+                        days_batch = []
+                    except Exception as e:
+                        print(f"  [X] days表提交失败: {e}")
+                        self.logger.error(f"ai_statistics_days批量提交失败: {e}")
+                        failed_count += len(days_batch)
+                        days_batch = []
+                    
+                    try:
+                        # 提交 ai_statistics_weekly
+                        result = self.db_manager.execute_many(weekly_sql, weekly_batch, autocommit=True)
+                        weekly_success += result
+                        print(f"[weekly] 已导入 {weekly_success} 条")
+                        weekly_batch = []
+                    except Exception as e:
+                        print(f"  [X] weekly表提交失败: {e}")
+                        self.logger.error(f"ai_statistics_weekly批量提交失败: {e}")
+                        weekly_batch = []
+                    
+                    try:
+                        # 提交 ai_statistics_monthly
+                        result = self.db_manager.execute_many(monthly_sql, monthly_batch, autocommit=True)
+                        monthly_success += result
+                        print(f"[monthly] 已导入 {monthly_success} 条")
+                        monthly_batch = []
+                    except Exception as e:
+                        print(f"  [X] monthly表提交失败: {e}")
+                        self.logger.error(f"ai_statistics_monthly批量提交失败: {e}")
+                        monthly_batch = []
                        
            except Exception as e:
                failed_count += 1
                print(f"  [X] 处理失败 ({author_name}): {e}")
-                self.logger.error(f"ai_statistics_days表处理失败: {author_name}, 错误: {e}")
+                self.logger.error(f"数据处理失败: {author_name}, 错误: {e}")
                continue
        
        print("\n" + "="*70)
-        print(f"[OK] ai_statistics_days 表数据导入完成")
-        print(f"     成功: {success_count} 条记录")
+        print(f"[OK] 数据导入完成（拆分到3个表）")
+        print(f"     ai_statistics_days: {days_success} 条")
+        print(f"     ai_statistics_weekly: {weekly_success} 条")
+        print(f"     ai_statistics_monthly: {monthly_success} 条")
        if failed_count > 0:
-            print(f"     失败: {failed_count} 条记录")
+            print(f"     失败: {failed_count} 条")
        print("="*70)
        
-        self.logger.info(f"ai_statistics_days表数据导入完成: 成功 {success_count} 条，失败 {failed_count} 条")
-        return success_count > 0
+        self.logger.info(f"数据导入完成: days={days_success}, weekly={weekly_success}, monthly={monthly_success}, failed={failed_count}")
+        return days_success > 0
    
    def import_all(self) -> bool:
        """导入所有CSV文件"""