baijiahao_data_crawl/data_validation.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
数据比对验证脚本

功能：
1. 顺序验证：验证不同数据源中记录的顺序一致性
2. 交叉验证：对比数据内容，识别缺失、新增或不匹配的记录

支持的数据源：
- JSON文件 (bjh_integrated_data.json)
- CSV文件 (ai_statistics_*.csv)
- MySQL数据库 (ai_statistics_* 表)

使用方法：
    # 验证JSON和CSV的一致性
    python data_validation.py --source json csv --date 2025-12-29
    
    # 验证CSV和数据库的一致性
    python data_validation.py --source csv database --date 2025-12-29
    
    # 完整验证（三个数据源）
    python data_validation.py --source json csv database --date 2025-12-29
    
    # 验证特定表
    python data_validation.py --source csv database --table ai_statistics_day --date 2025-12-29
"""

import sys
import os
import json
import csv
import argparse
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Optional, Any, Set
from collections import OrderedDict
import hashlib

# 设置UTF-8编码
if sys.platform == 'win32':
    import io
    if not isinstance(sys.stdout, io.TextIOWrapper) or sys.stdout.encoding != 'utf-8':
        sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
    if not isinstance(sys.stderr, io.TextIOWrapper) or sys.stderr.encoding != 'utf-8':
        sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')

# 导入数据库配置
try:
    from database_config import DatabaseManager
except ImportError:
    print("[X] 无法导入 database_config.py，数据库验证功能将不可用")
    DatabaseManager = None


class DataValidator:
    """数据比对验证器"""
    
    def __init__(self, date_str: Optional[str] = None):
        """初始化
        
        Args:
            date_str: 目标日期 (YYYY-MM-DD)，默认为昨天
        """
        self.script_dir = os.path.dirname(os.path.abspath(__file__))
        
        # 目标日期（默认为昨天）
        if date_str:
            self.target_date = datetime.strptime(date_str, '%Y-%m-%d')
        else:
            # 默认使用昨天的日期
            self.target_date = datetime.now() - timedelta(days=1)
        
        self.date_str = self.target_date.strftime('%Y-%m-%d')
        
        # 数据库管理器
        self.db_manager = None
        if DatabaseManager:
            try:
                self.db_manager = DatabaseManager()
                print(f"[OK] 数据库连接成功")
            except Exception as e:
                print(f"[!] 数据库连接失败: {e}")
        
        # 验证结果
        self.validation_results = {
            '顺序验证': [],
            '交叉验证': [],
            '差异统计': {}
        }
    
    def load_json_data(self, file_path: Optional[str] = None) -> Optional[Any]:
        """加载JSON数据
        
        Args:
            file_path: JSON文件路径，默认为 bjh_integrated_data.json
            
        Returns:
            JSON数据字典
        """
        if not file_path:
            file_path = os.path.join(self.script_dir, 'bjh_integrated_data.json')
        
        try:
            if not os.path.exists(file_path):
                print(f"[X] JSON文件不存在: {file_path}")
                return None
            
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            print(f"[OK] 加载JSON文件: {file_path}")
            print(f"     账号数量: {len(data) if isinstance(data, list) else 1}")
            return data
            
        except Exception as e:
            print(f"[X] 加载JSON文件失败: {e}")
            return None
    
    def load_csv_data(self, csv_file: str) -> Optional[List[Dict]]:
        """加载CSV数据
        
        Args:
            csv_file: CSV文件名
            
        Returns:
            CSV数据列表
        """
        csv_path = os.path.join(self.script_dir, csv_file)
        
        try:
            if not os.path.exists(csv_path):
                print(f"[X] CSV文件不存在: {csv_path}")
                return None
            
            rows = []
            with open(csv_path, 'r', encoding='utf-8-sig') as f:
                reader = csv.DictReader(f)
                rows = list(reader)
            
            print(f"[OK] 加载CSV文件: {csv_file}")
            print(f"     记录数量: {len(rows)}")
            return rows
            
        except Exception as e:
            print(f"[X] 加载CSV文件失败: {e}")
            return None
    
    def load_database_data(self, table_name: str, date_filter: Optional[str] = None) -> Optional[List[Dict]]:
        """从数据库加载数据
        
        Args:
            table_name: 表名
            date_filter: 日期过滤字段名（如 'date', 'stat_date'）
            
        Returns:
            数据库记录列表
        """
        if not self.db_manager:
            print(f"[X] 数据库管理器未初始化")
            return None
        
        try:
            # 构建SQL查询
            if date_filter:
                sql = f"SELECT * FROM {table_name} WHERE {date_filter} = %s ORDER BY author_name, channel"
                params = (self.date_str,)
            else:
                sql = f"SELECT * FROM {table_name} ORDER BY author_name, channel"
                params = None
            
            rows = self.db_manager.execute_query(sql, params)
            
            print(f"[OK] 加载数据库表: {table_name}")
            if date_filter:
                print(f"     过滤条件: {date_filter} = {self.date_str}")
            print(f"     记录数量: {len(rows) if rows else 0}")
            
            return rows if rows else []
            
        except Exception as e:
            print(f"[X] 加载数据库数据失败: {e}")
            import traceback
            traceback.print_exc()
            return None
    
    def generate_record_key(self, record: Dict, key_fields: List[str]) -> str:
        """生成记录唯一键
        
        Args:
            record: 数据记录
            key_fields: 主键字段列表
            
        Returns:
            唯一键字符串
        """
        key_values = []
        for field in key_fields:
            value = record.get(field, '')
            # 统一转为字符串并去除空白
            key_values.append(str(value).strip())
        
        return '|'.join(key_values)
    
    def calculate_record_hash(self, record: Dict, exclude_fields: Optional[Set[str]] = None) -> str:
        """计算记录的哈希值（用于内容比对）
        
        Args:
            record: 数据记录
            exclude_fields: 排除的字段集合（如时间戳字段）
            
        Returns:
            MD5哈希值
        """
        if exclude_fields is None:
            exclude_fields = {'updated_at', 'created_at', 'fetch_time'}
        
        # 排序字段并生成稳定的字符串
        sorted_items = []
        for key in sorted(record.keys()):
            if key not in exclude_fields:
                value = record.get(key, '')
                # 浮点数保留4位小数
                if isinstance(value, float):
                    value = f"{value:.4f}"
                sorted_items.append(f"{key}={value}")
        
        content = '|'.join(sorted_items)
        return hashlib.md5(content.encode('utf-8')).hexdigest()
    
    def validate_order(self, source1_data: List[Dict], source2_data: List[Dict],
                      source1_name: str, source2_name: str,
                      key_fields: List[str]) -> Dict:
        """顺序验证：验证两个数据源中记录的顺序是否一致
        
        Args:
            source1_data: 数据源1的数据
            source2_data: 数据源2的数据
            source1_name: 数据源1名称
            source2_name: 数据源2名称
            key_fields: 主键字段列表
            
        Returns:
            验证结果字典
        """
        print(f"\n{'='*70}")
        print(f"顺序验证: {source1_name} vs {source2_name}")
        print(f"{'='*70}")
        
        result = {
            'source1': source1_name,
            'source2': source2_name,
            'source1_count': len(source1_data),
            'source2_count': len(source2_data),
            'order_match': True,
            'mismatches': []
        }
        
        # 生成记录键列表
        source1_keys = [self.generate_record_key(r, key_fields) for r in source1_data]
        source2_keys = [self.generate_record_key(r, key_fields) for r in source2_data]
        
        # 比对顺序
        min_len = min(len(source1_keys), len(source2_keys))
        
        for i in range(min_len):
            if source1_keys[i] != source2_keys[i]:
                result['order_match'] = False
                result['mismatches'].append({
                    'position': i,
                    'source1_key': source1_keys[i],
                    'source2_key': source2_keys[i]
                })
        
        # 输出结果
        if result['order_match'] and len(source1_keys) == len(source2_keys):
            print(f"[✓] 顺序一致，记录数相同: {len(source1_keys)}")
        else:
            print(f"[X] 顺序不一致")
            print(f"    {source1_name} 记录数: {len(source1_keys)}")
            print(f"    {source2_name} 记录数: {len(source2_keys)}")
            
            if result['mismatches']:
                print(f"    不匹配位置数: {len(result['mismatches'])}")
                # 显示前5个不匹配
                for mismatch in result['mismatches'][:5]:
                    print(f"      位置{mismatch['position']}: {mismatch['source1_key']} != {mismatch['source2_key']}")
        
        return result
    
    def validate_cross(self, source1_data: List[Dict], source2_data: List[Dict],
                      source1_name: str, source2_name: str,
                      key_fields: List[str],
                      compare_fields: Optional[List[str]] = None) -> Dict:
        """交叉验证：对比数据内容，识别缺失、新增或不匹配的记录
        
        Args:
            source1_data: 数据源1的数据
            source2_data: 数据源2的数据
            source1_name: 数据源1名称
            source2_name: 数据源2名称
            key_fields: 主键字段列表
            compare_fields: 需要对比的字段列表（None表示全部字段）
            
        Returns:
            验证结果字典
        """
        print(f"\n{'='*70}")
        print(f"交叉验证: {source1_name} vs {source2_name}")
        print(f"{'='*70}")
        
        # 构建字典：key -> record
        source1_dict = {}
        for record in source1_data:
            key = self.generate_record_key(record, key_fields)
            source1_dict[key] = record
        
        source2_dict = {}
        for record in source2_data:
            key = self.generate_record_key(record, key_fields)
            source2_dict[key] = record
        
        # 查找差异
        only_in_source1 = set(source1_dict.keys()) - set(source2_dict.keys())
        only_in_source2 = set(source2_dict.keys()) - set(source1_dict.keys())
        common_keys = set(source1_dict.keys()) & set(source2_dict.keys())
        
        # 对比共同记录的字段值
        field_mismatches = []
        for key in common_keys:
            record1 = source1_dict[key]
            record2 = source2_dict[key]
            
            # 确定要比对的字段
            if compare_fields:
                fields_to_compare = compare_fields
            else:
                fields_to_compare = set(record1.keys()) & set(record2.keys())
            
            # 比对每个字段
            mismatches_in_record = {}
            for field in fields_to_compare:
                val1 = record1.get(field, '')
                val2 = record2.get(field, '')
                
                # 类型转换和标准化
                val1_normalized = self._normalize_value(val1)
                val2_normalized = self._normalize_value(val2)
                
                if val1_normalized != val2_normalized:
                    mismatches_in_record[field] = {
                        source1_name: val1,
                        source2_name: val2
                    }
            
            if mismatches_in_record:
                field_mismatches.append({
                    'key': key,
                    'fields': mismatches_in_record
                })
        
        # 输出结果
        result = {
            'source1': source1_name,
            'source2': source2_name,
            'source1_count': len(source1_data),
            'source2_count': len(source2_data),
            'only_in_source1': list(only_in_source1),
            'only_in_source2': list(only_in_source2),
            'common_count': len(common_keys),
            'field_mismatches': field_mismatches
        }
        
        print(f"记录数统计:")
        print(f"  {source1_name}: {len(source1_data)} 条")
        print(f"  {source2_name}: {len(source2_data)} 条")
        print(f"  共同记录: {len(common_keys)} 条")
        print(f"  仅在{source1_name}: {len(only_in_source1)} 条")
        print(f"  仅在{source2_name}: {len(only_in_source2)} 条")
        print(f"  字段不匹配: {len(field_mismatches)} 条")
        
        # 显示详细差异
        if only_in_source1:
            print(f"\n仅在{source1_name}中的记录（前5条）:")
            for key in list(only_in_source1)[:5]:
                print(f"  - {key}")
        
        if only_in_source2:
            print(f"\n仅在{source2_name}中的记录（前5条）:")
            for key in list(only_in_source2)[:5]:
                print(f"  - {key}")
        
        if field_mismatches:
            print(f"\n字段值不匹配的记录（前3条）:")
            for mismatch in field_mismatches[:3]:
                print(f"  记录: {mismatch['key']}")
                for field, values in list(mismatch['fields'].items())[:5]:  # 每条记录最多显示5个字段
                    print(f"    字段 {field}:")
                    print(f"      {source1_name}: {values[source1_name]}")
                    print(f"      {source2_name}: {values[source2_name]}")
        
        return result
    
    def _normalize_value(self, value: Any) -> str:
        """标准化值用于比对
        
        Args:
            value: 原始值
            
        Returns:
            标准化后的字符串
        """
        if value is None or value == '':
            return ''
        
        # 浮点数保留4位小数
        if isinstance(value, float):
            return f"{value:.4f}"
        
        # 整数转字符串
        if isinstance(value, int):
            return str(value)
        
        # 字符串去除首尾空白
        return str(value).strip()
    
    def validate_ai_statistics(self, sources: List[str]) -> bool:
        """验证 ai_statistics 表数据
        
        Args:
            sources: 数据源列表 ['json', 'csv', 'database']
            
        Returns:
            验证是否通过
        """
        print(f"\n{'#'*70}")
        print(f"# 验证 ai_statistics 表数据")
        print(f"# 日期: {self.date_str}")
        print(f"{'#'*70}")
        
        # 主键字段
        key_fields = ['author_name', 'channel']
        
        # 重要字段
        compare_fields = [
            'submission_count', 'read_count', 'comment_count', 'comment_rate',
            'like_count', 'like_rate', 'favorite_count', 'favorite_rate',
            'share_count', 'share_rate', 'slide_ratio', 'baidu_search_volume'
        ]
        
        # 加载数据
        data_sources = {}
        
        if 'json' in sources:
            json_data = self.load_json_data()
            if json_data:
                # 确保json_data是列表类型
                if not isinstance(json_data, list):
                    json_data = [json_data]
                # 从JSON提取 ai_statistics 数据
                json_records = self._extract_ai_statistics_from_json(json_data)
                data_sources['json'] = json_records
        
        if 'csv' in sources:
            csv_data = self.load_csv_data('ai_statistics.csv')
            if csv_data:
                data_sources['csv'] = csv_data
        
        if 'database' in sources:
            db_data = self.load_database_data('ai_statistics', date_filter='date')
            if db_data:
                data_sources['database'] = db_data
        
        # 执行验证
        if len(data_sources) < 2:
            print(f"[X] 数据源不足，至少需要2个数据源进行比对")
            return False
        
        # 两两比对
        source_names = list(data_sources.keys())
        all_passed = True
        
        for i in range(len(source_names)):
            for j in range(i + 1, len(source_names)):
                source1_name = source_names[i]
                source2_name = source_names[j]
                
                # 只对 json vs csv 进行顺序验证
                if (source1_name == 'json' and source2_name == 'csv') or \
                   (source1_name == 'csv' and source2_name == 'json'):
                    # 顺序验证
                    order_result = self.validate_order(
                        data_sources[source1_name],
                        data_sources[source2_name],
                        source1_name,
                        source2_name,
                        key_fields
                    )
                    self.validation_results['顺序验证'].append(order_result)
                    
                    if not order_result['order_match']:
                        all_passed = False
                
                # 交叉验证（所有组合都执行）
                cross_result = self.validate_cross(
                    data_sources[source1_name],
                    data_sources[source2_name],
                    source1_name,
                    source2_name,
                    key_fields,
                    compare_fields
                )
                self.validation_results['交叉验证'].append(cross_result)
                
                # 判断是否通过
                if cross_result['only_in_source1'] or \
                   cross_result['only_in_source2'] or \
                   cross_result['field_mismatches']:
                    all_passed = False
        
        return all_passed
    
    def validate_ai_statistics_day(self, sources: List[str]) -> bool:
        """验证 ai_statistics_day 表数据
        
        Args:
            sources: 数据源列表
            
        Returns:
            验证是否通过
        """
        print(f"\n{'#'*70}")
        print(f"# 验证 ai_statistics_day 表数据")
        print(f"# 日期: {self.date_str}")
        print(f"{'#'*70}")
        
        key_fields = ['author_name', 'channel', 'stat_date']
        compare_fields = [
            'total_submission_count', 'total_read_count', 'total_comment_count',
            'total_like_count', 'total_favorite_count', 'total_share_count',
            'avg_comment_rate', 'avg_like_rate', 'avg_favorite_rate',
            'avg_share_rate', 'avg_slide_ratio', 'total_baidu_search_volume'
        ]
        
        # 加载数据
        data_sources = {}
        
        if 'csv' in sources:
            csv_data = self.load_csv_data('ai_statistics_day.csv')
            if csv_data:
                data_sources['csv'] = csv_data
        
        if 'database' in sources:
            db_data = self.load_database_data('ai_statistics_day', date_filter='stat_date')
            if db_data:
                data_sources['database'] = db_data
        
        if len(data_sources) < 2:
            print(f"[X] 数据源不足")
            return False
        
        # 执行验证
        source_names = list(data_sources.keys())
        all_passed = True
        
        for i in range(len(source_names)):
            for j in range(i + 1, len(source_names)):
                source1_name = source_names[i]
                source2_name = source_names[j]
                
                # ai_statistics_day 表不需要顺序验证，只执行交叉验证
                cross_result = self.validate_cross(
                    data_sources[source1_name],
                    data_sources[source2_name],
                    source1_name,
                    source2_name,
                    key_fields,
                    compare_fields
                )
                self.validation_results['交叉验证'].append(cross_result)
                
                if cross_result['only_in_source1'] or \
                   cross_result['only_in_source2'] or \
                   cross_result['field_mismatches']:
                    all_passed = False
        
        return all_passed
    
    def _extract_ai_statistics_from_json(self, json_data: List[Dict]) -> List[Dict]:
        """从JSON数据中提取ai_statistics格式的数据
        
        Args:
            json_data: JSON数据
            
        Returns:
            ai_statistics格式的数据列表
        """
        records = []
        
        for account_data in json_data:
            account_id = account_data.get('account_id', '')
            if not account_id:
                continue
            
            analytics = account_data.get('analytics', {})
            apis = analytics.get('apis', [])
            
            if apis and len(apis) > 0:
                api_data = apis[0].get('data', {})
                if api_data.get('errno') == 0:
                    total_info = api_data.get('data', {}).get('total_info', {})
                    
                    record = {
                        'author_name': account_id,
                        'channel': 1,
                        'submission_count': int(total_info.get('publish_count', 0) or 0),
                        'read_count': int(total_info.get('view_count', 0) or 0),
                        'comment_count': int(total_info.get('comment_count', 0) or 0),
                        'comment_rate': float(total_info.get('comment_rate', 0) or 0) / 100,
                        'like_count': int(total_info.get('likes_count', 0) or 0),
                        'like_rate': float(total_info.get('likes_rate', 0) or 0) / 100,
                        'favorite_count': int(total_info.get('collect_count', 0) or 0),
                        'favorite_rate': float(total_info.get('collect_rate', 0) or 0) / 100,
                        'share_count': int(total_info.get('share_count', 0) or 0),
                        'share_rate': float(total_info.get('share_rate', 0) or 0) / 100,
                        'slide_ratio': float(total_info.get('pic_slide_rate', 0) or 0) / 100,
                        'baidu_search_volume': int(total_info.get('disp_pv', 0) or 0)
                    }
                    records.append(record)
        
        return records
    
    def generate_report(self, output_file: Optional[str] = None) -> None:
        """生成验证报告
        
        Args:
            output_file: 输出文件路径
        """
        if not output_file:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            output_file = os.path.join(self.script_dir, f'validation_report_{timestamp}.txt')
        
        try:
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(f"数据验证报告\n")
                f.write(f"{'='*70}\n")
                f.write(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                f.write(f"目标日期: {self.date_str}\n\n")
                
                # 顺序验证结果
                f.write(f"\n顺序验证结果\n")
                f.write(f"{'-'*70}\n")
                for result in self.validation_results['顺序验证']:
                    f.write(f"{result['source1']} vs {result['source2']}\n")
                    f.write(f"  顺序匹配: {'是' if result['order_match'] else '否'}\n")
                    f.write(f"  {result['source1']} 记录数: {result['source1_count']}\n")
                    f.write(f"  {result['source2']} 记录数: {result['source2_count']}\n")
                    if result['mismatches']:
                        f.write(f"  不匹配数: {len(result['mismatches'])}\n")
                    f.write(f"\n")
                
                # 交叉验证结果
                f.write(f"\n交叉验证结果\n")
                f.write(f"{'-'*70}\n")
                for result in self.validation_results['交叉验证']:
                    f.write(f"{result['source1']} vs {result['source2']}\n")
                    f.write(f"  共同记录: {result['common_count']}\n")
                    f.write(f"  仅在{result['source1']}: {len(result['only_in_source1'])}\n")
                    f.write(f"  仅在{result['source2']}: {len(result['only_in_source2'])}\n")
                    f.write(f"  字段不匹配: {len(result['field_mismatches'])}\n")
                    f.write(f"\n")
            
            print(f"\n[OK] 验证报告已生成: {output_file}")
            
        except Exception as e:
            print(f"[X] 生成报告失败: {e}")


def main():
    """主函数"""
    parser = argparse.ArgumentParser(
        description='数据比对验证脚本',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
示例用法:
  # 验证JSON和CSV
  python data_validation.py --source json csv --date 2025-12-29
  
  # 验证CSV和数据库
  python data_validation.py --source csv database --date 2025-12-29
  
  # 完整验证（三个数据源）
  python data_validation.py --source json csv database --date 2025-12-29
  
  # 验证特定表
  python data_validation.py --source csv database --table ai_statistics_day --date 2025-12-29
        """
    )
    
    parser.add_argument(
        '--source',
        nargs='+',
        choices=['json', 'csv', 'database'],
        default=['json', 'csv', 'database'],
        help='数据源列表（至少2个）'
    )
    
    parser.add_argument(
        '--date',
        type=str,
        default=(datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d'),
        help='目标日期 (YYYY-MM-DD)，默认为昨天'
    )
    
    parser.add_argument(
        '--table',
        type=str,
        choices=['ai_statistics', 'ai_statistics_day', 'ai_statistics_days'],
        default='ai_statistics',
        help='要验证的表名'
    )
    
    parser.add_argument(
        '--report',
        type=str,
        help='输出报告文件路径'
    )
    
    args = parser.parse_args()
    
    # 检查数据源数量
    if len(args.source) < 2:
        print("[X] 至少需要指定2个数据源进行比对")
        return 1
    
    # 创建验证器
    validator = DataValidator(date_str=args.date)
    
    # 执行验证
    try:
        if args.table == 'ai_statistics':
            passed = validator.validate_ai_statistics(args.source)
        elif args.table == 'ai_statistics_day':
            passed = validator.validate_ai_statistics_day(args.source)
        else:
            print(f"[!] 表 {args.table} 的验证功能暂未实现")
            passed = False
        
        # 生成报告
        validator.generate_report(args.report)
        
        # 输出总结
        print(f"\n{'='*70}")
        if passed:
            print(f"[✓] 验证通过：所有数据源数据一致")
        else:
            print(f"[X] 验证失败：发现数据差异")
        print(f"{'='*70}")
        
        return 0 if passed else 1
        
    except Exception as e:
        print(f"\n[X] 验证过程出错: {e}")
        import traceback
        traceback.print_exc()
        return 1


if __name__ == '__main__':
    sys.exit(main())