baijiahao_data_crawl/fetch_date_statistics.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
指定日期统计数据获取脚本
功能：获取指定日期的百家号统计数据并填充到数据库三个统计表
"""

import os
import sys
import json
import argparse
import requests
import time
from datetime import datetime, timedelta
from typing import List, Dict, Optional
from decimal import Decimal

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from database_config import DatabaseManager
from export_to_csv import DataExporter

# 天启代理配置
PROXY_API_URL = 'http://api.tianqiip.com/getip?secret=tmcrmh3q&num=1&type=txt&port=1&mr=1&sign=5451e454a54b9f1f06222606c418e12f'


class DateStatisticsFetcher:
    """指定日期统计数据获取器"""
    
    def __init__(self, target_date: str, use_proxy: bool = True):
        """初始化
        
        Args:
            target_date: 目标日期 (YYYY-MM-DD)
            use_proxy: 是否使用代理（默认True）
        """
        self.target_date = datetime.strptime(target_date, '%Y-%m-%d')
        self.target_date_str = target_date
        self.db_manager = DatabaseManager()
        self.script_dir = os.path.dirname(os.path.abspath(__file__))
        self.use_proxy = use_proxy
        self.current_proxy = None
        
        # 创建临时数据目录
        self.temp_dir = os.path.join(self.script_dir, 'temp_data')
        os.makedirs(self.temp_dir, exist_ok=True)
        
        # 创建请求会话
        self.session = requests.Session()
        self.session.verify = False
        
        # 禁用SSL警告
        import urllib3
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        
        print(f"[初始化] 目标日期: {target_date}")
        print(f"[初始化] 代理模式: {'启用' if use_proxy else '禁用'}")
        print(f"[初始化] 临时数据目录: {self.temp_dir}")
    
    def get_all_authors(self) -> List[Dict]:
        """获取所有活跃账号
        
        Returns:
            账号列表
        """
        try:
            sql = """
                SELECT id as author_id, author_name, toutiao_cookie
                FROM ai_authors
                WHERE channel = 1 
                  AND status = 'active'
                  AND toutiao_cookie IS NOT NULL
                  AND toutiao_cookie != ''
                ORDER BY id
            """
            
            accounts = self.db_manager.execute_query(sql, fetch_one=False, dict_cursor=True)
            
            if accounts:
                print(f"[数据库] 找到 {len(accounts)} 个活跃账号")
                return accounts
            else:
                print("[!] 未找到任何活跃账号")
                return []
                
        except Exception as e:
            print(f"[X] 查询账号失败: {e}")
            return []
    
    def get_daily_article_count(self, author_id: int, date_str: str) -> int:
        """从ai_articles表获取指定日期的发文量
        
        Args:
            author_id: 作者ID
            date_str: 日期字符串 (YYYY-MM-DD)
            
        Returns:
            发文量
        """
        try:
            sql = """
                SELECT COUNT(*) as count
                FROM ai_articles
                WHERE author_id = %s
                  AND DATE(publish_time) = %s
                  AND status = 'published'
                  AND channel = 1
            """
            
            result = self.db_manager.execute_query(
                sql, 
                (author_id, date_str), 
                fetch_one=True, 
                dict_cursor=True
            )
            
            return result['count'] if result else 0
        except Exception as e:
            print(f"  [!] 查询发文量失败: {e}")
            return 0
    
    def fetch_daily_income(self, cookie_string: str, date_timestamp: int, max_retries: int = 3) -> Optional[Dict]:
        """获取指定日期的收入数据（带重试机制）
        
        Args:
            cookie_string: Cookie字符串
            date_timestamp: 日期Unix时间戳（秒）
            max_retries: 最大重试次数
            
        Returns:
            收入数据字典，失败返回None
        """
        api_url = "https://baijiahao.baidu.com/author/eco/income4/overviewhomelist"
        
        # 设置Cookie
        self.session.cookies.clear()
        for item in cookie_string.split(';'):
            item = item.strip()
            if '=' in item:
                key, value = item.split('=', 1)
                self.session.cookies.set(key.strip(), value.strip())
        
        # 从Cookie中提取token
        token_cookie = self.session.cookies.get('bjhStoken') or self.session.cookies.get('devStoken')
        
        # 请求参数
        params = {
            'start_date': date_timestamp,
            'end_date': date_timestamp
        }
        
        # 请求头
        headers = {
            'Accept': 'application/json, text/plain, */*',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Referer': 'https://baijiahao.baidu.com/builder/rc/incomecenter',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-origin',
        }
        
        if token_cookie:
            headers['token'] = token_cookie
        
        retry_count = 0
        while retry_count <= max_retries:
            try:
                # 如果是重试，先等待
                if retry_count > 0:
                    wait_time = retry_count * 3  # 3秒、6秒、9秒
                    print(f"    [重试 {retry_count}/{max_retries}] 等待 {wait_time} 秒...")
                    time.sleep(wait_time)
                
                # 获取代理
                proxies = self.fetch_proxy() if self.use_proxy else None
                
                response = self.session.get(
                    api_url,
                    headers=headers,
                    params=params,
                    proxies=proxies,
                    timeout=15
                )
                
                if response.status_code == 200:
                    data = response.json()
                    if data.get('errno') == 0:
                        return data
                    else:
                        error_msg = data.get('errmsg', '')
                        errno = data.get('errno')
                        print(f"    [!] API返回错误: errno={errno}, errmsg={error_msg}")
                        
                        # 异常请求错误，尝试重试
                        if errno == 10000015 and retry_count < max_retries:
                            retry_count += 1
                            continue
                        return None
                else:
                    print(f"    [!] HTTP错误: {response.status_code}")
                    return None
                    
            except Exception as e:
                error_type = type(e).__name__
                print(f"    [!] 请求异常: {error_type} - {e}")
                
                # 判断是否需要重试
                is_retry_error = any([
                    'Connection' in error_type,
                    'Timeout' in error_type,
                    'ProxyError' in error_type,
                ])
                
                if is_retry_error and retry_count < max_retries:
                    retry_count += 1
                    continue
                return None
        
        return None
    
    def fetch_analytics_api(self, cookie_string: str, target_date: str, max_retries: int = 3) -> Optional[Dict]:
        """调用百家号发文统计API获取阅读量、评论量等数据
        
        Args:
            cookie_string: Cookie字符串
            target_date: 目标日期 (YYYY-MM-DD)
            max_retries: 最大重试次数
            
        Returns:
            API返回数据，失败返回None
        """
        # 设置Cookie
        self.session.cookies.clear()
        for item in cookie_string.split(';'):
            item = item.strip()
            if '=' in item:
                key, value = item.split('=', 1)
                self.session.cookies.set(key.strip(), value.strip(), domain='.baidu.com')
        
        # 从Cookie中提取token
        token_cookie = self.session.cookies.get('bjhStoken') or self.session.cookies.get('devStoken')
        
        # 计算日期范围（仅查询目标日期当天）
        date_obj = datetime.strptime(target_date, '%Y-%m-%d')
        start_day = date_obj.strftime('%Y%m%d')
        end_day = start_day  # 开始和结束是同一天
        
        # API端点（使用appStatisticV3）
        api_url = "https://baijiahao.baidu.com/author/eco/statistics/appStatisticV3"
        
        # 请求参数
        params = {
            'type': 'event',
            'start_day': start_day,
            'end_day': end_day,
            'stat': '0',
            'special_filter_days': '1'
        }
        
        # 请求头
        headers = {
            'Accept': 'application/json, text/plain, */*',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Referer': 'https://baijiahao.baidu.com/builder/rc/analysiscontent',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-origin',
        }
        
        if token_cookie:
            headers['token'] = token_cookie
        
        retry_count = 0
        while retry_count <= max_retries:
            try:
                # 如果是重试，先等待
                if retry_count > 0:
                    wait_time = retry_count * 3
                    print(f"    [重试 {retry_count}/{max_retries}] 等待 {wait_time} 秒...")
                    time.sleep(wait_time)
                
                # 获取代理
                proxies = self.fetch_proxy() if self.use_proxy else None
                
                response = self.session.get(
                    api_url,
                    headers=headers,
                    params=params,
                    proxies=proxies,
                    timeout=15
                )
                
                if response.status_code == 200:
                    data = response.json()
                    errno = data.get('errno', -1)
                    
                    if errno == 0:
                        # 提取total_info和list数据
                        data_content = data.get('data', {})
                        total_info = data_content.get('total_info', {})
                        daily_list = data_content.get('list', [])
                        
                        print(f"    [发文统计] 阅读量: {total_info.get('view_count', 0)}")
                        print(f"    [发文统计] 评论量: {total_info.get('comment_count', 0)}")
                        
                        return data
                    else:
                        error_msg = data.get('errmsg', '')
                        print(f"    [!] 发文统计API错误: errno={errno}, errmsg={error_msg}")
                        
                        if errno == 10000015 and retry_count < max_retries:
                            retry_count += 1
                            continue
                        return None
                else:
                    print(f"    [!] HTTP错误: {response.status_code}")
                    return None
                    
            except Exception as e:
                error_type = type(e).__name__
                print(f"    [!] 请求异常: {error_type} - {e}")
                
                is_retry_error = any([
                    'Connection' in error_type,
                    'Timeout' in error_type,
                    'ProxyError' in error_type,
                ])
                
                if is_retry_error and retry_count < max_retries:
                    retry_count += 1
                    continue
                return None
        
        return None
    
    def get_cumulative_article_count(self, author_id: int, start_date: str, end_date: str) -> int:
        """从ai_articles表获取累计发文量
        
        Args:
            author_id: 作者ID
            start_date: 开始日期 (YYYY-MM-DD)
            end_date: 结束日期 (YYYY-MM-DD)
            
        Returns:
            累计发文量
        """
        try:
            sql = """
                SELECT COUNT(*) as count
                FROM ai_articles
                WHERE author_id = %s
                  AND DATE(publish_time) >= %s
                  AND DATE(publish_time) <= %s
                  AND status = 'published'
                  AND channel = 1
            """
            
            result = self.db_manager.execute_query(
                sql,
                (author_id, start_date, end_date),
                fetch_one=True,
                dict_cursor=True
            )
            
            return result['count'] if result else 0
        except Exception as e:
            print(f"  [!] 查询累计发文量失败: {e}")
            return 0
    
    def fetch_proxy(self) -> Optional[Dict]:
        """获取天启代理IP
        
        Returns:
            代理配置字典，失败返回None
        """
        if not self.use_proxy:
            return None
        
        try:
            resp = requests.get(PROXY_API_URL, timeout=10)
            resp.raise_for_status()
            
            text = resp.text.strip()
            
            # 检测是否返回错误信息
            if text.upper().startswith('ERROR'):
                print(f"    [!] 代理API返回错误: {text}")
                return None
            
            # 解析IP:PORT格式
            lines = text.split('\n')
            for line in lines:
                line = line.strip()
                if ':' in line and line.count(':') == 1:
                    ip_port = line.split()[0] if ' ' in line else line
                    host, port = ip_port.split(':', 1)
                    proxy_url = f'http://{host}:{port}'
                    self.current_proxy = {
                        'http': proxy_url,
                        'https': proxy_url,
                    }
                    print(f"    [代理] 使用天启IP: {ip_port}")
                    return self.current_proxy
            
            print(f"    [!] 无法解析代理API返回: {text[:100]}")
            return None
            
        except Exception as e:
            print(f"    [!] 获取代理失败: {e}")
            return None
    
    def build_integrated_data(self, author_id: int, author_name: str, cookie_string: str) -> Dict:
        """构建指定日期的整合数据
        
        Args:
            author_id: 作者ID
            author_name: 作者名称
            cookie_string: Cookie字符串
            
        Returns:
            整合数据字典
        """
        print(f"\n  [构建] 账号 {author_name} 的整合数据...")
        
        # 计算当月第一天（用于累计发文量）
        month_first = self.target_date.replace(day=1).strftime('%Y-%m-%d')
        
        # 从数据库获取发文量
        daily_count = self.get_daily_article_count(author_id, self.target_date_str)
        cumulative_count = self.get_cumulative_article_count(author_id, month_first, self.target_date_str)
        
        print(f"    单日发文量: {daily_count}")
        print(f"    累计发文量: {cumulative_count} (从{month_first}至{self.target_date_str})")
        
        # 获取发文统计数据（阅读量、评论量等）
        print(f"    [API] 获取发文统计数据...")
        analytics_data = self.fetch_analytics_api(cookie_string, self.target_date_str)
        
        # 提取total_info和list数据
        total_info = {}
        daily_list = []
        if analytics_data:
            data_content = analytics_data.get('data', {})
            total_info = data_content.get('total_info', {})
            daily_list = data_content.get('list', [])
        
        # 获取收入数据
        day_revenue = 0.0
        date_timestamp = int(self.target_date.replace(hour=0, minute=0, second=0, microsecond=0).timestamp())
        
        print(f"    [API] 获取收入数据...")
        income_data = self.fetch_daily_income(cookie_string, date_timestamp)
        
        if income_data and income_data.get('data', {}).get('list'):
            income_list = income_data['data']['list']
            if income_list and len(income_list) > 0:
                total_income = income_list[0].get('total_income', 0)
                day_revenue = float(total_income)
                print(f"    当日收益: ¥{day_revenue:.2f}")
            else:
                print(f"    当日收益: ¥0.00 (无收入数据)")
        else:
            print(f"    当日收益: ¥0.00 (API调用失败)")
        
        # 构建整合数据（模拟BaijiahaoAnalytics的数据结构）
        integrated_data = {
            'account_id': author_name,
            'author_id': author_id,
            'fetch_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'target_date': self.target_date_str,
            'status': 'success',
            'analytics': {
                'apis': [  # 修改：需要包装在apis数组中
                    {
                        'data': {
                            'errno': 0,
                            'data': {
                                'list': daily_list if daily_list else [
                                    {
                                        'event_day': self.target_date_str.replace('-', ''),  # 格式：20251225
                                        'date': self.target_date_str,
                                        'publish_count': daily_count,
                                        'daily_published_count': daily_count,
                                        'cumulative_published_count': cumulative_count,
                                    }
                                ],
                                'latest_event_day': self.target_date_str.replace('-', ''),  # 格式：20251225
                                'total_info': total_info if total_info else {
                                    'publish_count': daily_count,
                                    'view_count': 0,
                                    'comment_count': 0,
                                    'comment_rate': 0,
                                    'likes_count': 0,
                                    'likes_rate': 0,
                                    'collect_count': 0,
                                    'collect_rate': 0,
                                    'share_count': 0,
                                    'share_rate': 0,
                                    'pic_slide_rate': 0,
                                    'disp_pv': 0,
                                }
                            }
                        }
                    }
                ]
            },
            'income': {
                'errno': 0,  # 添加：标记API调用成功
                'data': {
                    'income': {
                        'yesterday': {
                            'income': day_revenue  # 修改：使用income字段而不是value
                        },
                        'currentMonth': {
                            'income': 0  # 历史数据无法获取当月收益，设为0
                        }
                    }
                }
            }
        }
        
        return integrated_data
    
    def process_single_date(self) -> bool:
        """处理单个日期的所有账号数据
        
        Returns:
            是否成功
        """
        print(f"\n{'='*70}")
        print(f"开始处理 {self.target_date_str} 的数据")
        print(f"{'='*70}")
        
        # 获取所有账号
        accounts = self.get_all_authors()
        if not accounts:
            print("[X] 没有可用的账号，退出")
            return False
        
        # 构建所有账号的整合数据
        integrated_data_list = []
        
        for idx, account in enumerate(accounts, 1):
            author_id = account.get('author_id')
            author_name = account.get('author_name', '')
            cookie_string = account.get('toutiao_cookie', '')
            
            if not author_id:
                print(f"\n[{idx}/{len(accounts)}] 跳过: {author_name} (缺少author_id)")
                continue
            
            if not cookie_string:
                print(f"\n[{idx}/{len(accounts)}] 跳过: {author_name} (缺少Cookie)")
                continue
            
            print(f"\n[{idx}/{len(accounts)}] 处理账号: {author_name} (ID: {author_id})")
            
            try:
                integrated_data = self.build_integrated_data(author_id, author_name, cookie_string)
                integrated_data_list.append(integrated_data)
                print(f"  [OK] 数据构建成功")
                
                # 延迟避免请求过快（增加到3-5秒）
                if idx < len(accounts):
                    import random
                    delay = random.uniform(3, 5)
                    print(f"  [延迟] 等待 {delay:.1f} 秒...")
                    time.sleep(delay)
                    
            except Exception as e:
                print(f"  [X] 数据构建失败: {e}")
                import traceback
                traceback.print_exc()
                continue
        
        if not integrated_data_list:
            print("[!] 没有成功构建任何数据")
            return False
        
        # 保存整合数据到临时文件
        integrated_file = os.path.join(self.temp_dir, f'integrated_{self.target_date_str}.json')
        try:
            with open(integrated_file, 'w', encoding='utf-8') as f:
                json.dump(integrated_data_list, f, ensure_ascii=False, indent=2)
            print(f"\n[保存] 整合数据: {integrated_file}")
        except Exception as e:
            print(f"[X] 保存整合数据失败: {e}")
            return False
        
        # 使用DataExporter导出到三个表
        print(f"\n[导出] 开始导出到数据库...")
        try:
            exporter = DataExporter(use_database=False)
            
            # 临时替换整合数据文件路径
            original_file = exporter.integrated_file
            exporter.integrated_file = integrated_file
            
            # 导出三个表的数据
            result = exporter.export_all_tables()
            
            # 恢复原路径
            exporter.integrated_file = original_file
            
            if result:
                print(f"\n[OK] {self.target_date_str} 数据处理完成")
                return True
            else:
                print(f"\n[!] {self.target_date_str} 数据导出失败")
                return False
                
        except Exception as e:
            print(f"[X] 导出数据失败: {e}")
            import traceback
            traceback.print_exc()
            return False


def main():
    """主函数"""
    parser = argparse.ArgumentParser(
        description='获取指定日期的百家号统计数据',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
示例用法:
  python fetch_date_statistics.py 2025-12-01
  python fetch_date_statistics.py 2025-12-15
  
注意事项:
  1. 由于百家号API限制，无法获取历史日期的收入数据
  2. 脚本会从ai_articles表统计发文量数据
  3. 收入字段将被设置为0（需要在数据产生当天运行才能获取真实收入）
        """
    )
    
    parser.add_argument(
        'date',
        type=str,
        help='目标日期 (格式: YYYY-MM-DD)'
    )
    
    parser.add_argument(
        '--no-proxy',
        action='store_true',
        help='禁用代理（默认启用天启代理）'
    )
    
    args = parser.parse_args()
    
    # 验证日期格式
    try:
        datetime.strptime(args.date, '%Y-%m-%d')
    except ValueError:
        print(f"[X] 日期格式错误: {args.date}")
        print("    正确格式: YYYY-MM-DD (例如: 2025-12-01)")
        return 1
    
    print("\n" + "="*70)
    print("百家号指定日期统计数据获取工具")
    print("="*70)
    print(f"目标日期: {args.date}")
    print("="*70)
    
    try:
        fetcher = DateStatisticsFetcher(args.date, use_proxy=not args.no_proxy)
        success = fetcher.process_single_date()
        
        return 0 if success else 1
        
    except Exception as e:
        print(f"\n[X] 程序执行出错: {e}")
        import traceback
        traceback.print_exc()
        return 1


if __name__ == '__main__':
    sys.exit(main())