Files
baijiahao_data_crawl/mitmproxy_capture.py
“shengyudong” 322ac74336 2025-12-25 upload
2025-12-25 11:16:59 +08:00

223 lines
8.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
mitmproxy Cookie捕获脚本
自动捕获百家号账号的Cookie并保存到JSON文件
使用方法:
mitmdump -s mitmproxy_capture.py -p 8888
在浏览器中设置代理:
HTTP代理: 127.0.0.1:8888
HTTPS代理: 127.0.0.1:8888
然后访问百家号页面脚本会自动捕获Cookie
"""
import json
import os
from datetime import datetime
from mitmproxy import http
import requests
class BaijiahaoCapture:
"""百家号Cookie捕获器"""
def __init__(self):
self.script_dir = os.path.dirname(os.path.abspath(__file__))
self.output_file = os.path.join(self.script_dir, "captured_account_cookies.json")
self.captured_cookies = self.load_existing_cookies()
self.base_url = "https://baijiahao.baidu.com"
self.processed_cookies = set() # 记录已处理的Cookie避免重复
print("\n" + "="*70)
print("百家号Cookie捕获脚本已启动")
print("="*70)
print(f"监听端口: 8888")
print(f"目标域名: baijiahao.baidu.com")
print(f"保存路径: {self.output_file}")
print("\n提示:")
print(" - 如果在浏览器中使用,请手动设置代理")
print(" - 如果在小火花客户端中使用,请设置系统代理")
print(" - Windows系统代理: 127.0.0.1:8888")
print("="*70 + "\n")
def load_existing_cookies(self):
"""加载已有的Cookie数据"""
try:
if os.path.exists(self.output_file):
with open(self.output_file, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"[!] 加载已有Cookie失败: {e}")
return {}
def save_cookies(self):
"""保存Cookie到文件"""
try:
with open(self.output_file, 'w', encoding='utf-8') as f:
json.dump(self.captured_cookies, f, ensure_ascii=False, indent=2)
print(f"[OK] Cookie已保存到: {self.output_file}")
return True
except Exception as e:
print(f"[X] 保存Cookie失败: {e}")
return False
def parse_cookies(self, cookie_header: str) -> dict:
"""解析Cookie字符串为字典"""
cookies = {}
if not cookie_header:
return cookies
for item in cookie_header.split(';'):
item = item.strip()
if '=' in item:
name, value = item.split('=', 1)
cookies[name.strip()] = value.strip()
return cookies
def get_account_info(self, cookies: dict) -> dict | None:
"""调用百家号API获取账号信息"""
try:
# 获取token
token = cookies.get('bjhStoken') or cookies.get('devStoken')
if not token:
print(" [!] Cookie中未找到 bjhStoken 或 devStoken")
return None
# 构建请求
api_url = f"{self.base_url}/builder/app/appinfo"
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': f'{self.base_url}/builder/rc/incomecenter',
'token': token,
}
# 构建Cookie字符串
cookie_str = '; '.join([f"{k}={v}" for k, v in cookies.items()])
headers['Cookie'] = cookie_str
# 创建一个新的session不使用代理
import requests
session = requests.Session()
session.trust_env = False # 不使用系统代理
# 发送请求(直连,不走代理)
response = session.get(api_url, headers=headers, timeout=30, proxies={})
if response.status_code == 200:
data = response.json()
if data.get('errno') == 0:
return data.get('data', {})
return None
except Exception as e:
print(f" [!] 获取账号信息失败: {e}")
return None
def request(self, flow: http.HTTPFlow) -> None:
"""处理HTTP请求"""
# 调试:显示所有请求
print(f"[ALL] {flow.request.method} {flow.request.pretty_url[:100]}")
# 只处理百家号域名的请求
if "baijiahao.baidu.com" not in flow.request.pretty_host:
return
# 调试:显示所有百家号请求
print(f"[DEBUG] 检测到百家号请求: {flow.request.pretty_url}")
# 获取Cookie
cookie_header = flow.request.headers.get("Cookie", "")
if not cookie_header:
print("[DEBUG] 请求中没有Cookie")
return
# 解析Cookie
cookies = self.parse_cookies(cookie_header)
print(f"[DEBUG] Cookie数量: {len(cookies)}")
# 检查是否包含关键CookiebjhStoken或devStoken
if 'bjhStoken' not in cookies and 'devStoken' not in cookies:
print("[DEBUG] 缺少关键Cookie: bjhStoken或devStoken")
return
# 检查是否包含BDUSS百度登录凭证
if 'BDUSS' not in cookies:
print("[DEBUG] 缺少BDUSS登录凭证")
return
# 生成Cookie指纹避免重复处理
cookie_fingerprint = cookies.get('BDUSS', '')[:20] # 使用BDUSS前20位作为指纹
if cookie_fingerprint in self.processed_cookies:
print(f"[DEBUG] Cookie已处理过跳过 (指纹: {cookie_fingerprint})")
return
print("\n" + "-"*70)
print(f"[√] 捕获到Cookie")
print(f" URL: {flow.request.pretty_url}")
print(f" Cookie数量: {len(cookies)}")
print(f" 来源: {flow.request.headers.get('User-Agent', 'Unknown')[:50]}...")
# 获取账号信息
print(" 正在获取账号信息...")
account_info = self.get_account_info(cookies)
if account_info:
user_info = account_info.get('user', {})
account_id = user_info.get('name', '') or user_info.get('username', '')
if not account_id:
print(" [!] 无法获取账号昵称,跳过保存")
return
# 标记为已处理
self.processed_cookies.add(cookie_fingerprint)
# 准备保存的数据
account_data = {
'cookies': cookies,
'capture_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'capture_domain': 'baijiahao.baidu.com',
'nick': user_info.get('name', ''),
'username': user_info.get('username', ''),
'app_id': str(user_info.get('shoubai_c_appid', '')),
'user_id': user_info.get('userid', 0),
'level': user_info.get('status_cn', ''),
'domain': user_info.get('domain', ''),
'wishes': user_info.get('wishes', ''),
'media_type': user_info.get('media_type', ''),
}
# 保存到字典
self.captured_cookies[account_id] = account_data
# 保存到文件
self.save_cookies()
print(f"\n [OK] 账号信息已保存")
print(f" 账号ID: {account_id}")
print(f" 昵称: {account_data['nick']}")
print(f" 用户名: {account_data['username']}")
print(f" App ID: {account_data['app_id']}")
print(f" 等级: {account_data['level']}")
print(f" 领域: {account_data['domain']}")
else:
print(" [!] 获取账号信息失败Cookie未保存")
print(" 原因可能是Cookie已失效或网络问题")
print(" 建议:")
print(" 1. 确认已在小火花客户端中成功登录账号")
print(" 2. 刷新页面或重新切换账号")
print(" 3. 检查网络连接是否正常")
print("-"*70 + "\n")
# 创建捕获器实例
addons = [BaijiahaoCapture()]