719 lines
27 KiB
Python
719 lines
27 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
百家号文章数据抓取工具
|
|||
|
|
抓取当天发布的所有文章,生成CSV文件
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import sys
|
|||
|
|
import os
|
|||
|
|
import requests
|
|||
|
|
import csv
|
|||
|
|
import time
|
|||
|
|
from datetime import datetime, timedelta
|
|||
|
|
from typing import Dict, List, Optional
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
# 设置标准输出编码为UTF-8
|
|||
|
|
if sys.platform == 'win32':
|
|||
|
|
import io
|
|||
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|||
|
|
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
|
|||
|
|
|
|||
|
|
# 禁用SSL警告
|
|||
|
|
import urllib3
|
|||
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|||
|
|
|
|||
|
|
# 导入日志和数据库配置
|
|||
|
|
from log_config import setup_logger
|
|||
|
|
from database_config import DatabaseManager, DB_CONFIG
|
|||
|
|
|
|||
|
|
|
|||
|
|
class BaijiahaoArticlesCrawler:
|
|||
|
|
"""百家号文章数据抓取器"""
|
|||
|
|
|
|||
|
|
def __init__(self, load_from_db: bool = True, load_from_json: bool = False):
|
|||
|
|
"""初始化
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
load_from_db: 是否从数据库加载账号Cookie,默认True
|
|||
|
|
load_from_json: 是否从JSON文件加载Cookie(用于测试),默认False
|
|||
|
|
"""
|
|||
|
|
self.base_url = "https://baijiahao.baidu.com"
|
|||
|
|
self.session = requests.Session()
|
|||
|
|
self.session.verify = False
|
|||
|
|
|
|||
|
|
# 初始化日志
|
|||
|
|
self.logger = setup_logger(
|
|||
|
|
name='bjh_articles_crawler',
|
|||
|
|
log_file='logs/bjh_articles_crawler.log',
|
|||
|
|
error_log_file='logs/bjh_articles_crawler_error.log',
|
|||
|
|
level=20, # INFO
|
|||
|
|
backup_count=30,
|
|||
|
|
console_output=False # 不输出到控制台,避免与print重复
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 数据库配置
|
|||
|
|
self.load_from_db = load_from_db
|
|||
|
|
self.load_from_json = load_from_json
|
|||
|
|
self.db_manager = None
|
|||
|
|
|
|||
|
|
if load_from_json:
|
|||
|
|
print("[配置] 已启用JSON文件加载模式(测试模式)")
|
|||
|
|
self.logger.info("已启用JSON文件加载模式(测试模式)")
|
|||
|
|
elif load_from_db:
|
|||
|
|
self.db_manager = DatabaseManager(DB_CONFIG)
|
|||
|
|
print("[配置] 已启用数据库加载模式")
|
|||
|
|
self.logger.info("已启用数据库加载模式")
|
|||
|
|
|
|||
|
|
# 获取脚本所在目录
|
|||
|
|
self.script_dir = os.path.dirname(os.path.abspath(__file__))
|
|||
|
|
|
|||
|
|
# 加载Cookie
|
|||
|
|
self.captured_cookies_file = os.path.join(self.script_dir, "captured_account_cookies.json")
|
|||
|
|
|
|||
|
|
if self.load_from_json:
|
|||
|
|
# 测试模式:从JSON文件加载
|
|||
|
|
self.account_cookies = self.load_cookies_from_json_file()
|
|||
|
|
elif self.load_from_db:
|
|||
|
|
# 生产模式:从数据库加载
|
|||
|
|
self.account_cookies = self.load_cookies_from_database()
|
|||
|
|
else:
|
|||
|
|
# 兼容旧模式
|
|||
|
|
self.account_cookies = self.load_captured_cookies()
|
|||
|
|
|
|||
|
|
# 输出CSV文件
|
|||
|
|
self.output_csv = os.path.join(self.script_dir, f"bjh_articles_{datetime.now().strftime('%Y%m%d')}.csv")
|
|||
|
|
|
|||
|
|
def cookie_string_to_dict(self, cookie_string: str) -> Dict:
|
|||
|
|
"""将Cookie字符串转换为字典格式"""
|
|||
|
|
cookie_dict = {}
|
|||
|
|
if not cookie_string:
|
|||
|
|
return cookie_dict
|
|||
|
|
|
|||
|
|
for item in cookie_string.split(';'):
|
|||
|
|
item = item.strip()
|
|||
|
|
if '=' in item:
|
|||
|
|
key, value = item.split('=', 1)
|
|||
|
|
cookie_dict[key.strip()] = value.strip()
|
|||
|
|
|
|||
|
|
return cookie_dict
|
|||
|
|
|
|||
|
|
def load_cookies_from_database(self) -> Dict:
|
|||
|
|
"""从数据库加载Cookie数据"""
|
|||
|
|
try:
|
|||
|
|
if not self.db_manager:
|
|||
|
|
print("[X] 数据库管理器未初始化")
|
|||
|
|
return {}
|
|||
|
|
|
|||
|
|
sql = """
|
|||
|
|
SELECT id, author_name, app_id, toutiao_cookie, department_name
|
|||
|
|
FROM ai_authors
|
|||
|
|
WHERE channel = 1
|
|||
|
|
AND status = 'active'
|
|||
|
|
AND toutiao_cookie IS NOT NULL
|
|||
|
|
AND toutiao_cookie != ''
|
|||
|
|
ORDER BY id
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
results = self.db_manager.execute_query(sql)
|
|||
|
|
|
|||
|
|
if not results:
|
|||
|
|
print("[X] 数据库中未找到可用的账号Cookie")
|
|||
|
|
return {}
|
|||
|
|
|
|||
|
|
account_cookies = {}
|
|||
|
|
|
|||
|
|
for row in results:
|
|||
|
|
author_id = row['id']
|
|||
|
|
author_name = row['author_name']
|
|||
|
|
app_id = row['app_id'] or ''
|
|||
|
|
cookie_string = row['toutiao_cookie']
|
|||
|
|
domain = row['department_name'] or '其它'
|
|||
|
|
|
|||
|
|
cookies = self.cookie_string_to_dict(cookie_string)
|
|||
|
|
|
|||
|
|
if not cookies:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
account_cookies[author_name] = {
|
|||
|
|
'author_id': author_id,
|
|||
|
|
'app_id': app_id,
|
|||
|
|
'nick': author_name,
|
|||
|
|
'domain': domain,
|
|||
|
|
'cookies': cookies,
|
|||
|
|
'source': 'database'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
self.logger.info(f"从数据库加载了 {len(account_cookies)} 个账号的Cookie")
|
|||
|
|
print(f"[OK] 从数据库加载了 {len(account_cookies)} 个账号的Cookie")
|
|||
|
|
|
|||
|
|
return account_cookies
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.error(f"从数据库加载Cookie失败: {e}", exc_info=True)
|
|||
|
|
print(f"[X] 从数据库加载Cookie失败: {e}")
|
|||
|
|
return {}
|
|||
|
|
|
|||
|
|
def load_captured_cookies(self) -> Dict:
|
|||
|
|
"""从本地JSON文件加载已捕获的Cookie数据(兼容旧模式)"""
|
|||
|
|
try:
|
|||
|
|
with open(self.captured_cookies_file, 'r', encoding='utf-8') as f:
|
|||
|
|
data = json.load(f)
|
|||
|
|
print(f"[OK] 从文件加载了 {len(data)} 个账号的Cookie")
|
|||
|
|
return data
|
|||
|
|
except FileNotFoundError:
|
|||
|
|
print(f"[X] 未找到Cookie文件: {self.captured_cookies_file}")
|
|||
|
|
return {}
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"[X] 加载Cookie失败: {e}")
|
|||
|
|
return {}
|
|||
|
|
|
|||
|
|
def load_cookies_from_json_file(self) -> Dict:
|
|||
|
|
"""从JSON文件加载Cookie数据(测试模式)
|
|||
|
|
|
|||
|
|
从captured_account_cookies.json文件加载Cookie,
|
|||
|
|
并转换为与数据库加载模式相同的格式,确保author_id字段存在。
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
账号Cookie字典
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
with open(self.captured_cookies_file, 'r', encoding='utf-8') as f:
|
|||
|
|
data = json.load(f)
|
|||
|
|
|
|||
|
|
if not data:
|
|||
|
|
print("[X] JSON文件中没有账号数据")
|
|||
|
|
return {}
|
|||
|
|
|
|||
|
|
# 转换格式,添加author_id字段
|
|||
|
|
account_cookies = {}
|
|||
|
|
|
|||
|
|
for account_name, account_info in data.items():
|
|||
|
|
# 从JSON文件中提取数据
|
|||
|
|
nick = account_info.get('nick', account_name)
|
|||
|
|
app_id = account_info.get('app_id', '')
|
|||
|
|
cookies = account_info.get('cookies', {})
|
|||
|
|
domain = account_info.get('domain', '其它')
|
|||
|
|
|
|||
|
|
if not cookies:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 尝试从数据库查询author_id(如果启用了数据库)
|
|||
|
|
author_id = 0
|
|||
|
|
if self.db_manager:
|
|||
|
|
try:
|
|||
|
|
sql = "SELECT id FROM ai_authors WHERE author_name = %s AND channel = 1 LIMIT 1"
|
|||
|
|
result = self.db_manager.execute_query(sql, (nick,), fetch_one=True)
|
|||
|
|
if result:
|
|||
|
|
author_id = result['id']
|
|||
|
|
self.logger.info(f"[{nick}] 从数据库查询到 author_id={author_id}")
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.warning(f"[{nick}] 查询author_id失败: {e}")
|
|||
|
|
|
|||
|
|
# 构建账号数据
|
|||
|
|
account_cookies[account_name] = {
|
|||
|
|
'author_id': author_id, # 从数据库查询或默认为0
|
|||
|
|
'app_id': app_id,
|
|||
|
|
'nick': nick,
|
|||
|
|
'domain': domain,
|
|||
|
|
'cookies': cookies,
|
|||
|
|
'source': 'json_file' # 标记数据来源
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
self.logger.info(f"从JSON文件加载了 {len(account_cookies)} 个账号的Cookie")
|
|||
|
|
print(f"[OK] 从JSON文件加载了 {len(account_cookies)} 个账号的Cookie")
|
|||
|
|
|
|||
|
|
# 显示账号列表
|
|||
|
|
print("\n可用账号列表:")
|
|||
|
|
for idx, (name, info) in enumerate(account_cookies.items(), 1):
|
|||
|
|
author_id_str = f"ID:{info['author_id']}" if info['author_id'] > 0 else "ID:未查询"
|
|||
|
|
print(f" {idx}. {info['nick']} ({author_id_str}) - {info['domain']}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
return account_cookies
|
|||
|
|
|
|||
|
|
except FileNotFoundError:
|
|||
|
|
print(f"[X] 未找到Cookie文件: {self.captured_cookies_file}")
|
|||
|
|
print(" 请先运行一键捕获Cookie工具")
|
|||
|
|
return {}
|
|||
|
|
except json.JSONDecodeError as e:
|
|||
|
|
print(f"[X] JSON文件格式错误: {e}")
|
|||
|
|
self.logger.error(f"JSON文件格式错误: {e}")
|
|||
|
|
return {}
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"[X] 加载JSON文件失败: {e}")
|
|||
|
|
self.logger.error(f"加载JSON文件失败: {e}", exc_info=True)
|
|||
|
|
return {}
|
|||
|
|
|
|||
|
|
def set_account_cookies(self, account_data: Dict) -> bool:
|
|||
|
|
"""设置当前账号的Cookie"""
|
|||
|
|
try:
|
|||
|
|
cookies = account_data.get('cookies', {})
|
|||
|
|
if not cookies:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
self.session.cookies.clear()
|
|||
|
|
for key, value in cookies.items():
|
|||
|
|
self.session.cookies.set(key, value, domain='.baidu.com')
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.error(f"设置Cookie失败: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def fetch_articles_list(self, account_data: Dict, page: int = 1, page_size: int = 100,
|
|||
|
|
filter_status: Optional[str] = None) -> Optional[Dict]:
|
|||
|
|
"""获取文章列表
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
account_data: 账号数据
|
|||
|
|
page: 页码,从1开始
|
|||
|
|
page_size: 每页数量,默认100
|
|||
|
|
filter_status: 状态筛选(可选),例如'published'、'draft'等
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
文章列表数据
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
author_name = account_data.get('nick', '未知')
|
|||
|
|
|
|||
|
|
# 设置Cookie
|
|||
|
|
if not self.set_account_cookies(account_data):
|
|||
|
|
print(f" [X] 设置Cookie失败")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# 获取今天的日期范围
|
|||
|
|
today = datetime.now().date()
|
|||
|
|
start_date = today.strftime('%Y-%m-%d')
|
|||
|
|
end_date = (today + timedelta(days=1)).strftime('%Y-%m-%d') # 结束日期为明天(不包含)
|
|||
|
|
|
|||
|
|
# 构建API URL
|
|||
|
|
api_url = f"{self.base_url}/pcui/article/lists"
|
|||
|
|
|
|||
|
|
params = {
|
|||
|
|
'currentPage': page,
|
|||
|
|
'pageSize': page_size,
|
|||
|
|
'search': '',
|
|||
|
|
'type': '', # 类型筛选:空=全部
|
|||
|
|
'collection': '',
|
|||
|
|
'startDate': start_date, # 今天开始
|
|||
|
|
'endDate': end_date, # 明天开始(不包含)
|
|||
|
|
'clearBeforeFetch': 'false',
|
|||
|
|
'dynamic': '0'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 构建请求头
|
|||
|
|
headers = {
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
|
|||
|
|
'Accept': 'application/json, text/plain, */*',
|
|||
|
|
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108"',
|
|||
|
|
'sec-ch-ua-mobile': '?0',
|
|||
|
|
'sec-ch-ua-platform': '"Windows"',
|
|||
|
|
'Sec-Fetch-Site': 'same-origin',
|
|||
|
|
'Sec-Fetch-Mode': 'cors',
|
|||
|
|
'Sec-Fetch-Dest': 'empty',
|
|||
|
|
'Referer': f'{self.base_url}/builder/rc/content',
|
|||
|
|
'Accept-Language': 'zh-CN'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 如果有token,添加到请求头
|
|||
|
|
if 'token' in account_data:
|
|||
|
|
headers['token'] = account_data['token']
|
|||
|
|
|
|||
|
|
print(f" [>] 正在获取第{page}页文章列表...")
|
|||
|
|
self.logger.info(f"[{author_name}] 请求文章列表 - 第{page}页,日期范围: {start_date} ~ {end_date}")
|
|||
|
|
|
|||
|
|
response = self.session.get(
|
|||
|
|
api_url,
|
|||
|
|
headers=headers,
|
|||
|
|
params=params,
|
|||
|
|
timeout=15
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if response.status_code == 200:
|
|||
|
|
data = response.json()
|
|||
|
|
|
|||
|
|
# 检查响应
|
|||
|
|
errno = data.get('errno', -1)
|
|||
|
|
if errno == 0:
|
|||
|
|
# 保存返回的token供下次使用
|
|||
|
|
if 'Token' in response.headers:
|
|||
|
|
account_data['token'] = response.headers['Token']
|
|||
|
|
|
|||
|
|
return data
|
|||
|
|
else:
|
|||
|
|
errmsg = data.get('errmsg', '未知错误')
|
|||
|
|
print(f" [X] API错误: {errmsg}")
|
|||
|
|
self.logger.error(f"[{author_name}] API错误: errno={errno}, errmsg={errmsg}")
|
|||
|
|
return None
|
|||
|
|
else:
|
|||
|
|
print(f" [X] HTTP错误: {response.status_code}")
|
|||
|
|
self.logger.error(f"[{author_name}] HTTP错误: {response.status_code}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.error(f"获取文章列表异常: {e}", exc_info=True)
|
|||
|
|
print(f" [X] 获取文章列表异常: {e}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def fetch_article_detail(self, account_data: Dict, article_id: str, max_retries: int = 3) -> Optional[Dict]:
|
|||
|
|
"""获取文章详情
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
account_data: 账号数据
|
|||
|
|
article_id: 文章ID (feed_id)
|
|||
|
|
max_retries: 最大重试次数,默认3次
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
文章详情数据
|
|||
|
|
"""
|
|||
|
|
author_name = account_data.get('nick', '未知')
|
|||
|
|
|
|||
|
|
for retry in range(max_retries):
|
|||
|
|
try:
|
|||
|
|
# 设置Cookie
|
|||
|
|
if not self.set_account_cookies(account_data):
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# 构建API URL
|
|||
|
|
api_url = f"{self.base_url}/pcui/article/edit"
|
|||
|
|
|
|||
|
|
params = {
|
|||
|
|
'type': 'events',
|
|||
|
|
'feed_id': article_id,
|
|||
|
|
'copy_new_nid': article_id
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 构建请求头
|
|||
|
|
headers = {
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
|
|||
|
|
'Accept': 'application/json, text/plain, */*',
|
|||
|
|
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108"',
|
|||
|
|
'sec-ch-ua-mobile': '?0',
|
|||
|
|
'sec-ch-ua-platform': '"Windows"',
|
|||
|
|
'Sec-Fetch-Site': 'same-origin',
|
|||
|
|
'Sec-Fetch-Mode': 'cors',
|
|||
|
|
'Sec-Fetch-Dest': 'empty',
|
|||
|
|
'Referer': f'{self.base_url}/builder/rc/edit?type=events&app_id={account_data.get("app_id", "")}&feed_id={article_id}',
|
|||
|
|
'Accept-Language': 'zh-CN'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 如果有token,添加到请求头
|
|||
|
|
if 'token' in account_data:
|
|||
|
|
headers['token'] = account_data['token']
|
|||
|
|
|
|||
|
|
self.logger.info(f"[{author_name}] 获取文章详情: article_id={article_id}")
|
|||
|
|
|
|||
|
|
response = self.session.get(
|
|||
|
|
api_url,
|
|||
|
|
headers=headers,
|
|||
|
|
params=params,
|
|||
|
|
timeout=10 # 设置10秒超时
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if response.status_code == 200:
|
|||
|
|
data = response.json()
|
|||
|
|
|
|||
|
|
errno = data.get('errno', -1)
|
|||
|
|
if errno == 0:
|
|||
|
|
# 保存返回的token
|
|||
|
|
if 'Token' in response.headers:
|
|||
|
|
account_data['token'] = response.headers['Token']
|
|||
|
|
|
|||
|
|
return data.get('data', {}).get('article', {})
|
|||
|
|
else:
|
|||
|
|
errmsg = data.get('errmsg', '未知错误')
|
|||
|
|
self.logger.error(f"[{author_name}] 文章详情API错误: errno={errno}, errmsg={errmsg}")
|
|||
|
|
return None
|
|||
|
|
else:
|
|||
|
|
self.logger.error(f"[{author_name}] 文章详情HTTP错误: {response.status_code}")
|
|||
|
|
|
|||
|
|
# 如果是HTTP错误且还有重试机会,等待后重试
|
|||
|
|
if retry < max_retries - 1:
|
|||
|
|
wait_time = (retry + 1) * 2
|
|||
|
|
self.logger.warning(f"[{author_name}] HTTP错误,{wait_time}秒后重试...")
|
|||
|
|
time.sleep(wait_time)
|
|||
|
|
continue
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
except requests.exceptions.Timeout:
|
|||
|
|
self.logger.warning(f"[{author_name}] 请求超时,第{retry + 1}次尝试")
|
|||
|
|
if retry < max_retries - 1:
|
|||
|
|
time.sleep(2)
|
|||
|
|
continue
|
|||
|
|
else:
|
|||
|
|
self.logger.error(f"[{author_name}] 请求超时,已达最大重试次数")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
except requests.exceptions.ConnectionError as e:
|
|||
|
|
self.logger.warning(f"[{author_name}] 连接错误: {e},第{retry + 1}次尝试")
|
|||
|
|
if retry < max_retries - 1:
|
|||
|
|
time.sleep(2)
|
|||
|
|
continue
|
|||
|
|
else:
|
|||
|
|
self.logger.error(f"[{author_name}] 连接错误,已达最大重试次数")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.error(f"获取文章详情异常: {e}", exc_info=True)
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def extract_title_from_content(self, content: str) -> str:
|
|||
|
|
"""从正文中提取标题(第一行)"""
|
|||
|
|
if not content:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
lines = content.strip().split('\n')
|
|||
|
|
if lines:
|
|||
|
|
title = lines[0].strip()
|
|||
|
|
# 移除可能的标签
|
|||
|
|
title = re.sub(r'#.*?#', '', title).strip()
|
|||
|
|
return title
|
|||
|
|
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
def parse_article_data(self, article: Dict, account_data: Dict) -> Optional[Dict]:
|
|||
|
|
"""解析文章数据,映射到ai_articles表结构
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
article: 文章详情数据
|
|||
|
|
account_data: 账号数据
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
解析后的文章数据字典
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
# 提取基础信息
|
|||
|
|
article_id = article.get('article_id', '')
|
|||
|
|
content = article.get('content', '')
|
|||
|
|
title = self.extract_title_from_content(content)
|
|||
|
|
|
|||
|
|
# 提取作者信息
|
|||
|
|
author_name = account_data.get('nick', '')
|
|||
|
|
app_id = account_data.get('app_id', '')
|
|||
|
|
author_id = account_data.get('author_id', 0)
|
|||
|
|
|
|||
|
|
# 提取分类
|
|||
|
|
category = article.get('category_v4', '')
|
|||
|
|
|
|||
|
|
# 提取时间
|
|||
|
|
created_at = article.get('created_at', '')
|
|||
|
|
commit_at = article.get('commit_at', '')
|
|||
|
|
publish_time = commit_at or created_at
|
|||
|
|
|
|||
|
|
# 提取封面图片
|
|||
|
|
cover_images_str = article.get('cover_images', '[]')
|
|||
|
|
try:
|
|||
|
|
cover_images = json.loads(cover_images_str)
|
|||
|
|
image_count = len(cover_images) if isinstance(cover_images, list) else 0
|
|||
|
|
except:
|
|||
|
|
image_count = 0
|
|||
|
|
|
|||
|
|
# 计算字数
|
|||
|
|
word_count = len(content) if content else 0
|
|||
|
|
|
|||
|
|
# 提取审核状态
|
|||
|
|
audit_status = article.get('audit_status_info', {})
|
|||
|
|
status_map = {
|
|||
|
|
0: 'draft', # 草稿
|
|||
|
|
1: 'published', # 已发布
|
|||
|
|
2: 'pending_review', # 待审核
|
|||
|
|
3: 'rejected', # 审核拒绝
|
|||
|
|
4: 'failed' # 发布失败
|
|||
|
|
}
|
|||
|
|
auditing_status = article.get('auditing_status', 0)
|
|||
|
|
status = status_map.get(auditing_status, 'draft')
|
|||
|
|
|
|||
|
|
# 构建返回数据
|
|||
|
|
return {
|
|||
|
|
'baijiahao_id': article_id,
|
|||
|
|
'author_id': author_id,
|
|||
|
|
'author_name': author_name,
|
|||
|
|
'title': title,
|
|||
|
|
'content': content,
|
|||
|
|
'category': category,
|
|||
|
|
'channel': 1, # 1=百家号
|
|||
|
|
'status': status,
|
|||
|
|
'word_count': word_count,
|
|||
|
|
'image_count': image_count,
|
|||
|
|
'publish_time': publish_time,
|
|||
|
|
'created_at': created_at,
|
|||
|
|
'baijiahao_status': audit_status.get('quality_status', ''),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self.logger.error(f"解析文章数据失败: {e}", exc_info=True)
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def crawl_account_articles(self, account_name: str, account_data: Dict) -> List[Dict]:
|
|||
|
|
"""抓取单个账号的所有今日文章
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
account_name: 账号名称
|
|||
|
|
account_data: 账号数据
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
文章数据列表
|
|||
|
|
"""
|
|||
|
|
print(f"\n[账号] {account_name}")
|
|||
|
|
print("="*70)
|
|||
|
|
|
|||
|
|
all_articles = []
|
|||
|
|
page = 1
|
|||
|
|
|
|||
|
|
while True:
|
|||
|
|
# 获取文章列表
|
|||
|
|
result = self.fetch_articles_list(account_data, page=page, page_size=100)
|
|||
|
|
|
|||
|
|
if not result:
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
data = result.get('data', {})
|
|||
|
|
articles_list = data.get('list', [])
|
|||
|
|
|
|||
|
|
if not articles_list:
|
|||
|
|
print(f" [i] 第{page}页无数据,停止翻页")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
print(f" [OK] 第{page}页获取到 {len(articles_list)} 篇文章")
|
|||
|
|
|
|||
|
|
# 遍历文章,获取详情
|
|||
|
|
for idx, article_brief in enumerate(articles_list, 1):
|
|||
|
|
article_id = article_brief.get('article_id', '')
|
|||
|
|
|
|||
|
|
if not article_id:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
print(f" [{idx}/{len(articles_list)}] 正在获取文章详情: {article_id}")
|
|||
|
|
|
|||
|
|
# 获取文章详情
|
|||
|
|
article_detail = self.fetch_article_detail(account_data, article_id)
|
|||
|
|
|
|||
|
|
if article_detail:
|
|||
|
|
# 解析文章数据
|
|||
|
|
parsed_data = self.parse_article_data(article_detail, account_data)
|
|||
|
|
|
|||
|
|
if parsed_data:
|
|||
|
|
all_articles.append(parsed_data)
|
|||
|
|
print(f" 标题: {parsed_data['title']}")
|
|||
|
|
print(f" 状态: {parsed_data['status']}")
|
|||
|
|
|
|||
|
|
# 避免请求过快
|
|||
|
|
time.sleep(0.5)
|
|||
|
|
|
|||
|
|
# 检查是否还有下一页
|
|||
|
|
has_more = data.get('has_more', False)
|
|||
|
|
if not has_more:
|
|||
|
|
print(f" [i] 已到最后一页")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
page += 1
|
|||
|
|
time.sleep(1)
|
|||
|
|
|
|||
|
|
print(f"\n[OK] 账号 {account_name} 共抓取 {len(all_articles)} 篇文章")
|
|||
|
|
return all_articles
|
|||
|
|
|
|||
|
|
def save_to_csv(self, articles: List[Dict]) -> bool:
|
|||
|
|
"""保存文章数据到CSV文件
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
articles: 文章数据列表
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
是否成功
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
if not articles:
|
|||
|
|
print("[!] 没有文章数据需要保存")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# CSV表头(对应ai_articles表的主要字段)
|
|||
|
|
fieldnames = [
|
|||
|
|
'baijiahao_id',
|
|||
|
|
'author_id',
|
|||
|
|
'author_name',
|
|||
|
|
'title',
|
|||
|
|
'content',
|
|||
|
|
'category',
|
|||
|
|
'channel',
|
|||
|
|
'status',
|
|||
|
|
'word_count',
|
|||
|
|
'image_count',
|
|||
|
|
'publish_time',
|
|||
|
|
'created_at',
|
|||
|
|
'baijiahao_status'
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
print(f"\n[>] 正在保存到CSV文件: {self.output_csv}")
|
|||
|
|
|
|||
|
|
with open(self.output_csv, 'w', encoding='utf-8-sig', newline='') as csvfile:
|
|||
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|||
|
|
writer.writeheader()
|
|||
|
|
|
|||
|
|
for article in articles:
|
|||
|
|
# 只写入指定的字段
|
|||
|
|
row = {key: article.get(key, '') for key in fieldnames}
|
|||
|
|
writer.writerow(row)
|
|||
|
|
|
|||
|
|
print(f"[OK] 成功保存 {len(articles)} 篇文章到CSV文件")
|
|||
|
|
self.logger.info(f"成功保存 {len(articles)} 篇文章到 {self.output_csv}")
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"[X] 保存CSV失败: {e}")
|
|||
|
|
self.logger.error(f"保存CSV失败: {e}", exc_info=True)
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def run(self):
|
|||
|
|
"""运行爬虫"""
|
|||
|
|
print("\n" + "="*70)
|
|||
|
|
print("百家号文章数据抓取工具")
|
|||
|
|
print("="*70)
|
|||
|
|
print(f"抓取日期: {datetime.now().strftime('%Y-%m-%d')}")
|
|||
|
|
print(f"账号数量: {len(self.account_cookies)}")
|
|||
|
|
print("="*70)
|
|||
|
|
|
|||
|
|
if not self.account_cookies:
|
|||
|
|
print("[X] 没有可用的账号Cookie,退出")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
all_articles = []
|
|||
|
|
|
|||
|
|
# 遍历所有账号
|
|||
|
|
for account_name, account_data in self.account_cookies.items():
|
|||
|
|
try:
|
|||
|
|
articles = self.crawl_account_articles(account_name, account_data)
|
|||
|
|
all_articles.extend(articles)
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"[X] 抓取账号 {account_name} 失败: {e}")
|
|||
|
|
self.logger.error(f"抓取账号 {account_name} 失败: {e}", exc_info=True)
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 保存到CSV
|
|||
|
|
if all_articles:
|
|||
|
|
self.save_to_csv(all_articles)
|
|||
|
|
else:
|
|||
|
|
print("\n[!] 今日暂无文章数据")
|
|||
|
|
|
|||
|
|
print("\n" + "="*70)
|
|||
|
|
print(f"抓取完成!共抓取 {len(all_articles)} 篇文章")
|
|||
|
|
print("="*70)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
# 默认模式:从数据库加载Cookie
|
|||
|
|
# crawler = BaijiahaoArticlesCrawler(load_from_db=True)
|
|||
|
|
|
|||
|
|
# 测试模式:从JSON文件加载Cookie(方便测试,会尝试查询author_id)
|
|||
|
|
crawler = BaijiahaoArticlesCrawler(load_from_json=True)
|
|||
|
|
|
|||
|
|
# 兼容旧模式:从JSON文件加载Cookie(不查询author_id)
|
|||
|
|
# crawler = BaijiahaoArticlesCrawler(load_from_db=False)
|
|||
|
|
|
|||
|
|
crawler.run()
|