526 lines
18 KiB
Python
526 lines
18 KiB
Python
|
|
import json
|
|||
|
|
import random
|
|||
|
|
import time
|
|||
|
|
from typing import Dict, Any, Optional
|
|||
|
|
import logging
|
|||
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
|
|||
|
|
from fake_useragent import UserAgent
|
|||
|
|
import requests
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class BaiduBJHSpider:
|
|||
|
|
def __init__(self, use_proxy: bool = False):
|
|||
|
|
self.ua = UserAgent()
|
|||
|
|
self.use_proxy = use_proxy
|
|||
|
|
self.proxies = [] # 如果需要代理,这里填你的代理列表
|
|||
|
|
self.session_cookie = None
|
|||
|
|
self.session = requests.Session()
|
|||
|
|
# 设置请求超时和重试
|
|||
|
|
self.session.mount('http://', requests.adapters.HTTPAdapter(max_retries=3))
|
|||
|
|
self.session.mount('https://', requests.adapters.HTTPAdapter(max_retries=3))
|
|||
|
|
|
|||
|
|
def init_browser(self, timeout: int = 15000):
|
|||
|
|
"""初始化浏览器环境获取Cookie"""
|
|||
|
|
playwright = sync_playwright().start()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 配置浏览器参数
|
|||
|
|
browser_args = [
|
|||
|
|
'--disable-blink-features=AutomationControlled',
|
|||
|
|
'--disable-web-security',
|
|||
|
|
'--disable-features=IsolateOrigins,site-per-process',
|
|||
|
|
'--no-sandbox',
|
|||
|
|
'--disable-setuid-sandbox',
|
|||
|
|
'--disable-dev-shm-usage',
|
|||
|
|
'--disable-gpu',
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# 启动浏览器
|
|||
|
|
browser = playwright.chromium.launch(
|
|||
|
|
headless=True, # 改为True,无头模式更快
|
|||
|
|
args=browser_args,
|
|||
|
|
timeout=timeout
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 创建上下文
|
|||
|
|
context = browser.new_context(
|
|||
|
|
viewport={'width': 1920, 'height': 1080},
|
|||
|
|
user_agent=self.ua.random,
|
|||
|
|
locale='zh-CN',
|
|||
|
|
timezone_id='Asia/Shanghai',
|
|||
|
|
# 设置超时
|
|||
|
|
navigation_timeout=timeout,
|
|||
|
|
java_script_enabled=True,
|
|||
|
|
bypass_csp=True
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 设置额外的HTTP头
|
|||
|
|
context.set_extra_http_headers({
|
|||
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|||
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|||
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|||
|
|
'Connection': 'keep-alive',
|
|||
|
|
'Upgrade-Insecure-Requests': '1',
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
page = context.new_page()
|
|||
|
|
|
|||
|
|
# 1. 首先访问百度首页获取基础Cookie
|
|||
|
|
logger.info("访问百度首页...")
|
|||
|
|
try:
|
|||
|
|
page.goto('https://www.baidu.com', wait_until='domcontentloaded', timeout=10000)
|
|||
|
|
time.sleep(random.uniform(1, 2))
|
|||
|
|
except PlaywrightTimeoutError:
|
|||
|
|
logger.warning("百度首页加载超时,继续执行...")
|
|||
|
|
|
|||
|
|
# 2. 访问百家号页面
|
|||
|
|
logger.info("访问百家号页面...")
|
|||
|
|
try:
|
|||
|
|
# 使用更宽松的等待条件
|
|||
|
|
page.goto('https://baijiahao.baidu.com/',
|
|||
|
|
wait_until='domcontentloaded', # 改为domcontentloaded,更快
|
|||
|
|
timeout=10000)
|
|||
|
|
time.sleep(random.uniform(2, 3))
|
|||
|
|
except PlaywrightTimeoutError:
|
|||
|
|
logger.warning("百家号页面加载超时,尝试继续...")
|
|||
|
|
# 即使超时,也尝试获取Cookie
|
|||
|
|
|
|||
|
|
# 获取Cookie
|
|||
|
|
cookies = context.cookies()
|
|||
|
|
self.session_cookie = '; '.join([f"{c['name']}={c['value']}" for c in cookies])
|
|||
|
|
|
|||
|
|
# 将Cookie添加到requests session中
|
|||
|
|
for cookie in cookies:
|
|||
|
|
self.session.cookies.set(cookie['name'], cookie['value'])
|
|||
|
|
|
|||
|
|
if cookies:
|
|||
|
|
logger.info(f"成功获取到 {len(cookies)} 个Cookie")
|
|||
|
|
else:
|
|||
|
|
logger.warning("未获取到Cookie")
|
|||
|
|
|
|||
|
|
browser.close()
|
|||
|
|
return cookies
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"初始化浏览器失败: {e}")
|
|||
|
|
return None
|
|||
|
|
finally:
|
|||
|
|
playwright.stop()
|
|||
|
|
|
|||
|
|
def build_headers(self, referer: str = "https://baijiahao.baidu.com/") -> Dict:
|
|||
|
|
"""构建请求头"""
|
|||
|
|
headers = {
|
|||
|
|
'User-Agent': self.ua.random,
|
|||
|
|
'Accept': '*/*',
|
|||
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
|||
|
|
'Accept-Encoding': 'gzip, deflate',
|
|||
|
|
'Referer': referer,
|
|||
|
|
'Connection': 'keep-alive',
|
|||
|
|
'Pragma': 'no-cache',
|
|||
|
|
'Cache-Control': 'no-cache',
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if self.session_cookie:
|
|||
|
|
headers['Cookie'] = self.session_cookie
|
|||
|
|
|
|||
|
|
return headers
|
|||
|
|
|
|||
|
|
def generate_callback_name(self) -> str:
|
|||
|
|
"""生成随机的callback函数名"""
|
|||
|
|
timestamp = int(time.time() * 1000)
|
|||
|
|
return f"__jsonp{timestamp}"
|
|||
|
|
|
|||
|
|
def fetch_data_directly(self, uk: str = "ntHidnLhrlfclJar2z8wBg") -> Optional[Dict]:
|
|||
|
|
"""直接请求接口(可能需要多次尝试)"""
|
|||
|
|
# 先初始化浏览器获取Cookie
|
|||
|
|
logger.info("初始化浏览器获取Cookie...")
|
|||
|
|
cookies = self.init_browser()
|
|||
|
|
|
|||
|
|
if not cookies:
|
|||
|
|
logger.warning("未能获取到Cookie,尝试继续请求...")
|
|||
|
|
|
|||
|
|
for attempt in range(3): # 尝试3次
|
|||
|
|
try:
|
|||
|
|
callback_name = self.generate_callback_name()
|
|||
|
|
timestamp = int(time.time() * 1000)
|
|||
|
|
|
|||
|
|
# 构建URL参数 - 使用更简单的参数
|
|||
|
|
params = {
|
|||
|
|
'tab': 'main',
|
|||
|
|
'num': '10',
|
|||
|
|
'uk': uk,
|
|||
|
|
'source': 'pc',
|
|||
|
|
'type': 'newhome',
|
|||
|
|
'action': 'dynamic',
|
|||
|
|
'format': 'jsonp',
|
|||
|
|
'callback': callback_name,
|
|||
|
|
'_': str(timestamp) # 时间戳参数
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
url = "https://mbd.baidu.com/webpage"
|
|||
|
|
|
|||
|
|
headers = self.build_headers()
|
|||
|
|
|
|||
|
|
logger.info(f"尝试第{attempt + 1}次请求...")
|
|||
|
|
|
|||
|
|
# 随机延迟
|
|||
|
|
time.sleep(random.uniform(1, 2))
|
|||
|
|
|
|||
|
|
# 设置代理(如果需要)
|
|||
|
|
proxies = None
|
|||
|
|
if self.use_proxy and self.proxies:
|
|||
|
|
proxy = random.choice(self.proxies)
|
|||
|
|
proxies = {
|
|||
|
|
'http': proxy,
|
|||
|
|
'https': proxy
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
response = self.session.get(
|
|||
|
|
url,
|
|||
|
|
params=params,
|
|||
|
|
headers=headers,
|
|||
|
|
timeout=15, # 缩短超时时间
|
|||
|
|
proxies=proxies
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 提取JSONP数据
|
|||
|
|
text = response.text
|
|||
|
|
if text.startswith(callback_name + '(') and text.endswith(')'):
|
|||
|
|
json_str = text[len(callback_name) + 1:-1]
|
|||
|
|
data = json.loads(json_str)
|
|||
|
|
logger.info(f"成功获取JSON数据")
|
|||
|
|
return data
|
|||
|
|
else:
|
|||
|
|
# 尝试直接解析为JSON(可能是JSON格式)
|
|||
|
|
try:
|
|||
|
|
data = json.loads(text)
|
|||
|
|
logger.info("直接解析JSON成功")
|
|||
|
|
return data
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
except requests.exceptions.Timeout:
|
|||
|
|
logger.error(f"请求超时 (尝试{attempt + 1})")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"请求失败 (尝试{attempt + 1}): {e}")
|
|||
|
|
|
|||
|
|
# 等待后重试
|
|||
|
|
if attempt < 2: # 如果不是最后一次尝试
|
|||
|
|
time.sleep(random.uniform(2, 3))
|
|||
|
|
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def fetch_via_browser(self, uk: str = "ntHidnLhrlfclJar2z8wBg", timeout: int = 15000) -> Optional[Dict]:
|
|||
|
|
"""通过浏览器直接执行获取数据"""
|
|||
|
|
playwright = sync_playwright().start()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
browser = playwright.chromium.launch(
|
|||
|
|
headless=True, # 无头模式
|
|||
|
|
args=[
|
|||
|
|
'--disable-blink-features=AutomationControlled',
|
|||
|
|
'--no-sandbox',
|
|||
|
|
'--disable-dev-shm-usage'
|
|||
|
|
],
|
|||
|
|
timeout=timeout
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
context = browser.new_context(
|
|||
|
|
viewport={'width': 1920, 'height': 1080},
|
|||
|
|
user_agent=self.ua.random,
|
|||
|
|
locale='zh-CN',
|
|||
|
|
navigation_timeout=timeout
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
page = context.new_page()
|
|||
|
|
|
|||
|
|
# 监听网络请求
|
|||
|
|
results = []
|
|||
|
|
|
|||
|
|
def handle_response(response):
|
|||
|
|
url = response.url
|
|||
|
|
if "mbd.baidu.com/webpage" in url and "format=jsonp" in url:
|
|||
|
|
try:
|
|||
|
|
# 获取响应文本
|
|||
|
|
text = response.text()
|
|||
|
|
logger.info(f"捕获到请求: {url}")
|
|||
|
|
|
|||
|
|
# 从URL提取callback名称
|
|||
|
|
import urllib.parse
|
|||
|
|
parsed_url = urllib.parse.urlparse(url)
|
|||
|
|
query_params = urllib.parse.parse_qs(parsed_url.query)
|
|||
|
|
|
|||
|
|
if 'callback' in query_params:
|
|||
|
|
callback = query_params['callback'][0]
|
|||
|
|
if text.startswith(callback + '(') and text.endswith(')'):
|
|||
|
|
json_str = text[len(callback) + 1:-1]
|
|||
|
|
data = json.loads(json_str)
|
|||
|
|
results.append(data)
|
|||
|
|
logger.info("成功解析JSONP数据")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug(f"处理响应失败: {e}")
|
|||
|
|
|
|||
|
|
page.on("response", handle_response)
|
|||
|
|
|
|||
|
|
# 访问百家号页面
|
|||
|
|
target_url = f"https://baijiahao.baidu.com/u?app_id={uk}"
|
|||
|
|
logger.info(f"访问页面: {target_url}")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
page.goto(target_url, wait_until='domcontentloaded', timeout=10000)
|
|||
|
|
time.sleep(random.uniform(2, 3))
|
|||
|
|
|
|||
|
|
# 简单滚动
|
|||
|
|
page.evaluate("window.scrollBy(0, 500)")
|
|||
|
|
time.sleep(1)
|
|||
|
|
page.evaluate("window.scrollBy(0, 500)")
|
|||
|
|
time.sleep(1)
|
|||
|
|
|
|||
|
|
# 等待数据加载
|
|||
|
|
time.sleep(2)
|
|||
|
|
|
|||
|
|
except PlaywrightTimeoutError:
|
|||
|
|
logger.warning("页面加载超时,继续处理已捕获的数据...")
|
|||
|
|
|
|||
|
|
browser.close()
|
|||
|
|
|
|||
|
|
if results:
|
|||
|
|
logger.info(f"通过浏览器捕获到 {len(results)} 个结果")
|
|||
|
|
return results[0]
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"浏览器方式获取失败: {e}")
|
|||
|
|
finally:
|
|||
|
|
playwright.stop()
|
|||
|
|
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def fetch_with_ajax(self, uk: str = "ntHidnLhrlfclJar2z8wBg") -> Optional[Dict]:
|
|||
|
|
"""使用简化参数直接请求"""
|
|||
|
|
try:
|
|||
|
|
timestamp = int(time.time() * 1000)
|
|||
|
|
|
|||
|
|
# 使用更简单的参数
|
|||
|
|
params = {
|
|||
|
|
'action': 'dynamic',
|
|||
|
|
'uk': uk,
|
|||
|
|
'type': 'newhome',
|
|||
|
|
'num': '10',
|
|||
|
|
'format': 'json',
|
|||
|
|
'_': str(timestamp)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
url = "https://mbd.baidu.com/webpage"
|
|||
|
|
|
|||
|
|
headers = {
|
|||
|
|
'User-Agent': self.ua.random,
|
|||
|
|
'Referer': 'https://baijiahao.baidu.com/',
|
|||
|
|
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
|||
|
|
'X-Requested-With': 'XMLHttpRequest'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
logger.info("尝试AJAX方式请求...")
|
|||
|
|
|
|||
|
|
response = self.session.get(
|
|||
|
|
url,
|
|||
|
|
params=params,
|
|||
|
|
headers=headers,
|
|||
|
|
timeout=10
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
logger.info(f"AJAX响应状态: {response.status_code}")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
data = json.loads(response.text)
|
|||
|
|
logger.info("AJAX方式成功获取数据")
|
|||
|
|
return data
|
|||
|
|
except json.JSONDecodeError as e:
|
|||
|
|
logger.error(f"JSON解析失败: {e}")
|
|||
|
|
logger.info(f"响应内容: {response.text[:200]}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"AJAX方式失败: {e}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def fetch_all_methods(self, uk: str = "ntHidnLhrlfclJar2z8wBg") -> Optional[Dict]:
|
|||
|
|
"""尝试所有方法获取数据"""
|
|||
|
|
logger.info("=" * 50)
|
|||
|
|
logger.info(f"开始获取百家号数据,UK: {uk}")
|
|||
|
|
logger.info("=" * 50)
|
|||
|
|
|
|||
|
|
# 方法1:直接请求
|
|||
|
|
logger.info("\n方法1:直接请求接口...")
|
|||
|
|
data = self.fetch_data_directly(uk)
|
|||
|
|
|
|||
|
|
if data and data.get("errno") == "0" and data.get("data", {}).get("list") is not None:
|
|||
|
|
logger.info(f"✓ 方法1成功,获取到 {len(data['data']['list'])} 条数据")
|
|||
|
|
return data
|
|||
|
|
else:
|
|||
|
|
logger.info("✗ 方法1失败或数据为空")
|
|||
|
|
|
|||
|
|
# 方法2:通过浏览器获取
|
|||
|
|
logger.info("\n方法2:浏览器模拟获取...")
|
|||
|
|
data = self.fetch_via_browser(uk)
|
|||
|
|
|
|||
|
|
if data and data.get("errno") == "0" and data.get("data", {}).get("list") is not None:
|
|||
|
|
logger.info(f"✓ 方法2成功,获取到 {len(data['data']['list'])} 条数据")
|
|||
|
|
return data
|
|||
|
|
else:
|
|||
|
|
logger.info("✗ 方法2失败或数据为空")
|
|||
|
|
|
|||
|
|
# 方法3:AJAX请求
|
|||
|
|
logger.info("\n方法3:AJAX请求...")
|
|||
|
|
data = self.fetch_with_ajax(uk)
|
|||
|
|
|
|||
|
|
if data and data.get("errno") == "0" and data.get("data", {}).get("list") is not None:
|
|||
|
|
logger.info(f"✓ 方法3成功,获取到 {len(data['data']['list'])} 条数据")
|
|||
|
|
return data
|
|||
|
|
else:
|
|||
|
|
logger.info("✗ 方法3失败或数据为空")
|
|||
|
|
|
|||
|
|
# 方法4:备用请求
|
|||
|
|
logger.info("\n方法4:尝试备用请求方式...")
|
|||
|
|
data = self.try_backup_method(uk)
|
|||
|
|
|
|||
|
|
if data:
|
|||
|
|
logger.info("✓ 方法4成功获取数据")
|
|||
|
|
return data
|
|||
|
|
else:
|
|||
|
|
logger.error("所有方法都失败了")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def try_backup_method(self, uk: str) -> Optional[Dict]:
|
|||
|
|
"""备用方法:尝试不同的URL和参数"""
|
|||
|
|
backup_urls = [
|
|||
|
|
"https://author.baidu.com/rest/2.0/ugc/dynamic",
|
|||
|
|
"https://mbd.baidu.com/dynamic/api",
|
|||
|
|
"https://baijiahao.baidu.com/builder/api"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for url in backup_urls:
|
|||
|
|
try:
|
|||
|
|
params = {
|
|||
|
|
'action': 'list',
|
|||
|
|
'uk': uk,
|
|||
|
|
'page': '1',
|
|||
|
|
'page_size': '10',
|
|||
|
|
'_': str(int(time.time() * 1000))
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
headers = {
|
|||
|
|
'User-Agent': self.ua.random,
|
|||
|
|
'Referer': 'https://baijiahao.baidu.com/'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
response = requests.get(url, params=params, headers=headers, timeout=10)
|
|||
|
|
|
|||
|
|
if response.status_code == 200:
|
|||
|
|
try:
|
|||
|
|
data = response.json()
|
|||
|
|
if data:
|
|||
|
|
logger.info(f"备用URL {url} 成功")
|
|||
|
|
return data
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug(f"备用URL {url} 失败: {e}")
|
|||
|
|
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def display_simple_data(data):
|
|||
|
|
"""简单展示数据"""
|
|||
|
|
if not data or "data" not in data or "list" not in data["data"]:
|
|||
|
|
print("没有有效的数据")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
articles = data["data"]["list"]
|
|||
|
|
print(f"\n获取到 {len(articles)} 篇文章:")
|
|||
|
|
|
|||
|
|
for idx, article in enumerate(articles[:10]): # 显示前10条
|
|||
|
|
print(f"\n{'=' * 60}")
|
|||
|
|
print(f"文章 {idx + 1}:")
|
|||
|
|
|
|||
|
|
item_data = article.get("itemData", {})
|
|||
|
|
|
|||
|
|
# 标题
|
|||
|
|
title = item_data.get("title", "无标题")
|
|||
|
|
# 清理标题中的换行符
|
|||
|
|
title = title.replace('\n', ' ').strip()
|
|||
|
|
if not title or title == "无标题":
|
|||
|
|
# 尝试获取origin_title
|
|||
|
|
title = item_data.get("origin_title", "无标题").replace('\n', ' ').strip()
|
|||
|
|
print(f"标题: {title[:100]}{'...' if len(title) > 100 else ''}")
|
|||
|
|
|
|||
|
|
# 作者
|
|||
|
|
display_info = item_data.get("displaytype_exinfo", "")
|
|||
|
|
author = "未知作者"
|
|||
|
|
if display_info:
|
|||
|
|
try:
|
|||
|
|
info = json.loads(display_info)
|
|||
|
|
author = info.get("name", info.get("display_name", "未知作者"))
|
|||
|
|
except:
|
|||
|
|
# 尝试正则匹配
|
|||
|
|
name_match = re.search(r'"name":"([^"]+)"', display_info)
|
|||
|
|
if name_match:
|
|||
|
|
author = name_match.group(1)
|
|||
|
|
print(f"作者: {author}")
|
|||
|
|
|
|||
|
|
# 发布时间
|
|||
|
|
time_str = item_data.get("time", item_data.get("cst_time", "未知时间"))
|
|||
|
|
print(f"发布时间: {time_str}")
|
|||
|
|
|
|||
|
|
# 文章ID
|
|||
|
|
thread_id = item_data.get("thread_id", article.get("thread_id", "未知"))
|
|||
|
|
print(f"文章ID: {thread_id}")
|
|||
|
|
|
|||
|
|
# 图片信息
|
|||
|
|
img_src = item_data.get("imgSrc", [])
|
|||
|
|
if img_src:
|
|||
|
|
print(f"包含图片: {len(img_src)} 张")
|
|||
|
|
|
|||
|
|
# 标签/话题
|
|||
|
|
targets = item_data.get("target", [])
|
|||
|
|
if targets:
|
|||
|
|
tags = [t.get("key", "") for t in targets if t.get("key")]
|
|||
|
|
if tags:
|
|||
|
|
print(f"标签: {', '.join(tags)}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
spider = BaiduBJHSpider()
|
|||
|
|
|
|||
|
|
# 获取数据
|
|||
|
|
data = spider.fetch_all_methods()
|
|||
|
|
|
|||
|
|
if data:
|
|||
|
|
# 保存完整数据到文件
|
|||
|
|
filename = f'baijiahao_data_{int(time.time())}.json'
|
|||
|
|
with open(filename, 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|||
|
|
logger.info(f"完整数据已保存到 {filename}")
|
|||
|
|
|
|||
|
|
# 简单展示数据
|
|||
|
|
display_simple_data(data)
|
|||
|
|
|
|||
|
|
else:
|
|||
|
|
print("未能获取到数据,建议:")
|
|||
|
|
print("1. 检查网络连接")
|
|||
|
|
print("2. 尝试使用代理")
|
|||
|
|
print("3. 等待一段时间后重试")
|
|||
|
|
print("4. 检查目标页面是否可正常访问")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
# 设置更详细的日志
|
|||
|
|
logging.getLogger("playwright").setLevel(logging.WARNING)
|
|||
|
|
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
|||
|
|
|
|||
|
|
main()
|