This commit is contained in:
sjk
2026-01-06 19:36:42 +08:00
parent 15b579d64a
commit 19942144fb
261 changed files with 24034 additions and 5477 deletions

553
backend/browser_pool.py Normal file
View File

@@ -0,0 +1,553 @@
"""
浏览器池管理模块
管理Playwright浏览器实例的生命周期支持复用以提升性能
"""
import asyncio
import time
from typing import Optional, Dict, Any
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
import sys
class BrowserPool:
"""浏览器池管理器(单例模式)"""
def __init__(self, idle_timeout: int = 1800, max_instances: int = 5, headless: bool = True):
"""
初始化浏览器池
Args:
idle_timeout: 空闲超时时间默认30分钟已禁用保持常驻
max_instances: 最大浏览器实例数默认5个
headless: 是否使用无头模式False为有头模式方便调试
"""
self.playwright = None
self.browser: Optional[Browser] = None
self.context: Optional[BrowserContext] = None
self.page: Optional[Page] = None
self.last_used_time = 0
self.idle_timeout = idle_timeout
self.max_instances = max_instances
self.headless = headless
self.is_initializing = False
self.init_lock = asyncio.Lock()
self.is_preheated = False # 标记是否已预热
# 临时浏览器实例池(用于并发请求)
self.temp_browsers: Dict[str, Dict] = {} # {session_id: {browser, context, page, created_at}}
self.temp_lock = asyncio.Lock()
print(f"[浏览器池] 已创建,常驻模式(不自动清理),最大实例数: {max_instances}", file=sys.stderr)
async def get_browser(self, cookies: Optional[list] = None, proxy: Optional[str] = None,
user_agent: Optional[str] = None, session_id: Optional[str] = None,
headless: Optional[bool] = None) -> tuple[Browser, BrowserContext, Page]:
"""
获取浏览器实例(复用或新建)
Args:
cookies: 可选的Cookie列表
proxy: 可选的代理地址
user_agent: 可选的自定义User-Agent
session_id: 会话 ID用于区分不同的并发请求
headless: 可选的headless模式为None时使用默认配置
Returns:
(browser, context, page) 三元组
"""
# 如果没有指定headless使用默认配置
if headless is None:
headless = self.headless
# 如果主浏览器可用且无会话 ID使用主浏览器
if not session_id:
async with self.init_lock:
# 检查现有浏览器是否可用
if await self._is_browser_alive():
print("[浏览器池] 复用主浏览器实例", file=sys.stderr)
self.last_used_time = time.time()
# 如果需要注入Cookie直接添加到现有的context不创建新context
if cookies:
print(f"[浏览器池] 在现有context中注入 {len(cookies)} 个Cookie", file=sys.stderr)
await self.context.add_cookies(cookies)
return self.browser, self.context, self.page
else:
# 创建新浏览器
print("[浏览器池] 创建主浏览器实例", file=sys.stderr)
await self._init_browser(cookies, proxy, user_agent)
self.last_used_time = time.time()
return self.browser, self.context, self.page
# 并发请求:复用或创建临时浏览器
else:
async with self.temp_lock:
# 首先检查是否已存在该session_id的临时浏览器
if session_id in self.temp_browsers:
print(f"[浏览器池] 复用会话 {session_id} 的临时浏览器", file=sys.stderr)
browser_info = self.temp_browsers[session_id]
return browser_info["browser"], browser_info["context"], browser_info["page"]
# 检查是否超过最大实例数
if len(self.temp_browsers) >= self.max_instances - 1: # -1 留给主浏览器
print(f"[浏览器池] ⚠️ 已达最大实例数 ({self.max_instances}),等待释放...", file=sys.stderr)
# TODO: 可以实现等待队列,这里直接报错
raise Exception(f"浏览器实例数已满,请稍后再试")
print(f"[浏览器池] 为会话 {session_id} 创建临时浏览器 ({len(self.temp_browsers)+1}/{self.max_instances-1})", file=sys.stderr)
# 创建临时浏览器传入headless参数
browser, context, page = await self._create_temp_browser(cookies, proxy, user_agent, headless)
# 保存到临时池
self.temp_browsers[session_id] = {
"browser": browser,
"context": context,
"page": page,
"created_at": time.time()
}
return browser, context, page
async def _is_browser_alive(self) -> bool:
"""检查浏览器是否存活(不检查超时,保持常驻)"""
if not self.browser or not self.context or not self.page:
return False
# 注意:为了保持浏览器常驻,不再检查空闲超时
# 原代码:
# if time.time() - self.last_used_time > self.idle_timeout:
# print(f"[浏览器池] 浏览器空闲超时 ({self.idle_timeout}秒),需要重建", file=sys.stderr)
# await self.close()
# return False
# 检查浏览器是否仍在运行
try:
# 尝试获取页面标题来验证连接
await self.page.title()
return True
except Exception as e:
print(f"[浏览器池] 浏览器连接失效: {str(e)}", file=sys.stderr)
await self.close()
return False
async def _init_browser(self, cookies: Optional[list] = None, proxy: Optional[str] = None,
user_agent: Optional[str] = None):
"""初始化新浏览器实例"""
try:
# 启动Playwright
if not self.playwright:
# Windows环境下需要设置事件循环策略
if sys.platform == 'win32':
# 设置为ProactorEventLoop或SelectorEventLoop
try:
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
except Exception as e:
print(f"[浏览器池] 警告: 设置事件循环策略失败: {str(e)}", file=sys.stderr)
self.playwright = await async_playwright().start()
print("[浏览器池] Playwright启动成功", file=sys.stderr)
# 启动浏览器(性能优先配置)
launch_kwargs = {
"headless": self.headless, # 使用配置的headless参数
"args": [
'--disable-blink-features=AutomationControlled', # 隐藏自动化特征
'--no-sandbox', # Linux环境必需
'--disable-setuid-sandbox',
'--disable-dev-shm-usage', # 使用/tmp而非/dev/shm避免内存不足
# 性能优化
'--disable-web-security', # 禁用同源策略(提升加载速度)
'--disable-features=IsolateOrigins,site-per-process', # 禁用站点隔离(提升性能)
'--disable-site-isolation-trials',
'--enable-features=NetworkService,NetworkServiceInProcess', # 网络服务优化
'--disable-background-timer-throttling', # 禁用后台限速
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding', # 渲染进程不降优先级
'--disable-background-networking',
# 缓存和存储优化
'--disk-cache-size=268435456', # 256MB磁盘缓存
'--media-cache-size=134217728', # 128MB媒体缓存
# 渲染优化保留GPU支持
'--enable-gpu-rasterization', # 启用GPU光栅化
'--enable-zero-copy', # 零拷贝优化
'--ignore-gpu-blocklist', # 忽略GPU黑名单
'--enable-accelerated-2d-canvas', # 加速2D canvas
# 网络优化
'--enable-quic', # 启用QUIC协议
'--enable-tcp-fast-open', # TCP快速打开
'--max-connections-per-host=10', # 每个主机最大连接数
# 减少不必要的功能
'--disable-extensions',
'--disable-breakpad', # 禁用崩溃报告
'--disable-component-extensions-with-background-pages',
'--disable-ipc-flooding-protection', # 禁用IPC洪水保护提升性能
'--disable-hang-monitor', # 禁用挂起监控
'--disable-prompt-on-repost',
'--disable-domain-reliability',
'--disable-component-update',
# 界面优化
'--hide-scrollbars',
'--mute-audio',
'--no-first-run',
'--no-default-browser-check',
'--metrics-recording-only',
'--force-color-profile=srgb',
],
}
if proxy:
launch_kwargs["proxy"] = {"server": proxy}
self.browser = await self.playwright.chromium.launch(**launch_kwargs)
print("[浏览器池] Chromium浏览器启动成功", file=sys.stderr)
# 创建上下文
await self._create_new_context(cookies, proxy, user_agent)
except Exception as e:
print(f"[浏览器池] 初始化浏览器失败: {str(e)}", file=sys.stderr)
await self.close()
raise
async def _create_new_context(self, cookies: Optional[list] = None, proxy: Optional[str] = None,
user_agent: Optional[str] = None):
"""创建新的浏览器上下文"""
try:
# 关闭旧上下文
if self.context:
await self.context.close()
print("[浏览器池] 已关闭旧上下文", file=sys.stderr)
# 创建新上下文
context_kwargs = {
"viewport": {'width': 1280, 'height': 720},
"user_agent": user_agent or 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}
self.context = await self.browser.new_context(**context_kwargs)
# 注入Cookie
if cookies:
await self.context.add_cookies(cookies)
print(f"[浏览器池] 已注入 {len(cookies)} 个Cookie", file=sys.stderr)
# 创建页面
self.page = await self.context.new_page()
print("[浏览器池] 新页面创建成功", file=sys.stderr)
except Exception as e:
print(f"[浏览器池] 创建上下文失败: {str(e)}", file=sys.stderr)
raise
async def close(self):
"""关闭浏览器池"""
try:
if self.page:
await self.page.close()
self.page = None
if self.context:
await self.context.close()
self.context = None
if self.browser:
await self.browser.close()
self.browser = None
if self.playwright:
await self.playwright.stop()
self.playwright = None
print("[浏览器池] 浏览器已关闭", file=sys.stderr)
except Exception as e:
print(f"[浏览器池] 关闭浏览器异常: {str(e)}", file=sys.stderr)
async def cleanup_if_idle(self):
"""清理空闲浏览器(定时任务调用)- 已禁用,保持常驻"""
# 注意:为了保持浏览器常驻,不再自动清理
# 原代码:
# if self.browser and time.time() - self.last_used_time > self.idle_timeout:
# print(f"[浏览器池] 检测到空闲超时,自动清理浏览器", file=sys.stderr)
# await self.close()
pass # 不再执行清理操作
async def preheat(self, target_url: str = "https://creator.xiaohongshu.com/login"):
"""
预热浏览器:提前初始化并访问目标页面
Args:
target_url: 预热目标页面,默认为小红书登录页
"""
try:
print("[浏览器预热] 开始预热浏览器...", file=sys.stderr)
# 初始化浏览器
await self._init_browser()
self.last_used_time = time.time()
# 访问目标页面
print(f"[浏览器预热] 正在访问: {target_url}", file=sys.stderr)
await self.page.goto(target_url, wait_until='domcontentloaded', timeout=45000)
# 等待页面完全加载
await asyncio.sleep(1)
self.is_preheated = True
print("[浏览器预热] ✅ 预热完成,浏览器已就绪!", file=sys.stderr)
print(f"[浏览器预热] 当前页面: {self.page.url}", file=sys.stderr)
except Exception as e:
print(f"[浏览器预热] ⚠️ 预热失败: {str(e)}", file=sys.stderr)
print("[浏览器预热] 将在首次使用时再初始化", file=sys.stderr)
self.is_preheated = False
async def repreheat(self, target_url: str = "https://creator.xiaohongshu.com/login"):
"""
补充预热:在后台重新将浏览器预热到目标页面
用于在主浏览器被使用后,重新预热以保证下次使用的性能
重要:如果浏览器正在使用中(有临时实例),跳过预热避免干扰
Args:
target_url: 预热目标页面,默认为小红书登录页
"""
# 关键优化:检查是否有临时浏览器正在使用
if len(self.temp_browsers) > 0:
print(f"[浏览器补充预热] 检测到 {len(self.temp_browsers)} 个临时浏览器正在使用,跳过预热避免干扰", file=sys.stderr)
return
# 检查主浏览器是否正在被使用(通过最近使用时间判断)
time_since_last_use = time.time() - self.last_used_time
if time_since_last_use < 10: # 最近10秒内使用过可能还在操作中
print(f"[浏览器补充预热] 主浏览器最近 {time_since_last_use:.1f}秒前被使用,可能还在操作中,跳过预热", file=sys.stderr)
return
max_retries = 3
retry_count = 0
while retry_count < max_retries:
try:
# 检查主浏览器是否存活
if not await self._is_browser_alive():
print(f"[浏览器补充预热] 浏览器未初始化,执行完整预热 (尝试 {retry_count + 1}/{max_retries})", file=sys.stderr)
await self.preheat(target_url)
self.is_preheated = True
return
# 检查是否已经在目标页面
current_url = self.page.url if self.page else ""
if target_url in current_url:
print(f"[浏览器补充预热] 已在目标页面,无需补充预热: {current_url}", file=sys.stderr)
self.is_preheated = True
return
print(f"[浏览器补充预热] 开始补充预热... (尝试 {retry_count + 1}/{max_retries})", file=sys.stderr)
print(f"[浏览器补充预热] 当前页面: {current_url}", file=sys.stderr)
# 再次检查是否有新的临时浏览器(双重检查)
if len(self.temp_browsers) > 0:
print(f"[浏览器补充预热] 检测到新的临时浏览器启动,取消预热", file=sys.stderr)
return
# 访问目标页面
print(f"[浏览器补充预热] 正在访问: {target_url}", file=sys.stderr)
await self.page.goto(target_url, wait_until='domcontentloaded', timeout=45000)
# 额外等待,确保页面完全加载
await asyncio.sleep(2)
# 验证页面是否正确加载
current_page_url = self.page.url
if target_url in current_page_url or 'creator.xiaohongshu.com' in current_page_url:
self.is_preheated = True
self.last_used_time = time.time()
print("[浏览器补充预热] ✅ 补充预热完成!", file=sys.stderr)
print(f"[浏览器补充预热] 当前页面: {current_page_url}", file=sys.stderr)
return # 成功,退出重试循环
else:
print(f"[浏览器补充预热] 页面未正确加载,期望: {target_url}, 实际: {current_page_url}", file=sys.stderr)
raise Exception(f"页面未正确加载到目标地址")
except Exception as e:
retry_count += 1
print(f"[浏览器补充预热] ⚠️ 补充预热失败 (尝试 {retry_count}/{max_retries}): {str(e)}", file=sys.stderr)
if retry_count < max_retries:
# 等待一段时间后重试
await asyncio.sleep(2)
# 尝试重新初始化浏览器
try:
await self.close() # 关闭当前可能有问题的浏览器
except:
pass # 忽略关闭时的错误
else:
# 所有重试都失败了
print(f"[浏览器补充预热] ❌ 所有重试都失败了,将尝试完整预热", file=sys.stderr)
try:
await self.close() # 先关闭当前浏览器
except:
pass
# 执行完整预热
try:
await self.preheat(target_url)
self.is_preheated = True
return
except Exception as final_error:
print(f"[浏览器补充预热] ❌ 最终预热也失败: {str(final_error)}", file=sys.stderr)
self.is_preheated = False
# 即使最终失败,也要确保浏览器处于可用状态
try:
await self._init_browser()
except:
pass
async def _create_temp_browser(self, cookies: Optional[list] = None, proxy: Optional[str] = None,
user_agent: Optional[str] = None, headless: bool = True) -> tuple[Browser, BrowserContext, Page]:
"""创建临时浏览器实例(用于并发请求)
Args:
cookies: Cookie列表
proxy: 代理地址
user_agent: 自定义User-Agent
headless: 是否使用无头模式
"""
try:
# 启动Playwright复用全局实例
if not self.playwright:
if sys.platform == 'win32':
try:
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
except Exception as e:
print(f"[临时浏览器] 警告: 设置事件循环策略失败: {str(e)}", file=sys.stderr)
self.playwright = await async_playwright().start()
# 启动浏览器(临时实例,性能优先配置)
launch_kwargs = {
"headless": headless, # 使用传入的headless参数
"args": [
'--disable-blink-features=AutomationControlled',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
# 性能优化
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process',
'--disable-site-isolation-trials',
'--enable-features=NetworkService,NetworkServiceInProcess',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-background-networking',
# 缓存优化
'--disk-cache-size=268435456',
'--media-cache-size=134217728',
# 渲染优化
'--enable-gpu-rasterization',
'--enable-zero-copy',
'--ignore-gpu-blocklist',
'--enable-accelerated-2d-canvas',
# 网络优化
'--enable-quic',
'--enable-tcp-fast-open',
'--max-connections-per-host=10',
# 减少不必要的功能
'--disable-extensions',
'--disable-breakpad',
'--disable-component-extensions-with-background-pages',
'--disable-ipc-flooding-protection',
'--disable-hang-monitor',
'--disable-prompt-on-repost',
'--disable-domain-reliability',
'--disable-component-update',
# 界面优化
'--hide-scrollbars',
'--mute-audio',
'--no-first-run',
'--no-default-browser-check',
'--metrics-recording-only',
'--force-color-profile=srgb',
],
}
if proxy:
launch_kwargs["proxy"] = {"server": proxy}
browser = await self.playwright.chromium.launch(**launch_kwargs)
# 创建上下文
context_kwargs = {
"viewport": {'width': 1280, 'height': 720},
"user_agent": user_agent or 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}
context = await browser.new_context(**context_kwargs)
# 注入Cookie
if cookies:
await context.add_cookies(cookies)
# 创建页面
page = await context.new_page()
return browser, context, page
except Exception as e:
print(f"[临时浏览器] 创建失败: {str(e)}", file=sys.stderr)
raise
async def release_temp_browser(self, session_id: str):
"""释放临时浏览器"""
async with self.temp_lock:
if session_id in self.temp_browsers:
browser_info = self.temp_browsers[session_id]
try:
await browser_info["page"].close()
await browser_info["context"].close()
await browser_info["browser"].close()
print(f"[浏览器池] 已释放会话 {session_id} 的临时浏览器", file=sys.stderr)
except Exception as e:
print(f"[浏览器池] 释放临时浏览器异常: {str(e)}", file=sys.stderr)
finally:
del self.temp_browsers[session_id]
def get_stats(self) -> Dict[str, Any]:
"""获取浏览器池统计信息"""
return {
"browser_alive": self.browser is not None,
"context_alive": self.context is not None,
"page_alive": self.page is not None,
"is_preheated": self.is_preheated,
"temp_browsers_count": len(self.temp_browsers),
"max_instances": self.max_instances,
"last_used_time": self.last_used_time,
"idle_seconds": int(time.time() - self.last_used_time) if self.last_used_time > 0 else 0,
"idle_timeout": self.idle_timeout
}
# 全局单例
_browser_pool: Optional[BrowserPool] = None
def get_browser_pool(idle_timeout: int = 1800, headless: bool = True) -> BrowserPool:
"""获取全局浏览器池实例(单例)
Args:
idle_timeout: 空闲超时时间(秒)
headless: 是否使用无头模式False为有头模式方便调试
"""
global _browser_pool
if _browser_pool is None:
print(f"[浏览器池] 创建单例,模式: {'headless' if headless else 'headed'}", file=sys.stderr)
_browser_pool = BrowserPool(idle_timeout=idle_timeout, headless=headless)
elif _browser_pool.headless != headless:
# 如果headless配置变了需要更新
print(f"[浏览器池] 检测到headless配置变更: {_browser_pool.headless} -> {headless}", file=sys.stderr)
_browser_pool.headless = headless
return _browser_pool