Files
ai_wht_wechat/backend/browser_pool.py
2026-01-06 19:36:42 +08:00

554 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
浏览器池管理模块
管理Playwright浏览器实例的生命周期支持复用以提升性能
"""
import asyncio
import time
from typing import Optional, Dict, Any
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
import sys
class BrowserPool:
"""浏览器池管理器(单例模式)"""
def __init__(self, idle_timeout: int = 1800, max_instances: int = 5, headless: bool = True):
"""
初始化浏览器池
Args:
idle_timeout: 空闲超时时间默认30分钟已禁用保持常驻
max_instances: 最大浏览器实例数默认5个
headless: 是否使用无头模式False为有头模式方便调试
"""
self.playwright = None
self.browser: Optional[Browser] = None
self.context: Optional[BrowserContext] = None
self.page: Optional[Page] = None
self.last_used_time = 0
self.idle_timeout = idle_timeout
self.max_instances = max_instances
self.headless = headless
self.is_initializing = False
self.init_lock = asyncio.Lock()
self.is_preheated = False # 标记是否已预热
# 临时浏览器实例池(用于并发请求)
self.temp_browsers: Dict[str, Dict] = {} # {session_id: {browser, context, page, created_at}}
self.temp_lock = asyncio.Lock()
print(f"[浏览器池] 已创建,常驻模式(不自动清理),最大实例数: {max_instances}", file=sys.stderr)
async def get_browser(self, cookies: Optional[list] = None, proxy: Optional[str] = None,
user_agent: Optional[str] = None, session_id: Optional[str] = None,
headless: Optional[bool] = None) -> tuple[Browser, BrowserContext, Page]:
"""
获取浏览器实例(复用或新建)
Args:
cookies: 可选的Cookie列表
proxy: 可选的代理地址
user_agent: 可选的自定义User-Agent
session_id: 会话 ID用于区分不同的并发请求
headless: 可选的headless模式为None时使用默认配置
Returns:
(browser, context, page) 三元组
"""
# 如果没有指定headless使用默认配置
if headless is None:
headless = self.headless
# 如果主浏览器可用且无会话 ID使用主浏览器
if not session_id:
async with self.init_lock:
# 检查现有浏览器是否可用
if await self._is_browser_alive():
print("[浏览器池] 复用主浏览器实例", file=sys.stderr)
self.last_used_time = time.time()
# 如果需要注入Cookie直接添加到现有的context不创建新context
if cookies:
print(f"[浏览器池] 在现有context中注入 {len(cookies)} 个Cookie", file=sys.stderr)
await self.context.add_cookies(cookies)
return self.browser, self.context, self.page
else:
# 创建新浏览器
print("[浏览器池] 创建主浏览器实例", file=sys.stderr)
await self._init_browser(cookies, proxy, user_agent)
self.last_used_time = time.time()
return self.browser, self.context, self.page
# 并发请求:复用或创建临时浏览器
else:
async with self.temp_lock:
# 首先检查是否已存在该session_id的临时浏览器
if session_id in self.temp_browsers:
print(f"[浏览器池] 复用会话 {session_id} 的临时浏览器", file=sys.stderr)
browser_info = self.temp_browsers[session_id]
return browser_info["browser"], browser_info["context"], browser_info["page"]
# 检查是否超过最大实例数
if len(self.temp_browsers) >= self.max_instances - 1: # -1 留给主浏览器
print(f"[浏览器池] ⚠️ 已达最大实例数 ({self.max_instances}),等待释放...", file=sys.stderr)
# TODO: 可以实现等待队列,这里直接报错
raise Exception(f"浏览器实例数已满,请稍后再试")
print(f"[浏览器池] 为会话 {session_id} 创建临时浏览器 ({len(self.temp_browsers)+1}/{self.max_instances-1})", file=sys.stderr)
# 创建临时浏览器传入headless参数
browser, context, page = await self._create_temp_browser(cookies, proxy, user_agent, headless)
# 保存到临时池
self.temp_browsers[session_id] = {
"browser": browser,
"context": context,
"page": page,
"created_at": time.time()
}
return browser, context, page
async def _is_browser_alive(self) -> bool:
"""检查浏览器是否存活(不检查超时,保持常驻)"""
if not self.browser or not self.context or not self.page:
return False
# 注意:为了保持浏览器常驻,不再检查空闲超时
# 原代码:
# if time.time() - self.last_used_time > self.idle_timeout:
# print(f"[浏览器池] 浏览器空闲超时 ({self.idle_timeout}秒),需要重建", file=sys.stderr)
# await self.close()
# return False
# 检查浏览器是否仍在运行
try:
# 尝试获取页面标题来验证连接
await self.page.title()
return True
except Exception as e:
print(f"[浏览器池] 浏览器连接失效: {str(e)}", file=sys.stderr)
await self.close()
return False
async def _init_browser(self, cookies: Optional[list] = None, proxy: Optional[str] = None,
user_agent: Optional[str] = None):
"""初始化新浏览器实例"""
try:
# 启动Playwright
if not self.playwright:
# Windows环境下需要设置事件循环策略
if sys.platform == 'win32':
# 设置为ProactorEventLoop或SelectorEventLoop
try:
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
except Exception as e:
print(f"[浏览器池] 警告: 设置事件循环策略失败: {str(e)}", file=sys.stderr)
self.playwright = await async_playwright().start()
print("[浏览器池] Playwright启动成功", file=sys.stderr)
# 启动浏览器(性能优先配置)
launch_kwargs = {
"headless": self.headless, # 使用配置的headless参数
"args": [
'--disable-blink-features=AutomationControlled', # 隐藏自动化特征
'--no-sandbox', # Linux环境必需
'--disable-setuid-sandbox',
'--disable-dev-shm-usage', # 使用/tmp而非/dev/shm避免内存不足
# 性能优化
'--disable-web-security', # 禁用同源策略(提升加载速度)
'--disable-features=IsolateOrigins,site-per-process', # 禁用站点隔离(提升性能)
'--disable-site-isolation-trials',
'--enable-features=NetworkService,NetworkServiceInProcess', # 网络服务优化
'--disable-background-timer-throttling', # 禁用后台限速
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding', # 渲染进程不降优先级
'--disable-background-networking',
# 缓存和存储优化
'--disk-cache-size=268435456', # 256MB磁盘缓存
'--media-cache-size=134217728', # 128MB媒体缓存
# 渲染优化保留GPU支持
'--enable-gpu-rasterization', # 启用GPU光栅化
'--enable-zero-copy', # 零拷贝优化
'--ignore-gpu-blocklist', # 忽略GPU黑名单
'--enable-accelerated-2d-canvas', # 加速2D canvas
# 网络优化
'--enable-quic', # 启用QUIC协议
'--enable-tcp-fast-open', # TCP快速打开
'--max-connections-per-host=10', # 每个主机最大连接数
# 减少不必要的功能
'--disable-extensions',
'--disable-breakpad', # 禁用崩溃报告
'--disable-component-extensions-with-background-pages',
'--disable-ipc-flooding-protection', # 禁用IPC洪水保护提升性能
'--disable-hang-monitor', # 禁用挂起监控
'--disable-prompt-on-repost',
'--disable-domain-reliability',
'--disable-component-update',
# 界面优化
'--hide-scrollbars',
'--mute-audio',
'--no-first-run',
'--no-default-browser-check',
'--metrics-recording-only',
'--force-color-profile=srgb',
],
}
if proxy:
launch_kwargs["proxy"] = {"server": proxy}
self.browser = await self.playwright.chromium.launch(**launch_kwargs)
print("[浏览器池] Chromium浏览器启动成功", file=sys.stderr)
# 创建上下文
await self._create_new_context(cookies, proxy, user_agent)
except Exception as e:
print(f"[浏览器池] 初始化浏览器失败: {str(e)}", file=sys.stderr)
await self.close()
raise
async def _create_new_context(self, cookies: Optional[list] = None, proxy: Optional[str] = None,
user_agent: Optional[str] = None):
"""创建新的浏览器上下文"""
try:
# 关闭旧上下文
if self.context:
await self.context.close()
print("[浏览器池] 已关闭旧上下文", file=sys.stderr)
# 创建新上下文
context_kwargs = {
"viewport": {'width': 1280, 'height': 720},
"user_agent": user_agent or 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}
self.context = await self.browser.new_context(**context_kwargs)
# 注入Cookie
if cookies:
await self.context.add_cookies(cookies)
print(f"[浏览器池] 已注入 {len(cookies)} 个Cookie", file=sys.stderr)
# 创建页面
self.page = await self.context.new_page()
print("[浏览器池] 新页面创建成功", file=sys.stderr)
except Exception as e:
print(f"[浏览器池] 创建上下文失败: {str(e)}", file=sys.stderr)
raise
async def close(self):
"""关闭浏览器池"""
try:
if self.page:
await self.page.close()
self.page = None
if self.context:
await self.context.close()
self.context = None
if self.browser:
await self.browser.close()
self.browser = None
if self.playwright:
await self.playwright.stop()
self.playwright = None
print("[浏览器池] 浏览器已关闭", file=sys.stderr)
except Exception as e:
print(f"[浏览器池] 关闭浏览器异常: {str(e)}", file=sys.stderr)
async def cleanup_if_idle(self):
"""清理空闲浏览器(定时任务调用)- 已禁用,保持常驻"""
# 注意:为了保持浏览器常驻,不再自动清理
# 原代码:
# if self.browser and time.time() - self.last_used_time > self.idle_timeout:
# print(f"[浏览器池] 检测到空闲超时,自动清理浏览器", file=sys.stderr)
# await self.close()
pass # 不再执行清理操作
async def preheat(self, target_url: str = "https://creator.xiaohongshu.com/login"):
"""
预热浏览器:提前初始化并访问目标页面
Args:
target_url: 预热目标页面,默认为小红书登录页
"""
try:
print("[浏览器预热] 开始预热浏览器...", file=sys.stderr)
# 初始化浏览器
await self._init_browser()
self.last_used_time = time.time()
# 访问目标页面
print(f"[浏览器预热] 正在访问: {target_url}", file=sys.stderr)
await self.page.goto(target_url, wait_until='domcontentloaded', timeout=45000)
# 等待页面完全加载
await asyncio.sleep(1)
self.is_preheated = True
print("[浏览器预热] ✅ 预热完成,浏览器已就绪!", file=sys.stderr)
print(f"[浏览器预热] 当前页面: {self.page.url}", file=sys.stderr)
except Exception as e:
print(f"[浏览器预热] ⚠️ 预热失败: {str(e)}", file=sys.stderr)
print("[浏览器预热] 将在首次使用时再初始化", file=sys.stderr)
self.is_preheated = False
async def repreheat(self, target_url: str = "https://creator.xiaohongshu.com/login"):
"""
补充预热:在后台重新将浏览器预热到目标页面
用于在主浏览器被使用后,重新预热以保证下次使用的性能
重要:如果浏览器正在使用中(有临时实例),跳过预热避免干扰
Args:
target_url: 预热目标页面,默认为小红书登录页
"""
# 关键优化:检查是否有临时浏览器正在使用
if len(self.temp_browsers) > 0:
print(f"[浏览器补充预热] 检测到 {len(self.temp_browsers)} 个临时浏览器正在使用,跳过预热避免干扰", file=sys.stderr)
return
# 检查主浏览器是否正在被使用(通过最近使用时间判断)
time_since_last_use = time.time() - self.last_used_time
if time_since_last_use < 10: # 最近10秒内使用过可能还在操作中
print(f"[浏览器补充预热] 主浏览器最近 {time_since_last_use:.1f}秒前被使用,可能还在操作中,跳过预热", file=sys.stderr)
return
max_retries = 3
retry_count = 0
while retry_count < max_retries:
try:
# 检查主浏览器是否存活
if not await self._is_browser_alive():
print(f"[浏览器补充预热] 浏览器未初始化,执行完整预热 (尝试 {retry_count + 1}/{max_retries})", file=sys.stderr)
await self.preheat(target_url)
self.is_preheated = True
return
# 检查是否已经在目标页面
current_url = self.page.url if self.page else ""
if target_url in current_url:
print(f"[浏览器补充预热] 已在目标页面,无需补充预热: {current_url}", file=sys.stderr)
self.is_preheated = True
return
print(f"[浏览器补充预热] 开始补充预热... (尝试 {retry_count + 1}/{max_retries})", file=sys.stderr)
print(f"[浏览器补充预热] 当前页面: {current_url}", file=sys.stderr)
# 再次检查是否有新的临时浏览器(双重检查)
if len(self.temp_browsers) > 0:
print(f"[浏览器补充预热] 检测到新的临时浏览器启动,取消预热", file=sys.stderr)
return
# 访问目标页面
print(f"[浏览器补充预热] 正在访问: {target_url}", file=sys.stderr)
await self.page.goto(target_url, wait_until='domcontentloaded', timeout=45000)
# 额外等待,确保页面完全加载
await asyncio.sleep(2)
# 验证页面是否正确加载
current_page_url = self.page.url
if target_url in current_page_url or 'creator.xiaohongshu.com' in current_page_url:
self.is_preheated = True
self.last_used_time = time.time()
print("[浏览器补充预热] ✅ 补充预热完成!", file=sys.stderr)
print(f"[浏览器补充预热] 当前页面: {current_page_url}", file=sys.stderr)
return # 成功,退出重试循环
else:
print(f"[浏览器补充预热] 页面未正确加载,期望: {target_url}, 实际: {current_page_url}", file=sys.stderr)
raise Exception(f"页面未正确加载到目标地址")
except Exception as e:
retry_count += 1
print(f"[浏览器补充预热] ⚠️ 补充预热失败 (尝试 {retry_count}/{max_retries}): {str(e)}", file=sys.stderr)
if retry_count < max_retries:
# 等待一段时间后重试
await asyncio.sleep(2)
# 尝试重新初始化浏览器
try:
await self.close() # 关闭当前可能有问题的浏览器
except:
pass # 忽略关闭时的错误
else:
# 所有重试都失败了
print(f"[浏览器补充预热] ❌ 所有重试都失败了,将尝试完整预热", file=sys.stderr)
try:
await self.close() # 先关闭当前浏览器
except:
pass
# 执行完整预热
try:
await self.preheat(target_url)
self.is_preheated = True
return
except Exception as final_error:
print(f"[浏览器补充预热] ❌ 最终预热也失败: {str(final_error)}", file=sys.stderr)
self.is_preheated = False
# 即使最终失败,也要确保浏览器处于可用状态
try:
await self._init_browser()
except:
pass
async def _create_temp_browser(self, cookies: Optional[list] = None, proxy: Optional[str] = None,
user_agent: Optional[str] = None, headless: bool = True) -> tuple[Browser, BrowserContext, Page]:
"""创建临时浏览器实例(用于并发请求)
Args:
cookies: Cookie列表
proxy: 代理地址
user_agent: 自定义User-Agent
headless: 是否使用无头模式
"""
try:
# 启动Playwright复用全局实例
if not self.playwright:
if sys.platform == 'win32':
try:
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
except Exception as e:
print(f"[临时浏览器] 警告: 设置事件循环策略失败: {str(e)}", file=sys.stderr)
self.playwright = await async_playwright().start()
# 启动浏览器(临时实例,性能优先配置)
launch_kwargs = {
"headless": headless, # 使用传入的headless参数
"args": [
'--disable-blink-features=AutomationControlled',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
# 性能优化
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process',
'--disable-site-isolation-trials',
'--enable-features=NetworkService,NetworkServiceInProcess',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-background-networking',
# 缓存优化
'--disk-cache-size=268435456',
'--media-cache-size=134217728',
# 渲染优化
'--enable-gpu-rasterization',
'--enable-zero-copy',
'--ignore-gpu-blocklist',
'--enable-accelerated-2d-canvas',
# 网络优化
'--enable-quic',
'--enable-tcp-fast-open',
'--max-connections-per-host=10',
# 减少不必要的功能
'--disable-extensions',
'--disable-breakpad',
'--disable-component-extensions-with-background-pages',
'--disable-ipc-flooding-protection',
'--disable-hang-monitor',
'--disable-prompt-on-repost',
'--disable-domain-reliability',
'--disable-component-update',
# 界面优化
'--hide-scrollbars',
'--mute-audio',
'--no-first-run',
'--no-default-browser-check',
'--metrics-recording-only',
'--force-color-profile=srgb',
],
}
if proxy:
launch_kwargs["proxy"] = {"server": proxy}
browser = await self.playwright.chromium.launch(**launch_kwargs)
# 创建上下文
context_kwargs = {
"viewport": {'width': 1280, 'height': 720},
"user_agent": user_agent or 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}
context = await browser.new_context(**context_kwargs)
# 注入Cookie
if cookies:
await context.add_cookies(cookies)
# 创建页面
page = await context.new_page()
return browser, context, page
except Exception as e:
print(f"[临时浏览器] 创建失败: {str(e)}", file=sys.stderr)
raise
async def release_temp_browser(self, session_id: str):
"""释放临时浏览器"""
async with self.temp_lock:
if session_id in self.temp_browsers:
browser_info = self.temp_browsers[session_id]
try:
await browser_info["page"].close()
await browser_info["context"].close()
await browser_info["browser"].close()
print(f"[浏览器池] 已释放会话 {session_id} 的临时浏览器", file=sys.stderr)
except Exception as e:
print(f"[浏览器池] 释放临时浏览器异常: {str(e)}", file=sys.stderr)
finally:
del self.temp_browsers[session_id]
def get_stats(self) -> Dict[str, Any]:
"""获取浏览器池统计信息"""
return {
"browser_alive": self.browser is not None,
"context_alive": self.context is not None,
"page_alive": self.page is not None,
"is_preheated": self.is_preheated,
"temp_browsers_count": len(self.temp_browsers),
"max_instances": self.max_instances,
"last_used_time": self.last_used_time,
"idle_seconds": int(time.time() - self.last_used_time) if self.last_used_time > 0 else 0,
"idle_timeout": self.idle_timeout
}
# 全局单例
_browser_pool: Optional[BrowserPool] = None
def get_browser_pool(idle_timeout: int = 1800, headless: bool = True) -> BrowserPool:
"""获取全局浏览器池实例(单例)
Args:
idle_timeout: 空闲超时时间(秒)
headless: 是否使用无头模式False为有头模式方便调试
"""
global _browser_pool
if _browser_pool is None:
print(f"[浏览器池] 创建单例,模式: {'headless' if headless else 'headed'}", file=sys.stderr)
_browser_pool = BrowserPool(idle_timeout=idle_timeout, headless=headless)
elif _browser_pool.headless != headless:
# 如果headless配置变了需要更新
print(f"[浏览器池] 检测到headless配置变更: {_browser_pool.headless} -> {headless}", file=sys.stderr)
_browser_pool.headless = headless
return _browser_pool