ai_wht_wechat/backend/xhs_publish.py

"""
小红书笔记发布脚本
提供Cookie、文案（标题、内容、标签、图片）完成发布操作
支持本地图片路径和网络URL图片
"""
import sys
import json
import asyncio
import io
import os
import re
import aiohttp
import hashlib
import unicodedata
from typing import List, Dict, Any, Union
from pathlib import Path
from xhs_login import XHSLoginService


class XHSPublishService:
    """小红书笔记发布服务"""

    def __init__(self, cookies: Union[List[Dict[str, Any]], Dict[str, str]], proxy: str | None = None, user_agent: str | None = None):
        """
        初始化发布服务

        Args:
            cookies: Cookie数据，支持两种格式：
                1. Playwright格式（列表）: [{"name": "a1", "value": "xxx", "domain": "...", ...}]
                2. 键值对格式（字典）: {"a1": "xxx", "webId": "yyy", ...}
            proxy: 可选的代理地址（例如 http://user:pass@ip:port）
            user_agent: 可选的自定义User-Agent
        """
        # 转换Cookie格式
        self.cookies = self._normalize_cookies(cookies)
        self.proxy = proxy
        self.user_agent = user_agent
        self.service = XHSLoginService()
        self.temp_dir = "temp_downloads"  # 临时下载目录
        self.downloaded_files = []  # 记录下载的文件，用于清理

    def _normalize_cookies(self, cookies: Union[List[Dict[str, Any]], Dict[str, str]]) -> List[Dict[str, Any]]:
        """
        将Cookie标准化为Playwright格式

        Args:
            cookies: 输入的Cookie（支持两种格式）

        Returns:
            Playwright格式的Cookie列表
        """
        # 如果已经是列表格式（Playwright格式）
        if isinstance(cookies, list):
            # 检查是否包含必要字段
            if cookies and 'name' in cookies[0] and 'value' in cookies[0]:
                print("✅ 使用 Playwright 格式的 Cookie", file=sys.stderr)
                return cookies

        # 如果是字典格式（键值对格式），转换为Playwright格式
        if isinstance(cookies, dict):
            print("✅ 检测到键值对格式的 Cookie，转换为 Playwright 格式", file=sys.stderr)
            playwright_cookies = []
            for name, value in cookies.items():
                cookie = {
                    "name": name,
                    "value": str(value),
                    "domain": ".xiaohongshu.com",
                    "path": "/",
                    "expires": -1,  # 会话Cookie
                    "httpOnly": False,
                    "secure": False,
                    "sameSite": "Lax"
                }

                # 特殊处理某些Cookie的属性
                if name == "web_session":
                    cookie["httpOnly"] = True
                    cookie["secure"] = True
                elif name in ["acw_tc"]:
                    cookie["httpOnly"] = True

                playwright_cookies.append(cookie)

            print(f"  转换了 {len(playwright_cookies)} 个 Cookie", file=sys.stderr)
            return playwright_cookies

        # 如果格式不支持，抛出异常
        raise ValueError(f"不支持的Cookie格式: {type(cookies)}。请使用列表或字典格式。")

    def _calculate_title_width(self, title: str) -> int:
        width = 0
        for ch in title:
            if unicodedata.east_asian_width(ch) in ("F", "W"):
                width += 2
            else:
                width += 1
        return width

    def is_url(self, path: str) -> bool:
        """
        判断是否为网络URL

        Args:
            path: 图片路径或URL

        Returns:
            是否为URL
        """
        url_pattern = re.compile(r'^https?://', re.IGNORECASE)
        return bool(url_pattern.match(path))

    async def download_image(self, url: str, index: int = 0) -> str:
        """
        下载网络图片到本地临时目录

        Args:
            url: 图片URL
            index: 图片索引（用于命名）

        Returns:
            本地文件路径
        """
        try:
            print(f"  正在下载图片 [{index + 1}]: {url}", file=sys.stderr)

            # 创建临时目录
            Path(self.temp_dir).mkdir(exist_ok=True)

            # 生成文件名（使用URL的hash值）
            url_hash = hashlib.md5(url.encode()).hexdigest()[:10]

            # 从URL提取文件扩展名
            ext = '.jpg'  # 默认扩展名
            url_path = url.split('?')[0]  # 去除URL参数
            if '.' in url_path:
                ext = '.' + url_path.split('.')[-1].lower()
                if ext not in ['.jpg', '.jpeg', '.png', '.gif', '.webp']:
                    ext = '.jpg'

            filename = f"image_{index}_{url_hash}{ext}"
            filepath = os.path.join(self.temp_dir, filename)

            # 下载图片
            async with aiohttp.ClientSession() as session:
                async with session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as response:
                    if response.status == 200:
                        content = await response.read()

                        # 保存文件
                        with open(filepath, 'wb') as f:
                            f.write(content)

                        # 记录已下载文件
                        self.downloaded_files.append(filepath)

                        # 获取文件大小
                        file_size = len(content) / 1024  # KB
                        print(f"  ✅ 下载成功: {filename} ({file_size:.1f}KB)", file=sys.stderr)

                        return os.path.abspath(filepath)
                    else:
                        raise Exception(f"下载失败，HTTP状态码: {response.status}")

        except asyncio.TimeoutError:
            raise Exception(f"下载超时: {url}")
        except Exception as e:
            raise Exception(f"下载图片失败 ({url}): {str(e)}")

    async def process_images(self, images: List[str]) -> List[str]:
        """
        处理图片列表，将网络URL下载到本地

        Args:
            images: 图片路径列表（可以是本地路径或网络URL）

        Returns:
            本地图片路径列表
        """
        if not images:
            return []

        local_images = []

        # OSS域名前缀（用于补充不完整的图片路径）
        oss_prefix = "https://bxmkb-beijing.oss-cn-beijing.aliyuncs.com/Images/"

        print(f"\n正在处理 {len(images)} 张图片...", file=sys.stderr)

        for i, img in enumerate(images):
            # 检查是否需要补充OSS前缀
            original_img = img
            print(f"  [调试] 处理图片 {i+1}: '{img}'", file=sys.stderr)
            print(f"  [调试] is_url={self.is_url(img)}, isabs={os.path.isabs(img)}", file=sys.stderr)

            if not self.is_url(img) and not os.path.isabs(img):
                # 不是URL也不是绝对路径，检查是否需要补充OSS前缀
                print(f"  [调试] 不是URL也不是绝对路径", file=sys.stderr)
                # 如果路径不包含协议且不以/开头，可能是相对OSS路径
                if '/' in img and not img.startswith('/'):
                    # 可能是OSS相对路径，补充前缀
                    img = oss_prefix + img
                    print(f"  ✅ 检测到相对路径，补充OSS前缀: {original_img} -> {img}", file=sys.stderr)
                else:
                    print(f"  [调试] 不满足补充条件: '/' in img={('/' in img)}, not startswith('/')={not img.startswith('/')}", file=sys.stderr)

            if self.is_url(img):
                # 网络URL，需要下载
                try:
                    local_path = await self.download_image(img, i)
                    local_images.append(local_path)
                except Exception as e:
                    print(f"  ⚠️ 图片下载失败: {str(e)}", file=sys.stderr)
                    # 继续处理其他图片
                    continue
            else:
                # 本地路径
                # 先尝试直接使用，如果不存在则尝试相对路径
                abs_path = None

                # 1. 尝试作为绝对路径
                if os.path.isabs(img) and os.path.exists(img):
                    abs_path = img
                # 2. 尝试相对于当前工作目录
                elif os.path.exists(img):
                    abs_path = os.path.abspath(img)
                # 3. 尝试相对于 static 目录
                elif os.path.exists(os.path.join('static', img)):
                    abs_path = os.path.abspath(os.path.join('static', img))
                # 4. 尝试相对于 ../go_backend/static 目录
                elif os.path.exists(os.path.join('..', 'go_backend', 'static', img)):
                    abs_path = os.path.abspath(os.path.join('..', 'go_backend', 'static', img))

                if abs_path:
                    local_images.append(abs_path)
                    print(f"  ✅ 本地图片 [{i + 1}]: {os.path.basename(abs_path)} ({abs_path})", file=sys.stderr)
                else:
                    print(f"  ⚠️ 本地图片不存在: {img}", file=sys.stderr)

        print(f"\n成功处理 {len(local_images)}/{len(images)} 张图片", file=sys.stderr)
        return local_images

    def cleanup_temp_files(self):
        """
        清理临时下载的文件
        """
        if not self.downloaded_files:
            return

        print(f"\n清理 {len(self.downloaded_files)} 个临时文件...", file=sys.stderr)
        for filepath in self.downloaded_files:
            try:
                if os.path.exists(filepath):
                    os.remove(filepath)
                    print(f"  已删除: {os.path.basename(filepath)}", file=sys.stderr)
            except Exception as e:
                print(f"  删除失败 {filepath}: {e}", file=sys.stderr)

        # 清空记录
        self.downloaded_files = []

    async def publish(
        self,
        title: str,
        content: str,
        images: List[str] = None,
        tags: List[str] = None,
        cleanup: bool = True
    ) -> Dict[str, Any]:
        """
        发布笔记

        Args:
            title: 笔记标题
            content: 笔记内容
            images: 图片路径列表（支持本地文件路径或网络URL）
            tags: 标签列表（例如：["美食", "探店"]）
            cleanup: 是否清理临时下载的图片文件（默认True）

        Returns:
            Dict containing success status, message, and publish result
        """
        try:
            print("\n========== 开始发布小红书笔记 ==========", file=sys.stderr)
            print(f"标题: {title}", file=sys.stderr)
            print(f"内容: {content[:100]}{'...' if len(content) > 100 else ''}", file=sys.stderr)
            print(f"图片: {len(images) if images else 0} 张", file=sys.stderr)
            print(f"标签: {tags if tags else []}", file=sys.stderr)

            width = self._calculate_title_width(title)
            if width > 40:
                return {
                    "success": False,
                    "error": f"标题长度超过限制（当前宽度 {width}，平台限制 40）"
                }

            if tags:
                if len(tags) > 10:
                    tags = tags[:10]
                    print("⚠️ 标签数量超过10，已截取前10个标签", file=sys.stderr)

            local_images = None
            if images:
                local_images = await self.process_images(images)
                if not local_images:
                    print("⚠️ 警告：没有可用的图片", file=sys.stderr)
                    return {
                        "success": False,
                        "error": "没有可用的图片，无法发布笔记"
                    }

            # 初始化浏览器并注入Cookie
            print("\n1. 初始化浏览器...", file=sys.stderr)
            await self.service.init_browser(cookies=self.cookies, proxy=self.proxy, user_agent=self.user_agent)

            # 验证登录状态
            print("\n2. 验证登录状态...", file=sys.stderr)
            verify_result = await self.service.verify_login_status()

            if not verify_result.get('logged_in'):
                return {
                    "success": False,
                    "error": "Cookie已失效或未登录",
                    "details": verify_result
                }

            print("✅ 登录状态有效", file=sys.stderr)

            # 发布笔记
            print("\n3. 开始发布笔记...", file=sys.stderr)
            result = await self.service.publish_note(
                title=title,
                content=content,
                images=local_images,
                topics=tags
            )

            print("\n========== 发布完成 ==========", file=sys.stderr)
            return result

        except Exception as e:
            print(f"\n发布异常: {str(e)}", file=sys.stderr)
            return {
                "success": False,
                "error": str(e)
            }

        finally:
            # 关闭浏览器
            await self.service.close_browser()

            # 清理临时文件
            if cleanup:
                self.cleanup_temp_files()


async def publish_from_config(config_file: str) -> Dict[str, Any]:
    """
    从配置文件读取参数并发布

    Args:
        config_file: JSON配置文件路径

    Returns:
        发布结果
    """
    try:
        # 读取配置文件
        with open(config_file, 'r', encoding='utf-8') as f:
            config = json.load(f)

        # 提取参数
        cookies = config.get('cookies', [])
        title = config.get('title', '')
        content = config.get('content', '')
        images = config.get('images', [])
        tags = config.get('tags', [])
        proxy = config.get('proxy')
        user_agent = config.get('user_agent')

        # 验证必需参数
        if not cookies:
            return {
                "success": False,
                "error": "缺少Cookie参数"
            }

        if not title or not content:
            return {
                "success": False,
                "error": "标题和内容不能为空"
            }

        # 注意：不再验证图片文件是否存在，因为可能是网络URL
        # 图片验证交给 process_images 方法处理

        # 创建发布服务并执行
        publisher = XHSPublishService(cookies, proxy=proxy, user_agent=user_agent)
        result = await publisher.publish(
            title=title,
            content=content,
            images=images,
            tags=tags
        )

        return result

    except Exception as e:
        return {
            "success": False,
            "error": f"读取配置文件失败: {str(e)}"
        }


async def publish_from_params(
    cookies_json: str,
    title: str,
    content: str,
    images_json: str = None,
    tags_json: str = None
) -> Dict[str, Any]:
    """
    从命令行参数发布

    Args:
        cookies_json: Cookie JSON字符串 或 Cookie文件路径
        title: 标题
        content: 内容
        images_json: 图片路径数组的JSON字符串 (可选)
        tags_json: 标签数组的JSON字符串 (可选)

    Returns:
        发布结果
    """
    try:
        # 解析Cookie - 支持JSON字符串或文件路径
        cookies = None

        # 检查是否为文件路径
        if os.path.isfile(cookies_json):
            # 从文件读取
            try:
                with open(cookies_json, 'r', encoding='utf-8') as f:
                    cookies = json.load(f)
                print(f"✅ 从文件加载 Cookie: {cookies_json}")
            except Exception as e:
                return {
                    "success": False,
                    "error": f"读取 Cookie 文件失败: {str(e)}"
                }
        else:
            # 解析JSON字符串
            try:
                cookies = json.loads(cookies_json)
                print("✅ 从 JSON 字符串解析 Cookie")
            except json.JSONDecodeError as e:
                return {
                    "success": False,
                    "error": f"Cookie 参数既不是有效文件路径，也不是有效 JSON 字符串: {str(e)}"
                }

        if not cookies:
            return {
                "success": False,
                "error": "Cookie 为空"
            }

        # 解析图片列表
        images = []
        if images_json:
            images = json.loads(images_json)

        # 解析标签列表
        tags = []
        if tags_json:
            tags = json.loads(tags_json)

        # 创建发布服务并执行（命令行模式暂不支持传入代理和自定义UA）
        publisher = XHSPublishService(cookies)
        result = await publisher.publish(
            title=title,
            content=content,
            images=images,
            tags=tags
        )

        return result

    except json.JSONDecodeError as e:
        return {
            "success": False,
            "error": f"JSON解析失败: {str(e)}"
        }
    except Exception as e:
        return {
            "success": False,
            "error": str(e)
        }


def main():
    """
    命令行主函数

    使用方式:
    1. 从配置文件发布:
       python xhs_publish.py --config publish_config.json

    2. 从命令行参数发布:
       python xhs_publish.py --cookies '<cookies_json>' --title '标题' --content '内容' [--images '<images_json>'] [--tags '<tags_json>']
    """
    # 设置标准输出为UTF-8编码
    if sys.platform == 'win32':
        sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
        sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')

    if len(sys.argv) < 2:
        print(json.dumps({
            "success": False,
            "error": "缺少参数，请使用 --config 或 --cookies"
        }, ensure_ascii=False))
        sys.exit(1)

    try:
        # 解析命令行参数
        args = sys.argv[1:]

        # 方式1: 从配置文件读取
        if args[0] == '--config':
            if len(args) < 2:
                print(json.dumps({
                    "success": False,
                    "error": "缺少配置文件路径"
                }, ensure_ascii=False))
                sys.exit(1)

            config_file = args[1]
            result = asyncio.run(publish_from_config(config_file))
            print(json.dumps(result, ensure_ascii=False, indent=2))

        # 方式2: 从命令行参数
        elif args[0] == '--cookies':
            # 解析参数
            params = {}
            i = 0
            while i < len(args):
                if args[i] == '--cookies' and i + 1 < len(args):
                    params['cookies'] = args[i + 1]
                    i += 2
                elif args[i] == '--title' and i + 1 < len(args):
                    params['title'] = args[i + 1]
                    i += 2
                elif args[i] == '--content' and i + 1 < len(args):
                    params['content'] = args[i + 1]
                    i += 2
                elif args[i] == '--images' and i + 1 < len(args):
                    params['images'] = args[i + 1]
                    i += 2
                elif args[i] == '--tags' and i + 1 < len(args):
                    params['tags'] = args[i + 1]
                    i += 2
                else:
                    i += 1

            # 验证必需参数
            if 'cookies' not in params:
                print(json.dumps({
                    "success": False,
                    "error": "缺少 --cookies 参数"
                }, ensure_ascii=False))
                sys.exit(1)

            if 'title' not in params or 'content' not in params:
                print(json.dumps({
                    "success": False,
                    "error": "缺少 --title 或 --content 参数"
                }, ensure_ascii=False))
                sys.exit(1)

            result = asyncio.run(publish_from_params(
                cookies_json=params['cookies'],
                title=params['title'],
                content=params['content'],
                images_json=params.get('images'),
                tags_json=params.get('tags')
            ))
            print(json.dumps(result, ensure_ascii=False, indent=2))

        else:
            print(json.dumps({
                "success": False,
                "error": f"未知参数: {args[0]}，请使用 --config 或 --cookies"
            }, ensure_ascii=False))
            sys.exit(1)

    except Exception as e:
        print(json.dumps({
            "success": False,
            "error": str(e)
        }, ensure_ascii=False))
        sys.exit(1)


if __name__ == "__main__":
    main()