ai_crawler_tiktok/core/curl.py

import re
import json
from urllib.request import Request, urlopen

def _split_curl_blocks(text):
    """按出现的 `curl ` 关键字切分文本为多个命令块"""
    blocks = []
    indices = [m.start() for m in re.finditer(r"\bcurl\s", text)]
    if not indices:
        return blocks
    for i, start in enumerate(indices):
        end = indices[i + 1] if i + 1 < len(indices) else len(text)
        blocks.append(text[start:end])
    return blocks

def _parse_block(block):
    """从单个 curl 命令块中解析 URL 与头部

    返回：`{'url': str, 'headers': dict}`，若无法解析 URL 返回 None
    """
    url_m = re.search(r"curl\s+['\"](.*?)['\"]", block, re.S)
    if not url_m:
        return None
    url = url_m.group(1)
    headers = {}
    for hm in re.finditer(r"-H\s+['\"]([^:]+):\s*(.*?)['\"]", block):
        k = hm.group(1).strip()
        v = hm.group(2).strip()
        headers[k.lower()] = v
    cm = re.search(r"-b\s+['\"](.*?)['\"]", block, re.S)
    if cm:
        headers['cookie'] = cm.group(1)
    return {'url': url, 'headers': headers}

def parse_curl_file(file_path):
    """读取 curl 文本文件并解析为请求描述列表

    参数：`file_path` 文件路径
    返回：列表，每项包含 `url` 与 `headers`
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    blocks = _split_curl_blocks(text)
    result = []
    for b in blocks:
        parsed = _parse_block(b)
        if parsed:
            result.append(parsed)
    return result

def fetch_from_curl(file_path, index=0, timeout=30):
    """按索引选取解析出的请求并发起 GET

    参数：`index` 为第几个 curl 块；`timeout` 请求超时秒数
    返回：尝试解析为 JSON，失败则返回原始 bytes
    """
    reqs = parse_curl_file(file_path)
    if not reqs or index < 0 or index >= len(reqs):
        return None
    item = reqs[index]
    req = Request(item['url'], headers=item['headers'], method='GET')
    with urlopen(req, timeout=timeout) as resp:
        data = resp.read()
        try:
            return json.loads(data.decode('utf-8', errors='ignore'))
        except Exception:
            return data
"""curl 文本解析与请求发送工具

职责：
- 将包含多个 curl 命令的文本切分为块
- 从每个块解析 URL 与请求头（含 Cookie）
- 基于解析结果发起 GET 请求并尝试返回 JSON
"""