Files
ai_crawler_tiktok/core/curl.py
2025-12-08 15:20:22 +08:00

74 lines
2.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import json
from urllib.request import Request, urlopen
def _split_curl_blocks(text):
"""按出现的 `curl ` 关键字切分文本为多个命令块"""
blocks = []
indices = [m.start() for m in re.finditer(r"\bcurl\s", text)]
if not indices:
return blocks
for i, start in enumerate(indices):
end = indices[i + 1] if i + 1 < len(indices) else len(text)
blocks.append(text[start:end])
return blocks
def _parse_block(block):
"""从单个 curl 命令块中解析 URL 与头部
返回:`{'url': str, 'headers': dict}`,若无法解析 URL 返回 None
"""
url_m = re.search(r"curl\s+['\"](.*?)['\"]", block, re.S)
if not url_m:
return None
url = url_m.group(1)
headers = {}
for hm in re.finditer(r"-H\s+['\"]([^:]+):\s*(.*?)['\"]", block):
k = hm.group(1).strip()
v = hm.group(2).strip()
headers[k.lower()] = v
cm = re.search(r"-b\s+['\"](.*?)['\"]", block, re.S)
if cm:
headers['cookie'] = cm.group(1)
return {'url': url, 'headers': headers}
def parse_curl_file(file_path):
"""读取 curl 文本文件并解析为请求描述列表
参数:`file_path` 文件路径
返回:列表,每项包含 `url` 与 `headers`
"""
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
blocks = _split_curl_blocks(text)
result = []
for b in blocks:
parsed = _parse_block(b)
if parsed:
result.append(parsed)
return result
def fetch_from_curl(file_path, index=0, timeout=30):
"""按索引选取解析出的请求并发起 GET
参数:`index` 为第几个 curl 块;`timeout` 请求超时秒数
返回:尝试解析为 JSON失败则返回原始 bytes
"""
reqs = parse_curl_file(file_path)
if not reqs or index < 0 or index >= len(reqs):
return None
item = reqs[index]
req = Request(item['url'], headers=item['headers'], method='GET')
with urlopen(req, timeout=timeout) as resp:
data = resp.read()
try:
return json.loads(data.decode('utf-8', errors='ignore'))
except Exception:
return data
"""curl 文本解析与请求发送工具
职责:
- 将包含多个 curl 命令的文本切分为块
- 从每个块解析 URL 与请求头(含 Cookie
- 基于解析结果发起 GET 请求并尝试返回 JSON
"""