init commit

This commit is contained in:
徐微
2025-12-08 15:20:22 +08:00
commit 1d0077510a
28 changed files with 9050234 additions and 0 deletions

Binary file not shown.

Binary file not shown.

208
tiktok/comments.py Normal file
View File

@@ -0,0 +1,208 @@
import json
import re
import threading
import time
from urllib.parse import urlparse, parse_qs, urlencode
from urllib.request import Request, urlopen
from core.curl import parse_curl_file
from utils.io import ensure_csv_header, append_csv_rows
from data.store import save_comments_snapshot
def _extract_aweme_id(link):
"""从视频链接中提取 aweme_id/video/<id>"""
m = re.search(r"/video/(\d+)", link)
return m.group(1) if m else None
def fetch_comments_aweme(aweme_id, file_path, count=20, max_pages=50, timeout=30, total_limit=None, referer=None):
"""分页抓取某个视频的评论
参数:
- `aweme_id` 视频 ID
- `file_path` curl 文本文件(第 1 块为评论接口基准)
- `count/max_pages/timeout` 分页与超时控制
- `total_limit` 总条数上限(可选)
- `referer` 用于设置请求头的来源页(可选)
行为:失败重试、必要时切换到兜底评论接口;处理 `has_more/next_cursor`。
返回:评论对象列表。
"""
reqs = parse_curl_file(file_path)
if not reqs:
return []
base = reqs[0]
headers = dict(base['headers'])
if referer:
headers['referer'] = referer
cursor = 0
all_comments = []
for _ in range(max_pages):
u_parsed = urlparse(base['url'])
q = parse_qs(u_parsed.query)
q['aweme_id'] = [str(aweme_id)]
q['count'] = [str(count)]
q['cursor'] = [str(cursor)]
u = u_parsed._replace(query=urlencode(q, doseq=True)).geturl()
data = None
for i in range(3):
try:
req = Request(u, headers=headers, method='GET')
with urlopen(req, timeout=timeout) as resp:
data = resp.read()
break
except Exception:
time.sleep(0.5 * (i + 1))
data = None
try:
obj = json.loads(data.decode('utf-8', errors='ignore'))
except Exception:
obj = {}
if not obj.get('comments'):
alt_params = {'aid': 1988, 'aweme_id': aweme_id, 'count': count, 'cursor': cursor}
alt_url = 'https://www.tiktok.com/api/comment/list/?' + urlencode(alt_params)
for i in range(2):
try:
req = Request(alt_url, headers=headers, method='GET')
with urlopen(req, timeout=timeout) as resp:
data2 = resp.read()
obj2 = json.loads(data2.decode('utf-8', errors='ignore'))
if obj2.get('comments'):
obj = obj2
break
except Exception:
time.sleep(0.5 * (i + 1))
comments = obj.get('comments') or []
for c in comments:
all_comments.append(c)
if isinstance(total_limit, int) and total_limit > 0 and len(all_comments) >= total_limit:
break
has_more = obj.get('has_more')
next_cursor = obj.get('cursor') or obj.get('next_cursor')
if has_more in (True, 1) and isinstance(next_cursor, int):
cursor = next_cursor
continue
if comments and isinstance(next_cursor, int):
cursor = next_cursor
continue
break
return all_comments
def fetch_replies(comment_id, aweme_id, file_path, count=20, max_pages=50, timeout=30, total_limit=None):
"""分页抓取某条评论的二级回复
参数:`comment_id/aweme_id` 标识;其他参数同评论抓取。
返回:回复对象列表。
"""
reqs = parse_curl_file(file_path)
if not reqs:
return []
headers = reqs[0]['headers']
base = 'https://www.tiktok.com/api/comment/list/reply/'
cursor = 0
replies = []
for _ in range(max_pages):
params = {'aid': 1988, 'aweme_id': aweme_id, 'comment_id': comment_id, 'count': count, 'cursor': cursor}
url = base + '?' + urlencode(params)
data = None
for i in range(3):
try:
req = Request(url, headers=headers, method='GET')
with urlopen(req, timeout=timeout) as resp:
data = resp.read()
break
except Exception:
time.sleep(0.5 * (i + 1))
data = None
try:
obj = json.loads(data.decode('utf-8', errors='ignore'))
except Exception:
obj = {}
arr = obj.get('comments') or []
for r in arr:
replies.append(r)
if isinstance(total_limit, int) and total_limit > 0 and len(replies) >= total_limit:
break
has_more = obj.get('has_more')
next_cursor = obj.get('cursor')
if has_more in (True, 1) and isinstance(next_cursor, int):
cursor = next_cursor
continue
break
return replies
_csv_lock = threading.Lock()
_print_lock = threading.Lock()
_results_lock = threading.Lock()
def save_comments_from_links(links, out_path, file_path, count=20, pages=50, timeout=30, reply_count=20, reply_pages=50, total_limit=None, reply_total_limit=None, csv_path=None, workers=None):
"""并发从视频链接抓取评论与回复并保存快照
并发:可选信号量限制;每个链接独立线程抓取;
CSV若提供 `csv_path`,按 `username,text` 追加主评论与回复;
输出:写入 `out_path`,结构为 `{'items': [{link,count,comments: [...]}, ...]}`。
"""
ensure_csv_header(csv_path, ['username', 'text'])
results = []
sem = None
if isinstance(workers, int) and workers > 0:
sem = threading.Semaphore(workers)
def _process(link):
if sem:
sem.acquire()
with _print_lock:
print(f"[START] {link}", flush=True)
try:
cs = fetch_comments_aweme(_extract_aweme_id(link), file_path=file_path, count=count, max_pages=pages, timeout=timeout, total_limit=total_limit, referer=link)
enriched = []
for c in cs:
cid = c.get('cid')
if cid:
rs = fetch_replies(cid, _extract_aweme_id(link), file_path=file_path, count=reply_count, max_pages=reply_pages, timeout=timeout, total_limit=reply_total_limit)
c = dict(c)
c['replies'] = rs
c['reply_count'] = len(rs)
enriched.append(c)
try:
with _print_lock:
print(f"{link} | cid={c.get('cid')} | create_time={c.get('create_time')} | reply_count={c.get('reply_count', 0)} | text={c.get('text')}", flush=True)
except Exception:
pass
if csv_path:
u = c.get('user') or {}
uname = u.get('unique_id') or u.get('nickname') or u.get('uid') or ''
rows = [[uname, c.get('text')]]
for r in c.get('replies', []) or []:
ru = r.get('user') or {}
runame = ru.get('unique_id') or ru.get('nickname') or ru.get('uid') or ''
rows.append([runame, r.get('text')])
with _csv_lock:
append_csv_rows(csv_path, rows)
with _results_lock:
results.append({'link': link, 'count': len(cs), 'comments': enriched})
reply_total = sum(len(c.get('replies') or []) for c in enriched)
with _print_lock:
print(f"[DONE] {link} comments={len(cs)} replies={reply_total}", flush=True)
except Exception as e:
with _print_lock:
print(f"[ERROR] {link} {e}", flush=True)
finally:
if sem:
sem.release()
threads = []
for link in links:
t = threading.Thread(target=_process, args=(link,))
t.daemon = True
t.start()
threads.append(t)
for t in threads:
t.join()
save_comments_snapshot(out_path, results)
return out_path
"""TikTok 评论与回复抓取模块
能力:
- 根据视频链接提取 aweme_id
- 通过评论接口分页拉取评论(支持兜底接口)
- 针对每条评论抓取二级回复并汇总
- 可选写入 CSV 与打印进度日志
"""

171
tiktok/search.py Normal file
View File

@@ -0,0 +1,171 @@
import json
import re
import threading
import time
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
from urllib.request import Request, urlopen
from core.curl import parse_curl_file
from data.store import save_links_snapshot
def _update_query(url, updates):
"""在原始 URL 上用 `updates` 更新查询参数并返回新 URL"""
p = urlparse(url)
q = parse_qs(p.query)
for k, v in updates.items():
q[k] = [str(v)]
new_q = urlencode(q, doseq=True)
return urlunparse((p.scheme, p.netloc, p.path, p.params, new_q, p.fragment))
def _extract_links(obj):
"""从返回对象中提取视频链接
优先从 `data -> item -> author.uniqueId + item.id` 组合;
同时遍历字符串字段,用正则匹配 tiktok 链接作为兜底。
返回:链接列表(可能包含重复,外层负责去重)。
"""
links = []
data = obj.get('data') if isinstance(obj, dict) else None
if isinstance(data, list):
for e in data:
if isinstance(e, dict) and e.get('type') == 1 and isinstance(e.get('item'), dict):
it = e['item']
author = it.get('author') or {}
uid = author.get('uniqueId')
vid = it.get('id')
if uid and vid:
links.append(f"https://www.tiktok.com/@{uid}/video/{vid}")
patterns = [
r"https?://www\.tiktok\.com/[\w@._-]+/video/\d+",
r"https?://www\.tiktok\.com/video/\d+",
r"https?://vm\.tiktok\.com/[\w-]+",
r"https?://vt\.tiktok\.com/[\w-]+",
]
def rec(x):
if isinstance(x, dict):
for v in x.values():
rec(v)
elif isinstance(x, list):
for v in x:
rec(v)
elif isinstance(x, str):
s = x
for pat in patterns:
for m in re.finditer(pat, s):
links.append(m.group(0))
rec(obj)
return links
def search_video_links(keyword, file_path, max_pages=50, timeout=30, count=None, on_link=None):
"""按关键词分页搜索视频链接
输入:从 `file_path` 的第 2 个 curl 请求获取基准 URL 与头部
行为:分页拉取、重试、解析链接;对新链接触发 `on_link` 回调
返回:所有发现的链接列表(不去重本地返回,外层统一去重)。
"""
reqs = parse_curl_file(file_path)
if len(reqs) < 2:
return []
base = reqs[1]
headers = base['headers']
parsed = urlparse(base['url'])
q = parse_qs(parsed.query)
if count is None:
if 'count' in q:
try:
count = int(q['count'][0])
except Exception:
count = 12
else:
count = 12
all_links = []
seen = set()
offset = 0
cursor = None
for _ in range(max_pages):
params = {'keyword': keyword, 'count': count}
if cursor is not None:
params['offset'] = cursor
else:
params['offset'] = offset
u = _update_query(base['url'], params)
data = None
for i in range(3):
try:
req = Request(u, headers=headers, method='GET')
with urlopen(req, timeout=timeout) as resp:
data = resp.read()
break
except Exception:
time.sleep(0.5 * (i + 1))
data = None
try:
obj = json.loads(data.decode('utf-8', errors='ignore'))
except Exception:
obj = {}
links = _extract_links(obj)
has_more = obj.get('has_more')
next_cursor = obj.get('cursor')
new = 0
for l in links:
if l not in seen:
seen.add(l)
all_links.append(l)
new += 1
if on_link:
try:
on_link(l)
except Exception:
pass
if has_more in (True, 1) and isinstance(next_cursor, int):
cursor = next_cursor
continue
if new == 0:
break
offset += count
return all_links
_print_lock = threading.Lock()
def save_links_multi(keywords, out_path, file_path, max_pages=50, timeout=30, count=None, workers=5):
"""并发按多个关键词搜索并保存快照
并发:使用线程 + 信号量限制并发;跨关键词统一去重;
输出:写入 `out_path`,包含 `keywords/items/total_count/links`。
"""
all_links = []
seen = set()
items = []
seen_lock = threading.Lock()
sem = threading.Semaphore(max(1, int(workers)))
def worker(kw):
with sem:
item_links = []
def on_new(l):
with seen_lock:
if l not in seen:
seen.add(l)
all_links.append(l)
item_links.append(l)
with _print_lock:
print(l, flush=True)
search_video_links(kw, file_path=file_path, max_pages=max_pages, timeout=timeout, count=count, on_link=on_new)
items.append({'keyword': kw, 'count': len(item_links), 'links': item_links})
threads = []
for kw in keywords:
t = threading.Thread(target=worker, args=(kw,))
t.daemon = True
t.start()
threads.append(t)
for t in threads:
t.join()
save_links_snapshot(out_path, keywords, items, all_links)
return out_path
"""TikTok 视频链接搜索模块
核心能力:
- 构造查询 URL更新 keyword/offset/count 等参数)
- 发起请求并解析返回中的视频链接(结构化 + 正则兜底)
- 对多个关键词并发搜索、统一去重与快照保存
"""