init commit

This commit is contained in:
徐微
2025-12-08 15:20:22 +08:00
commit 1d0077510a
28 changed files with 9050234 additions and 0 deletions

Binary file not shown.

55
utils/filter_comments.py Normal file
View File

@@ -0,0 +1,55 @@
import argparse
import csv
import os
def filter_comments(csv_in, csv_out, keywords):
ks = set(k.lower() for k in keywords if k)
rows_out = []
with open(csv_in, 'r', encoding='utf-8', newline='') as f:
r = csv.reader(f)
first = True
for row in r:
if first and row and row[0].lower() == 'username':
first = False
continue
first = False
if not row:
continue
text = row[1] if len(row) > 1 else ''
s = (text or '').lower()
if any(k in s for k in ks):
rows_out.append(row)
os.makedirs(os.path.dirname(csv_out), exist_ok=True)
with open(csv_out, 'w', encoding='utf-8', newline='') as wf:
w = csv.writer(wf)
w.writerow(['username', 'text'])
for r in rows_out:
w.writerow(r)
print(f"input={csv_in} keywords={len(ks)} matched_rows={len(rows_out)} out={csv_out}")
def main():
p = argparse.ArgumentParser()
p.add_argument('--extern-keywords', default=r'd:\work\test\test\all_keywords.txt')
p.add_argument('--local-keywords', default=r'data\keyword.txt')
p.add_argument('--csv-in', default=r'data\comments.csv')
p.add_argument('--csv-out', default=r'data\key_comment.csv')
args = p.parse_args()
def _load(path):
arr = []
try:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
s = line.strip()
if s:
arr.append(s)
except Exception:
arr = []
return arr
kws = []
kws.extend(_load(args.extern_keywords))
kws.extend(_load(args.local_keywords))
kws.append('pen')
filter_comments(args.csv_in, args.csv_out, kws)
if __name__ == '__main__':
main()

54
utils/io.py Normal file
View File

@@ -0,0 +1,54 @@
import json
import os
import csv
def load_keywords_from_file(path):
"""逐行读取关键词文件,忽略空行,返回列表"""
arr = []
try:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
s = line.strip()
if s:
arr.append(s)
except Exception:
arr = []
return arr
def write_json(path, obj):
"""以 UTF-8 写入 JSON使用非 ASCII 保留与缩进"""
with open(path, 'w', encoding='utf-8') as f:
json.dump(obj, f, ensure_ascii=False, indent=2)
def read_json(path):
"""读取 JSON 文件,失败时返回空对象"""
try:
with open(path, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception:
return {}
def ensure_csv_header(path, headers):
"""若 CSV 不存在则创建并写入表头;为空路径直接返回"""
if not path:
return
if not os.path.exists(path):
with open(path, 'w', newline='', encoding='utf-8') as wf:
w = csv.writer(wf)
w.writerow(headers)
def append_csv_rows(path, rows):
"""向 CSV 追加多行,行元素按列表给出;为空路径直接返回"""
if not path:
return
with open(path, 'a', newline='', encoding='utf-8') as af:
w = csv.writer(af)
for r in rows:
w.writerow(r)
"""通用 IO 工具
提供:
- 关键词文件加载
- JSON 读写
- CSV 文件写入(确保表头、追加行)
"""