2025-11-26 18:48:12 +08:00
|
|
|
|
package wechat
|
|
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
|
"crypto/tls"
|
|
|
|
|
|
"encoding/json"
|
|
|
|
|
|
"fmt"
|
|
|
|
|
|
"net/url"
|
|
|
|
|
|
"os"
|
|
|
|
|
|
"regexp"
|
|
|
|
|
|
"strings"
|
|
|
|
|
|
"time"
|
|
|
|
|
|
"unicode"
|
|
|
|
|
|
|
|
|
|
|
|
"github.com/go-resty/resty/v2"
|
|
|
|
|
|
"github.com/wechat-crawler/configs"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
type ArticleDetail struct {
|
|
|
|
|
|
LocalTime string `json:"local_time"`
|
|
|
|
|
|
CreateTime string `json:"create_time"`
|
|
|
|
|
|
Title string `json:"title"`
|
|
|
|
|
|
OfficialName string `json:"official_name"`
|
|
|
|
|
|
Link string `json:"link"`
|
|
|
|
|
|
Content []string `json:"content"`
|
|
|
|
|
|
ReadCount string `json:"read_count"`
|
|
|
|
|
|
LikeCount string `json:"like_count"`
|
|
|
|
|
|
ShareCount string `json:"share_count"`
|
|
|
|
|
|
Comments []string `json:"comments"`
|
|
|
|
|
|
CommentLikes []string `json:"comment_likes"`
|
|
|
|
|
|
CommentID string `json:"comment_id"`
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
type WechatCrawler struct {
|
|
|
|
|
|
client *resty.Client
|
|
|
|
|
|
Config *configs.Config
|
|
|
|
|
|
uin string
|
|
|
|
|
|
key string
|
|
|
|
|
|
passTicket string
|
|
|
|
|
|
biz string
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// NewWechatCrawler 创建新的微信爬虫实例
|
|
|
|
|
|
func NewWechatCrawler(biz string, uin string, key string, passTicket string, cfg *configs.Config) (*WechatCrawler, error) {
|
|
|
|
|
|
client := resty.New()
|
|
|
|
|
|
client.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true})
|
|
|
|
|
|
client.SetTimeout(10 * time.Second)
|
|
|
|
|
|
|
|
|
|
|
|
// 设置默认headers
|
|
|
|
|
|
headers := map[string]string{
|
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
|
|
|
|
"Cookie": fmt.Sprintf("uin=%s; key=%s; pass_ticket=%s;", uin, key, passTicket),
|
|
|
|
|
|
}
|
|
|
|
|
|
client.SetHeaders(headers)
|
|
|
|
|
|
|
|
|
|
|
|
return &WechatCrawler{
|
|
|
|
|
|
client: client,
|
|
|
|
|
|
Config: cfg,
|
|
|
|
|
|
uin: uin,
|
|
|
|
|
|
key: key,
|
|
|
|
|
|
passTicket: passTicket,
|
|
|
|
|
|
biz: biz,
|
|
|
|
|
|
}, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// NewSimpleCrawler 创建一个简单的微信爬虫实例,不需要cookie信息,仅用于获取文章链接
|
|
|
|
|
|
func NewSimpleCrawler() *WechatCrawler {
|
|
|
|
|
|
// 初始化 HTTP 客户端
|
|
|
|
|
|
client := resty.New()
|
|
|
|
|
|
client.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true})
|
|
|
|
|
|
client.SetTimeout(15 * time.Second)
|
|
|
|
|
|
|
|
|
|
|
|
// 设置默认headers
|
|
|
|
|
|
headers := map[string]string{
|
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
|
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
|
|
|
|
|
"Connection": "keep-alive",
|
|
|
|
|
|
}
|
|
|
|
|
|
client.SetHeaders(headers)
|
|
|
|
|
|
|
|
|
|
|
|
return &WechatCrawler{
|
|
|
|
|
|
client: client,
|
|
|
|
|
|
Config: nil,
|
|
|
|
|
|
biz: "",
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// GetOfficialAccountName 获取公众号名称
|
|
|
|
|
|
func (w *WechatCrawler) GetOfficialAccountName() (string, error) {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 如果有登录凭证,使用带认证的请求(更可靠)
|
|
|
|
|
|
var url string
|
|
|
|
|
|
if w.uin != "" && w.key != "" && w.passTicket != "" {
|
|
|
|
|
|
// 带登录信息的请求,可以绕过验证页面
|
|
|
|
|
|
url = fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124&uin=%s&key=%s&pass_ticket=%s",
|
|
|
|
|
|
w.biz, w.uin, w.key, w.passTicket)
|
|
|
|
|
|
} else {
|
|
|
|
|
|
// 不带登录信息的请求(可能会遇到验证页面)
|
|
|
|
|
|
url = fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124", w.biz)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 设置更完整的请求头,模拟真实浏览器
|
|
|
|
|
|
resp, err := w.client.R().
|
|
|
|
|
|
SetHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8").
|
|
|
|
|
|
SetHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8").
|
|
|
|
|
|
SetHeader("Cache-Control", "max-age=0").
|
|
|
|
|
|
SetHeader("Upgrade-Insecure-Requests", "1").
|
|
|
|
|
|
SetHeader("Referer", "https://mp.weixin.qq.com/").
|
|
|
|
|
|
Get(url)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
if err != nil {
|
|
|
|
|
|
return "", fmt.Errorf("获取公众号信息失败: %v", err)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 检查 HTTP 状态码
|
|
|
|
|
|
if resp.StatusCode() != 200 {
|
|
|
|
|
|
return "", fmt.Errorf("获取公众号信息失败: HTTP状态码 %d", resp.StatusCode())
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
content := resp.String()
|
|
|
|
|
|
|
|
|
|
|
|
// 调试:检查响应内容的前500字符
|
|
|
|
|
|
if len(content) < 100 {
|
|
|
|
|
|
return "", fmt.Errorf("响应内容过短,可能是请求失败: %s", content)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 尝试多种正则表达式模式来提取公众号名称
|
|
|
|
|
|
// 优先级顺序:var nickname > JSON格式 > HTML title
|
|
|
|
|
|
patterns := []struct {
|
|
|
|
|
|
pattern string
|
|
|
|
|
|
desc string
|
|
|
|
|
|
}{
|
|
|
|
|
|
{`var nickname\s*=\s*['"](. +?)['']`, "var nickname变量"},
|
|
|
|
|
|
{`var nickname = "([^"]+)"\.html\(false\)\s*\|\|\s*""`, "var nickname(带html方法)"},
|
|
|
|
|
|
{`var nickname = "(.*?)";`, "var nickname原始模式"},
|
|
|
|
|
|
{`nickname\s*:\s*"([^"]+)"`, "JSON格式nickname"},
|
|
|
|
|
|
{`"nickname":"([^"]+)"`, "字符串格式nickname"},
|
|
|
|
|
|
{`<title>([^<]+)<\/title>`, "HTML标题"},
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for _, p := range patterns {
|
|
|
|
|
|
re := regexp.MustCompile(p.pattern)
|
|
|
|
|
|
match := re.FindStringSubmatch(content)
|
|
|
|
|
|
if len(match) >= 2 {
|
|
|
|
|
|
nickname := match[1]
|
|
|
|
|
|
// 如果是从 HTML title 提取的,需要清理
|
|
|
|
|
|
if p.desc == "HTML标题" {
|
|
|
|
|
|
// 清理标题,移除"- 微信公众号"等后缀
|
|
|
|
|
|
if idx := strings.Index(nickname, "-"); idx > 0 {
|
|
|
|
|
|
nickname = strings.TrimSpace(nickname[:idx])
|
|
|
|
|
|
}
|
|
|
|
|
|
// 如果提取到的是"验证",说明遇到了验证页面
|
|
|
|
|
|
// 返回更详细的错误信息,包括可能的解决方案
|
|
|
|
|
|
if nickname == "验证" {
|
|
|
|
|
|
return "", fmt.Errorf("遇到验证页面,Cookie可能已过期\n" +
|
|
|
|
|
|
"解决方案:\n" +
|
|
|
|
|
|
"1. 在浏览器中重新登录微信公众号平台\n" +
|
|
|
|
|
|
"2. 访问目标公众号主页\n" +
|
|
|
|
|
|
"3. 向下滚动加载文章列表\n" +
|
|
|
|
|
|
"4. 在Fiddler中重新抓取包含所有参数的URL")
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
// 成功提取,返回结果
|
|
|
|
|
|
return nickname, nil
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
2025-11-27 18:40:08 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 如果所有模式都失败,检查是否是验证页面
|
|
|
|
|
|
if strings.Contains(content, "当前环境异常") || strings.Contains(content, "完成验证后即可继续访问") {
|
|
|
|
|
|
return "", fmt.Errorf("遇到人机验证页面,请在浏览器中完成验证后重新获取Cookie")
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 如果所有模式都失败,尝试从biz生成一个有意义的名称
|
|
|
|
|
|
if w.biz != "" {
|
|
|
|
|
|
return "公众号_" + w.biz[:8], nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return "未知公众号", nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// GetNextList 获取下一页文章列表
|
|
|
|
|
|
func (w *WechatCrawler) GetNextList(offset int) (map[string]interface{}, error) {
|
|
|
|
|
|
// 检查是否有必要的登录参数
|
|
|
|
|
|
if w.uin == "" || w.key == "" || w.passTicket == "" {
|
|
|
|
|
|
return nil, fmt.Errorf("no session: 需要提供微信登录状态的cookies\n请在浏览器中登录微信公众号平台后,从URL中获取uin、key和pass_ticket参数")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
url := fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=%s&f=json&offset=%d&count=10&is_ok=1&scene=124&uin=%s&key=%s&pass_ticket=%s&wxtoken=&appmsg_token=&x5=0&f=json",
|
2025-11-26 18:48:12 +08:00
|
|
|
|
w.biz, offset*10, w.uin, w.key, w.passTicket)
|
|
|
|
|
|
|
|
|
|
|
|
resp, err := w.client.R().SetHeader("Referer", fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124", w.biz)).Get(url)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return nil, fmt.Errorf("获取文章列表失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 检查响应状态
|
|
|
|
|
|
if resp.StatusCode() != 200 {
|
|
|
|
|
|
return nil, fmt.Errorf("获取文章列表失败: HTTP状态码 %d\n可能需要更新cookies或登录状态已过期", resp.StatusCode())
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 解析响应内容
|
|
|
|
|
|
content := resp.String()
|
|
|
|
|
|
// 清理JSON格式
|
|
|
|
|
|
content = strings.Replace(content, "\x22", "\"", -1)
|
|
|
|
|
|
content = strings.Replace(content, "\x5c", "\\", -1)
|
|
|
|
|
|
|
|
|
|
|
|
// 检查是否存在明显的错误信息
|
|
|
|
|
|
if strings.Contains(content, "请先登录") || strings.Contains(content, "登录超时") {
|
|
|
|
|
|
return nil, fmt.Errorf("no session: 登录状态已过期或无效\n请在浏览器中重新登录微信公众号平台并更新cookies")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
var result map[string]interface{}
|
|
|
|
|
|
err = json.Unmarshal([]byte(content), &result)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
// 尝试更宽松的错误处理
|
|
|
|
|
|
if strings.Contains(content, "no session") {
|
|
|
|
|
|
return nil, fmt.Errorf("no session: 需要有效的微信登录状态\n请在浏览器中登录微信公众号平台后,从URL中获取登录参数")
|
|
|
|
|
|
}
|
|
|
|
|
|
return nil, fmt.Errorf("解析文章列表失败: %v\n响应内容: %s", err, content[:100])
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 检查是否有错误
|
|
|
|
|
|
if ret, ok := result["ret"].(float64); ok {
|
|
|
|
|
|
switch ret {
|
|
|
|
|
|
case 4001:
|
|
|
|
|
|
return nil, fmt.Errorf("获取文章列表失败: 登录状态已过期\n请更新cookies")
|
|
|
|
|
|
case -200013:
|
|
|
|
|
|
return nil, fmt.Errorf("获取文章列表失败: 需要验证\n请在浏览器中先访问公众号页面进行验证")
|
|
|
|
|
|
case -200015:
|
|
|
|
|
|
return nil, fmt.Errorf("获取文章列表失败: 操作过于频繁\n请稍后再试")
|
|
|
|
|
|
default:
|
|
|
|
|
|
if ret != 0 {
|
|
|
|
|
|
errMsg, _ := result["errmsg"].(string)
|
|
|
|
|
|
return nil, fmt.Errorf("获取文章列表失败: 错误码 %v, 信息: %s", ret, errMsg)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 解析文章列表
|
|
|
|
|
|
generalMsgList, ok := result["general_msg_list"].(string)
|
|
|
|
|
|
if !ok {
|
|
|
|
|
|
return nil, fmt.Errorf("解析文章列表格式错误")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 调试:打印原始 JSON 的前 500 字符
|
|
|
|
|
|
if len(generalMsgList) > 0 {
|
|
|
|
|
|
preview := generalMsgList
|
|
|
|
|
|
if len(preview) > 500 {
|
|
|
|
|
|
preview = preview[:500]
|
|
|
|
|
|
}
|
|
|
|
|
|
fmt.Printf("\n调试 - general_msg_list 前500字符:\n%s...\n\n", preview)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-26 18:48:12 +08:00
|
|
|
|
var msgList struct {
|
|
|
|
|
|
List []struct {
|
|
|
|
|
|
CommMsgInfo struct {
|
|
|
|
|
|
ID int64 `json:"id"`
|
|
|
|
|
|
Type int `json:"type"`
|
2025-11-27 18:40:08 +08:00
|
|
|
|
DateTime int64 `json:"datetime"` // 微信使用datetime字段,不是create_time
|
|
|
|
|
|
CreateTime int64 `json:"create_time"` // 保留兼容性
|
2025-11-26 18:48:12 +08:00
|
|
|
|
SourceMsgID int64 `json:"source_msg_id"`
|
|
|
|
|
|
} `json:"comm_msg_info"`
|
|
|
|
|
|
AppMsgExtInfo struct {
|
|
|
|
|
|
Title string `json:"title"`
|
|
|
|
|
|
Digest string `json:"digest"`
|
|
|
|
|
|
ContentURL string `json:"content_url"`
|
|
|
|
|
|
Cover string `json:"cover"`
|
|
|
|
|
|
Author string `json:"author"`
|
|
|
|
|
|
FileID int64 `json:"fileid"`
|
|
|
|
|
|
Content string `json:"content"`
|
|
|
|
|
|
UrlList []string `json:"url_list"`
|
|
|
|
|
|
} `json:"app_msg_ext_info"`
|
|
|
|
|
|
MultiAppMsgItemList []struct {
|
|
|
|
|
|
Title string `json:"title"`
|
|
|
|
|
|
Digest string `json:"digest"`
|
|
|
|
|
|
ContentURL string `json:"content_url"`
|
|
|
|
|
|
Cover string `json:"cover"`
|
|
|
|
|
|
Author string `json:"author"`
|
|
|
|
|
|
} `json:"multi_app_msg_item_list"`
|
|
|
|
|
|
} `json:"list"`
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
err = json.Unmarshal([]byte(generalMsgList), &msgList)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return nil, fmt.Errorf("解析文章列表内容失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 调试:打印第一篇文章的原始数据
|
|
|
|
|
|
if len(msgList.List) > 0 {
|
|
|
|
|
|
fmt.Printf("\n调试 - 第一篇文章的原始JSON数据:\n")
|
|
|
|
|
|
firstItem := msgList.List[0]
|
|
|
|
|
|
fmt.Printf(" Type: %d\n", firstItem.CommMsgInfo.Type)
|
|
|
|
|
|
fmt.Printf(" DateTime: %d\n", firstItem.CommMsgInfo.DateTime)
|
|
|
|
|
|
fmt.Printf(" CreateTime: %d\n", firstItem.CommMsgInfo.CreateTime)
|
|
|
|
|
|
fmt.Printf(" ID: %d\n", firstItem.CommMsgInfo.ID)
|
|
|
|
|
|
fmt.Printf(" Title: %s\n", firstItem.AppMsgExtInfo.Title)
|
|
|
|
|
|
fmt.Printf(" Author: %s\n", firstItem.AppMsgExtInfo.Author)
|
|
|
|
|
|
|
|
|
|
|
|
// 显示实际使用的时间戳
|
|
|
|
|
|
timestamp := firstItem.CommMsgInfo.DateTime
|
|
|
|
|
|
if timestamp == 0 {
|
|
|
|
|
|
timestamp = firstItem.CommMsgInfo.CreateTime
|
|
|
|
|
|
}
|
|
|
|
|
|
if timestamp > 0 {
|
|
|
|
|
|
fmt.Printf(" 实际使用的时间戳: %d (%s)\n", timestamp, time.Unix(timestamp, 0).Format("2006-01-02 15:04:05"))
|
|
|
|
|
|
}
|
|
|
|
|
|
fmt.Println()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-26 18:48:12 +08:00
|
|
|
|
// 构建返回数据
|
|
|
|
|
|
response := make(map[string]interface{})
|
|
|
|
|
|
response["m_flag"] = 1
|
|
|
|
|
|
|
|
|
|
|
|
var passageList [][]string
|
|
|
|
|
|
for _, item := range msgList.List {
|
|
|
|
|
|
if item.CommMsgInfo.Type == 49 {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 获取时间戳,优先使用DateTime,如果为0则使用CreateTime
|
|
|
|
|
|
timestamp := item.CommMsgInfo.DateTime
|
|
|
|
|
|
if timestamp == 0 {
|
|
|
|
|
|
timestamp = item.CommMsgInfo.CreateTime
|
|
|
|
|
|
}
|
|
|
|
|
|
createTime := fmt.Sprintf("%d", timestamp)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
title := item.AppMsgExtInfo.Title
|
|
|
|
|
|
link := item.AppMsgExtInfo.ContentURL
|
|
|
|
|
|
passageList = append(passageList, []string{"", createTime, title, link})
|
|
|
|
|
|
|
|
|
|
|
|
// 多图文消息
|
|
|
|
|
|
for _, multiItem := range item.MultiAppMsgItemList {
|
|
|
|
|
|
passageList = append(passageList, []string{"", createTime, multiItem.Title, multiItem.ContentURL})
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
response["passage_list"] = passageList
|
|
|
|
|
|
|
|
|
|
|
|
// 如果没有更多文章,设置m_flag为0
|
|
|
|
|
|
if len(passageList) == 0 {
|
|
|
|
|
|
response["m_flag"] = 0
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return response, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// GetOneArticle 获取单篇文章内容
|
|
|
|
|
|
func (w *WechatCrawler) GetOneArticle(link string) (string, error) {
|
|
|
|
|
|
resp, err := w.client.R().Get(link)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return "", fmt.Errorf("请求文章失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
return resp.String(), nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ExtractOfficialAccountName 从文章内容中提取公众号名称
|
|
|
|
|
|
func (w *WechatCrawler) ExtractOfficialAccountName(content string) string {
|
|
|
|
|
|
accountName := ""
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 参考 Python 版本,优先从 var nickname 提取公众号名称
|
2025-11-26 18:48:12 +08:00
|
|
|
|
patterns := []string{
|
2025-11-27 18:40:08 +08:00
|
|
|
|
`var nickname\s*=\s*['"](.+?)['"]`, // nickname变量(Python版本的主要模式)
|
|
|
|
|
|
`var nickname.*"(.*?)"`, // nickname变量备用模式
|
|
|
|
|
|
`"nickname"\s*:\s*['"](.+?)['"]`, // JSON中的nickname字段
|
|
|
|
|
|
`window\.appmsg\s*=\s*\{[^}]*"author"\s*:\s*['"](.+?)['']`, // window.appmsg.author
|
|
|
|
|
|
`var ct\s*=\s*['"](.+?)['"]`, // ct变量(有时用于存储公众号名称)
|
|
|
|
|
|
`<meta[^>]*name=["']?author["']?[^>]*content=["'](.+?)["]`, // meta标签中的作者信息
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for _, pattern := range patterns {
|
|
|
|
|
|
regex := regexp.MustCompile(pattern)
|
|
|
|
|
|
if match := regex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
accountName = match[1]
|
|
|
|
|
|
// 清理和转义
|
|
|
|
|
|
accountName = strings.ReplaceAll(accountName, """, "\"")
|
|
|
|
|
|
accountName = strings.ReplaceAll(accountName, "&", "&")
|
|
|
|
|
|
accountName = strings.ReplaceAll(accountName, "<", "<")
|
|
|
|
|
|
accountName = strings.ReplaceAll(accountName, ">", ">")
|
|
|
|
|
|
// 多次URL解码
|
|
|
|
|
|
for i := 0; i < 3; i++ {
|
|
|
|
|
|
if decoded, err := url.QueryUnescape(accountName); err == nil && decoded != accountName {
|
|
|
|
|
|
accountName = decoded
|
|
|
|
|
|
} else {
|
|
|
|
|
|
break
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 去除可能存在的空格和特殊字符
|
|
|
|
|
|
accountName = strings.TrimSpace(accountName)
|
|
|
|
|
|
if accountName != "" {
|
|
|
|
|
|
break
|
|
|
|
|
|
}
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return accountName
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ExtractArticleInfo 从文章内容中提取关键信息
|
|
|
|
|
|
func (w *WechatCrawler) ExtractArticleInfo(content string) (string, string, string, string, string, []string) {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 首先提取公众号名称,用于后续标题验证
|
|
|
|
|
|
accountName := w.ExtractOfficialAccountName(content)
|
|
|
|
|
|
|
|
|
|
|
|
// 提取创建时间 - 参考 Python 版本
|
2025-11-26 18:48:12 +08:00
|
|
|
|
createTime := ""
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 模式1: 标准createTime变量(Python版本的主要模式)
|
|
|
|
|
|
createTimeRegex := regexp.MustCompile(`var createTime = '(.+?)'`)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
if match := createTimeRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
createTime = match[1]
|
|
|
|
|
|
} else {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 模式2: 双引号格式
|
|
|
|
|
|
createTimeRegex2 := regexp.MustCompile(`var createTime\s*=\s*['"](.+?)['"]`)
|
|
|
|
|
|
if match := createTimeRegex2.FindStringSubmatch(content); len(match) > 1 {
|
2025-11-26 18:48:12 +08:00
|
|
|
|
createTime = match[1]
|
2025-11-27 18:40:08 +08:00
|
|
|
|
} else {
|
|
|
|
|
|
// 模式3: ori_create_time变量(在之前的文件中发现)
|
|
|
|
|
|
oriCreateTimeRegex := regexp.MustCompile(`ori_create_time\s*:\s*['"](.+?)['"]`)
|
|
|
|
|
|
if match := oriCreateTimeRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
createTime = match[1]
|
|
|
|
|
|
} else {
|
|
|
|
|
|
// 模式4: JSON对象中的create_time字段
|
|
|
|
|
|
jsonCreateTimeRegex := regexp.MustCompile(`"create_time"\s*:\s*(.+?)(?:,|\})`)
|
|
|
|
|
|
if match := jsonCreateTimeRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
createTime = match[1]
|
|
|
|
|
|
// 去除引号
|
|
|
|
|
|
createTime = strings.Trim(createTime, `"'`)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 提取标题 - 参考 Python 版本,支持单引号和双引号
|
2025-11-26 18:48:12 +08:00
|
|
|
|
title := ""
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 优先级顺序:
|
|
|
|
|
|
// 1. var msg_title - 微信文章真正的标题字段(最高优先级)
|
|
|
|
|
|
// 2. meta 标签中的 og:title 或 twitter:title
|
|
|
|
|
|
// 3. var title - 可能是公众号名称
|
2025-11-26 18:48:12 +08:00
|
|
|
|
titlePatterns := []string{
|
2025-11-27 18:40:08 +08:00
|
|
|
|
`var msg_title\s*=\s*['"](.+?)['"]`, // msg_title是真正的文章标题!
|
|
|
|
|
|
`<meta\s+property=["']og:title["']\s+content=["'](.+?)["']`, // Open Graph标题
|
|
|
|
|
|
`<meta\s+property=["']twitter:title["']\s+content=["'](.+?)["']`, // Twitter标题
|
|
|
|
|
|
`var title\s*=\s*['"](.+?)['"]`, // 直接变量赋值(可能是公众号名称)
|
|
|
|
|
|
`window\.appmsg\s*=\s*\{[^}]*"title"\s*:\s*['"](.+?)['']`, // window.appmsg对象中的title
|
|
|
|
|
|
`"title"\s*:\s*['"](.+?)['"]`, // JSON对象中的title字段
|
|
|
|
|
|
`window\.title\s*=\s*['"](.+?)['"]`, // window.title赋值
|
|
|
|
|
|
`title\s*=\s*JsDecode\(['"](.+?)['"]\)`, // title变量的JsDecode赋值
|
|
|
|
|
|
`JsDecode\(['"]([^'"]*?title[^'"]*)['"]\)`, // 包含title的JsDecode调用
|
|
|
|
|
|
`<title[^>]*>(.+?)</title>`, // HTML title标签(最低优先级)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for _, pattern := range titlePatterns {
|
|
|
|
|
|
titleRegex := regexp.MustCompile(pattern)
|
|
|
|
|
|
if match := titleRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
title = match[1]
|
|
|
|
|
|
// 尝试解码HTML实体和URL编码
|
|
|
|
|
|
title = strings.ReplaceAll(title, """, "\"")
|
|
|
|
|
|
title = strings.ReplaceAll(title, "&", "&")
|
|
|
|
|
|
title = strings.ReplaceAll(title, "<", "<")
|
|
|
|
|
|
title = strings.ReplaceAll(title, ">", ">")
|
|
|
|
|
|
// 多次URL解码,处理嵌套编码
|
|
|
|
|
|
for i := 0; i < 3; i++ { // 最多解码3次
|
|
|
|
|
|
if decoded, err := url.QueryUnescape(title); err == nil && decoded != title {
|
|
|
|
|
|
title = decoded
|
|
|
|
|
|
} else {
|
|
|
|
|
|
break
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-11-27 18:40:08 +08:00
|
|
|
|
|
|
|
|
|
|
// 验证:如果提取的标题与公众号名称相同,继续尝试下一个模式
|
|
|
|
|
|
// 这是因为HTML title标签通常包含公众号名称
|
|
|
|
|
|
if title != accountName && title != "" {
|
|
|
|
|
|
break
|
|
|
|
|
|
}
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 提取comment_id - 增强版,增加JsDecode支持
|
|
|
|
|
|
commentID := ""
|
|
|
|
|
|
// 模式1: 标准comment_id变量
|
|
|
|
|
|
commentIDRegex := regexp.MustCompile(`var comment_id\s*=\s*['"](\d+)['"]`)
|
|
|
|
|
|
if match := commentIDRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
commentID = match[1]
|
|
|
|
|
|
} else {
|
|
|
|
|
|
// 模式2: comment_id变量带JsDecode(在文件中发现)
|
|
|
|
|
|
commentIDJsDecodeRegex := regexp.MustCompile(`comment_id\s*=\s*JsDecode\(['"]([^'"]+)['"]\)`)
|
|
|
|
|
|
if match := commentIDJsDecodeRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
commentID = match[1]
|
|
|
|
|
|
}
|
|
|
|
|
|
// 模式3: JSON对象中的comment_id字段
|
|
|
|
|
|
jsonCommentIDRegex := regexp.MustCompile(`"comment_id"\s*:\s*['"]([^'"]+)['"]`)
|
|
|
|
|
|
if match := jsonCommentIDRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
commentID = match[1]
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 提取req_id - 增强版
|
|
|
|
|
|
reqID := ""
|
|
|
|
|
|
// 模式1: 标准req_id变量
|
|
|
|
|
|
reqIDRegex := regexp.MustCompile(`var req_id\s*=\s*['"](\d+)['"]`)
|
|
|
|
|
|
if match := reqIDRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
reqID = match[1]
|
|
|
|
|
|
} else {
|
|
|
|
|
|
// 模式2: req_id变量的其他格式
|
|
|
|
|
|
reqIDAltRegex := regexp.MustCompile(`req_id\s*=\s*['"]([^'"]+)['"]`)
|
|
|
|
|
|
if match := reqIDAltRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
reqID = match[1]
|
|
|
|
|
|
}
|
|
|
|
|
|
// 模式3: JSON对象中的req_id字段
|
|
|
|
|
|
jsonReqIDRegex := regexp.MustCompile(`"req_id"\s*:\s*['"]([^'"]+)['"]`)
|
|
|
|
|
|
if match := jsonReqIDRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
reqID = match[1]
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 提取文章文本内容 - 全新策略,专注于微信文章核心内容结构
|
|
|
|
|
|
textContent := []string{}
|
|
|
|
|
|
|
|
|
|
|
|
// 改进内容提取策略 - 全新的优先级顺序,专注于微信文章特有的内容结构
|
|
|
|
|
|
var rawContent string
|
|
|
|
|
|
|
|
|
|
|
|
// 方法1: 从微信文章特定的数据结构提取(最高优先级)
|
|
|
|
|
|
// 1.1 尝试从var content变量直接提取(微信文章常用的内容存储方式)
|
|
|
|
|
|
varContentRegex := regexp.MustCompile(`var\s+content\s*=\s*(['"](?:\\.|[^'"])*['"])\s*;`)
|
|
|
|
|
|
if match := varContentRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
rawContent = match[1]
|
|
|
|
|
|
// 移除引号
|
|
|
|
|
|
if len(rawContent) > 1 && rawContent[0] == '"' && rawContent[len(rawContent)-1] == '"' {
|
|
|
|
|
|
rawContent = rawContent[1 : len(rawContent)-1]
|
|
|
|
|
|
}
|
|
|
|
|
|
} else if rawContent == "" {
|
|
|
|
|
|
// 1.2 尝试从rich_media_content变量提取
|
|
|
|
|
|
richMediaVarRegex := regexp.MustCompile(`var\s+rich_media_content\s*=\s*(['"](?:\\.|[^'"])*['"])\s*;`)
|
|
|
|
|
|
if match := richMediaVarRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
rawContent = match[1]
|
|
|
|
|
|
// 移除引号
|
|
|
|
|
|
if len(rawContent) > 1 && rawContent[0] == '"' && rawContent[len(rawContent)-1] == '"' {
|
|
|
|
|
|
rawContent = rawContent[1 : len(rawContent)-1]
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
} else if rawContent == "" {
|
|
|
|
|
|
// 1.3 尝试从js_content变量提取
|
|
|
|
|
|
jsContentVarRegex := regexp.MustCompile(`var\s+js_content\s*=\s*(['"](?:\\.|[^'"])*['"])\s*;`)
|
|
|
|
|
|
if match := jsContentVarRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
rawContent = match[1]
|
|
|
|
|
|
// 移除引号
|
|
|
|
|
|
if len(rawContent) > 1 && rawContent[0] == '"' && rawContent[len(rawContent)-1] == '"' {
|
|
|
|
|
|
rawContent = rawContent[1 : len(rawContent)-1]
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 方法2: 从 HTML DOM 结构中直接提取(次优先级)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
if rawContent == "" {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 2.1 优先查找 id=img-content 的div(微信新版本文章容器)
|
|
|
|
|
|
imgContentIdRegex := regexp.MustCompile(`(?s)<div\s+id=["']img-content["'][^>]*>([\s\S]*?)</div>`)
|
|
|
|
|
|
if match := imgContentIdRegex.FindStringSubmatch(content); len(match) > 1 {
|
2025-11-26 18:48:12 +08:00
|
|
|
|
rawContent = match[1]
|
|
|
|
|
|
} else if rawContent == "" {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 2.2 查找rich_media_content类的div(微信文章核心内容容器)
|
|
|
|
|
|
richMediaClassRegex := regexp.MustCompile(`(?s)<div\s+class=["']rich_media_content["'][^>]*>([\s\S]*?)</div>`)
|
|
|
|
|
|
if match := richMediaClassRegex.FindStringSubmatch(content); len(match) > 1 {
|
2025-11-26 18:48:12 +08:00
|
|
|
|
rawContent = match[1]
|
2025-11-27 18:40:08 +08:00
|
|
|
|
} else if rawContent == "" {
|
|
|
|
|
|
// 2.3 尝试查找id为js_content的元素
|
|
|
|
|
|
jsContentIdRegex := regexp.MustCompile(`(?s)<div\s+id=["']js_content["'][^>]*>([\s\S]*?)</div>`)
|
|
|
|
|
|
if match := jsContentIdRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
rawContent = match[1]
|
|
|
|
|
|
}
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 方法3: 从window.appmsg对象中提取(微信文章标准数据结构)
|
|
|
|
|
|
if rawContent == "" {
|
|
|
|
|
|
appmsgRegex := regexp.MustCompile(`window\.appmsg\s*=\s*(\{[\s\S]+?\});`)
|
|
|
|
|
|
if match := appmsgRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
appmsgData := match[1]
|
|
|
|
|
|
// 尝试提取content字段(多种格式)
|
|
|
|
|
|
contentPatterns := []string{
|
|
|
|
|
|
`"content"\s*:\s*(['"](?:\\.|[^'"])*['"])`,
|
|
|
|
|
|
`content\s*=\s*(['"](?:\\.|[^'"])*['"])`,
|
|
|
|
|
|
`"content"\s*:\s*JsDecode\(['"]([^'"]+)['"]\)`,
|
|
|
|
|
|
`content\s*=\s*JsDecode\(['"]([^'"]+)['"]\)`,
|
|
|
|
|
|
}
|
|
|
|
|
|
for _, pattern := range contentPatterns {
|
|
|
|
|
|
contentRegex := regexp.MustCompile(pattern)
|
|
|
|
|
|
if contentMatch := contentRegex.FindStringSubmatch(appmsgData); len(contentMatch) > 1 {
|
|
|
|
|
|
rawContent = contentMatch[1]
|
|
|
|
|
|
// 移除引号
|
|
|
|
|
|
if len(rawContent) > 1 && rawContent[0] == '"' && rawContent[len(rawContent)-1] == '"' {
|
|
|
|
|
|
rawContent = rawContent[1 : len(rawContent)-1]
|
|
|
|
|
|
}
|
|
|
|
|
|
break
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 方法4: 从JSON格式的文章数据中提取
|
|
|
|
|
|
if rawContent == "" {
|
|
|
|
|
|
// 查找可能包含文章内容的JSON数据块
|
|
|
|
|
|
jsonDataRegex := regexp.MustCompile(`(?:\{"content"|\"content\")[^}]*\}`)
|
|
|
|
|
|
jsonMatches := jsonDataRegex.FindAllString(content, -1)
|
|
|
|
|
|
|
|
|
|
|
|
for _, jsonMatch := range jsonMatches {
|
|
|
|
|
|
// 尝试解析JSON
|
|
|
|
|
|
var jsonObj map[string]interface{}
|
|
|
|
|
|
if err := json.Unmarshal([]byte(jsonMatch), &jsonObj); err == nil {
|
|
|
|
|
|
if contentStr, ok := jsonObj["content"].(string); ok && contentStr != "" {
|
|
|
|
|
|
rawContent = contentStr
|
|
|
|
|
|
break
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 方法5: 尝试从微信文章特有的段落结构提取
|
|
|
|
|
|
if rawContent == "" {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// Python版本使用 BeautifulSoup 的 getText() 方法提取所有文本
|
|
|
|
|
|
// 这里我们直接提取所有段落,然后过滤JavaScript
|
|
|
|
|
|
// 查找带有data-pm-slice或js_darkmode类的p标签(微信文章特有样式)
|
|
|
|
|
|
specialPTagsRegex := regexp.MustCompile(`(?s)<p\s+[^>]*(?:data-pm-slice|js_darkmode)[^>]*>([\s\S]*?)</p>`)
|
|
|
|
|
|
if matches := specialPTagsRegex.FindAllStringSubmatch(content, -1); len(matches) > 0 {
|
2025-11-26 18:48:12 +08:00
|
|
|
|
// 如果找到多个p标签,合并它们的内容
|
|
|
|
|
|
var combinedContent strings.Builder
|
|
|
|
|
|
for _, match := range matches {
|
|
|
|
|
|
if len(match) > 1 {
|
|
|
|
|
|
combinedContent.WriteString(match[1])
|
|
|
|
|
|
combinedContent.WriteString("\n")
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
rawContent = combinedContent.String()
|
2025-11-27 18:40:08 +08:00
|
|
|
|
} else if rawContent == "" {
|
|
|
|
|
|
// 查找带有rich_media_p类的p标签(微信文章特有的段落样式)
|
|
|
|
|
|
pTagsRegex := regexp.MustCompile(`(?s)<p\s+class=["']rich_media_p["'].*?>([\s\S]*?)</p>`)
|
|
|
|
|
|
if matches := pTagsRegex.FindAllStringSubmatch(content, -1); len(matches) > 0 {
|
|
|
|
|
|
// 如果找到多个p标签,合并它们的内容
|
2025-11-26 18:48:12 +08:00
|
|
|
|
var combinedContent strings.Builder
|
|
|
|
|
|
for _, match := range matches {
|
|
|
|
|
|
if len(match) > 1 {
|
|
|
|
|
|
combinedContent.WriteString(match[1])
|
|
|
|
|
|
combinedContent.WriteString("\n")
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
rawContent = combinedContent.String()
|
2025-11-27 18:40:08 +08:00
|
|
|
|
} else {
|
|
|
|
|
|
// 尝试一般的p标签,这是微信文章的备用段落格式
|
|
|
|
|
|
generalPTagsRegex := regexp.MustCompile(`(?s)<p[^>]*>([\s\S]*?)</p>`)
|
|
|
|
|
|
if matches := generalPTagsRegex.FindAllStringSubmatch(content, -1); len(matches) > 10 { // 至少10个p标签才可能是文章内容
|
|
|
|
|
|
var combinedContent strings.Builder
|
|
|
|
|
|
for _, match := range matches {
|
|
|
|
|
|
if len(match) > 1 {
|
|
|
|
|
|
// 过滤JavaScript代码:如果段落包含function、var、window等关键词,跳过
|
|
|
|
|
|
paragraph := match[1]
|
|
|
|
|
|
// 简单过滤:如果段落中包含大量的JavaScript关键词,跳过
|
|
|
|
|
|
if !strings.Contains(paragraph, "function") &&
|
|
|
|
|
|
!strings.Contains(paragraph, "var ") &&
|
|
|
|
|
|
!strings.Contains(paragraph, "window.") &&
|
|
|
|
|
|
!strings.Contains(paragraph, ".length") {
|
|
|
|
|
|
combinedContent.WriteString(paragraph)
|
|
|
|
|
|
combinedContent.WriteString("\n")
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if combinedContent.Len() > 100 { // 只有当合并后的内容超过100字符才认为有效
|
|
|
|
|
|
rawContent = combinedContent.String()
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 方法6: 从article或section标签提取(HTML5标准内容容器)
|
|
|
|
|
|
if rawContent == "" {
|
|
|
|
|
|
// 分别处理article和section标签
|
|
|
|
|
|
articleRegex := regexp.MustCompile(`(?s)<article[^>]*>([\s\S]*?)<\/article>`)
|
|
|
|
|
|
if match := articleRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
// 检查提取的内容是否真的包含文章正文(而不是JavaScript代码)
|
|
|
|
|
|
articleContent := match[1]
|
|
|
|
|
|
if w.calculateChineseDensity(articleContent) > 0.2 {
|
|
|
|
|
|
rawContent = articleContent
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
sectionRegex := regexp.MustCompile(`(?s)<section[^>]*>([\s\S]*?)<\/section>`)
|
|
|
|
|
|
if match := sectionRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
// 检查提取的内容是否真的包含文章正文
|
|
|
|
|
|
sectionContent := match[1]
|
|
|
|
|
|
if w.calculateChineseDensity(sectionContent) > 0.2 {
|
|
|
|
|
|
rawContent = sectionContent
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 方法7: 基于中文密度的段落提取(备用方法)
|
|
|
|
|
|
if rawContent == "" {
|
|
|
|
|
|
// 将内容分成较小的块进行检查,使用更简单的正则表达式
|
|
|
|
|
|
contentBlocks := regexp.MustCompile(`[\S\s]+?`).FindAllString(content, -1)
|
|
|
|
|
|
|
|
|
|
|
|
var bestContent string
|
|
|
|
|
|
var bestScore float64
|
|
|
|
|
|
|
|
|
|
|
|
for _, block := range contentBlocks {
|
|
|
|
|
|
// 计算中文密度(使用现有的calculateChineseDensity方法)
|
|
|
|
|
|
density := w.calculateChineseDensity(block)
|
|
|
|
|
|
// 计算JavaScript关键词数量
|
|
|
|
|
|
jsCount := w.jsKeywordCount(block)
|
|
|
|
|
|
// 计算中文字符总数
|
|
|
|
|
|
chineseCount := 0
|
|
|
|
|
|
for _, char := range block {
|
|
|
|
|
|
if char >= 0x4e00 && char <= 0x9fa5 {
|
|
|
|
|
|
chineseCount++
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 计算综合评分:中文密度高且JavaScript关键词少的内容得分更高
|
|
|
|
|
|
score := density * float64(chineseCount) / (float64(jsCount) + 1.0)
|
|
|
|
|
|
|
|
|
|
|
|
// 只有中文密度足够高且JavaScript关键词较少的内容才考虑
|
|
|
|
|
|
if density > 0.4 && jsCount < 10 && chineseCount > 100 && score > bestScore {
|
|
|
|
|
|
bestScore = score
|
|
|
|
|
|
bestContent = block
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if bestContent != "" {
|
|
|
|
|
|
rawContent = bestContent
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 方法8: 从JavaScript字符串中提取HTML内容(备用方法)
|
|
|
|
|
|
if rawContent == "" {
|
|
|
|
|
|
// 查找可能包含HTML内容的长字符串
|
|
|
|
|
|
longStringRegex := regexp.MustCompile(`['"]([^'"]{200,})['"]`)
|
|
|
|
|
|
matches := longStringRegex.FindAllStringSubmatch(content, -1)
|
|
|
|
|
|
|
|
|
|
|
|
for _, match := range matches {
|
|
|
|
|
|
if len(match) > 1 {
|
|
|
|
|
|
// 先进行预检查,排除明显的JavaScript代码
|
|
|
|
|
|
candidate := match[1]
|
|
|
|
|
|
if w.jsKeywordCount(candidate) > 20 {
|
|
|
|
|
|
continue // 跳过JavaScript代码过多的候选内容
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 尝试解码可能的URL编码内容
|
|
|
|
|
|
decoded := candidate
|
|
|
|
|
|
for i := 0; i < 3; i++ { // 最多解码3次
|
|
|
|
|
|
if d, err := url.QueryUnescape(decoded); err == nil && d != decoded {
|
|
|
|
|
|
decoded = d
|
|
|
|
|
|
} else {
|
|
|
|
|
|
break
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 检查是否包含常见的HTML标签且中文密度足够高
|
|
|
|
|
|
hasHTMLTags := strings.Contains(decoded, "<p>") || strings.Contains(decoded, "<div") ||
|
|
|
|
|
|
strings.Contains(decoded, "<br>") || strings.Contains(decoded, "<p>") ||
|
|
|
|
|
|
strings.Contains(decoded, "<div") || strings.Contains(decoded, "<br>")
|
|
|
|
|
|
|
|
|
|
|
|
// 计算解码后的中文密度
|
|
|
|
|
|
density := w.calculateChineseDensity(decoded)
|
|
|
|
|
|
|
|
|
|
|
|
// 同时满足有HTML标签和足够的中文密度
|
|
|
|
|
|
if hasHTMLTags && density > 0.3 {
|
|
|
|
|
|
rawContent = decoded
|
|
|
|
|
|
break
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 预处理rawContent(如果已找到)
|
|
|
|
|
|
if rawContent != "" {
|
|
|
|
|
|
// 首先进行多次URL解码,处理嵌套编码
|
|
|
|
|
|
for i := 0; i < 3; i++ { // 最多解码3次
|
|
|
|
|
|
if decoded, err := url.QueryUnescape(rawContent); err == nil && decoded != rawContent {
|
|
|
|
|
|
rawContent = decoded
|
|
|
|
|
|
} else {
|
|
|
|
|
|
break
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 替换HTML实体
|
|
|
|
|
|
rawContent = strings.ReplaceAll(rawContent, "<", "<")
|
|
|
|
|
|
rawContent = strings.ReplaceAll(rawContent, ">", ">")
|
|
|
|
|
|
rawContent = strings.ReplaceAll(rawContent, """, "\"")
|
|
|
|
|
|
rawContent = strings.ReplaceAll(rawContent, "&", "&")
|
|
|
|
|
|
rawContent = strings.ReplaceAll(rawContent, "\\n", "")
|
|
|
|
|
|
rawContent = strings.ReplaceAll(rawContent, "\\r", "")
|
|
|
|
|
|
rawContent = strings.ReplaceAll(rawContent, "\\t", "")
|
|
|
|
|
|
rawContent = strings.ReplaceAll(rawContent, "\\\"", "\"") // 处理转义的双引号
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 如果找到了内容,进行清理
|
|
|
|
|
|
if rawContent != "" {
|
|
|
|
|
|
// 移除HTML标签
|
|
|
|
|
|
tagRegex := regexp.MustCompile(`<[^>]*>`)
|
|
|
|
|
|
cleanText := tagRegex.ReplaceAllString(rawContent, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 应用JavaScript大段过滤
|
|
|
|
|
|
cleanText = w.filterJavaScriptBlocks(cleanText)
|
|
|
|
|
|
|
|
|
|
|
|
// 移除多余的空白字符
|
|
|
|
|
|
spaceRegex := regexp.MustCompile(`\s+`)
|
|
|
|
|
|
cleanText = spaceRegex.ReplaceAllString(cleanText, " ")
|
|
|
|
|
|
cleanText = strings.TrimSpace(cleanText)
|
|
|
|
|
|
|
|
|
|
|
|
// 检查是否包含过多的JavaScript代码特征
|
|
|
|
|
|
jsCount := w.jsKeywordCount(cleanText)
|
|
|
|
|
|
chineseDensity := w.calculateChineseDensity(cleanText)
|
|
|
|
|
|
|
|
|
|
|
|
// 移除明显的JavaScript代码块 - 增强版,特别针对微信平台代码
|
|
|
|
|
|
// 1. 移除WX_BJ_REPORT相关代码
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*WX_BJ_REPORT\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*var\s+WX_BJ_REPORT\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*if\s*\(WX_BJ_REPORT\)[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 2. 移除BadJs相关代码
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*BadJs\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*var\s+BadJs\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*if\s*\(BadJs\)[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 3. 移除window.logs相关代码
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*window\.logs\s*=\s*\[.*?\];`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*window\.logs\s*\..*?;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 4. 移除__moon_initcallback相关代码
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*__moon_initcallback\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*\.__moon_initcallback\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 5. 移除try-catch块
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*try\s*{[^}]*}\s*catch\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 6. 移除函数定义
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*function\s+[^(]*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*var\s+[^=]*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 7. 移除IIFE函数
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*\(function\s*\([^)]*\)\s*{[^}]*}\)\s*\(\);`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 8. 移除变量声明
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*var\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*{[^}]*}\s*;?`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*let\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*const\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*window\.[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 9. 移除控制流语句
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*if\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*for\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*while\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 10. 移除JSON和数组
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*\{\s*"[^"]*"\s*:\s*[^}]*\}\s*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*\[\s*[^\]]*\]\s*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 11. 移除网络请求相关代码
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*new\s+XMLHttpRequest\(\)[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*xmlobj\s*\.[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*fetch\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*axios\s*\.[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 12. 移除正则表达式和调试代码
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*new\s+RegExp\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*\/[^/]*\/[gimuy]*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*console\.[a-z]+\([^)]*\);`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 13. 移除事件处理相关代码
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*document\.addEventListener\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*window\.addEventListener\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*on\$1\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 14. 移除定时器相关代码
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*setTimeout\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*setInterval\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 15. 移除微信特有的API调用
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*WeixinJSBridge\s*\..*?;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*wx\.\w+\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 16. 移除logsPagetime相关代码
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*logsPagetime\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*var\s+logsPagetime\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 17. 移除特定的微信错误处理代码
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*\.error\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*\.warn\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*\.info\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 18. 移除微信平台特定的方法调用
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*document\.write\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
cleanText = regexp.MustCompile(`(?s)\s*document\.writeln\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 如果JavaScript关键词较少且中文密度较高,可能是有效的文章内容
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 降低要求:只要中文密度 > 5% 或 长度 > 100 就认为有效
|
|
|
|
|
|
if (jsCount < 10 || chineseDensity > 0.05) && len(cleanText) > 50 {
|
2025-11-26 18:48:12 +08:00
|
|
|
|
// 按句子或段落分割,避免一行过长
|
|
|
|
|
|
if len(cleanText) > 0 {
|
|
|
|
|
|
// 首先尝试按段落分割
|
|
|
|
|
|
paragraphs := regexp.MustCompile(`[。!?.!?]\s*`).Split(cleanText, -1)
|
|
|
|
|
|
|
|
|
|
|
|
// 重组段落,保留标点符号
|
|
|
|
|
|
punctuations := regexp.MustCompile(`[。!?.!?]\s*`).FindAllString(cleanText, -1)
|
|
|
|
|
|
|
|
|
|
|
|
for i := 0; i < len(paragraphs); i++ {
|
|
|
|
|
|
if paragraphs[i] != "" {
|
|
|
|
|
|
if i < len(punctuations) {
|
|
|
|
|
|
paragraphs[i] += punctuations[i]
|
|
|
|
|
|
}
|
|
|
|
|
|
// 只添加非空且长度合理的段落(避免添加JavaScript片段)
|
|
|
|
|
|
paragraph := strings.TrimSpace(paragraphs[i])
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 降低过滤条件,增强中文密度考虑
|
2025-11-26 18:48:12 +08:00
|
|
|
|
paraDensity := w.calculateChineseDensity(paragraph)
|
|
|
|
|
|
paraJsCount := w.jsKeywordCount(paragraph)
|
2025-11-27 18:40:08 +08:00
|
|
|
|
if len(paragraph) > 10 && (paraJsCount < 3 || paraDensity > 0.1) {
|
2025-11-26 18:48:12 +08:00
|
|
|
|
textContent = append(textContent, paragraph)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 如果没有成功分割成段落,直接添加整个文本
|
2025-11-27 18:40:08 +08:00
|
|
|
|
if len(textContent) == 0 && len(cleanText) > 50 && (w.jsKeywordCount(cleanText) < 5 || chineseDensity > 0.1) {
|
2025-11-26 18:48:12 +08:00
|
|
|
|
textContent = append(textContent, cleanText)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 最后的备选方案:尝试从整个页面中提取非JavaScript的文本内容
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 【修改】参考Python版本,直接提取所有文本,然后过滤
|
|
|
|
|
|
if len(textContent) < 5 { // 如果提取的段落很少,说明前面的方法都失败了
|
|
|
|
|
|
fmt.Printf(" [调试] 前面提取方法只得到%d个段落,尝试简单提取方法\n", len(textContent))
|
|
|
|
|
|
|
|
|
|
|
|
// 方法1:优先尝试从 id="js_content" 容器中提取
|
|
|
|
|
|
contentRegex := regexp.MustCompile(`(?s)<div[^>]*id=["']js_content["'][^>]*>(.*?)</div>\s*<script`)
|
|
|
|
|
|
if match := contentRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
fmt.Printf(" [调试] 找到 js_content 容器\n")
|
|
|
|
|
|
contentHTML := match[1]
|
|
|
|
|
|
|
|
|
|
|
|
// 移除HTML标签,提取文本
|
|
|
|
|
|
tagRegex := regexp.MustCompile(`<[^>]*>`)
|
|
|
|
|
|
plainText := tagRegex.ReplaceAllString(contentHTML, "\n")
|
|
|
|
|
|
|
|
|
|
|
|
// 移除HTML实体
|
|
|
|
|
|
plainText = strings.ReplaceAll(plainText, "<", "<")
|
|
|
|
|
|
plainText = strings.ReplaceAll(plainText, ">", ">")
|
|
|
|
|
|
plainText = strings.ReplaceAll(plainText, """, "\"")
|
|
|
|
|
|
plainText = strings.ReplaceAll(plainText, "&", "&")
|
|
|
|
|
|
plainText = strings.ReplaceAll(plainText, " ", " ")
|
|
|
|
|
|
|
|
|
|
|
|
// 按行分割,过滤空行
|
|
|
|
|
|
lines := strings.Split(plainText, "\n")
|
|
|
|
|
|
for _, line := range lines {
|
|
|
|
|
|
line = strings.TrimSpace(line)
|
|
|
|
|
|
if len(line) > 0 {
|
|
|
|
|
|
textContent = append(textContent, line)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
fmt.Printf(" [调试] 从 js_content 提取到 %d 个段落\n", len(textContent))
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 方法2:如果仍然很少,尝试提取所有可见文本
|
|
|
|
|
|
if len(textContent) < 10 {
|
|
|
|
|
|
fmt.Printf(" [调试] js_content 提取不足,尝试全局提取\n")
|
|
|
|
|
|
// 移除script和style标签
|
|
|
|
|
|
scriptRegex := regexp.MustCompile(`(?s)<script[^>]*>.*?</script>`)
|
|
|
|
|
|
styleRegex := regexp.MustCompile(`(?s)<style[^>]*>.*?</style>`)
|
|
|
|
|
|
allText := scriptRegex.ReplaceAllString(content, "")
|
|
|
|
|
|
allText = styleRegex.ReplaceAllString(allText, "")
|
|
|
|
|
|
|
|
|
|
|
|
// 移除所有HTML标签
|
|
|
|
|
|
tagRegex := regexp.MustCompile(`<[^>]*>`)
|
|
|
|
|
|
allText = tagRegex.ReplaceAllString(allText, "\n")
|
|
|
|
|
|
|
|
|
|
|
|
// 移除HTML实体
|
|
|
|
|
|
allText = strings.ReplaceAll(allText, "<", "<")
|
|
|
|
|
|
allText = strings.ReplaceAll(allText, ">", ">")
|
|
|
|
|
|
allText = strings.ReplaceAll(allText, """, "\"")
|
|
|
|
|
|
allText = strings.ReplaceAll(allText, "&", "&")
|
|
|
|
|
|
allText = strings.ReplaceAll(allText, " ", " ")
|
|
|
|
|
|
|
|
|
|
|
|
// 按行分割,过滤空行和JS代码
|
|
|
|
|
|
textContent = []string{} // 重置
|
|
|
|
|
|
lines := strings.Split(allText, "\n")
|
|
|
|
|
|
for _, line := range lines {
|
|
|
|
|
|
line = strings.TrimSpace(line)
|
|
|
|
|
|
// 基础过滤:只保留有中文的行,且不是明显JS代码
|
|
|
|
|
|
if len(line) > 0 &&
|
|
|
|
|
|
!strings.HasPrefix(line, "var ") &&
|
|
|
|
|
|
!strings.HasPrefix(line, "function") &&
|
|
|
|
|
|
!strings.Contains(line, "window.") &&
|
|
|
|
|
|
w.calculateChineseDensity(line) > 0.1 {
|
|
|
|
|
|
textContent = append(textContent, line)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-11-27 18:40:08 +08:00
|
|
|
|
fmt.Printf(" [调试] 全局提取到 %d 个段落\n", len(textContent))
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 对提取的内容应用最终过滤,确保只保留真正的文章正文
|
|
|
|
|
|
filteredContent := w.finalContentFilter(textContent)
|
2025-11-27 18:40:08 +08:00
|
|
|
|
|
|
|
|
|
|
// 【调试】输出过滤前后的对比
|
|
|
|
|
|
fmt.Printf(" [调试] 过滤前段落数: %d, 过滤后段落数: %d\n", len(textContent), len(filteredContent))
|
|
|
|
|
|
if len(filteredContent) == 0 && len(textContent) > 0 {
|
|
|
|
|
|
fmt.Printf(" [调试] ⚠️ finalContentFilter 过滤掉了所有内容!\n")
|
|
|
|
|
|
fmt.Printf(" [调试] 过滤前第一段示例: %s\n", textContent[0][:min(len(textContent[0]), 200)])
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-26 18:48:12 +08:00
|
|
|
|
return createTime, title, commentID, reqID, w.extractAuthor(content), filteredContent
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// min 返回两个整数中的最小值(Go 1.21之前需要手动实现)
|
|
|
|
|
|
func min(a, b int) int {
|
|
|
|
|
|
if a < b {
|
|
|
|
|
|
return a
|
|
|
|
|
|
}
|
|
|
|
|
|
return b
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-26 18:48:12 +08:00
|
|
|
|
// calculateChineseDensity 计算文本中中文字符的密度
|
|
|
|
|
|
func (w *WechatCrawler) calculateChineseDensity(text string) float64 {
|
|
|
|
|
|
if len(text) == 0 {
|
|
|
|
|
|
return 0
|
|
|
|
|
|
}
|
|
|
|
|
|
// 使用正确的Go语言Unicode范围表示法
|
|
|
|
|
|
chineseCount := 0
|
|
|
|
|
|
for _, char := range text {
|
|
|
|
|
|
if char >= 0x4e00 && char <= 0x9fa5 {
|
|
|
|
|
|
chineseCount++
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return float64(chineseCount) / float64(len(text))
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 过滤大段JavaScript代码
|
|
|
|
|
|
func (w *WechatCrawler) filterJavaScriptBlocks(text string) string {
|
|
|
|
|
|
// 移除常见的JavaScript代码块模式
|
|
|
|
|
|
patterns := []string{
|
|
|
|
|
|
// 移除JavaScript函数声明
|
|
|
|
|
|
`(?s)function\s+[a-zA-Z_$][\w$]*\s*\([^)]*\)\s*{[^}]*}`,
|
|
|
|
|
|
// 移除匿名函数
|
|
|
|
|
|
`(?s)\(\s*function\s*\([^)]*\)\s*{[^}]*}\s*\)\s*\(\s*\)`,
|
|
|
|
|
|
// 移除对象字面量
|
|
|
|
|
|
`(?s)\{[^}]*\}`,
|
|
|
|
|
|
// 移除数组字面量
|
|
|
|
|
|
`(?s)\[[^\]]*\]`,
|
|
|
|
|
|
// 移除注释
|
|
|
|
|
|
`//[^\n]*`,
|
|
|
|
|
|
`/\*[^*]*\*/`,
|
|
|
|
|
|
// 移除微信特定错误报告代码
|
|
|
|
|
|
`(?s)WX_BJ_REPORT[^;]*;`,
|
|
|
|
|
|
`(?s)BadJs[^;]*;`,
|
|
|
|
|
|
`(?s)window\.[a-zA-Z_$][\w$]*[^;]*;`,
|
|
|
|
|
|
// 移除XMLHttpRequest相关代码
|
|
|
|
|
|
`(?s)xmlobj[^;]*;`,
|
|
|
|
|
|
`(?s)new\s+Image\([^)]*\)`,
|
|
|
|
|
|
`(?s)setRequestHeader[^;]*;`,
|
|
|
|
|
|
// 移除正则表达式
|
|
|
|
|
|
`/[^/]*\/[gimuy]*`,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
result := text
|
|
|
|
|
|
for _, pattern := range patterns {
|
|
|
|
|
|
regex, err := regexp.Compile(pattern)
|
|
|
|
|
|
if err == nil {
|
|
|
|
|
|
result = regex.ReplaceAllString(result, "")
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 提取纯中文文本
|
|
|
|
|
|
func (w *WechatCrawler) extractChineseText(text string) string {
|
|
|
|
|
|
var result []rune
|
|
|
|
|
|
for _, char := range text {
|
|
|
|
|
|
// 保留中文、标点符号、数字和英文字母,去除特殊字符
|
|
|
|
|
|
if (char >= 0x4e00 && char <= 0x9fa5) ||
|
|
|
|
|
|
unicode.IsPunct(char) ||
|
|
|
|
|
|
unicode.IsDigit(char) ||
|
|
|
|
|
|
unicode.IsLetter(char) ||
|
|
|
|
|
|
char == '\n' || char == ' ' {
|
|
|
|
|
|
result = append(result, char)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return string(result)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// finalContentFilter 最终内容过滤,确保只保留真正的文章正文
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 修改:大幅降低过滤门槛,参考Python版本的简单逻辑
|
|
|
|
|
|
func (w *WechatCrawler) finalContentFilter(textContent []string) []string {
|
2025-11-26 18:48:12 +08:00
|
|
|
|
var validParagraphs []string
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 【修改】如果提取的段落很少,说明可能是提取阶段的问题,直接返回
|
|
|
|
|
|
if len(textContent) <= 3 {
|
|
|
|
|
|
fmt.Printf(" [调试] 提取的段落太少(%d个),可能提取逻辑有问题,跳过过滤\n", len(textContent))
|
|
|
|
|
|
// 简单过滤:只去掉纯标题行和过短的内容
|
|
|
|
|
|
for _, text := range textContent {
|
|
|
|
|
|
text = strings.TrimSpace(text)
|
|
|
|
|
|
// 去掉明显的JavaScript关键词行
|
|
|
|
|
|
if len(text) > 5 &&
|
|
|
|
|
|
!strings.Contains(text, "function(") &&
|
|
|
|
|
|
!strings.Contains(text, "window.") &&
|
|
|
|
|
|
!strings.Contains(text, "var ") {
|
|
|
|
|
|
validParagraphs = append(validParagraphs, text)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-11-27 18:40:08 +08:00
|
|
|
|
return validParagraphs
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 【修改】降低过滤标准,参考Python版本
|
|
|
|
|
|
for _, text := range textContent {
|
|
|
|
|
|
// 基础清理
|
|
|
|
|
|
text = strings.TrimSpace(text)
|
|
|
|
|
|
|
|
|
|
|
|
// 计算中文字符数
|
|
|
|
|
|
chineseCount := 0
|
2025-11-26 18:48:12 +08:00
|
|
|
|
for _, char := range text {
|
|
|
|
|
|
if char >= 0x4e00 && char <= 0x9fa5 {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
chineseCount++
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 计算中文密度
|
|
|
|
|
|
paraDensity := w.calculateChineseDensity(text)
|
|
|
|
|
|
paraJsCount := w.jsKeywordCount(text)
|
|
|
|
|
|
|
|
|
|
|
|
// 【大幅降低门槛】:
|
|
|
|
|
|
// - 长度 > 10(原来25)
|
|
|
|
|
|
// - 中文字符 > 3(原来15)
|
|
|
|
|
|
// - 中文密度 > 0.15(原来0.4)
|
|
|
|
|
|
// - JavaScript关键词 < 5(原来3)
|
|
|
|
|
|
if len(text) > 10 &&
|
|
|
|
|
|
!strings.Contains(text, "function(") &&
|
|
|
|
|
|
!strings.Contains(text, "window.") &&
|
|
|
|
|
|
!strings.Contains(text, "WX_BJ_REPORT") &&
|
|
|
|
|
|
!strings.Contains(text, "BadJs") &&
|
|
|
|
|
|
chineseCount > 3 &&
|
|
|
|
|
|
paraDensity > 0.15 &&
|
|
|
|
|
|
paraJsCount < 5 {
|
2025-11-26 18:48:12 +08:00
|
|
|
|
validParagraphs = append(validParagraphs, text)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 【新增】如果过滤后还是空的,使用最宽松的规则
|
|
|
|
|
|
if len(validParagraphs) == 0 && len(textContent) > 0 {
|
|
|
|
|
|
fmt.Printf(" [调试] 标准过滤后仍为空,使用最宽松规则\n")
|
|
|
|
|
|
for _, text := range textContent {
|
|
|
|
|
|
text = strings.TrimSpace(text)
|
|
|
|
|
|
// 只要有中文字符且不是明显的JS代码就保留
|
|
|
|
|
|
overallDensity := w.calculateChineseDensity(text)
|
|
|
|
|
|
overallJsCount := w.jsKeywordCount(text)
|
|
|
|
|
|
|
|
|
|
|
|
if len(text) > 5 && overallDensity > 0.1 && overallJsCount < 10 {
|
|
|
|
|
|
validParagraphs = append(validParagraphs, text)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return validParagraphs
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// jsKeywordCount 计算文本中JavaScript关键词的数量 - 增强版
|
|
|
|
|
|
func (w *WechatCrawler) jsKeywordCount(text string) int {
|
|
|
|
|
|
count := 0
|
|
|
|
|
|
// 新增加的高优先级过滤关键词
|
|
|
|
|
|
highPriorityKeywords := []string{
|
|
|
|
|
|
"logs = ", "window.", "LANG = ", "extInfo:", "pagetime[",
|
|
|
|
|
|
"BadJs;", "sec_open=", "xmlobj = ", "addEventListener",
|
|
|
|
|
|
"new Image()", "setRequestHeader", "onreadystatechange",
|
|
|
|
|
|
"var ", "let ", "const ", "function ", "return ",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 基础JavaScript关键词
|
|
|
|
|
|
basicKeywords := []string{
|
|
|
|
|
|
"function", "var", "let", "const", "if(", "else", "for(", "while(",
|
|
|
|
|
|
"return", "setTimeout", "setInterval", "WeixinJSBridge", "JSON",
|
|
|
|
|
|
"console", "document", "window", "try{", "catch(", "throw",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 微信平台特定关键词
|
|
|
|
|
|
wechatKeywords := []string{
|
|
|
|
|
|
"WX_BJ_REPORT", "BadJs", "__moon_initcallback", "logsPagetime",
|
|
|
|
|
|
"WeixinJSBridge", "wx.", "document.write", "document.writeln",
|
|
|
|
|
|
// 错误处理关键词
|
|
|
|
|
|
".error(", ".warn(", ".info(", ".debug(",
|
|
|
|
|
|
// 网络请求关键词
|
|
|
|
|
|
"XMLHttpRequest", "fetch(", "axios.", "xmlobj.",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
lowerText := strings.ToLower(text)
|
|
|
|
|
|
// 计算高优先级关键词数量(权重更高)
|
|
|
|
|
|
for _, keyword := range highPriorityKeywords {
|
|
|
|
|
|
count += strings.Count(lowerText, strings.ToLower(keyword)) * 3
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 计算微信平台特定关键词数量
|
|
|
|
|
|
for _, keyword := range wechatKeywords {
|
|
|
|
|
|
count += strings.Count(lowerText, strings.ToLower(keyword)) * 2
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 计算基础JavaScript关键词数量
|
|
|
|
|
|
for _, keyword := range basicKeywords {
|
|
|
|
|
|
count += strings.Count(lowerText, strings.ToLower(keyword))
|
|
|
|
|
|
}
|
|
|
|
|
|
return count
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// extractAuthor 提取文章作者信息
|
|
|
|
|
|
func (w *WechatCrawler) extractAuthor(content string) string {
|
|
|
|
|
|
authorPatterns := []string{
|
|
|
|
|
|
`var author\s*=\s*['"](.*?)['"]`,
|
|
|
|
|
|
`"author"\s*:\s*['"](.*?)['"]`,
|
|
|
|
|
|
`window\.author\s*=\s*['"](.*?)['"]`,
|
|
|
|
|
|
`<meta name=["']author["'] content=["'](.*?)['"]`,
|
|
|
|
|
|
`window\.appmsg\s*=\s*\{[^}]*"author"\s*:\s*['"](.*?)['"]`,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for _, pattern := range authorPatterns {
|
|
|
|
|
|
authorRegex := regexp.MustCompile(pattern)
|
|
|
|
|
|
if match := authorRegex.FindStringSubmatch(content); len(match) > 1 {
|
|
|
|
|
|
author := match[1]
|
|
|
|
|
|
// 尝试解码HTML实体和URL编码
|
|
|
|
|
|
author = strings.ReplaceAll(author, """, "\"")
|
|
|
|
|
|
author = strings.ReplaceAll(author, "&", "&")
|
|
|
|
|
|
author = strings.ReplaceAll(author, "<", "<")
|
|
|
|
|
|
author = strings.ReplaceAll(author, ">", ">")
|
|
|
|
|
|
if decoded, err := url.QueryUnescape(author); err == nil {
|
|
|
|
|
|
author = decoded
|
|
|
|
|
|
}
|
|
|
|
|
|
return author
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return ""
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// GetArticleStats 获取文章统计信息
|
|
|
|
|
|
func (w *WechatCrawler) GetArticleStats(link string, title string, commentID string, reqID string, createTime string) (map[string]string, error) {
|
|
|
|
|
|
// 解析链接参数
|
|
|
|
|
|
mid := ""
|
|
|
|
|
|
sn := ""
|
|
|
|
|
|
idx := ""
|
|
|
|
|
|
|
|
|
|
|
|
// 尝试从链接中提取参数
|
|
|
|
|
|
midRegex := regexp.MustCompile(`mid=(.*?)&`)
|
|
|
|
|
|
if match := midRegex.FindStringSubmatch(link); len(match) > 1 {
|
|
|
|
|
|
mid = match[1]
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
snRegex := regexp.MustCompile(`sn=(.*?)&`)
|
|
|
|
|
|
if match := snRegex.FindStringSubmatch(link); len(match) > 1 {
|
|
|
|
|
|
sn = match[1]
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
idxRegex := regexp.MustCompile(`idx=(.*?)&`)
|
|
|
|
|
|
if match := idxRegex.FindStringSubmatch(link); len(match) > 1 {
|
|
|
|
|
|
idx = match[1]
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 生成随机r值
|
|
|
|
|
|
r := fmt.Sprintf("0.%d", time.Now().UnixNano()%10000000000000000)
|
|
|
|
|
|
|
|
|
|
|
|
// 构建请求URL
|
|
|
|
|
|
detailURL := fmt.Sprintf("https://mp.weixin.qq.com/mp/getappmsgext?f=json&mock=&fasttmplajax=1&f=json&uin=%s&key=%s&pass_ticket=%s&__biz=%s",
|
|
|
|
|
|
w.uin, w.key, w.passTicket, w.biz)
|
|
|
|
|
|
|
|
|
|
|
|
// 构建请求数据
|
|
|
|
|
|
data := map[string]string{
|
|
|
|
|
|
"r": r,
|
|
|
|
|
|
"sn": sn,
|
|
|
|
|
|
"mid": mid,
|
|
|
|
|
|
"idx": idx,
|
|
|
|
|
|
"req_id": reqID,
|
|
|
|
|
|
"title": title,
|
|
|
|
|
|
"comment_id": commentID,
|
|
|
|
|
|
"appmsg_type": "9",
|
|
|
|
|
|
"__biz": w.biz,
|
|
|
|
|
|
"pass_ticket": w.passTicket,
|
|
|
|
|
|
"abtest_cookie": "",
|
|
|
|
|
|
"devicetype": "Windows 7 x64",
|
|
|
|
|
|
"version": "63090b13",
|
|
|
|
|
|
"is_need_ticket": "0",
|
|
|
|
|
|
"is_need_ad": "0",
|
|
|
|
|
|
"is_need_reward": "0",
|
|
|
|
|
|
"both_ad": "0",
|
|
|
|
|
|
"reward_uin_count": "0",
|
|
|
|
|
|
"send_time": "",
|
|
|
|
|
|
"msg_daily_idx": "1",
|
|
|
|
|
|
"is_original": "0",
|
|
|
|
|
|
"is_only_read": "1",
|
|
|
|
|
|
"scene": "38",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 发送POST请求
|
|
|
|
|
|
resp, err := w.client.R().SetFormData(data).Post(detailURL)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return nil, fmt.Errorf("请求统计信息失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 解析响应
|
|
|
|
|
|
var result map[string]interface{}
|
|
|
|
|
|
err = json.Unmarshal([]byte(resp.String()), &result)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return nil, fmt.Errorf("解析统计信息失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 提取统计数据
|
|
|
|
|
|
stats := map[string]string{
|
|
|
|
|
|
"read_num": "0",
|
|
|
|
|
|
"old_like_num": "0",
|
|
|
|
|
|
"share_num": "0",
|
|
|
|
|
|
"show_read": "0",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 从返回的JSON中提取所需数据
|
|
|
|
|
|
if appMsgExtInfo, ok := result["appmsgstat"].(map[string]interface{}); ok {
|
|
|
|
|
|
if readNum, ok := appMsgExtInfo["read_num"].(float64); ok {
|
|
|
|
|
|
stats["read_num"] = fmt.Sprintf("%.0f", readNum)
|
|
|
|
|
|
}
|
|
|
|
|
|
if likeNum, ok := appMsgExtInfo["old_like_num"].(float64); ok {
|
|
|
|
|
|
stats["old_like_num"] = fmt.Sprintf("%.0f", likeNum)
|
|
|
|
|
|
}
|
|
|
|
|
|
if shareNum, ok := appMsgExtInfo["share_num"].(float64); ok {
|
|
|
|
|
|
stats["share_num"] = fmt.Sprintf("%.0f", shareNum)
|
|
|
|
|
|
}
|
|
|
|
|
|
if showRead, ok := appMsgExtInfo["show_read"].(float64); ok {
|
|
|
|
|
|
stats["show_read"] = fmt.Sprintf("%.0f", showRead)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return stats, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// GetArticleComments 获取文章评论
|
|
|
|
|
|
func (w *WechatCrawler) GetArticleComments(commentID string) ([]string, []string, error) {
|
|
|
|
|
|
if commentID == "" {
|
|
|
|
|
|
return []string{}, []string{}, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 构建评论请求URL
|
|
|
|
|
|
commentURL := fmt.Sprintf(
|
|
|
|
|
|
"https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=%s&appmsgid=2247491372&idx=1&comment_id=%s&offset=0&limit=100&uin=%s&key=%s&pass_ticket=%s&wxtoken=&devicetype=Windows+10&clientversion=62060833&appmsg_token=",
|
|
|
|
|
|
w.biz, commentID, w.uin, w.key, w.passTicket)
|
|
|
|
|
|
|
|
|
|
|
|
// 发送请求
|
|
|
|
|
|
resp, err := w.client.R().Get(commentURL)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return []string{}, []string{}, fmt.Errorf("获取评论失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 解析响应
|
|
|
|
|
|
var result map[string]interface{}
|
|
|
|
|
|
err = json.Unmarshal([]byte(resp.String()), &result)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return []string{}, []string{}, fmt.Errorf("解析评论失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 提取评论和点赞数
|
|
|
|
|
|
var comments []string
|
|
|
|
|
|
var commentLikes []string
|
|
|
|
|
|
|
|
|
|
|
|
// 简化实现,在实际项目中需要根据返回的JSON结构正确提取数据
|
|
|
|
|
|
return comments, commentLikes, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// GetOfficialAccountLinkFromArticle 通过文章链接获取公众号主页链接
|
|
|
|
|
|
func (w *WechatCrawler) GetOfficialAccountLinkFromArticle(articleURL string) (string, error) {
|
|
|
|
|
|
// 首先尝试从URL中提取__biz参数(兼容旧格式)
|
|
|
|
|
|
bizRegex := regexp.MustCompile(`__biz=([^&]+)`)
|
|
|
|
|
|
match := bizRegex.FindStringSubmatch(articleURL)
|
|
|
|
|
|
if len(match) >= 2 {
|
|
|
|
|
|
biz := match[1]
|
|
|
|
|
|
// 更新当前实例的biz值
|
|
|
|
|
|
w.biz = biz
|
|
|
|
|
|
|
|
|
|
|
|
// 构建公众号主页链接
|
|
|
|
|
|
homePageURL := fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124", biz)
|
|
|
|
|
|
return homePageURL, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 如果从URL中提取失败,尝试从文章内容中提取
|
|
|
|
|
|
content, err := w.GetOneArticle(articleURL)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return "", fmt.Errorf("获取文章内容失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 从文章内容中提取biz
|
|
|
|
|
|
contentBizRegex := regexp.MustCompile(`var biz = "(.*?);`)
|
|
|
|
|
|
contentMatch := contentBizRegex.FindStringSubmatch(content)
|
|
|
|
|
|
if len(contentMatch) < 2 {
|
|
|
|
|
|
// 尝试其他可能的biz格式
|
|
|
|
|
|
contentBizRegex2 := regexp.MustCompile(`__biz=(.*?)&`)
|
|
|
|
|
|
contentMatch = contentBizRegex2.FindStringSubmatch(content)
|
|
|
|
|
|
if len(contentMatch) < 2 {
|
|
|
|
|
|
return "", fmt.Errorf("无法从文章链接和内容中提取公众号信息")
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 清理biz值,移除可能的额外引号
|
|
|
|
|
|
biz := contentMatch[1]
|
|
|
|
|
|
biz = strings.ReplaceAll(biz, " || ", "")
|
|
|
|
|
|
biz = strings.ReplaceAll(biz, "\"", "")
|
|
|
|
|
|
|
|
|
|
|
|
// 更新当前实例的biz值
|
|
|
|
|
|
w.biz = biz
|
|
|
|
|
|
|
|
|
|
|
|
// 构建公众号主页链接
|
|
|
|
|
|
homePageURL := fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124", biz)
|
|
|
|
|
|
return homePageURL, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// GetArticleList 获取公众号所有文章列表
|
|
|
|
|
|
func (w *WechatCrawler) GetArticleList() ([][]string, error) {
|
|
|
|
|
|
var allArticles [][]string
|
|
|
|
|
|
offset := 0
|
|
|
|
|
|
|
|
|
|
|
|
for {
|
|
|
|
|
|
fmt.Printf("正在获取第%d页文章...\n", offset/10+1)
|
|
|
|
|
|
result, err := w.GetNextList(offset)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return allArticles, fmt.Errorf("获取文章列表失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 检查是否还有更多文章
|
2025-11-27 18:40:08 +08:00
|
|
|
|
mFlag, ok := result["m_flag"].(int)
|
|
|
|
|
|
if !ok {
|
|
|
|
|
|
// 尝试转换为float64(JSON反序列化可能将数字解析为float64)
|
|
|
|
|
|
if mFlagFloat, ok := result["m_flag"].(float64); ok {
|
|
|
|
|
|
mFlag = int(mFlagFloat)
|
|
|
|
|
|
} else {
|
|
|
|
|
|
mFlag = 0
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if mFlag == 0 {
|
2025-11-26 18:48:12 +08:00
|
|
|
|
break
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 获取当前页的文章列表
|
|
|
|
|
|
passageList, ok := result["passage_list"].([][]string)
|
|
|
|
|
|
if !ok {
|
|
|
|
|
|
return allArticles, fmt.Errorf("文章列表格式错误")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 添加到总列表
|
|
|
|
|
|
allArticles = append(allArticles, passageList...)
|
|
|
|
|
|
|
|
|
|
|
|
// 增加偏移量
|
|
|
|
|
|
offset += 10
|
|
|
|
|
|
|
|
|
|
|
|
// 随机延迟,避免被封禁
|
|
|
|
|
|
time.Sleep(time.Duration(2000+offset) * time.Millisecond)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 转换链接
|
|
|
|
|
|
transformedArticles := w.TransformLinks(allArticles)
|
|
|
|
|
|
|
|
|
|
|
|
fmt.Printf("共获取到%d篇文章\n", len(transformedArticles))
|
|
|
|
|
|
return transformedArticles, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// SaveArticleListToExcel 保存文章列表到Excel
|
|
|
|
|
|
func (w *WechatCrawler) SaveArticleListToExcel(officialPath string, articleList [][]string, nickname string) error {
|
|
|
|
|
|
// 确保目录存在
|
|
|
|
|
|
if err := os.MkdirAll(officialPath, 0755); err != nil {
|
|
|
|
|
|
return fmt.Errorf("创建目录失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 保存转换后的链接文件
|
|
|
|
|
|
filePath := fmt.Sprintf("%s/文章列表(article_list)_直连链接.txt", officialPath)
|
|
|
|
|
|
var content strings.Builder
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 添加 UTF-8 BOM 头,确保 Excel 正确识别编码
|
|
|
|
|
|
content.WriteString("\xEF\xBB\xBF")
|
|
|
|
|
|
|
2025-11-26 18:48:12 +08:00
|
|
|
|
// 写入标题行
|
|
|
|
|
|
content.WriteString("序号,创建时间,标题,链接\n")
|
|
|
|
|
|
|
|
|
|
|
|
// 写入文章列表
|
|
|
|
|
|
for i, article := range articleList {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
if len(article) < 4 {
|
|
|
|
|
|
continue // 跳过不完整的数据
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 转换时间戳为可读格式(如果是时间戳)
|
|
|
|
|
|
createTime := article[1]
|
|
|
|
|
|
|
|
|
|
|
|
// 调试输出:查看原始时间戳
|
|
|
|
|
|
if i == 0 { // 只打印第一篇文章,避免输出过多
|
|
|
|
|
|
fmt.Printf("调试信息 - 第1篇文章\n")
|
|
|
|
|
|
fmt.Printf(" article[0]: '%s'\n", article[0])
|
|
|
|
|
|
fmt.Printf(" article[1] (时间戳): '%s'\n", article[1])
|
|
|
|
|
|
fmt.Printf(" article[2] (标题): '%s'\n", article[2])
|
|
|
|
|
|
fmt.Printf(" 时间戳长度: %d\n", len(article[1]))
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if createTime != "" && createTime != "0" {
|
|
|
|
|
|
// 尝试将字符串转换为时间戳
|
|
|
|
|
|
var ts int64
|
|
|
|
|
|
n, err := fmt.Sscanf(createTime, "%d", &ts)
|
|
|
|
|
|
if i == 0 {
|
|
|
|
|
|
fmt.Printf(" Sscanf 结果: n=%d, err=%v, ts=%d\n", n, err, ts)
|
|
|
|
|
|
}
|
|
|
|
|
|
if err == nil && n == 1 && ts > 0 {
|
|
|
|
|
|
// 转换为可读的日期时间格式
|
|
|
|
|
|
createTime = time.Unix(ts, 0).Format("2006-01-02 15:04:05")
|
|
|
|
|
|
if i == 0 {
|
|
|
|
|
|
fmt.Printf(" 转换后的时间: %s\n", createTime)
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
// 如果转换失败,保留原始值
|
|
|
|
|
|
if i == 0 {
|
|
|
|
|
|
fmt.Printf(" 转换失败,保留原始值: %s\n", createTime)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
if i == 0 {
|
|
|
|
|
|
fmt.Printf(" 时间戳为空或为0,设置为'未知时间'\n")
|
|
|
|
|
|
}
|
|
|
|
|
|
createTime = "未知时间"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 清理和转义标题(移除换行符、制表符等)
|
|
|
|
|
|
title := strings.TrimSpace(article[2])
|
|
|
|
|
|
title = strings.ReplaceAll(title, "\n", " ")
|
|
|
|
|
|
title = strings.ReplaceAll(title, "\r", " ")
|
|
|
|
|
|
title = strings.ReplaceAll(title, "\t", " ")
|
|
|
|
|
|
|
|
|
|
|
|
// 如果标题包含逗号或引号,需要用双引号包裹并转义内部引号
|
|
|
|
|
|
if strings.Contains(title, ",") || strings.Contains(title, "\"") || strings.Contains(title, "\n") {
|
|
|
|
|
|
title = "\"" + strings.ReplaceAll(title, "\"", "\"\"") + "\""
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 清理链接
|
|
|
|
|
|
link := strings.TrimSpace(article[3])
|
|
|
|
|
|
|
|
|
|
|
|
// 写入CSV行
|
|
|
|
|
|
content.WriteString(fmt.Sprintf("%d,%s,%s,%s\n", i+1, createTime, title, link))
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 写入文件
|
|
|
|
|
|
err := os.WriteFile(filePath, []byte(content.String()), 0644)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return fmt.Errorf("保存文章列表失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fmt.Printf("文章列表已保存到: %s\n", filePath)
|
2025-11-27 18:40:08 +08:00
|
|
|
|
fmt.Printf("共保存 %d 篇文章\n", len(articleList))
|
2025-11-26 18:48:12 +08:00
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// TransformLinks 转换文章链接,将带有amp;的链接转换为直接可访问的链接
|
|
|
|
|
|
func (w *WechatCrawler) TransformLinks(articleList [][]string) [][]string {
|
|
|
|
|
|
transformedList := make([][]string, 0, len(articleList))
|
|
|
|
|
|
|
|
|
|
|
|
for _, article := range articleList {
|
|
|
|
|
|
if len(article) >= 4 {
|
|
|
|
|
|
// 转换链接,移除amp;
|
|
|
|
|
|
transformedLink := strings.Replace(article[3], "amp;", "", -1)
|
|
|
|
|
|
transformedArticle := []string{article[0], article[1], article[2], transformedLink}
|
|
|
|
|
|
transformedList = append(transformedList, transformedArticle)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return transformedList
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ReadArticleLinksFromExcel 从Excel读取文章链接
|
|
|
|
|
|
func (w *WechatCrawler) ReadArticleLinksFromExcel(filePath string) ([]string, error) {
|
|
|
|
|
|
// 简化实现,返回空列表
|
|
|
|
|
|
return []string{}, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// GetArticleDetail 获取单篇文章的详细信息
|
|
|
|
|
|
func (w *WechatCrawler) GetArticleDetail(link string) (*ArticleDetail, error) {
|
|
|
|
|
|
// 获取文章内容
|
|
|
|
|
|
content, err := w.GetOneArticle(link)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 【调试】保存原始HTML到文件,用于分析内容提取问题
|
|
|
|
|
|
debugPath := "./debug_article_raw.html"
|
|
|
|
|
|
if err := os.WriteFile(debugPath, []byte(content), 0644); err == nil {
|
|
|
|
|
|
fmt.Printf(" [调试] 原始HTML已保存: %s (长度: %d 字节)\n", debugPath, len(content))
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-26 18:48:12 +08:00
|
|
|
|
// 提取文章信息
|
|
|
|
|
|
createTime, title, commentID, reqID, _, textContent := w.ExtractArticleInfo(content)
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 【调试】输出内容提取详情
|
|
|
|
|
|
fmt.Printf(" [调试] 提取结果 - 标题: %s, 段落数: %d\n", title, len(textContent))
|
|
|
|
|
|
if len(textContent) > 0 {
|
|
|
|
|
|
firstPara := textContent[0]
|
|
|
|
|
|
if len(firstPara) > 100 {
|
|
|
|
|
|
firstPara = firstPara[:100] + "..."
|
|
|
|
|
|
}
|
|
|
|
|
|
fmt.Printf(" [调试] 第一段: %s\n", firstPara)
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fmt.Printf(" [调试] ⚠️ ExtractArticleInfo 未提取到任何内容!\n")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-26 18:48:12 +08:00
|
|
|
|
// 提取公众号名称
|
|
|
|
|
|
accountName := w.ExtractOfficialAccountName(content)
|
|
|
|
|
|
|
|
|
|
|
|
// 获取统计信息
|
|
|
|
|
|
stats, err := w.GetArticleStats(link, title, commentID, reqID, createTime)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
// 如果获取统计信息失败,使用默认值
|
|
|
|
|
|
stats = map[string]string{
|
|
|
|
|
|
"read_num": "0",
|
|
|
|
|
|
"old_like_num": "0",
|
|
|
|
|
|
"share_num": "0",
|
|
|
|
|
|
"show_read": "0",
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 获取评论信息
|
|
|
|
|
|
comments, commentLikes, _ := w.GetArticleComments(commentID)
|
|
|
|
|
|
|
|
|
|
|
|
// 构建文章详情
|
|
|
|
|
|
detail := &ArticleDetail{
|
|
|
|
|
|
LocalTime: time.Now().Format("2006-01-02 15:04:05"),
|
|
|
|
|
|
CreateTime: createTime,
|
|
|
|
|
|
Title: title,
|
|
|
|
|
|
OfficialName: accountName,
|
|
|
|
|
|
Link: link,
|
|
|
|
|
|
Content: textContent,
|
|
|
|
|
|
ReadCount: stats["read_num"],
|
|
|
|
|
|
LikeCount: stats["old_like_num"],
|
|
|
|
|
|
ShareCount: stats["share_num"],
|
|
|
|
|
|
Comments: comments,
|
|
|
|
|
|
CommentLikes: commentLikes,
|
|
|
|
|
|
CommentID: commentID,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return detail, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// GetDetailList 批量获取文章详情
|
|
|
|
|
|
func (w *WechatCrawler) GetDetailList(articleList [][]string, officialPath string) error {
|
|
|
|
|
|
// 确保目录存在
|
|
|
|
|
|
if err := os.MkdirAll(officialPath, 0755); err != nil {
|
|
|
|
|
|
return fmt.Errorf("创建目录失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
successCount := 0
|
|
|
|
|
|
errorCount := 0
|
|
|
|
|
|
errorLinks := [][]string{}
|
|
|
|
|
|
|
|
|
|
|
|
for i, article := range articleList {
|
|
|
|
|
|
if len(article) < 4 {
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
link := article[3]
|
|
|
|
|
|
title := article[2]
|
|
|
|
|
|
|
|
|
|
|
|
fmt.Printf("正在处理第%d篇文章: %s\n", i+1, title)
|
|
|
|
|
|
|
|
|
|
|
|
// 获取文章详情
|
|
|
|
|
|
detail, err := w.GetArticleDetail(link)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
fmt.Printf("获取文章详情失败: %v\n", err)
|
|
|
|
|
|
errorCount++
|
|
|
|
|
|
errorLinks = append(errorLinks, article)
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 保存文章详情 - 确保使用文章标题作为文件名,并清理非法字符
|
|
|
|
|
|
// 清理标题中的非法字符
|
|
|
|
|
|
cleanTitle := detail.Title
|
|
|
|
|
|
invalidChars := []string{"\\", "/", ":", "*", "?", "\"", "<", ">", "|"}
|
|
|
|
|
|
for _, char := range invalidChars {
|
|
|
|
|
|
cleanTitle = strings.ReplaceAll(cleanTitle, char, "_")
|
|
|
|
|
|
}
|
|
|
|
|
|
// 限制文件名长度,避免路径过长
|
|
|
|
|
|
if len(cleanTitle) > 100 {
|
|
|
|
|
|
cleanTitle = cleanTitle[:100]
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
filePath := fmt.Sprintf("%s/%s_文章详情.txt", officialPath, cleanTitle)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
if err := w.SaveArticleDetailToExcel(detail, filePath); err != nil {
|
|
|
|
|
|
fmt.Printf("保存文章详情失败: %v\n", err)
|
|
|
|
|
|
errorCount++
|
|
|
|
|
|
errorLinks = append(errorLinks, article)
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
successCount++
|
|
|
|
|
|
fmt.Printf("文章详情保存成功: %s\n", detail.Title)
|
|
|
|
|
|
|
|
|
|
|
|
// 随机延迟,避免被封禁
|
|
|
|
|
|
delayTime := 3000 + i*100 // 3秒基础延迟,递增
|
|
|
|
|
|
time.Sleep(time.Duration(delayTime) * time.Millisecond)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 保存错误链接
|
|
|
|
|
|
if len(errorLinks) > 0 {
|
|
|
|
|
|
errorPath := fmt.Sprintf("%s/问题链接(error_links).txt", officialPath)
|
|
|
|
|
|
var content strings.Builder
|
|
|
|
|
|
content.WriteString("序号,创建时间,标题,链接\n")
|
|
|
|
|
|
for i, link := range errorLinks {
|
|
|
|
|
|
content.WriteString(fmt.Sprintf("%d,%s,%s,%s\n", i+1, link[1], link[2], link[3]))
|
|
|
|
|
|
}
|
|
|
|
|
|
err := os.WriteFile(errorPath, []byte(content.String()), 0644)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
fmt.Printf("保存错误链接失败: %v\n", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fmt.Printf("文章详情获取完成: 成功%d篇, 失败%d篇\n", successCount, errorCount)
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// SaveArticleDetailToExcel 保存文章详情到Excel
|
|
|
|
|
|
func (c *WechatCrawler) SaveArticleDetailToExcel(article *ArticleDetail, filePath string) error {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 【修复】不要清理整个路径!只需要确保目录存在即可
|
|
|
|
|
|
// filePath 已经在调用处清理过了文件名部分
|
|
|
|
|
|
// 这里直接使用即可
|
|
|
|
|
|
|
2025-11-26 18:48:12 +08:00
|
|
|
|
var content strings.Builder
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 添加 UTF-8 BOM 头,确保正确显示中文
|
|
|
|
|
|
content.WriteString("\xEF\xBB\xBF")
|
|
|
|
|
|
|
|
|
|
|
|
content.WriteString("=")
|
|
|
|
|
|
content.WriteString(strings.Repeat("=", 80))
|
|
|
|
|
|
content.WriteString("\n")
|
2025-11-26 18:48:12 +08:00
|
|
|
|
content.WriteString(fmt.Sprintf("本地创建时间: %s\n", article.LocalTime))
|
|
|
|
|
|
content.WriteString(fmt.Sprintf("文章发布时间: %s\n", article.CreateTime))
|
|
|
|
|
|
content.WriteString(fmt.Sprintf("公众号名称: %s\n", article.OfficialName))
|
|
|
|
|
|
content.WriteString(fmt.Sprintf("文章标题: %s\n", article.Title))
|
|
|
|
|
|
content.WriteString(fmt.Sprintf("文章链接: %s\n", article.Link))
|
|
|
|
|
|
content.WriteString(fmt.Sprintf("阅读量: %s\n", article.ReadCount))
|
|
|
|
|
|
content.WriteString(fmt.Sprintf("点赞数: %s\n", article.LikeCount))
|
|
|
|
|
|
content.WriteString(fmt.Sprintf("转发数: %s\n", article.ShareCount))
|
2025-11-27 18:40:08 +08:00
|
|
|
|
content.WriteString(strings.Repeat("=", 80))
|
|
|
|
|
|
content.WriteString("\n\n")
|
|
|
|
|
|
|
|
|
|
|
|
content.WriteString("文章内容:\n")
|
|
|
|
|
|
content.WriteString(strings.Repeat("-", 80))
|
|
|
|
|
|
content.WriteString("\n")
|
|
|
|
|
|
|
|
|
|
|
|
for i, line := range article.Content {
|
|
|
|
|
|
// 清理内容,移除多余的空白字符
|
|
|
|
|
|
cleanLine := strings.TrimSpace(line)
|
|
|
|
|
|
if cleanLine != "" {
|
|
|
|
|
|
content.WriteString(cleanLine)
|
|
|
|
|
|
content.WriteString("\n")
|
|
|
|
|
|
|
|
|
|
|
|
// 每个段落后添加空行,提高可读性
|
|
|
|
|
|
if i < len(article.Content)-1 {
|
|
|
|
|
|
content.WriteString("\n")
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-11-26 18:48:12 +08:00
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 如果有评论,添加评论区
|
|
|
|
|
|
if len(article.Comments) > 0 {
|
2025-11-26 18:48:12 +08:00
|
|
|
|
content.WriteString("\n")
|
2025-11-27 18:40:08 +08:00
|
|
|
|
content.WriteString(strings.Repeat("=", 80))
|
|
|
|
|
|
content.WriteString("\n")
|
|
|
|
|
|
content.WriteString(fmt.Sprintf("评论区 (共 %d 条评论):\n", len(article.Comments)))
|
|
|
|
|
|
content.WriteString(strings.Repeat("-", 80))
|
|
|
|
|
|
content.WriteString("\n\n")
|
|
|
|
|
|
|
|
|
|
|
|
for i, comment := range article.Comments {
|
|
|
|
|
|
content.WriteString(fmt.Sprintf("%d. %s", i+1, comment))
|
|
|
|
|
|
if i < len(article.CommentLikes) && article.CommentLikes[i] != "" {
|
|
|
|
|
|
content.WriteString(fmt.Sprintf(" (点赞: %s)", article.CommentLikes[i]))
|
|
|
|
|
|
}
|
|
|
|
|
|
content.WriteString("\n\n")
|
|
|
|
|
|
}
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
content.WriteString("\n")
|
|
|
|
|
|
content.WriteString(strings.Repeat("=", 80))
|
|
|
|
|
|
content.WriteString("\n")
|
|
|
|
|
|
content.WriteString("文件结束\n")
|
|
|
|
|
|
|
2025-11-26 18:48:12 +08:00
|
|
|
|
// 写入文件
|
2025-11-27 18:40:08 +08:00
|
|
|
|
err := os.WriteFile(filePath, []byte(content.String()), 0644)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return fmt.Errorf("保存文章详情失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return nil
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// GetListArticleFromFile 根据公众号名称或文章链接,从文件中读取文章列表并下载内容
|
|
|
|
|
|
func (w *WechatCrawler) GetListArticleFromFile(nameLink string, imgSaveFlag bool, contentSaveFlag bool) error {
|
|
|
|
|
|
// 1. 判断输入类型并获取公众号名称
|
|
|
|
|
|
nickname := ""
|
|
|
|
|
|
if strings.Contains(nameLink, "http") {
|
|
|
|
|
|
fmt.Println("检测到输入为链接,开始获取公众号名称")
|
|
|
|
|
|
// 从文章链接获取公众号信息
|
2025-11-27 18:40:08 +08:00
|
|
|
|
content, err := w.GetOneArticle(nameLink)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
if err != nil {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
return fmt.Errorf("获取文章内容失败: %v", err)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 从内容中提取公众号名称
|
|
|
|
|
|
nickname = w.ExtractOfficialAccountName(content)
|
|
|
|
|
|
if nickname == "" {
|
|
|
|
|
|
return fmt.Errorf("无法从文章中提取公众号名称")
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
fmt.Printf("获取到公众号名称: %s\n", nickname)
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fmt.Println("检测到输入为公众号名称")
|
|
|
|
|
|
nickname = nameLink
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 2. 构建文件路径
|
|
|
|
|
|
rootPath := "./data/"
|
2025-11-27 18:40:08 +08:00
|
|
|
|
officialPath := rootPath + nickname
|
|
|
|
|
|
// 【新增】创建"文章详细"子目录
|
|
|
|
|
|
articleDetailPath := officialPath + "/文章详细"
|
2025-11-26 18:48:12 +08:00
|
|
|
|
articleListPath := officialPath + "/文章列表(article_list)_直连链接.txt"
|
|
|
|
|
|
|
|
|
|
|
|
// 3. 检查文件是否存在
|
|
|
|
|
|
if _, err := os.Stat(articleListPath); os.IsNotExist(err) {
|
|
|
|
|
|
return fmt.Errorf("文件不存在,请检查目录文件: %s", articleListPath)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 4. 读取文章链接列表
|
|
|
|
|
|
fileContent, err := os.ReadFile(articleListPath)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return fmt.Errorf("读取文章列表文件失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
lines := strings.Split(string(fileContent), "\n")
|
|
|
|
|
|
var articleLinks []string
|
2025-11-27 18:40:08 +08:00
|
|
|
|
var articleTitles []string
|
|
|
|
|
|
var articleTimes []string
|
2025-11-26 18:48:12 +08:00
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 跳过BOM头和标题行,提取链接
|
2025-11-26 18:48:12 +08:00
|
|
|
|
for i, line := range lines {
|
|
|
|
|
|
if i == 0 || line == "" {
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 移除可能的BOM头
|
|
|
|
|
|
line = strings.TrimPrefix(line, "\xEF\xBB\xBF")
|
|
|
|
|
|
line = strings.TrimSpace(line)
|
|
|
|
|
|
if line == "" {
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 解析CSV行(处理带引号的字段)
|
|
|
|
|
|
var parts []string
|
|
|
|
|
|
inQuote := false
|
|
|
|
|
|
currentPart := ""
|
|
|
|
|
|
for _, char := range line {
|
|
|
|
|
|
if char == '"' {
|
|
|
|
|
|
inQuote = !inQuote
|
|
|
|
|
|
} else if char == ',' && !inQuote {
|
|
|
|
|
|
parts = append(parts, currentPart)
|
|
|
|
|
|
currentPart = ""
|
|
|
|
|
|
} else {
|
|
|
|
|
|
currentPart += string(char)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
parts = append(parts, currentPart) // 添加最后一个字段
|
|
|
|
|
|
|
2025-11-26 18:48:12 +08:00
|
|
|
|
if len(parts) >= 4 {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 序号,创建时间,标题,链接
|
|
|
|
|
|
time := strings.TrimSpace(parts[1])
|
|
|
|
|
|
title := strings.TrimSpace(parts[2])
|
|
|
|
|
|
link := strings.TrimSpace(parts[3])
|
|
|
|
|
|
// 清理引号
|
2025-11-26 18:48:12 +08:00
|
|
|
|
link = strings.Trim(link, "\"")
|
2025-11-27 18:40:08 +08:00
|
|
|
|
title = strings.Trim(title, "\"")
|
|
|
|
|
|
|
|
|
|
|
|
if link != "" && link != "链接" { // 跳过标题行
|
|
|
|
|
|
articleLinks = append(articleLinks, link)
|
|
|
|
|
|
articleTitles = append(articleTitles, title)
|
|
|
|
|
|
articleTimes = append(articleTimes, time)
|
|
|
|
|
|
}
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
fmt.Printf("成功读取到 %d 篇文章链接\n", len(articleLinks))
|
|
|
|
|
|
if len(articleLinks) == 0 {
|
|
|
|
|
|
return fmt.Errorf("未能从文件中提取到有效的文章链接")
|
|
|
|
|
|
}
|
2025-11-26 18:48:12 +08:00
|
|
|
|
|
|
|
|
|
|
// 5. 遍历下载每篇文章
|
|
|
|
|
|
successCount := 0
|
|
|
|
|
|
errorCount := 0
|
2025-11-27 18:40:08 +08:00
|
|
|
|
errorLinks := [][]string{} // 保存失败的文章信息
|
|
|
|
|
|
|
|
|
|
|
|
// 【新增】确保"文章详细"目录存在
|
|
|
|
|
|
if err := os.MkdirAll(articleDetailPath, 0755); err != nil {
|
|
|
|
|
|
return fmt.Errorf("创建文章详细目录失败: %v", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
fmt.Printf("文章详细将保存到: %s\n", articleDetailPath)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
|
|
|
|
|
|
for i, link := range articleLinks {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
title := ""
|
|
|
|
|
|
if i < len(articleTitles) {
|
|
|
|
|
|
title = articleTitles[i]
|
|
|
|
|
|
}
|
|
|
|
|
|
creatTime := ""
|
|
|
|
|
|
if i < len(articleTimes) {
|
|
|
|
|
|
creatTime = articleTimes[i]
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fmt.Printf("\n正在处理第 %d/%d 篇文章\n", i+1, len(articleLinks))
|
|
|
|
|
|
fmt.Printf("标题: %s\n", title)
|
|
|
|
|
|
fmt.Printf("链接: %s\n", link)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
|
|
|
|
|
|
// 获取文章详情
|
|
|
|
|
|
detail, err := w.GetArticleDetail(link)
|
|
|
|
|
|
if err != nil {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
fmt.Printf("❌ 获取文章详情失败: %v\n", err)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
errorCount++
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 记录失败的文章
|
|
|
|
|
|
errorLinks = append(errorLinks, []string{
|
|
|
|
|
|
fmt.Sprintf("%d", i+1),
|
|
|
|
|
|
creatTime,
|
|
|
|
|
|
title,
|
|
|
|
|
|
link,
|
|
|
|
|
|
})
|
2025-11-26 18:48:12 +08:00
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 保存文章内容
|
|
|
|
|
|
if contentSaveFlag {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
// 清理标题中的非法字符
|
|
|
|
|
|
cleanTitle := detail.Title
|
|
|
|
|
|
invalidChars := []string{"\\", "/", ":", "*", "?", "\"", "<", ">", "|"}
|
|
|
|
|
|
for _, char := range invalidChars {
|
|
|
|
|
|
cleanTitle = strings.ReplaceAll(cleanTitle, char, "_")
|
|
|
|
|
|
}
|
|
|
|
|
|
// 限制文件名长度
|
|
|
|
|
|
if len(cleanTitle) > 100 {
|
|
|
|
|
|
cleanTitle = cleanTitle[:100]
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 【修改】生成文件路径,保存到"文章详细"子目录中
|
|
|
|
|
|
filePath := fmt.Sprintf("%s/%s_文章详情.txt", articleDetailPath, cleanTitle)
|
|
|
|
|
|
|
|
|
|
|
|
// 调试:打印文件保存路径和内容长度
|
|
|
|
|
|
fmt.Printf(" 保存路径: %s\n", filePath)
|
|
|
|
|
|
fmt.Printf(" 内容段落数: %d\n", len(detail.Content))
|
|
|
|
|
|
if len(detail.Content) > 0 {
|
|
|
|
|
|
previewLen := 50
|
|
|
|
|
|
if len(detail.Content[0]) < previewLen {
|
|
|
|
|
|
previewLen = len(detail.Content[0])
|
|
|
|
|
|
}
|
|
|
|
|
|
fmt.Printf(" 第一段内容预览: %s...\n", detail.Content[0][:previewLen])
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fmt.Printf(" ⚠️ 警告:文章内容为空!\n")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-26 18:48:12 +08:00
|
|
|
|
if err := w.SaveArticleDetailToExcel(detail, filePath); err != nil {
|
2025-11-27 18:40:08 +08:00
|
|
|
|
fmt.Printf("❌ 保存文章详情失败: %v\n", err)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
errorCount++
|
2025-11-27 18:40:08 +08:00
|
|
|
|
errorLinks = append(errorLinks, []string{
|
|
|
|
|
|
fmt.Sprintf("%d", i+1),
|
|
|
|
|
|
creatTime,
|
|
|
|
|
|
title,
|
|
|
|
|
|
link,
|
|
|
|
|
|
})
|
2025-11-26 18:48:12 +08:00
|
|
|
|
continue
|
|
|
|
|
|
}
|
2025-11-27 18:40:08 +08:00
|
|
|
|
fmt.Printf("✅ 文章保存成功: %s\n", detail.Title)
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// TODO: 保存图片功能(如果需要)
|
|
|
|
|
|
if imgSaveFlag {
|
|
|
|
|
|
fmt.Println("图片保存功能暂未实现")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
successCount++
|
|
|
|
|
|
|
|
|
|
|
|
// 添加延迟,避免被封
|
2025-11-27 18:40:08 +08:00
|
|
|
|
if i < len(articleLinks)-1 { // 不是最后一篇
|
|
|
|
|
|
delayTime := 3 + i/10 // 基础延迟3秒,每10篇增加1秒
|
|
|
|
|
|
fmt.Printf("为预防被封禁,延时 %d 秒...\n", delayTime)
|
|
|
|
|
|
time.Sleep(time.Duration(delayTime) * time.Second)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 6. 保存失败的文章链接
|
|
|
|
|
|
if len(errorLinks) > 0 {
|
|
|
|
|
|
errorPath := officialPath + "/问题链接(error_links).txt"
|
|
|
|
|
|
var errorContent strings.Builder
|
|
|
|
|
|
// 添加 BOM 头
|
|
|
|
|
|
errorContent.WriteString("\xEF\xBB\xBF")
|
|
|
|
|
|
errorContent.WriteString("序号,创建时间,标题,链接\n")
|
|
|
|
|
|
for _, errorLink := range errorLinks {
|
|
|
|
|
|
// 处理标题中的逗号和引号
|
|
|
|
|
|
title := errorLink[2]
|
|
|
|
|
|
if strings.Contains(title, ",") || strings.Contains(title, "\"") {
|
|
|
|
|
|
title = "\"" + strings.ReplaceAll(title, "\"", "\"\"") + "\""
|
|
|
|
|
|
}
|
|
|
|
|
|
errorContent.WriteString(fmt.Sprintf("%s,%s,%s,%s\n",
|
|
|
|
|
|
errorLink[0], errorLink[1], title, errorLink[3]))
|
|
|
|
|
|
}
|
|
|
|
|
|
err := os.WriteFile(errorPath, []byte(errorContent.String()), 0644)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
fmt.Printf("⚠️ 保存错误链接失败: %v\n", err)
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fmt.Printf("\n已保存失败的文章链接到: %s\n", errorPath)
|
|
|
|
|
|
}
|
2025-11-26 18:48:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-27 18:40:08 +08:00
|
|
|
|
fmt.Printf("\n" + strings.Repeat("=", 60) + "\n")
|
|
|
|
|
|
fmt.Printf("文章列表处理完成!\n")
|
|
|
|
|
|
fmt.Printf(" 成功: %d 篇\n", successCount)
|
|
|
|
|
|
fmt.Printf(" 失败: %d 篇\n", errorCount)
|
|
|
|
|
|
fmt.Printf(" 总计: %d 篇\n", len(articleLinks))
|
|
|
|
|
|
fmt.Printf(strings.Repeat("=", 60) + "\n")
|
2025-11-26 18:48:12 +08:00
|
|
|
|
return nil
|
|
|
|
|
|
}
|