From 51096cc21d5f3b6e6bb397ea06ece93a834b738a Mon Sep 17 00:00:00 2001 From: shengyudong Date: Wed, 26 Nov 2025 18:48:12 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20backend/pkg/wechat?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pkg/wechat/access_articles.go | 1589 +++++++++++++++++++++++++ 1 file changed, 1589 insertions(+) create mode 100644 backend/pkg/wechat/access_articles.go diff --git a/backend/pkg/wechat/access_articles.go b/backend/pkg/wechat/access_articles.go new file mode 100644 index 0000000..9838bcc --- /dev/null +++ b/backend/pkg/wechat/access_articles.go @@ -0,0 +1,1589 @@ +package wechat + +import ( + "crypto/tls" + "encoding/json" + "fmt" + "net/url" + "os" + "regexp" + "strings" + "time" + "unicode" + + "github.com/go-resty/resty/v2" + "github.com/wechat-crawler/configs" +) + +type ArticleDetail struct { + LocalTime string `json:"local_time"` + CreateTime string `json:"create_time"` + Title string `json:"title"` + OfficialName string `json:"official_name"` + Link string `json:"link"` + Content []string `json:"content"` + ReadCount string `json:"read_count"` + LikeCount string `json:"like_count"` + ShareCount string `json:"share_count"` + ShowRead string `json:"show_read"` + Comments []string `json:"comments"` + CommentLikes []string `json:"comment_likes"` + CommentID string `json:"comment_id"` +} + +type WechatCrawler struct { + client *resty.Client + Config *configs.Config + uin string + key string + passTicket string + biz string +} + +// NewWechatCrawler 创建新的微信爬虫实例 +func NewWechatCrawler(biz string, uin string, key string, passTicket string, cfg *configs.Config) (*WechatCrawler, error) { + client := resty.New() + client.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true}) + client.SetTimeout(10 * time.Second) + + // 设置默认headers + headers := map[string]string{ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "Cookie": fmt.Sprintf("uin=%s; key=%s; pass_ticket=%s;", uin, key, passTicket), + } + client.SetHeaders(headers) + + return &WechatCrawler{ + client: client, + Config: cfg, + uin: uin, + key: key, + passTicket: passTicket, + biz: biz, + }, nil +} + +// NewSimpleCrawler 创建一个简单的微信爬虫实例,不需要cookie信息,仅用于获取文章链接 +func NewSimpleCrawler() *WechatCrawler { + // 初始化 HTTP 客户端 + client := resty.New() + client.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true}) + client.SetTimeout(15 * time.Second) + + // 设置默认headers + headers := map[string]string{ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", + "Connection": "keep-alive", + } + client.SetHeaders(headers) + + return &WechatCrawler{ + client: client, + Config: nil, + biz: "", + } +} + +// GetOfficialAccountName 获取公众号名称 +func (w *WechatCrawler) GetOfficialAccountName() (string, error) { + url := fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124", w.biz) + resp, err := w.client.R().Get(url) + if err != nil { + return "", fmt.Errorf("获取公众号信息失败: %v", err) + } + + content := resp.String() + + // 尝试多种正则表达式模式来提取公众号名称 + // 模式1: 匹配格式: var nickname = "公众号名称".html(false) || ""; + nicknameRegex := regexp.MustCompile(`var nickname = "([^"]+)"\.html\(false\)\s*\|\|\s*""`) + match := nicknameRegex.FindStringSubmatch(content) + if len(match) >= 2 { + return match[1], nil + } + // 模式2: 原始模式 + nicknameRegex2 := regexp.MustCompile(`var nickname = "(.*?)";`) + match = nicknameRegex2.FindStringSubmatch(content) + if len(match) >= 2 { + return match[1], nil + } + + // 模式3: JSON格式 + nicknameRegex3 := regexp.MustCompile(`nickname\s*:\s*"([^"]+)"`) + match = nicknameRegex3.FindStringSubmatch(content) + if len(match) >= 2 { + return match[1], nil + } + + // 模式4: 字符串格式 + nicknameRegex4 := regexp.MustCompile(`"nickname":"([^"]+)"`) + match = nicknameRegex4.FindStringSubmatch(content) + if len(match) >= 2 { + return match[1], nil + } + + // 模式5: HTML标题 + nicknameRegex5 := regexp.MustCompile(`([^<]+)<\/title>`) + match = nicknameRegex5.FindStringSubmatch(content) + if len(match) >= 2 { + // 清理标题,移除"- 微信公众号"等后缀 + title := match[1] + if idx := strings.Index(title, "-"); idx > 0 { + title = strings.TrimSpace(title[:idx]) + } + return title, nil + } + + // 如果所有模式都失败,尝试从biz生成一个有意义的名称 + if w.biz != "" { + return "公众号_" + w.biz[:8], nil + } + + return "未知公众号", nil +} + +// GetNextList 获取下一页文章列表 +func (w *WechatCrawler) GetNextList(offset int) (map[string]interface{}, error) { + // 检查是否有必要的登录参数 + if w.uin == "" || w.key == "" || w.passTicket == "" { + return nil, fmt.Errorf("no session: 需要提供微信登录状态的cookies\n请在浏览器中登录微信公众号平台后,从URL中获取uin、key和pass_ticket参数") + } + + url := fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=%s&offset=%d&count=10&f=json&uin=%s&key=%s&pass_ticket=%s&appmsg_token=999999999&x5=0&f=json", + w.biz, offset*10, w.uin, w.key, w.passTicket) + + resp, err := w.client.R().SetHeader("Referer", fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124", w.biz)).Get(url) + if err != nil { + return nil, fmt.Errorf("获取文章列表失败: %v", err) + } + + // 检查响应状态 + if resp.StatusCode() != 200 { + return nil, fmt.Errorf("获取文章列表失败: HTTP状态码 %d\n可能需要更新cookies或登录状态已过期", resp.StatusCode()) + } + + // 解析响应内容 + content := resp.String() + // 清理JSON格式 + content = strings.Replace(content, "\x22", "\"", -1) + content = strings.Replace(content, "\x5c", "\\", -1) + + // 检查是否存在明显的错误信息 + if strings.Contains(content, "请先登录") || strings.Contains(content, "登录超时") { + return nil, fmt.Errorf("no session: 登录状态已过期或无效\n请在浏览器中重新登录微信公众号平台并更新cookies") + } + + var result map[string]interface{} + err = json.Unmarshal([]byte(content), &result) + if err != nil { + // 尝试更宽松的错误处理 + if strings.Contains(content, "no session") { + return nil, fmt.Errorf("no session: 需要有效的微信登录状态\n请在浏览器中登录微信公众号平台后,从URL中获取登录参数") + } + return nil, fmt.Errorf("解析文章列表失败: %v\n响应内容: %s", err, content[:100]) + } + + // 检查是否有错误 + if ret, ok := result["ret"].(float64); ok { + switch ret { + case 4001: + return nil, fmt.Errorf("获取文章列表失败: 登录状态已过期\n请更新cookies") + case -200013: + return nil, fmt.Errorf("获取文章列表失败: 需要验证\n请在浏览器中先访问公众号页面进行验证") + case -200015: + return nil, fmt.Errorf("获取文章列表失败: 操作过于频繁\n请稍后再试") + default: + if ret != 0 { + errMsg, _ := result["errmsg"].(string) + return nil, fmt.Errorf("获取文章列表失败: 错误码 %v, 信息: %s", ret, errMsg) + } + } + } + + // 解析文章列表 + generalMsgList, ok := result["general_msg_list"].(string) + if !ok { + return nil, fmt.Errorf("解析文章列表格式错误") + } + + var msgList struct { + List []struct { + CommMsgInfo struct { + ID int64 `json:"id"` + Type int `json:"type"` + CreateTime int64 `json:"create_time"` + SourceMsgID int64 `json:"source_msg_id"` + } `json:"comm_msg_info"` + AppMsgExtInfo struct { + Title string `json:"title"` + Digest string `json:"digest"` + ContentURL string `json:"content_url"` + Cover string `json:"cover"` + Author string `json:"author"` + FileID int64 `json:"fileid"` + Content string `json:"content"` + UrlList []string `json:"url_list"` + } `json:"app_msg_ext_info"` + MultiAppMsgItemList []struct { + Title string `json:"title"` + Digest string `json:"digest"` + ContentURL string `json:"content_url"` + Cover string `json:"cover"` + Author string `json:"author"` + } `json:"multi_app_msg_item_list"` + } `json:"list"` + } + + err = json.Unmarshal([]byte(generalMsgList), &msgList) + if err != nil { + return nil, fmt.Errorf("解析文章列表内容失败: %v", err) + } + + // 构建返回数据 + response := make(map[string]interface{}) + response["m_flag"] = 1 + + var passageList [][]string + for _, item := range msgList.List { + if item.CommMsgInfo.Type == 49 { + // 单图文消息 + createTime := fmt.Sprintf("%d", item.CommMsgInfo.CreateTime) + title := item.AppMsgExtInfo.Title + link := item.AppMsgExtInfo.ContentURL + passageList = append(passageList, []string{"", createTime, title, link}) + + // 多图文消息 + for _, multiItem := range item.MultiAppMsgItemList { + passageList = append(passageList, []string{"", createTime, multiItem.Title, multiItem.ContentURL}) + } + } + } + + response["passage_list"] = passageList + + // 如果没有更多文章,设置m_flag为0 + if len(passageList) == 0 { + response["m_flag"] = 0 + } + + return response, nil +} + +// GetOneArticle 获取单篇文章内容 +func (w *WechatCrawler) GetOneArticle(link string) (string, error) { + resp, err := w.client.R().Get(link) + if err != nil { + return "", fmt.Errorf("请求文章失败: %v", err) + } + return resp.String(), nil +} + +// ExtractOfficialAccountName 从文章内容中提取公众号名称 +func (w *WechatCrawler) ExtractOfficialAccountName(content string) string { + accountName := "" + // 优先从微信文章特定的字段提取公众号名称 + patterns := []string{ + `window\.appmsg\s*=\s*\{[^}]*"author"\s*:\s*['"](.*?)['"]`, // window.appmsg.author + `var nickname\s*=\s*['"](.*?)['"]`, // nickname变量 + `"nickname"\s*:\s*['"](.*?)['"]`, // JSON中的nickname字段 + `var ct\s*=\s*['"](.*?)['"]`, // ct变量(有时用于存储公众号名称) + `<meta[^>]*name=["']?author["']?[^>]*content=["'](.*?)["']`, // meta标签中的作者信息 + } + + for _, pattern := range patterns { + regex := regexp.MustCompile(pattern) + if match := regex.FindStringSubmatch(content); len(match) > 1 { + accountName = match[1] + // 清理和转义 + accountName = strings.ReplaceAll(accountName, """, "\"") + accountName = strings.ReplaceAll(accountName, "&", "&") + accountName = strings.ReplaceAll(accountName, "<", "<") + accountName = strings.ReplaceAll(accountName, ">", ">") + // 多次URL解码 + for i := 0; i < 3; i++ { + if decoded, err := url.QueryUnescape(accountName); err == nil && decoded != accountName { + accountName = decoded + } else { + break + } + } + break + } + } + + return accountName +} + +// ExtractArticleInfo 从文章内容中提取关键信息 +func (w *WechatCrawler) ExtractArticleInfo(content string) (string, string, string, string, string, []string) { + // 提取创建时间 - 增强版,增加对ori_create_time的支持 + createTime := "" + // 模式1: 标准createTime变量 + createTimeRegex := regexp.MustCompile(`var createTime\s*=\s*['"](\d+)['"]`) + if match := createTimeRegex.FindStringSubmatch(content); len(match) > 1 { + createTime = match[1] + } else { + // 模式2: ori_create_time变量(在之前的文件中发现) + oriCreateTimeRegex := regexp.MustCompile(`ori_create_time\s*:\s*['"](\d+)['"]`) + if match := oriCreateTimeRegex.FindStringSubmatch(content); len(match) > 1 { + createTime = match[1] + } + // 模式3: JSON对象中的create_time字段 + jsonCreateTimeRegex := regexp.MustCompile(`"create_time"\s*:\s*(\d+)`) + if match := jsonCreateTimeRegex.FindStringSubmatch(content); len(match) > 1 { + createTime = match[1] + } + } + + // 提取标题 - 增强版,优化标题提取逻辑,确保正确区分公众号名称和文章标题 + title := "" + // 优先从微信文章特有的结构提取标题(window.appmsg.title优先级最高) + titlePatterns := []string{ + `window\.appmsg\s*=\s*\{[^}]*"title"\s*:\s*['"](.*?)['"]`, // window.appmsg对象中的title(微信文章标准标题位置) + `var title\s*=\s*['"](.*?)['"]`, // 直接变量赋值 + `"title"\s*:\s*['"](.*?)['"]`, // JSON对象中的title字段 + `window\.title\s*=\s*['"](.*?)['"]`, // window.title赋值 + // 增加JsDecode函数支持(在文件中发现) + `title\s*=\s*JsDecode\(['"](.*?)['"]\)`, // title变量的JsDecode赋值 + `JsDecode\(['"]([^'"]*?title[^'"]*)['"]\)`, // 包含title的JsDecode调用 + // HTML title标签优先级降低,因为可能包含公众号名称 + `<title[^>]*>(.*?)`, + } + + for _, pattern := range titlePatterns { + titleRegex := regexp.MustCompile(pattern) + if match := titleRegex.FindStringSubmatch(content); len(match) > 1 { + title = match[1] + // 尝试解码HTML实体和URL编码 + title = strings.ReplaceAll(title, """, "\"") + title = strings.ReplaceAll(title, "&", "&") + title = strings.ReplaceAll(title, "<", "<") + title = strings.ReplaceAll(title, ">", ">") + // 多次URL解码,处理嵌套编码 + for i := 0; i < 3; i++ { // 最多解码3次 + if decoded, err := url.QueryUnescape(title); err == nil && decoded != title { + title = decoded + } else { + break + } + } + break + } + } + + // 提取comment_id - 增强版,增加JsDecode支持 + commentID := "" + // 模式1: 标准comment_id变量 + commentIDRegex := regexp.MustCompile(`var comment_id\s*=\s*['"](\d+)['"]`) + if match := commentIDRegex.FindStringSubmatch(content); len(match) > 1 { + commentID = match[1] + } else { + // 模式2: comment_id变量带JsDecode(在文件中发现) + commentIDJsDecodeRegex := regexp.MustCompile(`comment_id\s*=\s*JsDecode\(['"]([^'"]+)['"]\)`) + if match := commentIDJsDecodeRegex.FindStringSubmatch(content); len(match) > 1 { + commentID = match[1] + } + // 模式3: JSON对象中的comment_id字段 + jsonCommentIDRegex := regexp.MustCompile(`"comment_id"\s*:\s*['"]([^'"]+)['"]`) + if match := jsonCommentIDRegex.FindStringSubmatch(content); len(match) > 1 { + commentID = match[1] + } + } + + // 提取req_id - 增强版 + reqID := "" + // 模式1: 标准req_id变量 + reqIDRegex := regexp.MustCompile(`var req_id\s*=\s*['"](\d+)['"]`) + if match := reqIDRegex.FindStringSubmatch(content); len(match) > 1 { + reqID = match[1] + } else { + // 模式2: req_id变量的其他格式 + reqIDAltRegex := regexp.MustCompile(`req_id\s*=\s*['"]([^'"]+)['"]`) + if match := reqIDAltRegex.FindStringSubmatch(content); len(match) > 1 { + reqID = match[1] + } + // 模式3: JSON对象中的req_id字段 + jsonReqIDRegex := regexp.MustCompile(`"req_id"\s*:\s*['"]([^'"]+)['"]`) + if match := jsonReqIDRegex.FindStringSubmatch(content); len(match) > 1 { + reqID = match[1] + } + } + + // 提取文章文本内容 - 全新策略,专注于微信文章核心内容结构 + textContent := []string{} + + // 改进内容提取策略 - 全新的优先级顺序,专注于微信文章特有的内容结构 + var rawContent string + + // 方法1: 从微信文章特定的数据结构提取(最高优先级) + // 1.1 尝试从var content变量直接提取(微信文章常用的内容存储方式) + varContentRegex := regexp.MustCompile(`var\s+content\s*=\s*(['"](?:\\.|[^'"])*['"])\s*;`) + if match := varContentRegex.FindStringSubmatch(content); len(match) > 1 { + rawContent = match[1] + // 移除引号 + if len(rawContent) > 1 && rawContent[0] == '"' && rawContent[len(rawContent)-1] == '"' { + rawContent = rawContent[1 : len(rawContent)-1] + } + } else if rawContent == "" { + // 1.2 尝试从rich_media_content变量提取 + richMediaVarRegex := regexp.MustCompile(`var\s+rich_media_content\s*=\s*(['"](?:\\.|[^'"])*['"])\s*;`) + if match := richMediaVarRegex.FindStringSubmatch(content); len(match) > 1 { + rawContent = match[1] + // 移除引号 + if len(rawContent) > 1 && rawContent[0] == '"' && rawContent[len(rawContent)-1] == '"' { + rawContent = rawContent[1 : len(rawContent)-1] + } + } + } else if rawContent == "" { + // 1.3 尝试从js_content变量提取 + jsContentVarRegex := regexp.MustCompile(`var\s+js_content\s*=\s*(['"](?:\\.|[^'"])*['"])\s*;`) + if match := jsContentVarRegex.FindStringSubmatch(content); len(match) > 1 { + rawContent = match[1] + // 移除引号 + if len(rawContent) > 1 && rawContent[0] == '"' && rawContent[len(rawContent)-1] == '"' { + rawContent = rawContent[1 : len(rawContent)-1] + } + } + } + + // 方法2: 从HTML DOM结构中直接提取(次优先级) + if rawContent == "" { + // 2.1 优先查找rich_media_content类的div(微信文章核心内容容器) + richMediaClassRegex := regexp.MustCompile(`(?s)]*>([\s\S]*?)<\/div>`) + if match := richMediaClassRegex.FindStringSubmatch(content); len(match) > 1 { + rawContent = match[1] + } else if rawContent == "" { + // 2.2 尝试查找id为js_content的元素 + jsContentIdRegex := regexp.MustCompile(`(?s)]*>([\s\S]*?)<\/div>`) + if match := jsContentIdRegex.FindStringSubmatch(content); len(match) > 1 { + rawContent = match[1] + } + } + } + + // 方法3: 从window.appmsg对象中提取(微信文章标准数据结构) + if rawContent == "" { + appmsgRegex := regexp.MustCompile(`window\.appmsg\s*=\s*(\{[\s\S]+?\});`) + if match := appmsgRegex.FindStringSubmatch(content); len(match) > 1 { + appmsgData := match[1] + // 尝试提取content字段(多种格式) + contentPatterns := []string{ + `"content"\s*:\s*(['"](?:\\.|[^'"])*['"])`, + `content\s*=\s*(['"](?:\\.|[^'"])*['"])`, + `"content"\s*:\s*JsDecode\(['"]([^'"]+)['"]\)`, + `content\s*=\s*JsDecode\(['"]([^'"]+)['"]\)`, + } + for _, pattern := range contentPatterns { + contentRegex := regexp.MustCompile(pattern) + if contentMatch := contentRegex.FindStringSubmatch(appmsgData); len(contentMatch) > 1 { + rawContent = contentMatch[1] + // 移除引号 + if len(rawContent) > 1 && rawContent[0] == '"' && rawContent[len(rawContent)-1] == '"' { + rawContent = rawContent[1 : len(rawContent)-1] + } + break + } + } + } + } + + // 方法4: 从JSON格式的文章数据中提取 + if rawContent == "" { + // 查找可能包含文章内容的JSON数据块 + jsonDataRegex := regexp.MustCompile(`(?:\{"content"|\"content\")[^}]*\}`) + jsonMatches := jsonDataRegex.FindAllString(content, -1) + + for _, jsonMatch := range jsonMatches { + // 尝试解析JSON + var jsonObj map[string]interface{} + if err := json.Unmarshal([]byte(jsonMatch), &jsonObj); err == nil { + if contentStr, ok := jsonObj["content"].(string); ok && contentStr != "" { + rawContent = contentStr + break + } + } + } + } + + // 方法5: 尝试从微信文章特有的段落结构提取 + if rawContent == "" { + // 查找带有rich_media_p类的p标签(微信文章特有的段落样式) + pTagsRegex := regexp.MustCompile(`(?s)([\s\S]*?)<\/p>`) + if matches := pTagsRegex.FindAllStringSubmatch(content, -1); len(matches) > 0 { + // 如果找到多个p标签,合并它们的内容 + var combinedContent strings.Builder + for _, match := range matches { + if len(match) > 1 { + combinedContent.WriteString(match[1]) + combinedContent.WriteString("\n") + } + } + rawContent = combinedContent.String() + } else { + // 尝试一般的p标签,这是微信文章的备用段落格式 + generalPTagsRegex := regexp.MustCompile(`(?s)]*>([\s\S]*?)<\/p>`) + if matches := generalPTagsRegex.FindAllStringSubmatch(content, -1); len(matches) > 10 { // 至少10个p标签才可能是文章内容 + var combinedContent strings.Builder + for _, match := range matches { + if len(match) > 1 { + combinedContent.WriteString(match[1]) + combinedContent.WriteString("\n") + } + } + rawContent = combinedContent.String() + } + } + } + + // 方法6: 从article或section标签提取(HTML5标准内容容器) + if rawContent == "" { + // 分别处理article和section标签 + articleRegex := regexp.MustCompile(`(?s)]*>([\s\S]*?)<\/article>`) + if match := articleRegex.FindStringSubmatch(content); len(match) > 1 { + // 检查提取的内容是否真的包含文章正文(而不是JavaScript代码) + articleContent := match[1] + if w.calculateChineseDensity(articleContent) > 0.2 { + rawContent = articleContent + } + } else { + sectionRegex := regexp.MustCompile(`(?s)]*>([\s\S]*?)<\/section>`) + if match := sectionRegex.FindStringSubmatch(content); len(match) > 1 { + // 检查提取的内容是否真的包含文章正文 + sectionContent := match[1] + if w.calculateChineseDensity(sectionContent) > 0.2 { + rawContent = sectionContent + } + } + } + } + + // 方法7: 基于中文密度的段落提取(备用方法) + if rawContent == "" { + // 将内容分成较小的块进行检查,使用更简单的正则表达式 + contentBlocks := regexp.MustCompile(`[\S\s]+?`).FindAllString(content, -1) + + var bestContent string + var bestScore float64 + + for _, block := range contentBlocks { + // 计算中文密度(使用现有的calculateChineseDensity方法) + density := w.calculateChineseDensity(block) + // 计算JavaScript关键词数量 + jsCount := w.jsKeywordCount(block) + // 计算中文字符总数 + chineseCount := 0 + for _, char := range block { + if char >= 0x4e00 && char <= 0x9fa5 { + chineseCount++ + } + } + + // 计算综合评分:中文密度高且JavaScript关键词少的内容得分更高 + score := density * float64(chineseCount) / (float64(jsCount) + 1.0) + + // 只有中文密度足够高且JavaScript关键词较少的内容才考虑 + if density > 0.4 && jsCount < 10 && chineseCount > 100 && score > bestScore { + bestScore = score + bestContent = block + } + } + + if bestContent != "" { + rawContent = bestContent + } + } + + // 方法8: 从JavaScript字符串中提取HTML内容(备用方法) + if rawContent == "" { + // 查找可能包含HTML内容的长字符串 + longStringRegex := regexp.MustCompile(`['"]([^'"]{200,})['"]`) + matches := longStringRegex.FindAllStringSubmatch(content, -1) + + for _, match := range matches { + if len(match) > 1 { + // 先进行预检查,排除明显的JavaScript代码 + candidate := match[1] + if w.jsKeywordCount(candidate) > 20 { + continue // 跳过JavaScript代码过多的候选内容 + } + + // 尝试解码可能的URL编码内容 + decoded := candidate + for i := 0; i < 3; i++ { // 最多解码3次 + if d, err := url.QueryUnescape(decoded); err == nil && d != decoded { + decoded = d + } else { + break + } + } + + // 检查是否包含常见的HTML标签且中文密度足够高 + hasHTMLTags := strings.Contains(decoded, "

") || strings.Contains(decoded, "") || strings.Contains(decoded, "<p>") || + strings.Contains(decoded, "<div") || strings.Contains(decoded, "<br>") + + // 计算解码后的中文密度 + density := w.calculateChineseDensity(decoded) + + // 同时满足有HTML标签和足够的中文密度 + if hasHTMLTags && density > 0.3 { + rawContent = decoded + break + } + } + } + } + + // 预处理rawContent(如果已找到) + if rawContent != "" { + // 首先进行多次URL解码,处理嵌套编码 + for i := 0; i < 3; i++ { // 最多解码3次 + if decoded, err := url.QueryUnescape(rawContent); err == nil && decoded != rawContent { + rawContent = decoded + } else { + break + } + } + + // 替换HTML实体 + rawContent = strings.ReplaceAll(rawContent, "<", "<") + rawContent = strings.ReplaceAll(rawContent, ">", ">") + rawContent = strings.ReplaceAll(rawContent, """, "\"") + rawContent = strings.ReplaceAll(rawContent, "&", "&") + rawContent = strings.ReplaceAll(rawContent, "\\n", "") + rawContent = strings.ReplaceAll(rawContent, "\\r", "") + rawContent = strings.ReplaceAll(rawContent, "\\t", "") + rawContent = strings.ReplaceAll(rawContent, "\\\"", "\"") // 处理转义的双引号 + } + + // 如果找到了内容,进行清理 + if rawContent != "" { + // 移除HTML标签 + tagRegex := regexp.MustCompile(`<[^>]*>`) + cleanText := tagRegex.ReplaceAllString(rawContent, "") + + // 应用JavaScript大段过滤 + cleanText = w.filterJavaScriptBlocks(cleanText) + + // 移除多余的空白字符 + spaceRegex := regexp.MustCompile(`\s+`) + cleanText = spaceRegex.ReplaceAllString(cleanText, " ") + cleanText = strings.TrimSpace(cleanText) + + // 检查是否包含过多的JavaScript代码特征 + jsCount := w.jsKeywordCount(cleanText) + chineseDensity := w.calculateChineseDensity(cleanText) + + // 移除明显的JavaScript代码块 - 增强版,特别针对微信平台代码 + // 1. 移除WX_BJ_REPORT相关代码 + cleanText = regexp.MustCompile(`(?s)\s*WX_BJ_REPORT\s*\([^)]*\);`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*var\s+WX_BJ_REPORT\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*if\s*\(WX_BJ_REPORT\)[^;]*;`).ReplaceAllString(cleanText, "") + + // 2. 移除BadJs相关代码 + cleanText = regexp.MustCompile(`(?s)\s*BadJs\s*\([^)]*\);`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*var\s+BadJs\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*if\s*\(BadJs\)[^;]*;`).ReplaceAllString(cleanText, "") + + // 3. 移除window.logs相关代码 + cleanText = regexp.MustCompile(`(?s)\s*window\.logs\s*=\s*\[.*?\];`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*window\.logs\s*\..*?;`).ReplaceAllString(cleanText, "") + + // 4. 移除__moon_initcallback相关代码 + cleanText = regexp.MustCompile(`(?s)\s*__moon_initcallback\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*\.__moon_initcallback\s*\([^)]*\);`).ReplaceAllString(cleanText, "") + + // 5. 移除try-catch块 + cleanText = regexp.MustCompile(`(?s)\s*try\s*{[^}]*}\s*catch\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "") + + // 6. 移除函数定义 + cleanText = regexp.MustCompile(`(?s)\s*function\s+[^(]*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*var\s+[^=]*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "") + + // 7. 移除IIFE函数 + cleanText = regexp.MustCompile(`(?s)\s*\(function\s*\([^)]*\)\s*{[^}]*}\)\s*\(\);`).ReplaceAllString(cleanText, "") + + // 8. 移除变量声明 + cleanText = regexp.MustCompile(`(?s)\s*var\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*{[^}]*}\s*;?`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*let\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*const\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*window\.[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`).ReplaceAllString(cleanText, "") + + // 9. 移除控制流语句 + cleanText = regexp.MustCompile(`(?s)\s*if\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*for\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*while\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "") + + // 10. 移除JSON和数组 + cleanText = regexp.MustCompile(`(?s)\s*\{\s*"[^"]*"\s*:\s*[^}]*\}\s*`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*\[\s*[^\]]*\]\s*`).ReplaceAllString(cleanText, "") + + // 11. 移除网络请求相关代码 + cleanText = regexp.MustCompile(`(?s)\s*new\s+XMLHttpRequest\(\)[^;]*;`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*xmlobj\s*\.[^;]*;`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*fetch\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*axios\s*\.[^;]*;`).ReplaceAllString(cleanText, "") + + // 12. 移除正则表达式和调试代码 + cleanText = regexp.MustCompile(`(?s)\s*new\s+RegExp\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*\/[^/]*\/[gimuy]*`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*console\.[a-z]+\([^)]*\);`).ReplaceAllString(cleanText, "") + + // 13. 移除事件处理相关代码 + cleanText = regexp.MustCompile(`(?s)\s*document\.addEventListener\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*window\.addEventListener\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*on\$1\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "") + + // 14. 移除定时器相关代码 + cleanText = regexp.MustCompile(`(?s)\s*setTimeout\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*setInterval\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "") + + // 15. 移除微信特有的API调用 + cleanText = regexp.MustCompile(`(?s)\s*WeixinJSBridge\s*\..*?;`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*wx\.\w+\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "") + + // 16. 移除logsPagetime相关代码 + cleanText = regexp.MustCompile(`(?s)\s*logsPagetime\s*\([^)]*\);`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*var\s+logsPagetime\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "") + + // 17. 移除特定的微信错误处理代码 + cleanText = regexp.MustCompile(`(?s)\s*\.error\s*\([^)]*\);`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*\.warn\s*\([^)]*\);`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*\.info\s*\([^)]*\);`).ReplaceAllString(cleanText, "") + + // 18. 移除微信平台特定的方法调用 + cleanText = regexp.MustCompile(`(?s)\s*document\.write\s*\([^)]*\);`).ReplaceAllString(cleanText, "") + cleanText = regexp.MustCompile(`(?s)\s*document\.writeln\s*\([^)]*\);`).ReplaceAllString(cleanText, "") + + // 如果JavaScript关键词较少且中文密度较高,可能是有效的文章内容 + if (jsCount < 5 || chineseDensity > 0.3) && len(cleanText) > 50 { + // 按句子或段落分割,避免一行过长 + if len(cleanText) > 0 { + // 首先尝试按段落分割 + paragraphs := regexp.MustCompile(`[。!?.!?]\s*`).Split(cleanText, -1) + + // 重组段落,保留标点符号 + punctuations := regexp.MustCompile(`[。!?.!?]\s*`).FindAllString(cleanText, -1) + + for i := 0; i < len(paragraphs); i++ { + if paragraphs[i] != "" { + if i < len(punctuations) { + paragraphs[i] += punctuations[i] + } + // 只添加非空且长度合理的段落(避免添加JavaScript片段) + paragraph := strings.TrimSpace(paragraphs[i]) + // 增强过滤条件,避免JavaScript片段,同时考虑中文密度 + paraDensity := w.calculateChineseDensity(paragraph) + paraJsCount := w.jsKeywordCount(paragraph) + if len(paragraph) > 15 && + !strings.Contains(paragraph, "{") && + !strings.Contains(paragraph, "}") && + !strings.Contains(paragraph, "function") && + !strings.Contains(paragraph, "var") && + !strings.Contains(paragraph, "window.") && + !strings.Contains(paragraph, "WX_BJ_REPORT") && + !strings.Contains(paragraph, "BadJs") && + (paraJsCount < 2 || paraDensity > 0.4) { // 根据中文密度调整JavaScript关键词容忍度 + textContent = append(textContent, paragraph) + } + } + } + + // 如果没有成功分割成段落,直接添加整个文本 + if len(textContent) == 0 && len(cleanText) > 50 && (w.jsKeywordCount(cleanText) < 3 || chineseDensity > 0.5) { + textContent = append(textContent, cleanText) + } + } + } + } + + // 最后的备选方案:尝试从整个页面中提取非JavaScript的文本内容 + if len(textContent) == 0 { + // 移除所有HTML标签 + allText := regexp.MustCompile(`<[^>]*>`).ReplaceAllString(content, "") + + // 应用增强的JavaScript代码块过滤 + allText = w.filterJavaScriptBlocks(allText) + + // 进一步清理特定模式 + allText = regexp.MustCompile(`(?s)\s*WX_BJ_REPORT\s*\([^)]*\);`).ReplaceAllString(allText, "") + allText = regexp.MustCompile(`(?s)\s*BadJs\s*\([^)]*\);`).ReplaceAllString(allText, "") + allText = regexp.MustCompile(`(?s)\s*window\.logs\s*=\s*\[.*?\];`).ReplaceAllString(allText, "") + allText = regexp.MustCompile(`(?s)\s*__moon_initcallback\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(allText, "") + allText = regexp.MustCompile(`(?s)\s*try\s*{[^}]*}\s*catch\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(allText, "") + allText = regexp.MustCompile(`(?s)\s*function\s+[^(]*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(allText, "") + allText = regexp.MustCompile(`(?s)\s*var\s+[^=]*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(allText, "") + allText = regexp.MustCompile(`(?s)\s*\(function\s*\([^)]*\)\s*{[^}]*}\)\s*\(\);`).ReplaceAllString(allText, "") + + // 使用中文文本提取作为最后手段 + allText = w.extractChineseText(allText) + + // 清理空白字符 + spaceRegex := regexp.MustCompile(`\s+`) + allText = spaceRegex.ReplaceAllString(allText, " ") + allText = strings.TrimSpace(allText) + + // 尝试按句子分割 + if allText != "" && len(allText) > 100 { + sentences := regexp.MustCompile(`[。!?.!?]\s*`).Split(allText, -1) + punctuations := regexp.MustCompile(`[。!?.!?]\s*`).FindAllString(allText, -1) + + for i := 0; i < len(sentences); i++ { + if sentences[i] != "" { + if i < len(punctuations) { + sentences[i] += punctuations[i] + } + paragraph := strings.TrimSpace(sentences[i]) + // 过滤掉JavaScript代码和过短的内容,同时考虑中文密度 + if len(paragraph) > 20 && (w.jsKeywordCount(paragraph) < 3 || w.calculateChineseDensity(paragraph) > 0.4) { + textContent = append(textContent, paragraph) + } + } + } + } + } + + // 对提取的内容应用最终过滤,确保只保留真正的文章正文 + filteredContent := w.finalContentFilter(textContent) + return createTime, title, commentID, reqID, w.extractAuthor(content), filteredContent +} + +// calculateChineseDensity 计算文本中中文字符的密度 +func (w *WechatCrawler) calculateChineseDensity(text string) float64 { + if len(text) == 0 { + return 0 + } + // 使用正确的Go语言Unicode范围表示法 + chineseCount := 0 + for _, char := range text { + if char >= 0x4e00 && char <= 0x9fa5 { + chineseCount++ + } + } + return float64(chineseCount) / float64(len(text)) +} + +// 过滤大段JavaScript代码 +func (w *WechatCrawler) filterJavaScriptBlocks(text string) string { + // 移除常见的JavaScript代码块模式 + patterns := []string{ + // 移除JavaScript函数声明 + `(?s)function\s+[a-zA-Z_$][\w$]*\s*\([^)]*\)\s*{[^}]*}`, + // 移除匿名函数 + `(?s)\(\s*function\s*\([^)]*\)\s*{[^}]*}\s*\)\s*\(\s*\)`, + // 移除对象字面量 + `(?s)\{[^}]*\}`, + // 移除数组字面量 + `(?s)\[[^\]]*\]`, + // 移除注释 + `//[^\n]*`, + `/\*[^*]*\*/`, + // 移除微信特定错误报告代码 + `(?s)WX_BJ_REPORT[^;]*;`, + `(?s)BadJs[^;]*;`, + `(?s)window\.[a-zA-Z_$][\w$]*[^;]*;`, + // 移除XMLHttpRequest相关代码 + `(?s)xmlobj[^;]*;`, + `(?s)new\s+Image\([^)]*\)`, + `(?s)setRequestHeader[^;]*;`, + // 移除正则表达式 + `/[^/]*\/[gimuy]*`, + } + + result := text + for _, pattern := range patterns { + regex, err := regexp.Compile(pattern) + if err == nil { + result = regex.ReplaceAllString(result, "") + } + } + + return result +} + +// 提取纯中文文本 +func (w *WechatCrawler) extractChineseText(text string) string { + var result []rune + for _, char := range text { + // 保留中文、标点符号、数字和英文字母,去除特殊字符 + if (char >= 0x4e00 && char <= 0x9fa5) || + unicode.IsPunct(char) || + unicode.IsDigit(char) || + unicode.IsLetter(char) || + char == '\n' || char == ' ' { + result = append(result, char) + } + } + return string(result) +} + +// finalContentFilter 最终内容过滤,确保只保留真正的文章正文 +func (w *WechatCrawler) finalContentFilter(text string) string { + // 1. 移除明显的JavaScript代码块 + // 移除WX_BJ_REPORT相关代码 + wxCodeRegex := regexp.MustCompile(`(?s)\s*WX_BJ_REPORT\s*\([^)]*\);|\s*var\s+WX_BJ_REPORT\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*|\s*if\s*\(WX_BJ_REPORT\)[^;]*;`) + text = wxCodeRegex.ReplaceAllString(text, "") + + // 移除BadJs相关代码 + badJsRegex := regexp.MustCompile(`(?s)\s*BadJs\s*\([^)]*\);|\s*var\s+BadJs\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*|\s*if\s*\(BadJs\)[^;]*;`) + text = badJsRegex.ReplaceAllString(text, "") + + // 移除window.logs相关代码 + logsRegex := regexp.MustCompile(`(?s)\s*window\.logs\s*=\s*\[.*?\];|\s*window\.logs\s*\..*?;`) + text = logsRegex.ReplaceAllString(text, "") + + // 移除函数定义 + funcRegex := regexp.MustCompile(`(?s)\s*function\s+[^(]*\([^)]*\)\s*{[^}]*}\s*|\s*var\s+[^=]*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*|\s*[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`) + text = funcRegex.ReplaceAllString(text, "") + + // 移除变量声明 + varRegex := regexp.MustCompile(`(?s)\s*var\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*{[^}]*}\s*;?|\s*let\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;|\s*const\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;|\s*window\.[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`) + text = varRegex.ReplaceAllString(text, "") + + // 移除控制流语句 + flowRegex := regexp.MustCompile(`(?s)\s*if\s*\([^)]*\)\s*{[^}]*}\s*|\s*for\s*\([^)]*\)\s*{[^}]*}\s*|\s*while\s*\([^)]*\)\s*{[^}]*}\s*`) + text = flowRegex.ReplaceAllString(text, "") + + // 2. 提取真正的文章段落 + paragraphs := regexp.MustCompile(`[。!?.!?]\s*`).Split(text, -1) + punctuations := regexp.MustCompile(`[。!?.!?]\s*`).FindAllString(text, -1) + + var validParagraphs []string + for i := 0; i < len(paragraphs); i++ { + if paragraphs[i] != "" { + paragraph := paragraphs[i] + if i < len(punctuations) { + paragraph += punctuations[i] + } + paragraph = strings.TrimSpace(paragraph) + + // 计算段落特征 + paraDensity := w.calculateChineseDensity(paragraph) + paraJsCount := w.jsKeywordCount(paragraph) + chineseCount := 0 + for _, char := range paragraph { + if char >= 0x4e00 && char <= 0x9fa5 { + chineseCount++ + } + } + + // 严格的过滤规则 + if len(paragraph) > 25 && // 足够长的段落 + !strings.Contains(paragraph, "{") && + !strings.Contains(paragraph, "}") && + !strings.Contains(paragraph, "function") && + !strings.Contains(paragraph, "var") && + !strings.Contains(paragraph, "window.") && + !strings.Contains(paragraph, "WX_BJ_REPORT") && + !strings.Contains(paragraph, "BadJs") && + chineseCount > 15 && // 至少15个中文字符 + paraDensity > 0.4 && // 中文密度大于40% + paraJsCount < 3 { // JavaScript关键词少于3个 + validParagraphs = append(validParagraphs, paragraph) + } + } + } + + // 3. 如果没有找到有效的段落,尝试使用更宽松的规则 + if len(validParagraphs) == 0 { + // 直接检查整个文本 + overallDensity := w.calculateChineseDensity(text) + overallJsCount := w.jsKeywordCount(text) + overallChineseCount := 0 + for _, char := range text { + if char >= 0x4e00 && char <= 0x9fa5 { + overallChineseCount++ + } + } + + // 宽松条件:如果中文密度很高且JavaScript关键词较少 + if overallDensity > 0.6 && overallJsCount < 5 && overallChineseCount > 100 { + validParagraphs = append(validParagraphs, text) + } + } + + return strings.Join(validParagraphs, "\n\n") +} + +// jsKeywordCount 计算文本中JavaScript关键词的数量 - 增强版 +func (w *WechatCrawler) jsKeywordCount(text string) int { + count := 0 + // 新增加的高优先级过滤关键词 + highPriorityKeywords := []string{ + "logs = ", "window.", "LANG = ", "extInfo:", "pagetime[", + "BadJs;", "sec_open=", "xmlobj = ", "addEventListener", + "new Image()", "setRequestHeader", "onreadystatechange", + "var ", "let ", "const ", "function ", "return ", + } + + // 基础JavaScript关键词 + basicKeywords := []string{ + "function", "var", "let", "const", "if(", "else", "for(", "while(", + "return", "setTimeout", "setInterval", "WeixinJSBridge", "JSON", + "console", "document", "window", "try{", "catch(", "throw", + } + + // 微信平台特定关键词 + wechatKeywords := []string{ + "WX_BJ_REPORT", "BadJs", "__moon_initcallback", "logsPagetime", + "WeixinJSBridge", "wx.", "document.write", "document.writeln", + // 错误处理关键词 + ".error(", ".warn(", ".info(", ".debug(", + // 网络请求关键词 + "XMLHttpRequest", "fetch(", "axios.", "xmlobj.", + } + + lowerText := strings.ToLower(text) + // 计算高优先级关键词数量(权重更高) + for _, keyword := range highPriorityKeywords { + count += strings.Count(lowerText, strings.ToLower(keyword)) * 3 + } + + // 计算微信平台特定关键词数量 + for _, keyword := range wechatKeywords { + count += strings.Count(lowerText, strings.ToLower(keyword)) * 2 + } + + // 计算基础JavaScript关键词数量 + for _, keyword := range basicKeywords { + count += strings.Count(lowerText, strings.ToLower(keyword)) + } + return count +} + +// extractAuthor 提取文章作者信息 +func (w *WechatCrawler) extractAuthor(content string) string { + authorPatterns := []string{ + `var author\s*=\s*['"](.*?)['"]`, + `"author"\s*:\s*['"](.*?)['"]`, + `window\.author\s*=\s*['"](.*?)['"]`, + ` 1 { + author := match[1] + // 尝试解码HTML实体和URL编码 + author = strings.ReplaceAll(author, """, "\"") + author = strings.ReplaceAll(author, "&", "&") + author = strings.ReplaceAll(author, "<", "<") + author = strings.ReplaceAll(author, ">", ">") + if decoded, err := url.QueryUnescape(author); err == nil { + author = decoded + } + return author + } + } + return "" +} + +// GetArticleStats 获取文章统计信息 +func (w *WechatCrawler) GetArticleStats(link string, title string, commentID string, reqID string, createTime string) (map[string]string, error) { + // 解析链接参数 + mid := "" + sn := "" + idx := "" + + // 尝试从链接中提取参数 + midRegex := regexp.MustCompile(`mid=(.*?)&`) + if match := midRegex.FindStringSubmatch(link); len(match) > 1 { + mid = match[1] + } + + snRegex := regexp.MustCompile(`sn=(.*?)&`) + if match := snRegex.FindStringSubmatch(link); len(match) > 1 { + sn = match[1] + } + + idxRegex := regexp.MustCompile(`idx=(.*?)&`) + if match := idxRegex.FindStringSubmatch(link); len(match) > 1 { + idx = match[1] + } + + // 生成随机r值 + r := fmt.Sprintf("0.%d", time.Now().UnixNano()%10000000000000000) + + // 构建请求URL + detailURL := fmt.Sprintf("https://mp.weixin.qq.com/mp/getappmsgext?f=json&mock=&fasttmplajax=1&f=json&uin=%s&key=%s&pass_ticket=%s&__biz=%s", + w.uin, w.key, w.passTicket, w.biz) + + // 构建请求数据 + data := map[string]string{ + "r": r, + "sn": sn, + "mid": mid, + "idx": idx, + "req_id": reqID, + "title": title, + "comment_id": commentID, + "appmsg_type": "9", + "__biz": w.biz, + "pass_ticket": w.passTicket, + "abtest_cookie": "", + "devicetype": "Windows 7 x64", + "version": "63090b13", + "is_need_ticket": "0", + "is_need_ad": "0", + "is_need_reward": "0", + "both_ad": "0", + "reward_uin_count": "0", + "send_time": "", + "msg_daily_idx": "1", + "is_original": "0", + "is_only_read": "1", + "scene": "38", + } + + // 发送POST请求 + resp, err := w.client.R().SetFormData(data).Post(detailURL) + if err != nil { + return nil, fmt.Errorf("请求统计信息失败: %v", err) + } + + // 解析响应 + var result map[string]interface{} + err = json.Unmarshal([]byte(resp.String()), &result) + if err != nil { + return nil, fmt.Errorf("解析统计信息失败: %v", err) + } + + // 提取统计数据 + stats := map[string]string{ + "read_num": "0", + "old_like_num": "0", + "share_num": "0", + "show_read": "0", + } + + // 从返回的JSON中提取所需数据 + if appMsgExtInfo, ok := result["appmsgstat"].(map[string]interface{}); ok { + if readNum, ok := appMsgExtInfo["read_num"].(float64); ok { + stats["read_num"] = fmt.Sprintf("%.0f", readNum) + } + if likeNum, ok := appMsgExtInfo["old_like_num"].(float64); ok { + stats["old_like_num"] = fmt.Sprintf("%.0f", likeNum) + } + if shareNum, ok := appMsgExtInfo["share_num"].(float64); ok { + stats["share_num"] = fmt.Sprintf("%.0f", shareNum) + } + if showRead, ok := appMsgExtInfo["show_read"].(float64); ok { + stats["show_read"] = fmt.Sprintf("%.0f", showRead) + } + } + + return stats, nil +} + +// GetArticleComments 获取文章评论 +func (w *WechatCrawler) GetArticleComments(commentID string) ([]string, []string, error) { + if commentID == "" { + return []string{}, []string{}, nil + } + + // 构建评论请求URL + commentURL := fmt.Sprintf( + "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=%s&appmsgid=2247491372&idx=1&comment_id=%s&offset=0&limit=100&uin=%s&key=%s&pass_ticket=%s&wxtoken=&devicetype=Windows+10&clientversion=62060833&appmsg_token=", + w.biz, commentID, w.uin, w.key, w.passTicket) + + // 发送请求 + resp, err := w.client.R().Get(commentURL) + if err != nil { + return []string{}, []string{}, fmt.Errorf("获取评论失败: %v", err) + } + + // 解析响应 + var result map[string]interface{} + err = json.Unmarshal([]byte(resp.String()), &result) + if err != nil { + return []string{}, []string{}, fmt.Errorf("解析评论失败: %v", err) + } + + // 提取评论和点赞数 + var comments []string + var commentLikes []string + + // 简化实现,在实际项目中需要根据返回的JSON结构正确提取数据 + return comments, commentLikes, nil +} + +// GetOfficialAccountLinkFromArticle 通过文章链接获取公众号主页链接 +func (w *WechatCrawler) GetOfficialAccountLinkFromArticle(articleURL string) (string, error) { + // 首先尝试从URL中提取__biz参数(兼容旧格式) + bizRegex := regexp.MustCompile(`__biz=([^&]+)`) + match := bizRegex.FindStringSubmatch(articleURL) + if len(match) >= 2 { + biz := match[1] + // 更新当前实例的biz值 + w.biz = biz + + // 构建公众号主页链接 + homePageURL := fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124", biz) + return homePageURL, nil + } + + // 如果从URL中提取失败,尝试从文章内容中提取 + content, err := w.GetOneArticle(articleURL) + if err != nil { + return "", fmt.Errorf("获取文章内容失败: %v", err) + } + + // 从文章内容中提取biz + contentBizRegex := regexp.MustCompile(`var biz = "(.*?);`) + contentMatch := contentBizRegex.FindStringSubmatch(content) + if len(contentMatch) < 2 { + // 尝试其他可能的biz格式 + contentBizRegex2 := regexp.MustCompile(`__biz=(.*?)&`) + contentMatch = contentBizRegex2.FindStringSubmatch(content) + if len(contentMatch) < 2 { + return "", fmt.Errorf("无法从文章链接和内容中提取公众号信息") + } + } + + // 清理biz值,移除可能的额外引号 + biz := contentMatch[1] + biz = strings.ReplaceAll(biz, " || ", "") + biz = strings.ReplaceAll(biz, "\"", "") + + // 更新当前实例的biz值 + w.biz = biz + + // 构建公众号主页链接 + homePageURL := fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124", biz) + return homePageURL, nil +} + +// GetArticleList 获取公众号所有文章列表 +func (w *WechatCrawler) GetArticleList() ([][]string, error) { + var allArticles [][]string + offset := 0 + + for { + fmt.Printf("正在获取第%d页文章...\n", offset/10+1) + result, err := w.GetNextList(offset) + if err != nil { + return allArticles, fmt.Errorf("获取文章列表失败: %v", err) + } + + // 检查是否还有更多文章 + mFlag, ok := result["m_flag"].(float64) + if !ok || mFlag == 0 { + break + } + + // 获取当前页的文章列表 + passageList, ok := result["passage_list"].([][]string) + if !ok { + return allArticles, fmt.Errorf("文章列表格式错误") + } + + // 添加到总列表 + allArticles = append(allArticles, passageList...) + + // 增加偏移量 + offset += 10 + + // 随机延迟,避免被封禁 + time.Sleep(time.Duration(2000+offset) * time.Millisecond) + } + + // 转换链接 + transformedArticles := w.TransformLinks(allArticles) + + fmt.Printf("共获取到%d篇文章\n", len(transformedArticles)) + return transformedArticles, nil +} + +// SaveArticleListToExcel 保存文章列表到Excel +func (w *WechatCrawler) SaveArticleListToExcel(officialPath string, articleList [][]string, nickname string) error { + // 确保目录存在 + if err := os.MkdirAll(officialPath, 0755); err != nil { + return fmt.Errorf("创建目录失败: %v", err) + } + + // 保存转换后的链接文件 + filePath := fmt.Sprintf("%s/文章列表(article_list)_直连链接.txt", officialPath) + var content strings.Builder + + // 写入标题行 + content.WriteString("序号,创建时间,标题,链接\n") + + // 写入文章列表 + for i, article := range articleList { + content.WriteString(fmt.Sprintf("%d,%s,%s,%s\n", i+1, article[1], article[2], article[3])) + } + + // 写入文件 + err := os.WriteFile(filePath, []byte(content.String()), 0644) + if err != nil { + return fmt.Errorf("保存文章列表失败: %v", err) + } + + fmt.Printf("文章列表已保存到: %s\n", filePath) + return nil +} + +// TransformLinks 转换文章链接,将带有amp;的链接转换为直接可访问的链接 +func (w *WechatCrawler) TransformLinks(articleList [][]string) [][]string { + transformedList := make([][]string, 0, len(articleList)) + + for _, article := range articleList { + if len(article) >= 4 { + // 转换链接,移除amp; + transformedLink := strings.Replace(article[3], "amp;", "", -1) + transformedArticle := []string{article[0], article[1], article[2], transformedLink} + transformedList = append(transformedList, transformedArticle) + } + } + + return transformedList +} + +// ReadArticleLinksFromExcel 从Excel读取文章链接 +func (w *WechatCrawler) ReadArticleLinksFromExcel(filePath string) ([]string, error) { + // 简化实现,返回空列表 + return []string{}, nil +} + +// GetArticleDetail 获取单篇文章的详细信息 +func (w *WechatCrawler) GetArticleDetail(link string) (*ArticleDetail, error) { + // 获取文章内容 + content, err := w.GetOneArticle(link) + if err != nil { + return nil, err + } + + // 提取文章信息 + createTime, title, commentID, reqID, _, textContent := w.ExtractArticleInfo(content) + + // 提取公众号名称 + accountName := w.ExtractOfficialAccountName(content) + + // 获取统计信息 + stats, err := w.GetArticleStats(link, title, commentID, reqID, createTime) + if err != nil { + // 如果获取统计信息失败,使用默认值 + stats = map[string]string{ + "read_num": "0", + "old_like_num": "0", + "share_num": "0", + "show_read": "0", + } + } + + // 获取评论信息 + comments, commentLikes, _ := w.GetArticleComments(commentID) + + // 构建文章详情 + detail := &ArticleDetail{ + LocalTime: time.Now().Format("2006-01-02 15:04:05"), + CreateTime: createTime, + Title: title, + OfficialName: accountName, + Link: link, + Content: textContent, + ReadCount: stats["read_num"], + LikeCount: stats["old_like_num"], + ShareCount: stats["share_num"], + ShowRead: stats["show_read"], + Comments: comments, + CommentLikes: commentLikes, + CommentID: commentID, + } + + return detail, nil +} + +// GetDetailList 批量获取文章详情 +func (w *WechatCrawler) GetDetailList(articleList [][]string, officialPath string) error { + // 确保目录存在 + if err := os.MkdirAll(officialPath, 0755); err != nil { + return fmt.Errorf("创建目录失败: %v", err) + } + + successCount := 0 + errorCount := 0 + errorLinks := [][]string{} + + for i, article := range articleList { + if len(article) < 4 { + continue + } + + link := article[3] + title := article[2] + + fmt.Printf("正在处理第%d篇文章: %s\n", i+1, title) + + // 获取文章详情 + detail, err := w.GetArticleDetail(link) + if err != nil { + fmt.Printf("获取文章详情失败: %v\n", err) + errorCount++ + errorLinks = append(errorLinks, article) + continue + } + + // 保存文章详情 - 确保使用文章标题作为文件名 + filePath := fmt.Sprintf("%s/%s_文章详情.txt", officialPath, detail.Title) + if err := w.SaveArticleDetailToExcel(detail, filePath); err != nil { + fmt.Printf("保存文章详情失败: %v\n", err) + errorCount++ + errorLinks = append(errorLinks, article) + continue + } + + successCount++ + fmt.Printf("文章详情保存成功: %s\n", detail.Title) + + // 随机延迟,避免被封禁 + delayTime := 3000 + i*100 // 3秒基础延迟,递增 + time.Sleep(time.Duration(delayTime) * time.Millisecond) + } + + // 保存错误链接 + if len(errorLinks) > 0 { + errorPath := fmt.Sprintf("%s/问题链接(error_links).txt", officialPath) + var content strings.Builder + content.WriteString("序号,创建时间,标题,链接\n") + for i, link := range errorLinks { + content.WriteString(fmt.Sprintf("%d,%s,%s,%s\n", i+1, link[1], link[2], link[3])) + } + err := os.WriteFile(errorPath, []byte(content.String()), 0644) + if err != nil { + fmt.Printf("保存错误链接失败: %v\n", err) + } + } + + fmt.Printf("文章详情获取完成: 成功%d篇, 失败%d篇\n", successCount, errorCount) + return nil +} + +// SaveArticleDetailToExcel 保存文章详情到Excel +func (c *WechatCrawler) SaveArticleDetailToExcel(article *ArticleDetail, filePath string) error { + // 简化实现,保存为文本文件 + var content strings.Builder + + content.WriteString(fmt.Sprintf("本地创建时间: %s\n", article.LocalTime)) + content.WriteString(fmt.Sprintf("文章发布时间: %s\n", article.CreateTime)) + content.WriteString(fmt.Sprintf("公众号名称: %s\n", article.OfficialName)) + content.WriteString(fmt.Sprintf("文章标题: %s\n", article.Title)) + content.WriteString(fmt.Sprintf("文章链接: %s\n", article.Link)) + content.WriteString(fmt.Sprintf("阅读量: %s\n", article.ReadCount)) + content.WriteString(fmt.Sprintf("点赞数: %s\n", article.LikeCount)) + content.WriteString(fmt.Sprintf("转发数: %s\n", article.ShareCount)) + content.WriteString(fmt.Sprintf("在看数: %s\n", article.ShowRead)) + content.WriteString("\n文章内容:\n") + + for _, line := range article.Content { + content.WriteString(line) + content.WriteString("\n") + } + + // 写入文件 + return os.WriteFile(filePath, []byte(content.String()), 0644) +} + +// GetListArticleFromFile 根据公众号名称或文章链接,从文件中读取文章列表并下载内容 +func (w *WechatCrawler) GetListArticleFromFile(nameLink string, imgSaveFlag bool, contentSaveFlag bool) error { + // 1. 判断输入类型并获取公众号名称 + nickname := "" + if strings.Contains(nameLink, "http") { + fmt.Println("检测到输入为链接,开始获取公众号名称") + // 从文章链接获取公众号信息 + _, err := w.GetOfficialAccountLinkFromArticle(nameLink) + if err != nil { + return fmt.Errorf("获取公众号信息失败: %v", err) + } + // 获取公众号名称 + nickname, err = w.GetOfficialAccountName() + if err != nil { + return fmt.Errorf("获取公众号名称失败: %v", err) + } + fmt.Printf("获取到公众号名称: %s\n", nickname) + } else { + fmt.Println("检测到输入为公众号名称") + nickname = nameLink + } + + // 2. 构建文件路径 + rootPath := "./data/" + officialNamesHead := "公众号----" + officialPath := rootPath + officialNamesHead + nickname + articleListPath := officialPath + "/文章列表(article_list)_直连链接.txt" + + // 3. 检查文件是否存在 + if _, err := os.Stat(articleListPath); os.IsNotExist(err) { + return fmt.Errorf("文件不存在,请检查目录文件: %s", articleListPath) + } + + // 4. 读取文章链接列表 + fileContent, err := os.ReadFile(articleListPath) + if err != nil { + return fmt.Errorf("读取文章列表文件失败: %v", err) + } + + lines := strings.Split(string(fileContent), "\n") + var articleLinks []string + + // 跳过标题行,提取链接 + for i, line := range lines { + if i == 0 || line == "" { + continue + } + parts := strings.Split(line, ",") + if len(parts) >= 4 { + link := parts[3] + // 清理链接中的引号 + link = strings.TrimSpace(link) + link = strings.Trim(link, "\"") + articleLinks = append(articleLinks, link) + } + } + + fmt.Printf("成功读取到%d篇文章链接\n", len(articleLinks)) + + // 5. 遍历下载每篇文章 + successCount := 0 + errorCount := 0 + + for i, link := range articleLinks { + fmt.Printf("正在处理第%d篇文章,链接: %s\n", i+1, link) + + // 获取文章详情 + detail, err := w.GetArticleDetail(link) + if err != nil { + fmt.Printf("获取文章详情失败: %v\n", err) + errorCount++ + continue + } + + // 保存文章内容 + if contentSaveFlag { + filePath := fmt.Sprintf("%s/%s_文章详情.txt", officialPath, detail.Title) + if err := w.SaveArticleDetailToExcel(detail, filePath); err != nil { + fmt.Printf("保存文章详情失败: %v\n", err) + errorCount++ + continue + } + } + + // TODO: 保存图片功能(如果需要) + if imgSaveFlag { + fmt.Println("图片保存功能暂未实现") + } + + successCount++ + fmt.Printf("第%d篇文章处理成功: %s\n", i+1, detail.Title) + + // 添加延迟,避免被封 + time.Sleep(3 * time.Second) + } + + fmt.Printf("文章列表处理完成: 成功%d篇, 失败%d篇\n", successCount, errorCount) + return nil +}