Files
yixiaogao/backend/pkg/wechat/access_articles.go

1590 lines
56 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package wechat
import (
"crypto/tls"
"encoding/json"
"fmt"
"net/url"
"os"
"regexp"
"strings"
"time"
"unicode"
"github.com/go-resty/resty/v2"
"github.com/wechat-crawler/configs"
)
type ArticleDetail struct {
LocalTime string `json:"local_time"`
CreateTime string `json:"create_time"`
Title string `json:"title"`
OfficialName string `json:"official_name"`
Link string `json:"link"`
Content []string `json:"content"`
ReadCount string `json:"read_count"`
LikeCount string `json:"like_count"`
ShareCount string `json:"share_count"`
ShowRead string `json:"show_read"`
Comments []string `json:"comments"`
CommentLikes []string `json:"comment_likes"`
CommentID string `json:"comment_id"`
}
type WechatCrawler struct {
client *resty.Client
Config *configs.Config
uin string
key string
passTicket string
biz string
}
// NewWechatCrawler 创建新的微信爬虫实例
func NewWechatCrawler(biz string, uin string, key string, passTicket string, cfg *configs.Config) (*WechatCrawler, error) {
client := resty.New()
client.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true})
client.SetTimeout(10 * time.Second)
// 设置默认headers
headers := map[string]string{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Cookie": fmt.Sprintf("uin=%s; key=%s; pass_ticket=%s;", uin, key, passTicket),
}
client.SetHeaders(headers)
return &WechatCrawler{
client: client,
Config: cfg,
uin: uin,
key: key,
passTicket: passTicket,
biz: biz,
}, nil
}
// NewSimpleCrawler 创建一个简单的微信爬虫实例不需要cookie信息仅用于获取文章链接
func NewSimpleCrawler() *WechatCrawler {
// 初始化 HTTP 客户端
client := resty.New()
client.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true})
client.SetTimeout(15 * time.Second)
// 设置默认headers
headers := map[string]string{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection": "keep-alive",
}
client.SetHeaders(headers)
return &WechatCrawler{
client: client,
Config: nil,
biz: "",
}
}
// GetOfficialAccountName 获取公众号名称
func (w *WechatCrawler) GetOfficialAccountName() (string, error) {
url := fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124", w.biz)
resp, err := w.client.R().Get(url)
if err != nil {
return "", fmt.Errorf("获取公众号信息失败: %v", err)
}
content := resp.String()
// 尝试多种正则表达式模式来提取公众号名称
// 模式1: 匹配格式: var nickname = "公众号名称".html(false) || "";
nicknameRegex := regexp.MustCompile(`var nickname = "([^"]+)"\.html\(false\)\s*\|\|\s*""`)
match := nicknameRegex.FindStringSubmatch(content)
if len(match) >= 2 {
return match[1], nil
}
// 模式2: 原始模式
nicknameRegex2 := regexp.MustCompile(`var nickname = "(.*?)";`)
match = nicknameRegex2.FindStringSubmatch(content)
if len(match) >= 2 {
return match[1], nil
}
// 模式3: JSON格式
nicknameRegex3 := regexp.MustCompile(`nickname\s*:\s*"([^"]+)"`)
match = nicknameRegex3.FindStringSubmatch(content)
if len(match) >= 2 {
return match[1], nil
}
// 模式4: 字符串格式
nicknameRegex4 := regexp.MustCompile(`"nickname":"([^"]+)"`)
match = nicknameRegex4.FindStringSubmatch(content)
if len(match) >= 2 {
return match[1], nil
}
// 模式5: HTML标题
nicknameRegex5 := regexp.MustCompile(`<title>([^<]+)<\/title>`)
match = nicknameRegex5.FindStringSubmatch(content)
if len(match) >= 2 {
// 清理标题,移除"- 微信公众号"等后缀
title := match[1]
if idx := strings.Index(title, "-"); idx > 0 {
title = strings.TrimSpace(title[:idx])
}
return title, nil
}
// 如果所有模式都失败尝试从biz生成一个有意义的名称
if w.biz != "" {
return "公众号_" + w.biz[:8], nil
}
return "未知公众号", nil
}
// GetNextList 获取下一页文章列表
func (w *WechatCrawler) GetNextList(offset int) (map[string]interface{}, error) {
// 检查是否有必要的登录参数
if w.uin == "" || w.key == "" || w.passTicket == "" {
return nil, fmt.Errorf("no session: 需要提供微信登录状态的cookies\n请在浏览器中登录微信公众号平台后从URL中获取uin、key和pass_ticket参数")
}
url := fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=%s&offset=%d&count=10&f=json&uin=%s&key=%s&pass_ticket=%s&appmsg_token=999999999&x5=0&f=json",
w.biz, offset*10, w.uin, w.key, w.passTicket)
resp, err := w.client.R().SetHeader("Referer", fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124", w.biz)).Get(url)
if err != nil {
return nil, fmt.Errorf("获取文章列表失败: %v", err)
}
// 检查响应状态
if resp.StatusCode() != 200 {
return nil, fmt.Errorf("获取文章列表失败: HTTP状态码 %d\n可能需要更新cookies或登录状态已过期", resp.StatusCode())
}
// 解析响应内容
content := resp.String()
// 清理JSON格式
content = strings.Replace(content, "\x22", "\"", -1)
content = strings.Replace(content, "\x5c", "\\", -1)
// 检查是否存在明显的错误信息
if strings.Contains(content, "请先登录") || strings.Contains(content, "登录超时") {
return nil, fmt.Errorf("no session: 登录状态已过期或无效\n请在浏览器中重新登录微信公众号平台并更新cookies")
}
var result map[string]interface{}
err = json.Unmarshal([]byte(content), &result)
if err != nil {
// 尝试更宽松的错误处理
if strings.Contains(content, "no session") {
return nil, fmt.Errorf("no session: 需要有效的微信登录状态\n请在浏览器中登录微信公众号平台后从URL中获取登录参数")
}
return nil, fmt.Errorf("解析文章列表失败: %v\n响应内容: %s", err, content[:100])
}
// 检查是否有错误
if ret, ok := result["ret"].(float64); ok {
switch ret {
case 4001:
return nil, fmt.Errorf("获取文章列表失败: 登录状态已过期\n请更新cookies")
case -200013:
return nil, fmt.Errorf("获取文章列表失败: 需要验证\n请在浏览器中先访问公众号页面进行验证")
case -200015:
return nil, fmt.Errorf("获取文章列表失败: 操作过于频繁\n请稍后再试")
default:
if ret != 0 {
errMsg, _ := result["errmsg"].(string)
return nil, fmt.Errorf("获取文章列表失败: 错误码 %v, 信息: %s", ret, errMsg)
}
}
}
// 解析文章列表
generalMsgList, ok := result["general_msg_list"].(string)
if !ok {
return nil, fmt.Errorf("解析文章列表格式错误")
}
var msgList struct {
List []struct {
CommMsgInfo struct {
ID int64 `json:"id"`
Type int `json:"type"`
CreateTime int64 `json:"create_time"`
SourceMsgID int64 `json:"source_msg_id"`
} `json:"comm_msg_info"`
AppMsgExtInfo struct {
Title string `json:"title"`
Digest string `json:"digest"`
ContentURL string `json:"content_url"`
Cover string `json:"cover"`
Author string `json:"author"`
FileID int64 `json:"fileid"`
Content string `json:"content"`
UrlList []string `json:"url_list"`
} `json:"app_msg_ext_info"`
MultiAppMsgItemList []struct {
Title string `json:"title"`
Digest string `json:"digest"`
ContentURL string `json:"content_url"`
Cover string `json:"cover"`
Author string `json:"author"`
} `json:"multi_app_msg_item_list"`
} `json:"list"`
}
err = json.Unmarshal([]byte(generalMsgList), &msgList)
if err != nil {
return nil, fmt.Errorf("解析文章列表内容失败: %v", err)
}
// 构建返回数据
response := make(map[string]interface{})
response["m_flag"] = 1
var passageList [][]string
for _, item := range msgList.List {
if item.CommMsgInfo.Type == 49 {
// 单图文消息
createTime := fmt.Sprintf("%d", item.CommMsgInfo.CreateTime)
title := item.AppMsgExtInfo.Title
link := item.AppMsgExtInfo.ContentURL
passageList = append(passageList, []string{"", createTime, title, link})
// 多图文消息
for _, multiItem := range item.MultiAppMsgItemList {
passageList = append(passageList, []string{"", createTime, multiItem.Title, multiItem.ContentURL})
}
}
}
response["passage_list"] = passageList
// 如果没有更多文章设置m_flag为0
if len(passageList) == 0 {
response["m_flag"] = 0
}
return response, nil
}
// GetOneArticle 获取单篇文章内容
func (w *WechatCrawler) GetOneArticle(link string) (string, error) {
resp, err := w.client.R().Get(link)
if err != nil {
return "", fmt.Errorf("请求文章失败: %v", err)
}
return resp.String(), nil
}
// ExtractOfficialAccountName 从文章内容中提取公众号名称
func (w *WechatCrawler) ExtractOfficialAccountName(content string) string {
accountName := ""
// 优先从微信文章特定的字段提取公众号名称
patterns := []string{
`window\.appmsg\s*=\s*\{[^}]*"author"\s*:\s*['"](.*?)['"]`, // window.appmsg.author
`var nickname\s*=\s*['"](.*?)['"]`, // nickname变量
`"nickname"\s*:\s*['"](.*?)['"]`, // JSON中的nickname字段
`var ct\s*=\s*['"](.*?)['"]`, // ct变量有时用于存储公众号名称
`<meta[^>]*name=["']?author["']?[^>]*content=["'](.*?)["']`, // meta标签中的作者信息
}
for _, pattern := range patterns {
regex := regexp.MustCompile(pattern)
if match := regex.FindStringSubmatch(content); len(match) > 1 {
accountName = match[1]
// 清理和转义
accountName = strings.ReplaceAll(accountName, "&quot;", "\"")
accountName = strings.ReplaceAll(accountName, "&amp;", "&")
accountName = strings.ReplaceAll(accountName, "&lt;", "<")
accountName = strings.ReplaceAll(accountName, "&gt;", ">")
// 多次URL解码
for i := 0; i < 3; i++ {
if decoded, err := url.QueryUnescape(accountName); err == nil && decoded != accountName {
accountName = decoded
} else {
break
}
}
break
}
}
return accountName
}
// ExtractArticleInfo 从文章内容中提取关键信息
func (w *WechatCrawler) ExtractArticleInfo(content string) (string, string, string, string, string, []string) {
// 提取创建时间 - 增强版增加对ori_create_time的支持
createTime := ""
// 模式1: 标准createTime变量
createTimeRegex := regexp.MustCompile(`var createTime\s*=\s*['"](\d+)['"]`)
if match := createTimeRegex.FindStringSubmatch(content); len(match) > 1 {
createTime = match[1]
} else {
// 模式2: ori_create_time变量在之前的文件中发现
oriCreateTimeRegex := regexp.MustCompile(`ori_create_time\s*:\s*['"](\d+)['"]`)
if match := oriCreateTimeRegex.FindStringSubmatch(content); len(match) > 1 {
createTime = match[1]
}
// 模式3: JSON对象中的create_time字段
jsonCreateTimeRegex := regexp.MustCompile(`"create_time"\s*:\s*(\d+)`)
if match := jsonCreateTimeRegex.FindStringSubmatch(content); len(match) > 1 {
createTime = match[1]
}
}
// 提取标题 - 增强版,优化标题提取逻辑,确保正确区分公众号名称和文章标题
title := ""
// 优先从微信文章特有的结构提取标题window.appmsg.title优先级最高
titlePatterns := []string{
`window\.appmsg\s*=\s*\{[^}]*"title"\s*:\s*['"](.*?)['"]`, // window.appmsg对象中的title微信文章标准标题位置
`var title\s*=\s*['"](.*?)['"]`, // 直接变量赋值
`"title"\s*:\s*['"](.*?)['"]`, // JSON对象中的title字段
`window\.title\s*=\s*['"](.*?)['"]`, // window.title赋值
// 增加JsDecode函数支持在文件中发现
`title\s*=\s*JsDecode\(['"](.*?)['"]\)`, // title变量的JsDecode赋值
`JsDecode\(['"]([^'"]*?title[^'"]*)['"]\)`, // 包含title的JsDecode调用
// HTML title标签优先级降低因为可能包含公众号名称
`<title[^>]*>(.*?)</title>`,
}
for _, pattern := range titlePatterns {
titleRegex := regexp.MustCompile(pattern)
if match := titleRegex.FindStringSubmatch(content); len(match) > 1 {
title = match[1]
// 尝试解码HTML实体和URL编码
title = strings.ReplaceAll(title, "&quot;", "\"")
title = strings.ReplaceAll(title, "&amp;", "&")
title = strings.ReplaceAll(title, "&lt;", "<")
title = strings.ReplaceAll(title, "&gt;", ">")
// 多次URL解码处理嵌套编码
for i := 0; i < 3; i++ { // 最多解码3次
if decoded, err := url.QueryUnescape(title); err == nil && decoded != title {
title = decoded
} else {
break
}
}
break
}
}
// 提取comment_id - 增强版增加JsDecode支持
commentID := ""
// 模式1: 标准comment_id变量
commentIDRegex := regexp.MustCompile(`var comment_id\s*=\s*['"](\d+)['"]`)
if match := commentIDRegex.FindStringSubmatch(content); len(match) > 1 {
commentID = match[1]
} else {
// 模式2: comment_id变量带JsDecode在文件中发现
commentIDJsDecodeRegex := regexp.MustCompile(`comment_id\s*=\s*JsDecode\(['"]([^'"]+)['"]\)`)
if match := commentIDJsDecodeRegex.FindStringSubmatch(content); len(match) > 1 {
commentID = match[1]
}
// 模式3: JSON对象中的comment_id字段
jsonCommentIDRegex := regexp.MustCompile(`"comment_id"\s*:\s*['"]([^'"]+)['"]`)
if match := jsonCommentIDRegex.FindStringSubmatch(content); len(match) > 1 {
commentID = match[1]
}
}
// 提取req_id - 增强版
reqID := ""
// 模式1: 标准req_id变量
reqIDRegex := regexp.MustCompile(`var req_id\s*=\s*['"](\d+)['"]`)
if match := reqIDRegex.FindStringSubmatch(content); len(match) > 1 {
reqID = match[1]
} else {
// 模式2: req_id变量的其他格式
reqIDAltRegex := regexp.MustCompile(`req_id\s*=\s*['"]([^'"]+)['"]`)
if match := reqIDAltRegex.FindStringSubmatch(content); len(match) > 1 {
reqID = match[1]
}
// 模式3: JSON对象中的req_id字段
jsonReqIDRegex := regexp.MustCompile(`"req_id"\s*:\s*['"]([^'"]+)['"]`)
if match := jsonReqIDRegex.FindStringSubmatch(content); len(match) > 1 {
reqID = match[1]
}
}
// 提取文章文本内容 - 全新策略,专注于微信文章核心内容结构
textContent := []string{}
// 改进内容提取策略 - 全新的优先级顺序,专注于微信文章特有的内容结构
var rawContent string
// 方法1: 从微信文章特定的数据结构提取(最高优先级)
// 1.1 尝试从var content变量直接提取微信文章常用的内容存储方式
varContentRegex := regexp.MustCompile(`var\s+content\s*=\s*(['"](?:\\.|[^'"])*['"])\s*;`)
if match := varContentRegex.FindStringSubmatch(content); len(match) > 1 {
rawContent = match[1]
// 移除引号
if len(rawContent) > 1 && rawContent[0] == '"' && rawContent[len(rawContent)-1] == '"' {
rawContent = rawContent[1 : len(rawContent)-1]
}
} else if rawContent == "" {
// 1.2 尝试从rich_media_content变量提取
richMediaVarRegex := regexp.MustCompile(`var\s+rich_media_content\s*=\s*(['"](?:\\.|[^'"])*['"])\s*;`)
if match := richMediaVarRegex.FindStringSubmatch(content); len(match) > 1 {
rawContent = match[1]
// 移除引号
if len(rawContent) > 1 && rawContent[0] == '"' && rawContent[len(rawContent)-1] == '"' {
rawContent = rawContent[1 : len(rawContent)-1]
}
}
} else if rawContent == "" {
// 1.3 尝试从js_content变量提取
jsContentVarRegex := regexp.MustCompile(`var\s+js_content\s*=\s*(['"](?:\\.|[^'"])*['"])\s*;`)
if match := jsContentVarRegex.FindStringSubmatch(content); len(match) > 1 {
rawContent = match[1]
// 移除引号
if len(rawContent) > 1 && rawContent[0] == '"' && rawContent[len(rawContent)-1] == '"' {
rawContent = rawContent[1 : len(rawContent)-1]
}
}
}
// 方法2: 从HTML DOM结构中直接提取次优先级
if rawContent == "" {
// 2.1 优先查找rich_media_content类的div微信文章核心内容容器
richMediaClassRegex := regexp.MustCompile(`(?s)<div\s+class=["']rich_media_content["'][^>]*>([\s\S]*?)<\/div>`)
if match := richMediaClassRegex.FindStringSubmatch(content); len(match) > 1 {
rawContent = match[1]
} else if rawContent == "" {
// 2.2 尝试查找id为js_content的元素
jsContentIdRegex := regexp.MustCompile(`(?s)<div\s+id=["']js_content["'][^>]*>([\s\S]*?)<\/div>`)
if match := jsContentIdRegex.FindStringSubmatch(content); len(match) > 1 {
rawContent = match[1]
}
}
}
// 方法3: 从window.appmsg对象中提取微信文章标准数据结构
if rawContent == "" {
appmsgRegex := regexp.MustCompile(`window\.appmsg\s*=\s*(\{[\s\S]+?\});`)
if match := appmsgRegex.FindStringSubmatch(content); len(match) > 1 {
appmsgData := match[1]
// 尝试提取content字段多种格式
contentPatterns := []string{
`"content"\s*:\s*(['"](?:\\.|[^'"])*['"])`,
`content\s*=\s*(['"](?:\\.|[^'"])*['"])`,
`"content"\s*:\s*JsDecode\(['"]([^'"]+)['"]\)`,
`content\s*=\s*JsDecode\(['"]([^'"]+)['"]\)`,
}
for _, pattern := range contentPatterns {
contentRegex := regexp.MustCompile(pattern)
if contentMatch := contentRegex.FindStringSubmatch(appmsgData); len(contentMatch) > 1 {
rawContent = contentMatch[1]
// 移除引号
if len(rawContent) > 1 && rawContent[0] == '"' && rawContent[len(rawContent)-1] == '"' {
rawContent = rawContent[1 : len(rawContent)-1]
}
break
}
}
}
}
// 方法4: 从JSON格式的文章数据中提取
if rawContent == "" {
// 查找可能包含文章内容的JSON数据块
jsonDataRegex := regexp.MustCompile(`(?:\{"content"|\"content\")[^}]*\}`)
jsonMatches := jsonDataRegex.FindAllString(content, -1)
for _, jsonMatch := range jsonMatches {
// 尝试解析JSON
var jsonObj map[string]interface{}
if err := json.Unmarshal([]byte(jsonMatch), &jsonObj); err == nil {
if contentStr, ok := jsonObj["content"].(string); ok && contentStr != "" {
rawContent = contentStr
break
}
}
}
}
// 方法5: 尝试从微信文章特有的段落结构提取
if rawContent == "" {
// 查找带有rich_media_p类的p标签微信文章特有的段落样式
pTagsRegex := regexp.MustCompile(`(?s)<p\s+class=["']rich_media_p["'].*?>([\s\S]*?)<\/p>`)
if matches := pTagsRegex.FindAllStringSubmatch(content, -1); len(matches) > 0 {
// 如果找到多个p标签合并它们的内容
var combinedContent strings.Builder
for _, match := range matches {
if len(match) > 1 {
combinedContent.WriteString(match[1])
combinedContent.WriteString("\n")
}
}
rawContent = combinedContent.String()
} else {
// 尝试一般的p标签这是微信文章的备用段落格式
generalPTagsRegex := regexp.MustCompile(`(?s)<p[^>]*>([\s\S]*?)<\/p>`)
if matches := generalPTagsRegex.FindAllStringSubmatch(content, -1); len(matches) > 10 { // 至少10个p标签才可能是文章内容
var combinedContent strings.Builder
for _, match := range matches {
if len(match) > 1 {
combinedContent.WriteString(match[1])
combinedContent.WriteString("\n")
}
}
rawContent = combinedContent.String()
}
}
}
// 方法6: 从article或section标签提取HTML5标准内容容器
if rawContent == "" {
// 分别处理article和section标签
articleRegex := regexp.MustCompile(`(?s)<article[^>]*>([\s\S]*?)<\/article>`)
if match := articleRegex.FindStringSubmatch(content); len(match) > 1 {
// 检查提取的内容是否真的包含文章正文而不是JavaScript代码
articleContent := match[1]
if w.calculateChineseDensity(articleContent) > 0.2 {
rawContent = articleContent
}
} else {
sectionRegex := regexp.MustCompile(`(?s)<section[^>]*>([\s\S]*?)<\/section>`)
if match := sectionRegex.FindStringSubmatch(content); len(match) > 1 {
// 检查提取的内容是否真的包含文章正文
sectionContent := match[1]
if w.calculateChineseDensity(sectionContent) > 0.2 {
rawContent = sectionContent
}
}
}
}
// 方法7: 基于中文密度的段落提取(备用方法)
if rawContent == "" {
// 将内容分成较小的块进行检查,使用更简单的正则表达式
contentBlocks := regexp.MustCompile(`[\S\s]+?`).FindAllString(content, -1)
var bestContent string
var bestScore float64
for _, block := range contentBlocks {
// 计算中文密度使用现有的calculateChineseDensity方法
density := w.calculateChineseDensity(block)
// 计算JavaScript关键词数量
jsCount := w.jsKeywordCount(block)
// 计算中文字符总数
chineseCount := 0
for _, char := range block {
if char >= 0x4e00 && char <= 0x9fa5 {
chineseCount++
}
}
// 计算综合评分中文密度高且JavaScript关键词少的内容得分更高
score := density * float64(chineseCount) / (float64(jsCount) + 1.0)
// 只有中文密度足够高且JavaScript关键词较少的内容才考虑
if density > 0.4 && jsCount < 10 && chineseCount > 100 && score > bestScore {
bestScore = score
bestContent = block
}
}
if bestContent != "" {
rawContent = bestContent
}
}
// 方法8: 从JavaScript字符串中提取HTML内容备用方法
if rawContent == "" {
// 查找可能包含HTML内容的长字符串
longStringRegex := regexp.MustCompile(`['"]([^'"]{200,})['"]`)
matches := longStringRegex.FindAllStringSubmatch(content, -1)
for _, match := range matches {
if len(match) > 1 {
// 先进行预检查排除明显的JavaScript代码
candidate := match[1]
if w.jsKeywordCount(candidate) > 20 {
continue // 跳过JavaScript代码过多的候选内容
}
// 尝试解码可能的URL编码内容
decoded := candidate
for i := 0; i < 3; i++ { // 最多解码3次
if d, err := url.QueryUnescape(decoded); err == nil && d != decoded {
decoded = d
} else {
break
}
}
// 检查是否包含常见的HTML标签且中文密度足够高
hasHTMLTags := strings.Contains(decoded, "<p>") || strings.Contains(decoded, "<div") ||
strings.Contains(decoded, "<br>") || strings.Contains(decoded, "&lt;p&gt;") ||
strings.Contains(decoded, "&lt;div") || strings.Contains(decoded, "&lt;br&gt;")
// 计算解码后的中文密度
density := w.calculateChineseDensity(decoded)
// 同时满足有HTML标签和足够的中文密度
if hasHTMLTags && density > 0.3 {
rawContent = decoded
break
}
}
}
}
// 预处理rawContent如果已找到
if rawContent != "" {
// 首先进行多次URL解码处理嵌套编码
for i := 0; i < 3; i++ { // 最多解码3次
if decoded, err := url.QueryUnescape(rawContent); err == nil && decoded != rawContent {
rawContent = decoded
} else {
break
}
}
// 替换HTML实体
rawContent = strings.ReplaceAll(rawContent, "&lt;", "<")
rawContent = strings.ReplaceAll(rawContent, "&gt;", ">")
rawContent = strings.ReplaceAll(rawContent, "&quot;", "\"")
rawContent = strings.ReplaceAll(rawContent, "&amp;", "&")
rawContent = strings.ReplaceAll(rawContent, "\\n", "")
rawContent = strings.ReplaceAll(rawContent, "\\r", "")
rawContent = strings.ReplaceAll(rawContent, "\\t", "")
rawContent = strings.ReplaceAll(rawContent, "\\\"", "\"") // 处理转义的双引号
}
// 如果找到了内容,进行清理
if rawContent != "" {
// 移除HTML标签
tagRegex := regexp.MustCompile(`<[^>]*>`)
cleanText := tagRegex.ReplaceAllString(rawContent, "")
// 应用JavaScript大段过滤
cleanText = w.filterJavaScriptBlocks(cleanText)
// 移除多余的空白字符
spaceRegex := regexp.MustCompile(`\s+`)
cleanText = spaceRegex.ReplaceAllString(cleanText, " ")
cleanText = strings.TrimSpace(cleanText)
// 检查是否包含过多的JavaScript代码特征
jsCount := w.jsKeywordCount(cleanText)
chineseDensity := w.calculateChineseDensity(cleanText)
// 移除明显的JavaScript代码块 - 增强版,特别针对微信平台代码
// 1. 移除WX_BJ_REPORT相关代码
cleanText = regexp.MustCompile(`(?s)\s*WX_BJ_REPORT\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*var\s+WX_BJ_REPORT\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*if\s*\(WX_BJ_REPORT\)[^;]*;`).ReplaceAllString(cleanText, "")
// 2. 移除BadJs相关代码
cleanText = regexp.MustCompile(`(?s)\s*BadJs\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*var\s+BadJs\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*if\s*\(BadJs\)[^;]*;`).ReplaceAllString(cleanText, "")
// 3. 移除window.logs相关代码
cleanText = regexp.MustCompile(`(?s)\s*window\.logs\s*=\s*\[.*?\];`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*window\.logs\s*\..*?;`).ReplaceAllString(cleanText, "")
// 4. 移除__moon_initcallback相关代码
cleanText = regexp.MustCompile(`(?s)\s*__moon_initcallback\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*\.__moon_initcallback\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
// 5. 移除try-catch块
cleanText = regexp.MustCompile(`(?s)\s*try\s*{[^}]*}\s*catch\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
// 6. 移除函数定义
cleanText = regexp.MustCompile(`(?s)\s*function\s+[^(]*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*var\s+[^=]*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
// 7. 移除IIFE函数
cleanText = regexp.MustCompile(`(?s)\s*\(function\s*\([^)]*\)\s*{[^}]*}\)\s*\(\);`).ReplaceAllString(cleanText, "")
// 8. 移除变量声明
cleanText = regexp.MustCompile(`(?s)\s*var\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*{[^}]*}\s*;?`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*let\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*const\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*window\.[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`).ReplaceAllString(cleanText, "")
// 9. 移除控制流语句
cleanText = regexp.MustCompile(`(?s)\s*if\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*for\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*while\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
// 10. 移除JSON和数组
cleanText = regexp.MustCompile(`(?s)\s*\{\s*"[^"]*"\s*:\s*[^}]*\}\s*`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*\[\s*[^\]]*\]\s*`).ReplaceAllString(cleanText, "")
// 11. 移除网络请求相关代码
cleanText = regexp.MustCompile(`(?s)\s*new\s+XMLHttpRequest\(\)[^;]*;`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*xmlobj\s*\.[^;]*;`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*fetch\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*axios\s*\.[^;]*;`).ReplaceAllString(cleanText, "")
// 12. 移除正则表达式和调试代码
cleanText = regexp.MustCompile(`(?s)\s*new\s+RegExp\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*\/[^/]*\/[gimuy]*`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*console\.[a-z]+\([^)]*\);`).ReplaceAllString(cleanText, "")
// 13. 移除事件处理相关代码
cleanText = regexp.MustCompile(`(?s)\s*document\.addEventListener\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*window\.addEventListener\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*on\$1\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
// 14. 移除定时器相关代码
cleanText = regexp.MustCompile(`(?s)\s*setTimeout\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*setInterval\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
// 15. 移除微信特有的API调用
cleanText = regexp.MustCompile(`(?s)\s*WeixinJSBridge\s*\..*?;`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*wx\.\w+\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
// 16. 移除logsPagetime相关代码
cleanText = regexp.MustCompile(`(?s)\s*logsPagetime\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*var\s+logsPagetime\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
// 17. 移除特定的微信错误处理代码
cleanText = regexp.MustCompile(`(?s)\s*\.error\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*\.warn\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*\.info\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
// 18. 移除微信平台特定的方法调用
cleanText = regexp.MustCompile(`(?s)\s*document\.write\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
cleanText = regexp.MustCompile(`(?s)\s*document\.writeln\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
// 如果JavaScript关键词较少且中文密度较高可能是有效的文章内容
if (jsCount < 5 || chineseDensity > 0.3) && len(cleanText) > 50 {
// 按句子或段落分割,避免一行过长
if len(cleanText) > 0 {
// 首先尝试按段落分割
paragraphs := regexp.MustCompile(`[。!?.!?]\s*`).Split(cleanText, -1)
// 重组段落,保留标点符号
punctuations := regexp.MustCompile(`[。!?.!?]\s*`).FindAllString(cleanText, -1)
for i := 0; i < len(paragraphs); i++ {
if paragraphs[i] != "" {
if i < len(punctuations) {
paragraphs[i] += punctuations[i]
}
// 只添加非空且长度合理的段落避免添加JavaScript片段
paragraph := strings.TrimSpace(paragraphs[i])
// 增强过滤条件避免JavaScript片段同时考虑中文密度
paraDensity := w.calculateChineseDensity(paragraph)
paraJsCount := w.jsKeywordCount(paragraph)
if len(paragraph) > 15 &&
!strings.Contains(paragraph, "{") &&
!strings.Contains(paragraph, "}") &&
!strings.Contains(paragraph, "function") &&
!strings.Contains(paragraph, "var") &&
!strings.Contains(paragraph, "window.") &&
!strings.Contains(paragraph, "WX_BJ_REPORT") &&
!strings.Contains(paragraph, "BadJs") &&
(paraJsCount < 2 || paraDensity > 0.4) { // 根据中文密度调整JavaScript关键词容忍度
textContent = append(textContent, paragraph)
}
}
}
// 如果没有成功分割成段落,直接添加整个文本
if len(textContent) == 0 && len(cleanText) > 50 && (w.jsKeywordCount(cleanText) < 3 || chineseDensity > 0.5) {
textContent = append(textContent, cleanText)
}
}
}
}
// 最后的备选方案尝试从整个页面中提取非JavaScript的文本内容
if len(textContent) == 0 {
// 移除所有HTML标签
allText := regexp.MustCompile(`<[^>]*>`).ReplaceAllString(content, "")
// 应用增强的JavaScript代码块过滤
allText = w.filterJavaScriptBlocks(allText)
// 进一步清理特定模式
allText = regexp.MustCompile(`(?s)\s*WX_BJ_REPORT\s*\([^)]*\);`).ReplaceAllString(allText, "")
allText = regexp.MustCompile(`(?s)\s*BadJs\s*\([^)]*\);`).ReplaceAllString(allText, "")
allText = regexp.MustCompile(`(?s)\s*window\.logs\s*=\s*\[.*?\];`).ReplaceAllString(allText, "")
allText = regexp.MustCompile(`(?s)\s*__moon_initcallback\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(allText, "")
allText = regexp.MustCompile(`(?s)\s*try\s*{[^}]*}\s*catch\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(allText, "")
allText = regexp.MustCompile(`(?s)\s*function\s+[^(]*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(allText, "")
allText = regexp.MustCompile(`(?s)\s*var\s+[^=]*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(allText, "")
allText = regexp.MustCompile(`(?s)\s*\(function\s*\([^)]*\)\s*{[^}]*}\)\s*\(\);`).ReplaceAllString(allText, "")
// 使用中文文本提取作为最后手段
allText = w.extractChineseText(allText)
// 清理空白字符
spaceRegex := regexp.MustCompile(`\s+`)
allText = spaceRegex.ReplaceAllString(allText, " ")
allText = strings.TrimSpace(allText)
// 尝试按句子分割
if allText != "" && len(allText) > 100 {
sentences := regexp.MustCompile(`[。!?.!?]\s*`).Split(allText, -1)
punctuations := regexp.MustCompile(`[。!?.!?]\s*`).FindAllString(allText, -1)
for i := 0; i < len(sentences); i++ {
if sentences[i] != "" {
if i < len(punctuations) {
sentences[i] += punctuations[i]
}
paragraph := strings.TrimSpace(sentences[i])
// 过滤掉JavaScript代码和过短的内容同时考虑中文密度
if len(paragraph) > 20 && (w.jsKeywordCount(paragraph) < 3 || w.calculateChineseDensity(paragraph) > 0.4) {
textContent = append(textContent, paragraph)
}
}
}
}
}
// 对提取的内容应用最终过滤,确保只保留真正的文章正文
filteredContent := w.finalContentFilter(textContent)
return createTime, title, commentID, reqID, w.extractAuthor(content), filteredContent
}
// calculateChineseDensity 计算文本中中文字符的密度
func (w *WechatCrawler) calculateChineseDensity(text string) float64 {
if len(text) == 0 {
return 0
}
// 使用正确的Go语言Unicode范围表示法
chineseCount := 0
for _, char := range text {
if char >= 0x4e00 && char <= 0x9fa5 {
chineseCount++
}
}
return float64(chineseCount) / float64(len(text))
}
// 过滤大段JavaScript代码
func (w *WechatCrawler) filterJavaScriptBlocks(text string) string {
// 移除常见的JavaScript代码块模式
patterns := []string{
// 移除JavaScript函数声明
`(?s)function\s+[a-zA-Z_$][\w$]*\s*\([^)]*\)\s*{[^}]*}`,
// 移除匿名函数
`(?s)\(\s*function\s*\([^)]*\)\s*{[^}]*}\s*\)\s*\(\s*\)`,
// 移除对象字面量
`(?s)\{[^}]*\}`,
// 移除数组字面量
`(?s)\[[^\]]*\]`,
// 移除注释
`//[^\n]*`,
`/\*[^*]*\*/`,
// 移除微信特定错误报告代码
`(?s)WX_BJ_REPORT[^;]*;`,
`(?s)BadJs[^;]*;`,
`(?s)window\.[a-zA-Z_$][\w$]*[^;]*;`,
// 移除XMLHttpRequest相关代码
`(?s)xmlobj[^;]*;`,
`(?s)new\s+Image\([^)]*\)`,
`(?s)setRequestHeader[^;]*;`,
// 移除正则表达式
`/[^/]*\/[gimuy]*`,
}
result := text
for _, pattern := range patterns {
regex, err := regexp.Compile(pattern)
if err == nil {
result = regex.ReplaceAllString(result, "")
}
}
return result
}
// 提取纯中文文本
func (w *WechatCrawler) extractChineseText(text string) string {
var result []rune
for _, char := range text {
// 保留中文、标点符号、数字和英文字母,去除特殊字符
if (char >= 0x4e00 && char <= 0x9fa5) ||
unicode.IsPunct(char) ||
unicode.IsDigit(char) ||
unicode.IsLetter(char) ||
char == '\n' || char == ' ' {
result = append(result, char)
}
}
return string(result)
}
// finalContentFilter 最终内容过滤,确保只保留真正的文章正文
func (w *WechatCrawler) finalContentFilter(text string) string {
// 1. 移除明显的JavaScript代码块
// 移除WX_BJ_REPORT相关代码
wxCodeRegex := regexp.MustCompile(`(?s)\s*WX_BJ_REPORT\s*\([^)]*\);|\s*var\s+WX_BJ_REPORT\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*|\s*if\s*\(WX_BJ_REPORT\)[^;]*;`)
text = wxCodeRegex.ReplaceAllString(text, "")
// 移除BadJs相关代码
badJsRegex := regexp.MustCompile(`(?s)\s*BadJs\s*\([^)]*\);|\s*var\s+BadJs\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*|\s*if\s*\(BadJs\)[^;]*;`)
text = badJsRegex.ReplaceAllString(text, "")
// 移除window.logs相关代码
logsRegex := regexp.MustCompile(`(?s)\s*window\.logs\s*=\s*\[.*?\];|\s*window\.logs\s*\..*?;`)
text = logsRegex.ReplaceAllString(text, "")
// 移除函数定义
funcRegex := regexp.MustCompile(`(?s)\s*function\s+[^(]*\([^)]*\)\s*{[^}]*}\s*|\s*var\s+[^=]*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*|\s*[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`)
text = funcRegex.ReplaceAllString(text, "")
// 移除变量声明
varRegex := regexp.MustCompile(`(?s)\s*var\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*{[^}]*}\s*;?|\s*let\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;|\s*const\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;|\s*window\.[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`)
text = varRegex.ReplaceAllString(text, "")
// 移除控制流语句
flowRegex := regexp.MustCompile(`(?s)\s*if\s*\([^)]*\)\s*{[^}]*}\s*|\s*for\s*\([^)]*\)\s*{[^}]*}\s*|\s*while\s*\([^)]*\)\s*{[^}]*}\s*`)
text = flowRegex.ReplaceAllString(text, "")
// 2. 提取真正的文章段落
paragraphs := regexp.MustCompile(`[。!?.!?]\s*`).Split(text, -1)
punctuations := regexp.MustCompile(`[。!?.!?]\s*`).FindAllString(text, -1)
var validParagraphs []string
for i := 0; i < len(paragraphs); i++ {
if paragraphs[i] != "" {
paragraph := paragraphs[i]
if i < len(punctuations) {
paragraph += punctuations[i]
}
paragraph = strings.TrimSpace(paragraph)
// 计算段落特征
paraDensity := w.calculateChineseDensity(paragraph)
paraJsCount := w.jsKeywordCount(paragraph)
chineseCount := 0
for _, char := range paragraph {
if char >= 0x4e00 && char <= 0x9fa5 {
chineseCount++
}
}
// 严格的过滤规则
if len(paragraph) > 25 && // 足够长的段落
!strings.Contains(paragraph, "{") &&
!strings.Contains(paragraph, "}") &&
!strings.Contains(paragraph, "function") &&
!strings.Contains(paragraph, "var") &&
!strings.Contains(paragraph, "window.") &&
!strings.Contains(paragraph, "WX_BJ_REPORT") &&
!strings.Contains(paragraph, "BadJs") &&
chineseCount > 15 && // 至少15个中文字符
paraDensity > 0.4 && // 中文密度大于40%
paraJsCount < 3 { // JavaScript关键词少于3个
validParagraphs = append(validParagraphs, paragraph)
}
}
}
// 3. 如果没有找到有效的段落,尝试使用更宽松的规则
if len(validParagraphs) == 0 {
// 直接检查整个文本
overallDensity := w.calculateChineseDensity(text)
overallJsCount := w.jsKeywordCount(text)
overallChineseCount := 0
for _, char := range text {
if char >= 0x4e00 && char <= 0x9fa5 {
overallChineseCount++
}
}
// 宽松条件如果中文密度很高且JavaScript关键词较少
if overallDensity > 0.6 && overallJsCount < 5 && overallChineseCount > 100 {
validParagraphs = append(validParagraphs, text)
}
}
return strings.Join(validParagraphs, "\n\n")
}
// jsKeywordCount 计算文本中JavaScript关键词的数量 - 增强版
func (w *WechatCrawler) jsKeywordCount(text string) int {
count := 0
// 新增加的高优先级过滤关键词
highPriorityKeywords := []string{
"logs = ", "window.", "LANG = ", "extInfo:", "pagetime[",
"BadJs;", "sec_open=", "xmlobj = ", "addEventListener",
"new Image()", "setRequestHeader", "onreadystatechange",
"var ", "let ", "const ", "function ", "return ",
}
// 基础JavaScript关键词
basicKeywords := []string{
"function", "var", "let", "const", "if(", "else", "for(", "while(",
"return", "setTimeout", "setInterval", "WeixinJSBridge", "JSON",
"console", "document", "window", "try{", "catch(", "throw",
}
// 微信平台特定关键词
wechatKeywords := []string{
"WX_BJ_REPORT", "BadJs", "__moon_initcallback", "logsPagetime",
"WeixinJSBridge", "wx.", "document.write", "document.writeln",
// 错误处理关键词
".error(", ".warn(", ".info(", ".debug(",
// 网络请求关键词
"XMLHttpRequest", "fetch(", "axios.", "xmlobj.",
}
lowerText := strings.ToLower(text)
// 计算高优先级关键词数量(权重更高)
for _, keyword := range highPriorityKeywords {
count += strings.Count(lowerText, strings.ToLower(keyword)) * 3
}
// 计算微信平台特定关键词数量
for _, keyword := range wechatKeywords {
count += strings.Count(lowerText, strings.ToLower(keyword)) * 2
}
// 计算基础JavaScript关键词数量
for _, keyword := range basicKeywords {
count += strings.Count(lowerText, strings.ToLower(keyword))
}
return count
}
// extractAuthor 提取文章作者信息
func (w *WechatCrawler) extractAuthor(content string) string {
authorPatterns := []string{
`var author\s*=\s*['"](.*?)['"]`,
`"author"\s*:\s*['"](.*?)['"]`,
`window\.author\s*=\s*['"](.*?)['"]`,
`<meta name=["']author["'] content=["'](.*?)['"]`,
`window\.appmsg\s*=\s*\{[^}]*"author"\s*:\s*['"](.*?)['"]`,
}
for _, pattern := range authorPatterns {
authorRegex := regexp.MustCompile(pattern)
if match := authorRegex.FindStringSubmatch(content); len(match) > 1 {
author := match[1]
// 尝试解码HTML实体和URL编码
author = strings.ReplaceAll(author, "&quot;", "\"")
author = strings.ReplaceAll(author, "&amp;", "&")
author = strings.ReplaceAll(author, "&lt;", "<")
author = strings.ReplaceAll(author, "&gt;", ">")
if decoded, err := url.QueryUnescape(author); err == nil {
author = decoded
}
return author
}
}
return ""
}
// GetArticleStats 获取文章统计信息
func (w *WechatCrawler) GetArticleStats(link string, title string, commentID string, reqID string, createTime string) (map[string]string, error) {
// 解析链接参数
mid := ""
sn := ""
idx := ""
// 尝试从链接中提取参数
midRegex := regexp.MustCompile(`mid=(.*?)&`)
if match := midRegex.FindStringSubmatch(link); len(match) > 1 {
mid = match[1]
}
snRegex := regexp.MustCompile(`sn=(.*?)&`)
if match := snRegex.FindStringSubmatch(link); len(match) > 1 {
sn = match[1]
}
idxRegex := regexp.MustCompile(`idx=(.*?)&`)
if match := idxRegex.FindStringSubmatch(link); len(match) > 1 {
idx = match[1]
}
// 生成随机r值
r := fmt.Sprintf("0.%d", time.Now().UnixNano()%10000000000000000)
// 构建请求URL
detailURL := fmt.Sprintf("https://mp.weixin.qq.com/mp/getappmsgext?f=json&mock=&fasttmplajax=1&f=json&uin=%s&key=%s&pass_ticket=%s&__biz=%s",
w.uin, w.key, w.passTicket, w.biz)
// 构建请求数据
data := map[string]string{
"r": r,
"sn": sn,
"mid": mid,
"idx": idx,
"req_id": reqID,
"title": title,
"comment_id": commentID,
"appmsg_type": "9",
"__biz": w.biz,
"pass_ticket": w.passTicket,
"abtest_cookie": "",
"devicetype": "Windows 7 x64",
"version": "63090b13",
"is_need_ticket": "0",
"is_need_ad": "0",
"is_need_reward": "0",
"both_ad": "0",
"reward_uin_count": "0",
"send_time": "",
"msg_daily_idx": "1",
"is_original": "0",
"is_only_read": "1",
"scene": "38",
}
// 发送POST请求
resp, err := w.client.R().SetFormData(data).Post(detailURL)
if err != nil {
return nil, fmt.Errorf("请求统计信息失败: %v", err)
}
// 解析响应
var result map[string]interface{}
err = json.Unmarshal([]byte(resp.String()), &result)
if err != nil {
return nil, fmt.Errorf("解析统计信息失败: %v", err)
}
// 提取统计数据
stats := map[string]string{
"read_num": "0",
"old_like_num": "0",
"share_num": "0",
"show_read": "0",
}
// 从返回的JSON中提取所需数据
if appMsgExtInfo, ok := result["appmsgstat"].(map[string]interface{}); ok {
if readNum, ok := appMsgExtInfo["read_num"].(float64); ok {
stats["read_num"] = fmt.Sprintf("%.0f", readNum)
}
if likeNum, ok := appMsgExtInfo["old_like_num"].(float64); ok {
stats["old_like_num"] = fmt.Sprintf("%.0f", likeNum)
}
if shareNum, ok := appMsgExtInfo["share_num"].(float64); ok {
stats["share_num"] = fmt.Sprintf("%.0f", shareNum)
}
if showRead, ok := appMsgExtInfo["show_read"].(float64); ok {
stats["show_read"] = fmt.Sprintf("%.0f", showRead)
}
}
return stats, nil
}
// GetArticleComments 获取文章评论
func (w *WechatCrawler) GetArticleComments(commentID string) ([]string, []string, error) {
if commentID == "" {
return []string{}, []string{}, nil
}
// 构建评论请求URL
commentURL := fmt.Sprintf(
"https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=%s&appmsgid=2247491372&idx=1&comment_id=%s&offset=0&limit=100&uin=%s&key=%s&pass_ticket=%s&wxtoken=&devicetype=Windows+10&clientversion=62060833&appmsg_token=",
w.biz, commentID, w.uin, w.key, w.passTicket)
// 发送请求
resp, err := w.client.R().Get(commentURL)
if err != nil {
return []string{}, []string{}, fmt.Errorf("获取评论失败: %v", err)
}
// 解析响应
var result map[string]interface{}
err = json.Unmarshal([]byte(resp.String()), &result)
if err != nil {
return []string{}, []string{}, fmt.Errorf("解析评论失败: %v", err)
}
// 提取评论和点赞数
var comments []string
var commentLikes []string
// 简化实现在实际项目中需要根据返回的JSON结构正确提取数据
return comments, commentLikes, nil
}
// GetOfficialAccountLinkFromArticle 通过文章链接获取公众号主页链接
func (w *WechatCrawler) GetOfficialAccountLinkFromArticle(articleURL string) (string, error) {
// 首先尝试从URL中提取__biz参数兼容旧格式
bizRegex := regexp.MustCompile(`__biz=([^&]+)`)
match := bizRegex.FindStringSubmatch(articleURL)
if len(match) >= 2 {
biz := match[1]
// 更新当前实例的biz值
w.biz = biz
// 构建公众号主页链接
homePageURL := fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124", biz)
return homePageURL, nil
}
// 如果从URL中提取失败尝试从文章内容中提取
content, err := w.GetOneArticle(articleURL)
if err != nil {
return "", fmt.Errorf("获取文章内容失败: %v", err)
}
// 从文章内容中提取biz
contentBizRegex := regexp.MustCompile(`var biz = "(.*?);`)
contentMatch := contentBizRegex.FindStringSubmatch(content)
if len(contentMatch) < 2 {
// 尝试其他可能的biz格式
contentBizRegex2 := regexp.MustCompile(`__biz=(.*?)&`)
contentMatch = contentBizRegex2.FindStringSubmatch(content)
if len(contentMatch) < 2 {
return "", fmt.Errorf("无法从文章链接和内容中提取公众号信息")
}
}
// 清理biz值移除可能的额外引号
biz := contentMatch[1]
biz = strings.ReplaceAll(biz, " || ", "")
biz = strings.ReplaceAll(biz, "\"", "")
// 更新当前实例的biz值
w.biz = biz
// 构建公众号主页链接
homePageURL := fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124", biz)
return homePageURL, nil
}
// GetArticleList 获取公众号所有文章列表
func (w *WechatCrawler) GetArticleList() ([][]string, error) {
var allArticles [][]string
offset := 0
for {
fmt.Printf("正在获取第%d页文章...\n", offset/10+1)
result, err := w.GetNextList(offset)
if err != nil {
return allArticles, fmt.Errorf("获取文章列表失败: %v", err)
}
// 检查是否还有更多文章
mFlag, ok := result["m_flag"].(float64)
if !ok || mFlag == 0 {
break
}
// 获取当前页的文章列表
passageList, ok := result["passage_list"].([][]string)
if !ok {
return allArticles, fmt.Errorf("文章列表格式错误")
}
// 添加到总列表
allArticles = append(allArticles, passageList...)
// 增加偏移量
offset += 10
// 随机延迟,避免被封禁
time.Sleep(time.Duration(2000+offset) * time.Millisecond)
}
// 转换链接
transformedArticles := w.TransformLinks(allArticles)
fmt.Printf("共获取到%d篇文章\n", len(transformedArticles))
return transformedArticles, nil
}
// SaveArticleListToExcel 保存文章列表到Excel
func (w *WechatCrawler) SaveArticleListToExcel(officialPath string, articleList [][]string, nickname string) error {
// 确保目录存在
if err := os.MkdirAll(officialPath, 0755); err != nil {
return fmt.Errorf("创建目录失败: %v", err)
}
// 保存转换后的链接文件
filePath := fmt.Sprintf("%s/文章列表article_list_直连链接.txt", officialPath)
var content strings.Builder
// 写入标题行
content.WriteString("序号,创建时间,标题,链接\n")
// 写入文章列表
for i, article := range articleList {
content.WriteString(fmt.Sprintf("%d,%s,%s,%s\n", i+1, article[1], article[2], article[3]))
}
// 写入文件
err := os.WriteFile(filePath, []byte(content.String()), 0644)
if err != nil {
return fmt.Errorf("保存文章列表失败: %v", err)
}
fmt.Printf("文章列表已保存到: %s\n", filePath)
return nil
}
// TransformLinks 转换文章链接将带有amp;的链接转换为直接可访问的链接
func (w *WechatCrawler) TransformLinks(articleList [][]string) [][]string {
transformedList := make([][]string, 0, len(articleList))
for _, article := range articleList {
if len(article) >= 4 {
// 转换链接移除amp;
transformedLink := strings.Replace(article[3], "amp;", "", -1)
transformedArticle := []string{article[0], article[1], article[2], transformedLink}
transformedList = append(transformedList, transformedArticle)
}
}
return transformedList
}
// ReadArticleLinksFromExcel 从Excel读取文章链接
func (w *WechatCrawler) ReadArticleLinksFromExcel(filePath string) ([]string, error) {
// 简化实现,返回空列表
return []string{}, nil
}
// GetArticleDetail 获取单篇文章的详细信息
func (w *WechatCrawler) GetArticleDetail(link string) (*ArticleDetail, error) {
// 获取文章内容
content, err := w.GetOneArticle(link)
if err != nil {
return nil, err
}
// 提取文章信息
createTime, title, commentID, reqID, _, textContent := w.ExtractArticleInfo(content)
// 提取公众号名称
accountName := w.ExtractOfficialAccountName(content)
// 获取统计信息
stats, err := w.GetArticleStats(link, title, commentID, reqID, createTime)
if err != nil {
// 如果获取统计信息失败,使用默认值
stats = map[string]string{
"read_num": "0",
"old_like_num": "0",
"share_num": "0",
"show_read": "0",
}
}
// 获取评论信息
comments, commentLikes, _ := w.GetArticleComments(commentID)
// 构建文章详情
detail := &ArticleDetail{
LocalTime: time.Now().Format("2006-01-02 15:04:05"),
CreateTime: createTime,
Title: title,
OfficialName: accountName,
Link: link,
Content: textContent,
ReadCount: stats["read_num"],
LikeCount: stats["old_like_num"],
ShareCount: stats["share_num"],
ShowRead: stats["show_read"],
Comments: comments,
CommentLikes: commentLikes,
CommentID: commentID,
}
return detail, nil
}
// GetDetailList 批量获取文章详情
func (w *WechatCrawler) GetDetailList(articleList [][]string, officialPath string) error {
// 确保目录存在
if err := os.MkdirAll(officialPath, 0755); err != nil {
return fmt.Errorf("创建目录失败: %v", err)
}
successCount := 0
errorCount := 0
errorLinks := [][]string{}
for i, article := range articleList {
if len(article) < 4 {
continue
}
link := article[3]
title := article[2]
fmt.Printf("正在处理第%d篇文章: %s\n", i+1, title)
// 获取文章详情
detail, err := w.GetArticleDetail(link)
if err != nil {
fmt.Printf("获取文章详情失败: %v\n", err)
errorCount++
errorLinks = append(errorLinks, article)
continue
}
// 保存文章详情 - 确保使用文章标题作为文件名
filePath := fmt.Sprintf("%s/%s_文章详情.txt", officialPath, detail.Title)
if err := w.SaveArticleDetailToExcel(detail, filePath); err != nil {
fmt.Printf("保存文章详情失败: %v\n", err)
errorCount++
errorLinks = append(errorLinks, article)
continue
}
successCount++
fmt.Printf("文章详情保存成功: %s\n", detail.Title)
// 随机延迟,避免被封禁
delayTime := 3000 + i*100 // 3秒基础延迟递增
time.Sleep(time.Duration(delayTime) * time.Millisecond)
}
// 保存错误链接
if len(errorLinks) > 0 {
errorPath := fmt.Sprintf("%s/问题链接error_links.txt", officialPath)
var content strings.Builder
content.WriteString("序号,创建时间,标题,链接\n")
for i, link := range errorLinks {
content.WriteString(fmt.Sprintf("%d,%s,%s,%s\n", i+1, link[1], link[2], link[3]))
}
err := os.WriteFile(errorPath, []byte(content.String()), 0644)
if err != nil {
fmt.Printf("保存错误链接失败: %v\n", err)
}
}
fmt.Printf("文章详情获取完成: 成功%d篇, 失败%d篇\n", successCount, errorCount)
return nil
}
// SaveArticleDetailToExcel 保存文章详情到Excel
func (c *WechatCrawler) SaveArticleDetailToExcel(article *ArticleDetail, filePath string) error {
// 简化实现,保存为文本文件
var content strings.Builder
content.WriteString(fmt.Sprintf("本地创建时间: %s\n", article.LocalTime))
content.WriteString(fmt.Sprintf("文章发布时间: %s\n", article.CreateTime))
content.WriteString(fmt.Sprintf("公众号名称: %s\n", article.OfficialName))
content.WriteString(fmt.Sprintf("文章标题: %s\n", article.Title))
content.WriteString(fmt.Sprintf("文章链接: %s\n", article.Link))
content.WriteString(fmt.Sprintf("阅读量: %s\n", article.ReadCount))
content.WriteString(fmt.Sprintf("点赞数: %s\n", article.LikeCount))
content.WriteString(fmt.Sprintf("转发数: %s\n", article.ShareCount))
content.WriteString(fmt.Sprintf("在看数: %s\n", article.ShowRead))
content.WriteString("\n文章内容:\n")
for _, line := range article.Content {
content.WriteString(line)
content.WriteString("\n")
}
// 写入文件
return os.WriteFile(filePath, []byte(content.String()), 0644)
}
// GetListArticleFromFile 根据公众号名称或文章链接,从文件中读取文章列表并下载内容
func (w *WechatCrawler) GetListArticleFromFile(nameLink string, imgSaveFlag bool, contentSaveFlag bool) error {
// 1. 判断输入类型并获取公众号名称
nickname := ""
if strings.Contains(nameLink, "http") {
fmt.Println("检测到输入为链接,开始获取公众号名称")
// 从文章链接获取公众号信息
_, err := w.GetOfficialAccountLinkFromArticle(nameLink)
if err != nil {
return fmt.Errorf("获取公众号信息失败: %v", err)
}
// 获取公众号名称
nickname, err = w.GetOfficialAccountName()
if err != nil {
return fmt.Errorf("获取公众号名称失败: %v", err)
}
fmt.Printf("获取到公众号名称: %s\n", nickname)
} else {
fmt.Println("检测到输入为公众号名称")
nickname = nameLink
}
// 2. 构建文件路径
rootPath := "./data/"
officialNamesHead := "公众号----"
officialPath := rootPath + officialNamesHead + nickname
articleListPath := officialPath + "/文章列表article_list_直连链接.txt"
// 3. 检查文件是否存在
if _, err := os.Stat(articleListPath); os.IsNotExist(err) {
return fmt.Errorf("文件不存在,请检查目录文件: %s", articleListPath)
}
// 4. 读取文章链接列表
fileContent, err := os.ReadFile(articleListPath)
if err != nil {
return fmt.Errorf("读取文章列表文件失败: %v", err)
}
lines := strings.Split(string(fileContent), "\n")
var articleLinks []string
// 跳过标题行,提取链接
for i, line := range lines {
if i == 0 || line == "" {
continue
}
parts := strings.Split(line, ",")
if len(parts) >= 4 {
link := parts[3]
// 清理链接中的引号
link = strings.TrimSpace(link)
link = strings.Trim(link, "\"")
articleLinks = append(articleLinks, link)
}
}
fmt.Printf("成功读取到%d篇文章链接\n", len(articleLinks))
// 5. 遍历下载每篇文章
successCount := 0
errorCount := 0
for i, link := range articleLinks {
fmt.Printf("正在处理第%d篇文章链接: %s\n", i+1, link)
// 获取文章详情
detail, err := w.GetArticleDetail(link)
if err != nil {
fmt.Printf("获取文章详情失败: %v\n", err)
errorCount++
continue
}
// 保存文章内容
if contentSaveFlag {
filePath := fmt.Sprintf("%s/%s_文章详情.txt", officialPath, detail.Title)
if err := w.SaveArticleDetailToExcel(detail, filePath); err != nil {
fmt.Printf("保存文章详情失败: %v\n", err)
errorCount++
continue
}
}
// TODO: 保存图片功能(如果需要)
if imgSaveFlag {
fmt.Println("图片保存功能暂未实现")
}
successCount++
fmt.Printf("第%d篇文章处理成功: %s\n", i+1, detail.Title)
// 添加延迟,避免被封
time.Sleep(3 * time.Second)
}
fmt.Printf("文章列表处理完成: 成功%d篇, 失败%d篇\n", successCount, errorCount)
return nil
}