]*>([\s\S]*?)<\/div>`)
+ if match := richMediaClassRegex.FindStringSubmatch(content); len(match) > 1 {
+ rawContent = match[1]
+ } else if rawContent == "" {
+ // 2.2 尝试查找id为js_content的元素
+ jsContentIdRegex := regexp.MustCompile(`(?s)
]*>([\s\S]*?)<\/div>`)
+ if match := jsContentIdRegex.FindStringSubmatch(content); len(match) > 1 {
+ rawContent = match[1]
+ }
+ }
+ }
+
+ // 方法3: 从window.appmsg对象中提取(微信文章标准数据结构)
+ if rawContent == "" {
+ appmsgRegex := regexp.MustCompile(`window\.appmsg\s*=\s*(\{[\s\S]+?\});`)
+ if match := appmsgRegex.FindStringSubmatch(content); len(match) > 1 {
+ appmsgData := match[1]
+ // 尝试提取content字段(多种格式)
+ contentPatterns := []string{
+ `"content"\s*:\s*(['"](?:\\.|[^'"])*['"])`,
+ `content\s*=\s*(['"](?:\\.|[^'"])*['"])`,
+ `"content"\s*:\s*JsDecode\(['"]([^'"]+)['"]\)`,
+ `content\s*=\s*JsDecode\(['"]([^'"]+)['"]\)`,
+ }
+ for _, pattern := range contentPatterns {
+ contentRegex := regexp.MustCompile(pattern)
+ if contentMatch := contentRegex.FindStringSubmatch(appmsgData); len(contentMatch) > 1 {
+ rawContent = contentMatch[1]
+ // 移除引号
+ if len(rawContent) > 1 && rawContent[0] == '"' && rawContent[len(rawContent)-1] == '"' {
+ rawContent = rawContent[1 : len(rawContent)-1]
+ }
+ break
+ }
+ }
+ }
+ }
+
+ // 方法4: 从JSON格式的文章数据中提取
+ if rawContent == "" {
+ // 查找可能包含文章内容的JSON数据块
+ jsonDataRegex := regexp.MustCompile(`(?:\{"content"|\"content\")[^}]*\}`)
+ jsonMatches := jsonDataRegex.FindAllString(content, -1)
+
+ for _, jsonMatch := range jsonMatches {
+ // 尝试解析JSON
+ var jsonObj map[string]interface{}
+ if err := json.Unmarshal([]byte(jsonMatch), &jsonObj); err == nil {
+ if contentStr, ok := jsonObj["content"].(string); ok && contentStr != "" {
+ rawContent = contentStr
+ break
+ }
+ }
+ }
+ }
+
+ // 方法5: 尝试从微信文章特有的段落结构提取
+ if rawContent == "" {
+ // 查找带有rich_media_p类的p标签(微信文章特有的段落样式)
+ pTagsRegex := regexp.MustCompile(`(?s)
([\s\S]*?)<\/p>`)
+ if matches := pTagsRegex.FindAllStringSubmatch(content, -1); len(matches) > 0 {
+ // 如果找到多个p标签,合并它们的内容
+ var combinedContent strings.Builder
+ for _, match := range matches {
+ if len(match) > 1 {
+ combinedContent.WriteString(match[1])
+ combinedContent.WriteString("\n")
+ }
+ }
+ rawContent = combinedContent.String()
+ } else {
+ // 尝试一般的p标签,这是微信文章的备用段落格式
+ generalPTagsRegex := regexp.MustCompile(`(?s)
]*>([\s\S]*?)<\/p>`)
+ if matches := generalPTagsRegex.FindAllStringSubmatch(content, -1); len(matches) > 10 { // 至少10个p标签才可能是文章内容
+ var combinedContent strings.Builder
+ for _, match := range matches {
+ if len(match) > 1 {
+ combinedContent.WriteString(match[1])
+ combinedContent.WriteString("\n")
+ }
+ }
+ rawContent = combinedContent.String()
+ }
+ }
+ }
+
+ // 方法6: 从article或section标签提取(HTML5标准内容容器)
+ if rawContent == "" {
+ // 分别处理article和section标签
+ articleRegex := regexp.MustCompile(`(?s)]*>([\s\S]*?)<\/article>`)
+ if match := articleRegex.FindStringSubmatch(content); len(match) > 1 {
+ // 检查提取的内容是否真的包含文章正文(而不是JavaScript代码)
+ articleContent := match[1]
+ if w.calculateChineseDensity(articleContent) > 0.2 {
+ rawContent = articleContent
+ }
+ } else {
+ sectionRegex := regexp.MustCompile(`(?s)]*>([\s\S]*?)<\/section>`)
+ if match := sectionRegex.FindStringSubmatch(content); len(match) > 1 {
+ // 检查提取的内容是否真的包含文章正文
+ sectionContent := match[1]
+ if w.calculateChineseDensity(sectionContent) > 0.2 {
+ rawContent = sectionContent
+ }
+ }
+ }
+ }
+
+ // 方法7: 基于中文密度的段落提取(备用方法)
+ if rawContent == "" {
+ // 将内容分成较小的块进行检查,使用更简单的正则表达式
+ contentBlocks := regexp.MustCompile(`[\S\s]+?`).FindAllString(content, -1)
+
+ var bestContent string
+ var bestScore float64
+
+ for _, block := range contentBlocks {
+ // 计算中文密度(使用现有的calculateChineseDensity方法)
+ density := w.calculateChineseDensity(block)
+ // 计算JavaScript关键词数量
+ jsCount := w.jsKeywordCount(block)
+ // 计算中文字符总数
+ chineseCount := 0
+ for _, char := range block {
+ if char >= 0x4e00 && char <= 0x9fa5 {
+ chineseCount++
+ }
+ }
+
+ // 计算综合评分:中文密度高且JavaScript关键词少的内容得分更高
+ score := density * float64(chineseCount) / (float64(jsCount) + 1.0)
+
+ // 只有中文密度足够高且JavaScript关键词较少的内容才考虑
+ if density > 0.4 && jsCount < 10 && chineseCount > 100 && score > bestScore {
+ bestScore = score
+ bestContent = block
+ }
+ }
+
+ if bestContent != "" {
+ rawContent = bestContent
+ }
+ }
+
+ // 方法8: 从JavaScript字符串中提取HTML内容(备用方法)
+ if rawContent == "" {
+ // 查找可能包含HTML内容的长字符串
+ longStringRegex := regexp.MustCompile(`['"]([^'"]{200,})['"]`)
+ matches := longStringRegex.FindAllStringSubmatch(content, -1)
+
+ for _, match := range matches {
+ if len(match) > 1 {
+ // 先进行预检查,排除明显的JavaScript代码
+ candidate := match[1]
+ if w.jsKeywordCount(candidate) > 20 {
+ continue // 跳过JavaScript代码过多的候选内容
+ }
+
+ // 尝试解码可能的URL编码内容
+ decoded := candidate
+ for i := 0; i < 3; i++ { // 最多解码3次
+ if d, err := url.QueryUnescape(decoded); err == nil && d != decoded {
+ decoded = d
+ } else {
+ break
+ }
+ }
+
+ // 检查是否包含常见的HTML标签且中文密度足够高
+ hasHTMLTags := strings.Contains(decoded, "") || strings.Contains(decoded, "
") || strings.Contains(decoded, "<p>") ||
+ strings.Contains(decoded, "<div") || strings.Contains(decoded, "<br>")
+
+ // 计算解码后的中文密度
+ density := w.calculateChineseDensity(decoded)
+
+ // 同时满足有HTML标签和足够的中文密度
+ if hasHTMLTags && density > 0.3 {
+ rawContent = decoded
+ break
+ }
+ }
+ }
+ }
+
+ // 预处理rawContent(如果已找到)
+ if rawContent != "" {
+ // 首先进行多次URL解码,处理嵌套编码
+ for i := 0; i < 3; i++ { // 最多解码3次
+ if decoded, err := url.QueryUnescape(rawContent); err == nil && decoded != rawContent {
+ rawContent = decoded
+ } else {
+ break
+ }
+ }
+
+ // 替换HTML实体
+ rawContent = strings.ReplaceAll(rawContent, "<", "<")
+ rawContent = strings.ReplaceAll(rawContent, ">", ">")
+ rawContent = strings.ReplaceAll(rawContent, """, "\"")
+ rawContent = strings.ReplaceAll(rawContent, "&", "&")
+ rawContent = strings.ReplaceAll(rawContent, "\\n", "")
+ rawContent = strings.ReplaceAll(rawContent, "\\r", "")
+ rawContent = strings.ReplaceAll(rawContent, "\\t", "")
+ rawContent = strings.ReplaceAll(rawContent, "\\\"", "\"") // 处理转义的双引号
+ }
+
+ // 如果找到了内容,进行清理
+ if rawContent != "" {
+ // 移除HTML标签
+ tagRegex := regexp.MustCompile(`<[^>]*>`)
+ cleanText := tagRegex.ReplaceAllString(rawContent, "")
+
+ // 应用JavaScript大段过滤
+ cleanText = w.filterJavaScriptBlocks(cleanText)
+
+ // 移除多余的空白字符
+ spaceRegex := regexp.MustCompile(`\s+`)
+ cleanText = spaceRegex.ReplaceAllString(cleanText, " ")
+ cleanText = strings.TrimSpace(cleanText)
+
+ // 检查是否包含过多的JavaScript代码特征
+ jsCount := w.jsKeywordCount(cleanText)
+ chineseDensity := w.calculateChineseDensity(cleanText)
+
+ // 移除明显的JavaScript代码块 - 增强版,特别针对微信平台代码
+ // 1. 移除WX_BJ_REPORT相关代码
+ cleanText = regexp.MustCompile(`(?s)\s*WX_BJ_REPORT\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*var\s+WX_BJ_REPORT\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*if\s*\(WX_BJ_REPORT\)[^;]*;`).ReplaceAllString(cleanText, "")
+
+ // 2. 移除BadJs相关代码
+ cleanText = regexp.MustCompile(`(?s)\s*BadJs\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*var\s+BadJs\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*if\s*\(BadJs\)[^;]*;`).ReplaceAllString(cleanText, "")
+
+ // 3. 移除window.logs相关代码
+ cleanText = regexp.MustCompile(`(?s)\s*window\.logs\s*=\s*\[.*?\];`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*window\.logs\s*\..*?;`).ReplaceAllString(cleanText, "")
+
+ // 4. 移除__moon_initcallback相关代码
+ cleanText = regexp.MustCompile(`(?s)\s*__moon_initcallback\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*\.__moon_initcallback\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
+
+ // 5. 移除try-catch块
+ cleanText = regexp.MustCompile(`(?s)\s*try\s*{[^}]*}\s*catch\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
+
+ // 6. 移除函数定义
+ cleanText = regexp.MustCompile(`(?s)\s*function\s+[^(]*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*var\s+[^=]*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
+
+ // 7. 移除IIFE函数
+ cleanText = regexp.MustCompile(`(?s)\s*\(function\s*\([^)]*\)\s*{[^}]*}\)\s*\(\);`).ReplaceAllString(cleanText, "")
+
+ // 8. 移除变量声明
+ cleanText = regexp.MustCompile(`(?s)\s*var\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*{[^}]*}\s*;?`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*let\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*const\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*window\.[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`).ReplaceAllString(cleanText, "")
+
+ // 9. 移除控制流语句
+ cleanText = regexp.MustCompile(`(?s)\s*if\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*for\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*while\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
+
+ // 10. 移除JSON和数组
+ cleanText = regexp.MustCompile(`(?s)\s*\{\s*"[^"]*"\s*:\s*[^}]*\}\s*`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*\[\s*[^\]]*\]\s*`).ReplaceAllString(cleanText, "")
+
+ // 11. 移除网络请求相关代码
+ cleanText = regexp.MustCompile(`(?s)\s*new\s+XMLHttpRequest\(\)[^;]*;`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*xmlobj\s*\.[^;]*;`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*fetch\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*axios\s*\.[^;]*;`).ReplaceAllString(cleanText, "")
+
+ // 12. 移除正则表达式和调试代码
+ cleanText = regexp.MustCompile(`(?s)\s*new\s+RegExp\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*\/[^/]*\/[gimuy]*`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*console\.[a-z]+\([^)]*\);`).ReplaceAllString(cleanText, "")
+
+ // 13. 移除事件处理相关代码
+ cleanText = regexp.MustCompile(`(?s)\s*document\.addEventListener\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*window\.addEventListener\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*on\$1\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
+
+ // 14. 移除定时器相关代码
+ cleanText = regexp.MustCompile(`(?s)\s*setTimeout\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*setInterval\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
+
+ // 15. 移除微信特有的API调用
+ cleanText = regexp.MustCompile(`(?s)\s*WeixinJSBridge\s*\..*?;`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*wx\.\w+\s*\([^)]*\)[^;]*;`).ReplaceAllString(cleanText, "")
+
+ // 16. 移除logsPagetime相关代码
+ cleanText = regexp.MustCompile(`(?s)\s*logsPagetime\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*var\s+logsPagetime\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(cleanText, "")
+
+ // 17. 移除特定的微信错误处理代码
+ cleanText = regexp.MustCompile(`(?s)\s*\.error\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*\.warn\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*\.info\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
+
+ // 18. 移除微信平台特定的方法调用
+ cleanText = regexp.MustCompile(`(?s)\s*document\.write\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
+ cleanText = regexp.MustCompile(`(?s)\s*document\.writeln\s*\([^)]*\);`).ReplaceAllString(cleanText, "")
+
+ // 如果JavaScript关键词较少且中文密度较高,可能是有效的文章内容
+ if (jsCount < 5 || chineseDensity > 0.3) && len(cleanText) > 50 {
+ // 按句子或段落分割,避免一行过长
+ if len(cleanText) > 0 {
+ // 首先尝试按段落分割
+ paragraphs := regexp.MustCompile(`[。!?.!?]\s*`).Split(cleanText, -1)
+
+ // 重组段落,保留标点符号
+ punctuations := regexp.MustCompile(`[。!?.!?]\s*`).FindAllString(cleanText, -1)
+
+ for i := 0; i < len(paragraphs); i++ {
+ if paragraphs[i] != "" {
+ if i < len(punctuations) {
+ paragraphs[i] += punctuations[i]
+ }
+ // 只添加非空且长度合理的段落(避免添加JavaScript片段)
+ paragraph := strings.TrimSpace(paragraphs[i])
+ // 增强过滤条件,避免JavaScript片段,同时考虑中文密度
+ paraDensity := w.calculateChineseDensity(paragraph)
+ paraJsCount := w.jsKeywordCount(paragraph)
+ if len(paragraph) > 15 &&
+ !strings.Contains(paragraph, "{") &&
+ !strings.Contains(paragraph, "}") &&
+ !strings.Contains(paragraph, "function") &&
+ !strings.Contains(paragraph, "var") &&
+ !strings.Contains(paragraph, "window.") &&
+ !strings.Contains(paragraph, "WX_BJ_REPORT") &&
+ !strings.Contains(paragraph, "BadJs") &&
+ (paraJsCount < 2 || paraDensity > 0.4) { // 根据中文密度调整JavaScript关键词容忍度
+ textContent = append(textContent, paragraph)
+ }
+ }
+ }
+
+ // 如果没有成功分割成段落,直接添加整个文本
+ if len(textContent) == 0 && len(cleanText) > 50 && (w.jsKeywordCount(cleanText) < 3 || chineseDensity > 0.5) {
+ textContent = append(textContent, cleanText)
+ }
+ }
+ }
+ }
+
+ // 最后的备选方案:尝试从整个页面中提取非JavaScript的文本内容
+ if len(textContent) == 0 {
+ // 移除所有HTML标签
+ allText := regexp.MustCompile(`<[^>]*>`).ReplaceAllString(content, "")
+
+ // 应用增强的JavaScript代码块过滤
+ allText = w.filterJavaScriptBlocks(allText)
+
+ // 进一步清理特定模式
+ allText = regexp.MustCompile(`(?s)\s*WX_BJ_REPORT\s*\([^)]*\);`).ReplaceAllString(allText, "")
+ allText = regexp.MustCompile(`(?s)\s*BadJs\s*\([^)]*\);`).ReplaceAllString(allText, "")
+ allText = regexp.MustCompile(`(?s)\s*window\.logs\s*=\s*\[.*?\];`).ReplaceAllString(allText, "")
+ allText = regexp.MustCompile(`(?s)\s*__moon_initcallback\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(allText, "")
+ allText = regexp.MustCompile(`(?s)\s*try\s*{[^}]*}\s*catch\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(allText, "")
+ allText = regexp.MustCompile(`(?s)\s*function\s+[^(]*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(allText, "")
+ allText = regexp.MustCompile(`(?s)\s*var\s+[^=]*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`).ReplaceAllString(allText, "")
+ allText = regexp.MustCompile(`(?s)\s*\(function\s*\([^)]*\)\s*{[^}]*}\)\s*\(\);`).ReplaceAllString(allText, "")
+
+ // 使用中文文本提取作为最后手段
+ allText = w.extractChineseText(allText)
+
+ // 清理空白字符
+ spaceRegex := regexp.MustCompile(`\s+`)
+ allText = spaceRegex.ReplaceAllString(allText, " ")
+ allText = strings.TrimSpace(allText)
+
+ // 尝试按句子分割
+ if allText != "" && len(allText) > 100 {
+ sentences := regexp.MustCompile(`[。!?.!?]\s*`).Split(allText, -1)
+ punctuations := regexp.MustCompile(`[。!?.!?]\s*`).FindAllString(allText, -1)
+
+ for i := 0; i < len(sentences); i++ {
+ if sentences[i] != "" {
+ if i < len(punctuations) {
+ sentences[i] += punctuations[i]
+ }
+ paragraph := strings.TrimSpace(sentences[i])
+ // 过滤掉JavaScript代码和过短的内容,同时考虑中文密度
+ if len(paragraph) > 20 && (w.jsKeywordCount(paragraph) < 3 || w.calculateChineseDensity(paragraph) > 0.4) {
+ textContent = append(textContent, paragraph)
+ }
+ }
+ }
+ }
+ }
+
+ // 对提取的内容应用最终过滤,确保只保留真正的文章正文
+ filteredContent := w.finalContentFilter(textContent)
+ return createTime, title, commentID, reqID, w.extractAuthor(content), filteredContent
+}
+
+// calculateChineseDensity 计算文本中中文字符的密度
+func (w *WechatCrawler) calculateChineseDensity(text string) float64 {
+ if len(text) == 0 {
+ return 0
+ }
+ // 使用正确的Go语言Unicode范围表示法
+ chineseCount := 0
+ for _, char := range text {
+ if char >= 0x4e00 && char <= 0x9fa5 {
+ chineseCount++
+ }
+ }
+ return float64(chineseCount) / float64(len(text))
+}
+
+// 过滤大段JavaScript代码
+func (w *WechatCrawler) filterJavaScriptBlocks(text string) string {
+ // 移除常见的JavaScript代码块模式
+ patterns := []string{
+ // 移除JavaScript函数声明
+ `(?s)function\s+[a-zA-Z_$][\w$]*\s*\([^)]*\)\s*{[^}]*}`,
+ // 移除匿名函数
+ `(?s)\(\s*function\s*\([^)]*\)\s*{[^}]*}\s*\)\s*\(\s*\)`,
+ // 移除对象字面量
+ `(?s)\{[^}]*\}`,
+ // 移除数组字面量
+ `(?s)\[[^\]]*\]`,
+ // 移除注释
+ `//[^\n]*`,
+ `/\*[^*]*\*/`,
+ // 移除微信特定错误报告代码
+ `(?s)WX_BJ_REPORT[^;]*;`,
+ `(?s)BadJs[^;]*;`,
+ `(?s)window\.[a-zA-Z_$][\w$]*[^;]*;`,
+ // 移除XMLHttpRequest相关代码
+ `(?s)xmlobj[^;]*;`,
+ `(?s)new\s+Image\([^)]*\)`,
+ `(?s)setRequestHeader[^;]*;`,
+ // 移除正则表达式
+ `/[^/]*\/[gimuy]*`,
+ }
+
+ result := text
+ for _, pattern := range patterns {
+ regex, err := regexp.Compile(pattern)
+ if err == nil {
+ result = regex.ReplaceAllString(result, "")
+ }
+ }
+
+ return result
+}
+
+// 提取纯中文文本
+func (w *WechatCrawler) extractChineseText(text string) string {
+ var result []rune
+ for _, char := range text {
+ // 保留中文、标点符号、数字和英文字母,去除特殊字符
+ if (char >= 0x4e00 && char <= 0x9fa5) ||
+ unicode.IsPunct(char) ||
+ unicode.IsDigit(char) ||
+ unicode.IsLetter(char) ||
+ char == '\n' || char == ' ' {
+ result = append(result, char)
+ }
+ }
+ return string(result)
+}
+
+// finalContentFilter 最终内容过滤,确保只保留真正的文章正文
+func (w *WechatCrawler) finalContentFilter(text string) string {
+ // 1. 移除明显的JavaScript代码块
+ // 移除WX_BJ_REPORT相关代码
+ wxCodeRegex := regexp.MustCompile(`(?s)\s*WX_BJ_REPORT\s*\([^)]*\);|\s*var\s+WX_BJ_REPORT\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*|\s*if\s*\(WX_BJ_REPORT\)[^;]*;`)
+ text = wxCodeRegex.ReplaceAllString(text, "")
+
+ // 移除BadJs相关代码
+ badJsRegex := regexp.MustCompile(`(?s)\s*BadJs\s*\([^)]*\);|\s*var\s+BadJs\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*|\s*if\s*\(BadJs\)[^;]*;`)
+ text = badJsRegex.ReplaceAllString(text, "")
+
+ // 移除window.logs相关代码
+ logsRegex := regexp.MustCompile(`(?s)\s*window\.logs\s*=\s*\[.*?\];|\s*window\.logs\s*\..*?;`)
+ text = logsRegex.ReplaceAllString(text, "")
+
+ // 移除函数定义
+ funcRegex := regexp.MustCompile(`(?s)\s*function\s+[^(]*\([^)]*\)\s*{[^}]*}\s*|\s*var\s+[^=]*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*|\s*[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*function\s*\([^)]*\)\s*{[^}]*}\s*`)
+ text = funcRegex.ReplaceAllString(text, "")
+
+ // 移除变量声明
+ varRegex := regexp.MustCompile(`(?s)\s*var\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*{[^}]*}\s*;?|\s*let\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;|\s*const\s+[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;|\s*window\.[a-zA-Z_$][a-zA-Z0-9_$]*\s*=\s*[^;]*;`)
+ text = varRegex.ReplaceAllString(text, "")
+
+ // 移除控制流语句
+ flowRegex := regexp.MustCompile(`(?s)\s*if\s*\([^)]*\)\s*{[^}]*}\s*|\s*for\s*\([^)]*\)\s*{[^}]*}\s*|\s*while\s*\([^)]*\)\s*{[^}]*}\s*`)
+ text = flowRegex.ReplaceAllString(text, "")
+
+ // 2. 提取真正的文章段落
+ paragraphs := regexp.MustCompile(`[。!?.!?]\s*`).Split(text, -1)
+ punctuations := regexp.MustCompile(`[。!?.!?]\s*`).FindAllString(text, -1)
+
+ var validParagraphs []string
+ for i := 0; i < len(paragraphs); i++ {
+ if paragraphs[i] != "" {
+ paragraph := paragraphs[i]
+ if i < len(punctuations) {
+ paragraph += punctuations[i]
+ }
+ paragraph = strings.TrimSpace(paragraph)
+
+ // 计算段落特征
+ paraDensity := w.calculateChineseDensity(paragraph)
+ paraJsCount := w.jsKeywordCount(paragraph)
+ chineseCount := 0
+ for _, char := range paragraph {
+ if char >= 0x4e00 && char <= 0x9fa5 {
+ chineseCount++
+ }
+ }
+
+ // 严格的过滤规则
+ if len(paragraph) > 25 && // 足够长的段落
+ !strings.Contains(paragraph, "{") &&
+ !strings.Contains(paragraph, "}") &&
+ !strings.Contains(paragraph, "function") &&
+ !strings.Contains(paragraph, "var") &&
+ !strings.Contains(paragraph, "window.") &&
+ !strings.Contains(paragraph, "WX_BJ_REPORT") &&
+ !strings.Contains(paragraph, "BadJs") &&
+ chineseCount > 15 && // 至少15个中文字符
+ paraDensity > 0.4 && // 中文密度大于40%
+ paraJsCount < 3 { // JavaScript关键词少于3个
+ validParagraphs = append(validParagraphs, paragraph)
+ }
+ }
+ }
+
+ // 3. 如果没有找到有效的段落,尝试使用更宽松的规则
+ if len(validParagraphs) == 0 {
+ // 直接检查整个文本
+ overallDensity := w.calculateChineseDensity(text)
+ overallJsCount := w.jsKeywordCount(text)
+ overallChineseCount := 0
+ for _, char := range text {
+ if char >= 0x4e00 && char <= 0x9fa5 {
+ overallChineseCount++
+ }
+ }
+
+ // 宽松条件:如果中文密度很高且JavaScript关键词较少
+ if overallDensity > 0.6 && overallJsCount < 5 && overallChineseCount > 100 {
+ validParagraphs = append(validParagraphs, text)
+ }
+ }
+
+ return strings.Join(validParagraphs, "\n\n")
+}
+
+// jsKeywordCount 计算文本中JavaScript关键词的数量 - 增强版
+func (w *WechatCrawler) jsKeywordCount(text string) int {
+ count := 0
+ // 新增加的高优先级过滤关键词
+ highPriorityKeywords := []string{
+ "logs = ", "window.", "LANG = ", "extInfo:", "pagetime[",
+ "BadJs;", "sec_open=", "xmlobj = ", "addEventListener",
+ "new Image()", "setRequestHeader", "onreadystatechange",
+ "var ", "let ", "const ", "function ", "return ",
+ }
+
+ // 基础JavaScript关键词
+ basicKeywords := []string{
+ "function", "var", "let", "const", "if(", "else", "for(", "while(",
+ "return", "setTimeout", "setInterval", "WeixinJSBridge", "JSON",
+ "console", "document", "window", "try{", "catch(", "throw",
+ }
+
+ // 微信平台特定关键词
+ wechatKeywords := []string{
+ "WX_BJ_REPORT", "BadJs", "__moon_initcallback", "logsPagetime",
+ "WeixinJSBridge", "wx.", "document.write", "document.writeln",
+ // 错误处理关键词
+ ".error(", ".warn(", ".info(", ".debug(",
+ // 网络请求关键词
+ "XMLHttpRequest", "fetch(", "axios.", "xmlobj.",
+ }
+
+ lowerText := strings.ToLower(text)
+ // 计算高优先级关键词数量(权重更高)
+ for _, keyword := range highPriorityKeywords {
+ count += strings.Count(lowerText, strings.ToLower(keyword)) * 3
+ }
+
+ // 计算微信平台特定关键词数量
+ for _, keyword := range wechatKeywords {
+ count += strings.Count(lowerText, strings.ToLower(keyword)) * 2
+ }
+
+ // 计算基础JavaScript关键词数量
+ for _, keyword := range basicKeywords {
+ count += strings.Count(lowerText, strings.ToLower(keyword))
+ }
+ return count
+}
+
+// extractAuthor 提取文章作者信息
+func (w *WechatCrawler) extractAuthor(content string) string {
+ authorPatterns := []string{
+ `var author\s*=\s*['"](.*?)['"]`,
+ `"author"\s*:\s*['"](.*?)['"]`,
+ `window\.author\s*=\s*['"](.*?)['"]`,
+ ` 1 {
+ author := match[1]
+ // 尝试解码HTML实体和URL编码
+ author = strings.ReplaceAll(author, """, "\"")
+ author = strings.ReplaceAll(author, "&", "&")
+ author = strings.ReplaceAll(author, "<", "<")
+ author = strings.ReplaceAll(author, ">", ">")
+ if decoded, err := url.QueryUnescape(author); err == nil {
+ author = decoded
+ }
+ return author
+ }
+ }
+ return ""
+}
+
+// GetArticleStats 获取文章统计信息
+func (w *WechatCrawler) GetArticleStats(link string, title string, commentID string, reqID string, createTime string) (map[string]string, error) {
+ // 解析链接参数
+ mid := ""
+ sn := ""
+ idx := ""
+
+ // 尝试从链接中提取参数
+ midRegex := regexp.MustCompile(`mid=(.*?)&`)
+ if match := midRegex.FindStringSubmatch(link); len(match) > 1 {
+ mid = match[1]
+ }
+
+ snRegex := regexp.MustCompile(`sn=(.*?)&`)
+ if match := snRegex.FindStringSubmatch(link); len(match) > 1 {
+ sn = match[1]
+ }
+
+ idxRegex := regexp.MustCompile(`idx=(.*?)&`)
+ if match := idxRegex.FindStringSubmatch(link); len(match) > 1 {
+ idx = match[1]
+ }
+
+ // 生成随机r值
+ r := fmt.Sprintf("0.%d", time.Now().UnixNano()%10000000000000000)
+
+ // 构建请求URL
+ detailURL := fmt.Sprintf("https://mp.weixin.qq.com/mp/getappmsgext?f=json&mock=&fasttmplajax=1&f=json&uin=%s&key=%s&pass_ticket=%s&__biz=%s",
+ w.uin, w.key, w.passTicket, w.biz)
+
+ // 构建请求数据
+ data := map[string]string{
+ "r": r,
+ "sn": sn,
+ "mid": mid,
+ "idx": idx,
+ "req_id": reqID,
+ "title": title,
+ "comment_id": commentID,
+ "appmsg_type": "9",
+ "__biz": w.biz,
+ "pass_ticket": w.passTicket,
+ "abtest_cookie": "",
+ "devicetype": "Windows 7 x64",
+ "version": "63090b13",
+ "is_need_ticket": "0",
+ "is_need_ad": "0",
+ "is_need_reward": "0",
+ "both_ad": "0",
+ "reward_uin_count": "0",
+ "send_time": "",
+ "msg_daily_idx": "1",
+ "is_original": "0",
+ "is_only_read": "1",
+ "scene": "38",
+ }
+
+ // 发送POST请求
+ resp, err := w.client.R().SetFormData(data).Post(detailURL)
+ if err != nil {
+ return nil, fmt.Errorf("请求统计信息失败: %v", err)
+ }
+
+ // 解析响应
+ var result map[string]interface{}
+ err = json.Unmarshal([]byte(resp.String()), &result)
+ if err != nil {
+ return nil, fmt.Errorf("解析统计信息失败: %v", err)
+ }
+
+ // 提取统计数据
+ stats := map[string]string{
+ "read_num": "0",
+ "old_like_num": "0",
+ "share_num": "0",
+ "show_read": "0",
+ }
+
+ // 从返回的JSON中提取所需数据
+ if appMsgExtInfo, ok := result["appmsgstat"].(map[string]interface{}); ok {
+ if readNum, ok := appMsgExtInfo["read_num"].(float64); ok {
+ stats["read_num"] = fmt.Sprintf("%.0f", readNum)
+ }
+ if likeNum, ok := appMsgExtInfo["old_like_num"].(float64); ok {
+ stats["old_like_num"] = fmt.Sprintf("%.0f", likeNum)
+ }
+ if shareNum, ok := appMsgExtInfo["share_num"].(float64); ok {
+ stats["share_num"] = fmt.Sprintf("%.0f", shareNum)
+ }
+ if showRead, ok := appMsgExtInfo["show_read"].(float64); ok {
+ stats["show_read"] = fmt.Sprintf("%.0f", showRead)
+ }
+ }
+
+ return stats, nil
+}
+
+// GetArticleComments 获取文章评论
+func (w *WechatCrawler) GetArticleComments(commentID string) ([]string, []string, error) {
+ if commentID == "" {
+ return []string{}, []string{}, nil
+ }
+
+ // 构建评论请求URL
+ commentURL := fmt.Sprintf(
+ "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=%s&appmsgid=2247491372&idx=1&comment_id=%s&offset=0&limit=100&uin=%s&key=%s&pass_ticket=%s&wxtoken=&devicetype=Windows+10&clientversion=62060833&appmsg_token=",
+ w.biz, commentID, w.uin, w.key, w.passTicket)
+
+ // 发送请求
+ resp, err := w.client.R().Get(commentURL)
+ if err != nil {
+ return []string{}, []string{}, fmt.Errorf("获取评论失败: %v", err)
+ }
+
+ // 解析响应
+ var result map[string]interface{}
+ err = json.Unmarshal([]byte(resp.String()), &result)
+ if err != nil {
+ return []string{}, []string{}, fmt.Errorf("解析评论失败: %v", err)
+ }
+
+ // 提取评论和点赞数
+ var comments []string
+ var commentLikes []string
+
+ // 简化实现,在实际项目中需要根据返回的JSON结构正确提取数据
+ return comments, commentLikes, nil
+}
+
+// GetOfficialAccountLinkFromArticle 通过文章链接获取公众号主页链接
+func (w *WechatCrawler) GetOfficialAccountLinkFromArticle(articleURL string) (string, error) {
+ // 首先尝试从URL中提取__biz参数(兼容旧格式)
+ bizRegex := regexp.MustCompile(`__biz=([^&]+)`)
+ match := bizRegex.FindStringSubmatch(articleURL)
+ if len(match) >= 2 {
+ biz := match[1]
+ // 更新当前实例的biz值
+ w.biz = biz
+
+ // 构建公众号主页链接
+ homePageURL := fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124", biz)
+ return homePageURL, nil
+ }
+
+ // 如果从URL中提取失败,尝试从文章内容中提取
+ content, err := w.GetOneArticle(articleURL)
+ if err != nil {
+ return "", fmt.Errorf("获取文章内容失败: %v", err)
+ }
+
+ // 从文章内容中提取biz
+ contentBizRegex := regexp.MustCompile(`var biz = "(.*?);`)
+ contentMatch := contentBizRegex.FindStringSubmatch(content)
+ if len(contentMatch) < 2 {
+ // 尝试其他可能的biz格式
+ contentBizRegex2 := regexp.MustCompile(`__biz=(.*?)&`)
+ contentMatch = contentBizRegex2.FindStringSubmatch(content)
+ if len(contentMatch) < 2 {
+ return "", fmt.Errorf("无法从文章链接和内容中提取公众号信息")
+ }
+ }
+
+ // 清理biz值,移除可能的额外引号
+ biz := contentMatch[1]
+ biz = strings.ReplaceAll(biz, " || ", "")
+ biz = strings.ReplaceAll(biz, "\"", "")
+
+ // 更新当前实例的biz值
+ w.biz = biz
+
+ // 构建公众号主页链接
+ homePageURL := fmt.Sprintf("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124", biz)
+ return homePageURL, nil
+}
+
+// GetArticleList 获取公众号所有文章列表
+func (w *WechatCrawler) GetArticleList() ([][]string, error) {
+ var allArticles [][]string
+ offset := 0
+
+ for {
+ fmt.Printf("正在获取第%d页文章...\n", offset/10+1)
+ result, err := w.GetNextList(offset)
+ if err != nil {
+ return allArticles, fmt.Errorf("获取文章列表失败: %v", err)
+ }
+
+ // 检查是否还有更多文章
+ mFlag, ok := result["m_flag"].(float64)
+ if !ok || mFlag == 0 {
+ break
+ }
+
+ // 获取当前页的文章列表
+ passageList, ok := result["passage_list"].([][]string)
+ if !ok {
+ return allArticles, fmt.Errorf("文章列表格式错误")
+ }
+
+ // 添加到总列表
+ allArticles = append(allArticles, passageList...)
+
+ // 增加偏移量
+ offset += 10
+
+ // 随机延迟,避免被封禁
+ time.Sleep(time.Duration(2000+offset) * time.Millisecond)
+ }
+
+ // 转换链接
+ transformedArticles := w.TransformLinks(allArticles)
+
+ fmt.Printf("共获取到%d篇文章\n", len(transformedArticles))
+ return transformedArticles, nil
+}
+
+// SaveArticleListToExcel 保存文章列表到Excel
+func (w *WechatCrawler) SaveArticleListToExcel(officialPath string, articleList [][]string, nickname string) error {
+ // 确保目录存在
+ if err := os.MkdirAll(officialPath, 0755); err != nil {
+ return fmt.Errorf("创建目录失败: %v", err)
+ }
+
+ // 保存转换后的链接文件
+ filePath := fmt.Sprintf("%s/文章列表(article_list)_直连链接.txt", officialPath)
+ var content strings.Builder
+
+ // 写入标题行
+ content.WriteString("序号,创建时间,标题,链接\n")
+
+ // 写入文章列表
+ for i, article := range articleList {
+ content.WriteString(fmt.Sprintf("%d,%s,%s,%s\n", i+1, article[1], article[2], article[3]))
+ }
+
+ // 写入文件
+ err := os.WriteFile(filePath, []byte(content.String()), 0644)
+ if err != nil {
+ return fmt.Errorf("保存文章列表失败: %v", err)
+ }
+
+ fmt.Printf("文章列表已保存到: %s\n", filePath)
+ return nil
+}
+
+// TransformLinks 转换文章链接,将带有amp;的链接转换为直接可访问的链接
+func (w *WechatCrawler) TransformLinks(articleList [][]string) [][]string {
+ transformedList := make([][]string, 0, len(articleList))
+
+ for _, article := range articleList {
+ if len(article) >= 4 {
+ // 转换链接,移除amp;
+ transformedLink := strings.Replace(article[3], "amp;", "", -1)
+ transformedArticle := []string{article[0], article[1], article[2], transformedLink}
+ transformedList = append(transformedList, transformedArticle)
+ }
+ }
+
+ return transformedList
+}
+
+// ReadArticleLinksFromExcel 从Excel读取文章链接
+func (w *WechatCrawler) ReadArticleLinksFromExcel(filePath string) ([]string, error) {
+ // 简化实现,返回空列表
+ return []string{}, nil
+}
+
+// GetArticleDetail 获取单篇文章的详细信息
+func (w *WechatCrawler) GetArticleDetail(link string) (*ArticleDetail, error) {
+ // 获取文章内容
+ content, err := w.GetOneArticle(link)
+ if err != nil {
+ return nil, err
+ }
+
+ // 提取文章信息
+ createTime, title, commentID, reqID, _, textContent := w.ExtractArticleInfo(content)
+
+ // 提取公众号名称
+ accountName := w.ExtractOfficialAccountName(content)
+
+ // 获取统计信息
+ stats, err := w.GetArticleStats(link, title, commentID, reqID, createTime)
+ if err != nil {
+ // 如果获取统计信息失败,使用默认值
+ stats = map[string]string{
+ "read_num": "0",
+ "old_like_num": "0",
+ "share_num": "0",
+ "show_read": "0",
+ }
+ }
+
+ // 获取评论信息
+ comments, commentLikes, _ := w.GetArticleComments(commentID)
+
+ // 构建文章详情
+ detail := &ArticleDetail{
+ LocalTime: time.Now().Format("2006-01-02 15:04:05"),
+ CreateTime: createTime,
+ Title: title,
+ OfficialName: accountName,
+ Link: link,
+ Content: textContent,
+ ReadCount: stats["read_num"],
+ LikeCount: stats["old_like_num"],
+ ShareCount: stats["share_num"],
+ ShowRead: stats["show_read"],
+ Comments: comments,
+ CommentLikes: commentLikes,
+ CommentID: commentID,
+ }
+
+ return detail, nil
+}
+
+// GetDetailList 批量获取文章详情
+func (w *WechatCrawler) GetDetailList(articleList [][]string, officialPath string) error {
+ // 确保目录存在
+ if err := os.MkdirAll(officialPath, 0755); err != nil {
+ return fmt.Errorf("创建目录失败: %v", err)
+ }
+
+ successCount := 0
+ errorCount := 0
+ errorLinks := [][]string{}
+
+ for i, article := range articleList {
+ if len(article) < 4 {
+ continue
+ }
+
+ link := article[3]
+ title := article[2]
+
+ fmt.Printf("正在处理第%d篇文章: %s\n", i+1, title)
+
+ // 获取文章详情
+ detail, err := w.GetArticleDetail(link)
+ if err != nil {
+ fmt.Printf("获取文章详情失败: %v\n", err)
+ errorCount++
+ errorLinks = append(errorLinks, article)
+ continue
+ }
+
+ // 保存文章详情 - 确保使用文章标题作为文件名
+ filePath := fmt.Sprintf("%s/%s_文章详情.txt", officialPath, detail.Title)
+ if err := w.SaveArticleDetailToExcel(detail, filePath); err != nil {
+ fmt.Printf("保存文章详情失败: %v\n", err)
+ errorCount++
+ errorLinks = append(errorLinks, article)
+ continue
+ }
+
+ successCount++
+ fmt.Printf("文章详情保存成功: %s\n", detail.Title)
+
+ // 随机延迟,避免被封禁
+ delayTime := 3000 + i*100 // 3秒基础延迟,递增
+ time.Sleep(time.Duration(delayTime) * time.Millisecond)
+ }
+
+ // 保存错误链接
+ if len(errorLinks) > 0 {
+ errorPath := fmt.Sprintf("%s/问题链接(error_links).txt", officialPath)
+ var content strings.Builder
+ content.WriteString("序号,创建时间,标题,链接\n")
+ for i, link := range errorLinks {
+ content.WriteString(fmt.Sprintf("%d,%s,%s,%s\n", i+1, link[1], link[2], link[3]))
+ }
+ err := os.WriteFile(errorPath, []byte(content.String()), 0644)
+ if err != nil {
+ fmt.Printf("保存错误链接失败: %v\n", err)
+ }
+ }
+
+ fmt.Printf("文章详情获取完成: 成功%d篇, 失败%d篇\n", successCount, errorCount)
+ return nil
+}
+
+// SaveArticleDetailToExcel 保存文章详情到Excel
+func (c *WechatCrawler) SaveArticleDetailToExcel(article *ArticleDetail, filePath string) error {
+ // 简化实现,保存为文本文件
+ var content strings.Builder
+
+ content.WriteString(fmt.Sprintf("本地创建时间: %s\n", article.LocalTime))
+ content.WriteString(fmt.Sprintf("文章发布时间: %s\n", article.CreateTime))
+ content.WriteString(fmt.Sprintf("公众号名称: %s\n", article.OfficialName))
+ content.WriteString(fmt.Sprintf("文章标题: %s\n", article.Title))
+ content.WriteString(fmt.Sprintf("文章链接: %s\n", article.Link))
+ content.WriteString(fmt.Sprintf("阅读量: %s\n", article.ReadCount))
+ content.WriteString(fmt.Sprintf("点赞数: %s\n", article.LikeCount))
+ content.WriteString(fmt.Sprintf("转发数: %s\n", article.ShareCount))
+ content.WriteString(fmt.Sprintf("在看数: %s\n", article.ShowRead))
+ content.WriteString("\n文章内容:\n")
+
+ for _, line := range article.Content {
+ content.WriteString(line)
+ content.WriteString("\n")
+ }
+
+ // 写入文件
+ return os.WriteFile(filePath, []byte(content.String()), 0644)
+}
+
+// GetListArticleFromFile 根据公众号名称或文章链接,从文件中读取文章列表并下载内容
+func (w *WechatCrawler) GetListArticleFromFile(nameLink string, imgSaveFlag bool, contentSaveFlag bool) error {
+ // 1. 判断输入类型并获取公众号名称
+ nickname := ""
+ if strings.Contains(nameLink, "http") {
+ fmt.Println("检测到输入为链接,开始获取公众号名称")
+ // 从文章链接获取公众号信息
+ _, err := w.GetOfficialAccountLinkFromArticle(nameLink)
+ if err != nil {
+ return fmt.Errorf("获取公众号信息失败: %v", err)
+ }
+ // 获取公众号名称
+ nickname, err = w.GetOfficialAccountName()
+ if err != nil {
+ return fmt.Errorf("获取公众号名称失败: %v", err)
+ }
+ fmt.Printf("获取到公众号名称: %s\n", nickname)
+ } else {
+ fmt.Println("检测到输入为公众号名称")
+ nickname = nameLink
+ }
+
+ // 2. 构建文件路径
+ rootPath := "./data/"
+ officialNamesHead := "公众号----"
+ officialPath := rootPath + officialNamesHead + nickname
+ articleListPath := officialPath + "/文章列表(article_list)_直连链接.txt"
+
+ // 3. 检查文件是否存在
+ if _, err := os.Stat(articleListPath); os.IsNotExist(err) {
+ return fmt.Errorf("文件不存在,请检查目录文件: %s", articleListPath)
+ }
+
+ // 4. 读取文章链接列表
+ fileContent, err := os.ReadFile(articleListPath)
+ if err != nil {
+ return fmt.Errorf("读取文章列表文件失败: %v", err)
+ }
+
+ lines := strings.Split(string(fileContent), "\n")
+ var articleLinks []string
+
+ // 跳过标题行,提取链接
+ for i, line := range lines {
+ if i == 0 || line == "" {
+ continue
+ }
+ parts := strings.Split(line, ",")
+ if len(parts) >= 4 {
+ link := parts[3]
+ // 清理链接中的引号
+ link = strings.TrimSpace(link)
+ link = strings.Trim(link, "\"")
+ articleLinks = append(articleLinks, link)
+ }
+ }
+
+ fmt.Printf("成功读取到%d篇文章链接\n", len(articleLinks))
+
+ // 5. 遍历下载每篇文章
+ successCount := 0
+ errorCount := 0
+
+ for i, link := range articleLinks {
+ fmt.Printf("正在处理第%d篇文章,链接: %s\n", i+1, link)
+
+ // 获取文章详情
+ detail, err := w.GetArticleDetail(link)
+ if err != nil {
+ fmt.Printf("获取文章详情失败: %v\n", err)
+ errorCount++
+ continue
+ }
+
+ // 保存文章内容
+ if contentSaveFlag {
+ filePath := fmt.Sprintf("%s/%s_文章详情.txt", officialPath, detail.Title)
+ if err := w.SaveArticleDetailToExcel(detail, filePath); err != nil {
+ fmt.Printf("保存文章详情失败: %v\n", err)
+ errorCount++
+ continue
+ }
+ }
+
+ // TODO: 保存图片功能(如果需要)
+ if imgSaveFlag {
+ fmt.Println("图片保存功能暂未实现")
+ }
+
+ successCount++
+ fmt.Printf("第%d篇文章处理成功: %s\n", i+1, detail.Title)
+
+ // 添加延迟,避免被封
+ time.Sleep(3 * time.Second)
+ }
+
+ fmt.Printf("文章列表处理完成: 成功%d篇, 失败%d篇\n", successCount, errorCount)
+ return nil
+}