2025-12-2genxin

This commit is contained in:
2025-12-02 14:58:52 +08:00
parent 4fef65bd93
commit be0954828c
36 changed files with 3352 additions and 1638 deletions

View File

@@ -1,15 +1,21 @@
package main
import (
"crypto/rand"
"encoding/hex"
"encoding/json"
"fmt"
"log"
"net/http"
"net/url"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"time"
"github.com/wechat-crawler/pkg/wechat"
)
// Response 统一响应结构
@@ -17,6 +23,7 @@ type Response struct {
Success bool `json:"success"`
Message string `json:"message"`
Data interface{} `json:"data,omitempty"`
Code int `json:"code,omitempty"`
}
// 任务状态
@@ -27,7 +34,28 @@ type TaskStatus struct {
Error string `json:"error,omitempty"`
}
// 用户登录请求
type LoginRequest struct {
Username string `json:"username"`
Password string `json:"password"`
}
// 用户注册请求
type RegisterRequest struct {
Username string `json:"username"`
Password string `json:"password"`
Email string `json:"email"`
}
// Session存储
type Session struct {
Token string
UserID int
Expiry time.Time
}
var currentTask = &TaskStatus{Running: false}
var sessions = make(map[string]*Session)
func main() {
// 启用CORS
@@ -36,10 +64,18 @@ func main() {
http.HandleFunc("/api/article/download", corsMiddleware(downloadArticleHandler))
http.HandleFunc("/api/article/list", corsMiddleware(getArticleListHandler))
http.HandleFunc("/api/article/batch", corsMiddleware(batchDownloadHandler))
http.HandleFunc("/api/article/detail", corsMiddleware(getArticleDetailHandler))
http.HandleFunc("/api/data/list", corsMiddleware(getDataListHandler))
http.HandleFunc("/api/task/status", corsMiddleware(getTaskStatusHandler))
http.HandleFunc("/api/download/", corsMiddleware(downloadFileHandler))
// 用户认证接口
http.HandleFunc("/api/user/register", corsMiddleware(registerHandler))
http.HandleFunc("/api/user/login", corsMiddleware(loginHandler))
http.HandleFunc("/api/user/logout", corsMiddleware(logoutHandler))
http.HandleFunc("/api/user/info", corsMiddleware(getUserInfoHandler))
http.HandleFunc("/api/user/update", corsMiddleware(updateUserHandler))
port := ":8080"
fmt.Println("===============================================")
fmt.Println(" 🚀 微信公众号文章爬虫 API 服务器")
@@ -58,7 +94,7 @@ func corsMiddleware(next http.HandlerFunc) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Access-Control-Allow-Origin", "*")
w.Header().Set("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
w.Header().Set("Access-Control-Allow-Headers", "Content-Type")
w.Header().Set("Access-Control-Allow-Headers", "Content-Type, Authorization")
if r.Method == "OPTIONS" {
w.WriteHeader(http.StatusOK)
@@ -98,6 +134,9 @@ func handleRoot(w http.ResponseWriter, r *http.Request) {
<div class="endpoint">
<span class="method">POST</span> /api/article/list - 获取文章列表
</div>
<div class="endpoint">
<span class="method">POST</span> /api/article/detail - 获取文章详情(阅读量、点赞数、评论等)
</div>
<div class="endpoint">
<span class="method">POST</span> /api/article/batch - 批量下载文章
</div>
@@ -216,12 +255,12 @@ func getArticleListHandler(w http.ResponseWriter, r *http.Request) {
currentTask.Progress = 0
currentTask.Message = "正在获取文章列表..."
// 同步执行爬虫程序(功能3
// 同步执行爬虫程序(功能2获取文章列表
exePath := filepath.Join("..", "wechat-crawler.exe")
absPath, _ := filepath.Abs(exePath)
workDir, _ := filepath.Abs("..")
log.Printf("启动功能3: %s, 工作目录: %s", absPath, workDir)
log.Printf("启动功能2: %s, 工作目录: %s", absPath, workDir)
cmd := exec.Command(absPath)
cmd.Dir = workDir
@@ -242,8 +281,8 @@ func getArticleListHandler(w http.ResponseWriter, r *http.Request) {
return
}
// 发送选项"3"(功能3通过access_token获取文章列表
fmt.Fprintln(stdin, "3")
// 发送选项"2"(功能2通过access_token获取文章列表
fmt.Fprintln(stdin, "2")
fmt.Fprintln(stdin, req.AccessToken)
if req.Pages > 0 {
fmt.Fprintf(stdin, "%d\n", req.Pages)
@@ -445,6 +484,304 @@ func batchDownloadHandler(w http.ResponseWriter, r *http.Request) {
})
}
// 获取文章详情功能4包括阅读量、点赞数、评论等
func getArticleDetailHandler(w http.ResponseWriter, r *http.Request) {
var req struct {
AccessToken string `json:"access_token"`
Pages int `json:"pages"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
log.Printf("❌ 解析请求失败: %v", err)
writeJSON(w, Response{Success: false, Message: "请求参数错误: " + err.Error()})
return
}
if req.AccessToken == "" {
log.Printf("❌ Access Token 为空")
writeJSON(w, Response{Success: false, Message: "请输入Access Token URL"})
return
}
log.Printf("\n" + strings.Repeat("=", 60))
log.Printf("📊 开始获取文章详情功能")
log.Printf("接收到的 Access Token: %s", req.AccessToken[:min(100, len(req.AccessToken))])
log.Printf("获取页数: %d (0表示全部)", req.Pages)
currentTask.Running = true
currentTask.Progress = 0
currentTask.Message = "正在解析Access Token参数..."
// 从Access Token URL中提取参数
params, err := parseAccessToken(req.AccessToken)
if err != nil {
log.Printf("❌ 解析Access Token失败: %v", err)
currentTask.Running = false
writeJSON(w, Response{Success: false, Message: "Access Token 参数格式错误: " + err.Error()})
return
}
log.Printf("✅ 参数解析成功:")
log.Printf(" - biz: %s", params["biz"][:min(20, len(params["biz"]))])
log.Printf(" - uin: %s", params["uin"])
log.Printf(" - key: %s", params["key"][:min(20, len(params["key"]))])
log.Printf(" - pass_ticket: %s", params["pass_ticket"][:min(20, len(params["pass_ticket"]))])
// 创建爬虫实例
log.Printf("🔧 创建爬虫实例...")
crawler, err := wechat.NewWechatCrawler(
params["biz"],
params["uin"],
params["key"],
params["pass_ticket"],
nil,
)
if err != nil {
log.Printf("❌ 创建爬虫实例失败: %v", err)
currentTask.Running = false
writeJSON(w, Response{Success: false, Message: "创建爬虫实例失败: " + err.Error()})
return
}
log.Printf("✅ 爬虫实例创建成功")
currentTask.Progress = 20
currentTask.Message = "正在获取公众号名称..."
// 获取公众号名称
log.Printf("📱 获取公众号名称...")
officialName, err := crawler.GetOfficialAccountName()
if err != nil {
log.Printf("❌ 获取公众号名称失败: %v", err)
currentTask.Running = false
writeJSON(w, Response{Success: false, Message: "获取公众号名称失败: " + err.Error()})
return
}
log.Printf("✅ 公众号名称: %s", officialName)
currentTask.Progress = 40
currentTask.Message = "正在获取文章列表..."
// 获取文章列表
log.Printf("📋 获取文章列表...")
var articleList [][]string
if req.Pages > 0 {
// 只获取指定页数
log.Printf("📄 限制获取前 %d 页", req.Pages)
for offset := 0; offset < req.Pages; offset++ {
result, e := crawler.GetNextList(offset)
if e != nil {
log.Printf("❌ 获取第 %d 页失败: %v", offset+1, e)
err = e
break
}
// 检查是否有数据
mFlag, ok := result["m_flag"].(int)
if !ok {
if mFlagFloat, ok := result["m_flag"].(float64); ok {
mFlag = int(mFlagFloat)
}
}
if mFlag == 0 {
log.Printf(" 第 %d 页无更多数据", offset+1)
break
}
// 获取当前页的文章列表
log.Printf("📝 尝试从 result 中提取 passage_list...")
// 先尝试 [][]string 类型GetNextList 实际返回的类型)
if passageListStr, ok := result["passage_list"].([][]string); ok {
log.Printf("✅ passage_list 提取成功([][]string包含 %d 个元素", len(passageListStr))
for idx, strArr := range passageListStr {
articleList = append(articleList, strArr)
log.Printf("✅ 添加第 %d 篇文章: %v", idx+1, strArr)
}
} else if passageList, ok := result["passage_list"].([]interface{}); ok {
// 备用:尝试 []interface{} 类型
log.Printf("✅ passage_list 提取成功([]interface{}),包含 %d 个元素", len(passageList))
for idx, item := range passageList {
if arr, ok := item.([]interface{}); ok {
strArr := make([]string, len(arr))
for i, v := range arr {
if s, ok := v.(string); ok {
strArr[i] = s
}
}
articleList = append(articleList, strArr)
log.Printf("✅ 添加第 %d 篇文章: %v", idx+1, strArr)
} else {
log.Printf("❌ 第 %d 个 item 不是 []interface{} 类型,实际类型: %T", idx+1, item)
}
}
} else {
log.Printf("❌ passage_list 类型断言失败,实际类型: %T", result["passage_list"])
}
log.Printf("✅ 已获取第 %d/%d 页,当前累计 %d 篇文章", offset+1, req.Pages, len(articleList))
// 添加延迟
if offset < req.Pages-1 {
time.Sleep(2 * time.Second)
}
}
// 转换链接
log.Printf("🔗 转换文章链接...转换前共 %d 篇", len(articleList))
articleList = crawler.TransformLinks(articleList)
log.Printf("✅ 链接转换完成,共 %d 篇文章", len(articleList))
} else {
// 获取全部文章
log.Printf("📄 获取全部文章")
articleList, err = crawler.GetArticleList()
}
if err != nil {
log.Printf("❌ 获取文章列表失败: %v", err)
currentTask.Running = false
writeJSON(w, Response{Success: false, Message: "获取文章列表失败: " + err.Error()})
return
}
if len(articleList) == 0 {
log.Printf("⚠️ 文章列表为空")
currentTask.Running = false
writeJSON(w, Response{Success: false, Message: "公众号文章列表为空,可能是 Access Token 无效或公众号无文章"})
return
}
log.Printf("✅ 获取到 %d 篇文章", len(articleList))
currentTask.Progress = 60
currentTask.Message = fmt.Sprintf("正在获取文章详情 (0/%d)...", len(articleList))
// 创建保存目录
dataDir := "../data"
officialPath := filepath.Join(dataDir, officialName)
log.Printf("📁 创建保存目录: %s", officialPath)
if err := os.MkdirAll(officialPath, 0755); err != nil {
log.Printf("❌ 创建保存目录失败: %v", err)
currentTask.Running = false
writeJSON(w, Response{Success: false, Message: "创建保存目录失败: " + err.Error()})
return
}
// 获取文章详情
log.Printf("📊 开始获取文章详情数据...")
err = crawler.GetDetailList(articleList, officialPath)
if err != nil {
log.Printf("❌ 获取文章详情失败: %v", err)
currentTask.Running = false
writeJSON(w, Response{Success: false, Message: "获取文章详情失败: " + err.Error()})
return
}
log.Printf("✅ 文章详情获取完成")
currentTask.Running = false
currentTask.Progress = 100
currentTask.Message = "文章详情获取完成"
// 统计文章详情文件数量
detailPath := filepath.Join(officialPath, "文章详细")
var detailFiles []string
if entries, err := os.ReadDir(detailPath); err == nil {
for _, entry := range entries {
if !entry.IsDir() && strings.HasSuffix(entry.Name(), "_文章详情.txt") {
detailFiles = append(detailFiles, entry.Name())
}
}
}
if len(detailFiles) == 0 {
// 检查主目录
log.Printf("⚠️ 文章详细目录下未找到文件,检查主目录...")
if entries, err := os.ReadDir(officialPath); err == nil {
for _, entry := range entries {
if !entry.IsDir() && strings.HasSuffix(entry.Name(), "_文章详情.txt") {
detailFiles = append(detailFiles, entry.Name())
}
}
}
}
log.Printf("✅ 找到 %d 个文章详情文件", len(detailFiles))
log.Printf(strings.Repeat("=", 60) + "\n")
writeJSON(w, Response{
Success: true,
Message: fmt.Sprintf("文章详情获取成功,共 %d 篇文章", len(detailFiles)),
Data: map[string]interface{}{
"account": officialName,
"articleCount": len(detailFiles),
"path": officialPath,
},
})
}
// min 返回两个整数中的较小值
func min(a, b int) int {
if a < b {
return a
}
return b
}
// parseAccessToken 从URL中解析access token参数
func parseAccessToken(accessToken string) (map[string]string, error) {
params := make(map[string]string)
// 如果是完整URL解析参数
if strings.HasPrefix(accessToken, "http://") || strings.HasPrefix(accessToken, "https://") {
parsedURL, err := url.Parse(accessToken)
if err != nil {
return nil, fmt.Errorf("URL格式错误: %v", err)
}
query := parsedURL.Query()
params["biz"] = query.Get("__biz")
params["uin"] = query.Get("uin")
params["key"] = query.Get("key")
params["pass_ticket"] = query.Get("pass_ticket")
} else {
// 尝试使用正则表达式提取参数
bizRegex := regexp.MustCompile(`__biz=([^&]+)`)
if match := bizRegex.FindStringSubmatch(accessToken); len(match) > 1 {
params["biz"] = match[1]
}
uinRegex := regexp.MustCompile(`uin=([^&]+)`)
if match := uinRegex.FindStringSubmatch(accessToken); len(match) > 1 {
params["uin"] = match[1]
}
keyRegex := regexp.MustCompile(`key=([^&]+)`)
if match := keyRegex.FindStringSubmatch(accessToken); len(match) > 1 {
params["key"] = match[1]
}
passTicketRegex := regexp.MustCompile(`pass_ticket=([^&]+)`)
if match := passTicketRegex.FindStringSubmatch(accessToken); len(match) > 1 {
params["pass_ticket"] = match[1]
}
}
// 验证必需参数
if params["biz"] == "" {
return nil, fmt.Errorf("缺少__biz参数")
}
if params["uin"] == "" {
return nil, fmt.Errorf("缺少uin参数")
}
if params["key"] == "" {
return nil, fmt.Errorf("缺少key参数")
}
if params["pass_ticket"] == "" {
return nil, fmt.Errorf("缺少pass_ticket参数")
}
return params, nil
}
// 获取数据列表
func getDataListHandler(w http.ResponseWriter, r *http.Request) {
dataDir := "../data"
@@ -541,3 +878,348 @@ func writeJSON(w http.ResponseWriter, data interface{}) {
w.Header().Set("Content-Type", "application/json; charset=utf-8")
json.NewEncoder(w).Encode(data)
}
// 生成随机Token
func generateToken() string {
b := make([]byte, 32)
rand.Read(b)
return hex.EncodeToString(b)
}
// 调用Python脚本
func callPythonScript(scriptPath string, args ...string) (string, error) {
// 构建Python命令
cmdArgs := append([]string{scriptPath}, args...)
cmd := exec.Command("python", cmdArgs...)
// 设置工作目录为数据库目录
dbDir, _ := filepath.Abs(filepath.Join("..", "..", "database"))
cmd.Dir = dbDir
// 执行命令
output, err := cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("%s: %s", err, string(output))
}
return string(output), nil
}
// 用户注册处理
func registerHandler(w http.ResponseWriter, r *http.Request) {
if r.Method != "POST" {
writeJSON(w, Response{Success: false, Message: "仅支持POST请求", Code: 405})
return
}
var req RegisterRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeJSON(w, Response{Success: false, Message: "请求参数错误", Code: 400})
return
}
// 验证输入
if req.Username == "" || req.Password == "" || req.Email == "" {
writeJSON(w, Response{Success: false, Message: "用户名、密码和邮箱不能为空", Code: 400})
return
}
// 调用Python脚本创建用户
scriptPath := "user_cli.py"
args := []string{"create", req.Username, req.Password, req.Email}
output, err := callPythonScript(scriptPath, args...)
if err != nil {
log.Printf("注册失败: %v, 输出: %s", err, output)
// 判断错误类型
if strings.Contains(output, "用户名已存在") || strings.Contains(output, "邮箱已被注册") {
writeJSON(w, Response{Success: false, Message: "用户名或邮箱已存在", Code: 409})
} else if strings.Contains(output, "验证错误") {
writeJSON(w, Response{Success: false, Message: output, Code: 400})
} else {
writeJSON(w, Response{Success: false, Message: "注册失败", Code: 500})
}
return
}
log.Printf("用户注册成功: %s", req.Username)
writeJSON(w, Response{
Success: true,
Message: "注册成功",
Code: 200,
Data: map[string]interface{}{
"username": req.Username,
},
})
}
// 用户登录处理
func loginHandler(w http.ResponseWriter, r *http.Request) {
if r.Method != "POST" {
writeJSON(w, Response{Success: false, Message: "仅支持POST请求", Code: 405})
return
}
var req LoginRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeJSON(w, Response{Success: false, Message: "请求参数错误", Code: 400})
return
}
// 验证输入
if req.Username == "" || req.Password == "" {
writeJSON(w, Response{Success: false, Message: "用户名和密码不能为空", Code: 400})
return
}
// 调用Python脚本验证用户
scriptPath := "user_cli.py"
args := []string{"verify", req.Username, req.Password}
output, err := callPythonScript(scriptPath, args...)
log.Printf("🔍 Python输出: %s", output)
if err != nil {
log.Printf("❌ 登录失败: %v", err)
writeJSON(w, Response{Success: false, Message: "用户名或密码错误", Code: 401})
return
}
// 生成token
token := generateToken()
// 从输出中解析user_id和用户信息
var userData map[string]interface{}
if err := json.Unmarshal([]byte(output), &userData); err != nil {
log.Printf("❌ 解析用户数据失败: %v, 输出: %s", err, output)
writeJSON(w, Response{Success: false, Message: "服务器内部错误", Code: 500})
return
}
// 检查是否成功
if success, ok := userData["success"].(bool); !ok || !success {
log.Printf("❌ 用户验证失败: %v", userData)
writeJSON(w, Response{Success: false, Message: "用户名或密码错误", Code: 401})
return
}
userID := 0
if uid, ok := userData["user_id"].(float64); ok {
userID = int(uid)
}
// 存储session
sessions[token] = &Session{
Token: token,
UserID: userID,
Expiry: time.Now().Add(24 * time.Hour), // 24小时过期
}
log.Printf("✅ 用户登录成功: %s, token: %s", req.Username, token)
// 构建user_info不包含密码相关和success标记
userInfo := make(map[string]interface{})
for k, v := range userData {
if k != "password_hash" && k != "success" {
userInfo[k] = v
}
}
writeJSON(w, Response{
Success: true,
Message: "登录成功",
Code: 200,
Data: map[string]interface{}{
"token": token,
"user_id": userID,
"user_info": userInfo,
},
})
}
// 用户登出处理
func logoutHandler(w http.ResponseWriter, r *http.Request) {
if r.Method != "POST" {
writeJSON(w, Response{Success: false, Message: "仅支持POST请求", Code: 405})
return
}
// 从请求头中获取token
token := r.Header.Get("Authorization")
if token == "" {
var req struct {
Token string `json:"token"`
}
json.NewDecoder(r.Body).Decode(&req)
token = req.Token
}
if token == "" {
writeJSON(w, Response{Success: false, Message: "Token不能为空", Code: 400})
return
}
// 删除session
delete(sessions, token)
log.Printf("用户登出成功, token: %s", token)
writeJSON(w, Response{
Success: true,
Message: "登出成功",
Code: 200,
})
}
// 获取用户信息处理
func getUserInfoHandler(w http.ResponseWriter, r *http.Request) {
if r.Method != "GET" {
writeJSON(w, Response{Success: false, Message: "仅支持GET请求", Code: 405})
return
}
// 从请求头中获取token
token := r.Header.Get("Authorization")
if token == "" {
token = r.URL.Query().Get("token")
}
if token == "" {
writeJSON(w, Response{Success: false, Message: "Token不能为空", Code: 401})
return
}
// 验证session
session, ok := sessions[token]
if !ok || session.Expiry.Before(time.Now()) {
if ok {
delete(sessions, token) // 删除过期session
}
writeJSON(w, Response{Success: false, Message: "Token无效或已过期", Code: 401})
return
}
// 调用Python脚本获取用户信息
scriptPath := "user_cli.py"
args := []string{"get", fmt.Sprintf("%d", session.UserID)}
output, err := callPythonScript(scriptPath, args...)
if err != nil {
log.Printf("获取用户信息失败: %v", err)
writeJSON(w, Response{Success: false, Message: "获取用户信息失败", Code: 500})
return
}
// 解析用户信息
var userData map[string]interface{}
if err := json.Unmarshal([]byte(output), &userData); err != nil {
log.Printf("解析用户信息失败: %v", err)
writeJSON(w, Response{Success: false, Message: "解析用户信息失败", Code: 500})
return
}
// 删除密码哈希
delete(userData, "password_hash")
writeJSON(w, Response{
Success: true,
Message: "获取成功",
Code: 200,
Data: userData,
})
}
// 更新用户信息处理
func updateUserHandler(w http.ResponseWriter, r *http.Request) {
if r.Method != "POST" {
writeJSON(w, Response{Success: false, Message: "仅支持POST请求", Code: 405})
return
}
// 从请求头中获取token
token := r.Header.Get("Authorization")
if token == "" {
writeJSON(w, Response{Success: false, Message: "Token不能为空", Code: 401})
return
}
// 验证session
session, ok := sessions[token]
if !ok || session.Expiry.Before(time.Now()) {
if ok {
delete(sessions, token) // 删除过期session
}
writeJSON(w, Response{Success: false, Message: "Token无效或已过期", Code: 401})
return
}
// 解析请求体
var req struct {
UserID int `json:"user_id"`
Email string `json:"email"`
Bio string `json:"bio"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
log.Printf("❌ 解析请求体失败: %v", err)
writeJSON(w, Response{Success: false, Message: "请求参数错误", Code: 400})
return
}
log.Printf("🔍 更新用户信息: user_id=%d, email=%s", req.UserID, req.Email)
// 验证用户ID与session一致
if req.UserID != session.UserID {
log.Printf("❌ 用户ID不匹配: req=%d, session=%d", req.UserID, session.UserID)
writeJSON(w, Response{Success: false, Message: "无权操作", Code: 403})
return
}
// 调用Python脚本更新用户信息
scriptPath := "user_cli.py"
args := []string{"update", fmt.Sprintf("%d", req.UserID)}
// 添加需要更新的字段
if req.Email != "" {
args = append(args, "--email", req.Email)
}
if req.Bio != "" {
args = append(args, "--bio", req.Bio)
}
output, err := callPythonScript(scriptPath, args...)
log.Printf("🔍 Python输出: %s", output)
if err != nil {
log.Printf("❌ 更新用户信息失败: %v", err)
writeJSON(w, Response{Success: false, Message: "更新失败", Code: 500})
return
}
// 解析响应
var result map[string]interface{}
if err := json.Unmarshal([]byte(output), &result); err != nil {
log.Printf("❌ 解析响应失败: %v", err)
writeJSON(w, Response{Success: false, Message: "服务器内部错误", Code: 500})
return
}
// 检查是否成功
if success, ok := result["success"].(bool); !ok || !success {
errMsg := "更新失败"
if msg, ok := result["error"].(string); ok {
errMsg = msg
}
writeJSON(w, Response{Success: false, Message: errMsg, Code: 500})
return
}
log.Printf("✅ 用户信息更新成功: user_id=%d", req.UserID)
writeJSON(w, Response{
Success: true,
Message: "更新成功",
Code: 200,
})
}