package tools import ( "ai_xhs/service" "encoding/json" "io/ioutil" "log" "os" "os/signal" "path/filepath" "sync" "syscall" "time" ) // ServiceMonitor 服务监控器 type ServiceMonitor struct { alertPhone string serviceName string smsService *service.SmsService isRunning bool mutex sync.Mutex shutdownChan chan os.Signal alertSent bool // 标记是否已发送通知,避免重复发送 heartbeatFile string // 心跳文件路径 lastHeartbeat time.Time // 最后心跳时间 } // HeartbeatData 心跳数据 type HeartbeatData struct { ServiceName string `json:"service_name"` LastHeartbeat time.Time `json:"last_heartbeat"` PID int `json:"pid"` StartTime time.Time `json:"start_time"` GracefulShut bool `json:"graceful_shutdown"` // 是否为正常关闭 } var ( monitorInstance *ServiceMonitor monitorOnce sync.Once ) // GetServiceMonitor 获取服务监控器单例 func GetServiceMonitor(alertPhone string, serviceName string) *ServiceMonitor { monitorOnce.Do(func() { heartbeatFile := filepath.Join(os.TempDir(), "ai_xhs_service_heartbeat.json") monitorInstance = &ServiceMonitor{ alertPhone: alertPhone, serviceName: serviceName, smsService: service.GetSmsService(), isRunning: true, shutdownChan: make(chan os.Signal, 1), alertSent: false, heartbeatFile: heartbeatFile, lastHeartbeat: time.Now(), } }) return monitorInstance } // StartMonitoring 启动服务监控 // 监听系统信号,在服务异常退出时发送短信通知 func (m *ServiceMonitor) StartMonitoring() { // 检查上次启动是否异常关闭 m.checkLastShutdown() // 启动心跳任务 m.startHeartbeat() // 监听退出信号 signal.Notify(m.shutdownChan, os.Interrupt, // Ctrl+C syscall.SIGTERM, // kill命令 syscall.SIGQUIT, // Ctrl+\ syscall.SIGABRT, // abort ) go func() { sig := <-m.shutdownChan log.Printf("[服务监控] 捕获到退出信号: %v", sig) m.mutex.Lock() m.isRunning = false m.mutex.Unlock() // 标记为正常关闭 m.markGracefulShutdown() // 发送宕机通知 if !m.alertSent { m.sendAlert("服务接收到退出信号") } // 给短信发送一些时间 time.Sleep(2 * time.Second) // 退出程序 os.Exit(0) }() log.Printf("[服务监控] 服务监控已启动,监控电话: %s", m.alertPhone) log.Printf("[服务监控] 心跳文件: %s", m.heartbeatFile) } // SetAlertSent 设置通知已发送标记(供外部调用,避免重复发送) func (m *ServiceMonitor) SetAlertSent() { m.mutex.Lock() m.alertSent = true m.mutex.Unlock() } // SendManualAlert 手动发送服务宕机通知 func (m *ServiceMonitor) SendManualAlert(reason string) error { return m.sendAlert(reason) } // sendAlert 发送宕机通知 func (m *ServiceMonitor) sendAlert(reason string) error { if m.alertSent { log.Printf("[服务监控] 宕机通知已发送,跳过重复发送") return nil } log.Printf("[服务监控] 服务宕机,原因: %s", reason) err := m.smsService.SendServiceDownAlert(m.alertPhone, m.serviceName) if err != nil { log.Printf("[服务监控] 发送宕机通知失败: %v", err) return err } m.alertSent = true log.Printf("[服务监控] 宕机通知已发送到 %s", m.alertPhone) return nil } // IsRunning 检查服务是否运行中 func (m *ServiceMonitor) IsRunning() bool { m.mutex.Lock() defer m.mutex.Unlock() return m.isRunning } // Shutdown 优雅关闭 func (m *ServiceMonitor) Shutdown() { if m.shutdownChan != nil { m.shutdownChan <- syscall.SIGTERM } } // startHeartbeat 启动心跳任务,每30秒更新一次 func (m *ServiceMonitor) startHeartbeat() { // 立即写入一次 m.updateHeartbeat() // 启动定时任务 ticker := time.NewTicker(30 * time.Second) go func() { for range ticker.C { if !m.IsRunning() { break } m.updateHeartbeat() } }() log.Printf("[服务监控] 心跳任务已启动,每30秒更新一次") } // updateHeartbeat 更新心跳文件 func (m *ServiceMonitor) updateHeartbeat() { m.mutex.Lock() m.lastHeartbeat = time.Now() m.mutex.Unlock() data := HeartbeatData{ ServiceName: m.serviceName, LastHeartbeat: m.lastHeartbeat, PID: os.Getpid(), StartTime: time.Now(), // 在实际应用中应该记录启动时间 GracefulShut: false, // 默认未正常关闭 } jsonData, err := json.MarshalIndent(data, "", " ") if err != nil { log.Printf("[服务监控] 序列化心跳数据失败: %v", err) return } if err := ioutil.WriteFile(m.heartbeatFile, jsonData, 0644); err != nil { log.Printf("[服务监控] 写入心跳文件失败: %v", err) } } // markGracefulShutdown 标记为正常关闭 func (m *ServiceMonitor) markGracefulShutdown() { data := HeartbeatData{ ServiceName: m.serviceName, LastHeartbeat: time.Now(), PID: os.Getpid(), StartTime: m.lastHeartbeat, GracefulShut: true, // 标记为正常关闭 } jsonData, err := json.MarshalIndent(data, "", " ") if err != nil { log.Printf("[服务监控] 序列化关闭数据失败: %v", err) return } if err := ioutil.WriteFile(m.heartbeatFile, jsonData, 0644); err != nil { log.Printf("[服务监控] 写入关闭标记失败: %v", err) } log.Printf("[服务监控] 已标记为正常关闭") } // checkLastShutdown 检查上次关闭是否异常 func (m *ServiceMonitor) checkLastShutdown() { // 读取心跳文件 if _, err := os.Stat(m.heartbeatFile); os.IsNotExist(err) { log.Printf("[服务监控] 未找到历史心跳文件,可能是首次启动") return } fileData, err := ioutil.ReadFile(m.heartbeatFile) if err != nil { log.Printf("[服务监控] 读取心跳文件失败: %v", err) return } var lastData HeartbeatData if err := json.Unmarshal(fileData, &lastData); err != nil { log.Printf("[服务监控] 解析心跳数据失败: %v", err) return } log.Printf("[服务监控] 上次心跳: %v, PID: %d, 正常关闭: %v", lastData.LastHeartbeat.Format("2006-01-02 15:04:05"), lastData.PID, lastData.GracefulShut) // 如果上次不是正常关闭,发送通知 if !lastData.GracefulShut { timeSinceLastHeartbeat := time.Since(lastData.LastHeartbeat) // 如果距离上次心跳超过2分钟,认为是异常关闭 if timeSinceLastHeartbeat > 2*time.Minute { log.Printf("[服务监控] 检测到上次服务异常关闭(%v前),发送通知", timeSinceLastHeartbeat) err := m.smsService.SendServiceDownAlert(m.alertPhone, m.serviceName) if err != nil { log.Printf("[服务监控] 发送异常关闭通知失败: %v", err) } else { log.Printf("[服务监控] 已发送异常关闭通知") } } else { log.Printf("[服务监控] 距离上次心跳仅%v,可能是快速重启,不发送通知", timeSinceLastHeartbeat) } } }