Files
ai_wht_wechat/go_backend/tools/service_monitor.go
2026-01-06 19:36:42 +08:00

258 lines
6.7 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package tools
import (
"ai_xhs/service"
"encoding/json"
"io/ioutil"
"log"
"os"
"os/signal"
"path/filepath"
"sync"
"syscall"
"time"
)
// ServiceMonitor 服务监控器
type ServiceMonitor struct {
alertPhone string
serviceName string
smsService *service.SmsService
isRunning bool
mutex sync.Mutex
shutdownChan chan os.Signal
alertSent bool // 标记是否已发送通知,避免重复发送
heartbeatFile string // 心跳文件路径
lastHeartbeat time.Time // 最后心跳时间
}
// HeartbeatData 心跳数据
type HeartbeatData struct {
ServiceName string `json:"service_name"`
LastHeartbeat time.Time `json:"last_heartbeat"`
PID int `json:"pid"`
StartTime time.Time `json:"start_time"`
GracefulShut bool `json:"graceful_shutdown"` // 是否为正常关闭
}
var (
monitorInstance *ServiceMonitor
monitorOnce sync.Once
)
// GetServiceMonitor 获取服务监控器单例
func GetServiceMonitor(alertPhone string, serviceName string) *ServiceMonitor {
monitorOnce.Do(func() {
heartbeatFile := filepath.Join(os.TempDir(), "ai_xhs_service_heartbeat.json")
monitorInstance = &ServiceMonitor{
alertPhone: alertPhone,
serviceName: serviceName,
smsService: service.GetSmsService(),
isRunning: true,
shutdownChan: make(chan os.Signal, 1),
alertSent: false,
heartbeatFile: heartbeatFile,
lastHeartbeat: time.Now(),
}
})
return monitorInstance
}
// StartMonitoring 启动服务监控
// 监听系统信号,在服务异常退出时发送短信通知
func (m *ServiceMonitor) StartMonitoring() {
// 检查上次启动是否异常关闭
m.checkLastShutdown()
// 启动心跳任务
m.startHeartbeat()
// 监听退出信号
signal.Notify(m.shutdownChan,
os.Interrupt, // Ctrl+C
syscall.SIGTERM, // kill命令
syscall.SIGQUIT, // Ctrl+\
syscall.SIGABRT, // abort
)
go func() {
sig := <-m.shutdownChan
log.Printf("[服务监控] 捕获到退出信号: %v", sig)
m.mutex.Lock()
m.isRunning = false
m.mutex.Unlock()
// 标记为正常关闭
m.markGracefulShutdown()
// 发送宕机通知
if !m.alertSent {
m.sendAlert("服务接收到退出信号")
}
// 给短信发送一些时间
time.Sleep(2 * time.Second)
// 退出程序
os.Exit(0)
}()
log.Printf("[服务监控] 服务监控已启动,监控电话: %s", m.alertPhone)
log.Printf("[服务监控] 心跳文件: %s", m.heartbeatFile)
}
// SetAlertSent 设置通知已发送标记(供外部调用,避免重复发送)
func (m *ServiceMonitor) SetAlertSent() {
m.mutex.Lock()
m.alertSent = true
m.mutex.Unlock()
}
// SendManualAlert 手动发送服务宕机通知
func (m *ServiceMonitor) SendManualAlert(reason string) error {
return m.sendAlert(reason)
}
// sendAlert 发送宕机通知
func (m *ServiceMonitor) sendAlert(reason string) error {
if m.alertSent {
log.Printf("[服务监控] 宕机通知已发送,跳过重复发送")
return nil
}
log.Printf("[服务监控] 服务宕机,原因: %s", reason)
err := m.smsService.SendServiceDownAlert(m.alertPhone, m.serviceName)
if err != nil {
log.Printf("[服务监控] 发送宕机通知失败: %v", err)
return err
}
m.alertSent = true
log.Printf("[服务监控] 宕机通知已发送到 %s", m.alertPhone)
return nil
}
// IsRunning 检查服务是否运行中
func (m *ServiceMonitor) IsRunning() bool {
m.mutex.Lock()
defer m.mutex.Unlock()
return m.isRunning
}
// Shutdown 优雅关闭
func (m *ServiceMonitor) Shutdown() {
if m.shutdownChan != nil {
m.shutdownChan <- syscall.SIGTERM
}
}
// startHeartbeat 启动心跳任务每30秒更新一次
func (m *ServiceMonitor) startHeartbeat() {
// 立即写入一次
m.updateHeartbeat()
// 启动定时任务
ticker := time.NewTicker(30 * time.Second)
go func() {
for range ticker.C {
if !m.IsRunning() {
break
}
m.updateHeartbeat()
}
}()
log.Printf("[服务监控] 心跳任务已启动每30秒更新一次")
}
// updateHeartbeat 更新心跳文件
func (m *ServiceMonitor) updateHeartbeat() {
m.mutex.Lock()
m.lastHeartbeat = time.Now()
m.mutex.Unlock()
data := HeartbeatData{
ServiceName: m.serviceName,
LastHeartbeat: m.lastHeartbeat,
PID: os.Getpid(),
StartTime: time.Now(), // 在实际应用中应该记录启动时间
GracefulShut: false, // 默认未正常关闭
}
jsonData, err := json.MarshalIndent(data, "", " ")
if err != nil {
log.Printf("[服务监控] 序列化心跳数据失败: %v", err)
return
}
if err := ioutil.WriteFile(m.heartbeatFile, jsonData, 0644); err != nil {
log.Printf("[服务监控] 写入心跳文件失败: %v", err)
}
}
// markGracefulShutdown 标记为正常关闭
func (m *ServiceMonitor) markGracefulShutdown() {
data := HeartbeatData{
ServiceName: m.serviceName,
LastHeartbeat: time.Now(),
PID: os.Getpid(),
StartTime: m.lastHeartbeat,
GracefulShut: true, // 标记为正常关闭
}
jsonData, err := json.MarshalIndent(data, "", " ")
if err != nil {
log.Printf("[服务监控] 序列化关闭数据失败: %v", err)
return
}
if err := ioutil.WriteFile(m.heartbeatFile, jsonData, 0644); err != nil {
log.Printf("[服务监控] 写入关闭标记失败: %v", err)
}
log.Printf("[服务监控] 已标记为正常关闭")
}
// checkLastShutdown 检查上次关闭是否异常
func (m *ServiceMonitor) checkLastShutdown() {
// 读取心跳文件
if _, err := os.Stat(m.heartbeatFile); os.IsNotExist(err) {
log.Printf("[服务监控] 未找到历史心跳文件,可能是首次启动")
return
}
fileData, err := ioutil.ReadFile(m.heartbeatFile)
if err != nil {
log.Printf("[服务监控] 读取心跳文件失败: %v", err)
return
}
var lastData HeartbeatData
if err := json.Unmarshal(fileData, &lastData); err != nil {
log.Printf("[服务监控] 解析心跳数据失败: %v", err)
return
}
log.Printf("[服务监控] 上次心跳: %v, PID: %d, 正常关闭: %v",
lastData.LastHeartbeat.Format("2006-01-02 15:04:05"),
lastData.PID,
lastData.GracefulShut)
// 如果上次不是正常关闭,发送通知
if !lastData.GracefulShut {
timeSinceLastHeartbeat := time.Since(lastData.LastHeartbeat)
// 如果距离上次心跳超过2分钟认为是异常关闭
if timeSinceLastHeartbeat > 2*time.Minute {
log.Printf("[服务监控] 检测到上次服务异常关闭(%v前发送通知", timeSinceLastHeartbeat)
err := m.smsService.SendServiceDownAlert(m.alertPhone, m.serviceName)
if err != nil {
log.Printf("[服务监控] 发送异常关闭通知失败: %v", err)
} else {
log.Printf("[服务监控] 已发送异常关闭通知")
}
} else {
log.Printf("[服务监控] 距离上次心跳仅%v可能是快速重启不发送通知", timeSinceLastHeartbeat)
}
}
}