258 lines
6.7 KiB
Go
258 lines
6.7 KiB
Go
package tools
|
||
|
||
import (
|
||
"ai_xhs/service"
|
||
"encoding/json"
|
||
"io/ioutil"
|
||
"log"
|
||
"os"
|
||
"os/signal"
|
||
"path/filepath"
|
||
"sync"
|
||
"syscall"
|
||
"time"
|
||
)
|
||
|
||
// ServiceMonitor 服务监控器
|
||
type ServiceMonitor struct {
|
||
alertPhone string
|
||
serviceName string
|
||
smsService *service.SmsService
|
||
isRunning bool
|
||
mutex sync.Mutex
|
||
shutdownChan chan os.Signal
|
||
alertSent bool // 标记是否已发送通知,避免重复发送
|
||
heartbeatFile string // 心跳文件路径
|
||
lastHeartbeat time.Time // 最后心跳时间
|
||
}
|
||
|
||
// HeartbeatData 心跳数据
|
||
type HeartbeatData struct {
|
||
ServiceName string `json:"service_name"`
|
||
LastHeartbeat time.Time `json:"last_heartbeat"`
|
||
PID int `json:"pid"`
|
||
StartTime time.Time `json:"start_time"`
|
||
GracefulShut bool `json:"graceful_shutdown"` // 是否为正常关闭
|
||
}
|
||
|
||
var (
|
||
monitorInstance *ServiceMonitor
|
||
monitorOnce sync.Once
|
||
)
|
||
|
||
// GetServiceMonitor 获取服务监控器单例
|
||
func GetServiceMonitor(alertPhone string, serviceName string) *ServiceMonitor {
|
||
monitorOnce.Do(func() {
|
||
heartbeatFile := filepath.Join(os.TempDir(), "ai_xhs_service_heartbeat.json")
|
||
monitorInstance = &ServiceMonitor{
|
||
alertPhone: alertPhone,
|
||
serviceName: serviceName,
|
||
smsService: service.GetSmsService(),
|
||
isRunning: true,
|
||
shutdownChan: make(chan os.Signal, 1),
|
||
alertSent: false,
|
||
heartbeatFile: heartbeatFile,
|
||
lastHeartbeat: time.Now(),
|
||
}
|
||
})
|
||
return monitorInstance
|
||
}
|
||
|
||
// StartMonitoring 启动服务监控
|
||
// 监听系统信号,在服务异常退出时发送短信通知
|
||
func (m *ServiceMonitor) StartMonitoring() {
|
||
// 检查上次启动是否异常关闭
|
||
m.checkLastShutdown()
|
||
|
||
// 启动心跳任务
|
||
m.startHeartbeat()
|
||
|
||
// 监听退出信号
|
||
signal.Notify(m.shutdownChan,
|
||
os.Interrupt, // Ctrl+C
|
||
syscall.SIGTERM, // kill命令
|
||
syscall.SIGQUIT, // Ctrl+\
|
||
syscall.SIGABRT, // abort
|
||
)
|
||
|
||
go func() {
|
||
sig := <-m.shutdownChan
|
||
log.Printf("[服务监控] 捕获到退出信号: %v", sig)
|
||
|
||
m.mutex.Lock()
|
||
m.isRunning = false
|
||
m.mutex.Unlock()
|
||
|
||
// 标记为正常关闭
|
||
m.markGracefulShutdown()
|
||
|
||
// 发送宕机通知
|
||
if !m.alertSent {
|
||
m.sendAlert("服务接收到退出信号")
|
||
}
|
||
|
||
// 给短信发送一些时间
|
||
time.Sleep(2 * time.Second)
|
||
|
||
// 退出程序
|
||
os.Exit(0)
|
||
}()
|
||
|
||
log.Printf("[服务监控] 服务监控已启动,监控电话: %s", m.alertPhone)
|
||
log.Printf("[服务监控] 心跳文件: %s", m.heartbeatFile)
|
||
}
|
||
|
||
// SetAlertSent 设置通知已发送标记(供外部调用,避免重复发送)
|
||
func (m *ServiceMonitor) SetAlertSent() {
|
||
m.mutex.Lock()
|
||
m.alertSent = true
|
||
m.mutex.Unlock()
|
||
}
|
||
|
||
// SendManualAlert 手动发送服务宕机通知
|
||
func (m *ServiceMonitor) SendManualAlert(reason string) error {
|
||
return m.sendAlert(reason)
|
||
}
|
||
|
||
// sendAlert 发送宕机通知
|
||
func (m *ServiceMonitor) sendAlert(reason string) error {
|
||
if m.alertSent {
|
||
log.Printf("[服务监控] 宕机通知已发送,跳过重复发送")
|
||
return nil
|
||
}
|
||
|
||
log.Printf("[服务监控] 服务宕机,原因: %s", reason)
|
||
|
||
err := m.smsService.SendServiceDownAlert(m.alertPhone, m.serviceName)
|
||
if err != nil {
|
||
log.Printf("[服务监控] 发送宕机通知失败: %v", err)
|
||
return err
|
||
}
|
||
|
||
m.alertSent = true
|
||
log.Printf("[服务监控] 宕机通知已发送到 %s", m.alertPhone)
|
||
return nil
|
||
}
|
||
|
||
// IsRunning 检查服务是否运行中
|
||
func (m *ServiceMonitor) IsRunning() bool {
|
||
m.mutex.Lock()
|
||
defer m.mutex.Unlock()
|
||
return m.isRunning
|
||
}
|
||
|
||
// Shutdown 优雅关闭
|
||
func (m *ServiceMonitor) Shutdown() {
|
||
if m.shutdownChan != nil {
|
||
m.shutdownChan <- syscall.SIGTERM
|
||
}
|
||
}
|
||
|
||
// startHeartbeat 启动心跳任务,每30秒更新一次
|
||
func (m *ServiceMonitor) startHeartbeat() {
|
||
// 立即写入一次
|
||
m.updateHeartbeat()
|
||
|
||
// 启动定时任务
|
||
ticker := time.NewTicker(30 * time.Second)
|
||
go func() {
|
||
for range ticker.C {
|
||
if !m.IsRunning() {
|
||
break
|
||
}
|
||
m.updateHeartbeat()
|
||
}
|
||
}()
|
||
log.Printf("[服务监控] 心跳任务已启动,每30秒更新一次")
|
||
}
|
||
|
||
// updateHeartbeat 更新心跳文件
|
||
func (m *ServiceMonitor) updateHeartbeat() {
|
||
m.mutex.Lock()
|
||
m.lastHeartbeat = time.Now()
|
||
m.mutex.Unlock()
|
||
|
||
data := HeartbeatData{
|
||
ServiceName: m.serviceName,
|
||
LastHeartbeat: m.lastHeartbeat,
|
||
PID: os.Getpid(),
|
||
StartTime: time.Now(), // 在实际应用中应该记录启动时间
|
||
GracefulShut: false, // 默认未正常关闭
|
||
}
|
||
|
||
jsonData, err := json.MarshalIndent(data, "", " ")
|
||
if err != nil {
|
||
log.Printf("[服务监控] 序列化心跳数据失败: %v", err)
|
||
return
|
||
}
|
||
|
||
if err := ioutil.WriteFile(m.heartbeatFile, jsonData, 0644); err != nil {
|
||
log.Printf("[服务监控] 写入心跳文件失败: %v", err)
|
||
}
|
||
}
|
||
|
||
// markGracefulShutdown 标记为正常关闭
|
||
func (m *ServiceMonitor) markGracefulShutdown() {
|
||
data := HeartbeatData{
|
||
ServiceName: m.serviceName,
|
||
LastHeartbeat: time.Now(),
|
||
PID: os.Getpid(),
|
||
StartTime: m.lastHeartbeat,
|
||
GracefulShut: true, // 标记为正常关闭
|
||
}
|
||
|
||
jsonData, err := json.MarshalIndent(data, "", " ")
|
||
if err != nil {
|
||
log.Printf("[服务监控] 序列化关闭数据失败: %v", err)
|
||
return
|
||
}
|
||
|
||
if err := ioutil.WriteFile(m.heartbeatFile, jsonData, 0644); err != nil {
|
||
log.Printf("[服务监控] 写入关闭标记失败: %v", err)
|
||
}
|
||
log.Printf("[服务监控] 已标记为正常关闭")
|
||
}
|
||
|
||
// checkLastShutdown 检查上次关闭是否异常
|
||
func (m *ServiceMonitor) checkLastShutdown() {
|
||
// 读取心跳文件
|
||
if _, err := os.Stat(m.heartbeatFile); os.IsNotExist(err) {
|
||
log.Printf("[服务监控] 未找到历史心跳文件,可能是首次启动")
|
||
return
|
||
}
|
||
|
||
fileData, err := ioutil.ReadFile(m.heartbeatFile)
|
||
if err != nil {
|
||
log.Printf("[服务监控] 读取心跳文件失败: %v", err)
|
||
return
|
||
}
|
||
|
||
var lastData HeartbeatData
|
||
if err := json.Unmarshal(fileData, &lastData); err != nil {
|
||
log.Printf("[服务监控] 解析心跳数据失败: %v", err)
|
||
return
|
||
}
|
||
|
||
log.Printf("[服务监控] 上次心跳: %v, PID: %d, 正常关闭: %v",
|
||
lastData.LastHeartbeat.Format("2006-01-02 15:04:05"),
|
||
lastData.PID,
|
||
lastData.GracefulShut)
|
||
|
||
// 如果上次不是正常关闭,发送通知
|
||
if !lastData.GracefulShut {
|
||
timeSinceLastHeartbeat := time.Since(lastData.LastHeartbeat)
|
||
// 如果距离上次心跳超过2分钟,认为是异常关闭
|
||
if timeSinceLastHeartbeat > 2*time.Minute {
|
||
log.Printf("[服务监控] 检测到上次服务异常关闭(%v前),发送通知", timeSinceLastHeartbeat)
|
||
err := m.smsService.SendServiceDownAlert(m.alertPhone, m.serviceName)
|
||
if err != nil {
|
||
log.Printf("[服务监控] 发送异常关闭通知失败: %v", err)
|
||
} else {
|
||
log.Printf("[服务监控] 已发送异常关闭通知")
|
||
}
|
||
} else {
|
||
log.Printf("[服务监控] 距离上次心跳仅%v,可能是快速重启,不发送通知", timeSinceLastHeartbeat)
|
||
}
|
||
}
|
||
}
|