258 lines
6.7 KiB
Go
258 lines
6.7 KiB
Go
|
|
package tools
|
|||
|
|
|
|||
|
|
import (
|
|||
|
|
"ai_xhs/service"
|
|||
|
|
"encoding/json"
|
|||
|
|
"io/ioutil"
|
|||
|
|
"log"
|
|||
|
|
"os"
|
|||
|
|
"os/signal"
|
|||
|
|
"path/filepath"
|
|||
|
|
"sync"
|
|||
|
|
"syscall"
|
|||
|
|
"time"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// ServiceMonitor 服务监控器
|
|||
|
|
type ServiceMonitor struct {
|
|||
|
|
alertPhone string
|
|||
|
|
serviceName string
|
|||
|
|
smsService *service.SmsService
|
|||
|
|
isRunning bool
|
|||
|
|
mutex sync.Mutex
|
|||
|
|
shutdownChan chan os.Signal
|
|||
|
|
alertSent bool // 标记是否已发送通知,避免重复发送
|
|||
|
|
heartbeatFile string // 心跳文件路径
|
|||
|
|
lastHeartbeat time.Time // 最后心跳时间
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// HeartbeatData 心跳数据
|
|||
|
|
type HeartbeatData struct {
|
|||
|
|
ServiceName string `json:"service_name"`
|
|||
|
|
LastHeartbeat time.Time `json:"last_heartbeat"`
|
|||
|
|
PID int `json:"pid"`
|
|||
|
|
StartTime time.Time `json:"start_time"`
|
|||
|
|
GracefulShut bool `json:"graceful_shutdown"` // 是否为正常关闭
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
var (
|
|||
|
|
monitorInstance *ServiceMonitor
|
|||
|
|
monitorOnce sync.Once
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// GetServiceMonitor 获取服务监控器单例
|
|||
|
|
func GetServiceMonitor(alertPhone string, serviceName string) *ServiceMonitor {
|
|||
|
|
monitorOnce.Do(func() {
|
|||
|
|
heartbeatFile := filepath.Join(os.TempDir(), "ai_xhs_service_heartbeat.json")
|
|||
|
|
monitorInstance = &ServiceMonitor{
|
|||
|
|
alertPhone: alertPhone,
|
|||
|
|
serviceName: serviceName,
|
|||
|
|
smsService: service.GetSmsService(),
|
|||
|
|
isRunning: true,
|
|||
|
|
shutdownChan: make(chan os.Signal, 1),
|
|||
|
|
alertSent: false,
|
|||
|
|
heartbeatFile: heartbeatFile,
|
|||
|
|
lastHeartbeat: time.Now(),
|
|||
|
|
}
|
|||
|
|
})
|
|||
|
|
return monitorInstance
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// StartMonitoring 启动服务监控
|
|||
|
|
// 监听系统信号,在服务异常退出时发送短信通知
|
|||
|
|
func (m *ServiceMonitor) StartMonitoring() {
|
|||
|
|
// 检查上次启动是否异常关闭
|
|||
|
|
m.checkLastShutdown()
|
|||
|
|
|
|||
|
|
// 启动心跳任务
|
|||
|
|
m.startHeartbeat()
|
|||
|
|
|
|||
|
|
// 监听退出信号
|
|||
|
|
signal.Notify(m.shutdownChan,
|
|||
|
|
os.Interrupt, // Ctrl+C
|
|||
|
|
syscall.SIGTERM, // kill命令
|
|||
|
|
syscall.SIGQUIT, // Ctrl+\
|
|||
|
|
syscall.SIGABRT, // abort
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
go func() {
|
|||
|
|
sig := <-m.shutdownChan
|
|||
|
|
log.Printf("[服务监控] 捕获到退出信号: %v", sig)
|
|||
|
|
|
|||
|
|
m.mutex.Lock()
|
|||
|
|
m.isRunning = false
|
|||
|
|
m.mutex.Unlock()
|
|||
|
|
|
|||
|
|
// 标记为正常关闭
|
|||
|
|
m.markGracefulShutdown()
|
|||
|
|
|
|||
|
|
// 发送宕机通知
|
|||
|
|
if !m.alertSent {
|
|||
|
|
m.sendAlert("服务接收到退出信号")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 给短信发送一些时间
|
|||
|
|
time.Sleep(2 * time.Second)
|
|||
|
|
|
|||
|
|
// 退出程序
|
|||
|
|
os.Exit(0)
|
|||
|
|
}()
|
|||
|
|
|
|||
|
|
log.Printf("[服务监控] 服务监控已启动,监控电话: %s", m.alertPhone)
|
|||
|
|
log.Printf("[服务监控] 心跳文件: %s", m.heartbeatFile)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// SetAlertSent 设置通知已发送标记(供外部调用,避免重复发送)
|
|||
|
|
func (m *ServiceMonitor) SetAlertSent() {
|
|||
|
|
m.mutex.Lock()
|
|||
|
|
m.alertSent = true
|
|||
|
|
m.mutex.Unlock()
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// SendManualAlert 手动发送服务宕机通知
|
|||
|
|
func (m *ServiceMonitor) SendManualAlert(reason string) error {
|
|||
|
|
return m.sendAlert(reason)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// sendAlert 发送宕机通知
|
|||
|
|
func (m *ServiceMonitor) sendAlert(reason string) error {
|
|||
|
|
if m.alertSent {
|
|||
|
|
log.Printf("[服务监控] 宕机通知已发送,跳过重复发送")
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
log.Printf("[服务监控] 服务宕机,原因: %s", reason)
|
|||
|
|
|
|||
|
|
err := m.smsService.SendServiceDownAlert(m.alertPhone, m.serviceName)
|
|||
|
|
if err != nil {
|
|||
|
|
log.Printf("[服务监控] 发送宕机通知失败: %v", err)
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
m.alertSent = true
|
|||
|
|
log.Printf("[服务监控] 宕机通知已发送到 %s", m.alertPhone)
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// IsRunning 检查服务是否运行中
|
|||
|
|
func (m *ServiceMonitor) IsRunning() bool {
|
|||
|
|
m.mutex.Lock()
|
|||
|
|
defer m.mutex.Unlock()
|
|||
|
|
return m.isRunning
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Shutdown 优雅关闭
|
|||
|
|
func (m *ServiceMonitor) Shutdown() {
|
|||
|
|
if m.shutdownChan != nil {
|
|||
|
|
m.shutdownChan <- syscall.SIGTERM
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// startHeartbeat 启动心跳任务,每30秒更新一次
|
|||
|
|
func (m *ServiceMonitor) startHeartbeat() {
|
|||
|
|
// 立即写入一次
|
|||
|
|
m.updateHeartbeat()
|
|||
|
|
|
|||
|
|
// 启动定时任务
|
|||
|
|
ticker := time.NewTicker(30 * time.Second)
|
|||
|
|
go func() {
|
|||
|
|
for range ticker.C {
|
|||
|
|
if !m.IsRunning() {
|
|||
|
|
break
|
|||
|
|
}
|
|||
|
|
m.updateHeartbeat()
|
|||
|
|
}
|
|||
|
|
}()
|
|||
|
|
log.Printf("[服务监控] 心跳任务已启动,每30秒更新一次")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// updateHeartbeat 更新心跳文件
|
|||
|
|
func (m *ServiceMonitor) updateHeartbeat() {
|
|||
|
|
m.mutex.Lock()
|
|||
|
|
m.lastHeartbeat = time.Now()
|
|||
|
|
m.mutex.Unlock()
|
|||
|
|
|
|||
|
|
data := HeartbeatData{
|
|||
|
|
ServiceName: m.serviceName,
|
|||
|
|
LastHeartbeat: m.lastHeartbeat,
|
|||
|
|
PID: os.Getpid(),
|
|||
|
|
StartTime: time.Now(), // 在实际应用中应该记录启动时间
|
|||
|
|
GracefulShut: false, // 默认未正常关闭
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
jsonData, err := json.MarshalIndent(data, "", " ")
|
|||
|
|
if err != nil {
|
|||
|
|
log.Printf("[服务监控] 序列化心跳数据失败: %v", err)
|
|||
|
|
return
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if err := ioutil.WriteFile(m.heartbeatFile, jsonData, 0644); err != nil {
|
|||
|
|
log.Printf("[服务监控] 写入心跳文件失败: %v", err)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// markGracefulShutdown 标记为正常关闭
|
|||
|
|
func (m *ServiceMonitor) markGracefulShutdown() {
|
|||
|
|
data := HeartbeatData{
|
|||
|
|
ServiceName: m.serviceName,
|
|||
|
|
LastHeartbeat: time.Now(),
|
|||
|
|
PID: os.Getpid(),
|
|||
|
|
StartTime: m.lastHeartbeat,
|
|||
|
|
GracefulShut: true, // 标记为正常关闭
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
jsonData, err := json.MarshalIndent(data, "", " ")
|
|||
|
|
if err != nil {
|
|||
|
|
log.Printf("[服务监控] 序列化关闭数据失败: %v", err)
|
|||
|
|
return
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if err := ioutil.WriteFile(m.heartbeatFile, jsonData, 0644); err != nil {
|
|||
|
|
log.Printf("[服务监控] 写入关闭标记失败: %v", err)
|
|||
|
|
}
|
|||
|
|
log.Printf("[服务监控] 已标记为正常关闭")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// checkLastShutdown 检查上次关闭是否异常
|
|||
|
|
func (m *ServiceMonitor) checkLastShutdown() {
|
|||
|
|
// 读取心跳文件
|
|||
|
|
if _, err := os.Stat(m.heartbeatFile); os.IsNotExist(err) {
|
|||
|
|
log.Printf("[服务监控] 未找到历史心跳文件,可能是首次启动")
|
|||
|
|
return
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
fileData, err := ioutil.ReadFile(m.heartbeatFile)
|
|||
|
|
if err != nil {
|
|||
|
|
log.Printf("[服务监控] 读取心跳文件失败: %v", err)
|
|||
|
|
return
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
var lastData HeartbeatData
|
|||
|
|
if err := json.Unmarshal(fileData, &lastData); err != nil {
|
|||
|
|
log.Printf("[服务监控] 解析心跳数据失败: %v", err)
|
|||
|
|
return
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
log.Printf("[服务监控] 上次心跳: %v, PID: %d, 正常关闭: %v",
|
|||
|
|
lastData.LastHeartbeat.Format("2006-01-02 15:04:05"),
|
|||
|
|
lastData.PID,
|
|||
|
|
lastData.GracefulShut)
|
|||
|
|
|
|||
|
|
// 如果上次不是正常关闭,发送通知
|
|||
|
|
if !lastData.GracefulShut {
|
|||
|
|
timeSinceLastHeartbeat := time.Since(lastData.LastHeartbeat)
|
|||
|
|
// 如果距离上次心跳超过2分钟,认为是异常关闭
|
|||
|
|
if timeSinceLastHeartbeat > 2*time.Minute {
|
|||
|
|
log.Printf("[服务监控] 检测到上次服务异常关闭(%v前),发送通知", timeSinceLastHeartbeat)
|
|||
|
|
err := m.smsService.SendServiceDownAlert(m.alertPhone, m.serviceName)
|
|||
|
|
if err != nil {
|
|||
|
|
log.Printf("[服务监控] 发送异常关闭通知失败: %v", err)
|
|||
|
|
} else {
|
|||
|
|
log.Printf("[服务监控] 已发送异常关闭通知")
|
|||
|
|
}
|
|||
|
|
} else {
|
|||
|
|
log.Printf("[服务监控] 距离上次心跳仅%v,可能是快速重启,不发送通知", timeSinceLastHeartbeat)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|