123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208 |
- package main
- import (
- "fmt"
- "strconv"
- "strings"
- "time"
- "redisdog/model"
- )
- const (
- MONITOR_DEFAULT_LOOP_INTERVAL = 30
- MONITOR_DEFAULT_MAIL_INTERVAL = 3600
- )
- type RedisMonitor struct {
- redisCfg *model.RedisCfg
- monitorLogger *model.MonitorLog
- warnLogger *model.WarnLog
- processLogger *model.ProcessLog
- logs map[int64]*model.MonitorLogRow
- }
- func NewRedisMonitor() *RedisMonitor {
- return &RedisMonitor{
- redisCfg: model.NewRedisCfg(Db),
- monitorLogger: model.NewMonitorLog(Db),
- warnLogger: model.NewWarnLog(Db),
- processLogger: model.NewProcessLog(Db),
- logs: make(map[int64]*model.MonitorLogRow),
- }
- }
- func (m *RedisMonitor) Loop() {
- interval := MONITOR_DEFAULT_LOOP_INTERVAL
- for {
- if str, ok := SysCfg.Get("monitor_loop_interval"); ok {
- if num, err := strconv.Atoi(str); err == nil {
- interval = num
- } else {
- SYSLOG("DEBUG", fmt.Sprintf("表syscfg记录cfg_key=monitor_loop_interval的值不是数字:%s", err.Error()))
- }
- }
- time.Sleep(time.Second * time.Duration(interval))
- m.loopStep()
- }
- }
- func (m *RedisMonitor) loopStep() {
- interval := MONITOR_DEFAULT_MAIL_INTERVAL
- rows, err := m.redisCfg.GetAll(0)
- if err != nil {
- SYSLOG("WARN", fmt.Sprintf("函数model.RedisCfg.GetAll(0)调用失败:%s", err.Error()))
- return
- }
- for _, row := range rows {
- now, mails := time.Now(), make([]*MailRequest, 0)
- ts, tstr := now.Unix(), now.String()
- //取配置
- if str, ok := SysCfg.Get("monitor_mail_interval"); ok {
- if num, err := strconv.Atoi(str); err == nil {
- interval = num
- } else {
- SYSLOG("DEBUG", fmt.Sprintf("表syscfg记录cfg_key=monitor_mail_interval的值不是数字:%s", err.Error()))
- }
- }
- //初始化检测数据
- if _, ok := m.logs[row.Id]; !ok {
- m.logs[row.Id] = &model.MonitorLogRow{RedisId: row.Id}
- }
- m.logs[row.Id].LogTime = ts
- //查询Redis的状态信息
- info := queryRedisInfo(row)
- if info.Error != "" {
- //更新检测数据
- m.logs[row.Id].QueryStatus = false
- m.logs[row.Id].FailedCount++
- //判断状态异常检查是否连接出错次数达到上限
- if m.logs[row.Id].FailedCount >= row.MaxStatusFailed {
- mails = append(mails, &MailRequest{
- sendto: strings.Split(row.MailList, ";"),
- title: fmt.Sprintf("报警:Redis[%s]状态检测失败", row.Address),
- content: fmt.Sprintf(`<p>时间:%s</p><p>Redis#%d[%s]状态检测失败:<b>%s</b>,已连续失败:<b>%d</b>次</p>`, tstr, row.Id, row.Address, info.Error, m.logs[row.Id].FailedCount),
- })
- }
- //记录日志
- SYSLOG("WARN", fmt.Sprintf("Redis#%d[%s]状态查询失败:%s,连接第%d次", row.Id, row.Address, info.Error, m.logs[row.Id].FailedCount))
- } else {
- var usedMemory, maxMemory, systemMemory, connections, qps, evictedKeys, eviIncreased int
- //已用内存
- if usedMemory, err = strconv.Atoi(info.Data["used_memory"]); err != nil {
- SYSLOG("WARN", fmt.Sprintf("Redis#%d[%s]状态字段used_memory值无效:%s", row.Id, row.Address, err.Error()))
- }
- //最大可分配内存
- if maxMemory, err = strconv.Atoi(info.Data["maxmemory"]); err != nil {
- SYSLOG("WARN", fmt.Sprintf("Redis#%d[%s]状态字段maxmemory值无效:%s", row.Id, row.Address, err.Error()))
- }
- //系统内存
- if systemMemory, err = strconv.Atoi(info.Data["total_system_memory"]); err != nil {
- SYSLOG("WARN", fmt.Sprintf("Redis#%d[%s]状态字段total_system_memory值无效:%s", row.Id, row.Address, err.Error()))
- }
- //连接数
- if connections, err = strconv.Atoi(info.Data["connected_clients"]); err != nil {
- SYSLOG("WARN", fmt.Sprintf("Redis#%d[%s]状态字段connected_clients值无效:%s", row.Id, row.Address, err.Error()))
- }
- //QPS
- if qps, err = strconv.Atoi(info.Data["instantaneous_ops_per_sec"]); err != nil {
- SYSLOG("WARN", fmt.Sprintf("Redis#%d[%s]状态字段instantaneous_ops_per_sec值无效:%s", row.Id, row.Address, err.Error()))
- }
- //淘汰数
- if evictedKeys, err = strconv.Atoi(info.Data["evicted_keys"]); err != nil {
- SYSLOG("WARN", fmt.Sprintf("Redis#%d[%s]状态字段evicted_keys值无效:%s", row.Id, row.Address, err.Error()))
- }
- //新增淘汰数
- eviIncreased = evictedKeys - int(m.logs[row.Id].EvictedKeys)
- //检查连接的客户端数量是否达到上限
- if connections >= int(row.MaxConnection) {
- mails = append(mails, &MailRequest{
- sendto: strings.Split(row.MailList, ";"),
- title: fmt.Sprintf("报警:Redis[%s]连接数达到上限", row.Address),
- content: fmt.Sprintf(`<p>时间:%s</p><p>Redis#%d[%s]连接数已达到:<b>%d</b>,上限为:%d</p>`, tstr, row.Id, row.Address, connections, row.MaxConnection),
- })
- }
- //检查QPS是否达到上限
- if qps >= int(row.MaxQPS) {
- mails = append(mails, &MailRequest{
- sendto: strings.Split(row.MailList, ";"),
- title: fmt.Sprintf("报警:Redis[%s]QPS达到上限", row.Address),
- content: fmt.Sprintf(`<p>时间:%s</p><p>Redis#%d[%s]QPS已达到:<span color="red">%d</span>,上限为:%d</p>`, tstr, row.Id, row.Address, qps, row.MaxQPS),
- })
- }
- //检查新增淘汰数是否达到上限
- if eviIncreased >= int(row.MaxEviIncreased) {
- mails = append(mails, &MailRequest{
- sendto: strings.Split(row.MailList, ";"),
- title: fmt.Sprintf("报警:Redis[%s]新增淘汰记录数达到上限", row.Address),
- content: fmt.Sprintf(`<p>时间:%s</p><p>Redis#%d[%s]新增淘汰记录数已达到:<span color="red">%d</span>,上限为:%d</p>`, tstr, row.Id, row.Address, eviIncreased, row.MaxEviIncreased),
- })
- }
- //有内存使用限制,并且允许自动扩容
- if maxMemory > 0 {
- flag := false
- //检查当前使用内存是否达到了预警值
- freeMemory := maxMemory - usedMemory
- if freeMemory < int(row.MinMemoryFree) {
- mails = append(mails, &MailRequest{
- sendto: strings.Split(row.MailList, ";"),
- title: fmt.Sprintf("报警:Redis[%s]可用内存不足", row.Address),
- content: fmt.Sprintf(`<p>时间:%s</p><p>Redis#%d[%s]可用内存不足,分配:<b>%s</b>,剩余:<b>%s</b>,要求剩余:<b>%s</b></p>`, tstr, row.Id, row.Address, info.Data["maxmemory_human"], number2size(float64(freeMemory)), number2size(float64(row.MinMemoryFree))),
- })
- flag = true
- }
- //判断是否需要自动扩容
- if flag && row.StepMemoryIncrease > 0 && maxMemory < int(row.MaxMemoryUsage) {
- newMaxMemory := int64(maxMemory) + row.StepMemoryIncrease
- if newMaxMemory > row.MaxMemoryUsage {
- newMaxMemory = row.MaxMemoryUsage
- }
- ret, err := resetRedisConfig(row, newMaxMemory)
- //判断扩容结果
- if err != nil {
- SYSLOG("ERROR", fmt.Sprintf("Redis#%d[%s]扩容失败:%s", row.Id, row.Address, err.Error()))
- } else {
- if ret {
- mails = append(mails, &MailRequest{
- sendto: strings.Split(row.MailList, ";"),
- title: fmt.Sprintf("通知:Redis[%s]已自动扩容", row.Address),
- content: fmt.Sprintf(`<p>时间:%s</p><p>Redis#%d[%s]最大可用内存限制已由<b>%s</b>调整为<b>%s</b></p>`, tstr, row.Id, row.Address, info.Data["maxmemory_human"], number2size(float64(newMaxMemory))),
- })
- if _, err = m.processLogger.Add(&model.ProcessLogRow{RedisId: row.Id, MaxMemoryBefore: int64(maxMemory), MaxMemoryAfter: newMaxMemory}); err != nil {
- SYSLOG("ERROR", fmt.Sprintf("Redis#%d[%s]扩容日志写入DB失败:%s", row.Id, row.Address, err.Error()))
- }
- } else {
- SYSLOG("ERROR", fmt.Sprintf("Redis#%d[%s]扩容失败:无改变", row.Id, row.Address))
- }
- }
- }
- }
- //更新检测数据
- m.logs[row.Id].QueryStatus = true
- m.logs[row.Id].FailedCount = 0
- m.logs[row.Id].UsedMemory = int64(usedMemory)
- m.logs[row.Id].MaxMemory = int64(maxMemory)
- m.logs[row.Id].SystemMemory = int64(systemMemory)
- m.logs[row.Id].Connection = int64(connections)
- m.logs[row.Id].QPS = int64(qps)
- m.logs[row.Id].EvictedKeys = int64(evictedKeys)
- m.logs[row.Id].EviIncreased = int64(eviIncreased)
- }
- //添加任务到邮件发送队列
- if len(mails) > 0 && m.logs[row.Id].LastMailTime < ts-int64(interval) {
- m.logs[row.Id].LastMailTime = ts
- for _, mail := range mails {
- Sender.Push(mail)
- if _, err = m.warnLogger.Add(&model.WarnLogRow{RedisId: row.Id, WarnMsg: mail.content}); err != nil {
- SYSLOG("ERROR", fmt.Sprintf("Redis#%d[%s]报警日志写入DB失败:%s", row.Id, row.Address, err.Error()))
- }
- }
- }
- //记录检查日志
- if _, err = m.monitorLogger.Add(m.logs[row.Id]); err != nil {
- SYSLOG("ERROR", fmt.Sprintf("Redis#%d[%s]监控日志写入DB失败:%s", row.Id, row.Address, err.Error()))
- }
- }
- }
|