夜莺Nightingale V5 通过Exporter实现告警升级

夜莺Nightingale V5 通过Exporter实现告警升级

nightingale

由于夜莺监控没有告警升级这个功能,可以通过自定义exporter去夜莺数据库查询活跃告警暴露指标出来实现,代码如下

夜莺版本:V5.15.0

开发环境:Golang1.18

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
package main

import (
"database/sql"
_ "github.com/go-sql-driver/mysql"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"log"
"net/http"
"strconv"
"time"
)

var (
alertEventMetric = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "alert_event_trigger_time",
Help: "Trigger time of the alert events in UNIX timestamp",
},
[]string{"id", "cluster", "group_id", "severity", "description", "Group", "Region"}, #此处标签根据业务实际情况添加
)
)

func init() {
// Register custom metrics with Prometheus's default registry.
prometheus.MustRegister(alertEventMetric)
}
func queryDatabaseAndUpdateMetrics() {
dsn := "xxx:xxx@tcp(xxxx:3306)/n9e_v5"
db, err := sql.Open("mysql", dsn)
if err != nil {
log.Fatalf("Failed to connect to MySQL: %v", err)
}
defer db.Close()

// Ensure the connection is valid.
err = db.Ping()
if err != nil {
log.Fatalf("MySQL connection error: %v", err)
}

// Query the alert_cur_event table.
query := `SELECT id, cluster, group_id, severity, rule_name, trigger_time FROM alert_cur_event where severity < 3;`

rows, err := db.Query(query)
if err != nil {
log.Fatalf("Failed to execute query: %v", err)
}
defer rows.Close()

// Clear existing metrics before updating.
alertEventMetric.Reset()
currentTime := time.Now().Unix()
// Iterate over the result set and update Prometheus metrics.
for rows.Next() {
var id int
var cluster string
var groupID int
var severity int
var ruleName string
var triggerTime int64
var region string

err := rows.Scan(&id, &cluster, &groupID, &severity, &ruleName, &triggerTime)
if err != nil {
log.Printf("Error scanning row: %v", err)
continue
}
region = "告警升级"
var metricValue float64
if currentTime-triggerTime > 8*3600 { //8小时升级,可根据实际需求修改
metricValue = 1 // Indicates that the event is older than 8 hours.
} else {
metricValue = 0 // Indicates that the event is within 8 hours.
}
// Update the metric with label values.
alertEventMetric.WithLabelValues(
strconv.Itoa(id),
cluster,
strconv.Itoa(groupID),
strconv.Itoa(severity),
ruleName,
cluster,
region,
).Set(float64(metricValue))
}

if err = rows.Err(); err != nil {
log.Fatalf("Error during rows iteration: %v", err)
}
}

func main() {
go func() {
for {
queryDatabaseAndUpdateMetrics()
time.Sleep(30 * time.Second) // Query interval
}
}()

http.Handle("/metrics", promhttp.Handler())
log.Println("Starting HTTP server on :8080")
log.Fatal(http.ListenAndServe(":8080", nil))
}

效果如下:

image-20241112152213563

编译运行后prometheus添加该exporter,在夜莺中创建规则 0为正常告警 1为升级告警

alert_event_trigger_time > 0

理论上可以修改代码中表名以及表字段兼容夜莺v6 或者v7


夜莺Nightingale V5 通过Exporter实现告警升级
https://www.starsfox.com/posts/1e80028d.html
作者
Flycat
发布于
2024年11月11日
许可协议