阿里云数据库慢日志监控

阿里云数据慢日志监控

需求说明

阿里云的RDS Redis Mongo云服务慢日志告警只能对接云监控,无法对接自有的监控系统,需要获取阿里云数据库慢日志相关数据并接入监控

准备

需要开通阿里云SLS日志服务

阿里云RDS慢日志接入SLS:

https://sls.console.aliyun.com/lognext/app/lens/redis&resource=/common-data-access

阿里云Redis慢日志接入SLS:

https://sls.console.aliyun.com/lognext/app/lens/redis&resource=/common-data-access

阿里云Mongo慢日志接入SLS:

https://help.aliyun.com/zh/sls/user-guide/enable-the-log-audit-feature

开通后在对应控制台查看目标存储库相关sls project logstore信息

image-20241114151552275

查看接入日志

image-20241114151742623

编写Exporter

使用Prometheus go client,Aliyun Go SDK 查询对应慢日志情况并输出Metrices

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
package main

import (
"fmt"
"github.com/aliyun/alibaba-cloud-sdk-go/sdk"
"github.com/aliyun/alibaba-cloud-sdk-go/sdk/auth/credentials"
"github.com/aliyun/alibaba-cloud-sdk-go/sdk/requests"
"github.com/aliyun/alibaba-cloud-sdk-go/services/dds"
r_kvstore "github.com/aliyun/alibaba-cloud-sdk-go/services/r-kvstore"
"github.com/aliyun/alibaba-cloud-sdk-go/services/rds"
sls "github.com/aliyun/aliyun-log-go-sdk"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"log"
"net/http"
"strconv"
"time"
)

var accessKeyID = "xxx" //替换accessKeyID
var accessKeySecret = "xxx" //替换accessKeySecret
var region = "cn-shanghai"
var endpoint = "cn-shanghai.log.aliyuncs.com"
var queries = map[string]map[string]string{
"aliyun-product-data-xxx-cn-shanghai": { //RDS Mysql 日志 修改sls 对应的Project
"slow_error_log": "* and not db_name: \"\" | select instance_id as description,db_type,db_name,avg(query_time) AS \"query_time\" ,count(query_sql) AS \"count\" GROUP BY instance_id,db_name,db_type",
},
"nosql-xxx-cn-shanghai": { //RDS Mongo Redis 日志 修改sls 对应的Project
"redis_slow_run_log": "* and command:* and not db: -1 | select instanceid as description,count(command) as count,avg(latency)/1000000 as query_time ,db as db_name group by instanceid,db",
"mongo_slow_run_log": "* and audit_type : slowOp |select instanceid as \"description\", db as \"db_name\",avg(latency /1000) as \"query_time\",count(command) as \"count\" group by instanceid,db",
},
// 添加更多的项目和日志库及其自定义查询语句
}

var (
alertEventMetric = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "db_slow_logs_count",
Help: "db_slow_logs_count",
},
[]string{"db_name", "query_time", "db_type", "description"},
)
)

func init() {
// Register custom metrics with Prometheus's default registry.
prometheus.MustRegister(alertEventMetric)
}

func getAliyunRdsName() (maps map[string]string) {
config := sdk.NewConfig()
credential := credentials.NewAccessKeyCredential(accessKeyID, accessKeySecret)
client, err := rds.NewClientWithOptions(region, config, credential)
if err != nil {
log.Fatal("Aliyun new Client fail!", err)
return
}
request := rds.CreateDescribeDBInstancesRequest()
request.Scheme = "https"
request.PageSize = requests.NewInteger(100)
request.PageNumber = requests.NewInteger(1)
response, err := client.DescribeDBInstances(request)
maps = make(map[string]string)
if err != nil {
log.Fatal("Aliyun get Rds fail!", err)
}
for _, v := range response.Items.DBInstance {
maps[v.DBInstanceId] = v.DBInstanceDescription
}
return
}

func getAliyunRedisName() (maps map[string]string) {
config := sdk.NewConfig()
credential := credentials.NewAccessKeyCredential(accessKeyID, accessKeySecret)
client, err := r_kvstore.NewClientWithOptions(region, config, credential)
if err != nil {
log.Fatal("Aliyun new Client fail!", err)
return
}
request := r_kvstore.CreateDescribeInstancesRequest()
request.Scheme = "https"
request.PageSize = requests.NewInteger(100)
request.PageNumber = requests.NewInteger(1)
response, err := client.DescribeInstances(request)
maps = make(map[string]string)
if err != nil {
log.Fatal("Aliyun get Rds fail!", err)
}
for _, v := range response.Instances.KVStoreInstance {
maps[v.InstanceId] = v.InstanceName
}
return
}

func getAliyunMongoName() (maps map[string]string) {
config := sdk.NewConfig()
credential := credentials.NewAccessKeyCredential(accessKeyID, accessKeySecret)
client, err := dds.NewClientWithOptions(region, config, credential)
if err != nil {
log.Fatal("Aliyun new Client fail!", err)
return
}
request := dds.CreateDescribeDBInstancesRequest()
request.Scheme = "https"
request.PageSize = requests.NewInteger(100)
request.PageNumber = requests.NewInteger(1)
response, err := client.DescribeDBInstances(request)
maps = make(map[string]string)
if err != nil {
log.Fatal("Aliyun get Rds fail!", err)
}
for _, v := range response.DBInstances.DBInstance {
maps[v.DBInstanceId] = v.DBInstanceDescription
}
return
}

func getAliyunLogsClient() (client sls.ClientInterface) {
client = sls.CreateNormalInterface(endpoint, accessKeyID, accessKeySecret, "")
return
}

func getAliyunMysqlSlowLogsMetrics() {
alertEventMetric.Reset()
client := getAliyunLogsClient()
// 查询条件和时间范围
from := time.Now().Add(-time.Minute).Unix() // 查询1分钟前的日志
to := time.Now().Unix()
rdsNames := getAliyunRdsName()
redisNames := getAliyunRedisName()
mongoNames := getAliyunMongoName()
for project, logstores := range queries {
for logstore, query := range logstores {
// 这里可以设置 limit 参数来控制返回的日志数量
resp, err := client.GetLogs(project, logstore, "", from, to, query, 100, 0, true)
if err != nil {
log.Fatalf("获取日志失败: %v", err)
}
if len(resp.Logs) != 0 {
for _, logEntry := range resp.Logs {
db_name := logEntry["db_name"]
query_time := logEntry["query_time"]
db_type := logEntry["db_type"]
description := rdsNames[logEntry["description"]]
if logstore == "redis_slow_run_log" {
db_type = "redis"
description = redisNames[logEntry["description"]]
}
if logstore == "mongo_slow_run_log" {
db_type = "mongo"
description = mongoNames[logEntry["description"]]
}
count, _ := strconv.ParseFloat(logEntry["count"], 64)
alertEventMetric.WithLabelValues(db_name, query_time, db_type, description).Set(count)
}
} else {
fmt.Println("nil")
}
}
}
// 执行日志查询
}

func main() {
go func() {
for {
getAliyunMysqlSlowLogsMetrics()
time.Sleep(30 * time.Second) // Query interval
}
}()
http.Handle("/metrics", promhttp.Handler())
log.Println("Starting HTTP server on :8081")
log.Fatal(http.ListenAndServe(":8081", nil))
}

将代码中替换相应AK,SLS Project Logstore

1
2
#编译
GOOS=linux GOARCH=amd64 go build

查看效果

image-20241114152548782

标签:

db_name :数据库名

query_time :平均执行时间

db_type:数据库类型(Mongo,Redis,Mysql)

description:数据库实例名

指标值:慢日志数量(count)

配置告警

告警规则:db_slow_logs_count > xxx

可根据实际需求添加

对接Grafana

可使用Grafana对日志进行展示查询,方便告警后查看

Grafana安装阿里云SLS插件:

https://help.aliyun.com/zh/sls/developer-reference/connect-log-service-to-grafana

效果:

image-20241114153210408

image-20241114153228528


阿里云数据库慢日志监控
https://www.starsfox.com/posts/21707fcd.html
作者
Flycat
发布于
2024年11月14日
许可协议