1 Alertmanager安装及子路由配置 *****
- Alertmanager环境部署
1.什么是altermanager
Alertmanager是一款开源的告警工具包,可以和Prometheus集成。
2.下载Alertmanager
wget https://github.com/prometheus/alertmanager/releases/download/v0.28.1/alertmanager-0.28.1.linux-amd64.tar.gz
SVIP:
[root@prometheus-server33 ~]# wget http://192.168.17.253/Resources/Prometheus/softwares/Alertmanager/alertmanager-0.28.1.linux-amd64.tar.gz
3.解压安装包
[root@prometheus-server33 ~]# tar xf alertmanager-0.28.1.linux-amd64.tar.gz -C /usr/local/
[root@prometheus-server33 ~]#
4.修改Alertmanager的配置文件
[root@prometheus-server33 ~]# cd /usr/local/alertmanager-0.28.1.linux-amd64/
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]#
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# ll
total 67932
drwxr-xr-x 2 1001 1002 4096 Mar 7 23:08 ./
drwxr-xr-x 11 root root 4096 May 14 15:03 ../
-rwxr-xr-x 1 1001 1002 38948743 Mar 7 23:06 alertmanager*
-rw-r--r-- 1 1001 1002 356 Mar 7 23:07 alertmanager.yml
-rwxr-xr-x 1 1001 1002 30582387 Mar 7 23:06 amtool*
-rw-r--r-- 1 1001 1002 11357 Mar 7 23:07 LICENSE
-rw-r--r-- 1 1001 1002 311 Mar 7 23:07 NOTICE
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]#
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# cat alertmanager.yml
# 通用配置
global:
resolve_timeout: 5m
smtp_from: '1792702099@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: '1792702099@qq.com'
smtp_auth_password: 'efucaeallvybbcje'
smtp_require_tls: false
smtp_hello: 'qq.com'
# 定义路由信息
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'sre_system'
# 配置子路由
routes:
- receiver: 'sre_dba'
match_re:
job: cmy_dba_exporter
# 建议将continue的值设置为true,表示当前的条件是否匹配,都将继续向下匹配规则
# 这样做的目的是将消息发给最后的系统组(sre_system)
continue: true
- receiver: 'sre_k8s'
match_re:
job: cmy_k8s_exporter
continue: true
- receiver: 'sre_system'
match_re:
job: .*
continue: true
# 定义接受者
receivers:
- name: 'sre_dba'
email_configs:
- to: '2457193694@qq.com'
send_resolved: true
- to: '3081449097@qq.com'
send_resolved: true
- name: 'sre_k8s'
email_configs:
- to: '1138658498@qq.com'
send_resolved: true
- to: '3864762641@qq.com'
send_resolved: true
- name: 'sre_system'
email_configs:
- to: 'caidaopeng1201@163.com'
send_resolved: true
- to: '2859770173@qq.com'
send_resolved: true
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]#
5.检查配置文件语法
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# ./amtool check-config alertmanager.yml
Checking 'alertmanager.yml' SUCCESS
Found:
- global config
- route
- 0 inhibit rules
- 3 receivers
- 0 templates
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]#
6.启动Alertmanager服务
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# ./alertmanager
7.访问Alertmanager的WebUI
http://10.168.10.33:9093/#/status
1.1 Prometheus server集成Alertmanager
- Prometheus server集成Alertmanager实现告警功能
1. 修改配置文件
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# egrep -v "^#|^$" prometheus.yml
global:
scrape_interval: 3s
evaluation_interval: 3s
...
alerting:
alertmanagers:
- static_configs:
- targets:
- 10.168.10.33:9093
rule_files:
- "cmy-linux-rules.yml"
...
scrape_configs:
...
- job_name: "cmy_dba_exporter"
static_configs:
- targets: ["10.168.10.41:9100"]
- job_name: "cmy_k8s_exporter"
static_configs:
- targets: ["10.168.10.42:9100"]
- job_name: "cmy_bigdata_exporter"
static_configs:
- targets: ["10.168.10.43:9100"]
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]#
2 修改告警规则
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# cat cmy-linux-rules.yml
groups:
- name: cmy-linux-rules-alert
rules:
- alert: cmy-dba_exporter-alert
expr: up{job="cmy_dba_exporter"} == 0
for: 1s
labels:
school: cmy
class: linux97
apps: dba
annotations:
summary: "{{ $labels.instance }} 数据库实例已停止运行超过 3s!"
- alert: cmy-k8s_exporter-alert
expr: up{job="cmy_k8s_exporter"} == 0
for: 1s
labels:
school: cmy
class: linux97
apps: k8s
annotations:
summary: "{{ $labels.instance }} K8S服务器已停止运行超过 3s!"
- alert: cmy-bigdata_exporter-alert
expr: up{job="cmy_bigdata_exporter"} == 0
for: 1s
labels:
school: cmy
class: linux97
apps: bigdata
annotations:
summary: "{{ $labels.instance }} 大数据服务器已停止运行超过 5s!"
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]#
3.检查配置文件语法
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# ./promtool check config prometheus.yml
Checking prometheus.yml
SUCCESS: 1 rule files found
SUCCESS: prometheus.yml is valid prometheus config file syntax
Checking cmy-linux-rules.yml
SUCCESS: 3 rules found
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]#
4.重新加载prometheus的配置
curl -X POST http://10.168.10.31:9090/-/reload
5.查看prometheus server的WebUI验证是否生效
http://10.168.10.31:9090/config
http://10.168.10.31:9090/targets?search=
http://10.168.10.31:9090/alerts?search=
6.触发告警功能
[root@node-exporter41 ~]# systemctl stop node-exporter.service
[root@node-exporter41 ~]# ss -ntl | grep 9100
[root@node-exporter41 ~]#
[root@node-exporter42 ~]# systemctl stop node-exporter.service
[root@node-exporter42 ~]#
[root@node-exporter42 ~]# ss -ntl | grep 9100
[root@node-exporter42 ~]#
[root@node-exporter43 ~]# systemctl stop node-exporter.service
[root@node-exporter43 ~]#
[root@node-exporter43 ~]# ss -ntl | grep 9100
[root@node-exporter43 ~]#
7.查看alermanager的WebUI及邮箱接受者
略,见视频。
2 自定义告警模版
2.1 告警模板介绍
1 告警模板介绍
默认的告警信息界面有些简单,可以借助告警的模板信息,对告警信息进行丰富,需要借助于Alertmanager的模板功能来实现。
告警模板的使用流程如下:
- 分析关键信息
- 定制模板内容
- Alertmanager加载模板文件
- 告警信息使用模板内容属性
模板文件使用标准Go模板语法,并暴露一些包含时间标签和值的变量。
- 标签引用: {{ $label.<label_name> }}
- 指标样本值引用: {{ $value }}
为了显式效果,需要了解一些html相关技术,参考链接:
https://www.w3school.com.cn/html/index.asp
2.2 告警模板参考案例
2 altertmanger节点自定义告警模板参考案例
2.1 创建邮件模板文件工作目录
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# mkdir -pv /cmy/softwares/alertmanager/tmpl
2.2 创建模板实例,工作中可以考虑嵌入公司的logo
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# cat /cmy/softwares/alertmanager/tmpl/email.tmpl
{{ define "test.html" }}
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<style type="text/css">
body {
font-family: 'Helvetica Neue', Arial, sans-serif;
line-height: 1.6;
color: #333;
max-width: 700px;
margin: 0 auto;
padding: 20px;
background-color: #f9f9f9;
}
.alert-card {
border-radius: 8px;
padding: 20px;
margin-bottom: 20px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
.alert-critical {
background: linear-gradient(135deg, #FFF6F6 0%, #FFEBEB 100%);
border-left: 5px solid #FF5252;
}
.alert-resolved {
background: linear-gradient(135deg, #F6FFF6 0%, #EBFFEB 100%);
border-left: 5px solid #4CAF50;
}
.alert-title {
font-size: 18px;
font-weight: bold;
margin-bottom: 15px;
display: flex;
align-items: center;
}
.alert-icon {
width: 24px;
height: 24px;
margin-right: 10px;
}
.alert-field {
margin-bottom: 8px;
display: flex;
}
.field-label {
font-weight: bold;
min-width: 80px;
color: #555;
}
.field-value {
flex: 1;
}
.timestamp {
color: #666;
font-size: 13px;
margin-top: 15px;
text-align: right;
}
.divider {
height: 1px;
background: #eee;
margin: 15px 0;
}
</style>
</head>
<body>
{{- if gt (len .Alerts.Firing) 0 -}}
<div class="alert-header alert-critical">
告警触发 - 请立即处理!
</div>
<div>
<img src="https://img95.699pic.com/element/40114/9548.png_860.png" width="200px" height="200px">
</div>
{{- range $index, $alert := .Alerts -}}
<div class="alert-card alert-critical">
<div class="alert-field">
<span class="field-label">告警名称:</span>
<span class="field-value">{{ .Labels.alertname }}</span>
</div>
<div class="alert-field">
<span class="field-label">告警级别:</span>
<span class="field-value">{{ .Labels.severity }}</span>
</div>
<div class="alert-field">
<span class="field-label">目标机器:</span>
<span class="field-value">{{ .Labels.instance }}</span>
</div>
<div class="alert-field">
<span class="field-label">告警摘要:</span>
<span class="field-value">{{ .Annotations.summary }}</span>
</div>
<div class="alert-field">
<span class="field-label">触发时间:</span>
<span class="field-value">{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}</span>
</div>
{{- if .Annotations.description }}
<div class="divider"></div>
<div class="alert-field">
<span class="field-label">详细描述:</span>
<span class="field-value">{{ .Annotations.description }}</span>
</div>
{{- end }}
</div>
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
<div class="alert-card alert-resolved">
<div class="alert-title">
告警恢复通知
</div>
<div>
<img src="https://tse2-mm.cn.bing.net/th/id/OIP-C.n7AyZv_wWXqFCc1mtlGhFgHaHa?rs=1&pid=ImgDetMain" width="300" height="300">
</div>
<div class="alert-field">
<span class="field-label">告警名称:</span>
<span class="field-value">{{ .Labels.alertname }}</span>
</div>
<div class="alert-field">
<span class="field-label">目标机器:</span>
<span class="field-value">{{ .Labels.instance }}</span>
</div>
<div class="alert-field">
<span class="field-label">告警摘要:</span>
<span class="field-value">[ {{ .Annotations.summary }}] 此告警已经恢复~</span>
</div>
<div class="alert-field">
<span class="field-label">恢复时间:</span>
<span class="field-value">{{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}</span>
</div>
{{- if .Annotations.description }}
<div class="alert-field">
<span class="field-label">详细描述:</span>
<span class="field-value">{{ .Annotations.description }}</span>
</div>
{{- end }}
</div>
{{- end }}
{{- end }}
</body>
</html>
{{ end }}
{{ define "test.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= ERROR ==========<br>
这是告警邮件,请相关责任人及时处理!!!<br>
告警名称:{{ .Labels.alertname }}<br>
告警机器:{{ .Labels.instance }} {{ .Labels.device }}<br>
告警详情:{{ .Annotations.summary }} {{ .Annotations.description }}<br>
告警时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
========= END ==========<br>
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= INFO ==========<br>
这是恢复邮件,问题已解决<br>
告警名称:{{ .Labels.alertname }}<br>
告警机器:{{ .Labels.instance }}<br>
告警详情:{{ .Annotations.summary }}<br>
告警时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
恢复时间:{{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
========= END ==========<br>
{{- end }}
{{- end }}
{{- end }}
2.3 alertmanager引用自定义模板文件
cat /cmy/softwares/alertmanager/tmpl/email.bak
{{ define "test.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= ERROR ==========<br>
这是告警邮件,请相关责任人及时处理!!!<br>
告警名称:{{ .Labels.alertname }}<br>
告警机器:{{ .Labels.instance }} {{ .Labels.device }}<br>
告警详情:{{ .Annotations.summary }} {{ .Annotations.description }}<br>
告警时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
========= END ==========<br>
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= INFO ==========<br>
这是恢复邮件,问题已解决<br>
告警名称:{{ .Labels.alertname }}<br>
告警机器:{{ .Labels.instance }}<br>
告警详情:{{ .Annotations.summary }}<br>
告警时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
恢复时间:{{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
========= END ==========<br>
{{- end }}
{{- end }}
{{- end }}
[root@pm33 /usr/local/alertmanager-0.28.1.linux-amd64]# cat alertmanager.yml
global:
resolve_timeout: 5m
smtp_from: 'songfan0113@163.com'
smtp_smarthost: 'smtp.163.com:465'
smtp_auth_username: 'songfan0113@163.com'
smtp_auth_password: 'FVcjJQ8tw7xzGNP8'
smtp_require_tls: false
smtp_hello: '163.com'
# 定义路由信息
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'sre_system'
# 配置子路由
routes:
- receiver: 'sre_dba'
match_re:
job: cmy_dba_exporter
# 建议将continue的值设置为true,表示当前的条件是否匹配,都将继续向下匹配规则
# 这样做的目的是将消息发给最后的系统组(sre_system)
continue: true
- receiver: 'sre_k8s'
match_re:
job: cmy_k8s_exporter
continue: true
- receiver: 'sre_system'
match_re:
job: .*
continue: true
# 定义接受者
receivers:
- name: 'sre_dba'
email_configs:
- to: '1258389502@qq.com'
send_resolved: true
- name: 'sre_k8s'
email_configs:
- to: '1301118915@qq.com'
send_resolved: true
- name: 'sre_system'
email_configs:
- to: '2838466880@qq.com'
send_resolved: true
headers: { Subject: "[WARN] LINUX97报警邮件" }
html: '{{ template "test.html". }}'
- to: '1792702099@qq.com'
send_resolved: true
templates:
- '/cmy/softwares/alertmanager/tmpl/*.tmpl'
2.4 alertmanager语法检查
[root@pm33 /usr/local/alertmanager-0.28.1.linux-amd64]# ./amtool check-config ./alertmanager.yml
Checking './alertmanager.yml' SUCCESS
Found:
- global config
- route
- 0 inhibit rules
- 3 receivers
- 1 templates
SUCCESS
2.5 重启Alertmanager
2.5 重启Alertmanager程序
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# ./alertmanager
2.6 查看WebUi观察配置是否生效
http://10.168.10.33:9093/#/status
2.7 再次出发告警配置
略,见视频。
2.6 prometheus集成alter
cat prometheus.yml
...
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 10.168.10.31:9093
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "cmy-linux-rules.yml"
# - "first_rules.yml"
# - "second_rules.yml"
cat cmy-linux-rules.yml
groups:
- name: cmy-linux-rules-alert
rules:
- alert: ywz-dba_exporter-alert
expr: up{job="cmy_dba_exporter"} == 0
for: 1s
labels:
school: cmy
class: linux97
apps: dba
annotations:
summary: "{{ $labels.instance }} 数据库实例已停止运行超过 3s!"
- alert: zjh-k8s_exporter-alert
expr: (node_memory_MemTotal_bytes{job="cmy_k8s_exporter"} - node_memory_MemAvailable_bytes{job="cmy_k8s_exporter"}) / node_memory_MemTotal_bytes{job="cmy_k8s_exporter"} * 100 > 90
for: 1s
labels:
school: cmy
class: linux97
apps: k8s
annotations:
summary: "{{ $labels.instance }} K8S服务器内存大于90%,请及时处理!"
description: "{{ $labels.instance }} 内存使用率已达到 {{ printf \"%.2f\" $value }}%。当前阈值: 90%"
- alert: cmy-bigdata_exporter-alert
expr: up{job="cmy_bigdata_exporter"} == 0
for: 1s
labels:
school: cmy
class: linux97
apps: bigdata
annotations:
summary: "{{ $labels.instance }} 大数据服务器已停止运行超过 5s!"
3 告警静默
-
静默的用途:
-
计划维护期间避免告警噪音
-
已知问题修复过程中减少干扰
-
屏蔽预期内的非关键告警
-
-
静默的特点:
-
不会阻止告警触发,只是阻止通知发送
-
基于标签匹配规则工作
-
可设置静默时间范围
-
amtool silence add \
alertname=~"Database.*" \
instance=~"db-.*" \
--comment="周五23:00-周六2:00数据库维护" \
--author="dba-team@example.com" \
--start="2023-07-21T23:00:00+08:00" \
--end="2023-07-22T02:00:00+08:00"
4 告警抑制
-
基本定义:当特定"源告警"触发时,自动抑制匹配条件的"目标告警"通知
-
设计目的:
-
消除冗余告警(如"主机宕机"时抑制该主机上的所有服务告警)
-
建立告警优先级层次(高优先级告警抑制低优先级告警)
-
减少告警噪音,聚焦关键问题
-
-
关键特性:
-
基于标签匹配规则
-
单向抑制关系(源→目标)
-
不阻止告警生成,只抑制通知发送
-
1.什么是告警抑制
说白了,就是抑制告警,和静默不同的是,抑制的应用场景一般用于抑制符合条件的告警。
举个例子:
一个数据中心有800台服务器,每台服务器有50个监控项,假设一个意味着有4w个监控告警。
如果数据中心断电,理论上来说就会有4w条告警发送到你的手机,你是处理不过来的,所以我们只需要将数据中心断电的告警发出来即可。
2.Prometheus Server编写规则
2.1 修改Prometheus server的配置文件
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# vim prometheus.yml
...
rule_files:
# - "cmy-linux-rules.yml"
- "cmy-linux-rules-inhibit.yml"
...
2.2 编写告警规则
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# cat cmy-linux-rules-inhibit.yml
groups:
- name: cmy-linux-rules-alert-inhibit
rules:
- alert: cmy-dba_exporter-alert
expr: up{job="cmy_dba_exporter"} == 0
for: 3s
labels:
apps: dba
severity: critical
dc: beijing
annotations:
summary: "{{ $labels.instance }} 数据库实例已停止运行超过 3s!"
# 这里注释部分增加了一个value的属性信息,会从Prometheus的默认信息中获取阈值
value: "{{ $value }}"
- alert: cmy-k8s_exporter-alert
expr: up{job="cmy_k8s_exporter"} == 0
for: 3s
labels:
apps: k8s
severity: warning
dc: beijing
annotations:
summary: "{{ $labels.instance }} K8S服务器已停止运行超过 3s!"
value: "{{ $value }}"
- alert: cmy-bigdata_exporter-alert
expr: up{job="cmy_bigdata_exporter"} == 0
for: 5s
labels:
apps: bigdata
severity: warning
dc: shenzhen
annotations:
summary: "{{ $labels.instance }} 大数据服务器已停止运行超过 5s!"
value: "{{ $value }}"
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]#
2.3 热加载配置文件使得生效
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# curl -X POST 10.168.10.31:9090/-/reload
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]#
2.4 验证prometheus的webUI配置是否剩下
http://10.168.10.31:9090/alerts?search=
3.Alertmanager配置告警抑制规则
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# cat alertmanager.yml
...
## 配置告警抑制规则
inhibit_rules:
# 如果"dc"的值相同的前提条件下。
# 则当触发了"severity: critical"告警,就会抑制"severity: warning"的告警信息。
- source_match:
severity: critical
target_match:
severity: warning
equal:
- dc
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]#
4.启动Alertmanager
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# ./alertmanager
5.检查Alertmanager的webuI
http://10.168.10.33:9093/#/status
6.验证测试
略,见视频。
5 集成钉钉告警 *****
参考链接:
https://github.com/timonwong/prometheus-webhook-dingtalk/
0.注册钉钉账号并添加钉钉机器人
略,见视频。
1.部署钉钉插件
1.1 下载钉钉插件
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
svip:
[root@node-exporter43 ~]# wget http://192.168.17.253/Resources/Prometheus/softwares/Alertmanager/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
1.2 解压文件
[root@node-exporter43 ~]# tar xf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz -C /usr/local/
[root@node-exporter43 ~]#
[root@node-exporter43 ~]# cd /usr/local/prometheus-webhook-dingtalk-2.1.0.linux-amd64/
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]#
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# ll
total 18752
drwxr-xr-x 3 3434 3434 4096 Apr 21 2022 ./
drwxr-xr-x 12 root root 4096 Mar 30 17:47 ../
-rw-r--r-- 1 3434 3434 1299 Apr 21 2022 config.example.yml
drwxr-xr-x 4 3434 3434 4096 Apr 21 2022 contrib/
-rw-r--r-- 1 3434 3434 11358 Apr 21 2022 LICENSE
-rwxr-xr-x 1 3434 3434 19172733 Apr 21 2022 prometheus-webhook-dingtalk*
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]#
1.3 修改配置文件
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# cp config{.example,}.yml
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]#
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# ll
total 18756
drwxr-xr-x 3 3434 3434 4096 Mar 30 17:47 ./
drwxr-xr-x 12 root root 4096 Mar 30 17:47 ../
-rw-r--r-- 1 3434 3434 1299 Apr 21 2022 config.example.yml
-rw-r--r-- 1 root root 1299 Mar 30 17:47 config.yml
drwxr-xr-x 4 3434 3434 4096 Apr 21 2022 contrib/
-rw-r--r-- 1 3434 3434 11358 Apr 21 2022 LICENSE
-rwxr-xr-x 1 3434 3434 19172733 Apr 21 2022 prometheus-webhook-dingtalk*
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]#
egrep -v "^$|#" config.yml
targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=237433845a1498aaa6dd77cc881589c19081473dc6aba4e34eeaf9db76007088
secret: SEC741046752ed7539be082da3021503a77963374f5ca75885b7832ec496dbbf6f0
1.4 启动钉钉插件
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# ./prometheus-webhook-dingtalk --web.listen-address="10.168.10.43:8060"
2.Alertmanager集成钉钉插件
2.1 修改Alertmanager的配置文件
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# cat alertmanager.yml
# 通用配置
global:
resolve_timeout: 5m
smtp_from: '1792702099@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: '1792702099@qq.com'
smtp_auth_password: 'efucaeallvybbcje'
smtp_require_tls: false
smtp_hello: 'qq.com'
# 定义路由信息
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'sre_system'
# 配置子路由
routes:
- receiver: 'sre_dba'
match_re:
job: cmy_dba_exporter
# 建议将continue的值设置为true,表示当前的条件是否匹配,都将继续向下匹配规则
# 这样做的目的是将消息发给最后的系统组(sre_system)
continue: true
- receiver: 'sre_k8s'
match_re:
job: cmy_k8s_exporter
continue: true
- receiver: 'sre_system'
match_re:
job: .*
continue: true
# 定义接受者
receivers:
- name: 'sre_dba'
email_configs:
- to: '2457193694@qq.com'
send_resolved: true
# 添加此行,定制邮件的标题,对于"{{}}"属性用于加载其他信息,需要使用单引号括住。
headers: { Subject: "[WARN] LINUX97报警邮件" }
# 添加此行,调用模板显式邮件正文,对于"{}"不需要使用单引号,否则服务启动不成功。
html: '{{ template "cmy.html" . }}'
- to: '3081449097@qq.com'
send_resolved: true
- name: 'sre_k8s'
email_configs:
- to: '1138658498@qq.com'
send_resolved: true
headers: { Subject: "[WARN] LINUX97报警邮件" }
html: '{{ template "cmy.html" . }}'
- to: '3864762641@qq.com'
send_resolved: true
- name: 'sre_system'
webhook_configs:
# 指向的是Prometheus的插件地址
- url: 'http://10.168.10.43:8060/dingtalk/webhook1/send'
http_config: {}
max_alerts: 0
send_resolved: true
#email_configs:
#- to: 'caidaopeng1201@163.com'
# send_resolved: true
# headers: { Subject: "[WARN] LINUX97报警邮件" }
# html: '{{ template "cmy.html" . }}'
#- to: '2859770173@qq.com'
# send_resolved: true
# headers: { Subject: "[WARN] LINUX97报警邮件" }
# html: '{{ template "cmy.html" . }}'
# 加载模板
templates:
- '/cmy/softwares/alertmanager/tmpl/*.tmpl'
## 配置告警抑制规则
inhibit_rules:
# 如果"dc"的值相同的前提条件下。
# 则当触发了"severity: critical"告警,就会抑制"severity: warning"的告警信息。
- source_match:
severity: critical
target_match:
severity: warning
equal:
- dc