Alertmanager

1 Alertmanager安装及子路由配置 *****


- Alertmanager环境部署
	1.什么是altermanager 
Alertmanager是一款开源的告警工具包,可以和Prometheus集成。


	2.下载Alertmanager 
wget https://github.com/prometheus/alertmanager/releases/download/v0.28.1/alertmanager-0.28.1.linux-amd64.tar.gz

SVIP:
[root@prometheus-server33 ~]#  wget http://192.168.17.253/Resources/Prometheus/softwares/Alertmanager/alertmanager-0.28.1.linux-amd64.tar.gz 


	3.解压安装包 
[root@prometheus-server33 ~]#  tar xf alertmanager-0.28.1.linux-amd64.tar.gz  -C /usr/local/
[root@prometheus-server33 ~]#  

	4.修改Alertmanager的配置文件
[root@prometheus-server33 ~]# cd /usr/local/alertmanager-0.28.1.linux-amd64/
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# 
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# ll
total 67932
drwxr-xr-x  2 1001 1002     4096 Mar  7 23:08 ./
drwxr-xr-x 11 root root     4096 May 14 15:03 ../
-rwxr-xr-x  1 1001 1002 38948743 Mar  7 23:06 alertmanager*
-rw-r--r--  1 1001 1002      356 Mar  7 23:07 alertmanager.yml
-rwxr-xr-x  1 1001 1002 30582387 Mar  7 23:06 amtool*
-rw-r--r--  1 1001 1002    11357 Mar  7 23:07 LICENSE
-rw-r--r--  1 1001 1002      311 Mar  7 23:07 NOTICE
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# 
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# cat alertmanager.yml
# 通用配置
global:
  resolve_timeout: 5m
  smtp_from: '1792702099@qq.com'
  smtp_smarthost: 'smtp.qq.com:465'
  smtp_auth_username: '1792702099@qq.com'
  smtp_auth_password: 'efucaeallvybbcje'
  smtp_require_tls: false
  smtp_hello: 'qq.com'
# 定义路由信息
route:
  group_by: ['alertname']
  group_wait: 5s
  group_interval: 5s
  repeat_interval: 5m
  receiver: 'sre_system'
  # 配置子路由
  routes:
    - receiver: 'sre_dba'
      match_re:
        job: cmy_dba_exporter
      # 建议将continue的值设置为true,表示当前的条件是否匹配,都将继续向下匹配规则
      # 这样做的目的是将消息发给最后的系统组(sre_system)
      continue: true
    - receiver: 'sre_k8s'
      match_re:
        job: cmy_k8s_exporter
      continue: true
    - receiver: 'sre_system'
      match_re:
        job: .*
      continue: true
# 定义接受者 
receivers:
- name: 'sre_dba'
  email_configs:
  - to: '2457193694@qq.com'
    send_resolved: true
  - to: '3081449097@qq.com'
    send_resolved: true
- name: 'sre_k8s'
  email_configs:
  - to: '1138658498@qq.com'
    send_resolved: true
  - to: '3864762641@qq.com'
    send_resolved: true
- name: 'sre_system'
  email_configs:
  - to: 'caidaopeng1201@163.com'
    send_resolved: true
  - to: '2859770173@qq.com'
    send_resolved: true
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# 



	5.检查配置文件语法 
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# ./amtool check-config alertmanager.yml 
Checking 'alertmanager.yml'  SUCCESS
Found:
 - global config
 - route
 - 0 inhibit rules
 - 3 receivers
 - 0 templates

[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# 


	6.启动Alertmanager服务 
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# ./alertmanager 


	7.访问Alertmanager的WebUI
http://10.168.10.33:9093/#/status

1.1 Prometheus server集成Alertmanager

- Prometheus server集成Alertmanager实现告警功能
	1. 修改配置文件
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# egrep -v "^#|^$" prometheus.yml
global:
  scrape_interval: 3s
  evaluation_interval: 3s
...
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - 10.168.10.33:9093
rule_files:
  - "cmy-linux-rules.yml"
  
...
scrape_configs:
  ...
  - job_name: "cmy_dba_exporter"
    static_configs:
      - targets: ["10.168.10.41:9100"]
  - job_name: "cmy_k8s_exporter"
    static_configs:
      - targets: ["10.168.10.42:9100"]
  - job_name: "cmy_bigdata_exporter"
    static_configs:
      - targets: ["10.168.10.43:9100"]
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# 


	2 修改告警规则
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# cat cmy-linux-rules.yml 
groups:
- name: cmy-linux-rules-alert
  rules:
  - alert: cmy-dba_exporter-alert
    expr: up{job="cmy_dba_exporter"} == 0
    for: 1s
    labels:
      school: cmy
      class: linux97
      apps: dba
    annotations:
      summary: "{{ $labels.instance }} 数据库实例已停止运行超过 3s!"
  - alert: cmy-k8s_exporter-alert
    expr: up{job="cmy_k8s_exporter"} == 0
    for: 1s
    labels:
      school: cmy
      class: linux97
      apps: k8s
    annotations:
      summary: "{{ $labels.instance }} K8S服务器已停止运行超过 3s!"
  - alert: cmy-bigdata_exporter-alert
    expr: up{job="cmy_bigdata_exporter"} == 0
    for: 1s
    labels:
      school: cmy
      class: linux97
      apps: bigdata
    annotations:
      summary: "{{ $labels.instance }} 大数据服务器已停止运行超过 5s!"
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# 



	3.检查配置文件语法
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# ./promtool check config prometheus.yml
Checking prometheus.yml
  SUCCESS: 1 rule files found
 SUCCESS: prometheus.yml is valid prometheus config file syntax

Checking cmy-linux-rules.yml
  SUCCESS: 3 rules found

[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# 




	4.重新加载prometheus的配置
curl -X POST http://10.168.10.31:9090/-/reload


	5.查看prometheus server的WebUI验证是否生效
http://10.168.10.31:9090/config

http://10.168.10.31:9090/targets?search=

http://10.168.10.31:9090/alerts?search=


	6.触发告警功能
[root@node-exporter41 ~]# systemctl stop node-exporter.service 
[root@node-exporter41 ~]# ss -ntl | grep 9100
[root@node-exporter41 ~]# 

	
[root@node-exporter42 ~]# systemctl stop  node-exporter.service 
[root@node-exporter42 ~]# 
[root@node-exporter42 ~]# ss -ntl | grep 9100
[root@node-exporter42 ~]# 


[root@node-exporter43 ~]# systemctl stop node-exporter.service 
[root@node-exporter43 ~]# 
[root@node-exporter43 ~]# ss -ntl | grep 9100
[root@node-exporter43 ~]# 

	
	7.查看alermanager的WebUI及邮箱接受者
略,见视频。

2 自定义告警模版

2.1 告警模板介绍

1 告警模板介绍
默认的告警信息界面有些简单,可以借助告警的模板信息,对告警信息进行丰富,需要借助于Alertmanager的模板功能来实现。

告警模板的使用流程如下:
    - 分析关键信息
    - 定制模板内容
    - Alertmanager加载模板文件
    - 告警信息使用模板内容属性


模板文件使用标准Go模板语法,并暴露一些包含时间标签和值的变量。
    - 标签引用: {{ $label.<label_name> }}
    - 指标样本值引用: {{ $value }}
    
为了显式效果,需要了解一些html相关技术,参考链接:
    https://www.w3school.com.cn/html/index.asp

2.2 告警模板参考案例


	2 altertmanger节点自定义告警模板参考案例
		2.1 创建邮件模板文件工作目录
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# mkdir -pv /cmy/softwares/alertmanager/tmpl

		2.2 创建模板实例,工作中可以考虑嵌入公司的logo
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# cat /cmy/softwares/alertmanager/tmpl/email.tmpl
{{ define "test.html" }}
<!DOCTYPE html>
<html>
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <style type="text/css">
        body {
            font-family: 'Helvetica Neue', Arial, sans-serif;
            line-height: 1.6;
            color: #333;
            max-width: 700px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f9f9f9;
        }
        .alert-card {
            border-radius: 8px;
            padding: 20px;
            margin-bottom: 20px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
        }
        .alert-critical {
            background: linear-gradient(135deg, #FFF6F6 0%, #FFEBEB 100%);
            border-left: 5px solid #FF5252;
        }
        .alert-resolved {
            background: linear-gradient(135deg, #F6FFF6 0%, #EBFFEB 100%);
            border-left: 5px solid #4CAF50;
        }
        .alert-title {
            font-size: 18px;
            font-weight: bold;
            margin-bottom: 15px;
            display: flex;
            align-items: center;
        }
        .alert-icon {
            width: 24px;
            height: 24px;
            margin-right: 10px;
        }
        .alert-field {
            margin-bottom: 8px;
            display: flex;
        }
        .field-label {
            font-weight: bold;
            min-width: 80px;
            color: #555;
        }
        .field-value {
            flex: 1;
        }
        .timestamp {
            color: #666;
            font-size: 13px;
            margin-top: 15px;
            text-align: right;
        }
        .divider {
            height: 1px;
            background: #eee;
            margin: 15px 0;
        }
    </style>
</head>
<body>
{{- if gt (len .Alerts.Firing) 0 -}}
    <div class="alert-header alert-critical">
        告警触发 - 请立即处理!
    </div>
    <div>
        <img src="https://img95.699pic.com/element/40114/9548.png_860.png" width="200px" height="200px">
    </div>
    {{- range $index, $alert := .Alerts -}}
        <div class="alert-card alert-critical">
            <div class="alert-field">
                <span class="field-label">告警名称:</span>
                <span class="field-value">{{ .Labels.alertname }}</span>
            </div>
            <div class="alert-field">
                <span class="field-label">告警级别:</span>
                <span class="field-value">{{ .Labels.severity }}</span>
            </div>
            <div class="alert-field">
                <span class="field-label">目标机器:</span>
                <span class="field-value">{{ .Labels.instance }}</span>
            </div>
            <div class="alert-field">
                <span class="field-label">告警摘要:</span>
                <span class="field-value">{{ .Annotations.summary }}</span>
            </div>
            <div class="alert-field">
                <span class="field-label">触发时间:</span>
                <span class="field-value">{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}</span>
            </div>
            {{- if .Annotations.description }}
            <div class="divider"></div>
            <div class="alert-field">
                <span class="field-label">详细描述:</span>
                <span class="field-value">{{ .Annotations.description }}</span>
            </div>
            {{- end }}
        </div>
    {{- end }}
{{- end }}

{{- if gt (len .Alerts.Resolved) 0 -}}
    {{- range $index, $alert := .Alerts -}}
    <div class="alert-card alert-resolved">
        <div class="alert-title">
            告警恢复通知
        </div>
        <div>
            <img src="https://tse2-mm.cn.bing.net/th/id/OIP-C.n7AyZv_wWXqFCc1mtlGhFgHaHa?rs=1&pid=ImgDetMain" width="300" height="300">
        </div>
        <div class="alert-field">
            <span class="field-label">告警名称:</span>
            <span class="field-value">{{ .Labels.alertname }}</span>
        </div>
        <div class="alert-field">
            <span class="field-label">目标机器:</span>
            <span class="field-value">{{ .Labels.instance }}</span>
        </div>
        <div class="alert-field">
            <span class="field-label">告警摘要:</span>
            <span class="field-value">[ {{ .Annotations.summary }}] 此告警已经恢复~</span>
        </div>
        <div class="alert-field">
            <span class="field-label">恢复时间:</span>
            <span class="field-value">{{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}</span>
        </div>
        {{- if .Annotations.description }}
        <div class="alert-field">
            <span class="field-label">详细描述:</span>
            <span class="field-value">{{ .Annotations.description }}</span>
        </div>
        {{- end }}
    </div>
    {{- end }}
{{- end }}

</body>
</html>
{{ end }}




{{ define "test.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= ERROR ==========<br>
这是告警邮件,请相关责任人及时处理!!!<br>
告警名称:{{ .Labels.alertname }}<br>
告警机器:{{ .Labels.instance }} {{ .Labels.device }}<br>
告警详情:{{ .Annotations.summary }} {{ .Annotations.description }}<br>
告警时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>

========= END ==========<br>
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= INFO ==========<br>
这是恢复邮件,问题已解决<br>
告警名称:{{ .Labels.alertname }}<br>
告警机器:{{ .Labels.instance }}<br>
告警详情:{{ .Annotations.summary }}<br>
告警时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
恢复时间:{{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>

========= END ==========<br>
{{- end }}
{{- end }}
{{- end }}

2.3 alertmanager引用自定义模板文件

cat /cmy/softwares/alertmanager/tmpl/email.bak
{{ define "test.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= ERROR ==========<br>
这是告警邮件,请相关责任人及时处理!!!<br>
告警名称:{{ .Labels.alertname }}<br>
告警机器:{{ .Labels.instance }} {{ .Labels.device }}<br>
告警详情:{{ .Annotations.summary }} {{ .Annotations.description }}<br>
告警时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>

========= END ==========<br>
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= INFO ==========<br>
这是恢复邮件,问题已解决<br>
告警名称:{{ .Labels.alertname }}<br>
告警机器:{{ .Labels.instance }}<br>
告警详情:{{ .Annotations.summary }}<br>
告警时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
恢复时间:{{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>

========= END ==========<br>
{{- end }}
{{- end }}
{{- end }}
[root@pm33 /usr/local/alertmanager-0.28.1.linux-amd64]# cat alertmanager.yml
global:
  resolve_timeout: 5m
  smtp_from: 'songfan0113@163.com'
  smtp_smarthost: 'smtp.163.com:465'
  smtp_auth_username: 'songfan0113@163.com'
  smtp_auth_password: 'FVcjJQ8tw7xzGNP8'
  smtp_require_tls: false
  smtp_hello: '163.com'
# 定义路由信息
route:
  group_by: ['alertname']
  group_wait: 5s
  group_interval: 5s
  repeat_interval: 5m
  receiver: 'sre_system'
  # 配置子路由
  routes:
    - receiver: 'sre_dba'
      match_re:
        job: cmy_dba_exporter
      # 建议将continue的值设置为true,表示当前的条件是否匹配,都将继续向下匹配规则
      # 这样做的目的是将消息发给最后的系统组(sre_system)
      continue: true
    - receiver: 'sre_k8s'
      match_re:
        job: cmy_k8s_exporter
      continue: true
    - receiver: 'sre_system'
      match_re:
        job: .*
      continue: true
# 定义接受者
receivers:
- name: 'sre_dba'
  email_configs:
  - to: '1258389502@qq.com'
    send_resolved: true
- name: 'sre_k8s'
  email_configs:
  - to: '1301118915@qq.com'
    send_resolved: true
- name: 'sre_system'
  email_configs:
  - to: '2838466880@qq.com'
    send_resolved: true
    headers: { Subject: "[WARN] LINUX97报警邮件" }
    html: '{{ template "test.html". }}'
  - to: '1792702099@qq.com'
    send_resolved: true
templates:
  - '/cmy/softwares/alertmanager/tmpl/*.tmpl'

2.4 alertmanager语法检查

[root@pm33 /usr/local/alertmanager-0.28.1.linux-amd64]# ./amtool check-config ./alertmanager.yml
Checking './alertmanager.yml'  SUCCESS
Found:
 - global config
 - route
 - 0 inhibit rules
 - 3 receivers
 - 1 templates
  SUCCESS

2.5 重启Alertmanager


	2.5 重启Alertmanager程序
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# ./alertmanager 

	2.6 查看WebUi观察配置是否生效
http://10.168.10.33:9093/#/status


	2.7 再次出发告警配置
略,见视频。

2.6 prometheus集成alter

 cat prometheus.yml
 ...
# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - 10.168.10.31:9093
          # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "cmy-linux-rules.yml"
  # - "first_rules.yml"
  # - "second_rules.yml"

 cat cmy-linux-rules.yml
groups:
- name: cmy-linux-rules-alert
  rules:
  - alert: ywz-dba_exporter-alert
    expr: up{job="cmy_dba_exporter"} == 0
    for: 1s
    labels:
      school: cmy
      class: linux97
      apps: dba
    annotations:
      summary: "{{ $labels.instance }} 数据库实例已停止运行超过 3s!"
  - alert: zjh-k8s_exporter-alert
    expr: (node_memory_MemTotal_bytes{job="cmy_k8s_exporter"} - node_memory_MemAvailable_bytes{job="cmy_k8s_exporter"}) / node_memory_MemTotal_bytes{job="cmy_k8s_exporter"} * 100 > 90
    for: 1s
    labels:
      school: cmy
      class: linux97
      apps: k8s
    annotations:
      summary: "{{ $labels.instance }} K8S服务器内存大于90%,请及时处理!"
      description: "{{ $labels.instance }} 内存使用率已达到 {{ printf \"%.2f\" $value }}%。当前阈值: 90%"
  - alert: cmy-bigdata_exporter-alert
    expr: up{job="cmy_bigdata_exporter"} == 0
    for: 1s
    labels:
      school: cmy
      class: linux97
      apps: bigdata
    annotations:
      summary: "{{ $labels.instance }} 大数据服务器已停止运行超过 5s!"



3 告警静默

  1. 静默的用途

    • 计划维护期间避免告警噪音

    • 已知问题修复过程中减少干扰

    • 屏蔽预期内的非关键告警

  2. 静默的特点

    • 不会阻止告警触发,只是阻止通知发送

    • 基于标签匹配规则工作

    • 可设置静默时间范围

amtool silence add \
  alertname=~"Database.*" \
  instance=~"db-.*" \
  --comment="周五23:00-周六2:00数据库维护" \
  --author="dba-team@example.com" \
  --start="2023-07-21T23:00:00+08:00" \
  --end="2023-07-22T02:00:00+08:00"

4 告警抑制

  1. 基本定义:当特定"源告警"触发时,自动抑制匹配条件的"目标告警"通知

  2. 设计目的

    • 消除冗余告警(如"主机宕机"时抑制该主机上的所有服务告警)

    • 建立告警优先级层次(高优先级告警抑制低优先级告警)

    • 减少告警噪音,聚焦关键问题

  3. 关键特性

    • 基于标签匹配规则

    • 单向抑制关系(源→目标)

    • 不阻止告警生成,只抑制通知发送

	1.什么是告警抑制
说白了,就是抑制告警,和静默不同的是,抑制的应用场景一般用于抑制符合条件的告警。

举个例子: 
	一个数据中心有800台服务器,每台服务器有50个监控项,假设一个意味着有4w个监控告警。
	如果数据中心断电,理论上来说就会有4w条告警发送到你的手机,你是处理不过来的,所以我们只需要将数据中心断电的告警发出来即可。
	
	 
	2.Prometheus Server编写规则
		2.1 修改Prometheus server的配置文件
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# vim prometheus.yml
...
rule_files:
   # - "cmy-linux-rules.yml"
   - "cmy-linux-rules-inhibit.yml"
...

		
		2.2 编写告警规则
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# cat cmy-linux-rules-inhibit.yml
groups:
- name: cmy-linux-rules-alert-inhibit
  rules:
  - alert: cmy-dba_exporter-alert
    expr: up{job="cmy_dba_exporter"} == 0
    for: 3s
    labels:
      apps: dba
      severity: critical
      dc: beijing
    annotations:
      summary: "{{ $labels.instance }} 数据库实例已停止运行超过 3s!"
       # 这里注释部分增加了一个value的属性信息,会从Prometheus的默认信息中获取阈值
      value: "{{ $value }}"
  - alert: cmy-k8s_exporter-alert
    expr: up{job="cmy_k8s_exporter"} == 0
    for: 3s
    labels:
      apps: k8s
      severity: warning
      dc: beijing
    annotations:
      summary: "{{ $labels.instance }} K8S服务器已停止运行超过 3s!"
      value: "{{ $value }}"
  - alert: cmy-bigdata_exporter-alert
    expr: up{job="cmy_bigdata_exporter"} == 0
    for: 5s
    labels:
      apps: bigdata
      severity: warning
      dc: shenzhen
    annotations:
      summary: "{{ $labels.instance }} 大数据服务器已停止运行超过 5s!"
      value: "{{ $value }}"
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# 


		2.3 热加载配置文件使得生效 
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# curl -X POST 10.168.10.31:9090/-/reload
[root@prometheus-server31 prometheus-2.53.4.linux-amd64]# 


		2.4 验证prometheus的webUI配置是否剩下 
http://10.168.10.31:9090/alerts?search=


		
	3.Alertmanager配置告警抑制规则 
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# cat alertmanager.yml 
...
 
## 配置告警抑制规则
inhibit_rules:
  # 如果"dc"的值相同的前提条件下。
  #	则当触发了"severity: critical"告警,就会抑制"severity: warning"的告警信息。
- source_match:
    severity: critical
  target_match:
    severity: warning
  equal:
  - dc
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# 


	4.启动Alertmanager  
[root@node-exporter43 alertmanager-0.28.1.linux-amd64]# ./alertmanager 
 


	5.检查Alertmanager的webuI
http://10.168.10.33:9093/#/status


	6.验证测试
略,见视频。
 

5 集成钉钉告警 *****

参考链接:
	https://github.com/timonwong/prometheus-webhook-dingtalk/
	
 
	0.注册钉钉账号并添加钉钉机器人
略,见视频。


	1.部署钉钉插件
		1.1 下载钉钉插件 
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz

svip:
[root@node-exporter43 ~]#  wget http://192.168.17.253/Resources/Prometheus/softwares/Alertmanager/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz



		1.2 解压文件
[root@node-exporter43 ~]# tar xf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz  -C /usr/local/
[root@node-exporter43 ~]# 
[root@node-exporter43 ~]# cd /usr/local/prometheus-webhook-dingtalk-2.1.0.linux-amd64/
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# 
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# ll
total 18752
drwxr-xr-x  3 3434 3434     4096 Apr 21  2022 ./
drwxr-xr-x 12 root root     4096 Mar 30 17:47 ../
-rw-r--r--  1 3434 3434     1299 Apr 21  2022 config.example.yml
drwxr-xr-x  4 3434 3434     4096 Apr 21  2022 contrib/
-rw-r--r--  1 3434 3434    11358 Apr 21  2022 LICENSE
-rwxr-xr-x  1 3434 3434 19172733 Apr 21  2022 prometheus-webhook-dingtalk*
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# 



		1.3 修改配置文件 
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# cp config{.example,}.yml
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# 
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# ll
total 18756
drwxr-xr-x  3 3434 3434     4096 Mar 30 17:47 ./
drwxr-xr-x 12 root root     4096 Mar 30 17:47 ../
-rw-r--r--  1 3434 3434     1299 Apr 21  2022 config.example.yml
-rw-r--r--  1 root root     1299 Mar 30 17:47 config.yml
drwxr-xr-x  4 3434 3434     4096 Apr 21  2022 contrib/
-rw-r--r--  1 3434 3434    11358 Apr 21  2022 LICENSE
-rwxr-xr-x  1 3434 3434 19172733 Apr 21  2022 prometheus-webhook-dingtalk*
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# 
egrep -v "^$|#" config.yml
targets:
  webhook1:
    url: https://oapi.dingtalk.com/robot/send?access_token=237433845a1498aaa6dd77cc881589c19081473dc6aba4e34eeaf9db76007088
    secret: SEC741046752ed7539be082da3021503a77963374f5ca75885b7832ec496dbbf6f0




		1.4 启动钉钉插件
[root@node-exporter43 prometheus-webhook-dingtalk-2.1.0.linux-amd64]# ./prometheus-webhook-dingtalk --web.listen-address="10.168.10.43:8060" 


	2.Alertmanager集成钉钉插件
		2.1 修改Alertmanager的配置文件
[root@prometheus-server33 alertmanager-0.28.1.linux-amd64]# cat alertmanager.yml 
# 通用配置
global:
  resolve_timeout: 5m
  smtp_from: '1792702099@qq.com'
  smtp_smarthost: 'smtp.qq.com:465'
  smtp_auth_username: '1792702099@qq.com'
  smtp_auth_password: 'efucaeallvybbcje'
  smtp_require_tls: false
  smtp_hello: 'qq.com'
# 定义路由信息
route:
  group_by: ['alertname']
  group_wait: 5s
  group_interval: 5s
  repeat_interval: 5m
  receiver: 'sre_system'
  # 配置子路由
  routes:
    - receiver: 'sre_dba'
      match_re:
        job: cmy_dba_exporter
      # 建议将continue的值设置为true,表示当前的条件是否匹配,都将继续向下匹配规则
      # 这样做的目的是将消息发给最后的系统组(sre_system)
      continue: true
    - receiver: 'sre_k8s'
      match_re:
        job: cmy_k8s_exporter
      continue: true
    - receiver: 'sre_system'
      match_re:
        job: .*
      continue: true
# 定义接受者 
receivers:
- name: 'sre_dba'
  email_configs:
  - to: '2457193694@qq.com'
    send_resolved: true
    # 添加此行,定制邮件的标题,对于"{{}}"属性用于加载其他信息,需要使用单引号括住。
    headers: { Subject: "[WARN] LINUX97报警邮件" }
    # 添加此行,调用模板显式邮件正文,对于"{}"不需要使用单引号,否则服务启动不成功。
    html: '{{ template "cmy.html" . }}'
  - to: '3081449097@qq.com'
    send_resolved: true
- name: 'sre_k8s'
  email_configs:
  - to: '1138658498@qq.com'
    send_resolved: true
    headers: { Subject: "[WARN] LINUX97报警邮件" }
    html: '{{ template "cmy.html" . }}'
  - to: '3864762641@qq.com'
    send_resolved: true
- name: 'sre_system'
  webhook_configs:
      # 指向的是Prometheus的插件地址
    - url: 'http://10.168.10.43:8060/dingtalk/webhook1/send'
      http_config: {}
      max_alerts: 0
      send_resolved: true
  #email_configs:
  #- to: 'caidaopeng1201@163.com'
  #  send_resolved: true
  #  headers: { Subject: "[WARN] LINUX97报警邮件" }
  #  html: '{{ template "cmy.html" . }}'
  #- to: '2859770173@qq.com'
  #  send_resolved: true
  #  headers: { Subject: "[WARN] LINUX97报警邮件" }
  #  html: '{{ template "cmy.html" . }}'

# 加载模板
templates:
  - '/cmy/softwares/alertmanager/tmpl/*.tmpl'


## 配置告警抑制规则
inhibit_rules:
  # 如果"dc"的值相同的前提条件下。
  #	则当触发了"severity: critical"告警,就会抑制"severity: warning"的告警信息。
- source_match:
    severity: critical
  target_match:
    severity: warning
  equal:
  - dc
上一篇
下一篇