> 文章列表 > docker-compose安装prometheus告警系统

docker-compose安装prometheus告警系统

docker-compose安装prometheus告警系统

docker-compose安装Prometheus

  • 一、概述
  • 一、docker-compose
  • 二、配置文件

一、概述

本文只有监控与告警的安装、告警发送、发送模版的配置。没有数据展示监控数据UI工具

一、docker-compose

1)docker-compose.yaml

version: '3.0'
services:#1.prometheusprometheus:image: prom/prometheuscontainer_name: prometheusrestart: alwaysvolumes:- $PWD/config/prometheus.yml:/etc/prometheus/prometheus.yml- $PWD/config/rules:/etc/prometheus/rules- $PWD/config/targets:/etc/prometheus/targetscommand:- --web.enable-lifecycle- --config.file=/etc/prometheus/prometheus.ymlports:- 9090:9090networks:- prometheusenvironment:- TZ=Asia/Shanghai#2.alertmanageralertmanager:image: prom/alertmanagercontainer_name: alertmanagerrestart: alwaysvolumes:- $PWD/config/alertmanager.yml:/etc/alertmanager/alertmanager.yml- $PWD/config/template:/alertmanager/templateports:- 9093:9093networks:- prometheusenvironment:- TZ=Asia/Shanghainetworks:prometheus:driver: bridge

启动后prometheus访问地址:http://192.168.1.1:9090/graph?g0.expr=&g0.tab=1&g0.stacked=0&g0.range_input=1h
docker-compose安装prometheus告警系统
alertmanager访问地址:http://192.168.1.1:9093/#/alerts
docker-compose安装prometheus告警系统

二、配置文件

1)prometheus.yml

global:scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.# scrape_timeout is set to the global default (10s).# Alertmanager configuration
alerting:alertmanagers:- static_configs:- targets:- alertmanager:9093# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:- "rules/*"
# - "second_rules.yml"# remote_write:
#   - url: http://192.168.50.200:8080/prometheusscrape_configs:- job_name: prometheusstatic_configs:- targets: ['192.168.1.1:9090']labels:instance: prometheus- job_name: 'node_exporter'file_sd_configs:- files:- targets/*.yml

2)alertmanager.yml


global:# smtp_smarthost: 'smtp.exmail.qq.com:587'# smtp_from: '*'                # 谁发邮件# smtp_auth_username: '*'      # 邮箱用户# smtp_auth_password: '*'                 # 邮箱密码# smtp_require_tls: truesmtp_smarthost: 'smtp.163.com:465'smtp_from: '*'               # 谁发邮件smtp_auth_username: '*'       # 邮箱用户smtp_auth_password: '*'                   # 邮箱密码smtp_require_tls: falseroute:group_by: ["instance"]            # 分组名group_wait: 10s                   # 当收到告警的时候,等待三十秒看是否还有告警,如果有就一起发出去group_interval: 10s                # 发送警告间隔时间repeat_interval: 1h              # 重复报警的间隔时间receiver: mail                    # 全局报警组,这个参数是必选的,和下面报警组名要相同templates:- '/alertmanager/template/*.tmpl'receivers:
- name: 'mail'                      # 报警组名email_configs:- to: ''      # 发送给谁,多个逗号隔开html: '{{ template "alert.html" . }}'headers: { Subject: "[WARN] 监控报警" }

3)targets配置,建目标服务器配置文件node_targets.yml

- targets:- 192.168.1.*:9100labels:host: aaa
- targets:- 192.168.1.*:9100labels:host: bbb

4)常用告警规则,以node_exporter 为例node_down.yml

groups:
- name: 实例存活告警规则rules:- alert: 实例存活告警expr: up == 0for: 1mlabels:user: prometheusseverity: errorannotations:summary: "机器 {{ $labels.instance }} 挂了"description: "{{ $labels.instance }} 已停止超过1分钟"- name: CPU报警规则rules:- alert: CPU使用率告警expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 90for: 1mlabels:user: prometheusseverity: warningannotations:description: "服务器: CPU使用超过90%!(当前值: {{ $value }}%)"summary: "机器 {{ $labels.instance }} CPU使用超过90%"- name: 内存报警规则rules:- alert: 内存使用率告警expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80for: 1mlabels:user: prometheusseverity: warningannotations:description: "服务器: 内存使用超过80%!(当前值: {{ $value }}%)"summary: "机器 {{ $labels.instance }} 内存使用超过80%"- name: 磁盘报警规则rules:- alert: 磁盘使用率告警expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80for: 1mlabels:user: prometheusseverity: warningannotations:description: "服务器: 磁盘设备: 使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"summary: "机器 {{ $labels.instance }} 磁盘使用超过80%"

5)告警模版 alert.tmpl

{{ define "alert.html" }}
{{ range .Alerts }}
<table border="1"><tr><th> 告警通知 </th><th>  厂区prometheus监控告警通知 </th></tr><tr><td> 告警级别 </td><td>{{ .Labels.severity }}</td></tr><tr><td> 告警类型 </td><td>{{ .Labels.alertname }}</td></tr><tr><td> 故障主机 </td><td>{{ .Labels.instance }}</td></tr><tr><td> 告警主题 </td><td>{{ .Annotations.summary }}</td></tr><tr><td> 告警详情 </td><td>{{ .Annotations.description }}</td></tr><tr><td> 触发时间 </td><td>{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}</td></tr></table>
{{ end }}
{{ end }}