当前位置: 首页 > news >正文

alertManager部署安装、告警规则配置详解及告警消息推送

 

​
java接受告警请求@RestController
@RequestMapping("/alert")
@Slf4j
public class TestApi {private static final DateTimeFormatter FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");@RequestMappingpublic void sendTemplate(HttpServletRequest request) throws Exception {String requestBody = StreamUtils.copyToString(request.getInputStream(), StandardCharsets.UTF_8);JSONObject jsonObject = JSONUtil.parseObj(requestBody);log.info("sendTemplate {}", jsonObject);// 遍历告警信息JSONArray alerts = jsonObject.getJSONArray("alerts");for (int i = 0; i < alerts.size(); i++) {JSONObject alert = alerts.getJSONObject(i);JSONObject labels = alert.getJSONObject("labels");JSONObject annotations = alert.getJSONObject("annotations");// 替换模板中的占位符Map<String, Object> templateData = new HashMap<>();templateData.put("sendTime", LocalDateTime.now().format(FORMATTER));templateData.put("alertname", labels.getStr("alertname"));templateData.put("instance", labels.getStr("instance"));templateData.put("severity", labels.getStr("severity"));templateData.put("status", alert.getStr("status"));templateData.put("startsAt", alert.getStr("startsAt"));templateData.put("description", annotations.getStr("description"));templateData.put("generatorURL", alert.getStr("generatorURL"));String alertMsg = TemplateUtils.renderTemplate("alert.ftl", templateData);// 调用企业微信机器人发送消息WeComBot.sendToWeComBot(alertMsg);}}
}​


       <!-- FreeMarker Template Engine --><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-freemarker</artifactId></dependency>import freemarker.template.Configuration;
import freemarker.template.Template;import java.io.StringWriter;
import java.util.Map;public class TemplateUtils {private static final Configuration freemarkerConfig;// 静态初始化 FreeMarker 配置static {freemarkerConfig = new Configuration(Configuration.VERSION_2_3_31);freemarkerConfig.setClassForTemplateLoading(TemplateUtils.class, "/templates");freemarkerConfig.setDefaultEncoding("UTF-8");}/*** 使用 FreeMarker 渲染模板** @param templateName 模板文件名(如 "alert.ftl")* @param data         数据模型(键值对)* @return 渲染后的字符串*/public static String renderTemplate(String templateName, Map<String, Object> data) {try {// 加载模板Template template = freemarkerConfig.getTemplate(templateName);// 渲染模板StringWriter writer = new StringWriter();template.process(data, writer);return writer.toString();} catch (Exception e) {throw new RuntimeException("模板渲染失败", e);}}
}public class WeComBot {private static final String WEBHOOK_URL = "https://qyapi.weixin.qq.com/cgi-binbbfc-4412c60ad031";/*** 发送消息到企业微信机器人** @param message 消息内容* @throws Exception 如果发送失败*/public static void sendToWeComBot(String message) throws Exception {// 构造 JSON 数据String jsonPayload = JSONUtil.createObj().put("msgtype", "markdown").put("markdown", JSONUtil.createObj().put("content", message)).toString();// 发送 HTTP POST 请求HttpResponse response = HttpRequest.post(WEBHOOK_URL).header("Content-Type", "application/json; utf-8") // 设置请求头.body(jsonPayload) // 设置请求体.timeout(5000) // 设置超时时间为 5 秒(单位:毫秒).execute(); // 执行请求// 检查响应状态码if (response.getStatus() != 200) {throw new RuntimeException("Failed to send message: HTTP error code " + response.getStatus());}}
}


  alertmanager:
    image: prom/alertmanager:v0.26.0
    environment:
      - TZ=Asia/Shanghai
    container_name: alertmanager
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
      - ./alertmanager/templates:/etc/alertmanager/templates
      - ./alertmanager/data:/alertmanager
    command:
      - "--config.file=/etc/alertmanager/alertmanager.yml"
      - "--storage.path=/alertmanager"
      - "--log.level=info"             # 设置日志级别(可选)
    ports:
      - "9093:9093"
      - "9094:9094"
    restart: always


docker-prometheus.yamlversion: '3.8'services:prometheus:image: bitnami/prometheus:3.0.0container_name: prometheushostname: prometheusports:- "9090:9090" # Prometheus Web UI 端口volumes:- ./prometheus.yml:/etc/prometheus/prometheus.yml- ./prometheus-data:/prometheus- ./rules:/rulescommand:- '--config.file=/etc/prometheus/prometheus.yml'- '--web.external-url=http://192.168.118.20:9090/'- '--web.enable-lifecycle'- '--storage.tsdb.retention.time=90d'- "--storage.tsdb.path=/prometheus"- "--web.enable-admin-api"restart: alwaysalertmanager:image: prom/alertmanager:v0.26.0environment:- TZ=Asia/Shanghaicontainer_name: alertmanagervolumes:- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml- ./alertmanager/templates:/etc/alertmanager/templates- ./alertmanager/data:/alertmanagercommand:- "--config.file=/etc/alertmanager/alertmanager.yml"- "--storage.path=/alertmanager"- "--log.level=info"             # 设置日志级别(可选)ports:- "9093:9093"- "9094:9094"restart: alwaysgrafana:image: grafana/grafana:11.3.3container_name: grafanahostname: grafanaports:- "3000:3000" # Grafana Web UI 端口environment:GF_SECURITY_ADMIN_PASSWORD: admin # 设置 Grafana 的管理员密码volumes:- ./grafana-storage:/var/lib/grafanarestart: alwaysnode-exporter:image: bitnami/node-exporter:1.8.1container_name: node-exporterrestart: unless-stoppedports:- "9100:9100"volumes:- /proc:/host/proc:ro- /sys:/host/sys:ro- /:/rootfs:roenvironment:IGNORE_MOUNT_POINTS: "^/(sys|proc|dev|host|etc)($$|/)"IGNORE_FS_TYPES: "^(sys|proc|auto)fs$$"command:- '--path.procfs=/host/proc'- '--path.sysfs=/host/sys'- '--path.rootfs=/rootfs'  # 修复了未闭合的引号- '--collector.filesystem.ignored-mount-points=${IGNORE_MOUNT_POINTS}'- '--collector.filesystem.ignored-fs-types=${IGNORE_FS_TYPES}'  # 修复了无效的 #{}


alertmanager.ymlglobal:resolve_timeout: 5m #表示如果告警在 5 分钟内没有被解决,则认为该告警已恢复route:receiver: 'default'group_by: ['instance'] #通过alertname(告警名称)的值对告警进行分类 ;按照实例(instance)对告警进行分组group_wait: 10s  #表示第一次触发告警时会等待 10 秒后再发送通group_interval: 20s #表示两次告警之间的最小间隔为 20 秒; 同一组内两次告警之间的最小间隔为 20 秒repeat_interval: 1m #如果告警持续存在,每隔 1 分钟重复发送一次通知routes:- receiver: "hook"  #webhook通知group_wait: 10s#match:# service: "test"#severity: "critical"# match_re:#   service: "pods|critical"#   severity: "warning"# matchers:#   - service =~ "test|pods|critical"#   - severity =~ "critical|warning"- receiver: "hook1"  #邮件通知group_wait: 25s#matchers:# - severity =~ "critical|warning|info"receivers:
- name: 'hook'webhook_configs:- url: 'http://192.168.118.47:7998/alert'- name: "hook1"webhook_configs:- url: 'https://xe88-864e-8a9e7c476a18'send_resolved: true #通知已经恢复的告警- name: "default"webhook_configs:- url: 'https://x4af1-bbfc-4412c60ad031'send_resolved: true #通知已经恢复的告警- name: 'wechat'webhook_configs:- url: 'https://x-bbfc-4412c60ad031'send_resolved: trueinhibit_rules: #抑制的规则
- source_match:severity: 'critical'target_match:severity: 'warning'equal: ['alertname', 'dev', 'instance']


alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - 192.168.118.20:9093


rule_files:
  - "/rules/*_rules.yaml"

prometheus.ymlglobal:scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.# scrape_timeout is set to the global default (10s).# Alertmanager configuration
alerting:alertmanagers:- static_configs:- targets:- 192.168.118.20:9093rule_files:- "/rules/*_rules.yaml"scrape_configs:- job_name: 'kafka'static_configs:- targets:- '192.168.118.20:9101' # 对应 kafka1 的 JMX Exporter 端口- '192.168.118.20:9102' # 对应 kafka2 的 JMX Exporter 端口- '192.168.118.20:9103' # 对应 kafka3 的 JMX Exporter 端口- job_name: "node"static_configs:- targets: ["192.168.118.20:9100"]- job_name: 'prometheus'metrics_path: /actuator/prometheusstatic_configs:- targets: ['192.168.118.47:7998']- job_name: 'prometheus1'metrics_path: /actuator/prometheusstatic_configs:- targets: ['192.168.118.148:7998']

customer_rules.yamlgroups:- name: node-alertrules:- alert: NodeDownexpr: up == 0for: 5mlabels:severity: criticalinstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} down"description: "Instance: {{ $labels.instance }} 已经宕机 5分钟"value: "{{ $value }}"- alert: NodeCpuHighexpr: (1 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100 > 10for: 10slabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} cpu使用率过高"description: "CPU 使用率超过 80%"value: "{{ $value }}"- alert: NodeCpuIowaitHighexpr: avg by (instance) (irate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 80for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} cpu iowait 使用率过高"description: "CPU iowait 使用率超过 50%"value: "{{ $value }}"- alert: NodeLoad5Highexpr: node_load5 > (count by (instance) (node_cpu_seconds_total{mode='system'})) * 1.2for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} load(5m) 过高"description: "Load(5m) 过高,超出cpu核数 1.2倍"value: "{{ $value }}"- alert: NodeMemoryHighexpr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 10for: 10slabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} memory 使用率过高"description: "Memory 使用率超过 10%"value: "{{ $value }}"- alert: NodeDiskRootHighexpr: (1 - node_filesystem_avail_bytes{fstype=~"ext.*|xfs",mountpoint ="/"} / node_filesystem_size_bytes{fstype=~"ext.*|xfs",mountpoint ="/"}) * 100 > 50for: 1mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk(/ 分区) 使用率过高"description: "Disk(/ 分区) 使用率超过 50%"value: "{{ $value }}"- alert: NodeDiskBootHighexpr: (1 - node_filesystem_avail_bytes{fstype=~"ext.*|xfs",mountpoint ="/boot"} / node_filesystem_size_bytes{fstype=~"ext.*|xfs",mountpoint ="/boot"}) * 100 > 50for: 10slabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk(/boot 分区) 使用率过高"description: "Disk(/boot 分区) 使用率超过 50%"value: "{{ $value }}"- alert: NodeDiskReadHighexpr: irate(node_disk_read_bytes_total[5m]) > 20 * (1024 ^ 2)for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk 读取字节数 速率过高"description: "Disk 读取字节数 速率超过 20 MB/s"value: "{{ $value }}"- alert: NodeDiskWriteHighexpr: irate(node_disk_written_bytes_total[5m]) > 20 * (1024 ^ 2)for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk 写入字节数 速率过高"description: "Disk 写入字节数 速率超过 20 MB/s"value: "{{ $value }}"- alert: NodeDiskReadRateCountHighexpr: irate(node_disk_reads_completed_total[5m]) > 3000for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk iops 每秒读取速率过高"description: "Disk iops 每秒读取速率超过 3000 iops"value: "{{ $value }}"- alert: NodeDiskWriteRateCountHighexpr: irate(node_disk_writes_completed_total[5m]) > 3000for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk iops 每秒写入速率过高"description: "Disk iops 每秒写入速率超过 3000 iops"value: "{{ $value }}"- alert: NodeInodeRootUsedPercentHighexpr: (1 - node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint="/"} / node_filesystem_files{fstype=~"ext4|xfs",mountpoint="/"}) * 100 > 80for: 10mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk(/ 分区) inode 使用率过高"description: "Disk (/ 分区) inode 使用率超过 80%"value: "{{ $value }}"- alert: NodeInodeBootUsedPercentHighexpr: (1 - node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint="/boot"} / node_filesystem_files{fstype=~"ext4|xfs",mountpoint="/boot"}) * 100 > 80for: 10mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk(/boot 分区) inode 使用率过高"description: "Disk (/boot 分区) inode 使用率超过 80%"value: "{{ $value }}"- alert: NodeFilefdAllocatedPercentHighexpr: node_filefd_allocated / node_filefd_maximum * 100 > 80for: 10mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} filefd 打开百分比过高"description: "Filefd 打开百分比 超过 80%"value: "{{ $value }}"- alert: NodeNetworkNetinBitRateHighexpr: avg by (instance) (irate(node_network_receive_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]) * 8) > 20 * (1024 ^ 2) * 8for: 3mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} network 接收比特数 速率过高"description: "Network 接收比特数 速率超过 20MB/s"value: "{{ $value }}"- alert: NodeNetworkNetoutBitRateHighexpr: avg by (instance) (irate(node_network_transmit_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]) * 8) > 20 * (1024 ^ 2) * 8for: 3mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} network 发送比特数 速率过高"description: "Network 发送比特数 速率超过 20MB/s"value: "{{ $value }}"- alert: NodeNetworkNetinPacketErrorRateHighexpr: avg by (instance) (irate(node_network_receive_errs_total{device=~"eth0|eth1|ens33|ens37"}[1m])) > 15for: 3mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} 接收错误包 速率过高"description: "Network 接收错误包 速率超过 15个/秒"value: "{{ $value }}"- alert: NodeNetworkNetoutPacketErrorRateHighexpr: avg by (instance) (irate(node_network_transmit_packets_total{device=~"eth0|eth1|ens33|ens37"}[1m])) > 15for: 3mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} 发送错误包 速率过高"description: "Network 发送错误包 速率超过 15个/秒"value: "{{ $value }}"- alert: NodeProcessBlockedHighexpr: node_procs_blocked{job="node"} > 10for: 10mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} 当前被阻塞的任务的数量过多"description: "Process 当前被阻塞的任务的数量超过 10个"value: "{{ $value }}"- alert: NodeTimeOffsetHighexpr: abs(node_timex_offset_seconds{job="node"}) > 3 * 60for: 2mlabels:severity: infoinstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} 时间偏差过大"description: "Time 节点的时间偏差超过 3m"value: "{{ $value }}"

 

 


https://segmentfault.com/a/1190000043690204

prometheus结合consul+confd实现动态注册服务和动态更新配置告警规则_prometheus confd-CSDN博客





如若想动态修改下面规则内容;   可采用以下方案;
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
  - "/rules/*_rules.yaml"
 

# Download the binary
wget https://github.com/kelseyhightower/confd/releases/download/v0.16.0/confd-0.16.0-linux-amd64
 
# 重命名二进制文件,并移动到PATH的目录下
mv confd-0.16.0-linux-amd64 /usr/local/bin/confd
chmod +x /usr/local/bin/confd
 
# 验证是否安装成功
confd --help

sudo mkdir -p /etc/confd/{conf.d,templates,rules}


http://www.xdnf.cn/news/24121.html

相关文章:

  • 工厂方法模式详解及在自动驾驶场景代码示例(c++代码实现)
  • Linux根据 PID 进行性能分析
  • 三格电子——PROFIBUS DP设备长距离传输和干扰问题解决办法
  • ffprobe 输出 HEVC 码流 Level:标准的 “错位” 与分析的 “归位”
  • javaweb-servlet-继承关系以及service方法、servlet生命周期
  • LabelImg打标工具的下载和使用——YOLO格式篇
  • open CasCade下载
  • RVOS的任务调度优化
  • OJ笔试强训_1至24天
  • `peft`(Parameter-Efficient Fine-Tuning:高效微调)是什么
  • 接口测试的原则、用例与流程
  • Git学习之路(Updating)
  • 《软件设计师》复习笔记(11.3)——需求获取、分析、定义、验证、管理
  • 欧拉系统升级openssh 9.7p1
  • 【AI】实现中文文章摘要的AI模型
  • 【失败】Gnome将默认终端设置为 Kitty
  • 如何在Linux系统中部署C++ Web应用
  • Sa-Token使用指南
  • 1 Celery 简介
  • cpolar 内网穿透 实现公网可以访问本机
  • top100 (6-10)
  • 字符串循环拼接,不能用 + 连接, 需要用 StringBuilder 代替
  • 全球唯一电解方式除湿器 / 加湿器 RS1 ROSAHL 微型 易安装
  • Logisim数字逻辑实训——寄存器设计与应用
  • 稳态模型下的异步电机调速【运动控制系统】
  • 《软件设计师》复习笔记(13)——结构化开发方法
  • 2021-11-09 C++倍数11各位和为13
  • 哈电汽轮机携林重型燃机登陆2025涡轮展,5月苏州相见
  • 嵌入式通信协议与编程逻辑完全指南
  • 数据表示与运算