alertmanager matrix receiver

This commit is contained in:
Finn 2024-09-01 18:07:04 -07:00
parent ba35dc9b32
commit 8d7ab62a3a
8 changed files with 310 additions and 0 deletions

View file

@ -0,0 +1,43 @@
global:
resolve_timeout: 5m
inhibit_rules:
- equal:
- namespace
- alertname
source_matchers:
- severity = critical
target_matchers:
- severity =~ warning|info
- equal:
- namespace
- alertname
source_matchers:
- severity = warning
target_matchers:
- severity = info
- equal:
- namespace
source_matchers:
- alertname = InfoInhibitor
target_matchers:
- severity = info
receivers:
- name: default
webhook_configs:
- url: http://matrix-alertmanager-receiver:8080/alerts/alerts
- name: watchdog
- name: "null"
route:
group_by:
- namespace
group_interval: 5m
group_wait: 30s
receiver: default
repeat_interval: 12h
routes:
- matchers:
- alertname = Watchdog
receiver: watchdog
- matchers:
- alertname = InfoInhibitor
receiver: "null"

View file

@ -8,6 +8,7 @@ resources:
- grafana-database.yaml
- thanos.yaml
- alerts-longhorn.yaml
- matrix-alertmanager-receiver.yaml
images:
- name: quay.io/thanos/thanos
newTag: v0.36.0
@ -17,3 +18,12 @@ secretGenerator:
disableNameSuffixHash: true
files:
- prom-scrape-configs/scrape-configs.yaml
- name: alertmanager-main
options:
disableNameSuffixHash: true
files:
- alertmanager-config/alertmanager.yaml
configMapGenerator:
- name: matrix-alertmanager-receiver
files:
- matrix-alertmanager-receiver/config.yaml

View file

@ -0,0 +1,61 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: matrix-alertmanager-receiver
spec:
selector:
matchLabels:
app: matrix-alertmanager-receiver
template:
metadata:
labels:
app: matrix-alertmanager-receiver
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
spec:
initContainers:
- name: secret-replacer
image: debian:stable
command:
- bash
- -c
- "sed \"s#ACCESS_TOKEN#${ACCESS_TOKEN}#\" /config/config.yaml > /tempconfig/config.yaml"
volumeMounts:
- name: tempconfig
mountPath: /tempconfig
- name: config
mountPath: /config
envFrom:
- secretRef:
name: matrix-alertmanager-receiver
containers:
- name: matrix-alertmanager-receiver
image: docker.io/metio/matrix-alertmanager-receiver:latest
args: ["--config-path", "/config/config.yaml"]
resources:
limits:
memory: "128Mi"
cpu: "500m"
ports:
- name: http
containerPort: 8080
volumeMounts:
- name: tempconfig
mountPath: /config
volumes:
- name: config
configMap:
name: matrix-alertmanager-receiver
- name: tempconfig
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: matrix-alertmanager-receiver
spec:
selector:
app: matrix-alertmanager-receiver
ports:
- port: 8080

View file

@ -0,0 +1,100 @@
# configuration of the HTTP server
http:
address: 0.0.0.0 # bind address for this service. Can be left unspecified to bind on all interfaces
port: 8080 # port used by this service
alerts-path-prefix: /alerts # URL path for the webhook receiver called by an Alertmanager. Defaults to /alerts
metrics-path: /metrics # URL path to collect metrics. Defaults to /metrics
metrics-enabled: true # Whether to enable metrics or not. Defaults to false
# configuration for the Matrix connection
matrix:
homeserver-url: https://janky.bot # FQDN of the homeserver
user-id: "@alerts:janky.bot" # ID of the user used by this service
access-token: ACCESS_TOKEN # Access token for the user ID - will get replaced in an init container
# define short names for Matrix room ID
room-mapping:
alerts: "!nBmQwxtIfjWqYGDqLb:janky.solutions"
# configuration of the templating features
templating:
# mapping of ExternalURL values
external-url-mapping:
# key is the original value taken from the Alertmanager payload
# value is the mapped value which will be available as '.ExternalURL' in templates
# "http://alertmanager:9093": https://alertmanager.example.com
# mapping of GeneratorURL values
generator-url-mapping:
# key is the original value taken from the Alertmanager payload
# value is the mapped value which will be available as '.GeneratorURL' in templates
# "http://prometheus:8080": https://prometheus.example.com
# computation of arbitrary values based on matching alert annotations, labels, or status
# values will be evaluated top to bottom, last entry wins
computed-values:
- values: # always set 'color' to 'yellow'
color: yellow
- values: # set 'color' to 'orange' when alert label 'severity' is 'warning'
color: orange
when-matching-labels:
severity: warning
- values: # set 'color' to 'red' when alert label 'severity' is 'critical'
color: red
when-matching-labels:
severity: critical
- values: # set 'color' to 'green' when alert status is 'resolved'
color: green
when-matching-status: resolved
# template for alerts in status 'firing'
firing-template: '
<p>
<strong><font color="{{ .ComputedValues.color }}">{{ .Alert.Status | ToUpper }}</font></strong>
{{ if .Alert.Labels.name }}
{{ .Alert.Labels.name }}
{{ else if .Alert.Labels.alertname }}
{{ .Alert.Labels.alertname }}
{{ end }}
>>
{{ if .Alert.Labels.severity }}
{{ .Alert.Labels.severity | ToUpper }}:
{{ end }}
{{ if .Alert.Annotations.description }}
{{ .Alert.Annotations.description }}
{{ else if .Alert.Annotations.summary }}
{{ .Alert.Annotations.summary }}
{{ end }}
>>
{{ if .Alert.Annotations.runbook }}
<a href="{{ .Alert.Annotations.runbook }}">Runbook</a> |
{{ end }}
{{ if .Alert.Annotations.dashboard }}
<a href="{{ .Alert.Annotations.dashboard }}">Dashboard</a> |
{{ end }}
<a href="{{ .SilenceURL }}">Silence</a>
</p>'
# template for alerts in status 'resolved', if not specified will use the firing-template
resolved-template: '
<p>
<strong><font color="{{ .ComputedValues.color }}">{{ .Alert.Status | ToUpper }}</font></strong>
{{ if .Alert.Labels.name }}
{{ .Alert.Labels.name }}
{{ else if .Alert.Labels.alertname }}
{{ .Alert.Labels.alertname }}
{{ end }}
>>
{{ if .Alert.Labels.severity }}
{{ .Alert.Labels.severity | ToUpper }}:
{{ end }}
{{ if .Alert.Annotations.description }}
{{ .Alert.Annotations.description }}
{{ else if .Alert.Annotations.summary }}
{{ .Alert.Annotations.summary }}
{{ end }}
{{ if .Alert.Annotations.runbook }}
<a href="{{ .Alert.Annotations.runbook }}">Runbook</a> |
{{ end }}
{{ if .Alert.Annotations.dashboard }}
<a href="{{ .Alert.Annotations.dashboard }}">Dashboard</a> |
{{ end }}
</p>'

View file

@ -0,0 +1,5 @@
$patch: delete
apiVersion: v1
kind: Secret
metadata:
name: alertmanager-main

View file

@ -0,0 +1,5 @@
$patch: delete
apiVersion: apiregistration.k8s.io/v1
kind: APIService
metadata:
name: v1beta1.metrics.k8s.io

View file

@ -59,6 +59,15 @@ patches:
target:
kind: Deployment
name: kube-state-metrics
- path: alertmanager-main-secret-patch.yaml
target:
kind: Secret
name: alertmanager-main
- path: node-exporter-prometheus-rule-patches.yaml
target:
kind: PrometheusRule
name: node-exporter-rules
- path: apiregistration-patch.yaml
secretGenerator:
- name: grafana-config
namespace: monitoring

View file

@ -0,0 +1,77 @@
# Set the dashboard for all node-exporter alerts to the Node Exporter Full dashboard
- op: add
path: /spec/groups/0/rules/0/annotations/dashboard # NodeFilesystemSpaceFillingUp
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/1/annotations/dashboard # NodeFilesystemSpaceFillingUp
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/2/annotations/dashboard # NodeFilesystemAlmostOutOfSpace
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/3/annotations/dashboard # NodeFilesystemAlmostOutOfSpace
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/4/annotations/dashboard # NodeFilesystemFilesFillingUp
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/5/annotations/dashboard # NodeFilesystemFilesFillingUp
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/6/annotations/dashboard # NodeFilesystemAlmostOutOfFiles
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/7/annotations/dashboard # NodeFilesystemAlmostOutOfFiles
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/8/annotations/dashboard # NodeNetworkReceiveErrs
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/9/annotations/dashboard # NodeNetworkTransmitErrs
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/10/annotations/dashboard # NodeHighNumberConntrackEntriesUsed
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/11/annotations/dashboard # NodeTextFileCollectorScrapeError
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/12/annotations/dashboard # NodeClockSkewDetected
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/13/annotations/dashboard # NodeClockNotSynchronising
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/14/annotations/dashboard # NodeRAIDDegraded
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/15/annotations/dashboard # NodeRAIDDiskFailure
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/16/annotations/dashboard # NodeFileDescriptorLimit
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/17/annotations/dashboard # NodeFileDescriptorLimit
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/18/annotations/dashboard # NodeCPUHighUsage
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/19/annotations/dashboard # NodeSystemSaturation
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/20/annotations/dashboard # NodeMemoryMajorPagesFaults
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/21/annotations/dashboard # NodeMemoryHighUtilization
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/22/annotations/dashboard # NodeDiskIOSaturation
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
- op: add
path: /spec/groups/0/rules/23/annotations/dashboard # NodeSystemdServiceFailed
value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}
# unclear why this one doesn't want to patch, leaving it out for now
# - op: add
# path: /spec/groups/0/rules/24/annotations/dashboard # NodeBondingDegraded
# value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}