diff --git a/k8s/monitoring/alertmanager-config/alertmanager.yaml b/k8s/monitoring/alertmanager-config/alertmanager.yaml new file mode 100644 index 0000000..0744f8d --- /dev/null +++ b/k8s/monitoring/alertmanager-config/alertmanager.yaml @@ -0,0 +1,43 @@ +global: + resolve_timeout: 5m +inhibit_rules: +- equal: + - namespace + - alertname + source_matchers: + - severity = critical + target_matchers: + - severity =~ warning|info +- equal: + - namespace + - alertname + source_matchers: + - severity = warning + target_matchers: + - severity = info +- equal: + - namespace + source_matchers: + - alertname = InfoInhibitor + target_matchers: + - severity = info +receivers: +- name: default + webhook_configs: + - url: http://matrix-alertmanager-receiver:8080/alerts/alerts +- name: watchdog +- name: "null" +route: + group_by: + - namespace + group_interval: 5m + group_wait: 30s + receiver: default + repeat_interval: 12h + routes: + - matchers: + - alertname = Watchdog + receiver: watchdog + - matchers: + - alertname = InfoInhibitor + receiver: "null" diff --git a/k8s/monitoring/kustomization.yaml b/k8s/monitoring/kustomization.yaml index b5d615f..fb2f1cf 100644 --- a/k8s/monitoring/kustomization.yaml +++ b/k8s/monitoring/kustomization.yaml @@ -8,6 +8,7 @@ resources: - grafana-database.yaml - thanos.yaml - alerts-longhorn.yaml + - matrix-alertmanager-receiver.yaml images: - name: quay.io/thanos/thanos newTag: v0.36.0 @@ -17,3 +18,12 @@ secretGenerator: disableNameSuffixHash: true files: - prom-scrape-configs/scrape-configs.yaml + - name: alertmanager-main + options: + disableNameSuffixHash: true + files: + - alertmanager-config/alertmanager.yaml +configMapGenerator: + - name: matrix-alertmanager-receiver + files: + - matrix-alertmanager-receiver/config.yaml diff --git a/k8s/monitoring/matrix-alertmanager-receiver.yaml b/k8s/monitoring/matrix-alertmanager-receiver.yaml new file mode 100644 index 0000000..1cc7a0f --- /dev/null +++ b/k8s/monitoring/matrix-alertmanager-receiver.yaml @@ -0,0 +1,61 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: matrix-alertmanager-receiver +spec: + selector: + matchLabels: + app: matrix-alertmanager-receiver + template: + metadata: + labels: + app: matrix-alertmanager-receiver + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + spec: + initContainers: + - name: secret-replacer + image: debian:stable + command: + - bash + - -c + - "sed \"s#ACCESS_TOKEN#${ACCESS_TOKEN}#\" /config/config.yaml > /tempconfig/config.yaml" + volumeMounts: + - name: tempconfig + mountPath: /tempconfig + - name: config + mountPath: /config + envFrom: + - secretRef: + name: matrix-alertmanager-receiver + containers: + - name: matrix-alertmanager-receiver + image: docker.io/metio/matrix-alertmanager-receiver:latest + args: ["--config-path", "/config/config.yaml"] + resources: + limits: + memory: "128Mi" + cpu: "500m" + ports: + - name: http + containerPort: 8080 + volumeMounts: + - name: tempconfig + mountPath: /config + volumes: + - name: config + configMap: + name: matrix-alertmanager-receiver + - name: tempconfig + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: matrix-alertmanager-receiver +spec: + selector: + app: matrix-alertmanager-receiver + ports: + - port: 8080 diff --git a/k8s/monitoring/matrix-alertmanager-receiver/config.yaml b/k8s/monitoring/matrix-alertmanager-receiver/config.yaml new file mode 100644 index 0000000..c727ee3 --- /dev/null +++ b/k8s/monitoring/matrix-alertmanager-receiver/config.yaml @@ -0,0 +1,100 @@ +# configuration of the HTTP server +http: + address: 0.0.0.0 # bind address for this service. Can be left unspecified to bind on all interfaces + port: 8080 # port used by this service + alerts-path-prefix: /alerts # URL path for the webhook receiver called by an Alertmanager. Defaults to /alerts + metrics-path: /metrics # URL path to collect metrics. Defaults to /metrics + metrics-enabled: true # Whether to enable metrics or not. Defaults to false + +# configuration for the Matrix connection +matrix: + homeserver-url: https://janky.bot # FQDN of the homeserver + user-id: "@alerts:janky.bot" # ID of the user used by this service + access-token: ACCESS_TOKEN # Access token for the user ID - will get replaced in an init container + # define short names for Matrix room ID + room-mapping: + alerts: "!nBmQwxtIfjWqYGDqLb:janky.solutions" + +# configuration of the templating features +templating: + # mapping of ExternalURL values + external-url-mapping: + # key is the original value taken from the Alertmanager payload + # value is the mapped value which will be available as '.ExternalURL' in templates + # "http://alertmanager:9093": https://alertmanager.example.com + # mapping of GeneratorURL values + generator-url-mapping: + # key is the original value taken from the Alertmanager payload + # value is the mapped value which will be available as '.GeneratorURL' in templates + # "http://prometheus:8080": https://prometheus.example.com + + # computation of arbitrary values based on matching alert annotations, labels, or status + # values will be evaluated top to bottom, last entry wins + computed-values: + - values: # always set 'color' to 'yellow' + color: yellow + - values: # set 'color' to 'orange' when alert label 'severity' is 'warning' + color: orange + when-matching-labels: + severity: warning + - values: # set 'color' to 'red' when alert label 'severity' is 'critical' + color: red + when-matching-labels: + severity: critical + - values: # set 'color' to 'green' when alert status is 'resolved' + color: green + when-matching-status: resolved + + # template for alerts in status 'firing' + firing-template: ' +
+ {{ .Alert.Status | ToUpper }} + {{ if .Alert.Labels.name }} + {{ .Alert.Labels.name }} + {{ else if .Alert.Labels.alertname }} + {{ .Alert.Labels.alertname }} + {{ end }} + >> + {{ if .Alert.Labels.severity }} + {{ .Alert.Labels.severity | ToUpper }}: + {{ end }} + {{ if .Alert.Annotations.description }} + {{ .Alert.Annotations.description }} + {{ else if .Alert.Annotations.summary }} + {{ .Alert.Annotations.summary }} + {{ end }} + >> + {{ if .Alert.Annotations.runbook }} + Runbook | + {{ end }} + {{ if .Alert.Annotations.dashboard }} + Dashboard | + {{ end }} + Silence +
' + + # template for alerts in status 'resolved', if not specified will use the firing-template + resolved-template: ' ++ {{ .Alert.Status | ToUpper }} + {{ if .Alert.Labels.name }} + {{ .Alert.Labels.name }} + {{ else if .Alert.Labels.alertname }} + {{ .Alert.Labels.alertname }} + {{ end }} + >> + {{ if .Alert.Labels.severity }} + {{ .Alert.Labels.severity | ToUpper }}: + {{ end }} + {{ if .Alert.Annotations.description }} + {{ .Alert.Annotations.description }} + {{ else if .Alert.Annotations.summary }} + {{ .Alert.Annotations.summary }} + {{ end }} + {{ if .Alert.Annotations.runbook }} + Runbook | + {{ end }} + {{ if .Alert.Annotations.dashboard }} + Dashboard | + {{ end }} +
' diff --git a/k8s/operators/kube-prometheus/alertmanager-main-secret-patch.yaml b/k8s/operators/kube-prometheus/alertmanager-main-secret-patch.yaml new file mode 100644 index 0000000..26cc7b7 --- /dev/null +++ b/k8s/operators/kube-prometheus/alertmanager-main-secret-patch.yaml @@ -0,0 +1,5 @@ +$patch: delete +apiVersion: v1 +kind: Secret +metadata: + name: alertmanager-main diff --git a/k8s/operators/kube-prometheus/apiregistration-patch.yaml b/k8s/operators/kube-prometheus/apiregistration-patch.yaml new file mode 100644 index 0000000..d3aadcd --- /dev/null +++ b/k8s/operators/kube-prometheus/apiregistration-patch.yaml @@ -0,0 +1,5 @@ +$patch: delete +apiVersion: apiregistration.k8s.io/v1 +kind: APIService +metadata: + name: v1beta1.metrics.k8s.io diff --git a/k8s/operators/kube-prometheus/kustomization.yaml b/k8s/operators/kube-prometheus/kustomization.yaml index ec0d77a..878efda 100644 --- a/k8s/operators/kube-prometheus/kustomization.yaml +++ b/k8s/operators/kube-prometheus/kustomization.yaml @@ -59,6 +59,15 @@ patches: target: kind: Deployment name: kube-state-metrics + - path: alertmanager-main-secret-patch.yaml + target: + kind: Secret + name: alertmanager-main + - path: node-exporter-prometheus-rule-patches.yaml + target: + kind: PrometheusRule + name: node-exporter-rules + - path: apiregistration-patch.yaml secretGenerator: - name: grafana-config namespace: monitoring diff --git a/k8s/operators/kube-prometheus/node-exporter-prometheus-rule-patches.yaml b/k8s/operators/kube-prometheus/node-exporter-prometheus-rule-patches.yaml new file mode 100644 index 0000000..79717da --- /dev/null +++ b/k8s/operators/kube-prometheus/node-exporter-prometheus-rule-patches.yaml @@ -0,0 +1,77 @@ +# Set the dashboard for all node-exporter alerts to the Node Exporter Full dashboard +- op: add + path: /spec/groups/0/rules/0/annotations/dashboard # NodeFilesystemSpaceFillingUp + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/1/annotations/dashboard # NodeFilesystemSpaceFillingUp + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/2/annotations/dashboard # NodeFilesystemAlmostOutOfSpace + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/3/annotations/dashboard # NodeFilesystemAlmostOutOfSpace + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/4/annotations/dashboard # NodeFilesystemFilesFillingUp + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/5/annotations/dashboard # NodeFilesystemFilesFillingUp + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/6/annotations/dashboard # NodeFilesystemAlmostOutOfFiles + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/7/annotations/dashboard # NodeFilesystemAlmostOutOfFiles + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/8/annotations/dashboard # NodeNetworkReceiveErrs + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/9/annotations/dashboard # NodeNetworkTransmitErrs + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/10/annotations/dashboard # NodeHighNumberConntrackEntriesUsed + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/11/annotations/dashboard # NodeTextFileCollectorScrapeError + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/12/annotations/dashboard # NodeClockSkewDetected + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/13/annotations/dashboard # NodeClockNotSynchronising + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/14/annotations/dashboard # NodeRAIDDegraded + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/15/annotations/dashboard # NodeRAIDDiskFailure + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/16/annotations/dashboard # NodeFileDescriptorLimit + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/17/annotations/dashboard # NodeFileDescriptorLimit + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/18/annotations/dashboard # NodeCPUHighUsage + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/19/annotations/dashboard # NodeSystemSaturation + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/20/annotations/dashboard # NodeMemoryMajorPagesFaults + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/21/annotations/dashboard # NodeMemoryHighUtilization + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/22/annotations/dashboard # NodeDiskIOSaturation + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +- op: add + path: /spec/groups/0/rules/23/annotations/dashboard # NodeSystemdServiceFailed + value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }} +# unclear why this one doesn't want to patch, leaving it out for now +# - op: add +# path: /spec/groups/0/rules/24/annotations/dashboard # NodeBondingDegraded +# value: https://grafana.home.finn.io/d/rYdddlPWk/node-exporter-full?var-node={{ $labels.instance }}