From 3ef702fb52eeac25878a9444a4c8dbdb13ec8034 Mon Sep 17 00:00:00 2001 From: Finn Date: Sun, 1 Sep 2024 16:11:26 -0700 Subject: [PATCH] Add longhorn monitoring + alerting --- k8s/monitoring/alerts-longhorn.yaml | 90 ++++++++++++++++++++++ k8s/monitoring/kustomization.yaml | 1 + k8s/operators/longhorn/kustomization.yaml | 1 + k8s/operators/longhorn/servicemonitor.yaml | 16 ++++ 4 files changed, 108 insertions(+) create mode 100644 k8s/monitoring/alerts-longhorn.yaml create mode 100644 k8s/operators/longhorn/servicemonitor.yaml diff --git a/k8s/monitoring/alerts-longhorn.yaml b/k8s/monitoring/alerts-longhorn.yaml new file mode 100644 index 0000000..95ad244 --- /dev/null +++ b/k8s/monitoring/alerts-longhorn.yaml @@ -0,0 +1,90 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: k8s + role: alert-rules + name: longhorn-alerts +spec: + groups: + - name: longhorn.rules + rules: + - alert: LonghornVolumeActualSpaceUsedWarning + annotations: + description: The actual space used by Longhorn volume {{$labels.volume}} on {{$labels.node}} is at {{$value}}% capacity for + more than 5 minutes. + summary: The actual used space of Longhorn volume is over 90% of the capacity. + expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) * 100 > 90 + for: 5m + labels: + issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high. + severity: warning + - alert: LonghornVolumeStatusCritical + annotations: + description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for + more than 2 minutes. + summary: Longhorn volume {{$labels.volume}} is Fault + expr: longhorn_volume_robustness == 3 + for: 5m + labels: + issue: Longhorn volume {{$labels.volume}} is Fault. + severity: critical + - alert: LonghornVolumeStatusWarning + annotations: + description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for + more than 5 minutes. + summary: Longhorn volume {{$labels.volume}} is Degraded + expr: longhorn_volume_robustness == 2 + for: 5m + labels: + issue: Longhorn volume {{$labels.volume}} is Degraded. + severity: warning + - alert: LonghornNodeStorageWarning + annotations: + description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for + more than 5 minutes. + summary: The used storage of node is over 70% of the capacity. + expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70 + for: 5m + labels: + issue: The used storage of node {{$labels.node}} is high. + severity: warning + - alert: LonghornDiskStorageWarning + annotations: + description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for + more than 5 minutes. + summary: The used storage of disk is over 70% of the capacity. + expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70 + for: 5m + labels: + issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high. + severity: warning + - alert: LonghornNodeDown + annotations: + description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes. + summary: Longhorn nodes is offline + expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0 + for: 5m + labels: + issue: There are {{$value}} Longhorn nodes are offline + severity: critical + - alert: LonghornInstanceManagerCPUUsageWarning + annotations: + description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for + more than 5 minutes. + summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%. + expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300 + for: 5m + labels: + issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request. + severity: warning + - alert: LonghornNodeCPUUsageWarning + annotations: + description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for + more than 5 minutes. + summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m. + expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90 + for: 5m + labels: + issue: Longhorn node {{$labels.node}} experiences high CPU pressure. + severity: warning diff --git a/k8s/monitoring/kustomization.yaml b/k8s/monitoring/kustomization.yaml index 7a7a476..b5d615f 100644 --- a/k8s/monitoring/kustomization.yaml +++ b/k8s/monitoring/kustomization.yaml @@ -7,6 +7,7 @@ resources: - secrets.yaml - grafana-database.yaml - thanos.yaml + - alerts-longhorn.yaml images: - name: quay.io/thanos/thanos newTag: v0.36.0 diff --git a/k8s/operators/longhorn/kustomization.yaml b/k8s/operators/longhorn/kustomization.yaml index 8b7c88c..065ff39 100644 --- a/k8s/operators/longhorn/kustomization.yaml +++ b/k8s/operators/longhorn/kustomization.yaml @@ -6,6 +6,7 @@ resources: - secrets.yaml - backup.yaml - ingress.yaml + - servicemonitor.yaml patches: - path: config-patches.yaml target: diff --git a/k8s/operators/longhorn/servicemonitor.yaml b/k8s/operators/longhorn/servicemonitor.yaml new file mode 100644 index 0000000..fc878e5 --- /dev/null +++ b/k8s/operators/longhorn/servicemonitor.yaml @@ -0,0 +1,16 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: longhorn + namespace: longhorn-system + labels: + name: longhorn +spec: + selector: + matchLabels: + app: longhorn-manager + namespaceSelector: + matchNames: + - longhorn-system + endpoints: + - port: manager