From 5e348700650a418ce0411acc956ef8f384e1c048 Mon Sep 17 00:00:00 2001 From: Finn Date: Wed, 14 Aug 2024 17:00:56 -0700 Subject: [PATCH] Add Thanos to monitoring. just for historical data storage currently, alerting still happens the old fashion way --- k8s/monitoring/kustomization.yaml | 5 +- k8s/monitoring/thanos.yaml | 306 ++++++++++++++++++ .../kube-prometheus/alertmanager-patches.yaml | 11 + .../kube-prometheus/grafana/datasources.yaml | 4 +- .../kube-prometheus/kustomization.yaml | 10 +- .../prometheus-network-policy-patch.yaml | 22 ++ ...rometheus-operator-deployment-patches.yaml | 3 + .../kube-prometheus/prometheus-patch.yaml | 15 + k8s/system-upgrade-controller/plan.yaml | 4 +- 9 files changed, 372 insertions(+), 8 deletions(-) create mode 100644 k8s/monitoring/thanos.yaml create mode 100644 k8s/operators/kube-prometheus/prometheus-network-policy-patch.yaml create mode 100644 k8s/operators/kube-prometheus/prometheus-operator-deployment-patches.yaml diff --git a/k8s/monitoring/kustomization.yaml b/k8s/monitoring/kustomization.yaml index 18cf9f8..7a7a476 100644 --- a/k8s/monitoring/kustomization.yaml +++ b/k8s/monitoring/kustomization.yaml @@ -6,10 +6,13 @@ resources: - ingresses.yaml - secrets.yaml - grafana-database.yaml + - thanos.yaml +images: + - name: quay.io/thanos/thanos + newTag: v0.36.0 secretGenerator: - name: additional-scrape-configs options: disableNameSuffixHash: true files: - prom-scrape-configs/scrape-configs.yaml - diff --git a/k8s/monitoring/thanos.yaml b/k8s/monitoring/thanos.yaml new file mode 100644 index 0000000..d7004d2 --- /dev/null +++ b/k8s/monitoring/thanos.yaml @@ -0,0 +1,306 @@ +# This file contains all the components for Thanos that aren't configured by kube-prometheus +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: thanos-querier +spec: + selector: + matchLabels: + app: thanos-querier + template: + metadata: + labels: + app: thanos-querier + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9090" + spec: + containers: + - name: thanos-querier + image: quay.io/thanos/thanos:latest + args: + - query + - --http-address + - 0.0.0.0:9090 + - --endpoint + - dnssrv+_grpc._tcp.prometheus-k8s-headless.monitoring.svc + - --endpoint + - dnssrv+_grpc._tcp.thanos-store.monitoring.svc + resources: + limits: + memory: "128Mi" + cpu: "500m" + ports: + - containerPort: 9090 +--- +apiVersion: v1 +kind: Service +metadata: + name: thanos-querier +spec: + selector: + app: thanos-querier + ports: + - port: 9090 + targetPort: 9090 +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus-k8s-headless +spec: + selector: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + clusterIP: None + ports: + - name: grpc + port: 10901 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + app: thanos-store + name: thanos-store +spec: + replicas: 1 + selector: + matchLabels: + app: thanos-store + serviceName: thanos-store + template: + metadata: + labels: + app: thanos-store + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "10902" + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - thanos-store + - key: app.kubernetes.io/instance + operator: In + values: + - thanos-store + namespaces: + - monitoring + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - store + - --log.level=info + - --log.format=logfmt + - --data-dir=/var/thanos/store + - --grpc-address=0.0.0.0:10901 + - --http-address=0.0.0.0:10902 + - --objstore.config=$(OBJSTORE_CONFIG) + - --ignore-deletion-marks-delay=24h + env: + - name: OBJSTORE_CONFIG + valueFrom: + secretKeyRef: + key: thanos.yaml + name: thanos-objstore + - name: HOST_IP_ADDRESS + valueFrom: + fieldRef: + fieldPath: status.hostIP + image: quay.io/thanos/thanos:latest + livenessProbe: + failureThreshold: 8 + httpGet: + path: /-/healthy + port: 10902 + scheme: HTTP + periodSeconds: 30 + timeoutSeconds: 1 + name: thanos-store + ports: + - containerPort: 10901 + name: grpc + - containerPort: 10902 + name: http + readinessProbe: + failureThreshold: 20 + httpGet: + path: /-/ready + port: 10902 + scheme: HTTP + periodSeconds: 5 + resources: + limits: + memory: "128Mi" + cpu: "500m" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /var/thanos/store + name: data + readOnly: false + securityContext: + fsGroup: 65534 + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + terminationGracePeriodSeconds: 120 + volumes: [] + volumeClaimTemplates: + - metadata: + labels: + app: thanos-store + name: data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: thanos-store +spec: + selector: + app: thanos-store + ports: + - name: grpc + port: 10901 + - name: http + port: 10902 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + app: thanos-compact + name: thanos-compact + namespace: thanos +spec: + replicas: 1 + selector: + matchLabels: + app: thanos-compact + serviceName: thanos-compact + template: + metadata: + labels: + app: thanos-compact + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "10902" + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - thanos-compact + - key: app.kubernetes.io/instance + operator: In + values: + - thanos-compact + namespaces: + - thanos + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - compact + - --wait + - --log.level=info + - --log.format=logfmt + - --objstore.config=$(OBJSTORE_CONFIG) + - --data-dir=/var/thanos/compact + - --retention.resolution-1h=365d + env: + - name: OBJSTORE_CONFIG + valueFrom: + secretKeyRef: + key: thanos.yaml + name: thanos-objstore + - name: HOST_IP_ADDRESS + valueFrom: + fieldRef: + fieldPath: status.hostIP + image: quay.io/thanos/thanos:v0.30.2 + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 4 + httpGet: + path: /-/healthy + port: 10902 + scheme: HTTP + periodSeconds: 30 + name: thanos-compact + ports: + - containerPort: 10902 + name: http + readinessProbe: + failureThreshold: 20 + httpGet: + path: /-/ready + port: 10902 + scheme: HTTP + periodSeconds: 5 + resources: + limits: + cpu: 500m + memory: 500Mi + requests: + cpu: 200m + memory: 100Mi + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /var/thanos/compact + name: data + readOnly: false + nodeSelector: + kubernetes.io/os: linux + securityContext: + fsGroup: 65534 + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + terminationGracePeriodSeconds: 120 + volumes: [] + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi diff --git a/k8s/operators/kube-prometheus/alertmanager-patches.yaml b/k8s/operators/kube-prometheus/alertmanager-patches.yaml index ef12b50..4c0549c 100644 --- a/k8s/operators/kube-prometheus/alertmanager-patches.yaml +++ b/k8s/operators/kube-prometheus/alertmanager-patches.yaml @@ -1,3 +1,14 @@ - op: add path: /spec/externalUrl value: https://alertmanager.k8s.home.finn.io +- op: replace + path: /spec/resources/requests/memory + value: 40Mi +- op: add + path: /spec/storage + value: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 1Gi diff --git a/k8s/operators/kube-prometheus/grafana/datasources.yaml b/k8s/operators/kube-prometheus/grafana/datasources.yaml index b7f5911..6ef7809 100644 --- a/k8s/operators/kube-prometheus/grafana/datasources.yaml +++ b/k8s/operators/kube-prometheus/grafana/datasources.yaml @@ -7,8 +7,8 @@ "name": "prometheus", "orgId": 1, "type": "prometheus", - "url": "http://prometheus-k8s.monitoring.svc:9090", - "version": 1 + "url": "http://thanos-querier.monitoring.svc:9090", + "version": 2 }, { "access": "proxy", diff --git a/k8s/operators/kube-prometheus/kustomization.yaml b/k8s/operators/kube-prometheus/kustomization.yaml index 5d965c3..37d5881 100644 --- a/k8s/operators/kube-prometheus/kustomization.yaml +++ b/k8s/operators/kube-prometheus/kustomization.yaml @@ -14,11 +14,11 @@ patches: - path: network-policies-patch.yaml target: kind: NetworkPolicy - name: prometheus-k8s - - path: network-policies-patch.yaml + name: alertmanager-main + - path: prometheus-network-policy-patch.yaml target: kind: NetworkPolicy - name: alertmanager-main + name: prometheus-k8s - path: prometheus-patch.yaml target: kind: Prometheus @@ -43,6 +43,10 @@ patches: target: kind: PrometheusRule name: node-exporter-rules + - path: prometheus-operator-deployment-patches.yaml + target: + kind: Deployment + name: prometheus-operator secretGenerator: - name: grafana-config namespace: monitoring diff --git a/k8s/operators/kube-prometheus/prometheus-network-policy-patch.yaml b/k8s/operators/kube-prometheus/prometheus-network-policy-patch.yaml new file mode 100644 index 0000000..72f9ba1 --- /dev/null +++ b/k8s/operators/kube-prometheus/prometheus-network-policy-patch.yaml @@ -0,0 +1,22 @@ +- op: add + path: /spec/ingress/0/from/- + value: + podSelector: + matchLabels: + app.kubernetes.io/name: traefik + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system +- op: add + path: /spec/ingress/- + value: + from: + - podSelector: + matchLabels: + app: thanos-querier + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: monitoring + ports: + - port: 10901 + protocol: TCP diff --git a/k8s/operators/kube-prometheus/prometheus-operator-deployment-patches.yaml b/k8s/operators/kube-prometheus/prometheus-operator-deployment-patches.yaml new file mode 100644 index 0000000..4b7ca2e --- /dev/null +++ b/k8s/operators/kube-prometheus/prometheus-operator-deployment-patches.yaml @@ -0,0 +1,3 @@ +- op: add + path: /spec/template/spec/containers/0/args/- + value: --config-reloader-cpu-limit=500m diff --git a/k8s/operators/kube-prometheus/prometheus-patch.yaml b/k8s/operators/kube-prometheus/prometheus-patch.yaml index 6308b18..cb75e38 100644 --- a/k8s/operators/kube-prometheus/prometheus-patch.yaml +++ b/k8s/operators/kube-prometheus/prometheus-patch.yaml @@ -10,3 +10,18 @@ - op: add path: /spec/externalUrl value: https://prometheus.k8s.home.finn.io +- op: add + path: /spec/thanos + value: + image: quay.io/thanos/thanos:v0.36.0 + objectStorageConfig: + key: thanos.yaml + name: thanos-objstore +- op: add + path: /spec/storage + value: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 20Gi diff --git a/k8s/system-upgrade-controller/plan.yaml b/k8s/system-upgrade-controller/plan.yaml index b057bed..b8cee8f 100644 --- a/k8s/system-upgrade-controller/plan.yaml +++ b/k8s/system-upgrade-controller/plan.yaml @@ -52,9 +52,9 @@ spec: - key: node-role.kubernetes.io/control-plane operator: DoesNotExist tolerations: - - key: "rtlsdr" + - key: rtlsdr value: "true" - effect: "NoSchedule" + effect: NoSchedule prepare: args: - prepare