talos metrics improvements
All checks were successful
/ render-helm (push) Successful in 29s
/ diff-and-deploy (push) Successful in 2m14s

This commit is contained in:
Finn 2025-02-03 18:43:43 -08:00
parent be8364ec4a
commit dbd7d9aa2e
14 changed files with 619 additions and 3 deletions

View file

@ -0,0 +1,13 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
helmCharts:
- name: metrics-server
releaseName: metrics-server
valuesInline:
metrics:
enabled: true
serviceMonitor:
enabled: true
version: 3.12.2
repo: https://kubernetes-sigs.github.io/metrics-server/

View file

@ -19,6 +19,6 @@ for component in openbao external-secrets secrets-store-csi-driver cert-manager-
done
# cisco k8s cluster operators
for component in rook cert-manager-webhook-pdns traefik; do
for component in rook cert-manager-webhook-pdns traefik metrics-server; do
render_helm ../talos/k8s/operators "${component}"
done

View file

@ -48,6 +48,7 @@ spec:
ports:
- port: 9090
targetPort: 9090
name: http
---
apiVersion: v1
kind: Service

View file

@ -12,3 +12,5 @@ machine:
- bind
- rshared
- rw
extraArgs:
rotate-server-certificates: true

View file

@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
# This will raise some warnings, issue filed here: https://github.com/alex1989hu/kubelet-serving-cert-approver/issues/255
- https://github.com/alex1989hu/kubelet-serving-cert-approver//deploy/standalone?ref=v0.9.0

View file

@ -2,5 +2,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- operators
- kubelet-serving-cert-approver
- monitoring
- rook

View file

@ -0,0 +1,301 @@
# based on https://github.com/prometheus-operator/kube-prometheus/blob/v0.14.0/manifests/kubernetesControlPlane-prometheusRule.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.54.1
prometheus: k8s
role: alert-rules
name: prometheus-k8s-prometheus-rules
namespace: monitoring
spec:
groups:
- name: prometheus
rules:
- alert: PrometheusBadConfig
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig
summary: Failed Prometheus configuration reload.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0
for: 10m
labels:
severity: critical
- alert: PrometheusSDRefreshFailure
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to refresh SD with mechanism {{$labels.mechanism}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheussdrefreshfailure
summary: Failed Prometheus SD refresh.
expr: |
increase(prometheus_sd_refresh_failures_total{job="prometheus-k8s",namespace="monitoring"}[10m]) > 0
for: 20m
labels:
severity: warning
- alert: PrometheusKubernetesListWatchFailures
annotations:
description: Kubernetes service discovery of Prometheus {{$labels.namespace}}/{{$labels.pod}} is experiencing {{ printf "%.0f" $value }} failures with LIST/WATCH requests to the Kubernetes API in the last 5 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuskuberneteslistwatchfailures
summary: Requests in Kubernetes SD are failing.
expr: |
increase(prometheus_sd_kubernetes_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusNotificationQueueRunningFull
annotations:
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull
summary: Prometheus alert notification queue predicted to run full in less than 30m.
expr: |
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30)
>
min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m])
)
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
expr: |
(
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
)
* 100
> 1
for: 15m
labels:
severity: warning
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers
summary: Prometheus is not connected to any Alertmanagers.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1
for: 10m
labels:
severity: warning
- alert: PrometheusTSDBReloadsFailing
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing
summary: Prometheus has issues reloading blocks from disk.
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
for: 4h
labels:
severity: warning
- alert: PrometheusTSDBCompactionsFailing
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing
summary: Prometheus has issues compacting blocks.
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
for: 4h
labels:
severity: warning
- alert: PrometheusNotIngestingSamples
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples
summary: Prometheus is not ingesting samples.
expr: |
(
sum without(type) (rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m])) <= 0
and
(
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring"}) > 0
or
sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring"}) > 0
)
)
for: 10m
labels:
severity: warning
- alert: PrometheusDuplicateTimestamps
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps
summary: Prometheus is dropping samples with duplicate timestamps.
expr: |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusOutOfOrderTimestamps
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps
summary: Prometheus drops samples with out-of-order timestamps.
expr: |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures
summary: Prometheus fails to send samples to remote storage.
expr: |
(
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring"}[5m]))
/
(
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring"}[5m]))
+
(rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]))
)
)
* 100
> 1
for: 15m
labels:
severity: critical
- alert: PrometheusRemoteWriteBehind
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind
summary: Prometheus remote write is behind.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
- ignoring(remote_name, url) group_right
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
)
> 120
for: 15m
labels:
severity: critical
- alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m])
>
max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m])
)
for: 15m
labels:
severity: warning
- alert: PrometheusRuleFailures
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures
summary: Prometheus is failing rule evaluations.
expr: |
increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
severity: critical
- alert: PrometheusMissingRuleEvaluations
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusTargetLimitHit
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusLabelLimitHit
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit
summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusScrapeBodySizeLimitHit
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit
summary: Prometheus has dropped some targets that exceeded body size limit.
expr: |
increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusScrapeSampleLimitHit
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit
summary: Prometheus has failed scrapes that have exceeded the configured sample limit.
expr: |
increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusTargetSyncFailure
annotations:
description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure
summary: Prometheus has failed to sync targets.
expr: |
increase(prometheus_target_sync_failed_total{job="prometheus-k8s",namespace="monitoring"}[30m]) > 0
for: 5m
labels:
severity: critical
- alert: PrometheusHighQueryLoad
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
summary: Prometheus is reaching its maximum capacity serving concurrent requests.
expr: |
avg_over_time(prometheus_engine_queries{job="prometheus-k8s",namespace="monitoring"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.8
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without (alertmanager) (
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m])
)
* 100
> 3
for: 15m
labels:
severity: critical

View file

@ -3,6 +3,7 @@ kind: Kustomization
namespace: monitoring
resources:
- kube-state-metrics.yaml
- kubernetesControlPlane-prometheusRule.yaml
- node-exporter.yaml
- prom.yaml
- servicemonitors.yaml

View file

@ -1,3 +1,4 @@
# this file is heavily based on https://github.com/prometheus-operator/kube-prometheus/blob/v0.14.0/manifests/kubernetesControlPlane-serviceMonitorKubelet.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
@ -264,6 +265,19 @@ spec:
scheme: https
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
path: /metrics/resource
port: https-metrics
relabelings:
- action: replace
sourceLabels:
- __metrics_path__
targetLabel: metrics_path
scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: app.kubernetes.io/name
namespaceSelector:
matchNames:

View file

@ -4,6 +4,7 @@ resources:
- cert-manager
- cert-manager-webhook-pdns
- local-path-provisioner
- metrics-server
- prometheus-operator
- rook
- traefik

View file

@ -0,0 +1,273 @@
# DO NOT EDIT: This file has been automatically generated by the script in helm/render-all.sh, edits may get overwritten
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/instance: metrics-server
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: metrics-server
app.kubernetes.io/version: 0.7.2
helm.sh/chart: metrics-server-3.12.2
name: metrics-server
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/instance: metrics-server
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: metrics-server
app.kubernetes.io/version: 0.7.2
helm.sh/chart: metrics-server-3.12.2
name: system:metrics-server
rules:
- apiGroups:
- ""
resources:
- nodes/metrics
verbs:
- get
- apiGroups:
- ""
resources:
- pods
- nodes
- namespaces
- configmaps
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/instance: metrics-server
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: metrics-server
app.kubernetes.io/version: 0.7.2
helm.sh/chart: metrics-server-3.12.2
rbac.authorization.k8s.io/aggregate-to-admin: "true"
rbac.authorization.k8s.io/aggregate-to-edit: "true"
rbac.authorization.k8s.io/aggregate-to-view: "true"
name: system:metrics-server-aggregated-reader
rules:
- apiGroups:
- metrics.k8s.io
resources:
- pods
- nodes
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/instance: metrics-server
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: metrics-server
app.kubernetes.io/version: 0.7.2
helm.sh/chart: metrics-server-3.12.2
name: metrics-server-auth-reader
namespace: monitoring
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: extension-apiserver-authentication-reader
subjects:
- kind: ServiceAccount
name: metrics-server
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/instance: metrics-server
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: metrics-server
app.kubernetes.io/version: 0.7.2
helm.sh/chart: metrics-server-3.12.2
name: metrics-server:system:auth-delegator
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:auth-delegator
subjects:
- kind: ServiceAccount
name: metrics-server
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/instance: metrics-server
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: metrics-server
app.kubernetes.io/version: 0.7.2
helm.sh/chart: metrics-server-3.12.2
name: system:metrics-server
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:metrics-server
subjects:
- kind: ServiceAccount
name: metrics-server
namespace: monitoring
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/instance: metrics-server
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: metrics-server
app.kubernetes.io/version: 0.7.2
helm.sh/chart: metrics-server-3.12.2
name: metrics-server
namespace: monitoring
spec:
ports:
- appProtocol: https
name: https
port: 443
protocol: TCP
targetPort: https
selector:
app.kubernetes.io/instance: metrics-server
app.kubernetes.io/name: metrics-server
type: ClusterIP
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/instance: metrics-server
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: metrics-server
app.kubernetes.io/version: 0.7.2
helm.sh/chart: metrics-server-3.12.2
name: metrics-server
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/instance: metrics-server
app.kubernetes.io/name: metrics-server
template:
metadata:
labels:
app.kubernetes.io/instance: metrics-server
app.kubernetes.io/name: metrics-server
spec:
containers:
- args:
- --secure-port=10250
- --cert-dir=/tmp
- --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
- --kubelet-use-node-status-port
- --metric-resolution=15s
- --authorization-always-allow-paths=/metrics
image: registry.k8s.io/metrics-server/metrics-server:v0.7.2
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 3
httpGet:
path: /livez
port: https
scheme: HTTPS
initialDelaySeconds: 0
periodSeconds: 10
name: metrics-server
ports:
- containerPort: 10250
name: https
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /readyz
port: https
scheme: HTTPS
initialDelaySeconds: 20
periodSeconds: 10
resources:
requests:
cpu: 100m
memory: 200Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
volumeMounts:
- mountPath: /tmp
name: tmp
priorityClassName: system-cluster-critical
serviceAccountName: metrics-server
volumes:
- emptyDir: {}
name: tmp
---
apiVersion: apiregistration.k8s.io/v1
kind: APIService
metadata:
labels:
app.kubernetes.io/instance: metrics-server
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: metrics-server
app.kubernetes.io/version: 0.7.2
helm.sh/chart: metrics-server-3.12.2
name: v1beta1.metrics.k8s.io
spec:
group: metrics.k8s.io
groupPriorityMinimum: 100
insecureSkipTLSVerify: true
service:
name: metrics-server
namespace: monitoring
port: 443
version: v1beta1
versionPriority: 100
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/instance: metrics-server
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: metrics-server
app.kubernetes.io/version: 0.7.2
helm.sh/chart: metrics-server-3.12.2
name: metrics-server
namespace: monitoring
spec:
endpoints:
- interval: 1m
path: /metrics
port: https
scheme: https
scrapeTimeout: 10s
tlsConfig:
insecureSkipVerify: true
jobLabel: app.kubernetes.io/instance
namespaceSelector:
matchNames:
- default
selector:
matchLabels:
app.kubernetes.io/instance: metrics-server
app.kubernetes.io/name: metrics-server

View file

@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- bundle.yaml

View file

@ -1,6 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: rook-ceph
name: monitoring
labels:
pod-security.kubernetes.io/enforce: privileged

View file

@ -2,5 +2,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- namespace.yaml
- https://github.com/prometheus-operator/prometheus-operator?ref=v0.79.2