diff --git a/talos/config_patch.yaml b/talos/config_patch.yaml index 426f514..39c6bba 100644 --- a/talos/config_patch.yaml +++ b/talos/config_patch.yaml @@ -3,3 +3,12 @@ cluster: machine: install: disk: /dev/sde + kubelet: + extraMounts: + - destination: /var/local-path-provisioner + type: bind + source: /var/local-path-provisioner + options: + - bind + - rshared + - rw diff --git a/talos/k8s/kustomization.yaml b/talos/k8s/kustomization.yaml index 829470d..86b4977 100644 --- a/talos/k8s/kustomization.yaml +++ b/talos/k8s/kustomization.yaml @@ -2,4 +2,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - operators + - monitoring - rook diff --git a/talos/k8s/monitoring/kube-state-metrics.yaml b/talos/k8s/monitoring/kube-state-metrics.yaml new file mode 100644 index 0000000..e0cf9d4 --- /dev/null +++ b/talos/k8s/monitoring/kube-state-metrics.yaml @@ -0,0 +1,356 @@ +# inspired by the various kubeStateMetrics-* files in https://github.com/prometheus-operator/kube-prometheus/tree/main/manifests + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + name: kube-state-metrics +rules: +- apiGroups: + - "" + resources: + - configmaps + - secrets + - nodes + - pods + - services + - serviceaccounts + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: + - list + - watch +- apiGroups: + - apps + resources: + - statefulsets + - daemonsets + - deployments + - replicasets + verbs: + - list + - watch +- apiGroups: + - batch + resources: + - cronjobs + - jobs + verbs: + - list + - watch +- apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - list + - watch +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +- apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - list + - watch +- apiGroups: + - certificates.k8s.io + resources: + - certificatesigningrequests + verbs: + - list + - watch +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - list + - watch +- apiGroups: + - storage.k8s.io + resources: + - storageclasses + - volumeattachments + verbs: + - list + - watch +- apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + - validatingwebhookconfigurations + verbs: + - list + - watch +- apiGroups: + - networking.k8s.io + resources: + - networkpolicies + - ingressclasses + - ingresses + verbs: + - list + - watch +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - list + - watch +- apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - clusterroles + - rolebindings + - roles + verbs: + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: monitoring +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + name: kube-state-metrics + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: kube-state-metrics + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + spec: + automountServiceAccountToken: true + containers: + - args: + - --host=127.0.0.1 + - --port=8081 + - --telemetry-host=127.0.0.1 + - --telemetry-port=8082 + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.14.0 + name: kube-state-metrics + resources: + limits: + cpu: 100m + memory: 250Mi + requests: + cpu: 10m + memory: 190Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + - args: + - --secure-listen-address=:8443 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:8081/ + image: quay.io/brancz/kube-rbac-proxy:v0.18.2 + name: kube-rbac-proxy-main + ports: + - containerPort: 8443 + name: https-main + resources: + limits: + cpu: 40m + memory: 40Mi + requests: + cpu: 20m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + seccompProfile: + type: RuntimeDefault + - args: + - --secure-listen-address=:9443 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:8082/ + image: quay.io/brancz/kube-rbac-proxy:v0.18.2 + name: kube-rbac-proxy-self + ports: + - containerPort: 9443 + name: https-self + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + seccompProfile: + type: RuntimeDefault + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: kube-state-metrics +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + name: kube-state-metrics + namespace: monitoring +spec: + egress: + - {} + ingress: + - from: + - podSelector: + matchLabels: + app.kubernetes.io/name: prometheus + ports: + - port: 8443 + protocol: TCP + - port: 9443 + protocol: TCP + podSelector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + policyTypes: + - Egress + - Ingress +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + name: kube-state-metrics + namespace: monitoring +spec: + clusterIP: None + ports: + - name: https-main + port: 8443 + targetPort: https-main + - name: https-self + port: 9443 + targetPort: https-self + selector: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus +--- +apiVersion: v1 +automountServiceAccountToken: false +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + name: kube-state-metrics + namespace: monitoring +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kube-state-metrics +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + metricRelabelings: + - action: drop + regex: kube_endpoint_address_not_ready|kube_endpoint_address_available + sourceLabels: + - __name__ + port: https-main + relabelings: + - action: labeldrop + regex: (pod|service|endpoint|namespace) + scheme: https + scrapeTimeout: 30s + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + port: https-self + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics diff --git a/talos/k8s/monitoring/kustomization.yaml b/talos/k8s/monitoring/kustomization.yaml new file mode 100644 index 0000000..1d8b273 --- /dev/null +++ b/talos/k8s/monitoring/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: monitoring +resources: + - kube-state-metrics.yaml + - node-exporter.yaml + - prom.yaml + - servicemonitors.yaml diff --git a/talos/k8s/monitoring/node-exporter.yaml b/talos/k8s/monitoring/node-exporter.yaml new file mode 100644 index 0000000..7057ad9 --- /dev/null +++ b/talos/k8s/monitoring/node-exporter.yaml @@ -0,0 +1,199 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-exporter +spec: + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: node-exporter + creationTimestamp: null + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.8.2 + spec: + automountServiceAccountToken: true + containers: + - args: + - --web.listen-address=0.0.0.0:9100 + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + - --path.udev.data=/host/root/run/udev/data + - --no-collector.wifi + - --no-collector.hwmon + - --no-collector.btrfs + - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run/k3s/containerd/.+|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) + - --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$ + - --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$ + image: quay.io/prometheus/node-exporter:v1.8.2 + imagePullPolicy: IfNotPresent + name: node-exporter + ports: + - containerPort: 9100 + hostPort: 9100 + name: http + protocol: TCP + resources: + limits: + cpu: 250m + memory: 180Mi + requests: + cpu: 102m + memory: 180Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + add: + - SYS_TIME + drop: + - ALL + readOnlyRootFilesystem: true + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /host/sys + mountPropagation: HostToContainer + name: sys + readOnly: true + - mountPath: /host/root + mountPropagation: HostToContainer + name: root + readOnly: true + - args: + - --secure-listen-address=[$(IP)]:9101 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:9100/ + env: + - name: IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + image: quay.io/brancz/kube-rbac-proxy:v0.18.0 + imagePullPolicy: IfNotPresent + name: kube-rbac-proxy + ports: + - containerPort: 9101 + hostPort: 9101 + name: https + protocol: TCP + resources: + limits: + cpu: 250m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + seccompProfile: + type: RuntimeDefault + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + dnsPolicy: ClusterFirst + hostNetwork: true + hostPID: true + nodeSelector: + kubernetes.io/os: linux + priorityClassName: system-cluster-critical + restartPolicy: Always + schedulerName: default-scheduler + securityContext: + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + serviceAccount: node-exporter + serviceAccountName: node-exporter + terminationGracePeriodSeconds: 30 + tolerations: + - operator: Exists + volumes: + - hostPath: + path: /sys + type: "" + name: sys + - hostPath: + path: / + type: "" + name: root + updateStrategy: + rollingUpdate: + maxSurge: 0 + maxUnavailable: 10% + type: RollingUpdate +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + name: node-exporter +spec: + clusterIP: None + clusterIPs: + - None + internalTrafficPolicy: Cluster + ipFamilies: + - IPv4 + ipFamilyPolicy: SingleStack + ports: + - name: https + port: 9100 + protocol: TCP + targetPort: https + selector: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + sessionAffinity: None + type: ClusterIP +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-exporter +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: node-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: node-exporter +subjects: +- kind: ServiceAccount + name: node-exporter + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: node-exporter +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create diff --git a/talos/k8s/monitoring/prom.yaml b/talos/k8s/monitoring/prom.yaml new file mode 100644 index 0000000..d6c3b50 --- /dev/null +++ b/talos/k8s/monitoring/prom.yaml @@ -0,0 +1,111 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: talos-prom + namespace: monitoring +spec: + enableFeatures: [] + evaluationInterval: 30s + externalLabels: + cluster: talos + externalUrl: https://prometheus.k8s.home.finn.io + image: quay.io/prometheus/prometheus:v2.54.0 + nodeSelector: + kubernetes.io/os: linux + podMetadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: talos-prom + podMonitorNamespaceSelector: {} + podMonitorSelector: {} + portName: web + probeNamespaceSelector: {} + probeSelector: {} + replicas: 2 + resources: + requests: + memory: 400Mi + retention: 24h + ruleNamespaceSelector: {} + ruleSelector: {} + scrapeConfigNamespaceSelector: {} + scrapeConfigSelector: {} + scrapeInterval: 30s + securityContext: + fsGroup: 2000 + runAsNonRoot: true + runAsUser: 1000 + serviceAccountName: prometheus-thanos + serviceMonitorNamespaceSelector: {} + serviceMonitorSelector: {} + storage: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 50Gi + thanos: + blockSize: 2h + image: quay.io/thanos/thanos:v0.37.2 + objectStorageConfig: + key: thanos.yaml + name: thanos-objstore + version: 2.54.0 +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-thanos +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus-self +rules: +- apiGroups: + - "" + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get +- nonResourceURLs: + - /metrics + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus-self +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-self +subjects: +- kind: ServiceAccount + name: prometheus-thanos + namespace: monitoring +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus-thanos-sidecar +spec: + type: NodePort + selector: + app.kubernetes.io/name: prometheus + ports: + - port: 10901 + targetPort: 10901 diff --git a/talos/k8s/monitoring/servicemonitors.yaml b/talos/k8s/monitoring/servicemonitors.yaml new file mode 100644 index 0000000..714303d --- /dev/null +++ b/talos/k8s/monitoring/servicemonitors.yaml @@ -0,0 +1,352 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: coredns +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + metricRelabelings: + - action: drop + regex: coredns_cache_misses_total + sourceLabels: + - __name__ + port: metrics + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: kube-dns +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kube-apiserver +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers|storage_db_total_size_in_bytes) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(debugging|disk|server).* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_admission_controller_admission_latencies_seconds_.* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_admission_step_admission_latencies_seconds_.* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50) + sourceLabels: + - __name__ + - le + port: https + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + jobLabel: component + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + component: apiserver + provider: kubernetes +--- + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kube-controller-manager +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers|storage_db_total_size_in_bytes) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(debugging|disk|request|server).* + sourceLabels: + - __name__ + port: https-metrics + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app.kubernetes.io/name: kube-controller-manager +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kube-scheduler +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + port: https-metrics + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app.kubernetes.io/name: kube-scheduler +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kubelet +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers|storage_db_total_size_in_bytes) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + port: https-metrics + relabelings: + - action: replace + sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + honorTimestamps: false + interval: 30s + metricRelabelings: + - action: drop + regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) + sourceLabels: + - __name__ + - action: drop + regex: (container_spec_.*|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);; + sourceLabels: + - __name__ + - pod + - namespace + - action: drop + regex: (container_blkio_device_usage_total);.+ + sourceLabels: + - __name__ + - container + path: /metrics/cadvisor + port: https-metrics + relabelings: + - action: replace + sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + path: /metrics/probes + port: https-metrics + relabelings: + - action: replace + sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app.kubernetes.io/name: kubelet +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: node-exporter +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + port: https + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: prometheus-adapter +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + metricRelabelings: + - action: drop + regex: (apiserver_client_certificate_.*|apiserver_envelope_.*|apiserver_flowcontrol_.*|apiserver_storage_.*|apiserver_webhooks_.*|workqueue_.*) + sourceLabels: + - __name__ + port: https + scheme: https + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: prometheus-thanos +spec: + endpoints: + - interval: 30s + port: web + - interval: 30s + port: reloader-web + selector: + matchLabels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: prometheus-operator +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + port: https + scheme: https + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: prometheus-operator diff --git a/talos/k8s/operators/kustomization.yaml b/talos/k8s/operators/kustomization.yaml index d20e599..dc8c5b7 100644 --- a/talos/k8s/operators/kustomization.yaml +++ b/talos/k8s/operators/kustomization.yaml @@ -3,5 +3,7 @@ kind: Kustomization resources: - cert-manager - cert-manager-webhook-pdns + - local-path-provisioner + - prometheus-operator - rook - traefik diff --git a/talos/k8s/operators/local-path-provisioner/kustomization.yaml b/talos/k8s/operators/local-path-provisioner/kustomization.yaml new file mode 100644 index 0000000..1873946 --- /dev/null +++ b/talos/k8s/operators/local-path-provisioner/kustomization.yaml @@ -0,0 +1,37 @@ +# based on https://www.talos.dev/v1.9/kubernetes-guides/configuration/local-storage/#local-path-provisioner +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - github.com/rancher/local-path-provisioner/deploy?ref=v0.0.31 +patches: + - patch: |- + kind: ConfigMap + apiVersion: v1 + metadata: + name: local-path-config + namespace: local-path-storage + data: + config.json: |- + { + "nodePathMap":[ + { + "node":"DEFAULT_PATH_FOR_NON_LISTED_NODES", + "paths":["/var/local-path-provisioner"] + } + ] + } + - patch: |- + apiVersion: storage.k8s.io/v1 + kind: StorageClass + metadata: + name: local-path + annotations: + storageclass.kubernetes.io/is-default-class: "true" + defaultVolumeType: local + - patch: |- + apiVersion: v1 + kind: Namespace + metadata: + name: local-path-storage + labels: + pod-security.kubernetes.io/enforce: privileged diff --git a/talos/k8s/operators/prometheus-operator/kustomization.yaml b/talos/k8s/operators/prometheus-operator/kustomization.yaml new file mode 100644 index 0000000..1b0dd05 --- /dev/null +++ b/talos/k8s/operators/prometheus-operator/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: monitoring +resources: + - namespace.yaml + - https://github.com/prometheus-operator/prometheus-operator?ref=v0.79.1 diff --git a/talos/k8s/operators/prometheus-operator/namespace.yaml b/talos/k8s/operators/prometheus-operator/namespace.yaml new file mode 100644 index 0000000..5e70635 --- /dev/null +++ b/talos/k8s/operators/prometheus-operator/namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: rook-ceph + labels: + pod-security.kubernetes.io/enforce: privileged diff --git a/talos/k8s/rook/buckets.yaml b/talos/k8s/rook/buckets.yaml index f2341d6..a1c27bd 100644 --- a/talos/k8s/rook/buckets.yaml +++ b/talos/k8s/rook/buckets.yaml @@ -13,3 +13,27 @@ metadata: spec: bucketName: dvn-backup storageClassName: rook-ceph-bucket +--- +apiVersion: objectbucket.io/v1alpha1 +kind: ObjectBucketClaim +metadata: + name: media +spec: + bucketName: media + storageClassName: rook-ceph-bucket +--- +apiVersion: objectbucket.io/v1alpha1 +kind: ObjectBucketClaim +metadata: + name: janky.email-backups +spec: + bucketName: janky.email-backups + storageClassName: rook-ceph-bucket +--- +apiVersion: objectbucket.io/v1alpha1 +kind: ObjectBucketClaim +metadata: + name: loki +spec: + bucketName: loki + storageClassName: rook-ceph-bucket diff --git a/talos/k8s/rook/kustomization.yaml b/talos/k8s/rook/kustomization.yaml index 37d8fa5..f3896f3 100644 --- a/talos/k8s/rook/kustomization.yaml +++ b/talos/k8s/rook/kustomization.yaml @@ -4,3 +4,4 @@ namespace: rook-ceph resources: - buckets.yaml - s3-pool.yaml + - servicemonitor.yaml diff --git a/talos/k8s/rook/servicemonitor.yaml b/talos/k8s/rook/servicemonitor.yaml new file mode 100644 index 0000000..7f8efbf --- /dev/null +++ b/talos/k8s/rook/servicemonitor.yaml @@ -0,0 +1,56 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: rook-ceph-exporter + labels: + team: rook +spec: + namespaceSelector: + matchNames: + - rook-ceph + selector: + matchLabels: + app: rook-ceph-exporter + rook_cluster: rook-ceph + endpoints: + - port: ceph-exporter-http-metrics + path: /metrics + interval: 10s +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: rook-ceph-mgr + labels: + team: rook +spec: + namespaceSelector: + matchNames: + - rook-ceph + selector: + matchLabels: + app: rook-ceph-mgr + rook_cluster: rook-ceph + endpoints: + - port: http-metrics + path: /metrics + interval: 10s + honorLabels: true +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: csi-metrics + labels: + team: rook +spec: + namespaceSelector: + matchNames: + - rook-ceph + selector: + matchLabels: + app: csi-metrics + endpoints: + - port: csi-http-metrics + path: /metrics + interval: 5s