You've already forked DataMate
@@ -0,0 +1,44 @@
|
||||
{{- /*
|
||||
Generated file. Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- define "rules.names" }}
|
||||
rules:
|
||||
- "alertmanager.rules"
|
||||
- "config-reloaders"
|
||||
- "etcd"
|
||||
- "general.rules"
|
||||
- "k8s.rules.container_cpu_usage_seconds_total"
|
||||
- "k8s.rules.container_memory_cache"
|
||||
- "k8s.rules.container_memory_rss"
|
||||
- "k8s.rules.container_memory_swap"
|
||||
- "k8s.rules.container_memory_working_set_bytes"
|
||||
- "k8s.rules.container_resource"
|
||||
- "k8s.rules.pod_owner"
|
||||
- "kube-apiserver-availability.rules"
|
||||
- "kube-apiserver-burnrate.rules"
|
||||
- "kube-apiserver-histogram.rules"
|
||||
- "kube-apiserver-slos"
|
||||
- "kube-prometheus-general.rules"
|
||||
- "kube-prometheus-node-recording.rules"
|
||||
- "kube-scheduler.rules"
|
||||
- "kube-state-metrics"
|
||||
- "kubelet.rules"
|
||||
- "kubernetes-apps"
|
||||
- "kubernetes-resources"
|
||||
- "kubernetes-storage"
|
||||
- "kubernetes-system"
|
||||
- "kubernetes-system-kube-proxy"
|
||||
- "kubernetes-system-apiserver"
|
||||
- "kubernetes-system-kubelet"
|
||||
- "kubernetes-system-controller-manager"
|
||||
- "kubernetes-system-scheduler"
|
||||
- "node-exporter.rules"
|
||||
- "node-exporter"
|
||||
- "node.rules"
|
||||
- "node-network"
|
||||
- "prometheus-operator"
|
||||
- "prometheus"
|
||||
- "windows.node.rules"
|
||||
- "windows.pod.rules"
|
||||
{{- end }}
|
||||
@@ -0,0 +1,16 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.prometheus.prometheusSpec.additionalAlertRelabelConfigs }}
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus-am-relabel-confg
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
{{- if .Values.prometheus.prometheusSpec.additionalPrometheusSecretsAnnotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.additionalPrometheusSecretsAnnotations | indent 4 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus-am-relabel-confg
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
data:
|
||||
additional-alert-relabel-configs.yaml: {{ toYaml .Values.prometheus.prometheusSpec.additionalAlertRelabelConfigs | b64enc | quote }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,16 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.prometheus.prometheusSpec.additionalAlertManagerConfigs }}
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus-am-confg
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
{{- if .Values.prometheus.prometheusSpec.additionalPrometheusSecretsAnnotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.additionalPrometheusSecretsAnnotations | indent 4 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus-am-confg
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
data:
|
||||
additional-alertmanager-configs.yaml: {{ tpl (toYaml .Values.prometheus.prometheusSpec.additionalAlertManagerConfigs) . | b64enc | quote }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,43 @@
|
||||
{{- if or .Values.additionalPrometheusRules .Values.additionalPrometheusRulesMap}}
|
||||
apiVersion: v1
|
||||
kind: List
|
||||
metadata:
|
||||
name: {{ include "kube-prometheus-stack.fullname" $ }}-additional-prometheus-rules
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
items:
|
||||
{{- if .Values.additionalPrometheusRulesMap }}
|
||||
{{- range $prometheusRuleName, $prometheusRule := .Values.additionalPrometheusRulesMap }}
|
||||
- apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.name" $ }}-{{ $prometheusRuleName }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" $ }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" $ }}
|
||||
{{ include "kube-prometheus-stack.labels" $ | indent 8 }}
|
||||
{{- if $prometheusRule.additionalLabels }}
|
||||
{{ toYaml $prometheusRule.additionalLabels | indent 8 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
{{ toYaml $prometheusRule.groups| indent 8 }}
|
||||
{{- end }}
|
||||
{{- else }}
|
||||
{{- range .Values.additionalPrometheusRules }}
|
||||
- apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.name" $ }}-{{ .name }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" $ }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" $ }}
|
||||
{{ include "kube-prometheus-stack.labels" $ | indent 8 }}
|
||||
{{- if .additionalLabels }}
|
||||
{{ toYaml .additionalLabels | indent 8 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
{{ toYaml .groups| indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,20 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.prometheus.prometheusSpec.additionalScrapeConfigs }}
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus-scrape-confg
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
{{- if .Values.prometheus.prometheusSpec.additionalPrometheusSecretsAnnotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.additionalPrometheusSecretsAnnotations | indent 4 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus-scrape-confg
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
data:
|
||||
{{- if eq ( typeOf .Values.prometheus.prometheusSpec.additionalScrapeConfigs ) "string" }}
|
||||
additional-scrape-configs.yaml: {{ tpl .Values.prometheus.prometheusSpec.additionalScrapeConfigs $ | b64enc | quote }}
|
||||
{{- else }}
|
||||
additional-scrape-configs.yaml: {{ tpl (toYaml .Values.prometheus.prometheusSpec.additionalScrapeConfigs) $ | b64enc | quote }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,27 @@
|
||||
{{- if and .Values.prometheus.networkPolicy.enabled (eq .Values.prometheus.networkPolicy.flavor "cilium") }}
|
||||
apiVersion: cilium.io/v2
|
||||
kind: CiliumNetworkPolicy
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
{{- include "kube-prometheus-stack.labels" . | nindent 4 }}
|
||||
spec:
|
||||
endpointSelector:
|
||||
{{- if .Values.prometheus.networkPolicy.cilium.endpointSelector }}
|
||||
{{- toYaml .Values.prometheus.networkPolicy.cilium.endpointSelector | nindent 4 }}
|
||||
{{- else }}
|
||||
matchExpressions:
|
||||
- {key: app.kubernetes.io/name, operator: In, values: [prometheus]}
|
||||
- {key: prometheus, operator: In, values: [{{ template "kube-prometheus-stack.prometheus.crname" . }}]}
|
||||
{{- end }}
|
||||
{{- if and .Values.prometheus.networkPolicy.cilium .Values.prometheus.networkPolicy.cilium.egress }}
|
||||
egress:
|
||||
{{ toYaml .Values.prometheus.networkPolicy.cilium.egress | nindent 4 }}
|
||||
{{- end }}
|
||||
{{- if and .Values.prometheus.networkPolicy.cilium .Values.prometheus.networkPolicy.cilium.ingress }}
|
||||
ingress:
|
||||
{{ toYaml .Values.prometheus.networkPolicy.cilium.ingress | nindent 4 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,30 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.global.rbac.create }}
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
rules:
|
||||
# This permission are not in the kube-prometheus repo
|
||||
# they're grabbed from https://github.com/prometheus/prometheus/blob/master/documentation/examples/rbac-setup.yml
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- nodes
|
||||
- nodes/metrics
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups:
|
||||
- "networking.k8s.io"
|
||||
resources:
|
||||
- ingresses
|
||||
verbs: ["get", "list", "watch"]
|
||||
- nonResourceURLs: ["/metrics", "/metrics/cadvisor"]
|
||||
verbs: ["get"]
|
||||
{{- if .Values.prometheus.additionalRulesForClusterRole }}
|
||||
{{ toYaml .Values.prometheus.additionalRulesForClusterRole | indent 0 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,18 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.global.rbac.create }}
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ template "kube-prometheus-stack.prometheus.serviceAccountName" . }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
{{- end }}
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
{{- if and .Values.prometheus.prometheusSpec.thanos .Values.prometheus.prometheusSpec.thanos.secretProviderClass }}
|
||||
---
|
||||
apiVersion: secrets-store.csi.x-k8s.io/v1alpha1
|
||||
kind: SecretProviderClass
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
spec:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.thanos.secretProviderClass | indent 2 }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,20 @@
|
||||
{{- if .Values.prometheus.extraSecret.data -}}
|
||||
{{- $secretName := printf "prometheus-%s-extra" (include "kube-prometheus-stack.fullname" . ) -}}
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ default $secretName .Values.prometheus.extraSecret.name }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
{{- if .Values.prometheus.extraSecret.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.prometheus.extraSecret.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
app.kubernetes.io/component: prometheus
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
data:
|
||||
{{- range $key, $val := .Values.prometheus.extraSecret.data }}
|
||||
{{ $key }}: {{ $val | b64enc | quote }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,77 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.prometheus.ingress.enabled -}}
|
||||
{{- $pathType := .Values.prometheus.ingress.pathType | default "ImplementationSpecific" -}}
|
||||
{{- $serviceName := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" -}}
|
||||
{{- $servicePort := .Values.prometheus.ingress.servicePort | default .Values.prometheus.service.port -}}
|
||||
{{- $routePrefix := list .Values.prometheus.prometheusSpec.routePrefix -}}
|
||||
{{- $paths := .Values.prometheus.ingress.paths | default $routePrefix -}}
|
||||
{{- $apiIsStable := eq (include "kube-prometheus-stack.ingress.isStable" .) "true" -}}
|
||||
{{- $ingressSupportsPathType := eq (include "kube-prometheus-stack.ingress.supportsPathType" .) "true" -}}
|
||||
apiVersion: {{ include "kube-prometheus-stack.ingress.apiVersion" . }}
|
||||
kind: Ingress
|
||||
metadata:
|
||||
{{- if .Values.prometheus.ingress.annotations }}
|
||||
annotations:
|
||||
{{- tpl (toYaml .Values.prometheus.ingress.annotations) . | nindent 4 }}
|
||||
{{- end }}
|
||||
name: {{ $serviceName }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.prometheus.ingress.labels }}
|
||||
{{ toYaml .Values.prometheus.ingress.labels | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if $apiIsStable }}
|
||||
{{- if .Values.prometheus.ingress.ingressClassName }}
|
||||
ingressClassName: {{ .Values.prometheus.ingress.ingressClassName }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
rules:
|
||||
{{- if .Values.prometheus.ingress.hosts }}
|
||||
{{- range $host := .Values.prometheus.ingress.hosts }}
|
||||
- host: {{ tpl $host $ }}
|
||||
http:
|
||||
paths:
|
||||
{{- range $p := $paths }}
|
||||
- path: {{ tpl $p $ }}
|
||||
{{- if and $pathType $ingressSupportsPathType }}
|
||||
pathType: {{ $pathType }}
|
||||
{{- end }}
|
||||
backend:
|
||||
{{- if $apiIsStable }}
|
||||
service:
|
||||
name: {{ $serviceName }}
|
||||
port:
|
||||
number: {{ $servicePort }}
|
||||
{{- else }}
|
||||
serviceName: {{ $serviceName }}
|
||||
servicePort: {{ $servicePort }}
|
||||
{{- end }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- else }}
|
||||
- http:
|
||||
paths:
|
||||
{{- range $p := $paths }}
|
||||
- path: {{ tpl $p $ }}
|
||||
{{- if and $pathType $ingressSupportsPathType }}
|
||||
pathType: {{ $pathType }}
|
||||
{{- end }}
|
||||
backend:
|
||||
{{- if $apiIsStable }}
|
||||
service:
|
||||
name: {{ $serviceName }}
|
||||
port:
|
||||
number: {{ $servicePort }}
|
||||
{{- else }}
|
||||
serviceName: {{ $serviceName }}
|
||||
servicePort: {{ $servicePort }}
|
||||
{{- end }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- if .Values.prometheus.ingress.tls }}
|
||||
tls:
|
||||
{{ tpl (toYaml .Values.prometheus.ingress.tls | indent 4) . }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
@@ -0,0 +1,77 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.prometheus.thanosIngress.enabled }}
|
||||
{{- $pathType := .Values.prometheus.thanosIngress.pathType | default "" }}
|
||||
{{- $serviceName := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "thanos-discovery" }}
|
||||
{{- $thanosPort := .Values.prometheus.thanosIngress.servicePort -}}
|
||||
{{- $routePrefix := list .Values.prometheus.prometheusSpec.routePrefix }}
|
||||
{{- $paths := .Values.prometheus.thanosIngress.paths | default $routePrefix -}}
|
||||
{{- $apiIsStable := eq (include "kube-prometheus-stack.ingress.isStable" .) "true" -}}
|
||||
{{- $ingressSupportsPathType := eq (include "kube-prometheus-stack.ingress.supportsPathType" .) "true" -}}
|
||||
apiVersion: {{ include "kube-prometheus-stack.ingress.apiVersion" . }}
|
||||
kind: Ingress
|
||||
metadata:
|
||||
{{- if .Values.prometheus.thanosIngress.annotations }}
|
||||
annotations:
|
||||
{{- tpl (toYaml .Values.prometheus.thanosIngress.annotations) . | nindent 4 }}
|
||||
{{- end }}
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-thanos-gateway
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.prometheus.thanosIngress.labels }}
|
||||
{{ toYaml .Values.prometheus.thanosIngress.labels | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if $apiIsStable }}
|
||||
{{- if .Values.prometheus.thanosIngress.ingressClassName }}
|
||||
ingressClassName: {{ .Values.prometheus.thanosIngress.ingressClassName }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
rules:
|
||||
{{- if .Values.prometheus.thanosIngress.hosts }}
|
||||
{{- range $host := .Values.prometheus.thanosIngress.hosts }}
|
||||
- host: {{ tpl $host $ }}
|
||||
http:
|
||||
paths:
|
||||
{{- range $p := $paths }}
|
||||
- path: {{ tpl $p $ }}
|
||||
{{- if and $pathType $ingressSupportsPathType }}
|
||||
pathType: {{ $pathType }}
|
||||
{{- end }}
|
||||
backend:
|
||||
{{- if $apiIsStable }}
|
||||
service:
|
||||
name: {{ $serviceName }}
|
||||
port:
|
||||
number: {{ $thanosPort }}
|
||||
{{- else }}
|
||||
serviceName: {{ $serviceName }}
|
||||
servicePort: {{ $thanosPort }}
|
||||
{{- end }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- else }}
|
||||
- http:
|
||||
paths:
|
||||
{{- range $p := $paths }}
|
||||
- path: {{ tpl $p $ }}
|
||||
{{- if and $pathType $ingressSupportsPathType }}
|
||||
pathType: {{ $pathType }}
|
||||
{{- end }}
|
||||
backend:
|
||||
{{- if $apiIsStable }}
|
||||
service:
|
||||
name: {{ $serviceName }}
|
||||
port:
|
||||
number: {{ $thanosPort }}
|
||||
{{- else }}
|
||||
serviceName: {{ $serviceName }}
|
||||
servicePort: {{ $thanosPort }}
|
||||
{{- end }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- if .Values.prometheus.thanosIngress.tls }}
|
||||
tls:
|
||||
{{ tpl (toYaml .Values.prometheus.thanosIngress.tls | indent 4) . }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
@@ -0,0 +1,67 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.prometheus.servicePerReplica.enabled .Values.prometheus.ingressPerReplica.enabled }}
|
||||
{{- $pathType := .Values.prometheus.ingressPerReplica.pathType | default "" }}
|
||||
{{- $count := .Values.prometheus.prometheusSpec.replicas | int -}}
|
||||
{{- $servicePort := .Values.prometheus.servicePerReplica.port -}}
|
||||
{{- $ingressValues := .Values.prometheus.ingressPerReplica -}}
|
||||
{{- $apiIsStable := eq (include "kube-prometheus-stack.ingress.isStable" .) "true" -}}
|
||||
{{- $ingressSupportsPathType := eq (include "kube-prometheus-stack.ingress.supportsPathType" .) "true" -}}
|
||||
apiVersion: v1
|
||||
kind: List
|
||||
metadata:
|
||||
name: {{ include "kube-prometheus-stack.fullname" $ }}-prometheus-ingressperreplica
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" $ }}
|
||||
items:
|
||||
{{ range $i, $e := until $count }}
|
||||
- kind: Ingress
|
||||
apiVersion: {{ include "kube-prometheus-stack.ingress.apiVersion" $ }}
|
||||
metadata:
|
||||
name: {{ include "kube-prometheus-stack.fullname" $ }}-prometheus-{{ $i }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" $ }}
|
||||
labels:
|
||||
app: {{ include "kube-prometheus-stack.name" $ }}-prometheus
|
||||
{{ include "kube-prometheus-stack.labels" $ | indent 8 }}
|
||||
{{- if $ingressValues.labels }}
|
||||
{{ toYaml $ingressValues.labels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if $ingressValues.annotations }}
|
||||
annotations:
|
||||
{{- tpl (toYaml $ingressValues.annotations) $ | nindent 8 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if $apiIsStable }}
|
||||
{{- if $ingressValues.ingressClassName }}
|
||||
ingressClassName: {{ $ingressValues.ingressClassName }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
rules:
|
||||
- host: {{ $ingressValues.hostPrefix }}-{{ $i }}.{{ $ingressValues.hostDomain }}
|
||||
http:
|
||||
paths:
|
||||
{{- range $p := $ingressValues.paths }}
|
||||
- path: {{ tpl $p $ }}
|
||||
{{- if and $pathType $ingressSupportsPathType }}
|
||||
pathType: {{ $pathType }}
|
||||
{{- end }}
|
||||
backend:
|
||||
{{- if $apiIsStable }}
|
||||
service:
|
||||
name: {{ include "kube-prometheus-stack.fullname" $ }}-prometheus-{{ $i }}
|
||||
port:
|
||||
number: {{ $servicePort }}
|
||||
{{- else }}
|
||||
serviceName: {{ include "kube-prometheus-stack.fullname" $ }}-prometheus-{{ $i }}
|
||||
servicePort: {{ $servicePort }}
|
||||
{{- end }}
|
||||
{{- end -}}
|
||||
{{- if or $ingressValues.tlsSecretName $ingressValues.tlsSecretPerReplica.enabled }}
|
||||
tls:
|
||||
- hosts:
|
||||
- {{ $ingressValues.hostPrefix }}-{{ $i }}.{{ $ingressValues.hostDomain }}
|
||||
{{- if $ingressValues.tlsSecretPerReplica.enabled }}
|
||||
secretName: {{ $ingressValues.tlsSecretPerReplica.prefix }}-{{ $i }}
|
||||
{{- else }}
|
||||
secretName: {{ $ingressValues.tlsSecretName }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
@@ -0,0 +1,34 @@
|
||||
{{- if and .Values.prometheus.networkPolicy.enabled (eq .Values.prometheus.networkPolicy.flavor "kubernetes") }}
|
||||
apiVersion: {{ template "kube-prometheus-stack.prometheus.networkPolicy.apiVersion" . }}
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
{{- include "kube-prometheus-stack.labels" . | nindent 4 }}
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
spec:
|
||||
{{- if .Values.prometheus.networkPolicy.egress }}
|
||||
egress:
|
||||
{{- toYaml .Values.prometheus.networkPolicy.egress | nindent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.networkPolicy.ingress }}
|
||||
ingress:
|
||||
{{- toYaml .Values.prometheus.networkPolicy.ingress | nindent 4 }}
|
||||
{{- end }}
|
||||
policyTypes:
|
||||
- Egress
|
||||
- Ingress
|
||||
podSelector:
|
||||
{{- if .Values.prometheus.networkPolicy.podSelector }}
|
||||
{{- toYaml .Values.prometheus.networkPolicy.podSelector | nindent 4 }}
|
||||
{{- else }}
|
||||
matchLabels:
|
||||
{{- if .Values.prometheus.agentMode }}
|
||||
app.kubernetes.io/name: prometheus-agent
|
||||
{{- else }}
|
||||
app.kubernetes.io/name: prometheus
|
||||
{{- end }}
|
||||
operator.prometheus.io/name: {{ template "kube-prometheus-stack.prometheus.crname" . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,25 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.prometheus.podDisruptionBudget.enabled }}
|
||||
apiVersion: {{ include "kube-prometheus-stack.pdb.apiVersion" . }}
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
spec:
|
||||
{{- if .Values.prometheus.podDisruptionBudget.minAvailable }}
|
||||
minAvailable: {{ .Values.prometheus.podDisruptionBudget.minAvailable }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.podDisruptionBudget.maxUnavailable }}
|
||||
maxUnavailable: {{ .Values.prometheus.podDisruptionBudget.maxUnavailable }}
|
||||
{{- end }}
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- if .Values.prometheus.agentMode }}
|
||||
app.kubernetes.io/name: prometheus-agent
|
||||
{{- else }}
|
||||
app.kubernetes.io/name: prometheus
|
||||
{{- end }}
|
||||
operator.prometheus.io/name: {{ template "kube-prometheus-stack.prometheus.crname" . }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,38 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.prometheus.additionalPodMonitors }}
|
||||
apiVersion: v1
|
||||
kind: List
|
||||
items:
|
||||
{{- range .Values.prometheus.additionalPodMonitors }}
|
||||
- apiVersion: monitoring.coreos.com/v1
|
||||
kind: PodMonitor
|
||||
metadata:
|
||||
name: {{ .name }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" $ }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" $ }}-prometheus
|
||||
{{ include "kube-prometheus-stack.labels" $ | indent 8 }}
|
||||
{{- if .additionalLabels }}
|
||||
{{ toYaml .additionalLabels | indent 8 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- include "servicemonitor.scrapeLimits" . | nindent 6 }}
|
||||
podMetricsEndpoints:
|
||||
{{ toYaml .podMetricsEndpoints | indent 8 }}
|
||||
{{- if .jobLabel }}
|
||||
jobLabel: {{ .jobLabel }}
|
||||
{{- end }}
|
||||
{{- if .namespaceSelector }}
|
||||
namespaceSelector:
|
||||
{{ toYaml .namespaceSelector | indent 8 }}
|
||||
{{- end }}
|
||||
selector:
|
||||
{{ toYaml .selector | indent 8 }}
|
||||
{{- if .podTargetLabels }}
|
||||
podTargetLabels:
|
||||
{{ toYaml .podTargetLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .sampleLimit }}
|
||||
sampleLimit: {{ .sampleLimit }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,470 @@
|
||||
{{- if .Values.prometheus.enabled }}
|
||||
{{- if .Values.prometheus.agentMode }}
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: PrometheusAgent
|
||||
{{- else }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: Prometheus
|
||||
{{- end }}
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.prometheus.crname" . }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.prometheus.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.prometheus.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if and (not .Values.prometheus.agentMode) (or .Values.prometheus.prometheusSpec.alertingEndpoints .Values.alertmanager.enabled) }}
|
||||
alerting:
|
||||
alertmanagers:
|
||||
{{- if .Values.prometheus.prometheusSpec.alertingEndpoints }}
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.alertingEndpoints | indent 6 }}
|
||||
{{- else if .Values.alertmanager.enabled }}
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-alertmanager
|
||||
port: {{ .Values.alertmanager.alertmanagerSpec.portName }}
|
||||
{{- if .Values.alertmanager.alertmanagerSpec.routePrefix }}
|
||||
pathPrefix: "{{ .Values.alertmanager.alertmanagerSpec.routePrefix }}"
|
||||
{{- end }}
|
||||
{{- if .Values.alertmanager.alertmanagerSpec.scheme }}
|
||||
scheme: {{ .Values.alertmanager.alertmanagerSpec.scheme }}
|
||||
{{- end }}
|
||||
{{- if .Values.alertmanager.alertmanagerSpec.tlsConfig }}
|
||||
tlsConfig:
|
||||
{{ toYaml .Values.alertmanager.alertmanagerSpec.tlsConfig | indent 10 }}
|
||||
{{- end }}
|
||||
apiVersion: {{ .Values.alertmanager.apiVersion }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.apiserverConfig }}
|
||||
apiserverConfig:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.apiserverConfig | indent 4}}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.image }}
|
||||
{{- $registry := .Values.global.imageRegistry | default .Values.prometheus.prometheusSpec.image.registry -}}
|
||||
{{- if and .Values.prometheus.prometheusSpec.image.tag .Values.prometheus.prometheusSpec.image.sha }}
|
||||
image: "{{ $registry }}/{{ .Values.prometheus.prometheusSpec.image.repository }}:{{ .Values.prometheus.prometheusSpec.image.tag }}@sha256:{{ .Values.prometheus.prometheusSpec.image.sha }}"
|
||||
{{- else if .Values.prometheus.prometheusSpec.image.sha }}
|
||||
image: "{{ $registry }}/{{ .Values.prometheus.prometheusSpec.image.repository }}@sha256:{{ .Values.prometheus.prometheusSpec.image.sha }}"
|
||||
{{- else if .Values.prometheus.prometheusSpec.image.tag }}
|
||||
image: "{{ $registry }}/{{ .Values.prometheus.prometheusSpec.image.repository }}:{{ .Values.prometheus.prometheusSpec.image.tag }}"
|
||||
{{- else }}
|
||||
image: "{{ $registry }}/{{ .Values.prometheus.prometheusSpec.image.repository }}"
|
||||
{{- end }}
|
||||
version: {{ default .Values.prometheus.prometheusSpec.image.tag .Values.prometheus.prometheusSpec.version }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.additionalArgs }}
|
||||
additionalArgs:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.additionalArgs | indent 4}}
|
||||
{{- end -}}
|
||||
{{- if .Values.prometheus.prometheusSpec.externalLabels }}
|
||||
externalLabels:
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.externalLabels | indent 4) . }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.prometheusExternalLabelNameClear }}
|
||||
prometheusExternalLabelName: ""
|
||||
{{- else if .Values.prometheus.prometheusSpec.prometheusExternalLabelName }}
|
||||
prometheusExternalLabelName: "{{ .Values.prometheus.prometheusSpec.prometheusExternalLabelName }}"
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.replicaExternalLabelNameClear }}
|
||||
replicaExternalLabelName: ""
|
||||
{{- else if .Values.prometheus.prometheusSpec.replicaExternalLabelName }}
|
||||
replicaExternalLabelName: "{{ .Values.prometheus.prometheusSpec.replicaExternalLabelName }}"
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.enableRemoteWriteReceiver }}
|
||||
enableRemoteWriteReceiver: {{ .Values.prometheus.prometheusSpec.enableRemoteWriteReceiver }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.externalUrl }}
|
||||
externalUrl: "{{ tpl .Values.prometheus.prometheusSpec.externalUrl . }}"
|
||||
{{- else if and .Values.prometheus.ingress.enabled .Values.prometheus.ingress.hosts }}
|
||||
externalUrl: "http://{{ tpl (index .Values.prometheus.ingress.hosts 0) . }}{{ .Values.prometheus.prometheusSpec.routePrefix }}"
|
||||
{{- else }}
|
||||
externalUrl: http://{{ template "kube-prometheus-stack.fullname" . }}-prometheus.{{ template "kube-prometheus-stack.namespace" . }}:{{ .Values.prometheus.service.port }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.nodeSelector | indent 4 }}
|
||||
{{- end }}
|
||||
paused: {{ .Values.prometheus.prometheusSpec.paused }}
|
||||
replicas: {{ .Values.prometheus.prometheusSpec.replicas }}
|
||||
shards: {{ .Values.prometheus.prometheusSpec.shards }}
|
||||
logLevel: {{ .Values.prometheus.prometheusSpec.logLevel }}
|
||||
logFormat: {{ .Values.prometheus.prometheusSpec.logFormat }}
|
||||
listenLocal: {{ .Values.prometheus.prometheusSpec.listenLocal }}
|
||||
{{- if not .Values.prometheus.agentMode }}
|
||||
enableAdminAPI: {{ .Values.prometheus.prometheusSpec.enableAdminAPI }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.web }}
|
||||
web:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.web | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if and (not .Values.prometheus.agentMode) .Values.prometheus.prometheusSpec.exemplars }}
|
||||
exemplars:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.exemplars | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.enableFeatures }}
|
||||
enableFeatures:
|
||||
{{- range $enableFeatures := .Values.prometheus.prometheusSpec.enableFeatures }}
|
||||
- {{ tpl $enableFeatures $ }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.scrapeInterval }}
|
||||
scrapeInterval: {{ .Values.prometheus.prometheusSpec.scrapeInterval }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.scrapeTimeout }}
|
||||
scrapeTimeout: {{ .Values.prometheus.prometheusSpec.scrapeTimeout }}
|
||||
{{- end }}
|
||||
{{- if and (not .Values.prometheus.agentMode) .Values.prometheus.prometheusSpec.evaluationInterval }}
|
||||
evaluationInterval: {{ .Values.prometheus.prometheusSpec.evaluationInterval }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.resources }}
|
||||
resources:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.resources | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if not .Values.prometheus.agentMode }}
|
||||
retention: {{ .Values.prometheus.prometheusSpec.retention | quote }}
|
||||
{{- if .Values.prometheus.prometheusSpec.retentionSize }}
|
||||
retentionSize: {{ .Values.prometheus.prometheusSpec.retentionSize | quote }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.tsdb }}
|
||||
tsdb:
|
||||
{{- if .Values.prometheus.prometheusSpec.tsdb.outOfOrderTimeWindow }}
|
||||
outOfOrderTimeWindow: {{ .Values.prometheus.prometheusSpec.tsdb.outOfOrderTimeWindow }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if eq .Values.prometheus.prometheusSpec.walCompression false }}
|
||||
walCompression: false
|
||||
{{ else }}
|
||||
walCompression: true
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.routePrefix }}
|
||||
routePrefix: {{ .Values.prometheus.prometheusSpec.routePrefix | quote }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.secrets }}
|
||||
secrets:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.secrets | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.configMaps }}
|
||||
configMaps:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.configMaps | indent 4 }}
|
||||
{{- end }}
|
||||
serviceAccountName: {{ template "kube-prometheus-stack.prometheus.serviceAccountName" . }}
|
||||
{{- if .Values.prometheus.prometheusSpec.serviceMonitorSelector }}
|
||||
serviceMonitorSelector:
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.serviceMonitorSelector | indent 4) . }}
|
||||
{{ else if .Values.prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues }}
|
||||
serviceMonitorSelector:
|
||||
matchLabels:
|
||||
release: {{ $.Release.Name | quote }}
|
||||
{{ else }}
|
||||
serviceMonitorSelector: {}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.serviceMonitorNamespaceSelector }}
|
||||
serviceMonitorNamespaceSelector:
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.serviceMonitorNamespaceSelector | indent 4) . }}
|
||||
{{ else }}
|
||||
serviceMonitorNamespaceSelector: {}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.podMonitorSelector }}
|
||||
podMonitorSelector:
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.podMonitorSelector | indent 4) . }}
|
||||
{{ else if .Values.prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues }}
|
||||
podMonitorSelector:
|
||||
matchLabels:
|
||||
release: {{ $.Release.Name | quote }}
|
||||
{{ else }}
|
||||
podMonitorSelector: {}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.podMonitorNamespaceSelector }}
|
||||
podMonitorNamespaceSelector:
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.podMonitorNamespaceSelector | indent 4) . }}
|
||||
{{ else }}
|
||||
podMonitorNamespaceSelector: {}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.probeSelector }}
|
||||
probeSelector:
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.probeSelector | indent 4) . }}
|
||||
{{ else if .Values.prometheus.prometheusSpec.probeSelectorNilUsesHelmValues }}
|
||||
probeSelector:
|
||||
matchLabels:
|
||||
release: {{ $.Release.Name | quote }}
|
||||
{{ else }}
|
||||
probeSelector: {}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.probeNamespaceSelector }}
|
||||
probeNamespaceSelector:
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.probeNamespaceSelector | indent 4) . }}
|
||||
{{ else }}
|
||||
probeNamespaceSelector: {}
|
||||
{{- end }}
|
||||
{{- if and (not .Values.prometheus.agentMode) (or .Values.prometheus.prometheusSpec.remoteRead .Values.prometheus.prometheusSpec.additionalRemoteRead) }}
|
||||
remoteRead:
|
||||
{{- if .Values.prometheus.prometheusSpec.remoteRead }}
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.remoteRead | indent 4) . }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.additionalRemoteRead }}
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.additionalRemoteRead | indent 4 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if (or .Values.prometheus.prometheusSpec.remoteWrite .Values.prometheus.prometheusSpec.additionalRemoteWrite) }}
|
||||
remoteWrite:
|
||||
{{- if .Values.prometheus.prometheusSpec.remoteWrite }}
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.remoteWrite | indent 4) . }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.additionalRemoteWrite }}
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.additionalRemoteWrite | indent 4 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.securityContext }}
|
||||
securityContext:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.securityContext | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if not .Values.prometheus.agentMode }}
|
||||
{{- if .Values.prometheus.prometheusSpec.ruleNamespaceSelector }}
|
||||
ruleNamespaceSelector:
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.ruleNamespaceSelector | indent 4) . }}
|
||||
{{ else }}
|
||||
ruleNamespaceSelector: {}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.ruleSelector }}
|
||||
ruleSelector:
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.ruleSelector | indent 4) . }}
|
||||
{{- else if .Values.prometheus.prometheusSpec.ruleSelectorNilUsesHelmValues }}
|
||||
ruleSelector:
|
||||
matchLabels:
|
||||
release: {{ $.Release.Name | quote }}
|
||||
{{ else }}
|
||||
ruleSelector: {}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.scrapeConfigSelector }}
|
||||
scrapeConfigSelector:
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.scrapeConfigSelector | indent 4) . }}
|
||||
{{ else if .Values.prometheus.prometheusSpec.scrapeConfigSelectorNilUsesHelmValues }}
|
||||
scrapeConfigSelector:
|
||||
matchLabels:
|
||||
release: {{ $.Release.Name | quote }}
|
||||
{{ else }}
|
||||
scrapeConfigSelector: {}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.scrapeConfigNamespaceSelector }}
|
||||
scrapeConfigNamespaceSelector:
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.scrapeConfigNamespaceSelector | indent 4) . }}
|
||||
{{ else }}
|
||||
scrapeConfigNamespaceSelector: {}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.storageSpec }}
|
||||
storage:
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.storageSpec | indent 4) . }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.podMetadata }}
|
||||
podMetadata:
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.podMetadata | indent 4) . }}
|
||||
{{- end }}
|
||||
{{- if and (not .Values.prometheus.agentMode) .Values.prometheus.prometheusSpec.query }}
|
||||
query:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.query | indent 4}}
|
||||
{{- end }}
|
||||
{{- if or .Values.prometheus.prometheusSpec.podAntiAffinity .Values.prometheus.prometheusSpec.affinity }}
|
||||
affinity:
|
||||
{{- if .Values.prometheus.prometheusSpec.affinity }}
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.affinity | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if eq .Values.prometheus.prometheusSpec.podAntiAffinity "hard" }}
|
||||
podAntiAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
- topologyKey: {{ .Values.prometheus.prometheusSpec.podAntiAffinityTopologyKey }}
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- {key: app.kubernetes.io/name, operator: In, values: [prometheus]}
|
||||
- {key: prometheus, operator: In, values: [{{ template "kube-prometheus-stack.prometheus.crname" . }}]}
|
||||
{{- else if eq .Values.prometheus.prometheusSpec.podAntiAffinity "soft" }}
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
topologyKey: {{ .Values.prometheus.prometheusSpec.podAntiAffinityTopologyKey }}
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- {key: app.kubernetes.io/name, operator: In, values: [prometheus]}
|
||||
- {key: prometheus, operator: In, values: [{{ template "kube-prometheus-stack.prometheus.crname" . }}]}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.tolerations }}
|
||||
tolerations:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.tolerations | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.topologySpreadConstraints }}
|
||||
topologySpreadConstraints:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.topologySpreadConstraints | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.global.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{ include "kube-prometheus-stack.imagePullSecrets" . | trim | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.additionalScrapeConfigs }}
|
||||
additionalScrapeConfigs:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus-scrape-confg
|
||||
key: additional-scrape-configs.yaml
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.additionalScrapeConfigsSecret.enabled }}
|
||||
additionalScrapeConfigs:
|
||||
name: {{ .Values.prometheus.prometheusSpec.additionalScrapeConfigsSecret.name }}
|
||||
key: {{ .Values.prometheus.prometheusSpec.additionalScrapeConfigsSecret.key }}
|
||||
{{- end }}
|
||||
{{- if not .Values.prometheus.agentMode }}
|
||||
{{- if or .Values.prometheus.prometheusSpec.additionalAlertManagerConfigs .Values.prometheus.prometheusSpec.additionalAlertManagerConfigsSecret }}
|
||||
additionalAlertManagerConfigs:
|
||||
{{- if .Values.prometheus.prometheusSpec.additionalAlertManagerConfigs }}
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus-am-confg
|
||||
key: additional-alertmanager-configs.yaml
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.additionalAlertManagerConfigsSecret }}
|
||||
name: {{ .Values.prometheus.prometheusSpec.additionalAlertManagerConfigsSecret.name }}
|
||||
key: {{ .Values.prometheus.prometheusSpec.additionalAlertManagerConfigsSecret.key }}
|
||||
{{- if hasKey .Values.prometheus.prometheusSpec.additionalAlertManagerConfigsSecret "optional" }}
|
||||
optional: {{ .Values.prometheus.prometheusSpec.additionalAlertManagerConfigsSecret.optional }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.additionalAlertRelabelConfigs }}
|
||||
additionalAlertRelabelConfigs:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus-am-relabel-confg
|
||||
key: additional-alert-relabel-configs.yaml
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.additionalAlertRelabelConfigsSecret }}
|
||||
additionalAlertRelabelConfigs:
|
||||
name: {{ .Values.prometheus.prometheusSpec.additionalAlertRelabelConfigsSecret.name }}
|
||||
key: {{ .Values.prometheus.prometheusSpec.additionalAlertRelabelConfigsSecret.key }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.containers }}
|
||||
containers:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.containers | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.initContainers }}
|
||||
initContainers:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.initContainers | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.priorityClassName }}
|
||||
priorityClassName: {{ .Values.prometheus.prometheusSpec.priorityClassName }}
|
||||
{{- end }}
|
||||
{{- if not .Values.prometheus.agentMode }}
|
||||
{{- if .Values.prometheus.prometheusSpec.thanos }}
|
||||
thanos:
|
||||
{{- with (omit .Values.prometheus.prometheusSpec.thanos "objectStorageConfig")}}
|
||||
{{ toYaml . | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if ((.Values.prometheus.prometheusSpec.thanos.objectStorageConfig).existingSecret) }}
|
||||
objectStorageConfig:
|
||||
key: "{{.Values.prometheus.prometheusSpec.thanos.objectStorageConfig.existingSecret.key }}"
|
||||
name: "{{.Values.prometheus.prometheusSpec.thanos.objectStorageConfig.existingSecret.name }}"
|
||||
{{- else if ((.Values.prometheus.prometheusSpec.thanos.objectStorageConfig).secret) }}
|
||||
objectStorageConfig:
|
||||
key: object-storage-configs.yaml
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.disableCompaction }}
|
||||
disableCompaction: {{ .Values.prometheus.prometheusSpec.disableCompaction }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
portName: {{ .Values.prometheus.prometheusSpec.portName }}
|
||||
{{- if .Values.prometheus.prometheusSpec.volumes }}
|
||||
volumes:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.volumes | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.volumeMounts }}
|
||||
volumeMounts:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.volumeMounts | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.arbitraryFSAccessThroughSMs }}
|
||||
arbitraryFSAccessThroughSMs:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.arbitraryFSAccessThroughSMs | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.overrideHonorLabels }}
|
||||
overrideHonorLabels: {{ .Values.prometheus.prometheusSpec.overrideHonorLabels }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.overrideHonorTimestamps }}
|
||||
overrideHonorTimestamps: {{ .Values.prometheus.prometheusSpec.overrideHonorTimestamps }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.ignoreNamespaceSelectors }}
|
||||
ignoreNamespaceSelectors: {{ .Values.prometheus.prometheusSpec.ignoreNamespaceSelectors }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.enforcedNamespaceLabel }}
|
||||
enforcedNamespaceLabel: {{ .Values.prometheus.prometheusSpec.enforcedNamespaceLabel }}
|
||||
{{- $prometheusDefaultRulesExcludedFromEnforce := (include "rules.names" .) | fromYaml }}
|
||||
{{- if not .Values.prometheus.agentMode }}
|
||||
prometheusRulesExcludedFromEnforce:
|
||||
{{- range $prometheusDefaultRulesExcludedFromEnforce.rules }}
|
||||
- ruleNamespace: "{{ template "kube-prometheus-stack.namespace" $ }}"
|
||||
ruleName: "{{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) . | trunc 63 | trimSuffix "-" }}"
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.prometheusRulesExcludedFromEnforce }}
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.prometheusRulesExcludedFromEnforce | indent 4 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
excludedFromEnforcement:
|
||||
{{- range $prometheusDefaultRulesExcludedFromEnforce.rules }}
|
||||
- group: monitoring.coreos.com
|
||||
resource: prometheusrules
|
||||
namespace: "{{ template "kube-prometheus-stack.namespace" $ }}"
|
||||
name: "{{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) . | trunc 63 | trimSuffix "-" }}"
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.excludedFromEnforcement }}
|
||||
{{ tpl (toYaml .Values.prometheus.prometheusSpec.excludedFromEnforcement | indent 4) . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if and (not .Values.prometheus.agentMode) .Values.prometheus.prometheusSpec.queryLogFile }}
|
||||
queryLogFile: {{ .Values.prometheus.prometheusSpec.queryLogFile }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.sampleLimit }}
|
||||
sampleLimit: {{ .Values.prometheus.prometheusSpec.sampleLimit }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.enforcedKeepDroppedTargets }}
|
||||
enforcedKeepDroppedTargets: {{ .Values.prometheus.prometheusSpec.enforcedKeepDroppedTargets }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.enforcedSampleLimit }}
|
||||
enforcedSampleLimit: {{ .Values.prometheus.prometheusSpec.enforcedSampleLimit }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.enforcedTargetLimit }}
|
||||
enforcedTargetLimit: {{ .Values.prometheus.prometheusSpec.enforcedTargetLimit }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.enforcedLabelLimit }}
|
||||
enforcedLabelLimit: {{ .Values.prometheus.prometheusSpec.enforcedLabelLimit }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.enforcedLabelNameLengthLimit }}
|
||||
enforcedLabelNameLengthLimit: {{ .Values.prometheus.prometheusSpec.enforcedLabelNameLengthLimit }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.enforcedLabelValueLengthLimit}}
|
||||
enforcedLabelValueLengthLimit: {{ .Values.prometheus.prometheusSpec.enforcedLabelValueLengthLimit }}
|
||||
{{- end }}
|
||||
{{- if and (not .Values.prometheus.agentMode) .Values.prometheus.prometheusSpec.allowOverlappingBlocks }}
|
||||
allowOverlappingBlocks: {{ .Values.prometheus.prometheusSpec.allowOverlappingBlocks }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.minReadySeconds }}
|
||||
minReadySeconds: {{ .Values.prometheus.prometheusSpec.minReadySeconds }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.maximumStartupDurationSeconds }}
|
||||
maximumStartupDurationSeconds: {{ .Values.prometheus.prometheusSpec.maximumStartupDurationSeconds }}
|
||||
{{- end }}
|
||||
hostNetwork: {{ .Values.prometheus.prometheusSpec.hostNetwork }}
|
||||
{{- if .Values.prometheus.prometheusSpec.hostAliases }}
|
||||
hostAliases:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.hostAliases | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.tracingConfig }}
|
||||
tracingConfig:
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.tracingConfig | indent 4 }}
|
||||
{{- end }}
|
||||
{{- with .Values.prometheus.prometheusSpec.additionalConfig }}
|
||||
{{- tpl (toYaml .) $ | nindent 2 }}
|
||||
{{- end }}
|
||||
{{- with .Values.prometheus.prometheusSpec.additionalConfigString }}
|
||||
{{- tpl . $ | nindent 2 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,22 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.global.rbac.create .Values.global.rbac.pspEnabled }}
|
||||
{{- if .Capabilities.APIVersions.Has "policy/v1beta1/PodSecurityPolicy" }}
|
||||
kind: ClusterRole
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus-psp
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
rules:
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if semverCompare "> 1.15.0-0" $kubeTargetVersion }}
|
||||
- apiGroups: ['policy']
|
||||
{{- else }}
|
||||
- apiGroups: ['extensions']
|
||||
{{- end }}
|
||||
resources: ['podsecuritypolicies']
|
||||
verbs: ['use']
|
||||
resourceNames:
|
||||
- {{ template "kube-prometheus-stack.fullname" . }}-prometheus
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,19 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.global.rbac.create .Values.global.rbac.pspEnabled }}
|
||||
{{- if .Capabilities.APIVersions.Has "policy/v1beta1/PodSecurityPolicy" }}
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus-psp
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus-psp
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ template "kube-prometheus-stack.prometheus.serviceAccountName" . }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,58 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.global.rbac.create .Values.global.rbac.pspEnabled }}
|
||||
{{- if .Capabilities.APIVersions.Has "policy/v1beta1/PodSecurityPolicy" }}
|
||||
apiVersion: policy/v1beta1
|
||||
kind: PodSecurityPolicy
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
{{- if .Values.global.rbac.pspAnnotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.global.rbac.pspAnnotations | indent 4 }}
|
||||
{{- end }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
spec:
|
||||
privileged: false
|
||||
# Allow core volume types.
|
||||
volumes:
|
||||
- 'configMap'
|
||||
- 'emptyDir'
|
||||
- 'projected'
|
||||
- 'secret'
|
||||
- 'downwardAPI'
|
||||
- 'persistentVolumeClaim'
|
||||
{{- if .Values.prometheus.podSecurityPolicy.volumes }}
|
||||
{{ toYaml .Values.prometheus.podSecurityPolicy.volumes | indent 4 }}
|
||||
{{- end }}
|
||||
hostNetwork: false
|
||||
hostIPC: false
|
||||
hostPID: false
|
||||
runAsUser:
|
||||
# Permits the container to run with root privileges as well.
|
||||
rule: 'RunAsAny'
|
||||
seLinux:
|
||||
# This policy assumes the nodes are using AppArmor rather than SELinux.
|
||||
rule: 'RunAsAny'
|
||||
supplementalGroups:
|
||||
rule: 'MustRunAs'
|
||||
ranges:
|
||||
# Allow adding the root group.
|
||||
- min: 0
|
||||
max: 65535
|
||||
fsGroup:
|
||||
rule: 'MustRunAs'
|
||||
ranges:
|
||||
# Allow adding the root group.
|
||||
- min: 0
|
||||
max: 65535
|
||||
readOnlyRootFilesystem: false
|
||||
{{- if .Values.prometheus.podSecurityPolicy.allowedCapabilities }}
|
||||
allowedCapabilities:
|
||||
{{ toYaml .Values.prometheus.podSecurityPolicy.allowedCapabilities | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.podSecurityPolicy.allowedHostPaths }}
|
||||
allowedHostPaths:
|
||||
{{ toYaml .Values.prometheus.podSecurityPolicy.allowedHostPaths | indent 4 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,303 @@
|
||||
{{- /*
|
||||
Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/alertmanager-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }}
|
||||
{{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerFailedReload | default false) }}
|
||||
- alert: AlertmanagerFailedReload
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager | indent 8 }}
|
||||
{{- end }}
|
||||
description: Configuration has failed to load for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerfailedreload
|
||||
summary: Reloading an Alertmanager configuration has failed.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0
|
||||
for: {{ dig "AlertmanagerFailedReload" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "AlertmanagerFailedReload" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerMembersInconsistent | default false) }}
|
||||
- alert: AlertmanagerMembersInconsistent
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager | indent 8 }}
|
||||
{{- end }}
|
||||
description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} has only found {{`{{`}} $value {{`}}`}} members of the {{`{{`}}$labels.job{{`}}`}} cluster.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagermembersinconsistent
|
||||
summary: A member of an Alertmanager cluster has not found all other cluster members.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
< on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) group_left
|
||||
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
for: {{ dig "AlertmanagerMembersInconsistent" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "AlertmanagerMembersInconsistent" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerFailedToSendAlerts | default false) }}
|
||||
- alert: AlertmanagerFailedToSendAlerts
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager | indent 8 }}
|
||||
{{- end }}
|
||||
description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} failed to send {{`{{`}} $value | humanizePercentage {{`}}`}} of notifications to {{`{{`}} $labels.integration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerfailedtosendalerts
|
||||
summary: An Alertmanager instance failed to send notifications.
|
||||
expr: |-
|
||||
(
|
||||
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
/
|
||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: {{ dig "AlertmanagerFailedToSendAlerts" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "AlertmanagerFailedToSendAlerts" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerClusterFailedToSendAlerts | default false) }}
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager | indent 8 }}
|
||||
{{- end }}
|
||||
description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclusterfailedtosendalerts
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
|
||||
expr: |-
|
||||
min by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service, integration) (
|
||||
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m])
|
||||
/
|
||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: {{ dig "AlertmanagerClusterFailedToSendAlerts" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "AlertmanagerClusterFailedToSendAlerts" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerClusterFailedToSendAlerts | default false) }}
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager | indent 8 }}
|
||||
{{- end }}
|
||||
description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclusterfailedtosendalerts
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
|
||||
expr: |-
|
||||
min by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service, integration) (
|
||||
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m])
|
||||
/
|
||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: {{ dig "AlertmanagerClusterFailedToSendAlerts" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "AlertmanagerClusterFailedToSendAlerts" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerConfigInconsistent | default false) }}
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager | indent 8 }}
|
||||
{{- end }}
|
||||
description: Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have different configurations.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerconfiginconsistent
|
||||
summary: Alertmanager instances within the same cluster have different configurations.
|
||||
expr: |-
|
||||
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (
|
||||
count_values by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
|
||||
)
|
||||
!= 1
|
||||
for: {{ dig "AlertmanagerConfigInconsistent" "for" "20m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "AlertmanagerConfigInconsistent" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerClusterDown | default false) }}
|
||||
- alert: AlertmanagerClusterDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have been up for less than half of the last 5m.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclusterdown
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are down.
|
||||
expr: |-
|
||||
(
|
||||
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (
|
||||
avg_over_time(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < 0.5
|
||||
)
|
||||
/
|
||||
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (
|
||||
up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
for: {{ dig "AlertmanagerClusterDown" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "AlertmanagerClusterDown" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.AlertmanagerClusterCrashlooping | default false) }}
|
||||
- alert: AlertmanagerClusterCrashlooping
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.alertmanager | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have restarted at least 5 times in the last 10m.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclustercrashlooping
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
|
||||
expr: |-
|
||||
(
|
||||
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (
|
||||
changes(process_start_time_seconds{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[10m]) > 4
|
||||
)
|
||||
/
|
||||
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (
|
||||
up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
for: {{ dig "AlertmanagerClusterCrashlooping" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "AlertmanagerClusterCrashlooping" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,57 @@
|
||||
{{- /*
|
||||
Generated from 'config-reloaders' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheusOperator-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.configReloaders }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "config-reloaders" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: config-reloaders
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.ConfigReloaderSidecarErrors | default false) }}
|
||||
- alert: ConfigReloaderSidecarErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.configReloaders }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.configReloaders | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'Errors encountered while the {{`{{`}}$labels.pod{{`}}`}} config-reloader sidecar attempts to sync config in {{`{{`}}$labels.namespace{{`}}`}} namespace.
|
||||
|
||||
As a result, configuration for service running in {{`{{`}}$labels.pod{{`}}`}} may be stale and cannot be updated anymore.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/configreloadersidecarerrors
|
||||
summary: config-reloader sidecar has not had a successful reload for 10m
|
||||
expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
|
||||
for: {{ dig "ConfigReloaderSidecarErrors" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "ConfigReloaderSidecarErrors" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.configReloaders }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.configReloaders }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,459 @@
|
||||
{{- /*
|
||||
Generated from 'etcd' group from https://github.com/etcd-io/etcd.git
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: etcd
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.etcdMembersDown | default false) }}
|
||||
- alert: etcdMembersDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).'
|
||||
summary: etcd cluster members are down.
|
||||
expr: |-
|
||||
max without (endpoint) (
|
||||
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
|
||||
or
|
||||
count without (To) (
|
||||
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
for: {{ dig "etcdMembersDown" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdMembersDown" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdInsufficientMembers | default false) }}
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
|
||||
summary: etcd cluster has insufficient number of members.
|
||||
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
|
||||
for: {{ dig "etcdInsufficientMembers" "for" "3m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdInsufficientMembers" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdNoLeader | default false) }}
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
|
||||
summary: etcd cluster has no leader.
|
||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: {{ dig "etcdNoLeader" "for" "1m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdNoLeader" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfLeaderChanges | default false) }}
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
||||
summary: etcd cluster has high number of leader changes.
|
||||
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
||||
for: {{ dig "etcdHighNumberOfLeaderChanges" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdHighNumberOfLeaderChanges" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedGRPCRequests | default false) }}
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 1
|
||||
for: {{ dig "etcdHighNumberOfFailedGRPCRequests" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdHighNumberOfFailedGRPCRequests" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedGRPCRequests | default false) }}
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 5
|
||||
for: {{ dig "etcdHighNumberOfFailedGRPCRequests" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdHighNumberOfFailedGRPCRequests" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdGRPCRequestsSlow | default false) }}
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile of gRPC requests is {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}} for {{`{{`}} $labels.grpc_method {{`}}`}} method.'
|
||||
summary: etcd grpc requests are slow
|
||||
expr: |-
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
|
||||
> 0.15
|
||||
for: {{ dig "etcdGRPCRequestsSlow" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdGRPCRequestsSlow" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdMemberCommunicationSlow | default false) }}
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster member communication is slow.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: {{ dig "etcdMemberCommunicationSlow" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdMemberCommunicationSlow" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedProposals | default false) }}
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster has high number of proposal failures.
|
||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: {{ dig "etcdHighNumberOfFailedProposals" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdHighNumberOfFailedProposals" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighFsyncDurations | default false) }}
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
for: {{ dig "etcdHighFsyncDurations" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdHighFsyncDurations" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighFsyncDurations | default false) }}
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 1
|
||||
for: {{ dig "etcdHighFsyncDurations" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdHighFsyncDurations" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighCommitDurations | default false) }}
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster 99th percentile commit durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
for: {{ dig "etcdHighCommitDurations" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdHighCommitDurations" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdDatabaseQuotaLowSpace | default false) }}
|
||||
- alert: etcdDatabaseQuotaLowSpace
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size exceeds the defined quota on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
|
||||
summary: etcd cluster database is running full.
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95
|
||||
for: {{ dig "etcdDatabaseQuotaLowSpace" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdDatabaseQuotaLowSpace" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdExcessiveDatabaseGrowth | default false) }}
|
||||
- alert: etcdExcessiveDatabaseGrowth
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please check as it might be disruptive.'
|
||||
summary: etcd cluster database growing very fast.
|
||||
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
|
||||
for: {{ dig "etcdExcessiveDatabaseGrowth" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdExcessiveDatabaseGrowth" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdDatabaseHighFragmentationRatio | default false) }}
|
||||
- alert: etcdDatabaseHighFragmentationRatio
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.etcd }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.etcd | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size in use on instance {{`{{`}} $labels.instance {{`}}`}} is {{`{{`}} $value | humanizePercentage {{`}}`}} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
|
||||
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
||||
summary: etcd database size in use is less than 50% of the actual allocated storage.
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
|
||||
for: {{ dig "etcdDatabaseHighFragmentationRatio" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "etcdDatabaseHighFragmentationRatio" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.etcd }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,125 @@
|
||||
{{- /*
|
||||
Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubePrometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.general }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: general.rules
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.TargetDown | default false) }}
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.general }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.general | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/targetdown
|
||||
summary: One or more targets are unreachable.
|
||||
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10
|
||||
for: {{ dig "TargetDown" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "TargetDown" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.general }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.general }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.Watchdog | default false) }}
|
||||
- alert: Watchdog
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.general }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.general | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
|
||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||
|
||||
and always fire against a receiver. There are integrations with various notification
|
||||
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
|
||||
'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/watchdog
|
||||
summary: An alert that should always be firing to certify that Alertmanager is working properly.
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: {{ dig "Watchdog" "severity" "none" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.general }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.general }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.InfoInhibitor | default false) }}
|
||||
- alert: InfoInhibitor
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.general }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.general | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'This is an alert that is used to inhibit info alerts.
|
||||
|
||||
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
|
||||
|
||||
other alerts.
|
||||
|
||||
This alert fires whenever there''s a severity="info" alert, and stops firing when another alert with a
|
||||
|
||||
severity of ''warning'' or ''critical'' starts firing on the same namespace.
|
||||
|
||||
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
||||
|
||||
'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/infoinhibitor
|
||||
summary: Info-level alert inhibition.
|
||||
expr: ALERTS{severity = "info"} == 1 unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
|
||||
labels:
|
||||
severity: {{ dig "InfoInhibitor" "severity" "none" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.general }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.general }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,43 @@
|
||||
{{- /*
|
||||
Generated from 'k8s.rules.container-cpu-usage-seconds-total' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerCpuUsageSecondsTotal }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-cpu-usage-seconds-total" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_cpu_usage_seconds_total
|
||||
rules:
|
||||
- expr: |-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, container) (
|
||||
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
||||
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (
|
||||
1, max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerCpuUsageSecondsTotal }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerCpuUsageSecondsTotal }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,42 @@
|
||||
{{- /*
|
||||
Generated from 'k8s.rules.container-memory-cache' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerMemoryCache }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-memory-cache" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_memory_cache
|
||||
rules:
|
||||
- expr: |-
|
||||
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1,
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_cache
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryCache }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryCache }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,42 @@
|
||||
{{- /*
|
||||
Generated from 'k8s.rules.container-memory-rss' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerMemoryRss }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-memory-rss" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_memory_rss
|
||||
rules:
|
||||
- expr: |-
|
||||
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1,
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_rss
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryRss }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryRss }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,42 @@
|
||||
{{- /*
|
||||
Generated from 'k8s.rules.container-memory-swap' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerMemorySwap }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-memory-swap" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_memory_swap
|
||||
rules:
|
||||
- expr: |-
|
||||
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1,
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_swap
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemorySwap }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemorySwap }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,42 @@
|
||||
{{- /*
|
||||
Generated from 'k8s.rules.container-memory-working-set-bytes' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerMemoryWorkingSetBytes }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-memory-working-set-bytes" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_memory_working_set_bytes
|
||||
rules:
|
||||
- expr: |-
|
||||
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1,
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_working_set_bytes
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryWorkingSetBytes }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryWorkingSetBytes }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,168 @@
|
||||
{{- /*
|
||||
Generated from 'k8s.rules.container-resource' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerResource }}
|
||||
{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-resource" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_resource
|
||||
rules:
|
||||
- expr: |-
|
||||
kube_pod_container_resource_requests{resource="memory",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster)
|
||||
group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) (
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) (
|
||||
kube_pod_container_resource_requests{resource="memory",job="{{ $kubeStateMetricsJob }}"}
|
||||
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_memory:kube_pod_container_resource_requests:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
kube_pod_container_resource_requests{resource="cpu",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster)
|
||||
group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) (
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) (
|
||||
kube_pod_container_resource_requests{resource="cpu",job="{{ $kubeStateMetricsJob }}"}
|
||||
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_cpu:kube_pod_container_resource_requests:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
kube_pod_container_resource_limits{resource="memory",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster)
|
||||
group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) (
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) (
|
||||
kube_pod_container_resource_limits{resource="memory",job="{{ $kubeStateMetricsJob }}"}
|
||||
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_memory:kube_pod_container_resource_limits:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
kube_pod_container_resource_limits{resource="cpu",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster)
|
||||
group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) (
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) (
|
||||
kube_pod_container_resource_limits{resource="cpu",job="{{ $kubeStateMetricsJob }}"}
|
||||
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_cpu:kube_pod_container_resource_limits:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,107 @@
|
||||
{{- /*
|
||||
Generated from 'k8s.rules.pod-owner' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sPodOwner }}
|
||||
{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.pod-owner" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.pod_owner
|
||||
rules:
|
||||
- expr: |-
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_owner{job="{{ $kubeStateMetricsJob }}", owner_kind="ReplicaSet"},
|
||||
"replicaset", "$1", "owner_name", "(.*)"
|
||||
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}replicaset, namespace) group_left(owner_name) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}replicaset, namespace) (
|
||||
1, max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}replicaset, namespace, owner_name) (
|
||||
kube_replicaset_owner{job="{{ $kubeStateMetricsJob }}"}
|
||||
)
|
||||
),
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- expr: |-
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="{{ $kubeStateMetricsJob }}", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- expr: |-
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="{{ $kubeStateMetricsJob }}", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- expr: |-
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="{{ $kubeStateMetricsJob }}", owner_kind="Job"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: job
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
{{- end }}
|
||||
@@ -0,0 +1,273 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver-availability.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverAvailability }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-availability.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- interval: 3m
|
||||
name: kube-apiserver-availability.rules
|
||||
rules:
|
||||
- expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
|
||||
labels:
|
||||
verb: read
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
labels:
|
||||
verb: write
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, verb, scope) (increase(apiserver_request_sli_duration_seconds_count{job="apiserver"}[1h]))
|
||||
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[30d]) * 24 * 30)
|
||||
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))
|
||||
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d]) * 24 * 30)
|
||||
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
1 - (
|
||||
(
|
||||
# write too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
|
||||
) +
|
||||
(
|
||||
# read too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
|
||||
)
|
||||
) +
|
||||
# errors
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (code:apiserver_request_total:increase30d)
|
||||
labels:
|
||||
verb: all
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |-
|
||||
1 - (
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
|
||||
-
|
||||
(
|
||||
# too slow
|
||||
(
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (code:apiserver_request_total:increase30d{verb="read"})
|
||||
labels:
|
||||
verb: read
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |-
|
||||
1 - (
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (code:apiserver_request_total:increase30d{verb="write"})
|
||||
labels:
|
||||
verb: write
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:availability30d
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,440 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver-burnrate.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverBurnrate }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-burnrate.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-burnrate.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d]))
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||
labels:
|
||||
verb: read
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:burnrate1d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h]))
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||
labels:
|
||||
verb: read
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:burnrate1h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h]))
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||
labels:
|
||||
verb: read
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:burnrate2h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m]))
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||
labels:
|
||||
verb: read
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:burnrate30m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d]))
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||
labels:
|
||||
verb: read
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:burnrate3d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m]))
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:burnrate5m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h]))
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||
labels:
|
||||
verb: read
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:burnrate6h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
|
||||
-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d]))
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
labels:
|
||||
verb: write
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:burnrate1d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
|
||||
-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h]))
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
labels:
|
||||
verb: write
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:burnrate1h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
|
||||
-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h]))
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
labels:
|
||||
verb: write
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:burnrate2h
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
|
||||
-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m]))
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
labels:
|
||||
verb: write
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:burnrate30m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
|
||||
-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d]))
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
labels:
|
||||
verb: write
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:burnrate3d
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
|
||||
-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m]))
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:burnrate5m
|
||||
- expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
|
||||
-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h]))
|
||||
)
|
||||
+
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
labels:
|
||||
verb: write
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: apiserver_request:burnrate6h
|
||||
{{- end }}
|
||||
@@ -0,0 +1,53 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver-histogram.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverHistogram }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-histogram.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-histogram.rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: read
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverHistogram }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverHistogram }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: write
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverHistogram }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverHistogram }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
|
||||
{{- end }}
|
||||
@@ -0,0 +1,159 @@
|
||||
{{- /*
|
||||
Generated from 'kube-apiserver-slos' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverSlos }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-slos" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-slos
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAPIErrorBudgetBurn | default false) }}
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubeApiserverSlos }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubeApiserverSlos | indent 8 }}
|
||||
{{- end }}
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
|
||||
for: {{ dig "KubeAPIErrorBudgetBurn" "for" "2m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
long: 1h
|
||||
severity: {{ dig "KubeAPIErrorBudgetBurn" "severity" "critical" .Values.customRules }}
|
||||
short: 5m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAPIErrorBudgetBurn | default false) }}
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubeApiserverSlos }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubeApiserverSlos | indent 8 }}
|
||||
{{- end }}
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
|
||||
for: {{ dig "KubeAPIErrorBudgetBurn" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
long: 6h
|
||||
severity: {{ dig "KubeAPIErrorBudgetBurn" "severity" "critical" .Values.customRules }}
|
||||
short: 30m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAPIErrorBudgetBurn | default false) }}
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubeApiserverSlos }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubeApiserverSlos | indent 8 }}
|
||||
{{- end }}
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
|
||||
for: {{ dig "KubeAPIErrorBudgetBurn" "for" "1h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
long: 1d
|
||||
severity: {{ dig "KubeAPIErrorBudgetBurn" "severity" "warning" .Values.customRules }}
|
||||
short: 2h
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAPIErrorBudgetBurn | default false) }}
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubeApiserverSlos }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubeApiserverSlos | indent 8 }}
|
||||
{{- end }}
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
|
||||
for: {{ dig "KubeAPIErrorBudgetBurn" "for" "3h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
long: 3d
|
||||
severity: {{ dig "KubeAPIErrorBudgetBurn" "severity" "warning" .Values.customRules }}
|
||||
short: 6h
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,49 @@
|
||||
{{- /*
|
||||
Generated from 'kube-prometheus-general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubePrometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusGeneral }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-general.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-prometheus-general.rules
|
||||
rules:
|
||||
- expr: count without(instance, pod, node) (up == 1)
|
||||
record: count:up1
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusGeneral }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusGeneral }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: count without(instance, pod, node) (up == 0)
|
||||
record: count:up0
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusGeneral }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusGeneral }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,93 @@
|
||||
{{- /*
|
||||
Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubePrometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeRecording }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-node-recording.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
rules:
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
|
||||
record: instance:node_cpu:rate:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusNodeRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusNodeRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_receive_bytes:rate:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusNodeRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusNodeRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_transmit_bytes:rate:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusNodeRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusNodeRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
|
||||
record: instance:node_cpu:ratio
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusNodeRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusNodeRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
||||
record: cluster:node_cpu:sum_rate5m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusNodeRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusNodeRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
|
||||
record: cluster:node_cpu:ratio
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusNodeRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubePrometheusNodeRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,135 @@
|
||||
{{- /*
|
||||
Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeSchedulerRecording }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-scheduler.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-scheduler.rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
{{- end }}
|
||||
@@ -0,0 +1,152 @@
|
||||
{{- /*
|
||||
Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubeStateMetrics-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubeStateMetrics }}
|
||||
{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-state-metrics" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-state-metrics
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsListErrors | default false) }}
|
||||
- alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubeStateMetrics }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubeStateMetrics | indent 8 }}
|
||||
{{- end }}
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricslisterrors
|
||||
summary: kube-state-metrics is experiencing errors in list operations.
|
||||
expr: |-
|
||||
(sum(rate(kube_state_metrics_list_total{job="{{ $kubeStateMetricsJob }}",result="error"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)
|
||||
/
|
||||
sum(rate(kube_state_metrics_list_total{job="{{ $kubeStateMetricsJob }}"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster))
|
||||
> 0.01
|
||||
for: {{ dig "KubeStateMetricsListErrors" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeStateMetricsListErrors" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsWatchErrors | default false) }}
|
||||
- alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubeStateMetrics }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubeStateMetrics | indent 8 }}
|
||||
{{- end }}
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricswatcherrors
|
||||
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||
expr: |-
|
||||
(sum(rate(kube_state_metrics_watch_total{job="{{ $kubeStateMetricsJob }}",result="error"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)
|
||||
/
|
||||
sum(rate(kube_state_metrics_watch_total{job="{{ $kubeStateMetricsJob }}"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster))
|
||||
> 0.01
|
||||
for: {{ dig "KubeStateMetricsWatchErrors" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeStateMetricsWatchErrors" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsShardingMismatch | default false) }}
|
||||
- alert: KubeStateMetricsShardingMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubeStateMetrics }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubeStateMetrics | indent 8 }}
|
||||
{{- end }}
|
||||
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardingmismatch
|
||||
summary: kube-state-metrics sharding is misconfigured.
|
||||
expr: stdvar (kube_state_metrics_total_shards{job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) != 0
|
||||
for: {{ dig "KubeStateMetricsShardingMismatch" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeStateMetricsShardingMismatch" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsShardsMissing | default false) }}
|
||||
- alert: KubeStateMetricsShardsMissing
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubeStateMetrics }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubeStateMetrics | indent 8 }}
|
||||
{{- end }}
|
||||
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardsmissing
|
||||
summary: kube-state-metrics shards are missing.
|
||||
expr: |-
|
||||
2^max(kube_state_metrics_total_shards{job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - 1
|
||||
-
|
||||
sum( 2 ^ max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="{{ $kubeStateMetricsJob }}"}) ) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)
|
||||
!= 0
|
||||
for: {{ dig "KubeStateMetricsShardsMissing" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeStateMetricsShardsMissing" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,63 @@
|
||||
{{- /*
|
||||
Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubelet.enabled .Values.defaultRules.rules.kubelet }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubelet.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, le) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubelet }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubelet }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, le) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubelet }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubelet }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, le) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubelet }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubelet }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
{{- end }}
|
||||
@@ -0,0 +1,568 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesApps }}
|
||||
{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }}
|
||||
{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-apps
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubePodCrashLooping | default false) }}
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff").'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodcrashlooping
|
||||
summary: Pod is crash looping.
|
||||
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[5m]) >= 1
|
||||
for: {{ dig "KubePodCrashLooping" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubePodCrashLooping" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubePodNotReady | default false) }}
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodnotready
|
||||
summary: Pod has been in a non-ready state for more than 15 minutes.
|
||||
expr: |-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
kube_pod_status_phase{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown|Failed"}
|
||||
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left(owner_kind) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
|
||||
1, max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
|
||||
)
|
||||
) > 0
|
||||
for: {{ dig "KubePodNotReady" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubePodNotReady" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDeploymentGenerationMismatch | default false) }}
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentgenerationmismatch
|
||||
summary: Deployment generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_deployment_status_observed_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_deployment_metadata_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
for: {{ dig "KubeDeploymentGenerationMismatch" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeDeploymentGenerationMismatch" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDeploymentReplicasMismatch | default false) }}
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentreplicasmismatch
|
||||
summary: Deployment has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_deployment_spec_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
>
|
||||
kube_deployment_status_replicas_available{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
) and (
|
||||
changes(kube_deployment_status_replicas_updated{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: {{ dig "KubeDeploymentReplicasMismatch" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeDeploymentReplicasMismatch" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDeploymentRolloutStuck | default false) }}
|
||||
- alert: KubeDeploymentRolloutStuck
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: Rollout of deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} is not progressing for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentrolloutstuck
|
||||
summary: Deployment rollout is not progressing.
|
||||
expr: |-
|
||||
kube_deployment_status_condition{condition="Progressing", status="false",job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
!= 0
|
||||
for: {{ dig "KubeDeploymentRolloutStuck" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeDeploymentRolloutStuck" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStatefulSetReplicasMismatch | default false) }}
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetreplicasmismatch
|
||||
summary: StatefulSet has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_statefulset_status_replicas_ready{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: {{ dig "KubeStatefulSetReplicasMismatch" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeStatefulSetReplicasMismatch" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStatefulSetGenerationMismatch | default false) }}
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetgenerationmismatch
|
||||
summary: StatefulSet generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_statefulset_status_observed_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_metadata_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
for: {{ dig "KubeStatefulSetGenerationMismatch" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeStatefulSetGenerationMismatch" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStatefulSetUpdateNotRolledOut | default false) }}
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetupdatenotrolledout
|
||||
summary: StatefulSet update has not been rolled out.
|
||||
expr: |-
|
||||
(
|
||||
max without (revision) (
|
||||
kube_statefulset_status_current_revision{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
unless
|
||||
kube_statefulset_status_update_revision{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
)
|
||||
*
|
||||
(
|
||||
kube_statefulset_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_status_replicas_updated{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: {{ dig "KubeStatefulSetUpdateNotRolledOut" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeStatefulSetUpdateNotRolledOut" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDaemonSetRolloutStuck | default false) }}
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetrolloutstuck
|
||||
summary: DaemonSet rollout is stuck.
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
kube_daemonset_status_current_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
) or (
|
||||
kube_daemonset_status_number_misscheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
0
|
||||
) or (
|
||||
kube_daemonset_status_updated_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
) or (
|
||||
kube_daemonset_status_number_available{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_daemonset_status_updated_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: {{ dig "KubeDaemonSetRolloutStuck" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeDaemonSetRolloutStuck" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeContainerWaiting | default false) }}
|
||||
- alert: KubeContainerWaiting
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontainerwaiting
|
||||
summary: Pod container waiting longer than 1 hour
|
||||
expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}) > 0
|
||||
for: {{ dig "KubeContainerWaiting" "for" "1h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeContainerWaiting" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDaemonSetNotScheduled | default false) }}
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetnotscheduled
|
||||
summary: DaemonSet pods are not scheduled.
|
||||
expr: |-
|
||||
kube_daemonset_status_desired_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
-
|
||||
kube_daemonset_status_current_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: {{ dig "KubeDaemonSetNotScheduled" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeDaemonSetNotScheduled" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDaemonSetMisScheduled | default false) }}
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetmisscheduled
|
||||
summary: DaemonSet pods are misscheduled.
|
||||
expr: kube_daemonset_status_number_misscheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: {{ dig "KubeDaemonSetMisScheduled" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeDaemonSetMisScheduled" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeJobNotCompleted | default false) }}
|
||||
- alert: KubeJobNotCompleted
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobnotcompleted
|
||||
summary: Job did not complete in time
|
||||
expr: |-
|
||||
time() - max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, job_name, cluster) (kube_job_status_start_time{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
and
|
||||
kube_job_status_active{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0) > 43200
|
||||
labels:
|
||||
severity: {{ dig "KubeJobNotCompleted" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeJobFailed | default false) }}
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobfailed
|
||||
summary: Job failed to complete.
|
||||
expr: kube_job_failed{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: {{ dig "KubeJobFailed" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeJobFailed" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeHpaReplicasMismatch | default false) }}
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpareplicasmismatch
|
||||
summary: HPA has not matched desired number of replicas.
|
||||
expr: |-
|
||||
(kube_horizontalpodautoscaler_status_desired_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
>
|
||||
kube_horizontalpodautoscaler_spec_min_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
<
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
changes(kube_horizontalpodautoscaler_status_current_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[15m]) == 0
|
||||
for: {{ dig "KubeHpaReplicasMismatch" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeHpaReplicasMismatch" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeHpaMaxedOut | default false) }}
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesApps | indent 8 }}
|
||||
{{- end }}
|
||||
description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpamaxedout
|
||||
summary: HPA is running at max replicas
|
||||
expr: |-
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
==
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}
|
||||
for: {{ dig "KubeHpaMaxedOut" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeHpaMaxedOut" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,282 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesResources }}
|
||||
{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-resources" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-resources
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeCPUOvercommit | default false) }}
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources | indent 8 }}
|
||||
{{- end }}
|
||||
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="{{ $kubeStateMetricsJob }}",}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - (sum(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0
|
||||
for: {{ dig "KubeCPUOvercommit" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeCPUOvercommit" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeMemoryOvercommit | default false) }}
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources | indent 8 }}
|
||||
{{- end }}
|
||||
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - (sum(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0
|
||||
for: {{ dig "KubeMemoryOvercommit" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeMemoryOvercommit" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeCPUQuotaOvercommit | default false) }}
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources | indent 8 }}
|
||||
{{- end }}
|
||||
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Namespaces.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuquotaovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard", resource=~"(cpu|requests.cpu)"})) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="cpu", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)
|
||||
> 1.5
|
||||
for: {{ dig "KubeCPUQuotaOvercommit" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeCPUQuotaOvercommit" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeMemoryQuotaOvercommit | default false) }}
|
||||
- alert: KubeMemoryQuotaOvercommit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources | indent 8 }}
|
||||
{{- end }}
|
||||
description: Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted memory resource requests for Namespaces.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryquotaovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard", resource=~"(memory|requests.memory)"})) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)
|
||||
> 1.5
|
||||
for: {{ dig "KubeMemoryQuotaOvercommit" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeMemoryQuotaOvercommit" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeQuotaAlmostFull | default false) }}
|
||||
- alert: KubeQuotaAlmostFull
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources | indent 8 }}
|
||||
{{- end }}
|
||||
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaalmostfull
|
||||
summary: Namespace quota is going to be full.
|
||||
expr: |-
|
||||
kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard"} > 0)
|
||||
> 0.9 < 1
|
||||
for: {{ dig "KubeQuotaAlmostFull" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeQuotaAlmostFull" "severity" "info" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeQuotaFullyUsed | default false) }}
|
||||
- alert: KubeQuotaFullyUsed
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources | indent 8 }}
|
||||
{{- end }}
|
||||
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotafullyused
|
||||
summary: Namespace quota is fully used.
|
||||
expr: |-
|
||||
kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard"} > 0)
|
||||
== 1
|
||||
for: {{ dig "KubeQuotaFullyUsed" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeQuotaFullyUsed" "severity" "info" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeQuotaExceeded | default false) }}
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources | indent 8 }}
|
||||
{{- end }}
|
||||
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaexceeded
|
||||
summary: Namespace quota has exceeded the limits.
|
||||
expr: |-
|
||||
kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard"} > 0)
|
||||
> 1
|
||||
for: {{ dig "KubeQuotaExceeded" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeQuotaExceeded" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.CPUThrottlingHigh | default false) }}
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesResources | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/cputhrottlinghigh
|
||||
summary: Processes experience elevated CPU throttling.
|
||||
expr: |-
|
||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, container, pod, namespace)
|
||||
/
|
||||
sum(increase(container_cpu_cfs_periods_total{}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, container, pod, namespace)
|
||||
> ( 25 / 100 )
|
||||
for: {{ dig "CPUThrottlingHigh" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "CPUThrottlingHigh" "severity" "info" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,217 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesStorage }}
|
||||
{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }}
|
||||
{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-storage" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-storage
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubePersistentVolumeFillingUp | default false) }}
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }}
|
||||
{{- end }}
|
||||
description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: {{ dig "KubePersistentVolumeFillingUp" "for" "1m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubePersistentVolumeFillingUp" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubePersistentVolumeFillingUp | default false) }}
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }}
|
||||
{{- end }}
|
||||
description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: {{ dig "KubePersistentVolumeFillingUp" "for" "1h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubePersistentVolumeFillingUp" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubePersistentVolumeInodesFillingUp | default false) }}
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }}
|
||||
{{- end }}
|
||||
description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} only has {{`{{`}} $value | humanizePercentage {{`}}`}} free inodes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup
|
||||
summary: PersistentVolumeInodes are filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: {{ dig "KubePersistentVolumeInodesFillingUp" "for" "1m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubePersistentVolumeInodesFillingUp" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubePersistentVolumeInodesFillingUp | default false) }}
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }}
|
||||
{{- end }}
|
||||
description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is expected to run out of inodes within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} of its inodes are free.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup
|
||||
summary: PersistentVolumeInodes are filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: {{ dig "KubePersistentVolumeInodesFillingUp" "for" "1h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubePersistentVolumeInodesFillingUp" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubePersistentVolumeErrors | default false) }}
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }}
|
||||
{{- end }}
|
||||
description: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeerrors
|
||||
summary: PersistentVolume is having issues with provisioning.
|
||||
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="{{ $kubeStateMetricsJob }}"} > 0
|
||||
for: {{ dig "KubePersistentVolumeErrors" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubePersistentVolumeErrors" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,193 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-apiserver" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-apiserver
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeClientCertificateExpiration | default false) }}
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration
|
||||
summary: Client certificate is about to expire.
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job) histogram_quantile(0.01, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
||||
for: {{ dig "KubeClientCertificateExpiration" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeClientCertificateExpiration" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeClientCertificateExpiration | default false) }}
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration
|
||||
summary: Client certificate is about to expire.
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job) histogram_quantile(0.01, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||
for: {{ dig "KubeClientCertificateExpiration" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeClientCertificateExpiration" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAggregatedAPIErrors | default false) }}
|
||||
- alert: KubeAggregatedAPIErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapierrors
|
||||
summary: Kubernetes aggregated API has reported errors.
|
||||
expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4
|
||||
labels:
|
||||
severity: {{ dig "KubeAggregatedAPIErrors" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAggregatedAPIDown | default false) }}
|
||||
- alert: KubeAggregatedAPIDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapidown
|
||||
summary: Kubernetes aggregated API is down.
|
||||
expr: (1 - max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85
|
||||
for: {{ dig "KubeAggregatedAPIDown" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeAggregatedAPIDown" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeApiServer.enabled }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAPIDown | default false) }}
|
||||
- alert: KubeAPIDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: KubeAPI has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapidown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="apiserver"} == 1)
|
||||
for: {{ dig "KubeAPIDown" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeAPIDown" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAPITerminatedRequests | default false) }}
|
||||
- alert: KubeAPITerminatedRequests
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapiterminatedrequests
|
||||
summary: The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
|
||||
expr: sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
|
||||
for: {{ dig "KubeAPITerminatedRequests" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeAPITerminatedRequests" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,57 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-controller-manager' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeControllerManager.enabled .Values.defaultRules.rules.kubeControllerManager }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-controller-manager" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-controller-manager
|
||||
rules:
|
||||
{{- if .Values.kubeControllerManager.enabled }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeControllerManagerDown | default false) }}
|
||||
- alert: KubeControllerManagerDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubeControllerManager }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubeControllerManager | indent 8 }}
|
||||
{{- end }}
|
||||
description: KubeControllerManager has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontrollermanagerdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||
for: {{ dig "KubeControllerManagerDown" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeControllerManagerDown" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeControllerManager }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeControllerManager }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,57 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-kube-proxy' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeProxy.enabled .Values.defaultRules.rules.kubeProxy }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-kube-proxy" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-kube-proxy
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeProxyDown | default false) }}
|
||||
- alert: KubeProxyDown
|
||||
annotations:
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupAnnotations.kubeProxy }}
|
||||
{{- with .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupAnnotations.kubeProxy }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
description: KubeProxy has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeproxydown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kube-proxy"} == 1)
|
||||
for: {{ dig "KubeProxyDown" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeProxyDown" "labelsSeverity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeProxy }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeProxy }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,385 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
|
||||
{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-kubelet" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-kubelet
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeNodeNotReady | default false) }}
|
||||
- alert: KubeNodeNotReady
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodenotready
|
||||
summary: Node is not ready.
|
||||
expr: kube_node_status_condition{job="{{ $kubeStateMetricsJob }}",condition="Ready",status="true"} == 0
|
||||
for: {{ dig "KubeNodeNotReady" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeNodeNotReady" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeNodeUnreachable | default false) }}
|
||||
- alert: KubeNodeUnreachable
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodeunreachable
|
||||
summary: Node is unreachable.
|
||||
expr: (kube_node_spec_taint{job="{{ $kubeStateMetricsJob }}",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="{{ $kubeStateMetricsJob }}",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
|
||||
for: {{ dig "KubeNodeUnreachable" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeNodeUnreachable" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletTooManyPods | default false) }}
|
||||
- alert: KubeletTooManyPods
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubelettoomanypods
|
||||
summary: Kubelet is running at capacity.
|
||||
expr: |-
|
||||
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) (
|
||||
(kube_pod_status_phase{job="{{ $kubeStateMetricsJob }}",phase="Running"} == 1) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}instance,pod,namespace,cluster) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}instance,pod,namespace,cluster) (1, kube_pod_info{job="{{ $kubeStateMetricsJob }}"})
|
||||
)
|
||||
/
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) (
|
||||
kube_node_status_capacity{job="{{ $kubeStateMetricsJob }}",resource="pods"} != 1
|
||||
) > 0.95
|
||||
for: {{ dig "KubeletTooManyPods" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeletTooManyPods" "severity" "info" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeNodeReadinessFlapping | default false) }}
|
||||
- alert: KubeNodeReadinessFlapping
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping
|
||||
summary: Node readiness status is flapping.
|
||||
expr: sum(changes(kube_node_status_condition{job="{{ $kubeStateMetricsJob }}",status="true",condition="Ready"}[15m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) > 2
|
||||
for: {{ dig "KubeNodeReadinessFlapping" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeNodeReadinessFlapping" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletPlegDurationHigh | default false) }}
|
||||
- alert: KubeletPlegDurationHigh
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletplegdurationhigh
|
||||
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
|
||||
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
||||
for: {{ dig "KubeletPlegDurationHigh" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeletPlegDurationHigh" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletPodStartUpLatencyHigh | default false) }}
|
||||
- alert: KubeletPodStartUpLatencyHigh
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh
|
||||
summary: Kubelet Pod startup latency is too high.
|
||||
expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, le)) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
|
||||
for: {{ dig "KubeletPodStartUpLatencyHigh" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeletPodStartUpLatencyHigh" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletClientCertificateExpiration | default false) }}
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration
|
||||
summary: Kubelet client certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
|
||||
labels:
|
||||
severity: {{ dig "KubeletClientCertificateExpiration" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletClientCertificateExpiration | default false) }}
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration
|
||||
summary: Kubelet client certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
|
||||
labels:
|
||||
severity: {{ dig "KubeletClientCertificateExpiration" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletServerCertificateExpiration | default false) }}
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration
|
||||
summary: Kubelet server certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
|
||||
labels:
|
||||
severity: {{ dig "KubeletServerCertificateExpiration" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletServerCertificateExpiration | default false) }}
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration
|
||||
summary: Kubelet server certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
|
||||
labels:
|
||||
severity: {{ dig "KubeletServerCertificateExpiration" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletClientCertificateRenewalErrors | default false) }}
|
||||
- alert: KubeletClientCertificateRenewalErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificaterenewalerrors
|
||||
summary: Kubelet has failed to renew its client certificate.
|
||||
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
|
||||
for: {{ dig "KubeletClientCertificateRenewalErrors" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeletClientCertificateRenewalErrors" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletServerCertificateRenewalErrors | default false) }}
|
||||
- alert: KubeletServerCertificateRenewalErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificaterenewalerrors
|
||||
summary: Kubelet has failed to renew its server certificate.
|
||||
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
|
||||
for: {{ dig "KubeletServerCertificateRenewalErrors" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeletServerCertificateRenewalErrors" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheusOperator.kubeletService.enabled }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletDown | default false) }}
|
||||
- alert: KubeletDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1)
|
||||
for: {{ dig "KubeletDown" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeletDown" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,57 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system-scheduler' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeSchedulerAlerting }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-scheduler" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-scheduler
|
||||
rules:
|
||||
{{- if .Values.kubeScheduler.enabled }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeSchedulerDown | default false) }}
|
||||
- alert: KubeSchedulerDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubeSchedulerAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubeSchedulerAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: KubeScheduler has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeschedulerdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kube-scheduler"} == 1)
|
||||
for: {{ dig "KubeSchedulerDown" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeSchedulerDown" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,87 @@
|
||||
{{- /*
|
||||
Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeVersionMismatch | default false) }}
|
||||
- alert: KubeVersionMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeversionmismatch
|
||||
summary: Different semantic versions of Kubernetes components running.
|
||||
expr: count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
|
||||
for: {{ dig "KubeVersionMismatch" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeVersionMismatch" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeClientErrors | default false) }}
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesSystem | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclienterrors
|
||||
summary: Kubernetes API server client is experiencing errors.
|
||||
expr: |-
|
||||
(sum(rate(rest_client_requests_total{job="apiserver",code=~"5.."}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, job, namespace)
|
||||
/
|
||||
sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, job, namespace))
|
||||
> 0.01
|
||||
for: {{ dig "KubeClientErrors" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "KubeClientErrors" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,188 @@
|
||||
{{- /*
|
||||
Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/nodeExporter-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.nodeExporterRecording }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-exporter.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
count without (cpu, mode) (
|
||||
node_cpu_seconds_total{job="node-exporter",mode="idle"}
|
||||
)
|
||||
record: instance:node_num_cpu:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
1 - avg without (cpu) (
|
||||
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
|
||||
)
|
||||
record: instance:node_cpu_utilisation:rate5m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
(
|
||||
node_load1{job="node-exporter"}
|
||||
/
|
||||
instance:node_num_cpu:sum{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_load1_per_cpu:ratio
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
1 - (
|
||||
(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"}
|
||||
or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Cached_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_MemFree_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
)
|
||||
/
|
||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
|
||||
record: instance:node_vmstat_pgmajfault:rate5m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate5m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate5m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate5m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate5m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterRecording }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,801 @@
|
||||
{{- /*
|
||||
Generated from 'node-exporter' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/nodeExporter-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.nodeExporterAlerting }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-exporter" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemSpaceFillingUp | default false) }}
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: {{ dig "NodeFilesystemSpaceFillingUp" "for" "1h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeFilesystemSpaceFillingUp" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemSpaceFillingUp | default false) }}
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: {{ dig "NodeFilesystemSpaceFillingUp" "for" "1h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeFilesystemSpaceFillingUp" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemAlmostOutOfSpace | default false) }}
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: {{ dig "NodeFilesystemAlmostOutOfSpace" "for" "30m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeFilesystemAlmostOutOfSpace" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemAlmostOutOfSpace | default false) }}
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: {{ dig "NodeFilesystemAlmostOutOfSpace" "for" "30m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeFilesystemAlmostOutOfSpace" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemFilesFillingUp | default false) }}
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: {{ dig "NodeFilesystemFilesFillingUp" "for" "1h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeFilesystemFilesFillingUp" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemFilesFillingUp | default false) }}
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: {{ dig "NodeFilesystemFilesFillingUp" "for" "1h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeFilesystemFilesFillingUp" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemAlmostOutOfFiles | default false) }}
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: {{ dig "NodeFilesystemAlmostOutOfFiles" "for" "1h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeFilesystemAlmostOutOfFiles" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemAlmostOutOfFiles | default false) }}
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: {{ dig "NodeFilesystemAlmostOutOfFiles" "for" "1h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeFilesystemAlmostOutOfFiles" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeNetworkReceiveErrs | default false) }}
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01
|
||||
for: {{ dig "NodeNetworkReceiveErrs" "for" "1h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeNetworkReceiveErrs" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeNetworkTransmitErrs | default false) }}
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01
|
||||
for: {{ dig "NodeNetworkTransmitErrs" "for" "1h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeNetworkTransmitErrs" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeHighNumberConntrackEntriesUsed | default false) }}
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75
|
||||
labels:
|
||||
severity: {{ dig "NodeHighNumberConntrackEntriesUsed" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeTextFileCollectorScrapeError | default false) }}
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: Node Exporter text file collector on {{`{{`}} $labels.instance {{`}}`}} failed to scrape.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodetextfilecollectorscrapeerror
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
labels:
|
||||
severity: {{ dig "NodeTextFileCollectorScrapeError" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeClockSkewDetected | default false) }}
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: Clock at {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
expr: |-
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
|
||||
)
|
||||
for: {{ dig "NodeClockSkewDetected" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeClockSkewDetected" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeClockNotSynchronising | default false) }}
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: Clock at {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
expr: |-
|
||||
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
|
||||
and
|
||||
node_timex_maxerror_seconds{job="node-exporter"} >= 16
|
||||
for: {{ dig "NodeClockNotSynchronising" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeClockNotSynchronising" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeRAIDDegraded | default false) }}
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: RAID array '{{`{{`}} $labels.device {{`}}`}}' at {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/noderaiddegraded
|
||||
summary: RAID Array is degraded.
|
||||
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
|
||||
for: {{ dig "NodeRAIDDegraded" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeRAIDDegraded" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeRAIDDiskFailure | default false) }}
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: At least one device in RAID array at {{`{{`}} $labels.instance {{`}}`}} failed. Array '{{`{{`}} $labels.device {{`}}`}}' needs attention and possibly a disk swap.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/noderaiddiskfailure
|
||||
summary: Failed device in RAID array.
|
||||
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
|
||||
labels:
|
||||
severity: {{ dig "NodeRAIDDiskFailure" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFileDescriptorLimit | default false) }}
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
||||
)
|
||||
for: {{ dig "NodeFileDescriptorLimit" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeFileDescriptorLimit" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFileDescriptorLimit | default false) }}
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
||||
)
|
||||
for: {{ dig "NodeFileDescriptorLimit" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeFileDescriptorLimit" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeCPUHighUsage | default false) }}
|
||||
- alert: NodeCPUHighUsage
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'CPU usage at {{`{{`}} $labels.instance {{`}}`}} has been above 90% for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
|
||||
|
||||
'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodecpuhighusage
|
||||
summary: High CPU usage.
|
||||
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[2m]))) * 100 > 90
|
||||
for: {{ dig "NodeCPUHighUsage" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeCPUHighUsage" "severity" "info" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeSystemSaturation | default false) }}
|
||||
- alert: NodeSystemSaturation
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'System load per core at {{`{{`}} $labels.instance {{`}}`}} has been above 2 for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}.
|
||||
|
||||
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
||||
|
||||
'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodesystemsaturation
|
||||
summary: System saturated, load per core is very high.
|
||||
expr: |-
|
||||
node_load1{job="node-exporter"}
|
||||
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
|
||||
for: {{ dig "NodeSystemSaturation" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeSystemSaturation" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeMemoryMajorPagesFaults | default false) }}
|
||||
- alert: NodeMemoryMajorPagesFaults
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'Memory major pages are occurring at very high rate at {{`{{`}} $labels.instance {{`}}`}}, 500 major page faults per second for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}.
|
||||
|
||||
Please check that there is enough memory available at this instance.
|
||||
|
||||
'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodememorymajorpagesfaults
|
||||
summary: Memory major page faults are occurring at very high rate.
|
||||
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
|
||||
for: {{ dig "NodeMemoryMajorPagesFaults" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeMemoryMajorPagesFaults" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeMemoryHighUtilization | default false) }}
|
||||
- alert: NodeMemoryHighUtilization
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'Memory is filling up at {{`{{`}} $labels.instance {{`}}`}}, has been above 90% for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
|
||||
|
||||
'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodememoryhighutilization
|
||||
summary: Host is running out of memory.
|
||||
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90
|
||||
for: {{ dig "NodeMemoryHighUtilization" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeMemoryHighUtilization" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeDiskIOSaturation | default false) }}
|
||||
- alert: NodeDiskIOSaturation
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'Disk IO queue (aqu-sq) is high on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}}, has been above 10 for the last 30 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}.
|
||||
|
||||
This symptom might indicate disk saturation.
|
||||
|
||||
'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodediskiosaturation
|
||||
summary: Disk IO queue is high.
|
||||
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) > 10
|
||||
for: {{ dig "NodeDiskIOSaturation" "for" "30m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeDiskIOSaturation" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeSystemdServiceFailed | default false) }}
|
||||
- alert: NodeSystemdServiceFailed
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: Systemd service {{`{{`}} $labels.name {{`}}`}} has entered failed state at {{`{{`}} $labels.instance {{`}}`}}
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodesystemdservicefailed
|
||||
summary: Systemd service has entered failed state.
|
||||
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
|
||||
for: {{ dig "NodeSystemdServiceFailed" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeSystemdServiceFailed" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeBondingDegraded | default false) }}
|
||||
- alert: NodeBondingDegraded
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }}
|
||||
{{- end }}
|
||||
description: Bonding interface {{`{{`}} $labels.master {{`}}`}} on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more slave failures.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodebondingdegraded
|
||||
summary: Bonding interface is degraded
|
||||
expr: (node_bonding_slaves - node_bonding_active) != 0
|
||||
for: {{ dig "NodeBondingDegraded" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeBondingDegraded" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,55 @@
|
||||
{{- /*
|
||||
Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubePrometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.network }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-network" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node-network
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.NodeNetworkInterfaceFlapping | default false) }}
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.network }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.network | indent 8 }}
|
||||
{{- end }}
|
||||
description: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing its up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/nodenetworkinterfaceflapping
|
||||
summary: Network interface is often changing its status
|
||||
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||
for: {{ dig "NodeNetworkInterfaceFlapping" "for" "2m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "NodeNetworkInterfaceFlapping" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.network }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.network }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,109 @@
|
||||
{{- /*
|
||||
Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.node }}
|
||||
{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1,
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node, namespace, pod) (
|
||||
label_replace(kube_pod_info{job="{{ $kubeStateMetricsJob }}",node!=""}, "pod", "$1", "pod", "(.*)")
|
||||
))
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.node }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.node }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) (
|
||||
node_cpu_seconds_total{mode="idle",job="node-exporter"}
|
||||
* on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node)
|
||||
topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1, node_namespace_pod:kube_pod_info:)
|
||||
)
|
||||
record: node:node_num_cpu:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.node }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.node }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"} or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"} +
|
||||
node_memory_Cached_bytes{job="node-exporter"} +
|
||||
node_memory_MemFree_bytes{job="node-exporter"} +
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)
|
||||
record: :node_memory_MemAvailable_bytes:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.node }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.node }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
avg by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) (
|
||||
sum without (mode) (
|
||||
rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
|
||||
)
|
||||
)
|
||||
record: node:node_cpu_utilization:ratio_rate5m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.node }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.node }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
avg by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (
|
||||
node:node_cpu_utilization:ratio_rate5m
|
||||
)
|
||||
record: cluster:node_cpu:ratio_rate5m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.node }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.node }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,253 @@
|
||||
{{- /*
|
||||
Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheusOperator-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheusOperator }}
|
||||
{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus-operator" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: prometheus-operator
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorListErrors | default false) }}
|
||||
- alert: PrometheusOperatorListErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator | indent 8 }}
|
||||
{{- end }}
|
||||
description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorlisterrors
|
||||
summary: Errors while performing list operations in controller.
|
||||
expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m])) / sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m]))) > 0.4
|
||||
for: {{ dig "PrometheusOperatorListErrors" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusOperatorListErrors" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorWatchErrors | default false) }}
|
||||
- alert: PrometheusOperatorWatchErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator | indent 8 }}
|
||||
{{- end }}
|
||||
description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorwatcherrors
|
||||
summary: Errors while performing watch operations in controller.
|
||||
expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m])) / sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.4
|
||||
for: {{ dig "PrometheusOperatorWatchErrors" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusOperatorWatchErrors" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorSyncFailed | default false) }}
|
||||
- alert: PrometheusOperatorSyncFailed
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator | indent 8 }}
|
||||
{{- end }}
|
||||
description: Controller {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} namespace fails to reconcile {{`{{`}} $value {{`}}`}} objects.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorsyncfailed
|
||||
summary: Last controller reconciliation failed
|
||||
expr: min_over_time(prometheus_operator_syncs{status="failed",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: {{ dig "PrometheusOperatorSyncFailed" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusOperatorSyncFailed" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorReconcileErrors | default false) }}
|
||||
- alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorreconcileerrors
|
||||
summary: Errors while reconciling objects.
|
||||
expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) / (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.1
|
||||
for: {{ dig "PrometheusOperatorReconcileErrors" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusOperatorReconcileErrors" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorStatusUpdateErrors | default false) }}
|
||||
- alert: PrometheusOperatorStatusUpdateErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of status update operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorstatusupdateerrors
|
||||
summary: Errors while updating objects status.
|
||||
expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) / (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.1
|
||||
for: {{ dig "PrometheusOperatorStatusUpdateErrors" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusOperatorStatusUpdateErrors" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorNodeLookupErrors | default false) }}
|
||||
- alert: PrometheusOperatorNodeLookupErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator | indent 8 }}
|
||||
{{- end }}
|
||||
description: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatornodelookuperrors
|
||||
summary: Errors while reconciling Prometheus.
|
||||
expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1
|
||||
for: {{ dig "PrometheusOperatorNodeLookupErrors" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusOperatorNodeLookupErrors" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorNotReady | default false) }}
|
||||
- alert: PrometheusOperatorNotReady
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatornotready
|
||||
summary: Prometheus operator not ready
|
||||
expr: min by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) == 0)
|
||||
for: {{ dig "PrometheusOperatorNotReady" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusOperatorNotReady" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorRejectedResources | default false) }}
|
||||
- alert: PrometheusOperatorRejectedResources
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheusOperator | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace rejected {{`{{`}} printf "%0.0f" $value {{`}}`}} {{`{{`}} $labels.controller {{`}}`}}/{{`{{`}} $labels.resource {{`}}`}} resources.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorrejectedresources
|
||||
summary: Resources rejected by Prometheus operator
|
||||
expr: min_over_time(prometheus_operator_managed_resources{state="rejected",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: {{ dig "PrometheusOperatorRejectedResources" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusOperatorRejectedResources" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,707 @@
|
||||
{{- /*
|
||||
Generated from 'prometheus' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheus }}
|
||||
{{- $prometheusJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: prometheus
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusBadConfig | default false) }}
|
||||
- alert: PrometheusBadConfig
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to reload its configuration.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusbadconfig
|
||||
summary: Failed Prometheus configuration reload.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) == 0
|
||||
for: {{ dig "PrometheusBadConfig" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusBadConfig" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusSDRefreshFailure | default false) }}
|
||||
- alert: PrometheusSDRefreshFailure
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to refresh SD with mechanism {{`{{`}}$labels.mechanism{{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheussdrefreshfailure
|
||||
summary: Failed Prometheus SD refresh.
|
||||
expr: increase(prometheus_sd_refresh_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[10m]) > 0
|
||||
for: {{ dig "PrometheusSDRefreshFailure" "for" "20m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusSDRefreshFailure" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusNotificationQueueRunningFull | default false) }}
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Alert notification queue of Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is running full.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusnotificationqueuerunningfull
|
||||
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
||||
expr: |-
|
||||
# Without min_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30)
|
||||
>
|
||||
min_over_time(prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
for: {{ dig "PrometheusNotificationQueueRunningFull" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusNotificationQueueRunningFull" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusErrorSendingAlertsToSomeAlertmanagers | default false) }}
|
||||
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheuserrorsendingalertstosomealertmanagers
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
||||
expr: |-
|
||||
(
|
||||
rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: {{ dig "PrometheusErrorSendingAlertsToSomeAlertmanagers" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusErrorSendingAlertsToSomeAlertmanagers" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusNotConnectedToAlertmanagers | default false) }}
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not connected to any Alertmanagers.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusnotconnectedtoalertmanagers
|
||||
summary: Prometheus is not connected to any Alertmanagers.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) < 1
|
||||
for: {{ dig "PrometheusNotConnectedToAlertmanagers" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusNotConnectedToAlertmanagers" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusTSDBReloadsFailing | default false) }}
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} reload failures over the last 3h.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustsdbreloadsfailing
|
||||
summary: Prometheus has issues reloading blocks from disk.
|
||||
expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0
|
||||
for: {{ dig "PrometheusTSDBReloadsFailing" "for" "4h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusTSDBReloadsFailing" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusTSDBCompactionsFailing | default false) }}
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last 3h.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustsdbcompactionsfailing
|
||||
summary: Prometheus has issues compacting blocks.
|
||||
expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0
|
||||
for: {{ dig "PrometheusTSDBCompactionsFailing" "for" "4h" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusTSDBCompactionsFailing" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusNotIngestingSamples | default false) }}
|
||||
- alert: PrometheusNotIngestingSamples
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not ingesting samples.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusnotingestingsamples
|
||||
summary: Prometheus is not ingesting samples.
|
||||
expr: |-
|
||||
(
|
||||
sum without(type) (rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])) <= 0
|
||||
and
|
||||
(
|
||||
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0
|
||||
or
|
||||
sum without(rule_group) (prometheus_rule_group_rules{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0
|
||||
)
|
||||
)
|
||||
for: {{ dig "PrometheusNotIngestingSamples" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusNotIngestingSamples" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusDuplicateTimestamps | default false) }}
|
||||
- alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with different values but duplicated timestamp.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusduplicatetimestamps
|
||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||
expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: {{ dig "PrometheusDuplicateTimestamps" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusDuplicateTimestamps" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOutOfOrderTimestamps | default false) }}
|
||||
- alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with timestamps arriving out of order.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusoutofordertimestamps
|
||||
summary: Prometheus drops samples with out-of-order timestamps.
|
||||
expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: {{ dig "PrometheusOutOfOrderTimestamps" "for" "10m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusOutOfOrderTimestamps" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusRemoteStorageFailures | default false) }}
|
||||
- alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} failed to send {{`{{`}} printf "%.1f" $value {{`}}`}}% of the samples to {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusremotestoragefailures
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: |-
|
||||
(
|
||||
(rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
/
|
||||
(
|
||||
(rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
+
|
||||
(rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
|
||||
)
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: {{ dig "PrometheusRemoteStorageFailures" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusRemoteStorageFailures" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusRemoteWriteBehind | default false) }}
|
||||
- alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write is {{`{{`}} printf "%.1f" $value {{`}}`}}s behind for {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusremotewritebehind
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
- ignoring(remote_name, url) group_right
|
||||
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
> 120
|
||||
for: {{ dig "PrometheusRemoteWriteBehind" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusRemoteWriteBehind" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusRemoteWriteDesiredShards | default false) }}
|
||||
- alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write desired shards calculation wants to run {{`{{`}} $value {{`}}`}} shards for queue {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}, which is more than the max of {{`{{`}} printf `prometheus_remote_storage_shards_max{instance="%s",job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}` $labels.instance | query | first | value {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusremotewritedesiredshards
|
||||
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_shards_desired{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
>
|
||||
max_over_time(prometheus_remote_storage_shards_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
|
||||
)
|
||||
for: {{ dig "PrometheusRemoteWriteDesiredShards" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusRemoteWriteDesiredShards" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusRuleFailures | default false) }}
|
||||
- alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to evaluate {{`{{`}} printf "%.0f" $value {{`}}`}} rules in the last 5m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusrulefailures
|
||||
summary: Prometheus is failing rule evaluations.
|
||||
expr: increase(prometheus_rule_evaluation_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: {{ dig "PrometheusRuleFailures" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusRuleFailures" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusMissingRuleEvaluations | default false) }}
|
||||
- alert: PrometheusMissingRuleEvaluations
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has missed {{`{{`}} printf "%.0f" $value {{`}}`}} rule group evaluations in the last 5m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusmissingruleevaluations
|
||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||
expr: increase(prometheus_rule_group_iterations_missed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: {{ dig "PrometheusMissingRuleEvaluations" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusMissingRuleEvaluations" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusTargetLimitHit | default false) }}
|
||||
- alert: PrometheusTargetLimitHit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because the number of targets exceeded the configured target_limit.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustargetlimithit
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
|
||||
expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: {{ dig "PrometheusTargetLimitHit" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusTargetLimitHit" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusLabelLimitHit | default false) }}
|
||||
- alert: PrometheusLabelLimitHit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheuslabellimithit
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
|
||||
expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: {{ dig "PrometheusLabelLimitHit" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusLabelLimitHit" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusScrapeBodySizeLimitHit | default false) }}
|
||||
- alert: PrometheusScrapeBodySizeLimitHit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed {{`{{`}} printf "%.0f" $value {{`}}`}} scrapes in the last 5m because some targets exceeded the configured body_size_limit.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusscrapebodysizelimithit
|
||||
summary: Prometheus has dropped some targets that exceeded body size limit.
|
||||
expr: increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: {{ dig "PrometheusScrapeBodySizeLimitHit" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusScrapeBodySizeLimitHit" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusScrapeSampleLimitHit | default false) }}
|
||||
- alert: PrometheusScrapeSampleLimitHit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed {{`{{`}} printf "%.0f" $value {{`}}`}} scrapes in the last 5m because some targets exceeded the configured sample_limit.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusscrapesamplelimithit
|
||||
summary: Prometheus has failed scrapes that have exceeded the configured sample limit.
|
||||
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: {{ dig "PrometheusScrapeSampleLimitHit" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusScrapeSampleLimitHit" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusTargetSyncFailure | default false) }}
|
||||
- alert: PrometheusTargetSyncFailure
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} printf "%.0f" $value {{`}}`}} targets in Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} have failed to sync because invalid configuration was supplied.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustargetsyncfailure
|
||||
summary: Prometheus has failed to sync targets.
|
||||
expr: increase(prometheus_target_sync_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[30m]) > 0
|
||||
for: {{ dig "PrometheusTargetSyncFailure" "for" "5m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusTargetSyncFailure" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusHighQueryLoad | default false) }}
|
||||
- alert: PrometheusHighQueryLoad
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheushighqueryload
|
||||
summary: Prometheus is reaching its maximum capacity serving concurrent requests.
|
||||
expr: avg_over_time(prometheus_engine_queries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.8
|
||||
for: {{ dig "PrometheusHighQueryLoad" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusHighQueryLoad" "severity" "warning" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusErrorSendingAlertsToAnyAlertmanager | default false) }}
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.additionalRuleGroupAnnotations.prometheus }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.prometheus | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheuserrorsendingalertstoanyalertmanager
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |-
|
||||
min without (alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: {{ dig "PrometheusErrorSendingAlertsToAnyAlertmanager" "for" "15m" .Values.customRules }}
|
||||
{{- with .Values.defaultRules.keepFiringFor }}
|
||||
keep_firing_for: "{{ . }}"
|
||||
{{- end }}
|
||||
labels:
|
||||
severity: {{ dig "PrometheusErrorSendingAlertsToAnyAlertmanager" "severity" "critical" .Values.customRules }}
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.prometheus }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,301 @@
|
||||
{{- /*
|
||||
Generated from 'windows.node.rules' group from https://github.com/kubernetes-monitoring/kubernetes-mixin.git
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.windowsMonitoring.enabled .Values.defaultRules.rules.windows }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "windows.node.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: windows.node.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (
|
||||
windows_system_system_up_time{job="windows-exporter"}
|
||||
)
|
||||
record: node:windows_node:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, core) (
|
||||
windows_cpu_time_total{job="windows-exporter"}
|
||||
))
|
||||
record: node:windows_node_num_cpu:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: 1 - avg by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(windows_cpu_time_total{job="windows-exporter",mode="idle"}[1m]))
|
||||
record: :windows_node_cpu_utilisation:avg1m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
1 - avg by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) (
|
||||
rate(windows_cpu_time_total{job="windows-exporter",mode="idle"}[1m])
|
||||
)
|
||||
record: node:windows_node_cpu_utilisation:avg1m
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
1 -
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (windows_memory_available_bytes{job="windows-exporter"})
|
||||
/
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (windows_os_visible_memory_bytes{job="windows-exporter"})
|
||||
record: ':windows_node_memory_utilisation:'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (windows_memory_available_bytes{job="windows-exporter"} + windows_memory_cache_bytes{job="windows-exporter"})
|
||||
record: :windows_node_memory_MemFreeCached_bytes:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: (windows_memory_cache_bytes{job="windows-exporter"} + windows_memory_modified_page_list_bytes{job="windows-exporter"} + windows_memory_standby_cache_core_bytes{job="windows-exporter"} + windows_memory_standby_cache_normal_priority_bytes{job="windows-exporter"} + windows_memory_standby_cache_reserve_bytes{job="windows-exporter"})
|
||||
record: node:windows_node_memory_totalCached_bytes:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (windows_os_visible_memory_bytes{job="windows-exporter"})
|
||||
record: :windows_node_memory_MemTotal_bytes:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) (
|
||||
(windows_memory_available_bytes{job="windows-exporter"})
|
||||
)
|
||||
record: node:windows_node_memory_bytes_available:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) (
|
||||
windows_os_visible_memory_bytes{job="windows-exporter"}
|
||||
)
|
||||
record: node:windows_node_memory_bytes_total:sum
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
(node:windows_node_memory_bytes_total:sum - node:windows_node_memory_bytes_available:sum)
|
||||
/
|
||||
scalar(sum(node:windows_node_memory_bytes_total:sum))
|
||||
record: node:windows_node_memory_utilisation:ratio
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: 1 - (node:windows_node_memory_bytes_available:sum / node:windows_node_memory_bytes_total:sum)
|
||||
record: 'node:windows_node_memory_utilisation:'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: irate(windows_memory_swap_page_operations_total{job="windows-exporter"}[5m])
|
||||
record: node:windows_node_memory_swap_io_pages:irate
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
avg by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (irate(windows_logical_disk_read_seconds_total{job="windows-exporter"}[1m]) +
|
||||
irate(windows_logical_disk_write_seconds_total{job="windows-exporter"}[1m])
|
||||
)
|
||||
record: :windows_node_disk_utilisation:avg_irate
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
avg by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) (
|
||||
(irate(windows_logical_disk_read_seconds_total{job="windows-exporter"}[1m]) +
|
||||
irate(windows_logical_disk_write_seconds_total{job="windows-exporter"}[1m]))
|
||||
)
|
||||
record: node:windows_node_disk_utilisation:avg_irate
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,instance,volume)(
|
||||
(windows_logical_disk_size_bytes{job="windows-exporter"}
|
||||
- windows_logical_disk_free_bytes{job="windows-exporter"})
|
||||
/ windows_logical_disk_size_bytes{job="windows-exporter"}
|
||||
)
|
||||
record: 'node:windows_node_filesystem_usage:'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, volume) (windows_logical_disk_free_bytes{job="windows-exporter"} / windows_logical_disk_size_bytes{job="windows-exporter"})
|
||||
record: 'node:windows_node_filesystem_avail:'
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (irate(windows_net_bytes_total{job="windows-exporter"}[1m]))
|
||||
record: :windows_node_net_utilisation:sum_irate
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) (
|
||||
(irate(windows_net_bytes_total{job="windows-exporter"}[1m]))
|
||||
)
|
||||
record: node:windows_node_net_utilisation:sum_irate
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (irate(windows_net_packets_received_discarded_total{job="windows-exporter"}[1m])) +
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (irate(windows_net_packets_outbound_discarded_total{job="windows-exporter"}[1m]))
|
||||
record: :windows_node_net_saturation:sum_irate
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) (
|
||||
(irate(windows_net_packets_received_discarded_total{job="windows-exporter"}[1m]) +
|
||||
irate(windows_net_packets_outbound_discarded_total{job="windows-exporter"}[1m]))
|
||||
)
|
||||
record: node:windows_node_net_saturation:sum_irate
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,158 @@
|
||||
{{- /*
|
||||
Generated from 'windows.pod.rules' group from https://github.com/kubernetes-monitoring/kubernetes-mixin.git
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.windowsMonitoring.enabled .Values.defaultRules.rules.windows }}
|
||||
{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "windows.pod.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: windows.pod.rules
|
||||
rules:
|
||||
- expr: windows_container_available{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="{{ $kubeStateMetricsJob }}", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster)
|
||||
record: windows_pod_container_available
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: windows_container_cpu_usage_seconds_total{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="{{ $kubeStateMetricsJob }}", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster)
|
||||
record: windows_container_total_runtime
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: windows_container_memory_usage_commit_bytes{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="{{ $kubeStateMetricsJob }}", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster)
|
||||
record: windows_container_memory_usage
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: windows_container_memory_usage_private_working_set_bytes{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="{{ $kubeStateMetricsJob }}", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster)
|
||||
record: windows_container_private_working_set_usage
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: windows_container_network_receive_bytes_total{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="{{ $kubeStateMetricsJob }}", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster)
|
||||
record: windows_container_network_received_bytes_total
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: windows_container_network_transmit_bytes_total{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="{{ $kubeStateMetricsJob }}", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster)
|
||||
record: windows_container_network_transmitted_bytes_total
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, container) (
|
||||
kube_pod_container_resource_requests{resource="memory",job="{{ $kubeStateMetricsJob }}"}
|
||||
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container,pod,namespace,cluster) (windows_pod_container_available)
|
||||
record: kube_pod_windows_container_resource_memory_request
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: kube_pod_container_resource_limits{resource="memory",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container,pod,namespace,cluster) (windows_pod_container_available)
|
||||
record: kube_pod_windows_container_resource_memory_limit
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, container) (
|
||||
kube_pod_container_resource_requests{resource="cpu",job="{{ $kubeStateMetricsJob }}"}
|
||||
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container,pod,namespace,cluster) (windows_pod_container_available)
|
||||
record: kube_pod_windows_container_resource_cpu_cores_request
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: kube_pod_container_resource_limits{resource="cpu",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container,pod,namespace,cluster) (windows_pod_container_available)
|
||||
record: kube_pod_windows_container_resource_cpu_cores_limit
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, container) (
|
||||
rate(windows_container_total_runtime{}[5m])
|
||||
)
|
||||
record: namespace_pod_container:windows_container_cpu_usage_seconds_total:sum_rate
|
||||
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
labels:
|
||||
{{- with .Values.defaultRules.additionalRuleLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.defaultRules.additionalRuleGroupLabels.windows }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,17 @@
|
||||
{{- if .Values.prometheus.enabled }}
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
app.kubernetes.io/component: prometheus
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
data:
|
||||
{{- with .Values.prometheus.prometheusSpec.thanos.objectStorageConfig }}
|
||||
{{- if and .secret (not .existingSecret) }}
|
||||
object-storage-configs.yaml: {{ toYaml .secret | b64enc | quote }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,80 @@
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if .Values.prometheus.enabled }}
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
self-monitor: {{ .Values.prometheus.serviceMonitor.selfMonitor | quote }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.prometheus.service.labels }}
|
||||
{{ toYaml .Values.prometheus.service.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.service.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.prometheus.service.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if .Values.prometheus.service.clusterIP }}
|
||||
clusterIP: {{ .Values.prometheus.service.clusterIP }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.service.externalIPs }}
|
||||
externalIPs:
|
||||
{{ toYaml .Values.prometheus.service.externalIPs | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.service.loadBalancerIP }}
|
||||
loadBalancerIP: {{ .Values.prometheus.service.loadBalancerIP }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.service.loadBalancerSourceRanges }}
|
||||
loadBalancerSourceRanges:
|
||||
{{- range $cidr := .Values.prometheus.service.loadBalancerSourceRanges }}
|
||||
- {{ $cidr }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if ne .Values.prometheus.service.type "ClusterIP" }}
|
||||
externalTrafficPolicy: {{ .Values.prometheus.service.externalTrafficPolicy }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- name: {{ .Values.prometheus.prometheusSpec.portName }}
|
||||
{{- if eq .Values.prometheus.service.type "NodePort" }}
|
||||
nodePort: {{ .Values.prometheus.service.nodePort }}
|
||||
{{- end }}
|
||||
port: {{ .Values.prometheus.service.port }}
|
||||
targetPort: {{ .Values.prometheus.service.targetPort }}
|
||||
- name: reloader-web
|
||||
{{- if semverCompare "> 1.20.0-0" $kubeTargetVersion }}
|
||||
appProtocol: http
|
||||
{{- end }}
|
||||
port: 8080
|
||||
targetPort: reloader-web
|
||||
{{- if .Values.prometheus.thanosIngress.enabled }}
|
||||
- name: grpc
|
||||
{{- if eq .Values.prometheus.service.type "NodePort" }}
|
||||
nodePort: {{ .Values.prometheus.thanosIngress.nodePort }}
|
||||
{{- end }}
|
||||
port: {{ .Values.prometheus.thanosIngress.servicePort }}
|
||||
targetPort: {{ .Values.prometheus.thanosIngress.servicePort }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.service.additionalPorts }}
|
||||
{{ toYaml .Values.prometheus.service.additionalPorts | indent 2 }}
|
||||
{{- end }}
|
||||
publishNotReadyAddresses: {{ .Values.prometheus.service.publishNotReadyAddresses }}
|
||||
selector:
|
||||
{{- if .Values.prometheus.agentMode }}
|
||||
app.kubernetes.io/name: prometheus-agent
|
||||
{{- else }}
|
||||
app.kubernetes.io/name: prometheus
|
||||
{{- end }}
|
||||
operator.prometheus.io/name: {{ template "kube-prometheus-stack.prometheus.crname" . }}
|
||||
{{- if .Values.prometheus.service.sessionAffinity }}
|
||||
sessionAffinity: {{ .Values.prometheus.service.sessionAffinity }}
|
||||
{{- end }}
|
||||
{{- if eq .Values.prometheus.service.sessionAffinity "ClientIP" }}
|
||||
sessionAffinityConfig:
|
||||
clientIP:
|
||||
timeoutSeconds: {{ .Values.prometheus.service.sessionAffinityConfig.clientIP.timeoutSeconds }}
|
||||
{{- end }}
|
||||
type: "{{ .Values.prometheus.service.type }}"
|
||||
{{- end }}
|
||||
@@ -0,0 +1,39 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.prometheus.thanosService.enabled }}
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-thanos-discovery
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-thanos-discovery
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.prometheus.thanosService.labels }}
|
||||
{{ toYaml .Values.prometheus.thanosService.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.thanosService.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.prometheus.thanosService.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
type: {{ .Values.prometheus.thanosService.type }}
|
||||
clusterIP: {{ .Values.prometheus.thanosService.clusterIP }}
|
||||
{{- if ne .Values.prometheus.thanosService.type "ClusterIP" }}
|
||||
externalTrafficPolicy: {{ .Values.prometheus.thanosService.externalTrafficPolicy }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- name: {{ .Values.prometheus.thanosService.portName }}
|
||||
port: {{ .Values.prometheus.thanosService.port }}
|
||||
targetPort: {{ .Values.prometheus.thanosService.targetPort }}
|
||||
{{- if eq .Values.prometheus.thanosService.type "NodePort" }}
|
||||
nodePort: {{ .Values.prometheus.thanosService.nodePort }}
|
||||
{{- end }}
|
||||
- name: {{ .Values.prometheus.thanosService.httpPortName }}
|
||||
port: {{ .Values.prometheus.thanosService.httpPort }}
|
||||
targetPort: {{ .Values.prometheus.thanosService.targetHttpPort }}
|
||||
{{- if eq .Values.prometheus.thanosService.type "NodePort" }}
|
||||
nodePort: {{ .Values.prometheus.thanosService.httpNodePort }}
|
||||
{{- end }}
|
||||
selector:
|
||||
app.kubernetes.io/name: prometheus
|
||||
operator.prometheus.io/name: {{ template "kube-prometheus-stack.prometheus.crname" . }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,46 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.prometheus.thanosServiceExternal.enabled }}
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-thanos-external
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.prometheus.thanosServiceExternal.labels }}
|
||||
{{ toYaml .Values.prometheus.thanosServiceExternal.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.thanosServiceExternal.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.prometheus.thanosServiceExternal.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
type: {{ .Values.prometheus.thanosServiceExternal.type }}
|
||||
{{- if .Values.prometheus.thanosServiceExternal.loadBalancerIP }}
|
||||
loadBalancerIP: {{ .Values.prometheus.thanosServiceExternal.loadBalancerIP }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.thanosServiceExternal.loadBalancerSourceRanges }}
|
||||
loadBalancerSourceRanges:
|
||||
{{- range $cidr := .Values.prometheus.thanosServiceExternal.loadBalancerSourceRanges }}
|
||||
- {{ $cidr }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if ne .Values.prometheus.thanosServiceExternal.type "ClusterIP" }}
|
||||
externalTrafficPolicy: {{ .Values.prometheus.thanosServiceExternal.externalTrafficPolicy }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- name: {{ .Values.prometheus.thanosServiceExternal.portName }}
|
||||
port: {{ .Values.prometheus.thanosServiceExternal.port }}
|
||||
targetPort: {{ .Values.prometheus.thanosServiceExternal.targetPort }}
|
||||
{{- if eq .Values.prometheus.thanosServiceExternal.type "NodePort" }}
|
||||
nodePort: {{ .Values.prometheus.thanosServiceExternal.nodePort }}
|
||||
{{- end }}
|
||||
- name: {{ .Values.prometheus.thanosServiceExternal.httpPortName }}
|
||||
port: {{ .Values.prometheus.thanosServiceExternal.httpPort }}
|
||||
targetPort: {{ .Values.prometheus.thanosServiceExternal.targetHttpPort }}
|
||||
{{- if eq .Values.prometheus.thanosServiceExternal.type "NodePort" }}
|
||||
nodePort: {{ .Values.prometheus.thanosServiceExternal.httpNodePort }}
|
||||
{{- end }}
|
||||
selector:
|
||||
app.kubernetes.io/name: prometheus
|
||||
operator.prometheus.io/name: {{ template "kube-prometheus-stack.prometheus.crname" . }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,20 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.prometheus.serviceAccount.create }}
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.prometheus.serviceAccountName" . }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
app.kubernetes.io/name: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
app.kubernetes.io/component: prometheus
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.prometheus.serviceAccount.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.prometheus.serviceAccount.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.global.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{ include "kube-prometheus-stack.imagePullSecrets" . | trim | indent 2 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,81 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.prometheus.serviceMonitor.selfMonitor }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- with .Values.prometheus.serviceMonitor.additionalLabels }}
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- include "servicemonitor.scrapeLimits" .Values.prometheus.serviceMonitor | nindent 2 }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
release: {{ $.Release.Name | quote }}
|
||||
self-monitor: "true"
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- {{ printf "%s" (include "kube-prometheus-stack.namespace" .) | quote }}
|
||||
endpoints:
|
||||
- port: {{ .Values.prometheus.prometheusSpec.portName }}
|
||||
{{- if .Values.prometheus.serviceMonitor.interval }}
|
||||
interval: {{ .Values.prometheus.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.serviceMonitor.scheme }}
|
||||
scheme: {{ .Values.prometheus.serviceMonitor.scheme }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.serviceMonitor.tlsConfig }}
|
||||
tlsConfig: {{- toYaml .Values.prometheus.serviceMonitor.tlsConfig | nindent 6 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.serviceMonitor.bearerTokenFile }}
|
||||
bearerTokenFile: {{ .Values.prometheus.serviceMonitor.bearerTokenFile }}
|
||||
{{- end }}
|
||||
path: "{{ trimSuffix "/" .Values.prometheus.prometheusSpec.routePrefix }}/metrics"
|
||||
{{- if .Values.prometheus.serviceMonitor.metricRelabelings }}
|
||||
metricRelabelings: {{- tpl (toYaml .Values.prometheus.serviceMonitor.metricRelabelings | nindent 6) . }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.serviceMonitor.relabelings }}
|
||||
relabelings: {{- toYaml .Values.prometheus.serviceMonitor.relabelings | nindent 6 }}
|
||||
{{- end }}
|
||||
- port: reloader-web
|
||||
{{- if .Values.prometheus.serviceMonitor.interval }}
|
||||
interval: {{ .Values.prometheus.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
scheme: http
|
||||
path: "/metrics"
|
||||
{{- if .Values.prometheus.serviceMonitor.metricRelabelings }}
|
||||
metricRelabelings: {{- tpl (toYaml .Values.prometheus.serviceMonitor.metricRelabelings | nindent 6) . }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.serviceMonitor.relabelings }}
|
||||
relabelings: {{- toYaml .Values.prometheus.serviceMonitor.relabelings | nindent 6 }}
|
||||
{{- end }}
|
||||
{{- range .Values.prometheus.serviceMonitor.additionalEndpoints }}
|
||||
- port: {{ .port }}
|
||||
{{- if or $.Values.prometheus.serviceMonitor.interval .interval }}
|
||||
interval: {{ default $.Values.prometheus.serviceMonitor.interval .interval }}
|
||||
{{- end }}
|
||||
{{- if or $.Values.prometheus.serviceMonitor.proxyUrl .proxyUrl }}
|
||||
proxyUrl: {{ default $.Values.prometheus.serviceMonitor.proxyUrl .proxyUrl }}
|
||||
{{- end }}
|
||||
{{- if or $.Values.prometheus.serviceMonitor.scheme .scheme }}
|
||||
scheme: {{ default $.Values.prometheus.serviceMonitor.scheme .scheme }}
|
||||
{{- end }}
|
||||
{{- if or $.Values.prometheus.serviceMonitor.bearerTokenFile .bearerTokenFile }}
|
||||
bearerTokenFile: {{ default $.Values.prometheus.serviceMonitor.bearerTokenFile .bearerTokenFile }}
|
||||
{{- end }}
|
||||
{{- if or $.Values.prometheus.serviceMonitor.tlsConfig .tlsConfig }}
|
||||
tlsConfig: {{- default $.Values.prometheus.serviceMonitor.tlsConfig .tlsConfig | toYaml | nindent 6 }}
|
||||
{{- end }}
|
||||
path: {{ .path }}
|
||||
{{- if or $.Values.prometheus.serviceMonitor.metricRelabelings .metricRelabelings }}
|
||||
metricRelabelings: {{- tpl (default $.Values.prometheus.serviceMonitor.metricRelabelings .metricRelabelings | toYaml | nindent 6) . }}
|
||||
{{- end }}
|
||||
{{- if or $.Values.prometheus.serviceMonitor.relabelings .relabelings }}
|
||||
relabelings: {{- default $.Values.prometheus.serviceMonitor.relabelings .relabelings | toYaml | nindent 6 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,45 @@
|
||||
{{- if and .Values.prometheus.thanosService.enabled .Values.prometheus.thanosServiceMonitor.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-thanos-sidecar
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-thanos-sidecar
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- with .Values.prometheus.thanosServiceMonitor.additionalLabels }}
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- include "servicemonitor.scrapeLimits" .Values.prometheus.thanosServiceMonitor | nindent 2 }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-thanos-discovery
|
||||
release: {{ $.Release.Name | quote }}
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- {{ printf "%s" (include "kube-prometheus-stack.namespace" .) | quote }}
|
||||
endpoints:
|
||||
- port: {{ .Values.prometheus.thanosService.httpPortName }}
|
||||
{{- if .Values.prometheus.thanosServiceMonitor.interval }}
|
||||
interval: {{ .Values.prometheus.thanosServiceMonitor.interval }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.thanosServiceMonitor.scheme }}
|
||||
scheme: {{ .Values.prometheus.thanosServiceMonitor.scheme }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.thanosServiceMonitor.tlsConfig }}
|
||||
tlsConfig: {{ toYaml .Values.prometheus.thanosServiceMonitor.tlsConfig | nindent 6 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.thanosServiceMonitor.bearerTokenFile }}
|
||||
bearerTokenFile: {{ .Values.prometheus.thanosServiceMonitor.bearerTokenFile }}
|
||||
{{- end }}
|
||||
path: "/metrics"
|
||||
{{- if .Values.prometheus.thanosServiceMonitor.metricRelabelings }}
|
||||
metricRelabelings:
|
||||
{{ tpl (toYaml .Values.prometheus.thanosServiceMonitor.metricRelabelings | indent 6) . }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.thanosServiceMonitor.relabelings }}
|
||||
relabelings:
|
||||
{{ toYaml .Values.prometheus.thanosServiceMonitor.relabelings | indent 6 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,47 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.prometheus.additionalServiceMonitors }}
|
||||
apiVersion: v1
|
||||
kind: List
|
||||
items:
|
||||
{{- range .Values.prometheus.additionalServiceMonitors }}
|
||||
- apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: {{ .name }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" $ }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" $ }}-prometheus
|
||||
{{ include "kube-prometheus-stack.labels" $ | indent 8 }}
|
||||
{{- if .additionalLabels }}
|
||||
{{ toYaml .additionalLabels | indent 8 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- include "servicemonitor.scrapeLimits" . | nindent 6 }}
|
||||
endpoints:
|
||||
{{ toYaml .endpoints | indent 8 }}
|
||||
{{- if .jobLabel }}
|
||||
jobLabel: {{ .jobLabel }}
|
||||
{{- end }}
|
||||
{{- if .namespaceSelector }}
|
||||
namespaceSelector:
|
||||
{{ toYaml .namespaceSelector | indent 8 }}
|
||||
{{- end }}
|
||||
selector:
|
||||
{{ toYaml .selector | indent 8 }}
|
||||
{{- if .targetLabels }}
|
||||
targetLabels:
|
||||
{{ toYaml .targetLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .podTargetLabels }}
|
||||
podTargetLabels:
|
||||
{{ toYaml .podTargetLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .metricRelabelings }}
|
||||
metricRelabelings:
|
||||
{{ toYaml .metricRelabelings | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .relabelings }}
|
||||
relabelings:
|
||||
{{ toYaml .relabelings | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,54 @@
|
||||
{{- if and .Values.prometheus.enabled .Values.prometheus.servicePerReplica.enabled }}
|
||||
{{- $count := .Values.prometheus.prometheusSpec.replicas | int -}}
|
||||
{{- $serviceValues := .Values.prometheus.servicePerReplica -}}
|
||||
apiVersion: v1
|
||||
kind: List
|
||||
metadata:
|
||||
name: {{ include "kube-prometheus-stack.fullname" $ }}-prometheus-serviceperreplica
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
items:
|
||||
{{- range $i, $e := until $count }}
|
||||
- apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "kube-prometheus-stack.fullname" $ }}-prometheus-{{ $i }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" $ }}
|
||||
labels:
|
||||
app: {{ include "kube-prometheus-stack.name" $ }}-prometheus
|
||||
{{ include "kube-prometheus-stack.labels" $ | indent 8 }}
|
||||
{{- if $serviceValues.annotations }}
|
||||
annotations:
|
||||
{{ toYaml $serviceValues.annotations | indent 8 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if $serviceValues.clusterIP }}
|
||||
clusterIP: {{ $serviceValues.clusterIP }}
|
||||
{{- end }}
|
||||
{{- if $serviceValues.loadBalancerSourceRanges }}
|
||||
loadBalancerSourceRanges:
|
||||
{{- range $cidr := $serviceValues.loadBalancerSourceRanges }}
|
||||
- {{ $cidr }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if ne $serviceValues.type "ClusterIP" }}
|
||||
externalTrafficPolicy: {{ $serviceValues.externalTrafficPolicy }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- name: {{ $.Values.prometheus.prometheusSpec.portName }}
|
||||
{{- if eq $serviceValues.type "NodePort" }}
|
||||
nodePort: {{ $serviceValues.nodePort }}
|
||||
{{- end }}
|
||||
port: {{ $serviceValues.port }}
|
||||
targetPort: {{ $serviceValues.targetPort }}
|
||||
selector:
|
||||
{{- if $.Values.prometheus.agentMode }}
|
||||
app.kubernetes.io/name: prometheus-agent
|
||||
statefulset.kubernetes.io/pod-name: prom-agent-{{ include "kube-prometheus-stack.prometheus.crname" $ }}-{{ $i }}
|
||||
{{- else }}
|
||||
app.kubernetes.io/name: prometheus
|
||||
statefulset.kubernetes.io/pod-name: prometheus-{{ include "kube-prometheus-stack.prometheus.crname" $ }}-{{ $i }}
|
||||
{{- end }}
|
||||
operator.prometheus.io/name: {{ template "kube-prometheus-stack.prometheus.crname" $ }}
|
||||
type: "{{ $serviceValues.type }}"
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
Reference in New Issue
Block a user