123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953 |
- apiVersion: monitoring.coreos.com/v1
- kind: PrometheusRule
- metadata:
- labels:
- prometheus: k8s
- role: alert-rules
- name: prometheus-k8s-rules
- namespace: monitoring
- spec:
- groups:
- - name: k8s.rules
- rules:
- - expr: |
- sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace)
- record: namespace:container_cpu_usage_seconds_total:sum_rate
- - expr: |
- sum by (namespace, pod_name, container_name) (
- rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])
- )
- record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
- - expr: |
- sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace)
- record: namespace:container_memory_usage_bytes:sum
- - expr: |
- sum by (namespace, label_name) (
- sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace, pod_name)
- * on (namespace, pod_name) group_left(label_name)
- label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
- )
- record: namespace_name:container_cpu_usage_seconds_total:sum_rate
- - expr: |
- sum by (namespace, label_name) (
- sum(container_memory_usage_bytes{job="kubelet",image!="", container_name!=""}) by (pod_name, namespace)
- * on (namespace, pod_name) group_left(label_name)
- label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
- )
- record: namespace_name:container_memory_usage_bytes:sum
- - expr: |
- sum by (namespace, label_name) (
- sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) by (namespace, pod)
- * on (namespace, pod) group_left(label_name)
- label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
- )
- record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
- - expr: |
- sum by (namespace, label_name) (
- sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod)
- * on (namespace, pod) group_left(label_name)
- label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
- )
- record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
- - name: kube-scheduler.rules
- rules:
- - expr: |
- histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
- labels:
- quantile: "0.99"
- record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
- - expr: |
- histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
- labels:
- quantile: "0.99"
- record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
- - expr: |
- histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
- labels:
- quantile: "0.99"
- record: cluster_quantile:scheduler_binding_latency:histogram_quantile
- - expr: |
- histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
- labels:
- quantile: "0.9"
- record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
- - expr: |
- histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
- labels:
- quantile: "0.9"
- record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
- - expr: |
- histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
- labels:
- quantile: "0.9"
- record: cluster_quantile:scheduler_binding_latency:histogram_quantile
- - expr: |
- histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
- labels:
- quantile: "0.5"
- record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
- - expr: |
- histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
- labels:
- quantile: "0.5"
- record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
- - expr: |
- histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
- labels:
- quantile: "0.5"
- record: cluster_quantile:scheduler_binding_latency:histogram_quantile
- - name: kube-apiserver.rules
- rules:
- - expr: |
- histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
- labels:
- quantile: "0.99"
- record: cluster_quantile:apiserver_request_latencies:histogram_quantile
- - expr: |
- histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
- labels:
- quantile: "0.9"
- record: cluster_quantile:apiserver_request_latencies:histogram_quantile
- - expr: |
- histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
- labels:
- quantile: "0.5"
- record: cluster_quantile:apiserver_request_latencies:histogram_quantile
- - name: node.rules
- rules:
- - expr: sum(min(kube_pod_info) by (node))
- record: ':kube_pod_info_node_count:'
- - expr: |
- max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
- record: 'node_namespace_pod:kube_pod_info:'
- - expr: |
- count by (node) (sum by (node, cpu) (
- node_cpu_seconds_total{job="node-exporter"}
- * on (namespace, pod) group_left(node)
- node_namespace_pod:kube_pod_info:
- ))
- record: node:node_num_cpu:sum
- - expr: |
- 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
- record: :node_cpu_utilisation:avg1m
- - expr: |
- 1 - avg by (node) (
- rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
- * on (namespace, pod) group_left(node)
- node_namespace_pod:kube_pod_info:)
- record: node:node_cpu_utilisation:avg1m
- - expr: |
- sum(node_load1{job="node-exporter"})
- /
- sum(node:node_num_cpu:sum)
- record: ':node_cpu_saturation_load1:'
- - expr: |
- sum by (node) (
- node_load1{job="node-exporter"}
- * on (namespace, pod) group_left(node)
- node_namespace_pod:kube_pod_info:
- )
- /
- node:node_num_cpu:sum
- record: 'node:node_cpu_saturation_load1:'
- - expr: |
- 1 -
- sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
- /
- sum(node_memory_MemTotal_bytes{job="node-exporter"})
- record: ':node_memory_utilisation:'
- - expr: |
- sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
- record: :node_memory_MemFreeCachedBuffers_bytes:sum
- - expr: |
- sum(node_memory_MemTotal_bytes{job="node-exporter"})
- record: :node_memory_MemTotal_bytes:sum
- - expr: |
- sum by (node) (
- (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
- * on (namespace, pod) group_left(node)
- node_namespace_pod:kube_pod_info:
- )
- record: node:node_memory_bytes_available:sum
- - expr: |
- sum by (node) (
- node_memory_MemTotal_bytes{job="node-exporter"}
- * on (namespace, pod) group_left(node)
- node_namespace_pod:kube_pod_info:
- )
- record: node:node_memory_bytes_total:sum
- - expr: |
- (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
- /
- scalar(sum(node:node_memory_bytes_total:sum))
- record: node:node_memory_utilisation:ratio
- - expr: |
- 1e3 * sum(
- (rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
- + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
- )
- record: :node_memory_swap_io_bytes:sum_rate
- - expr: |
- 1 -
- sum by (node) (
- (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
- * on (namespace, pod) group_left(node)
- node_namespace_pod:kube_pod_info:
- )
- /
- sum by (node) (
- node_memory_MemTotal_bytes{job="node-exporter"}
- * on (namespace, pod) group_left(node)
- node_namespace_pod:kube_pod_info:
- )
- record: 'node:node_memory_utilisation:'
- - expr: |
- 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
- record: 'node:node_memory_utilisation_2:'
- - expr: |
- 1e3 * sum by (node) (
- (rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
- + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
- * on (namespace, pod) group_left(node)
- node_namespace_pod:kube_pod_info:
- )
- record: node:node_memory_swap_io_bytes:sum_rate
- - expr: |
- avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]))
- record: :node_disk_utilisation:avg_irate
- - expr: |
- avg by (node) (
- irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m])
- * on (namespace, pod) group_left(node)
- node_namespace_pod:kube_pod_info:
- )
- record: node:node_disk_utilisation:avg_irate
- - expr: |
- avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3)
- record: :node_disk_saturation:avg_irate
- - expr: |
- avg by (node) (
- irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3
- * on (namespace, pod) group_left(node)
- node_namespace_pod:kube_pod_info:
- )
- record: node:node_disk_saturation:avg_irate
- - expr: |
- max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
- - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
- / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
- record: 'node:node_filesystem_usage:'
- - expr: |
- max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
- record: 'node:node_filesystem_avail:'
- - expr: |
- sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) +
- sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m]))
- record: :node_net_utilisation:sum_irate
- - expr: |
- sum by (node) (
- (irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) +
- irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m]))
- * on (namespace, pod) group_left(node)
- node_namespace_pod:kube_pod_info:
- )
- record: node:node_net_utilisation:sum_irate
- - expr: |
- sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) +
- sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m]))
- record: :node_net_saturation:sum_irate
- - expr: |
- sum by (node) (
- (irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) +
- irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m]))
- * on (namespace, pod) group_left(node)
- node_namespace_pod:kube_pod_info:
- )
- record: node:node_net_saturation:sum_irate
- - name: kube-prometheus-node-recording.rules
- rules:
- - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)
- record: instance:node_cpu:rate:sum
- - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
- BY (instance)
- record: instance:node_filesystem_usage:sum
- - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
- record: instance:node_network_receive_bytes:rate:sum
- - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
- record: instance:node_network_transmit_bytes:rate:sum
- - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode)
- / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
- record: instance:node_cpu:ratio
- - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))
- record: cluster:node_cpu:sum_rate5m
- - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
- record: cluster:node_cpu:ratio
- - name: kubernetes-absent
- rules:
- - alert: AlertmanagerDown
- annotations:
- message: Alertmanager has disappeared from Prometheus target discovery.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
- expr: |
- absent(up{job="alertmanager-main"} == 1)
- for: 15m
- labels:
- severity: critical
- - alert: CoreDNSDown
- annotations:
- message: CoreDNS has disappeared from Prometheus target discovery.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-corednsdown
- expr: |
- absent(up{job="kube-dns"} == 1)
- for: 15m
- labels:
- severity: critical
- - alert: KubeAPIDown
- annotations:
- message: KubeAPI has disappeared from Prometheus target discovery.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
- expr: |
- absent(up{job="apiserver"} == 1)
- for: 15m
- labels:
- severity: critical
- - alert: KubeControllerManagerDown
- annotations:
- message: KubeControllerManager has disappeared from Prometheus target discovery.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
- expr: |
- absent(up{job="kube-controller-manager"} == 1)
- for: 15m
- labels:
- severity: critical
- - alert: KubeSchedulerDown
- annotations:
- message: KubeScheduler has disappeared from Prometheus target discovery.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
- expr: |
- absent(up{job="kube-scheduler"} == 1)
- for: 15m
- labels:
- severity: critical
- - alert: KubeStateMetricsDown
- annotations:
- message: KubeStateMetrics has disappeared from Prometheus target discovery.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown
- expr: |
- absent(up{job="kube-state-metrics"} == 1)
- for: 15m
- labels:
- severity: critical
- - alert: KubeletDown
- annotations:
- message: Kubelet has disappeared from Prometheus target discovery.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
- expr: |
- absent(up{job="kubelet"} == 1)
- for: 15m
- labels:
- severity: critical
- - alert: NodeExporterDown
- annotations:
- message: NodeExporter has disappeared from Prometheus target discovery.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown
- expr: |
- absent(up{job="node-exporter"} == 1)
- for: 15m
- labels:
- severity: critical
- - alert: PrometheusDown
- annotations:
- message: Prometheus has disappeared from Prometheus target discovery.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
- expr: |
- absent(up{job="prometheus-k8s"} == 1)
- for: 15m
- labels:
- severity: critical
- - alert: PrometheusOperatorDown
- annotations:
- message: PrometheusOperator has disappeared from Prometheus target discovery.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
- expr: |
- absent(up{job="prometheus-operator"} == 1)
- for: 15m
- labels:
- severity: critical
- - name: kubernetes-apps
- rules:
- - alert: KubePodCrashLooping
- annotations:
- message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
- }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
- expr: |
- rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
- for: 1h
- labels:
- severity: critical
- - alert: KubePodNotReady
- annotations:
- message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
- state for longer than an hour.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
- expr: |
- sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0
- for: 1h
- labels:
- severity: critical
- - alert: KubeDeploymentGenerationMismatch
- annotations:
- message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
- }} does not match, this indicates that the Deployment has failed but has
- not been rolled back.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
- expr: |
- kube_deployment_status_observed_generation{job="kube-state-metrics"}
- !=
- kube_deployment_metadata_generation{job="kube-state-metrics"}
- for: 15m
- labels:
- severity: critical
- - alert: KubeDeploymentReplicasMismatch
- annotations:
- message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
- matched the expected number of replicas for longer than an hour.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
- expr: |
- kube_deployment_spec_replicas{job="kube-state-metrics"}
- !=
- kube_deployment_status_replicas_available{job="kube-state-metrics"}
- for: 1h
- labels:
- severity: critical
- - alert: KubeStatefulSetReplicasMismatch
- annotations:
- message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
- not matched the expected number of replicas for longer than 15 minutes.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
- expr: |
- kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
- !=
- kube_statefulset_status_replicas{job="kube-state-metrics"}
- for: 15m
- labels:
- severity: critical
- - alert: KubeStatefulSetGenerationMismatch
- annotations:
- message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
- }} does not match, this indicates that the StatefulSet has failed but has
- not been rolled back.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
- expr: |
- kube_statefulset_status_observed_generation{job="kube-state-metrics"}
- !=
- kube_statefulset_metadata_generation{job="kube-state-metrics"}
- for: 15m
- labels:
- severity: critical
- - alert: KubeStatefulSetUpdateNotRolledOut
- annotations:
- message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
- has not been rolled out.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
- expr: |
- max without (revision) (
- kube_statefulset_status_current_revision{job="kube-state-metrics"}
- unless
- kube_statefulset_status_update_revision{job="kube-state-metrics"}
- )
- *
- (
- kube_statefulset_replicas{job="kube-state-metrics"}
- !=
- kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
- )
- for: 15m
- labels:
- severity: critical
- - alert: KubeDaemonSetRolloutStuck
- annotations:
- message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
- }}/{{ $labels.daemonset }} are scheduled and ready.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
- expr: |
- kube_daemonset_status_number_ready{job="kube-state-metrics"}
- /
- kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100
- for: 15m
- labels:
- severity: critical
- - alert: KubeDaemonSetNotScheduled
- annotations:
- message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
- }} are not scheduled.'
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
- expr: |
- kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
- -
- kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
- for: 10m
- labels:
- severity: warning
- - alert: KubeDaemonSetMisScheduled
- annotations:
- message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
- }} are running where they are not supposed to run.'
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
- expr: |
- kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
- for: 10m
- labels:
- severity: warning
- - alert: KubeCronJobRunning
- annotations:
- message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more
- than 1h to complete.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
- expr: |
- time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
- for: 1h
- labels:
- severity: warning
- - alert: KubeJobCompletion
- annotations:
- message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
- than one hour to complete.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
- expr: |
- kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
- for: 1h
- labels:
- severity: warning
- - alert: KubeJobFailed
- annotations:
- message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
- expr: |
- kube_job_status_failed{job="kube-state-metrics"} > 0
- for: 1h
- labels:
- severity: warning
- - name: kubernetes-resources
- rules:
- - alert: KubeCPUOvercommit
- annotations:
- message: Cluster has overcommitted CPU resource requests for Pods and cannot
- tolerate node failure.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
- expr: |
- sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
- /
- sum(node:node_num_cpu:sum)
- >
- (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
- for: 5m
- labels:
- severity: warning
- - alert: KubeMemOvercommit
- annotations:
- message: Cluster has overcommitted memory resource requests for Pods and cannot
- tolerate node failure.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
- expr: |
- sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
- /
- sum(node_memory_MemTotal_bytes)
- >
- (count(node:node_num_cpu:sum)-1)
- /
- count(node:node_num_cpu:sum)
- for: 5m
- labels:
- severity: warning
- - alert: KubeCPUOvercommit
- annotations:
- message: Cluster has overcommitted CPU resource requests for Namespaces.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
- expr: |
- sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"})
- /
- sum(node:node_num_cpu:sum)
- > 1.5
- for: 5m
- labels:
- severity: warning
- - alert: KubeMemOvercommit
- annotations:
- message: Cluster has overcommitted memory resource requests for Namespaces.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
- expr: |
- sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"})
- /
- sum(node_memory_MemTotal_bytes{job="node-exporter"})
- > 1.5
- for: 5m
- labels:
- severity: warning
- - alert: KubeQuotaExceeded
- annotations:
- message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
- }}% of its {{ $labels.resource }} quota.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
- expr: |
- 100 * kube_resourcequota{job="kube-state-metrics", type="used"}
- / ignoring(instance, job, type)
- (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
- > 90
- for: 15m
- labels:
- severity: warning
- - alert: CPUThrottlingHigh
- annotations:
- message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace
- }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name
- }}.'
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
- expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{}[5m]))
- by (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total{}[5m]))
- by (container_name, pod_name, namespace)\n > 25 \n"
- for: 15m
- labels:
- severity: warning
- - name: kubernetes-storage
- rules:
- - alert: KubePersistentVolumeUsageCritical
- annotations:
- message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
- }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value
- }}% free.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
- expr: |
- 100 * kubelet_volume_stats_available_bytes{job="kubelet"}
- /
- kubelet_volume_stats_capacity_bytes{job="kubelet"}
- < 3
- for: 1m
- labels:
- severity: critical
- - alert: KubePersistentVolumeFullInFourDays
- annotations:
- message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
- }} in Namespace {{ $labels.namespace }} is expected to fill up within four
- days. Currently {{ printf "%0.2f" $value }}% is available.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
- expr: |
- 100 * (
- kubelet_volume_stats_available_bytes{job="kubelet"}
- /
- kubelet_volume_stats_capacity_bytes{job="kubelet"}
- ) < 15
- and
- predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
- for: 5m
- labels:
- severity: critical
- - alert: KubePersistentVolumeErrors
- annotations:
- message: The persistent volume {{ $labels.persistentvolume }} has status {{
- $labels.phase }}.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
- expr: |
- kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
- for: 5m
- labels:
- severity: critical
- - name: kubernetes-system
- rules:
- - alert: KubeNodeNotReady
- annotations:
- message: '{{ $labels.node }} has been unready for more than an hour.'
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
- expr: |
- kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
- for: 1h
- labels:
- severity: warning
- - alert: KubeVersionMismatch
- annotations:
- message: There are {{ $value }} different versions of Kubernetes components
- running.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
- expr: |
- count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1
- for: 1h
- labels:
- severity: warning
- - alert: KubeClientErrors
- annotations:
- message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
- }}' is experiencing {{ printf "%0.0f" $value }}% errors.'
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
- expr: |
- (sum(rate(rest_client_requests_total{code!~"2..|404"}[5m])) by (instance, job)
- /
- sum(rate(rest_client_requests_total[5m])) by (instance, job))
- * 100 > 1
- for: 15m
- labels:
- severity: warning
- - alert: KubeClientErrors
- annotations:
- message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
- }}' is experiencing {{ printf "%0.0f" $value }} errors / second.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
- expr: |
- sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
- for: 15m
- labels:
- severity: warning
- - alert: KubeletTooManyPods
- annotations:
- message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close
- to the limit of 110.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
- expr: |
- kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
- for: 15m
- labels:
- severity: warning
- - alert: KubeAPILatencyHigh
- annotations:
- message: The API server has a 99th percentile latency of {{ $value }} seconds
- for {{ $labels.verb }} {{ $labels.resource }}.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
- expr: |
- cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
- for: 10m
- labels:
- severity: warning
- - alert: KubeAPILatencyHigh
- annotations:
- message: The API server has a 99th percentile latency of {{ $value }} seconds
- for {{ $labels.verb }} {{ $labels.resource }}.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
- expr: |
- cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
- for: 10m
- labels:
- severity: critical
- - alert: KubeAPIErrorsHigh
- annotations:
- message: API server is returning errors for {{ $value }}% of requests.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
- expr: |
- sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
- /
- sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10
- for: 10m
- labels:
- severity: critical
- - alert: KubeAPIErrorsHigh
- annotations:
- message: API server is returning errors for {{ $value }}% of requests.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
- expr: |
- sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
- /
- sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5
- for: 10m
- labels:
- severity: warning
- - alert: KubeClientCertificateExpiration
- annotations:
- message: Kubernetes API certificate is expiring in less than 7 days.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
- expr: |
- histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
- labels:
- severity: warning
- - alert: KubeClientCertificateExpiration
- annotations:
- message: Kubernetes API certificate is expiring in less than 24 hours.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
- expr: |
- histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
- labels:
- severity: critical
- - name: alertmanager.rules
- rules:
- - alert: AlertmanagerConfigInconsistent
- annotations:
- message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
- are out of sync.
- expr: |
- count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
- for: 5m
- labels:
- severity: critical
- - alert: AlertmanagerFailedReload
- annotations:
- message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
- }}/{{ $labels.pod}}.
- expr: |
- alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0
- for: 10m
- labels:
- severity: warning
- - alert: AlertmanagerMembersInconsistent
- annotations:
- message: Alertmanager has not found all other members of the cluster.
- expr: |
- alertmanager_cluster_members{job="alertmanager-main"}
- != on (service) GROUP_LEFT()
- count by (service) (alertmanager_cluster_members{job="alertmanager-main"})
- for: 5m
- labels:
- severity: critical
- - name: general.rules
- rules:
- - alert: TargetDown
- annotations:
- message: '{{ $value }}% of the {{ $labels.job }} targets are down.'
- expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
- for: 10m
- labels:
- severity: warning
- - alert: DeadMansSwitch
- annotations:
- message: This is a DeadMansSwitch meant to ensure that the entire alerting
- pipeline is functional.
- expr: vector(1)
- labels:
- severity: none
- - name: kube-prometheus-node-alerting.rules
- rules:
- - alert: NodeDiskRunningFull
- annotations:
- message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
- }}/{{ $labels.pod }} will be full within the next 24 hours.
- expr: |
- (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
- for: 30m
- labels:
- severity: warning
- - alert: NodeDiskRunningFull
- annotations:
- message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
- }}/{{ $labels.pod }} will be full within the next 2 hours.
- expr: |
- (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
- for: 10m
- labels:
- severity: critical
- - name: prometheus.rules
- rules:
- - alert: PrometheusConfigReloadFailed
- annotations:
- description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
- summary: Reloading Prometheus' configuration failed
- expr: |
- prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0
- for: 10m
- labels:
- severity: warning
- - alert: PrometheusNotificationQueueRunningFull
- annotations:
- description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
- $labels.pod}}
- summary: Prometheus' alert notification queue is running full
- expr: |
- predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s"}
- for: 10m
- labels:
- severity: warning
- - alert: PrometheusErrorSendingAlerts
- annotations:
- description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
- $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- summary: Errors while sending alert from Prometheus
- expr: |
- rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.01
- for: 10m
- labels:
- severity: warning
- - alert: PrometheusErrorSendingAlerts
- annotations:
- description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
- $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- summary: Errors while sending alerts from Prometheus
- expr: |
- rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.03
- for: 10m
- labels:
- severity: critical
- - alert: PrometheusNotConnectedToAlertmanagers
- annotations:
- description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
- to any Alertmanagers
- summary: Prometheus is not connected to any Alertmanagers
- expr: |
- prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s"} < 1
- for: 10m
- labels:
- severity: warning
- - alert: PrometheusTSDBReloadsFailing
- annotations:
- description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
- reload failures over the last four hours.'
- summary: Prometheus has issues reloading data blocks from disk
- expr: |
- increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s"}[2h]) > 0
- for: 12h
- labels:
- severity: warning
- - alert: PrometheusTSDBCompactionsFailing
- annotations:
- description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
- compaction failures over the last four hours.'
- summary: Prometheus has issues compacting sample blocks
- expr: |
- increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s"}[2h]) > 0
- for: 12h
- labels:
- severity: warning
- - alert: PrometheusTSDBWALCorruptions
- annotations:
- description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
- log (WAL).'
- summary: Prometheus write-ahead log is corrupted
- expr: |
- tsdb_wal_corruptions_total{job="prometheus-k8s"} > 0
- for: 4h
- labels:
- severity: warning
- - alert: PrometheusNotIngestingSamples
- annotations:
- description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting
- samples.
- summary: Prometheus isn't ingesting samples
- expr: |
- rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s"}[5m]) <= 0
- for: 10m
- labels:
- severity: warning
- - alert: PrometheusTargetScrapesDuplicate
- annotations:
- description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected
- due to duplicate timestamps but different values'
- summary: Prometheus has many samples rejected
- expr: |
- increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s"}[5m]) > 0
- for: 10m
- labels:
- severity: warning
- - name: prometheus-operator
- rules:
- - alert: PrometheusOperatorReconcileErrors
- annotations:
- message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
- }} Namespace.
- expr: |
- rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1
- for: 10m
- labels:
- severity: warning
- - alert: PrometheusOperatorNodeLookupErrors
- annotations:
- message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
- expr: |
- rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1
- for: 10m
- labels:
- severity: warning
|