prometheus-rules.yaml 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953
  1. apiVersion: monitoring.coreos.com/v1
  2. kind: PrometheusRule
  3. metadata:
  4. labels:
  5. prometheus: k8s
  6. role: alert-rules
  7. name: prometheus-k8s-rules
  8. namespace: monitoring
  9. spec:
  10. groups:
  11. - name: k8s.rules
  12. rules:
  13. - expr: |
  14. sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace)
  15. record: namespace:container_cpu_usage_seconds_total:sum_rate
  16. - expr: |
  17. sum by (namespace, pod_name, container_name) (
  18. rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])
  19. )
  20. record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
  21. - expr: |
  22. sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace)
  23. record: namespace:container_memory_usage_bytes:sum
  24. - expr: |
  25. sum by (namespace, label_name) (
  26. sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace, pod_name)
  27. * on (namespace, pod_name) group_left(label_name)
  28. label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
  29. )
  30. record: namespace_name:container_cpu_usage_seconds_total:sum_rate
  31. - expr: |
  32. sum by (namespace, label_name) (
  33. sum(container_memory_usage_bytes{job="kubelet",image!="", container_name!=""}) by (pod_name, namespace)
  34. * on (namespace, pod_name) group_left(label_name)
  35. label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
  36. )
  37. record: namespace_name:container_memory_usage_bytes:sum
  38. - expr: |
  39. sum by (namespace, label_name) (
  40. sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) by (namespace, pod)
  41. * on (namespace, pod) group_left(label_name)
  42. label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
  43. )
  44. record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
  45. - expr: |
  46. sum by (namespace, label_name) (
  47. sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod)
  48. * on (namespace, pod) group_left(label_name)
  49. label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
  50. )
  51. record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
  52. - name: kube-scheduler.rules
  53. rules:
  54. - expr: |
  55. histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
  56. labels:
  57. quantile: "0.99"
  58. record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
  59. - expr: |
  60. histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
  61. labels:
  62. quantile: "0.99"
  63. record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
  64. - expr: |
  65. histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
  66. labels:
  67. quantile: "0.99"
  68. record: cluster_quantile:scheduler_binding_latency:histogram_quantile
  69. - expr: |
  70. histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
  71. labels:
  72. quantile: "0.9"
  73. record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
  74. - expr: |
  75. histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
  76. labels:
  77. quantile: "0.9"
  78. record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
  79. - expr: |
  80. histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
  81. labels:
  82. quantile: "0.9"
  83. record: cluster_quantile:scheduler_binding_latency:histogram_quantile
  84. - expr: |
  85. histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
  86. labels:
  87. quantile: "0.5"
  88. record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
  89. - expr: |
  90. histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
  91. labels:
  92. quantile: "0.5"
  93. record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
  94. - expr: |
  95. histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
  96. labels:
  97. quantile: "0.5"
  98. record: cluster_quantile:scheduler_binding_latency:histogram_quantile
  99. - name: kube-apiserver.rules
  100. rules:
  101. - expr: |
  102. histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
  103. labels:
  104. quantile: "0.99"
  105. record: cluster_quantile:apiserver_request_latencies:histogram_quantile
  106. - expr: |
  107. histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
  108. labels:
  109. quantile: "0.9"
  110. record: cluster_quantile:apiserver_request_latencies:histogram_quantile
  111. - expr: |
  112. histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
  113. labels:
  114. quantile: "0.5"
  115. record: cluster_quantile:apiserver_request_latencies:histogram_quantile
  116. - name: node.rules
  117. rules:
  118. - expr: sum(min(kube_pod_info) by (node))
  119. record: ':kube_pod_info_node_count:'
  120. - expr: |
  121. max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
  122. record: 'node_namespace_pod:kube_pod_info:'
  123. - expr: |
  124. count by (node) (sum by (node, cpu) (
  125. node_cpu_seconds_total{job="node-exporter"}
  126. * on (namespace, pod) group_left(node)
  127. node_namespace_pod:kube_pod_info:
  128. ))
  129. record: node:node_num_cpu:sum
  130. - expr: |
  131. 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
  132. record: :node_cpu_utilisation:avg1m
  133. - expr: |
  134. 1 - avg by (node) (
  135. rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
  136. * on (namespace, pod) group_left(node)
  137. node_namespace_pod:kube_pod_info:)
  138. record: node:node_cpu_utilisation:avg1m
  139. - expr: |
  140. sum(node_load1{job="node-exporter"})
  141. /
  142. sum(node:node_num_cpu:sum)
  143. record: ':node_cpu_saturation_load1:'
  144. - expr: |
  145. sum by (node) (
  146. node_load1{job="node-exporter"}
  147. * on (namespace, pod) group_left(node)
  148. node_namespace_pod:kube_pod_info:
  149. )
  150. /
  151. node:node_num_cpu:sum
  152. record: 'node:node_cpu_saturation_load1:'
  153. - expr: |
  154. 1 -
  155. sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
  156. /
  157. sum(node_memory_MemTotal_bytes{job="node-exporter"})
  158. record: ':node_memory_utilisation:'
  159. - expr: |
  160. sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
  161. record: :node_memory_MemFreeCachedBuffers_bytes:sum
  162. - expr: |
  163. sum(node_memory_MemTotal_bytes{job="node-exporter"})
  164. record: :node_memory_MemTotal_bytes:sum
  165. - expr: |
  166. sum by (node) (
  167. (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
  168. * on (namespace, pod) group_left(node)
  169. node_namespace_pod:kube_pod_info:
  170. )
  171. record: node:node_memory_bytes_available:sum
  172. - expr: |
  173. sum by (node) (
  174. node_memory_MemTotal_bytes{job="node-exporter"}
  175. * on (namespace, pod) group_left(node)
  176. node_namespace_pod:kube_pod_info:
  177. )
  178. record: node:node_memory_bytes_total:sum
  179. - expr: |
  180. (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
  181. /
  182. scalar(sum(node:node_memory_bytes_total:sum))
  183. record: node:node_memory_utilisation:ratio
  184. - expr: |
  185. 1e3 * sum(
  186. (rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
  187. + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
  188. )
  189. record: :node_memory_swap_io_bytes:sum_rate
  190. - expr: |
  191. 1 -
  192. sum by (node) (
  193. (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
  194. * on (namespace, pod) group_left(node)
  195. node_namespace_pod:kube_pod_info:
  196. )
  197. /
  198. sum by (node) (
  199. node_memory_MemTotal_bytes{job="node-exporter"}
  200. * on (namespace, pod) group_left(node)
  201. node_namespace_pod:kube_pod_info:
  202. )
  203. record: 'node:node_memory_utilisation:'
  204. - expr: |
  205. 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
  206. record: 'node:node_memory_utilisation_2:'
  207. - expr: |
  208. 1e3 * sum by (node) (
  209. (rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
  210. + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
  211. * on (namespace, pod) group_left(node)
  212. node_namespace_pod:kube_pod_info:
  213. )
  214. record: node:node_memory_swap_io_bytes:sum_rate
  215. - expr: |
  216. avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]))
  217. record: :node_disk_utilisation:avg_irate
  218. - expr: |
  219. avg by (node) (
  220. irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m])
  221. * on (namespace, pod) group_left(node)
  222. node_namespace_pod:kube_pod_info:
  223. )
  224. record: node:node_disk_utilisation:avg_irate
  225. - expr: |
  226. avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3)
  227. record: :node_disk_saturation:avg_irate
  228. - expr: |
  229. avg by (node) (
  230. irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3
  231. * on (namespace, pod) group_left(node)
  232. node_namespace_pod:kube_pod_info:
  233. )
  234. record: node:node_disk_saturation:avg_irate
  235. - expr: |
  236. max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
  237. - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
  238. / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
  239. record: 'node:node_filesystem_usage:'
  240. - expr: |
  241. max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
  242. record: 'node:node_filesystem_avail:'
  243. - expr: |
  244. sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) +
  245. sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m]))
  246. record: :node_net_utilisation:sum_irate
  247. - expr: |
  248. sum by (node) (
  249. (irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) +
  250. irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m]))
  251. * on (namespace, pod) group_left(node)
  252. node_namespace_pod:kube_pod_info:
  253. )
  254. record: node:node_net_utilisation:sum_irate
  255. - expr: |
  256. sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) +
  257. sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m]))
  258. record: :node_net_saturation:sum_irate
  259. - expr: |
  260. sum by (node) (
  261. (irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) +
  262. irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m]))
  263. * on (namespace, pod) group_left(node)
  264. node_namespace_pod:kube_pod_info:
  265. )
  266. record: node:node_net_saturation:sum_irate
  267. - name: kube-prometheus-node-recording.rules
  268. rules:
  269. - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)
  270. record: instance:node_cpu:rate:sum
  271. - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
  272. BY (instance)
  273. record: instance:node_filesystem_usage:sum
  274. - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
  275. record: instance:node_network_receive_bytes:rate:sum
  276. - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
  277. record: instance:node_network_transmit_bytes:rate:sum
  278. - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode)
  279. / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
  280. record: instance:node_cpu:ratio
  281. - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))
  282. record: cluster:node_cpu:sum_rate5m
  283. - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
  284. record: cluster:node_cpu:ratio
  285. - name: kubernetes-absent
  286. rules:
  287. - alert: AlertmanagerDown
  288. annotations:
  289. message: Alertmanager has disappeared from Prometheus target discovery.
  290. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
  291. expr: |
  292. absent(up{job="alertmanager-main"} == 1)
  293. for: 15m
  294. labels:
  295. severity: critical
  296. - alert: CoreDNSDown
  297. annotations:
  298. message: CoreDNS has disappeared from Prometheus target discovery.
  299. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-corednsdown
  300. expr: |
  301. absent(up{job="kube-dns"} == 1)
  302. for: 15m
  303. labels:
  304. severity: critical
  305. - alert: KubeAPIDown
  306. annotations:
  307. message: KubeAPI has disappeared from Prometheus target discovery.
  308. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
  309. expr: |
  310. absent(up{job="apiserver"} == 1)
  311. for: 15m
  312. labels:
  313. severity: critical
  314. - alert: KubeControllerManagerDown
  315. annotations:
  316. message: KubeControllerManager has disappeared from Prometheus target discovery.
  317. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
  318. expr: |
  319. absent(up{job="kube-controller-manager"} == 1)
  320. for: 15m
  321. labels:
  322. severity: critical
  323. - alert: KubeSchedulerDown
  324. annotations:
  325. message: KubeScheduler has disappeared from Prometheus target discovery.
  326. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
  327. expr: |
  328. absent(up{job="kube-scheduler"} == 1)
  329. for: 15m
  330. labels:
  331. severity: critical
  332. - alert: KubeStateMetricsDown
  333. annotations:
  334. message: KubeStateMetrics has disappeared from Prometheus target discovery.
  335. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown
  336. expr: |
  337. absent(up{job="kube-state-metrics"} == 1)
  338. for: 15m
  339. labels:
  340. severity: critical
  341. - alert: KubeletDown
  342. annotations:
  343. message: Kubelet has disappeared from Prometheus target discovery.
  344. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
  345. expr: |
  346. absent(up{job="kubelet"} == 1)
  347. for: 15m
  348. labels:
  349. severity: critical
  350. - alert: NodeExporterDown
  351. annotations:
  352. message: NodeExporter has disappeared from Prometheus target discovery.
  353. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown
  354. expr: |
  355. absent(up{job="node-exporter"} == 1)
  356. for: 15m
  357. labels:
  358. severity: critical
  359. - alert: PrometheusDown
  360. annotations:
  361. message: Prometheus has disappeared from Prometheus target discovery.
  362. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
  363. expr: |
  364. absent(up{job="prometheus-k8s"} == 1)
  365. for: 15m
  366. labels:
  367. severity: critical
  368. - alert: PrometheusOperatorDown
  369. annotations:
  370. message: PrometheusOperator has disappeared from Prometheus target discovery.
  371. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
  372. expr: |
  373. absent(up{job="prometheus-operator"} == 1)
  374. for: 15m
  375. labels:
  376. severity: critical
  377. - name: kubernetes-apps
  378. rules:
  379. - alert: KubePodCrashLooping
  380. annotations:
  381. message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
  382. }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
  383. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
  384. expr: |
  385. rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
  386. for: 1h
  387. labels:
  388. severity: critical
  389. - alert: KubePodNotReady
  390. annotations:
  391. message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
  392. state for longer than an hour.
  393. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
  394. expr: |
  395. sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0
  396. for: 1h
  397. labels:
  398. severity: critical
  399. - alert: KubeDeploymentGenerationMismatch
  400. annotations:
  401. message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
  402. }} does not match, this indicates that the Deployment has failed but has
  403. not been rolled back.
  404. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
  405. expr: |
  406. kube_deployment_status_observed_generation{job="kube-state-metrics"}
  407. !=
  408. kube_deployment_metadata_generation{job="kube-state-metrics"}
  409. for: 15m
  410. labels:
  411. severity: critical
  412. - alert: KubeDeploymentReplicasMismatch
  413. annotations:
  414. message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
  415. matched the expected number of replicas for longer than an hour.
  416. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
  417. expr: |
  418. kube_deployment_spec_replicas{job="kube-state-metrics"}
  419. !=
  420. kube_deployment_status_replicas_available{job="kube-state-metrics"}
  421. for: 1h
  422. labels:
  423. severity: critical
  424. - alert: KubeStatefulSetReplicasMismatch
  425. annotations:
  426. message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
  427. not matched the expected number of replicas for longer than 15 minutes.
  428. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
  429. expr: |
  430. kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
  431. !=
  432. kube_statefulset_status_replicas{job="kube-state-metrics"}
  433. for: 15m
  434. labels:
  435. severity: critical
  436. - alert: KubeStatefulSetGenerationMismatch
  437. annotations:
  438. message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
  439. }} does not match, this indicates that the StatefulSet has failed but has
  440. not been rolled back.
  441. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
  442. expr: |
  443. kube_statefulset_status_observed_generation{job="kube-state-metrics"}
  444. !=
  445. kube_statefulset_metadata_generation{job="kube-state-metrics"}
  446. for: 15m
  447. labels:
  448. severity: critical
  449. - alert: KubeStatefulSetUpdateNotRolledOut
  450. annotations:
  451. message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
  452. has not been rolled out.
  453. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
  454. expr: |
  455. max without (revision) (
  456. kube_statefulset_status_current_revision{job="kube-state-metrics"}
  457. unless
  458. kube_statefulset_status_update_revision{job="kube-state-metrics"}
  459. )
  460. *
  461. (
  462. kube_statefulset_replicas{job="kube-state-metrics"}
  463. !=
  464. kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
  465. )
  466. for: 15m
  467. labels:
  468. severity: critical
  469. - alert: KubeDaemonSetRolloutStuck
  470. annotations:
  471. message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
  472. }}/{{ $labels.daemonset }} are scheduled and ready.
  473. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
  474. expr: |
  475. kube_daemonset_status_number_ready{job="kube-state-metrics"}
  476. /
  477. kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100
  478. for: 15m
  479. labels:
  480. severity: critical
  481. - alert: KubeDaemonSetNotScheduled
  482. annotations:
  483. message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
  484. }} are not scheduled.'
  485. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
  486. expr: |
  487. kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
  488. -
  489. kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
  490. for: 10m
  491. labels:
  492. severity: warning
  493. - alert: KubeDaemonSetMisScheduled
  494. annotations:
  495. message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
  496. }} are running where they are not supposed to run.'
  497. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
  498. expr: |
  499. kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
  500. for: 10m
  501. labels:
  502. severity: warning
  503. - alert: KubeCronJobRunning
  504. annotations:
  505. message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more
  506. than 1h to complete.
  507. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
  508. expr: |
  509. time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
  510. for: 1h
  511. labels:
  512. severity: warning
  513. - alert: KubeJobCompletion
  514. annotations:
  515. message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
  516. than one hour to complete.
  517. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
  518. expr: |
  519. kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
  520. for: 1h
  521. labels:
  522. severity: warning
  523. - alert: KubeJobFailed
  524. annotations:
  525. message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
  526. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
  527. expr: |
  528. kube_job_status_failed{job="kube-state-metrics"} > 0
  529. for: 1h
  530. labels:
  531. severity: warning
  532. - name: kubernetes-resources
  533. rules:
  534. - alert: KubeCPUOvercommit
  535. annotations:
  536. message: Cluster has overcommitted CPU resource requests for Pods and cannot
  537. tolerate node failure.
  538. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
  539. expr: |
  540. sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
  541. /
  542. sum(node:node_num_cpu:sum)
  543. >
  544. (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
  545. for: 5m
  546. labels:
  547. severity: warning
  548. - alert: KubeMemOvercommit
  549. annotations:
  550. message: Cluster has overcommitted memory resource requests for Pods and cannot
  551. tolerate node failure.
  552. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
  553. expr: |
  554. sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
  555. /
  556. sum(node_memory_MemTotal_bytes)
  557. >
  558. (count(node:node_num_cpu:sum)-1)
  559. /
  560. count(node:node_num_cpu:sum)
  561. for: 5m
  562. labels:
  563. severity: warning
  564. - alert: KubeCPUOvercommit
  565. annotations:
  566. message: Cluster has overcommitted CPU resource requests for Namespaces.
  567. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
  568. expr: |
  569. sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"})
  570. /
  571. sum(node:node_num_cpu:sum)
  572. > 1.5
  573. for: 5m
  574. labels:
  575. severity: warning
  576. - alert: KubeMemOvercommit
  577. annotations:
  578. message: Cluster has overcommitted memory resource requests for Namespaces.
  579. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
  580. expr: |
  581. sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"})
  582. /
  583. sum(node_memory_MemTotal_bytes{job="node-exporter"})
  584. > 1.5
  585. for: 5m
  586. labels:
  587. severity: warning
  588. - alert: KubeQuotaExceeded
  589. annotations:
  590. message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
  591. }}% of its {{ $labels.resource }} quota.
  592. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
  593. expr: |
  594. 100 * kube_resourcequota{job="kube-state-metrics", type="used"}
  595. / ignoring(instance, job, type)
  596. (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
  597. > 90
  598. for: 15m
  599. labels:
  600. severity: warning
  601. - alert: CPUThrottlingHigh
  602. annotations:
  603. message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace
  604. }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name
  605. }}.'
  606. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
  607. expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{}[5m]))
  608. by (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total{}[5m]))
  609. by (container_name, pod_name, namespace)\n > 25 \n"
  610. for: 15m
  611. labels:
  612. severity: warning
  613. - name: kubernetes-storage
  614. rules:
  615. - alert: KubePersistentVolumeUsageCritical
  616. annotations:
  617. message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
  618. }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value
  619. }}% free.
  620. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
  621. expr: |
  622. 100 * kubelet_volume_stats_available_bytes{job="kubelet"}
  623. /
  624. kubelet_volume_stats_capacity_bytes{job="kubelet"}
  625. < 3
  626. for: 1m
  627. labels:
  628. severity: critical
  629. - alert: KubePersistentVolumeFullInFourDays
  630. annotations:
  631. message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
  632. }} in Namespace {{ $labels.namespace }} is expected to fill up within four
  633. days. Currently {{ printf "%0.2f" $value }}% is available.
  634. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
  635. expr: |
  636. 100 * (
  637. kubelet_volume_stats_available_bytes{job="kubelet"}
  638. /
  639. kubelet_volume_stats_capacity_bytes{job="kubelet"}
  640. ) < 15
  641. and
  642. predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
  643. for: 5m
  644. labels:
  645. severity: critical
  646. - alert: KubePersistentVolumeErrors
  647. annotations:
  648. message: The persistent volume {{ $labels.persistentvolume }} has status {{
  649. $labels.phase }}.
  650. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
  651. expr: |
  652. kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
  653. for: 5m
  654. labels:
  655. severity: critical
  656. - name: kubernetes-system
  657. rules:
  658. - alert: KubeNodeNotReady
  659. annotations:
  660. message: '{{ $labels.node }} has been unready for more than an hour.'
  661. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
  662. expr: |
  663. kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
  664. for: 1h
  665. labels:
  666. severity: warning
  667. - alert: KubeVersionMismatch
  668. annotations:
  669. message: There are {{ $value }} different versions of Kubernetes components
  670. running.
  671. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
  672. expr: |
  673. count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1
  674. for: 1h
  675. labels:
  676. severity: warning
  677. - alert: KubeClientErrors
  678. annotations:
  679. message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
  680. }}' is experiencing {{ printf "%0.0f" $value }}% errors.'
  681. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
  682. expr: |
  683. (sum(rate(rest_client_requests_total{code!~"2..|404"}[5m])) by (instance, job)
  684. /
  685. sum(rate(rest_client_requests_total[5m])) by (instance, job))
  686. * 100 > 1
  687. for: 15m
  688. labels:
  689. severity: warning
  690. - alert: KubeClientErrors
  691. annotations:
  692. message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
  693. }}' is experiencing {{ printf "%0.0f" $value }} errors / second.
  694. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
  695. expr: |
  696. sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
  697. for: 15m
  698. labels:
  699. severity: warning
  700. - alert: KubeletTooManyPods
  701. annotations:
  702. message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close
  703. to the limit of 110.
  704. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
  705. expr: |
  706. kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
  707. for: 15m
  708. labels:
  709. severity: warning
  710. - alert: KubeAPILatencyHigh
  711. annotations:
  712. message: The API server has a 99th percentile latency of {{ $value }} seconds
  713. for {{ $labels.verb }} {{ $labels.resource }}.
  714. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
  715. expr: |
  716. cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
  717. for: 10m
  718. labels:
  719. severity: warning
  720. - alert: KubeAPILatencyHigh
  721. annotations:
  722. message: The API server has a 99th percentile latency of {{ $value }} seconds
  723. for {{ $labels.verb }} {{ $labels.resource }}.
  724. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
  725. expr: |
  726. cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
  727. for: 10m
  728. labels:
  729. severity: critical
  730. - alert: KubeAPIErrorsHigh
  731. annotations:
  732. message: API server is returning errors for {{ $value }}% of requests.
  733. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
  734. expr: |
  735. sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
  736. /
  737. sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10
  738. for: 10m
  739. labels:
  740. severity: critical
  741. - alert: KubeAPIErrorsHigh
  742. annotations:
  743. message: API server is returning errors for {{ $value }}% of requests.
  744. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
  745. expr: |
  746. sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
  747. /
  748. sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5
  749. for: 10m
  750. labels:
  751. severity: warning
  752. - alert: KubeClientCertificateExpiration
  753. annotations:
  754. message: Kubernetes API certificate is expiring in less than 7 days.
  755. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
  756. expr: |
  757. histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
  758. labels:
  759. severity: warning
  760. - alert: KubeClientCertificateExpiration
  761. annotations:
  762. message: Kubernetes API certificate is expiring in less than 24 hours.
  763. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
  764. expr: |
  765. histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
  766. labels:
  767. severity: critical
  768. - name: alertmanager.rules
  769. rules:
  770. - alert: AlertmanagerConfigInconsistent
  771. annotations:
  772. message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
  773. are out of sync.
  774. expr: |
  775. count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
  776. for: 5m
  777. labels:
  778. severity: critical
  779. - alert: AlertmanagerFailedReload
  780. annotations:
  781. message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
  782. }}/{{ $labels.pod}}.
  783. expr: |
  784. alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0
  785. for: 10m
  786. labels:
  787. severity: warning
  788. - alert: AlertmanagerMembersInconsistent
  789. annotations:
  790. message: Alertmanager has not found all other members of the cluster.
  791. expr: |
  792. alertmanager_cluster_members{job="alertmanager-main"}
  793. != on (service) GROUP_LEFT()
  794. count by (service) (alertmanager_cluster_members{job="alertmanager-main"})
  795. for: 5m
  796. labels:
  797. severity: critical
  798. - name: general.rules
  799. rules:
  800. - alert: TargetDown
  801. annotations:
  802. message: '{{ $value }}% of the {{ $labels.job }} targets are down.'
  803. expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
  804. for: 10m
  805. labels:
  806. severity: warning
  807. - alert: DeadMansSwitch
  808. annotations:
  809. message: This is a DeadMansSwitch meant to ensure that the entire alerting
  810. pipeline is functional.
  811. expr: vector(1)
  812. labels:
  813. severity: none
  814. - name: kube-prometheus-node-alerting.rules
  815. rules:
  816. - alert: NodeDiskRunningFull
  817. annotations:
  818. message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
  819. }}/{{ $labels.pod }} will be full within the next 24 hours.
  820. expr: |
  821. (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
  822. for: 30m
  823. labels:
  824. severity: warning
  825. - alert: NodeDiskRunningFull
  826. annotations:
  827. message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
  828. }}/{{ $labels.pod }} will be full within the next 2 hours.
  829. expr: |
  830. (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
  831. for: 10m
  832. labels:
  833. severity: critical
  834. - name: prometheus.rules
  835. rules:
  836. - alert: PrometheusConfigReloadFailed
  837. annotations:
  838. description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
  839. summary: Reloading Prometheus' configuration failed
  840. expr: |
  841. prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0
  842. for: 10m
  843. labels:
  844. severity: warning
  845. - alert: PrometheusNotificationQueueRunningFull
  846. annotations:
  847. description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
  848. $labels.pod}}
  849. summary: Prometheus' alert notification queue is running full
  850. expr: |
  851. predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s"}
  852. for: 10m
  853. labels:
  854. severity: warning
  855. - alert: PrometheusErrorSendingAlerts
  856. annotations:
  857. description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
  858. $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
  859. summary: Errors while sending alert from Prometheus
  860. expr: |
  861. rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.01
  862. for: 10m
  863. labels:
  864. severity: warning
  865. - alert: PrometheusErrorSendingAlerts
  866. annotations:
  867. description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
  868. $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
  869. summary: Errors while sending alerts from Prometheus
  870. expr: |
  871. rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.03
  872. for: 10m
  873. labels:
  874. severity: critical
  875. - alert: PrometheusNotConnectedToAlertmanagers
  876. annotations:
  877. description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
  878. to any Alertmanagers
  879. summary: Prometheus is not connected to any Alertmanagers
  880. expr: |
  881. prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s"} < 1
  882. for: 10m
  883. labels:
  884. severity: warning
  885. - alert: PrometheusTSDBReloadsFailing
  886. annotations:
  887. description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
  888. reload failures over the last four hours.'
  889. summary: Prometheus has issues reloading data blocks from disk
  890. expr: |
  891. increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s"}[2h]) > 0
  892. for: 12h
  893. labels:
  894. severity: warning
  895. - alert: PrometheusTSDBCompactionsFailing
  896. annotations:
  897. description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
  898. compaction failures over the last four hours.'
  899. summary: Prometheus has issues compacting sample blocks
  900. expr: |
  901. increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s"}[2h]) > 0
  902. for: 12h
  903. labels:
  904. severity: warning
  905. - alert: PrometheusTSDBWALCorruptions
  906. annotations:
  907. description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
  908. log (WAL).'
  909. summary: Prometheus write-ahead log is corrupted
  910. expr: |
  911. tsdb_wal_corruptions_total{job="prometheus-k8s"} > 0
  912. for: 4h
  913. labels:
  914. severity: warning
  915. - alert: PrometheusNotIngestingSamples
  916. annotations:
  917. description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting
  918. samples.
  919. summary: Prometheus isn't ingesting samples
  920. expr: |
  921. rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s"}[5m]) <= 0
  922. for: 10m
  923. labels:
  924. severity: warning
  925. - alert: PrometheusTargetScrapesDuplicate
  926. annotations:
  927. description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected
  928. due to duplicate timestamps but different values'
  929. summary: Prometheus has many samples rejected
  930. expr: |
  931. increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s"}[5m]) > 0
  932. for: 10m
  933. labels:
  934. severity: warning
  935. - name: prometheus-operator
  936. rules:
  937. - alert: PrometheusOperatorReconcileErrors
  938. annotations:
  939. message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
  940. }} Namespace.
  941. expr: |
  942. rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1
  943. for: 10m
  944. labels:
  945. severity: warning
  946. - alert: PrometheusOperatorNodeLookupErrors
  947. annotations:
  948. message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
  949. expr: |
  950. rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1
  951. for: 10m
  952. labels:
  953. severity: warning