2
0

prometheus-k8s-rules.yaml 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675
  1. apiVersion: v1
  2. kind: ConfigMap
  3. metadata:
  4. name: prometheus-k8s-rules
  5. labels:
  6. role: alert-rules
  7. prometheus: k8s
  8. data:
  9. alertmanager.rules.yaml: |+
  10. groups:
  11. - name: alertmanager.rules
  12. rules:
  13. - alert: AlertmanagerConfigInconsistent
  14. expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
  15. GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
  16. "alertmanager-$1", "alertmanager", "(.*)") != 1
  17. for: 5m
  18. labels:
  19. severity: critical
  20. annotations:
  21. description: The configuration of the instances of the Alertmanager cluster
  22. `{{$labels.service}}` are out of sync.
  23. summary: Configuration out of sync
  24. - alert: AlertmanagerDownOrMissing
  25. expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
  26. "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
  27. for: 5m
  28. labels:
  29. severity: warning
  30. annotations:
  31. description: An unexpected number of Alertmanagers are scraped or Alertmanagers
  32. disappeared from discovery.
  33. summary: Alertmanager down or missing
  34. - alert: AlertmanagerFailedReload
  35. expr: alertmanager_config_last_reload_successful == 0
  36. for: 10m
  37. labels:
  38. severity: warning
  39. annotations:
  40. description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
  41. }}/{{ $labels.pod}}.
  42. summary: Alertmanager's configuration reload failed
  43. etcd3.rules.yaml: |+
  44. groups:
  45. - name: ./etcd3.rules
  46. rules:
  47. - alert: InsufficientMembers
  48. expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
  49. for: 3m
  50. labels:
  51. severity: critical
  52. annotations:
  53. description: If one more etcd member goes down the cluster will be unavailable
  54. summary: etcd cluster insufficient members
  55. - alert: NoLeader
  56. expr: etcd_server_has_leader{job="etcd"} == 0
  57. for: 1m
  58. labels:
  59. severity: critical
  60. annotations:
  61. description: etcd member {{ $labels.instance }} has no leader
  62. summary: etcd member has no leader
  63. - alert: HighNumberOfLeaderChanges
  64. expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
  65. labels:
  66. severity: warning
  67. annotations:
  68. description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
  69. changes within the last hour
  70. summary: a high number of leader changes within the etcd cluster are happening
  71. - alert: HighNumberOfFailedGRPCRequests
  72. expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
  73. / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
  74. for: 10m
  75. labels:
  76. severity: warning
  77. annotations:
  78. description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
  79. on etcd instance {{ $labels.instance }}'
  80. summary: a high number of gRPC requests are failing
  81. - alert: HighNumberOfFailedGRPCRequests
  82. expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
  83. / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
  84. for: 5m
  85. labels:
  86. severity: critical
  87. annotations:
  88. description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
  89. on etcd instance {{ $labels.instance }}'
  90. summary: a high number of gRPC requests are failing
  91. - alert: GRPCRequestsSlow
  92. expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
  93. > 0.15
  94. for: 10m
  95. labels:
  96. severity: critical
  97. annotations:
  98. description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
  99. }} are slow
  100. summary: slow gRPC requests
  101. - alert: HighNumberOfFailedHTTPRequests
  102. expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
  103. BY (method) > 0.01
  104. for: 10m
  105. labels:
  106. severity: warning
  107. annotations:
  108. description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
  109. instance {{ $labels.instance }}'
  110. summary: a high number of HTTP requests are failing
  111. - alert: HighNumberOfFailedHTTPRequests
  112. expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
  113. BY (method) > 0.05
  114. for: 5m
  115. labels:
  116. severity: critical
  117. annotations:
  118. description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
  119. instance {{ $labels.instance }}'
  120. summary: a high number of HTTP requests are failing
  121. - alert: HTTPRequestsSlow
  122. expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
  123. > 0.15
  124. for: 10m
  125. labels:
  126. severity: warning
  127. annotations:
  128. description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
  129. }} are slow
  130. summary: slow HTTP requests
  131. - alert: EtcdMemberCommunicationSlow
  132. expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
  133. > 0.15
  134. for: 10m
  135. labels:
  136. severity: warning
  137. annotations:
  138. description: etcd instance {{ $labels.instance }} member communication with
  139. {{ $labels.To }} is slow
  140. summary: etcd member communication is slow
  141. - alert: HighNumberOfFailedProposals
  142. expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
  143. labels:
  144. severity: warning
  145. annotations:
  146. description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
  147. failures within the last hour
  148. summary: a high number of proposals within the etcd cluster are failing
  149. - alert: HighFsyncDurations
  150. expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
  151. > 0.5
  152. for: 10m
  153. labels:
  154. severity: warning
  155. annotations:
  156. description: etcd instance {{ $labels.instance }} fync durations are high
  157. summary: high fsync durations
  158. - alert: HighCommitDurations
  159. expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
  160. > 0.25
  161. for: 10m
  162. labels:
  163. severity: warning
  164. annotations:
  165. description: etcd instance {{ $labels.instance }} commit durations are high
  166. summary: high commit durations
  167. general.rules.yaml: |+
  168. groups:
  169. - name: general.rules
  170. rules:
  171. - alert: TargetDown
  172. expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
  173. for: 10m
  174. labels:
  175. severity: warning
  176. annotations:
  177. description: '{{ $value }}% of {{ $labels.job }} targets are down.'
  178. summary: Targets are down
  179. - alert: DeadMansSwitch
  180. expr: vector(1)
  181. labels:
  182. severity: none
  183. annotations:
  184. description: This is a DeadMansSwitch meant to ensure that the entire Alerting
  185. pipeline is functional.
  186. summary: Alerting DeadMansSwitch
  187. - record: fd_utilization
  188. expr: process_open_fds / process_max_fds
  189. - alert: FdExhaustionClose
  190. expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
  191. for: 10m
  192. labels:
  193. severity: warning
  194. annotations:
  195. description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
  196. will exhaust in file/socket descriptors within the next 4 hours'
  197. summary: file descriptors soon exhausted
  198. - alert: FdExhaustionClose
  199. expr: predict_linear(fd_utilization[10m], 3600) > 1
  200. for: 10m
  201. labels:
  202. severity: critical
  203. annotations:
  204. description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
  205. will exhaust in file/socket descriptors within the next hour'
  206. summary: file descriptors soon exhausted
  207. kube-controller-manager.rules.yaml: |+
  208. groups:
  209. - name: kube-controller-manager.rules
  210. rules:
  211. - alert: K8SControllerManagerDown
  212. expr: absent(up{job="kube-controller-manager"} == 1)
  213. for: 5m
  214. labels:
  215. severity: critical
  216. annotations:
  217. description: There is no running K8S controller manager. Deployments and replication
  218. controllers are not making progress.
  219. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
  220. summary: Controller manager is down
  221. kube-scheduler.rules.yaml: |+
  222. groups:
  223. - name: kube-scheduler.rules
  224. rules:
  225. - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
  226. expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
  227. BY (le, cluster)) / 1e+06
  228. labels:
  229. quantile: "0.99"
  230. - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
  231. expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
  232. BY (le, cluster)) / 1e+06
  233. labels:
  234. quantile: "0.9"
  235. - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
  236. expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
  237. BY (le, cluster)) / 1e+06
  238. labels:
  239. quantile: "0.5"
  240. - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
  241. expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
  242. BY (le, cluster)) / 1e+06
  243. labels:
  244. quantile: "0.99"
  245. - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
  246. expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
  247. BY (le, cluster)) / 1e+06
  248. labels:
  249. quantile: "0.9"
  250. - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
  251. expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
  252. BY (le, cluster)) / 1e+06
  253. labels:
  254. quantile: "0.5"
  255. - record: cluster:scheduler_binding_latency_seconds:quantile
  256. expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
  257. BY (le, cluster)) / 1e+06
  258. labels:
  259. quantile: "0.99"
  260. - record: cluster:scheduler_binding_latency_seconds:quantile
  261. expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
  262. BY (le, cluster)) / 1e+06
  263. labels:
  264. quantile: "0.9"
  265. - record: cluster:scheduler_binding_latency_seconds:quantile
  266. expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
  267. BY (le, cluster)) / 1e+06
  268. labels:
  269. quantile: "0.5"
  270. - alert: K8SSchedulerDown
  271. expr: absent(up{job="kube-scheduler"} == 1)
  272. for: 5m
  273. labels:
  274. severity: critical
  275. annotations:
  276. description: There is no running K8S scheduler. New pods are not being assigned
  277. to nodes.
  278. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
  279. summary: Scheduler is down
  280. kube-state-metrics.rules.yaml: |+
  281. groups:
  282. - name: kube-state-metrics.rules
  283. rules:
  284. - alert: DeploymentGenerationMismatch
  285. expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
  286. for: 15m
  287. labels:
  288. severity: warning
  289. annotations:
  290. description: Observed deployment generation does not match expected one for
  291. deployment {{$labels.namespaces}}/{{$labels.deployment}}
  292. summary: Deployment is outdated
  293. - alert: DeploymentReplicasNotUpdated
  294. expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
  295. or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
  296. unless (kube_deployment_spec_paused == 1)
  297. for: 15m
  298. labels:
  299. severity: warning
  300. annotations:
  301. description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
  302. summary: Deployment replicas are outdated
  303. - alert: DaemonSetRolloutStuck
  304. expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
  305. * 100 < 100
  306. for: 15m
  307. labels:
  308. severity: warning
  309. annotations:
  310. description: Only {{$value}}% of desired pods scheduled and ready for daemon
  311. set {{$labels.namespaces}}/{{$labels.daemonset}}
  312. summary: DaemonSet is missing pods
  313. - alert: K8SDaemonSetsNotScheduled
  314. expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
  315. > 0
  316. for: 10m
  317. labels:
  318. severity: warning
  319. annotations:
  320. description: A number of daemonsets are not scheduled.
  321. summary: Daemonsets are not scheduled correctly
  322. - alert: DaemonSetsMissScheduled
  323. expr: kube_daemonset_status_number_misscheduled > 0
  324. for: 10m
  325. labels:
  326. severity: warning
  327. annotations:
  328. description: A number of daemonsets are running where they are not supposed
  329. to run.
  330. summary: Daemonsets are not scheduled correctly
  331. - alert: PodFrequentlyRestarting
  332. expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
  333. for: 10m
  334. labels:
  335. severity: warning
  336. annotations:
  337. description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
  338. times within the last hour
  339. summary: Pod is restarting frequently
  340. kubelet.rules.yaml: |+
  341. groups:
  342. - name: kubelet.rules
  343. rules:
  344. - alert: K8SNodeNotReady
  345. expr: kube_node_status_condition{condition="Ready",status="true"} == 0
  346. for: 1h
  347. labels:
  348. severity: warning
  349. annotations:
  350. description: The Kubelet on {{ $labels.node }} has not checked in with the API,
  351. or has set itself to NotReady, for more than an hour
  352. summary: Node status is NotReady
  353. - alert: K8SManyNodesNotReady
  354. expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
  355. > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
  356. 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
  357. for: 1m
  358. labels:
  359. severity: critical
  360. annotations:
  361. description: '{{ $value }}% of Kubernetes nodes are not ready'
  362. - alert: K8SKubeletDown
  363. expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
  364. for: 1h
  365. labels:
  366. severity: warning
  367. annotations:
  368. description: Prometheus failed to scrape {{ $value }}% of kubelets.
  369. summary: Prometheus failed to scrape
  370. - alert: K8SKubeletDown
  371. expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
  372. * 100 > 10
  373. for: 1h
  374. labels:
  375. severity: critical
  376. annotations:
  377. description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
  378. have disappeared from service discovery.
  379. summary: Many Kubelets cannot be scraped
  380. - alert: K8SKubeletTooManyPods
  381. expr: kubelet_running_pod_count > 100
  382. for: 10m
  383. labels:
  384. severity: warning
  385. annotations:
  386. description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
  387. to the limit of 110
  388. summary: Kubelet is close to pod limit
  389. kubernetes.rules.yaml: |+
  390. groups:
  391. - name: kubernetes.rules
  392. rules:
  393. - record: pod_name:container_memory_usage_bytes:sum
  394. expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
  395. (pod_name)
  396. - record: pod_name:container_spec_cpu_shares:sum
  397. expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
  398. - record: pod_name:container_cpu_usage:sum
  399. expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
  400. BY (pod_name)
  401. - record: pod_name:container_fs_usage_bytes:sum
  402. expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
  403. - record: namespace:container_memory_usage_bytes:sum
  404. expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
  405. - record: namespace:container_spec_cpu_shares:sum
  406. expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
  407. - record: namespace:container_cpu_usage:sum
  408. expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
  409. BY (namespace)
  410. - record: cluster:memory_usage:ratio
  411. expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
  412. (cluster) / sum(machine_memory_bytes) BY (cluster)
  413. - record: cluster:container_spec_cpu_shares:ratio
  414. expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
  415. / sum(machine_cpu_cores)
  416. - record: cluster:container_cpu_usage:ratio
  417. expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
  418. / sum(machine_cpu_cores)
  419. - record: apiserver_latency_seconds:quantile
  420. expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
  421. 1e+06
  422. labels:
  423. quantile: "0.99"
  424. - record: apiserver_latency:quantile_seconds
  425. expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
  426. 1e+06
  427. labels:
  428. quantile: "0.9"
  429. - record: apiserver_latency_seconds:quantile
  430. expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
  431. 1e+06
  432. labels:
  433. quantile: "0.5"
  434. - alert: APIServerLatencyHigh
  435. expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
  436. > 1
  437. for: 10m
  438. labels:
  439. severity: warning
  440. annotations:
  441. description: the API server has a 99th percentile latency of {{ $value }} seconds
  442. for {{$labels.verb}} {{$labels.resource}}
  443. summary: API server high latency
  444. - alert: APIServerLatencyHigh
  445. expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
  446. > 4
  447. for: 10m
  448. labels:
  449. severity: critical
  450. annotations:
  451. description: the API server has a 99th percentile latency of {{ $value }} seconds
  452. for {{$labels.verb}} {{$labels.resource}}
  453. summary: API server high latency
  454. - alert: APIServerErrorsHigh
  455. expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
  456. * 100 > 2
  457. for: 10m
  458. labels:
  459. severity: warning
  460. annotations:
  461. description: API server returns errors for {{ $value }}% of requests
  462. summary: API server request errors
  463. - alert: APIServerErrorsHigh
  464. expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
  465. * 100 > 5
  466. for: 10m
  467. labels:
  468. severity: critical
  469. annotations:
  470. description: API server returns errors for {{ $value }}% of requests
  471. - alert: K8SApiserverDown
  472. expr: absent(up{job="apiserver"} == 1)
  473. for: 20m
  474. labels:
  475. severity: critical
  476. annotations:
  477. description: No API servers are reachable or all have disappeared from service
  478. discovery
  479. summary: No API servers are reachable
  480. - alert: K8sCertificateExpirationNotice
  481. labels:
  482. severity: warning
  483. annotations:
  484. description: Kubernetes API Certificate is expiring soon (less than 7 days)
  485. summary: Kubernetes API Certificate is expiering soon
  486. expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
  487. - alert: K8sCertificateExpirationNotice
  488. labels:
  489. severity: critical
  490. annotations:
  491. description: Kubernetes API Certificate is expiring in less than 1 day
  492. summary: Kubernetes API Certificate is expiering
  493. expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
  494. node.rules.yaml: |+
  495. groups:
  496. - name: node.rules
  497. rules:
  498. - record: instance:node_cpu:rate:sum
  499. expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
  500. BY (instance)
  501. - record: instance:node_filesystem_usage:sum
  502. expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
  503. BY (instance)
  504. - record: instance:node_network_receive_bytes:rate:sum
  505. expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
  506. - record: instance:node_network_transmit_bytes:rate:sum
  507. expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
  508. - record: instance:node_cpu:ratio
  509. expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
  510. GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
  511. - record: cluster:node_cpu:sum_rate5m
  512. expr: sum(rate(node_cpu{mode!="idle"}[5m]))
  513. - record: cluster:node_cpu:ratio
  514. expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
  515. - alert: NodeExporterDown
  516. expr: absent(up{job="node-exporter"} == 1)
  517. for: 10m
  518. labels:
  519. severity: warning
  520. annotations:
  521. description: Prometheus could not scrape a node-exporter for more than 10m,
  522. or node-exporters have disappeared from discovery
  523. summary: Prometheus could not scrape a node-exporter
  524. - alert: NodeDiskRunningFull
  525. expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
  526. for: 30m
  527. labels:
  528. severity: warning
  529. annotations:
  530. description: device {{$labels.device}} on node {{$labels.instance}} is running
  531. full within the next 24 hours (mounted at {{$labels.mountpoint}})
  532. summary: Node disk is running full within 24 hours
  533. - alert: NodeDiskRunningFull
  534. expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
  535. for: 10m
  536. labels:
  537. severity: critical
  538. annotations:
  539. description: device {{$labels.device}} on node {{$labels.instance}} is running
  540. full within the next 2 hours (mounted at {{$labels.mountpoint}})
  541. summary: Node disk is running full within 2 hours
  542. prometheus.rules.yaml: |+
  543. groups:
  544. - name: prometheus.rules
  545. rules:
  546. - alert: PrometheusConfigReloadFailed
  547. expr: prometheus_config_last_reload_successful == 0
  548. for: 10m
  549. labels:
  550. severity: warning
  551. annotations:
  552. description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
  553. summary: Reloading Promehteus' configuration failed
  554. - alert: PrometheusNotificationQueueRunningFull
  555. expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
  556. for: 10m
  557. labels:
  558. severity: warning
  559. annotations:
  560. description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
  561. $labels.pod}}
  562. summary: Prometheus' alert notification queue is running full
  563. - alert: PrometheusErrorSendingAlerts
  564. expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
  565. > 0.01
  566. for: 10m
  567. labels:
  568. severity: warning
  569. annotations:
  570. description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
  571. $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
  572. summary: Errors while sending alert from Prometheus
  573. - alert: PrometheusErrorSendingAlerts
  574. expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
  575. > 0.03
  576. for: 10m
  577. labels:
  578. severity: critical
  579. annotations:
  580. description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
  581. $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
  582. summary: Errors while sending alerts from Prometheus
  583. - alert: PrometheusNotConnectedToAlertmanagers
  584. expr: prometheus_notifications_alertmanagers_discovered < 1
  585. for: 10m
  586. labels:
  587. severity: warning
  588. annotations:
  589. description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
  590. to any Alertmanagers
  591. summary: Prometheus is not connected to any Alertmanagers
  592. - alert: PrometheusTSDBReloadsFailing
  593. expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
  594. for: 12h
  595. labels:
  596. severity: warning
  597. annotations:
  598. description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
  599. reload failures over the last four hours.'
  600. summary: Prometheus has issues reloading data blocks from disk
  601. - alert: PrometheusTSDBCompactionsFailing
  602. expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
  603. for: 12h
  604. labels:
  605. severity: warning
  606. annotations:
  607. description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
  608. compaction failures over the last four hours.'
  609. summary: Prometheus has issues compacting sample blocks
  610. - alert: PrometheusTSDBWALCorruptions
  611. expr: tsdb_wal_corruptions_total > 0
  612. for: 4h
  613. labels:
  614. severity: warning
  615. annotations:
  616. description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
  617. log (WAL).'
  618. summary: Prometheus write-ahead log is corrupted
  619. - alert: PrometheusNotIngestingSamples
  620. expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
  621. for: 10m
  622. labels:
  623. severity: warning
  624. annotations:
  625. description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
  626. summary: "Prometheus isn't ingesting samples"
  627. - alert: PrometheusTargetScapesDuplicate
  628. expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
  629. for: 10m
  630. labels:
  631. severity: warning
  632. annotations:
  633. description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values"
  634. summary: Prometheus has many samples rejected
  635. noah_pod.rules.yaml: |+
  636. groups:
  637. - name: noah_pod.rules
  638. rules:
  639. - alert: Pod_all_cpu_usage
  640. expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 10
  641. for: 5m
  642. labels:
  643. severity: critical
  644. service: pods
  645. annotations:
  646. description: 容器 {{ $labels.name }} CPU 资源利用率大于 75% , (current value is {{ $value }})
  647. summary: Dev CPU 负载告警
  648. - alert: Pod_all_memory_usage
  649. expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 1024*10^3*2
  650. for: 10m
  651. labels:
  652. severity: critical
  653. annotations:
  654. description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }})
  655. summary: Dev Memory 负载告警
  656. - alert: Pod_all_network_receive_usage
  657. expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 1024*1024*50
  658. for: 10m
  659. labels:
  660. severity: critical
  661. annotations:
  662. description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M , (current value is {{ $value }})
  663. summary: network_receive 负载告警