prometheus-k8s-rules.yaml-0.16.1 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818
  1. apiVersion: v1
  2. kind: ConfigMap
  3. metadata:
  4. name: prometheus-k8s-rules
  5. labels:
  6. role: alert-rules
  7. prometheus: k8s
  8. data:
  9. alertmanager.rules.yaml: |+
  10. groups:
  11. - name: alertmanager.rules
  12. rules:
  13. - alert: AlertmanagerConfigInconsistent
  14. expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
  15. GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
  16. "alertmanager-$1", "alertmanager", "(.*)") != 1
  17. for: 5m
  18. labels:
  19. severity: critical
  20. annotations:
  21. description: The configuration of the instances of the Alertmanager cluster
  22. `{{$labels.service}}` are out of sync.
  23. summary: Configuration out of sync
  24. - alert: AlertmanagerDownOrMissing
  25. expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
  26. "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
  27. for: 5m
  28. labels:
  29. severity: warning
  30. annotations:
  31. description: An unexpected number of Alertmanagers are scraped or Alertmanagers
  32. disappeared from discovery.
  33. summary: Alertmanager down or missing
  34. - alert: AlertmanagerFailedReload
  35. expr: alertmanager_config_last_reload_successful == 0
  36. for: 10m
  37. labels:
  38. severity: warning
  39. annotations:
  40. description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
  41. }}/{{ $labels.pod}}.
  42. summary: Alertmanager's configuration reload failed
  43. etcd3.rules.yaml: |+
  44. groups:
  45. - name: ./etcd3.rules
  46. rules:
  47. - alert: InsufficientMembers
  48. expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
  49. for: 3m
  50. labels:
  51. severity: critical
  52. annotations:
  53. description: If one more etcd member goes down the cluster will be unavailable
  54. summary: etcd cluster insufficient members
  55. - alert: NoLeader
  56. expr: etcd_server_has_leader{job="etcd"} == 0
  57. for: 1m
  58. labels:
  59. severity: critical
  60. annotations:
  61. description: etcd member {{ $labels.instance }} has no leader
  62. summary: etcd member has no leader
  63. - alert: HighNumberOfLeaderChanges
  64. expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
  65. labels:
  66. severity: warning
  67. annotations:
  68. description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
  69. changes within the last hour
  70. summary: a high number of leader changes within the etcd cluster are happening
  71. - alert: HighNumberOfFailedGRPCRequests
  72. expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
  73. / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
  74. for: 10m
  75. labels:
  76. severity: warning
  77. annotations:
  78. description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
  79. on etcd instance {{ $labels.instance }}'
  80. summary: a high number of gRPC requests are failing
  81. - alert: HighNumberOfFailedGRPCRequests
  82. expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
  83. / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
  84. for: 5m
  85. labels:
  86. severity: critical
  87. annotations:
  88. description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
  89. on etcd instance {{ $labels.instance }}'
  90. summary: a high number of gRPC requests are failing
  91. - alert: GRPCRequestsSlow
  92. expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
  93. > 0.15
  94. for: 10m
  95. labels:
  96. severity: critical
  97. annotations:
  98. description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
  99. }} are slow
  100. summary: slow gRPC requests
  101. - alert: HighNumberOfFailedHTTPRequests
  102. expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
  103. BY (method) > 0.01
  104. for: 10m
  105. labels:
  106. severity: warning
  107. annotations:
  108. description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
  109. instance {{ $labels.instance }}'
  110. summary: a high number of HTTP requests are failing
  111. - alert: HighNumberOfFailedHTTPRequests
  112. expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
  113. BY (method) > 0.05
  114. for: 5m
  115. labels:
  116. severity: critical
  117. annotations:
  118. description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
  119. instance {{ $labels.instance }}'
  120. summary: a high number of HTTP requests are failing
  121. - alert: HTTPRequestsSlow
  122. expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
  123. > 0.15
  124. for: 5m
  125. labels:
  126. severity: warning
  127. annotations:
  128. description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
  129. }} are slow
  130. summary: slow HTTP requests
  131. - alert: EtcdMemberCommunicationSlow
  132. expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
  133. > 0.15
  134. for: 5m
  135. labels:
  136. severity: warning
  137. annotations:
  138. description: etcd instance {{ $labels.instance }} member communication with
  139. {{ $labels.To }} is slow
  140. summary: etcd member communication is slow
  141. - alert: HighNumberOfFailedProposals
  142. expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
  143. labels:
  144. severity: warning
  145. annotations:
  146. description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
  147. failures within the last hour
  148. summary: a high number of proposals within the etcd cluster are failing
  149. - alert: HighFsyncDurations
  150. expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
  151. > 0.5
  152. for: 5m
  153. labels:
  154. severity: warning
  155. annotations:
  156. description: etcd instance {{ $labels.instance }} fync durations are high
  157. summary: high fsync durations
  158. - alert: HighCommitDurations
  159. expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
  160. > 0.25
  161. for: 5m
  162. labels:
  163. severity: warning
  164. annotations:
  165. description: etcd instance {{ $labels.instance }} commit durations are high
  166. summary: high commit durations
  167. general.rules.yaml: |+
  168. groups:
  169. - name: general.rules
  170. rules:
  171. - alert: TargetDown
  172. expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
  173. for: 10m
  174. labels:
  175. severity: warning
  176. annotations:
  177. description: '{{ $value }}% of {{ $labels.job }} targets are down.'
  178. summary: Targets are down
  179. - alert: FdExhaustionClose
  180. expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
  181. for: 10m
  182. labels:
  183. severity: warning
  184. annotations:
  185. description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
  186. will exhaust in file/socket descriptors within the next 4 hours'
  187. summary: file descriptors soon exhausted
  188. - alert: FdExhaustionClose
  189. expr: predict_linear(fd_utilization[10m], 3600) > 1
  190. for: 10m
  191. labels:
  192. severity: critical
  193. annotations:
  194. description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
  195. will exhaust in file/socket descriptors within the next hour'
  196. summary: file descriptors soon exhausted
  197. kube-controller-manager.rules.yaml: |+
  198. groups:
  199. - name: kube-controller-manager.rules
  200. rules:
  201. - alert: K8SControllerManagerDown
  202. expr: absent(up{job="kube-controller-manager"} == 1)
  203. for: 5m
  204. labels:
  205. severity: critical
  206. annotations:
  207. description: There is no running K8S controller manager. Deployments and replication
  208. controllers are not making progress.
  209. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
  210. summary: Controller manager is down
  211. kube-scheduler.rules.yaml: |+
  212. groups:
  213. - name: kube-scheduler.rules
  214. rules:
  215. - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
  216. expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
  217. BY (le, cluster)) / 1e+06
  218. labels:
  219. quantile: "0.99"
  220. - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
  221. expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
  222. BY (le, cluster)) / 1e+06
  223. labels:
  224. quantile: "0.9"
  225. - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
  226. expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
  227. BY (le, cluster)) / 1e+06
  228. labels:
  229. quantile: "0.5"
  230. - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
  231. expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
  232. BY (le, cluster)) / 1e+06
  233. labels:
  234. quantile: "0.99"
  235. - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
  236. expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
  237. BY (le, cluster)) / 1e+06
  238. labels:
  239. quantile: "0.9"
  240. - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
  241. expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
  242. BY (le, cluster)) / 1e+06
  243. labels:
  244. quantile: "0.5"
  245. - record: cluster:scheduler_binding_latency_seconds:quantile
  246. expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
  247. BY (le, cluster)) / 1e+06
  248. labels:
  249. quantile: "0.99"
  250. - record: cluster:scheduler_binding_latency_seconds:quantile
  251. expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
  252. BY (le, cluster)) / 1e+06
  253. labels:
  254. quantile: "0.9"
  255. - record: cluster:scheduler_binding_latency_seconds:quantile
  256. expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
  257. BY (le, cluster)) / 1e+06
  258. labels:
  259. quantile: "0.5"
  260. - alert: K8SSchedulerDown
  261. expr: absent(up{job="kube-scheduler"} == 1)
  262. for: 5m
  263. labels:
  264. severity: critical
  265. annotations:
  266. description: There is no running K8S scheduler. New pods are not being assigned
  267. to nodes.
  268. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
  269. summary: Scheduler is down
  270. kube-state-metrics.rules.yaml: |+
  271. groups:
  272. - name: kube-state-metrics.rules
  273. rules:
  274. - alert: DeploymentGenerationMismatch
  275. expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
  276. for: 15m
  277. labels:
  278. severity: warning
  279. annotations:
  280. description: Observed deployment generation does not match expected one for
  281. deployment {{$labels.namespaces}}/{{$labels.deployment}}
  282. summary: Deployment is outdated
  283. - alert: DeploymentReplicasNotUpdated
  284. expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
  285. or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
  286. unless (kube_deployment_spec_paused == 1)
  287. for: 15m
  288. labels:
  289. severity: warning
  290. annotations:
  291. description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
  292. summary: Deployment replicas are outdated
  293. - alert: DaemonSetRolloutStuck
  294. expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
  295. * 100 < 100
  296. for: 15m
  297. labels:
  298. severity: warning
  299. annotations:
  300. description: Only {{$value}}% of desired pods scheduled and ready for daemon
  301. set {{$labels.namespaces}}/{{$labels.daemonset}}
  302. summary: DaemonSet is missing pods
  303. - alert: K8SDaemonSetsNotScheduled
  304. expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
  305. > 0
  306. for: 10m
  307. labels:
  308. severity: warning
  309. annotations:
  310. description: A number of daemonsets are not scheduled.
  311. summary: Daemonsets are not scheduled correctly
  312. - alert: DaemonSetsMissScheduled
  313. expr: kube_daemonset_status_number_misscheduled > 0
  314. for: 10m
  315. labels:
  316. severity: warning
  317. annotations:
  318. description: A number of daemonsets are running where they are not supposed
  319. to run.
  320. summary: Daemonsets are not scheduled correctly
  321. - alert: SystemPodFrequentlyRestarting
  322. expr: increase(kube_pod_container_status_restarts_total{namespace=~"kubernetes-dashboard|monitoring|kube-system|default|kube-public|logging|istio-system"}[30m]) > 5
  323. for: 10m
  324. labels:
  325. severity: warning
  326. annotations:
  327. description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
  328. times within the last hour
  329. summary: Pod is restarting frequently
  330. - alert: AppPodFrequentlyRestarting
  331. expr: increase(kube_pod_container_status_restarts_total{namespace!~"kubernetes-dashboard|monitoring|kube-system|default|kube-public|logging|istio-system|hamster",pod!~"^appqueue.*"}[30m]) > 0
  332. for: 3m
  333. labels:
  334. severity: warning
  335. service: pods
  336. owner: ops
  337. annotations:
  338. description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
  339. times within the last hour
  340. summary: Pod is restarting frequently
  341. - alert: RobotPodFrequentlyRestarting
  342. expr: increase(kube_pod_container_status_restarts_total{namespace=~"hamster"}[30m]) > 0
  343. for: 5m
  344. labels:
  345. severity: warning
  346. service: pods
  347. owner: robot
  348. annotations:
  349. description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
  350. times within the last half hour
  351. summary: Pod is restarting frequently
  352. kubelet.rules.yaml: |+
  353. groups:
  354. - name: kubelet.rules
  355. rules:
  356. - alert: K8SNodeNotReady
  357. expr: kube_node_status_condition{condition="Ready",status="true"} == 0
  358. for: 1h
  359. labels:
  360. severity: warning
  361. annotations:
  362. description: The Kubelet on {{ $labels.node }} has not checked in with the API,
  363. or has set itself to NotReady, for more than an hour
  364. summary: Node status is NotReady
  365. - alert: K8SManyNodesNotReady
  366. expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
  367. > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
  368. 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
  369. for: 1m
  370. labels:
  371. severity: critical
  372. annotations:
  373. description: '{{ $value }}% of Kubernetes nodes are not ready'
  374. - alert: K8SKubeletDown
  375. expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
  376. for: 1h
  377. labels:
  378. severity: warning
  379. annotations:
  380. description: Prometheus failed to scrape {{ $value }}% of kubelets.
  381. summary: Prometheus failed to scrape
  382. - alert: K8SKubeletDown
  383. expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
  384. * 100 > 10
  385. for: 1h
  386. labels:
  387. severity: critical
  388. annotations:
  389. description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
  390. have disappeared from service discovery.
  391. summary: Many Kubelets cannot be scraped
  392. - alert: K8SKubeletTooManyPods
  393. expr: kubelet_running_pod_count > 100
  394. for: 10m
  395. labels:
  396. severity: warning
  397. annotations:
  398. description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
  399. to the limit of 110
  400. summary: Kubelet is close to pod limit
  401. kubernetes.rules.yaml: |+
  402. groups:
  403. - name: kubernetes.rules
  404. rules:
  405. - record: pod_name:container_memory_usage_bytes:sum
  406. expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
  407. (pod_name)
  408. - record: pod_name:container_spec_cpu_shares:sum
  409. expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
  410. - record: pod_name:container_cpu_usage:sum
  411. expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
  412. BY (pod_name)
  413. - record: pod_name:container_fs_usage_bytes:sum
  414. expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
  415. - record: namespace:container_memory_usage_bytes:sum
  416. expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
  417. - record: namespace:container_spec_cpu_shares:sum
  418. expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
  419. - record: namespace:container_cpu_usage:sum
  420. expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
  421. BY (namespace)
  422. - record: cluster:memory_usage:ratio
  423. expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
  424. (cluster) / sum(machine_memory_bytes) BY (cluster)
  425. - record: cluster:container_spec_cpu_shares:ratio
  426. expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
  427. / sum(machine_cpu_cores)
  428. - record: cluster:container_cpu_usage:ratio
  429. expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
  430. / sum(machine_cpu_cores)
  431. - record: apiserver_latency_seconds:quantile
  432. expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
  433. 1e+06
  434. labels:
  435. quantile: "0.99"
  436. - record: apiserver_latency:quantile_seconds
  437. expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
  438. 1e+06
  439. labels:
  440. quantile: "0.9"
  441. - record: apiserver_latency_seconds:quantile
  442. expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
  443. 1e+06
  444. labels:
  445. quantile: "0.5"
  446. - alert: APIServerLatencyHigh
  447. expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
  448. > 1
  449. for: 10m
  450. labels:
  451. severity: warning
  452. annotations:
  453. description: the API server has a 99th percentile latency of {{ $value }} seconds
  454. for {{$labels.verb}} {{$labels.resource}}
  455. summary: API server high latency
  456. - alert: APIServerLatencyHigh
  457. expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
  458. > 4
  459. for: 10m
  460. labels:
  461. severity: critical
  462. annotations:
  463. description: the API server has a 99th percentile latency of {{ $value }} seconds
  464. for {{$labels.verb}} {{$labels.resource}}
  465. summary: API server high latency
  466. - alert: APIServerErrorsHigh
  467. expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
  468. * 100 > 2
  469. for: 10m
  470. labels:
  471. severity: warning
  472. annotations:
  473. description: API server returns errors for {{ $value }}% of requests
  474. summary: API server request errors
  475. - alert: APIServerErrorsHigh
  476. expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
  477. * 100 > 5
  478. for: 10m
  479. labels:
  480. severity: critical
  481. annotations:
  482. description: API server returns errors for {{ $value }}% of requests
  483. - alert: K8SApiserverDown
  484. expr: absent(up{job="apiserver"} == 1)
  485. for: 20m
  486. labels:
  487. severity: critical
  488. annotations:
  489. description: No API servers are reachable or all have disappeared from service
  490. discovery
  491. summary: No API servers are reachable
  492. - alert: K8sCertificateExpirationNotice
  493. labels:
  494. severity: warning
  495. annotations:
  496. description: Kubernetes API Certificate is expiring soon (less than 7 days)
  497. summary: Kubernetes API Certificate is expiering soon
  498. expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
  499. - alert: K8sCertificateExpirationNotice
  500. labels:
  501. severity: critical
  502. annotations:
  503. description: Kubernetes API Certificate is expiring in less than 1 day
  504. summary: Kubernetes API Certificate is expiering
  505. expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
  506. node.rules.yaml: |+
  507. groups:
  508. - name: node.rules
  509. rules:
  510. - record: instance:node_cpu:rate:sum
  511. expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
  512. BY (instance)
  513. - record: instance:node_filesystem_usage:sum
  514. expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
  515. BY (instance)
  516. - record: instance:node_network_receive_bytes:rate:sum
  517. expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
  518. - record: instance:node_network_transmit_bytes:rate:sum
  519. expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
  520. - record: instance:node_cpu:ratio
  521. expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
  522. GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
  523. - record: cluster:node_cpu:sum_rate5m
  524. expr: sum(rate(node_cpu{mode!="idle"}[5m]))
  525. - record: cluster:node_cpu:ratio
  526. expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
  527. - alert: NodeExporterDown
  528. expr: absent(up{job="node-exporter"} == 1)
  529. for: 10m
  530. labels:
  531. severity: warning
  532. annotations:
  533. description: Prometheus could not scrape a node-exporter for more than 10m,
  534. or node-exporters have disappeared from discovery
  535. summary: Prometheus could not scrape a node-exporter
  536. - alert: NodeIOWaitHigher
  537. expr: avg(irate(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance) * 100 > 30
  538. for: 5m
  539. labels:
  540. severity: warning
  541. owner: ops
  542. annotations:
  543. description: 节点 {{$labels.instance}} 的IOWait过高,当前IOWait为 {{ $value }}%
  544. summary: 节点IOWait超过30%
  545. - alert: NodeDiskRunningFull
  546. expr: predict_linear(node_filesystem_files_free[6h], 3600 * 24) < 0
  547. for: 30m
  548. labels:
  549. severity: warning
  550. annotations:
  551. description: device {{$labels.device}} on node {{$labels.instance}} is running
  552. full within the next 24 hours (mounted at {{$labels.mountpoint}})
  553. summary: Node disk is running full within 24 hours
  554. - alert: NodeDiskRunningFull
  555. expr: predict_linear(node_filesystem_files_free[30m], 3600 * 2) < 0
  556. for: 10m
  557. labels:
  558. severity: critical
  559. annotations:
  560. description: device {{$labels.device}} on node {{$labels.instance}} is running
  561. full within the next 2 hours (mounted at {{$labels.mountpoint}})
  562. summary: Node disk is running full within 2 hours
  563. - alert: tcpEstablishedTooMany
  564. expr: predict_linear(node_netstat_Tcp_CurrEstab{instance!=""}[1h], 4*3600) > 5000
  565. for: 20m
  566. labels:
  567. severity: warning
  568. owner: ops
  569. annotations:
  570. description: '主机: {{ $labels.instance }} 的TCP连接数将在4个小时后超过5000,当前ESTABLISHED: {{ $value }}, 请及时查看。'
  571. summary: 根据1小时的数据预估TCP的ESTABLISHED数量将在4小时后超过5000
  572. - alert: tcpTimeWaitTooMany
  573. expr: predict_linear(node_sockstat_TCP_tw{instance!=""}[1h], 4*3600) > 2000
  574. for: 10m
  575. labels:
  576. severity: warning
  577. owner: ops
  578. annotations:
  579. description: '主机: {{ $labels.instance }} 的TIME_WAIT连接数将在4个小时后超过2000,当前TIME_WAIT: {{ $value }}, 请及时查看。'
  580. summary: 根据1小时的数据预估TCP的TIME_WAIT数量将在4小时后超过2000
  581. - alert: OpenFileHandleTooMany
  582. expr: predict_linear(node_filefd_allocated{instance!=""}[1h], 4*3600) > 20000
  583. for: 30m
  584. labels:
  585. severity: warning
  586. owner: ops
  587. annotations:
  588. description: '主机: {{ $labels.instance }} 打开的文件句柄将在4个小时后超过20000,当前打开的文件句柄: {{ $value }}, 请及时查看。'
  589. summary: 根据1小时的数据预估打开的文件句柄数量将在4小时后超过20000
  590. - alert: diskUsageOver80Percent
  591. expr: (1-(node_filesystem_free_bytes{mountpoint=~"/data|/|/.*etcd.*", fstype=~"ext4|xfs"} / node_filesystem_size_bytes{mountpoint=~"/data|/|/.*etcd.*", fstype=~"ext4|xfs"})) * 100 > 80
  592. for: 20m
  593. labels:
  594. severity: warning
  595. owner: ops
  596. annotations:
  597. description: '主机: {{ $labels.instance }} 的 {{ $labels.mountpoint }} 分区使用率超过80% ,当前使用率: {{ $value }}%, 请及时查看。'
  598. summary: 磁盘使用率超过80%
  599. - alert: nodeMemoryUsageOver80Percent
  600. expr: (1 - (node_memory_MemAvailable_bytes{instance!=""} / (node_memory_MemTotal_bytes{instance!=""})))* 100 > 80
  601. for: 20m
  602. labels:
  603. severity: warning
  604. owner: ops
  605. annotations:
  606. description: '主机: {{ $labels.instance }} 的内存使用率超过80%, 当前使用率: {{ $value }}, 请及时查看。'
  607. summary: 节点内存使用率超过80%
  608. prometheus.rules.yaml: |+
  609. groups:
  610. - name: prometheus.rules
  611. rules:
  612. - alert: PrometheusConfigReloadFailed
  613. expr: prometheus_config_last_reload_successful == 0
  614. for: 10m
  615. labels:
  616. severity: warning
  617. annotations:
  618. description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
  619. summary: Reloading Promehteus' configuration failed
  620. - alert: PrometheusNotificationQueueRunningFull
  621. expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
  622. for: 10m
  623. labels:
  624. severity: warning
  625. annotations:
  626. description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
  627. $labels.pod}}
  628. summary: Prometheus' alert notification queue is running full
  629. - alert: PrometheusErrorSendingAlerts
  630. expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
  631. > 0.01
  632. for: 10m
  633. labels:
  634. severity: warning
  635. annotations:
  636. description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
  637. $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
  638. summary: Errors while sending alert from Prometheus
  639. - alert: PrometheusErrorSendingAlerts
  640. expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
  641. > 0.03
  642. for: 10m
  643. labels:
  644. severity: critical
  645. annotations:
  646. description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
  647. $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
  648. summary: Errors while sending alerts from Prometheus
  649. - alert: PrometheusNotConnectedToAlertmanagers
  650. expr: prometheus_notifications_alertmanagers_discovered < 1
  651. for: 10m
  652. labels:
  653. severity: warning
  654. annotations:
  655. description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
  656. to any Alertmanagers
  657. summary: Prometheus is not connected to any Alertmanagers
  658. - alert: PrometheusTSDBReloadsFailing
  659. expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
  660. for: 12h
  661. labels:
  662. severity: warning
  663. annotations:
  664. description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
  665. reload failures over the last four hours.'
  666. summary: Prometheus has issues reloading data blocks from disk
  667. - alert: PrometheusTSDBCompactionsFailing
  668. expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
  669. for: 12h
  670. labels:
  671. severity: warning
  672. annotations:
  673. description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
  674. compaction failures over the last four hours.'
  675. summary: Prometheus has issues compacting sample blocks
  676. - alert: PrometheusTSDBWALCorruptions
  677. expr: tsdb_wal_corruptions_total > 0
  678. for: 4h
  679. labels:
  680. severity: warning
  681. annotations:
  682. description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
  683. log (WAL).'
  684. summary: Prometheus write-ahead log is corrupted
  685. - alert: PrometheusNotIngestingSamples
  686. expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
  687. for: 10m
  688. labels:
  689. severity: warning
  690. annotations:
  691. description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
  692. summary: "Prometheus isn't ingesting samples"
  693. - alert: PrometheusTargetScapesDuplicate
  694. expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
  695. for: 10m
  696. labels:
  697. severity: warning
  698. annotations:
  699. description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values"
  700. summary: Prometheus has many samples rejected
  701. zk.rules.yaml: |+
  702. groups:
  703. - name: zk.rules
  704. rules:
  705. - alert: zkClusterHealth
  706. expr: zk_up < 1
  707. for: 5m
  708. labels:
  709. severity: critical
  710. service: pods
  711. owner: ops
  712. app: zk
  713. annotations:
  714. description: '{{$labels.job}} at {{$labels.instance}} had {{$value}}
  715. Server IP: {{$labels.server}}'
  716. summary: zookeeper 状态不健康
  717. noah_pod.rules.yaml: |+
  718. groups:
  719. - name: noah_pod.rules
  720. rules:
  721. - alert: Pod_all_cpu_usage
  722. expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[10m]))*100) > 500
  723. for: 5m
  724. labels:
  725. severity: critical
  726. service: pods
  727. annotations:
  728. description: 容器 {{ $labels.name }} CPU 资源利用率过大 , (current value is {{ $value }})
  729. summary: Dev CPU 负载告警
  730. - alert: Pod_Memory_Grows_too_Fast
  731. expr: predict_linear(container_memory_usage_bytes{pod_name!="", image!="",namespace!~"monitoring|kube-system|default|kube-public|logging|istio-system"}[2h] , 4 * 3600) > kube_pod_container_resource_limits_memory_bytes{pod!="",namespace!~"monitoring|kube-system|default|kube-public|logging|istio-system"}
  732. for: 5m
  733. labels:
  734. severity: critical
  735. service: pods
  736. owner: ops
  737. annotations:
  738. description: 容器 {{ $labels.name }} 的内存使用可能会在4个小时后超过Limit限制,超过Limit限制会造成Pod重启,
  739. summary: Pod Memory 增长过快
  740. - alert: Pod_CPU_Grows_too_Fast
  741. expr: irate(container_cpu_usage_seconds_total{image!="",container_name!="POD",namespace!~"monitoring|kube-system|default|kube-public|logging|istio-system"}[30m]) > 1
  742. for: 10m
  743. labels:
  744. severity: critical
  745. service: pods
  746. owner: ops
  747. annotations:
  748. description: 容器 {{ $labels.name }} CPU增长率过快 , (当前增长率 {{ $value }})
  749. summary: Dev CPU 增长过快
  750. - alert: Pod_Memory_will_be_full
  751. expr: container_memory_usage_bytes{container_name!="POD",container_name!~"curl|wget|busy|fortio",image!="",namespace!~"kubernetes-dashboard|monitoring|kube-system|default|kube-public|logging|istio-system|kong",pod_name!~"appjob.*"} > container_spec_memory_limit_bytes{container_name!="POD",container_name!~"curl|wget|busy|fortio",image!="",namespace!~"monitoring|kubernetes-dashboard|kube-system|default|kube-public|logging|istio-system|kong",pod_name!~"appjob.*"} * 0.9
  752. for: 5m
  753. labels:
  754. severity: critical
  755. service: pods
  756. owner: ops
  757. annotations:
  758. description: 容器 {{ $labels.name }} 内存将要达到上限。
  759. summary: 容器内存使用率超过limit的90%。
  760. kong.rules.yaml: |+
  761. groups:
  762. - name: kong.rules
  763. rules:
  764. - alert: ErrorCode504Excessive
  765. expr: irate(kong_http_status{service!~".*appfront.*|.*default.*|.*kubernetes.*",code=~"500|504|502|503"}[5m]) > 0.1
  766. for: 5m
  767. labels:
  768. severity: critical
  769. type: httpErrorCode
  770. annotations:
  771. # description: 错误代码过多
  772. description: "Service: {{ $labels.service }} 的错误代码: {{ $labels.code }} 过多,每秒增长率为: {{ $value }}"
  773. summary: Kong 上域名错误代码过多
  774. - alert: ErrorCodeExcessive
  775. expr: irate(kong_http_status{service!~".*appfront.*|.*default.*|.*kubernetes.*",code!~"20.*|30.*|10.*|404|500|504|502|503"}[5m]) > 0.1
  776. for: 5m
  777. labels:
  778. severity: critical
  779. type: httpErrorCode
  780. annotations:
  781. # description: 错误代码过多
  782. description: "Service: {{ $labels.service }} 的错误代码: {{ $labels.code }} 过多,每秒增长率为: {{ $value }}"
  783. summary: Kong 上域名错误代码过多
  784. - alert: CannotConnectionDatabase
  785. expr: kong_datastore_reachable == 0
  786. for: 5m
  787. labels:
  788. severity: critical
  789. owner: ops
  790. annotations:
  791. description: Kong 无法无法连接数据库
  792. summary: Kong 连接数据库失败
  793. #- alert: ErrorCode404Excessive
  794. # expr: sum(rate(kong_http_status{service=~"cola.*",code="404"}[10m])) by (service,code) > 10
  795. # for: 10m
  796. # labels:
  797. # severity: critical
  798. # owner: robot
  799. # annotations:
  800. # # description: 错误代码过多
  801. # description: "Service: {{ $labels.service }} 的错误代码: {{ $labels.code }} 过多,每秒增长率为: {{ $value }}"
  802. # summary: Kong 上域名错误代码过多