123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818 |
- apiVersion: v1
- kind: ConfigMap
- metadata:
- name: prometheus-k8s-rules
- labels:
- role: alert-rules
- prometheus: k8s
- data:
- alertmanager.rules.yaml: |+
- groups:
- - name: alertmanager.rules
- rules:
- - alert: AlertmanagerConfigInconsistent
- expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
- GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
- "alertmanager-$1", "alertmanager", "(.*)") != 1
- for: 5m
- labels:
- severity: critical
- annotations:
- description: The configuration of the instances of the Alertmanager cluster
- `{{$labels.service}}` are out of sync.
- summary: Configuration out of sync
- - alert: AlertmanagerDownOrMissing
- expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
- "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
- for: 5m
- labels:
- severity: warning
- annotations:
- description: An unexpected number of Alertmanagers are scraped or Alertmanagers
- disappeared from discovery.
- summary: Alertmanager down or missing
- - alert: AlertmanagerFailedReload
- expr: alertmanager_config_last_reload_successful == 0
- for: 10m
- labels:
- severity: warning
- annotations:
- description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
- }}/{{ $labels.pod}}.
- summary: Alertmanager's configuration reload failed
- etcd3.rules.yaml: |+
- groups:
- - name: ./etcd3.rules
- rules:
- - alert: InsufficientMembers
- expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
- for: 3m
- labels:
- severity: critical
- annotations:
- description: If one more etcd member goes down the cluster will be unavailable
- summary: etcd cluster insufficient members
- - alert: NoLeader
- expr: etcd_server_has_leader{job="etcd"} == 0
- for: 1m
- labels:
- severity: critical
- annotations:
- description: etcd member {{ $labels.instance }} has no leader
- summary: etcd member has no leader
- - alert: HighNumberOfLeaderChanges
- expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
- labels:
- severity: warning
- annotations:
- description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
- changes within the last hour
- summary: a high number of leader changes within the etcd cluster are happening
- - alert: HighNumberOfFailedGRPCRequests
- expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
- / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
- for: 10m
- labels:
- severity: warning
- annotations:
- description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
- on etcd instance {{ $labels.instance }}'
- summary: a high number of gRPC requests are failing
- - alert: HighNumberOfFailedGRPCRequests
- expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
- / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
- for: 5m
- labels:
- severity: critical
- annotations:
- description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
- on etcd instance {{ $labels.instance }}'
- summary: a high number of gRPC requests are failing
- - alert: GRPCRequestsSlow
- expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
- > 0.15
- for: 10m
- labels:
- severity: critical
- annotations:
- description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
- }} are slow
- summary: slow gRPC requests
- - alert: HighNumberOfFailedHTTPRequests
- expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
- BY (method) > 0.01
- for: 10m
- labels:
- severity: warning
- annotations:
- description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
- instance {{ $labels.instance }}'
- summary: a high number of HTTP requests are failing
- - alert: HighNumberOfFailedHTTPRequests
- expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
- BY (method) > 0.05
- for: 5m
- labels:
- severity: critical
- annotations:
- description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
- instance {{ $labels.instance }}'
- summary: a high number of HTTP requests are failing
- - alert: HTTPRequestsSlow
- expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
- > 0.15
- for: 5m
- labels:
- severity: warning
- annotations:
- description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
- }} are slow
- summary: slow HTTP requests
- - alert: EtcdMemberCommunicationSlow
- expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
- > 0.15
- for: 5m
- labels:
- severity: warning
- annotations:
- description: etcd instance {{ $labels.instance }} member communication with
- {{ $labels.To }} is slow
- summary: etcd member communication is slow
- - alert: HighNumberOfFailedProposals
- expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
- labels:
- severity: warning
- annotations:
- description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
- failures within the last hour
- summary: a high number of proposals within the etcd cluster are failing
- - alert: HighFsyncDurations
- expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
- > 0.5
- for: 5m
- labels:
- severity: warning
- annotations:
- description: etcd instance {{ $labels.instance }} fync durations are high
- summary: high fsync durations
- - alert: HighCommitDurations
- expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
- > 0.25
- for: 5m
- labels:
- severity: warning
- annotations:
- description: etcd instance {{ $labels.instance }} commit durations are high
- summary: high commit durations
- general.rules.yaml: |+
- groups:
- - name: general.rules
- rules:
- - alert: TargetDown
- expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
- for: 10m
- labels:
- severity: warning
- annotations:
- description: '{{ $value }}% of {{ $labels.job }} targets are down.'
- summary: Targets are down
- - alert: FdExhaustionClose
- expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
- for: 10m
- labels:
- severity: warning
- annotations:
- description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
- will exhaust in file/socket descriptors within the next 4 hours'
- summary: file descriptors soon exhausted
- - alert: FdExhaustionClose
- expr: predict_linear(fd_utilization[10m], 3600) > 1
- for: 10m
- labels:
- severity: critical
- annotations:
- description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
- will exhaust in file/socket descriptors within the next hour'
- summary: file descriptors soon exhausted
- kube-controller-manager.rules.yaml: |+
- groups:
- - name: kube-controller-manager.rules
- rules:
- - alert: K8SControllerManagerDown
- expr: absent(up{job="kube-controller-manager"} == 1)
- for: 5m
- labels:
- severity: critical
- annotations:
- description: There is no running K8S controller manager. Deployments and replication
- controllers are not making progress.
- runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
- summary: Controller manager is down
- kube-scheduler.rules.yaml: |+
- groups:
- - name: kube-scheduler.rules
- rules:
- - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
- expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
- BY (le, cluster)) / 1e+06
- labels:
- quantile: "0.99"
- - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
- expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
- BY (le, cluster)) / 1e+06
- labels:
- quantile: "0.9"
- - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
- expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
- BY (le, cluster)) / 1e+06
- labels:
- quantile: "0.5"
- - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
- expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
- BY (le, cluster)) / 1e+06
- labels:
- quantile: "0.99"
- - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
- expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
- BY (le, cluster)) / 1e+06
- labels:
- quantile: "0.9"
- - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
- expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
- BY (le, cluster)) / 1e+06
- labels:
- quantile: "0.5"
- - record: cluster:scheduler_binding_latency_seconds:quantile
- expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
- BY (le, cluster)) / 1e+06
- labels:
- quantile: "0.99"
- - record: cluster:scheduler_binding_latency_seconds:quantile
- expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
- BY (le, cluster)) / 1e+06
- labels:
- quantile: "0.9"
- - record: cluster:scheduler_binding_latency_seconds:quantile
- expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
- BY (le, cluster)) / 1e+06
- labels:
- quantile: "0.5"
- - alert: K8SSchedulerDown
- expr: absent(up{job="kube-scheduler"} == 1)
- for: 5m
- labels:
- severity: critical
- annotations:
- description: There is no running K8S scheduler. New pods are not being assigned
- to nodes.
- runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
- summary: Scheduler is down
- kube-state-metrics.rules.yaml: |+
- groups:
- - name: kube-state-metrics.rules
- rules:
- - alert: DeploymentGenerationMismatch
- expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
- for: 15m
- labels:
- severity: warning
- annotations:
- description: Observed deployment generation does not match expected one for
- deployment {{$labels.namespaces}}/{{$labels.deployment}}
- summary: Deployment is outdated
- - alert: DeploymentReplicasNotUpdated
- expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
- or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
- unless (kube_deployment_spec_paused == 1)
- for: 15m
- labels:
- severity: warning
- annotations:
- description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
- summary: Deployment replicas are outdated
- - alert: DaemonSetRolloutStuck
- expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
- * 100 < 100
- for: 15m
- labels:
- severity: warning
- annotations:
- description: Only {{$value}}% of desired pods scheduled and ready for daemon
- set {{$labels.namespaces}}/{{$labels.daemonset}}
- summary: DaemonSet is missing pods
- - alert: K8SDaemonSetsNotScheduled
- expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
- > 0
- for: 10m
- labels:
- severity: warning
- annotations:
- description: A number of daemonsets are not scheduled.
- summary: Daemonsets are not scheduled correctly
- - alert: DaemonSetsMissScheduled
- expr: kube_daemonset_status_number_misscheduled > 0
- for: 10m
- labels:
- severity: warning
- annotations:
- description: A number of daemonsets are running where they are not supposed
- to run.
- summary: Daemonsets are not scheduled correctly
- - alert: SystemPodFrequentlyRestarting
- expr: increase(kube_pod_container_status_restarts_total{namespace=~"kubernetes-dashboard|monitoring|kube-system|default|kube-public|logging|istio-system"}[30m]) > 5
- for: 10m
- labels:
- severity: warning
- annotations:
- description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
- times within the last hour
- summary: Pod is restarting frequently
- - alert: AppPodFrequentlyRestarting
- expr: increase(kube_pod_container_status_restarts_total{namespace!~"kubernetes-dashboard|monitoring|kube-system|default|kube-public|logging|istio-system|hamster",pod!~"^appqueue.*"}[30m]) > 0
- for: 3m
- labels:
- severity: warning
- service: pods
- owner: ops
- annotations:
- description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
- times within the last hour
- summary: Pod is restarting frequently
- - alert: RobotPodFrequentlyRestarting
- expr: increase(kube_pod_container_status_restarts_total{namespace=~"hamster"}[30m]) > 0
- for: 5m
- labels:
- severity: warning
- service: pods
- owner: robot
- annotations:
- description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
- times within the last half hour
- summary: Pod is restarting frequently
- kubelet.rules.yaml: |+
- groups:
- - name: kubelet.rules
- rules:
- - alert: K8SNodeNotReady
- expr: kube_node_status_condition{condition="Ready",status="true"} == 0
- for: 1h
- labels:
- severity: warning
- annotations:
- description: The Kubelet on {{ $labels.node }} has not checked in with the API,
- or has set itself to NotReady, for more than an hour
- summary: Node status is NotReady
- - alert: K8SManyNodesNotReady
- expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
- > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
- 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
- for: 1m
- labels:
- severity: critical
- annotations:
- description: '{{ $value }}% of Kubernetes nodes are not ready'
- - alert: K8SKubeletDown
- expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
- for: 1h
- labels:
- severity: warning
- annotations:
- description: Prometheus failed to scrape {{ $value }}% of kubelets.
- summary: Prometheus failed to scrape
- - alert: K8SKubeletDown
- expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
- * 100 > 10
- for: 1h
- labels:
- severity: critical
- annotations:
- description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
- have disappeared from service discovery.
- summary: Many Kubelets cannot be scraped
- - alert: K8SKubeletTooManyPods
- expr: kubelet_running_pod_count > 100
- for: 10m
- labels:
- severity: warning
- annotations:
- description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
- to the limit of 110
- summary: Kubelet is close to pod limit
- kubernetes.rules.yaml: |+
- groups:
- - name: kubernetes.rules
- rules:
- - record: pod_name:container_memory_usage_bytes:sum
- expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
- (pod_name)
- - record: pod_name:container_spec_cpu_shares:sum
- expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
- - record: pod_name:container_cpu_usage:sum
- expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
- BY (pod_name)
- - record: pod_name:container_fs_usage_bytes:sum
- expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
- - record: namespace:container_memory_usage_bytes:sum
- expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
- - record: namespace:container_spec_cpu_shares:sum
- expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
- - record: namespace:container_cpu_usage:sum
- expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
- BY (namespace)
- - record: cluster:memory_usage:ratio
- expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
- (cluster) / sum(machine_memory_bytes) BY (cluster)
- - record: cluster:container_spec_cpu_shares:ratio
- expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
- / sum(machine_cpu_cores)
- - record: cluster:container_cpu_usage:ratio
- expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
- / sum(machine_cpu_cores)
- - record: apiserver_latency_seconds:quantile
- expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
- 1e+06
- labels:
- quantile: "0.99"
- - record: apiserver_latency:quantile_seconds
- expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
- 1e+06
- labels:
- quantile: "0.9"
- - record: apiserver_latency_seconds:quantile
- expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
- 1e+06
- labels:
- quantile: "0.5"
- - alert: APIServerLatencyHigh
- expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
- > 1
- for: 10m
- labels:
- severity: warning
- annotations:
- description: the API server has a 99th percentile latency of {{ $value }} seconds
- for {{$labels.verb}} {{$labels.resource}}
- summary: API server high latency
- - alert: APIServerLatencyHigh
- expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
- > 4
- for: 10m
- labels:
- severity: critical
- annotations:
- description: the API server has a 99th percentile latency of {{ $value }} seconds
- for {{$labels.verb}} {{$labels.resource}}
- summary: API server high latency
- - alert: APIServerErrorsHigh
- expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
- * 100 > 2
- for: 10m
- labels:
- severity: warning
- annotations:
- description: API server returns errors for {{ $value }}% of requests
- summary: API server request errors
- - alert: APIServerErrorsHigh
- expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
- * 100 > 5
- for: 10m
- labels:
- severity: critical
- annotations:
- description: API server returns errors for {{ $value }}% of requests
- - alert: K8SApiserverDown
- expr: absent(up{job="apiserver"} == 1)
- for: 20m
- labels:
- severity: critical
- annotations:
- description: No API servers are reachable or all have disappeared from service
- discovery
- summary: No API servers are reachable
-
- - alert: K8sCertificateExpirationNotice
- labels:
- severity: warning
- annotations:
- description: Kubernetes API Certificate is expiring soon (less than 7 days)
- summary: Kubernetes API Certificate is expiering soon
- expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
-
- - alert: K8sCertificateExpirationNotice
- labels:
- severity: critical
- annotations:
- description: Kubernetes API Certificate is expiring in less than 1 day
- summary: Kubernetes API Certificate is expiering
- expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
- node.rules.yaml: |+
- groups:
- - name: node.rules
- rules:
- - record: instance:node_cpu:rate:sum
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
- BY (instance)
- - record: instance:node_filesystem_usage:sum
- expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
- BY (instance)
- - record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
- - record: instance:node_network_transmit_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
- - record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
- GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
- - record: cluster:node_cpu:sum_rate5m
- expr: sum(rate(node_cpu{mode!="idle"}[5m]))
- - record: cluster:node_cpu:ratio
- expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
- - alert: NodeExporterDown
- expr: absent(up{job="node-exporter"} == 1)
- for: 10m
- labels:
- severity: warning
- annotations:
- description: Prometheus could not scrape a node-exporter for more than 10m,
- or node-exporters have disappeared from discovery
- summary: Prometheus could not scrape a node-exporter
- - alert: NodeIOWaitHigher
- expr: avg(irate(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance) * 100 > 30
- for: 5m
- labels:
- severity: warning
- owner: ops
- annotations:
- description: 节点 {{$labels.instance}} 的IOWait过高,当前IOWait为 {{ $value }}%
- summary: 节点IOWait超过30%
- - alert: NodeDiskRunningFull
- expr: predict_linear(node_filesystem_files_free[6h], 3600 * 24) < 0
- for: 30m
- labels:
- severity: warning
- annotations:
- description: device {{$labels.device}} on node {{$labels.instance}} is running
- full within the next 24 hours (mounted at {{$labels.mountpoint}})
- summary: Node disk is running full within 24 hours
- - alert: NodeDiskRunningFull
- expr: predict_linear(node_filesystem_files_free[30m], 3600 * 2) < 0
- for: 10m
- labels:
- severity: critical
- annotations:
- description: device {{$labels.device}} on node {{$labels.instance}} is running
- full within the next 2 hours (mounted at {{$labels.mountpoint}})
- summary: Node disk is running full within 2 hours
- - alert: tcpEstablishedTooMany
- expr: predict_linear(node_netstat_Tcp_CurrEstab{instance!=""}[1h], 4*3600) > 5000
- for: 20m
- labels:
- severity: warning
- owner: ops
- annotations:
- description: '主机: {{ $labels.instance }} 的TCP连接数将在4个小时后超过5000,当前ESTABLISHED: {{ $value }}, 请及时查看。'
- summary: 根据1小时的数据预估TCP的ESTABLISHED数量将在4小时后超过5000
- - alert: tcpTimeWaitTooMany
- expr: predict_linear(node_sockstat_TCP_tw{instance!=""}[1h], 4*3600) > 2000
- for: 10m
- labels:
- severity: warning
- owner: ops
- annotations:
- description: '主机: {{ $labels.instance }} 的TIME_WAIT连接数将在4个小时后超过2000,当前TIME_WAIT: {{ $value }}, 请及时查看。'
- summary: 根据1小时的数据预估TCP的TIME_WAIT数量将在4小时后超过2000
- - alert: OpenFileHandleTooMany
- expr: predict_linear(node_filefd_allocated{instance!=""}[1h], 4*3600) > 20000
- for: 30m
- labels:
- severity: warning
- owner: ops
- annotations:
- description: '主机: {{ $labels.instance }} 打开的文件句柄将在4个小时后超过20000,当前打开的文件句柄: {{ $value }}, 请及时查看。'
- summary: 根据1小时的数据预估打开的文件句柄数量将在4小时后超过20000
- - alert: diskUsageOver80Percent
- expr: (1-(node_filesystem_free_bytes{mountpoint=~"/data|/|/.*etcd.*", fstype=~"ext4|xfs"} / node_filesystem_size_bytes{mountpoint=~"/data|/|/.*etcd.*", fstype=~"ext4|xfs"})) * 100 > 80
- for: 20m
- labels:
- severity: warning
- owner: ops
- annotations:
- description: '主机: {{ $labels.instance }} 的 {{ $labels.mountpoint }} 分区使用率超过80% ,当前使用率: {{ $value }}%, 请及时查看。'
- summary: 磁盘使用率超过80%
- - alert: nodeMemoryUsageOver80Percent
- expr: (1 - (node_memory_MemAvailable_bytes{instance!=""} / (node_memory_MemTotal_bytes{instance!=""})))* 100 > 80
- for: 20m
- labels:
- severity: warning
- owner: ops
- annotations:
- description: '主机: {{ $labels.instance }} 的内存使用率超过80%, 当前使用率: {{ $value }}, 请及时查看。'
- summary: 节点内存使用率超过80%
- prometheus.rules.yaml: |+
- groups:
- - name: prometheus.rules
- rules:
- - alert: PrometheusConfigReloadFailed
- expr: prometheus_config_last_reload_successful == 0
- for: 10m
- labels:
- severity: warning
- annotations:
- description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
- summary: Reloading Promehteus' configuration failed
-
- - alert: PrometheusNotificationQueueRunningFull
- expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
- for: 10m
- labels:
- severity: warning
- annotations:
- description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
- $labels.pod}}
- summary: Prometheus' alert notification queue is running full
-
- - alert: PrometheusErrorSendingAlerts
- expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
- > 0.01
- for: 10m
- labels:
- severity: warning
- annotations:
- description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
- $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- summary: Errors while sending alert from Prometheus
-
- - alert: PrometheusErrorSendingAlerts
- expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
- > 0.03
- for: 10m
- labels:
- severity: critical
- annotations:
- description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
- $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- summary: Errors while sending alerts from Prometheus
-
- - alert: PrometheusNotConnectedToAlertmanagers
- expr: prometheus_notifications_alertmanagers_discovered < 1
- for: 10m
- labels:
- severity: warning
- annotations:
- description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
- to any Alertmanagers
- summary: Prometheus is not connected to any Alertmanagers
-
- - alert: PrometheusTSDBReloadsFailing
- expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
- for: 12h
- labels:
- severity: warning
- annotations:
- description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
- reload failures over the last four hours.'
- summary: Prometheus has issues reloading data blocks from disk
-
- - alert: PrometheusTSDBCompactionsFailing
- expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
- for: 12h
- labels:
- severity: warning
- annotations:
- description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
- compaction failures over the last four hours.'
- summary: Prometheus has issues compacting sample blocks
-
- - alert: PrometheusTSDBWALCorruptions
- expr: tsdb_wal_corruptions_total > 0
- for: 4h
- labels:
- severity: warning
- annotations:
- description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
- log (WAL).'
- summary: Prometheus write-ahead log is corrupted
-
- - alert: PrometheusNotIngestingSamples
- expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
- for: 10m
- labels:
- severity: warning
- annotations:
- description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
- summary: "Prometheus isn't ingesting samples"
-
- - alert: PrometheusTargetScapesDuplicate
- expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
- for: 10m
- labels:
- severity: warning
- annotations:
- description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values"
- summary: Prometheus has many samples rejected
- zk.rules.yaml: |+
- groups:
- - name: zk.rules
- rules:
- - alert: zkClusterHealth
- expr: zk_up < 1
- for: 5m
- labels:
- severity: critical
- service: pods
- owner: ops
- app: zk
- annotations:
- description: '{{$labels.job}} at {{$labels.instance}} had {{$value}}
- Server IP: {{$labels.server}}'
- summary: zookeeper 状态不健康
- noah_pod.rules.yaml: |+
- groups:
- - name: noah_pod.rules
- rules:
- - alert: Pod_all_cpu_usage
- expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[10m]))*100) > 500
- for: 5m
- labels:
- severity: critical
- service: pods
- annotations:
- description: 容器 {{ $labels.name }} CPU 资源利用率过大 , (current value is {{ $value }})
- summary: Dev CPU 负载告警
- - alert: Pod_Memory_Grows_too_Fast
- expr: predict_linear(container_memory_usage_bytes{pod_name!="", image!="",namespace!~"monitoring|kube-system|default|kube-public|logging|istio-system"}[2h] , 4 * 3600) > kube_pod_container_resource_limits_memory_bytes{pod!="",namespace!~"monitoring|kube-system|default|kube-public|logging|istio-system"}
- for: 5m
- labels:
- severity: critical
- service: pods
- owner: ops
- annotations:
- description: 容器 {{ $labels.name }} 的内存使用可能会在4个小时后超过Limit限制,超过Limit限制会造成Pod重启,
- summary: Pod Memory 增长过快
- - alert: Pod_CPU_Grows_too_Fast
- expr: irate(container_cpu_usage_seconds_total{image!="",container_name!="POD",namespace!~"monitoring|kube-system|default|kube-public|logging|istio-system"}[30m]) > 1
- for: 10m
- labels:
- severity: critical
- service: pods
- owner: ops
- annotations:
- description: 容器 {{ $labels.name }} CPU增长率过快 , (当前增长率 {{ $value }})
- summary: Dev CPU 增长过快
- - alert: Pod_Memory_will_be_full
- expr: container_memory_usage_bytes{container_name!="POD",container_name!~"curl|wget|busy|fortio",image!="",namespace!~"kubernetes-dashboard|monitoring|kube-system|default|kube-public|logging|istio-system|kong",pod_name!~"appjob.*"} > container_spec_memory_limit_bytes{container_name!="POD",container_name!~"curl|wget|busy|fortio",image!="",namespace!~"monitoring|kubernetes-dashboard|kube-system|default|kube-public|logging|istio-system|kong",pod_name!~"appjob.*"} * 0.9
- for: 5m
- labels:
- severity: critical
- service: pods
- owner: ops
- annotations:
- description: 容器 {{ $labels.name }} 内存将要达到上限。
- summary: 容器内存使用率超过limit的90%。
- kong.rules.yaml: |+
- groups:
- - name: kong.rules
- rules:
- - alert: ErrorCode504Excessive
- expr: irate(kong_http_status{service!~".*appfront.*|.*default.*|.*kubernetes.*",code=~"500|504|502|503"}[5m]) > 0.1
- for: 5m
- labels:
- severity: critical
- type: httpErrorCode
- annotations:
- # description: 错误代码过多
- description: "Service: {{ $labels.service }} 的错误代码: {{ $labels.code }} 过多,每秒增长率为: {{ $value }}"
- summary: Kong 上域名错误代码过多
- - alert: ErrorCodeExcessive
- expr: irate(kong_http_status{service!~".*appfront.*|.*default.*|.*kubernetes.*",code!~"20.*|30.*|10.*|404|500|504|502|503"}[5m]) > 0.1
- for: 5m
- labels:
- severity: critical
- type: httpErrorCode
- annotations:
- # description: 错误代码过多
- description: "Service: {{ $labels.service }} 的错误代码: {{ $labels.code }} 过多,每秒增长率为: {{ $value }}"
- summary: Kong 上域名错误代码过多
- - alert: CannotConnectionDatabase
- expr: kong_datastore_reachable == 0
- for: 5m
- labels:
- severity: critical
- owner: ops
- annotations:
- description: Kong 无法无法连接数据库
- summary: Kong 连接数据库失败
- #- alert: ErrorCode404Excessive
- # expr: sum(rate(kong_http_status{service=~"cola.*",code="404"}[10m])) by (service,code) > 10
- # for: 10m
- # labels:
- # severity: critical
- # owner: robot
- # annotations:
- # # description: 错误代码过多
- # description: "Service: {{ $labels.service }} 的错误代码: {{ $labels.code }} 过多,每秒增长率为: {{ $value }}"
- # summary: Kong 上域名错误代码过多
|