|
@@ -0,0 +1,818 @@
|
|
|
+apiVersion: v1
|
|
|
+kind: ConfigMap
|
|
|
+metadata:
|
|
|
+ name: prometheus-k8s-rules
|
|
|
+ labels:
|
|
|
+ role: alert-rules
|
|
|
+ prometheus: k8s
|
|
|
+data:
|
|
|
+ alertmanager.rules.yaml: |+
|
|
|
+ groups:
|
|
|
+ - name: alertmanager.rules
|
|
|
+ rules:
|
|
|
+ - alert: AlertmanagerConfigInconsistent
|
|
|
+ expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
|
|
|
+ GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
|
|
|
+ "alertmanager-$1", "alertmanager", "(.*)") != 1
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: The configuration of the instances of the Alertmanager cluster
|
|
|
+ `{{$labels.service}}` are out of sync.
|
|
|
+ summary: Configuration out of sync
|
|
|
+ - alert: AlertmanagerDownOrMissing
|
|
|
+ expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
|
|
|
+ "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: An unexpected number of Alertmanagers are scraped or Alertmanagers
|
|
|
+ disappeared from discovery.
|
|
|
+ summary: Alertmanager down or missing
|
|
|
+ - alert: AlertmanagerFailedReload
|
|
|
+ expr: alertmanager_config_last_reload_successful == 0
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
|
|
+ }}/{{ $labels.pod}}.
|
|
|
+ summary: Alertmanager's configuration reload failed
|
|
|
+ etcd3.rules.yaml: |+
|
|
|
+ groups:
|
|
|
+ - name: ./etcd3.rules
|
|
|
+ rules:
|
|
|
+ - alert: InsufficientMembers
|
|
|
+ expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
|
|
+ for: 3m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: If one more etcd member goes down the cluster will be unavailable
|
|
|
+ summary: etcd cluster insufficient members
|
|
|
+ - alert: NoLeader
|
|
|
+ expr: etcd_server_has_leader{job="etcd"} == 0
|
|
|
+ for: 1m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: etcd member {{ $labels.instance }} has no leader
|
|
|
+ summary: etcd member has no leader
|
|
|
+ - alert: HighNumberOfLeaderChanges
|
|
|
+ expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
|
|
|
+ changes within the last hour
|
|
|
+ summary: a high number of leader changes within the etcd cluster are happening
|
|
|
+ - alert: HighNumberOfFailedGRPCRequests
|
|
|
+ expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
|
|
+ / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
|
|
+ on etcd instance {{ $labels.instance }}'
|
|
|
+ summary: a high number of gRPC requests are failing
|
|
|
+ - alert: HighNumberOfFailedGRPCRequests
|
|
|
+ expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
|
|
+ / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
|
|
+ on etcd instance {{ $labels.instance }}'
|
|
|
+ summary: a high number of gRPC requests are failing
|
|
|
+ - alert: GRPCRequestsSlow
|
|
|
+ expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
|
|
|
+ > 0.15
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
|
|
|
+ }} are slow
|
|
|
+ summary: slow gRPC requests
|
|
|
+ - alert: HighNumberOfFailedHTTPRequests
|
|
|
+ expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
|
|
+ BY (method) > 0.01
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
|
|
+ instance {{ $labels.instance }}'
|
|
|
+ summary: a high number of HTTP requests are failing
|
|
|
+ - alert: HighNumberOfFailedHTTPRequests
|
|
|
+ expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
|
|
+ BY (method) > 0.05
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
|
|
+ instance {{ $labels.instance }}'
|
|
|
+ summary: a high number of HTTP requests are failing
|
|
|
+ - alert: HTTPRequestsSlow
|
|
|
+ expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
|
|
+ > 0.15
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
|
|
+ }} are slow
|
|
|
+ summary: slow HTTP requests
|
|
|
+ - alert: EtcdMemberCommunicationSlow
|
|
|
+ expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
|
|
|
+ > 0.15
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: etcd instance {{ $labels.instance }} member communication with
|
|
|
+ {{ $labels.To }} is slow
|
|
|
+ summary: etcd member communication is slow
|
|
|
+ - alert: HighNumberOfFailedProposals
|
|
|
+ expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
|
|
|
+ failures within the last hour
|
|
|
+ summary: a high number of proposals within the etcd cluster are failing
|
|
|
+ - alert: HighFsyncDurations
|
|
|
+ expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
|
|
|
+ > 0.5
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: etcd instance {{ $labels.instance }} fync durations are high
|
|
|
+ summary: high fsync durations
|
|
|
+ - alert: HighCommitDurations
|
|
|
+ expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
|
|
|
+ > 0.25
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: etcd instance {{ $labels.instance }} commit durations are high
|
|
|
+ summary: high commit durations
|
|
|
+ general.rules.yaml: |+
|
|
|
+ groups:
|
|
|
+ - name: general.rules
|
|
|
+ rules:
|
|
|
+ - alert: TargetDown
|
|
|
+ expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: '{{ $value }}% of {{ $labels.job }} targets are down.'
|
|
|
+ summary: Targets are down
|
|
|
+ - alert: FdExhaustionClose
|
|
|
+ expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
|
|
+ will exhaust in file/socket descriptors within the next 4 hours'
|
|
|
+ summary: file descriptors soon exhausted
|
|
|
+ - alert: FdExhaustionClose
|
|
|
+ expr: predict_linear(fd_utilization[10m], 3600) > 1
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
|
|
+ will exhaust in file/socket descriptors within the next hour'
|
|
|
+ summary: file descriptors soon exhausted
|
|
|
+ kube-controller-manager.rules.yaml: |+
|
|
|
+ groups:
|
|
|
+ - name: kube-controller-manager.rules
|
|
|
+ rules:
|
|
|
+ - alert: K8SControllerManagerDown
|
|
|
+ expr: absent(up{job="kube-controller-manager"} == 1)
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: There is no running K8S controller manager. Deployments and replication
|
|
|
+ controllers are not making progress.
|
|
|
+ runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
|
|
+ summary: Controller manager is down
|
|
|
+ kube-scheduler.rules.yaml: |+
|
|
|
+ groups:
|
|
|
+ - name: kube-scheduler.rules
|
|
|
+ rules:
|
|
|
+ - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
|
|
+ expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
|
|
+ BY (le, cluster)) / 1e+06
|
|
|
+ labels:
|
|
|
+ quantile: "0.99"
|
|
|
+ - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
|
|
+ expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
|
|
+ BY (le, cluster)) / 1e+06
|
|
|
+ labels:
|
|
|
+ quantile: "0.9"
|
|
|
+ - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
|
|
+ expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
|
|
+ BY (le, cluster)) / 1e+06
|
|
|
+ labels:
|
|
|
+ quantile: "0.5"
|
|
|
+ - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
|
|
+ expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
|
|
+ BY (le, cluster)) / 1e+06
|
|
|
+ labels:
|
|
|
+ quantile: "0.99"
|
|
|
+ - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
|
|
+ expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
|
|
+ BY (le, cluster)) / 1e+06
|
|
|
+ labels:
|
|
|
+ quantile: "0.9"
|
|
|
+ - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
|
|
+ expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
|
|
+ BY (le, cluster)) / 1e+06
|
|
|
+ labels:
|
|
|
+ quantile: "0.5"
|
|
|
+ - record: cluster:scheduler_binding_latency_seconds:quantile
|
|
|
+ expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
|
|
+ BY (le, cluster)) / 1e+06
|
|
|
+ labels:
|
|
|
+ quantile: "0.99"
|
|
|
+ - record: cluster:scheduler_binding_latency_seconds:quantile
|
|
|
+ expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
|
|
+ BY (le, cluster)) / 1e+06
|
|
|
+ labels:
|
|
|
+ quantile: "0.9"
|
|
|
+ - record: cluster:scheduler_binding_latency_seconds:quantile
|
|
|
+ expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
|
|
+ BY (le, cluster)) / 1e+06
|
|
|
+ labels:
|
|
|
+ quantile: "0.5"
|
|
|
+ - alert: K8SSchedulerDown
|
|
|
+ expr: absent(up{job="kube-scheduler"} == 1)
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: There is no running K8S scheduler. New pods are not being assigned
|
|
|
+ to nodes.
|
|
|
+ runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
|
|
|
+ summary: Scheduler is down
|
|
|
+ kube-state-metrics.rules.yaml: |+
|
|
|
+ groups:
|
|
|
+ - name: kube-state-metrics.rules
|
|
|
+ rules:
|
|
|
+ - alert: DeploymentGenerationMismatch
|
|
|
+ expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
|
|
|
+ for: 15m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: Observed deployment generation does not match expected one for
|
|
|
+ deployment {{$labels.namespaces}}/{{$labels.deployment}}
|
|
|
+ summary: Deployment is outdated
|
|
|
+ - alert: DeploymentReplicasNotUpdated
|
|
|
+ expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
|
|
|
+ or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
|
|
|
+ unless (kube_deployment_spec_paused == 1)
|
|
|
+ for: 15m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
|
|
|
+ summary: Deployment replicas are outdated
|
|
|
+ - alert: DaemonSetRolloutStuck
|
|
|
+ expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
|
|
|
+ * 100 < 100
|
|
|
+ for: 15m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: Only {{$value}}% of desired pods scheduled and ready for daemon
|
|
|
+ set {{$labels.namespaces}}/{{$labels.daemonset}}
|
|
|
+ summary: DaemonSet is missing pods
|
|
|
+ - alert: K8SDaemonSetsNotScheduled
|
|
|
+ expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
|
|
|
+ > 0
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: A number of daemonsets are not scheduled.
|
|
|
+ summary: Daemonsets are not scheduled correctly
|
|
|
+ - alert: DaemonSetsMissScheduled
|
|
|
+ expr: kube_daemonset_status_number_misscheduled > 0
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: A number of daemonsets are running where they are not supposed
|
|
|
+ to run.
|
|
|
+ summary: Daemonsets are not scheduled correctly
|
|
|
+ - alert: SystemPodFrequentlyRestarting
|
|
|
+ expr: increase(kube_pod_container_status_restarts_total{namespace=~"kubernetes-dashboard|monitoring|kube-system|default|kube-public|logging|istio-system"}[30m]) > 5
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
|
|
|
+ times within the last hour
|
|
|
+ summary: Pod is restarting frequently
|
|
|
+ - alert: AppPodFrequentlyRestarting
|
|
|
+ expr: increase(kube_pod_container_status_restarts_total{namespace!~"kubernetes-dashboard|monitoring|kube-system|default|kube-public|logging|istio-system|hamster",pod!~"^appqueue.*"}[30m]) > 0
|
|
|
+ for: 3m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ service: pods
|
|
|
+ owner: ops
|
|
|
+ annotations:
|
|
|
+ description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
|
|
|
+ times within the last hour
|
|
|
+ summary: Pod is restarting frequently
|
|
|
+ - alert: RobotPodFrequentlyRestarting
|
|
|
+ expr: increase(kube_pod_container_status_restarts_total{namespace=~"hamster"}[30m]) > 0
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ service: pods
|
|
|
+ owner: robot
|
|
|
+ annotations:
|
|
|
+ description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
|
|
|
+ times within the last half hour
|
|
|
+ summary: Pod is restarting frequently
|
|
|
+
|
|
|
+ kubelet.rules.yaml: |+
|
|
|
+ groups:
|
|
|
+ - name: kubelet.rules
|
|
|
+ rules:
|
|
|
+ - alert: K8SNodeNotReady
|
|
|
+ expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
|
|
+ for: 1h
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: The Kubelet on {{ $labels.node }} has not checked in with the API,
|
|
|
+ or has set itself to NotReady, for more than an hour
|
|
|
+ summary: Node status is NotReady
|
|
|
+ - alert: K8SManyNodesNotReady
|
|
|
+ expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
|
|
|
+ > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
|
|
|
+ 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
|
|
|
+ for: 1m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: '{{ $value }}% of Kubernetes nodes are not ready'
|
|
|
+ - alert: K8SKubeletDown
|
|
|
+ expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
|
|
|
+ for: 1h
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
|
|
+ summary: Prometheus failed to scrape
|
|
|
+ - alert: K8SKubeletDown
|
|
|
+ expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
|
|
|
+ * 100 > 10
|
|
|
+ for: 1h
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
|
|
|
+ have disappeared from service discovery.
|
|
|
+ summary: Many Kubelets cannot be scraped
|
|
|
+ - alert: K8SKubeletTooManyPods
|
|
|
+ expr: kubelet_running_pod_count > 100
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
|
|
+ to the limit of 110
|
|
|
+ summary: Kubelet is close to pod limit
|
|
|
+ kubernetes.rules.yaml: |+
|
|
|
+ groups:
|
|
|
+ - name: kubernetes.rules
|
|
|
+ rules:
|
|
|
+ - record: pod_name:container_memory_usage_bytes:sum
|
|
|
+ expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
|
|
+ (pod_name)
|
|
|
+ - record: pod_name:container_spec_cpu_shares:sum
|
|
|
+ expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
|
|
|
+ - record: pod_name:container_cpu_usage:sum
|
|
|
+ expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
|
|
+ BY (pod_name)
|
|
|
+ - record: pod_name:container_fs_usage_bytes:sum
|
|
|
+ expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
|
|
|
+ - record: namespace:container_memory_usage_bytes:sum
|
|
|
+ expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
|
|
|
+ - record: namespace:container_spec_cpu_shares:sum
|
|
|
+ expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
|
|
|
+ - record: namespace:container_cpu_usage:sum
|
|
|
+ expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
|
|
|
+ BY (namespace)
|
|
|
+ - record: cluster:memory_usage:ratio
|
|
|
+ expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
|
|
+ (cluster) / sum(machine_memory_bytes) BY (cluster)
|
|
|
+ - record: cluster:container_spec_cpu_shares:ratio
|
|
|
+ expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
|
|
|
+ / sum(machine_cpu_cores)
|
|
|
+ - record: cluster:container_cpu_usage:ratio
|
|
|
+ expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
|
|
+ / sum(machine_cpu_cores)
|
|
|
+ - record: apiserver_latency_seconds:quantile
|
|
|
+ expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
|
|
|
+ 1e+06
|
|
|
+ labels:
|
|
|
+ quantile: "0.99"
|
|
|
+ - record: apiserver_latency:quantile_seconds
|
|
|
+ expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
|
|
|
+ 1e+06
|
|
|
+ labels:
|
|
|
+ quantile: "0.9"
|
|
|
+ - record: apiserver_latency_seconds:quantile
|
|
|
+ expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
|
|
|
+ 1e+06
|
|
|
+ labels:
|
|
|
+ quantile: "0.5"
|
|
|
+ - alert: APIServerLatencyHigh
|
|
|
+ expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
|
|
+ > 1
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: the API server has a 99th percentile latency of {{ $value }} seconds
|
|
|
+ for {{$labels.verb}} {{$labels.resource}}
|
|
|
+ summary: API server high latency
|
|
|
+ - alert: APIServerLatencyHigh
|
|
|
+ expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
|
|
+ > 4
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: the API server has a 99th percentile latency of {{ $value }} seconds
|
|
|
+ for {{$labels.verb}} {{$labels.resource}}
|
|
|
+ summary: API server high latency
|
|
|
+ - alert: APIServerErrorsHigh
|
|
|
+ expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
|
|
+ * 100 > 2
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: API server returns errors for {{ $value }}% of requests
|
|
|
+ summary: API server request errors
|
|
|
+ - alert: APIServerErrorsHigh
|
|
|
+ expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
|
|
+ * 100 > 5
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: API server returns errors for {{ $value }}% of requests
|
|
|
+ - alert: K8SApiserverDown
|
|
|
+ expr: absent(up{job="apiserver"} == 1)
|
|
|
+ for: 20m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: No API servers are reachable or all have disappeared from service
|
|
|
+ discovery
|
|
|
+ summary: No API servers are reachable
|
|
|
+
|
|
|
+ - alert: K8sCertificateExpirationNotice
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: Kubernetes API Certificate is expiring soon (less than 7 days)
|
|
|
+ summary: Kubernetes API Certificate is expiering soon
|
|
|
+ expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
|
|
|
+
|
|
|
+ - alert: K8sCertificateExpirationNotice
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: Kubernetes API Certificate is expiring in less than 1 day
|
|
|
+ summary: Kubernetes API Certificate is expiering
|
|
|
+ expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
|
|
|
+ node.rules.yaml: |+
|
|
|
+ groups:
|
|
|
+ - name: node.rules
|
|
|
+ rules:
|
|
|
+ - record: instance:node_cpu:rate:sum
|
|
|
+ expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
|
|
|
+ BY (instance)
|
|
|
+ - record: instance:node_filesystem_usage:sum
|
|
|
+ expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
|
|
|
+ BY (instance)
|
|
|
+ - record: instance:node_network_receive_bytes:rate:sum
|
|
|
+ expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
|
|
|
+ - record: instance:node_network_transmit_bytes:rate:sum
|
|
|
+ expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
|
|
|
+ - record: instance:node_cpu:ratio
|
|
|
+ expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
|
|
|
+ GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
|
|
|
+ - record: cluster:node_cpu:sum_rate5m
|
|
|
+ expr: sum(rate(node_cpu{mode!="idle"}[5m]))
|
|
|
+ - record: cluster:node_cpu:ratio
|
|
|
+ expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
|
|
+ - alert: NodeExporterDown
|
|
|
+ expr: absent(up{job="node-exporter"} == 1)
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: Prometheus could not scrape a node-exporter for more than 10m,
|
|
|
+ or node-exporters have disappeared from discovery
|
|
|
+ summary: Prometheus could not scrape a node-exporter
|
|
|
+ - alert: NodeIOWaitHigher
|
|
|
+ expr: avg(irate(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance) * 100 > 30
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ owner: ops
|
|
|
+ annotations:
|
|
|
+ description: 节点 {{$labels.instance}} 的IOWait过高,当前IOWait为 {{ $value }}%
|
|
|
+ summary: 节点IOWait超过30%
|
|
|
+ - alert: NodeDiskRunningFull
|
|
|
+ expr: predict_linear(node_filesystem_files_free[6h], 3600 * 24) < 0
|
|
|
+ for: 30m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: device {{$labels.device}} on node {{$labels.instance}} is running
|
|
|
+ full within the next 24 hours (mounted at {{$labels.mountpoint}})
|
|
|
+ summary: Node disk is running full within 24 hours
|
|
|
+ - alert: NodeDiskRunningFull
|
|
|
+ expr: predict_linear(node_filesystem_files_free[30m], 3600 * 2) < 0
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: device {{$labels.device}} on node {{$labels.instance}} is running
|
|
|
+ full within the next 2 hours (mounted at {{$labels.mountpoint}})
|
|
|
+ summary: Node disk is running full within 2 hours
|
|
|
+ - alert: tcpEstablishedTooMany
|
|
|
+ expr: predict_linear(node_netstat_Tcp_CurrEstab{instance!=""}[1h], 4*3600) > 5000
|
|
|
+ for: 20m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ owner: ops
|
|
|
+ annotations:
|
|
|
+ description: '主机: {{ $labels.instance }} 的TCP连接数将在4个小时后超过5000,当前ESTABLISHED: {{ $value }}, 请及时查看。'
|
|
|
+ summary: 根据1小时的数据预估TCP的ESTABLISHED数量将在4小时后超过5000
|
|
|
+ - alert: tcpTimeWaitTooMany
|
|
|
+ expr: predict_linear(node_sockstat_TCP_tw{instance!=""}[1h], 4*3600) > 2000
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ owner: ops
|
|
|
+ annotations:
|
|
|
+ description: '主机: {{ $labels.instance }} 的TIME_WAIT连接数将在4个小时后超过2000,当前TIME_WAIT: {{ $value }}, 请及时查看。'
|
|
|
+ summary: 根据1小时的数据预估TCP的TIME_WAIT数量将在4小时后超过2000
|
|
|
+ - alert: OpenFileHandleTooMany
|
|
|
+ expr: predict_linear(node_filefd_allocated{instance!=""}[1h], 4*3600) > 20000
|
|
|
+ for: 30m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ owner: ops
|
|
|
+ annotations:
|
|
|
+ description: '主机: {{ $labels.instance }} 打开的文件句柄将在4个小时后超过20000,当前打开的文件句柄: {{ $value }}, 请及时查看。'
|
|
|
+ summary: 根据1小时的数据预估打开的文件句柄数量将在4小时后超过20000
|
|
|
+ - alert: diskUsageOver80Percent
|
|
|
+ expr: (1-(node_filesystem_free_bytes{mountpoint=~"/data|/|/.*etcd.*", fstype=~"ext4|xfs"} / node_filesystem_size_bytes{mountpoint=~"/data|/|/.*etcd.*", fstype=~"ext4|xfs"})) * 100 > 80
|
|
|
+ for: 20m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ owner: ops
|
|
|
+ annotations:
|
|
|
+ description: '主机: {{ $labels.instance }} 的 {{ $labels.mountpoint }} 分区使用率超过80% ,当前使用率: {{ $value }}%, 请及时查看。'
|
|
|
+ summary: 磁盘使用率超过80%
|
|
|
+ - alert: nodeMemoryUsageOver80Percent
|
|
|
+ expr: (1 - (node_memory_MemAvailable_bytes{instance!=""} / (node_memory_MemTotal_bytes{instance!=""})))* 100 > 80
|
|
|
+ for: 20m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ owner: ops
|
|
|
+ annotations:
|
|
|
+ description: '主机: {{ $labels.instance }} 的内存使用率超过80%, 当前使用率: {{ $value }}, 请及时查看。'
|
|
|
+ summary: 节点内存使用率超过80%
|
|
|
+
|
|
|
+ prometheus.rules.yaml: |+
|
|
|
+ groups:
|
|
|
+ - name: prometheus.rules
|
|
|
+ rules:
|
|
|
+ - alert: PrometheusConfigReloadFailed
|
|
|
+ expr: prometheus_config_last_reload_successful == 0
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
|
|
+ summary: Reloading Promehteus' configuration failed
|
|
|
+
|
|
|
+ - alert: PrometheusNotificationQueueRunningFull
|
|
|
+ expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
|
|
|
+ $labels.pod}}
|
|
|
+ summary: Prometheus' alert notification queue is running full
|
|
|
+
|
|
|
+ - alert: PrometheusErrorSendingAlerts
|
|
|
+ expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
|
|
+ > 0.01
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
|
|
+ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
|
|
+ summary: Errors while sending alert from Prometheus
|
|
|
+
|
|
|
+ - alert: PrometheusErrorSendingAlerts
|
|
|
+ expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
|
|
+ > 0.03
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
|
|
+ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
|
|
+ summary: Errors while sending alerts from Prometheus
|
|
|
+
|
|
|
+ - alert: PrometheusNotConnectedToAlertmanagers
|
|
|
+ expr: prometheus_notifications_alertmanagers_discovered < 1
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
|
|
+ to any Alertmanagers
|
|
|
+ summary: Prometheus is not connected to any Alertmanagers
|
|
|
+
|
|
|
+ - alert: PrometheusTSDBReloadsFailing
|
|
|
+ expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
|
|
|
+ for: 12h
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
|
|
+ reload failures over the last four hours.'
|
|
|
+ summary: Prometheus has issues reloading data blocks from disk
|
|
|
+
|
|
|
+ - alert: PrometheusTSDBCompactionsFailing
|
|
|
+ expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
|
|
|
+ for: 12h
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
|
|
+ compaction failures over the last four hours.'
|
|
|
+ summary: Prometheus has issues compacting sample blocks
|
|
|
+
|
|
|
+ - alert: PrometheusTSDBWALCorruptions
|
|
|
+ expr: tsdb_wal_corruptions_total > 0
|
|
|
+ for: 4h
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
|
|
|
+ log (WAL).'
|
|
|
+ summary: Prometheus write-ahead log is corrupted
|
|
|
+
|
|
|
+ - alert: PrometheusNotIngestingSamples
|
|
|
+ expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
|
|
|
+ summary: "Prometheus isn't ingesting samples"
|
|
|
+
|
|
|
+ - alert: PrometheusTargetScapesDuplicate
|
|
|
+ expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values"
|
|
|
+ summary: Prometheus has many samples rejected
|
|
|
+
|
|
|
+ zk.rules.yaml: |+
|
|
|
+ groups:
|
|
|
+ - name: zk.rules
|
|
|
+ rules:
|
|
|
+ - alert: zkClusterHealth
|
|
|
+ expr: zk_up < 1
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ service: pods
|
|
|
+ owner: ops
|
|
|
+ app: zk
|
|
|
+ annotations:
|
|
|
+ description: '{{$labels.job}} at {{$labels.instance}} had {{$value}}
|
|
|
+ Server IP: {{$labels.server}}'
|
|
|
+ summary: zookeeper 状态不健康
|
|
|
+
|
|
|
+ noah_pod.rules.yaml: |+
|
|
|
+ groups:
|
|
|
+ - name: noah_pod.rules
|
|
|
+ rules:
|
|
|
+ - alert: Pod_all_cpu_usage
|
|
|
+ expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[10m]))*100) > 500
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ service: pods
|
|
|
+ annotations:
|
|
|
+ description: 容器 {{ $labels.name }} CPU 资源利用率过大 , (current value is {{ $value }})
|
|
|
+ summary: Dev CPU 负载告警
|
|
|
+ - alert: Pod_Memory_Grows_too_Fast
|
|
|
+ expr: predict_linear(container_memory_usage_bytes{pod_name!="", image!="",namespace!~"monitoring|kube-system|default|kube-public|logging|istio-system"}[2h] , 4 * 3600) > kube_pod_container_resource_limits_memory_bytes{pod!="",namespace!~"monitoring|kube-system|default|kube-public|logging|istio-system"}
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ service: pods
|
|
|
+ owner: ops
|
|
|
+ annotations:
|
|
|
+ description: 容器 {{ $labels.name }} 的内存使用可能会在4个小时后超过Limit限制,超过Limit限制会造成Pod重启,
|
|
|
+ summary: Pod Memory 增长过快
|
|
|
+ - alert: Pod_CPU_Grows_too_Fast
|
|
|
+ expr: irate(container_cpu_usage_seconds_total{image!="",container_name!="POD",namespace!~"monitoring|kube-system|default|kube-public|logging|istio-system"}[30m]) > 1
|
|
|
+ for: 10m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ service: pods
|
|
|
+ owner: ops
|
|
|
+ annotations:
|
|
|
+ description: 容器 {{ $labels.name }} CPU增长率过快 , (当前增长率 {{ $value }})
|
|
|
+ summary: Dev CPU 增长过快
|
|
|
+ - alert: Pod_Memory_will_be_full
|
|
|
+ expr: container_memory_usage_bytes{container_name!="POD",container_name!~"curl|wget|busy|fortio",image!="",namespace!~"kubernetes-dashboard|monitoring|kube-system|default|kube-public|logging|istio-system|kong",pod_name!~"appjob.*"} > container_spec_memory_limit_bytes{container_name!="POD",container_name!~"curl|wget|busy|fortio",image!="",namespace!~"monitoring|kubernetes-dashboard|kube-system|default|kube-public|logging|istio-system|kong",pod_name!~"appjob.*"} * 0.9
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ service: pods
|
|
|
+ owner: ops
|
|
|
+ annotations:
|
|
|
+ description: 容器 {{ $labels.name }} 内存将要达到上限。
|
|
|
+ summary: 容器内存使用率超过limit的90%。
|
|
|
+
|
|
|
+ kong.rules.yaml: |+
|
|
|
+ groups:
|
|
|
+ - name: kong.rules
|
|
|
+ rules:
|
|
|
+ - alert: ErrorCode504Excessive
|
|
|
+ expr: irate(kong_http_status{service!~".*appfront.*|.*default.*|.*kubernetes.*",code=~"500|504|502|503"}[5m]) > 0.1
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ type: httpErrorCode
|
|
|
+ annotations:
|
|
|
+ # description: 错误代码过多
|
|
|
+ description: "Service: {{ $labels.service }} 的错误代码: {{ $labels.code }} 过多,每秒增长率为: {{ $value }}"
|
|
|
+ summary: Kong 上域名错误代码过多
|
|
|
+ - alert: ErrorCodeExcessive
|
|
|
+ expr: irate(kong_http_status{service!~".*appfront.*|.*default.*|.*kubernetes.*",code!~"20.*|30.*|10.*|404|500|504|502|503"}[5m]) > 0.1
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ type: httpErrorCode
|
|
|
+ annotations:
|
|
|
+ # description: 错误代码过多
|
|
|
+ description: "Service: {{ $labels.service }} 的错误代码: {{ $labels.code }} 过多,每秒增长率为: {{ $value }}"
|
|
|
+ summary: Kong 上域名错误代码过多
|
|
|
+ - alert: CannotConnectionDatabase
|
|
|
+ expr: kong_datastore_reachable == 0
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ owner: ops
|
|
|
+ annotations:
|
|
|
+ description: Kong 无法无法连接数据库
|
|
|
+ summary: Kong 连接数据库失败
|
|
|
+ #- alert: ErrorCode404Excessive
|
|
|
+ # expr: sum(rate(kong_http_status{service=~"cola.*",code="404"}[10m])) by (service,code) > 10
|
|
|
+ # for: 10m
|
|
|
+ # labels:
|
|
|
+ # severity: critical
|
|
|
+ # owner: robot
|
|
|
+ # annotations:
|
|
|
+ # # description: 错误代码过多
|
|
|
+ # description: "Service: {{ $labels.service }} 的错误代码: {{ $labels.code }} 过多,每秒增长率为: {{ $value }}"
|
|
|
+ # summary: Kong 上域名错误代码过多
|