apiVersion: v1 kind: ConfigMap metadata: name: prometheus-k8s-rules labels: role: alert-rules prometheus: k8s data: alertmanager.rules.yaml: |+ groups: - name: alertmanager.rules rules: - alert: AlertmanagerConfigInconsistent expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 for: 5m labels: severity: critical annotations: description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. summary: Configuration out of sync - alert: AlertmanagerDownOrMissing expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 for: 5m labels: severity: warning annotations: description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery. summary: Alertmanager down or missing - alert: AlertmanagerFailedReload expr: alertmanager_config_last_reload_successful == 0 for: 10m labels: severity: warning annotations: description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. summary: Alertmanager's configuration reload failed etcd3.rules.yaml: |+ groups: - name: ./etcd3.rules rules: - alert: InsufficientMembers expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) for: 3m labels: severity: critical annotations: description: If one more etcd member goes down the cluster will be unavailable summary: etcd cluster insufficient members - alert: NoLeader expr: etcd_server_has_leader{job="etcd"} == 0 for: 1m labels: severity: critical annotations: description: etcd member {{ $labels.instance }} has no leader summary: etcd member has no leader - alert: HighNumberOfLeaderChanges expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour summary: a high number of leader changes within the etcd cluster are happening - alert: HighNumberOfFailedGRPCRequests expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01 for: 10m labels: severity: warning annotations: description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of gRPC requests are failing - alert: HighNumberOfFailedGRPCRequests expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05 for: 5m labels: severity: critical annotations: description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of gRPC requests are failing - alert: GRPCRequestsSlow expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15 for: 10m labels: severity: critical annotations: description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow summary: slow gRPC requests - alert: HighNumberOfFailedHTTPRequests expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01 for: 10m labels: severity: warning annotations: description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of HTTP requests are failing - alert: HighNumberOfFailedHTTPRequests expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05 for: 5m labels: severity: critical annotations: description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of HTTP requests are failing - alert: HTTPRequestsSlow expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 for: 5m labels: severity: warning annotations: description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow summary: slow HTTP requests - alert: EtcdMemberCommunicationSlow expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15 for: 5m labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow summary: etcd member communication is slow - alert: HighNumberOfFailedProposals expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour summary: a high number of proposals within the etcd cluster are failing - alert: HighFsyncDurations expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 for: 5m labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} fync durations are high summary: high fsync durations - alert: HighCommitDurations expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 for: 5m labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} commit durations are high summary: high commit durations general.rules.yaml: |+ groups: - name: general.rules rules: - alert: TargetDown expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 for: 10m labels: severity: warning annotations: description: '{{ $value }}% of {{ $labels.job }} targets are down.' summary: Targets are down - alert: FdExhaustionClose expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1 for: 10m labels: severity: warning annotations: description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance will exhaust in file/socket descriptors within the next 4 hours' summary: file descriptors soon exhausted - alert: FdExhaustionClose expr: predict_linear(fd_utilization[10m], 3600) > 1 for: 10m labels: severity: critical annotations: description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance will exhaust in file/socket descriptors within the next hour' summary: file descriptors soon exhausted kube-controller-manager.rules.yaml: |+ groups: - name: kube-controller-manager.rules rules: - alert: K8SControllerManagerDown expr: absent(up{job="kube-controller-manager"} == 1) for: 5m labels: severity: critical annotations: description: There is no running K8S controller manager. Deployments and replication controllers are not making progress. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager summary: Controller manager is down kube-scheduler.rules.yaml: |+ groups: - name: kube-scheduler.rules rules: - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.99" - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.9" - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.5" - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.99" - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.9" - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.5" - record: cluster:scheduler_binding_latency_seconds:quantile expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.99" - record: cluster:scheduler_binding_latency_seconds:quantile expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.9" - record: cluster:scheduler_binding_latency_seconds:quantile expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.5" - alert: K8SSchedulerDown expr: absent(up{job="kube-scheduler"} == 1) for: 5m labels: severity: critical annotations: description: There is no running K8S scheduler. New pods are not being assigned to nodes. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler summary: Scheduler is down kube-state-metrics.rules.yaml: |+ groups: - name: kube-state-metrics.rules rules: - alert: DeploymentGenerationMismatch expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation for: 15m labels: severity: warning annotations: description: Observed deployment generation does not match expected one for deployment {{$labels.namespaces}}/{{$labels.deployment}} summary: Deployment is outdated - alert: DeploymentReplicasNotUpdated expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) unless (kube_deployment_spec_paused == 1) for: 15m labels: severity: warning annotations: description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}} summary: Deployment replicas are outdated - alert: DaemonSetRolloutStuck expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 for: 15m labels: severity: warning annotations: description: Only {{$value}}% of desired pods scheduled and ready for daemon set {{$labels.namespaces}}/{{$labels.daemonset}} summary: DaemonSet is missing pods - alert: K8SDaemonSetsNotScheduled expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0 for: 10m labels: severity: warning annotations: description: A number of daemonsets are not scheduled. summary: Daemonsets are not scheduled correctly - alert: DaemonSetsMissScheduled expr: kube_daemonset_status_number_misscheduled > 0 for: 10m labels: severity: warning annotations: description: A number of daemonsets are running where they are not supposed to run. summary: Daemonsets are not scheduled correctly - alert: SystemPodFrequentlyRestarting expr: increase(kube_pod_container_status_restarts_total{namespace=~"kubernetes-dashboard|monitoring|kube-system|default|kube-public|logging|istio-system"}[30m]) > 5 for: 10m labels: severity: warning annotations: description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} times within the last hour summary: Pod is restarting frequently - alert: AppPodFrequentlyRestarting expr: increase(kube_pod_container_status_restarts_total{namespace!~"kubernetes-dashboard|monitoring|kube-system|default|kube-public|logging|istio-system|hamster",pod!~"^appqueue.*"}[30m]) > 0 for: 3m labels: severity: warning service: pods owner: ops annotations: description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} times within the last hour summary: Pod is restarting frequently - alert: RobotPodFrequentlyRestarting expr: increase(kube_pod_container_status_restarts_total{namespace=~"hamster"}[30m]) > 0 for: 5m labels: severity: warning service: pods owner: robot annotations: description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} times within the last half hour summary: Pod is restarting frequently kubelet.rules.yaml: |+ groups: - name: kubelet.rules rules: - alert: K8SNodeNotReady expr: kube_node_status_condition{condition="Ready",status="true"} == 0 for: 1h labels: severity: warning annotations: description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour summary: Node status is NotReady - alert: K8SManyNodesNotReady expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2 for: 1m labels: severity: critical annotations: description: '{{ $value }}% of Kubernetes nodes are not ready' - alert: K8SKubeletDown expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3 for: 1h labels: severity: warning annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets. summary: Prometheus failed to scrape - alert: K8SKubeletDown expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) * 100 > 10 for: 1h labels: severity: critical annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery. summary: Many Kubelets cannot be scraped - alert: K8SKubeletTooManyPods expr: kubelet_running_pod_count > 100 for: 10m labels: severity: warning annotations: description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110 summary: Kubelet is close to pod limit kubernetes.rules.yaml: |+ groups: - name: kubernetes.rules rules: - record: pod_name:container_memory_usage_bytes:sum expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name) - record: pod_name:container_spec_cpu_shares:sum expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name) - record: pod_name:container_cpu_usage:sum expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) BY (pod_name) - record: pod_name:container_fs_usage_bytes:sum expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name) - record: namespace:container_memory_usage_bytes:sum expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace) - record: namespace:container_spec_cpu_shares:sum expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace) - record: namespace:container_cpu_usage:sum expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m])) BY (namespace) - record: cluster:memory_usage:ratio expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster) - record: cluster:container_spec_cpu_shares:ratio expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 / sum(machine_cpu_cores) - record: cluster:container_cpu_usage:ratio expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) / sum(machine_cpu_cores) - record: apiserver_latency_seconds:quantile expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / 1e+06 labels: quantile: "0.99" - record: apiserver_latency:quantile_seconds expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / 1e+06 labels: quantile: "0.9" - record: apiserver_latency_seconds:quantile expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / 1e+06 labels: quantile: "0.5" - alert: APIServerLatencyHigh expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 for: 10m labels: severity: warning annotations: description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}} summary: API server high latency - alert: APIServerLatencyHigh expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 for: 10m labels: severity: critical annotations: description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}} summary: API server high latency - alert: APIServerErrorsHigh expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 2 for: 10m labels: severity: warning annotations: description: API server returns errors for {{ $value }}% of requests summary: API server request errors - alert: APIServerErrorsHigh expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 5 for: 10m labels: severity: critical annotations: description: API server returns errors for {{ $value }}% of requests - alert: K8SApiserverDown expr: absent(up{job="apiserver"} == 1) for: 20m labels: severity: critical annotations: description: No API servers are reachable or all have disappeared from service discovery summary: No API servers are reachable - alert: K8sCertificateExpirationNotice labels: severity: warning annotations: description: Kubernetes API Certificate is expiring soon (less than 7 days) summary: Kubernetes API Certificate is expiering soon expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 - alert: K8sCertificateExpirationNotice labels: severity: critical annotations: description: Kubernetes API Certificate is expiring in less than 1 day summary: Kubernetes API Certificate is expiering expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0 node.rules.yaml: |+ groups: - name: node.rules rules: - record: instance:node_cpu:rate:sum expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) BY (instance) - record: instance:node_filesystem_usage:sum expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance) - record: instance:node_network_receive_bytes:rate:sum expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) - record: instance:node_network_transmit_bytes:rate:sum expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) - record: instance:node_cpu:ratio expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) - record: cluster:node_cpu:sum_rate5m expr: sum(rate(node_cpu{mode!="idle"}[5m])) - record: cluster:node_cpu:ratio expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) - alert: NodeExporterDown expr: absent(up{job="node-exporter"} == 1) for: 10m labels: severity: warning annotations: description: Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery summary: Prometheus could not scrape a node-exporter - alert: NodeIOWaitHigher expr: avg(irate(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance) * 100 > 30 for: 5m labels: severity: warning owner: ops annotations: description: 节点 {{$labels.instance}} 的IOWait过高,当前IOWait为 {{ $value }}% summary: 节点IOWait超过30% - alert: NodeDiskRunningFull expr: predict_linear(node_filesystem_files_free[6h], 3600 * 24) < 0 for: 30m labels: severity: warning annotations: description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}}) summary: Node disk is running full within 24 hours - alert: NodeDiskRunningFull expr: predict_linear(node_filesystem_files_free[30m], 3600 * 2) < 0 for: 10m labels: severity: critical annotations: description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}}) summary: Node disk is running full within 2 hours - alert: tcpEstablishedTooMany expr: predict_linear(node_netstat_Tcp_CurrEstab{instance!=""}[1h], 4*3600) > 5000 for: 20m labels: severity: warning owner: ops annotations: description: '主机: {{ $labels.instance }} 的TCP连接数将在4个小时后超过5000,当前ESTABLISHED: {{ $value }}, 请及时查看。' summary: 根据1小时的数据预估TCP的ESTABLISHED数量将在4小时后超过5000 - alert: tcpTimeWaitTooMany expr: predict_linear(node_sockstat_TCP_tw{instance!=""}[1h], 4*3600) > 2000 for: 10m labels: severity: warning owner: ops annotations: description: '主机: {{ $labels.instance }} 的TIME_WAIT连接数将在4个小时后超过2000,当前TIME_WAIT: {{ $value }}, 请及时查看。' summary: 根据1小时的数据预估TCP的TIME_WAIT数量将在4小时后超过2000 - alert: OpenFileHandleTooMany expr: predict_linear(node_filefd_allocated{instance!=""}[1h], 4*3600) > 20000 for: 30m labels: severity: warning owner: ops annotations: description: '主机: {{ $labels.instance }} 打开的文件句柄将在4个小时后超过20000,当前打开的文件句柄: {{ $value }}, 请及时查看。' summary: 根据1小时的数据预估打开的文件句柄数量将在4小时后超过20000 - alert: diskUsageOver80Percent expr: (1-(node_filesystem_free_bytes{mountpoint=~"/data|/|/.*etcd.*", fstype=~"ext4|xfs"} / node_filesystem_size_bytes{mountpoint=~"/data|/|/.*etcd.*", fstype=~"ext4|xfs"})) * 100 > 80 for: 20m labels: severity: warning owner: ops annotations: description: '主机: {{ $labels.instance }} 的 {{ $labels.mountpoint }} 分区使用率超过80% ,当前使用率: {{ $value }}%, 请及时查看。' summary: 磁盘使用率超过80% - alert: nodeMemoryUsageOver80Percent expr: (1 - (node_memory_MemAvailable_bytes{instance!=""} / (node_memory_MemTotal_bytes{instance!=""})))* 100 > 80 for: 20m labels: severity: warning owner: ops annotations: description: '主机: {{ $labels.instance }} 的内存使用率超过80%, 当前使用率: {{ $value }}, 请及时查看。' summary: 节点内存使用率超过80% prometheus.rules.yaml: |+ groups: - name: prometheus.rules rules: - alert: PrometheusConfigReloadFailed expr: prometheus_config_last_reload_successful == 0 for: 10m labels: severity: warning annotations: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} summary: Reloading Promehteus' configuration failed - alert: PrometheusNotificationQueueRunningFull expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity for: 10m labels: severity: warning annotations: description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}} summary: Prometheus' alert notification queue is running full - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.01 for: 10m labels: severity: warning annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} summary: Errors while sending alert from Prometheus - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.03 for: 10m labels: severity: critical annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} summary: Errors while sending alerts from Prometheus - alert: PrometheusNotConnectedToAlertmanagers expr: prometheus_notifications_alertmanagers_discovered < 1 for: 10m labels: severity: warning annotations: description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers summary: Prometheus is not connected to any Alertmanagers - alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 for: 12h labels: severity: warning annotations: description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.' summary: Prometheus has issues reloading data blocks from disk - alert: PrometheusTSDBCompactionsFailing expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 for: 12h labels: severity: warning annotations: description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.' summary: Prometheus has issues compacting sample blocks - alert: PrometheusTSDBWALCorruptions expr: tsdb_wal_corruptions_total > 0 for: 4h labels: severity: warning annotations: description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).' summary: Prometheus write-ahead log is corrupted - alert: PrometheusNotIngestingSamples expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0 for: 10m labels: severity: warning annotations: description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples." summary: "Prometheus isn't ingesting samples" - alert: PrometheusTargetScapesDuplicate expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 for: 10m labels: severity: warning annotations: description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values" summary: Prometheus has many samples rejected zk.rules.yaml: |+ groups: - name: zk.rules rules: - alert: zkClusterHealth expr: zk_up < 1 for: 5m labels: severity: critical service: pods owner: ops app: zk annotations: description: '{{$labels.job}} at {{$labels.instance}} had {{$value}} Server IP: {{$labels.server}}' summary: zookeeper 状态不健康 noah_pod.rules.yaml: |+ groups: - name: noah_pod.rules rules: - alert: Pod_all_cpu_usage expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[10m]))*100) > 500 for: 5m labels: severity: critical service: pods annotations: description: 容器 {{ $labels.name }} CPU 资源利用率过大 , (current value is {{ $value }}) summary: Dev CPU 负载告警 - alert: Pod_Memory_Grows_too_Fast expr: predict_linear(container_memory_usage_bytes{pod_name!="", image!="",namespace!~"monitoring|kube-system|default|kube-public|logging|istio-system"}[2h] , 4 * 3600) > kube_pod_container_resource_limits_memory_bytes{pod!="",namespace!~"monitoring|kube-system|default|kube-public|logging|istio-system"} for: 5m labels: severity: critical service: pods owner: ops annotations: description: 容器 {{ $labels.name }} 的内存使用可能会在4个小时后超过Limit限制,超过Limit限制会造成Pod重启, summary: Pod Memory 增长过快 - alert: Pod_CPU_Grows_too_Fast expr: irate(container_cpu_usage_seconds_total{image!="",container_name!="POD",namespace!~"monitoring|kube-system|default|kube-public|logging|istio-system"}[30m]) > 1 for: 10m labels: severity: critical service: pods owner: ops annotations: description: 容器 {{ $labels.name }} CPU增长率过快 , (当前增长率 {{ $value }}) summary: Dev CPU 增长过快 - alert: Pod_Memory_will_be_full expr: container_memory_usage_bytes{container_name!="POD",container_name!~"curl|wget|busy|fortio",image!="",namespace!~"kubernetes-dashboard|monitoring|kube-system|default|kube-public|logging|istio-system|kong",pod_name!~"appjob.*"} > container_spec_memory_limit_bytes{container_name!="POD",container_name!~"curl|wget|busy|fortio",image!="",namespace!~"monitoring|kubernetes-dashboard|kube-system|default|kube-public|logging|istio-system|kong",pod_name!~"appjob.*"} * 0.9 for: 5m labels: severity: critical service: pods owner: ops annotations: description: 容器 {{ $labels.name }} 内存将要达到上限。 summary: 容器内存使用率超过limit的90%。 kong.rules.yaml: |+ groups: - name: kong.rules rules: - alert: ErrorCode504Excessive expr: irate(kong_http_status{service!~".*appfront.*|.*default.*|.*kubernetes.*",code=~"500|504|502|503"}[5m]) > 0.1 for: 5m labels: severity: critical type: httpErrorCode annotations: # description: 错误代码过多 description: "Service: {{ $labels.service }} 的错误代码: {{ $labels.code }} 过多,每秒增长率为: {{ $value }}" summary: Kong 上域名错误代码过多 - alert: ErrorCodeExcessive expr: irate(kong_http_status{service!~".*appfront.*|.*default.*|.*kubernetes.*",code!~"20.*|30.*|10.*|404|500|504|502|503"}[5m]) > 0.1 for: 5m labels: severity: critical type: httpErrorCode annotations: # description: 错误代码过多 description: "Service: {{ $labels.service }} 的错误代码: {{ $labels.code }} 过多,每秒增长率为: {{ $value }}" summary: Kong 上域名错误代码过多 - alert: CannotConnectionDatabase expr: kong_datastore_reachable == 0 for: 5m labels: severity: critical owner: ops annotations: description: Kong 无法无法连接数据库 summary: Kong 连接数据库失败 #- alert: ErrorCode404Excessive # expr: sum(rate(kong_http_status{service=~"cola.*",code="404"}[10m])) by (service,code) > 10 # for: 10m # labels: # severity: critical # owner: robot # annotations: # # description: 错误代码过多 # description: "Service: {{ $labels.service }} 的错误代码: {{ $labels.code }} 过多,每秒增长率为: {{ $value }}" # summary: Kong 上域名错误代码过多