123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200 |
- groups:
- - name: elastic-rules.rule
- rules:
- - alert: ElasticSearchCpuCritical
- expr: es_cpu_percentage > 95
- for: 3m
- labels:
- severity: critical
- annotations:
- description: '{{$labels.instance}} reports critical cpu usage. Please verify
- workload, or add another node to the cluster '
- summary: Critical CPU usage on {{$labels.instance}}
- - alert: ElasticSearchStatusCritical
- expr: es_status > 1
- for: 1m
- labels:
- severity: critical
- annotations:
- description: '{{$labels.instance}} reports critical status of a ElasticSearch
- cluster {{$labels.cluster}}. Please check additional metrics or logs.'
- summary: Critical cluster status of {{$labels.cluster}} on {{$labels.instance}}
- - alert: ElasticSearchStatusWarning
- expr: es_status == 1
- for: 30m
- labels:
- severity: warning
- annotations:
- description: '{{$labels.instance}} reports non-healthy status of ElasticSearch
- cluster {{$labels.cluster}}. Please check additional metrics or logs to find
- a root cause'
- summary: NonHealthy cluster status of {{$labels.cluster}} on {{$labels.instance}}
- - alert: ElasticSearchUnassigedShards
- expr: es_unassigned_shards > 0
- for: 10m
- labels:
- severity: warning
- annotations:
- description: There are unassigned shards for more than 3 minutes in {{$labels.cluster}}
- on node {{$labels.instance}}. Please check cluster performance
- summary: Unassigned shards on {{$labels.cluster}}
- - alert: ElasticSearchActiveShardsPercentage
- expr: es_shards_active_percentage < 100
- for: 10m
- labels:
- severity: warning
- annotations:
- description: Some shards ({{$value}}%) shards are inactive on {{$labels.cluster}}
- for more than 3 minutes. Results from those shards are unavailable in returned
- results.
- summary: Non-active shards on {{$labels.cluster}}
- - alert: ElasticSearchTooManyIndexFailures
- expr: delta(es_indexing_failed_count[1m]) > 0
- labels:
- severity: warning
- annotations:
- description: There are documents indexing failures on node {{$labels.instance}}.
- Please check logs to get more details.
- summary: Indexing failures on {{$labels.instance}}
- - alert: ElasticSearchIndexIsThrootled
- expr: es_indexng_isthrottled > 0
- for: 10m
- labels:
- severity: warning
- annotations:
- description: Index {{$labels.index}} is throttled for more than 10minutes. Some
- documents can be missing from returned results.
- summary: Index {{$labels.index}} throttled for more than 10 minutes
- - alert: ElasticSearchIndexUnassignedShards
- expr: es_index_unassigned_shards > 0
- for: 10m
- labels:
- severity: warning
- annotations:
- description: Unassigned shards on index {{$labels.index}} from {{$labels.cluster}}
- for more than 3 minutes
- summary: Unassigned shards on {{$labels.index}} at {{$labels.cluster}}
- - alert: ElasticSearchJvmMemoryPercent
- expr: es_jvm_memory_heap_used_percen > 95
- for: 1m
- labels:
- severity: warning
- annotations:
- description: '{{$labels.instance}} reports high memory consumption for more
- than 1 minute. Please check logs for more details'
- summary: High JVM memory consumption on ES node {{$labels.instance}}
- - alert: ElasticSearchClusterAllocationDisabled
- expr: es_cluster_settings{cluster_routing_allocation_enable="none"} > 0
- for: 10m
- labels:
- severity: warning
- annotations:
- description: '{{$labels.instance}} reports that cluster allocation has been
- disabled for {{$labels.cluster}}. Some documents can be missing from reported
- results.'
- summary: Cluster allocation disabled on cluster {{$labels.cluster}}
- - alert: ElasticSearchClusterRebalanceDisabled
- expr: es_cluster_settings{cluster_routing_rebalance_enable="none"} > 0
- for: 10m
- labels:
- severity: warning
- annotations:
- description: '{{$labels.instance}} reports that cluster rebalance has been disabled
- on {{$labels.cluster}}. Some documents can be missing from reported results.'
- summary: Cluster rebalance disabled on cluster {{$labels.cluster}}
- - alert: ElasticSearchClusterAllocationDisabledPersistently
- expr: es_cluster_persistent_settings{cluster_routing_allocation_enable="none"}
- > 0
- for: 10m
- labels:
- severity: warning
- annotations:
- description: '{{$labels.instance}} reports that cluster allocation has been
- disabled persistently for {{$labels.cluster}}. Some documents can be missing
- from reported results and restart will not help.'
- summary: Cluster allocation disabled on cluster {{$labels.cluster}}
- - alert: ElasticSearchClusterRebalanceDisabledPersistently
- expr: es_cluster_persistent_settings{cluster_routing_rebalance_enable="none"}
- > 0
- for: 10m
- labels:
- severity: warning
- annotations:
- description: '{{$labels.instance}} reports that cluster rebalance has been disabled
- on {{$labels.cluster}}. Some documents can be missing from reported results
- and restart will not help.'
- summary: Cluster rebalance disabled on cluster {{$labels.cluster}}
- - alert: ElasticSearchClusterReadOnly
- expr: es_cluster_settings{cluster_blocks_read_only="true"} > 0
- for: 1m
- labels:
- severity: critical
- annotations:
- description: '{{$labels.instance}} reports that cluster {{$labels.cluster}}
- is in read-only mode. New documents will be rejected.'
- summary: Cluster {{$labels.cluster}} is in RO mode
- - alert: ElasticSearchClusterReadOnlyPersistently
- expr: es_cluster_persistent_settings{cluster_blocks_read_only="true"} > 0
- for: 1m
- labels:
- severity: critical
- annotations:
- description: '{{$labels.instance}} reports that cluster {{$labels.cluster}}
- is in read-only mode'
- summary: Cluster {{$labels.cluster}} is read-only
- - alert: ElasticDifferentVersion
- expr: count(sum(es_prometheus_version) BY (pluginVersion, cluster)) BY (cluster)
- > 1
- for: 5d
- labels:
- severity: warning
- annotations:
- description: Cluster {{$labels.cluster}} reports inconsistent versions of ES
- plugin
- summary: Cluster {{$labels.cluster}} is inconsistent
- - alert: ElasticIngestionFailed
- expr: delta(es_ingest_total_failed_count[1m]) > 0
- labels:
- severity: warning
- annotations:
- description: Node {{$labels.instance}} reports failed ingestions. Some documents
- were lost
- summary: Node {{$labels.instance}} ingestion failed
- - alert: ElasticNoSpaceWithin24h
- expr: predict_linear(es_fs_free_bytes[1h], 24 * 3600) < 0
- for: 10m
- labels:
- severity: warning
- annotations:
- description: Elasticsearch reports that space on {{ $labels.node }}({{ $labels.instance
- }}) will run within 24h. Please check disk usage on that host
- summary: Elasticsearch {{ $labels.node }} at {{ $labels.cluster }} will be out
- of disk space within 24h
- - alert: ElasticNoAvailableSpace
- expr: es_fs_path_free_bytes * 100 / es_fs_path_total_bytes < 10
- for: 10m
- labels:
- severity: critical
- annotations:
- description: Elasticsearch reports that there are only {{ $value }}% left on
- {{ $labels.path }} at {{$labels.instance}}. Please check it
- summary: No Available space on {{$labels.instance}}
- - alert: ElasticCircuitEnabled
- expr: delta(es_breaker_tripped[1m]) > 0
- labels:
- severity: critical
- annotations:
- description: Elasticsearch circuit breaker {{ $labels.circuit_name }} was enabled
- within last minute on {{$labels.node}} ({{$labels.instance}}). Looks like
- high memory pressure on this host ans some data were discarded.
- summary: Circuit breaker {{ $labels.circuit_name}} enabled on {{$labels.node}}
- - alert: ElasticThreadpoolRejected
- expr: delta(es_threadpool_rejected[1m]) > 0
- labels:
- severity: warning
- annotations:
- description: Elasticsearch threadpool {{ $labels.threadpool }} rejected {{ $value
- }} tasks within last minute. Some jobs failed and never will be repeated,
- it could be a high CPU pressure or I/O errors. Please check node and it's
- state
- summary: Threadpool tasks rejected {{ $labels.threadpool}} at {{$labels.node}}
|