groups: - name: elastic-rules.rule rules: - alert: ElasticSearchCpuCritical expr: es_cpu_percentage > 95 for: 3m labels: severity: critical annotations: description: '{{$labels.instance}} reports critical cpu usage. Please verify workload, or add another node to the cluster ' summary: Critical CPU usage on {{$labels.instance}} - alert: ElasticSearchStatusCritical expr: es_status > 1 for: 1m labels: severity: critical annotations: description: '{{$labels.instance}} reports critical status of a ElasticSearch cluster {{$labels.cluster}}. Please check additional metrics or logs.' summary: Critical cluster status of {{$labels.cluster}} on {{$labels.instance}} - alert: ElasticSearchStatusWarning expr: es_status == 1 for: 30m labels: severity: warning annotations: description: '{{$labels.instance}} reports non-healthy status of ElasticSearch cluster {{$labels.cluster}}. Please check additional metrics or logs to find a root cause' summary: NonHealthy cluster status of {{$labels.cluster}} on {{$labels.instance}} - alert: ElasticSearchUnassigedShards expr: es_unassigned_shards > 0 for: 10m labels: severity: warning annotations: description: There are unassigned shards for more than 3 minutes in {{$labels.cluster}} on node {{$labels.instance}}. Please check cluster performance summary: Unassigned shards on {{$labels.cluster}} - alert: ElasticSearchActiveShardsPercentage expr: es_shards_active_percentage < 100 for: 10m labels: severity: warning annotations: description: Some shards ({{$value}}%) shards are inactive on {{$labels.cluster}} for more than 3 minutes. Results from those shards are unavailable in returned results. summary: Non-active shards on {{$labels.cluster}} - alert: ElasticSearchTooManyIndexFailures expr: delta(es_indexing_failed_count[1m]) > 0 labels: severity: warning annotations: description: There are documents indexing failures on node {{$labels.instance}}. Please check logs to get more details. summary: Indexing failures on {{$labels.instance}} - alert: ElasticSearchIndexIsThrootled expr: es_indexng_isthrottled > 0 for: 10m labels: severity: warning annotations: description: Index {{$labels.index}} is throttled for more than 10minutes. Some documents can be missing from returned results. summary: Index {{$labels.index}} throttled for more than 10 minutes - alert: ElasticSearchIndexUnassignedShards expr: es_index_unassigned_shards > 0 for: 10m labels: severity: warning annotations: description: Unassigned shards on index {{$labels.index}} from {{$labels.cluster}} for more than 3 minutes summary: Unassigned shards on {{$labels.index}} at {{$labels.cluster}} - alert: ElasticSearchJvmMemoryPercent expr: es_jvm_memory_heap_used_percen > 95 for: 1m labels: severity: warning annotations: description: '{{$labels.instance}} reports high memory consumption for more than 1 minute. Please check logs for more details' summary: High JVM memory consumption on ES node {{$labels.instance}} - alert: ElasticSearchClusterAllocationDisabled expr: es_cluster_settings{cluster_routing_allocation_enable="none"} > 0 for: 10m labels: severity: warning annotations: description: '{{$labels.instance}} reports that cluster allocation has been disabled for {{$labels.cluster}}. Some documents can be missing from reported results.' summary: Cluster allocation disabled on cluster {{$labels.cluster}} - alert: ElasticSearchClusterRebalanceDisabled expr: es_cluster_settings{cluster_routing_rebalance_enable="none"} > 0 for: 10m labels: severity: warning annotations: description: '{{$labels.instance}} reports that cluster rebalance has been disabled on {{$labels.cluster}}. Some documents can be missing from reported results.' summary: Cluster rebalance disabled on cluster {{$labels.cluster}} - alert: ElasticSearchClusterAllocationDisabledPersistently expr: es_cluster_persistent_settings{cluster_routing_allocation_enable="none"} > 0 for: 10m labels: severity: warning annotations: description: '{{$labels.instance}} reports that cluster allocation has been disabled persistently for {{$labels.cluster}}. Some documents can be missing from reported results and restart will not help.' summary: Cluster allocation disabled on cluster {{$labels.cluster}} - alert: ElasticSearchClusterRebalanceDisabledPersistently expr: es_cluster_persistent_settings{cluster_routing_rebalance_enable="none"} > 0 for: 10m labels: severity: warning annotations: description: '{{$labels.instance}} reports that cluster rebalance has been disabled on {{$labels.cluster}}. Some documents can be missing from reported results and restart will not help.' summary: Cluster rebalance disabled on cluster {{$labels.cluster}} - alert: ElasticSearchClusterReadOnly expr: es_cluster_settings{cluster_blocks_read_only="true"} > 0 for: 1m labels: severity: critical annotations: description: '{{$labels.instance}} reports that cluster {{$labels.cluster}} is in read-only mode. New documents will be rejected.' summary: Cluster {{$labels.cluster}} is in RO mode - alert: ElasticSearchClusterReadOnlyPersistently expr: es_cluster_persistent_settings{cluster_blocks_read_only="true"} > 0 for: 1m labels: severity: critical annotations: description: '{{$labels.instance}} reports that cluster {{$labels.cluster}} is in read-only mode' summary: Cluster {{$labels.cluster}} is read-only - alert: ElasticDifferentVersion expr: count(sum(es_prometheus_version) BY (pluginVersion, cluster)) BY (cluster) > 1 for: 5d labels: severity: warning annotations: description: Cluster {{$labels.cluster}} reports inconsistent versions of ES plugin summary: Cluster {{$labels.cluster}} is inconsistent - alert: ElasticIngestionFailed expr: delta(es_ingest_total_failed_count[1m]) > 0 labels: severity: warning annotations: description: Node {{$labels.instance}} reports failed ingestions. Some documents were lost summary: Node {{$labels.instance}} ingestion failed - alert: ElasticNoSpaceWithin24h expr: predict_linear(es_fs_free_bytes[1h], 24 * 3600) < 0 for: 10m labels: severity: warning annotations: description: Elasticsearch reports that space on {{ $labels.node }}({{ $labels.instance }}) will run within 24h. Please check disk usage on that host summary: Elasticsearch {{ $labels.node }} at {{ $labels.cluster }} will be out of disk space within 24h - alert: ElasticNoAvailableSpace expr: es_fs_path_free_bytes * 100 / es_fs_path_total_bytes < 10 for: 10m labels: severity: critical annotations: description: Elasticsearch reports that there are only {{ $value }}% left on {{ $labels.path }} at {{$labels.instance}}. Please check it summary: No Available space on {{$labels.instance}} - alert: ElasticCircuitEnabled expr: delta(es_breaker_tripped[1m]) > 0 labels: severity: critical annotations: description: Elasticsearch circuit breaker {{ $labels.circuit_name }} was enabled within last minute on {{$labels.node}} ({{$labels.instance}}). Looks like high memory pressure on this host ans some data were discarded. summary: Circuit breaker {{ $labels.circuit_name}} enabled on {{$labels.node}} - alert: ElasticThreadpoolRejected expr: delta(es_threadpool_rejected[1m]) > 0 labels: severity: warning annotations: description: Elasticsearch threadpool {{ $labels.threadpool }} rejected {{ $value }} tasks within last minute. Some jobs failed and never will be repeated, it could be a high CPU pressure or I/O errors. Please check node and it's state summary: Threadpool tasks rejected {{ $labels.threadpool}} at {{$labels.node}}