6 years ago · d0bef0f2ed
--- a/prometheus-operator/exporter/elasticsearch-exporter/alter.rules
+++ b/prometheus-operator/exporter/elasticsearch-exporter/alter.rules
@@ -1,99 +0,0 @@
 
				-ALERT Elastic_UP
			
 
				-  IF elasticsearch_up{job="elasticsearch"} != 1
			
 
				-  FOR 120s
			
 
				-  LABELS { severity="alert", value = "{{$value}}" }
			
 
				-  ANNOTATIONS {
			
 
				-    summary = "Instance {{ $labels.instance }}: Elasticsearch instance status is not 1",
			
 
				-    description = "This server's Elasticsearch instance status has a value of {{ $value }}.",
			
 
				-  }
			
 
				-
			
 
				-ALERT Elastic_Cluster_Health_RED
			
 
				-  IF elasticsearch_cluster_health_status{color="red"}==1
			
 
				-  FOR 300s
			
 
				-  LABELS { severity="alert", value = "{{$value}}" }
			
 
				-  ANNOTATIONS {
			
 
				-    summary = "Instance {{ $labels.instance }}: not all primary and replica shards are allocated in elasticsearch cluster {{ $labels.cluster }}",
			
 
				-    description = "Instance {{ $labels.instance }}: not all primary and replica shards are allocated in elasticsearch cluster {{ $labels.cluster }}.",
			
 
				-  }
			
 
				-
			
 
				-ALERT Elastic_Cluster_Health_Yellow
			
 
				-  IF elasticsearch_cluster_health_status{color="yellow"}==1
			
 
				-  FOR 300s
			
 
				-  LABELS { severity="alert", value = "{{$value}}" }
			
 
				-  ANNOTATIONS {
			
 
				-    summary = "Instance {{ $labels.instance }}: not all primary and replica shards are allocated in elasticsearch cluster {{ $labels.cluster }}",
			
 
				-    description = "Instance {{ $labels.instance }}: not all primary and replica shards are allocated in elasticsearch cluster {{ $labels.cluster }}.",
			
 
				-  }
			
 
				-
			
 
				-ALERT Elasticsearch_JVM_Heap_Too_High
			
 
				- IF elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.8
			
 
				- FOR 15m
			
 
				- LABELS { severity="alert", value = "{{$value}}" }
			
 
				- ANNOTATIONS {
			
 
				-    summary = "ElasticSearch node {{ $labels.instance }} heap usage is high",
			
 
				-    description = "The heap in {{ $labels.instance }} is over 80% for 15m.",
			
 
				-  }
			
 
				-
			
 
				-ALERT Elasticsearch_health_up
			
 
				- IF elasticsearch_cluster_health_up !=1
			
 
				- FOR 1m
			
 
				- LABELS { severity="alert", value = "{{$value}}" }
			
 
				- ANNOTATIONS {
			
 
				-    summary = "ElasticSearch node: {{ $labels.instance }} last scrape of the ElasticSearch cluster health failed",
			
 
				-    description = "ElasticSearch node: {{ $labels.instance }} last scrape of the ElasticSearch cluster health failed",
			
 
				-  }
			
 
				-
			
 
				-ALERT Elasticsearch_Too_Few_Nodes_Running
			
 
				-  IF elasticsearch_cluster_health_number_of_nodes < 3
			
 
				-  FOR 5m
			
 
				-  LABELS { severity="alert", value = "{{$value}}" }
			
 
				-  ANNOTATIONS {
			
 
				-    description="There are only {{$value}} < 3 ElasticSearch nodes running",
			
 
				-    summary="ElasticSearch running on less than 3 nodes"
			
 
				-  }
			
 
				-
			
 
				-ALERT Elasticsearch_Count_of_JVM_GC_Runs
			
 
				- IF rate(elasticsearch_jvm_gc_collection_seconds_count{}[5m])>5
			
 
				- FOR 60s
			
 
				- LABELS { severity="warning", value = "{{$value}}" }
			
 
				- ANNOTATIONS {
			
 
				-    summary = "ElasticSearch node {{ $labels.instance }}: Count of JVM GC runs > 5 per sec and has a value of {{ $value }}",
			
 
				-    description = "ElasticSearch node {{ $labels.instance }}: Count of JVM GC runs > 5 per sec and has a value of {{ $value }}",
			
 
				-  }
			
 
				-
			
 
				-ALERT Elasticsearch_GC_Run_Time
			
 
				- IF rate(elasticsearch_jvm_gc_collection_seconds_sum[5m])>0.3
			
 
				- FOR 60s
			
 
				- LABELS { severity="warning", value = "{{$value}}" }
			
 
				- ANNOTATIONS {
			
 
				-    summary = "ElasticSearch node {{ $labels.instance }}: GC run time in seconds > 0.3 sec and has a value of {{ $value }}",
			
 
				-    description = "ElasticSearch node {{ $labels.instance }}: GC run time in seconds > 0.3 sec and has a value of {{ $value }}",
			
 
				-  }
			
 
				-
			
 
				-ALERT Elasticsearch_json_parse_failures
			
 
				- IF elasticsearch_cluster_health_json_parse_failures>0
			
 
				- FOR 60s
			
 
				- LABELS { severity="warning", value = "{{$value}}" }
			
 
				- ANNOTATIONS {
			
 
				-    summary = "ElasticSearch node {{ $labels.instance }}: json parse failures > 0 and has a value of {{ $value }}",
			
 
				-    description = "ElasticSearch node {{ $labels.instance }}: json parse failures > 0 and has a value of {{ $value }}",
			
 
				-  }
			
 
				-
			
 
				-
			
 
				-ALERT Elasticsearch_breakers_tripped
			
 
				- IF rate(elasticsearch_breakers_tripped{}[5m])>0
			
 
				- FOR 60s
			
 
				- LABELS { severity="warning", value = "{{$value}}" }
			
 
				- ANNOTATIONS {
			
 
				-    summary = "ElasticSearch node {{ $labels.instance }}: breakers tripped > 0 and has a value of {{ $value }}",
			
 
				-    description = "ElasticSearch node {{ $labels.instance }}: breakers tripped > 0 and has a value of {{ $value }}",
			
 
				-  }
			
 
				-
			
 
				-ALERT Elasticsearch_health_timed_out
			
 
				- IF elasticsearch_cluster_health_timed_out>0
			
 
				- FOR 60s
			
 
				- LABELS { severity="warning", value = "{{$value}}" }
			
 
				- ANNOTATIONS {
			
 
				-    summary = "ElasticSearch node {{ $labels.instance }}: Number of cluster health checks timed out > 0 and has a value of {{ $value }}",
			
 
				-    description = "ElasticSearch node {{ $labels.instance }}: Number of cluster health checks timed out > 0 and has a value of {{ $value }}",
			
 
				-  }
			
--- a/prometheus-operator/exporter/elasticsearch-exporter/es-alert-rules.yaml
+++ b/prometheus-operator/exporter/elasticsearch-exporter/es-alert-rules.yaml
@@ -0,0 +1,200 @@
 
				+groups:
			
 
				+- name: elastic-rules.rule
			
 
				+  rules:
			
 
				+  - alert: ElasticSearchCpuCritical
			
 
				+    expr: es_cpu_percentage > 95
			
 
				+    for: 3m
			
 
				+    labels:
			
 
				+      severity: critical
			
 
				+    annotations:
			
 
				+      description: '{{$labels.instance}} reports critical cpu usage. Please verify
			
 
				+        workload, or add another node to the cluster '
			
 
				+      summary: Critical CPU usage on {{$labels.instance}}
			
 
				+  - alert: ElasticSearchStatusCritical
			
 
				+    expr: es_status > 1
			
 
				+    for: 1m
			
 
				+    labels:
			
 
				+      severity: critical
			
 
				+    annotations:
			
 
				+      description: '{{$labels.instance}} reports critical status of a ElasticSearch
			
 
				+        cluster {{$labels.cluster}}. Please check additional metrics or logs.'
			
 
				+      summary: Critical cluster status of {{$labels.cluster}} on {{$labels.instance}}
			
 
				+  - alert: ElasticSearchStatusWarning
			
 
				+    expr: es_status == 1
			
 
				+    for: 30m
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: '{{$labels.instance}} reports non-healthy status of ElasticSearch
			
 
				+        cluster {{$labels.cluster}}. Please check additional metrics or logs to find
			
 
				+        a root cause'
			
 
				+      summary: NonHealthy cluster status of {{$labels.cluster}} on {{$labels.instance}}
			
 
				+  - alert: ElasticSearchUnassigedShards
			
 
				+    expr: es_unassigned_shards > 0
			
 
				+    for: 10m
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: There are unassigned shards for more than 3 minutes in {{$labels.cluster}}
			
 
				+        on node {{$labels.instance}}. Please check cluster performance
			
 
				+      summary: Unassigned shards on {{$labels.cluster}}
			
 
				+  - alert: ElasticSearchActiveShardsPercentage
			
 
				+    expr: es_shards_active_percentage < 100
			
 
				+    for: 10m
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: Some shards ({{$value}}%) shards are inactive on {{$labels.cluster}}
			
 
				+        for more than 3 minutes. Results from those shards are unavailable in returned
			
 
				+        results.
			
 
				+      summary: Non-active shards on {{$labels.cluster}}
			
 
				+  - alert: ElasticSearchTooManyIndexFailures
			
 
				+    expr: delta(es_indexing_failed_count[1m]) > 0
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: There are documents indexing failures on node {{$labels.instance}}.
			
 
				+        Please check logs to get more details.
			
 
				+      summary: Indexing failures on {{$labels.instance}}
			
 
				+  - alert: ElasticSearchIndexIsThrootled
			
 
				+    expr: es_indexng_isthrottled > 0
			
 
				+    for: 10m
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: Index {{$labels.index}} is throttled for more than 10minutes. Some
			
 
				+        documents can be missing from returned results.
			
 
				+      summary: Index {{$labels.index}} throttled for more than 10 minutes
			
 
				+  - alert: ElasticSearchIndexUnassignedShards
			
 
				+    expr: es_index_unassigned_shards > 0
			
 
				+    for: 10m
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: Unassigned shards on index {{$labels.index}} from {{$labels.cluster}}
			
 
				+        for more than 3 minutes
			
 
				+      summary: Unassigned shards on {{$labels.index}} at {{$labels.cluster}}
			
 
				+  - alert: ElasticSearchJvmMemoryPercent
			
 
				+    expr: es_jvm_memory_heap_used_percen > 95
			
 
				+    for: 1m
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: '{{$labels.instance}} reports high memory consumption for more
			
 
				+        than 1 minute. Please check logs for more details'
			
 
				+      summary: High JVM memory consumption on ES node {{$labels.instance}}
			
 
				+  - alert: ElasticSearchClusterAllocationDisabled
			
 
				+    expr: es_cluster_settings{cluster_routing_allocation_enable="none"} > 0
			
 
				+    for: 10m
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: '{{$labels.instance}} reports that cluster allocation has been
			
 
				+        disabled for {{$labels.cluster}}. Some documents can be missing from reported
			
 
				+        results.'
			
 
				+      summary: Cluster allocation disabled on cluster {{$labels.cluster}}
			
 
				+  - alert: ElasticSearchClusterRebalanceDisabled
			
 
				+    expr: es_cluster_settings{cluster_routing_rebalance_enable="none"} > 0
			
 
				+    for: 10m
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: '{{$labels.instance}} reports that cluster rebalance has been disabled
			
 
				+        on {{$labels.cluster}}. Some documents can be missing from reported results.'
			
 
				+      summary: Cluster rebalance disabled on cluster {{$labels.cluster}}
			
 
				+  - alert: ElasticSearchClusterAllocationDisabledPersistently
			
 
				+    expr: es_cluster_persistent_settings{cluster_routing_allocation_enable="none"}
			
 
				+      > 0
			
 
				+    for: 10m
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: '{{$labels.instance}} reports that cluster allocation has been
			
 
				+        disabled persistently for {{$labels.cluster}}. Some documents can be missing
			
 
				+        from reported results and restart will not help.'
			
 
				+      summary: Cluster allocation disabled on cluster {{$labels.cluster}}
			
 
				+  - alert: ElasticSearchClusterRebalanceDisabledPersistently
			
 
				+    expr: es_cluster_persistent_settings{cluster_routing_rebalance_enable="none"}
			
 
				+      > 0
			
 
				+    for: 10m
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: '{{$labels.instance}} reports that cluster rebalance has been disabled
			
 
				+        on {{$labels.cluster}}. Some documents can be missing from reported results
			
 
				+        and restart will not help.'
			
 
				+      summary: Cluster rebalance disabled on cluster {{$labels.cluster}}
			
 
				+  - alert: ElasticSearchClusterReadOnly
			
 
				+    expr: es_cluster_settings{cluster_blocks_read_only="true"} > 0
			
 
				+    for: 1m
			
 
				+    labels:
			
 
				+      severity: critical
			
 
				+    annotations:
			
 
				+      description: '{{$labels.instance}} reports that cluster {{$labels.cluster}}
			
 
				+        is in read-only mode. New documents will be rejected.'
			
 
				+      summary: Cluster {{$labels.cluster}} is in RO mode
			
 
				+  - alert: ElasticSearchClusterReadOnlyPersistently
			
 
				+    expr: es_cluster_persistent_settings{cluster_blocks_read_only="true"} > 0
			
 
				+    for: 1m
			
 
				+    labels:
			
 
				+      severity: critical
			
 
				+    annotations:
			
 
				+      description: '{{$labels.instance}} reports that cluster {{$labels.cluster}}
			
 
				+        is in read-only mode'
			
 
				+      summary: Cluster {{$labels.cluster}} is read-only
			
 
				+  - alert: ElasticDifferentVersion
			
 
				+    expr: count(sum(es_prometheus_version) BY (pluginVersion, cluster)) BY (cluster)
			
 
				+      > 1
			
 
				+    for: 5d
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: Cluster {{$labels.cluster}} reports inconsistent versions of ES
			
 
				+        plugin
			
 
				+      summary: Cluster {{$labels.cluster}} is inconsistent
			
 
				+  - alert: ElasticIngestionFailed
			
 
				+    expr: delta(es_ingest_total_failed_count[1m]) > 0
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: Node {{$labels.instance}} reports failed ingestions. Some documents
			
 
				+        were lost
			
 
				+      summary: Node {{$labels.instance}} ingestion failed
			
 
				+  - alert: ElasticNoSpaceWithin24h
			
 
				+    expr: predict_linear(es_fs_free_bytes[1h], 24 * 3600) < 0
			
 
				+    for: 10m
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: Elasticsearch reports that space on {{ $labels.node }}({{ $labels.instance
			
 
				+        }}) will run within 24h. Please check disk usage on that host
			
 
				+      summary: Elasticsearch {{ $labels.node }} at {{ $labels.cluster }} will be out
			
 
				+        of disk space within 24h
			
 
				+  - alert: ElasticNoAvailableSpace
			
 
				+    expr: es_fs_path_free_bytes * 100 / es_fs_path_total_bytes < 10
			
 
				+    for: 10m
			
 
				+    labels:
			
 
				+      severity: critical
			
 
				+    annotations:
			
 
				+      description: Elasticsearch reports that there  are only {{ $value }}% left on
			
 
				+        {{ $labels.path }} at {{$labels.instance}}. Please check it
			
 
				+      summary: No Available space on {{$labels.instance}}
			
 
				+  - alert: ElasticCircuitEnabled
			
 
				+    expr: delta(es_breaker_tripped[1m]) > 0
			
 
				+    labels:
			
 
				+      severity: critical
			
 
				+    annotations:
			
 
				+      description: Elasticsearch circuit breaker {{ $labels.circuit_name }} was enabled
			
 
				+        within last minute on {{$labels.node}} ({{$labels.instance}}). Looks like
			
 
				+        high memory pressure on this host ans some data were discarded.
			
 
				+      summary: Circuit breaker {{ $labels.circuit_name}} enabled on {{$labels.node}}
			
 
				+  - alert: ElasticThreadpoolRejected
			
 
				+    expr: delta(es_threadpool_rejected[1m]) > 0
			
 
				+    labels:
			
 
				+      severity: warning
			
 
				+    annotations:
			
 
				+      description: Elasticsearch threadpool {{ $labels.threadpool }} rejected {{ $value
			
 
				+        }} tasks within last minute. Some jobs failed and never will be repeated,
			
 
				+        it could be a high CPU pressure or I/O errors. Please check node and it's
			
 
				+        state
			
 
				+      summary: Threadpool tasks rejected {{ $labels.threadpool}} at {{$labels.node}}