Your Name 6 роки тому
батько
коміт
d0bef0f2ed

+ 0 - 99
prometheus-operator/exporter/elasticsearch-exporter/alter.rules

@@ -1,99 +0,0 @@
-ALERT Elastic_UP
-  IF elasticsearch_up{job="elasticsearch"} != 1
-  FOR 120s
-  LABELS { severity="alert", value = "{{$value}}" }
-  ANNOTATIONS {
-    summary = "Instance {{ $labels.instance }}: Elasticsearch instance status is not 1",
-    description = "This server's Elasticsearch instance status has a value of {{ $value }}.",
-  }
-
-ALERT Elastic_Cluster_Health_RED
-  IF elasticsearch_cluster_health_status{color="red"}==1
-  FOR 300s
-  LABELS { severity="alert", value = "{{$value}}" }
-  ANNOTATIONS {
-    summary = "Instance {{ $labels.instance }}: not all primary and replica shards are allocated in elasticsearch cluster {{ $labels.cluster }}",
-    description = "Instance {{ $labels.instance }}: not all primary and replica shards are allocated in elasticsearch cluster {{ $labels.cluster }}.",
-  }
-
-ALERT Elastic_Cluster_Health_Yellow
-  IF elasticsearch_cluster_health_status{color="yellow"}==1
-  FOR 300s
-  LABELS { severity="alert", value = "{{$value}}" }
-  ANNOTATIONS {
-    summary = "Instance {{ $labels.instance }}: not all primary and replica shards are allocated in elasticsearch cluster {{ $labels.cluster }}",
-    description = "Instance {{ $labels.instance }}: not all primary and replica shards are allocated in elasticsearch cluster {{ $labels.cluster }}.",
-  }
-
-ALERT Elasticsearch_JVM_Heap_Too_High
- IF elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} > 0.8
- FOR 15m
- LABELS { severity="alert", value = "{{$value}}" }
- ANNOTATIONS {
-    summary = "ElasticSearch node {{ $labels.instance }} heap usage is high",
-    description = "The heap in {{ $labels.instance }} is over 80% for 15m.",
-  }
-
-ALERT Elasticsearch_health_up
- IF elasticsearch_cluster_health_up !=1
- FOR 1m
- LABELS { severity="alert", value = "{{$value}}" }
- ANNOTATIONS {
-    summary = "ElasticSearch node: {{ $labels.instance }} last scrape of the ElasticSearch cluster health failed",
-    description = "ElasticSearch node: {{ $labels.instance }} last scrape of the ElasticSearch cluster health failed",
-  }
-
-ALERT Elasticsearch_Too_Few_Nodes_Running
-  IF elasticsearch_cluster_health_number_of_nodes < 3
-  FOR 5m
-  LABELS { severity="alert", value = "{{$value}}" }
-  ANNOTATIONS {
-    description="There are only {{$value}} < 3 ElasticSearch nodes running",
-    summary="ElasticSearch running on less than 3 nodes"
-  }
-
-ALERT Elasticsearch_Count_of_JVM_GC_Runs
- IF rate(elasticsearch_jvm_gc_collection_seconds_count{}[5m])>5
- FOR 60s
- LABELS { severity="warning", value = "{{$value}}" }
- ANNOTATIONS {
-    summary = "ElasticSearch node {{ $labels.instance }}: Count of JVM GC runs > 5 per sec and has a value of {{ $value }}",
-    description = "ElasticSearch node {{ $labels.instance }}: Count of JVM GC runs > 5 per sec and has a value of {{ $value }}",
-  }
-
-ALERT Elasticsearch_GC_Run_Time
- IF rate(elasticsearch_jvm_gc_collection_seconds_sum[5m])>0.3
- FOR 60s
- LABELS { severity="warning", value = "{{$value}}" }
- ANNOTATIONS {
-    summary = "ElasticSearch node {{ $labels.instance }}: GC run time in seconds > 0.3 sec and has a value of {{ $value }}",
-    description = "ElasticSearch node {{ $labels.instance }}: GC run time in seconds > 0.3 sec and has a value of {{ $value }}",
-  }
-
-ALERT Elasticsearch_json_parse_failures
- IF elasticsearch_cluster_health_json_parse_failures>0
- FOR 60s
- LABELS { severity="warning", value = "{{$value}}" }
- ANNOTATIONS {
-    summary = "ElasticSearch node {{ $labels.instance }}: json parse failures > 0 and has a value of {{ $value }}",
-    description = "ElasticSearch node {{ $labels.instance }}: json parse failures > 0 and has a value of {{ $value }}",
-  }
-
-
-ALERT Elasticsearch_breakers_tripped
- IF rate(elasticsearch_breakers_tripped{}[5m])>0
- FOR 60s
- LABELS { severity="warning", value = "{{$value}}" }
- ANNOTATIONS {
-    summary = "ElasticSearch node {{ $labels.instance }}: breakers tripped > 0 and has a value of {{ $value }}",
-    description = "ElasticSearch node {{ $labels.instance }}: breakers tripped > 0 and has a value of {{ $value }}",
-  }
-
-ALERT Elasticsearch_health_timed_out
- IF elasticsearch_cluster_health_timed_out>0
- FOR 60s
- LABELS { severity="warning", value = "{{$value}}" }
- ANNOTATIONS {
-    summary = "ElasticSearch node {{ $labels.instance }}: Number of cluster health checks timed out > 0 and has a value of {{ $value }}",
-    description = "ElasticSearch node {{ $labels.instance }}: Number of cluster health checks timed out > 0 and has a value of {{ $value }}",
-  }

+ 200 - 0
prometheus-operator/exporter/elasticsearch-exporter/es-alert-rules.yaml

@@ -0,0 +1,200 @@
+groups:
+- name: elastic-rules.rule
+  rules:
+  - alert: ElasticSearchCpuCritical
+    expr: es_cpu_percentage > 95
+    for: 3m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{$labels.instance}} reports critical cpu usage. Please verify
+        workload, or add another node to the cluster '
+      summary: Critical CPU usage on {{$labels.instance}}
+  - alert: ElasticSearchStatusCritical
+    expr: es_status > 1
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{$labels.instance}} reports critical status of a ElasticSearch
+        cluster {{$labels.cluster}}. Please check additional metrics or logs.'
+      summary: Critical cluster status of {{$labels.cluster}} on {{$labels.instance}}
+  - alert: ElasticSearchStatusWarning
+    expr: es_status == 1
+    for: 30m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{$labels.instance}} reports non-healthy status of ElasticSearch
+        cluster {{$labels.cluster}}. Please check additional metrics or logs to find
+        a root cause'
+      summary: NonHealthy cluster status of {{$labels.cluster}} on {{$labels.instance}}
+  - alert: ElasticSearchUnassigedShards
+    expr: es_unassigned_shards > 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: There are unassigned shards for more than 3 minutes in {{$labels.cluster}}
+        on node {{$labels.instance}}. Please check cluster performance
+      summary: Unassigned shards on {{$labels.cluster}}
+  - alert: ElasticSearchActiveShardsPercentage
+    expr: es_shards_active_percentage < 100
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: Some shards ({{$value}}%) shards are inactive on {{$labels.cluster}}
+        for more than 3 minutes. Results from those shards are unavailable in returned
+        results.
+      summary: Non-active shards on {{$labels.cluster}}
+  - alert: ElasticSearchTooManyIndexFailures
+    expr: delta(es_indexing_failed_count[1m]) > 0
+    labels:
+      severity: warning
+    annotations:
+      description: There are documents indexing failures on node {{$labels.instance}}.
+        Please check logs to get more details.
+      summary: Indexing failures on {{$labels.instance}}
+  - alert: ElasticSearchIndexIsThrootled
+    expr: es_indexng_isthrottled > 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: Index {{$labels.index}} is throttled for more than 10minutes. Some
+        documents can be missing from returned results.
+      summary: Index {{$labels.index}} throttled for more than 10 minutes
+  - alert: ElasticSearchIndexUnassignedShards
+    expr: es_index_unassigned_shards > 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: Unassigned shards on index {{$labels.index}} from {{$labels.cluster}}
+        for more than 3 minutes
+      summary: Unassigned shards on {{$labels.index}} at {{$labels.cluster}}
+  - alert: ElasticSearchJvmMemoryPercent
+    expr: es_jvm_memory_heap_used_percen > 95
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{$labels.instance}} reports high memory consumption for more
+        than 1 minute. Please check logs for more details'
+      summary: High JVM memory consumption on ES node {{$labels.instance}}
+  - alert: ElasticSearchClusterAllocationDisabled
+    expr: es_cluster_settings{cluster_routing_allocation_enable="none"} > 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{$labels.instance}} reports that cluster allocation has been
+        disabled for {{$labels.cluster}}. Some documents can be missing from reported
+        results.'
+      summary: Cluster allocation disabled on cluster {{$labels.cluster}}
+  - alert: ElasticSearchClusterRebalanceDisabled
+    expr: es_cluster_settings{cluster_routing_rebalance_enable="none"} > 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{$labels.instance}} reports that cluster rebalance has been disabled
+        on {{$labels.cluster}}. Some documents can be missing from reported results.'
+      summary: Cluster rebalance disabled on cluster {{$labels.cluster}}
+  - alert: ElasticSearchClusterAllocationDisabledPersistently
+    expr: es_cluster_persistent_settings{cluster_routing_allocation_enable="none"}
+      > 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{$labels.instance}} reports that cluster allocation has been
+        disabled persistently for {{$labels.cluster}}. Some documents can be missing
+        from reported results and restart will not help.'
+      summary: Cluster allocation disabled on cluster {{$labels.cluster}}
+  - alert: ElasticSearchClusterRebalanceDisabledPersistently
+    expr: es_cluster_persistent_settings{cluster_routing_rebalance_enable="none"}
+      > 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{$labels.instance}} reports that cluster rebalance has been disabled
+        on {{$labels.cluster}}. Some documents can be missing from reported results
+        and restart will not help.'
+      summary: Cluster rebalance disabled on cluster {{$labels.cluster}}
+  - alert: ElasticSearchClusterReadOnly
+    expr: es_cluster_settings{cluster_blocks_read_only="true"} > 0
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{$labels.instance}} reports that cluster {{$labels.cluster}}
+        is in read-only mode. New documents will be rejected.'
+      summary: Cluster {{$labels.cluster}} is in RO mode
+  - alert: ElasticSearchClusterReadOnlyPersistently
+    expr: es_cluster_persistent_settings{cluster_blocks_read_only="true"} > 0
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{$labels.instance}} reports that cluster {{$labels.cluster}}
+        is in read-only mode'
+      summary: Cluster {{$labels.cluster}} is read-only
+  - alert: ElasticDifferentVersion
+    expr: count(sum(es_prometheus_version) BY (pluginVersion, cluster)) BY (cluster)
+      > 1
+    for: 5d
+    labels:
+      severity: warning
+    annotations:
+      description: Cluster {{$labels.cluster}} reports inconsistent versions of ES
+        plugin
+      summary: Cluster {{$labels.cluster}} is inconsistent
+  - alert: ElasticIngestionFailed
+    expr: delta(es_ingest_total_failed_count[1m]) > 0
+    labels:
+      severity: warning
+    annotations:
+      description: Node {{$labels.instance}} reports failed ingestions. Some documents
+        were lost
+      summary: Node {{$labels.instance}} ingestion failed
+  - alert: ElasticNoSpaceWithin24h
+    expr: predict_linear(es_fs_free_bytes[1h], 24 * 3600) < 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: Elasticsearch reports that space on {{ $labels.node }}({{ $labels.instance
+        }}) will run within 24h. Please check disk usage on that host
+      summary: Elasticsearch {{ $labels.node }} at {{ $labels.cluster }} will be out
+        of disk space within 24h
+  - alert: ElasticNoAvailableSpace
+    expr: es_fs_path_free_bytes * 100 / es_fs_path_total_bytes < 10
+    for: 10m
+    labels:
+      severity: critical
+    annotations:
+      description: Elasticsearch reports that there  are only {{ $value }}% left on
+        {{ $labels.path }} at {{$labels.instance}}. Please check it
+      summary: No Available space on {{$labels.instance}}
+  - alert: ElasticCircuitEnabled
+    expr: delta(es_breaker_tripped[1m]) > 0
+    labels:
+      severity: critical
+    annotations:
+      description: Elasticsearch circuit breaker {{ $labels.circuit_name }} was enabled
+        within last minute on {{$labels.node}} ({{$labels.instance}}). Looks like
+        high memory pressure on this host ans some data were discarded.
+      summary: Circuit breaker {{ $labels.circuit_name}} enabled on {{$labels.node}}
+  - alert: ElasticThreadpoolRejected
+    expr: delta(es_threadpool_rejected[1m]) > 0
+    labels:
+      severity: warning
+    annotations:
+      description: Elasticsearch threadpool {{ $labels.threadpool }} rejected {{ $value
+        }} tasks within last minute. Some jobs failed and never will be repeated,
+        it could be a high CPU pressure or I/O errors. Please check node and it's
+        state
+      summary: Threadpool tasks rejected {{ $labels.threadpool}} at {{$labels.node}}