2
0

es-alert-rules.yaml 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. groups:
  2. - name: elastic-rules.rule
  3. rules:
  4. - alert: ElasticSearchCpuCritical
  5. expr: es_cpu_percentage > 95
  6. for: 3m
  7. labels:
  8. severity: critical
  9. annotations:
  10. description: '{{$labels.instance}} reports critical cpu usage. Please verify
  11. workload, or add another node to the cluster '
  12. summary: Critical CPU usage on {{$labels.instance}}
  13. - alert: ElasticSearchStatusCritical
  14. expr: es_status > 1
  15. for: 1m
  16. labels:
  17. severity: critical
  18. annotations:
  19. description: '{{$labels.instance}} reports critical status of a ElasticSearch
  20. cluster {{$labels.cluster}}. Please check additional metrics or logs.'
  21. summary: Critical cluster status of {{$labels.cluster}} on {{$labels.instance}}
  22. - alert: ElasticSearchStatusWarning
  23. expr: es_status == 1
  24. for: 30m
  25. labels:
  26. severity: warning
  27. annotations:
  28. description: '{{$labels.instance}} reports non-healthy status of ElasticSearch
  29. cluster {{$labels.cluster}}. Please check additional metrics or logs to find
  30. a root cause'
  31. summary: NonHealthy cluster status of {{$labels.cluster}} on {{$labels.instance}}
  32. - alert: ElasticSearchUnassigedShards
  33. expr: es_unassigned_shards > 0
  34. for: 10m
  35. labels:
  36. severity: warning
  37. annotations:
  38. description: There are unassigned shards for more than 3 minutes in {{$labels.cluster}}
  39. on node {{$labels.instance}}. Please check cluster performance
  40. summary: Unassigned shards on {{$labels.cluster}}
  41. - alert: ElasticSearchActiveShardsPercentage
  42. expr: es_shards_active_percentage < 100
  43. for: 10m
  44. labels:
  45. severity: warning
  46. annotations:
  47. description: Some shards ({{$value}}%) shards are inactive on {{$labels.cluster}}
  48. for more than 3 minutes. Results from those shards are unavailable in returned
  49. results.
  50. summary: Non-active shards on {{$labels.cluster}}
  51. - alert: ElasticSearchTooManyIndexFailures
  52. expr: delta(es_indexing_failed_count[1m]) > 0
  53. labels:
  54. severity: warning
  55. annotations:
  56. description: There are documents indexing failures on node {{$labels.instance}}.
  57. Please check logs to get more details.
  58. summary: Indexing failures on {{$labels.instance}}
  59. - alert: ElasticSearchIndexIsThrootled
  60. expr: es_indexng_isthrottled > 0
  61. for: 10m
  62. labels:
  63. severity: warning
  64. annotations:
  65. description: Index {{$labels.index}} is throttled for more than 10minutes. Some
  66. documents can be missing from returned results.
  67. summary: Index {{$labels.index}} throttled for more than 10 minutes
  68. - alert: ElasticSearchIndexUnassignedShards
  69. expr: es_index_unassigned_shards > 0
  70. for: 10m
  71. labels:
  72. severity: warning
  73. annotations:
  74. description: Unassigned shards on index {{$labels.index}} from {{$labels.cluster}}
  75. for more than 3 minutes
  76. summary: Unassigned shards on {{$labels.index}} at {{$labels.cluster}}
  77. - alert: ElasticSearchJvmMemoryPercent
  78. expr: es_jvm_memory_heap_used_percen > 95
  79. for: 1m
  80. labels:
  81. severity: warning
  82. annotations:
  83. description: '{{$labels.instance}} reports high memory consumption for more
  84. than 1 minute. Please check logs for more details'
  85. summary: High JVM memory consumption on ES node {{$labels.instance}}
  86. - alert: ElasticSearchClusterAllocationDisabled
  87. expr: es_cluster_settings{cluster_routing_allocation_enable="none"} > 0
  88. for: 10m
  89. labels:
  90. severity: warning
  91. annotations:
  92. description: '{{$labels.instance}} reports that cluster allocation has been
  93. disabled for {{$labels.cluster}}. Some documents can be missing from reported
  94. results.'
  95. summary: Cluster allocation disabled on cluster {{$labels.cluster}}
  96. - alert: ElasticSearchClusterRebalanceDisabled
  97. expr: es_cluster_settings{cluster_routing_rebalance_enable="none"} > 0
  98. for: 10m
  99. labels:
  100. severity: warning
  101. annotations:
  102. description: '{{$labels.instance}} reports that cluster rebalance has been disabled
  103. on {{$labels.cluster}}. Some documents can be missing from reported results.'
  104. summary: Cluster rebalance disabled on cluster {{$labels.cluster}}
  105. - alert: ElasticSearchClusterAllocationDisabledPersistently
  106. expr: es_cluster_persistent_settings{cluster_routing_allocation_enable="none"}
  107. > 0
  108. for: 10m
  109. labels:
  110. severity: warning
  111. annotations:
  112. description: '{{$labels.instance}} reports that cluster allocation has been
  113. disabled persistently for {{$labels.cluster}}. Some documents can be missing
  114. from reported results and restart will not help.'
  115. summary: Cluster allocation disabled on cluster {{$labels.cluster}}
  116. - alert: ElasticSearchClusterRebalanceDisabledPersistently
  117. expr: es_cluster_persistent_settings{cluster_routing_rebalance_enable="none"}
  118. > 0
  119. for: 10m
  120. labels:
  121. severity: warning
  122. annotations:
  123. description: '{{$labels.instance}} reports that cluster rebalance has been disabled
  124. on {{$labels.cluster}}. Some documents can be missing from reported results
  125. and restart will not help.'
  126. summary: Cluster rebalance disabled on cluster {{$labels.cluster}}
  127. - alert: ElasticSearchClusterReadOnly
  128. expr: es_cluster_settings{cluster_blocks_read_only="true"} > 0
  129. for: 1m
  130. labels:
  131. severity: critical
  132. annotations:
  133. description: '{{$labels.instance}} reports that cluster {{$labels.cluster}}
  134. is in read-only mode. New documents will be rejected.'
  135. summary: Cluster {{$labels.cluster}} is in RO mode
  136. - alert: ElasticSearchClusterReadOnlyPersistently
  137. expr: es_cluster_persistent_settings{cluster_blocks_read_only="true"} > 0
  138. for: 1m
  139. labels:
  140. severity: critical
  141. annotations:
  142. description: '{{$labels.instance}} reports that cluster {{$labels.cluster}}
  143. is in read-only mode'
  144. summary: Cluster {{$labels.cluster}} is read-only
  145. - alert: ElasticDifferentVersion
  146. expr: count(sum(es_prometheus_version) BY (pluginVersion, cluster)) BY (cluster)
  147. > 1
  148. for: 5d
  149. labels:
  150. severity: warning
  151. annotations:
  152. description: Cluster {{$labels.cluster}} reports inconsistent versions of ES
  153. plugin
  154. summary: Cluster {{$labels.cluster}} is inconsistent
  155. - alert: ElasticIngestionFailed
  156. expr: delta(es_ingest_total_failed_count[1m]) > 0
  157. labels:
  158. severity: warning
  159. annotations:
  160. description: Node {{$labels.instance}} reports failed ingestions. Some documents
  161. were lost
  162. summary: Node {{$labels.instance}} ingestion failed
  163. - alert: ElasticNoSpaceWithin24h
  164. expr: predict_linear(es_fs_free_bytes[1h], 24 * 3600) < 0
  165. for: 10m
  166. labels:
  167. severity: warning
  168. annotations:
  169. description: Elasticsearch reports that space on {{ $labels.node }}({{ $labels.instance
  170. }}) will run within 24h. Please check disk usage on that host
  171. summary: Elasticsearch {{ $labels.node }} at {{ $labels.cluster }} will be out
  172. of disk space within 24h
  173. - alert: ElasticNoAvailableSpace
  174. expr: es_fs_path_free_bytes * 100 / es_fs_path_total_bytes < 10
  175. for: 10m
  176. labels:
  177. severity: critical
  178. annotations:
  179. description: Elasticsearch reports that there are only {{ $value }}% left on
  180. {{ $labels.path }} at {{$labels.instance}}. Please check it
  181. summary: No Available space on {{$labels.instance}}
  182. - alert: ElasticCircuitEnabled
  183. expr: delta(es_breaker_tripped[1m]) > 0
  184. labels:
  185. severity: critical
  186. annotations:
  187. description: Elasticsearch circuit breaker {{ $labels.circuit_name }} was enabled
  188. within last minute on {{$labels.node}} ({{$labels.instance}}). Looks like
  189. high memory pressure on this host ans some data were discarded.
  190. summary: Circuit breaker {{ $labels.circuit_name}} enabled on {{$labels.node}}
  191. - alert: ElasticThreadpoolRejected
  192. expr: delta(es_threadpool_rejected[1m]) > 0
  193. labels:
  194. severity: warning
  195. annotations:
  196. description: Elasticsearch threadpool {{ $labels.threadpool }} rejected {{ $value
  197. }} tasks within last minute. Some jobs failed and never will be repeated,
  198. it could be a high CPU pressure or I/O errors. Please check node and it's
  199. state
  200. summary: Threadpool tasks rejected {{ $labels.threadpool}} at {{$labels.node}}