Procházet zdrojové kódy

add README and teardown

dotbalo před 6 roky
rodič
revize
dc3d9e47fa

+ 5 - 0
prometheus-operator/README.md

@@ -0,0 +1,5 @@
+````
+  1. Change ssl files of etcd to yours in the deploy
+  2. Change tlsConfig and addresses of etcd to yours in the manifests/prometheus/prometheus-etcd.yaml
+  3. Install: ./deploy and Uninstall: ./teardown
+````

+ 2 - 0
prometheus-operator/deploy

@@ -15,6 +15,8 @@ kubectl create namespace "$NAMESPACE"
 
 
 kubectl create secret generic  alertmanager-main --from-file=alertmanager.yaml --from-file=mail-template.tmpl -n monitoring
 kubectl create secret generic  alertmanager-main --from-file=alertmanager.yaml --from-file=mail-template.tmpl -n monitoring
 
 
+kubectl -n monitoring create secret generic etcd-certs --from-file=/etc/kubernetes/pki/etcd/peer.crt   --from-file=/etc/kubernetes/pki/etcd/peer.key  --from-file=/etc/kubernetes/pki/etcd/ca.crt
+
 kctl() {
 kctl() {
     kubectl --namespace "$NAMESPACE" "$@"
     kubectl --namespace "$NAMESPACE" "$@"
 }
 }

+ 30 - 0
prometheus-operator/manifests/prometheus/prometheus-k8s-rules.yaml

@@ -643,3 +643,33 @@ data:
         annotations:
         annotations:
           description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values"
           description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values"
           summary: Prometheus has many samples rejected
           summary: Prometheus has many samples rejected
+
+  noah_pod.rules.yaml: |+
+    groups:
+    - name: noah_pod.rules
+      rules:
+      - alert: Pod_all_cpu_usage
+        expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 10
+        for: 5m
+        labels:
+          severity: critical
+          service: pods
+        annotations:
+          description: 容器 {{ $labels.name }} CPU 资源利用率大于 75% , (current value is {{ $value }})
+          summary: Dev CPU 负载告警
+      - alert: Pod_all_memory_usage
+        expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 1024*10^3*2
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }})
+          summary: Dev Memory 负载告警
+      - alert: Pod_all_network_receive_usage
+        expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 1024*1024*50
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M , (current value is {{ $value }})
+          summary: network_receive 负载告警

+ 30 - 0
prometheus-operator/teardown

@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+if [ -z "${KUBECONFIG}" ]; then
+    export KUBECONFIG=~/.kube/config
+fi
+
+# CAUTION - NAMESPACE must match its value when deploy script was run.
+# Some resources are always deployed to the monitoring namespace. 
+
+if [ -z "${NAMESPACE}" ]; then
+    NAMESPACE=monitoring
+fi
+
+kctl() {
+    kubectl --namespace "$NAMESPACE" "$@"
+}
+
+kctl delete -f manifests/node-exporter
+kctl delete -f manifests/kube-state-metrics
+kctl delete -f manifests/grafana
+find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \;
+kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml
+kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml
+kctl delete -f manifests/alertmanager
+
+# Hack: wait a bit to let the controller delete the deployed Prometheus server.
+sleep 5
+
+kctl delete -f manifests/prometheus-operator
+