Merge pull request #1661 from phong-nguyen-duy/TMA-1640

gdgate · web-flow · commit 311ced137381 · 2020-05-28T14:31:41.000+07:00
FEATURE: TMA-1640 Add alert for CPU limit hit Reviewed-by: https://github.com/danh-ung
diff --git a/k8s/charts/lcm-bricks/Chart.yaml b/k8s/charts/lcm-bricks/Chart.yaml
@@ -1,4 +1,4 @@
 apiVersion: v1
 name: lcm-bricks
 description: LCM Bricks
-version: 2.0.1
+version: 2.0.2
diff --git a/k8s/charts/lcm-bricks/templates/prometheus/alertingRules.yaml b/k8s/charts/lcm-bricks/templates/prometheus/alertingRules.yaml
@@ -20,16 +20,16 @@ data:
         expr: container_pod:lcm_pod_container_status_restarts:increase10m >= 1
         labels:
           severity: warning
-          team: lcm # switch to msf in production
+          team: lcm
           cluster_id: {{ .Values.clusterId }}
         annotations:
           description: "There is more than 0 restarts of {{`{{ $labels.pod }}`}} pod in the last 10 minutes"
           summary: "{{`{{ $labels.pod }}`}} pod has too many restarts"
       - alert: "[LCM] Pod has too many restarts on cluster={{ .Values.clusterId }}"
         expr: container_pod:lcm_pod_container_status_restarts:increase10m >= 2
         labels:
-          severity: critical
-          team: lcm # switch to msf in production
+          severity: warning
+          team: lcm
           cluster_id: {{ .Values.clusterId }}
         annotations:
           description: "There is more than 1 restart of {{`{{ $labels.pod }}`}} pod in the last 10 minutes"
@@ -40,16 +40,16 @@ data:
         expr: container_pod:lcm_pod_container_status_oomkilled:increase10m >= 1
         labels:
           severity: warning
-          team: lcm # switch to msf in production
+          team: lcm
           cluster_id: {{ .Values.clusterId }}
         annotations:
           description: "{{`{{ $labels.pod }}`}} was OOMKilled in the last 30 minutes. Investigate and/or increase memoryRequest or memoryLimit."
           summary: "{{`{{ $labels.pod }}`}} OOMKill occured"
       - alert: "[LCM] OOMKill occured on cluster={{ .Values.clusterId }}"
         expr: container_pod:lcm_pod_container_status_oomkilled:increase10m >= 2
         labels:
-          severity: critical
-          team: lcm # switch to msf in production
+          severity: warning
+          team: lcm
           cluster_id: {{ .Values.clusterId }}
         annotations:
           description: "{{`{{ $labels.pod }}`}} was OOMKilled in the last 10 minutes. Investigate and/or increase memoryRequest or memoryLimit."
@@ -58,8 +58,8 @@ data:
         expr: rate(container_cpu_cfs_throttled_seconds_total{namespace='{{ .Release.Namespace }}'}[1m]) > 1
         for: 5m
         labels:
-          severity: critical
-          team: lcm # switch to msf in production
+          severity: warning
+          team: lcm
           cluster_id: {{ .Values.clusterId }}
         annotations:
           description: "{{`{{ $labels.pod_name }}`}} container is beeing throttled and probably hit CPU limit. Investigate root cause and increase limit and/or number of replicas if necessary."
@@ -68,18 +68,28 @@ data:
         expr: rate(jvm_gc_pause_seconds_sum{kubernetes_namespace='{{ .Release.Namespace }}'}[1m]) > 1
         for: 5m
         labels:
-          severity: critical
-          team: lcm # switch to msf in production
+          severity: warning
+          team: lcm
           cluster_id: {{ .Values.clusterId }}
         annotations:
           description: "{{`{{ $labels.kubernetes_pod_name }}`}} container is spending too much time in pause garbage collector. Investigate root cause and increase heap size and/or number of replicas if necessary."
           summary: "{{`{{ $labels.kubernetes_pod_name }}`}} is doing too much pause GC"
       - alert: "[LCM] there is more than 100 jobs on cluster={{ .Values.clusterId }}"
         expr: count(kube_job_info{namespace="lcm"}) > 100
         labels:
-          severity: critical
-          team: lcm # switch to msf in production
+          severity: warning
+          team: lcm
           cluster_id: {{ .Values.clusterId }}
         annotations:
           description: "There is more than 100 jobs in LCM namespace. They are likely not deleted."
           summary: "There is more than 100 jobs in LCM namespace."
+      - alert: "[LCM] Resource quotas hit CPU limit on cluster={{ .Values.clusterId }}"
+        expr: kube_resourcequota{namespace='{{ .Release.Namespace }}',resource="limits.cpu",type="hard"} - ignoring(type) kube_resourcequota{namespace='{{ .Release.Namespace }}',resource="limits.cpu",type="used"} == 0
+        labels:
+          severity: critical
+          team: lcm
+          cluster_id: {{ .Values.clusterId }}
+        annotations:
+          description: "We are hitting CPU limit in LCM namespace."
+          summary: "We are hitting CPU limit in LCM namespace."
+