Skip to content

Commit 311ced1

Browse files
author
gdgate
authored
Merge pull request #1661 from phong-nguyen-duy/TMA-1640
FEATURE: TMA-1640 Add alert for CPU limit hit Reviewed-by: https://github.com/danh-ung
2 parents 62ee494 + 80106d6 commit 311ced1

2 files changed

Lines changed: 23 additions & 13 deletions

File tree

k8s/charts/lcm-bricks/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
apiVersion: v1
22
name: lcm-bricks
33
description: LCM Bricks
4-
version: 2.0.1
4+
version: 2.0.2

k8s/charts/lcm-bricks/templates/prometheus/alertingRules.yaml

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,16 @@ data:
2020
expr: container_pod:lcm_pod_container_status_restarts:increase10m >= 1
2121
labels:
2222
severity: warning
23-
team: lcm # switch to msf in production
23+
team: lcm
2424
cluster_id: {{ .Values.clusterId }}
2525
annotations:
2626
description: "There is more than 0 restarts of {{`{{ $labels.pod }}`}} pod in the last 10 minutes"
2727
summary: "{{`{{ $labels.pod }}`}} pod has too many restarts"
2828
- alert: "[LCM] Pod has too many restarts on cluster={{ .Values.clusterId }}"
2929
expr: container_pod:lcm_pod_container_status_restarts:increase10m >= 2
3030
labels:
31-
severity: critical
32-
team: lcm # switch to msf in production
31+
severity: warning
32+
team: lcm
3333
cluster_id: {{ .Values.clusterId }}
3434
annotations:
3535
description: "There is more than 1 restart of {{`{{ $labels.pod }}`}} pod in the last 10 minutes"
@@ -40,16 +40,16 @@ data:
4040
expr: container_pod:lcm_pod_container_status_oomkilled:increase10m >= 1
4141
labels:
4242
severity: warning
43-
team: lcm # switch to msf in production
43+
team: lcm
4444
cluster_id: {{ .Values.clusterId }}
4545
annotations:
4646
description: "{{`{{ $labels.pod }}`}} was OOMKilled in the last 30 minutes. Investigate and/or increase memoryRequest or memoryLimit."
4747
summary: "{{`{{ $labels.pod }}`}} OOMKill occured"
4848
- alert: "[LCM] OOMKill occured on cluster={{ .Values.clusterId }}"
4949
expr: container_pod:lcm_pod_container_status_oomkilled:increase10m >= 2
5050
labels:
51-
severity: critical
52-
team: lcm # switch to msf in production
51+
severity: warning
52+
team: lcm
5353
cluster_id: {{ .Values.clusterId }}
5454
annotations:
5555
description: "{{`{{ $labels.pod }}`}} was OOMKilled in the last 10 minutes. Investigate and/or increase memoryRequest or memoryLimit."
@@ -58,8 +58,8 @@ data:
5858
expr: rate(container_cpu_cfs_throttled_seconds_total{namespace='{{ .Release.Namespace }}'}[1m]) > 1
5959
for: 5m
6060
labels:
61-
severity: critical
62-
team: lcm # switch to msf in production
61+
severity: warning
62+
team: lcm
6363
cluster_id: {{ .Values.clusterId }}
6464
annotations:
6565
description: "{{`{{ $labels.pod_name }}`}} container is beeing throttled and probably hit CPU limit. Investigate root cause and increase limit and/or number of replicas if necessary."
@@ -68,18 +68,28 @@ data:
6868
expr: rate(jvm_gc_pause_seconds_sum{kubernetes_namespace='{{ .Release.Namespace }}'}[1m]) > 1
6969
for: 5m
7070
labels:
71-
severity: critical
72-
team: lcm # switch to msf in production
71+
severity: warning
72+
team: lcm
7373
cluster_id: {{ .Values.clusterId }}
7474
annotations:
7575
description: "{{`{{ $labels.kubernetes_pod_name }}`}} container is spending too much time in pause garbage collector. Investigate root cause and increase heap size and/or number of replicas if necessary."
7676
summary: "{{`{{ $labels.kubernetes_pod_name }}`}} is doing too much pause GC"
7777
- alert: "[LCM] there is more than 100 jobs on cluster={{ .Values.clusterId }}"
7878
expr: count(kube_job_info{namespace="lcm"}) > 100
7979
labels:
80-
severity: critical
81-
team: lcm # switch to msf in production
80+
severity: warning
81+
team: lcm
8282
cluster_id: {{ .Values.clusterId }}
8383
annotations:
8484
description: "There is more than 100 jobs in LCM namespace. They are likely not deleted."
8585
summary: "There is more than 100 jobs in LCM namespace."
86+
- alert: "[LCM] Resource quotas hit CPU limit on cluster={{ .Values.clusterId }}"
87+
expr: kube_resourcequota{namespace='{{ .Release.Namespace }}',resource="limits.cpu",type="hard"} - ignoring(type) kube_resourcequota{namespace='{{ .Release.Namespace }}',resource="limits.cpu",type="used"} == 0
88+
labels:
89+
severity: critical
90+
team: lcm
91+
cluster_id: {{ .Values.clusterId }}
92+
annotations:
93+
description: "We are hitting CPU limit in LCM namespace."
94+
summary: "We are hitting CPU limit in LCM namespace."
95+

0 commit comments

Comments
 (0)