Skip to content

Commit 085b8a0

Browse files
Merge pull request #30862 from xueqzhan/double-pod-termination
OCPBUGS-78016: Fix double counting of pod restart events
2 parents 1485a89 + 3b1e7b0 commit 085b8a0

1 file changed

Lines changed: 33 additions & 23 deletions

File tree

pkg/monitortests/node/watchpods/collection.go

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -218,29 +218,39 @@ func startPodMonitoring(ctx context.Context, recorderWriter monitorapi.RecorderW
218218
// on the event stream
219219

220220
case lastTerminated && oldContainerStatus.LastTerminationState.Terminated == nil:
221-
// if we are transitioning to a terminated state
222-
if containerStatus.LastTerminationState.Terminated.ExitCode != 0 {
223-
intervals = append(intervals,
224-
monitorapi.NewInterval(monitorapi.SourcePodMonitor, monitorapi.Error).
225-
Locator(monitorapi.NewLocator().ContainerFromPod(pod, containerName)).
226-
Message(monitorapi.NewMessage().
227-
Reason(monitorapi.ContainerReasonContainerExit).
228-
WithAnnotation(monitorapi.AnnotationContainerExitCode, fmt.Sprintf("%d", containerStatus.LastTerminationState.Terminated.ExitCode)).
229-
Cause(containerStatus.LastTerminationState.Terminated.Reason).
230-
HumanMessage(containerStatus.LastTerminationState.Terminated.Message),
231-
).BuildNow(),
232-
)
233-
} else {
234-
intervals = append(intervals,
235-
monitorapi.NewInterval(monitorapi.SourcePodMonitor, monitorapi.Info).
236-
Locator(monitorapi.NewLocator().ContainerFromPod(pod, containerName)).
237-
Message(monitorapi.NewMessage().
238-
Reason(monitorapi.ContainerReasonContainerExit).
239-
WithAnnotation(monitorapi.AnnotationContainerExitCode, "0").
240-
Cause(containerStatus.LastTerminationState.Terminated.Reason).
241-
HumanMessage(containerStatus.LastTerminationState.Terminated.Message)).
242-
BuildNow(),
243-
)
221+
// if we are transitioning to a terminated state in LastTerminationState
222+
// Check if we already recorded this exit when it was in State.Terminated
223+
// If oldContainerStatus.State.Terminated matches the current LastTerminationState.Terminated,
224+
// then we already recorded this exit and should skip to avoid double-counting
225+
alreadyRecorded := oldContainerStatus.State.Terminated != nil &&
226+
containerStatus.State.Terminated == nil &&
227+
oldContainerStatus.State.Terminated.FinishedAt.Equal(&containerStatus.LastTerminationState.Terminated.FinishedAt)
228+
229+
if !alreadyRecorded {
230+
// We missed the original exit event, record it now as a safety net
231+
if containerStatus.LastTerminationState.Terminated.ExitCode != 0 {
232+
intervals = append(intervals,
233+
monitorapi.NewInterval(monitorapi.SourcePodMonitor, monitorapi.Error).
234+
Locator(monitorapi.NewLocator().ContainerFromPod(pod, containerName)).
235+
Message(monitorapi.NewMessage().
236+
Reason(monitorapi.ContainerReasonContainerExit).
237+
WithAnnotation(monitorapi.AnnotationContainerExitCode, fmt.Sprintf("%d", containerStatus.LastTerminationState.Terminated.ExitCode)).
238+
Cause(containerStatus.LastTerminationState.Terminated.Reason).
239+
HumanMessage(containerStatus.LastTerminationState.Terminated.Message),
240+
).BuildNow(),
241+
)
242+
} else {
243+
intervals = append(intervals,
244+
monitorapi.NewInterval(monitorapi.SourcePodMonitor, monitorapi.Info).
245+
Locator(monitorapi.NewLocator().ContainerFromPod(pod, containerName)).
246+
Message(monitorapi.NewMessage().
247+
Reason(monitorapi.ContainerReasonContainerExit).
248+
WithAnnotation(monitorapi.AnnotationContainerExitCode, "0").
249+
Cause(containerStatus.LastTerminationState.Terminated.Reason).
250+
HumanMessage(containerStatus.LastTerminationState.Terminated.Message)).
251+
BuildNow(),
252+
)
253+
}
244254
}
245255

246256
case currentTerminated && oldContainerStatus.State.Terminated == nil:

0 commit comments

Comments
 (0)