Skip to content

Commit 72d25a9

Browse files
ispeakc0deKarthik Satchitanand
andauthored
feat(container-kill): Adding the container-kill experiment (#50)
* feat(container-kill): Adding the container-kill experiment Signed-off-by: shubhamchaudhary <shubham.chaudhary@mayadata.io> * Update container-kill-k8s-job.yml Co-authored-by: Karthik Satchitanand <karthik.s@mayadata.io>
1 parent ac24a40 commit 72d25a9

File tree

12 files changed

+958
-19
lines changed

12 files changed

+958
-19
lines changed

build/generate_go_binary

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,7 @@ go build -o build/_output/kubelet-service-kill ./experiments/generic/kubelet-ser
2222
go build -o build/_output/node-memory-hog ./experiments/generic/node-memory-hog
2323
# Buiding go binaries for node_cpu_hog experiment
2424
go build -o build/_output/node-cpu-hog ./experiments/generic/node-cpu-hog
25+
# Buiding go binaries for container_kill experiment
26+
go build -o build/_output/container-kill ./experiments/generic/container-kill
2527
# Buiding go binaries for disk_fill experiment
26-
go build -o build/_output/disk-fill ./experiments/generic/disk-fill
28+
go build -o build/_output/disk-fill ./experiments/generic/disk-fill
Lines changed: 373 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,373 @@
1+
package container_kill
2+
3+
import (
4+
"math/rand"
5+
"strconv"
6+
"time"
7+
8+
clients "github.com/litmuschaos/litmus-go/pkg/clients"
9+
experimentTypes "github.com/litmuschaos/litmus-go/pkg/generic/container-kill/types"
10+
"github.com/litmuschaos/litmus-go/pkg/log"
11+
"github.com/litmuschaos/litmus-go/pkg/math"
12+
"github.com/litmuschaos/litmus-go/pkg/status"
13+
"github.com/litmuschaos/litmus-go/pkg/types"
14+
"github.com/openebs/maya/pkg/util/retry"
15+
"github.com/pkg/errors"
16+
"github.com/sirupsen/logrus"
17+
apiv1 "k8s.io/api/core/v1"
18+
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
19+
)
20+
21+
//PrepareContainerKill contains the prepration steps before chaos injection
22+
func PrepareContainerKill(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails) error {
23+
24+
//Select application pod and node name for the container-kill
25+
appName, appNodeName, err := GetApplicationPod(experimentsDetails, clients)
26+
if err != nil {
27+
return errors.Errorf("Unable to get the application name and application nodename due to, err: %v", err)
28+
}
29+
30+
//Get the target container name of the application pod
31+
if experimentsDetails.TargetContainer == "" {
32+
experimentsDetails.TargetContainer, err = GetTargetContainer(experimentsDetails, appName, clients)
33+
if err != nil {
34+
return errors.Errorf("Unable to get the target container name due to, err: %v", err)
35+
}
36+
}
37+
38+
log.InfoWithValues("[Info]: Details of application under chaos injection", logrus.Fields{
39+
"PodName": appName,
40+
"NodeName": appNodeName,
41+
"ContainerName": experimentsDetails.TargetContainer,
42+
})
43+
44+
//Getting the iteration count for the container-kill
45+
GetIterations(experimentsDetails)
46+
47+
// generating a unique string which can be appended with the helper pod name & labels for the uniquely identification
48+
experimentsDetails.RunID = GetRunID()
49+
50+
// Getting the serviceAccountName, need permission inside helper pod to create the events
51+
if experimentsDetails.ChaosServiceAccount == "" {
52+
err = GetServiceAccount(experimentsDetails, clients)
53+
if err != nil {
54+
return errors.Errorf("Unable to get the serviceAccountName, err: %v", err)
55+
}
56+
}
57+
58+
//Waiting for the ramp time before chaos injection
59+
if experimentsDetails.RampTime != 0 {
60+
log.Infof("[Ramp]: Waiting for the %vs ramp time before injecting chaos", strconv.Itoa(experimentsDetails.RampTime))
61+
waitForDuration(experimentsDetails.RampTime)
62+
}
63+
64+
//GetRestartCount return the restart count of target container
65+
restartCountBefore, err := GetRestartCount(experimentsDetails, appName, clients)
66+
if err != nil {
67+
return err
68+
}
69+
70+
log.Infof("restartCount of target container before chaos injection: %v", strconv.Itoa(restartCountBefore))
71+
72+
// creating the helper pod to perform container kill chaos
73+
err = CreateHelperPod(experimentsDetails, clients, appName, appNodeName)
74+
if err != nil {
75+
return errors.Errorf("Unable to create the helper pod, err: %v", err)
76+
}
77+
78+
//checking the status of the helper pod, wait till the helper pod comes to running state else fail the experiment
79+
log.Info("[Status]: Checking the status of the helper pod")
80+
err = status.CheckApplicationStatus(experimentsDetails.ChaosNamespace, "name=container-kill-"+experimentsDetails.RunID, clients)
81+
if err != nil {
82+
return errors.Errorf("helper pod is not in running state, err: %v", err)
83+
}
84+
85+
// Recording the chaos start timestamp
86+
ChaosStartTimeStamp := time.Now().Unix()
87+
88+
// Wait till the completion of the helper pod
89+
// set an upper limit for the waiting time
90+
log.Info("[Wait]: waiting till the completion of the helper pod")
91+
err = status.WaitForCompletion(experimentsDetails.ChaosNamespace, "name=container-kill-"+experimentsDetails.RunID, clients, experimentsDetails.ChaosDuration+experimentsDetails.ChaosInterval+60)
92+
if err != nil {
93+
return err
94+
}
95+
96+
//ChaosCurrentTimeStamp contains the current timestamp
97+
ChaosCurrentTimeStamp := time.Now().Unix()
98+
//ChaosDiffTimeStamp contains the difference of current timestamp and start timestamp
99+
//It will helpful to track the total chaos duration
100+
chaosDiffTimeStamp := ChaosCurrentTimeStamp - ChaosStartTimeStamp
101+
102+
if int(chaosDiffTimeStamp) < experimentsDetails.ChaosDuration {
103+
return errors.Errorf("The helper pod failed, check the logs of helper pod for more details")
104+
}
105+
106+
//Deleting the helper pod for container-kill chaos
107+
log.Info("[Cleanup]: Deleting the helper pod")
108+
err = DeleteHelperPod(experimentsDetails, clients)
109+
if err != nil {
110+
return errors.Errorf("Unable to delete the helper pod, err: %v", err)
111+
}
112+
113+
// It will verify that the restart count of container should increase after chaos injection
114+
err = VerifyRestartCount(experimentsDetails, appName, clients, restartCountBefore)
115+
if err != nil {
116+
return errors.Errorf("Target container is not restarted , err: %v", err)
117+
}
118+
119+
//Waiting for the ramp time after chaos injection
120+
if experimentsDetails.RampTime != 0 {
121+
log.Infof("[Ramp]: Waiting for the %vs ramp time after injecting chaos", strconv.Itoa(experimentsDetails.RampTime))
122+
waitForDuration(experimentsDetails.RampTime)
123+
}
124+
return nil
125+
}
126+
127+
//GetIterations derive the iterations value from given parameters
128+
func GetIterations(experimentsDetails *experimentTypes.ExperimentDetails) {
129+
var Iterations int
130+
if experimentsDetails.ChaosInterval != 0 {
131+
Iterations = experimentsDetails.ChaosDuration / experimentsDetails.ChaosInterval
132+
}
133+
experimentsDetails.Iterations = math.Maximum(Iterations, 1)
134+
135+
}
136+
137+
//waitForDuration waits for the given time duration (in seconds)
138+
func waitForDuration(duration int) {
139+
time.Sleep(time.Duration(duration) * time.Second)
140+
}
141+
142+
// GetRunID generate a random string
143+
func GetRunID() string {
144+
var letterRunes = []rune("abcdefghijklmnopqrstuvwxyz")
145+
runID := make([]rune, 6)
146+
for i := range runID {
147+
runID[i] = letterRunes[rand.Intn(len(letterRunes))]
148+
}
149+
return string(runID)
150+
}
151+
152+
// GetServiceAccount find the serviceAccountName for the helper pod
153+
func GetServiceAccount(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets) error {
154+
pod, err := clients.KubeClient.CoreV1().Pods(experimentsDetails.ChaosNamespace).Get(experimentsDetails.ChaosPodName, v1.GetOptions{})
155+
if err != nil {
156+
return err
157+
}
158+
experimentsDetails.ChaosServiceAccount = pod.Spec.ServiceAccountName
159+
return nil
160+
}
161+
162+
//GetApplicationPod will select a random replica of application pod for chaos
163+
//It will also get the node name of the application pod
164+
func GetApplicationPod(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets) (string, string, error) {
165+
podList, err := clients.KubeClient.CoreV1().Pods(experimentsDetails.AppNS).List(v1.ListOptions{LabelSelector: experimentsDetails.AppLabel})
166+
if err != nil || len(podList.Items) == 0 {
167+
return "", "", errors.Wrapf(err, "Fail to get the application pod in %v namespace", experimentsDetails.AppNS)
168+
}
169+
170+
rand.Seed(time.Now().Unix())
171+
randomIndex := rand.Intn(len(podList.Items))
172+
applicationName := podList.Items[randomIndex].Name
173+
nodeName := podList.Items[randomIndex].Spec.NodeName
174+
175+
return applicationName, nodeName, nil
176+
}
177+
178+
//GetRestartCount return the restart count of target container
179+
func GetRestartCount(experimentsDetails *experimentTypes.ExperimentDetails, podName string, clients clients.ClientSets) (int, error) {
180+
pod, err := clients.KubeClient.CoreV1().Pods(experimentsDetails.AppNS).Get(podName, v1.GetOptions{})
181+
if err != nil {
182+
return 0, err
183+
}
184+
restartCount := 0
185+
for _, container := range pod.Status.ContainerStatuses {
186+
if container.Name == experimentsDetails.TargetContainer {
187+
restartCount = int(container.RestartCount)
188+
break
189+
}
190+
}
191+
return restartCount, nil
192+
}
193+
194+
//VerifyRestartCount verify the restart count of target container that it is restarted or not after chaos injection
195+
func VerifyRestartCount(experimentsDetails *experimentTypes.ExperimentDetails, podName string, clients clients.ClientSets, restartCountBefore int) error {
196+
197+
restartCountAfter := 0
198+
err := retry.
199+
Times(90).
200+
Wait(1 * time.Second).
201+
Try(func(attempt uint) error {
202+
pod, err := clients.KubeClient.CoreV1().Pods(experimentsDetails.AppNS).Get(podName, v1.GetOptions{})
203+
if err != nil {
204+
return errors.Errorf("Unable to get the application pod, err: %v", err)
205+
}
206+
for _, container := range pod.Status.ContainerStatuses {
207+
if container.Name == experimentsDetails.TargetContainer {
208+
restartCountAfter = int(container.RestartCount)
209+
break
210+
}
211+
}
212+
if restartCountAfter <= restartCountBefore {
213+
return errors.Errorf("Target container is not restarted")
214+
}
215+
return nil
216+
})
217+
218+
log.Infof("restartCount of target container after chaos injection: %v", strconv.Itoa(restartCountAfter))
219+
220+
return err
221+
222+
}
223+
224+
//GetTargetContainer will fetch the conatiner name from application pod
225+
//This container will be used as target container
226+
func GetTargetContainer(experimentsDetails *experimentTypes.ExperimentDetails, appName string, clients clients.ClientSets) (string, error) {
227+
pod, err := clients.KubeClient.CoreV1().Pods(experimentsDetails.AppNS).Get(appName, v1.GetOptions{})
228+
if err != nil {
229+
return "", errors.Wrapf(err, "Fail to get the application pod status, due to:%v", err)
230+
}
231+
232+
return pod.Spec.Containers[0].Name, nil
233+
}
234+
235+
// CreateHelperPod derive the attributes for helper pod and create the helper pod
236+
func CreateHelperPod(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, podName, nodeName string) error {
237+
238+
privilegedEnable := true
239+
240+
helperPod := &apiv1.Pod{
241+
ObjectMeta: v1.ObjectMeta{
242+
Name: "container-kill-" + experimentsDetails.RunID,
243+
Namespace: experimentsDetails.ChaosNamespace,
244+
Labels: map[string]string{
245+
"app": "container-kill",
246+
"name": "container-kill-" + experimentsDetails.RunID,
247+
"chaosUID": string(experimentsDetails.ChaosUID),
248+
},
249+
},
250+
Spec: apiv1.PodSpec{
251+
ServiceAccountName: experimentsDetails.ChaosServiceAccount,
252+
RestartPolicy: apiv1.RestartPolicyNever,
253+
NodeName: nodeName,
254+
Volumes: []apiv1.Volume{
255+
{
256+
Name: "cri-socket",
257+
VolumeSource: apiv1.VolumeSource{
258+
HostPath: &apiv1.HostPathVolumeSource{
259+
Path: experimentsDetails.ContainerPath,
260+
},
261+
},
262+
},
263+
{
264+
Name: "cri-config",
265+
VolumeSource: apiv1.VolumeSource{
266+
HostPath: &apiv1.HostPathVolumeSource{
267+
Path: "/etc/crictl.yaml",
268+
},
269+
},
270+
},
271+
},
272+
Containers: []apiv1.Container{
273+
{
274+
Name: "container-kill",
275+
Image: experimentsDetails.LIBImage,
276+
ImagePullPolicy: apiv1.PullAlways,
277+
Command: []string{
278+
"bin/bash",
279+
},
280+
Args: []string{
281+
"-c",
282+
"./experiments/container-kill",
283+
},
284+
Env: GetPodEnv(experimentsDetails, podName),
285+
VolumeMounts: []apiv1.VolumeMount{
286+
{
287+
Name: "cri-socket",
288+
MountPath: experimentsDetails.ContainerPath,
289+
},
290+
{
291+
Name: "cri-config",
292+
MountPath: "/etc/crictl.yaml",
293+
},
294+
},
295+
SecurityContext: &apiv1.SecurityContext{
296+
Privileged: &privilegedEnable,
297+
},
298+
},
299+
},
300+
},
301+
}
302+
303+
_, err := clients.KubeClient.CoreV1().Pods(experimentsDetails.ChaosNamespace).Create(helperPod)
304+
return err
305+
306+
}
307+
308+
//DeleteHelperPod deletes the helper pod and wait until it got terminated
309+
func DeleteHelperPod(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets) error {
310+
311+
err := clients.KubeClient.CoreV1().Pods(experimentsDetails.ChaosNamespace).Delete("container-kill-"+experimentsDetails.RunID, &v1.DeleteOptions{})
312+
313+
if err != nil {
314+
return err
315+
}
316+
317+
// waiting for the termination of the pod
318+
err = retry.
319+
Times(90).
320+
Wait(1 * time.Second).
321+
Try(func(attempt uint) error {
322+
podSpec, err := clients.KubeClient.CoreV1().Pods(experimentsDetails.ChaosNamespace).List(v1.ListOptions{LabelSelector: "name=container-kill-" + experimentsDetails.RunID})
323+
if err != nil || len(podSpec.Items) != 0 {
324+
return errors.Errorf("Pod is not deleted yet, err: %v", err)
325+
}
326+
return nil
327+
})
328+
329+
return err
330+
}
331+
332+
// GetPodEnv derive all the env required for the helper pod
333+
func GetPodEnv(experimentsDetails *experimentTypes.ExperimentDetails, podName string) []apiv1.EnvVar {
334+
335+
var envVar []apiv1.EnvVar
336+
ENVList := map[string]string{
337+
"APP_NS": experimentsDetails.AppNS,
338+
"APP_POD": podName,
339+
"APP_CONTAINER": experimentsDetails.TargetContainer,
340+
"TOTAL_CHAOS_DURATION": strconv.Itoa(experimentsDetails.ChaosDuration),
341+
"CHAOS_NAMESPACE": experimentsDetails.ChaosNamespace,
342+
"CHAOS_ENGINE": experimentsDetails.EngineName,
343+
"CHAOS_UID": string(experimentsDetails.ChaosUID),
344+
"CHAOS_INTERVAL": strconv.Itoa(experimentsDetails.ChaosInterval),
345+
"ITERATIONS": strconv.Itoa(experimentsDetails.Iterations),
346+
}
347+
for key, value := range ENVList {
348+
var perEnv apiv1.EnvVar
349+
perEnv.Name = key
350+
perEnv.Value = value
351+
envVar = append(envVar, perEnv)
352+
}
353+
// Getting experiment pod name from downward API
354+
experimentPodName := GetValueFromDownwardAPI("v1", "metadata.name")
355+
356+
var downwardEnv apiv1.EnvVar
357+
downwardEnv.Name = "POD_NAME"
358+
downwardEnv.ValueFrom = &experimentPodName
359+
envVar = append(envVar, downwardEnv)
360+
361+
return envVar
362+
}
363+
364+
// GetValueFromDownwardAPI returns the value from downwardApi
365+
func GetValueFromDownwardAPI(apiVersion string, fieldPath string) apiv1.EnvVarSource {
366+
downwardENV := apiv1.EnvVarSource{
367+
FieldRef: &apiv1.ObjectFieldSelector{
368+
APIVersion: apiVersion,
369+
FieldPath: fieldPath,
370+
},
371+
}
372+
return downwardENV
373+
}

0 commit comments

Comments
 (0)