Skip to content

Commit 5c22472

Browse files
authored
feat: Add a rds-instance-stop chaos fault (#710)
* feat: Add a rds-instance-stop chaos fault Signed-off-by: Jongwoo Han <jongwooo.han@gmail.com> --------- Signed-off-by: Jongwoo Han <jongwooo.han@gmail.com>
1 parent e7b3fb6 commit 5c22472

File tree

10 files changed

+819
-1
lines changed

10 files changed

+819
-1
lines changed

bin/experiment/experiment.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ import (
1515
// _ "k8s.io/client-go/plugin/pkg/client/auth/oidc"
1616
// _ "k8s.io/client-go/plugin/pkg/client/auth/openstack"
1717

18+
"go.opentelemetry.io/otel"
19+
1820
awsSSMChaosByID "github.com/litmuschaos/litmus-go/experiments/aws-ssm/aws-ssm-chaos-by-id/experiment"
1921
awsSSMChaosByTag "github.com/litmuschaos/litmus-go/experiments/aws-ssm/aws-ssm-chaos-by-tag/experiment"
2022
azureDiskLoss "github.com/litmuschaos/litmus-go/experiments/azure/azure-disk-loss/experiment"
@@ -60,14 +62,14 @@ import (
6062
ebsLossByTag "github.com/litmuschaos/litmus-go/experiments/kube-aws/ebs-loss-by-tag/experiment"
6163
ec2TerminateByID "github.com/litmuschaos/litmus-go/experiments/kube-aws/ec2-terminate-by-id/experiment"
6264
ec2TerminateByTag "github.com/litmuschaos/litmus-go/experiments/kube-aws/ec2-terminate-by-tag/experiment"
65+
rdsInstanceStop "github.com/litmuschaos/litmus-go/experiments/kube-aws/rds-instance-stop/experiment"
6366
k6Loadgen "github.com/litmuschaos/litmus-go/experiments/load/k6-loadgen/experiment"
6467
springBootFaults "github.com/litmuschaos/litmus-go/experiments/spring-boot/spring-boot-faults/experiment"
6568
vmpoweroff "github.com/litmuschaos/litmus-go/experiments/vmware/vm-poweroff/experiment"
6669
cli "github.com/litmuschaos/litmus-go/pkg/clients"
6770
"github.com/litmuschaos/litmus-go/pkg/log"
6871
"github.com/litmuschaos/litmus-go/pkg/telemetry"
6972
"github.com/sirupsen/logrus"
70-
"go.opentelemetry.io/otel"
7173
)
7274

7375
func init() {
@@ -171,6 +173,8 @@ func main() {
171173
ebsLossByID.EBSLossByID(ctx, clients)
172174
case "ebs-loss-by-tag":
173175
ebsLossByTag.EBSLossByTag(ctx, clients)
176+
case "rds-instance-stop":
177+
rdsInstanceStop.RDSInstanceStop(ctx, clients)
174178
case "node-restart":
175179
nodeRestart.NodeRestart(ctx, clients)
176180
case "pod-dns-error":
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
package lib
2+
3+
import (
4+
"fmt"
5+
"go.opentelemetry.io/otel"
6+
"golang.org/x/net/context"
7+
"os"
8+
"os/signal"
9+
"strings"
10+
"syscall"
11+
"time"
12+
13+
"github.com/litmuschaos/litmus-go/pkg/cerrors"
14+
awslib "github.com/litmuschaos/litmus-go/pkg/cloud/aws/rds"
15+
"github.com/litmuschaos/litmus-go/pkg/events"
16+
experimentTypes "github.com/litmuschaos/litmus-go/pkg/kube-aws/rds-instance-stop/types"
17+
"github.com/litmuschaos/litmus-go/pkg/probe"
18+
"github.com/litmuschaos/litmus-go/pkg/telemetry"
19+
"github.com/palantir/stacktrace"
20+
21+
"github.com/litmuschaos/litmus-go/pkg/clients"
22+
"github.com/litmuschaos/litmus-go/pkg/log"
23+
"github.com/litmuschaos/litmus-go/pkg/types"
24+
"github.com/litmuschaos/litmus-go/pkg/utils/common"
25+
)
26+
27+
var (
28+
err error
29+
inject, abort chan os.Signal
30+
)
31+
32+
func PrepareRDSInstanceStop(ctx context.Context, experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
33+
ctx, span := otel.Tracer(telemetry.TracerName).Start(ctx, "PrepareRDSInstanceStop")
34+
defer span.End()
35+
36+
// Inject channel is used to transmit signal notifications.
37+
inject = make(chan os.Signal, 1)
38+
// Catch and relay certain signal(s) to inject channel.
39+
signal.Notify(inject, os.Interrupt, syscall.SIGTERM)
40+
41+
// Abort channel is used to transmit signal notifications.
42+
abort = make(chan os.Signal, 1)
43+
// Catch and relay certain signal(s) to abort channel.
44+
signal.Notify(abort, os.Interrupt, syscall.SIGTERM)
45+
46+
// Waiting for the ramp time before chaos injection
47+
if experimentsDetails.RampTime != 0 {
48+
log.Infof("[Ramp]: Waiting for the %vs ramp time before injecting chaos", experimentsDetails.RampTime)
49+
common.WaitForDuration(experimentsDetails.RampTime)
50+
}
51+
52+
// Get the instance identifier or list of instance identifiers
53+
instanceIdentifierList := strings.Split(experimentsDetails.RDSInstanceIdentifier, ",")
54+
if experimentsDetails.RDSInstanceIdentifier == "" || len(instanceIdentifierList) == 0 {
55+
return cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "no RDS instance identifier found to stop"}
56+
}
57+
58+
instanceIdentifierList = common.FilterBasedOnPercentage(experimentsDetails.InstanceAffectedPerc, instanceIdentifierList)
59+
log.Infof("[Chaos]:Number of Instance targeted: %v", len(instanceIdentifierList))
60+
61+
// Watching for the abort signal and revert the chaos
62+
go abortWatcher(experimentsDetails, instanceIdentifierList, chaosDetails)
63+
64+
switch strings.ToLower(experimentsDetails.Sequence) {
65+
case "serial":
66+
if err = injectChaosInSerialMode(ctx, experimentsDetails, instanceIdentifierList, clients, resultDetails, eventsDetails, chaosDetails); err != nil {
67+
return stacktrace.Propagate(err, "could not run chaos in serial mode")
68+
}
69+
case "parallel":
70+
if err = injectChaosInParallelMode(ctx, experimentsDetails, instanceIdentifierList, clients, resultDetails, eventsDetails, chaosDetails); err != nil {
71+
return stacktrace.Propagate(err, "could not run chaos in parallel mode")
72+
}
73+
default:
74+
return cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: fmt.Sprintf("'%s' sequence is not supported", experimentsDetails.Sequence)}
75+
}
76+
77+
// Waiting for the ramp time after chaos injection
78+
if experimentsDetails.RampTime != 0 {
79+
log.Infof("[Ramp]: Waiting for the %vs ramp time after injecting chaos", experimentsDetails.RampTime)
80+
common.WaitForDuration(experimentsDetails.RampTime)
81+
}
82+
return nil
83+
}
84+
85+
// injectChaosInSerialMode will inject the rds instance state in serial mode that is one after other
86+
func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experimentTypes.ExperimentDetails, instanceIdentifierList []string, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
87+
88+
select {
89+
case <-inject:
90+
// Stopping the chaos execution, if abort signal received
91+
os.Exit(0)
92+
default:
93+
// ChaosStartTimeStamp contains the start timestamp, when the chaos injection begin
94+
ChaosStartTimeStamp := time.Now()
95+
duration := int(time.Since(ChaosStartTimeStamp).Seconds())
96+
97+
for duration < experimentsDetails.ChaosDuration {
98+
99+
log.Infof("[Info]: Target instance identifier list, %v", instanceIdentifierList)
100+
101+
if experimentsDetails.EngineName != "" {
102+
msg := "Injecting " + experimentsDetails.ExperimentName + " chaos on rds instance"
103+
types.SetEngineEventAttributes(eventsDetails, types.ChaosInject, msg, "Normal", chaosDetails)
104+
events.GenerateEvents(eventsDetails, clients, chaosDetails, "ChaosEngine")
105+
}
106+
107+
for i, identifier := range instanceIdentifierList {
108+
109+
// Stopping the RDS instance
110+
log.Info("[Chaos]: Stopping the desired RDS instance")
111+
if err := awslib.RDSInstanceStop(identifier, experimentsDetails.Region); err != nil {
112+
return stacktrace.Propagate(err, "rds instance failed to stop")
113+
}
114+
115+
common.SetTargets(identifier, "injected", "RDS", chaosDetails)
116+
117+
// Wait for rds instance to completely stop
118+
log.Infof("[Wait]: Wait for RDS instance '%v' to get in stopped state", identifier)
119+
if err := awslib.WaitForRDSInstanceDown(experimentsDetails.Timeout, experimentsDetails.Delay, identifier, experimentsDetails.Region); err != nil {
120+
return stacktrace.Propagate(err, "rds instance failed to stop")
121+
}
122+
123+
// Run the probes during chaos
124+
// the OnChaos probes execution will start in the first iteration and keep running for the entire chaos duration
125+
if len(resultDetails.ProbeDetails) != 0 && i == 0 {
126+
if err = probe.RunProbes(ctx, chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
127+
return stacktrace.Propagate(err, "failed to run probes")
128+
}
129+
}
130+
131+
// Wait for chaos interval
132+
log.Infof("[Wait]: Waiting for chaos interval of %vs", experimentsDetails.ChaosInterval)
133+
time.Sleep(time.Duration(experimentsDetails.ChaosInterval) * time.Second)
134+
135+
// Starting the RDS instance
136+
log.Info("[Chaos]: Starting back the RDS instance")
137+
if err = awslib.RDSInstanceStart(identifier, experimentsDetails.Region); err != nil {
138+
return stacktrace.Propagate(err, "rds instance failed to start")
139+
}
140+
141+
// Wait for rds instance to get in available state
142+
log.Infof("[Wait]: Wait for RDS instance '%v' to get in available state", identifier)
143+
if err := awslib.WaitForRDSInstanceUp(experimentsDetails.Timeout, experimentsDetails.Delay, experimentsDetails.Region, identifier); err != nil {
144+
return stacktrace.Propagate(err, "rds instance failed to start")
145+
}
146+
147+
common.SetTargets(identifier, "reverted", "RDS", chaosDetails)
148+
}
149+
duration = int(time.Since(ChaosStartTimeStamp).Seconds())
150+
}
151+
}
152+
return nil
153+
}
154+
155+
// injectChaosInParallelMode will inject the rds instance termination in parallel mode that is all at once
156+
func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experimentTypes.ExperimentDetails, instanceIdentifierList []string, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
157+
158+
select {
159+
case <-inject:
160+
// stopping the chaos execution, if abort signal received
161+
os.Exit(0)
162+
default:
163+
//ChaosStartTimeStamp contains the start timestamp, when the chaos injection begin
164+
ChaosStartTimeStamp := time.Now()
165+
duration := int(time.Since(ChaosStartTimeStamp).Seconds())
166+
167+
for duration < experimentsDetails.ChaosDuration {
168+
169+
log.Infof("[Info]: Target instance identifier list, %v", instanceIdentifierList)
170+
171+
if experimentsDetails.EngineName != "" {
172+
msg := "Injecting " + experimentsDetails.ExperimentName + " chaos on rds instance"
173+
types.SetEngineEventAttributes(eventsDetails, types.ChaosInject, msg, "Normal", chaosDetails)
174+
events.GenerateEvents(eventsDetails, clients, chaosDetails, "ChaosEngine")
175+
}
176+
177+
// PowerOff the instance
178+
for _, identifier := range instanceIdentifierList {
179+
// Stopping the RDS instance
180+
log.Info("[Chaos]: Stopping the desired RDS instance")
181+
if err := awslib.RDSInstanceStop(identifier, experimentsDetails.Region); err != nil {
182+
return stacktrace.Propagate(err, "rds instance failed to stop")
183+
}
184+
common.SetTargets(identifier, "injected", "RDS", chaosDetails)
185+
}
186+
187+
for _, identifier := range instanceIdentifierList {
188+
// Wait for rds instance to completely stop
189+
log.Infof("[Wait]: Wait for RDS instance '%v' to get in stopped state", identifier)
190+
if err := awslib.WaitForRDSInstanceDown(experimentsDetails.Timeout, experimentsDetails.Delay, experimentsDetails.Region, identifier); err != nil {
191+
return stacktrace.Propagate(err, "rds instance failed to stop")
192+
}
193+
common.SetTargets(identifier, "reverted", "RDS", chaosDetails)
194+
}
195+
196+
// Run the probes during chaos
197+
if len(resultDetails.ProbeDetails) != 0 {
198+
if err := probe.RunProbes(ctx, chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
199+
return stacktrace.Propagate(err, "failed to run probes")
200+
}
201+
}
202+
203+
// Wait for chaos interval
204+
log.Infof("[Wait]: Waiting for chaos interval of %vs", experimentsDetails.ChaosInterval)
205+
time.Sleep(time.Duration(experimentsDetails.ChaosInterval) * time.Second)
206+
207+
// Starting the RDS instance
208+
for _, identifier := range instanceIdentifierList {
209+
log.Info("[Chaos]: Starting back the RDS instance")
210+
if err = awslib.RDSInstanceStart(identifier, experimentsDetails.Region); err != nil {
211+
return stacktrace.Propagate(err, "rds instance failed to start")
212+
}
213+
}
214+
215+
for _, identifier := range instanceIdentifierList {
216+
// Wait for rds instance to get in available state
217+
log.Infof("[Wait]: Wait for RDS instance '%v' to get in available state", identifier)
218+
if err := awslib.WaitForRDSInstanceUp(experimentsDetails.Timeout, experimentsDetails.Delay, experimentsDetails.Region, identifier); err != nil {
219+
return stacktrace.Propagate(err, "rds instance failed to start")
220+
}
221+
}
222+
223+
for _, identifier := range instanceIdentifierList {
224+
common.SetTargets(identifier, "reverted", "RDS", chaosDetails)
225+
}
226+
duration = int(time.Since(ChaosStartTimeStamp).Seconds())
227+
}
228+
}
229+
return nil
230+
}
231+
232+
// watching for the abort signal and revert the chaos
233+
func abortWatcher(experimentsDetails *experimentTypes.ExperimentDetails, instanceIdentifierList []string, chaosDetails *types.ChaosDetails) {
234+
235+
<-abort
236+
237+
log.Info("[Abort]: Chaos Revert Started")
238+
for _, identifier := range instanceIdentifierList {
239+
instanceState, err := awslib.GetRDSInstanceStatus(identifier, experimentsDetails.Region)
240+
if err != nil {
241+
log.Errorf("Failed to get instance status when an abort signal is received: %v", err)
242+
}
243+
if instanceState != "running" {
244+
245+
log.Info("[Abort]: Waiting for the RDS instance to get down")
246+
if err := awslib.WaitForRDSInstanceDown(experimentsDetails.Timeout, experimentsDetails.Delay, experimentsDetails.Region, identifier); err != nil {
247+
log.Errorf("Unable to wait till stop of the instance: %v", err)
248+
}
249+
250+
log.Info("[Abort]: Starting RDS instance as abort signal received")
251+
err := awslib.RDSInstanceStart(identifier, experimentsDetails.Region)
252+
if err != nil {
253+
log.Errorf("RDS instance failed to start when an abort signal is received: %v", err)
254+
}
255+
}
256+
common.SetTargets(identifier, "reverted", "RDS", chaosDetails)
257+
}
258+
log.Info("[Abort]: Chaos Revert Completed")
259+
os.Exit(1)
260+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
## Experiment Metadata
2+
3+
<table>
4+
<tr>
5+
<th> Name </th>
6+
<th> Description </th>
7+
<th> Documentation Link </th>
8+
</tr>
9+
<tr>
10+
<td> RDS Instance Stop </td>
11+
<td> This experiment causes the state change of an RDS instance to be in stopped state before bringing it back to available using the instance identifier after the specified chaos duration. We can also control the number of target instance using the instance affected percentage</td>
12+
<td> <a href="https://litmuschaos.github.io/litmus/experiments/categories/aws/rds-instance-stop/"> Here </a> </td>
13+
</tr>
14+
</table>

0 commit comments

Comments
 (0)