Exercise: Chaos Testing Framework
Difficulty - Advanced
Learning Objectives
- Understand chaos engineering principles
- Implement fault injection mechanisms
- Build resilience testing tools
- Practice failure scenario simulation
- Measure system recovery
Problem Statement
Create a chaos testing framework that injects various types of failures to test system resilience.
Implementation
1package chaostest
2
3import (
4 "context"
5 "errors"
6 "fmt"
7 "math/rand"
8 "net/http"
9 "sync"
10 "time"
11)
12
13// ChaosType represents different types of chaos
14type ChaosType string
15
16const (
17 LatencyChaos ChaosType = "latency"
18 ErrorChaos ChaosType = "error"
19 PanicChaos ChaosType = "panic"
20 TimeoutChaos ChaosType = "timeout"
21 ResourceChaos ChaosType = "resource_exhaustion"
22 NetworkChaos ChaosType = "network_partition"
23)
24
25// ChaosConfig configures chaos behavior
26type ChaosConfig struct {
27 Type ChaosType
28 Probability float64 // 0.0 to 1.0
29 MinDuration time.Duration
30 MaxDuration time.Duration
31 Enabled bool
32}
33
34// ChaosEngine manages chaos injection
35type ChaosEngine struct {
36 mu sync.RWMutex
37 configs map[ChaosType]ChaosConfig
38 metrics ChaosMetrics
39}
40
41type ChaosMetrics struct {
42 TotalCalls int
43 ChaosInjected int
44 ByType map[ChaosType]int
45}
46
47func NewChaosEngine() *ChaosEngine {
48 return &ChaosEngine{
49 configs: make(map[ChaosType]ChaosConfig),
50 metrics: ChaosMetrics{
51 ByType: make(map[ChaosType]int),
52 },
53 }
54}
55
56// AddConfig adds a chaos configuration
57func AddConfig(config ChaosConfig) {
58 ce.mu.Lock()
59 defer ce.mu.Unlock()
60 ce.configs[config.Type] = config
61}
62
63// InjectChaos injects chaos based on configuration
64func InjectChaos(chaosType ChaosType) error {
65 ce.mu.Lock()
66 ce.metrics.TotalCalls++
67 ce.mu.Unlock()
68
69 ce.mu.RLock()
70 config, exists := ce.configs[chaosType]
71 ce.mu.RUnlock()
72
73 if !exists || !config.Enabled {
74 return nil
75 }
76
77 // Check probability
78 if rand.Float64() > config.Probability {
79 return nil
80 }
81
82 ce.mu.Lock()
83 ce.metrics.ChaosInjected++
84 ce.metrics.ByType[chaosType]++
85 ce.mu.Unlock()
86
87 // Inject chaos based on type
88 switch chaosType {
89 case LatencyChaos:
90 return ce.injectLatency(config)
91 case ErrorChaos:
92 return ce.injectError(config)
93 case PanicChaos:
94 ce.injectPanic(config)
95 case TimeoutChaos:
96 return ce.injectTimeout(config)
97 }
98
99 return nil
100}
101
102func injectLatency(config ChaosConfig) error {
103 duration := config.MinDuration
104 if config.MaxDuration > config.MinDuration {
105 diff := config.MaxDuration - config.MinDuration
106 duration += time.Duration(rand.Int63n(int64(diff)))
107 }
108
109 time.Sleep(duration)
110 return nil
111}
112
113func injectError(config ChaosConfig) error {
114 return errors.New("chaos: simulated error")
115}
116
117func injectPanic(config ChaosConfig) {
118 panic("chaos: simulated panic")
119}
120
121func injectTimeout(config ChaosConfig) error {
122 time.Sleep(config.MaxDuration)
123 return context.DeadlineExceeded
124}
125
126// GetMetrics returns chaos metrics
127func GetMetrics() ChaosMetrics {
128 ce.mu.RLock()
129 defer ce.mu.RUnlock()
130
131 // Return a copy
132 metrics := ce.metrics
133 metrics.ByType = make(map[ChaosType]int)
134 for k, v := range ce.metrics.ByType {
135 metrics.ByType[k] = v
136 }
137
138 return metrics
139}
140
141// HTTPChaosMiddleware injects chaos into HTTP handlers
142func HTTPChaosMiddleware(next http.Handler) http.Handler {
143 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
144 // Inject latency
145 if err := ce.InjectChaos(LatencyChaos); err != nil {
146 http.Error(w, err.Error(), http.StatusInternalServerError)
147 return
148 }
149
150 // Inject error
151 if err := ce.InjectChaos(ErrorChaos); err != nil {
152 http.Error(w, err.Error(), http.StatusInternalServerError)
153 return
154 }
155
156 next.ServeHTTP(w, r)
157 })
158}
159
160// ResilientOperation wraps operations with resilience patterns
161type ResilientOperation struct {
162 MaxRetries int
163 RetryDelay time.Duration
164 Timeout time.Duration
165 CircuitBreaker *CircuitBreaker
166}
167
168func Execute(fn func() error) error {
169 ctx, cancel := context.WithTimeout(context.Background(), ro.Timeout)
170 defer cancel()
171
172 errChan := make(chan error, 1)
173
174 go func() {
175 var err error
176 for attempt := 0; attempt <= ro.MaxRetries; attempt++ {
177 if attempt > 0 {
178 time.Sleep(ro.RetryDelay)
179 }
180
181 // Check circuit breaker
182 if ro.CircuitBreaker != nil {
183 err = ro.CircuitBreaker.Call(fn)
184 } else {
185 err = fn()
186 }
187
188 if err == nil {
189 errChan <- nil
190 return
191 }
192 }
193 errChan <- err
194 }()
195
196 select {
197 case <-ctx.Done():
198 return fmt.Errorf("operation timeout: %w", ctx.Err())
199 case err := <-errChan:
200 return err
201 }
202}
203
204// CircuitBreaker for fault tolerance
205type CircuitBreaker struct {
206 mu sync.Mutex
207 failures int
208 threshold int
209 timeout time.Duration
210 state string
211 lastAttempt time.Time
212}
213
214func NewCircuitBreaker(threshold int, timeout time.Duration) *CircuitBreaker {
215 return &CircuitBreaker{
216 threshold: threshold,
217 timeout: timeout,
218 state: "closed",
219 }
220}
221
222func Call(fn func() error) error {
223 cb.mu.Lock()
224 defer cb.mu.Unlock()
225
226 if cb.state == "open" {
227 if time.Since(cb.lastAttempt) > cb.timeout {
228 cb.state = "half-open"
229 cb.failures = 0
230 } else {
231 return fmt.Errorf("circuit breaker is open")
232 }
233 }
234
235 err := fn()
236 cb.lastAttempt = time.Now()
237
238 if err != nil {
239 cb.failures++
240 if cb.failures >= cb.threshold {
241 cb.state = "open"
242 }
243 return err
244 }
245
246 cb.failures = 0
247 if cb.state == "half-open" {
248 cb.state = "closed"
249 }
250
251 return nil
252}
253
254// TestScenario defines a chaos test scenario
255type TestScenario struct {
256 Name string
257 Description string
258 Setup func() error
259 Execute func() error
260 Verify func() error
261 Teardown func() error
262}
263
264// ScenarioRunner executes test scenarios
265type ScenarioRunner struct {
266 scenarios []TestScenario
267 results []ScenarioResult
268}
269
270type ScenarioResult struct {
271 ScenarioName string
272 Success bool
273 Error error
274 Duration time.Duration
275}
276
277func NewScenarioRunner() *ScenarioRunner {
278 return &ScenarioRunner{
279 scenarios: make([]TestScenario, 0),
280 results: make([]ScenarioResult, 0),
281 }
282}
283
284func AddScenario(scenario TestScenario) {
285 sr.scenarios = append(sr.scenarios, scenario)
286}
287
288func RunAll() {
289 for _, scenario := range sr.scenarios {
290 result := sr.runScenario(scenario)
291 sr.results = append(sr.results, result)
292 }
293}
294
295func runScenario(scenario TestScenario) ScenarioResult {
296 start := time.Now()
297 result := ScenarioResult{
298 ScenarioName: scenario.Name,
299 Success: false,
300 }
301
302 // Setup
303 if scenario.Setup != nil {
304 if err := scenario.Setup(); err != nil {
305 result.Error = fmt.Errorf("setup failed: %w", err)
306 result.Duration = time.Since(start)
307 return result
308 }
309 }
310
311 // Execute
312 if err := scenario.Execute(); err != nil {
313 result.Error = fmt.Errorf("execution failed: %w", err)
314 result.Duration = time.Since(start)
315
316 // Attempt teardown even on failure
317 if scenario.Teardown != nil {
318 scenario.Teardown()
319 }
320 return result
321 }
322
323 // Verify
324 if scenario.Verify != nil {
325 if err := scenario.Verify(); err != nil {
326 result.Error = fmt.Errorf("verification failed: %w", err)
327 result.Duration = time.Since(start)
328
329 if scenario.Teardown != nil {
330 scenario.Teardown()
331 }
332 return result
333 }
334 }
335
336 // Teardown
337 if scenario.Teardown != nil {
338 if err := scenario.Teardown(); err != nil {
339 result.Error = fmt.Errorf("teardown failed: %w", err)
340 result.Duration = time.Since(start)
341 return result
342 }
343 }
344
345 result.Success = true
346 result.Duration = time.Since(start)
347 return result
348}
349
350func GetResults() []ScenarioResult {
351 return sr.results
352}
Key Takeaways
- Controlled Failure: Inject failures in controlled manner
- Probabilistic Testing: Use probability for realistic scenarios
- Measure Resilience: Track how system handles failures
- Circuit Breaker: Prevent cascading failures
- Automated Recovery: Test recovery mechanisms
Related Topics
- Chaos Engineering - Main chaos tutorial
- Testing Strategies - Testing patterns
- Observability - System monitoring