Chaos Testing Framework

Exercise: Chaos Testing Framework

Difficulty - Advanced

Learning Objectives

  • Understand chaos engineering principles
  • Implement fault injection mechanisms
  • Build resilience testing tools
  • Practice failure scenario simulation
  • Measure system recovery

Problem Statement

Create a chaos testing framework that injects various types of failures to test system resilience.

Implementation

  1package chaostest
  2
  3import (
  4	"context"
  5	"errors"
  6	"fmt"
  7	"math/rand"
  8	"net/http"
  9	"sync"
 10	"time"
 11)
 12
 13// ChaosType represents different types of chaos
 14type ChaosType string
 15
 16const (
 17	LatencyChaos     ChaosType = "latency"
 18	ErrorChaos       ChaosType = "error"
 19	PanicChaos       ChaosType = "panic"
 20	TimeoutChaos     ChaosType = "timeout"
 21	ResourceChaos    ChaosType = "resource_exhaustion"
 22	NetworkChaos     ChaosType = "network_partition"
 23)
 24
 25// ChaosConfig configures chaos behavior
 26type ChaosConfig struct {
 27	Type        ChaosType
 28	Probability float64 // 0.0 to 1.0
 29	MinDuration time.Duration
 30	MaxDuration time.Duration
 31	Enabled     bool
 32}
 33
 34// ChaosEngine manages chaos injection
 35type ChaosEngine struct {
 36	mu      sync.RWMutex
 37	configs map[ChaosType]ChaosConfig
 38	metrics ChaosMetrics
 39}
 40
 41type ChaosMetrics struct {
 42	TotalCalls    int
 43	ChaosInjected int
 44	ByType        map[ChaosType]int
 45}
 46
 47func NewChaosEngine() *ChaosEngine {
 48	return &ChaosEngine{
 49		configs: make(map[ChaosType]ChaosConfig),
 50		metrics: ChaosMetrics{
 51			ByType: make(map[ChaosType]int),
 52		},
 53	}
 54}
 55
 56// AddConfig adds a chaos configuration
 57func AddConfig(config ChaosConfig) {
 58	ce.mu.Lock()
 59	defer ce.mu.Unlock()
 60	ce.configs[config.Type] = config
 61}
 62
 63// InjectChaos injects chaos based on configuration
 64func InjectChaos(chaosType ChaosType) error {
 65	ce.mu.Lock()
 66	ce.metrics.TotalCalls++
 67	ce.mu.Unlock()
 68
 69	ce.mu.RLock()
 70	config, exists := ce.configs[chaosType]
 71	ce.mu.RUnlock()
 72
 73	if !exists || !config.Enabled {
 74		return nil
 75	}
 76
 77	// Check probability
 78	if rand.Float64() > config.Probability {
 79		return nil
 80	}
 81
 82	ce.mu.Lock()
 83	ce.metrics.ChaosInjected++
 84	ce.metrics.ByType[chaosType]++
 85	ce.mu.Unlock()
 86
 87	// Inject chaos based on type
 88	switch chaosType {
 89	case LatencyChaos:
 90		return ce.injectLatency(config)
 91	case ErrorChaos:
 92		return ce.injectError(config)
 93	case PanicChaos:
 94		ce.injectPanic(config)
 95	case TimeoutChaos:
 96		return ce.injectTimeout(config)
 97	}
 98
 99	return nil
100}
101
102func injectLatency(config ChaosConfig) error {
103	duration := config.MinDuration
104	if config.MaxDuration > config.MinDuration {
105		diff := config.MaxDuration - config.MinDuration
106		duration += time.Duration(rand.Int63n(int64(diff)))
107	}
108
109	time.Sleep(duration)
110	return nil
111}
112
113func injectError(config ChaosConfig) error {
114	return errors.New("chaos: simulated error")
115}
116
117func injectPanic(config ChaosConfig) {
118	panic("chaos: simulated panic")
119}
120
121func injectTimeout(config ChaosConfig) error {
122	time.Sleep(config.MaxDuration)
123	return context.DeadlineExceeded
124}
125
126// GetMetrics returns chaos metrics
127func GetMetrics() ChaosMetrics {
128	ce.mu.RLock()
129	defer ce.mu.RUnlock()
130
131	// Return a copy
132	metrics := ce.metrics
133	metrics.ByType = make(map[ChaosType]int)
134	for k, v := range ce.metrics.ByType {
135		metrics.ByType[k] = v
136	}
137
138	return metrics
139}
140
141// HTTPChaosMiddleware injects chaos into HTTP handlers
142func HTTPChaosMiddleware(next http.Handler) http.Handler {
143	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
144		// Inject latency
145		if err := ce.InjectChaos(LatencyChaos); err != nil {
146			http.Error(w, err.Error(), http.StatusInternalServerError)
147			return
148		}
149
150		// Inject error
151		if err := ce.InjectChaos(ErrorChaos); err != nil {
152			http.Error(w, err.Error(), http.StatusInternalServerError)
153			return
154		}
155
156		next.ServeHTTP(w, r)
157	})
158}
159
160// ResilientOperation wraps operations with resilience patterns
161type ResilientOperation struct {
162	MaxRetries    int
163	RetryDelay    time.Duration
164	Timeout       time.Duration
165	CircuitBreaker *CircuitBreaker
166}
167
168func Execute(fn func() error) error {
169	ctx, cancel := context.WithTimeout(context.Background(), ro.Timeout)
170	defer cancel()
171
172	errChan := make(chan error, 1)
173
174	go func() {
175		var err error
176		for attempt := 0; attempt <= ro.MaxRetries; attempt++ {
177			if attempt > 0 {
178				time.Sleep(ro.RetryDelay)
179			}
180
181			// Check circuit breaker
182			if ro.CircuitBreaker != nil {
183				err = ro.CircuitBreaker.Call(fn)
184			} else {
185				err = fn()
186			}
187
188			if err == nil {
189				errChan <- nil
190				return
191			}
192		}
193		errChan <- err
194	}()
195
196	select {
197	case <-ctx.Done():
198		return fmt.Errorf("operation timeout: %w", ctx.Err())
199	case err := <-errChan:
200		return err
201	}
202}
203
204// CircuitBreaker for fault tolerance
205type CircuitBreaker struct {
206	mu           sync.Mutex
207	failures     int
208	threshold    int
209	timeout      time.Duration
210	state        string
211	lastAttempt  time.Time
212}
213
214func NewCircuitBreaker(threshold int, timeout time.Duration) *CircuitBreaker {
215	return &CircuitBreaker{
216		threshold: threshold,
217		timeout:   timeout,
218		state:     "closed",
219	}
220}
221
222func Call(fn func() error) error {
223	cb.mu.Lock()
224	defer cb.mu.Unlock()
225
226	if cb.state == "open" {
227		if time.Since(cb.lastAttempt) > cb.timeout {
228			cb.state = "half-open"
229			cb.failures = 0
230		} else {
231			return fmt.Errorf("circuit breaker is open")
232		}
233	}
234
235	err := fn()
236	cb.lastAttempt = time.Now()
237
238	if err != nil {
239		cb.failures++
240		if cb.failures >= cb.threshold {
241			cb.state = "open"
242		}
243		return err
244	}
245
246	cb.failures = 0
247	if cb.state == "half-open" {
248		cb.state = "closed"
249	}
250
251	return nil
252}
253
254// TestScenario defines a chaos test scenario
255type TestScenario struct {
256	Name        string
257	Description string
258	Setup       func() error
259	Execute     func() error
260	Verify      func() error
261	Teardown    func() error
262}
263
264// ScenarioRunner executes test scenarios
265type ScenarioRunner struct {
266	scenarios []TestScenario
267	results   []ScenarioResult
268}
269
270type ScenarioResult struct {
271	ScenarioName string
272	Success      bool
273	Error        error
274	Duration     time.Duration
275}
276
277func NewScenarioRunner() *ScenarioRunner {
278	return &ScenarioRunner{
279		scenarios: make([]TestScenario, 0),
280		results:   make([]ScenarioResult, 0),
281	}
282}
283
284func AddScenario(scenario TestScenario) {
285	sr.scenarios = append(sr.scenarios, scenario)
286}
287
288func RunAll() {
289	for _, scenario := range sr.scenarios {
290		result := sr.runScenario(scenario)
291		sr.results = append(sr.results, result)
292	}
293}
294
295func runScenario(scenario TestScenario) ScenarioResult {
296	start := time.Now()
297	result := ScenarioResult{
298		ScenarioName: scenario.Name,
299		Success:      false,
300	}
301
302	// Setup
303	if scenario.Setup != nil {
304		if err := scenario.Setup(); err != nil {
305			result.Error = fmt.Errorf("setup failed: %w", err)
306			result.Duration = time.Since(start)
307			return result
308		}
309	}
310
311	// Execute
312	if err := scenario.Execute(); err != nil {
313		result.Error = fmt.Errorf("execution failed: %w", err)
314		result.Duration = time.Since(start)
315
316		// Attempt teardown even on failure
317		if scenario.Teardown != nil {
318			scenario.Teardown()
319		}
320		return result
321	}
322
323	// Verify
324	if scenario.Verify != nil {
325		if err := scenario.Verify(); err != nil {
326			result.Error = fmt.Errorf("verification failed: %w", err)
327			result.Duration = time.Since(start)
328
329			if scenario.Teardown != nil {
330				scenario.Teardown()
331			}
332			return result
333		}
334	}
335
336	// Teardown
337	if scenario.Teardown != nil {
338		if err := scenario.Teardown(); err != nil {
339			result.Error = fmt.Errorf("teardown failed: %w", err)
340			result.Duration = time.Since(start)
341			return result
342		}
343	}
344
345	result.Success = true
346	result.Duration = time.Since(start)
347	return result
348}
349
350func GetResults() []ScenarioResult {
351	return sr.results
352}

Key Takeaways

  1. Controlled Failure: Inject failures in controlled manner
  2. Probabilistic Testing: Use probability for realistic scenarios
  3. Measure Resilience: Track how system handles failures
  4. Circuit Breaker: Prevent cascading failures
  5. Automated Recovery: Test recovery mechanisms