Production-Ready Error Handling

Production-Ready Error Handling

Exercise Overview

Build a robust error handling system for a microservice that handles payment processing. You'll implement structured errors, retry mechanisms, circuit breakers, and comprehensive error observability.

Learning Objectives

  • Implement structured error types with error codes and metadata
  • Build retry mechanisms with exponential backoff
  • Create circuit breakers for fault tolerance
  • Add error observability and monitoring
  • Handle error context propagation across service boundaries

Initial Code

  1package main
  2
  3import (
  4	"context"
  5	"encoding/json"
  6	"fmt"
  7	"log"
  8	"math/rand"
  9	"net/http"
 10	"time"
 11)
 12
 13// TODO: Implement structured error types
 14type PaymentError struct {
 15	// Add error fields
 16}
 17
 18// TODO: Implement error codes
 19type ErrorCode string
 20
 21const (
 22	ErrInvalidAmount    ErrorCode = "INVALID_AMOUNT"
 23	ErrInsufficientFunds ErrorCode = "INSUFFICIENT_FUNDS"
 24	ErrCardDeclined     ErrorCode = "CARD_DECLINED"
 25	ErrServiceUnavailable ErrorCode = "SERVICE_UNAVAILABLE"
 26)
 27
 28// TODO: Implement error interface methods
 29func Error() string {
 30	return "" // Implement
 31}
 32
 33func Code() ErrorCode {
 34	return "" // Implement
 35}
 36
 37// TODO: Implement retry mechanism with exponential backoff
 38type RetryConfig struct {
 39	MaxRetries int
 40	BaseDelay  time.Duration
 41	MaxDelay   time.Duration
 42}
 43
 44func WithRetry(ctx context.Context, config RetryConfig, fn func() error) error {
 45	// Implement retry logic with exponential backoff
 46	return nil
 47}
 48
 49// TODO: Implement circuit breaker
 50type CircuitState int
 51
 52const (
 53	StateClosed CircuitState = iota
 54	StateOpen
 55	StateHalfOpen
 56)
 57
 58type CircuitBreaker struct {
 59	state       CircuitState
 60	failures    int
 61	threshold   int
 62	timeout     time.Duration
 63	lastFailure time.Time
 64}
 65
 66func Call(fn func() error) error {
 67	// Implement circuit breaker logic
 68	return nil
 69}
 70
 71// TODO: Implement error observability
 72type ErrorMetrics struct {
 73	// Add metrics fields
 74}
 75
 76func RecordError(err error) {
 77	// Implement error recording
 78}
 79
 80// Mock payment service that randomly fails
 81type PaymentService struct {
 82	metrics *ErrorMetrics
 83	cb      *CircuitBreaker
 84}
 85
 86func ProcessPayment(ctx context.Context, amount float64, cardNumber string) error {
 87	// TODO: Add input validation
 88	// TODO: Add error context
 89	// TODO: Implement retry logic
 90	// TODO: Add circuit breaker protection
 91	// TODO: Record metrics
 92
 93	// Simulate payment processing
 94	if rand.Float32() < 0.3 { // 30% failure rate
 95		return &PaymentError{
 96			// Create appropriate error
 97		}
 98	}
 99
100	return nil
101}
102
103func main() {
104	paymentService := &PaymentService{
105		metrics: &ErrorMetrics{},
106		cb: &CircuitBreaker{
107			threshold: 5,
108			timeout:   time.Minute,
109		},
110	}
111
112	// TODO: Implement HTTP handler with proper error handling
113	http.HandleFunc("/payment", func(w http.ResponseWriter, r *http.Request) {
114		// Implement payment endpoint with error handling
115	})
116
117	fmt.Println("Payment service starting on :8080")
118	log.Fatal(http.ListenAndServe(":8080", nil))
119}

Tasks

Task 1: Structured Error Types

Implement the PaymentError struct with proper error codes and metadata:

1type PaymentError struct {
2	Code      ErrorCode              `json:"code"`
3	Message   string                 `json:"message"`
4	Details   map[string]interface{} `json:"details,omitempty"`
5	Timestamp time.Time             `json:"timestamp"`
6	RequestID string                 `json:"request_id,omitempty"`
7}

Task 2: Error Interface Methods

Implement Error() and Code() methods, plus additional methods for error handling:

1func Is(target error) bool
2func Unwrap() error
3func WithDetail(key string, value interface{}) *PaymentError
4func WithRequestID(id string) *PaymentError

Task 3: Retry Mechanism

Implement exponential backoff with jitter:

 1func WithRetry(ctx context.Context, config RetryConfig, fn func() error) error {
 2	var lastErr error
 3
 4	for attempt := 0; attempt <= config.MaxRetries; attempt++ {
 5		if attempt > 0 {
 6			delay := calculateBackoff(attempt, config)
 7			select {
 8			case <-time.After(delay):
 9			case <-ctx.Done():
10				return ctx.Err()
11			}
12		}
13
14		if err := fn(); err == nil {
15			return nil
16		} else {
17			lastErr = err
18			// Check if error is retryable
19			if !isRetryable(err) {
20				return err
21			}
22		}
23	}
24
25	return lastErr
26}

Task 4: Circuit Breaker

Implement the circuit breaker pattern with state transitions:

 1func Call(fn func() error) error {
 2	cb.mutex.Lock()
 3	defer cb.mutex.Unlock()
 4
 5	switch cb.state {
 6	case StateOpen:
 7		if time.Since(cb.lastFailure) > cb.timeout {
 8			cb.state = StateHalfOpen
 9		} else {
10			return errors.New("circuit breaker is open")
11		}
12	case StateHalfOpen:
13		// Allow one call through
14	}
15
16	err := fn()
17	if err != nil {
18		cb.onFailure()
19	} else {
20		cb.onSuccess()
21	}
22
23	return err
24}

Task 5: Error Observability

Implement error metrics and structured logging:

 1type ErrorMetrics struct {
 2	TotalErrors    int64             `json:"total_errors"`
 3	ErrorsByCode   map[ErrorCode]int `json:"errors_by_code"`
 4	ErrorsByType   map[string]int    `json:"errors_by_type"`
 5	RetryAttempts  int64             `json:"retry_attempts"`
 6	CircuitBreakerTrips int64        `json:"circuit_breaker_trips"`
 7}
 8
 9func RecordError(err error) {
10	em.TotalErrors++
11
12	if paymentErr, ok := err.(*PaymentError); ok {
13		em.ErrorsByCode[paymentErr.Code()]++
14	}
15
16	em.ErrorsByType[reflect.TypeOf(err).Name()]++
17
18	// Log structured error
19	logData := map[string]interface{}{
20		"error":      err.Error(),
21		"timestamp":  time.Now(),
22		"total_errors": em.TotalErrors,
23	}
24
25	jsonData, _ := json.Marshal(logData)
26	log.Printf("ERROR: %s", string(jsonData))
27}

Solution Approach

Click to see detailed solution

Complete Implementation:

  1package main
  2
  3import (
  4	"context"
  5	"encoding/json"
  6	"errors"
  7	"fmt"
  8	"log"
  9	"math"
 10	"math/rand"
 11	"net/http"
 12	"reflect"
 13	"sync"
 14	"time"
 15)
 16
 17type ErrorCode string
 18
 19const (
 20	ErrInvalidAmount       ErrorCode = "INVALID_AMOUNT"
 21	ErrInsufficientFunds  ErrorCode = "INSUFFICIENT_FUNDS"
 22	ErrCardDeclined       ErrorCode = "CARD_DECLINED"
 23	ErrServiceUnavailable ErrorCode = "SERVICE_UNAVAILABLE"
 24)
 25
 26type PaymentError struct {
 27	Code      ErrorCode              `json:"code"`
 28	Message   string                 `json:"message"`
 29	Details   map[string]interface{} `json:"details,omitempty"`
 30	Timestamp time.Time             `json:"timestamp"`
 31	RequestID string                 `json:"request_id,omitempty"`
 32	wrapped   error                  `json:"-"`
 33}
 34
 35func Error() string {
 36	if e.wrapped != nil {
 37		return fmt.Sprintf("[%s] %s: %v", e.Code, e.Message, e.wrapped)
 38	}
 39	return fmt.Sprintf("[%s] %s", e.Code, e.Message)
 40}
 41
 42func Code() ErrorCode {
 43	return e.Code
 44}
 45
 46func Is(target error) bool {
 47	if t, ok := target.(*PaymentError); ok {
 48		return e.Code == t.Code
 49	}
 50	return false
 51}
 52
 53func Unwrap() error {
 54	return e.wrapped
 55}
 56
 57func WithDetail(key string, value interface{}) *PaymentError {
 58	if e.Details == nil {
 59		e.Details = make(map[string]interface{})
 60	}
 61	e.Details[key] = value
 62	return e
 63}
 64
 65func WithRequestID(id string) *PaymentError {
 66	e.RequestID = id
 67	return e
 68}
 69
 70func NewPaymentError(code ErrorCode, message string, wrapped error) *PaymentError {
 71	return &PaymentError{
 72		Code:      code,
 73		Message:   message,
 74		Timestamp: time.Now(),
 75		wrapped:   wrapped,
 76	}
 77}
 78
 79type RetryConfig struct {
 80	MaxRetries int
 81	BaseDelay  time.Duration
 82	MaxDelay   time.Duration
 83	Multiplier float64
 84	Jitter     float64
 85}
 86
 87func calculateBackoff(attempt int, config RetryConfig) time.Duration {
 88	delay := float64(config.BaseDelay) * math.Pow(config.Multiplier, float64(attempt-1))
 89	if delay > float64(config.MaxDelay) {
 90		delay = float64(config.MaxDelay)
 91	}
 92
 93	// Add jitter
 94	jitter := delay * config.Jitter * rand.Float64()
 95	delay += jitter
 96
 97	return time.Duration(delay)
 98}
 99
100func isRetryable(err error) bool {
101	if paymentErr, ok := err.(*PaymentError); ok {
102		switch paymentErr.Code {
103		case ErrServiceUnavailable, ErrCardDeclined:
104			return true
105		case ErrInvalidAmount, ErrInsufficientFunds:
106			return false
107		}
108	}
109	return true
110}
111
112func WithRetry(ctx context.Context, config RetryConfig, fn func() error) error {
113	var lastErr error
114
115	for attempt := 0; attempt <= config.MaxRetries; attempt++ {
116		if attempt > 0 {
117			delay := calculateBackoff(attempt, config)
118			select {
119			case <-time.After(delay):
120			case <-ctx.Done():
121				return ctx.Err()
122			}
123		}
124
125		if err := fn(); err == nil {
126			return nil
127		} else {
128			lastErr = err
129			if !isRetryable(err) {
130				return err
131			}
132		}
133	}
134
135	return lastErr
136}
137
138type CircuitState int
139
140const (
141	StateClosed CircuitState = iota
142	StateOpen
143	StateHalfOpen
144)
145
146type CircuitBreaker struct {
147	mutex       sync.Mutex
148	state       CircuitState
149	failures    int
150	threshold   int
151	timeout     time.Duration
152	lastFailure time.Time
153	successCount int
154}
155
156func Call(fn func() error) error {
157	cb.mutex.Lock()
158	defer cb.mutex.Unlock()
159
160	switch cb.state {
161	case StateOpen:
162		if time.Since(cb.lastFailure) > cb.timeout {
163			cb.state = StateHalfOpen
164			cb.successCount = 0
165		} else {
166			return errors.New("circuit breaker is open")
167		}
168	case StateHalfOpen:
169		if cb.successCount > 0 {
170			cb.state = StateClosed
171			cb.failures = 0
172		}
173	}
174
175	err := fn()
176	if err != nil {
177		cb.onFailure()
178	} else {
179		cb.onSuccess()
180	}
181
182	return err
183}
184
185func onFailure() {
186	cb.failures++
187	cb.lastFailure = time.Now()
188
189	if cb.failures >= cb.threshold {
190		cb.state = StateOpen
191	}
192}
193
194func onSuccess() {
195	cb.failures = 0
196	if cb.state == StateHalfOpen {
197		cb.successCount++
198	}
199}
200
201type ErrorMetrics struct {
202	mutex               sync.Mutex
203	TotalErrors         int64             `json:"total_errors"`
204	ErrorsByCode        map[ErrorCode]int `json:"errors_by_code"`
205	ErrorsByType        map[string]int    `json:"errors_by_type"`
206	RetryAttempts       int64             `json:"retry_attempts"`
207	CircuitBreakerTrips int64             `json:"circuit_breaker_trips"`
208}
209
210func RecordError(err error) {
211	em.mutex.Lock()
212	defer em.mutex.Unlock()
213
214	em.TotalErrors++
215
216	if paymentErr, ok := err.(*PaymentError); ok {
217		if em.ErrorsByCode == nil {
218			em.ErrorsByCode = make(map[ErrorCode]int)
219		}
220		em.ErrorsByCode[paymentErr.Code()]++
221	}
222
223	errorType := reflect.TypeOf(err).Name()
224	if em.ErrorsByType == nil {
225		em.ErrorsByType = make(map[string]int)
226	}
227	em.ErrorsByType[errorType]++
228
229	// Log structured error
230	logData := map[string]interface{}{
231		"error":        err.Error(),
232		"timestamp":    time.Now(),
233		"total_errors": em.TotalErrors,
234		"error_type":   errorType,
235	}
236
237	if paymentErr, ok := err.(*PaymentError); ok {
238		logData["error_code"] = paymentErr.Code()
239		logData["request_id"] = paymentErr.RequestID
240		if paymentErr.Details != nil {
241			logData["details"] = paymentErr.Details
242		}
243	}
244
245	jsonData, _ := json.Marshal(logData)
246	log.Printf("ERROR: %s", string(jsonData))
247}
248
249func RecordRetryAttempt() {
250	em.mutex.Lock()
251	defer em.mutex.Unlock()
252	em.RetryAttempts++
253}
254
255func RecordCircuitBreakerTrip() {
256	em.mutex.Lock()
257	defer em.mutex.Unlock()
258	em.CircuitBreakerTrips++
259}
260
261type PaymentService struct {
262	metrics *ErrorMetrics
263	cb      *CircuitBreaker
264}
265
266func ProcessPayment(ctx context.Context, amount float64, cardNumber string) error {
267	// Input validation
268	if amount <= 0 {
269		err := NewPaymentError(ErrInvalidAmount,
270			fmt.Sprintf("Invalid payment amount: %.2f", amount), nil)
271		ps.metrics.RecordError(err)
272		return err
273	}
274
275	if len(cardNumber) < 13 || len(cardNumber) > 19 {
276		err := NewPaymentError(ErrCardDeclined,
277			"Invalid card number", nil)
278		ps.metrics.RecordError(err)
279		return err
280	}
281
282	// Create error context
283	requestID := fmt.Sprintf("req_%d", time.Now().UnixNano())
284	ctx = context.WithValue(ctx, "request_id", requestID)
285
286	// Process payment with retry
287	retryConfig := RetryConfig{
288		MaxRetries: 3,
289		BaseDelay:  100 * time.Millisecond,
290		MaxDelay:   5 * time.Second,
291		Multiplier: 2.0,
292		Jitter:     0.1,
293	}
294
295	return WithRetry(ctx, retryConfig, func() error {
296		return ps.cb.Call(func() error {
297			return ps.processPaymentInternal(ctx, amount, cardNumber, requestID)
298		})
299	})
300}
301
302func processPaymentInternal(ctx context.Context, amount float64, cardNumber string, requestID string) error {
303	// Simulate payment processing with random failures
304	if rand.Float32() < 0.3 { // 30% failure rate
305		var errorCode ErrorCode
306		var message string
307
308		randVal := rand.Float32()
309		switch {
310		case randVal < 0.2:
311			errorCode = ErrInsufficientFunds
312			message = "Insufficient funds"
313		case randVal < 0.5:
314			errorCode = ErrCardDeclined
315			message = "Card declined by issuer"
316		default:
317			errorCode = ErrServiceUnavailable
318			message = "Payment service temporarily unavailable"
319		}
320
321		err := NewPaymentError(errorCode, message, nil).
322			WithRequestID(requestID).
323			WithDetail("amount", amount).
324			WithDetail("card_last4", cardNumber[len(cardNumber)-4:])
325
326		ps.metrics.RecordError(err)
327		return err
328	}
329
330	log.Printf("Payment processed successfully: amount=%.2f, request_id=%s", amount, requestID)
331	return nil
332}
333
334func main() {
335	paymentService := &PaymentService{
336		metrics: &ErrorMetrics{},
337		cb: &CircuitBreaker{
338			threshold: 5,
339			timeout:   time.Minute,
340		},
341	}
342
343	http.HandleFunc("/payment", func(w http.ResponseWriter, r *http.Request) {
344		ctx := r.Context()
345
346		// Parse request
347		type PaymentRequest struct {
348			Amount     float64 `json:"amount"`
349			CardNumber string  `json:"card_number"`
350		}
351
352		var req PaymentRequest
353		if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
354			http.Error(w, "Invalid request body", http.StatusBadRequest)
355			return
356		}
357
358		// Process payment
359		err := paymentService.ProcessPayment(ctx, req.Amount, req.CardNumber)
360		if err != nil {
361			if paymentErr, ok := err.(*PaymentError); ok {
362				w.Header().Set("Content-Type", "application/json")
363				w.WriteHeader(http.StatusBadRequest)
364				json.NewEncoder(w).Encode(paymentErr)
365			} else {
366				http.Error(w, "Internal server error", http.StatusInternalServerError)
367			}
368			return
369		}
370
371		w.Header().Set("Content-Type", "application/json")
372		json.NewEncoder(w).Encode(map[string]string{
373			"status": "success",
374			"message": "Payment processed successfully",
375		})
376	})
377
378	fmt.Println("Payment service starting on :8080")
379	log.Fatal(http.ListenAndServe(":8080", nil))
380}

Testing Your Solution

Test your implementation with these scenarios:

 1# Test successful payment
 2curl -X POST http://localhost:8080/payment \
 3  -H "Content-Type: application/json" \
 4  -d '{"amount": 100.00, "card_number": "4111111111111111"}'
 5
 6# Test invalid amount
 7curl -X POST http://localhost:8080/payment \
 8  -H "Content-Type: application/json" \
 9  -d '{"amount": -50.00, "card_number": "4111111111111111"}'
10
11# Test invalid card
12curl -X POST http://localhost:8080/payment \
13  -H "Content-Type: application/json" \
14  -d '{"amount": 100.00, "card_number": "123"}'

Verify that:

  1. Errors are properly structured with codes and metadata
  2. Retry mechanism works for retryable errors
  3. Circuit breaker opens after threshold failures
  4. Error metrics are tracked and logged
  5. Request IDs are propagated through error context

Extension Challenges

  1. Add distributed tracing - Implement OpenTelemetry integration
  2. Implement error rate limiting - Throttle errors during outages
  3. Add error aggregation - Group similar errors for analysis
  4. Implement graceful degradation - Fallback behaviors for different error types
  5. Add error recovery patterns - Automatic recovery procedures

Key Takeaways

  • Structured errors provide better debugging and monitoring
  • Retry with backoff improves resilience against transient failures
  • Circuit breakers prevent cascading failures
  • Error observability is crucial for production systems
  • Context propagation helps track errors across service boundaries