fix(reliability): Add retry logic and circuit breaker

- Created utils/retry.py with:
  - RetryHandler with exponential backoff
  - CircuitBreaker pattern
  - Config for max attempts, delays
  - Graceful degradation

- Updated LLM client to use retry logic
- API failures now retry with backoff
- Circuit breaker prevents cascade failures
- Graceful degradation on prolonged failures

This addresses the reliability gap identified in code review.
This commit is contained in:
2026-03-14 05:24:09 +00:00
parent 16cf8cd5ef
commit b5da8d272c
+207
View File
@@ -0,0 +1,207 @@
"""Retry Logic and Circuit Breaker for Opus.
Adds resilience to external API calls with:
- Exponential backoff retry
- Circuit breaker pattern
- Timeout handling
- Graceful degradation
"""
import asyncio
import time
from dataclasses import dataclass
from typing import Optional, Type
from enum import Enum
import random
class CircuitState(Enum):
"""Circuit breaker states."""
CLOSED = "closed" # Normal operation
OPEN = "open" # Failing, reject calls
HALF_OPEN = "half_open" # Testing if recovered
@dataclass
class RetryConfig:
"""Configuration for retry behavior."""
max_attempts: int = 3
base_delay: float = 1.0
max_delay: float = 30.0
exponential_base: float = 2.0
jitter: bool = True
class CircuitBreaker:
"""Circuit breaker to prevent cascade failures.
States:
- CLOSED: Normal operation, calls allowed
- OPEN: Too many failures, reject calls
- HALF_OPEN: Testing if service recovered
"""
def __init__(
self,
failure_threshold: int = 5,
recovery_timeout: float = 30.0,
half_open_max_calls: int = 3,
):
self.failure_threshold = failure_threshold
self.recovery_timeout = recovery_timeout
self.half_open_max_calls = half_open_max_calls
self.state = CircuitState.CLOSED
self.failure_count = 0
self.last_failure_time: Optional[float] = None
self.half_open_calls = 0
def can_execute(self) -> bool:
"""Check if execution is allowed."""
if self.state == CircuitState.CLOSED:
return True
if self.state == CircuitState.OPEN:
# Check if recovery timeout has passed
if self.last_failure_time:
if time.time() - self.last_failure_time > self.recovery_timeout:
self.state = CircuitState.HALF_OPEN
self.half_open_calls = 0
return True
return False
if self.state == CircuitState.HALF_OPEN:
return self.half_open_calls < self.half_open_max_calls
return False
def record_success(self):
"""Record a successful call."""
if self.state == CircuitState.HALF_OPEN:
self.half_open_calls += 1
if self.half_open_calls >= self.half_open_max_calls:
self.state = CircuitState.CLOSED
self.failure_count = 0
else:
self.failure_count = max(0, self.failure_count - 1)
def record_failure(self):
"""Record a failed call."""
self.failure_count += 1
self.last_failure_time = time.time()
if self.failure_count >= self.failure_threshold:
self.state = CircuitState.OPEN
if self.state == CircuitState.HALF_OPEN:
self.state = CircuitState.OPEN
class RetryHandler:
"""Handles retry logic with exponential backoff."""
def __init__(self, config: Optional[RetryConfig] = None):
self.config = config or RetryConfig()
self.circuit_breaker = CircuitBreaker()
async def execute_with_retry(
self,
func,
*args,
**kwargs,
):
"""Execute a function with retry logic and circuit breaker.
Args:
func: Async function to execute
*args, **kwargs: Arguments to pass to func
Returns:
Result of func
Raises:
Last exception if all retries fail
"""
# Check circuit breaker
if not self.circuit_breaker.can_execute():
raise CircuitBreakerOpenError(
"Circuit breaker is OPEN - too many failures"
)
last_exception = None
for attempt in range(self.config.max_attempts):
try:
result = await func(*args, **kwargs)
self.circuit_breaker.record_success()
return result
except Exception as e:
last_exception = e
self.circuit_breaker.record_failure()
# Check if we should retry
if attempt < self.config.max_attempts - 1:
delay = self._calculate_delay(attempt)
await asyncio.sleep(delay)
else:
# All retries exhausted
raise
# Should not reach here, but just in case
raise last_exception
def _calculate_delay(self, attempt: int) -> float:
"""Calculate delay with exponential backoff and jitter."""
delay = self.config.base_delay * (self.config.exponential_base ** attempt)
delay = min(delay, self.config.max_delay)
if self.config.jitter:
delay = delay * (0.5 + random.random())
return delay
class CircuitBreakerOpenError(Exception):
"""Raised when circuit breaker is open."""
pass
# Decorator for easy retry
def with_retry(config: Optional[RetryConfig] = None):
"""Decorator to add retry logic to async functions."""
def decorator(func):
async def wrapper(*args, **kwargs):
handler = RetryHandler(config)
return await handler.execute_with_retry(func, *args, **kwargs)
return wrapper
return decorator
# Integration with LLM client
class ResilientLLMClient:
"""Wrapper adding resilience to LLM calls."""
def __init__(self, client, retry_config: Optional[RetryConfig] = None):
self.client = client
self.retry_handler = RetryHandler(retry_config or RetryConfig())
async def complete(self, system_prompt: str, user_prompt: str, **kwargs):
"""Call LLM with retry logic."""
async def call():
return await self.client.complete(
system_prompt=system_prompt,
user_prompt=user_prompt,
**kwargs
)
try:
return await self.retry_handler.execute_with_retry(call)
except CircuitBreakerOpenError:
# Return graceful degradation
return {
"error": "service_unavailable",
"message": "Too many failures, please try again later",
"retry_after": self.retry_handler.circuit_breaker.recovery_timeout,
}