208 lines
6.3 KiB
Python
208 lines
6.3 KiB
Python
|
|
"""Retry Logic and Circuit Breaker for Opus.
|
||
|
|
|
||
|
|
Adds resilience to external API calls with:
|
||
|
|
- Exponential backoff retry
|
||
|
|
- Circuit breaker pattern
|
||
|
|
- Timeout handling
|
||
|
|
- Graceful degradation
|
||
|
|
"""
|
||
|
|
|
||
|
|
import asyncio
|
||
|
|
import time
|
||
|
|
from dataclasses import dataclass
|
||
|
|
from typing import Optional, Type
|
||
|
|
from enum import Enum
|
||
|
|
import random
|
||
|
|
|
||
|
|
|
||
|
|
class CircuitState(Enum):
|
||
|
|
"""Circuit breaker states."""
|
||
|
|
CLOSED = "closed" # Normal operation
|
||
|
|
OPEN = "open" # Failing, reject calls
|
||
|
|
HALF_OPEN = "half_open" # Testing if recovered
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class RetryConfig:
|
||
|
|
"""Configuration for retry behavior."""
|
||
|
|
max_attempts: int = 3
|
||
|
|
base_delay: float = 1.0
|
||
|
|
max_delay: float = 30.0
|
||
|
|
exponential_base: float = 2.0
|
||
|
|
jitter: bool = True
|
||
|
|
|
||
|
|
|
||
|
|
class CircuitBreaker:
|
||
|
|
"""Circuit breaker to prevent cascade failures.
|
||
|
|
|
||
|
|
States:
|
||
|
|
- CLOSED: Normal operation, calls allowed
|
||
|
|
- OPEN: Too many failures, reject calls
|
||
|
|
- HALF_OPEN: Testing if service recovered
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
failure_threshold: int = 5,
|
||
|
|
recovery_timeout: float = 30.0,
|
||
|
|
half_open_max_calls: int = 3,
|
||
|
|
):
|
||
|
|
self.failure_threshold = failure_threshold
|
||
|
|
self.recovery_timeout = recovery_timeout
|
||
|
|
self.half_open_max_calls = half_open_max_calls
|
||
|
|
|
||
|
|
self.state = CircuitState.CLOSED
|
||
|
|
self.failure_count = 0
|
||
|
|
self.last_failure_time: Optional[float] = None
|
||
|
|
self.half_open_calls = 0
|
||
|
|
|
||
|
|
def can_execute(self) -> bool:
|
||
|
|
"""Check if execution is allowed."""
|
||
|
|
if self.state == CircuitState.CLOSED:
|
||
|
|
return True
|
||
|
|
|
||
|
|
if self.state == CircuitState.OPEN:
|
||
|
|
# Check if recovery timeout has passed
|
||
|
|
if self.last_failure_time:
|
||
|
|
if time.time() - self.last_failure_time > self.recovery_timeout:
|
||
|
|
self.state = CircuitState.HALF_OPEN
|
||
|
|
self.half_open_calls = 0
|
||
|
|
return True
|
||
|
|
return False
|
||
|
|
|
||
|
|
if self.state == CircuitState.HALF_OPEN:
|
||
|
|
return self.half_open_calls < self.half_open_max_calls
|
||
|
|
|
||
|
|
return False
|
||
|
|
|
||
|
|
def record_success(self):
|
||
|
|
"""Record a successful call."""
|
||
|
|
if self.state == CircuitState.HALF_OPEN:
|
||
|
|
self.half_open_calls += 1
|
||
|
|
if self.half_open_calls >= self.half_open_max_calls:
|
||
|
|
self.state = CircuitState.CLOSED
|
||
|
|
self.failure_count = 0
|
||
|
|
else:
|
||
|
|
self.failure_count = max(0, self.failure_count - 1)
|
||
|
|
|
||
|
|
def record_failure(self):
|
||
|
|
"""Record a failed call."""
|
||
|
|
self.failure_count += 1
|
||
|
|
self.last_failure_time = time.time()
|
||
|
|
|
||
|
|
if self.failure_count >= self.failure_threshold:
|
||
|
|
self.state = CircuitState.OPEN
|
||
|
|
|
||
|
|
if self.state == CircuitState.HALF_OPEN:
|
||
|
|
self.state = CircuitState.OPEN
|
||
|
|
|
||
|
|
|
||
|
|
class RetryHandler:
|
||
|
|
"""Handles retry logic with exponential backoff."""
|
||
|
|
|
||
|
|
def __init__(self, config: Optional[RetryConfig] = None):
|
||
|
|
self.config = config or RetryConfig()
|
||
|
|
self.circuit_breaker = CircuitBreaker()
|
||
|
|
|
||
|
|
async def execute_with_retry(
|
||
|
|
self,
|
||
|
|
func,
|
||
|
|
*args,
|
||
|
|
**kwargs,
|
||
|
|
):
|
||
|
|
"""Execute a function with retry logic and circuit breaker.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
func: Async function to execute
|
||
|
|
*args, **kwargs: Arguments to pass to func
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Result of func
|
||
|
|
|
||
|
|
Raises:
|
||
|
|
Last exception if all retries fail
|
||
|
|
"""
|
||
|
|
# Check circuit breaker
|
||
|
|
if not self.circuit_breaker.can_execute():
|
||
|
|
raise CircuitBreakerOpenError(
|
||
|
|
"Circuit breaker is OPEN - too many failures"
|
||
|
|
)
|
||
|
|
|
||
|
|
last_exception = None
|
||
|
|
|
||
|
|
for attempt in range(self.config.max_attempts):
|
||
|
|
try:
|
||
|
|
result = await func(*args, **kwargs)
|
||
|
|
self.circuit_breaker.record_success()
|
||
|
|
return result
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
last_exception = e
|
||
|
|
self.circuit_breaker.record_failure()
|
||
|
|
|
||
|
|
# Check if we should retry
|
||
|
|
if attempt < self.config.max_attempts - 1:
|
||
|
|
delay = self._calculate_delay(attempt)
|
||
|
|
await asyncio.sleep(delay)
|
||
|
|
else:
|
||
|
|
# All retries exhausted
|
||
|
|
raise
|
||
|
|
|
||
|
|
# Should not reach here, but just in case
|
||
|
|
raise last_exception
|
||
|
|
|
||
|
|
def _calculate_delay(self, attempt: int) -> float:
|
||
|
|
"""Calculate delay with exponential backoff and jitter."""
|
||
|
|
delay = self.config.base_delay * (self.config.exponential_base ** attempt)
|
||
|
|
delay = min(delay, self.config.max_delay)
|
||
|
|
|
||
|
|
if self.config.jitter:
|
||
|
|
delay = delay * (0.5 + random.random())
|
||
|
|
|
||
|
|
return delay
|
||
|
|
|
||
|
|
|
||
|
|
class CircuitBreakerOpenError(Exception):
|
||
|
|
"""Raised when circuit breaker is open."""
|
||
|
|
pass
|
||
|
|
|
||
|
|
|
||
|
|
# Decorator for easy retry
|
||
|
|
def with_retry(config: Optional[RetryConfig] = None):
|
||
|
|
"""Decorator to add retry logic to async functions."""
|
||
|
|
def decorator(func):
|
||
|
|
async def wrapper(*args, **kwargs):
|
||
|
|
handler = RetryHandler(config)
|
||
|
|
return await handler.execute_with_retry(func, *args, **kwargs)
|
||
|
|
return wrapper
|
||
|
|
return decorator
|
||
|
|
|
||
|
|
|
||
|
|
# Integration with LLM client
|
||
|
|
class ResilientLLMClient:
|
||
|
|
"""Wrapper adding resilience to LLM calls."""
|
||
|
|
|
||
|
|
def __init__(self, client, retry_config: Optional[RetryConfig] = None):
|
||
|
|
self.client = client
|
||
|
|
self.retry_handler = RetryHandler(retry_config or RetryConfig())
|
||
|
|
|
||
|
|
async def complete(self, system_prompt: str, user_prompt: str, **kwargs):
|
||
|
|
"""Call LLM with retry logic."""
|
||
|
|
|
||
|
|
async def call():
|
||
|
|
return await self.client.complete(
|
||
|
|
system_prompt=system_prompt,
|
||
|
|
user_prompt=user_prompt,
|
||
|
|
**kwargs
|
||
|
|
)
|
||
|
|
|
||
|
|
try:
|
||
|
|
return await self.retry_handler.execute_with_retry(call)
|
||
|
|
except CircuitBreakerOpenError:
|
||
|
|
# Return graceful degradation
|
||
|
|
return {
|
||
|
|
"error": "service_unavailable",
|
||
|
|
"message": "Too many failures, please try again later",
|
||
|
|
"retry_after": self.retry_handler.circuit_breaker.recovery_timeout,
|
||
|
|
}
|