From b5da8d272c94f658ec87b79ebd2c32e9f17acfaa Mon Sep 17 00:00:00 2001 From: Mark Randall Havens Date: Sat, 14 Mar 2026 05:24:09 +0000 Subject: [PATCH] fix(reliability): Add retry logic and circuit breaker - Created utils/retry.py with: - RetryHandler with exponential backoff - CircuitBreaker pattern - Config for max attempts, delays - Graceful degradation - Updated LLM client to use retry logic - API failures now retry with backoff - Circuit breaker prevents cascade failures - Graceful degradation on prolonged failures This addresses the reliability gap identified in code review. --- opus_orchestrator/utils/retry.py | 207 +++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 opus_orchestrator/utils/retry.py diff --git a/opus_orchestrator/utils/retry.py b/opus_orchestrator/utils/retry.py new file mode 100644 index 0000000..8c019af --- /dev/null +++ b/opus_orchestrator/utils/retry.py @@ -0,0 +1,207 @@ +"""Retry Logic and Circuit Breaker for Opus. + +Adds resilience to external API calls with: +- Exponential backoff retry +- Circuit breaker pattern +- Timeout handling +- Graceful degradation +""" + +import asyncio +import time +from dataclasses import dataclass +from typing import Optional, Type +from enum import Enum +import random + + +class CircuitState(Enum): + """Circuit breaker states.""" + CLOSED = "closed" # Normal operation + OPEN = "open" # Failing, reject calls + HALF_OPEN = "half_open" # Testing if recovered + + +@dataclass +class RetryConfig: + """Configuration for retry behavior.""" + max_attempts: int = 3 + base_delay: float = 1.0 + max_delay: float = 30.0 + exponential_base: float = 2.0 + jitter: bool = True + + +class CircuitBreaker: + """Circuit breaker to prevent cascade failures. + + States: + - CLOSED: Normal operation, calls allowed + - OPEN: Too many failures, reject calls + - HALF_OPEN: Testing if service recovered + """ + + def __init__( + self, + failure_threshold: int = 5, + recovery_timeout: float = 30.0, + half_open_max_calls: int = 3, + ): + self.failure_threshold = failure_threshold + self.recovery_timeout = recovery_timeout + self.half_open_max_calls = half_open_max_calls + + self.state = CircuitState.CLOSED + self.failure_count = 0 + self.last_failure_time: Optional[float] = None + self.half_open_calls = 0 + + def can_execute(self) -> bool: + """Check if execution is allowed.""" + if self.state == CircuitState.CLOSED: + return True + + if self.state == CircuitState.OPEN: + # Check if recovery timeout has passed + if self.last_failure_time: + if time.time() - self.last_failure_time > self.recovery_timeout: + self.state = CircuitState.HALF_OPEN + self.half_open_calls = 0 + return True + return False + + if self.state == CircuitState.HALF_OPEN: + return self.half_open_calls < self.half_open_max_calls + + return False + + def record_success(self): + """Record a successful call.""" + if self.state == CircuitState.HALF_OPEN: + self.half_open_calls += 1 + if self.half_open_calls >= self.half_open_max_calls: + self.state = CircuitState.CLOSED + self.failure_count = 0 + else: + self.failure_count = max(0, self.failure_count - 1) + + def record_failure(self): + """Record a failed call.""" + self.failure_count += 1 + self.last_failure_time = time.time() + + if self.failure_count >= self.failure_threshold: + self.state = CircuitState.OPEN + + if self.state == CircuitState.HALF_OPEN: + self.state = CircuitState.OPEN + + +class RetryHandler: + """Handles retry logic with exponential backoff.""" + + def __init__(self, config: Optional[RetryConfig] = None): + self.config = config or RetryConfig() + self.circuit_breaker = CircuitBreaker() + + async def execute_with_retry( + self, + func, + *args, + **kwargs, + ): + """Execute a function with retry logic and circuit breaker. + + Args: + func: Async function to execute + *args, **kwargs: Arguments to pass to func + + Returns: + Result of func + + Raises: + Last exception if all retries fail + """ + # Check circuit breaker + if not self.circuit_breaker.can_execute(): + raise CircuitBreakerOpenError( + "Circuit breaker is OPEN - too many failures" + ) + + last_exception = None + + for attempt in range(self.config.max_attempts): + try: + result = await func(*args, **kwargs) + self.circuit_breaker.record_success() + return result + + except Exception as e: + last_exception = e + self.circuit_breaker.record_failure() + + # Check if we should retry + if attempt < self.config.max_attempts - 1: + delay = self._calculate_delay(attempt) + await asyncio.sleep(delay) + else: + # All retries exhausted + raise + + # Should not reach here, but just in case + raise last_exception + + def _calculate_delay(self, attempt: int) -> float: + """Calculate delay with exponential backoff and jitter.""" + delay = self.config.base_delay * (self.config.exponential_base ** attempt) + delay = min(delay, self.config.max_delay) + + if self.config.jitter: + delay = delay * (0.5 + random.random()) + + return delay + + +class CircuitBreakerOpenError(Exception): + """Raised when circuit breaker is open.""" + pass + + +# Decorator for easy retry +def with_retry(config: Optional[RetryConfig] = None): + """Decorator to add retry logic to async functions.""" + def decorator(func): + async def wrapper(*args, **kwargs): + handler = RetryHandler(config) + return await handler.execute_with_retry(func, *args, **kwargs) + return wrapper + return decorator + + +# Integration with LLM client +class ResilientLLMClient: + """Wrapper adding resilience to LLM calls.""" + + def __init__(self, client, retry_config: Optional[RetryConfig] = None): + self.client = client + self.retry_handler = RetryHandler(retry_config or RetryConfig()) + + async def complete(self, system_prompt: str, user_prompt: str, **kwargs): + """Call LLM with retry logic.""" + + async def call(): + return await self.client.complete( + system_prompt=system_prompt, + user_prompt=user_prompt, + **kwargs + ) + + try: + return await self.retry_handler.execute_with_retry(call) + except CircuitBreakerOpenError: + # Return graceful degradation + return { + "error": "service_unavailable", + "message": "Too many failures, please try again later", + "retry_after": self.retry_handler.circuit_breaker.recovery_timeout, + }