Error Handling

Strategies for building resilient agents that can recover from failures gracefully.

Core Principles#

RESILIENCE

Agents should continue to function even when individual components or model calls fail.

OBSERVABILITY

Failures must be logged, monitored, and alerted on to ensure system health.

Graceful Degradation

When advanced features fail, fallback to simpler, reliable alternatives.

Self Correction

Agents should attempt to fix their own mistakes before giving up.

Automatic Retries#

IMPLEMENT_CIRCUIT_BREAKER_PATTERN

circuit-breaker.ts

class CircuitBreaker {
  private failures = 0
  private lastFailureTime = 0
  private state: 'closed' | 'open' | 'half-open' = 'closed'

  constructor(
    private failureThreshold: number = 5,
    private recoveryTimeout: number = 60000 // 1 minute
  ) {}

  async execute<T>(operation: () => Promise<T>): Promise<T> {
    if (this.state === 'open') {
      if (Date.now() - this.lastFailureTime > this.recoveryTimeout) {
        this.state = 'half-open'
      } else {
        throw new Error('Circuit breaker is OPEN - service unavailable')
      }
    }

    try {
      const result = await operation()
      this.onSuccess()
      return result
    } catch (error) {
      this.onFailure()
      throw error
    }
  }

  private onSuccess() {
    this.failures = 0
    this.state = 'closed'
  }

  private onFailure() {
    this.failures++
    this.lastFailureTime = Date.now()

    if (this.failures >= this.failureThreshold) {
      this.state = 'open'
    }
  }
}

// Usage in a tool
const apiCircuitBreaker = new CircuitBreaker(3, 30000) // 3 failures, 30s timeout

const resilientApiTool = new Tool({
  name: 'call_external_api',
  description: 'Make API calls with circuit breaker protection',
  schema: z.object({ endpoint: z.string() }),

  handler: async ({ endpoint }) => {
    try {
      return await apiCircuitBreaker.execute(() =>
        callExternalAPI(endpoint)
      )
    } catch (error) {
      if (error.message.includes('Circuit breaker is OPEN')) {
        return "Service temporarily unavailable. Please try again later."
      }
      return `API call failed: ${error.message}`
    }
  }
})

ADD_RETRY_LOGIC_WITH_EXPONENTIAL_BACKOFF

retry-logic.ts

class RetryHandler {
  async executeWithRetry<T>(
    operation: () => Promise<T>,
    options: {
      maxAttempts: number
      baseDelay: number
      maxDelay: number
      backoffFactor: number
      retryCondition?: (error: any) => boolean
    }
  ): Promise<T> {
    let lastError: any

    for (let attempt = 1; attempt <= options.maxAttempts; attempt++) {
      try {
        return await operation()
      } catch (error) {
        lastError = error

        // Check if we should retry this error
        if (options.retryCondition && !options.retryCondition(error)) {
          throw error
        }

        // Don't retry on last attempt
        if (attempt === options.maxAttempts) {
          throw error
        }

        // Calculate delay with exponential backoff
        const delay = Math.min(
          options.baseDelay * Math.pow(options.backoffFactor, attempt - 1),
          options.maxDelay
        )

        // Add jitter to prevent thundering herd
        const jitteredDelay = delay * (0.5 + Math.random() * 0.5)

        await new Promise(resolve => setTimeout(resolve, jitteredDelay))
      }
    }

    throw lastError
  }
}

// Usage in tools
const retryHandler = new RetryHandler()

const databaseTool = new Tool({
  name: 'query_database',
  description: 'Execute database queries with retry logic',
  schema: z.object({ query: z.string() }),

  handler: async ({ query }) => {
    try {
      return await retryHandler.executeWithRetry(
        () => executeDatabaseQuery(query),
        {
          maxAttempts: 3,
          baseDelay: 1000, // 1 second
          maxDelay: 10000, // 10 seconds
          backoffFactor: 2,
          retryCondition: (error) =>
            error.code === 'CONNECTION_LOST' ||
            error.code === 'TIMEOUT'
        }
      )
    } catch (error) {
      return `Database query failed after retries: ${error.message}`
    }
  }
})

IMPLEMENT_FALLBACK_STRATEGIES

fallback-patterns.ts

class FallbackHandler {
  async executeWithFallback<T>(
    primary: () => Promise<T>,
    fallbacks: Array<() => Promise<T>>,
    options: {
      fallbackCondition?: (error: any) => boolean
      onFallback?: (error: any, fallbackIndex: number) => void
    } = {}
  ): Promise<T> {
    let lastError: any

    // Try primary operation
    try {
      return await primary()
    } catch (error) {
      lastError = error
      if (options.fallbackCondition && !options.fallbackCondition(error)) {
        throw error
      }
    }

    // Try fallbacks in order
    for (let i = 0; i < fallbacks.length; i++) {
      try {
        options.onFallback?.(lastError, i)
        return await fallbacks[i]()
      } catch (error) {
        lastError = error
        // Continue to next fallback
      }
    }

    throw lastError
  }
}

// Usage example: Multiple data sources
const dataTool = new Tool({
  name: 'get_user_data',
  description: 'Retrieve user data with fallback sources',
  schema: z.object({ userId: z.string() }),

  handler: async ({ userId }) => {
    const fallbackHandler = new FallbackHandler()

    try {
      const data = await fallbackHandler.executeWithFallback(
        // Primary: Fast cache
        () => getUserFromCache(userId),

        // Fallbacks: Database, then external API
        [
          () => getUserFromDatabase(userId),
          () => getUserFromExternalAPI(userId)
        ],

        {
          fallbackCondition: (error) =>
            error.message.includes('not found') ||
            error.message.includes('timeout'),

          onFallback: (error, index) => {
            console.log(`Fallback ${index + 1} triggered: ${error.message}`)
          }
        }
      )

      return JSON.stringify(data)

    } catch (error) {
      return `Unable to retrieve user data: ${error.message}`
    }
  }
})

Agent Level Error Handling#

Graceful Agent Degradation

agent-resilience.ts

class ResilientAgent extends Agent {
  private degradedMode = false
  private availableTools: Set<string> = new Set()

  constructor(config: AgentConfig) {
    super(config)
    // Initialize with all tools available
    this.availableTools = new Set(config.tools.map(t => t.name))
  }

  async runWithResilience(prompt: string): Promise<AgentResponse> {
    try {
      const result = await this.run(prompt)

      // Reset degraded mode on success
      if (this.degradedMode) {
        this.degradedMode = false
        console.log('Agent recovered from degraded mode')
      }

      return result

    } catch (error) {
      return await this.handleAgentError(error, prompt)
    }
  }

  private async handleAgentError(error: any, originalPrompt: string): Promise<AgentResponse> {
    // Determine error type and response strategy
    if (error.message.includes('rate limit')) {
      return {
        output: "I'm currently experiencing high demand. Please try again in a moment.",
        metadata: { error: 'rate_limited', retryAfter: 60 }
      }
    }

    if (error.message.includes('model unavailable')) {
      this.degradedMode = true
      return {
        output: "My primary reasoning system is temporarily unavailable. I'll provide a simpler response.",
        metadata: { error: 'model_degraded', degraded: true }
      }
    }

    if (error.message.includes('tool failed')) {
      // Try to continue without the failing tool
      const toolName = this.extractToolNameFromError(error)
      this.availableTools.delete(toolName)

      return {
        output: `I encountered an issue with one of my tools. Let me try a different approach.`,
        metadata: { error: 'tool_failure', failedTool: toolName }
      }
    }

    // Generic fallback
    return {
      output: "I apologize, but I'm experiencing technical difficulties. Please try again or contact support if the issue persists.",
      metadata: { error: 'unknown_error', originalError: error.message }
    }
  }

  // Override tool execution to check availability
  protected async executeTool(toolName: string, args: any): Promise<string> {
    if (!this.availableTools.has(toolName)) {
      throw new Error(`Tool ${toolName} is currently unavailable`)
    }

    try {
      return await super.executeTool(toolName, args)
    } catch (error) {
      // Mark tool as unavailable on failure
      this.availableTools.delete(toolName)
      throw error
    }
  }
}

Error Recovery Strategies

error-recovery.ts

class ErrorRecoveryManager {
  private recoveryStrategies: Map<string, RecoveryStrategy> = new Map()

  constructor() {
    this.setupRecoveryStrategies()
  }

  private setupRecoveryStrategies() {
    // Network errors
    this.recoveryStrategies.set('network_timeout', {
      condition: (error) => error.code === 'ETIMEDOUT' || error.code === 'ENOTFOUND',
      action: async (error, context) => {
        // Wait and retry with exponential backoff
        const delay = Math.min(1000 * Math.pow(2, context.attemptCount), 30000)
        await new Promise(resolve => setTimeout(resolve, delay))
        return context.retry()
      }
    })

    // Rate limiting
    this.recoveryStrategies.set('rate_limited', {
      condition: (error) => error.status === 429,
      action: async (error, context) => {
        // Extract retry-after header or use exponential backoff
        const retryAfter = error.headers?.['retry-after'] || Math.min(1000 * Math.pow(2, context.attemptCount), 60000)
        await new Promise(resolve => setTimeout(resolve, retryAfter))
        return context.retry()
      }
    })

    // Authentication errors
    this.recoveryStrategies.set('auth_failed', {
      condition: (error) => error.status === 401 || error.status === 403,
      action: async (error, context) => {
        // Try to refresh token or re-authenticate
        if (context.canRefreshAuth) {
          await context.refreshAuth()
          return context.retry()
        }
        throw new Error('Authentication failed and cannot be refreshed')
      }
    })

    // Temporary service unavailability
    this.recoveryStrategies.set('service_unavailable', {
      condition: (error) => error.status >= 500,
      action: async (error, context) => {
        // Use circuit breaker pattern
        if (context.circuitBreaker.isOpen()) {
          throw new Error('Service is currently unavailable (circuit breaker open)')
        }

        // Wait before retry
        await new Promise(resolve => setTimeout(resolve, 5000))
        return context.retry()
      }
    })
  }

  async attemptRecovery(error: any, context: RecoveryContext): Promise<any> {
    for (const [strategyName, strategy] of this.recoveryStrategies) {
      if (strategy.condition(error)) {
        console.log(`Applying recovery strategy: ${strategyName}`)
        return await strategy.action(error, context)
      }
    }

    // No recovery strategy found
    throw error
  }
}

interface RecoveryStrategy {
  condition: (error: any) => boolean
  action: (error: any, context: RecoveryContext) => Promise<any>
}

interface RecoveryContext {
  attemptCount: number
  retry: () => Promise<any>
  canRefreshAuth?: boolean
  refreshAuth?: () => Promise<void>
  circuitBreaker?: { isOpen: () => boolean }
}

Testing Error Scenarios#

error-testing.ts

import { Agent, MockProvider } from '@akios/core'

describe('Agent Error Handling', () => {
  test('handles API timeouts gracefully', async () => {
    const mockProvider = new MockProvider([
      // First attempt fails
      { role: 'assistant', content: null, tool_calls: [{ name: 'api_call', arguments: {} }] },
      { role: 'tool', content: 'Error: Connection timeout' },
      // Agent should retry or handle gracefully
      { role: 'assistant', content: 'The service is currently slow. Let me try again.' }
    ])

    const agent = new Agent({
      name: 'test-agent',
      model: mockProvider,
      tools: [/* tools that can timeout */]
    })

    const result = await agent.run('Make an API call')
    expect(response.text).toContain('slow') // Should provide helpful message
    expect(response.text).not.toContain('Error:') // Should not expose raw errors
  })

  test('enters degraded mode when model fails', async () => {
    const agent = new Agent({
      name: 'test-agent',
      model: 'failing-model', // Mock that throws errors
      tools: []
    })

    // Mock the model to fail
    jest.spyOn(agent, 'run').mockRejectedValue(new Error('Model unavailable'))

    const result = await agent.runWithResilience('Hello')

    expect(result.metadata?.degraded).toBe(true)
    expect(response.text).toContain('simpler response')
  })

  test('circuit breaker prevents cascade failures', async () => {
    const circuitBreaker = new CircuitBreaker(2, 1000)

    // Simulate multiple failures
    for (let i = 0; i < 3; i++) {
      try {
        await circuitBreaker.execute(() => Promise.reject(new Error('Service down')))
      } catch (error) {
        // Expected
      }
    }

    // Next call should be blocked by circuit breaker
    await expect(
      circuitBreaker.execute(() => Promise.resolve('success'))
    ).rejects.toThrow('Circuit breaker is OPEN')
  })

  test('fallback strategies work correctly', async () => {
    const fallbackHandler = new FallbackHandler()

    let primaryCalled = false
    let fallbackCalled = false

    const result = await fallbackHandler.executeWithFallback(
      () => {
        primaryCalled = true
        throw new Error('Primary failed')
      },
      [
        () => {
          fallbackCalled = true
          return Promise.resolve('Fallback success')
        }
      ]
    )

    expect(primaryCalled).toBe(true)
    expect(fallbackCalled).toBe(true)
    expect(result).toBe('Fallback success')
  })
})

MONITORING_&_ALERTING#

error-monitoring.ts

class ErrorMonitor {
  private errorCounts: Map<string, number> = new Map()
  private alerts: AlertRule[] = []

  constructor() {
    this.setupAlertRules()
  }

  private setupAlertRules() {
    this.alerts = [
      {
        name: 'high_error_rate',
        condition: (errorType, count, timeWindow) =>
          count / timeWindow > 0.1, // More than 10% error rate
        action: (errorType, count) =>
          this.sendAlert(`High error rate for ${errorType}: ${count} errors`)
      },
      {
        name: 'circuit_breaker_open',
        condition: (errorType) => errorType === 'circuit_breaker_open',
        action: () => this.sendAlert('Circuit breaker opened - service degraded')
      },
      {
        name: 'repeated_timeouts',
        condition: (errorType, count) =>
          errorType === 'timeout' && count > 5,
        action: () => this.sendAlert('Multiple timeout errors detected')
      }
    ]
  }

  recordError(errorType: string, error: any) {
    const count = (this.errorCounts.get(errorType) || 0) + 1
    this.errorCounts.set(errorType, count)

    // Check alert rules
    for (const alert of this.alerts) {
      if (alert.condition(errorType, count, 300)) { // 5 minute window
        alert.action(errorType, count)
      }
    }

    // Log structured error data
    console.error(JSON.stringify({
      timestamp: new Date().toISOString(),
      errorType,
      message: error.message,
      stack: error.stack,
      count,
      metadata: {
        userAgent: 'agent-runtime',
        version: process.env.AGENT_VERSION
      }
    }))
  }

  private sendAlert(message: string) {
    // In production, send to monitoring service (DataDog, New Relic, etc.)
    console.warn(`ALERT: ${message}`)

    // Could also send email, Slack notification, etc.
    // sendToMonitoringService('alert', { message, severity: 'warning' })
  }

  getErrorStats(): Record<string, number> {
    return Object.fromEntries(this.errorCounts)
  }
}

interface AlertRule {
  name: string
  condition: (errorType: string, count: number, timeWindow: number) => boolean
  action: (errorType: string, count?: number) => void
}

// Usage in agent
const errorMonitor = new ErrorMonitor()

// In error handling code
try {
  await riskyOperation()
} catch (error) {
  errorMonitor.recordError('api_call_failed', error)
  // Handle the error...
}

Error Handling Best Practices

Fail_Fast_vs._Graceful_Degradation: Know when to stop trying vs. when to continue with reduced functionality.

User-Friendly_Error_Messages: Never expose internal error details to users. Provide actionable feedback instead.

Structured_Logging: Log errors with context, but don't log sensitive information like passwords or tokens.

Cascading_Failure_Prevention: Use circuit breakers and bulkheads to prevent one failure from bringing down the entire system.

Recovery_Testing: Regularly test your error handling by simulating failures in staging environments.