Error Handling
Strategies for building resilient agents that can recover from failures gracefully.
Core Principles#
RESILIENCE
Agents should continue to function even when individual components or model calls fail.
OBSERVABILITY
Failures must be logged, monitored, and alerted on to ensure system health.
Graceful Degradation
When advanced features fail, fallback to simpler, reliable alternatives.
Self Correction
Agents should attempt to fix their own mistakes before giving up.
Automatic Retries#
IMPLEMENT_CIRCUIT_BREAKER_PATTERN
class CircuitBreaker {
private failures = 0
private lastFailureTime = 0
private state: 'closed' | 'open' | 'half-open' = 'closed'
constructor(
private failureThreshold: number = 5,
private recoveryTimeout: number = 60000 // 1 minute
) {}
async execute<T>(operation: () => Promise<T>): Promise<T> {
if (this.state === 'open') {
if (Date.now() - this.lastFailureTime > this.recoveryTimeout) {
this.state = 'half-open'
} else {
throw new Error('Circuit breaker is OPEN - service unavailable')
}
}
try {
const result = await operation()
this.onSuccess()
return result
} catch (error) {
this.onFailure()
throw error
}
}
private onSuccess() {
this.failures = 0
this.state = 'closed'
}
private onFailure() {
this.failures++
this.lastFailureTime = Date.now()
if (this.failures >= this.failureThreshold) {
this.state = 'open'
}
}
}
// Usage in a tool
const apiCircuitBreaker = new CircuitBreaker(3, 30000) // 3 failures, 30s timeout
const resilientApiTool = new Tool({
name: 'call_external_api',
description: 'Make API calls with circuit breaker protection',
schema: z.object({ endpoint: z.string() }),
handler: async ({ endpoint }) => {
try {
return await apiCircuitBreaker.execute(() =>
callExternalAPI(endpoint)
)
} catch (error) {
if (error.message.includes('Circuit breaker is OPEN')) {
return "Service temporarily unavailable. Please try again later."
}
return `API call failed: ${error.message}`
}
}
})ADD_RETRY_LOGIC_WITH_EXPONENTIAL_BACKOFF
class RetryHandler {
async executeWithRetry<T>(
operation: () => Promise<T>,
options: {
maxAttempts: number
baseDelay: number
maxDelay: number
backoffFactor: number
retryCondition?: (error: any) => boolean
}
): Promise<T> {
let lastError: any
for (let attempt = 1; attempt <= options.maxAttempts; attempt++) {
try {
return await operation()
} catch (error) {
lastError = error
// Check if we should retry this error
if (options.retryCondition && !options.retryCondition(error)) {
throw error
}
// Don't retry on last attempt
if (attempt === options.maxAttempts) {
throw error
}
// Calculate delay with exponential backoff
const delay = Math.min(
options.baseDelay * Math.pow(options.backoffFactor, attempt - 1),
options.maxDelay
)
// Add jitter to prevent thundering herd
const jitteredDelay = delay * (0.5 + Math.random() * 0.5)
await new Promise(resolve => setTimeout(resolve, jitteredDelay))
}
}
throw lastError
}
}
// Usage in tools
const retryHandler = new RetryHandler()
const databaseTool = new Tool({
name: 'query_database',
description: 'Execute database queries with retry logic',
schema: z.object({ query: z.string() }),
handler: async ({ query }) => {
try {
return await retryHandler.executeWithRetry(
() => executeDatabaseQuery(query),
{
maxAttempts: 3,
baseDelay: 1000, // 1 second
maxDelay: 10000, // 10 seconds
backoffFactor: 2,
retryCondition: (error) =>
error.code === 'CONNECTION_LOST' ||
error.code === 'TIMEOUT'
}
)
} catch (error) {
return `Database query failed after retries: ${error.message}`
}
}
})IMPLEMENT_FALLBACK_STRATEGIES
class FallbackHandler {
async executeWithFallback<T>(
primary: () => Promise<T>,
fallbacks: Array<() => Promise<T>>,
options: {
fallbackCondition?: (error: any) => boolean
onFallback?: (error: any, fallbackIndex: number) => void
} = {}
): Promise<T> {
let lastError: any
// Try primary operation
try {
return await primary()
} catch (error) {
lastError = error
if (options.fallbackCondition && !options.fallbackCondition(error)) {
throw error
}
}
// Try fallbacks in order
for (let i = 0; i < fallbacks.length; i++) {
try {
options.onFallback?.(lastError, i)
return await fallbacks[i]()
} catch (error) {
lastError = error
// Continue to next fallback
}
}
throw lastError
}
}
// Usage example: Multiple data sources
const dataTool = new Tool({
name: 'get_user_data',
description: 'Retrieve user data with fallback sources',
schema: z.object({ userId: z.string() }),
handler: async ({ userId }) => {
const fallbackHandler = new FallbackHandler()
try {
const data = await fallbackHandler.executeWithFallback(
// Primary: Fast cache
() => getUserFromCache(userId),
// Fallbacks: Database, then external API
[
() => getUserFromDatabase(userId),
() => getUserFromExternalAPI(userId)
],
{
fallbackCondition: (error) =>
error.message.includes('not found') ||
error.message.includes('timeout'),
onFallback: (error, index) => {
console.log(`Fallback ${index + 1} triggered: ${error.message}`)
}
}
)
return JSON.stringify(data)
} catch (error) {
return `Unable to retrieve user data: ${error.message}`
}
}
})Agent Level Error Handling#
Graceful Agent Degradation
class ResilientAgent extends Agent {
private degradedMode = false
private availableTools: Set<string> = new Set()
constructor(config: AgentConfig) {
super(config)
// Initialize with all tools available
this.availableTools = new Set(config.tools.map(t => t.name))
}
async runWithResilience(prompt: string): Promise<AgentResponse> {
try {
const result = await this.run(prompt)
// Reset degraded mode on success
if (this.degradedMode) {
this.degradedMode = false
console.log('Agent recovered from degraded mode')
}
return result
} catch (error) {
return await this.handleAgentError(error, prompt)
}
}
private async handleAgentError(error: any, originalPrompt: string): Promise<AgentResponse> {
// Determine error type and response strategy
if (error.message.includes('rate limit')) {
return {
output: "I'm currently experiencing high demand. Please try again in a moment.",
metadata: { error: 'rate_limited', retryAfter: 60 }
}
}
if (error.message.includes('model unavailable')) {
this.degradedMode = true
return {
output: "My primary reasoning system is temporarily unavailable. I'll provide a simpler response.",
metadata: { error: 'model_degraded', degraded: true }
}
}
if (error.message.includes('tool failed')) {
// Try to continue without the failing tool
const toolName = this.extractToolNameFromError(error)
this.availableTools.delete(toolName)
return {
output: `I encountered an issue with one of my tools. Let me try a different approach.`,
metadata: { error: 'tool_failure', failedTool: toolName }
}
}
// Generic fallback
return {
output: "I apologize, but I'm experiencing technical difficulties. Please try again or contact support if the issue persists.",
metadata: { error: 'unknown_error', originalError: error.message }
}
}
// Override tool execution to check availability
protected async executeTool(toolName: string, args: any): Promise<string> {
if (!this.availableTools.has(toolName)) {
throw new Error(`Tool ${toolName} is currently unavailable`)
}
try {
return await super.executeTool(toolName, args)
} catch (error) {
// Mark tool as unavailable on failure
this.availableTools.delete(toolName)
throw error
}
}
}Error Recovery Strategies
class ErrorRecoveryManager {
private recoveryStrategies: Map<string, RecoveryStrategy> = new Map()
constructor() {
this.setupRecoveryStrategies()
}
private setupRecoveryStrategies() {
// Network errors
this.recoveryStrategies.set('network_timeout', {
condition: (error) => error.code === 'ETIMEDOUT' || error.code === 'ENOTFOUND',
action: async (error, context) => {
// Wait and retry with exponential backoff
const delay = Math.min(1000 * Math.pow(2, context.attemptCount), 30000)
await new Promise(resolve => setTimeout(resolve, delay))
return context.retry()
}
})
// Rate limiting
this.recoveryStrategies.set('rate_limited', {
condition: (error) => error.status === 429,
action: async (error, context) => {
// Extract retry-after header or use exponential backoff
const retryAfter = error.headers?.['retry-after'] || Math.min(1000 * Math.pow(2, context.attemptCount), 60000)
await new Promise(resolve => setTimeout(resolve, retryAfter))
return context.retry()
}
})
// Authentication errors
this.recoveryStrategies.set('auth_failed', {
condition: (error) => error.status === 401 || error.status === 403,
action: async (error, context) => {
// Try to refresh token or re-authenticate
if (context.canRefreshAuth) {
await context.refreshAuth()
return context.retry()
}
throw new Error('Authentication failed and cannot be refreshed')
}
})
// Temporary service unavailability
this.recoveryStrategies.set('service_unavailable', {
condition: (error) => error.status >= 500,
action: async (error, context) => {
// Use circuit breaker pattern
if (context.circuitBreaker.isOpen()) {
throw new Error('Service is currently unavailable (circuit breaker open)')
}
// Wait before retry
await new Promise(resolve => setTimeout(resolve, 5000))
return context.retry()
}
})
}
async attemptRecovery(error: any, context: RecoveryContext): Promise<any> {
for (const [strategyName, strategy] of this.recoveryStrategies) {
if (strategy.condition(error)) {
console.log(`Applying recovery strategy: ${strategyName}`)
return await strategy.action(error, context)
}
}
// No recovery strategy found
throw error
}
}
interface RecoveryStrategy {
condition: (error: any) => boolean
action: (error: any, context: RecoveryContext) => Promise<any>
}
interface RecoveryContext {
attemptCount: number
retry: () => Promise<any>
canRefreshAuth?: boolean
refreshAuth?: () => Promise<void>
circuitBreaker?: { isOpen: () => boolean }
}Testing Error Scenarios#
import { Agent, MockProvider } from '@akios/core'
describe('Agent Error Handling', () => {
test('handles API timeouts gracefully', async () => {
const mockProvider = new MockProvider([
// First attempt fails
{ role: 'assistant', content: null, tool_calls: [{ name: 'api_call', arguments: {} }] },
{ role: 'tool', content: 'Error: Connection timeout' },
// Agent should retry or handle gracefully
{ role: 'assistant', content: 'The service is currently slow. Let me try again.' }
])
const agent = new Agent({
name: 'test-agent',
model: mockProvider,
tools: [/* tools that can timeout */]
})
const result = await agent.run('Make an API call')
expect(response.text).toContain('slow') // Should provide helpful message
expect(response.text).not.toContain('Error:') // Should not expose raw errors
})
test('enters degraded mode when model fails', async () => {
const agent = new Agent({
name: 'test-agent',
model: 'failing-model', // Mock that throws errors
tools: []
})
// Mock the model to fail
jest.spyOn(agent, 'run').mockRejectedValue(new Error('Model unavailable'))
const result = await agent.runWithResilience('Hello')
expect(result.metadata?.degraded).toBe(true)
expect(response.text).toContain('simpler response')
})
test('circuit breaker prevents cascade failures', async () => {
const circuitBreaker = new CircuitBreaker(2, 1000)
// Simulate multiple failures
for (let i = 0; i < 3; i++) {
try {
await circuitBreaker.execute(() => Promise.reject(new Error('Service down')))
} catch (error) {
// Expected
}
}
// Next call should be blocked by circuit breaker
await expect(
circuitBreaker.execute(() => Promise.resolve('success'))
).rejects.toThrow('Circuit breaker is OPEN')
})
test('fallback strategies work correctly', async () => {
const fallbackHandler = new FallbackHandler()
let primaryCalled = false
let fallbackCalled = false
const result = await fallbackHandler.executeWithFallback(
() => {
primaryCalled = true
throw new Error('Primary failed')
},
[
() => {
fallbackCalled = true
return Promise.resolve('Fallback success')
}
]
)
expect(primaryCalled).toBe(true)
expect(fallbackCalled).toBe(true)
expect(result).toBe('Fallback success')
})
})MONITORING_&_ALERTING#
class ErrorMonitor {
private errorCounts: Map<string, number> = new Map()
private alerts: AlertRule[] = []
constructor() {
this.setupAlertRules()
}
private setupAlertRules() {
this.alerts = [
{
name: 'high_error_rate',
condition: (errorType, count, timeWindow) =>
count / timeWindow > 0.1, // More than 10% error rate
action: (errorType, count) =>
this.sendAlert(`High error rate for ${errorType}: ${count} errors`)
},
{
name: 'circuit_breaker_open',
condition: (errorType) => errorType === 'circuit_breaker_open',
action: () => this.sendAlert('Circuit breaker opened - service degraded')
},
{
name: 'repeated_timeouts',
condition: (errorType, count) =>
errorType === 'timeout' && count > 5,
action: () => this.sendAlert('Multiple timeout errors detected')
}
]
}
recordError(errorType: string, error: any) {
const count = (this.errorCounts.get(errorType) || 0) + 1
this.errorCounts.set(errorType, count)
// Check alert rules
for (const alert of this.alerts) {
if (alert.condition(errorType, count, 300)) { // 5 minute window
alert.action(errorType, count)
}
}
// Log structured error data
console.error(JSON.stringify({
timestamp: new Date().toISOString(),
errorType,
message: error.message,
stack: error.stack,
count,
metadata: {
userAgent: 'agent-runtime',
version: process.env.AGENT_VERSION
}
}))
}
private sendAlert(message: string) {
// In production, send to monitoring service (DataDog, New Relic, etc.)
console.warn(`ALERT: ${message}`)
// Could also send email, Slack notification, etc.
// sendToMonitoringService('alert', { message, severity: 'warning' })
}
getErrorStats(): Record<string, number> {
return Object.fromEntries(this.errorCounts)
}
}
interface AlertRule {
name: string
condition: (errorType: string, count: number, timeWindow: number) => boolean
action: (errorType: string, count?: number) => void
}
// Usage in agent
const errorMonitor = new ErrorMonitor()
// In error handling code
try {
await riskyOperation()
} catch (error) {
errorMonitor.recordError('api_call_failed', error)
// Handle the error...
}Error Handling Best Practices
Fail_Fast_vs._Graceful_Degradation: Know when to stop trying vs. when to continue with reduced functionality.
User-Friendly_Error_Messages: Never expose internal error details to users. Provide actionable feedback instead.
Structured_Logging: Log errors with context, but don't log sensitive information like passwords or tokens.
Cascading_Failure_Prevention: Use circuit breakers and bulkheads to prevent one failure from bringing down the entire system.
Recovery_Testing: Regularly test your error handling by simulating failures in staging environments.