PERFORMANCE
OPTIMIZATIONOptimize latency, throughput, and resource usage for production agents. Learn caching, parallelization, and efficient prompting techniques.
PERFORMANCE_FUNDAMENTALS#
Agent performance directly impacts user experience and operational costs. A slow agent loses users, while an inefficient one wastes resources. Focus on three key metrics: latency (response time), throughput (requests per second), and cost (compute resources).
LATENCY
How fast can your agent respond? Target <2s for good UX, <500ms for real-time.
THROUGHPUT
How many concurrent requests can you handle? Scale horizontally, not vertically.
COST_EFFICIENCY
Minimize API calls and compute time. Cache aggressively, batch operations.
IMPLEMENT_MULTI_LEVEL_CACHING
class MultiLevelCache {
private l1Cache = new Map<string, { data: any, expires: number }>() // In-memory
private l2Cache: Redis // Distributed cache
private l3Cache: Database // Persistent storage
async get<T>(key: string, ttl: number = 300000): Promise<T | null> {
// L1: Fast in-memory cache
const l1Data = this.l1Cache.get(key)
if (l1Data && l1Data.expires > Date.now()) {
return l1Data.data
}
// L2: Distributed cache
try {
const l2Data = await this.l2Cache.get(key)
if (l2Data) {
// Populate L1 for faster future access
this.l1Cache.set(key, { data: JSON.parse(l2Data), expires: Date.now() + ttl })
return JSON.parse(l2Data)
}
} catch (error) {
console.warn('L2 cache unavailable:', error)
}
// L3: Database fallback
const l3Data = await this.l3Cache.findUnique({ where: { key } })
if (l3Data) {
// Populate higher caches
const data = l3Data.value
this.l1Cache.set(key, { data, expires: Date.now() + ttl })
await this.l2Cache.setex(key, ttl / 1000, JSON.stringify(data))
return data
}
return null
}
async set<T>(key: string, value: T, ttl: number = 300000): Promise<void> {
const expires = Date.now() + ttl
// Set all levels
this.l1Cache.set(key, { data: value, expires })
await this.l2Cache.setex(key, ttl / 1000, JSON.stringify(value))
await this.l3Cache.upsert({
where: { key },
update: { value, expires },
create: { key, value, expires }
})
}
// Cleanup expired L1 entries
cleanup() {
const now = Date.now()
for (const [key, value] of this.l1Cache) {
if (value.expires < now) {
this.l1Cache.delete(key)
}
}
}
}
// Usage in agent tools
const cache = new MultiLevelCache()
const cachedSearchTool = new Tool({
name: 'search_with_cache',
description: 'Search with intelligent caching',
schema: z.object({
query: z.string(),
useCache: z.boolean().default(true)
}),
execute: async ({ query, useCache }) => {
if (useCache) {
const cached = await cache.get(`search:${query}`)
if (cached) {
return JSON.stringify({ ...cached, cached: true })
}
}
// Perform actual search
const results = await performExpensiveSearch(query)
// Cache for future use
await cache.set(`search:${query}`, results, 600000) // 10 minutes
return JSON.stringify({ ...results, cached: false })
}
})OPTIMIZE_PROMPTS_FOR_SPEED
class PromptOptimizer {
// Pre-compile common prompt templates
private templates = new Map<string, string>()
constructor() {
this.compileTemplates()
}
private compileTemplates() {
// Template with placeholders for dynamic content
this.templates.set('customer_support', `
You are a customer support agent for {{companyName}}.
Customer: {{customerMessage}}
Guidelines:
- Be helpful and friendly
- Use the following context: {{context}}
- Keep responses under 200 words
- If you need more information, ask specific questions
Response:`)
this.templates.set('data_analysis', `
Analyze this dataset: {{data}}
Requirements:
- Focus on {{focusArea}}
- Provide {{outputFormat}} output
- Highlight {{keyMetrics}}
Analysis:`)
}
optimizePrompt(templateName: string, variables: Record<string, string>): string {
let prompt = this.templates.get(templateName)
if (!prompt) throw new Error(`Unknown template: ${templateName}`)
// Replace variables
for (const [key, value] of Object.entries(variables)) {
const placeholder = '{{' + key + '}}'
prompt = prompt.split(placeholder).join(value)
}
// Remove unused variables (cleanup)
prompt = prompt.replace(/{{[^}]+}}/g, '')
return prompt.trim()
}
// Compress prompt by removing redundant information
compressPrompt(prompt: string): string {
return prompt
// Remove excessive whitespace
.replace(/[\n\r]+/g, '\n')
.replace(/[ \t]+/g, ' ')
// Remove redundant phrases
.replace(/please assist me with/g, 'help with')
.replace(/I would like to/g, 'I want to')
// Truncate if too long (keep essential parts)
.substring(0, 8000) // Leave room for response
}
}
// Usage
const optimizer = new PromptOptimizer()
const agent = new Agent({
name: 'optimized-agent',
model: 'gpt-4',
systemPrompt: optimizer.optimizePrompt('customer_support', {
companyName: 'Acme Corp',
context: 'Product returns within 30 days',
customerMessage: 'I want to return my order'
})
})IMPLEMENT_PARALLEL_PROCESSING
class ParallelProcessor {
async processInParallel<T, R>(
items: T[],
processor: (item: T) => Promise<R>,
options: {
concurrency: number
onProgress?: (completed: number, total: number) => void
timeout?: number
}
): Promise<R[]> {
const results: R[] = []
const semaphore = new Semaphore(options.concurrency)
const processItem = async (item: T, index: number): Promise<void> => {
await semaphore.acquire()
try {
const timeoutPromise = options.timeout
? Promise.race([
processor(item),
new Promise<never>((_, reject) =>
setTimeout(() => reject(new Error('Timeout')), options.timeout)
)
])
: processor(item)
const result = await timeoutPromise
results[index] = result
options.onProgress?.(results.filter(r => r !== undefined).length, items.length)
} finally {
semaphore.release()
}
}
// Start all operations
const promises = items.map((item, index) => processItem(item, index))
// Wait for all to complete
await Promise.allSettled(promises)
return results
}
}
class Semaphore {
private permits: number
private waitQueue: Array<() => void> = []
constructor(permits: number) {
this.permits = permits
}
async acquire(): Promise<void> {
if (this.permits > 0) {
this.permits--
return
}
return new Promise(resolve => {
this.waitQueue.push(resolve)
})
}
release(): void {
this.permits++
if (this.waitQueue.length > 0) {
const resolve = this.waitQueue.shift()!
this.permits--
resolve()
}
}
}
// Usage in agent tools
const processor = new ParallelProcessor()
const batchAnalysisTool = new Tool({
name: 'analyze_multiple_documents',
description: 'Analyze multiple documents in parallel',
schema: z.object({
documents: z.array(z.string()),
analysisType: z.enum(['summary', 'sentiment', 'keywords'])
}),
execute: async ({ documents, analysisType }) => {
const results = await processor.processInParallel(
documents,
async (doc) => {
// Each document analysis runs in parallel
return await analyzeDocument(doc, analysisType)
},
{
concurrency: 5, // Max 5 concurrent analyses
timeout: 30000, // 30 second timeout per document
onProgress: (completed, total) => {
console.log(`Progress: ${completed}/${total} documents analyzed`)
}
}
)
return JSON.stringify({
totalDocuments: documents.length,
completedAnalyses: results.length,
results
})
}
})ADVANCED_OPTIMIZATION_TECHNIQUES#
RESPONSE_STREAMING
Stream responses to reduce perceived latency and enable real-time interactions.
class StreamingAgent extends Agent {
async *runStreaming(prompt: string): AsyncGenerator<string, void, unknown> {
const messages = this.buildMessages(prompt)
let buffer = ''
try {
const stream = await this.model.stream(messages)
for await (const chunk of stream) {
buffer += chunk.content || ''
// Yield complete words/tokens for better UX
const words = buffer.split(' ')
if (words.length > 1) {
const completeWords = words.slice(0, -1).join(' ') + ' '
buffer = words[words.length - 1]
yield completeWords
}
}
// Yield remaining buffer
if (buffer.trim()) {
yield buffer
}
} catch (error) {
yield `Error: ${error.message}`
}
}
}
// Usage with real-time UI updates
async function handleStreamingResponse(prompt: string) {
const agent = new StreamingAgent({ /* config */ })
const responseElement = document.getElementById('response')
for await (const chunk of agent.runStreaming(prompt)) {
responseElement.textContent += chunk
// Allow UI to update
await new Promise(resolve => setTimeout(resolve, 0))
}
}
// Progressive tool calling
class ProgressiveToolAgent extends Agent {
async runWithProgressiveTools(prompt: string) {
let currentPrompt = prompt
let iteration = 0
const maxIterations = 5
while (iteration < maxIterations) {
const response = await this.run(currentPrompt)
// Check if response contains tool calls
const toolCalls = this.extractToolCalls(response)
if (toolCalls.length === 0) {
// Final answer reached
return response
}
// Execute tools progressively
const toolResults = await this.executeToolsProgressively(toolCalls)
// Update prompt with results for next iteration
currentPrompt = `${response.output}
Tool Results:
${toolResults.map(r => `${r.tool}: ${r.result}`).join('\n')}
Continue with this information:`
iteration++
}
return { output: "Maximum iterations reached", metadata: { iterations: maxIterations } }
}
private async executeToolsProgressively(toolCalls: any[]): Promise<any[]> {
// Execute non-dependent tools in parallel
const results = await Promise.allSettled(
toolCalls.map(async (call) => {
try {
const result = await this.executeTool(call.name, call.arguments)
return { tool: call.name, result, success: true }
} catch (error) {
return { tool: call.name, result: error.message, success: false }
}
})
)
return results.map((result, index) => ({
tool: toolCalls[index].name,
result: result.status === 'fulfilled' ? result.value : result.reason,
success: result.status === 'fulfilled' && result.value.success
}))
}
}MEMORY_OPTIMIZATION
Efficiently manage conversation context to prevent memory bloat and token limits.
class OptimizedMemoryManager {
private conversations = new Map<string, ConversationMemory>()
private maxTokens = 8000 // Leave room for response
private compressionThreshold = 0.8 // Compress when 80% full
async addMessage(conversationId: string, message: Message): Promise<void> {
let memory = this.conversations.get(conversationId)
if (!memory) {
memory = new ConversationMemory()
this.conversations.set(conversationId, memory)
}
memory.addMessage(message)
// Check if compression needed
if (this.shouldCompress(memory)) {
await this.compressMemory(memory)
}
}
private shouldCompress(memory: ConversationMemory): boolean {
const tokenCount = memory.getTokenCount()
return tokenCount > this.maxTokens * this.compressionThreshold
}
private async compressMemory(memory: ConversationMemory): Promise<void> {
const messages = memory.getMessages()
// Keep recent messages (last 20% of token budget)
const recentMessages = messages.slice(-Math.floor(this.maxTokens * 0.2))
// Compress older messages
const olderMessages = messages.slice(0, -Math.floor(this.maxTokens * 0.2))
const compressedSummary = await this.summarizeMessages(olderMessages)
// Replace with compressed version
memory.clear()
memory.addMessage({
role: 'system',
content: `Previous conversation summary: ${compressedSummary}`
})
recentMessages.forEach(msg => memory.addMessage(msg))
}
private async summarizeMessages(messages: Message[]): Promise<string> {
// Use a smaller, faster model for summarization
const summaryPrompt = `Summarize the key points from this conversation concisely:
${messages.map(m => `${m.role}: ${m.content}`).join('\n')}
Summary:`
const response = await this.fastModel.generate(summaryPrompt)
return response.content
}
getOptimizedContext(conversationId: string): Message[] {
const memory = this.conversations.get(conversationId)
return memory ? memory.getMessages() : []
}
}
class ConversationMemory {
private messages: Message[] = []
private tokenCount = 0
addMessage(message: Message): void {
this.messages.push(message)
// Rough token estimation (1 token ≈ 4 characters)
this.tokenCount += Math.ceil(message.content.length / 4)
}
getMessages(): Message[] {
return [...this.messages]
}
getTokenCount(): number {
return this.tokenCount
}
clear(): void {
this.messages = []
this.tokenCount = 0
}
}
// Usage in agent
const memoryManager = new OptimizedMemoryManager()
const efficientAgent = new Agent({
name: 'memory-efficient-agent',
model: 'gpt-4',
memory: {
getContext: (conversationId) => memoryManager.getOptimizedContext(conversationId),
addMessage: (conversationId, message) => memoryManager.addMessage(conversationId, message)
}
})PERFORMANCE_MONITORING#
class PerformanceMonitor {
private metrics: Map<string, PerformanceMetric[]> = new Map()
recordMetric(operation: string, duration: number, metadata: any = {}) {
const metric: PerformanceMetric = {
timestamp: Date.now(),
operation,
duration,
...metadata
}
if (!this.metrics.has(operation)) {
this.metrics.set(operation, [])
}
this.metrics.get(operation)!.push(metric)
// Keep only recent metrics (last 1000)
const metrics = this.metrics.get(operation)!
if (metrics.length > 1000) {
metrics.shift()
}
}
getMetrics(operation: string): PerformanceStats {
const metrics = this.metrics.get(operation) || []
if (metrics.length === 0) {
return { count: 0, avg: 0, p95: 0, p99: 0 }
}
const durations = metrics.map(m => m.duration).sort((a, b) => a - b)
return {
count: metrics.length,
avg: durations.reduce((a, b) => a + b, 0) / durations.length,
p95: durations[Math.floor(durations.length * 0.95)],
p99: durations[Math.floor(durations.length * 0.99)]
}
}
async monitoredExecute<T>(
operation: string,
fn: () => Promise<T>,
metadata: any = {}
): Promise<T> {
const start = performance.now()
try {
const result = await fn()
const duration = performance.now() - start
this.recordMetric(operation, duration, { ...metadata, success: true })
return result
} catch (error) {
const duration = performance.now() - start
this.recordMetric(operation, duration, {
...metadata,
success: false,
error: error.message
})
throw error
}
}
}
interface PerformanceMetric {
timestamp: number
operation: string
duration: number
success?: boolean
error?: string
[key: string]: any
}
interface PerformanceStats {
count: number
avg: number
p95: number
p99: number
}
// Usage
const monitor = new PerformanceMonitor()
const monitoredTool = new Tool({
name: 'monitored_search',
description: 'Search with performance monitoring',
schema: z.object({ query: z.string() }),
execute: async ({ query }) => {
return await monitor.monitoredExecute(
'search_operation',
async () => {
const results = await performSearch(query)
return JSON.stringify(results)
},
{ queryLength: query.length, hasFilters: false }
)
}
})
// Performance dashboard
function logPerformanceStats() {
const searchStats = monitor.getMetrics('search_operation')
console.log(`Search Performance:
Total calls: ${searchStats.count}
Average latency: ${searchStats.avg.toFixed(2)}ms
P95 latency: ${searchStats.p95.toFixed(2)}ms
P99 latency: ${searchStats.p99.toFixed(2)}ms`)
}LOAD_TESTING_AND_BENCHMARKING#
class LoadTester {
async runLoadTest(
agent: Agent,
testConfig: {
duration: number // seconds
concurrentUsers: number
requestsPerUser: number
requestGenerator: () => string // Generates test prompts
}
): Promise<LoadTestResults> {
const results: LoadTestResult[] = []
const startTime = Date.now()
// Create concurrent users
const userPromises = Array.from({ length: testConfig.concurrentUsers }, async (_, userId) => {
const userResults: LoadTestResult[] = []
for (let i = 0; i < testConfig.requestsPerUser; i++) {
const prompt = testConfig.requestGenerator()
const requestStart = performance.now()
try {
const response = await agent.run(prompt)
const latency = performance.now() - requestStart
userResults.push({
userId,
requestId: i,
latency,
success: true,
tokensUsed: response.metadata?.tokens || 0
})
} catch (error) {
const latency = performance.now() - requestStart
userResults.push({
userId,
requestId: i,
latency,
success: false,
error: error.message
})
}
// Small delay between requests to simulate realistic usage
await new Promise(resolve => setTimeout(resolve, 100))
}
return userResults
})
// Wait for all users to complete
const allResults = await Promise.all(userPromises)
const flatResults = allResults.flat()
// Calculate statistics
const successful = flatResults.filter(r => r.success)
const failed = flatResults.filter(r => !r.success)
const latencies = successful.map(r => r.latency)
const totalDuration = Date.now() - startTime
return {
totalRequests: flatResults.length,
successfulRequests: successful.length,
failedRequests: failed.length,
totalDuration,
requestsPerSecond: flatResults.length / (totalDuration / 1000),
averageLatency: latencies.reduce((a, b) => a + b, 0) / latencies.length,
p95Latency: this.calculatePercentile(latencies, 95),
p99Latency: this.calculatePercentile(latencies, 99),
errorRate: failed.length / flatResults.length,
results: flatResults
}
}
private calculatePercentile(values: number[], percentile: number): number {
const sorted = values.sort((a, b) => a - b)
const index = Math.ceil((percentile / 100) * sorted.length) - 1
return sorted[index]
}
}
interface LoadTestResult {
userId: number
requestId: number
latency: number
success: boolean
tokensUsed?: number
error?: string
}
interface LoadTestResults {
totalRequests: number
successfulRequests: number
failedRequests: number
totalDuration: number
requestsPerSecond: number
averageLatency: number
p95Latency: number
p99Latency: number
errorRate: number
results: LoadTestResult[]
}
// Example usage
const loadTester = new LoadTester()
const results = await loadTester.runLoadTest(myAgent, {
duration: 60, // 1 minute
concurrentUsers: 10,
requestsPerUser: 5,
requestGenerator: () => {
const queries = [
'What is the weather like?',
'Tell me a joke',
'How do I reset my password?',
'What are your business hours?',
'Can you help me with my order?'
]
return queries[Math.floor(Math.random() * queries.length)]
}
})
console.log(`Load Test Results:
Requests/sec: ${results.requestsPerSecond.toFixed(2)}
Average latency: ${results.averageLatency.toFixed(2)}ms
P95 latency: ${results.p95Latency.toFixed(2)}ms
Error rate: ${(results.errorRate * 100).toFixed(2)}%`)
// Performance assertions
if (results.averageLatency > 2000) {
console.warn('Average latency too high!')
}
if (results.errorRate > 0.05) {
console.error('Error rate too high!')
}
if (results.requestsPerSecond < 10) {
console.warn('Throughput too low!')
}PERFORMANCE_OPTIMIZATION_CHECKLIST
✅_Caching_Strategy: Implement multi-level caching for expensive operations
✅_Prompt_Optimization: Use templates, remove redundancy, compress context
✅_Parallel_Processing: Execute independent operations concurrently
✅_Streaming_Responses: Provide immediate feedback, reduce perceived latency
✅_Memory_Management: Compress old context, limit conversation length
✅_Performance_Monitoring: Track latency, throughput, and error rates
✅_Load_Testing: Validate performance under realistic conditions
✅_Resource_Limits: Set timeouts, rate limits, and circuit breakers