COOKBOOK:_CONTENT_MODERATION_AGENT
Build an AI content moderator that protects your community while maintaining a positive user experience.
The Problem#
Online communities need protection from harmful content, but manual moderation doesn't scale. An AI moderator can:
- Detect toxic language, spam, and harassment in real-time
- Enforce community guidelines consistently
- Reduce moderator workload by handling routine decisions
- Provide immediate feedback to users
- Learn from human moderator decisions to improve accuracy
Architecture Overview#
Content Analysis
Multi-layered analysis for toxicity, spam, and policy violations using ML models
Decision Engine
Configurable rules engine that makes fair, consistent moderation decisions
Feedback Loop
Learning system that improves accuracy based on human moderator corrections
1
CREATE_CONTENT_ANALYSIS_TOOLS
tools/content-analysis.ts
import { Tool } from '@akios/core'
import { z } from 'zod'
// Content analysis tool
const analyzeContent = new Tool({
name: 'analyze_content',
description: 'Analyze text content for toxicity, spam, and policy violations',
schema: z.object({
content: z.string(),
context: z.string().optional(),
strictness: z.enum(['low', 'medium', 'high']).default('medium')
}),
handler: async ({ content, context, strictness }) => {
// In practice, use services like Perspective API, OpenAI Moderation, etc.
const analysis = {
toxicity_score: calculateToxicity(content),
spam_score: calculateSpam(content),
policy_violations: detectPolicyViolations(content),
categories: categorizeContent(content),
recommended_action: determineAction(content, strictness)
}
return JSON.stringify(analysis)
}
})
// Helper functions
function calculateToxicity(text: string): number {
const toxicWords = ['hate', 'stupid', 'idiot', 'worst', 'terrible', 'awful']
const words = text.toLowerCase().split(' ')
const toxicCount = words.filter(word => toxicWords.includes(word)).length
return Math.min(toxicCount * 0.2, 1.0)
}
function calculateSpam(text: string): number {
// Simple spam detection
const spamIndicators = [
text.length < 10, // Too short
text.includes('http'), // Links
text.toUpperCase() === text && text.length > 20, // All caps
/([a-zA-Z])\1{3,}/.test(text), // Repeated characters
]
return spamIndicators.filter(Boolean).length * 0.25
}
function detectPolicyViolations(text: string): string[] {
const violations = []
const lowerText = text.toLowerCase()
if (lowerText.includes('contact info') || /\d{3}-\d{3}-\d{4}/.test(text)) {
violations.push('personal_information')
}
if (lowerText.includes('political') && lowerText.includes('extreme')) {
violations.push('hate_speech')
}
if (lowerText.includes('advertisement') || lowerText.includes('buy now')) {
violations.push('spam')
}
return violations
}
function categorizeContent(text: string): string[] {
const categories = []
const lowerText = text.toLowerCase()
if (lowerText.includes('question') || lowerText.includes('help') || lowerText.includes('?')) {
categories.push('question')
}
if (lowerText.includes('thank') || lowerText.includes('appreciate')) {
categories.push('positive_feedback')
}
if (lowerText.includes('bug') || lowerText.includes('error') || lowerText.includes('broken')) {
categories.push('bug_report')
}
return categories
}
function determineAction(text: string, strictness: string): string {
const toxicity = calculateToxicity(text)
const spam = calculateSpam(text)
const violations = detectPolicyViolations(text)
const thresholds = {
low: { toxicity: 0.8, spam: 0.7 },
medium: { toxicity: 0.6, spam: 0.5 },
high: { toxicity: 0.4, spam: 0.3 }
}
const threshold = thresholds[strictness as keyof typeof thresholds]
if (toxicity > threshold.toxicity || violations.length > 0) {
return 'reject'
}
if (spam > threshold.spam) {
return 'flag_for_review'
}
return 'approve'
}2
BUILD_THE_MODERATION_AGENT
content-moderation-agent.ts
import { Agent } from '@akios/core'
const contentModerationAgent = new Agent({
name: 'ContentModerationAgent',
model: 'gpt-4',
systemPrompt: `You are a content moderation specialist. Your responsibilities:
1. Analyze content for safety and appropriateness
2. Detect spam, toxicity, and policy violations
3. Make fair and consistent moderation decisions
4. Provide clear feedback when content is rejected
5. Consider context and intent when evaluating content
Guidelines:
- Be fair but firm - protect community safety
- Give benefit of doubt when intent is ambiguous
- Explain rejections clearly so users can improve
- Consider cultural context and language differences
- Escalate edge cases to human moderators`,
tools: [analyzeContent],
guardrails: [
new ContentSafetyGuardrail({ strictness: 'high' }),
new ProfanityFilterGuardrail()
]
})
// Moderation workflow
async function moderateContent(content: string, context?: string) {
// Step 1: Analyze content
const analysis = await analyzeContent.execute({
content,
context,
strictness: 'medium'
})
const parsed = JSON.parse(analysis)
// Step 2: Make decision
switch (parsed.recommended_action) {
case 'approve':
return {
action: 'approve',
reason: 'Content meets community guidelines'
}
case 'reject':
return {
action: 'reject',
reason: generateRejectionReason(parsed),
categories: parsed.categories
}
case 'flag_for_review':
// Escalate to human moderator
await escalateToHuman(content, parsed)
return {
action: 'escalated',
reason: 'Content flagged for human review'
}
default:
return {
action: 'escalated',
reason: 'Unable to determine action automatically'
}
}
}
function generateRejectionReason(analysis: any): string {
const reasons = []
if (analysis.toxicity_score > 0.6) {
reasons.push('Content appears to contain toxic or harmful language')
}
if (analysis.policy_violations.includes('personal_information')) {
reasons.push('Content contains personal information that should not be shared')
}
if (analysis.policy_violations.includes('hate_speech')) {
reasons.push('Content violates hate speech policy')
}
if (analysis.spam_score > 0.5) {
reasons.push('Content appears to be spam or promotional')
}
return reasons.length > 0 ? reasons.join('. ') : 'Content violates community guidelines'
}
async function escalateToHuman(content: string, analysis: any) {
// In practice, send to human moderation queue
console.log('Escalating to human moderator:', {
content: content.substring(0, 200) + '...',
analysis,
timestamp: new Date().toISOString()
})
}`3
INTEGRATE_WITH_YOUR_PLATFORM
platform-integration.ts
import express from 'express'
import { moderateContent } from './content-moderation-agent'
const app = express()
app.use(express.json())
// Moderate forum posts
app.post('/api/posts', async (req, res) => {
const { content, author_id, category } = req.body
try {
const moderation = await moderateContent(content, `forum_post_${category}`)
if (moderation.action === 'reject') {
return res.status(400).json({
error: 'Content rejected',
reason: moderation.reason,
code: 'MODERATION_REJECTED'
})
}
if (moderation.action === 'escalated') {
// Save post but flag for human review
const post = await savePost({
...req.body,
status: 'pending_review',
moderation_result: moderation
})
return res.status(202).json({
message: 'Post submitted for review',
post_id: post.id
})
}
// Approve and save post
const post = await savePost({
...req.body,
status: 'approved',
moderation_result: moderation
})
res.status(201).json({ post_id: post.id })
} catch (error) {
console.error('Moderation error:', error)
res.status(500).json({ error: 'Moderation service unavailable' })
}
})
// Moderate chat messages
app.post('/api/chat/messages', async (req, res) => {
const { content, room_id, user_id } = req.body
const moderation = await moderateContent(content, `chat_${room_id}`)
if (moderation.action === 'reject') {
return res.status(400).json({
error: 'Message blocked',
reason: moderation.reason
})
}
// Send message to chat room
await sendToChatRoom(room_id, {
...req.body,
moderation_status: moderation.action
})
res.status(200).json({ status: 'sent' })
})
app.listen(3000, () => {
console.log('Content moderation service running on port 3000')
})Advanced Features#
Context Aware Moderation
context-aware-moderation.ts
class ContextAwareModerator extends Agent {
private userHistory: Map<string, UserModerationHistory> = new Map()
private contentContext: Map<string, ContentContext> = new Map()
async moderateWithContext(content: string, userId: string, contextId: string) {
// Get user history
const userHistory = this.userHistory.get(userId) || {
violations: 0,
warnings: 0,
lastViolation: null
}
// Get content context
const context = this.contentContext.get(contextId) || {
type: 'general',
sensitivity: 'medium',
recentActivity: []
}
// Adjust strictness based on context
let strictness: 'low' | 'medium' | 'high' = 'medium'
if (context.type === 'children' || context.sensitivity === 'high') {
strictness = 'high'
}
if (userHistory.violations > 0) {
strictness = 'high' // Higher scrutiny for repeat offenders
}
if (context.type === 'trusted_users_only') {
strictness = 'low' // More lenient in trusted spaces
}
const analysis = await analyzeContent.execute({
content,
context: `${context.type}_${context.sensitivity}`,
strictness
})
const result = JSON.parse(analysis)
// Update user history
if (result.recommended_action === 'reject') {
userHistory.violations++
userHistory.lastViolation = new Date()
this.userHistory.set(userId, userHistory)
}
return {
...result,
adjusted_strictness: strictness,
user_history: userHistory
}
}
}
interface UserModerationHistory {
violations: number
warnings: number
lastViolation: Date | null
}
interface ContentContext {
type: 'general' | 'children' | 'professional' | 'trusted_users_only'
sensitivity: 'low' | 'medium' | 'high'
recentActivity: string[]
}Learning from Human Moderators
learning-moderation.ts
class LearningModerator extends Agent {
private decisionHistory: ModerationDecision[] = []
async moderateAndLearn(content: string, context?: string) {
// Make initial decision
const aiDecision = await this.run(`Moderate this content: "${content}"`)
// Store for potential learning
const decisionId = this.storeDecision({
content,
ai_decision: aiDecision.output,
context,
timestamp: new Date(),
human_override: null
})
return {
decision: aiDecision.output,
decision_id: decisionId,
needs_human_review: this.isAmbiguous(content)
}
}
// Human moderator can override AI decisions
async humanOverride(decisionId: string, humanDecision: string, reason: string) {
const decision = this.decisionHistory.find(d => d.id === decisionId)
if (!decision) return
decision.human_override = {
decision: humanDecision,
reason,
timestamp: new Date()
}
// Learn from the correction
await this.learnFromCorrection(decision)
}
private async learnFromCorrection(decision: ModerationDecision) {
if (!decision.human_override) return
const correction = {
original_content: decision.content,
ai_decision: decision.ai_decision,
human_decision: decision.human_override.decision,
reason: decision.human_override.reason,
context: decision.context
}
// In practice, send to ML model for training
console.log('Learning from correction:', correction)
}
private isAmbiguous(content: string): boolean {
// Simple heuristic for ambiguous content
const ambiguousIndicators = [
content.includes('?'), // Questions
content.length < 50, // Very short
content.toLowerCase().includes('maybe') || content.toLowerCase().includes('perhaps'),
/\?\?\?/.test(content) // Multiple question marks
]
return ambiguousIndicators.filter(Boolean).length >= 2
}
private storeDecision(decision: ModerationDecision): string {
const id = `decision_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`
decision.id = id
this.decisionHistory.push(decision)
return id
}
}
interface ModerationDecision {
id?: string
content: string
ai_decision: string
context?: string
timestamp: Date
human_override: {
decision: string
reason: string
timestamp: Date
} | null
}Testing your Moderation Agent#
Test your agent with various inputs to ensure it correctly identifies policy violations.
typescript
const safeContent = "I really enjoyed learning about the new features!";
const unsafeContent = "I hate everyone and I want to cause harm.";
const safeResult = await moderatorAgent.run(safeContent);
console.log('Safe Content Check:', safeResult);
const unsafeResult = await moderatorAgent.run(unsafeContent);
console.log('Unsafe Content Check:', unsafeResult);Important Considerations
- FALSE_POSITIVES: AI can be overly cautious - always allow human appeals
- CULTURAL_CONTEXT: What seems toxic in one culture may be normal in another
- PRIVACY: Never store moderated content longer than necessary
- TRANSPARENCY: Users should understand why content was moderated
- REGULAR_AUDITS: Human moderators should regularly review AI decisions