Skip to content

Emergency Runbook

Step-by-step procedures for recovering from various failure scenarios.

A. Complete System Loss Recovery

Prerequisites

  • Access to Proxmox host (157.180.63.15)
  • Backup archives (PostgreSQL dump, config backup)
  • GitHub access for source code

Step 1: Provision New LXC Container

# On Proxmox host
pct create 103 local:vztmpl/ubuntu-24.04-standard_24.04-1_amd64.tar.gz \
  --hostname aegis \
  --memory 112640 \
  --cores 16 \
  --rootfs local-zfs:200 \
  --net0 name=eth0,bridge=vmbr1,ip=10.10.10.103/24,gw=10.10.10.1 \
  --features nesting=1 \
  --unprivileged 1

pct start 103

Step 2: Install Base Packages

# Inside container
apt update && apt upgrade -y
apt install -y \
  python3.12 python3.12-venv python3-pip \
  docker.io docker-compose-plugin \
  postgresql-client \
  git curl wget htop vim \
  build-essential libpq-dev

# Add user
useradd -m -s /bin/bash agent
usermod -aG docker agent

# Enable Docker
systemctl enable docker
systemctl start docker

Step 3: Restore Configuration

# As agent user
su - agent

# Create directory structure
mkdir -p ~/.secure ~/.claude/hooks ~/memory/{episodic,semantic,procedural,journal}
mkdir -p ~/projects ~/stacks ~/downloads ~/logs

# Restore credentials (from encrypted backup)
# CRITICAL: These must be restored from secure backup
# - ~/.secure/.env
# - ~/.secure/stripe*.json
# - ~/.claude.json
# - ~/.claude/settings.json

Step 4: Clone Source Code

cd ~/projects
git clone git@github.com:aegis-agent/aegis-core.git
cd aegis-core

Step 5: Restore PostgreSQL Database

# Install PostgreSQL server
sudo apt install -y postgresql-16 postgresql-16-pgvector

# Create database and user
sudo -u postgres psql << EOF
CREATE USER agent WITH PASSWORD 'agent';
CREATE DATABASE aegis OWNER agent;
GRANT ALL PRIVILEGES ON DATABASE aegis TO agent;
EOF

# Enable pgvector
sudo -u postgres psql -d aegis -c "CREATE EXTENSION IF NOT EXISTS vector;"

# Restore from backup
psql -h localhost -U agent -d aegis < backup/aegis_dump.sql

Step 6: Create Docker Network

docker network create traefik_proxy

Step 7: Start Services

cd ~/projects/aegis-core

# Build images
docker compose build

# Start services in order
docker compose up -d falkordb
sleep 10  # Wait for FalkorDB

docker compose up -d traefik
sleep 5

docker compose up -d dashboard
sleep 10  # Wait for health check

docker compose up -d scheduler playwright

Step 8: Verify Services

# Check container health
docker ps

# Expected output:
# aegis-dashboard   ... (healthy)
# aegis-scheduler   ... (healthy)
# aegis-playwright  ... (healthy)
# falkordb          ... (healthy)
# traefik           ... Up

# Test endpoints
curl -s http://localhost:8080/health | jq .
curl -s https://aegisagent.ai/health | jq .

Step 9: Restore MCP Configuration

# Verify MCP config exists
cat ~/.claude.json | jq '.mcpServers | keys'

# Expected servers:
# discord, telegram, github, google-workspace, docker,
# filesystem, postgres, ollama, playwright, vonage

Step 10: Test Integrations

# Test Discord
# Post to #alerts: "System recovered from backup"

# Test database connection
psql -h localhost -U agent -d aegis -c "SELECT COUNT(*) FROM api_keys;"

# Test FalkorDB
redis-cli -p 6379 ping  # Should return PONG

Step 11: Resume Operations

# Check scheduled jobs
docker logs aegis-scheduler --tail 50

# Run health check
curl https://aegisagent.ai/api/health

# Post recovery notice to Discord

B. Database Recovery Only

Step 1: Stop Services

cd ~/projects/aegis-core
docker compose stop dashboard scheduler

Step 2: Backup Current Database (if accessible)

pg_dump -h localhost -U agent aegis > aegis_pre_restore.sql

Step 3: Restore from Backup

# Drop and recreate database
sudo -u postgres psql << EOF
DROP DATABASE IF EXISTS aegis;
CREATE DATABASE aegis OWNER agent;
GRANT ALL PRIVILEGES ON DATABASE aegis TO agent;
EOF

# Enable extensions
sudo -u postgres psql -d aegis -c "CREATE EXTENSION IF NOT EXISTS vector;"

# Restore
psql -h localhost -U agent -d aegis < backup/aegis_dump.sql

Step 4: Run Migrations

cd ~/projects/aegis-core
python -c "from aegis.migrations import run_migrations; run_migrations()"

Step 5: Restart Services

docker compose up -d dashboard scheduler
docker compose ps  # Verify healthy

C. Container Recovery

Dashboard Not Starting

# Check logs
docker logs aegis-dashboard --tail 100

# Common issues:
# 1. Database connection failed → Check PostgreSQL running
# 2. Missing env vars → Check .env file
# 3. Port conflict → Check nothing else on 8080

# Rebuild and restart
docker compose build dashboard
docker compose up -d dashboard

Scheduler Not Starting

# Check logs
docker logs aegis-scheduler --tail 100

# Common issues:
# 1. Dashboard not healthy → Wait for dashboard
# 2. Playwright not reachable → Start playwright first

# Restart with dependencies
docker compose up -d playwright
sleep 5
docker compose up -d scheduler

FalkorDB Data Loss

# Stop services
docker compose stop dashboard scheduler

# Remove corrupted volume
docker volume rm aegis-core_falkordb_data

# Restart (will recreate empty)
docker compose up -d falkordb

# Re-ingest data from transcripts
python scripts/ingest_transcripts.py

D. Configuration Recovery

Lost ~/.secure/.env

# This file contains all API keys and secrets
# Must be restored from secure backup or regenerated

# Required variables (get new values from each service):
cat > ~/.secure/.env << 'EOF'
# Database
POSTGRES_HOST=localhost
POSTGRES_PORT=5432
POSTGRES_USER=agent
POSTGRES_PASSWORD=agent
POSTGRES_DB=aegis

# LLM APIs
ZAI_API_KEY=<get-from-z.ai>
ANTHROPIC_API_KEY=<get-from-anthropic>
PERPLEXITY_API_KEY=<get-from-perplexity>

# Communication
DISCORD_BOT_TOKEN=<get-from-discord-developer-portal>
TELEGRAM_BOT_TOKEN=<get-from-botfather>
VONAGE_API_KEY=<get-from-vonage>
VONAGE_API_SECRET=<get-from-vonage>
VONAGE_APPLICATION_ID=<get-from-vonage>

# Payment
STRIPE_SECRET_KEY=<get-from-stripe>
STRIPE_WEBHOOK_SECRET=<get-from-stripe>

# Email
RESEND_API_KEY=<get-from-resend>
EOF

chmod 600 ~/.secure/.env

Lost ~/.claude.json

# MCP server configuration
# Restore from backup or recreate with tokens

# Template structure:
{
  "mcpServers": {
    "discord": {
      "command": "npx",
      "args": ["-y", "@aegis/mcp-discord"],
      "env": {
        "DISCORD_BOT_TOKEN": "<token>"
      }
    },
    // ... other servers
  }
}

E. Verification Checklist

After any recovery, verify:

# Services running
docker ps | grep healthy

# Database connectivity
psql -h localhost -U agent -d aegis -c "SELECT 1;"

# API responding
curl -s https://aegisagent.ai/health

# MCP servers available
# (verify in Claude Code that tools work)

# Scheduled jobs running
docker logs aegis-scheduler --tail 20 | grep "Job executed"

# Memory accessible
ls ~/memory/journal/
ls ~/memory/semantic/

# Git status clean
git -C ~/projects/aegis-core status

F. Post-Recovery Actions

  1. Document incident in journal
  2. Post to Discord #alerts with recovery status
  3. Verify scheduled jobs are running
  4. Run /status to confirm health
  5. Check recent Beads tasks weren't lost
  6. Review backup strategy if data was lost

Last Updated: 2026-01-25 Recovery Time Target: 2 hours