Backup and Recovery Guide¶
This guide provides comprehensive backup strategies, procedures, and recovery plans for Readur deployments to ensure business continuity and data protection.
Backup Strategy Overview¶
Readur requires backing up three critical components: 1. PostgreSQL Database - Document metadata, user data, settings 2. File Storage - Uploaded documents and processed files 3. Configuration - Environment variables and settings
Database Backup¶
Automated Database Backups¶
Using pg_dump¶
Create a backup script:
#!/bin/bash
# backup-database.sh
set -e # Exit on error
# Configuration
BACKUP_DIR="/backups/database"
DB_NAME="readur"
DB_USER="readur"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="${BACKUP_DIR}/readur_db_${TIMESTAMP}.sql.gz"
RETENTION_DAYS=30
# Create backup directory
mkdir -p ${BACKUP_DIR}
# Perform backup with proper authentication
# Note: Use PGPASSWORD environment variable or .pgpass file for authentication
PGPASSWORD="${DB_PASSWORD}" pg_dump -h localhost -U ${DB_USER} -d ${DB_NAME} --no-owner --clean --if-exists | gzip > ${BACKUP_FILE}
# Verify backup
if [ $? -eq 0 ]; then
echo "Backup successful: ${BACKUP_FILE}"
echo "Size: $(du -h ${BACKUP_FILE} | cut -f1)"
# Test restore capability
gunzip -c ${BACKUP_FILE} | head -n 100 > /dev/null
if [ $? -eq 0 ]; then
echo "Backup file verified"
else
echo "WARNING: Backup file may be corrupted"
exit 1
fi
else
echo "Backup failed!"
exit 1
fi
# Remove old backups
find ${BACKUP_DIR} -name "readur_db_*.sql.gz" -mtime +${RETENTION_DAYS} -delete
echo "Cleaned up backups older than ${RETENTION_DAYS} days"
Using pg_basebackup for Physical Backups¶
#!/bin/bash
# physical-backup.sh
BACKUP_DIR="/backups/physical"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_PATH="${BACKUP_DIR}/readur_base_${TIMESTAMP}"
# Enable archive mode in PostgreSQL first
# In postgresql.conf:
# wal_level = replica
# archive_mode = on
# archive_command = 'cp %p /archive/%f'
# Perform physical backup
pg_basebackup -D ${BACKUP_PATH} -Ft -z -P -U replication_user
# Create restore point
psql -U readur -d readur -c "SELECT pg_create_restore_point('backup_${TIMESTAMP}');"
Continuous Archiving with WAL¶
# docker-compose.yml addition for WAL archiving
services:
postgres:
environment:
POSTGRES_INITDB_WALDIR: /var/lib/postgresql/wal
volumes:
- postgres_data:/var/lib/postgresql/data
- postgres_wal:/var/lib/postgresql/wal
- ./postgresql.conf:/etc/postgresql/postgresql.conf
command: postgres -c config_file=/etc/postgresql/postgresql.conf
PostgreSQL configuration:
# postgresql.conf
wal_level = replica
archive_mode = on
archive_command = 'test ! -f /archive/%f && cp %p /archive/%f'
archive_timeout = 300 # Force WAL switch every 5 minutes
Database Backup Scheduling¶
Using Cron¶
# Add to crontab
# Daily backup at 2 AM
# IMPORTANT: Set DB_PASSWORD environment variable in crontab or use .pgpass file
0 2 * * * DB_PASSWORD='your_password' /opt/readur/scripts/backup-database.sh >> /var/log/readur-backup.log 2>&1
# Hourly incremental backup
0 * * * * /opt/readur/scripts/backup-incremental.sh >> /var/log/readur-backup.log 2>&1
# Weekly full backup on Sunday
0 3 * * 0 /opt/readur/scripts/backup-full.sh >> /var/log/readur-backup.log 2>&1
Using Kubernetes CronJob¶
apiVersion: batch/v1
kind: CronJob
metadata:
name: readur-db-backup
spec:
schedule: "0 2 * * *" # Daily at 2 AM
jobTemplate:
spec:
template:
spec:
containers:
- name: backup
image: postgres:15
env:
- name: PGPASSWORD
valueFrom:
secretKeyRef:
name: readur-db-secret
key: password
command:
- /bin/bash
- -c
- |
pg_dump -h readur-db -U readur -d readur | \
gzip | \
aws s3 cp - s3://backups/readur/db/$(date +%Y%m%d_%H%M%S).sql.gz
restartPolicy: OnFailure
File Storage Backup¶
Local Storage Backup¶
Using rsync¶
#!/bin/bash
# backup-files.sh
set -e # Exit on error
SOURCE_DIR="/var/readur/uploads"
BACKUP_DIR="/backups/files"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_PATH="${BACKUP_DIR}/readur_files_${TIMESTAMP}"
# Create incremental backup using rsync
rsync -avz --link-dest=${BACKUP_DIR}/latest \
${SOURCE_DIR}/ ${BACKUP_PATH}/
# Update latest symlink
rm -f ${BACKUP_DIR}/latest
ln -s ${BACKUP_PATH} ${BACKUP_DIR}/latest
# Calculate backup size
BACKUP_SIZE=$(du -sh ${BACKUP_PATH} | cut -f1)
echo "Backup completed: ${BACKUP_PATH} (${BACKUP_SIZE})"
Using tar with compression¶
#!/bin/bash
# archive-files.sh
set -e # Exit on error
SOURCE_DIR="/var/readur/uploads"
BACKUP_DIR="/backups/archives"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="${BACKUP_DIR}/readur_files_${TIMESTAMP}.tar.gz"
# Create compressed archive
tar -czf ${BACKUP_FILE} \
--exclude='*.tmp' \
--exclude='thumbnails/*' \
-C ${SOURCE_DIR} .
# Verify archive
tar -tzf ${BACKUP_FILE} > /dev/null
if [ $? -eq 0 ]; then
echo "Archive verified: ${BACKUP_FILE}"
else
echo "Archive verification failed!"
exit 1
fi
S3 Storage Backup¶
S3 to S3 Replication¶
# AWS S3 bucket replication configuration
{
"Role": "arn:aws:iam::123456789012:role/replication-role",
"Rules": [
{
"ID": "ReplicateAll",
"Priority": 1,
"Status": "Enabled",
"Filter": {},
"Destination": {
"Bucket": "arn:aws:s3:::readur-backup-bucket",
"ReplicationTime": {
"Status": "Enabled",
"Time": {
"Minutes": 15
}
},
"StorageClass": "GLACIER_IR"
},
"DeleteMarkerReplication": {
"Status": "Enabled"
}
}
]
}
S3 to Local Backup¶
#!/bin/bash
# backup-s3-to-local.sh
S3_BUCKET="readur-documents"
LOCAL_BACKUP="/backups/s3"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_PATH="${LOCAL_BACKUP}/${TIMESTAMP}"
# Sync S3 to local
aws s3 sync s3://${S3_BUCKET} ${BACKUP_PATH}/ \
--storage-class STANDARD_IA \
--include "*" \
--exclude "*.tmp"
# Create manifest
aws s3api list-objects-v2 \
--bucket ${S3_BUCKET} \
--query 'Contents[].{Key:Key,Size:Size,ETag:ETag}' \
--output json > ${BACKUP_PATH}/manifest.json
Configuration Backup¶
Environment and Secrets Backup¶
#!/bin/bash
# backup-config.sh
CONFIG_DIR="/etc/readur"
BACKUP_DIR="/backups/config"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="${BACKUP_DIR}/readur_config_${TIMESTAMP}.tar.gz.gpg"
# Collect configuration files
mkdir -p /tmp/config_backup
cp ${CONFIG_DIR}/.env /tmp/config_backup/
cp ${CONFIG_DIR}/readur.yml /tmp/config_backup/
cp /etc/nginx/sites-available/readur /tmp/config_backup/nginx.conf
# Export Kubernetes secrets
kubectl get secret readur-secrets -o yaml > /tmp/config_backup/k8s-secrets.yaml
# Encrypt and archive
tar -czf - -C /tmp/config_backup . | \
gpg --symmetric --cipher-algo AES256 --output ${BACKUP_FILE}
# Clean up
rm -rf /tmp/config_backup
echo "Encrypted config backup: ${BACKUP_FILE}"
Backup Verification¶
Automated Backup Testing¶
#!/bin/bash
# verify-backups.sh
BACKUP_DIR="/backups"
TEST_DIR="/tmp/backup_test"
LOG_FILE="/var/log/backup_verification.log"
# Function to test database backup
test_db_backup() {
local backup_file=$1
local test_db="readur_test_restore"
echo "Testing database backup: ${backup_file}" >> ${LOG_FILE}
# Create test database
createdb ${test_db}
# Restore backup
gunzip -c ${backup_file} | psql -d ${test_db} -q
# Verify tables exist
table_count=$(psql -d ${test_db} -t -c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='public'")
# Clean up
dropdb ${test_db}
if [ ${table_count} -gt 0 ]; then
echo "Database backup verified: ${table_count} tables found" >> ${LOG_FILE}
return 0
else
echo "Database backup verification failed!" >> ${LOG_FILE}
return 1
fi
}
# Function to test file backup
test_file_backup() {
local backup_file=$1
echo "Testing file backup: ${backup_file}" >> ${LOG_FILE}
# Extract sample files
mkdir -p ${TEST_DIR}
tar -xzf ${backup_file} -C ${TEST_DIR} --wildcards '*.pdf' --limit=10
# Verify files
file_count=$(find ${TEST_DIR} -name "*.pdf" | wc -l)
# Clean up
rm -rf ${TEST_DIR}
if [ ${file_count} -gt 0 ]; then
echo "File backup verified: ${file_count} files extracted" >> ${LOG_FILE}
return 0
else
echo "File backup verification failed!" >> ${LOG_FILE}
return 1
fi
}
# Run verification
latest_db_backup=$(ls -t ${BACKUP_DIR}/database/*.sql.gz | head -1)
latest_file_backup=$(ls -t ${BACKUP_DIR}/files/*.tar.gz | head -1)
test_db_backup ${latest_db_backup}
test_file_backup ${latest_file_backup}
Recovery Procedures¶
Database Recovery¶
Point-in-Time Recovery¶
#!/bin/bash
# restore-to-point-in-time.sh
RESTORE_TIME="2025-01-15 14:30:00"
BACKUP_FILE="/backups/database/readur_db_20250115_020000.sql.gz"
WAL_ARCHIVE="/archive"
# Stop application
docker-compose stop readur
# Restore base backup
gunzip -c ${BACKUP_FILE} | psql -U readur -d readur
# Apply WAL logs up to restore time
cat > /tmp/recovery.conf <<EOF
restore_command = 'cp ${WAL_ARCHIVE}/%f %p'
recovery_target_time = '${RESTORE_TIME}'
recovery_target_action = 'promote'
EOF
# Start PostgreSQL in recovery mode
pg_ctl start -D /var/lib/postgresql/data -o "-c recovery_config_file=/tmp/recovery.conf"
# Wait for recovery
while [ -f /var/lib/postgresql/data/recovery.signal ]; do
sleep 5
echo "Recovery in progress..."
done
echo "Recovery completed to ${RESTORE_TIME}"
Full Database Restore¶
#!/bin/bash
# full-restore.sh
BACKUP_FILE=$1
if [ -z "${BACKUP_FILE}" ]; then
echo "Usage: $0 <backup_file>"
exit 1
fi
# Confirm restoration
read -p "This will replace the entire database. Continue? (y/N) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
exit 1
fi
# Stop application
systemctl stop readur
# Drop and recreate database
psql -U postgres <<EOF
DROP DATABASE IF EXISTS readur;
CREATE DATABASE readur OWNER readur;
EOF
# Restore backup
gunzip -c ${BACKUP_FILE} | psql -U readur -d readur
# Verify restoration
psql -U readur -d readur -c "SELECT COUNT(*) FROM documents;"
# Restart application
systemctl start readur
echo "Database restored from ${BACKUP_FILE}"
File Storage Recovery¶
Local Storage Recovery¶
#!/bin/bash
# restore-files.sh
BACKUP_PATH=$1
RESTORE_PATH="/var/readur/uploads"
if [ -z "${BACKUP_PATH}" ]; then
echo "Usage: $0 <backup_path>"
exit 1
fi
# Create restore point
mv ${RESTORE_PATH} ${RESTORE_PATH}.old
# Restore files
if [[ ${BACKUP_PATH} == *.tar.gz ]]; then
# Extract archive
mkdir -p ${RESTORE_PATH}
tar -xzf ${BACKUP_PATH} -C ${RESTORE_PATH}
else
# Copy directory
cp -r ${BACKUP_PATH} ${RESTORE_PATH}
fi
# Fix permissions
chown -R readur:readur ${RESTORE_PATH}
chmod -R 755 ${RESTORE_PATH}
# Verify restoration
file_count=$(find ${RESTORE_PATH} -type f | wc -l)
echo "Restored ${file_count} files to ${RESTORE_PATH}"
# Clean up old backup
read -p "Remove old backup? (y/N) " -n 1 -r
if [[ $REPLY =~ ^[Yy]$ ]]; then
rm -rf ${RESTORE_PATH}.old
fi
S3 Recovery¶
#!/bin/bash
# restore-s3.sh
SOURCE_BUCKET="readur-backup"
DEST_BUCKET="readur-documents"
RESTORE_DATE="2025-01-15"
# List available backups
echo "Available backups:"
aws s3 ls s3://${SOURCE_BUCKET}/ --recursive | grep ${RESTORE_DATE}
# Restore specific backup
aws s3 sync s3://${SOURCE_BUCKET}/${RESTORE_DATE}/ s3://${DEST_BUCKET}/ \
--delete \
--exact-timestamps
# Verify restoration
object_count=$(aws s3 ls s3://${DEST_BUCKET}/ --recursive | wc -l)
echo "Restored ${object_count} objects to ${DEST_BUCKET}"
Disaster Recovery Plan¶
RTO and RPO Targets¶
Component | RPO (Recovery Point Objective) | RTO (Recovery Time Objective) |
---|---|---|
Database | 1 hour | 2 hours |
File Storage | 4 hours | 4 hours |
Full System | 4 hours | 8 hours |
Recovery Runbook¶
Phase 1: Assessment (15 minutes)¶
#!/bin/bash
# assess-damage.sh
echo "=== Disaster Recovery Assessment ==="
echo "Time: $(date)"
# Check database
pg_isready -h localhost -p 5432 -U readur
DB_STATUS=$?
# Check file storage
df -h /var/readur/uploads
STORAGE_STATUS=$?
# Check application
curl -f http://localhost:8080/health
APP_STATUS=$?
echo "Database status: ${DB_STATUS}"
echo "Storage status: ${STORAGE_STATUS}"
echo "Application status: ${APP_STATUS}"
if [ ${DB_STATUS} -ne 0 ]; then
echo "ACTION: Database recovery required"
fi
if [ ${STORAGE_STATUS} -ne 0 ]; then
echo "ACTION: Storage recovery required"
fi
if [ ${APP_STATUS} -ne 0 ]; then
echo "ACTION: Application recovery required"
fi
Phase 2: Recovery Execution¶
#!/bin/bash
# execute-recovery.sh
RECOVERY_LOG="/var/log/disaster_recovery_$(date +%Y%m%d_%H%M%S).log"
# Function to log with timestamp
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a ${RECOVERY_LOG}
}
log "Starting disaster recovery"
# 1. Restore database
log "Restoring database..."
latest_db_backup=$(ls -t /backups/database/*.sql.gz | head -1)
./full-restore.sh ${latest_db_backup} >> ${RECOVERY_LOG} 2>&1
# 2. Restore files
log "Restoring file storage..."
latest_file_backup=$(ls -t /backups/files/*.tar.gz | head -1)
./restore-files.sh ${latest_file_backup} >> ${RECOVERY_LOG} 2>&1
# 3. Restore configuration
log "Restoring configuration..."
latest_config_backup=$(ls -t /backups/config/*.tar.gz.gpg | head -1)
gpg --decrypt ${latest_config_backup} | tar -xzf - -C /etc/readur/
# 4. Start services
log "Starting services..."
systemctl start postgresql
systemctl start readur
systemctl start nginx
# 5. Verify recovery
log "Verifying recovery..."
sleep 30
./health-check.sh >> ${RECOVERY_LOG} 2>&1
log "Disaster recovery completed"
Important Architecture Note¶
Readur is designed as a single-instance application. It does not support high availability configurations with multiple server instances, database replication, or clustering. For reliability:
- Implement regular automated backups
- Use reliable storage (RAID, cloud storage)
- Monitor system health
- Have a tested recovery procedure
Single-Instance Best Practices¶
- Regular Backups: Schedule frequent backups based on your RPO requirements
- Monitoring: Set up alerts for system failures
- Quick Recovery: Keep recovery scripts ready and tested
- Hardware Redundancy: Use RAID for storage, redundant power supplies
- Cloud Deployment: Consider cloud platforms with built-in reliability features
Backup Storage Management¶
Retention Policies¶
#!/bin/bash
# manage-retention.sh
# Retention periods (days)
DAILY_RETENTION=7
WEEKLY_RETENTION=28
MONTHLY_RETENTION=365
BACKUP_DIR="/backups"
# Keep daily backups for 7 days
find ${BACKUP_DIR}/daily -mtime +${DAILY_RETENTION} -delete
# Keep weekly backups for 4 weeks
find ${BACKUP_DIR}/weekly -mtime +${WEEKLY_RETENTION} -delete
# Keep monthly backups for 1 year
find ${BACKUP_DIR}/monthly -mtime +${MONTHLY_RETENTION} -delete
# Archive old backups to glacier
find ${BACKUP_DIR}/monthly -mtime +30 -name "*.tar.gz" | while read file; do
aws s3 cp ${file} s3://readur-archive/ --storage-class GLACIER
rm ${file}
done
Storage Monitoring¶
#!/bin/bash
# monitor-backup-storage.sh
BACKUP_DIR="/backups"
THRESHOLD=80 # Alert when usage exceeds 80%
# Check disk usage
usage=$(df ${BACKUP_DIR} | awk 'NR==2 {print int($5)}')
if [ ${usage} -gt ${THRESHOLD} ]; then
echo "WARNING: Backup storage at ${usage}% capacity"
# Send alert
curl -X POST https://alerts.example.com/webhook \
-H "Content-Type: application/json" \
-d "{\"message\": \"Backup storage critical: ${usage}% used\"}"
# Trigger cleanup
./manage-retention.sh
fi
# Report backup statistics
echo "=== Backup Storage Report ==="
echo "Total backups: $(find ${BACKUP_DIR} -type f | wc -l)"
echo "Storage used: $(du -sh ${BACKUP_DIR} | cut -f1)"
echo "Oldest backup: $(ls -t ${BACKUP_DIR}/**/*.gz | tail -1)"
echo "Latest backup: $(ls -t ${BACKUP_DIR}/**/*.gz | head -1)"
Testing and Validation¶
Disaster Recovery Drill¶
#!/bin/bash
# dr-drill.sh
# Schedule: Run quarterly
# Purpose: Validate recovery procedures
TEST_ENV="dr-test"
START_TIME=$(date +%s)
echo "=== Starting Disaster Recovery Drill ==="
date
# 1. Create test environment
docker-compose -f docker-compose.test.yml up -d
# 2. Restore from backup
./execute-recovery.sh --env ${TEST_ENV}
# 3. Validate data integrity
psql -h localhost -p 5433 -U readur -d readur_test <<EOF
SELECT COUNT(*) as document_count FROM documents;
SELECT COUNT(*) as user_count FROM users;
SELECT COUNT(DISTINCT file_hash) as unique_files FROM documents;
EOF
# 4. Test application functionality
curl -f http://localhost:8081/health
curl -f http://localhost:8081/api/documents
# 5. Clean up
docker-compose -f docker-compose.test.yml down
END_TIME=$(date +%s)
DURATION=$((END_TIME - START_TIME))
echo "=== Disaster Recovery Drill Completed ==="
echo "Duration: ${DURATION} seconds"
echo "Target RTO: 8 hours (28800 seconds)"
if [ ${DURATION} -lt 28800 ]; then
echo "✓ RTO target met"
else
echo "✗ RTO target exceeded"
fi
Security Considerations¶
Backup Encryption¶
# Encrypt backups at rest
openssl enc -aes-256-cbc -salt -in backup.tar.gz -out backup.tar.gz.enc
# Decrypt for restoration
openssl enc -aes-256-cbc -d -in backup.tar.gz.enc -out backup.tar.gz
Access Control¶
# Set proper permissions
chmod 600 /backups/database/*.sql.gz
chmod 600 /backups/config/*.gpg
chown -R backup:backup /backups
# Restrict backup script execution
chmod 750 /opt/readur/scripts/backup-*.sh
Monitoring and Alerting¶
Backup Monitoring¶
# Prometheus alert rules
groups:
- name: backup_alerts
rules:
- alert: BackupFailed
expr: backup_last_success_timestamp < (time() - 86400)
annotations:
summary: "Backup has not succeeded in 24 hours"
- alert: BackupStorageFull
expr: backup_storage_usage_percent > 90
annotations:
summary: "Backup storage is {{ $value }}% full"
- alert: BackupVerificationFailed
expr: backup_verification_success == 0
annotations:
summary: "Backup verification failed"
Checklist¶
Daily Tasks¶
- Verify overnight backups completed
- Check backup storage capacity
- Review backup logs for errors
Weekly Tasks¶
- Test restore procedure with sample data
- Verify backup retention policy
- Update backup documentation
Monthly Tasks¶
- Full backup verification
- Review and update RTO/RPO targets
- Test failover procedures
Quarterly Tasks¶
- Complete disaster recovery drill
- Review and update recovery runbooks
- Audit backup security and encryption