Recovery and System Restoration

Recovery and System Restoration

Recovery procedures restore normal operations while ensuring threats have been completely eradicated. Effective recovery balances speed with security, gradually bringing systems back online while monitoring for reinfection. Understanding recovery strategies for different incident types ensures rapid restoration without compromising security.

Implement systematic recovery procedures:

#!/bin/bash
# System Recovery and Restoration Script

set -euo pipefail

# Configuration
INCIDENT_ID="$1"
RECOVERY_LOG="/var/log/recovery_${INCIDENT_ID}.log"
BACKUP_LOCATION="/backups"
VALIDATION_TESTS="/opt/recovery/validation_tests"

# Logging function
log() {
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*" | tee -a "$RECOVERY_LOG"
}

# Validation function
validate_system() {
    local system="$1"
    local test_suite="$2"
    
    log "Running validation tests on $system"
    
    # Run test suite
    if [[ -x "$VALIDATION_TESTS/$test_suite" ]]; then
        if "$VALIDATION_TESTS/$test_suite" "$system"; then
            log "Validation passed for $system"
            return 0
        else
            log "ERROR: Validation failed for $system"
            return 1
        fi
    else
        log "WARNING: No validation test suite found for $test_suite"
        return 0
    fi
}

# Restore from backup
restore_system() {
    local system="$1"
    local backup_date="$2"
    
    log "Starting restoration of $system from backup dated $backup_date"
    
    # Verify backup integrity
    local backup_path="$BACKUP_LOCATION/$system/$backup_date"
    if [[ ! -d "$backup_path" ]]; then
        log "ERROR: Backup not found at $backup_path"
        return 1
    fi
    
    # Verify backup hash
    if [[ -f "$backup_path/backup.sha256" ]]; then
        if ! sha256sum -c "$backup_path/backup.sha256"; then
            log "ERROR: Backup integrity check failed"
            return 1
        fi
    fi
    
    # Perform restoration based on system type
    case "$system" in
        web-*)
            restore_web_server "$system" "$backup_path"
            ;;
        db-*)
            restore_database_server "$system" "$backup_path"
            ;;
        app-*)
            restore_application_server "$system" "$backup_path"
            ;;
        *)
            restore_generic_system "$system" "$backup_path"
            ;;
    esac
}

# Web server restoration
restore_web_server() {
    local server="$1"
    local backup_path="$2"
    
    log "Restoring web server $server"
    
    # Stop web services
    systemctl stop nginx apache2 2>/dev/null || true
    
    # Restore web content
    rsync -av --delete "$backup_path/var/www/" /var/www/
    
    # Restore configuration
    cp -a "$backup_path/etc/nginx" /etc/ 2>/dev/null || true
    cp -a "$backup_path/etc/apache2" /etc/ 2>/dev/null || true
    
    # Update security configurations
    apply_security_hardening "web"
    
    # Start services
    systemctl start nginx 2>/dev/null || systemctl start apache2
    
    # Validate
    validate_system "$server" "web_validation"
}

# Database restoration
restore_database_server() {
    local server="$1"
    local backup_path="$2"
    
    log "Restoring database server $server"
    
    # Stop database services
    systemctl stop mysql postgresql 2>/dev/null || true
    
    # Restore database files
    if [[ -d "$backup_path/var/lib/mysql" ]]; then
        # MySQL restoration
        rm -rf /var/lib/mysql/*
        cp -a "$backup_path/var/lib/mysql/"* /var/lib/mysql/
        chown -R mysql:mysql /var/lib/mysql
        
        # Start MySQL in safe mode for verification
        mysqld_safe --skip-grant-tables &
        sleep 5
        
        # Reset passwords and privileges
        mysql -e "FLUSH PRIVILEGES; ALTER USER 'root'@'localhost' IDENTIFIED BY 'NewSecurePassword123!';"
        
        # Stop safe mode and start normally
        mysqladmin shutdown
        systemctl start mysql
        
    elif [[ -d "$backup_path/var/lib/postgresql" ]]; then
        # PostgreSQL restoration
        rm -rf /var/lib/postgresql/*
        cp -a "$backup_path/var/lib/postgresql/"* /var/lib/postgresql/
        chown -R postgres:postgres /var/lib/postgresql
        
        systemctl start postgresql
    fi
    
    # Validate
    validate_system "$server" "database_validation"
}

# Application hardening
apply_security_hardening() {
    local server_type="$1"
    
    log "Applying security hardening for $server_type"
    
    # Common hardening
    # Update system
    apt-get update && apt-get upgrade -y
    
    # Configure firewall
    ufw --force reset
    ufw default deny incoming
    ufw default allow outgoing
    
    case "$server_type" in
        web)
            ufw allow 80/tcp
            ufw allow 443/tcp
            ;;
        database)
            ufw allow from 10.0.0.0/24 to any port 3306
            ufw allow from 10.0.0.0/24 to any port 5432
            ;;
    esac
    
    ufw allow from 10.0.0.0/24 to any port 22
    ufw --force enable
    
    # Update security configurations
    # Kernel parameters
    cat > /etc/sysctl.d/99-security.conf << 'EOF'
net.ipv4.tcp_syncookies = 1
net.ipv4.conf.all.rp_filter = 1
net.ipv4.conf.default.rp_filter = 1
net.ipv4.conf.all.accept_redirects = 0
net.ipv4.conf.default.accept_redirects = 0
net.ipv4.conf.all.send_redirects = 0
net.ipv4.conf.default.send_redirects = 0
net.ipv4.conf.all.accept_source_route = 0
net.ipv4.conf.default.accept_source_route = 0
net.ipv4.icmp_echo_ignore_broadcasts = 1
net.ipv4.icmp_ignore_bogus_error_responses = 1
kernel.randomize_va_space = 2
fs.suid_dumpable = 0
EOF
    
    sysctl -p /etc/sysctl.d/99-security.conf
    
    # Reset all passwords
    log "Forcing password reset for all users"
    for user in $(awk -F: '$3 >= 1000 {print $1}' /etc/passwd); do
        chage -d 0 "$user"
    done
}

# Gradual service restoration
gradual_restoration() {
    local services=("$@")
    local restored=()
    local failed=()
    
    log "Starting gradual service restoration"
    
    for service in "${services[@]}"; do
        log "Restoring service: $service"
        
        # Restore service
        if restore_system "$service" "latest"; then
            restored+=("$service")
            
            # Monitor for 30 minutes
            log "Monitoring $service for stability"
            sleep 1800
            
            # Check for signs of reinfection
            if check_reinfection "$service"; then
                log "WARNING: Possible reinfection detected on $service"
                isolate_system "$service"
                failed+=("$service")
            else
                log "Service $service appears stable"
            fi
        else
            failed+=("$service")
        fi
    done
    
    # Report results
    log "Restoration complete"
    log "Successfully restored: ${restored[*]}"
    log "Failed to restore: ${failed[*]}"
}

# Check for reinfection
check_reinfection() {
    local system="$1"
    
    # Check for IOCs
    # File hashes
    if [[ -f "/opt/recovery/ioc_hashes.txt" ]]; then
        while read -r hash; do
            if find /usr /var /home -type f -exec sha256sum {} \; 2>/dev/null | grep -q "$hash"; then
                log "ERROR: Known malicious file detected on $system"
                return 1
            fi
        done < "/opt/recovery/ioc_hashes.txt"
    fi
    
    # Network connections
    if ss -tn | grep -E ":(4444|5555|6666|31337)"; then
        log "ERROR: Suspicious network connection detected on $system"
        return 1
    fi
    
    # Process checks
    if ps aux | grep -E "(nc -l|/dev/tcp/|curl.*\|.*sh)"; then
        log "ERROR: Suspicious process detected on $system"
        return 1
    fi
    
    return 0
}

# Main recovery workflow
main() {
    log "Starting recovery process for incident $INCIDENT_ID"
    
    # Phase 1: Preparation
    log "Phase 1: Recovery preparation"
    
    # Verify backups are available and clean
    verify_clean_backups
    
    # Prepare recovery environment
    prepare_recovery_environment
    
    # Phase 2: Restoration
    log "Phase 2: System restoration"
    
    # Define restoration order (least critical first)
    restoration_order=(
        "web-dev01"
        "app-test01"
        "web-prod01"
        "app-prod01"
        "db-prod01"
    )
    
    gradual_restoration "${restoration_order[@]}"
    
    # Phase 3: Validation
    log "Phase 3: Post-recovery validation"
    
    # Run comprehensive validation
    run_validation_suite
    
    # Phase 4: Monitoring
    log "Phase 4: Enhanced monitoring"
    
    # Deploy enhanced monitoring
    deploy_enhanced_monitoring
    
    log "Recovery process completed"
}

# Run main function
main "$@"