Automated Rotation Implementation

Automating the key rotation process eliminates manual errors and ensures consistent execution across all systems. The automation must handle key generation, distribution, validation, and cleanup while maintaining zero downtime for critical services.
Implement comprehensive rotation automation:
#!/bin/bash
# automated-key-rotation.sh
# Zero-downtime SSH key rotation system

# Configuration
ROTATION_DIR="/opt/ssh-rotation"
STATE_FILE="$ROTATION_DIR/state/current-rotation.json"
BACKUP_DIR="$ROTATION_DIR/backups/$(date +%Y%m%d-%H%M%S)"
LOG_FILE="$ROTATION_DIR/logs/rotation-$(date +%Y%m%d).log"

# Logging function
log() {
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
}

# Initialize rotation environment
init_rotation() {
    local rotation_id="$1"
    
    log "Initializing rotation: $rotation_id"
    
    # Create directories
    mkdir -p "$ROTATION_DIR"/{state,backups,logs,keys/{old,new},scripts}
    mkdir -p "$BACKUP_DIR"
    
    # Create state file
    cat > "$STATE_FILE" << EOF
{
    "rotation_id": "$rotation_id",
    "start_time": "$(date -Iseconds)",
    "status": "initializing",
    "phase": "preparation",
    "keys_rotated": 0,
    "systems_updated": 0,
    "errors": []
}
EOF
}

# Generate new key pair
generate_new_key() {
    local key_name="$1"
    local key_type="${2:-ed25519}"
    local key_comment="$3"
    
    log "Generating new $key_type key: $key_name"
    
    local new_key_path="$ROTATION_DIR/keys/new/${key_name}"
    
    # Generate key
    ssh-keygen -t "$key_type" -f "$new_key_path" -N "" -C "$key_comment" \
        -o -a 100 2>/dev/null
    
    if [ $? -eq 0 ]; then
        # Set secure permissions
        chmod 600 "$new_key_path"
        chmod 644 "${new_key_path}.pub"
        
        # Calculate fingerprint
        local fingerprint=$(ssh-keygen -lf "${new_key_path}.pub" | awk '{print $2}')
        
        # Update state
        update_state ".keys_generated[\"$key_name\"] = \"$fingerprint\""
        
        echo "$new_key_path"
    else
        log "ERROR: Failed to generate key $key_name"
        update_state ".errors += [\"Failed to generate key $key_name\"]"
        return 1
    fi
}

# Update rotation state
update_state() {
    local jq_expression="$1"
    
    local temp_file=$(mktemp)
    jq "$jq_expression" "$STATE_FILE" > "$temp_file" && mv "$temp_file" "$STATE_FILE"
}

# Deploy key to target systems
deploy_key_parallel() {
    local key_name="$1"
    local target_systems="$2"
    local deploy_user="$3"
    
    log "Deploying key $key_name to ${#target_systems[@]} systems"
    
    local public_key=$(cat "$ROTATION_DIR/keys/new/${key_name}.pub")
    
    # Create deployment script
    cat > "$ROTATION_DIR/scripts/deploy_${key_name}.sh" << 'EOF'
#!/bin/bash
TARGET_HOST="$1"
DEPLOY_USER="$2"
PUBLIC_KEY="$3"
KEY_NAME="$4"

# Create temporary authorized_keys with both old and new keys
ssh -o ConnectTimeout=10 "$DEPLOY_USER@$TARGET_HOST" << ENDSSH
set -e

# Backup current authorized_keys
cp ~/.ssh/authorized_keys ~/.ssh/authorized_keys.backup

# Add new key if not present
if ! grep -q "$PUBLIC_KEY" ~/.ssh/authorized_keys 2>/dev/null; then
    echo "$PUBLIC_KEY" >> ~/.ssh/authorized_keys
fi

# Ensure correct permissions
chmod 600 ~/.ssh/authorized_keys

# Create marker file for successful deployment
touch ~/.ssh/.rotation_${KEY_NAME}_deployed
ENDSSH
EOF
    
    chmod +x "$ROTATION_DIR/scripts/deploy_${key_name}.sh"
    
    # Deploy in parallel
    local success_count=0
    local fail_count=0
    
    export -f deploy_single_key
    echo "$target_systems" | tr ' ' '\n' | \
    parallel -j 10 --timeout 30 \
        "$ROTATION_DIR/scripts/deploy_${key_name}.sh" {} "$deploy_user" "$public_key" "$key_name" \
        2>&1 | while read -r line; do
        if [[ $line =~ "successfully" ]]; then
            ((success_count++))
        elif [[ $line =~ "failed" ]]; then
            ((fail_count++))
            echo "$line" >> "$ROTATION_DIR/logs/deployment_errors.log"
        fi
    done
    
    # Update state
    update_state ".systems_updated += $success_count"
    
    log "Deployment complete: $success_count successful, $fail_count failed"
    
    return $fail_count
}

# Validate new key access
validate_new_key() {
    local key_name="$1"
    local test_systems="$2"
    local test_user="$3"
    
    log "Validating new key access for $key_name"
    
    local new_key_path="$ROTATION_DIR/keys/new/${key_name}"
    local validation_failures=0
    
    for system in $test_systems; do
        # Test SSH connection with new key
        if ssh -o BatchMode=yes \
               -o ConnectTimeout=5 \
               -o StrictHostKeyChecking=no \
               -i "$new_key_path" \
               "$test_user@$system" \
               "echo 'Key validation successful'" &>/dev/null; then
            log "✓ Validated access to $system"
        else
            log "✗ Failed validation on $system"
            ((validation_failures++))
        fi
    done
    
    if [ $validation_failures -eq 0 ]; then
        update_state ".phase = \"validated\""
        return 0
    else
        update_state ".errors += [\"Validation failed on $validation_failures systems\"]"
        return 1
    fi
}

# Remove old key from systems
remove_old_key() {
    local old_key_fingerprint="$1"
    local target_systems="$2"
    local target_user="$3"
    
    log "Removing old key from systems"
    
    # Create removal script
    cat > "$ROTATION_DIR/scripts/remove_old_key.sh" << 'EOF'
#!/bin/bash
TARGET_HOST="$1"
TARGET_USER="$2"
OLD_FINGERPRINT="$3"

ssh -o ConnectTimeout=10 "$TARGET_USER@$TARGET_HOST" << ENDSSH
set -e

# Remove lines containing the old key fingerprint
if [ -f ~/.ssh/authorized_keys ]; then
    grep -v "$OLD_FINGERPRINT" ~/.ssh/authorized_keys > ~/.ssh/authorized_keys.tmp || true
    mv ~/.ssh/authorized_keys.tmp ~/.ssh/authorized_keys
    chmod 600 ~/.ssh/authorized_keys
fi

# Remove deployment marker
rm -f ~/.ssh/.rotation_*_deployed
ENDSSH
EOF
    
    chmod +x "$ROTATION_DIR/scripts/remove_old_key.sh"
    
    # Execute removal in parallel
    echo "$target_systems" | tr ' ' '\n' | \
    parallel -j 10 --timeout 30 \
        "$ROTATION_DIR/scripts/remove_old_key.sh" {} "$target_user" "$old_key_fingerprint"
    
    log "Old key removal complete"
}

# Rollback function
rollback_rotation() {
    local rotation_id="$1"
    local reason="$2"
    
    log "ROLLBACK: Initiating rollback for $rotation_id - Reason: $reason"
    
    update_state ".status = \"rolling_back\""
    update_state ".rollback_reason = \"$reason\""
    
    # Restore from backups
    if [ -d "$BACKUP_DIR" ]; then
        log "Restoring from backup: $BACKUP_DIR"
        
        # Restore authorized_keys on all systems
        # ... (implementation depends on backup strategy)
    fi
    
    # Send alerts
    send_alert "Rotation Rollback" "Rotation $rotation_id rolled back: $reason"
    
    update_state ".status = \"rolled_back\""
}

# Main rotation workflow
execute_rotation() {
    local key_name="$1"
    local key_type="$2"
    local target_systems="$3"
    local deploy_user="$4"
    local old_key_fingerprint="$5"
    
    local rotation_id="rot-$(date +%s)"
    
    # Initialize
    init_rotation "$rotation_id"
    
    # Update state
    update_state ".phase = \"generating_keys\""
    
    # Generate new key
    new_key_path=$(generate_new_key "$key_name" "$key_type" "rotated-$rotation_id")
    if [ $? -ne 0 ]; then
        rollback_rotation "$rotation_id" "Key generation failed"
        return 1
    fi
    
    # Deploy new key
    update_state ".phase = \"deploying_keys\""
    deploy_key_parallel "$key_name" "$target_systems" "$deploy_user"
    
    # Validate new key
    update_state ".phase = \"validating_access\""
    
    # Test on subset of systems
    test_systems=$(echo "$target_systems" | tr ' ' '\n' | head -3 | tr '\n' ' ')
    
    if ! validate_new_key "$key_name" "$test_systems" "$deploy_user"; then
        rollback_rotation "$rotation_id" "Validation failed"
        return 1
    fi
    
    # Remove old key (after grace period)
    update_state ".phase = \"removing_old_keys\""
    
    log "Waiting for grace period before removing old keys..."
    sleep "${GRACE_PERIOD:-300}"  # 5 minute default
    
    remove_old_key "$old_key_fingerprint" "$target_systems" "$deploy_user"
    
    # Finalize
    update_state ".status = \"completed\""
    update_state ".end_time = \"$(date -Iseconds)\""
    update_state ".keys_rotated += 1"
    
    log "Rotation completed successfully: $rotation_id"
    
    # Archive state and logs
    archive_rotation "$rotation_id"
    
    return 0
}

# Archive rotation data
archive_rotation() {
    local rotation_id="$1"
    
    local archive_dir="$ROTATION_DIR/archives/$rotation_id"
    mkdir -p "$archive_dir"
    
    # Move rotation artifacts
    mv "$STATE_FILE" "$archive_dir/"
    cp "$LOG_FILE" "$archive_dir/"
    
    # Move old keys to archive (never delete them immediately)
    mv "$ROTATION_DIR/keys/old/"* "$archive_dir/" 2>/dev/null || true
    
    # Compress archive
    tar -czf "$archive_dir.tar.gz" -C "$ROTATION_DIR/archives" "$rotation_id"
    rm -rf "$archive_dir"
    
    log "Rotation archived: $archive_dir.tar.gz"
}

# Send notifications
send_alert() {
    local subject="$1"
    local message="$2"
    
    # Email notification
    echo "$message" | mail -s "SSH Rotation: $subject" [email protected]
    
    # Slack notification
    curl -X POST -H 'Content-type: application/json' \
        --data "{\"text\":\"SSH Rotation Alert: $subject\n$message\"}" \
        "$SLACK_WEBHOOK_URL" 2>/dev/null || true
}

# Example usage
if [ $# -lt 4 ]; then
    echo "Usage: $0 <key_name> <key_type> <target_systems> <deploy_user> [old_fingerprint]"
    exit 1
fi

execute_rotation "$@"