High Availability and Scaling

High Availability and Scaling

Production bastion hosts must provide reliable access even during failures or high load. Implementing high availability ensures continuous access for critical operations while load balancing distributes connections across multiple bastion instances.

Deploy highly available bastion architecture:

#!/usr/bin/env python3
# bastion-ha-manager.py
# High availability manager for bastion hosts

import os
import time
import socket
import subprocess
import json
import requests
from datetime import datetime
import consul
import redis
from kubernetes import client, config

class BastionHAManager:
    def __init__(self, config_file):
        with open(config_file, 'r') as f:
            self.config = json.load(f)
            
        # Initialize service discovery
        self.consul = consul.Consul(
            host=self.config['consul']['host'],
            port=self.config['consul']['port']
        )
        
        # Initialize shared state
        self.redis = redis.Redis(
            host=self.config['redis']['host'],
            port=self.config['redis']['port'],
            decode_responses=True
        )
        
        # Initialize Kubernetes client if running in K8s
        if self.config.get('kubernetes', {}).get('enabled'):
            config.load_incluster_config()
            self.k8s_v1 = client.CoreV1Api()
            self.k8s_apps = client.AppsV1Api()
            
        self.node_id = socket.gethostname()
        self.is_leader = False
        
    def register_bastion_node(self):
        """Register this bastion node with service discovery"""
        service_definition = {
            'Name': 'bastion-ssh',
            'ID': f'bastion-{self.node_id}',
            'Tags': ['ssh', 'bastion', self.config['environment']],
            'Port': 22,
            'Check': {
                'TCP': f"{self.get_node_ip()}:22",
                'Interval': '10s',
                'Timeout': '2s'
            },
            'Meta': {
                'version': self.config['version'],
                'capacity': str(self.get_node_capacity()),
                'load': '0'
            }
        }
        
        self.consul.agent.service.register(service_definition)
        
    def update_node_health(self):
        """Update node health metrics"""
        metrics = {
            'cpu_usage': self.get_cpu_usage(),
            'memory_usage': self.get_memory_usage(),
            'active_sessions': self.get_active_sessions(),
            'load_score': self.calculate_load_score()
        }
        
        # Update in Consul
        self.consul.agent.service.register(
            name='bastion-ssh',
            service_id=f'bastion-{self.node_id}',
            meta={
                'load': str(metrics['load_score']),
                'sessions': str(metrics['active_sessions'])
            }
        )
        
        # Update in Redis for quick access
        self.redis.hset(
            'bastion:nodes:health',
            self.node_id,
            json.dumps(metrics)
        )
        
        # Set expiry to detect failed nodes
        self.redis.expire('bastion:nodes:health', 30)
        
        return metrics
        
    def elect_leader(self):
        """Participate in leader election"""
        session_id = self.consul.session.create(
            name=f'bastion-leader-{self.node_id}',
            ttl=15,
            behavior='delete'
        )
        
        # Try to acquire leader lock
        acquired = self.consul.kv.put(
            'bastion/leader',
            self.node_id,
            acquire=session_id
        )
        
        if acquired:
            self.is_leader = True
            self.perform_leader_duties()
        else:
            self.is_leader = False
            
        # Renew session
        self.consul.session.renew(session_id)
        
    def perform_leader_duties(self):
        """Perform duties as leader node"""
        # Monitor cluster health
        self.monitor_cluster_health()
        
        # Rebalance connections if needed
        self.rebalance_connections()
        
        # Update DNS/load balancer
        self.update_load_balancer()
        
        # Clean up stale sessions
        self.cleanup_stale_sessions()
        
    def monitor_cluster_health(self):
        """Monitor health of all bastion nodes"""
        nodes = self.consul.health.service('bastion-ssh')[1]
        
        healthy_nodes = []
        unhealthy_nodes = []
        
        for node in nodes:
            node_id = node['Service']['ID']
            checks = node['Checks']
            
            # Check if all health checks pass
            if all(check['Status'] == 'passing' for check in checks):
                healthy_nodes.append(node_id)
            else:
                unhealthy_nodes.append(node_id)
                
        # Alert on unhealthy nodes
        if unhealthy_nodes:
            self.send_alert(
                'Unhealthy bastion nodes detected',
                {'nodes': unhealthy_nodes}
            )
            
        # Scale if needed
        if len(healthy_nodes) < self.config['min_nodes']:
            self.scale_up()
        elif len(healthy_nodes) > self.config['max_nodes']:
            self.scale_down()
            
    def rebalance_connections(self):
        """Rebalance SSH connections across nodes"""
        nodes_health = {}
        
        # Get all nodes health
        for node_id, health_json in self.redis.hgetall('bastion:nodes:health').items():
            nodes_health[node_id] = json.loads(health_json)
            
        # Find overloaded nodes
        avg_load = sum(h['load_score'] for h in nodes_health.values()) / len(nodes_health)
        
        overloaded = [
            node_id for node_id, health in nodes_health.items()
            if health['load_score'] > avg_load * 1.5
        ]
        
        if overloaded:
            # Update load balancer weights
            self.adjust_load_balancer_weights(nodes_health)
            
            # Optionally migrate sessions
            if self.config.get('enable_session_migration'):
                self.migrate_sessions_from_overloaded(overloaded)
                
    def scale_up(self):
        """Scale up bastion hosts"""
        if self.config['kubernetes']['enabled']:
            # Scale Kubernetes deployment
            deployment = self.k8s_apps.read_namespaced_deployment(
                name='bastion-ssh',
                namespace=self.config['kubernetes']['namespace']
            )
            
            current_replicas = deployment.spec.replicas
            new_replicas = min(
                current_replicas + 1,
                self.config['max_nodes']
            )
            
            deployment.spec.replicas = new_replicas
            
            self.k8s_apps.patch_namespaced_deployment(
                name='bastion-ssh',
                namespace=self.config['kubernetes']['namespace'],
                body=deployment
            )
            
        else:
            # Trigger cloud provider auto-scaling
            self.trigger_cloud_scaling('up')
            
    def update_load_balancer(self):
        """Update load balancer configuration"""
        healthy_nodes = []
        
        # Get healthy nodes from Consul
        services = self.consul.health.service('bastion-ssh', passing=True)[1]
        
        for service in services:
            node_info = {
                'id': service['Service']['ID'],
                'address': service['Service']['Address'],
                'port': service['Service']['Port'],
                'weight': 100 - int(service['Service']['Meta'].get('load', '0'))
            }
            healthy_nodes.append(node_info)
            
        # Update HAProxy configuration
        haproxy_config = self.generate_haproxy_config(healthy_nodes)
        
        with open('/etc/haproxy/haproxy.cfg', 'w') as f:
            f.write(haproxy_config)
            
        # Reload HAProxy
        subprocess.run(['systemctl', 'reload', 'haproxy'])
        
    def generate_haproxy_config(self, nodes):
        """Generate HAProxy configuration"""
        config = """
global
    log /dev/log local0
    log /dev/log local1 notice
    chroot /var/lib/haproxy
    stats socket /run/haproxy/admin.sock mode 660
    stats timeout 30s
    user haproxy
    group haproxy
    daemon

defaults
    log     global
    mode    tcp
    option  tcplog
    option  dontlognull
    timeout connect 5000
    timeout client  50000
    timeout server  50000

frontend bastion_ssh
    bind *:22
    mode tcp
    default_backend bastion_nodes

backend bastion_nodes
    mode tcp
    balance leastconn
    option tcp-check
    tcp-check connect port 22
"""
        
        for node in nodes:
            config += f"\n    server {node['id']} {node['address']}:{node['port']} "
            config += f"check weight {node['weight']} inter 2000 rise 2 fall 3"
            
        return config
        
    def get_active_sessions(self):
        """Count active SSH sessions"""
        try:
            result = subprocess.run(
                ['who', '-u'],
                capture_output=True,
                text=True
            )
            
            # Count SSH sessions
            sessions = [
                line for line in result.stdout.splitlines()
                if 'pts/' in line
            ]
            
            return len(sessions)
            
        except:
            return 0
            
    def calculate_load_score(self):
        """Calculate node load score (0-100)"""
        cpu = self.get_cpu_usage()
        memory = self.get_memory_usage()
        sessions = self.get_active_sessions()
        max_sessions = self.config['max_sessions_per_node']
        
        # Weighted score
        load_score = (
            cpu * 0.3 +
            memory * 0.3 +
            (sessions / max_sessions * 100) * 0.4
        )
        
        return min(100, int(load_score))

SSH bastion hosts provide critical security infrastructure for protecting access to internal resources. By implementing comprehensive security controls, access management, session recording, and high availability, organizations create robust gateways that enable secure remote access while maintaining visibility and control. Regular updates and monitoring ensure bastion hosts continue providing effective protection as threats and requirements evolve.