High Availability and Scaling
High Availability and Scaling
Production bastion hosts must provide reliable access even during failures or high load. Implementing high availability ensures continuous access for critical operations while load balancing distributes connections across multiple bastion instances.
Deploy highly available bastion architecture:
#!/usr/bin/env python3
# bastion-ha-manager.py
# High availability manager for bastion hosts
import os
import time
import socket
import subprocess
import json
import requests
from datetime import datetime
import consul
import redis
from kubernetes import client, config
class BastionHAManager:
def __init__(self, config_file):
with open(config_file, 'r') as f:
self.config = json.load(f)
# Initialize service discovery
self.consul = consul.Consul(
host=self.config['consul']['host'],
port=self.config['consul']['port']
)
# Initialize shared state
self.redis = redis.Redis(
host=self.config['redis']['host'],
port=self.config['redis']['port'],
decode_responses=True
)
# Initialize Kubernetes client if running in K8s
if self.config.get('kubernetes', {}).get('enabled'):
config.load_incluster_config()
self.k8s_v1 = client.CoreV1Api()
self.k8s_apps = client.AppsV1Api()
self.node_id = socket.gethostname()
self.is_leader = False
def register_bastion_node(self):
"""Register this bastion node with service discovery"""
service_definition = {
'Name': 'bastion-ssh',
'ID': f'bastion-{self.node_id}',
'Tags': ['ssh', 'bastion', self.config['environment']],
'Port': 22,
'Check': {
'TCP': f"{self.get_node_ip()}:22",
'Interval': '10s',
'Timeout': '2s'
},
'Meta': {
'version': self.config['version'],
'capacity': str(self.get_node_capacity()),
'load': '0'
}
}
self.consul.agent.service.register(service_definition)
def update_node_health(self):
"""Update node health metrics"""
metrics = {
'cpu_usage': self.get_cpu_usage(),
'memory_usage': self.get_memory_usage(),
'active_sessions': self.get_active_sessions(),
'load_score': self.calculate_load_score()
}
# Update in Consul
self.consul.agent.service.register(
name='bastion-ssh',
service_id=f'bastion-{self.node_id}',
meta={
'load': str(metrics['load_score']),
'sessions': str(metrics['active_sessions'])
}
)
# Update in Redis for quick access
self.redis.hset(
'bastion:nodes:health',
self.node_id,
json.dumps(metrics)
)
# Set expiry to detect failed nodes
self.redis.expire('bastion:nodes:health', 30)
return metrics
def elect_leader(self):
"""Participate in leader election"""
session_id = self.consul.session.create(
name=f'bastion-leader-{self.node_id}',
ttl=15,
behavior='delete'
)
# Try to acquire leader lock
acquired = self.consul.kv.put(
'bastion/leader',
self.node_id,
acquire=session_id
)
if acquired:
self.is_leader = True
self.perform_leader_duties()
else:
self.is_leader = False
# Renew session
self.consul.session.renew(session_id)
def perform_leader_duties(self):
"""Perform duties as leader node"""
# Monitor cluster health
self.monitor_cluster_health()
# Rebalance connections if needed
self.rebalance_connections()
# Update DNS/load balancer
self.update_load_balancer()
# Clean up stale sessions
self.cleanup_stale_sessions()
def monitor_cluster_health(self):
"""Monitor health of all bastion nodes"""
nodes = self.consul.health.service('bastion-ssh')[1]
healthy_nodes = []
unhealthy_nodes = []
for node in nodes:
node_id = node['Service']['ID']
checks = node['Checks']
# Check if all health checks pass
if all(check['Status'] == 'passing' for check in checks):
healthy_nodes.append(node_id)
else:
unhealthy_nodes.append(node_id)
# Alert on unhealthy nodes
if unhealthy_nodes:
self.send_alert(
'Unhealthy bastion nodes detected',
{'nodes': unhealthy_nodes}
)
# Scale if needed
if len(healthy_nodes) < self.config['min_nodes']:
self.scale_up()
elif len(healthy_nodes) > self.config['max_nodes']:
self.scale_down()
def rebalance_connections(self):
"""Rebalance SSH connections across nodes"""
nodes_health = {}
# Get all nodes health
for node_id, health_json in self.redis.hgetall('bastion:nodes:health').items():
nodes_health[node_id] = json.loads(health_json)
# Find overloaded nodes
avg_load = sum(h['load_score'] for h in nodes_health.values()) / len(nodes_health)
overloaded = [
node_id for node_id, health in nodes_health.items()
if health['load_score'] > avg_load * 1.5
]
if overloaded:
# Update load balancer weights
self.adjust_load_balancer_weights(nodes_health)
# Optionally migrate sessions
if self.config.get('enable_session_migration'):
self.migrate_sessions_from_overloaded(overloaded)
def scale_up(self):
"""Scale up bastion hosts"""
if self.config['kubernetes']['enabled']:
# Scale Kubernetes deployment
deployment = self.k8s_apps.read_namespaced_deployment(
name='bastion-ssh',
namespace=self.config['kubernetes']['namespace']
)
current_replicas = deployment.spec.replicas
new_replicas = min(
current_replicas + 1,
self.config['max_nodes']
)
deployment.spec.replicas = new_replicas
self.k8s_apps.patch_namespaced_deployment(
name='bastion-ssh',
namespace=self.config['kubernetes']['namespace'],
body=deployment
)
else:
# Trigger cloud provider auto-scaling
self.trigger_cloud_scaling('up')
def update_load_balancer(self):
"""Update load balancer configuration"""
healthy_nodes = []
# Get healthy nodes from Consul
services = self.consul.health.service('bastion-ssh', passing=True)[1]
for service in services:
node_info = {
'id': service['Service']['ID'],
'address': service['Service']['Address'],
'port': service['Service']['Port'],
'weight': 100 - int(service['Service']['Meta'].get('load', '0'))
}
healthy_nodes.append(node_info)
# Update HAProxy configuration
haproxy_config = self.generate_haproxy_config(healthy_nodes)
with open('/etc/haproxy/haproxy.cfg', 'w') as f:
f.write(haproxy_config)
# Reload HAProxy
subprocess.run(['systemctl', 'reload', 'haproxy'])
def generate_haproxy_config(self, nodes):
"""Generate HAProxy configuration"""
config = """
global
log /dev/log local0
log /dev/log local1 notice
chroot /var/lib/haproxy
stats socket /run/haproxy/admin.sock mode 660
stats timeout 30s
user haproxy
group haproxy
daemon
defaults
log global
mode tcp
option tcplog
option dontlognull
timeout connect 5000
timeout client 50000
timeout server 50000
frontend bastion_ssh
bind *:22
mode tcp
default_backend bastion_nodes
backend bastion_nodes
mode tcp
balance leastconn
option tcp-check
tcp-check connect port 22
"""
for node in nodes:
config += f"\n server {node['id']} {node['address']}:{node['port']} "
config += f"check weight {node['weight']} inter 2000 rise 2 fall 3"
return config
def get_active_sessions(self):
"""Count active SSH sessions"""
try:
result = subprocess.run(
['who', '-u'],
capture_output=True,
text=True
)
# Count SSH sessions
sessions = [
line for line in result.stdout.splitlines()
if 'pts/' in line
]
return len(sessions)
except:
return 0
def calculate_load_score(self):
"""Calculate node load score (0-100)"""
cpu = self.get_cpu_usage()
memory = self.get_memory_usage()
sessions = self.get_active_sessions()
max_sessions = self.config['max_sessions_per_node']
# Weighted score
load_score = (
cpu * 0.3 +
memory * 0.3 +
(sessions / max_sessions * 100) * 0.4
)
return min(100, int(load_score))
SSH bastion hosts provide critical security infrastructure for protecting access to internal resources. By implementing comprehensive security controls, access management, session recording, and high availability, organizations create robust gateways that enable secure remote access while maintaining visibility and control. Regular updates and monitoring ensure bastion hosts continue providing effective protection as threats and requirements evolve.