High Availability Setup

Guide to configuring high availability for systems managed by Lambda Softworks' automation scripts.

This guide covers high availability (HA) configuration for mission-critical systems, ensuring maximum uptime and reliability.

High Availability Basics

Core Concepts

Redundancy
- Hardware redundancy
- Service redundancy
- Network redundancy
- Data redundancy
Failover
- Automatic failover
- Manual failover
- Failback procedures
- Health monitoring
Load Distribution
- Load balancing
- Traffic routing
- Session persistence
- Geographic distribution

HA Configuration

Basic Setup

# Initialize HA cluster
./ha-setup.sh --init \
  --nodes "node1,node2,node3" \
  --vip 192.168.1.100 \
  --services "web,db,cache"

# Configure monitoring
./ha-setup.sh --monitor \
  --check-interval 5s \
  --timeout 3s \
  --failure-threshold 3

Advanced Setup

# Configure advanced HA features
./ha-setup.sh --advanced \
  --quorum 2 \
  --split-brain-detection \
  --automatic-fencing

# Set up disaster recovery
./ha-setup.sh --dr-setup \
  --primary-dc dc1 \
  --secondary-dc dc2 \
  --replication-type sync

Configuration Files

Basic HA Configuration

# /etc/lambdasoftworks/ha/config.yml
cluster:
  name: "production-ha"
  nodes:
    - name: "node1"
      ip: "192.168.1.101"
      role: "primary"
      
    - name: "node2"
      ip: "192.168.1.102"
      role: "secondary"
      
    - name: "node3"
      ip: "192.168.1.103"
      role: "secondary"
  
  virtual_ip:
    address: "192.168.1.100"
    netmask: "255.255.255.0"
    interface: "eth0"
    
  services:
    - name: "nginx"
      check_command: "systemctl status nginx"
      restart_command: "systemctl restart nginx"
      
    - name: "mysql"
      check_command: "mysqladmin ping"
      restart_command: "systemctl restart mysql"
      
    - name: "redis"
      check_command: "redis-cli ping"
      restart_command: "systemctl restart redis"

  monitoring:
    interval: 5s
    timeout: 3s
    failures_before_failover: 3
    
  fencing:
    enabled: true
    methods:
      - type: "ipmi"
      - type: "ssh"

Advanced HA Configuration

# /etc/lambdasoftworks/ha/advanced-config.yml
cluster:
  name: "enterprise-ha"
  
  datacenter:
    primary:
      name: "dc1"
      location: "us-east"
      nodes:
        - name: "dc1-node1"
          ip: "10.0.1.101"
          role: "primary"
          
        - name: "dc1-node2"
          ip: "10.0.1.102"
          role: "secondary"
    
    secondary:
      name: "dc2"
      location: "us-west"
      nodes:
        - name: "dc2-node1"
          ip: "10.0.2.101"
          role: "dr-primary"
          
        - name: "dc2-node2"
          ip: "10.0.2.102"
          role: "dr-secondary"
  
  networking:
    load_balancer:
      type: "haproxy"
      config:
        maxconn: 10000
        timeout:
          connect: 5s
          client: 30s
          server: 30s
        
    dns:
      provider: "route53"
      ttl: 60
      failover_policy: "latency-based"
    
    vpn:
      type: "ipsec"
      encryption: "aes256"
      perfect_forward_secrecy: true
  
  storage:
    replication:
      type: "synchronous"
      verify: true
      compression: true
      
    backup:
      schedule: "0 2 * * *"
      retention: "7d"
      type: "incremental"
      
    snapshot:
      interval: "1h"
      retention: "24h"
  
  monitoring:
    metrics:
      collection_interval: 10s
      retention: "30d"
      
    alerts:
      channels:
        - type: "email"
          recipients: ["admin@example.com"]
        - type: "slack"
          webhook: "https://hooks.slack.com/..."
        
    dashboards:
      - name: "HA Status"
        refresh: 30s
        panels:
          - title: "Node Status"
            type: "status"
          - title: "Service Health"
            type: "health"
          - title: "Replication Lag"
            type: "gauge"
  
  automation:
    failover:
      automatic: true
      check_timeout: 5s
      stabilization_time: 30s
      
    maintenance:
      window: "sun 02:00-04:00"
      automatic_updates: true
      pre_update_backup: true

Service Configuration

Web Server HA

# /etc/lambdasoftworks/ha/nginx-ha.conf
upstream backend {
    server 192.168.1.101:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.102:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.103:8080 backup;
    
    keepalive 32;
    keepalive_requests 100;
    keepalive_timeout 60s;
}

server {
    listen 80;
    server_name example.com;
    
    location / {
        proxy_pass http://backend;
        proxy_next_upstream error timeout http_500;
        proxy_next_upstream_tries 3;
        proxy_next_upstream_timeout 10s;
        
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        
        proxy_connect_timeout 5s;
        proxy_send_timeout 60s;
        proxy_read_timeout 60s;
    }
}

Database HA

# /etc/lambdasoftworks/ha/mysql-ha.cnf
[mysqld]
# Replication
server-id = 1
log_bin = /var/log/mysql/mysql-bin.log
sync_binlog = 1
binlog_format = ROW

# Semi-synchronous replication
plugin-load-add = semisync_master.so
plugin-load-add = semisync_slave.so
rpl_semi_sync_master_enabled = 1
rpl_semi_sync_slave_enabled = 1
rpl_semi_sync_master_timeout = 10000

# GTID
gtid_mode = ON
enforce_gtid_consistency = ON

# Crash safety
innodb_flush_log_at_trx_commit = 1
innodb_flush_method = O_DIRECT
innodb_file_per_table = 1

# Monitoring
performance_schema = ON

Management Commands

Cluster Management

# Check cluster status
./ha-manage.sh --status \
  --cluster production-ha \
  --verbose

# Manual failover
./ha-manage.sh --failover \
  --to-node node2 \
  --services all \
  --force

Service Management

# Service health check
./ha-manage.sh --check-health \
  --services all \
  --timeout 5s

# Service maintenance
./ha-manage.sh --maintenance \
  --service mysql \
  --node node1 \
  --duration 30m

Monitoring and Alerts

Setup Monitoring

# Configure monitoring
./ha-monitor.sh --setup \
  --metrics all \
  --interval 10s \
  --retention 30d

# Configure alerts
./ha-monitor.sh --alerts \
  --rules-file alerts.yml \
  --notification-channels "slack,email"

Example Alert Configuration

# /etc/lambdasoftworks/ha/alerts.yml
alerts:
  - name: "Node Down"
    condition: "node_status == 'down'"
    severity: "critical"
    channels: ["slack", "email"]
    
  - name: "Service Degraded"
    condition: "service_health < 80"
    severity: "warning"
    channels: ["slack"]
    
  - name: "Replication Lag"
    condition: "replication_lag > 30"
    severity: "warning"
    channels: ["email"]

Best Practices

Design Principles

Redundancy
- Deploy redundant hardware
- Implement service redundancy
- Use multiple network paths
- Configure data replication
Monitoring
- Monitor all components
- Set up proactive alerts
- Track performance metrics
- Log all events
Documentation
- Document configurations
- Create runbooks
- Maintain change logs
- Test procedures

Operational Guidelines

Testing
- Regular failover tests
- DR scenario testing
- Configuration validation
- Performance testing
Maintenance
- Scheduled maintenance
- Rolling updates
- Backup verification
- Security patches
Recovery
- Documented procedures
- Regular drills
- Automated recovery
- Manual procedures

Troubleshooting

Common Issues

Split Brain

# Detect split brain
./ha-manage.sh --check-split-brain \
  --cluster production-ha

# Resolve split brain
./ha-manage.sh --resolve-split-brain \
  --preferred-node node1

Replication Issues

# Check replication status
./ha-manage.sh --check-replication \
  --service mysql \
  --verbose

# Fix replication
./ha-manage.sh --fix-replication \
  --auto-position \
  --timeout 5m

Next Steps

High Availability Setup

Guide to configuring high availability for systems managed by Lambda Softworks' automation scripts.

This guide covers high availability (HA) configuration for mission-critical systems, ensuring maximum uptime and reliability.

High Availability Basics

Core Concepts

Redundancy
- Hardware redundancy
- Service redundancy
- Network redundancy
- Data redundancy
Failover
- Automatic failover
- Manual failover
- Failback procedures
- Health monitoring
Load Distribution
- Load balancing
- Traffic routing
- Session persistence
- Geographic distribution

HA Configuration

Basic Setup

# Initialize HA cluster
./ha-setup.sh --init \
  --nodes "node1,node2,node3" \
  --vip 192.168.1.100 \
  --services "web,db,cache"

# Configure monitoring
./ha-setup.sh --monitor \
  --check-interval 5s \
  --timeout 3s \
  --failure-threshold 3

Advanced Setup

# Configure advanced HA features
./ha-setup.sh --advanced \
  --quorum 2 \
  --split-brain-detection \
  --automatic-fencing

# Set up disaster recovery
./ha-setup.sh --dr-setup \
  --primary-dc dc1 \
  --secondary-dc dc2 \
  --replication-type sync

Configuration Files

Basic HA Configuration

# /etc/lambdasoftworks/ha/config.yml
cluster:
  name: "production-ha"
  nodes:
    - name: "node1"
      ip: "192.168.1.101"
      role: "primary"
      
    - name: "node2"
      ip: "192.168.1.102"
      role: "secondary"
      
    - name: "node3"
      ip: "192.168.1.103"
      role: "secondary"
  
  virtual_ip:
    address: "192.168.1.100"
    netmask: "255.255.255.0"
    interface: "eth0"
    
  services:
    - name: "nginx"
      check_command: "systemctl status nginx"
      restart_command: "systemctl restart nginx"
      
    - name: "mysql"
      check_command: "mysqladmin ping"
      restart_command: "systemctl restart mysql"
      
    - name: "redis"
      check_command: "redis-cli ping"
      restart_command: "systemctl restart redis"

  monitoring:
    interval: 5s
    timeout: 3s
    failures_before_failover: 3
    
  fencing:
    enabled: true
    methods:
      - type: "ipmi"
      - type: "ssh"

Advanced HA Configuration

# /etc/lambdasoftworks/ha/advanced-config.yml
cluster:
  name: "enterprise-ha"
  
  datacenter:
    primary:
      name: "dc1"
      location: "us-east"
      nodes:
        - name: "dc1-node1"
          ip: "10.0.1.101"
          role: "primary"
          
        - name: "dc1-node2"
          ip: "10.0.1.102"
          role: "secondary"
    
    secondary:
      name: "dc2"
      location: "us-west"
      nodes:
        - name: "dc2-node1"
          ip: "10.0.2.101"
          role: "dr-primary"
          
        - name: "dc2-node2"
          ip: "10.0.2.102"
          role: "dr-secondary"
  
  networking:
    load_balancer:
      type: "haproxy"
      config:
        maxconn: 10000
        timeout:
          connect: 5s
          client: 30s
          server: 30s
        
    dns:
      provider: "route53"
      ttl: 60
      failover_policy: "latency-based"
    
    vpn:
      type: "ipsec"
      encryption: "aes256"
      perfect_forward_secrecy: true
  
  storage:
    replication:
      type: "synchronous"
      verify: true
      compression: true
      
    backup:
      schedule: "0 2 * * *"
      retention: "7d"
      type: "incremental"
      
    snapshot:
      interval: "1h"
      retention: "24h"
  
  monitoring:
    metrics:
      collection_interval: 10s
      retention: "30d"
      
    alerts:
      channels:
        - type: "email"
          recipients: ["admin@example.com"]
        - type: "slack"
          webhook: "https://hooks.slack.com/..."
        
    dashboards:
      - name: "HA Status"
        refresh: 30s
        panels:
          - title: "Node Status"
            type: "status"
          - title: "Service Health"
            type: "health"
          - title: "Replication Lag"
            type: "gauge"
  
  automation:
    failover:
      automatic: true
      check_timeout: 5s
      stabilization_time: 30s
      
    maintenance:
      window: "sun 02:00-04:00"
      automatic_updates: true
      pre_update_backup: true

Service Configuration

Web Server HA

# /etc/lambdasoftworks/ha/nginx-ha.conf
upstream backend {
    server 192.168.1.101:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.102:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.103:8080 backup;
    
    keepalive 32;
    keepalive_requests 100;
    keepalive_timeout 60s;
}

server {
    listen 80;
    server_name example.com;
    
    location / {
        proxy_pass http://backend;
        proxy_next_upstream error timeout http_500;
        proxy_next_upstream_tries 3;
        proxy_next_upstream_timeout 10s;
        
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        
        proxy_connect_timeout 5s;
        proxy_send_timeout 60s;
        proxy_read_timeout 60s;
    }
}

Database HA

# /etc/lambdasoftworks/ha/mysql-ha.cnf
[mysqld]
# Replication
server-id = 1
log_bin = /var/log/mysql/mysql-bin.log
sync_binlog = 1
binlog_format = ROW

# Semi-synchronous replication
plugin-load-add = semisync_master.so
plugin-load-add = semisync_slave.so
rpl_semi_sync_master_enabled = 1
rpl_semi_sync_slave_enabled = 1
rpl_semi_sync_master_timeout = 10000

# GTID
gtid_mode = ON
enforce_gtid_consistency = ON

# Crash safety
innodb_flush_log_at_trx_commit = 1
innodb_flush_method = O_DIRECT
innodb_file_per_table = 1

# Monitoring
performance_schema = ON

Management Commands

Cluster Management

# Check cluster status
./ha-manage.sh --status \
  --cluster production-ha \
  --verbose

# Manual failover
./ha-manage.sh --failover \
  --to-node node2 \
  --services all \
  --force

Service Management

# Service health check
./ha-manage.sh --check-health \
  --services all \
  --timeout 5s

# Service maintenance
./ha-manage.sh --maintenance \
  --service mysql \
  --node node1 \
  --duration 30m

Monitoring and Alerts

Setup Monitoring

# Configure monitoring
./ha-monitor.sh --setup \
  --metrics all \
  --interval 10s \
  --retention 30d

# Configure alerts
./ha-monitor.sh --alerts \
  --rules-file alerts.yml \
  --notification-channels "slack,email"

Example Alert Configuration

# /etc/lambdasoftworks/ha/alerts.yml
alerts:
  - name: "Node Down"
    condition: "node_status == 'down'"
    severity: "critical"
    channels: ["slack", "email"]
    
  - name: "Service Degraded"
    condition: "service_health < 80"
    severity: "warning"
    channels: ["slack"]
    
  - name: "Replication Lag"
    condition: "replication_lag > 30"
    severity: "warning"
    channels: ["email"]

Best Practices

Design Principles

Redundancy
- Deploy redundant hardware
- Implement service redundancy
- Use multiple network paths
- Configure data replication
Monitoring
- Monitor all components
- Set up proactive alerts
- Track performance metrics
- Log all events
Documentation
- Document configurations
- Create runbooks
- Maintain change logs
- Test procedures

Operational Guidelines

Testing
- Regular failover tests
- DR scenario testing
- Configuration validation
- Performance testing
Maintenance
- Scheduled maintenance
- Rolling updates
- Backup verification
- Security patches
Recovery
- Documented procedures
- Regular drills
- Automated recovery
- Manual procedures

Troubleshooting

Common Issues

Split Brain

# Detect split brain
./ha-manage.sh --check-split-brain \
  --cluster production-ha

# Resolve split brain
./ha-manage.sh --resolve-split-brain \
  --preferred-node node1

Replication Issues

# Check replication status
./ha-manage.sh --check-replication \
  --service mysql \
  --verbose

# Fix replication
./ha-manage.sh --fix-replication \
  --auto-position \
  --timeout 5m