CI/CD and Infrastructure as Code for HA Database Clusters in Kubernetes

    This extension to our guide incorporates a comprehensive CI/CD pipeline, Ansible automation, and other Infrastructure as Code (IaC) tools to streamline deployment and management of our high-availability database clusters.

    Table of Contents

    1. CI/CD Pipeline Overview
    2. Terraform Infrastructure Provisioning
    3. Ansible Automation
    4. GitOps with ArgoCD
    5. Helm Charts
    6. Monitoring and Alerting Integration
    7. Disaster Recovery Automation

    CI/CD Pipeline Overview

    Pipeline Architecture

    flowchart TD
        subgraph "Developer Workflow"
            code["Database Config Changes"] --> commit["Git Commit/Push"]
        end
    
        subgraph "CI Pipeline"
            commit --> lint["Lint YAML/SQL"]
            lint --> security["Security Scan"]
            security --> test["Unit Tests"]
            test --> build["Build Images"]
        end
    
        subgraph "CD Pipeline"
            build --> dev["Deploy to Dev"]
            dev --> integration["Integration Tests"]
            integration --> approve["Manual Approval"]
            approve --> staging["Deploy to Staging"]
            staging --> e2e["End-to-End Tests"]
            e2e --> prod_approve["Production Approval"]
            prod_approve --> production["Deploy to Production"]
        end
    
        subgraph "Monitoring"
            production --> monitor["Monitor Deployment"]
            monitor --> alert["Alerting"]
        end

    GitHub Actions CI/CD Workflow

    name: Database CI/CD Pipeline
    
    on:
      push:
        branches: [ main, develop ]
        paths:
          - 'k8s/**'
          - 'mysql/**'
          - 'postgres/**'
          - 'terraform/**'
          - 'ansible/**'
          - 'helm/**'
      pull_request:
        branches: [ main, develop ]
    
    jobs:
      lint:
        name: Lint Code
        runs-on: ubuntu-latest
        steps:
          - uses: actions/checkout@v3
    
          - name: Lint YAML
            uses: ibiqlik/action-yamllint@v3
            with:
              file_or_dir: k8s/ helm/
    
          - name: Lint Terraform
            uses: actionshub/terraform-lint@main
            with:
              directory: terraform/
    
          - name: Lint Ansible
            uses: ansible/ansible-lint-action@main
            with:
              targets: ansible/
    
      security-scan:
        name: Security Scan
        needs: lint
        runs-on: ubuntu-latest
        steps:
          - uses: actions/checkout@v3
    
          - name: Run Trivy vulnerability scanner
            uses: aquasecurity/trivy-action@master
            with:
              scan-type: 'fs'
              ignore-unfixed: true
              format: 'sarif'
              output: 'trivy-results.sarif'
              severity: 'CRITICAL,HIGH'
    
          - name: Upload Trivy scan results
            uses: github/codeql-action/upload-sarif@v2
            with:
              sarif_file: 'trivy-results.sarif'
    
          - name: Check Kubernetes manifests with kubesec
            run: |
              for f in $(find k8s -name "*.yaml" -o -name "*.yml"); do
                echo "Scanning $f"
                curl -sSX POST --data-binary @$f https://v2.kubesec.io/scan
              done
    
      build:
        name: Build Images
        needs: security-scan
        runs-on: ubuntu-latest
        steps:
          - uses: actions/checkout@v3
    
          - name: Set up Docker Buildx
            uses: docker/setup-buildx-action@v2
    
          - name: Login to DockerHub
            uses: docker/login-action@v2
            with:
              username: ${{ secrets.DOCKERHUB_USERNAME }}
              password: ${{ secrets.DOCKERHUB_TOKEN }}
    
          - name: Build and push MySQL image
            uses: docker/build-push-action@v4
            with:
              context: ./mysql
              push: ${{ github.event_name != 'pull_request' }}
              tags: ${{ secrets.DOCKERHUB_USERNAME }}/mysql-ha:latest
    
          - name: Build and push PostgreSQL image
            uses: docker/build-push-action@v4
            with:
              context: ./postgres
              push: ${{ github.event_name != 'pull_request' }}
              tags: ${{ secrets.DOCKERHUB_USERNAME }}/postgres-ha:latest
    
      deploy-dev:
        name: Deploy to Development
        needs: build
        if: github.ref == 'refs/heads/develop'
        runs-on: ubuntu-latest
        environment: development
        steps:
          - uses: actions/checkout@v3
    
          - name: Set up Terraform
            uses: hashicorp/setup-terraform@v2
    
          - name: Terraform Init
            run: cd terraform/environments/dev && terraform init
    
          - name: Terraform Plan
            run: cd terraform/environments/dev && terraform plan -out=tfplan
    
          - name: Terraform Apply
            run: cd terraform/environments/dev && terraform apply -auto-approve tfplan
    
          - name: Setup kubeconfig
            run: |
              echo "${{ secrets.KUBE_CONFIG_DEV }}" > kubeconfig
              echo "KUBECONFIG=kubeconfig" >> $GITHUB_ENV
    
          - name: Deploy with Helm
            run: |
              helm upgrade --install mysql-ha ./helm/mysql-ha -f ./helm/environments/dev/mysql-values.yaml -n database
              helm upgrade --install postgres-ha ./helm/postgres-ha -f ./helm/environments/dev/postgres-values.yaml -n database
    
          - name: Run Integration Tests
            run: |
              cd tests/integration
              ./run-db-tests.sh
    
      deploy-staging:
        name: Deploy to Staging
        needs: deploy-dev
        if: github.ref == 'refs/heads/develop'
        runs-on: ubuntu-latest
        environment: staging
        steps:
          - uses: actions/checkout@v3
    
          - name: Set up Terraform
            uses: hashicorp/setup-terraform@v2
    
          - name: Terraform Init
            run: cd terraform/environments/staging && terraform init
    
          - name: Terraform Plan
            run: cd terraform/environments/staging && terraform plan -out=tfplan
    
          - name: Terraform Apply
            run: cd terraform/environments/staging && terraform apply -auto-approve tfplan
    
          - name: Setup kubeconfig
            run: |
              echo "${{ secrets.KUBE_CONFIG_STAGING }}" > kubeconfig
              echo "KUBECONFIG=kubeconfig" >> $GITHUB_ENV
    
          - name: Deploy with Helm
            run: |
              helm upgrade --install mysql-ha ./helm/mysql-ha -f ./helm/environments/staging/mysql-values.yaml -n database
              helm upgrade --install postgres-ha ./helm/postgres-ha -f ./helm/environments/staging/postgres-values.yaml -n database
    
          - name: Run E2E Tests
            run: |
              cd tests/e2e
              ./run-db-e2e-tests.sh
    
      deploy-production:
        name: Deploy to Production
        needs: deploy-staging
        if: github.ref == 'refs/heads/main'
        runs-on: ubuntu-latest
        environment: production
        steps:
          - uses: actions/checkout@v3
    
          - name: Set up ArgoCD CLI
            run: |
              curl -sSL -o argocd https://github.com/argoproj/argo-cd/releases/download/v2.7.0/argocd-linux-amd64
              chmod +x argocd
              mv argocd /usr/local/bin/
    
          - name: Login to ArgoCD
            run: |
              argocd login ${{ secrets.ARGOCD_SERVER }} --username ${{ secrets.ARGOCD_USERNAME }} --password ${{ secrets.ARGOCD_PASSWORD }}
    
          - name: Sync Applications
            run: |
              argocd app sync mysql-ha-prod
              argocd app sync postgres-ha-prod
    
          - name: Verify Deployment
            run: |
              argocd app wait mysql-ha-prod --health
              argocd app wait postgres-ha-prod --health
    YAML

    Terraform Infrastructure Provisioning

    flowchart TD
        subgraph "Terraform"
            network["Network Module"] --> cluster["Kubernetes Cluster Module"]
            cluster --> storage["Storage Module"]
            cluster --> monitoring["Monitoring Module"]
            cluster --> security["Security Module"]
        end
    
        subgraph "Cloud Provider"
            vpc["VPC/Virtual Network"]
            nodes["Kubernetes Nodes"]
            disks["Persistent Disks"]
            aws_services["Managed Services"]
        end
    
        network --> vpc
        cluster --> nodes
        storage --> disks
        monitoring --> aws_services
        security --> aws_services

    Base Infrastructure Module

    provider "aws" {
      region = var.region
    }
    
    module "vpc" {
      source = "terraform-aws-modules/vpc/aws"
      version = "3.19.0"
    
      name = "${var.environment}-db-vpc"
      cidr = var.vpc_cidr
    
      azs             = var.availability_zones
      private_subnets = var.private_subnets
      public_subnets  = var.public_subnets
    
      enable_nat_gateway = true
      single_nat_gateway = var.environment != "production"
    
      tags = {
        Environment = var.environment
        Terraform   = "true"
      }
    }
    
    module "eks" {
      source = "terraform-aws-modules/eks/aws"
      version = "18.30.3"
    
      cluster_name    = "${var.environment}-db-cluster"
      cluster_version = var.kubernetes_version
    
      vpc_id     = module.vpc.vpc_id
      subnet_ids = module.vpc.private_subnets
    
      cluster_endpoint_private_access = true
      cluster_endpoint_public_access  = true
    
      eks_managed_node_groups = {
        general = {
          desired_size = 2
          min_size     = 2
          max_size     = 5
    
          instance_types = ["t3.large"]
          capacity_type  = "ON_DEMAND"
    
          labels = {
            role = "general"
          }
        }
    
        database = {
          desired_size = 3
          min_size     = 3
          max_size     = 7
    
          instance_types = ["r5.xlarge"]
          capacity_type  = "ON_DEMAND"
    
          labels = {
            role = "database"
          }
    
          taints = [{
            key    = "dedicated"
            value  = "database"
            effect = "NO_SCHEDULE"
          }]
        }
      }
    
      manage_aws_auth_configmap = true
    
      tags = {
        Environment = var.environment
        Terraform   = "true"
      }
    }
    
    # EBS CSI Driver for persistent volumes
    resource "aws_iam_policy" "ebs_csi_policy" {
      name        = "${var.environment}-AmazonEKS_EBS_CSI_Driver_Policy"
      description = "Policy for EKS EBS CSI Driver"
    
      policy = jsonencode({
        Version = "2012-10-17"
        Statement = [
          {
            Effect = "Allow"
            Action = [
              "ec2:CreateSnapshot",
              "ec2:AttachVolume",
              "ec2:DetachVolume",
              "ec2:ModifyVolume",
              "ec2:DescribeAvailabilityZones",
              "ec2:DescribeInstances",
              "ec2:DescribeSnapshots",
              "ec2:DescribeTags",
              "ec2:DescribeVolumes",
              "ec2:DescribeVolumesModifications"
            ]
            Resource = "*"
          },
          {
            Effect = "Allow"
            Action = [
              "ec2:CreateTags"
            ]
            Resource = [
              "arn:aws:ec2:*:*:volume/*",
              "arn:aws:ec2:*:*:snapshot/*"
            ]
            Condition = {
              StringEquals = {
                "ec2:CreateAction" = [
                  "CreateVolume",
                  "CreateSnapshot"
                ]
              }
            }
          },
          {
            Effect = "Allow"
            Action = [
              "ec2:DeleteTags"
            ]
            Resource = [
              "arn:aws:ec2:*:*:volume/*",
              "arn:aws:ec2:*:*:snapshot/*"
            ]
          },
          {
            Effect = "Allow"
            Action = [
              "ec2:CreateVolume"
            ]
            Resource = "*"
            Condition = {
              StringLike = {
                "aws:RequestTag/kubernetes.io/cluster/*" = "owned"
              }
            }
          },
          {
            Effect = "Allow"
            Action = [
              "ec2:DeleteVolume"
            ]
            Resource = "*"
            Condition = {
              StringLike = {
                "ec2:ResourceTag/kubernetes.io/cluster/*" = "owned"
              }
            }
          }
        ]
      })
    }
    
    module "ebs_csi_irsa_role" {
      source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
    
      role_name             = "${var.environment}-ebs-csi-driver"
      attach_ebs_csi_policy = true
    
      oidc_providers = {
        main = {
          provider_arn               = module.eks.oidc_provider_arn
          namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
        }
      }
    
      tags = {
        Environment = var.environment
        Terraform   = "true"
      }
    }
    
    # S3 Bucket for database backups
    resource "aws_s3_bucket" "db_backups" {
      bucket = "${var.environment}-db-backups-${random_string.suffix.result}"
    
      tags = {
        Environment = var.environment
        Terraform   = "true"
      }
    }
    
    resource "aws_s3_bucket_versioning" "db_backups" {
      bucket = aws_s3_bucket.db_backups.id
      versioning_configuration {
        status = "Enabled"
      }
    }
    
    resource "aws_s3_bucket_lifecycle_configuration" "db_backups" {
      bucket = aws_s3_bucket.db_backups.id
    
      rule {
        id     = "backup-lifecycle"
        status = "Enabled"
    
        transition {
          days          = 30
          storage_class = "STANDARD_IA"
        }
    
        transition {
          days          = 90
          storage_class = "GLACIER"
        }
    
        expiration {
          days = 365
        }
      }
    }
    
    resource "random_string" "suffix" {
      length  = 8
      special = false
      upper   = false
    }
    
    output "cluster_endpoint" {
      description = "Endpoint for EKS control plane"
      value       = module.eks.cluster_endpoint
    }
    
    output "cluster_name" {
      description = "Kubernetes Cluster Name"
      value       = module.eks.cluster_name
    }
    
    output "backup_bucket_name" {
      description = "S3 bucket name for database backups"
      value       = aws_s3_bucket.db_backups.bucket
    }
    HCL

    Environment-Specific Configuration

    provider "aws" {
      region = "us-west-2"
    }
    
    terraform {
      backend "s3" {
        bucket = "terraform-state-db-ha"
        key    = "production/terraform.tfstate"
        region = "us-west-2"
      }
    
      required_providers {
        aws = {
          source  = "hashicorp/aws"
          version = "~> 4.0"
        }
        kubernetes = {
          source  = "hashicorp/kubernetes"
          version = "~> 2.16"
        }
        helm = {
          source  = "hashicorp/helm"
          version = "~> 2.8"
        }
      }
    }
    
    module "kubernetes_cluster" {
      source = "../../modules/kubernetes-cluster"
    
      environment        = "production"
      region             = "us-west-2"
      kubernetes_version = "1.25"
    
      vpc_cidr           = "10.0.0.0/16"
      availability_zones = ["us-west-2a", "us-west-2b", "us-west-2c"]
      private_subnets    = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
      public_subnets     = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
    }
    
    provider "kubernetes" {
      host                   = module.kubernetes_cluster.cluster_endpoint
      cluster_ca_certificate = base64decode(module.kubernetes_cluster.cluster_certificate_authority_data)
      exec {
        api_version = "client.authentication.k8s.io/v1beta1"
        command     = "aws"
        args        = ["eks", "get-token", "--cluster-name", module.kubernetes_cluster.cluster_name]
      }
    }
    
    provider "helm" {
      kubernetes {
        host                   = module.kubernetes_cluster.cluster_endpoint
        cluster_ca_certificate = base64decode(module.kubernetes_cluster.cluster_certificate_authority_data)
        exec {
          api_version = "client.authentication.k8s.io/v1beta1"
          command     = "aws"
          args        = ["eks", "get-token", "--cluster-name", module.kubernetes_cluster.cluster_name]
        }
      }
    }
    
    # Create Kubernetes Namespaces
    resource "kubernetes_namespace" "database" {
      metadata {
        name = "database"
      }
    }
    
    resource "kubernetes_namespace" "monitoring" {
      metadata {
        name = "monitoring"
      }
    }
    
    resource "kubernetes_namespace" "argocd" {
      metadata {
        name = "argocd"
      }
    }
    
    # Install ArgoCD
    resource "helm_release" "argocd" {
      name       = "argocd"
      repository = "https://argoproj.github.io/argo-helm"
      chart      = "argo-cd"
      version    = "5.16.2"
      namespace  = kubernetes_namespace.argocd.metadata[0].name
    
      values = [
        file("${path.module}/values/argocd-values.yaml")
      ]
    }
    
    # Install Prometheus and Grafana
    resource "helm_release" "prometheus_stack" {
      name       = "prometheus"
      repository = "https://prometheus-community.github.io/helm-charts"
      chart      = "kube-prometheus-stack"
      version    = "42.0.3"
      namespace  = kubernetes_namespace.monitoring.metadata[0].name
    
      values = [
        file("${path.module}/values/prometheus-values.yaml")
      ]
    }
    
    # Output information
    output "backup_bucket_name" {
      description = "S3 bucket name for database backups"
      value       = module.kubernetes_cluster.backup_bucket_name
    }
    
    output "kubeconfig_command" {
      description = "Command to get kubeconfig for the cluster"
      value       = "aws eks update-kubeconfig --region us-west-2 --name ${module.kubernetes_cluster.cluster_name}"
    }
    HCL

    Ansible Automation

    flowchart LR
        subgraph "Ansible Control Node"
            playbooks["Ansible Playbooks"]
            inventory["Inventory"]
            roles["Roles"]
            tasks["Tasks"]
        end
    
        subgraph "Target Environments"
            dev["Development"]
            staging["Staging"]
            prod["Production"]
        end
    
        playbooks --> roles
        roles --> tasks
        inventory --> playbooks
    
        playbooks --> dev
        playbooks --> staging
        playbooks --> prod

    Ansible Playbook for Database Management

    ---
    - name: Setup Database Infrastructure
      hosts: localhost
      connection: local
      become: false
      gather_facts: true
    
      vars_files:
        - "../vars/{{ env }}_vars.yaml"
    
      tasks:
        - name: Ensure Terraform is installed
          apt:
            name: terraform
            state: present
          become: true
          when: ansible_distribution == 'Ubuntu'
    
        - name: Ensure kubectl is installed
          shell: |
            curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
            chmod +x kubectl
            sudo mv kubectl /usr/local/bin/
          args:
            creates: /usr/local/bin/kubectl
    
        - name: Ensure helm is installed
          shell: |
            curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
          args:
            creates: /usr/local/bin/helm
    
        - name: Apply Terraform
          shell: |
            cd ../terraform/environments/{{ env }}
            terraform init
            terraform apply -auto-approve
          register: terraform_output
    
        - name: Parse Terraform outputs
          set_fact:
            cluster_name: "{{ terraform_output.stdout | regex_search('cluster_name = \"([^\"]*)\"', '\\1') | first }}"
            backup_bucket: "{{ terraform_output.stdout | regex_search('backup_bucket_name = \"([^\"]*)\"', '\\1') | first }}"
    
        - name: Configure kubectl
          shell: |
            aws eks update-kubeconfig --region {{ aws_region }} --name {{ cluster_name }}
    
        - name: Create namespaces
          k8s:
            state: present
            definition:
              apiVersion: v1
              kind: Namespace
              metadata:
                name: "{{ item }}"
          loop:
            - database
            - monitoring
            - backup
    
        - name: Deploy database secrets
          k8s:
            state: present
            definition: "{{ lookup('template', '../templates/secrets/{{ item }}.yaml.j2') }}"
          loop:
            - mysql-secrets
            - postgres-secrets
          no_log: true
    
        - name: Deploy MySQL HA cluster
          k8s:
            state: present
            src: "../templates/mysql/{{ item }}"
          loop:
            - mysql-configmap.yaml
            - mysql-services.yaml
            - mysql-statefulset.yaml
            - orchestrator-configmap.yaml
            - orchestrator-deployment.yaml
    
        - name: Deploy PostgreSQL HA cluster
          k8s:
            state: present
            src: "../templates/postgres/{{ item }}"
          loop:
            - postgres-configmap.yaml
            - etcd-statefulset.yaml
            - patroni-statefulset.yaml
    
        - name: Configure Database Backup
          k8s:
            state: present
            definition: "{{ lookup('template', '../templates/backup/{{ item }}.yaml.j2') }}"
          loop:
            - mysql-backup-cronjob
            - postgres-backup-cronjob
    
        - name: Create monitoring dashboards
          k8s:
            state: present
            src: "../templates/monitoring/{{ item }}"
          loop:
            - mysql-grafana-dashboard.yaml
            - postgres-grafana-dashboard.yaml
            - db-prometheus-rules.yaml
    
        - name: Verify database deployment
          shell: |
            kubectl wait --for=condition=Ready pods -l app=mysql -n database --timeout=300s
            kubectl wait --for=condition=Ready pods -l app=postgres -n database --timeout=300s
          register: db_status
    
        - name: Display deployment status
          debug:
            msg: "Database deployment complete. Status: {{ db_status.stdout }}"
    YAML

    Ansible Role for Database Operational Tasks

    ---
    - name: Get environment variables
      include_vars: "{{ env }}_vars.yaml"
    
    - name: Check MySQL StatefulSet
      k8s_info:
        api_version: apps/v1
        kind: StatefulSet
        name: mysql
        namespace: database
      register: mysql_statefulset
    
    - name: Check PostgreSQL StatefulSet
      k8s_info:
        api_version: apps/v1
        kind: StatefulSet
        name: postgres
        namespace: database
      register: postgres_statefulset
    
    - name: Include specified tasks
      include_tasks: "{{ operation }}.yaml"
      when: operation is defined
    YAML
    ---
    - name: Scale MySQL cluster
      k8s:
        api_version: apps/v1
        kind: StatefulSet
        name: mysql
        namespace: database
        definition:
          spec:
            replicas: "{{ mysql_replicas | default(3) }}"
      when: mysql_statefulset.resources is defined and mysql_statefulset.resources | length > 0
    
    - name: Scale PostgreSQL cluster
      k8s:
        api_version: apps/v1
        kind: StatefulSet
        name: postgres
        namespace: database
        definition:
          spec:
            replicas: "{{ postgres_replicas | default(3) }}"
      when: postgres_statefulset.resources is defined and postgres_statefulset.resources | length > 0
    
    - name: Wait for scaling to complete
      shell: |
        kubectl rollout status statefulset/mysql -n database
        kubectl rollout status statefulset/postgres -n database
      register: scaling_result
    
    - name: Display scaling result
      debug:
        msg: "Database scaling complete: {{ scaling_result.stdout }}"
    YAML
    ---
    - name: Create on-demand MySQL backup
      shell: |
        kubectl create job --from=cronjob/mysql-backup mysql-backup-manual-{{ ansible_date_time.epoch }} -n database
      register: mysql_backup_job
    
    - name: Create on-demand PostgreSQL backup
      shell: |
        kubectl create job --from=cronjob/postgres-backup postgres-backup-manual-{{ ansible_date_time.epoch }} -n database
      register: postgres_backup_job
    
    - name: Wait for backup jobs to complete
      shell: |
        kubectl wait --for=condition=complete job/{{ item.stdout_lines[0] }} -n database --timeout=600s
      with_items:
        - "{{ mysql_backup_job }}"
        - "{{ postgres_backup_job }}"
      register: backup_status
    
    - name: Show backup job status
      debug:
        msg: "Backup jobs completed: {{ backup_status.results | map(attribute='stdout') | list }}"
    
    - name: Verify backup files in S3
      command: "aws s3 ls s3://{{ backup_bucket }}/{{ item }}-backups/ --recursive"
      with_items:
        - mysql
        - postgres
      register: s3_backup_files
    
    - name: Display backup files
      debug:
        msg: "{{ s3_backup_files.results | map(attribute='stdout') | list }}"
    YAML

    GitOps with ArgoCD

    flowchart TD
        subgraph "GitOps Workflow"
            git["Git Repository"] --> argocd["ArgoCD"]
            argocd --> sync["Auto/Manual Sync"]
            sync --> k8s["Kubernetes Clusters"]
        end
    
        subgraph "Application Deployments"
            dev_apps["Dev Applications"]
            staging_apps["Staging Applications"]
            prod_apps["Production Applications"]
        end
    
        k8s --> dev_apps
        k8s --> staging_apps
        k8s --> prod_apps
    
        dev_apps --> health["Health Monitoring"]
        staging_apps --> health
        prod_apps --> health
    
        health --> rollback["Auto Rollback"]
        rollback --> argocd

    ArgoCD Application Definitions

    apiVersion: argoproj.io/v1alpha1
    kind: Application
    metadata:
      name: mysql-ha-prod
      namespace: argocd
      finalizers:
        - resources-finalizer.argocd.argoproj.io
    spec:
      project: default
      source:
        repoURL: https://github.com/yourusername/db-ha-infrastructure.git
        targetRevision: HEAD
        path: helm/mysql-ha
        helm:
          valueFiles:
            - ../environments/production/mysql-values.yaml
      destination:
        server: https://kubernetes.default.svc
        namespace: database
      syncPolicy:
        automated:
          prune: true
          selfHeal: true
        syncOptions:
          - CreateNamespace=false
          - PruneLast=true
        retry:
          limit: 5
          backoff:
            duration: 5s
            factor: 2
            maxDuration: 3m
    YAML
    apiVersion: argoproj.io/v1alpha1
    kind: Application
    metadata:
      name: postgres-ha-prod
      namespace: argocd
      finalizers:
        - resources-finalizer.argocd.argoproj.io
    spec:
      project: default
      source:
        repoURL: https://github.com/yourusername/db-ha-infrastructure.git
        targetRevision: HEAD
        path: helm/postgres-ha
        helm:
          valueFiles:
            - ../environments/production/postgres-values.yaml
      destination:
        server: https://kubernetes.default.svc
        namespace: database
      syncPolicy:
        automated:
          prune: true
          selfHeal: true
        syncOptions:
          - CreateNamespace=false
          - PruneLast=true
        retry:
          limit: 5
          backoff:
            duration: 5s
            factor: 2
            maxDuration: 3m
    YAML

    ArgoCD Project Configuration

    apiVersion: argoproj.io/v1alpha1
    kind: AppProject
    metadata:
      name: database-infra
      namespace: argocd
    spec:
      description: Database infrastructure project
    
      sourceRepos:
        - 'https://github.com/yourusername/db-ha-infrastructure.git'
    
      destinations:
        - namespace: database
          server: https://kubernetes.default.svc
        - namespace: monitoring
          server: https://kubernetes.default.svc
        - namespace: backup
          server: https://kubernetes.default.svc
    
      clusterResourceWhitelist:
        - group: '*'
          kind: '*'
    
      namespaceResourceBlacklist:
        - group: ''
          kind: ResourceQuota
        - group: ''
          kind: LimitRange
    
      roles:
        - name: project-admin
          description: Project administrators
          policies:
            - p, proj:database-infra:project-admin, applications, *, database-infra/*, allow
          groups:
            - database-admins
    
        - name: project-readonly
          description: Read-only access to the project
          policies:
            - p, proj:database-infra:project-readonly, applications, get, database-infra/*, allow
          groups:
            - database-readonly
    YAML

    Helm Charts

    flowchart TB
        subgraph "Helm Charts"
            mysql["MySQL HA Chart"]
            postgres["PostgreSQL HA Chart"]
    
            subgraph "MySQL Chart Components"
                mysql_values["Values"]
                mysql_templates["Templates"]
                mysql_helpers["Helpers"]
            end
    
            subgraph "PostgreSQL Chart Components"
                pg_values["Values"]
                pg_templates["Templates"]
                pg_helpers["Helpers"]
            end
    
            mysql --> mysql_values
            mysql --> mysql_templates
            mysql --> mysql_helpers
    
            postgres --> pg_values
            postgres --> pg_templates
            postgres --> pg_helpers
        end
    
        subgraph "Environment Values"
            dev_values["Development"]
            staging_values["Staging"]
            prod_values["Production"]
        end
    
        mysql_values -.-> dev_values
        mysql_values -.-> staging_values
        mysql_values -.-> prod_values
    
        pg_values -.-> dev_values
        pg_values -.-> staging_values
        pg_values -.-> prod_values

    MySQL Helm Chart

    apiVersion: v2
    name: mysql-ha
    description: High Availability MySQL Cluster for Kubernetes
    type: application
    version: 1.0.0
    appVersion: "8.0.0"
    
    dependencies:
      - name: prometheus-mysql-exporter
        version: 1.5.0
        repository: https://prometheus-community.github.io/helm-charts
        condition: monitoring.enabled
    YAML
    # Default configuration for MySQL HA
    image:
      repository: mysql
      tag: "8.0"
      pullPolicy: IfNotPresent
    
    replicaCount: 3
    
    mysql:
      rootPassword: "changeme"
      replicationPassword: "changeme"
      monitorPassword: "changeme"
      healthcheckPassword: "changeme"
    
    storage:
      size: 10Gi
      storageClass: "standard"
    
    resources:
      requests:
        cpu: 500m
        memory: 1Gi
      limits:
        cpu: 2
        memory: 4Gi
    
    monitoring:
      enabled: true
    
    backup:
      enabled: true
      schedule: "0 1 * * *"
      retention: 7
      s3Bucket: ""
    
    orchestrator:
      enabled: true
      image:
        repository: orchestrator
        tag: latest
      replicaCount: 1
      resources:
        requests:
          cpu: 200m
          memory: 256Mi
        limits:
          cpu: 500m
          memory: 512Mi
    
    affinity:
      podAntiAffinity:
        requiredDuringSchedulingIgnoredDuringExecution:
          - labelSelector:
              matchExpressions:
                - key: app
                  operator: In
                  values:
                    - mysql
            topologyKey: "kubernetes.io/hostname"
    
    nodeSelector: {}
    
    tolerations: []
    YAML
    replicaCount: 5
    
    mysql:
      rootPassword: "${MYSQL_ROOT_PASSWORD}"
      replicationPassword: "${MYSQL_REPLICATION_PASSWORD}"
      monitorPassword: "${MYSQL_MONITOR_PASSWORD}"
      healthcheckPassword: "${MYSQL_HEALTHCHECK_PASSWORD}"
    
    storage:
      size: 100Gi
      storageClass: "gp3"
    
    resources:
      requests:
        cpu: 2
        memory: 8Gi
      limits:
        cpu: 4
        memory: 16Gi
    
    backup:
      enabled: true
      schedule: "0 */4 * * *"  # Every 4 hours
      retention: 30
      s3Bucket: "prod-db-backups-12345678"
    
    orchestrator:
      replicaCount: 3
      resources:
        requests:
          cpu: 500m
          memory: 1Gi
        limits:
          cpu: 1
          memory: 2Gi
    
    nodeSelector:
      role: database
    
    tolerations:
    - key: "dedicated"
      operator: "Equal"
      value: "database"
      effect: "NoSchedule"
    YAML

    Monitoring and Alerting Integration

    flowchart TB
        subgraph "Monitoring Stack"
            prometheus["Prometheus"]
            grafana["Grafana"]
            alertmanager["Alert Manager"]
        end
    
        subgraph "Database Exporters"
            mysql_exp["MySQL Exporter"]
            pg_exp["PostgreSQL Exporter"]
        end
    
        subgraph "Alert Channels"
            slack["Slack"]
            pagerduty["PagerDuty"]
            email["Email"]
        end
    
        mysql["MySQL Cluster"] --> mysql_exp
        postgres["PostgreSQL Cluster"] --> pg_exp
    
        mysql_exp -->|Metrics| prometheus
        pg_exp -->|Metrics| prometheus
    
        prometheus -->|Data| grafana
        prometheus -->|Alerts| alertmanager
    
        alertmanager -->|Notifications| slack
        alertmanager -->|Incidents| pagerduty
        alertmanager -->|Reports| email

    Prometheus Alert Rules for Databases

    apiVersion: monitoring.coreos.com/v1
    kind: PrometheusRule
    metadata:
      name: database-alerts
      namespace: monitoring
      labels:
        app: prometheus
    spec:
      groups:
      - name: mysql.rules
        rules:
        - alert: MySQLInstanceDown
          expr: mysql_up == 0
          for: 1m
          labels:
            severity: critical
          annotations:
            summary: "MySQL instance {{ $labels.instance }} is down"
            description: "MySQL instance has been down for more than 1 minute."
            runbook_url: "https://wiki.example.com/mysql/instance-down"
    
        - alert: MySQLReplicationLag
          expr: mysql_slave_lag_seconds > 300
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "MySQL replication lag on {{ $labels.instance }}"
            description: "MySQL replication lag is more than 5 minutes on {{ $labels.instance }}."
            runbook_url: "https://wiki.example.com/mysql/replication-lag"
    
        - alert: MySQLHighConnectionUsage
          expr: mysql_global_status_threads_connected / mysql_global_variables_max_connections * 100 > 80
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "MySQL high connection usage on {{ $labels.instance }}"
            description: "MySQL connection usage is above 80% on {{ $labels.instance }}."
            runbook_url: "https://wiki.example.com/mysql/high-connections"
    
      - name: postgres.rules
        rules:
        - alert: PostgreSQLInstanceDown
          expr: pg_up == 0
          for: 1m
          labels:
            severity: critical
          annotations:
            summary: "PostgreSQL instance {{ $labels.instance }} is down"
            description: "PostgreSQL instance has been down for more than 1 minute."
            runbook_url: "https://wiki.example.com/postgres/instance-down"
    
        - alert: PostgreSQLReplicationLag
          expr: pg_replication_lag > 300
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "PostgreSQL replication lag on {{ $labels.instance }}"
            description: "PostgreSQL replication lag is more than 5 minutes on {{ $labels.instance }}."
            runbook_url: "https://wiki.example.com/postgres/replication-lag"
    
        - alert: PostgreSQLHighConnectionUsage
          expr: pg_stat_activity_count / pg_settings_max_connections * 100 > 80
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "PostgreSQL high connection usage on {{ $labels.instance }}"
            description: "PostgreSQL connection usage is above 80% on {{ $labels.instance }}."
            runbook_url: "https://wiki.example.com/postgres/high-connections"
    YAML

    Disaster Recovery Automation

    flowchart TB
        subgraph "Disaster Recovery"
            backup["Automated Backups"]
            verify["Backup Verification"]
            restore["Restore Testing"]
            failover["Failover Testing"]
        end
    
        subgraph "DR Automation"
            ansible["Ansible Playbooks"]
            scripts["Restore Scripts"]
            automation["CI/CD Pipeline"]
        end
    
        backup --> verify
        verify --> restore
        restore --> failover
    
        ansible --> backup
        ansible --> verify
        ansible --> restore
        ansible --> failover
    
        scripts --> restore
        scripts --> failover
    
        automation --> ansible

    DR Ansible Playbook

    ---
    - name: Database Disaster Recovery
      hosts: localhost
      connection: local
      gather_facts: false
    
      vars_files:
        - "../vars/{{ env }}_vars.yaml"
    
      vars:
        dr_action: "{{ action | default('test') }}"
        backup_date: "{{ date | default('latest') }}"
        target_cluster: "{{ cluster | default('primary') }}"
    
      tasks:
        - name: Check for valid DR action
          fail:
            msg: "Valid actions are 'test', 'restore', 'failover', or 'verify'"
          when: dr_action not in ['test', 'restore', 'failover', 'verify']
    
        - name: Configure kubectl for target cluster
          shell: |
            aws eks update-kubeconfig --region {{ aws_region }} --name {{ cluster_name }}
    
        # Backup verification task
        - name: Verify database backups
          block:
            - name: Check MySQL backups in S3
              command: "aws s3 ls s3://{{ backup_bucket }}/mysql-backups/ --recursive"
              register: mysql_backups
    
            - name: Check PostgreSQL backups in S3
              command: "aws s3 ls s3://{{ backup_bucket }}/postgres-backups/ --recursive"
              register: postgres_backups
    
            - name: Create verification pod for MySQL
              k8s:
                state: present
                definition:
                  apiVersion: v1
                  kind: Pod
                  metadata:
                    name: mysql-backup-verify
                    namespace: database
                  spec:
                    containers:
                    - name: mysql-verify
                      image: mysql:8.0
                      command:
                      - "/bin/bash"
                      - "-c"
                      - |
                        # Download latest backup
                        aws s3 cp s3://{{ backup_bucket }}/mysql-backups/$(aws s3 ls s3://{{ backup_bucket }}/mysql-backups/ | sort | tail -n 1 | awk '{print $4}') /tmp/backup.sql.gz
                        # Verify backup integrity
                        gunzip -t /tmp/backup.sql.gz
                        if [ $? -eq 0 ]; then
                          echo "MySQL backup verified successfully!"
                          exit 0
                        else
                          echo "MySQL backup verification failed!"
                          exit 1
                        fi
                      env:
                      - name: AWS_REGION
                        value: "{{ aws_region }}"
                    restartPolicy: Never
    
            - name: Create verification pod for PostgreSQL
              k8s:
                state: present
                definition:
                  apiVersion: v1
                  kind: Pod
                  metadata:
                    name: postgres-backup-verify
                    namespace: database
                  spec:
                    containers:
                    - name: postgres-verify
                      image: postgres:14
                      command:
                      - "/bin/bash"
                      - "-c"
                      - |
                        # Download latest backup
                        aws s3 cp s3://{{ backup_bucket }}/postgres-backups/$(aws s3 ls s3://{{ backup_bucket }}/postgres-backups/ | sort | tail -n 1 | awk '{print $4}') /tmp/backup.pgdump
                        # Verify backup integrity
                        pg_restore -l /tmp/backup.pgdump > /dev/null
                        if [ $? -eq 0 ]; then
                          echo "PostgreSQL backup verified successfully!"
                          exit 0
                        else
                          echo "PostgreSQL backup verification failed!"
                          exit 1
                        fi
                      env:
                      - name: AWS_REGION
                        value: "{{ aws_region }}"
                    restartPolicy: Never
    
            - name: Wait for verification jobs to complete
              shell: |
                kubectl wait --for=condition=Ready pod/mysql-backup-verify -n database --timeout=300s
                kubectl wait --for=condition=Ready pod/postgres-backup-verify -n database --timeout=300s
              register: verification_status
          when: dr_action == 'verify'
    
        # Restore database task
        - name: Restore database
          block:
            - name: Set backup file based on date parameter
              set_fact:
                mysql_backup_file: "{{ backup_date if backup_date != 'latest' else '$(aws s3 ls s3://{{ backup_bucket }}/mysql-backups/ | sort | tail -n 1 | awk \\'{ print $4 }\\')' }}"
                postgres_backup_file: "{{ backup_date if backup_date != 'latest' else '$(aws s3 ls s3://{{ backup_bucket }}/postgres-backups/ | sort | tail -n 1 | awk \\'{ print $4 }\\')' }}"
    
            - name: Scale down database clusters for restore
              k8s:
                state: present
                definition:
                  apiVersion: apps/v1
                  kind: StatefulSet
                  metadata:
                    name: "{{ item }}"
                    namespace: database
                  spec:
                    replicas: 0
              loop:
                - mysql
                - postgres
    
            - name: Wait for scale down
              shell: |
                kubectl wait --for=delete pod/mysql-0 -n database --timeout=300s
                kubectl wait --for=delete pod/postgres-0 -n database --timeout=300s
    
            - name: Create MySQL restore job
              k8s:
                state: present
                definition:
                  apiVersion: batch/v1
                  kind: Job
                  metadata:
                    name: mysql-restore-job
                    namespace: database
                  spec:
                    template:
                      spec:
                        containers:
                        - name: mysql-restore
                          image: mysql:8.0
                          command:
                          - "/bin/bash"
                          - "-c"
                          - |
                            # Download backup
                            aws s3 cp s3://{{ backup_bucket }}/mysql-backups/{{ mysql_backup_file }} /tmp/backup.sql.gz
                            gunzip /tmp/backup.sql.gz
    
                            # Perform restore into PVC
                            mysql -h mysql-0.mysql -u root -p${MYSQL_ROOT_PASSWORD} < /tmp/backup.sql
                          env:
                          - name: MYSQL_ROOT_PASSWORD
                            valueFrom:
                              secretKeyRef:
                                name: mysql-secrets
                                key: root-password
                          - name: AWS_REGION
                            value: "{{ aws_region }}"
                        restartPolicy: Never
                    backoffLimit: 3
    
            - name: Create PostgreSQL restore job
              k8s:
                state: present
                definition:
                  apiVersion: batch/v1
                  kind: Job
                  metadata:
                    name: postgres-restore-job
                    namespace: database
                  spec:
                    template:
                      spec:
                        containers:
                        - name: postgres-restore
                          image: postgres:14
                          command:
                          - "/bin/bash"
                          - "-c"
                          - |
                            # Download backup
                            aws s3 cp s3://{{ backup_bucket }}/postgres-backups/{{ postgres_backup_file }} /tmp/backup.pgdump
    
                            # Perform restore
                            PGPASSWORD=${POSTGRES_PASSWORD} pg_restore -h postgres-primary -U postgres -d postgres --clean --if-exists /tmp/backup.pgdump
                          env:
                          - name: POSTGRES_PASSWORD
                            valueFrom:
                              secretKeyRef:
                                name: postgres-secrets
                                key: superuser-password
                          - name: AWS_REGION
                            value: "{{ aws_region }}"
                        restartPolicy: Never
                    backoffLimit: 3
    
            - name: Wait for restore jobs to complete
              shell: |
                kubectl wait --for=condition=complete job/mysql-restore-job -n database --timeout=600s
                kubectl wait --for=condition=complete job/postgres-restore-job -n database --timeout=600s
              register: restore_status
    
            - name: Scale up database clusters after restore
              k8s:
                state: present
                definition:
                  apiVersion: apps/v1
                  kind: StatefulSet
                  metadata:
                    name: "{{ item.name }}"
                    namespace: database
                  spec:
                    replicas: "{{ item.replicas }}"
              loop:
                - { name: mysql, replicas: 3 }
                - { name: postgres, replicas: 3 }
          when: dr_action == 'restore'
    
        # Failover testing task
        - name: Test failover capabilities
          block:
            - name: Test MySQL failover
              shell: |
                # Get current primary
                mysql_primary=$(kubectl get pods -l app=mysql -n database -o jsonpath='{.items[0].metadata.name}')
    
                # Simulate primary failure
                kubectl delete pod $mysql_primary -n database
    
                # Wait for orchestrator to detect failure and promote new primary
                sleep 30
    
                # Verify new primary was promoted
                new_primary=$(kubectl exec -it orchestrator-0 -n database -- curl -s http://localhost:3000/api/cluster/alias/mysql-cluster | jq '.[] | select(.IsReadOnly==false) | .Key')
    
                if [ -n "$new_primary" ]; then
                  echo "MySQL failover test succeeded. New primary: $new_primary"
                else
                  echo "MySQL failover test failed. No new primary detected."
                  exit 1
                fi
              register: mysql_failover_result
    
            - name: Test PostgreSQL failover
              shell: |
                # Get current primary
                pg_primary=$(kubectl get pods -l app=postgres,role=master -n database -o jsonpath='{.items[0].metadata.name}')
    
                # Simulate primary failure
                kubectl delete pod $pg_primary -n database
    
                # Wait for Patroni to detect failure and promote new primary
                sleep 30
    
                # Verify new primary was promoted
                new_primary=$(kubectl get pods -l app=postgres,role=master -n database -o jsonpath='{.items[0].metadata.name}')
    
                if [ -n "$new_primary" -a "$new_primary" != "$pg_primary" ]; then
                  echo "PostgreSQL failover test succeeded. New primary: $new_primary"
                else
                  echo "PostgreSQL failover test failed. No new primary detected."
                  exit 1
                fi
              register: postgres_failover_result
          when: dr_action == 'test'
    
        - name: Report DR test results
          debug:
            msg: "{{ item.name }} failover test result: {{ item.result.stdout }}"
          loop:
            - { name: "MySQL", result: "{{ mysql_failover_result }}" }
            - { name: "PostgreSQL", result: "{{ postgres_failover_result }}" }
          when: dr_action == 'test'
    YAML

    This comprehensive CI/CD and Infrastructure as Code solution provides a complete framework for deploying, managing, and maintaining high-availability database clusters in Kubernetes with proper disaster recovery capabilities and monitoring.


    Discover more from Altgr Blog

    Subscribe to get the latest posts sent to your email.

    Leave a Reply

    Your email address will not be published. Required fields are marked *