This extension to our guide incorporates a comprehensive CI/CD pipeline, Ansible automation, and other Infrastructure as Code (IaC) tools to streamline deployment and management of our high-availability database clusters.
Table of Contents
- CI/CD Pipeline Overview
- Terraform Infrastructure Provisioning
- Ansible Automation
- GitOps with ArgoCD
- Helm Charts
- Monitoring and Alerting Integration
- Disaster Recovery Automation
CI/CD Pipeline Overview
Pipeline Architecture
flowchart TD
subgraph "Developer Workflow"
code["Database Config Changes"] --> commit["Git Commit/Push"]
end
subgraph "CI Pipeline"
commit --> lint["Lint YAML/SQL"]
lint --> security["Security Scan"]
security --> test["Unit Tests"]
test --> build["Build Images"]
end
subgraph "CD Pipeline"
build --> dev["Deploy to Dev"]
dev --> integration["Integration Tests"]
integration --> approve["Manual Approval"]
approve --> staging["Deploy to Staging"]
staging --> e2e["End-to-End Tests"]
e2e --> prod_approve["Production Approval"]
prod_approve --> production["Deploy to Production"]
end
subgraph "Monitoring"
production --> monitor["Monitor Deployment"]
monitor --> alert["Alerting"]
endGitHub Actions CI/CD Workflow
name: Database CI/CD Pipeline
on:
push:
branches: [ main, develop ]
paths:
- 'k8s/**'
- 'mysql/**'
- 'postgres/**'
- 'terraform/**'
- 'ansible/**'
- 'helm/**'
pull_request:
branches: [ main, develop ]
jobs:
lint:
name: Lint Code
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Lint YAML
uses: ibiqlik/action-yamllint@v3
with:
file_or_dir: k8s/ helm/
- name: Lint Terraform
uses: actionshub/terraform-lint@main
with:
directory: terraform/
- name: Lint Ansible
uses: ansible/ansible-lint-action@main
with:
targets: ansible/
security-scan:
name: Security Scan
needs: lint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Run Trivy vulnerability scanner
uses: aquasecurity/trivy-action@master
with:
scan-type: 'fs'
ignore-unfixed: true
format: 'sarif'
output: 'trivy-results.sarif'
severity: 'CRITICAL,HIGH'
- name: Upload Trivy scan results
uses: github/codeql-action/upload-sarif@v2
with:
sarif_file: 'trivy-results.sarif'
- name: Check Kubernetes manifests with kubesec
run: |
for f in $(find k8s -name "*.yaml" -o -name "*.yml"); do
echo "Scanning $f"
curl -sSX POST --data-binary @$f https://v2.kubesec.io/scan
done
build:
name: Build Images
needs: security-scan
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Login to DockerHub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Build and push MySQL image
uses: docker/build-push-action@v4
with:
context: ./mysql
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ secrets.DOCKERHUB_USERNAME }}/mysql-ha:latest
- name: Build and push PostgreSQL image
uses: docker/build-push-action@v4
with:
context: ./postgres
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ secrets.DOCKERHUB_USERNAME }}/postgres-ha:latest
deploy-dev:
name: Deploy to Development
needs: build
if: github.ref == 'refs/heads/develop'
runs-on: ubuntu-latest
environment: development
steps:
- uses: actions/checkout@v3
- name: Set up Terraform
uses: hashicorp/setup-terraform@v2
- name: Terraform Init
run: cd terraform/environments/dev && terraform init
- name: Terraform Plan
run: cd terraform/environments/dev && terraform plan -out=tfplan
- name: Terraform Apply
run: cd terraform/environments/dev && terraform apply -auto-approve tfplan
- name: Setup kubeconfig
run: |
echo "${{ secrets.KUBE_CONFIG_DEV }}" > kubeconfig
echo "KUBECONFIG=kubeconfig" >> $GITHUB_ENV
- name: Deploy with Helm
run: |
helm upgrade --install mysql-ha ./helm/mysql-ha -f ./helm/environments/dev/mysql-values.yaml -n database
helm upgrade --install postgres-ha ./helm/postgres-ha -f ./helm/environments/dev/postgres-values.yaml -n database
- name: Run Integration Tests
run: |
cd tests/integration
./run-db-tests.sh
deploy-staging:
name: Deploy to Staging
needs: deploy-dev
if: github.ref == 'refs/heads/develop'
runs-on: ubuntu-latest
environment: staging
steps:
- uses: actions/checkout@v3
- name: Set up Terraform
uses: hashicorp/setup-terraform@v2
- name: Terraform Init
run: cd terraform/environments/staging && terraform init
- name: Terraform Plan
run: cd terraform/environments/staging && terraform plan -out=tfplan
- name: Terraform Apply
run: cd terraform/environments/staging && terraform apply -auto-approve tfplan
- name: Setup kubeconfig
run: |
echo "${{ secrets.KUBE_CONFIG_STAGING }}" > kubeconfig
echo "KUBECONFIG=kubeconfig" >> $GITHUB_ENV
- name: Deploy with Helm
run: |
helm upgrade --install mysql-ha ./helm/mysql-ha -f ./helm/environments/staging/mysql-values.yaml -n database
helm upgrade --install postgres-ha ./helm/postgres-ha -f ./helm/environments/staging/postgres-values.yaml -n database
- name: Run E2E Tests
run: |
cd tests/e2e
./run-db-e2e-tests.sh
deploy-production:
name: Deploy to Production
needs: deploy-staging
if: github.ref == 'refs/heads/main'
runs-on: ubuntu-latest
environment: production
steps:
- uses: actions/checkout@v3
- name: Set up ArgoCD CLI
run: |
curl -sSL -o argocd https://github.com/argoproj/argo-cd/releases/download/v2.7.0/argocd-linux-amd64
chmod +x argocd
mv argocd /usr/local/bin/
- name: Login to ArgoCD
run: |
argocd login ${{ secrets.ARGOCD_SERVER }} --username ${{ secrets.ARGOCD_USERNAME }} --password ${{ secrets.ARGOCD_PASSWORD }}
- name: Sync Applications
run: |
argocd app sync mysql-ha-prod
argocd app sync postgres-ha-prod
- name: Verify Deployment
run: |
argocd app wait mysql-ha-prod --health
argocd app wait postgres-ha-prod --healthYAMLTerraform Infrastructure Provisioning
flowchart TD
subgraph "Terraform"
network["Network Module"] --> cluster["Kubernetes Cluster Module"]
cluster --> storage["Storage Module"]
cluster --> monitoring["Monitoring Module"]
cluster --> security["Security Module"]
end
subgraph "Cloud Provider"
vpc["VPC/Virtual Network"]
nodes["Kubernetes Nodes"]
disks["Persistent Disks"]
aws_services["Managed Services"]
end
network --> vpc
cluster --> nodes
storage --> disks
monitoring --> aws_services
security --> aws_servicesBase Infrastructure Module
provider "aws" {
region = var.region
}
module "vpc" {
source = "terraform-aws-modules/vpc/aws"
version = "3.19.0"
name = "${var.environment}-db-vpc"
cidr = var.vpc_cidr
azs = var.availability_zones
private_subnets = var.private_subnets
public_subnets = var.public_subnets
enable_nat_gateway = true
single_nat_gateway = var.environment != "production"
tags = {
Environment = var.environment
Terraform = "true"
}
}
module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "18.30.3"
cluster_name = "${var.environment}-db-cluster"
cluster_version = var.kubernetes_version
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnets
cluster_endpoint_private_access = true
cluster_endpoint_public_access = true
eks_managed_node_groups = {
general = {
desired_size = 2
min_size = 2
max_size = 5
instance_types = ["t3.large"]
capacity_type = "ON_DEMAND"
labels = {
role = "general"
}
}
database = {
desired_size = 3
min_size = 3
max_size = 7
instance_types = ["r5.xlarge"]
capacity_type = "ON_DEMAND"
labels = {
role = "database"
}
taints = [{
key = "dedicated"
value = "database"
effect = "NO_SCHEDULE"
}]
}
}
manage_aws_auth_configmap = true
tags = {
Environment = var.environment
Terraform = "true"
}
}
# EBS CSI Driver for persistent volumes
resource "aws_iam_policy" "ebs_csi_policy" {
name = "${var.environment}-AmazonEKS_EBS_CSI_Driver_Policy"
description = "Policy for EKS EBS CSI Driver"
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"ec2:CreateSnapshot",
"ec2:AttachVolume",
"ec2:DetachVolume",
"ec2:ModifyVolume",
"ec2:DescribeAvailabilityZones",
"ec2:DescribeInstances",
"ec2:DescribeSnapshots",
"ec2:DescribeTags",
"ec2:DescribeVolumes",
"ec2:DescribeVolumesModifications"
]
Resource = "*"
},
{
Effect = "Allow"
Action = [
"ec2:CreateTags"
]
Resource = [
"arn:aws:ec2:*:*:volume/*",
"arn:aws:ec2:*:*:snapshot/*"
]
Condition = {
StringEquals = {
"ec2:CreateAction" = [
"CreateVolume",
"CreateSnapshot"
]
}
}
},
{
Effect = "Allow"
Action = [
"ec2:DeleteTags"
]
Resource = [
"arn:aws:ec2:*:*:volume/*",
"arn:aws:ec2:*:*:snapshot/*"
]
},
{
Effect = "Allow"
Action = [
"ec2:CreateVolume"
]
Resource = "*"
Condition = {
StringLike = {
"aws:RequestTag/kubernetes.io/cluster/*" = "owned"
}
}
},
{
Effect = "Allow"
Action = [
"ec2:DeleteVolume"
]
Resource = "*"
Condition = {
StringLike = {
"ec2:ResourceTag/kubernetes.io/cluster/*" = "owned"
}
}
}
]
})
}
module "ebs_csi_irsa_role" {
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
role_name = "${var.environment}-ebs-csi-driver"
attach_ebs_csi_policy = true
oidc_providers = {
main = {
provider_arn = module.eks.oidc_provider_arn
namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
}
}
tags = {
Environment = var.environment
Terraform = "true"
}
}
# S3 Bucket for database backups
resource "aws_s3_bucket" "db_backups" {
bucket = "${var.environment}-db-backups-${random_string.suffix.result}"
tags = {
Environment = var.environment
Terraform = "true"
}
}
resource "aws_s3_bucket_versioning" "db_backups" {
bucket = aws_s3_bucket.db_backups.id
versioning_configuration {
status = "Enabled"
}
}
resource "aws_s3_bucket_lifecycle_configuration" "db_backups" {
bucket = aws_s3_bucket.db_backups.id
rule {
id = "backup-lifecycle"
status = "Enabled"
transition {
days = 30
storage_class = "STANDARD_IA"
}
transition {
days = 90
storage_class = "GLACIER"
}
expiration {
days = 365
}
}
}
resource "random_string" "suffix" {
length = 8
special = false
upper = false
}
output "cluster_endpoint" {
description = "Endpoint for EKS control plane"
value = module.eks.cluster_endpoint
}
output "cluster_name" {
description = "Kubernetes Cluster Name"
value = module.eks.cluster_name
}
output "backup_bucket_name" {
description = "S3 bucket name for database backups"
value = aws_s3_bucket.db_backups.bucket
}HCLEnvironment-Specific Configuration
provider "aws" {
region = "us-west-2"
}
terraform {
backend "s3" {
bucket = "terraform-state-db-ha"
key = "production/terraform.tfstate"
region = "us-west-2"
}
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.16"
}
helm = {
source = "hashicorp/helm"
version = "~> 2.8"
}
}
}
module "kubernetes_cluster" {
source = "../../modules/kubernetes-cluster"
environment = "production"
region = "us-west-2"
kubernetes_version = "1.25"
vpc_cidr = "10.0.0.0/16"
availability_zones = ["us-west-2a", "us-west-2b", "us-west-2c"]
private_subnets = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
public_subnets = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
}
provider "kubernetes" {
host = module.kubernetes_cluster.cluster_endpoint
cluster_ca_certificate = base64decode(module.kubernetes_cluster.cluster_certificate_authority_data)
exec {
api_version = "client.authentication.k8s.io/v1beta1"
command = "aws"
args = ["eks", "get-token", "--cluster-name", module.kubernetes_cluster.cluster_name]
}
}
provider "helm" {
kubernetes {
host = module.kubernetes_cluster.cluster_endpoint
cluster_ca_certificate = base64decode(module.kubernetes_cluster.cluster_certificate_authority_data)
exec {
api_version = "client.authentication.k8s.io/v1beta1"
command = "aws"
args = ["eks", "get-token", "--cluster-name", module.kubernetes_cluster.cluster_name]
}
}
}
# Create Kubernetes Namespaces
resource "kubernetes_namespace" "database" {
metadata {
name = "database"
}
}
resource "kubernetes_namespace" "monitoring" {
metadata {
name = "monitoring"
}
}
resource "kubernetes_namespace" "argocd" {
metadata {
name = "argocd"
}
}
# Install ArgoCD
resource "helm_release" "argocd" {
name = "argocd"
repository = "https://argoproj.github.io/argo-helm"
chart = "argo-cd"
version = "5.16.2"
namespace = kubernetes_namespace.argocd.metadata[0].name
values = [
file("${path.module}/values/argocd-values.yaml")
]
}
# Install Prometheus and Grafana
resource "helm_release" "prometheus_stack" {
name = "prometheus"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "kube-prometheus-stack"
version = "42.0.3"
namespace = kubernetes_namespace.monitoring.metadata[0].name
values = [
file("${path.module}/values/prometheus-values.yaml")
]
}
# Output information
output "backup_bucket_name" {
description = "S3 bucket name for database backups"
value = module.kubernetes_cluster.backup_bucket_name
}
output "kubeconfig_command" {
description = "Command to get kubeconfig for the cluster"
value = "aws eks update-kubeconfig --region us-west-2 --name ${module.kubernetes_cluster.cluster_name}"
}HCLAnsible Automation
flowchart LR
subgraph "Ansible Control Node"
playbooks["Ansible Playbooks"]
inventory["Inventory"]
roles["Roles"]
tasks["Tasks"]
end
subgraph "Target Environments"
dev["Development"]
staging["Staging"]
prod["Production"]
end
playbooks --> roles
roles --> tasks
inventory --> playbooks
playbooks --> dev
playbooks --> staging
playbooks --> prodAnsible Playbook for Database Management
---
- name: Setup Database Infrastructure
hosts: localhost
connection: local
become: false
gather_facts: true
vars_files:
- "../vars/{{ env }}_vars.yaml"
tasks:
- name: Ensure Terraform is installed
apt:
name: terraform
state: present
become: true
when: ansible_distribution == 'Ubuntu'
- name: Ensure kubectl is installed
shell: |
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
chmod +x kubectl
sudo mv kubectl /usr/local/bin/
args:
creates: /usr/local/bin/kubectl
- name: Ensure helm is installed
shell: |
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
args:
creates: /usr/local/bin/helm
- name: Apply Terraform
shell: |
cd ../terraform/environments/{{ env }}
terraform init
terraform apply -auto-approve
register: terraform_output
- name: Parse Terraform outputs
set_fact:
cluster_name: "{{ terraform_output.stdout | regex_search('cluster_name = \"([^\"]*)\"', '\\1') | first }}"
backup_bucket: "{{ terraform_output.stdout | regex_search('backup_bucket_name = \"([^\"]*)\"', '\\1') | first }}"
- name: Configure kubectl
shell: |
aws eks update-kubeconfig --region {{ aws_region }} --name {{ cluster_name }}
- name: Create namespaces
k8s:
state: present
definition:
apiVersion: v1
kind: Namespace
metadata:
name: "{{ item }}"
loop:
- database
- monitoring
- backup
- name: Deploy database secrets
k8s:
state: present
definition: "{{ lookup('template', '../templates/secrets/{{ item }}.yaml.j2') }}"
loop:
- mysql-secrets
- postgres-secrets
no_log: true
- name: Deploy MySQL HA cluster
k8s:
state: present
src: "../templates/mysql/{{ item }}"
loop:
- mysql-configmap.yaml
- mysql-services.yaml
- mysql-statefulset.yaml
- orchestrator-configmap.yaml
- orchestrator-deployment.yaml
- name: Deploy PostgreSQL HA cluster
k8s:
state: present
src: "../templates/postgres/{{ item }}"
loop:
- postgres-configmap.yaml
- etcd-statefulset.yaml
- patroni-statefulset.yaml
- name: Configure Database Backup
k8s:
state: present
definition: "{{ lookup('template', '../templates/backup/{{ item }}.yaml.j2') }}"
loop:
- mysql-backup-cronjob
- postgres-backup-cronjob
- name: Create monitoring dashboards
k8s:
state: present
src: "../templates/monitoring/{{ item }}"
loop:
- mysql-grafana-dashboard.yaml
- postgres-grafana-dashboard.yaml
- db-prometheus-rules.yaml
- name: Verify database deployment
shell: |
kubectl wait --for=condition=Ready pods -l app=mysql -n database --timeout=300s
kubectl wait --for=condition=Ready pods -l app=postgres -n database --timeout=300s
register: db_status
- name: Display deployment status
debug:
msg: "Database deployment complete. Status: {{ db_status.stdout }}"YAMLAnsible Role for Database Operational Tasks
---
- name: Get environment variables
include_vars: "{{ env }}_vars.yaml"
- name: Check MySQL StatefulSet
k8s_info:
api_version: apps/v1
kind: StatefulSet
name: mysql
namespace: database
register: mysql_statefulset
- name: Check PostgreSQL StatefulSet
k8s_info:
api_version: apps/v1
kind: StatefulSet
name: postgres
namespace: database
register: postgres_statefulset
- name: Include specified tasks
include_tasks: "{{ operation }}.yaml"
when: operation is definedYAML---
- name: Scale MySQL cluster
k8s:
api_version: apps/v1
kind: StatefulSet
name: mysql
namespace: database
definition:
spec:
replicas: "{{ mysql_replicas | default(3) }}"
when: mysql_statefulset.resources is defined and mysql_statefulset.resources | length > 0
- name: Scale PostgreSQL cluster
k8s:
api_version: apps/v1
kind: StatefulSet
name: postgres
namespace: database
definition:
spec:
replicas: "{{ postgres_replicas | default(3) }}"
when: postgres_statefulset.resources is defined and postgres_statefulset.resources | length > 0
- name: Wait for scaling to complete
shell: |
kubectl rollout status statefulset/mysql -n database
kubectl rollout status statefulset/postgres -n database
register: scaling_result
- name: Display scaling result
debug:
msg: "Database scaling complete: {{ scaling_result.stdout }}"YAML---
- name: Create on-demand MySQL backup
shell: |
kubectl create job --from=cronjob/mysql-backup mysql-backup-manual-{{ ansible_date_time.epoch }} -n database
register: mysql_backup_job
- name: Create on-demand PostgreSQL backup
shell: |
kubectl create job --from=cronjob/postgres-backup postgres-backup-manual-{{ ansible_date_time.epoch }} -n database
register: postgres_backup_job
- name: Wait for backup jobs to complete
shell: |
kubectl wait --for=condition=complete job/{{ item.stdout_lines[0] }} -n database --timeout=600s
with_items:
- "{{ mysql_backup_job }}"
- "{{ postgres_backup_job }}"
register: backup_status
- name: Show backup job status
debug:
msg: "Backup jobs completed: {{ backup_status.results | map(attribute='stdout') | list }}"
- name: Verify backup files in S3
command: "aws s3 ls s3://{{ backup_bucket }}/{{ item }}-backups/ --recursive"
with_items:
- mysql
- postgres
register: s3_backup_files
- name: Display backup files
debug:
msg: "{{ s3_backup_files.results | map(attribute='stdout') | list }}"YAMLGitOps with ArgoCD
flowchart TD
subgraph "GitOps Workflow"
git["Git Repository"] --> argocd["ArgoCD"]
argocd --> sync["Auto/Manual Sync"]
sync --> k8s["Kubernetes Clusters"]
end
subgraph "Application Deployments"
dev_apps["Dev Applications"]
staging_apps["Staging Applications"]
prod_apps["Production Applications"]
end
k8s --> dev_apps
k8s --> staging_apps
k8s --> prod_apps
dev_apps --> health["Health Monitoring"]
staging_apps --> health
prod_apps --> health
health --> rollback["Auto Rollback"]
rollback --> argocdArgoCD Application Definitions
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: mysql-ha-prod
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
source:
repoURL: https://github.com/yourusername/db-ha-infrastructure.git
targetRevision: HEAD
path: helm/mysql-ha
helm:
valueFiles:
- ../environments/production/mysql-values.yaml
destination:
server: https://kubernetes.default.svc
namespace: database
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=false
- PruneLast=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3mYAMLapiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: postgres-ha-prod
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
source:
repoURL: https://github.com/yourusername/db-ha-infrastructure.git
targetRevision: HEAD
path: helm/postgres-ha
helm:
valueFiles:
- ../environments/production/postgres-values.yaml
destination:
server: https://kubernetes.default.svc
namespace: database
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=false
- PruneLast=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3mYAMLArgoCD Project Configuration
apiVersion: argoproj.io/v1alpha1
kind: AppProject
metadata:
name: database-infra
namespace: argocd
spec:
description: Database infrastructure project
sourceRepos:
- 'https://github.com/yourusername/db-ha-infrastructure.git'
destinations:
- namespace: database
server: https://kubernetes.default.svc
- namespace: monitoring
server: https://kubernetes.default.svc
- namespace: backup
server: https://kubernetes.default.svc
clusterResourceWhitelist:
- group: '*'
kind: '*'
namespaceResourceBlacklist:
- group: ''
kind: ResourceQuota
- group: ''
kind: LimitRange
roles:
- name: project-admin
description: Project administrators
policies:
- p, proj:database-infra:project-admin, applications, *, database-infra/*, allow
groups:
- database-admins
- name: project-readonly
description: Read-only access to the project
policies:
- p, proj:database-infra:project-readonly, applications, get, database-infra/*, allow
groups:
- database-readonlyYAMLHelm Charts
flowchart TB
subgraph "Helm Charts"
mysql["MySQL HA Chart"]
postgres["PostgreSQL HA Chart"]
subgraph "MySQL Chart Components"
mysql_values["Values"]
mysql_templates["Templates"]
mysql_helpers["Helpers"]
end
subgraph "PostgreSQL Chart Components"
pg_values["Values"]
pg_templates["Templates"]
pg_helpers["Helpers"]
end
mysql --> mysql_values
mysql --> mysql_templates
mysql --> mysql_helpers
postgres --> pg_values
postgres --> pg_templates
postgres --> pg_helpers
end
subgraph "Environment Values"
dev_values["Development"]
staging_values["Staging"]
prod_values["Production"]
end
mysql_values -.-> dev_values
mysql_values -.-> staging_values
mysql_values -.-> prod_values
pg_values -.-> dev_values
pg_values -.-> staging_values
pg_values -.-> prod_valuesMySQL Helm Chart
apiVersion: v2
name: mysql-ha
description: High Availability MySQL Cluster for Kubernetes
type: application
version: 1.0.0
appVersion: "8.0.0"
dependencies:
- name: prometheus-mysql-exporter
version: 1.5.0
repository: https://prometheus-community.github.io/helm-charts
condition: monitoring.enabledYAML# Default configuration for MySQL HA
image:
repository: mysql
tag: "8.0"
pullPolicy: IfNotPresent
replicaCount: 3
mysql:
rootPassword: "changeme"
replicationPassword: "changeme"
monitorPassword: "changeme"
healthcheckPassword: "changeme"
storage:
size: 10Gi
storageClass: "standard"
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 2
memory: 4Gi
monitoring:
enabled: true
backup:
enabled: true
schedule: "0 1 * * *"
retention: 7
s3Bucket: ""
orchestrator:
enabled: true
image:
repository: orchestrator
tag: latest
replicaCount: 1
resources:
requests:
cpu: 200m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- mysql
topologyKey: "kubernetes.io/hostname"
nodeSelector: {}
tolerations: []YAMLreplicaCount: 5
mysql:
rootPassword: "${MYSQL_ROOT_PASSWORD}"
replicationPassword: "${MYSQL_REPLICATION_PASSWORD}"
monitorPassword: "${MYSQL_MONITOR_PASSWORD}"
healthcheckPassword: "${MYSQL_HEALTHCHECK_PASSWORD}"
storage:
size: 100Gi
storageClass: "gp3"
resources:
requests:
cpu: 2
memory: 8Gi
limits:
cpu: 4
memory: 16Gi
backup:
enabled: true
schedule: "0 */4 * * *" # Every 4 hours
retention: 30
s3Bucket: "prod-db-backups-12345678"
orchestrator:
replicaCount: 3
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 1
memory: 2Gi
nodeSelector:
role: database
tolerations:
- key: "dedicated"
operator: "Equal"
value: "database"
effect: "NoSchedule"YAMLMonitoring and Alerting Integration
flowchart TB
subgraph "Monitoring Stack"
prometheus["Prometheus"]
grafana["Grafana"]
alertmanager["Alert Manager"]
end
subgraph "Database Exporters"
mysql_exp["MySQL Exporter"]
pg_exp["PostgreSQL Exporter"]
end
subgraph "Alert Channels"
slack["Slack"]
pagerduty["PagerDuty"]
email["Email"]
end
mysql["MySQL Cluster"] --> mysql_exp
postgres["PostgreSQL Cluster"] --> pg_exp
mysql_exp -->|Metrics| prometheus
pg_exp -->|Metrics| prometheus
prometheus -->|Data| grafana
prometheus -->|Alerts| alertmanager
alertmanager -->|Notifications| slack
alertmanager -->|Incidents| pagerduty
alertmanager -->|Reports| emailPrometheus Alert Rules for Databases
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: database-alerts
namespace: monitoring
labels:
app: prometheus
spec:
groups:
- name: mysql.rules
rules:
- alert: MySQLInstanceDown
expr: mysql_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "MySQL instance {{ $labels.instance }} is down"
description: "MySQL instance has been down for more than 1 minute."
runbook_url: "https://wiki.example.com/mysql/instance-down"
- alert: MySQLReplicationLag
expr: mysql_slave_lag_seconds > 300
for: 5m
labels:
severity: warning
annotations:
summary: "MySQL replication lag on {{ $labels.instance }}"
description: "MySQL replication lag is more than 5 minutes on {{ $labels.instance }}."
runbook_url: "https://wiki.example.com/mysql/replication-lag"
- alert: MySQLHighConnectionUsage
expr: mysql_global_status_threads_connected / mysql_global_variables_max_connections * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "MySQL high connection usage on {{ $labels.instance }}"
description: "MySQL connection usage is above 80% on {{ $labels.instance }}."
runbook_url: "https://wiki.example.com/mysql/high-connections"
- name: postgres.rules
rules:
- alert: PostgreSQLInstanceDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "PostgreSQL instance {{ $labels.instance }} is down"
description: "PostgreSQL instance has been down for more than 1 minute."
runbook_url: "https://wiki.example.com/postgres/instance-down"
- alert: PostgreSQLReplicationLag
expr: pg_replication_lag > 300
for: 5m
labels:
severity: warning
annotations:
summary: "PostgreSQL replication lag on {{ $labels.instance }}"
description: "PostgreSQL replication lag is more than 5 minutes on {{ $labels.instance }}."
runbook_url: "https://wiki.example.com/postgres/replication-lag"
- alert: PostgreSQLHighConnectionUsage
expr: pg_stat_activity_count / pg_settings_max_connections * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "PostgreSQL high connection usage on {{ $labels.instance }}"
description: "PostgreSQL connection usage is above 80% on {{ $labels.instance }}."
runbook_url: "https://wiki.example.com/postgres/high-connections"YAMLDisaster Recovery Automation
flowchart TB
subgraph "Disaster Recovery"
backup["Automated Backups"]
verify["Backup Verification"]
restore["Restore Testing"]
failover["Failover Testing"]
end
subgraph "DR Automation"
ansible["Ansible Playbooks"]
scripts["Restore Scripts"]
automation["CI/CD Pipeline"]
end
backup --> verify
verify --> restore
restore --> failover
ansible --> backup
ansible --> verify
ansible --> restore
ansible --> failover
scripts --> restore
scripts --> failover
automation --> ansibleDR Ansible Playbook
---
- name: Database Disaster Recovery
hosts: localhost
connection: local
gather_facts: false
vars_files:
- "../vars/{{ env }}_vars.yaml"
vars:
dr_action: "{{ action | default('test') }}"
backup_date: "{{ date | default('latest') }}"
target_cluster: "{{ cluster | default('primary') }}"
tasks:
- name: Check for valid DR action
fail:
msg: "Valid actions are 'test', 'restore', 'failover', or 'verify'"
when: dr_action not in ['test', 'restore', 'failover', 'verify']
- name: Configure kubectl for target cluster
shell: |
aws eks update-kubeconfig --region {{ aws_region }} --name {{ cluster_name }}
# Backup verification task
- name: Verify database backups
block:
- name: Check MySQL backups in S3
command: "aws s3 ls s3://{{ backup_bucket }}/mysql-backups/ --recursive"
register: mysql_backups
- name: Check PostgreSQL backups in S3
command: "aws s3 ls s3://{{ backup_bucket }}/postgres-backups/ --recursive"
register: postgres_backups
- name: Create verification pod for MySQL
k8s:
state: present
definition:
apiVersion: v1
kind: Pod
metadata:
name: mysql-backup-verify
namespace: database
spec:
containers:
- name: mysql-verify
image: mysql:8.0
command:
- "/bin/bash"
- "-c"
- |
# Download latest backup
aws s3 cp s3://{{ backup_bucket }}/mysql-backups/$(aws s3 ls s3://{{ backup_bucket }}/mysql-backups/ | sort | tail -n 1 | awk '{print $4}') /tmp/backup.sql.gz
# Verify backup integrity
gunzip -t /tmp/backup.sql.gz
if [ $? -eq 0 ]; then
echo "MySQL backup verified successfully!"
exit 0
else
echo "MySQL backup verification failed!"
exit 1
fi
env:
- name: AWS_REGION
value: "{{ aws_region }}"
restartPolicy: Never
- name: Create verification pod for PostgreSQL
k8s:
state: present
definition:
apiVersion: v1
kind: Pod
metadata:
name: postgres-backup-verify
namespace: database
spec:
containers:
- name: postgres-verify
image: postgres:14
command:
- "/bin/bash"
- "-c"
- |
# Download latest backup
aws s3 cp s3://{{ backup_bucket }}/postgres-backups/$(aws s3 ls s3://{{ backup_bucket }}/postgres-backups/ | sort | tail -n 1 | awk '{print $4}') /tmp/backup.pgdump
# Verify backup integrity
pg_restore -l /tmp/backup.pgdump > /dev/null
if [ $? -eq 0 ]; then
echo "PostgreSQL backup verified successfully!"
exit 0
else
echo "PostgreSQL backup verification failed!"
exit 1
fi
env:
- name: AWS_REGION
value: "{{ aws_region }}"
restartPolicy: Never
- name: Wait for verification jobs to complete
shell: |
kubectl wait --for=condition=Ready pod/mysql-backup-verify -n database --timeout=300s
kubectl wait --for=condition=Ready pod/postgres-backup-verify -n database --timeout=300s
register: verification_status
when: dr_action == 'verify'
# Restore database task
- name: Restore database
block:
- name: Set backup file based on date parameter
set_fact:
mysql_backup_file: "{{ backup_date if backup_date != 'latest' else '$(aws s3 ls s3://{{ backup_bucket }}/mysql-backups/ | sort | tail -n 1 | awk \\'{ print $4 }\\')' }}"
postgres_backup_file: "{{ backup_date if backup_date != 'latest' else '$(aws s3 ls s3://{{ backup_bucket }}/postgres-backups/ | sort | tail -n 1 | awk \\'{ print $4 }\\')' }}"
- name: Scale down database clusters for restore
k8s:
state: present
definition:
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: "{{ item }}"
namespace: database
spec:
replicas: 0
loop:
- mysql
- postgres
- name: Wait for scale down
shell: |
kubectl wait --for=delete pod/mysql-0 -n database --timeout=300s
kubectl wait --for=delete pod/postgres-0 -n database --timeout=300s
- name: Create MySQL restore job
k8s:
state: present
definition:
apiVersion: batch/v1
kind: Job
metadata:
name: mysql-restore-job
namespace: database
spec:
template:
spec:
containers:
- name: mysql-restore
image: mysql:8.0
command:
- "/bin/bash"
- "-c"
- |
# Download backup
aws s3 cp s3://{{ backup_bucket }}/mysql-backups/{{ mysql_backup_file }} /tmp/backup.sql.gz
gunzip /tmp/backup.sql.gz
# Perform restore into PVC
mysql -h mysql-0.mysql -u root -p${MYSQL_ROOT_PASSWORD} < /tmp/backup.sql
env:
- name: MYSQL_ROOT_PASSWORD
valueFrom:
secretKeyRef:
name: mysql-secrets
key: root-password
- name: AWS_REGION
value: "{{ aws_region }}"
restartPolicy: Never
backoffLimit: 3
- name: Create PostgreSQL restore job
k8s:
state: present
definition:
apiVersion: batch/v1
kind: Job
metadata:
name: postgres-restore-job
namespace: database
spec:
template:
spec:
containers:
- name: postgres-restore
image: postgres:14
command:
- "/bin/bash"
- "-c"
- |
# Download backup
aws s3 cp s3://{{ backup_bucket }}/postgres-backups/{{ postgres_backup_file }} /tmp/backup.pgdump
# Perform restore
PGPASSWORD=${POSTGRES_PASSWORD} pg_restore -h postgres-primary -U postgres -d postgres --clean --if-exists /tmp/backup.pgdump
env:
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: postgres-secrets
key: superuser-password
- name: AWS_REGION
value: "{{ aws_region }}"
restartPolicy: Never
backoffLimit: 3
- name: Wait for restore jobs to complete
shell: |
kubectl wait --for=condition=complete job/mysql-restore-job -n database --timeout=600s
kubectl wait --for=condition=complete job/postgres-restore-job -n database --timeout=600s
register: restore_status
- name: Scale up database clusters after restore
k8s:
state: present
definition:
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: "{{ item.name }}"
namespace: database
spec:
replicas: "{{ item.replicas }}"
loop:
- { name: mysql, replicas: 3 }
- { name: postgres, replicas: 3 }
when: dr_action == 'restore'
# Failover testing task
- name: Test failover capabilities
block:
- name: Test MySQL failover
shell: |
# Get current primary
mysql_primary=$(kubectl get pods -l app=mysql -n database -o jsonpath='{.items[0].metadata.name}')
# Simulate primary failure
kubectl delete pod $mysql_primary -n database
# Wait for orchestrator to detect failure and promote new primary
sleep 30
# Verify new primary was promoted
new_primary=$(kubectl exec -it orchestrator-0 -n database -- curl -s http://localhost:3000/api/cluster/alias/mysql-cluster | jq '.[] | select(.IsReadOnly==false) | .Key')
if [ -n "$new_primary" ]; then
echo "MySQL failover test succeeded. New primary: $new_primary"
else
echo "MySQL failover test failed. No new primary detected."
exit 1
fi
register: mysql_failover_result
- name: Test PostgreSQL failover
shell: |
# Get current primary
pg_primary=$(kubectl get pods -l app=postgres,role=master -n database -o jsonpath='{.items[0].metadata.name}')
# Simulate primary failure
kubectl delete pod $pg_primary -n database
# Wait for Patroni to detect failure and promote new primary
sleep 30
# Verify new primary was promoted
new_primary=$(kubectl get pods -l app=postgres,role=master -n database -o jsonpath='{.items[0].metadata.name}')
if [ -n "$new_primary" -a "$new_primary" != "$pg_primary" ]; then
echo "PostgreSQL failover test succeeded. New primary: $new_primary"
else
echo "PostgreSQL failover test failed. No new primary detected."
exit 1
fi
register: postgres_failover_result
when: dr_action == 'test'
- name: Report DR test results
debug:
msg: "{{ item.name }} failover test result: {{ item.result.stdout }}"
loop:
- { name: "MySQL", result: "{{ mysql_failover_result }}" }
- { name: "PostgreSQL", result: "{{ postgres_failover_result }}" }
when: dr_action == 'test'YAMLThis comprehensive CI/CD and Infrastructure as Code solution provides a complete framework for deploying, managing, and maintaining high-availability database clusters in Kubernetes with proper disaster recovery capabilities and monitoring.
Discover more from Altgr Blog
Subscribe to get the latest posts sent to your email.
