使用 Terraform 实现 AWS EC2 蓝绿部署

通过 Terraform 基础设施即代码,详解 AWS EC2 跨可用区蓝绿部署架构与自动化切换。

什么是蓝绿部署

蓝绿部署是一种降低发布风险的策略,通过同时运行两套完整环境(蓝色和绿色)来实现无缝切换。在传统模式下,部署新版本往往伴随着服务中断,而蓝绿部署可以在几乎零停机的情况下完成版本升级。

架构设计

整体架构

code
                    ┌─────────────────┐
                    │   Route 53      │
                    │   DNS 记录      │
                    └────────┬────────┘
                             │
                    ┌────────▼────────┐
                    │  ALB / CLB      │
                    │  负载均衡器      │
                    └┬───────────────┬┘
                     │               │
        ┌────────────▼──┐     ┌──────▼──────────┐
        │   Blue 环境   │     │    Green 环境    │
        │   (当前生产)  │     │    (待部署)      │
        │  EC2 + ASG    │     │   EC2 + ASG     │
        └───────────────┘     └─────────────────┘

Terraform 目录结构

bash
├── main.tf
├── variables.tf
├── outputs.tf
├── versions.tf
├── modules/
│   ├── vpc/
│   │   ├── main.tf
│   │   └── variables.tf
│   ├── ec2/
│   │   ├── main.tf
│   │   └── variables.tf
│   └── alb/
│       ├── main.tf
│       └── variables.tf
└── scripts/
    ├── switch.sh
    └── rollback.sh

Terraform 配置实现

1. 版本和提供商配置

hcl
# versions.tf
terraform {
  required_version = ">= 1.5.0"
  
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 5.0"
    }
  }
}

provider "aws" {
  region = var.aws_region
}

2. VPC 模块

hcl
# modules/vpc/main.tf
resource "aws_vpc" "main" {
  cidr_block           = var.vpc_cidr
  enable_dns_hostnames = true
  enable_dns_support   = true

  tags = {
    Name = "${var.project}-vpc"
  }
}

resource "aws_subnet" "blue_a" {
  vpc_id                  = aws_vpc.main.id
  cidr_block              = var.blue_subnet_a_cidr
  availability_zone       = "${var.aws_region}a"
  map_public_ip_on_launch = true

  tags = {
    Name = "${var.project}-subnet-blue-a"
    Slot = "blue"
  }
}

resource "aws_subnet" "blue_b" {
  vpc_id                  = aws_vpc.main.id
  cidr_block              = var.blue_subnet_b_cidr
  availability_zone       = "${var.aws_region}b"
  map_public_ip_on_launch = true

  tags = {
    Name = "${var.project}-subnet-blue-b"
    Slot = "blue"
  }
}

resource "aws_subnet" "green_a" {
  vpc_id                  = aws_vpc.main.id
  cidr_block              = var.green_subnet_a_cidr
  availability_zone       = "${var.aws_region}a"
  map_public_ip_on_launch = true

  tags = {
    Name = "${var.project}-subnet-green-a"
    Slot = "green"
  }
}

resource "aws_subnet" "green_b" {
  vpc_id                  = aws_vpc.main.id
  cidr_block              = var.green_subnet_b_cidr
  availability_zone       = "${var.aws_region}b"
  map_public_ip_on_launch = true

  tags = {
    Name = "${var.project}-subnet-green-b"
    Slot = "green"
  }
}

3. EC2 蓝绿环境配置

hcl
# modules/ec2/main.tf
resource "aws_launch_template" "blue" {
  name_prefix   = "${var.project}-blue-"
  image_id      = var.ami_id
  instance_type = var.instance_type

  vpc_security_group_ids = var.security_group_ids

  user_data = base64encode(<<-EOF
              #!/bin/bash
              echo "Blue Environment - Version ${var.blue_version}" >> /var/log/app.log
              yum install -y nginx
              systemctl start nginx
              EOF)

  tag_specifications {
    resource_type = "instance"
    tags = {
      Name = "${var.project}-blue"
      Slot = "blue"
      Version = var.blue_version
    }
  }

  lifecycle {
    create_before_destroy = true
  }
}

resource "aws_launch_template" "green" {
  name_prefix   = "${var.project}-green-"
  image_id      = var.ami_id
  instance_type = var.instance_type

  vpc_security_group_ids = var.security_group_ids

  user_data = base64encode(<<-EOF
              #!/bin/bash
              echo "Green Environment - Version ${var.green_version}" >> /var/log/app.log
              yum install -y nginx
              systemctl start nginx
              EOF)

  tag_specifications {
    resource_type = "instance"
    tags = {
      Name = "${var.project}-green"
      Slot = "green"
      Version = var.green_version
    }
  }

  lifecycle {
    create_before_destroy = true
  }
}

resource "aws_autoscaling_group" "blue" {
  name                = "${var.project}-asg-blue"
  vpc_zone_identifier = var.blue_subnet_ids
  desired_capacity    = var.desired_capacity
  min_size            = var.min_size
  max_size            = var.max_size
  health_check_type   = "ELB"
  health_check_grace_period = 300

  launch_template {
    id      = aws_launch_template.blue.id
    version = "$Latest"
  }

  tag {
    key                 = "Name"
    value               = "${var.project}-blue"
    propagate_at_launch = true
  }

  tag {
    key                 = "Slot"
    value               = "blue"
    propagate_at_launch = true
  }
}

resource "aws_autoscaling_group" "green" {
  name                = "${var.project}-asg-green"
  vpc_zone_identifier = var.green_subnet_ids
  desired_capacity    = 0
  min_size            = 0
  max_size            = var.max_size
  health_check_type   = "ELB"
  health_check_grace_period = 300

  launch_template {
    id      = aws_launch_template.green.id
    version = "$Latest"
  }

  tag {
    key                 = "Name"
    value               = "${var.project}-green"
    propagate_at_launch = true
  }

  tag {
    key                 = "Slot"
    value               = "green"
    propagate_at_launch = true
  }
}

4. 负载均衡器配置

hcl
# modules/alb/main.tf
resource "aws_lb" "main" {
  name               = "${var.project}-alb"
  internal           = false
  load_balancer_type = "application"
  security_groups     = var.security_group_ids
  subnets            = var.public_subnet_ids

  enable_deletion_protection = false

  tags = {
    Name = "${var.project}-alb"
  }
}

resource "aws_lb_target_group" "blue" {
  name     = "${var.project}-tg-blue"
  port     = 80
  protocol = "HTTP"
  vpc_id   = var.vpc_id

  health_check {
    enabled             = true
    healthy_threshold   = 2
    unhealthy_threshold = 2
    timeout             = 5
    interval            = 30
    path               = "/health"
  }

  stickiness {
    enabled  = true
    type     = "lb_cookie"
    duration = 3600
  }

  tags = {
    Slot = "blue"
  }
}

resource "aws_lb_target_group" "green" {
  name     = "${var.project}-tg-green"
  port     = 80
  protocol = "HTTP"
  vpc_id   = var.vpc_id

  health_check {
    enabled             = true
    healthy_threshold   = 2
    unhealthy_threshold = 2
    timeout             = 5
    interval            = 30
    path               = "/health"
  }

  stickiness {
    enabled  = true
    type     = "lb_cookie"
    duration = 3600
  }

  tags = {
    Slot = "green"
  }
}

resource "aws_lb_listener" "blue" {
  load_balancer_arn = aws_lb.main.arn
  port              = 80
  protocol          = "HTTP"

  default_action {
    type             = "forward"
    target_group_arn = aws_lb_target_group.blue.arn
  }
}

resource "aws_lb_target_group_attachment" "blue" {
  target_group_arn = aws_lb_target_group.blue.arn
  target_id        = var.blue_instance_id
  port             = 80
}

5. 变量定义

hcl
# variables.tf
variable "project" {
  description = "项目名称"
  type        = string
  default     = "blue-green-app"
}

variable "aws_region" {
  description = "AWS 区域"
  type        = string
  default     = "us-east-1"
}

variable "ami_id" {
  description = "EC2 AMI ID"
  type        = string
}

variable "instance_type" {
  description = "EC2 实例类型"
  type        = string
  default     = "t3.medium"
}

variable "vpc_cidr" {
  description = "VPC CIDR 块"
  type        = string
  default     = "10.0.0.0/16"
}

variable "blue_version" {
  description = "蓝色环境版本"
  type        = string
  default     = "v1.0.0"
}

variable "green_version" {
  description = "绿色环境版本"
  type        = string
  default     = "v1.1.0"
}

variable "desired_capacity" {
  description = "ASG 期望容量"
  type        = number
  default     = 2
}

variable "min_size" {
  description = "ASG 最小实例数"
  type        = number
  default     = 2
}

variable "max_size" {
  description = "ASG 最大实例数"
  type        = number
  default     = 4
}

6. 主配置文件

hcl
# main.tf
module "vpc" {
  source = "./modules/vpc"

  project        = var.project
  aws_region     = var.aws_region
  vpc_cidr       = var.vpc_cidr
  blue_subnet_a_cidr = "10.0.1.0/24"
  blue_subnet_b_cidr = "10.0.2.0/24"
  green_subnet_a_cidr = "10.0.3.0/24"
  green_subnet_b_cidr = "10.0.4.0/24"
}

module "ec2" {
  source = "./modules/ec2"

  project           = var.project
  ami_id            = var.ami_id
  instance_type     = var.instance_type
  blue_version      = var.blue_version
  green_version     = var.green_version
  blue_subnet_ids   = [module.vpc.blue_subnet_a.id, module.vpc.blue_subnet_b.id]
  green_subnet_ids  = [module.vpc.green_subnet_a.id, module.vpc.green_subnet_b.id]
  security_group_ids = [module.security_group.id]
  desired_capacity  = var.desired_capacity
  min_size          = var.min_size
  max_size          = var.max_size
}

module "alb" {
  source = "./modules/alb"

  project         = var.project
  vpc_id          = module.vpc.vpc_id
  public_subnet_ids = [module.vpc.blue_subnet_a.id, module.vpc.blue_subnet_b.id]
  security_group_ids = [module.security_group.id]
}

自动化切换脚本

切换脚本

bash
#!/bin/bash
# scripts/switch.sh

set -e

# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
NC='\033[0m'

echo -e "${GREEN}开始蓝绿环境切换...${NC}"

# 1. 验证绿色环境健康状态
echo "步骤 1: 验证绿色环境健康状态..."
GREEN_TG_ARN="${GREEN_TARGET_GROUP_ARN}"

# 2. 增加绿色环境实例数
echo "步骤 2: 扩展绿色环境..."
aws autoscaling set-desired-capacity \
    --auto-scaling-group-name "${PROJECT}-asg-green" \
    --desired-capacity 2 \
    --region ${AWS_REGION}

# 3. 等待实例注册完成
echo "步骤 3: 等待实例注册完成..."
aws elb wait target-in-service \
    --target-group-arn ${GREEN_TG_ARN} \
    --region ${AWS_REGION}

# 4. 修改 ALB 监听器指向绿色环境
echo "步骤 4: 修改 ALB 监听器..."
aws elbv2 modify-listener \
    --listener-arn ${LISTENER_ARN} \
    --default-actions Type=forward,TargetGroupArn=${GREEN_TG_ARN} \
    --region ${AWS_REGION}

# 5. 缩减蓝色环境
echo "步骤 5: 缩减蓝色环境..."
aws autoscaling set-desired-capacity \
    --auto-scaling-group-name "${PROJECT}-asg-blue" \
    --desired-capacity 0 \
    --region ${AWS_REGION}

echo -e "${GREEN}切换完成!${NC}"

回滚脚本

bash
#!/bin/bash
# scripts/rollback.sh

set -e

echo -e "${RED}开始回滚操作...${NC}"

# 1. 扩展蓝色环境
echo "步骤 1: 扩展蓝色环境..."
aws autoscaling set-desired-capacity \
    --auto-scaling-group-name "${PROJECT}-asg-blue" \
    --desired-capacity 2 \
    --region ${AWS_REGION}

# 2. 等待蓝色环境就绪
echo "步骤 2: 等待蓝色环境就绪..."
sleep 60

# 3. 切换回蓝色环境
echo "步骤 3: 切换回蓝色环境..."
aws elbv2 modify-listener \
    --listener-arn ${LISTENER_ARN} \
    --default-actions Type=forward,TargetGroupArn=${BLUE_TG_ARN} \
    --region ${AWS_REGION}

# 4. 缩减绿色环境
echo "步骤 4: 缩减绿色环境..."
aws autoscaling set-desired-capacity \
    --auto-scaling-group-name "${PROJECT}-asg-green" \
    --desired-capacity 0 \
    --region ${AWS_REGION}

echo -e "${RED}回滚完成!${NC}"

部署流程

完整部署步骤

bash
# 1. 初始化 Terraform
terraform init

# 2. 规划部署
terraform plan -var-file="prod.tfvars"

# 3. 应用配置
terraform apply -var-file="prod.tfvars"

# 4. 验证部署
./scripts/switch.sh

# 5. 监控切换后状态
aws cloudwatch get-metric-statistics \
    --namespace AWS/EC2 \
    --metric-name CPUUtilization \
    --dimensions Name=AutoScalingGroupName,Value="${PROJECT}-asg-green" \
    --start-time $(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%S) \
    --end-time $(date -u +%Y-%m-%dT%H:%M:%S) \
    --period 300 \
    --statistics Average

总结

通过 Terraform 实现 AWS EC2 蓝绿部署,我们可以:

  1. 基础设施即代码 - 所有配置版本化管理,易于回溯和审计
  2. 跨可用区高可用 - 分布在多个 AZ,确保故障隔离
  3. 自动化切换 - 一键完成环境切换,最小化人工错误
  4. 快速回滚 - 保留旧环境,支持秒级回滚
  5. 可扩展性 - 通过 ASG 实现自动伸缩,应对流量高峰

蓝绿部署是生产环境发布的最佳实践之一,配合 Terraform 的声明式配置,可以构建可靠、可重复的基础设施部署流程。