Terraform 基础设施即代码
大约 10 分钟约 3135 字
Terraform 基础设施即代码
简介
Terraform 是由 HashiCorp 开发的一款开源基础设施即代码(Infrastructure as Code, IaC)工具,它使用声明式配置语言 HCL(HashiCorp Configuration Language)来定义和管理云资源。通过 Terraform,运维团队可以用代码的方式版本化、复用和自动化管理从服务器、网络到数据库的整个基础设施生命周期,支持 AWS、Azure、GCP 等数百个云平台和服务提供商,是目前业界最流行的多云基础设施管理工具之一。
特点
Provider 配置
Provider 是 Terraform 与云平台或服务交互的插件。每个 Provider 封装了特定平台的 API,负责资源的创建、读取、更新和删除操作。
# 配置 AWS Provider
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.23"
}
}
# 配置远程状态存储
backend "s3" {
bucket = "my-terraform-state"
key = "infra/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-lock"
}
}
# AWS Provider 配置
provider "aws" {
region = "us-east-1"
default_tags {
tags = {
Environment = "production"
ManagedBy = "terraform"
Team = "devops"
}
}
}
# 多区域配置
provider "aws" {
alias = "west"
region = "us-west-2"
}# 同时使用多个 Provider
provider "kubernetes" {
config_path = "~/.kube/config"
}
provider "helm" {
kubernetes {
config_path = "~/.kube/config"
}
}
provider "docker" {
host = "tcp://localhost:2376"
}Resource 资源管理
Resource 是 Terraform 配置的核心组件,用于定义基础设施中的具体资源对象。
# 创建 VPC 网络
resource "aws_vpc" "main" {
cidr_block = "10.0.0.0/16"
enable_dns_hostnames = true
enable_dns_support = true
tags = {
Name = "production-vpc"
}
}
# 创建子网
resource "aws_subnet" "public" {
count = 3
vpc_id = aws_vpc.main.id
cidr_block = "10.0.${count.index}.0/24"
availability_zone = data.aws_availability_zones.available.names[count.index]
map_public_ip_on_launch = true
tags = {
Name = "public-subnet-${count.index}"
}
}
# 创建安全组
resource "aws_security_group" "web" {
name = "web-sg"
description = "Allow web traffic"
vpc_id = aws_vpc.main.id
ingress {
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}# 创建 ECS 集群和服务
resource "aws_ecs_cluster" "main" {
name = "production-cluster"
setting {
name = "containerInsights"
value = "enabled"
}
}
resource "aws_ecs_task_definition" "app" {
family = "my-app"
network_mode = "awsvpc"
requires_compatibilities = ["FARGATE"]
cpu = "256"
memory = "512"
execution_role_arn = aws_iam_role.ecs_execution.arn
container_definitions = jsonencode([
{
name = "app"
image = "${aws_ecr_repository.app.repository_url}:latest"
essential = true
portMappings = [
{
containerPort = 8080
protocol = "tcp"
}
]
environment = [
{ name = "NODE_ENV", value = "production" }
]
logConfiguration = {
logDriver = "awslogs"
options = {
"awslogs-group" = "/ecs/my-app"
"awslogs-region" = "us-east-1"
"awslogs-stream-prefix" = "ecs"
}
}
}
])
}State 状态管理
State 是 Terraform 记录基础设施实际状态的机制,它将配置与真实资源进行映射。
# 使用 S3 作为远程状态后端
terraform {
backend "s3" {
bucket = "terraform-state-prod"
key = "network/terraform.tfstate"
region = "us-east-1"
encrypt = true
kms_key_id = "arn:aws:kms:us-east-1:123456789:key/xxx"
dynamodb_table = "terraform-state-lock"
}
}# 状态管理常用命令
# 查看当前状态
terraform state list
# 查看特定资源详情
terraform state show aws_vpc.main
# 移动资源(重构时使用)
terraform state mv aws_instance.old aws_instance.new
# 导入已有资源到状态
terraform import aws_instance.web i-1234567890abcdef0
# 移除状态中的资源(不删除实际资源)
terraform state rm aws_instance.deprecated
# 刷新状态(同步实际资源状态)
terraform refreshModule 模块化
模块是 Terraform 代码复用的核心机制,允许将一组相关的资源配置封装为可复用的组件。
# modules/vpc/variables.tf
variable "vpc_cidr" {
description = "VPC CIDR block"
type = string
default = "10.0.0.0/16"
}
variable "public_subnets" {
description = "Public subnet CIDR blocks"
type = list(string)
default = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
}
variable "environment" {
description = "Environment name"
type = string
}
# modules/vpc/main.tf
resource "aws_vpc" "this" {
cidr_block = var.vpc_cidr
tags = {
Name = "${var.environment}-vpc"
Environment = var.environment
}
}
resource "aws_subnet" "public" {
count = length(var.public_subnets)
vpc_id = aws_vpc.this.id
cidr_block = var.public_subnets[count.index]
availability_zone = data.aws_availability_zones.available.names[count.index]
tags = {
Name = "${var.environment}-public-${count.index}"
}
}
# modules/vpc/outputs.tf
output "vpc_id" {
description = "VPC ID"
value = aws_vpc.this.id
}
output "public_subnet_ids" {
description = "Public subnet IDs"
value = aws_subnet.public[*].id
}# 使用模块
module "vpc" {
source = "./modules/vpc"
vpc_cidr = "10.0.0.0/16"
public_subnets = ["10.0.1.0/24", "10.0.2.0/24"]
environment = "production"
}
module "vpc_staging" {
source = "./modules/vpc"
vpc_cidr = "172.16.0.0/16"
public_subnets = ["172.16.1.0/24", "172.16.2.0/24"]
environment = "staging"
}
# 使用 Terraform Registry 上的公共模块
module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "~> 19.0"
cluster_name = "production-eks"
cluster_version = "1.28"
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.public_subnet_ids
eks_managed_node_groups = {
default = {
min_size = 2
max_size = 10
desired_size = 3
instance_types = ["t3.medium"]
}
}
}数据源与输出
# 数据源:查询已有资源
data "aws_availability_zones" "available" {
state = "available"
}
data "aws_ami" "ubuntu" {
most_recent = true
owners = ["099720109477"]
filter {
name = "name"
values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"]
}
filter {
name = "virtualization-type"
values = ["hvm"]
}
}
data "terraform_remote_state" "network" {
backend = "s3"
config = {
bucket = "terraform-state-prod"
key = "network/terraform.tfstate"
region = "us-east-1"
}
}
# 输出值
output "cluster_endpoint" {
description = "EKS cluster endpoint"
value = module.eks.cluster_endpoint
}
output "load_balancer_dns" {
description = "Load balancer DNS name"
value = aws_lb.main.dns_name
}常用工作流命令
Variables 变量管理
# variables.tf — 变量定义
variable "environment" {
description = "Environment name (dev/staging/prod)"
type = string
default = "dev"
validation {
condition = contains(["dev", "staging", "prod"], var.environment)
error_message = "Environment must be one of: dev, staging, prod."
}
}
variable "instance_type" {
description = "EC2 instance type"
type = string
default = "t3.medium"
}
variable "allowed_cidrs" {
description = "Allowed CIDR blocks for ingress"
type = list(string)
default = ["10.0.0.0/8"]
}
variable "tags" {
description = "Common tags for all resources"
type = map(string)
default = {}
}
# 使用对象类型
variable "database_config" {
description = "Database configuration"
type = object({
engine = string
version = string
instance_class = string
allocated_storage = number
multi_az = bool
})
default = {
engine = "mysql"
version = "8.0"
instance_class = "db.t3.medium"
allocated_storage = 100
multi_az = false
}
}# terraform.tfvars — 变量赋值
environment = "production"
instance_type = "t3.large"
allowed_cidrs = ["10.0.0.0/8", "172.16.0.0/12"]
tags = {
Project = "myapp"
Owner = "devops"
CostCenter = "engineering"
}
# 生产环境特定的 tfvars 文件
# production.tfvars
environment = "prod"
instance_type = "t3.xlarge"
database_config = {
engine = "mysql"
version = "8.0"
instance_class = "db.r5.large"
allocated_storage = 500
multi_az = true
}# 使用变量文件
terraform apply -var-file="production.tfvars"
# 命令行传递变量
terraform apply -var="environment=staging" -var="instance_type=t3.small"
# 从环境变量读取(TF_VAR_ 前缀)
export TF_VAR_environment="production"
terraform applyLocals 局部值
# locals.tf — 计算值和命名约定
locals {
name_prefix = "${var.project}-${var.environment}"
common_tags = merge(var.tags, {
Environment = var.environment
ManagedBy = "terraform"
TerraformWorkspace = terraform.workspace
})
# 条件表达式
instance_type = var.environment == "prod" ? "t3.xlarge" : "t3.medium"
db_multi_az = var.environment == "prod" ? true : false
# 从 CIDR 计算子网
subnet_cidrs = [for i in range(3) : cidrsubnet(var.vpc_cidr, 8, i)]
# 过滤和转换
public_subnet_ids = [for s in aws_subnet.public : s.id]
private_subnet_ids = [for s in aws_subnet.private : s.id]
# 分区映射
az_names = data.aws_availability_zones.available.names
}
# 使用 locals
resource "aws_instance" "app" {
ami = data.aws_ami.ubuntu.id
instance_type = local.instance_type
subnet_id = local.public_subnet_ids[0]
tags = merge(local.common_tags, {
Name = "${local.name_prefix}-app"
})
}条件表达式与循环
# count — 条件创建
resource "aws_eip" "nat" {
count = var.environment == "prod" ? 3 : 1
domain = "vpc"
tags = {
Name = "${local.name_prefix}-nat-eip-${count.index}"
}
}
# for_each — 使用 map 创建多个资源
resource "aws_security_group_rule" "ingress" {
for_each = var.ingress_rules
type = "ingress"
from_port = each.value.from_port
to_port = each.value.to_port
protocol = each.value.protocol
cidr_blocks = each.value.cidr_blocks
security_group_id = aws_security_group.main.id
}
# variables.tf 中定义规则
variable "ingress_rules" {
type = map(object({
from_port = number
to_port = number
protocol = string
cidr_blocks = list(string)
}))
default = {
http = { from_port = 80, to_port = 80, protocol = "tcp", cidr_blocks = ["0.0.0.0/0"] }
https = { from_port = 443, to_port = 443, protocol = "tcp", cidr_blocks = ["0.0.0.0/0"] }
ssh = { from_port = 22, to_port = 22, protocol = "tcp", cidr_blocks = ["10.0.0.0/8"] }
}
}
# dynamic 动态块 — 动态生成嵌套块
resource "aws_security_group" "dynamic_example" {
name = "dynamic-sg"
vpc_id = aws_vpc.main.id
dynamic "ingress" {
for_each = var.ingress_rules
content {
from_port = ingress.value.from_port
to_port = ingress.value.to_port
protocol = ingress.value.protocol
cidr_blocks = ingress.value.cidr_blocks
}
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}Terraform Cloud 与 CI/CD 集成
# GitHub Actions 集成 Terraform
name: Terraform CI/CD
on:
push:
branches: [main]
paths: ['infra/**']
pull_request:
paths: ['infra/**']
jobs:
terraform:
runs-on: ubuntu-latest
defaults:
run:
working-directory: infra
steps:
- uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: "1.7.0"
- name: Terraform Init
run: terraform init -backend-config="token=${{ secrets.TF_API_TOKEN }}"
env:
TF_CLI_ARGS_init: "-input=false"
- name: Terraform Format
run: terraform fmt -check -recursive
- name: Terraform Validate
run: terraform validate
- name: Terraform Plan
if: github.event_name == 'pull_request'
run: terraform plan -out=tfplan -var-file="production.tfvars"
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Terraform Apply
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
run: terraform apply -auto-approve -var-file="production.tfvars"
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}Workspace 多环境管理
# Workspace 管理
terraform workspace new staging
terraform workspace new production
terraform workspace list
terraform workspace select production
# 每个 Workspace 可以有不同的变量
# terraform.tfvars 中可以使用 terraform.workspace# 基于 Workspace 的环境配置
locals {
env_config = {
default = {
instance_type = "t3.micro"
min_size = 1
max_size = 3
}
staging = {
instance_type = "t3.small"
min_size = 2
max_size = 5
}
production = {
instance_type = "t3.xlarge"
min_size = 3
max_size = 20
}
}
config = local.env_config[terraform.workspace]
}
# 使用
resource "aws_autoscaling_group" "app" {
min_size = local.config.min_size
max_size = local.config.max_size
desired_capacity = local.config.min_size
}Terraform Import 导入已有资源
# 导入已存在的 AWS 资源到 Terraform 状态
# 步骤1:在配置文件中定义对应的 resource
# 步骤2:执行 import 命令
# 导入 VPC
terraform import aws_vpc.main vpc-0abc123def456
# 导入安全组
terraform import aws_security_group.web sg-0abc123def456
# 导入 S3 Bucket
terraform import aws_s3_bucket.logs my-log-bucket
# 导入 RDS 实例
terraform import aws_db_instance.main my-db-instance
# 批量导入脚本
for bucket in $(aws s3 ls | awk '{print $3}'); do
terraform import "aws_s3_bucket.logs[\"$bucket\"]" "$bucket"
done
# 生成配置(terraform 1.5+)
terraform plan -generate-config-out=generated.tf状态漂移检测与修复
# 检测配置与实际资源的差异
terraform plan -detailed-exitcode
# 退出码:
# 0 = 无差异
# 1 = 错误
# 2 = 有差异(漂移)
# 修复漂移 — 按配置重建资源
terraform apply -refresh-only # 只刷新状态,不做变更
terraform plan # 查看差异
terraform apply # 应用变更
# 忽略某些属性的变化(防止漂移)
# 在 resource 中使用 lifecycle 块resource "aws_instance" "app" {
ami = data.aws_ami.ubuntu.id
instance_type = "t3.medium"
# lifecycle 管理策略
lifecycle {
# 忽略某些属性的变化(如 tags 被外部工具修改)
ignore_changes = [
tags,
user_data,
]
# 防止意外删除
prevent_destroy = true
# 更新时先创建新资源再删除旧资源(零停机)
create_before_destroy = true
}
}常见问题与排错
# 问题1:状态锁定(Locking)
# 错误:Error acquiring the state lock
# 原因:另一个 Terraform 进程正在运行或上次异常退出
# 解决:确认没有其他进程运行后强制解锁
terraform force-unlock <lock-id>
# 问题2:Provider 版本冲突
# 错误:Inconsistent dependency lock file
# 解决:重新初始化
terraform init -upgrade
# 问题3:资源依赖问题
# 使用 depends_on 显式声明依赖
# depends_on = [aws_iam_role_policy_attachment.ecs_execution]
# 问题4:状态文件损坏
# 从 S3 备份恢复
aws s3 cp s3://terraform-state-prod/network/terraform.tfstate ./terraform.tfstate.backup
# 问题5:大规模基础设施 plan 太慢
# 使用 -target 缩小范围
terraform plan -target=aws_instance.app -target=aws_security_group.web
# 调试输出
TF_LOG=DEBUG terraform apply 2>tf-debug.log
TF_LOG=JSON terraform plan >tf-plan-debug.json常用工作流命令
# 初始化工作目录
terraform init -backend-config=backend.hcl
# 格式化配置文件
terraform fmt -recursive
# 验证配置语法
terraform validate
# 规划变更(预览)
terraform plan -out=tfplan
terraform plan -var-file="production.tfvars"
# 应用变更
terraform apply tfplan
terraform apply -auto-approve -var="environment=staging"
# 销毁资源
terraform destroy -target=aws_instance.web
# 查看输出
terraform output -json优点
缺点
总结
Terraform 作为基础设施即代码领域的事实标准,通过 Provider、Resource、State 和 Module 四大核心概念构建了一套完整的基础设施管理方案。其声明式的配置方式让运维团队能够以代码的形式管理云资源,结合版本控制系统实现基础设施的审计追踪和协作管理。虽然 State 管理和 HCL 语言存在一定的学习成本,但 Terraform 强大的多云支持能力和丰富的模块生态使其成为现代 DevOps 实践中不可或缺的关键工具。
关键知识点
- DevOps 主题的核心是让交付更快、更稳、更可审计。
- 自动化不是把命令脚本化,而是把失败、回滚、权限和观测一起设计进去。
- 生产链路必须明确制品、环境、凭据、配置和责任边界。
项目落地视角
- 把流水线拆成构建、测试、制品、部署、验证和回滚几个阶段。
- 为关键步骤补齐日志、指标、通知和人工兜底点。
- 定期演练扩容、回滚、故障注入和灾备切换。
常见误区
- 只关注部署成功,不关注失败恢复和审计追踪。
- 把环境差异藏在临时脚本或人工操作里。
- 上线频率高了以后,没有标准化制品和配置管理。
进阶路线
- 继续补齐 GitOps、可观测性、平台工程和成本治理。
- 把主题和应用架构、安全、权限、备份恢复联动起来理解。
- 形成团队级平台能力,而不是每个项目重复造轮子。
适用场景
- 当你准备把《Terraform 基础设施即代码》真正落到项目里时,最适合先在一个独立模块或最小样例里验证关键路径。
- 适合构建自动化交付、基础设施治理、监控告警和生产发布体系。
- 当团队规模扩大、发布频率提升或环境变多时,这类主题会显著影响交付效率。
落地建议
- 所有自动化流程尽量做到幂等、可审计、可回滚。
- 把制品、变量、凭据和执行权限分层管理。
- 定期演练扩容、回滚、密钥轮换和灾备恢复。
排错清单
- 先定位失败发生在代码、构建、制品、环境还是权限层。
- 检查流水线变量、凭据、镜像标签和目标环境配置是否一致。
- 如果问题偶发,重点看并发发布、资源争抢和外部依赖抖动。
复盘问题
- 如果把《Terraform 基础设施即代码》放进你的当前项目,最先要验证的输入、输出和失败路径分别是什么?
- 《Terraform 基础设施即代码》最容易在什么规模、什么边界条件下暴露问题?你会用什么指标或日志去确认?
- 相比默认实现或替代方案,采用《Terraform 基础设施即代码》最大的收益和代价分别是什么?
