健康检查与就绪探针
大约 10 分钟约 2911 字
健康检查与就绪探针
简介
ASP.NET Core 内置健康检查(Health Checks)框架支持对应用依赖项(数据库、缓存、消息队列等)进行可用性检测。结合 Kubernetes 的存活探针(Liveness)和就绪探针(Readiness),实现自动故障恢复和流量控制。
特点
基础配置
健康检查注册
// 注册健康检查服务
builder.Services.AddHealthChecks()
// 数据库检查
.AddDbContextCheck<AppDbContext>("database", tags: new[] { "ready" })
// Redis 检查
.AddRedis(builder.Configuration.GetConnectionString("Redis")!, "redis",
tags: new[] { "ready" })
// RabbitMQ 检查
.AddRabbitMQ(builder.Configuration.GetConnectionString("RabbitMQ")!, "rabbitmq",
tags: new[] { "ready" })
// URL 检查(外部依赖)
.AddUrlGroup(new Uri("https://api.external.com/health"), "external-api",
tags: new[] { "ready" })
// NpgSql 检查
.AddNpgSql(builder.Configuration.GetConnectionString("Postgres")!, "postgresql",
tags: new[] { "ready" })
// Elasticsearch 检查
.AddElasticsearch(builder.Configuration["Elasticsearch:Url"]!, "elasticsearch",
tags: new[] { "ready" });
// 配置端点
app.MapHealthChecks("/health", new HealthCheckOptions
{
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});
// 分组端点
app.MapHealthChecks("/health/ready", new HealthCheckOptions
{
Predicate = check => check.Tags.Contains("ready"),
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});
app.MapHealthChecks("/health/live", new HealthCheckOptions
{
Predicate = _ => false // 只检查应用是否运行
});自定义健康检查
// 实现 IHealthCheck
public class DiskSpaceHealthCheck : IHealthCheck
{
private readonly long _minimumFreeBytes;
private readonly string _drivePath;
public DiskSpaceHealthCheck(string drivePath = "C:\\", long minimumFreeGB = 1)
{
_drivePath = drivePath;
_minimumFreeBytes = minimumFreeGB * 1024 * 1024 * 1024;
}
public Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
try
{
var drive = new DriveInfo(_drivePath);
var freeBytes = drive.AvailableFreeSpace;
var totalBytes = drive.TotalSize;
var usedPercent = (double)(totalBytes - freeBytes) / totalBytes * 100;
var data = new Dictionary<string, object>
{
["drive"] = _drivePath,
["freeGB"] = Math.Round((double)freeBytes / 1024 / 1024 / 1024, 2),
["totalGB"] = Math.Round((double)totalBytes / 1024 / 1024 / 1024, 2),
["usedPercent"] = Math.Round(usedPercent, 2)
};
if (freeBytes < _minimumFreeBytes)
{
return Task.FromResult(HealthCheckResult.Unhealthy(
$"磁盘空间不足: 剩余 {data["freeGB"]}GB", data: data));
}
if (usedPercent > 90)
{
return Task.FromResult(HealthCheckResult.Degraded(
$"磁盘使用率过高: {data["usedPercent"]}%", data: data));
}
return Task.FromResult(HealthCheckResult.Healthy(
$"磁盘正常: 剩余 {data["freeGB"]}GB", data));
}
catch (Exception ex)
{
return Task.FromResult(HealthCheckResult.Unhealthy(
"磁盘检查失败", ex, null));
}
}
}
// 注册自定义检查
builder.Services.AddHealthChecks()
.AddCheck<DiskSpaceHealthCheck>("disk-space",
tags: new[] { "ready", "infra" },
timeout: TimeSpan.FromSeconds(5));
// 带参数的检查
builder.Services.AddHealthChecks()
.AddCheck("disk-space",
sp => new DiskSpaceHealthCheck("D:\\", minimumFreeGB: 10),
tags: new[] { "ready" });外部服务检查
// API 依赖检查
public class ExternalApiHealthCheck : IHealthCheck
{
private readonly IHttpClientFactory _httpClientFactory;
private readonly string _baseUrl;
private readonly string _name;
public ExternalApiHealthCheck(
IHttpClientFactory httpClientFactory,
string baseUrl,
string name)
{
_httpClientFactory = httpClientFactory;
_baseUrl = baseUrl;
_name = name;
}
public async Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
var client = _httpClientFactory.CreateClient(_name);
var stopwatch = Stopwatch.StartNew();
try
{
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
cts.CancelAfter(TimeSpan.FromSeconds(5));
var response = await client.GetAsync($"{_baseUrl}/health", cts.Token);
stopwatch.Stop();
var data = new Dictionary<string, object>
{
["url"] = _baseUrl,
["latencyMs"] = stopwatch.ElapsedMilliseconds,
["statusCode"] = (int)response.StatusCode
};
if (response.IsSuccessStatusCode)
{
return HealthCheckResult.Healthy(
$"{_name} 正常 ({stopwatch.ElapsedMilliseconds}ms)", data);
}
return HealthCheckResult.Unhealthy(
$"{_name} 返回 {(int)response.StatusCode}", data: data);
}
catch (TaskCanceledException)
{
return HealthCheckResult.Unhealthy(
$"{_name} 超时 ({stopwatch.ElapsedMilliseconds}ms)");
}
catch (Exception ex)
{
return HealthCheckResult.Unhealthy(
$"{_name} 连接失败: {ex.Message}", ex);
}
}
}
// 消息队列深度检查
public class QueueDepthHealthCheck : IHealthCheck
{
private readonly IConnection _connection;
private readonly string _queueName;
private readonly long _maxDepth;
public QueueDepthHealthCheck(IConnection connection, string queueName, long maxDepth = 10000)
{
_connection = connection;
_queueName = queueName;
_maxDepth = maxDepth;
}
public Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
try
{
using var channel = _connection.CreateModel();
var declareOk = channel.QueueDeclarePassive(_queueName);
var messageCount = declareOk.MessageCount;
var consumerCount = declareOk.ConsumerCount;
var data = new Dictionary<string, object>
{
["queueName"] = _queueName,
["messageCount"] = messageCount,
["consumerCount"] = consumerCount,
["maxDepth"] = _maxDepth
};
if (messageCount > _maxDepth)
{
return Task.FromResult(HealthCheckResult.Unhealthy(
$"队列 {_queueName} 积压 {messageCount} 条消息(超过 {_maxDepth})", data: data));
}
if (messageCount > _maxDepth * 0.8)
{
return Task.FromResult(HealthCheckResult.Degraded(
$"队列 {_queueName} 积压 {messageCount} 条消息(接近阈值)", data: data));
}
if (consumerCount == 0)
{
return Task.FromResult(HealthCheckResult.Degraded(
$"队列 {_queueName} 无消费者", data: data));
}
return Task.FromResult(HealthCheckResult.Healthy(
$"队列 {_queueName} 正常 ({messageCount} 条消息, {consumerCount} 个消费者)", data));
}
catch (Exception ex)
{
return Task.FromResult(HealthCheckResult.Unhealthy(
$"队列 {_queueName} 检查失败", ex));
}
}
}探针模式
Kubernetes 三探针
// Kubernetes 三种探针:
// 1. Startup Probe — 应用是否已启动(启动期间反复探测)
// 2. Liveness Probe — 应用是否存活(失败则重启容器)
// 3. Readiness Probe — 应用是否就绪(失败则移除流量)
// ASP.NET Core 健康端点设计
// /health/startup — 启动探针(检查关键依赖)
// /health/live — 存活探针(仅检查应用是否响应)
// /health/ready — 就绪探针(检查所有依赖是否正常)
builder.Services.AddHealthChecks()
// 关键依赖(启动检查)
.AddCheck<DatabaseHealthCheck>("database",
tags: new[] { "startup", "ready" })
// 就绪检查
.AddCheck<RedisHealthCheck>("redis",
tags: new[] { "ready" })
.AddCheck<RabbitMqHealthCheck>("rabbitmq",
tags: new[] { "ready" })
.AddCheck<ExternalApiHealthCheck>("external-api",
tags: new[] { "ready" });
// 启动探针端点
app.MapHealthChecks("/health/startup", new HealthCheckOptions
{
Predicate = check => check.Tags.Contains("startup"),
ResponseWriter = WriteDetailedResponse
});
// 存活探针端点(轻量级,只检查应用响应)
app.MapHealthChecks("/health/live", new HealthCheckOptions
{
Predicate = _ => false // 不执行任何检查,仅确认应用在运行
});
// 就绪探针端点
app.MapHealthChecks("/health/ready", new HealthCheckOptions
{
Predicate = check => check.Tags.Contains("ready"),
ResponseWriter = WriteDetailedResponse
});
// 详细响应格式
static Task WriteDetailedResponse(HttpContext context, HealthReport report)
{
context.Response.ContentType = "application/json";
var response = new
{
status = report.Status.ToString(),
totalDuration = report.TotalDuration.TotalMilliseconds,
checks = report.Entries.Select(e => new
{
name = e.Key,
status = e.Value.Status.ToString(),
description = e.Value.Description,
duration = e.Value.Duration.TotalMilliseconds,
tags = e.Value.Tags,
data = e.Value.Data
})
};
return context.Response.WriteAsJsonAsync(response);
}Kubernetes 部署配置
# Kubernetes 部署配置(deployment.yaml)
apiVersion: apps/v1
kind: Deployment
metadata:
name: my-api
spec:
replicas: 3
selector:
matchLabels:
app: my-api
template:
metadata:
labels:
app: my-api
spec:
containers:
- name: api
image: my-api:latest
ports:
- containerPort: 8080
# 启动探针
startupProbe:
httpGet:
path: /health/startup
port: 8080
failureThreshold: 30 # 失败 30 次后重启
periodSeconds: 10 # 每 10 秒探测一次
# 最多等待 300 秒(30 × 10)
# 存活探针
livenessProbe:
httpGet:
path: /health/live
port: 8080
initialDelaySeconds: 0 # startupProbe 通过后才开始
periodSeconds: 15
timeoutSeconds: 5
failureThreshold: 3 # 连续 3 次失败重启
# 就绪探针
readinessProbe:
httpGet:
path: /health/ready
port: 8080
initialDelaySeconds: 0
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3 # 连续 3 次失败移除流量
# 资源限制
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
# 优雅关闭
lifecycle:
preStop:
exec:
command: ["sh", "-c", "sleep 10"] # 等待 Service 移除端点
terminationGracePeriodSeconds: 60
---
apiVersion: v1
kind: Service
metadata:
name: my-api
spec:
selector:
app: my-api
ports:
- port: 80
targetPort: 8080Health Check UI
可视化面板
// dotnet add package AspNetCore.HealthChecks.UI
// dotnet add package AspNetCore.HealthChecks.UI.InMemory.Storage
builder.Services.AddHealthChecksUI(options =>
{
options.AddHealthCheckEndpoint("my-api", "/health");
options.SetEvaluationTimeInSeconds(30); // 每 30 秒检查
options.SetMinimumSecondsBetweenFailureNotifications(60);
options.SetHeaderText("健康检查面板");
})
.AddInMemoryStorage();
// 配置端点
app.MapHealthChecks("/health", new HealthCheckOptions
{
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});
// UI 面板
app.MapHealthChecksUI(options =>
{
options.UIPath = "/health-ui";
options.ApiPath = "/health-ui-api";
options.UseRelativeApiPath = false;
});
// 访问 /health-ui 查看可视化面板优点
缺点
总结
ASP.NET Core 健康检查框架通过 AddHealthChecks() 注册检查项,支持数据库、Redis、RabbitMQ、URL 等内置检查。自定义检查实现 IHealthCheck 接口,返回 Healthy/Degraded/Unhealthy 三种状态。Kubernetes 三探针模式:Startup(启动检查)、Liveness(存活检查)、Readiness(就绪检查),通过标签(Tags)分组映射到不同端点。建议存活探针轻量化(不检查外部依赖),就绪探针检查所有依赖,避免级联重启。
关键知识点
- 先分清这个主题位于请求链路、后台任务链路还是基础设施链路。
- 服务端主题通常不只关心功能正确,还关心稳定性、性能和可观测性。
- 任何框架能力都要结合配置、生命周期、异常传播和外部依赖一起看。
项目落地视角
- 画清请求进入、业务执行、外部调用、日志记录和错误返回的完整路径。
- 为关键链路补齐超时、重试、熔断、追踪和结构化日志。
- 把配置与敏感信息分离,并明确不同环境的差异来源。
常见误区
- 只会堆中间件或组件,不知道它们在链路中的执行顺序。
- 忽略生命周期和线程池、连接池等运行时资源约束。
- 没有监控和测试就对性能或可靠性下结论。
进阶路线
- 继续向运行时行为、可观测性、发布治理和微服务协同深入。
- 把主题和数据库、缓存、消息队列、认证授权联动起来理解。
- 沉淀团队级模板,包括统一异常处理、配置约定和基础设施封装。
适用场景
- 当你准备把《健康检查与就绪探针》真正落到项目里时,最适合先在一个独立模块或最小样例里验证关键路径。
- 适合 API 服务、后台任务、实时通信、认证授权和微服务协作场景。
- 当需求开始涉及稳定性、性能、可观测性和发布流程时,这类主题会成为基础设施能力。
落地建议
- 先定义请求链路与失败路径,再决定中间件、过滤器、服务边界和依赖方式。
- 为关键链路补日志、指标、追踪、超时与重试策略。
- 环境配置与敏感信息分离,避免把生产参数写死在代码或镜像里。
排错清单
- 先确认问题发生在路由、模型绑定、中间件、业务层还是基础设施层。
- 检查 DI 生命周期、配置来源、序列化规则和认证上下文。
- 查看线程池、连接池、缓存命中率和外部依赖超时。
复盘问题
- 如果把《健康检查与就绪探针》放进你的当前项目,最先要验证的输入、输出和失败路径分别是什么?
- 《健康检查与就绪探针》最容易在什么规模、什么边界条件下暴露问题?你会用什么指标或日志去确认?
- 相比默认实现或替代方案,采用《健康检查与就绪探针》最大的收益和代价分别是什么?
健康检查进阶配置
健康检查超时与降级
// 健康检查超时配置 — 防止单个检查阻塞整个探针
builder.Services.AddHealthChecks()
.AddCheck<DatabaseHealthCheck>("database",
tags: new[] { "startup", "ready" },
timeout: TimeSpan.FromSeconds(3)) // 单个检查超时
.AddCheck<RedisHealthCheck>("redis",
tags: new[] { "ready" },
timeout: TimeSpan.FromSeconds(2))
.AddCheck<ExternalApiHealthCheck>("external-api",
tags: new[] { "ready" },
timeout: TimeSpan.FromSeconds(5));
// 全局超时配置
builder.Services.AddSingleton<IHealthCheckPublisher>(sp =>
new HealthCheckPublisherOptions
{
Delay = TimeSpan.FromSeconds(10),
Period = TimeSpan.FromSeconds(30),
Filter = check => check.Tags.Contains("ready"),
Timeout = TimeSpan.FromSeconds(10)
});健康检查结果缓存
/// <summary>
/// 缓存健康检查结果 — 避免频繁执行昂贵的检查
/// </summary>
public class CachedHealthCheck : IHealthCheck, IDisposable
{
private readonly IHealthCheck _inner;
private readonly TimeSpan _cacheDuration;
private HealthCheckResult _cachedResult;
private DateTime _lastChecked;
private readonly object _lock = new();
public CachedHealthCheck(IHealthCheck inner, TimeSpan cacheDuration)
{
_inner = inner;
_cacheDuration = cacheDuration;
}
public Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
if (_cachedResult != null && DateTime.UtcNow - _lastChecked < _cacheDuration)
{
return Task.FromResult(_cachedResult);
}
}
var result = _inner.CheckHealthAsync(context, cancellationToken).GetAwaiter().GetResult();
lock (_lock)
{
_cachedResult = result;
_lastChecked = DateTime.UtcNow;
}
return Task.FromResult(result);
}
public void Dispose() => (_inner as IDisposable)?.Dispose();
}
// 使用缓存的检查
builder.Services.AddHealthChecks()
.AddCheck<CachedHealthCheck>("cached-redis",
tags: new[] { "ready" });
// 或通过工厂注册
builder.Services.AddHealthChecks()
.AddCheck("redis", sp =>
{
var inner = new RedisHealthCheck(sp.GetRequiredService<IConnectionMultiplexer>());
return new CachedHealthCheck(inner, TimeSpan.FromSeconds(15));
}, tags: new[] { "ready" });Prometheus 指标集成
// dotnet add package AspNetCore.HealthChecks.Prometheus.Metrics
// 注册 Prometheus 指标端点
app.MapHealthChecks("/health", new HealthCheckOptions
{
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});
// Prometheus 格式的指标端点
app.MapHealthChecks("/metrics/health", new HealthCheckOptions
{
ResponseWriter = HealthChecksPrometheusWriter.WritePrometheusResultText
});
// 输出示例:
// # HELP healthcheck_status Health check status (0=Unhealthy, 1=Degraded, 2=Healthy)
// # TYPE healthcheck_status gauge
// healthcheck_status{name="database"} 2
// healthcheck_status{name="redis"} 2
// healthcheck_status{name="rabbitmq"} 1
// healthcheck_duration_seconds{name="database"} 0.023定时健康检查发布
/// <summary>
/// 定时执行健康检查并发布结果 — 不依赖外部请求触发
/// </summary>
public class HealthCheckBackgroundService : BackgroundService
{
private readonly IServiceScopeFactory _scopeFactory;
private readonly ILogger<HealthCheckBackgroundService> _logger;
private readonly TimeSpan _checkInterval = TimeSpan.FromSeconds(30);
public HealthCheckBackgroundService(
IServiceScopeFactory scopeFactory,
ILogger<HealthCheckBackgroundService> logger)
{
_scopeFactory = scopeFactory;
_logger = logger;
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
while (!stoppingToken.IsCancellationRequested)
{
try
{
using var scope = _scopeFactory.CreateScope();
var healthCheckService = scope.ServiceProvider
.GetRequiredService<HealthCheckService>();
var report = await healthCheckService.CheckHealthAsync(
check => check.Tags.Contains("ready"), stoppingToken);
if (report.Status == HealthStatus.Unhealthy)
{
_logger.LogError("健康检查不健康: {UnhealthyChecks}",
string.Join(", ", report.Entries
.Where(e => e.Value.Status == HealthStatus.Unhealthy)
.Select(e => $"{e.Key}: {e.Value.Description}")));
}
else if (report.Status == HealthStatus.Degraded)
{
_logger.LogWarning("健康检查降级: {DegradedChecks}",
string.Join(", ", report.Entries
.Where(e => e.Value.Status == HealthStatus.Degraded)
.Select(e => $"{e.Key}: {e.Value.Description}")));
}
else
{
_logger.LogDebug("健康检查正常,耗时 {Duration}ms",
report.TotalDuration.TotalMilliseconds);
}
}
catch (Exception ex)
{
_logger.LogError(ex, "健康检查执行异常");
}
await Task.Delay(_checkInterval, stoppingToken);
}
}
}
// 注册后台服务
builder.Services.AddHostedService<HealthCheckBackgroundService>();Docker 健康检查集成
# Dockerfile 中添加健康检查
FROM mcr.microsoft.com/dotnet/aspnet:8.0
WORKDIR /app
COPY publish/ .
# Docker 内置健康检查(替代 Kubernetes 探针)
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD curl -f http://localhost:8080/health/live || exit 1
ENTRYPOINT ["dotnet", "MyApp.dll"]# docker-compose.yml 健康检查
services:
api:
image: my-api:latest
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health/ready"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
# 依赖 API 健康的服务
worker:
image: my-worker:latest
depends_on:
api:
condition: service_healthy