Python 数据验证
大约 10 分钟约 3055 字
Python 数据验证
简介
数据验证是在数据进入系统前检查其合法性、完整性和一致性的工程实践。Python 中 Pydantic 是最流行的数据验证库,通过类型注解自动实现验证逻辑。配合 JSON Schema、marshmallow 等工具,可以构建完整的输入验证和数据清洗体系。
特点
实现
Pydantic 模型验证
from pydantic import BaseModel, Field, EmailStr, field_validator, model_validator
from typing import Optional
from datetime import datetime, date
from enum import Enum
class Gender(str, Enum):
MALE = "male"
FEMALE = "female"
OTHER = "other"
class Address(BaseModel):
province: str = Field(..., min_length=2, max_length=20, description="省份")
city: str = Field(..., min_length=2, max_length=30, description="城市")
district: str = Field(..., min_length=2, max_length=30, description="区县")
detail: str = Field(..., max_length=200, description="详细地址")
class CreateUserRequest(BaseModel):
"""用户创建请求模型"""
name: str = Field(..., min_length=2, max_length=50, description="姓名")
email: EmailStr = Field(..., description="邮箱")
age: int = Field(..., ge=0, le=150, description="年龄")
gender: Gender = Field(..., description="性别")
phone: str = Field(
...,
pattern=r"^1[3-9]\d{9}$",
description="手机号",
)
address: Optional[Address] = None
tags: list[str] = Field(default_factory=list, max_length=10)
@field_validator("name")
@classmethod
def normalize_name(cls, v: str) -> str:
return v.strip()
@field_validator("tags")
@classmethod
def deduplicate_tags(cls, v: list[str]) -> list[str]:
return list(set(tag.strip().lower() for tag in v))
model_config = {"from_attributes": True}
# 使用
try:
user = CreateUserRequest(
name=" 张三 ",
email="zhang@example.com",
age=30,
gender="male",
phone="13800138000",
address=Address(province="北京", city="北京", district="海淀区", detail="中关村1号"),
tags=["Python", "GO", "python"],
)
print(user.model_dump())
except Exception as e:
print(f"验证失败: {e}")
# 错误示例
from pydantic import ValidationError
try:
CreateUserRequest(
name="X", # 太短
email="invalid-email",
age=-1,
gender="unknown",
phone="123",
)
except ValidationError as e:
for error in e.errors():
print(f" 字段: {error['loc']}, 错误: {error['msg']}")嵌套模型与自定义校验器
from pydantic import BaseModel, Field, field_validator, model_validator
from typing import Optional
from datetime import date, datetime
class OrderItem(BaseModel):
product_id: str = Field(..., pattern=r"^P-\d{4,6}$")
product_name: str = Field(..., min_length=1, max_length=200)
quantity: int = Field(..., ge=1, le=999)
unit_price: float = Field(..., gt=0)
@property
def subtotal(self) -> float:
return round(self.quantity * self.unit_price, 2)
class CreateOrderRequest(BaseModel):
"""订单创建请求"""
user_id: str = Field(..., pattern=r"^U-\d{4,8}$")
items: list[OrderItem] = Field(..., min_length=1, max_length=100)
shipping_address: str = Field(..., min_length=5, max_length=500)
coupon_code: Optional[str] = Field(None, pattern=r"^[A-Z0-9]{6,12}$")
note: Optional[str] = Field(None, max_length=500)
expected_delivery_date: Optional[date] = None
@field_validator("expected_delivery_date")
@classmethod
def validate_delivery_date(cls, v: Optional[date]) -> Optional[date]:
if v and v < date.today():
raise ValueError("期望送达日期不能早于今天")
return v
@model_validator(mode="after")
def validate_order(self):
# 校验订单金额
total = sum(item.subtotal for item in self.items)
if total > 100000:
raise ValueError(f"订单金额 {total} 超过单笔上限 100000")
if total < 1:
raise ValueError("订单金额不能小于 1 元")
return self
@property
def total_amount(self) -> float:
return sum(item.subtotal for item in self.items)
# 使用
order = CreateOrderRequest(
user_id="U-001234",
items=[
OrderItem(product_id="P-0001", product_name="Python 书籍", quantity=2, unit_price=89.9),
OrderItem(product_id="P-0002", product_name="机械键盘", quantity=1, unit_price=599.0),
],
shipping_address="北京市海淀区中关村大街1号",
coupon_code="SAVE20",
)
print(f"订单金额: {order.total_amount}")响应模型与数据过滤
from pydantic import BaseModel, Field, computed_field
from typing import Optional
from datetime import datetime
class UserResponse(BaseModel):
"""用户响应模型(脱敏)"""
id: int
name: str
email: str
role: str
is_active: bool
created_at: datetime
@computed_field
@property
def email_domain(self) -> str:
return self.email.split("@")[-1] if "@" in self.email else ""
class PaginatedResponse(BaseModel):
"""分页响应模型"""
items: list
total: int = Field(..., ge=0)
page: int = Field(..., ge=1)
page_size: int = Field(..., ge=1, le=100)
@computed_field
@property
def pages(self) -> int:
return (self.total + self.page_size - 1) // self.page_size
@computed_field
@property
def has_next(self) -> bool:
return self.page < self.pages
# FastAPI 中使用
from fastapi import FastAPI, Query
from typing import Annotated
app = FastAPI()
@app.get("/api/users", response_model=PaginatedResponse[UserResponse])
async def list_users(
page: Annotated[int, Query(ge=1)] = 1,
page_size: Annotated[int, Query(ge=1, le=100)] = 20,
role: Optional[str] = None,
):
users = get_users_from_db(page, page_size, role)
total = count_users(role)
return PaginatedResponse(
items=users,
total=total,
page=page,
page_size=page_size,
)数据清洗与转换 Pipeline
from pydantic import BaseModel, Field, field_validator
import re
from typing import Optional
class RawCustomerInput(BaseModel):
"""原始客户数据(需要清洗)"""
raw_name: str = Field(alias="name")
raw_phone: str = Field(alias="phone")
raw_email: str = Field(alias="email")
raw_id_card: Optional[str] = Field(None, alias="idCard")
@field_validator("raw_name")
@classmethod
def clean_name(cls, v: str) -> str:
# 去除空格、特殊字符
cleaned = re.sub(r"[^\u4e00-\u9fa5a-zA-Z\s·]", "", v).strip()
if len(cleaned) < 2:
raise ValueError(f"姓名清洗后过短: '{cleaned}'")
return cleaned
@field_validator("raw_phone")
@classmethod
def clean_phone(cls, v: str) -> str:
# 只保留数字
digits = re.sub(r"\D", "", v)
if len(digits) == 11 and digits.startswith("1"):
return digits
raise ValueError(f"无效的手机号: '{v}'")
@field_validator("raw_email")
@classmethod
def clean_email(cls, v: str) -> str:
cleaned = v.strip().lower()
if not re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", cleaned):
raise ValueError(f"无效的邮箱: '{v}'")
return cleaned
@field_validator("raw_id_card")
@classmethod
def clean_id_card(cls, v: Optional[str]) -> Optional[str]:
if not v:
return None
cleaned = v.strip().upper()
if not re.match(r"^\d{17}[\dX]$", cleaned):
raise ValueError(f"无效的身份证号: '{v}'")
return cleaned
model_config = {"populate_by_name": True}
# 使用
raw_data = {
"name": " 张 三 !!! ",
"phone": "1-3-8-0-0-1-3-8-0-0-0",
"email": "ZHANG@Example.COM ",
"idCard": "110101199001011234",
}
customer = RawCustomerInput(**raw_data)
print(f"清洗结果: name={customer.raw_name}, phone={customer.raw_phone}")
# 清洗结果: name=张 三, phone=13800138000Pydantic 自定义类型与 Generic 模型
from pydantic import BaseModel, Field, GetCoreSchemaHandler
from pydantic_core import core_schema
from typing import TypeVar, Generic, Type, List
from datetime import datetime
from enum import Enum
# 自定义类型:确保字符串去除前后空格
class TrimmedStr(str):
"""自动去除前后空格的字符串类型"""
@classmethod
def __get_pydantic_core_schema__(cls, source_type, handler: GetCoreSchemaHandler):
return core_schema.with_info_plain_validator_function(
cls._validate
)
@classmethod
def _validate(cls, value, _info):
if isinstance(value, str):
return cls(value.strip())
return cls(value)
class Status(str, Enum):
ACTIVE = "active"
INACTIVE = "inactive"
PENDING = "pending"
# Generic 分页响应模型
T = TypeVar("T")
class PaginatedResponse(BaseModel, Generic[T]):
"""通用分页响应"""
items: List[T]
total: int = Field(..., ge=0)
page: int = Field(..., ge=1)
page_size: int = Field(..., ge=1, le=100)
@computed_field
@property
def total_pages(self) -> int:
return (self.total + self.page_size - 1) // self.page_size
# 使用 Generic 模型
class UserBrief(BaseModel):
id: int
name: str
email: str
# 类型别名
UserListResponse = PaginatedResponse[UserBrief]
# 实际使用
response = UserListResponse(
items=[
UserBrief(id=1, name="张三", email="zhang@example.com"),
UserBrief(id=2, name="李四", email="li@example.com"),
],
total=100,
page=1,
page_size=20,
)
print(response.model_dump_json(indent=2))Pydantic 与 JSON Schema 互操作
from pydantic import BaseModel, Field
from typing import Optional
import json
class SearchQuery(BaseModel):
keyword: str = Field(..., min_length=1, max_length=200, description="搜索关键词")
category: Optional[str] = Field(None, description="分类筛选")
price_min: Optional[float] = Field(None, ge=0, description="最低价格")
price_max: Optional[float] = Field(None, description="最高价格")
sort_by: str = Field("relevance", pattern=r"^(relevance|price_asc|price_desc|newest)$")
page: int = Field(1, ge=1, le=100)
page_size: int = Field(20, ge=1, le=100)
# 导出 JSON Schema(可用于前端表单验证)
schema = SearchQuery.model_json_schema()
print(json.dumps(schema, indent=2, ensure_ascii=False))
# 可用于前端 JSON Schema 验证库(ajv、zod 等)
# 按 JSON Schema 验证原始数据
raw_data = {
"keyword": "Python 书籍",
"price_min": 10,
"price_max": 200,
"sort_by": "price_asc",
}
query = SearchQuery.model_validate(raw_data)
print(f"搜索: {query.keyword}, 价格: {query.price_min}-{query.price_max}")
# 模型配置选项
class StrictModel(BaseModel):
"""严格模式:禁止额外字段"""
model_config = {"extra": "forbid"}
name: str
age: int
# data = {"name": "张三", "age": 30, "extra_field": "xxx"}
# StrictModel.model_validate(data) # 报错:extra fields not permitted数据验证装饰器
from pydantic import BaseModel, ValidationError, Field
from functools import wraps
from typing import Any, Callable, Type
def validate_input(model_class: Type[BaseModel]):
"""输入验证装饰器"""
def decorator(func: Callable):
@wraps(func)
def wrapper(*args, **kwargs):
# 假设最后一个参数是需要验证的数据
if args and isinstance(args[-1], dict):
try:
validated = model_class.model_validate(args[-1])
return func(*args[:-1], validated)
except ValidationError as e:
return {"errors": e.errors()}, 400
return func(*args, **kwargs)
return wrapper
return decorator
class CreateProductInput(BaseModel):
name: str = Field(..., min_length=1, max_length=200)
price: float = Field(..., gt=0, le=999999)
stock: int = Field(default=0, ge=0)
category: str = Field(..., min_length=1)
# 使用装饰器
@validate_input(CreateProductInput)
def create_product(validated_data: CreateProductInput) -> dict:
return {
"id": 1,
"name": validated_data.name,
"price": validated_data.price,
"stock": validated_data.stock,
"category": validated_data.category,
}
result = create_product({
"name": "Python 高级编程",
"price": 89.9,
"stock": 100,
"category": "书籍",
})
print(result)marshmallow 验证方案对比
# pip install marshmallow
from marshmallow import Schema, fields, validate, post_load, ValidationError
class UserSchema(Schema):
"""marshmallow 验证示例"""
name = fields.Str(required=True, validate=validate.Length(min=2, max=50))
email = fields.Email(required=True)
age = fields.Int(required=True, validate=validate.Range(min=0, max=150))
role = fields.Str(missing="viewer", validate=validate.OneOf(["admin", "editor", "viewer"]))
@post_load
def make_user(self, data, **kwargs):
return UserInput(**data)
class UserInput:
def __init__(self, name, email, age, role="viewer"):
self.name = name
self.email = email
self.age = age
self.role = role
# 使用
schema = UserSchema()
try:
user = schema.load({
"name": "张三",
"email": "zhang@example.com",
"age": 30,
})
print(f"验证通过: {user.name}, {user.email}")
except ValidationError as e:
print(f"验证失败: {e.messages}")
# 序列化
user = UserInput("李四", "li@example.com", 25, "editor")
json_data = schema.dump(user)
print(f"序列化: {json_data}")数据验证最佳实践总结
# 1. 分层验证策略
# API 层 → Pydantic 模型验证(类型、格式、范围)
# Service 层 → 业务规则验证(库存是否充足、用户是否有权限)
# 数据库层 → 约束验证(唯一性、外键、非空)
# 2. 错误信息规范化
from pydantic import BaseModel, Field, ValidationError
class StandardErrorResponse(BaseModel):
code: str
message: str
details: list[dict]
def format_validation_error(e: ValidationError) -> dict:
"""统一格式化验证错误"""
details = []
for error in e.errors():
field = ".".join(str(loc) for loc in error["loc"])
details.append({
"field": field,
"message": error["msg"],
"type": error["type"],
})
return StandardErrorResponse(
code="VALIDATION_ERROR",
message="输入数据验证失败",
details=details,
).model_dump()
# 3. 测试验证覆盖
def test_validation():
"""验证测试示例"""
# 有效数据
order = CreateOrderRequest(
user_id="U-001",
items=[OrderItem(product_id="P-001", product_name="测试", quantity=1, unit_price=10)],
shipping_address="北京市",
)
assert order.total_amount == 10
# 无效数据
try:
CreateOrderRequest(user_id="INVALID", items=[], shipping_address="a")
except ValidationError as e:
errors = e.errors()
assert len(errors) >= 2 # 至少两个验证错误
# 4. 性能考虑
# - Pydantic v2 比 v1 快 5-50 倍(Rust 核心)
# - 避免在热路径中创建大量 Pydantic 模型实例
# - 对于超大数据集,考虑分批验证优点
缺点
总结
数据验证是构建健壮 API 和数据处理管道的基础能力。Pydantic 通过类型注解自动生成验证规则,配合 FastAPI 可以实现"模型即文档、模型即校验"的开发体验。核心是掌握 Field 约束、field_validator 和 model_validator 三种验证手段,覆盖从简单字段校验到复杂跨字段校验的全部场景。
关键知识点
- Pydantic BaseModel 自动验证所有字段的类型和约束
- Field(...) 表示必填字段,Field(default=...) 表示可选字段
- field_validator 用于单字段的自定义验证逻辑
- model_validator 用于跨字段联合验证(如结束日期必须大于开始日期)
项目落地视角
- API 层统一使用 Pydantic 模型验证输入,业务层不需要重复校验
- 请求模型和响应模型分开定义,响应模型可以做数据脱敏
- 验证错误统一格式化返回给前端,标注具体字段和错误原因
- 数据导入场景使用专门的清洗模型,将原始数据标准化后再入库
常见误区
- 只在 API 层验证,数据处理管道和脚本中没有验证
- 把数据库模型直接用作 API 输入模型,暴露内部结构
- 忽略 Pydantic v1 和 v2 的 API 差异
- 过度复杂的验证逻辑放在模型中,应该抽到 Service 层
进阶路线
- 学习 Pydantic 的自定义类型和 Generic 模型
- 研究数据合同(Data Contract)在微服务间的应用
- 了解 msgspec 和 attrs 等替代验证库
- 探索 JSON Schema 标准在数据验证中的应用
适用场景
- Web API 的请求参数验证和响应格式化
- 数据导入和 ETL 管道的输入数据清洗
- 配置文件解析和环境变量绑定
落地建议
- API 请求/响应模型与数据库模型分离,各自独立演进
- 验证错误统一格式:
- 为每个 API 端点编写验证相关的测试用例
排错清单
- 检查 Field 的 alias 和 populate_by_name 配置是否一致
- 确认 Optional 字段是否正确处理了 None 值
- 排查自定义 validator 的执行顺序是否符合预期
复盘问题
- 你的 API 是否有统一的输入验证策略?验证覆盖率为多少?
- 数据导入管道是否有完整的数据清洗和验证步骤?
- 验证错误信息是否对前端开发者友好?是否能快速定位问题?
