Stable Diffusion 图像生成

SunnyFan大约 11 分钟约 3281 字

Stable Diffusion 图像生成

简介

Stable Diffusion 是基于扩散模型的文本生成图像（Text-to-Image）技术。理解 UNet 去噪网络、CLIP 文本编码、Latent Diffusion 和 ControlNet 控制的原理，有助于掌握 AI 图像生成的核心技术。

特点

1.扩散模型 — 前向加噪与反向去噪
2.Latent Diffusion — 潜空间扩散
3.CLIP 编码 — 文本到图像的语义桥接
4.ControlNet — 条件控制生成
5.LoRA 训练 — 风格微调

扩散模型原理

前向与反向过程

import torch
import torch.nn as nn
import numpy as np

# 扩散模型（Diffusion Model）原理：
# 前向过程：逐步向图像添加高斯噪声
# x_t = sqrt(alpha_t) * x_0 + sqrt(1 - alpha_t) * epsilon
#
# 反向过程：训练神经网络预测噪声，逐步去噪
# x_{t-1} = (x_t - beta_t / sqrt(1-alpha_t) * model(x_t, t)) / sqrt(alpha_t)

class DiffusionScheduler:
    """噪声调度器"""

    def __init__(self, num_timesteps=1000, beta_start=0.00085, beta_end=0.012):
        self.num_timesteps = num_timesteps

        # 线性 beta 调度
        self.betas = torch.linspace(beta_start, beta_end, num_timesteps)

        # 预计算常用量
        self.alphas = 1.0 - self.betas
        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - self.alphas_cumprod)

    def add_noise(self, x_0, noise, timesteps):
        """前向过程：向原始图像添加噪声"""
        sqrt_alpha = self.sqrt_alphas_cumprod[timesteps]
        sqrt_one_minus_alpha = self.sqrt_one_minus_alphas_cumprod[timesteps]

        # 调整维度以进行广播
        sqrt_alpha = sqrt_alpha.view(-1, 1, 1, 1)
        sqrt_one_minus_alpha = sqrt_one_minus_alpha.view(-1, 1, 1, 1)

        # q(x_t | x_0) = sqrt(alpha_bar_t) * x_0 + sqrt(1 - alpha_bar_t) * noise
        noisy_x = sqrt_alpha * x_0 + sqrt_one_minus_alpha * noise
        return noisy_x

    def step(self, model_output, timestep, sample):
        """反向过程：单步去噪"""
        t = timestep
        alpha_t = self.alphas[t]
        alpha_bar_t = self.alphas_cumprod[t]
        beta_t = self.betas[t]

        # 预测的 x_0
        pred_x0 = (sample - torch.sqrt(1 - alpha_bar_t) * model_output) / torch.sqrt(alpha_bar_t)

        # 计算去噪后的 x_{t-1}
        pred_dir_xt = torch.sqrt(1 - alpha_bar_t - beta_t) * model_output
        x_prev = torch.sqrt(alpha_t) * (sample - pred_dir_xt) / torch.sqrt(1 - alpha_bar_t)

        return x_prev


# DDPM 训练循环
def train_diffusion_model(model, dataloader, scheduler, optimizer, epochs=100):
    """训练扩散模型"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.train()

    for epoch in range(epochs):
        total_loss = 0

        for batch in dataloader:
            images = batch.to(device)  # (B, C, H, W)
            batch_size = images.shape[0]

            # 随机采样时间步
            timesteps = torch.randint(0, scheduler.num_timesteps, (batch_size,),
                                     device=device)

            # 生成随机噪声
            noise = torch.randn_like(images)

            # 前向加噪
            noisy_images = scheduler.add_noise(images, noise, timesteps)

            # 模型预测噪声
            predicted_noise = model(noisy_images, timesteps)

            # MSE 损失
            loss = nn.functional.mse_loss(predicted_noise, noise)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {avg_loss:.6f}")


# 采样（推理）
@torch.no_grad()
def sample_diffusion(model, scheduler, shape, device="cuda"):
    """从噪声生成图像"""
    model.eval()

    # 从纯噪声开始
    x = torch.randn(shape, device=device)

    # 逐步去噪
    for t in reversed(range(scheduler.num_timesteps)):
        timesteps = torch.full((shape[0],), t, device=device, dtype=torch.long)

        # 预测噪声
        predicted_noise = model(x, timesteps)

        # 去噪一步
        x = scheduler.step(predicted_noise, t, x)

        # 添加少量随机噪声（除了最后一步）
        if t > 0:
            noise = torch.randn_like(x)
            sigma = torch.sqrt(scheduler.betas[t])
            x = x + sigma * noise

    return x

Latent Diffusion

VAE 潜空间

# Latent Diffusion Model (LDM)
# 核心思想：在低维潜空间中进行扩散，而非像素空间
# 优势：大幅降低计算量（512x512 → 64x64 潜空间）

# 1. VAE 编码器
class VAEEncoder(nn.Module):
    """将图像编码到潜空间"""
    def __init__(self, in_channels=3, latent_dim=4):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels, 64, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 128, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 256, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 512, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, latent_dim * 2, 1)  # 输出 mean + log_var
        )

    def forward(self, x):
        h = self.encoder(x)
        mean, log_var = h.chunk(2, dim=1)

        # 重参数化技巧
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        z = mean + eps * std

        return z, mean, log_var


class VAEDecoder(nn.Module):
    """从潜空间重建图像"""
    def __init__(self, latent_dim=4, out_channels=3):
        super().__init__()
        self.decoder = nn.Sequential(
            nn.Conv2d(latent_dim, 512, 3, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2),
            nn.Conv2d(512, 256, 3, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2),
            nn.Conv2d(256, 128, 3, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2),
            nn.Conv2d(128, 64, 3, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2),
            nn.Conv2d(64, out_channels, 3, padding=1),
            nn.Tanh()
        )

    def forward(self, z):
        return self.decoder(z)

CLIP 文本编码

文本-图像语义对齐

# CLIP（Contrastive Language-Image Pre-training）
# 将文本和图像映射到共享的语义空间

# 使用 Hugging Face 的 CLIP
from transformers import CLIPTextModel, CLIPTokenizer

class TextEncoder:
    """CLIP 文本编码器"""
    def __init__(self, model_name="openai/clip-vit-large-patch14"):
        self.tokenizer = CLIPTokenizer.from_pretrained(model_name)
        self.text_encoder = CLIPTextModel.from_pretrained(model_name)

    def encode(self, prompts, device="cuda"):
        """编码文本提示"""
        # Tokenize
        tokens = self.tokenizer(
            prompts,
            padding="max_length",
            max_length=self.tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt"
        )

        input_ids = tokens.input_ids.to(device)

        # 编码
        with torch.no_grad():
            text_embeddings = self.text_encoder(input_ids)[0]

        # 无条件嵌入（用于 Classifier-Free Guidance）
        uncond_tokens = self.tokenizer(
            [""] * len(prompts),
            padding="max_length",
            max_length=self.tokenizer.model_max_length,
            return_tensors="pt"
        )
        uncond_ids = uncond_tokens.input_ids.to(device)

        with torch.no_grad():
            uncond_embeddings = self.text_encoder(uncond_ids)[0]

        # 拼接条件和无条件嵌入
        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

        return text_embeddings


# Classifier-Free Guidance
def classifier_free_guidance(noise_pred_uncond, noise_pred_cond, guidance_scale=7.5):
    """
    CFG 公式：
    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
    guidance_scale 越大，生成的图像越贴合文本描述
    """
    return noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)

Stable Diffusion Pipeline

完整推理流程

# 使用 diffusers 库
# pip install diffusers transformers accelerate

from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline
import torch

def text_to_image(
    prompt,
    model_name="runwayml/stable-diffusion-v1-5",
    negative_prompt="low quality, blurry, distorted",
    num_images=1,
    num_inference_steps=30,
    guidance_scale=7.5,
    seed=42,
    width=512,
    height=512
):
    """文本生成图像"""
    # 加载模型
    pipe = StableDiffusionPipeline.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        use_safetensors=True
    ).to("cuda")

    # 启用内存优化
    pipe.enable_attention_slicing()     # 降低显存
    pipe.enable_vae_slicing()           # 大图分块解码
    pipe.enable_xformers_memory_efficient_attention()  # Flash Attention

    # 设置随机种子
    generator = torch.Generator("cuda").manual_seed(seed)

    # 生成图像
    images = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        num_images_per_prompt=num_images,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        generator=generator,
        width=width,
        height=height
    ).images

    return images


# 使用示例
images = text_to_image(
    prompt="a beautiful sunset over a mountain lake, digital art, highly detailed",
    negative_prompt="low quality, ugly, blurry",
    guidance_scale=7.5,
    num_inference_steps=30
)
images[0].save("output.png")

ControlNet

条件控制生成

# ControlNet — 添加空间条件控制
# 支持：Canny 边缘、深度图、姿态、法线图、分割图等

from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
from diffusers.utils import load_image

# 1. Canny 边缘控制
def canny_controlled_generation(prompt, image_path):
    """使用 Canny 边缘图控制生成"""
    import cv2

    # 加载图片并提取 Canny 边缘
    image = load_image(image_path)
    image = np.array(image)
    canny_image = cv2.Canny(image, 100, 200)
    canny_image = Image.fromarray(canny_image)

    # 加载 ControlNet
    controlnet = ControlNetModel.from_pretrained(
        "lllyasviel/sd-controlnet-canny",
        torch_dtype=torch.float16
    )

    # 加载 Pipeline
    pipe = StableDiffusionControlNetPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        controlnet=controlnet,
        torch_dtype=torch.float16
    ).to("cuda")

    # 生成
    output = pipe(
        prompt=prompt,
        image=canny_image,
        num_inference_steps=30,
        guidance_scale=7.5,
        controlnet_conditioning_scale=1.0  # 控制强度
    ).images[0]

    return output

# 2. 深度图控制
def depth_controlled_generation(prompt, image_path):
    """使用深度图控制生成"""
    from transformers import DPTFeatureExtractor, DPTForDepthEstimation

    # 估计深度图
    image = load_image(image_path)
    feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
    depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

    inputs = feature_extractor(images=image, return_tensors="pt")
    with torch.no_grad():
        depth_map = depth_model(**inputs).predicted_depth

    # 归一化深度图
    depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
    depth_image = Image.fromarray((depth_map * 255).numpy().astype(np.uint8))

    controlnet = ControlNetModel.from_pretrained(
        "lllyasviel/sd-controlnet-depth",
        torch_dtype=torch.float16
    )
    pipe = StableDiffusionControlNetPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        controlnet=controlnet,
        torch_dtype=torch.float16
    ).to("cuda")

    return pipe(prompt=prompt, image=depth_image).images[0]

优点

1.高质量 — 生成逼真的图像
2.可控性 — ControlNet 精确控制布局
3.多样性 — 提示词驱动多样化生成
4.开源生态 — 丰富的模型和工具

缺点

1.GPU 需求 — 推理需要大量显存
2.生成速度 — 多步去噪耗时
3.不可控因素 — 相同提示词结果不同
4.版权争议 — 训练数据版权问题

Stable Diffusion 基于 Latent Diffusion 架构，在 VAE 潜空间（64x64）中进行扩散去噪，大幅降低计算量。CLIP 文本编码器将提示词映射为语义向量，Classifier-Free Guidance 控制生成与文本的对齐程度。ControlNet 通过额外的条件分支实现精确的空间控制（边缘、深度、姿态）。推理使用 diffusers 库，支持 FP16、Attention Slicing 和 xFormers 优化显存。SDXL 分辨率提升到 1024x1024。

关键知识点

先分清模型能力边界、数据边界和工程边界。
任何 AI 主题都不只看效果，还要看延迟、成本、可解释性和安全性。
评估方式和失败样例往往比“换哪个模型”更重要。
先把模型结构、训练目标、评价指标和适用场景分开理解。

项目落地视角

给数据来源、Prompt 模板、Embedding 版本、评估集和实验结果做版本管理。
上线前准备兜底策略，例如拒答、回退、人工审核或缓存降级。
观察错误类型时，区分数据问题、召回问题、提示词问题和模型问题。
用固定数据集和固定指标比较方案，不要只看主观效果。

常见误区

只关注 Demo 效果，不考虑线上稳定性和可复现性。
没有评估集就频繁调参，最后无法解释为什么变好或变差。
忽略权限、审计、隐私和模型输出的安全边界。
只讨论模型结构，不讨论数据质量和标签质量。

进阶路线

继续补齐训练、推理、评估、MLOps 和治理链路。
把主题放回真实业务流程，思考谁提供数据、谁消费结果、谁负责兜底。
把 PoC 逐步升级到可观测、可回滚、可演进的生产方案。
继续补齐模型压缩、量化、蒸馏、在线评估和数据闭环。

适用场景

当你准备把《Stable Diffusion 图像生成》真正落到项目里时，最适合先在一个独立模块或最小样例里验证关键路径。
适合企业知识问答、内容生成、分类抽取和智能助手等场景。
当需求同时关注效果、时延、成本和安全边界时，这类主题最有价值。

落地建议

先定义评估集、成功标准和失败样例，再开始调模型或调提示。
把数据来源、分块方式、Embedding 版本和 Prompt 模板纳入版本管理。
上线前准备兜底策略，例如拒答、回退、人工审核或检索降级。

排错清单

先判断问题出在数据、检索、Prompt、模型还是后处理。
检查上下文是否过长、分块是否过碎或召回是否偏题。
对错误回答做分类，区分幻觉、事实过时、指令误解和格式错误。

复盘问题

如果把《Stable Diffusion 图像生成》放进你的当前项目，最先要验证的输入、输出和失败路径分别是什么？
《Stable Diffusion 图像生成》最容易在什么规模、什么边界条件下暴露问题？你会用什么指标或日志去确认？
相比默认实现或替代方案，采用《Stable Diffusion 图像生成》最大的收益和代价分别是什么？

LoRA 微调

LoRA（Low-Rank Adaptation）是一种参数高效的微调方法，通过在原始权重矩阵旁添加低秩分解矩阵来实现风格定制。

# LoRA 原理：
# 原始权重 W (d x d)
# LoRA 添加：delta_W = A @ B，其中 A (d x r), B (r x d)，r << d
# 前向传播：y = (W + A @ B) @ x
# 只训练 A 和 B，参数量从 d^2 降到 2*d*r

# 使用 diffusers 训练 LoRA
# pip install diffusers[training] accelerate transformers

def lora_training_example():
    """LoRA 微调配置"""
    from diffusers import StableDiffusionPipeline

    # 加载基础模型
    pipe = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float16
    ).to("cuda")

    # LoRA 训练参数配置（命令行方式）
    lora_config = """
    # accelerate launch train_text_to_image_lora.py \\
    #   --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \\
    #   --dataset_name="my_dataset" \\
    #   --resolution=512 --center_crop --random_flip \\
    #   --train_batch_size=1 \\
    #   --gradient_accumulation_steps=4 \\
    #   --gradient_checkpointing \\
    #   --max_train_steps=500 \\
    #   --learning_rate=1e-04 \\
    #   --max_grad_norm=1 \\
    #   --lr_scheduler="cosine" --lr_warmup_steps=0 \\
    #   --output_dir="./my-lora-model" \\
    #   --rank=4 \\
    #   --validation_prompt="a photo of sks dog in a bucket"
    """

    # 加载训练好的 LoRA 权重
    # pipe.load_lora_weights("./my-lora-model", weight_name="pytorch_lora_weights.safetensors")

    # 使用 LoRA 生成
    # image = pipe("a photo of sks dog on the beach", num_inference_steps=30).images[0]

    print("LoRA 微调要点：")
    print("  1. 数据集：需要 20-200 张目标风格/对象的图片")
    print("  2. 分辨率：与基础模型匹配（SD 1.5 = 512, SDXL = 1024）")
    print("  3. Rank（r）：4-16 即可，太大会过拟合")
    print("  4. 学习率：1e-4 ~ 5e-4")
    print("  5. 训练步数：300-1000 步通常足够")
    print("  6. 输出大小：通常只有几十 MB（远小于完整模型）")

lora_training_example()

SDXL 与模型演进

def sdxl_example():
    """Stable Diffusion XL 使用示例"""

    from diffusers import StableDiffusionXLPipeline
    import torch

    # SDXL 相比 SD 1.5 的改进：
    # 1. 更高分辨率：1024x1024（SD 1.5 是 512x512）
    # 2. 双文本编码器：CLIP ViT-L + CLIP ViT-G
    # 3. 更大的 UNet：更多参数，更精细的去噪
    # 4. Refiner 模型：可选的第二阶段精化

    # 基础使用
    # pipe = StableDiffusionXLPipeline.from_pretrained(
    #     "stabilityai/stable-diffusion-xl-base-1.0",
    #     torch_dtype=torch.float16,
    #     variant="fp16"
    # ).to("cuda")

    # 生成高质量图像
    # image = pipe(
    #     prompt="a majestic lion in a mystical forest, digital art, 8k",
    #     negative_prompt="low quality, blurry, distorted, watermark",
    #     num_inference_steps=40,
    #     guidance_scale=7.5,
    #     width=1024,
    #     height=1024
    # ).images[0]

    # 其他主流模型：
    models = {
        "SD 1.5": "基础模型，生态最丰富，LoRA/ControlNet 最多",
        "SDXL": "更高质量，1024x1024，双文本编码器",
        "SD 3.0": "最新版本，Flow Matching 架构",
        "SDXL Turbo": "实时生成，1-4 步即可出图",
        "LCM (Latent Consistency)": "加速推理，4 步出图",
    }

    for model, desc in models.items():
        print(f"  {model}: {desc}")

sdxl_example()

图像生成优化技巧

def optimization_techniques():
    """Stable Diffusion 推理优化技巧"""

    # 1. 显存优化
    memory_tips = {
        "enable_xformers": "使用 Flash Attention，显存减少 30-50%",
        "enable_vae_slicing": "大图分块解码，降低峰值显存",
        "enable_sequential_cpu_offload": "模型层逐个加载到 GPU",
        "enable_tiled_vae": "VAE 分块处理，支持 4K 图像",
        "float16": "使用半精度，显存减半，质量几乎无损",
    }

    # 2. 速度优化
    speed_tips = {
        "LCM Scheduler": "4-8 步即可生成高质量图像",
        "DPM++ SDE Karras": "20 步达到传统 50 步的质量",
        "Euler A": "最快的调度器之一，适合快速预览",
        "Compiled Pipeline": "torch.compile() 加速（PyTorch 2.0+）",
    }

    # 3. 质量优化
    quality_tips = {
        "guidance_scale 7-12": "CFG 越高越贴合提示词，但过高会过饱和",
        "负向提示词": "排除不想要的元素，如 'low quality, blurry'",
        "Hires.fix": "先生成低分辨率，再超分辨率放大",
        "ADetailer": "自动检测并修复面部/手部细节",
    }

    # 4. 提示词工程
    prompt_structure = """
    高质量提示词结构：
    [主体描述], [风格], [光照], [构图], [质量修饰词]

    示例：
    a beautiful woman in a garden, oil painting style,
    soft golden hour lighting, close-up portrait,
    highly detailed, 8k, masterpiece, best quality

    质量修饰词：masterpiece, best quality, highly detailed, 8k, sharp focus
    风格修饰词：oil painting, watercolor, digital art, photograph, anime
    光照修饰词：golden hour, studio lighting, cinematic lighting, volumetric light
    """

    print("SD 推理优化总结:")
    for category, tips in [("显存优化", memory_tips),
                           ("速度优化", speed_tips),
                           ("质量优化", quality_tips)]:
        print(f"\n{category}:")
        for tip, desc in tips.items():
            print(f"  {tip}: {desc}")

optimization_techniques()