AI 应用成本优化

SunnyFan大约 19 分钟约 5713 字

AI 应用成本优化

简介

AI 应用成本优化是指在保证模型效果和用户体验的前提下，通过技术手段和管理策略降低 AI 系统的整体运行成本。随着大语言模型（LLM）的广泛应用，API 调用费用、推理计算成本、存储开销等问题日益突出。一个没有经过优化的 AI 应用，其成本可能在几周内失控增长。

成本优化的核心思路可以归纳为三个维度：减少请求量（缓存、压缩）、降低单次成本（模型选择、量化）、提升资源利用率（批处理、调度）。每个维度都有多种具体的技术手段，需要根据业务场景组合使用。

从经济学角度看，AI 成本优化不是一次性工程，而是持续运营的过程。需要建立成本监控体系，定期评估 ROI，根据业务增长动态调整优化策略。很多团队在 POC 阶段不计成本，上线后才发现预算难以支撑，这种情况可以通过提前规划来避免。

特点

1.多维度优化 — 模型选择、缓存、压缩、批处理等
2.持续运营 — 成本监控是长期工作，不是一次性工程
3.效果与成本权衡 — 在质量可接受范围内寻找最优成本点
4.技术与管理结合 — 既需要技术手段，也需要使用规范和审批流程
5.可量化衡量 — 每项优化都应有明确的指标和 ROI 计算

Token 计费分析

主流模型定价对比

# 模型定价分析工具
MODEL_PRICING = {
    # OpenAI 系列 (价格单位: USD / 1M tokens)
    "gpt-4o": {"input": 2.50, "output": 10.00, "context": 128000},
    "gpt-4o-mini": {"input": 0.15, "output": 0.60, "context": 128000},
    "gpt-4-turbo": {"input": 10.00, "output": 30.00, "context": 128000},
    "gpt-3.5-turbo": {"input": 0.50, "output": 1.50, "context": 16385},
    # Claude 系列
    "claude-opus-4": {"input": 15.00, "output": 75.00, "context": 200000},
    "claude-sonnet-4": {"input": 3.00, "output": 15.00, "context": 200000},
    "claude-haiku-3.5": {"input": 0.80, "output": 4.00, "context": 200000},
    # 本地开源模型 (仅计算 GPU 成本)
    "llama-3.1-70b": {"input": 0.00, "output": 0.00, "gpu_hour": 1.50},
    "qwen2.5-72b": {"input": 0.00, "output": 0.00, "gpu_hour": 1.50},
}

def estimate_monthly_cost(
    model: str,
    daily_requests: int,
    avg_input_tokens: int = 500,
    avg_output_tokens: int = 300,
) -> dict:
    """估算月度 API 调用成本"""
    pricing = MODEL_PRICING[model]
    days = 30

    total_input = daily_requests * avg_input_tokens * days
    total_output = daily_requests * avg_output_tokens * days

    input_cost = (total_input / 1_000_000) * pricing["input"]
    output_cost = (total_output / 1_000_000) * pricing["output"]
    total_cost = input_cost + output_cost

    return {
        "model": model,
        "monthly_requests": daily_requests * days,
        "input_tokens_m": total_input / 1_000_000,
        "output_tokens_m": total_output / 1_000_000,
        "input_cost": f"${input_cost:.2f}",
        "output_cost": f"${output_cost:.2f}",
        "total_cost": f"${total_cost:.2f}",
    }

# 对比不同模型的成本
for model in ["gpt-4o", "gpt-4o-mini", "claude-haiku-3.5"]:
    result = estimate_monthly_cost(model, daily_requests=10000)
    print(f"{model}: {result['total_cost']}/月")

成本构成分析

class CostAnalyzer:
    """AI 应用成本分析器"""

    def __init__(self):
        self.costs = {
            "api_calls": 0,
            "infrastructure": 0,
            "storage": 0,
            "development": 0,
        }

    def analyze_request_cost(self, request_log: dict) -> dict:
        """分析单次请求的成本明细"""
        model = request_log["model"]
        input_tokens = request_log.get("input_tokens", 0)
        output_tokens = request_log.get("output_tokens", 0)

        pricing = MODEL_PRICING.get(model, {})
        input_rate = pricing.get("input", 0)
        output_rate = pricing.get("output", 0)

        request_cost = (
            input_tokens * input_rate / 1_000_000
            + output_tokens * output_rate / 1_000_000
        )

        return {
            "request_id": request_log.get("id"),
            "model": model,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "cost_usd": request_cost,
            "cost_per_1k_tokens": request_cost / max(input_tokens + output_tokens, 1) * 1000,
        }

    def generate_cost_report(self, logs: list) -> dict:
        """生成成本分析报告"""
        total_cost = 0
        model_breakdown = {}
        hourly_distribution = {}

        for log in logs:
            analysis = self.analyze_request_cost(log)
            total_cost += analysis["cost_usd"]

            model = analysis["model"]
            if model not in model_breakdown:
                model_breakdown[model] = {"count": 0, "cost": 0, "tokens": 0}
            model_breakdown[model]["count"] += 1
            model_breakdown[model]["cost"] += analysis["cost_usd"]
            model_breakdown[model]["tokens"] += analysis["input_tokens"] + analysis["output_tokens"]

        return {
            "total_cost": f"${total_cost:.4f}",
            "total_requests": len(logs),
            "avg_cost_per_request": f"${total_cost / max(len(logs), 1):.6f}",
            "model_breakdown": model_breakdown,
        }

缓存策略

精确匹配缓存

import hashlib
import json
import time
from typing import Optional, Any

class ExactMatchCache:
    """精确匹配缓存 — 对相同输入直接返回缓存结果"""

    def __init__(self, ttl_seconds: int = 3600, max_size: int = 10000):
        self.cache = {}
        self.ttl = ttl_seconds
        self.max_size = max_size
        self.stats = {"hits": 0, "misses": 0}

    def _make_key(self, prompt: str, model: str, **kwargs) -> str:
        """生成缓存键"""
        cache_data = {
            "prompt": prompt,
            "model": model,
            **{k: v for k, v in sorted(kwargs.items())}
        }
        return hashlib.sha256(
            json.dumps(cache_data, sort_keys=True).encode()
        ).hexdigest()

    def get(self, prompt: str, model: str, **kwargs) -> Optional[str]:
        """获取缓存"""
        key = self._make_key(prompt, model, **kwargs)
        entry = self.cache.get(key)

        if entry and time.time() - entry["timestamp"] < self.ttl:
            self.stats["hits"] += 1
            return entry["response"]

        self.stats["misses"] += 1
        return None

    def set(self, prompt: str, model: str, response: str, **kwargs):
        """设置缓存"""
        if len(self.cache) >= self.max_size:
            # LRU 淘汰：删除最早的条目
            oldest_key = min(self.cache, key=lambda k: self.cache[k]["timestamp"])
            del self.cache[oldest_key]

        key = self._make_key(prompt, model, **kwargs)
        self.cache[key] = {
            "response": response,
            "timestamp": time.time(),
        }

    @property
    def hit_rate(self) -> float:
        total = self.stats["hits"] + self.stats["misses"]
        return self.stats["hits"] / total if total > 0 else 0.0


# 使用示例
cache = ExactMatchCache(ttl_seconds=1800)

def call_llm_with_cache(prompt: str, model: str = "gpt-4o-mini") -> str:
    """带缓存的 LLM 调用"""
    cached = cache.get(prompt, model)
    if cached:
        print(f"缓存命中! 命中率: {cache.hit_rate:.1%}")
        return cached

    # 实际调用 LLM（这里省略）
    response = f"AI response for: {prompt[:50]}..."
    cache.set(prompt, model, response)
    return response

语义缓存

import numpy as np
from dataclasses import dataclass

@dataclass
class CacheEntry:
    query: str
    response: str
    embedding: np.ndarray
    timestamp: float
    hit_count: int = 0

class SemanticCache:
    """语义缓存 — 对语义相似的查询返回缓存结果

    核心思路：将查询转换为向量，通过余弦相似度判断是否命中缓存。
    适用于用户用不同措辞提问相同问题的场景。
    """

    def __init__(
        self,
        similarity_threshold: float = 0.92,
        ttl_seconds: int = 3600,
    ):
        self.entries: list[CacheEntry] = []
        self.threshold = similarity_threshold
        self.ttl = ttl_seconds
        self.stats = {"hits": 0, "misses": 0}

    def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
        """计算余弦相似度"""
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    def _get_embedding(self, text: str) -> np.ndarray:
        """获取文本向量（实际使用时接入 embedding API）"""
        # 示例：使用 OpenAI embedding
        # response = openai.embeddings.create(
        #     model="text-embedding-3-small", input=text
        # )
        # return np.array(response.data[0].embedding)
        return np.random.rand(1536)  # 模拟向量

    def get(self, query: str) -> Optional[str]:
        """查找语义相似的缓存"""
        query_embedding = self._get_embedding(query)
        current_time = time.time()

        best_match = None
        best_score = 0.0

        for entry in self.entries:
            # 检查 TTL
            if current_time - entry.timestamp > self.ttl:
                continue

            score = self._cosine_similarity(query_embedding, entry.embedding)
            if score > self.threshold and score > best_score:
                best_score = score
                best_match = entry

        if best_match:
            best_match.hit_count += 1
            self.stats["hits"] += 1
            return best_match.response

        self.stats["misses"] += 1
        return None

    def set(self, query: str, response: str):
        """添加到语义缓存"""
        embedding = self._get_embedding(query)
        entry = CacheEntry(
            query=query,
            response=response,
            embedding=embedding,
            timestamp=time.time(),
        )
        self.entries.append(entry)

    def cleanup_expired(self):
        """清理过期条目"""
        current_time = time.time()
        self.entries = [
            e for e in self.entries
            if current_time - e.timestamp <= self.ttl
        ]


# 使用示例
semantic_cache = SemanticCache(similarity_threshold=0.90)

def call_with_semantic_cache(user_query: str) -> str:
    cached = semantic_cache.get(user_query)
    if cached:
        print(f"语义缓存命中! 命中率: {semantic_cache.stats['hits'] / max(sum(semantic_cache.stats.values()), 1):.1%}")
        return cached

    # 调用 LLM
    response = f"AI 回答: {user_query}"
    semantic_cache.set(user_query, response)
    return response

模型选择指南

智能路由策略

from enum import Enum
from dataclasses import dataclass

class TaskComplexity(Enum):
    SIMPLE = "simple"         # 简单问答、格式转换
    MEDIUM = "medium"         # 摘要、翻译、简单推理
    COMPLEX = "complex"       # 代码生成、多步推理
    EXPERT = "expert"         # 数学证明、复杂分析

@dataclass
class ModelConfig:
    name: str
    cost_per_1m_input: float
    cost_per_1m_output: float
    max_context: int
    quality_score: float       # 1-10
    latency_ms: int
    supports_tools: bool

class ModelRouter:
    """模型智能路由 — 根据任务复杂度选择最优模型"""

    MODELS = {
        "fast": ModelConfig(
            "gpt-4o-mini", 0.15, 0.60, 128000, 7.0, 300, True
        ),
        "balanced": ModelConfig(
            "gpt-4o", 2.50, 10.00, 128000, 8.5, 800, True
        ),
        "powerful": ModelConfig(
            "claude-sonnet-4", 3.00, 15.00, 200000, 9.0, 1000, True
        ),
    }

    def classify_complexity(self, prompt: str) -> TaskComplexity:
        """根据 prompt 特征判断任务复杂度"""
        prompt_lower = prompt.lower()

        # 简单任务特征
        simple_keywords = ["翻译", "格式化", "总结这段", "是什么"]
        if any(kw in prompt_lower for kw in simple_keywords):
            return TaskComplexity.SIMPLE

        # 复杂任务特征
        complex_keywords = ["分析", "设计", "优化", "实现", "debug"]
        if any(kw in prompt_lower for kw in complex_keywords):
            if len(prompt) > 2000:
                return TaskComplexity.EXPERT
            return TaskComplexity.COMPLEX

        # 中等任务
        return TaskComplexity.MEDIUM

    def route(self, prompt: str, priority: str = "cost") -> str:
        """根据策略路由到最优模型

        Args:
            prompt: 用户输入
            priority: 优化目标 - "cost"（成本优先）,
                      "quality"（质量优先）, "speed"（速度优先）
        """
        complexity = self.classify_complexity(prompt)

        # 成本优先策略
        if priority == "cost":
            model_map = {
                TaskComplexity.SIMPLE: "fast",
                TaskComplexity.MEDIUM: "fast",
                TaskComplexity.COMPLEX: "balanced",
                TaskComplexity.EXPERT: "powerful",
            }
        # 质量优先策略
        elif priority == "quality":
            model_map = {
                TaskComplexity.SIMPLE: "balanced",
                TaskComplexity.MEDIUM: "balanced",
                TaskComplexity.COMPLEX: "powerful",
                TaskComplexity.EXPERT: "powerful",
            }
        # 速度优先策略
        else:
            model_map = {
                TaskComplexity.SIMPLE: "fast",
                TaskComplexity.MEDIUM: "fast",
                TaskComplexity.COMPLEX: "fast",
                TaskComplexity.EXPERT: "balanced",
            }

        selected = self.MODELS[model_map[complexity]]
        print(f"任务复杂度: {complexity.value} -> 选择模型: {selected.name}")
        return selected.name


# 使用示例
router = ModelRouter()
model = router.route("请翻译以下英文为中文：Hello World", priority="cost")
model = router.route("请设计一个高并发的消息队列系统", priority="quality")

Batch API 批量处理

import json
import time
from pathlib import Path

class BatchProcessor:
    """OpenAI Batch API 封装 — 批量提交请求降低 50% 成本"""

    def __init__(self, client):
        self.client = client
        self.batch_requests = []

    def add_request(
        self,
        custom_id: str,
        model: str,
        messages: list,
        temperature: float = 0.0,
        max_tokens: int = 1000,
    ):
        """添加批量请求"""
        request = {
            "custom_id": custom_id,
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": model,
                "messages": messages,
                "temperature": temperature,
                "max_tokens": max_tokens,
            }
        }
        self.batch_requests.append(request)

    def create_batch_file(self, filepath: str = "batch_input.jsonl") -> str:
        """创建批量输入文件"""
        with open(filepath, "w", encoding="utf-8") as f:
            for req in self.batch_requests:
                f.write(json.dumps(req, ensure_ascii=False) + "\n")
        return filepath

    def submit_batch(self, input_file_path: str) -> dict:
        """提交批量任务"""
        # 上传文件
        with open(input_file_path, "rb") as f:
            uploaded_file = self.client.files.create(
                file=f, purpose="batch"
            )

        # 创建批量任务
        batch = self.client.batches.create(
            input_file_id=uploaded_file.id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={"description": "批量处理任务"}
        )

        print(f"批量任务已提交: {batch.id}")
        return batch

    def poll_batch(self, batch_id: str, interval: int = 60) -> dict:
        """轮询批量任务状态"""
        while True:
            batch = self.client.batches.retrieve(batch_id)
            status = batch.status

            if status == "completed":
                print(f"批量任务完成! 处理 {batch.request_counts.completed} 个请求")
                return batch
            elif status in ("failed", "expired", "cancelled"):
                raise Exception(f"批量任务失败: {status}")

            completed = batch.request_counts.completed
            total = batch.request_counts.total
            print(f"进度: {completed}/{total}, 等待 {interval}s...")
            time.sleep(interval)

    def get_results(self, output_file_id: str) -> list:
        """获取批量处理结果"""
        content = self.client.files.content(output_file_id)
        results = []
        for line in content.text.strip().split("\n"):
            results.append(json.loads(line))
        return results


# 使用示例：批量翻译
def batch_translate_example():
    """批量翻译示例 — 利用 Batch API 降低成本"""
    processor = BatchProcessor(None)  # 传入实际 client

    texts = [
        "The quick brown fox jumps over the lazy dog.",
        "To be or not to be, that is the question.",
        "All that glitters is not gold.",
    ]

    for i, text in enumerate(texts):
        processor.add_request(
            custom_id=f"translate-{i}",
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "你是翻译专家，将英文翻译为中文。"},
                {"role": "user", "content": f"翻译: {text}"},
            ],
            max_tokens=200,
        )

    print(f"已添加 {len(processor.batch_requests)} 个批量请求")
    print("Batch API 相比实时调用可节省约 50% 成本")

Prompt 压缩技术

class PromptCompressor:
    """Prompt 压缩 — 减少 token 数量降低成本"""

    @staticmethod
    def compress_system_prompt(prompt: str) -> str:
        """压缩系统提示词

        策略：
        1. 移除冗余的空白和换行
        2. 合并重复的指令
        3. 使用更简洁的表达
        """
        # 移除多余空白
        lines = [line.strip() for line in prompt.split("\n") if line.strip()]
        compressed = "\n".join(lines)

        # 常见缩写替换（不改变语义）
        replacements = {
            "Please ": "",
            "please ": "",
            "You are a ": "Role: ",
            "You are an ": "Role: ",
            "Your task is to ": "Task: ",
            "You should ": "Rule: ",
            "Make sure to ": "Must: ",
            "It is important to ": "Note: ",
        }
        for old, new in replacements.items():
            compressed = compressed.replace(old, new)

        return compressed

    @staticmethod
    def compress_conversation(messages: list, max_history: int = 5) -> list:
        """压缩对话历史 — 只保留最近的几轮"""
        if len(messages) <= max_history * 2 + 1:
            return messages

        # 保留 system 消息
        system_msgs = [m for m in messages if m["role"] == "system"]
        non_system = [m for m in messages if m["role"] != "system"]

        # 保留最近的对话
        recent = non_system[-(max_history * 2):]

        # 添加摘要占位（可选：用 LLM 生成摘要）
        summary_msg = {
            "role": "system",
            "content": f"[之前 {len(non_system) - len(recent)} 条消息已省略]"
        }

        return system_msgs + [summary_msg] + recent

    @staticmethod
    def estimate_tokens(text: str) -> int:
        """估算 token 数量（粗略：中文约 1.5 字/token）"""
        chinese_chars = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
        english_words = len(text.split()) - chinese_chars
        return int(chinese_chars / 1.5 + english_words / 0.75)

    def optimize_prompt(self, messages: list) -> dict:
        """综合优化 prompt"""
        original_count = sum(
            self.estimate_tokens(m["content"]) for m in messages
        )

        # 压缩系统提示
        optimized = []
        for msg in messages:
            if msg["role"] == "system":
                compressed = self.compress_system_prompt(msg["content"])
                optimized.append({**msg, "content": compressed})
            else:
                optimized.append(msg)

        # 压缩对话历史
        optimized = self.compress_conversation(optimized)

        new_count = sum(
            self.estimate_tokens(m["content"]) for m in optimized
        )

        return {
            "original_tokens": original_count,
            "optimized_tokens": new_count,
            "reduction": f"{(1 - new_count / max(original_count, 1)):.1%}",
            "messages": optimized,
        }


# 使用示例
compressor = PromptCompressor()
messages = [
    {"role": "system", "content": "You are a helpful assistant. Please help the user with their questions. You should be accurate and concise."},
    {"role": "user", "content": "你好"},
    {"role": "assistant", "content": "你好！有什么可以帮助你的吗？"},
    {"role": "user", "content": "请介绍一下 Python"},
]
result = compressor.optimize_prompt(messages)
print(f"Token 减少: {result['reduction']}")

模型蒸馏与量化

模型蒸馏

class KnowledgeDistillation:
    """知识蒸馏 — 用大模型训练小模型，降低推理成本

    核心思路：让小模型（Student）学习大模型（Teacher）的输出分布，
    而不仅仅是硬标签。通过软标签获得更丰富的知识。
    """

    @staticmethod
    def generate_training_data(
        teacher_model: str,
        prompts: list,
        num_samples: int = 1000,
    ) -> list:
        """用大模型生成训练数据"""
        training_data = []
        for prompt in prompts[:num_samples]:
            # 调用大模型生成高质量回答
            # response = call_llm(teacher_model, prompt)
            training_data.append({
                "prompt": prompt,
                "response": f"Teacher response for: {prompt[:30]}",
                "model": teacher_model,
            })
        return training_data

    @staticmethod
    def distillation_loss(student_logits, teacher_logits, temperature=2.0):
        """蒸馏损失函数

        KD Loss = alpha * KL(teacher_soft / student_soft) + (1-alpha) * CE(label)
        temperature 越高，软标签分布越平滑，知识传递越丰富
        """
        import torch
        import torch.nn.functional as F

        # 软标签
        soft_teacher = F.softmax(teacher_logits / temperature, dim=-1)
        soft_student = F.log_softmax(student_logits / temperature, dim=-1)

        # KL 散度
        kd_loss = F.kl_div(soft_student, soft_teacher, reduction="batchmean")
        kd_loss = kd_loss * (temperature ** 2)

        return kd_loss

    def cost_comparison(self) -> dict:
        """蒸馏前后成本对比"""
        return {
            "teacher_model": {
                "name": "gpt-4o",
                "cost_per_1m_tokens": 6.25,  # input + output 加权平均
                "latency_ms": 1500,
            },
            "student_model": {
                "name": "fine-tuned-gpt-4o-mini",
                "cost_per_1m_tokens": 0.375,
                "latency_ms": 400,
                "training_cost": "~$5-50（取决于数据量）",
            },
            "savings": {
                "cost_reduction": "94%",
                "latency_improvement": "73%",
                "quality_retention": "85-95%（取决于任务）",
            },
        }

模型量化

class ModelQuantization:
    """模型量化 — 降低模型精度减少计算和存储成本"""

    @staticmethod
    def gguf_quantization_guide():
        """GGUF 量化级别选择指南"""
        quant_levels = {
            "Q8_0": {
                "bits": 8,
                "size_factor": 1.0,
                "quality_loss": "~0.1%",
                "recommended_for": "质量敏感场景",
                "example_size": "70B 模型约 70GB",
            },
            "Q5_K_M": {
                "bits": 5,
                "size_factor": 0.65,
                "quality_loss": "~0.5-1%",
                "recommended_for": "平衡场景（推荐）",
                "example_size": "70B 模型约 45GB",
            },
            "Q4_K_M": {
                "bits": 4,
                "size_factor": 0.55,
                "quality_loss": "~1-2%",
                "recommended_for": "资源受限场景",
                "example_size": "70B 模型约 40GB",
            },
            "Q2_K": {
                "bits": 2,
                "size_factor": 0.35,
                "quality_loss": "~3-5%",
                "recommended_for": "极致压缩场景",
                "example_size": "70B 模型约 25GB",
            },
        }
        return quant_levels

    @staticmethod
    def llama_cpp_deployment():
        """使用 llama.cpp 部署量化模型示例"""
        commands = """
        # 1. 下载模型
        huggingface-cli download TheBloke/Llama-2-7B-Chat-GGUF \
            llama-2-7b-chat.Q4_K_M.gguf \
            --local-dir ./models

        # 2. 使用 llama.cpp server 启动
        ./llama-server \
            -m ./models/llama-2-7b-chat.Q4_K_M.gguf \
            -c 4096 \
            -ngl 32 \
            --host 0.0.0.0 \
            --port 8080

        # 3. 使用 OpenAI 兼容 API 调用
        curl http://localhost:8080/v1/chat/completions \
            -H "Content-Type: application/json" \
            -d '{
                "messages": [{"role": "user", "content": "你好"}],
                "temperature": 0.7
            }'
        """
        return commands

本地模型部署

class LocalModelDeployment:
    """本地模型部署方案 — 完全消除 API 调用成本"""

    @staticmethod
    def gpu_cost_analysis() -> dict:
        """GPU 租用成本分析"""
        return {
            "NVIDIA A10G (24GB)": {
                "hourly_cost": "$1.50-2.50",
                "suitable_models": "7B-13B Q4量化",
                "monthly_24x7": "$1,080-1,800",
                "break_even_vs_api": "~500K requests/month",
            },
            "NVIDIA A100 (80GB)": {
                "hourly_cost": "$3.50-5.00",
                "suitable_models": "70B Q4量化",
                "monthly_24x7": "$2,520-3,600",
                "break_even_vs_api": "~200K requests/month",
            },
            "NVIDIA H100 (80GB)": {
                "hourly_cost": "$5.00-8.00",
                "suitable_models": "70B-120B Q4量化",
                "monthly_24x7": "$3,600-5,760",
                "break_even_vs_api": "~100K requests/month",
            },
        }

    @staticmethod
    def vllm_deployment_config():
        """vLLM 高性能推理服务部署配置"""
        config = """
        # vLLM 部署脚本
        import subprocess

        # 启动 vLLM OpenAI 兼容服务
        subprocess.run([
            "python", "-m", "vllm.entrypoints.openai.api_server",
            "--model", "Qwen/Qwen2.5-72B-Instruct-AWQ",
            "--tensor-parallel-size", "2",      # 2卡并行
            "--gpu-memory-utilization", "0.90",  # GPU 显存利用率
            "--max-model-len", "8192",           # 最大上下文长度
            "--port", "8000",
            "--host", "0.0.0.0",
        ])
        """
        return config

    @staticmethod
    def cost_comparison_table() -> str:
        """API vs 本地部署成本对比"""
        return """
        | 场景 | API 调用 (GPT-4o) | 本地部署 (Qwen2.5-72B Q4) |
        |------|------------------|--------------------------|
        | 10K req/天 | ~$900/月 | ~$1,800 GPU/月 |
        | 100K req/天 | ~$9,000/月 | ~$1,800 GPU/月 |
        | 1M req/天 | ~$90,000/月 | ~$3,600 GPU/月(2节点) |
        | 延迟 | 800-1500ms | 200-500ms |
        | 数据隐私 | 数据发送到第三方 | 数据完全自控 |
        """

成本监控体系

import time
from collections import defaultdict
from dataclasses import dataclass, field

@dataclass
class UsageRecord:
    timestamp: float
    model: str
    input_tokens: int
    output_tokens: int
    cost_usd: float
    cached: bool = False
    endpoint: str = "chat"

class CostMonitor:
    """AI 成本监控器 — 实时追踪成本消耗"""

    def __init__(self, daily_budget: float = 100.0):
        self.daily_budget = daily_budget
        self.records: list[UsageRecord] = []
        self.alerts: list[str] = []

    def record_usage(
        self,
        model: str,
        input_tokens: int,
        output_tokens: int,
        cost_usd: float,
        cached: bool = False,
    ):
        """记录使用量"""
        record = UsageRecord(
            timestamp=time.time(),
            model=model,
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            cost_usd=cost_usd,
            cached=cached,
        )
        self.records.append(record)

        # 检查预算
        today_cost = self._get_today_cost()
        if today_cost > self.daily_budget * 0.8 and today_cost - cost_usd <= self.daily_budget * 0.8:
            self.alerts.append(f"警告: 今日消费已达预算的80% (${today_cost:.2f}/${self.daily_budget})")
        if today_cost > self.daily_budget:
            self.alerts.append(f"严重: 今日消费已超出预算! (${today_cost:.2f}/${self.daily_budget})")

    def _get_today_cost(self) -> float:
        """获取今日总成本"""
        today_start = time.time() - (time.time() % 86400)
        return sum(
            r.cost_usd for r in self.records
            if r.timestamp >= today_start
        )

    def get_dashboard(self) -> dict:
        """获取成本看板数据"""
        today_cost = self._get_today_cost()

        # 按模型分组
        by_model = defaultdict(lambda: {"cost": 0, "requests": 0, "tokens": 0})
        for r in self.records:
            by_model[r.model]["cost"] += r.cost_usd
            by_model[r.model]["requests"] += 1
            by_model[r.model]["tokens"] += r.input_tokens + r.output_tokens

        # 缓存命中率
        total = len(self.records)
        cached = sum(1 for r in self.records if r.cached)
        cache_rate = cached / total if total > 0 else 0

        # 节省估算
        estimated_original_cost = sum(
            r.cost_usd / (1 - 0.3) for r in self.records if r.cached
        ) if cached > 0 else 0

        return {
            "today_cost": f"${today_cost:.4f}",
            "budget_usage": f"{today_cost / self.daily_budget:.1%}",
            "total_requests": total,
            "cache_hit_rate": f"{cache_rate:.1%}",
            "estimated_savings": f"${estimated_original_cost:.2f}",
            "cost_by_model": dict(by_model),
            "alerts": self.alerts[-5:],  # 最近5条告警
        }

    def should_throttle(self) -> bool:
        """是否应该限流"""
        return self._get_today_cost() > self.daily_budget


# 使用示例：集成到 LLM 客户端
monitor = CostMonitor(daily_budget=50.0)

def call_llm_monitored(prompt: str, model: str = "gpt-4o-mini") -> str:
    """带成本监控的 LLM 调用"""
    if monitor.should_throttle():
        # 降级到更便宜的模型或返回缓存
        model = "gpt-4o-mini"
        print("预算接近上限，降级到低成本模型")

    # 调用 LLM（省略）
    input_tokens = len(prompt) // 3
    output_tokens = 200
    cost = input_tokens * 0.00015 / 1000 + output_tokens * 0.0006 / 1000

    monitor.record_usage(model, input_tokens, output_tokens, cost)
    return "AI response"

流式输出优化

import asyncio
import time

class StreamingOptimizer:
    """流式输出优化 — 降低首字节时间 (TTFB)"""

    @staticmethod
    async def stream_with_early_token(client, model: str, messages: list):
        """流式输出 + 提前返回首 token"""
        start_time = time.time()
        first_token_time = None
        full_response = ""

        stream = await client.chat.completions.create(
            model=model,
            messages=messages,
            stream=True,
            stream_options={"include_usage": True},
        )

        async for chunk in stream:
            if chunk.choices and chunk.choices[0].delta.content:
                if first_token_time is None:
                    first_token_time = time.time() - start_time
                    print(f"TTFB: {first_token_time*1000:.0f}ms")

                content = chunk.choices[0].delta.content
                full_response += content
                yield content

            # 获取使用量
            if hasattr(chunk, 'usage') and chunk.usage:
                print(f"Token 使用: input={chunk.usage.prompt_tokens}, "
                      f"output={chunk.usage.completion_tokens}")

    @staticmethod
    def calculate_streaming_benefits():
        """流式输出的成本和体验收益"""
        return {
            "ttfb_improvement": "从 2-5s 降低到 200-500ms",
            "user_experience": "用户感知响应更快，减少等待焦虑",
            "cancellation_savings": "用户可提前取消不满意的输出，节省 30-50% token",
            "timeout_handling": "流式保持连接活跃，减少超时重试",
        }

    @staticmethod
    async def stream_with_budget_control(
        client, model: str, messages: list, max_output_tokens: int = 1000
    ):
        """带预算控制的流式输出"""
        token_count = 0

        stream = await client.chat.completions.create(
            model=model,
            messages=messages,
            stream=True,
            max_tokens=max_output_tokens,
        )

        async for chunk in stream:
            if chunk.choices and chunk.choices[0].delta.content:
                token_count += 1
                if token_count > max_output_tokens:
                    yield "\n\n[已达到输出上限]"
                    break
                yield chunk.choices[0].delta.content

ROI 计算框架

class ROICalculator:
    """AI 项目 ROI 计算器"""

    @staticmethod
    def calculate(
        monthly_api_cost: float,
        monthly_gpu_cost: float,
        monthly_engineering_cost: float,
        monthly_revenue_or_savings: float,
        months: int = 12,
    ) -> dict:
        total_cost = monthly_api_cost + monthly_gpu_cost + monthly_engineering_cost
        monthly_roi = (monthly_revenue_or_savings - total_cost) / total_cost

        cumulative_cost = 0
        cumulative_value = 0
        payback_month = None

        for month in range(1, months + 1):
            cumulative_cost += total_cost
            cumulative_value += monthly_revenue_or_savings
            if cumulative_value >= cumulative_cost and payback_month is None:
                payback_month = month

        return {
            "monthly_cost": f"${total_cost:,.0f}",
            "monthly_value": f"${monthly_revenue_or_savings:,.0f}",
            "monthly_roi": f"{monthly_roi:.1%}",
            "annual_net_value": f"${(monthly_revenue_or_savings - total_cost) * 12:,.0f}",
            "payback_month": f"第 {payback_month} 个月" if payback_month else "12个月内无法回本",
        }

    @staticmethod
    def optimization_roi(
        current_monthly_cost: float,
        optimized_monthly_cost: float,
        implementation_effort_hours: float,
        hourly_rate: float = 75.0,
    ) -> dict:
        """优化措施的 ROI 计算"""
        monthly_savings = current_monthly_cost - optimized_monthly_cost
        implementation_cost = implementation_effort_hours * hourly_rate
        payback_months = implementation_cost / max(monthly_savings, 0.01)

        return {
            "implementation_cost": f"${implementation_cost:,.0f}",
            "monthly_savings": f"${monthly_savings:,.0f}",
            "annual_savings": f"${monthly_savings * 12:,.0f}",
            "payback_period": f"{payback_months:.1f} 个月",
            "three_year_roi": f"{((monthly_savings * 36 - implementation_cost) / implementation_cost):.0%}",
        }