NLP 文本分类

SunnyFan大约 16 分钟约 4792 字

NLP 文本分类

简介

文本分类是 NLP 最基础也最实用的任务之一，将输入文本映射到预定义的类别标签。从情感分析、垃圾邮件过滤到意图识别和内容审核，文本分类在企业场景中应用广泛。掌握从数据准备、模型选择到评估优化的完整流程，是落地 NLP 项目的核心能力。

文本分类的历史可以追溯到早期的规则方法和统计方法（朴素贝叶斯、SVM），到深度学习时代的 TextCNN、FastText，再到预训练模型时代的 BERT 微调和 Prompt-based 分类。每一步演进都带来了效果的大幅提升，但同时也增加了计算成本。在实际项目中，需要在效果、成本和延迟之间做出权衡。

文本分类的核心挑战不仅仅是模型选择，还包括：标签体系的定义是否合理、标注数据的质量和一致性、类别不平衡问题、以及线上部署时的延迟要求。一个被低估的事实是：在很多实际项目中，数据质量对最终效果的影响比模型选择更大。

特点

1.任务定义清晰 — 给定文本，输出一个或多个类别标签，评估指标明确
2.模型选择灵活 — 从朴素贝叶斯、TextCNN 到 BERT 微调，可根据数据量和效果需求灵活选择
3.标签体系可扩展 — 支持二分类、多分类、多标签分类和层次分类等多种标签结构
4.数据驱动效果 — 在标注数据充足的情况下，微调预训练模型可达到 90%+ 的准确率
5.业务闭环短 — 从标注到上线可以在几周内完成，是验证 NLP 项目价值的好起点

分类任务的类型

二分类：正面/负面、垃圾/非垃圾、通过/拒绝
多分类：情感（正面/中性/负面）、新闻类别（体育/科技/娱乐/...）
多标签分类：一篇文档可以属于多个类别（如一篇论文可以同时属于 AI 和 NLP）
层次分类：一级类别下还有二级类别（如科技 -> AI -> 计算机视觉）
层次多标签：层次分类 + 多标签的组合

实现

# 示例1：基于 TextCNN 的文本分类模型
import torch
import torch.nn as nn
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_classes=2,
                 filter_sizes=[2, 3, 4], num_filters=128, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(embed_dim, num_filters, fs) for fs in filter_sizes
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes)

    def forward(self, x):
        # x: (batch, seq_len)
        embeds = self.embedding(x)          # (B, L, E)
        embeds = embeds.permute(0, 2, 1)    # (B, E, L) 适配 Conv1d

        conv_outputs = []
        for conv in self.convs:
            c = F.relu(conv(embeds))         # (B, num_filters, L-fs+1)
            c = F.max_pool1d(c, c.size(2)).squeeze(2)  # (B, num_filters)
            conv_outputs.append(c)

        out = torch.cat(conv_outputs, dim=1)  # (B, num_filters * len(filter_sizes))
        out = self.dropout(out)
        return self.fc(out)

model = TextCNN(vocab_size=10000, embed_dim=128, num_classes=4)
dummy = torch.randint(0, 10000, (8, 50))
print(f"TextCNN 输出: {model(dummy).shape}")
print(f"参数量: {sum(p.numel() for p in model.parameters()):,}")

FastText 文本分类

import torch
import torch.nn as nn

class FastTextClassifier(nn.Module):
    """FastText 风格的文本分类器

    FastText 的核心思想：将文本表示为词向量的平均值，然后通过全连接层分类。
    优点：简单、快速、效果在短文本上不错。
    """
    def __init__(self, vocab_size, embed_dim=100, num_classes=4):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        # x: (B, seq_len)
        embeds = self.embedding(x)       # (B, seq_len, embed_dim)
        # 平均池化（忽略 padding）
        mask = (x != 0).unsqueeze(-1).float()  # (B, seq_len, 1)
        pooled = (embeds * mask).sum(dim=1) / mask.sum(dim=1)  # (B, embed_dim)
        pooled = self.dropout(pooled)
        return self.fc(pooled)

model = FastTextClassifier(vocab_size=10000, embed_dim=100, num_classes=4)
print(f"FastText 参数量: {sum(p.numel() for p in model.parameters()):,}")

# 示例2：基于 BERT 的文本分类微调
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-chinese',
    num_labels=3,
)

# 准备数据
texts = ["这个产品非常好用", "质量太差了，不推荐", "还行吧，一般般"]
labels = [2, 0, 1]  # 假设 0=负面, 1=中性, 2=正面

encodings = tokenizer(
    texts, truncation=True, padding=True,
    max_length=128, return_tensors='pt'
)

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

dataset = ClassificationDataset(encodings, labels)

# 训练参数配置
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

print("BERT 微调配置完成，实际训练需要更多数据")
print(f"模型参数量: {sum(p.numel() for p in model.parameters()):,}")

多标签分类

import torch
import torch.nn as nn

class MultiLabelClassifier(nn.Module):
    """多标签分类：一篇文本可以属于多个类别

    与多分类的区别：
    - 多分类: softmax + CrossEntropyLoss，每个样本属于一个类别
    - 多标签: sigmoid + BCEWithLogitsLoss，每个类别独立判断
    """
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_labels)

    def forward(self, x):
        embeds = self.embedding(x).mean(dim=1)
        out = torch.relu(self.fc1(embeds))
        return self.fc2(out)  # 不加 sigmoid，BCEWithLogitsLoss 内置

# 多标签分类的损失函数和评估
def multilabel_example():
    """多标签分类示例"""
    num_labels = 5
    criterion = nn.BCEWithLogitsLoss()

    # 模拟输出和标签
    logits = torch.randn(4, num_labels)  # (batch, num_labels)
    # 多标签：标签是 0/1 矩阵
    targets = torch.tensor([
        [1, 0, 1, 0, 0],
        [0, 1, 1, 0, 1],
        [1, 1, 0, 0, 0],
        [0, 0, 0, 1, 1],
    ], dtype=torch.float)

    loss = criterion(logits, targets)

    # 预测：对每个类别独立 sigmoid，阈值 0.5
    probs = torch.sigmoid(logits)
    predictions = (probs > 0.5).int()

    print(f"多标签损失: {loss.item():.4f}")
    print(f"预测: {predictions}")
    print(f"真实: {targets.int()}")

multilabel_example()

# 示例3：完整的评估和错误分析
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

def evaluate_classifier(y_true, y_pred, label_names=None):
    """全面的分类评估"""
    report = classification_report(y_true, y_pred, target_names=label_names)
    print("=== 分类报告 ===")
    print(report)

    cm = confusion_matrix(y_true, y_pred)
    print("=== 混淆矩阵 ===")
    print(cm)

    # 找出最容易混淆的类别对
    np.fill_diagonal(cm, 0)
    most_confused = np.unravel_index(np.argmax(cm), cm.shape)
    print(f"\n最容易混淆的类别对: {most_confused}")
    return report

def error_analysis(texts, y_true, y_pred, probs, label_names, top_k=10):
    """分析高置信度错误预测"""
    errors = []
    for i, (text, true, pred, prob) in enumerate(zip(texts, y_true, y_pred, probs)):
        if true != pred:
            confidence = prob[pred]
            errors.append((i, text, true, pred, confidence))

    # 按置信度排序，高置信度错误最值得关注
    errors.sort(key=lambda x: x[4], reverse=True)
    print(f"\n=== Top {top_k} 高置信度错误 ===")
    for idx, text, true, pred, conf in errors[:top_k]:
        print(f"  [{idx}] '{text[:30]}...' 真实={label_names[true]}, "
              f"预测={label_names[pred]}, 置信度={conf:.3f}")

# 模拟评估
y_true = [0, 1, 2, 0, 1, 2, 0, 1, 2, 1]
y_pred = [0, 1, 1, 0, 2, 2, 0, 1, 1, 1]
evaluate_classifier(y_true, y_pred, label_names=['负面', '中性', '正面'])

# 示例4：数据不平衡处理策略
from torch.utils.data import WeightedRandomSampler
import torch
import numpy as np

def handle_imbalance(labels, strategy='oversample'):
    """处理类别不平衡"""
    label_counts = np.bincount(labels)
    total = len(labels)
    num_classes = len(label_counts)

    print(f"类别分布: {dict(enumerate(label_counts))}")

    if strategy == 'oversample':
        # 过采样：少数类样本权重更高
        class_weights = total / (num_classes * label_counts)
        sample_weights = [class_weights[l] for l in labels]
        sampler = WeightedRandomSampler(sample_weights, num_samples=total, replacement=True)
        print(f"各类权重: {class_weights}")
        return sampler

    elif strategy == 'class_weight':
        # 损失函数加权
        class_weights = torch.tensor(total / (num_classes * label_counts), dtype=torch.float)
        criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
        print(f"损失函数权重: {class_weights}")
        return criterion

labels = [0]*100 + [1]*500 + [2]*50  # 严重不平衡
sampler = handle_imbalance(labels, strategy='oversample')
criterion = handle_imbalance(labels, strategy='class_weight')

数据增强策略

def text_augmentation_strategies():
    """文本数据增强方法

    1. 同义词替换：随机替换词语为同义词
    2. 回译增强：翻译为外语再翻译回中文
    3. 随机删除：随机删除部分词语
    4. 随机交换：随机交换相邻词语的位置
    5. EDA (Easy Data Augmentation)：上述方法的组合

    注意事项：
    - 增强后的文本需要保持语义不变
    - 增强强度不宜过大，否则可能改变语义
    - NLP 的数据增强效果不如 CV 中的数据增强显著
    - 对预训练模型微调，数据增强的收益较小
    """
    import random

    def random_deletion(text, p=0.1):
        """随机删除词语"""
        words = text.split()
        if len(words) == 1:
            return text
        return ' '.join(w for w in words if random.random() > p)

    def random_swap(text, n=1):
        """随机交换相邻词语"""
        words = text.split()
        for _ in range(n):
            if len(words) < 2:
                break
            idx = random.randint(0, len(words) - 2)
            words[idx], words[idx+1] = words[idx+1], words[idx]
        return ' '.join(words)

    text = "这个产品质量非常好值得购买"
    print(f"原文: {text}")
    print(f"随机删除: {random_deletion(text, p=0.2)}")
    print(f"随机交换: {random_swap(text, n=2)}")

text_augmentation_strategies()

模型选择指南

def model_selection_guide():
    """文本分类模型选择指南

    数据量 -> 推荐模型：
    - < 1000 条: 朴素贝叶斯 / SVM + TF-IDF
    - 1000 - 10000 条: TextCNN / FastText
    - 10000 - 100000 条: BERT 微调
    - > 100000 条: 大模型微调 / RoBERTa / ERNIE

    延迟要求：
    - < 10ms: FastText / TF-IDF + LR
    - < 50ms: TextCNN / DistilBERT
    - < 200ms: BERT / RoBERTa
    - 不敏感: 大模型 API

    精度要求：
    - 85%+: 传统 ML 即可
    - 90%+: TextCNN / BERT 微调
    - 95%+: 大模型 + 精细调优
    """
    print("模型选择决策树:")
    print("  数据量 < 1K + 需要快速上线: FastText / SVM")
    print("  数据量 1K-10K + 追求精度: TextCNN / BERT")
    print("  数据量 > 10K + 领域专业: 领域预训练模型微调")
    print("  延迟敏感 (< 50ms): TextCNN / DistilBERT")
    print("  多标签分类: BERT + BCE Loss")

model_selection_guide()

传统机器学习基线

# 传统 ML 方法：朴素贝叶斯 + SVM 基线
# 在数据量少时，传统方法往往是最实际的起点

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np

class TraditionalClassifier:
    """传统文本分类器集合"""

    @staticmethod
    def build_nb_pipeline():
        """朴素贝叶斯 + TF-IDF"""
        return Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=50000,
                ngram_range=(1, 2),       # 使用 unigram + bigram
                min_df=2,                 # 最小文档频率
                max_df=0.95,              # 最大文档频率（过滤停用词级别的词）
                sublinear_tf=True         # 使用 1 + log(tf) 替代原始 tf
            )),
            ('clf', MultinomialNB(alpha=0.1))  # alpha 拉普拉斯平滑
        ])

    @staticmethod
    def build_svm_pipeline():
        """SVM + TF-IDF（文本分类经典方案）"""
        return Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=100000,
                ngram_range=(1, 3),
                sublinear_tf=True
            )),
            ('clf', LinearSVC(
                C=1.0,
                max_iter=2000,
                class_weight='balanced'    # 自动平衡类别权重
            ))
        ])

    @staticmethod
    def build_lr_pipeline():
        """逻辑回归 + TF-IDF（可输出概率）"""
        return Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=100000,
                ngram_range=(1, 2),
                sublinear_tf=True
            )),
            ('clf', LogisticRegression(
                C=1.0,
                max_iter=1000,
                class_weight='balanced',
                solver='lbfgs',
                multi_class='multinomial'
            ))
        ])

# 交叉验证选择最优模型
def select_best_model(texts, labels):
    """通过交叉验证选择最优传统模型"""
    pipelines = {
        'NaiveBayes': TraditionalClassifier.build_nb_pipeline(),
        'SVM': TraditionalClassifier.build_svm_pipeline(),
        'LogisticRegression': TraditionalClassifier.build_lr_pipeline()
    }

    results = {}
    for name, pipe in pipelines.items():
        scores = cross_val_score(pipe, texts, labels, cv=5, scoring='f1_macro')
        results[name] = {
            'mean_f1': scores.mean(),
            'std_f1': scores.std()
        }
        print(f"{name}: F1-macro = {scores.mean():.4f} (+/- {scores.std():.4f})")

    best = max(results, key=lambda k: results[k]['mean_f1'])
    print(f"\n最优模型: {best}")
    return best, pipelines[best]

TextRNN 和 BiLSTM 分类器

import torch
import torch.nn as nn

class TextRNN(nn.Module):
    """BiLSTM 文本分类器

    相比 TextCNN：
    - 更适合捕捉长距离依赖
    - 训练速度更慢
    - 在短文本上不一定优于 TextCNN
    """
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128,
                 num_classes=4, num_layers=2, dropout=0.5, bidirectional=True):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embed_dim, hidden_dim,
            num_layers=num_layers,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )
        self.direction_factor = 2 if bidirectional else 1
        self.fc = nn.Linear(hidden_dim * self.direction_factor, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x: (B, seq_len)
        embeds = self.embedding(x)     # (B, L, E)
        output, (hidden, cell) = self.lstm(embeds)

        if self.lstm.bidirectional:
            # 拼接最后一层的前向和后向隐状态
            hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)  # (B, 2*H)
        else:
            hidden = hidden[-1]  # (B, H)

        hidden = self.dropout(hidden)
        return self.fc(hidden)

model = TextRNN(vocab_size=10000, embed_dim=128, hidden_dim=128, num_classes=4)
dummy = torch.randint(0, 10000, (8, 50))
print(f"BiLSTM 输出: {model(dummy).shape}")
print(f"参数量: {sum(p.numel() for p in model.parameters()):,}")

BERT 微调完整训练流程

# 完整的 BERT 微调训练脚本
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

class TextClassificationDataset(Dataset):
    """文本分类数据集"""
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_bert(texts, labels, num_classes, model_name='bert-base-chinese',
               epochs=3, batch_size=16, learning_rate=2e-5):
    """完整的 BERT 微调训练流程"""

    # 划分数据集
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, stratify=labels, random_state=42
    )

    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(
        model_name, num_labels=num_classes
    )

    # 创建数据集和数据加载器
    train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer)
    val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # 优化器和学习率调度器
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # 训练循环
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels_tensor = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels_tensor)
            loss = outputs.loss
            loss.backward()

            # 梯度裁剪
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        # 验证
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels_tensor = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                preds = outputs.logits.argmax(dim=-1)
                correct += (preds == labels_tensor).sum().item()
                total += labels_tensor.size(0)

        val_acc = correct / total
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f} - Val Acc: {val_acc:.4f}")

    return model

层次分类实现

class HierarchicalClassifier:
    """层次分类：先预测一级类别，再预测二级类别

    适用场景：
    - 电商商品分类（数码 -> 手机 -> 智能手机）
    - 新闻分类（科技 -> 人工智能 -> NLP）
    - 工单分类（技术 -> 网络 -> DNS）

    优势：层次间共享信息，减少搜索空间
    劣势：错误传播（一级分类错误导致二级必然错误）
    """

    def __init__(self, hierarchy):
        """
        hierarchy 结构示例：
        {
            "科技": ["人工智能", "互联网", "硬件"],
            "体育": ["足球", "篮球", "网球"],
            "娱乐": ["电影", "音乐", "综艺"]
        }
        """
        self.hierarchy = hierarchy
        self.level1_labels = list(hierarchy.keys())
        self.level2_models = {}  # 每个一级类别一个二级分类器

    def predict(self, text, level1_prob, level2_probs):
        """层次预测"""
        # 第一步：预测一级类别
        level1_idx = level1_prob.argmax()
        level1 = self.level1_labels[level1_idx]

        # 第二步：在对应一级类别下预测二级类别
        level2_labels = self.hierarchy[level1]
        level2_idx = level2_probs[level1].argmax()
        level2 = level2_labels[level2_idx]

        return level1, level2

    def predict_with_confidence(self, text, level1_prob, level2_probs, threshold=0.6):
        """带置信度阈值的层次预测"""
        level1_conf = level1_prob.max()

        if level1_conf < threshold:
            return None, None, level1_conf

        level1_idx = level1_prob.argmax()
        level1 = self.level1_labels[level1_idx]

        level2_labels = self.hierarchy[level1]
        level2_conf = level2_probs[level1].max()

        if level2_conf < threshold:
            return level1, None, level2_conf

        level2_idx = level2_probs[level1].argmax()
        level2 = level2_labels[level2_idx]

        return level1, level2, min(level1_conf, level2_conf)

# 使用示例
hierarchy = {
    "科技": ["人工智能", "互联网", "硬件"],
    "体育": ["足球", "篮球", "网球"],
    "娱乐": ["电影", "音乐", "综艺"]
}
clf = HierarchicalClassifier(hierarchy)
print("层次分类器已创建，支持 3 个一级类别")

文本预处理流水线

import re
from collections import Counter

class TextPreprocessor:
    """中文文本预处理流水线"""

    def __init__(self, max_length=512, min_length=2):
        self.max_length = max_length
        self.min_length = min_length

    def clean_text(self, text):
        """基础文本清洗"""
        # 去除 HTML 标签
        text = re.sub(r'<[^>]+>', '', text)
        # 去除 URL
        text = re.sub(r'http[s]?://\S+', '[URL]', text)
        # 去除邮箱
        text = re.sub(r'\S+@\S+', '[EMAIL]', text)
        # 去除多余空白
        text = re.sub(r'\s+', ' ', text).strip()
        # 去除特殊字符（保留中文、英文、数字和基本标点）
        text = re.sub(r'[^\u4e00-\u9fff\w\s，。！？、；：""''（）]', '', text)
        return text

    def filter_by_length(self, text):
        """按长度过滤"""
        if len(text) < self.min_length or len(text) > self.max_length:
            return None
        return text

    def remove_duplicates(self, texts, labels):
        """去除重复样本"""
        seen = set()
        unique_texts, unique_labels = [], []
        for text, label in zip(texts, labels):
            if text not in seen:
                seen.add(text)
                unique_texts.append(text)
                unique_labels.append(label)
        removed = len(texts) - len(unique_texts)
        print(f"去重: {len(texts)} -> {len(unique_texts)} (去除 {removed} 条)")
        return unique_texts, unique_labels

    def analyze_distribution(self, labels):
        """分析标签分布"""
        counter = Counter(labels)
        total = len(labels)
        print("=== 标签分布 ===")
        for label, count in sorted(counter.items()):
            pct = count / total * 100
            bar = '#' * int(pct / 2)
            print(f"  类别 {label}: {count:5d} ({pct:5.1f}%) {bar}")

        # 检查不平衡程度
        max_count = max(counter.values())
        min_count = min(counter.values())
        ratio = max_count / max(min_count, 1)
        if ratio > 10:
            print(f"  警告: 类别不平衡比例 {ratio:.1f}:1，建议处理")
        return counter

# 使用示例
preprocessor = TextPreprocessor(max_length=512, min_length=2)
sample = "<p>这个产品非常好用！</p> 联系 email@test.com"
print(f"清洗结果: {preprocessor.clean_text(sample)}")

预处理流水线要点：
- 清洗规则要根据业务场景定制（保留什么、过滤什么）
- 长度过滤：太短的文本信息不足，太长的文本增加计算开销
- 去重：完全相同的文本重复标注浪费资源，还可能导致数据泄露
- 分布分析：必须在划分训练/测试集之前做，了解整体数据情况
- BERT 类模型通常只需要做基础清洗，分词交给 tokenizer