NLP 文本分类
大约 16 分钟约 4792 字
NLP 文本分类
简介
文本分类是 NLP 最基础也最实用的任务之一,将输入文本映射到预定义的类别标签。从情感分析、垃圾邮件过滤到意图识别和内容审核,文本分类在企业场景中应用广泛。掌握从数据准备、模型选择到评估优化的完整流程,是落地 NLP 项目的核心能力。
文本分类的历史可以追溯到早期的规则方法和统计方法(朴素贝叶斯、SVM),到深度学习时代的 TextCNN、FastText,再到预训练模型时代的 BERT 微调和 Prompt-based 分类。每一步演进都带来了效果的大幅提升,但同时也增加了计算成本。在实际项目中,需要在效果、成本和延迟之间做出权衡。
文本分类的核心挑战不仅仅是模型选择,还包括:标签体系的定义是否合理、标注数据的质量和一致性、类别不平衡问题、以及线上部署时的延迟要求。一个被低估的事实是:在很多实际项目中,数据质量对最终效果的影响比模型选择更大。
特点
分类任务的类型
- 二分类:正面/负面、垃圾/非垃圾、通过/拒绝
- 多分类:情感(正面/中性/负面)、新闻类别(体育/科技/娱乐/...)
- 多标签分类:一篇文档可以属于多个类别(如一篇论文可以同时属于 AI 和 NLP)
- 层次分类:一级类别下还有二级类别(如科技 -> AI -> 计算机视觉)
- 层次多标签:层次分类 + 多标签的组合
实现
# 示例1:基于 TextCNN 的文本分类模型
import torch
import torch.nn as nn
import torch.nn.functional as F
class TextCNN(nn.Module):
def __init__(self, vocab_size, embed_dim=128, num_classes=2,
filter_sizes=[2, 3, 4], num_filters=128, dropout=0.5):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.convs = nn.ModuleList([
nn.Conv1d(embed_dim, num_filters, fs) for fs in filter_sizes
])
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes)
def forward(self, x):
# x: (batch, seq_len)
embeds = self.embedding(x) # (B, L, E)
embeds = embeds.permute(0, 2, 1) # (B, E, L) 适配 Conv1d
conv_outputs = []
for conv in self.convs:
c = F.relu(conv(embeds)) # (B, num_filters, L-fs+1)
c = F.max_pool1d(c, c.size(2)).squeeze(2) # (B, num_filters)
conv_outputs.append(c)
out = torch.cat(conv_outputs, dim=1) # (B, num_filters * len(filter_sizes))
out = self.dropout(out)
return self.fc(out)
model = TextCNN(vocab_size=10000, embed_dim=128, num_classes=4)
dummy = torch.randint(0, 10000, (8, 50))
print(f"TextCNN 输出: {model(dummy).shape}")
print(f"参数量: {sum(p.numel() for p in model.parameters()):,}")FastText 文本分类
import torch
import torch.nn as nn
class FastTextClassifier(nn.Module):
"""FastText 风格的文本分类器
FastText 的核心思想:将文本表示为词向量的平均值,然后通过全连接层分类。
优点:简单、快速、效果在短文本上不错。
"""
def __init__(self, vocab_size, embed_dim=100, num_classes=4):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.fc = nn.Linear(embed_dim, num_classes)
self.dropout = nn.Dropout(0.3)
def forward(self, x):
# x: (B, seq_len)
embeds = self.embedding(x) # (B, seq_len, embed_dim)
# 平均池化(忽略 padding)
mask = (x != 0).unsqueeze(-1).float() # (B, seq_len, 1)
pooled = (embeds * mask).sum(dim=1) / mask.sum(dim=1) # (B, embed_dim)
pooled = self.dropout(pooled)
return self.fc(pooled)
model = FastTextClassifier(vocab_size=10000, embed_dim=100, num_classes=4)
print(f"FastText 参数量: {sum(p.numel() for p in model.parameters()):,}")# 示例2:基于 BERT 的文本分类微调
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained(
'bert-base-chinese',
num_labels=3,
)
# 准备数据
texts = ["这个产品非常好用", "质量太差了,不推荐", "还行吧,一般般"]
labels = [2, 0, 1] # 假设 0=负面, 1=中性, 2=正面
encodings = tokenizer(
texts, truncation=True, padding=True,
max_length=128, return_tensors='pt'
)
class ClassificationDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
item = {k: v[idx] for k, v in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
dataset = ClassificationDataset(encodings, labels)
# 训练参数配置
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=16,
learning_rate=2e-5,
weight_decay=0.01,
warmup_ratio=0.1,
evaluation_strategy='epoch',
save_strategy='epoch',
load_best_model_at_end=True,
)
print("BERT 微调配置完成,实际训练需要更多数据")
print(f"模型参数量: {sum(p.numel() for p in model.parameters()):,}")多标签分类
import torch
import torch.nn as nn
class MultiLabelClassifier(nn.Module):
"""多标签分类:一篇文本可以属于多个类别
与多分类的区别:
- 多分类: softmax + CrossEntropyLoss,每个样本属于一个类别
- 多标签: sigmoid + BCEWithLogitsLoss,每个类别独立判断
"""
def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.fc1 = nn.Linear(embed_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, num_labels)
def forward(self, x):
embeds = self.embedding(x).mean(dim=1)
out = torch.relu(self.fc1(embeds))
return self.fc2(out) # 不加 sigmoid,BCEWithLogitsLoss 内置
# 多标签分类的损失函数和评估
def multilabel_example():
"""多标签分类示例"""
num_labels = 5
criterion = nn.BCEWithLogitsLoss()
# 模拟输出和标签
logits = torch.randn(4, num_labels) # (batch, num_labels)
# 多标签:标签是 0/1 矩阵
targets = torch.tensor([
[1, 0, 1, 0, 0],
[0, 1, 1, 0, 1],
[1, 1, 0, 0, 0],
[0, 0, 0, 1, 1],
], dtype=torch.float)
loss = criterion(logits, targets)
# 预测:对每个类别独立 sigmoid,阈值 0.5
probs = torch.sigmoid(logits)
predictions = (probs > 0.5).int()
print(f"多标签损失: {loss.item():.4f}")
print(f"预测: {predictions}")
print(f"真实: {targets.int()}")
multilabel_example()# 示例3:完整的评估和错误分析
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
def evaluate_classifier(y_true, y_pred, label_names=None):
"""全面的分类评估"""
report = classification_report(y_true, y_pred, target_names=label_names)
print("=== 分类报告 ===")
print(report)
cm = confusion_matrix(y_true, y_pred)
print("=== 混淆矩阵 ===")
print(cm)
# 找出最容易混淆的类别对
np.fill_diagonal(cm, 0)
most_confused = np.unravel_index(np.argmax(cm), cm.shape)
print(f"\n最容易混淆的类别对: {most_confused}")
return report
def error_analysis(texts, y_true, y_pred, probs, label_names, top_k=10):
"""分析高置信度错误预测"""
errors = []
for i, (text, true, pred, prob) in enumerate(zip(texts, y_true, y_pred, probs)):
if true != pred:
confidence = prob[pred]
errors.append((i, text, true, pred, confidence))
# 按置信度排序,高置信度错误最值得关注
errors.sort(key=lambda x: x[4], reverse=True)
print(f"\n=== Top {top_k} 高置信度错误 ===")
for idx, text, true, pred, conf in errors[:top_k]:
print(f" [{idx}] '{text[:30]}...' 真实={label_names[true]}, "
f"预测={label_names[pred]}, 置信度={conf:.3f}")
# 模拟评估
y_true = [0, 1, 2, 0, 1, 2, 0, 1, 2, 1]
y_pred = [0, 1, 1, 0, 2, 2, 0, 1, 1, 1]
evaluate_classifier(y_true, y_pred, label_names=['负面', '中性', '正面'])# 示例4:数据不平衡处理策略
from torch.utils.data import WeightedRandomSampler
import torch
import numpy as np
def handle_imbalance(labels, strategy='oversample'):
"""处理类别不平衡"""
label_counts = np.bincount(labels)
total = len(labels)
num_classes = len(label_counts)
print(f"类别分布: {dict(enumerate(label_counts))}")
if strategy == 'oversample':
# 过采样:少数类样本权重更高
class_weights = total / (num_classes * label_counts)
sample_weights = [class_weights[l] for l in labels]
sampler = WeightedRandomSampler(sample_weights, num_samples=total, replacement=True)
print(f"各类权重: {class_weights}")
return sampler
elif strategy == 'class_weight':
# 损失函数加权
class_weights = torch.tensor(total / (num_classes * label_counts), dtype=torch.float)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
print(f"损失函数权重: {class_weights}")
return criterion
labels = [0]*100 + [1]*500 + [2]*50 # 严重不平衡
sampler = handle_imbalance(labels, strategy='oversample')
criterion = handle_imbalance(labels, strategy='class_weight')数据增强策略
def text_augmentation_strategies():
"""文本数据增强方法
1. 同义词替换:随机替换词语为同义词
2. 回译增强:翻译为外语再翻译回中文
3. 随机删除:随机删除部分词语
4. 随机交换:随机交换相邻词语的位置
5. EDA (Easy Data Augmentation):上述方法的组合
注意事项:
- 增强后的文本需要保持语义不变
- 增强强度不宜过大,否则可能改变语义
- NLP 的数据增强效果不如 CV 中的数据增强显著
- 对预训练模型微调,数据增强的收益较小
"""
import random
def random_deletion(text, p=0.1):
"""随机删除词语"""
words = text.split()
if len(words) == 1:
return text
return ' '.join(w for w in words if random.random() > p)
def random_swap(text, n=1):
"""随机交换相邻词语"""
words = text.split()
for _ in range(n):
if len(words) < 2:
break
idx = random.randint(0, len(words) - 2)
words[idx], words[idx+1] = words[idx+1], words[idx]
return ' '.join(words)
text = "这个产品质量非常好值得购买"
print(f"原文: {text}")
print(f"随机删除: {random_deletion(text, p=0.2)}")
print(f"随机交换: {random_swap(text, n=2)}")
text_augmentation_strategies()模型选择指南
def model_selection_guide():
"""文本分类模型选择指南
数据量 -> 推荐模型:
- < 1000 条: 朴素贝叶斯 / SVM + TF-IDF
- 1000 - 10000 条: TextCNN / FastText
- 10000 - 100000 条: BERT 微调
- > 100000 条: 大模型微调 / RoBERTa / ERNIE
延迟要求:
- < 10ms: FastText / TF-IDF + LR
- < 50ms: TextCNN / DistilBERT
- < 200ms: BERT / RoBERTa
- 不敏感: 大模型 API
精度要求:
- 85%+: 传统 ML 即可
- 90%+: TextCNN / BERT 微调
- 95%+: 大模型 + 精细调优
"""
print("模型选择决策树:")
print(" 数据量 < 1K + 需要快速上线: FastText / SVM")
print(" 数据量 1K-10K + 追求精度: TextCNN / BERT")
print(" 数据量 > 10K + 领域专业: 领域预训练模型微调")
print(" 延迟敏感 (< 50ms): TextCNN / DistilBERT")
print(" 多标签分类: BERT + BCE Loss")
model_selection_guide()传统机器学习基线
# 传统 ML 方法:朴素贝叶斯 + SVM 基线
# 在数据量少时,传统方法往往是最实际的起点
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np
class TraditionalClassifier:
"""传统文本分类器集合"""
@staticmethod
def build_nb_pipeline():
"""朴素贝叶斯 + TF-IDF"""
return Pipeline([
('tfidf', TfidfVectorizer(
max_features=50000,
ngram_range=(1, 2), # 使用 unigram + bigram
min_df=2, # 最小文档频率
max_df=0.95, # 最大文档频率(过滤停用词级别的词)
sublinear_tf=True # 使用 1 + log(tf) 替代原始 tf
)),
('clf', MultinomialNB(alpha=0.1)) # alpha 拉普拉斯平滑
])
@staticmethod
def build_svm_pipeline():
"""SVM + TF-IDF(文本分类经典方案)"""
return Pipeline([
('tfidf', TfidfVectorizer(
max_features=100000,
ngram_range=(1, 3),
sublinear_tf=True
)),
('clf', LinearSVC(
C=1.0,
max_iter=2000,
class_weight='balanced' # 自动平衡类别权重
))
])
@staticmethod
def build_lr_pipeline():
"""逻辑回归 + TF-IDF(可输出概率)"""
return Pipeline([
('tfidf', TfidfVectorizer(
max_features=100000,
ngram_range=(1, 2),
sublinear_tf=True
)),
('clf', LogisticRegression(
C=1.0,
max_iter=1000,
class_weight='balanced',
solver='lbfgs',
multi_class='multinomial'
))
])
# 交叉验证选择最优模型
def select_best_model(texts, labels):
"""通过交叉验证选择最优传统模型"""
pipelines = {
'NaiveBayes': TraditionalClassifier.build_nb_pipeline(),
'SVM': TraditionalClassifier.build_svm_pipeline(),
'LogisticRegression': TraditionalClassifier.build_lr_pipeline()
}
results = {}
for name, pipe in pipelines.items():
scores = cross_val_score(pipe, texts, labels, cv=5, scoring='f1_macro')
results[name] = {
'mean_f1': scores.mean(),
'std_f1': scores.std()
}
print(f"{name}: F1-macro = {scores.mean():.4f} (+/- {scores.std():.4f})")
best = max(results, key=lambda k: results[k]['mean_f1'])
print(f"\n最优模型: {best}")
return best, pipelines[best]TextRNN 和 BiLSTM 分类器
import torch
import torch.nn as nn
class TextRNN(nn.Module):
"""BiLSTM 文本分类器
相比 TextCNN:
- 更适合捕捉长距离依赖
- 训练速度更慢
- 在短文本上不一定优于 TextCNN
"""
def __init__(self, vocab_size, embed_dim=128, hidden_dim=128,
num_classes=4, num_layers=2, dropout=0.5, bidirectional=True):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.lstm = nn.LSTM(
embed_dim, hidden_dim,
num_layers=num_layers,
bidirectional=bidirectional,
dropout=dropout if num_layers > 1 else 0,
batch_first=True
)
self.direction_factor = 2 if bidirectional else 1
self.fc = nn.Linear(hidden_dim * self.direction_factor, num_classes)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
# x: (B, seq_len)
embeds = self.embedding(x) # (B, L, E)
output, (hidden, cell) = self.lstm(embeds)
if self.lstm.bidirectional:
# 拼接最后一层的前向和后向隐状态
hidden = torch.cat([hidden[-2], hidden[-1]], dim=1) # (B, 2*H)
else:
hidden = hidden[-1] # (B, H)
hidden = self.dropout(hidden)
return self.fc(hidden)
model = TextRNN(vocab_size=10000, embed_dim=128, hidden_dim=128, num_classes=4)
dummy = torch.randint(0, 10000, (8, 50))
print(f"BiLSTM 输出: {model(dummy).shape}")
print(f"参数量: {sum(p.numel() for p in model.parameters()):,}")BERT 微调完整训练流程
# 完整的 BERT 微调训练脚本
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
class TextClassificationDataset(Dataset):
"""文本分类数据集"""
def __init__(self, texts, labels, tokenizer, max_length=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = str(self.texts[idx])
label = self.labels[idx]
encoding = self.tokenizer(
text,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
'labels': torch.tensor(label, dtype=torch.long)
}
def train_bert(texts, labels, num_classes, model_name='bert-base-chinese',
epochs=3, batch_size=16, learning_rate=2e-5):
"""完整的 BERT 微调训练流程"""
# 划分数据集
train_texts, val_texts, train_labels, val_labels = train_test_split(
texts, labels, test_size=0.2, stratify=labels, random_state=42
)
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
model_name, num_labels=num_classes
)
# 创建数据集和数据加载器
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
# 优化器和学习率调度器
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# 训练循环
for epoch in range(epochs):
model.train()
total_loss = 0
for batch in train_loader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels_tensor = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels_tensor)
loss = outputs.loss
loss.backward()
# 梯度裁剪
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
total_loss += loss.item()
avg_loss = total_loss / len(train_loader)
# 验证
model.eval()
correct = 0
total = 0
with torch.no_grad():
for batch in val_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels_tensor = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
preds = outputs.logits.argmax(dim=-1)
correct += (preds == labels_tensor).sum().item()
total += labels_tensor.size(0)
val_acc = correct / total
print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f} - Val Acc: {val_acc:.4f}")
return model层次分类实现
class HierarchicalClassifier:
"""层次分类:先预测一级类别,再预测二级类别
适用场景:
- 电商商品分类(数码 -> 手机 -> 智能手机)
- 新闻分类(科技 -> 人工智能 -> NLP)
- 工单分类(技术 -> 网络 -> DNS)
优势:层次间共享信息,减少搜索空间
劣势:错误传播(一级分类错误导致二级必然错误)
"""
def __init__(self, hierarchy):
"""
hierarchy 结构示例:
{
"科技": ["人工智能", "互联网", "硬件"],
"体育": ["足球", "篮球", "网球"],
"娱乐": ["电影", "音乐", "综艺"]
}
"""
self.hierarchy = hierarchy
self.level1_labels = list(hierarchy.keys())
self.level2_models = {} # 每个一级类别一个二级分类器
def predict(self, text, level1_prob, level2_probs):
"""层次预测"""
# 第一步:预测一级类别
level1_idx = level1_prob.argmax()
level1 = self.level1_labels[level1_idx]
# 第二步:在对应一级类别下预测二级类别
level2_labels = self.hierarchy[level1]
level2_idx = level2_probs[level1].argmax()
level2 = level2_labels[level2_idx]
return level1, level2
def predict_with_confidence(self, text, level1_prob, level2_probs, threshold=0.6):
"""带置信度阈值的层次预测"""
level1_conf = level1_prob.max()
if level1_conf < threshold:
return None, None, level1_conf
level1_idx = level1_prob.argmax()
level1 = self.level1_labels[level1_idx]
level2_labels = self.hierarchy[level1]
level2_conf = level2_probs[level1].max()
if level2_conf < threshold:
return level1, None, level2_conf
level2_idx = level2_probs[level1].argmax()
level2 = level2_labels[level2_idx]
return level1, level2, min(level1_conf, level2_conf)
# 使用示例
hierarchy = {
"科技": ["人工智能", "互联网", "硬件"],
"体育": ["足球", "篮球", "网球"],
"娱乐": ["电影", "音乐", "综艺"]
}
clf = HierarchicalClassifier(hierarchy)
print("层次分类器已创建,支持 3 个一级类别")文本预处理流水线
import re
from collections import Counter
class TextPreprocessor:
"""中文文本预处理流水线"""
def __init__(self, max_length=512, min_length=2):
self.max_length = max_length
self.min_length = min_length
def clean_text(self, text):
"""基础文本清洗"""
# 去除 HTML 标签
text = re.sub(r'<[^>]+>', '', text)
# 去除 URL
text = re.sub(r'http[s]?://\S+', '[URL]', text)
# 去除邮箱
text = re.sub(r'\S+@\S+', '[EMAIL]', text)
# 去除多余空白
text = re.sub(r'\s+', ' ', text).strip()
# 去除特殊字符(保留中文、英文、数字和基本标点)
text = re.sub(r'[^\u4e00-\u9fff\w\s,。!?、;:""''()]', '', text)
return text
def filter_by_length(self, text):
"""按长度过滤"""
if len(text) < self.min_length or len(text) > self.max_length:
return None
return text
def remove_duplicates(self, texts, labels):
"""去除重复样本"""
seen = set()
unique_texts, unique_labels = [], []
for text, label in zip(texts, labels):
if text not in seen:
seen.add(text)
unique_texts.append(text)
unique_labels.append(label)
removed = len(texts) - len(unique_texts)
print(f"去重: {len(texts)} -> {len(unique_texts)} (去除 {removed} 条)")
return unique_texts, unique_labels
def analyze_distribution(self, labels):
"""分析标签分布"""
counter = Counter(labels)
total = len(labels)
print("=== 标签分布 ===")
for label, count in sorted(counter.items()):
pct = count / total * 100
bar = '#' * int(pct / 2)
print(f" 类别 {label}: {count:5d} ({pct:5.1f}%) {bar}")
# 检查不平衡程度
max_count = max(counter.values())
min_count = min(counter.values())
ratio = max_count / max(min_count, 1)
if ratio > 10:
print(f" 警告: 类别不平衡比例 {ratio:.1f}:1,建议处理")
return counter
# 使用示例
preprocessor = TextPreprocessor(max_length=512, min_length=2)
sample = "<p>这个产品非常好用!</p> 联系 email@test.com"
print(f"清洗结果: {preprocessor.clean_text(sample)}")预处理流水线要点:
- 清洗规则要根据业务场景定制(保留什么、过滤什么)
- 长度过滤:太短的文本信息不足,太长的文本增加计算开销
- 去重:完全相同的文本重复标注浪费资源,还可能导致数据泄露
- 分布分析:必须在划分训练/测试集之前做,了解整体数据情况
- BERT 类模型通常只需要做基础清洗,分词交给 tokenizer优点
缺点
总结
文本分类是 NLP 落地最直接的方式,掌握数据标注、模型选择、评估分析和长尾处理是构建高质量分类系统的关键。不要小看分类任务——很多看似简单的场景,在数据噪声、类别重叠和长尾分布上都有深层挑战。
关键知识点
- BERT 微调的分类头通常只在 [CLS] token 的表示上接一个线性层
- 多标签分类使用 sigmoid + BCE loss,多分类使用 softmax + CrossEntropy loss
- 数据划分要保证训练/验证/测试集无数据泄露,且标签分布一致
- F1-macro 关注每个类的平等表现,F1-weighted 关注总体表现,选择取决于业务目标
项目落地视角
- 先用小规模标注数据跑通完整管线,再扩大标注规模
- 建立 golden set(固定评估集),每次迭代都评估其上的表现变化
- 上线后持续监控线上数据的分布变化,防止数据漂移导致效果下降
常见误区
- 只看整体准确率不看各类别 F1——不平衡数据下准确率有误导性
- 标注数据不够时就上大模型——先用传统 ML 基线评估标注质量和任务难度
- 忽略标签噪声——标注不一致的样本会显著拉低模型上限
进阶路线
- 学习主动学习(Active Learning)策略,高效利用标注预算
- 掌握 Few-shot 和 Zero-shot 分类方法,降低对标注数据的依赖
- 探索 Prompt-based 分类,将分类任务转化为填空任务
- 了解领域自适应(Domain Adaptation)技术处理跨领域迁移
适用场景
- 情感分析、意图识别、垃圾内容过滤
- 新闻分类、工单路由、商品类别预测
- 内容审核、合规检测、风险评估
落地建议
- 标注前先定义清晰的标注规范,包含边界样例和冲突处理规则
- 优先使用 BERT 微调作为基线,效果不足时再考虑更大模型
- 部署时配合置信度阈值,低置信度结果路由到人工审核
排错清单
- 准确率不高于随机基线:检查标签是否正确、模型是否在学习
- 训练集效果差:增大模型容量或检查数据标注质量
- 验证集效果差但训练集好:增加正则化、数据增强或减少模型容量
复盘问题
- 各类别的 F1 分数是否均衡?哪些类别效果最差?
- 当前标注数据量和质量是否足够?是否需要追加标注?
- 线上效果和离线评估的差异有多大?可能的原因是什么?
