Python scikit-learn 实践
大约 9 分钟约 2815 字
Python scikit-learn 实践
简介
scikit-learn(sklearn)是 Python 最经典的机器学习库,提供分类、回归、聚类、降维、特征工程和模型评估等全套工具。它 API 统一(fit/predict/transform)、文档完善,是机器学习入门和生产应用的首选库。
特点
实现
分类:客户流失预测
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
# 准备数据(模拟客户数据)
np.random.seed(42)
n_samples = 2000
data = pd.DataFrame({
"age": np.random.randint(18, 70, n_samples),
"tenure_months": np.random.randint(1, 72, n_samples),
"monthly_charges": np.round(np.random.uniform(20, 120, n_samples), 2),
"total_charges": lambda df: df["tenure_months"] * df["monthly_charges"],
"contract_type": np.random.choice(["月付", "年付", "两年付"], n_samples, p=[0.5, 0.3, 0.2]),
"has_internet": np.random.choice([0, 1], n_samples, p=[0.3, 0.7]),
"support_calls": np.random.poisson(1.5, n_samples),
})
data["total_charges"] = data["tenure_months"] * data["monthly_charges"]
data["churned"] = (
(data["support_calls"] > 3) | (data["contract_type"] == "月付") & (data["monthly_charges"] > 80)
).astype(int)
# 特征工程
le = LabelEncoder()
data["contract_encoded"] = le.fit_transform(data["contract_type"])
features = ["age", "tenure_months", "monthly_charges", "total_charges",
"contract_encoded", "has_internet", "support_calls"]
X = data[features]
y = data["churned"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# 模型对比
models = {
"Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
"Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
}
for name, model in models.items():
pipe = Pipeline([("scaler", StandardScaler()), ("model", model)])
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring="f1")
print(f"{name}: F1={scores.mean():.3f} (+/- {scores.std():.3f})")
# 最佳模型详细评估
best_pipe = Pipeline([
("scaler", StandardScaler()),
("model", GradientBoostingClassifier(n_estimators=100, random_state=42)),
])
best_pipe.fit(X_train, y_train)
y_pred = best_pipe.predict(X_test)
y_prob = best_pipe.predict_proba(X_test)[:, 1]
print("\n分类报告:")
print(classification_report(y_test, y_pred, target_names=["未流失", "已流失"]))
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob):.3f}")回归:房价预测与超参调优
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
# 加载数据
housing = fetch_california_housing()
X, y = housing.data, housing.target
feature_names = housing.feature_names
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Pipeline + 网格搜索
pipe = Pipeline([
("scaler", StandardScaler()),
("model", GradientBoostingRegressor(random_state=42)),
])
param_grid = {
"model__n_estimators": [100, 200],
"model__max_depth": [3, 5, 7],
"model__learning_rate": [0.05, 0.1],
}
grid_search = GridSearchCV(pipe, param_grid, cv=3, scoring="neg_mean_squared_error", n_jobs=-1, verbose=0)
grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳 CV 分数 (RMSE): {np.sqrt(-grid_search.best_score_):.4f}")
# 测试集评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(f"\n测试集评估:")
print(f" RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f" MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f" R2: {r2_score(y_test, y_pred):.4f}")
# 特征重要性
importances = best_model.named_steps["model"].feature_importances_
for name, imp in sorted(zip(feature_names, importances), key=lambda x: -x[1]):
print(f" {name}: {imp:.4f}")聚类与降维:客户分群
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
# 模拟客户消费数据
np.random.seed(42)
n_customers = 1000
customer_data = np.column_stack([
np.random.lognormal(4, 1, n_customers), # 年消费金额
np.random.randint(1, 50, n_customers), # 购买频次
np.random.uniform(50, 500, n_customers), # 客单价
np.random.uniform(1, 365, n_customers), # 最近购买天数
])
# 标准化
scaler = StandardScaler()
data_scaled = scaler.fit_transform(customer_data)
# 确定 K 值(肘部法则 + 轮廓系数)
inertias = []
silhouettes = []
K_range = range(2, 10)
for k in K_range:
km = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = km.fit_predict(data_scaled)
inertias.append(km.inertia_)
silhouettes.append(silhouette_score(data_scaled, labels))
best_k = K_range[np.argmax(silhouettes)]
print(f"最佳 K 值: {best_k}, 轮廓系数: {max(silhouettes):.3f}")
# K-Means 聚类
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
labels = kmeans.fit_predict(data_scaled)
# PCA 降维可视化
pca = PCA(n_components=2)
data_2d = pca.fit_transform(data_scaled)
print(f"PCA 解释方差比: {pca.explained_variance_ratio_}")
print(f"累计解释方差: {sum(pca.explained_variance_ratio_):.3f}")
# 输出每个簇的特征
for cluster_id in range(best_k):
mask = labels == cluster_id
cluster_data = customer_data[mask]
print(f"\n簇 {cluster_id} ({mask.sum()} 人):")
print(f" 平均年消费: {cluster_data[:, 0].mean():.0f}")
print(f" 平均频次: {cluster_data[:, 1].mean():.1f}")
print(f" 平均客单价: {cluster_data[:, 2].mean():.0f}")
print(f" 平均最近购买天数: {cluster_data[:, 3].mean():.0f}")
# DBSCAN 密度聚类对比
dbscan = DBSCAN(eps=0.8, min_samples=10)
db_labels = dbscan.fit_predict(data_scaled)
n_clusters = len(set(db_labels)) - (1 if -1 in db_labels else 0)
print(f"\nDBSCAN 发现 {n_clusters} 个簇,噪声点 {(db_labels == -1).sum()} 个")文本分类 Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
import numpy as np
# 模拟新闻文本数据
texts = [
"Python 3.12 发布,性能提升显著",
"新款电动汽车续航突破 800 公里",
"深度学习模型在图像识别中取得突破",
"股市今日大涨,科技板块领涨",
"机器学习算法优化推荐系统效果",
"国际油价持续走高",
"自然语言处理技术进展迅速",
"新能源汽车销量创历史新高",
"PyTorch 2.0 加速模型训练",
"央行降息刺激经济增长",
]
categories = ["科技", "汽车", "科技", "财经", "科技", "财经", "科技", "汽车", "科技", "财经"]
# 复制并扩充数据
texts = texts * 50
categories = categories * 50
# TF-IDF + SVM 文本分类 Pipeline
text_clf = Pipeline([
("tfidf", TfidfVectorizer(
max_features=5000,
ngram_range=(1, 2),
sublinear_tf=True,
)),
("clf", LinearSVC(random_state=42, max_iter=1000)),
])
# 交叉验证
scores = cross_val_score(text_clf, texts, categories, cv=5, scoring="accuracy")
print(f"文本分类准确率: {scores.mean():.3f} (+/- {scores.std():.3f})")
# 训练并预测
text_clf.fit(texts, categories)
new_texts = ["大模型推理成本下降", "锂电池技术突破", "金融市场波动加剧"]
predictions = text_clf.predict(new_texts)
for text, pred in zip(new_texts, predictions):
print(f" '{text}' -> {pred}")特征工程与预处理 Pipeline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
StandardScaler, MinMaxScaler, RobustScaler,
OneHotEncoder, OrdinalEncoder, LabelEncoder,
FunctionTransformer, KBinsDiscretizer,
)
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# 模拟真实数据集
np.random.seed(42)
n = 1000
df = pd.DataFrame({
"age": np.random.randint(18, 70, n),
"income": np.random.lognormal(10.5, 0.8, n),
"city": np.random.choice(["北京", "上海", "广州", "深圳"], n),
"education": np.random.choice(["高中", "本科", "硕士", "博士"], n),
"credit_score": np.random.normal(650, 80, n),
"is_default": (np.random.random(n) < 0.15).astype(int),
})
# 插入缺失值
mask = np.random.random(n) < 0.05
df.loc[mask, "income"] = np.nan
mask2 = np.random.random(n) < 0.03
df.loc[mask2, "credit_score"] = np.nan
# 定义特征分组
numeric_features = ["age", "income", "credit_score"]
categorical_features = ["city", "education"]
# 数值特征 Pipeline
numeric_transformer = Pipeline([
("imputer", SimpleImputer(strategy="median")),
("scaler", RobustScaler()), # 对异常值鲁棒
])
# 分类特征 Pipeline
categorical_transformer = Pipeline([
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
])
# 组合预处理器
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
],
remainder="drop",
)
# 完整 Pipeline
full_pipeline = Pipeline([
("preprocessor", preprocessor),
("classifier", RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")),
])
# 训练与评估
X = df.drop("is_default", axis=1)
y = df["is_default"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
full_pipeline.fit(X_train, y_train)
y_pred = full_pipeline.predict(X_test)
print("分类报告:")
print(classification_report(y_test, y_pred, target_names=["正常", "违约"]))
# 特征重要性
importances = full_pipeline.named_steps["classifier"].feature_importances_
feature_names = numeric_features + categorical_features
for name, imp in sorted(zip(feature_names, importances), key=lambda x: -x[1]):
print(f" {name}: {imp:.4f}")模型持久化与部署
import joblib
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np
import json
# 训练模型
pipeline = Pipeline([
("scaler", StandardScaler()),
("model", RandomForestClassifier(n_estimators=100, random_state=42)),
])
X_train = np.random.randn(800, 10)
y_train = (X_train[:, 0] + X_train[:, 1] > 0).astype(int)
pipeline.fit(X_train, y_train)
# 保存模型
joblib.dump(pipeline, "model_v1.joblib")
print(f"模型大小: {joblib.__version__}")
# 加载模型并预测
loaded_model = joblib.load("model_v1.joblib")
X_new = np.random.randn(5, 10)
predictions = loaded_model.predict(X_new)
probabilities = loaded_model.predict_proba(X_new)
print(f"预测结果: {predictions}")
print(f"预测概率: {probabilities[:, 1]}")
# 模型版本管理
model_info = {
"model_file": "model_v1.joblib",
"version": "1.0.0",
"trained_at": "2024-01-15",
"features": [f"feature_{i}" for i in range(10)],
"metrics": {"accuracy": 0.95, "f1": 0.93},
}
with open("model_metadata.json", "w") as f:
json.dump(model_info, f, indent=2, ensure_ascii=False)自定义 Transformer 与评估指标
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import (
precision_recall_curve, roc_curve, auc,
confusion_matrix, precision_score, recall_score, f1_score,
)
import numpy as np
class FeatureEngineer(BaseEstimator, TransformerMixin):
"""自定义特征工程 Transformer"""
def __init__(self, interaction_pairs=None):
self.interaction_pairs = interaction_pairs or []
def fit(self, X, y=None):
return self
def transform(self, X):
X = np.array(X, dtype=float)
# 对数变换
X = np.hstack([X, np.log1p(np.abs(X))])
# 交互特征
interactions = []
for i, j in self.interaction_pairs:
interactions.append((X[:, i] * X[:, j]).reshape(-1, 1))
if interactions:
X = np.hstack([X] + interactions)
return X
# 使用自定义 Transformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
custom_pipeline = Pipeline([
("features", FeatureEngineer(interaction_pairs=[(0, 1), (1, 2)])),
("scaler", StandardScaler()),
("model", LogisticRegression(max_iter=1000)),
])
# 模型评估工具
def detailed_evaluation(y_true, y_pred, y_prob=None):
"""详细的模型评估"""
print(f"精确率: {precision_score(y_true, y_pred):.4f}")
print(f"召回率: {recall_score(y_true, y_pred):.4f}")
print(f"F1 分数: {f1_score(y_true, y_pred):.4f}")
cm = confusion_matrix(y_true, y_pred)
print(f"\n混淆矩阵:")
print(f" TN={cm[0,0]:4d} FP={cm[0,1]:4d}")
print(f" FN={cm[1,0]:4d} TP={cm[1,1]:4d}")
if y_prob is not None:
fpr, tpr, _ = roc_curve(y_true, y_prob)
roc_auc = auc(fpr, tpr)
print(f"\nROC-AUC: {roc_auc:.4f}")
precision, recall, _ = precision_recall_curve(y_true, y_prob)
pr_auc = auc(recall, precision)
print(f"PR-AUC: {pr_auc:.4f}")
# 示例
np.random.seed(42)
y_true = np.random.randint(0, 2, 200)
y_prob = np.clip(y_true + np.random.normal(0, 0.3, 200), 0, 1)
y_pred = (y_prob > 0.5).astype(int)
detailed_evaluation(y_true, y_pred, y_prob)优点
缺点
总结
scikit-learn 是传统机器学习的首选工具,统一的 API 和 Pipeline 机制让模型开发高效且可复现。对于表格数据的分类、回归和聚类任务,sklearn 的经典算法(随机森林、梯度提升、SVM)仍然是强有力的基线。掌握特征工程、模型选择、交叉验证和超参调优四大核心能力即可覆盖大部分业务场景。
关键知识点
- 所有模型遵循 fit(X, y) -> predict(X) 的统一接口
- Pipeline 将预处理和模型封装为原子操作,防止数据泄漏
- cross_val_score 交叉验证评估模型泛化能力
- GridSearchCV 和 RandomizedSearchCV 自动搜索最优超参数
项目落地视角
- 使用 joblib 或 pickle 持久化训练好的模型,部署为 API 服务
- Pipeline 对象应作为模型部署单元,确保预处理一致
- 使用 BentoML 或 MLflow 管理模型版本和部署
- 定期使用新数据重新训练模型,监控预测性能衰减
常见误区
- 在交叉验证前做了全量数据预处理,导致数据泄漏
- 忽略特征缩放,对距离敏感的算法(SVM、KNN)性能下降
- 只看准确率不看召回率,类别不平衡时被误导
- 过度调参导致过拟合,交叉验证分数高但实际效果差
进阶路线
- 学习 XGBoost、LightGBM 等高性能梯度提升库
- 研究特征工程自动化(Featuretools、AutoFeat)
- 了解 AutoML 工具(auto-sklearn、FLAML)
- 探索模型可解释性(SHAP、LIME)
适用场景
- 表格数据的分类和回归预测(用户流失、信用评分、销量预测)
- 客户分群和异常检测(聚类、Isolation Forest)
- 文本分类和简单 NLP 任务
落地建议
- 始终使用 Pipeline 封装预处理和模型,确保训练和预测一致
- 建立模型评估标准,关注业务指标而非仅看模型指标
- 使用 MLflow 记录实验参数和指标,支持模型对比和回溯
排错清单
- 检查是否存在数据泄漏(在拆分前做了全量特征工程)
- 确认类别不平衡问题是否需要处理(过采样/欠采样/权重调整)
- 排查模型过拟合(训练集表现好但测试集差)
复盘问题
- 你的模型在生产环境的表现与离线评估差距有多大?原因是什么?
- 模型的特征重要性是否符合业务直觉?是否有意料之外的特征?
- 模型更新频率是多少?如何检测模型性能衰减?
