🏷️ 特征编码
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd
df = pd.DataFrame({
'color': ['red', 'blue', 'green', 'blue', 'red'],
'size': ['S', 'M', 'L', 'M', 'S']
})
# 标签编码(有序类别)
le = LabelEncoder()
df['size_encoded'] = le.fit_transform(df['size'])
# S→2, M→1, L→0
# 独热编码(无序类别)
df_encoded = pd.get_dummies(df, columns=['color'])
# color_red, color_blue, color_green
# 使用sklearn
ohe = OneHotEncoder(sparse_output=False)
color_encoded = ohe.fit_transform(df[['color']])
✂️ 数据分割
训练集/验证集/测试集划分
from sklearn.model_selection import train_test_split
import numpy as np
X = np.random.randn(1000, 10)
y = np.random.randint(0, 2, 1000)
# 先分出测试集
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 再从剩余数据中分出验证集
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=42
)
# 最终比例: 60% train, 20% val, 20% test
print(f"训练集: {len(X_train)}")
print(f"验证集: {len(X_val)}")
print(f"测试集: {len(X_test)}")
⚠️
分层采样
分类问题中使用stratify=y确保各类别比例一致。