💻 基本Pipeline使用
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA(n_components=10)),
('classifier', RandomForestClassifier())
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
score = pipeline.score(X_test, y_test)
pipeline.named_steps['scaler']
pipeline['classifier']
📊 ColumnTransformer处理异构数据
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
numeric_features = ['age', 'income']
categorical_features = ['gender', 'city']
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(), categorical_features)
]
)
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier())
])
🔧 Pipeline与网格搜索
from sklearn.model_selection import GridSearchCV
param_grid = {
'pca__n_components': [5, 10, 20],
'classifier__n_estimators': [50, 100, 200],
'classifier__max_depth': [5, 10, None]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f'最佳参数: {grid_search.best_params_}')