⏳
Loading cheatsheet...
Model pipeline building, preprocessing, evaluation, tuning and production-ready ML patterns.
from sklearn.preprocessing import (
StandardScaler, MinMaxScaler, RobustScaler,
LabelEncoder, OrdinalEncoder, OneHotEncoder,
LabelBinarizer, MultiLabelBinarizer,
)
# ── StandardScaler (zero mean, unit variance) ──
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Each feature: mean=0, std=1
# Formula: z = (x - μ) / σ
scaler.mean_ # per-feature means
scaler.scale_ # per-feature stds
scaler.inverse_transform(X_scaled) # back to original
# ── MinMaxScaler (scale to [0, 1]) ──
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)
# Formula: x_norm = (x - x_min) / (x_max - x_min)
scaler = MinMaxScaler(feature_range=(-1, 1)) # custom range
# ── RobustScaler (uses median & IQR, robust to outliers) ──
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
# Formula: x_norm = (x - median) / IQR
# ── MaxAbsScaler (scale to [-1, 1], preserves sparsity) ──
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X_sparse)
# ── Normalization (scale samples, not features) ──
from sklearn.preprocessing import Normalizer
norm = Normalizer(norm='l2') # l1, l2, or max
X_norm = norm.fit_transform(X) # each row has unit norm# ── LabelEncoder (target variable: y) ──
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# ['cat', 'dog', 'bird'] → [0, 1, 2]
y_original = le.inverse_transform(y_encoded)
le.classes_ # ['bird', 'cat', 'dog']
# ── OrdinalEncoder (features with ordered categories) ──
oe = OrdinalEncoder(categories=[['low', 'medium', 'high']])
X_encoded = oe.fit_transform(X[['size']])
# ── OneHotEncoder (categorical features) ──
ohe = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = ohe.fit_transform(X[['city', 'dept']])
# drop='first' avoids multicollinearity
# drop='if_binary' drops first only for binary categories
ohe.get_feature_names_out() # ['city_LA', 'dept_Sales', ...]
ohe.categories_ # list of category arrays
# ── LabelBinarizer (target: multi-class → one-vs-rest) ──
lb = LabelBinarizer()
y_bin = lb.fit_transform(y)
# 3 classes → 3-column binary matrix
# ── MultiLabelBinarizer ──
mlb = MultiLabelBinarizer()
y_bin = mlb.fit_transform([['a', 'b'], ['a'], ['b', 'c']])
# [[1,1,0], [1,0,0], [0,1,1]]
# ── Discretization ──
from sklearn.preprocessing import KBinsDiscretizer
kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X_binned = kbd.fit_transform(X)
# strategy: 'uniform', 'quantile', 'kmeans'
# ── Polynomial features ──
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
# [x1, x2] → [x1, x2, x1², x1*x2, x2²]
poly.get_feature_names_out()| Scaler | Center | Scale By | Best For |
|---|---|---|---|
| StandardScaler | Mean | Std dev | Normal-like distributions |
| MinMaxScaler | Min | Range (max-min) | Bounded features |
| RobustScaler | Median | IQR | Outlier-heavy data |
| MaxAbsScaler | 0 | Max abs value | Sparse data |
| Normalizer | — | Row norm | Text/similarity data |
| Encoder | Input | Output | Use For |
|---|---|---|---|
| LabelEncoder | 1D y | 1D int | Target labels |
| OrdinalEncoder | X columns | X columns (int) | Ordered categories |
| OneHotEncoder | X columns | Binary columns | Nominal categories |
| LabelBinarizer | 1D y | Binary matrix | Multi-class target |
| TargetEncoder | X columns | Numeric columns | High-cardinality cats |
.fit_transform(X_train) and .transform(X_test) separately. Fitting on the full dataset causes data leakage and optimistic bias.from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
# ── Logistic Regression ──
model = LogisticRegression(
penalty='l2', # 'l1', 'l2', 'elasticnet', None
C=1.0, # inverse regularization (higher = less reg)
solver='lbfgs', # 'lbfgs', 'liblinear', 'saga'
max_iter=1000,
class_weight='balanced', # auto-balance classes
multi_class='auto',
random_state=42,
)
model.fit(X_train, y_train)
model.predict(X_test)
model.predict_proba(X_test) # probability per class
model.predict_log_proba(X_test)
model.coef_ # feature weights
model.intercept_
model.classes_
# ── Random Forest ──
model = RandomForestClassifier(
n_estimators=100, # number of trees
max_depth=None, # max tree depth
min_samples_split=2,
min_samples_leaf=1,
max_features='sqrt', # 'sqrt', 'log2', int, float
criterion='gini', # 'gini' or 'entropy'
bootstrap=True,
class_weight='balanced',
n_jobs=-1, # use all cores
random_state=42,
)
# ── SVM ──
model = SVC(
C=1.0,
kernel='rbf', # 'linear', 'poly', 'rbf', 'sigmoid'
gamma='scale', # 'scale', 'auto'
probability=True, # enable predict_proba
class_weight='balanced',
)
# For large datasets, use LinearSVC (faster)
from sklearn.svm import LinearSVC
model = LinearSVC(C=1.0, max_iter=5000)
# ── KNN ──
model = KNeighborsClassifier(
n_neighbors=5,
weights='uniform', # 'uniform' or 'distance'
metric='minkowski',
p=2, # 2=euclidean, 1=manhattan
n_jobs=-1,
)
# ── Gradient Boosting ──
model = GradientBoostingClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=3,
min_samples_split=2,
subsample=1.0,
random_state=42,
) # for production, consider XGBoost/LightGBM# ── Complete classification workflow ──
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# 1. Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
# 2. Build pipeline
pipe = Pipeline([
('scaler', StandardScaler()),
('clf', LogisticRegression(random_state=42)),
])
# 3. Cross-validate
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy')
print(f'CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}')
# 4. Fit and predict
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1] # probability of positive class
# 5. Evaluate
print(classification_report(y_test, y_pred, target_names=le.classes_))
cm = confusion_matrix(y_test, y_pred)
# Predicted
# Neg Pos
# Actual Neg [TN, FP]
# Actual Pos [FN, TP]
# 6. ROC curve
from sklearn.metrics import roc_curve, auc, RocCurveDisplay
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc_score = auc(fpr, tpr)
RocCurveDisplay.from_predictions(y_test, y_prob)| Model | Pros | Cons | Best For |
|---|---|---|---|
| Logistic Reg | Fast, interpretable | Linear boundary | Baseline, linear data |
| Random Forest | Robust, no scaling | Overfitting risk | Tabular, non-linear |
| SVM | High dim, kernels | Slow on large data | Medium datasets |
| KNN | Simple, non-parametric | Slow prediction | Small datasets |
| Gradient Boost | High accuracy | Slow training | Competition, tabular |
| Naive Bayes | Very fast | Strong assumptions | Text, high-dim sparse |
| Param | Higher = | Effect |
|---|---|---|
| C (SVM/LR) | Less regularization | More complex model |
| n_estimators | More trees | Better but slower (RF/GB) |
| max_depth | Deeper trees | More complex, overfit risk |
| n_neighbors | More neighbors | Smoother, simpler (KNN) |
| learning_rate | Larger steps | Faster but less stable (GB) |
| min_samples_leaf | More restriction | Prevents overfitting |
stratify=y in train_test_split() for classification. It preserves class proportions across splits, preventing imbalanced train/test sets.from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
# ── Linear Regression ──
model = LinearRegression()
model.fit(X_train, y_train)
model.coef_ # feature weights
model.intercept_ # bias term
model.score(X_test, y_test) # R² score
# ── Ridge (L2 regularization) ──
model = Ridge(alpha=1.0) # alpha = λ (higher = more regularization)
# Loss = MSE + α * Σ(w²)
# Shrinks coefficients toward 0, never exactly 0
# ── Lasso (L1 regularization) ──
model = Lasso(alpha=0.1)
# Loss = MSE + α * Σ|w|
# Can zero out coefficients → automatic feature selection
# ── ElasticNet (L1 + L2) ──
model = ElasticNet(
alpha=1.0,
l1_ratio=0.5, # 0=Ridge, 0.5=mix, 1=Lasso
)
# ── Polynomial Regression ──
model = make_pipeline(
PolynomialFeatures(degree=2, include_bias=False),
LinearRegression()
)
# ── Random Forest Regressor ──
model = RandomForestRegressor(
n_estimators=200,
max_depth=10,
min_samples_split=5,
min_samples_leaf=2,
n_jobs=-1,
random_state=42,
)
# ── Gradient Boosting Regressor ──
model = GradientBoostingRegressor(
n_estimators=200,
learning_rate=0.05,
max_depth=4,
subsample=0.8,
loss='squared_error', # 'squared_error', 'absolute_error', 'huber'
random_state=42,
)
# ── SVR ──
model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
# epsilon: margin of tolerance (no penalty within tube)from sklearn.metrics import (
mean_squared_error, mean_absolute_error, r2_score,
mean_absolute_percentage_error,
)
from sklearn.model_selection import cross_val_score
# ── Metrics ──
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred) # MSE
rmse = mean_squared_error(y_test, y_pred, squared=False) # RMSE
mae = mean_absolute_error(y_test, y_pred) # MAE
r2 = r2_score(y_test, y_pred) # R²
mape = mean_absolute_percentage_error(y_test, y_pred) # MAPE
print(f'R²: {r2:.4f}') # 1.0 = perfect, 0 = mean, <0 = worse
print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'MAPE: {mape:.4f}')
# ── Cross-validation ──
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
cv_mse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f'CV R²: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}')
# ── Residual analysis ──
residuals = y_test - y_pred
import matplotlib.pyplot as plt
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted')
plt.ylabel('Residuals')
# ── Feature importance (tree-based models) ──
importance = model.feature_importances_
feat_imp = pd.Series(importance, index=X.columns).sort_values(ascending=False)
feat_imp.head(10).plot(kind='barh')| Type | Penalty | Sparsity | Use Case |
|---|---|---|---|
| None (OLS) | — | No | Many features, no collinearity |
| Ridge (L2) | α·Σw² | No | Collinear features |
| Lasso (L1) | α·Σ|w| | Yes | Feature selection |
| ElasticNet | α·(r·|w|+(1-r)·w²) | Partial | Correlated groups |
| Metric | Formula | Robust to Outliers? |
|---|---|---|
| MSE | Σ(y-ŷ)²/n | No (squared errors) |
| RMSE | √MSE | No |
| MAE | Σ|y-ŷ|/n | Yes |
| MAPE | Σ|y-ŷ|/y/n × 100 | No (relative) |
| R² | 1 - SS_res/SS_tot | Context-dependent |
| MedAE | median(|y-ŷ|) | Yes (robust) |
GradientBoostingRegressor or RandomForestRegressor. Always check residual plots for patterns (heteroscedasticity, non-linearity).from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.cluster import Birch, MeanShift, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score
# ── K-Means ──
kmeans = KMeans(
n_clusters=3,
init='k-means++', # smart initialization
n_init=10, # number of initializations
max_iter=300,
algorithm='lloyd', # 'lloyd' or 'elkan'
random_state=42,
)
kmeans.fit(X)
labels = kmeans.labels_
kmeans.predict(X_new) # assign new points
kmeans.cluster_centers_ # center coordinates
kmeans.inertia_ # sum of squared distances (WCSS)
# ── Find optimal k with Elbow method ──
inertias = []
K = range(1, 11)
for k in K:
km = KMeans(n_clusters=k, n_init=10, random_state=42)
km.fit(X)
inertias.append(km.inertia_)
# Plot inertias — look for "elbow"
# ── Silhouette analysis ──
from sklearn.metrics import silhouette_samples
sil_scores = []
for k in range(2, 11):
km = KMeans(n_clusters=k, n_init=10, random_state=42)
labels = km.fit_predict(X)
sil_scores.append(silhouette_score(X, labels))
# ── DBSCAN (density-based) ──
dbscan = DBSCAN(
eps=0.5, # neighborhood radius
min_samples=5, # minimum points in neighborhood
metric='euclidean',
n_jobs=-1,
)
labels = dbscan.fit_predict(X)
# -1 = noise points
n_clusters = len(set(labels) - {-1}) # exclude noise
n_noise = (labels == -1).sum()
# ── Agglomerative (hierarchical) ──
agg = AgglomerativeClustering(
n_clusters=3,
metric='euclidean',
linkage='ward', # 'ward', 'complete', 'average', 'single'
)
labels = agg.fit_predict(X)
# ── Gaussian Mixture Model ──
gmm = GaussianMixture(
n_components=3,
covariance_type='full', # 'full', 'tied', 'diag', 'spherical'
max_iter=200,
n_init=3,
random_state=42,
)
gmm.fit(X)
labels = gmm.predict(X)
probs = gmm.predict_proba(X) # soft assignments
gmm.bic(X) # Bayesian Info Criterion
gmm.aic(X) # Akaike Info Criterion| Algorithm | Shape | Needs k? | Handles Outliers? |
|---|---|---|---|
| KMeans | Spherical | Yes | No |
| DBSCAN | Arbitrary | No | Yes (-1 label) |
| Agglomerative | Flexible | Yes | No |
| GaussianMixture | Ellipsoidal | Yes (BIC) | Partial |
| MeanShift | Arbitrary | No | No |
| Spectral | Complex | Yes | No |
| Birch | Spherical | No (auto) | Partial |
| Metric | Range | Higher = |
|---|---|---|
| Silhouette | [-1, 1] | Better (1=perfect) |
| Calinski-Harabasz | [0, ∞) | Better (well-separated) |
| Davies-Bouldin | [0, ∞) | Worse (lower is better) |
| Inertia (WCSS) | [0, ∞) | Lower = tighter clusters |
| BIC / AIC | Any | Lower = better (GMM) |
DBSCAN when you don't know the number of clusters and have noise/outliers. Use KMeans for fast, spherical clusters. Use GaussianMixture for soft assignments and ellipsoidal shapes.from sklearn.model_selection import (
train_test_split,
cross_val_score, cross_validate,
StratifiedKFold, KFold, GroupKFold,
LeaveOneOut, RepeatedStratifiedKFold,
learning_curve, validation_curve,
)
# ── Train/test split ──
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2, # 20% test
train_size=None, # auto (80%)
random_state=42,
shuffle=True,
stratify=y, # preserve class proportions (classification)
)
# ── Cross-validation ──
# Basic k-fold CV
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f'CV: {scores.mean():.4f} ± {scores.std():.4f}')
# Detailed CV results
results = cross_validate(
model, X, y, cv=5,
scoring=['accuracy', 'f1_macro', 'roc_auc_ovr'],
return_train_score=True,
return_estimator=True,
)
# ── Cross-validation splitters ──
# Standard K-Fold
cv = KFold(n_splits=5, shuffle=True, random_state=42)
# Stratified K-Fold (classification — preserves class ratio)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Group K-Fold (same group never in train AND test)
cv = GroupKFold(n_splits=5)
# Repeated CV (more stable estimates)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)
# Leave-One-Out
cv = LeaveOneOut()
# ── Learning curves ──
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(
model, X, y, cv=5,
train_sizes=np.linspace(0.1, 1.0, 10),
scoring='accuracy',
n_jobs=-1,
)
# Plot: train_scores vs test_scores per train_size
# High bias → both low, converge
# High variance → train high, test low, gap largefrom sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform
# ── GridSearchCV (exhaustive) ──
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
grid = GridSearchCV(
estimator=RandomForestClassifier(random_state=42),
param_grid=param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=1,
refit=True, # refit on full train with best params
return_train_score=True,
)
grid.fit(X_train, y_train)
grid.best_params_ # best hyperparameter combo
grid.best_score_ # best CV score
grid.best_estimator_ # refitted model
grid.cv_results_ # full results dict
# ── RandomizedSearchCV (random sampling — faster) ──
param_dist = {
'n_estimators': randint(50, 500),
'max_depth': randint(3, 20),
'min_samples_split': randint(2, 20),
'min_samples_leaf': randint(1, 10),
'max_features': uniform(0.1, 0.9),
}
random = RandomizedSearchCV(
estimator=RandomForestClassifier(random_state=42),
param_distributions=param_dist,
n_iter=100, # number of random combinations
cv=5,
scoring='accuracy',
n_jobs=-1,
random_state=42,
verbose=1,
)
random.fit(X_train, y_train)
random.best_params_
# ── HalvingGridSearchCV (successive halving — efficient) ──
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
halving = HalvingGridSearchCV(
estimator=model,
param_grid=param_grid,
factor=3, # eliminate 1/3 candidates each round
cv=5,
scoring='accuracy',
random_state=42,
)
halving.fit(X_train, y_train)| Scoring | Task | Direction |
|---|---|---|
| accuracy | Classification | Higher = better |
| f1 | Classification | Higher = better |
| f1_macro | Multi-class | Higher = better |
| precision | Classification | Higher = better |
| recall | Classification | Higher = better |
| roc_auc | Binary classif. | Higher = better |
| r2 | Regression | Higher = better |
| neg_mean_squared_error | Regression | Higher (less neg) |
| neg_mean_absolute_error | Regression | Higher (less neg) |
| neg_root_mean_squared_error | Regression | Higher (less neg) |
| Method | Speed | Coverage | Best For |
|---|---|---|---|
| GridSearchCV | Slow | Exhaustive | Small param space |
| RandomizedSearchCV | Fast | Random sample | Large param space |
| HalvingGridSearchCV | Medium | Iterative pruning | Many candidates |
| Optuna | Medium | Bayesian | Complex spaces |
from sklearn.feature_selection import (
SelectKBest, SelectFromModel, RFE, RFECV,
f_classif, f_regression, mutual_info_classif,
chi2, VarianceThreshold,
)
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
# ── VarianceThreshold (remove low-variance features) ──
selector = VarianceThreshold(threshold=0.01)
X_sel = selector.fit_transform(X)
# Removes features with variance below threshold
# ── SelectKBest (univariate, pick top k by score) ──
# For classification
selector = SelectKBest(f_classif, k=10) # ANOVA F-value
selector = SelectKBest(chi2, k=10) # Chi-squared (non-neg)
selector = SelectKBest(mutual_info_classif, k=10) # mutual information
# For regression
selector = SelectKBest(f_regression, k=10)
X_sel = selector.fit_transform(X, y)
selector.scores_ # scores per feature
selector.pvalues_ # p-values per feature
selector.get_feature_names_out()
# ── SelectFromModel (use model importance) ──
from sklearn.ensemble import RandomForestClassifier
selector = SelectFromModel(
RandomForestClassifier(n_estimators=100, random_state=42),
threshold='median', # or 'mean', or float
max_features=20,
)
X_sel = selector.fit_transform(X, y)
selector.get_support() # boolean mask of selected
# ── Recursive Feature Elimination ──
rfe = RFE(
estimator=LogisticRegression(max_iter=1000),
n_features_to_select=10,
step=1, # remove 1 feature per iteration
)
X_sel = rfe.fit_transform(X, y)
rfe.ranking_ # 1 = selected, higher = eliminated earlier
rfe.support_ # boolean mask
# ── RFECV (RFE with CV for optimal k) ──
rfecv = RFECV(
estimator=LogisticRegression(max_iter=1000),
cv=5,
scoring='accuracy',
step=1,
min_features_to_select=5,
)
rfecv.fit(X, y)
rfecv.n_features_ # optimal number of features
rfecv.cv_results_ # scores for each k# ── PCA (Principal Component Analysis) ──
pca = PCA(
n_components=0.95, # keep 95% variance (auto-selects k)
# OR: n_components=10 # explicitly set
# OR: n_components='mle' # MLE estimate
whiten=False, # scale components to unit variance
random_state=42,
)
X_pca = pca.fit_transform(X)
pca.explained_variance_ratio_ # variance explained per component
pca.explained_variance_ratio_.sum() # total variance captured
pca.n_components_ # number of components
pca.components_ # principal axes (eigenvectors)
# ── PCA for visualization (2D) ──
pca_2d = PCA(n_components=2)
X_2d = pca_2d.fit_transform(X)
plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y, cmap='viridis')
# ── Incremental PCA (for large datasets) ──
from sklearn.decomposition import IncrementalPCA
ipca = IncrementalPCA(n_components=50, batch_size=1000)
X_ipca = ipca.fit_transform(X)
# ── TruncatedSVD (for sparse data, works with scipy sparse) ──
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_sparse)
svd.explained_variance_ratio_
# ── t-SNE (visualization only, not for preprocessing) ──
tsne = TSNE(
n_components=2,
perplexity=30, # typical range: 5-50
learning_rate='auto',
n_iter=1000,
random_state=42,
)
X_tsne = tsne.fit_transform(X)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='viridis')
plt.title('t-SNE Visualization')| Method | Type | Model-Free? | Speed |
|---|---|---|---|
| VarianceThreshold | Filter | Yes | Very fast |
| SelectKBest | Filter | Yes | Fast |
| mutual_info_classif | Filter | Yes | Medium |
| SelectFromModel | Embedded | No | Fast |
| RFE | Wrapper | No | Slow |
| RFECV | Wrapper | No | Very slow |
| Lasso (L1) | Embedded | No | Fast |
| Method | Type | Sparse? | Use Case |
|---|---|---|---|
| PCA | Linear | No | Dense features, noise reduction |
| TruncatedSVD | Linear | Yes | Sparse (text/TF-IDF) |
| IncrementalPCA | Linear | No | Large datasets (streaming) |
| t-SNE | Non-linear | No | 2D/3D visualization only |
| UMAP | Non-linear | No | Vis + general dim reduction |
| LDA | Linear (supervised) | No | Classification, n_classes-1 |
transform() for new data. Use PCA for preprocessing; use t-SNE only for visualization.from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
# ── Basic Pipeline ──
pipe = Pipeline([
('scaler', StandardScaler()),
('model', LogisticRegression(random_state=42)),
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
pipe.score(X_test, y_test)
# ── make_pipeline (auto-names steps) ──
pipe = make_pipeline(
StandardScaler(),
LogisticRegression(random_state=42),
)
# Steps named: 'standardscaler', 'logisticregression'
# ── ColumnTransformer (different transforms per column type) ──
numeric_features = ['age', 'salary', 'years_experience']
categorical_features = ['department', 'city', 'education_level']
numeric_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler()),
])
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features),
],
remainder='drop', # 'drop' or 'passthrough'
verbose_feature_names_out=False,
)
# ── Full pipeline with preprocessing + model ──
full_pipe = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_estimators=100, random_state=42)),
])
full_pipe.fit(X_train, y_train)
y_pred = full_pipe.predict(X_test)# ── Access pipeline steps ──
pipe.named_steps['scaler'] # access a step
pipe.named_steps['model'].coef_ # model coefficients
pipe[:-1] # all steps except last
pipe[-1] # last step only
# ── Set parameters (nested with __) ──
pipe.set_params(classifier__n_estimators=200)
pipe.get_params() # all params with __ notation
# ── GridSearchCV with pipeline ──
param_grid = {
'preprocessor__num__scaler': [StandardScaler(), RobustScaler(), 'passthrough'],
'classifier__n_estimators': [50, 100, 200],
'classifier__max_depth': [3, 5, None],
'classifier__min_samples_split': [2, 5, 10],
}
grid = GridSearchCV(full_pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)
# ── Feature names from pipeline ──
feature_names = full_pipe.named_steps['preprocessor'].get_feature_names_out()
# ── Caching steps (speed up GridSearch) ──
from tempfile import mkdtemp
cachedir = mkdtemp()
pipe = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(random_state=42)),
], memory=cachedir) # caches fitted transformers
# ── Custom transformer ──
from sklearn.base import BaseEstimator, TransformerMixin
class FeatureRatioAdder(BaseEstimator, TransformerMixin):
def __init__(self, col1, col2, name='ratio'):
self.col1 = col1
self.col2 = col2
self.name = name
def fit(self, X, y=None):
return self
def transform(self, X):
X = X.copy()
X[self.name] = X[self.col1] / X[self.col2].replace(0, np.nan)
return X
# Use in pipeline
pipe = Pipeline([
('ratio', FeatureRatioAdder('height', 'weight', 'bmi')),
('scaler', StandardScaler()),
('model', LogisticRegression()),
])| Benefit | Description |
|---|---|
| No data leakage | Fit on train, transform test automatically |
| Cleaner code | Single object for fit/predict |
| GridSearch | Search over ALL params (preprocessing + model) |
| Reproducibility | One object to pickle/deploy |
| Safety | Can't accidentally fit on test data |
| Pattern | Code | Use Case |
|---|---|---|
| Named columns | Transformer(cols=['a','b']) | Select specific columns |
| Type-based | Transformer(StandardScaler()) | All numeric columns |
| Selector | make_column_selector(dtype_include=number) | Auto-select by type |
| Passthrough | remainder='passthrough' | Keep unprocessed columns |
| Drop | remainder='drop' | Drop unprocessed columns |
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
classification_report, confusion_matrix,
roc_auc_score, average_precision_score,
roc_curve, precision_recall_curve,
ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay,
cohen_kappa_score, matthews_corrcoef,
balanced_accuracy_score,
)
# ── Basic metrics ──
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='binary')
rec = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')
# ── Multi-class metrics (average parameter) ──
# average='micro': globally (counts all TP, FP, FN)
# average='macro': per-class unweighted mean
# average='weighted': per-class weighted by support
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')
prec_macro = precision_score(y_test, y_pred, average='macro')
# ── Classification report ──
print(classification_report(
y_test, y_pred,
target_names=['Class 0', 'Class 1'],
digits=4,
output_dict=False, # True for dict output
))
# precision recall f1-score support
# Class 0 0.95 0.93 0.94 150
# Class 1 0.91 0.94 0.92 100
# ── Confusion Matrix ──
cm = confusion_matrix(y_test, y_pred, normalize='true') # 'true', 'pred', 'all'
ConfusionMatrixDisplay(cm, display_labels=['Neg', 'Pos']).plot()
# ── ROC Curve & AUC ──
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)
RocCurveDisplay.from_predictions(y_test, y_prob)
# ── Precision-Recall Curve ──
prec_vals, rec_vals, pr_thresholds = precision_recall_curve(y_test, y_prob)
ap = average_precision_score(y_test, y_prob)
PrecisionRecallDisplay.from_predictions(y_test, y_prob)
# ── Threshold tuning ──
from sklearn.metrics import fbeta_score
f2 = fbeta_score(y_test, (y_prob > 0.3).astype(int), beta=2) # emphasize recallfrom sklearn.metrics import (
mean_squared_error, mean_absolute_error, r2_score,
mean_absolute_percentage_error, median_absolute_error,
max_error, explained_variance_score,
)
import matplotlib.pyplot as plt
# ── Regression metrics ──
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)
me = max_error(y_test, y_pred) # worst single prediction
medae = median_absolute_error(y_test, y_pred)
# ── Prediction error plot ──
from sklearn.metrics import PredictionErrorDisplay
PredictionErrorDisplay.from_predictions(
y_test, y_pred, kind='actual_vs_predicted'
)
PredictionErrorDisplay.from_predictions(
y_test, y_pred, kind='residual_vs_predicted'
)
# ── Cross-validated metrics ──
from sklearn.model_selection import cross_val_score, cross_validate
scoring = {
'r2': 'r2',
'neg_mse': 'neg_mean_squared_error',
'neg_mae': 'neg_mean_absolute_error',
}
cv_results = cross_validate(model, X, y, cv=5, scoring=scoring)
print(f'CV R²: {-cv_results["test_neg_mse"].mean():.4f}')
print(f'CV MAE: {-cv_results["test_neg_mae"].mean():.4f}')
# ── Custom scoring function ──
from sklearn.metrics import make_scorer
def custom_metric(y_true, y_pred):
return np.mean(np.abs(y_true - y_pred) / y_true)
scorer = make_scorer(custom_metric, greater_is_better=False)
scores = cross_val_score(model, X, y, cv=5, scoring=scorer)
# ── Dummy baseline (always compare to this!) ──
from sklearn.dummy import DummyClassifier, DummyRegressor
dummy_clf = DummyClassifier(strategy='most_frequent') # or 'stratified'
dummy_reg = DummyRegressor(strategy='mean') # or 'median'
dummy_clf.fit(X_train, y_train).score(X_test, y_test)
dummy_reg.fit(X_train, y_train).score(X_test, y_test)| Metric | Focus | When to Use |
|---|---|---|
| Accuracy | Overall correctness | Balanced classes |
| Precision | Minimize false positives | Spam detection |
| Recall | Minimize false negatives | Disease screening |
| F1 | Balance P & R | Imbalanced classes |
| Fβ (β>1) | Weighted toward recall | Medical diagnosis |
| ROC AUC | Ranking quality | Threshold-independent |
| Avg Precision | PR curve area | Imbalanced (better than AUC) |
| Cohen's Kappa | Agreement beyond chance | Inter-rater |
DummyClassifier(strategy='stratified') or DummyRegressor(strategy='mean') to verify your model actually learns. A high accuracy on imbalanced data might just mean it always predicts the majority class.