Scikit-Learn Cheatsheet Cheatsheet

⚡

Preprocessing

FUNDAMENTALS

scaling_encoding.py

from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler,
    LabelEncoder, OrdinalEncoder, OneHotEncoder,
    LabelBinarizer, MultiLabelBinarizer,
)

# ── StandardScaler (zero mean, unit variance) ──
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Each feature: mean=0, std=1
# Formula: z = (x - μ) / σ
scaler.mean_          # per-feature means
scaler.scale_         # per-feature stds
scaler.inverse_transform(X_scaled)  # back to original

# ── MinMaxScaler (scale to [0, 1]) ──
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)
# Formula: x_norm = (x - x_min) / (x_max - x_min)
scaler = MinMaxScaler(feature_range=(-1, 1))  # custom range

# ── RobustScaler (uses median & IQR, robust to outliers) ──
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
# Formula: x_norm = (x - median) / IQR

# ── MaxAbsScaler (scale to [-1, 1], preserves sparsity) ──
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X_sparse)

# ── Normalization (scale samples, not features) ──
from sklearn.preprocessing import Normalizer
norm = Normalizer(norm='l2')       # l1, l2, or max
X_norm = norm.fit_transform(X)    # each row has unit norm

encoding.py

# ── LabelEncoder (target variable: y) ──
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# ['cat', 'dog', 'bird'] → [0, 1, 2]
y_original = le.inverse_transform(y_encoded)
le.classes_                       # ['bird', 'cat', 'dog']

# ── OrdinalEncoder (features with ordered categories) ──
oe = OrdinalEncoder(categories=[['low', 'medium', 'high']])
X_encoded = oe.fit_transform(X[['size']])

# ── OneHotEncoder (categorical features) ──
ohe = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = ohe.fit_transform(X[['city', 'dept']])
# drop='first' avoids multicollinearity
# drop='if_binary' drops first only for binary categories
ohe.get_feature_names_out()        # ['city_LA', 'dept_Sales', ...]
ohe.categories_                    # list of category arrays

# ── LabelBinarizer (target: multi-class → one-vs-rest) ──
lb = LabelBinarizer()
y_bin = lb.fit_transform(y)
# 3 classes → 3-column binary matrix

# ── MultiLabelBinarizer ──
mlb = MultiLabelBinarizer()
y_bin = mlb.fit_transform([['a', 'b'], ['a'], ['b', 'c']])
# [[1,1,0], [1,0,0], [0,1,1]]

# ── Discretization ──
from sklearn.preprocessing import KBinsDiscretizer
kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X_binned = kbd.fit_transform(X)
# strategy: 'uniform', 'quantile', 'kmeans'

# ── Polynomial features ──
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
# [x1, x2] → [x1, x2, x1², x1*x2, x2²]
poly.get_feature_names_out()

Scaler Comparison

Scaler	Center	Scale By	Best For
StandardScaler	Mean	Std dev	Normal-like distributions
MinMaxScaler	Min	Range (max-min)	Bounded features
RobustScaler	Median	IQR	Outlier-heavy data
MaxAbsScaler	0	Max abs value	Sparse data
Normalizer	—	Row norm	Text/similarity data

Encoder Selection Guide

Encoder	Input	Output	Use For
LabelEncoder	1D y	1D int	Target labels
OrdinalEncoder	X columns	X columns (int)	Ordered categories
OneHotEncoder	X columns	Binary columns	Nominal categories
LabelBinarizer	1D y	Binary matrix	Multi-class target
TargetEncoder	X columns	Numeric columns	High-cardinality cats

💡

Fit scalers on train, transform test: Always call .fit_transform(X_train) and .transform(X_test) separately. Fitting on the full dataset causes data leakage and optimistic bias.

🏷️

Classification

SUPERVISED

classification_models.py

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# ── Logistic Regression ──
model = LogisticRegression(
    penalty='l2',                # 'l1', 'l2', 'elasticnet', None
    C=1.0,                      # inverse regularization (higher = less reg)
    solver='lbfgs',             # 'lbfgs', 'liblinear', 'saga'
    max_iter=1000,
    class_weight='balanced',    # auto-balance classes
    multi_class='auto',
    random_state=42,
)
model.fit(X_train, y_train)
model.predict(X_test)
model.predict_proba(X_test)      # probability per class
model.predict_log_proba(X_test)
model.coef_                      # feature weights
model.intercept_
model.classes_

# ── Random Forest ──
model = RandomForestClassifier(
    n_estimators=100,            # number of trees
    max_depth=None,              # max tree depth
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',         # 'sqrt', 'log2', int, float
    criterion='gini',            # 'gini' or 'entropy'
    bootstrap=True,
    class_weight='balanced',
    n_jobs=-1,                   # use all cores
    random_state=42,
)

# ── SVM ──
model = SVC(
    C=1.0,
    kernel='rbf',                # 'linear', 'poly', 'rbf', 'sigmoid'
    gamma='scale',               # 'scale', 'auto'
    probability=True,            # enable predict_proba
    class_weight='balanced',
)
# For large datasets, use LinearSVC (faster)
from sklearn.svm import LinearSVC
model = LinearSVC(C=1.0, max_iter=5000)

# ── KNN ──
model = KNeighborsClassifier(
    n_neighbors=5,
    weights='uniform',           # 'uniform' or 'distance'
    metric='minkowski',
    p=2,                         # 2=euclidean, 1=manhattan
    n_jobs=-1,
)

# ── Gradient Boosting ──
model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    min_samples_split=2,
    subsample=1.0,
    random_state=42,
)  # for production, consider XGBoost/LightGBM

classification_workflow.py

# ── Complete classification workflow ──
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# 1. Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 2. Build pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(random_state=42)),
])

# 3. Cross-validate
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy')
print(f'CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}')

# 4. Fit and predict
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]  # probability of positive class

# 5. Evaluate
print(classification_report(y_test, y_pred, target_names=le.classes_))
cm = confusion_matrix(y_test, y_pred)
#                  Predicted
#              Neg    Pos
# Actual Neg  [TN,    FP]
# Actual Pos  [FN,    TP]

# 6. ROC curve
from sklearn.metrics import roc_curve, auc, RocCurveDisplay
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc_score = auc(fpr, tpr)
RocCurveDisplay.from_predictions(y_test, y_prob)

Classifier Comparison

Model	Pros	Cons	Best For
Logistic Reg	Fast, interpretable	Linear boundary	Baseline, linear data
Random Forest	Robust, no scaling	Overfitting risk	Tabular, non-linear
SVM	High dim, kernels	Slow on large data	Medium datasets
KNN	Simple, non-parametric	Slow prediction	Small datasets
Gradient Boost	High accuracy	Slow training	Competition, tabular
Naive Bayes	Very fast	Strong assumptions	Text, high-dim sparse

Key Hyperparameters

Param	Higher =	Effect
C (SVM/LR)	Less regularization	More complex model
n_estimators	More trees	Better but slower (RF/GB)
max_depth	Deeper trees	More complex, overfit risk
n_neighbors	More neighbors	Smoother, simpler (KNN)
learning_rate	Larger steps	Faster but less stable (GB)
min_samples_leaf	More restriction	Prevents overfitting

⚠️

Always use stratify=y in train_test_split() for classification. It preserves class proportions across splits, preventing imbalanced train/test sets.

📈

Regression

SUPERVISED

regression_models.py

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# ── Linear Regression ──
model = LinearRegression()
model.fit(X_train, y_train)
model.coef_                      # feature weights
model.intercept_                 # bias term
model.score(X_test, y_test)      # R² score

# ── Ridge (L2 regularization) ──
model = Ridge(alpha=1.0)         # alpha = λ (higher = more regularization)
# Loss = MSE + α * Σ(w²)
# Shrinks coefficients toward 0, never exactly 0

# ── Lasso (L1 regularization) ──
model = Lasso(alpha=0.1)
# Loss = MSE + α * Σ|w|
# Can zero out coefficients → automatic feature selection

# ── ElasticNet (L1 + L2) ──
model = ElasticNet(
    alpha=1.0,
    l1_ratio=0.5,                # 0=Ridge, 0.5=mix, 1=Lasso
)

# ── Polynomial Regression ──
model = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    LinearRegression()
)

# ── Random Forest Regressor ──
model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42,
)

# ── Gradient Boosting Regressor ──
model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    loss='squared_error',         # 'squared_error', 'absolute_error', 'huber'
    random_state=42,
)

# ── SVR ──
model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
# epsilon: margin of tolerance (no penalty within tube)

regression_eval.py

from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    mean_absolute_percentage_error,
)
from sklearn.model_selection import cross_val_score

# ── Metrics ──
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)            # MSE
rmse = mean_squared_error(y_test, y_pred, squared=False)  # RMSE
mae = mean_absolute_error(y_test, y_pred)           # MAE
r2 = r2_score(y_test, y_pred)                      # R²
mape = mean_absolute_percentage_error(y_test, y_pred)  # MAPE

print(f'R²:   {r2:.4f}')           # 1.0 = perfect, 0 = mean, <0 = worse
print(f'RMSE: {rmse:.4f}')
print(f'MAE:  {mae:.4f}')
print(f'MAPE: {mape:.4f}')

# ── Cross-validation ──
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
cv_mse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f'CV R²: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}')

# ── Residual analysis ──
residuals = y_test - y_pred
import matplotlib.pyplot as plt
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted')
plt.ylabel('Residuals')

# ── Feature importance (tree-based models) ──
importance = model.feature_importances_
feat_imp = pd.Series(importance, index=X.columns).sort_values(ascending=False)
feat_imp.head(10).plot(kind='barh')

Regularization Comparison

Type	Penalty	Sparsity	Use Case
None (OLS)	—	No	Many features, no collinearity
Ridge (L2)	α·Σw²	No	Collinear features
Lasso (L1)	α·Σ\|w\|	Yes	Feature selection
ElasticNet	α·(r·\|w\|+(1-r)·w²)	Partial	Correlated groups

Regression Metrics

Metric	Formula	Robust to Outliers?
MSE	Σ(y-ŷ)²/n	No (squared errors)
RMSE	√MSE	No
MAE	Σ\|y-ŷ\|/n	Yes
MAPE	Σ\|y-ŷ\|/y/n × 100	No (relative)
R²	1 - SS_res/SS_tot	Context-dependent
MedAE	median(\|y-ŷ\|)	Yes (robust)

💡

Start with Linear Regression as a baseline, then try Ridge/Lasso for regularization. For non-linear patterns, use GradientBoostingRegressor or RandomForestRegressor. Always check residual plots for patterns (heteroscedasticity, non-linearity).

🔵

Clustering

UNSUPERVISED

clustering_models.py

from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.cluster import Birch, MeanShift, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score

# ── K-Means ──
kmeans = KMeans(
    n_clusters=3,
    init='k-means++',           # smart initialization
    n_init=10,                  # number of initializations
    max_iter=300,
    algorithm='lloyd',          # 'lloyd' or 'elkan'
    random_state=42,
)
kmeans.fit(X)
labels = kmeans.labels_
kmeans.predict(X_new)            # assign new points
kmeans.cluster_centers_          # center coordinates
kmeans.inertia_                  # sum of squared distances (WCSS)

# ── Find optimal k with Elbow method ──
inertias = []
K = range(1, 11)
for k in K:
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    km.fit(X)
    inertias.append(km.inertia_)
# Plot inertias — look for "elbow"

# ── Silhouette analysis ──
from sklearn.metrics import silhouette_samples
sil_scores = []
for k in range(2, 11):
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    labels = km.fit_predict(X)
    sil_scores.append(silhouette_score(X, labels))

# ── DBSCAN (density-based) ──
dbscan = DBSCAN(
    eps=0.5,                    # neighborhood radius
    min_samples=5,              # minimum points in neighborhood
    metric='euclidean',
    n_jobs=-1,
)
labels = dbscan.fit_predict(X)
# -1 = noise points
n_clusters = len(set(labels) - {-1})  # exclude noise
n_noise = (labels == -1).sum()

# ── Agglomerative (hierarchical) ──
agg = AgglomerativeClustering(
    n_clusters=3,
    metric='euclidean',
    linkage='ward',             # 'ward', 'complete', 'average', 'single'
)
labels = agg.fit_predict(X)

# ── Gaussian Mixture Model ──
gmm = GaussianMixture(
    n_components=3,
    covariance_type='full',     # 'full', 'tied', 'diag', 'spherical'
    max_iter=200,
    n_init=3,
    random_state=42,
)
gmm.fit(X)
labels = gmm.predict(X)
probs = gmm.predict_proba(X)   # soft assignments
gmm.bic(X)                     # Bayesian Info Criterion
gmm.aic(X)                     # Akaike Info Criterion

Clustering Algorithm Comparison

Algorithm	Shape	Needs k?	Handles Outliers?
KMeans	Spherical	Yes	No
DBSCAN	Arbitrary	No	Yes (-1 label)
Agglomerative	Flexible	Yes	No
GaussianMixture	Ellipsoidal	Yes (BIC)	Partial
MeanShift	Arbitrary	No	No
Spectral	Complex	Yes	No
Birch	Spherical	No (auto)	Partial

Clustering Metrics

Metric	Range	Higher =
Silhouette	[-1, 1]	Better (1=perfect)
Calinski-Harabasz	[0, ∞)	Better (well-separated)
Davies-Bouldin	[0, ∞)	Worse (lower is better)
Inertia (WCSS)	[0, ∞)	Lower = tighter clusters
BIC / AIC	Any	Lower = better (GMM)

💡

Use DBSCAN when you don't know the number of clusters and have noise/outliers. Use KMeans for fast, spherical clusters. Use GaussianMixture for soft assignments and ellipsoidal shapes.

🎯

Model Selection

TUNING

cross_validation.py

from sklearn.model_selection import (
    train_test_split,
    cross_val_score, cross_validate,
    StratifiedKFold, KFold, GroupKFold,
    LeaveOneOut, RepeatedStratifiedKFold,
    learning_curve, validation_curve,
)

# ── Train/test split ──
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,              # 20% test
    train_size=None,            # auto (80%)
    random_state=42,
    shuffle=True,
    stratify=y,                 # preserve class proportions (classification)
)

# ── Cross-validation ──
# Basic k-fold CV
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f'CV: {scores.mean():.4f} ± {scores.std():.4f}')

# Detailed CV results
results = cross_validate(
    model, X, y, cv=5,
    scoring=['accuracy', 'f1_macro', 'roc_auc_ovr'],
    return_train_score=True,
    return_estimator=True,
)

# ── Cross-validation splitters ──
# Standard K-Fold
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Stratified K-Fold (classification — preserves class ratio)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Group K-Fold (same group never in train AND test)
cv = GroupKFold(n_splits=5)

# Repeated CV (more stable estimates)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

# Leave-One-Out
cv = LeaveOneOut()

# ── Learning curves ──
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(
    model, X, y, cv=5,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring='accuracy',
    n_jobs=-1,
)
# Plot: train_scores vs test_scores per train_size
# High bias → both low, converge
# High variance → train high, test low, gap large

grid_search.py

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform

# ── GridSearchCV (exhaustive) ──
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    refit=True,                 # refit on full train with best params
    return_train_score=True,
)
grid.fit(X_train, y_train)
grid.best_params_               # best hyperparameter combo
grid.best_score_                # best CV score
grid.best_estimator_            # refitted model
grid.cv_results_                # full results dict

# ── RandomizedSearchCV (random sampling — faster) ──
param_dist = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': uniform(0.1, 0.9),
}

random = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=100,                 # number of random combinations
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1,
)
random.fit(X_train, y_train)
random.best_params_

# ── HalvingGridSearchCV (successive halving — efficient) ──
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

halving = HalvingGridSearchCV(
    estimator=model,
    param_grid=param_grid,
    factor=3,                   # eliminate 1/3 candidates each round
    cv=5,
    scoring='accuracy',
    random_state=42,
)
halving.fit(X_train, y_train)

Scoring Functions

Scoring	Task	Direction
accuracy	Classification	Higher = better
f1	Classification	Higher = better
f1_macro	Multi-class	Higher = better
precision	Classification	Higher = better
recall	Classification	Higher = better
roc_auc	Binary classif.	Higher = better
r2	Regression	Higher = better
neg_mean_squared_error	Regression	Higher (less neg)
neg_mean_absolute_error	Regression	Higher (less neg)
neg_root_mean_squared_error	Regression	Higher (less neg)

Search Comparison

Method	Speed	Coverage	Best For
GridSearchCV	Slow	Exhaustive	Small param space
RandomizedSearchCV	Fast	Random sample	Large param space
HalvingGridSearchCV	Medium	Iterative pruning	Many candidates
Optuna	Medium	Bayesian	Complex spaces

⚠️

Bias-variance diagnostic: Plot learning curves. If training and validation scores converge at a low value → high bias (underfit). If there's a large gap → high variance (overfit). More data helps variance; more features help bias.

🔧

Feature Engineering

TRANSFORM

feature_selection.py

from sklearn.feature_selection import (
    SelectKBest, SelectFromModel, RFE, RFECV,
    f_classif, f_regression, mutual_info_classif,
    chi2, VarianceThreshold,
)
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE

# ── VarianceThreshold (remove low-variance features) ──
selector = VarianceThreshold(threshold=0.01)
X_sel = selector.fit_transform(X)
# Removes features with variance below threshold

# ── SelectKBest (univariate, pick top k by score) ──
# For classification
selector = SelectKBest(f_classif, k=10)         # ANOVA F-value
selector = SelectKBest(chi2, k=10)              # Chi-squared (non-neg)
selector = SelectKBest(mutual_info_classif, k=10)  # mutual information

# For regression
selector = SelectKBest(f_regression, k=10)
X_sel = selector.fit_transform(X, y)
selector.scores_                    # scores per feature
selector.pvalues_                   # p-values per feature
selector.get_feature_names_out()

# ── SelectFromModel (use model importance) ──
from sklearn.ensemble import RandomForestClassifier
selector = SelectFromModel(
    RandomForestClassifier(n_estimators=100, random_state=42),
    threshold='median',             # or 'mean', or float
    max_features=20,
)
X_sel = selector.fit_transform(X, y)
selector.get_support()              # boolean mask of selected

# ── Recursive Feature Elimination ──
rfe = RFE(
    estimator=LogisticRegression(max_iter=1000),
    n_features_to_select=10,
    step=1,                         # remove 1 feature per iteration
)
X_sel = rfe.fit_transform(X, y)
rfe.ranking_                       # 1 = selected, higher = eliminated earlier
rfe.support_                       # boolean mask

# ── RFECV (RFE with CV for optimal k) ──
rfecv = RFECV(
    estimator=LogisticRegression(max_iter=1000),
    cv=5,
    scoring='accuracy',
    step=1,
    min_features_to_select=5,
)
rfecv.fit(X, y)
rfecv.n_features_                  # optimal number of features
rfecv.cv_results_                  # scores for each k

dim_reduction.py

# ── PCA (Principal Component Analysis) ──
pca = PCA(
    n_components=0.95,              # keep 95% variance (auto-selects k)
    # OR: n_components=10           # explicitly set
    # OR: n_components='mle'        # MLE estimate
    whiten=False,                   # scale components to unit variance
    random_state=42,
)
X_pca = pca.fit_transform(X)
pca.explained_variance_ratio_      # variance explained per component
pca.explained_variance_ratio_.sum() # total variance captured
pca.n_components_                  # number of components
pca.components_                    # principal axes (eigenvectors)

# ── PCA for visualization (2D) ──
pca_2d = PCA(n_components=2)
X_2d = pca_2d.fit_transform(X)
plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y, cmap='viridis')

# ── Incremental PCA (for large datasets) ──
from sklearn.decomposition import IncrementalPCA
ipca = IncrementalPCA(n_components=50, batch_size=1000)
X_ipca = ipca.fit_transform(X)

# ── TruncatedSVD (for sparse data, works with scipy sparse) ──
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_sparse)
svd.explained_variance_ratio_

# ── t-SNE (visualization only, not for preprocessing) ──
tsne = TSNE(
    n_components=2,
    perplexity=30,                 # typical range: 5-50
    learning_rate='auto',
    n_iter=1000,
    random_state=42,
)
X_tsne = tsne.fit_transform(X)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='viridis')
plt.title('t-SNE Visualization')

Feature Selection Methods

Method	Type	Model-Free?	Speed
VarianceThreshold	Filter	Yes	Very fast
SelectKBest	Filter	Yes	Fast
mutual_info_classif	Filter	Yes	Medium
SelectFromModel	Embedded	No	Fast
RFE	Wrapper	No	Slow
RFECV	Wrapper	No	Very slow
Lasso (L1)	Embedded	No	Fast

Dimensionality Reduction

Method	Type	Sparse?	Use Case
PCA	Linear	No	Dense features, noise reduction
TruncatedSVD	Linear	Yes	Sparse (text/TF-IDF)
IncrementalPCA	Linear	No	Large datasets (streaming)
t-SNE	Non-linear	No	2D/3D visualization only
UMAP	Non-linear	No	Vis + general dim reduction
LDA	Linear (supervised)	No	Classification, n_classes-1

🚫

Never use t-SNE for preprocessing or feeding into downstream models — it's non-parametric, doesn't preserve global structure, and has no transform() for new data. Use PCA for preprocessing; use t-SNE only for visualization.

🔗

Pipelines

WORKFLOW

pipeline_basics.py

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

# ── Basic Pipeline ──
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(random_state=42)),
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
pipe.score(X_test, y_test)

# ── make_pipeline (auto-names steps) ──
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(random_state=42),
)
# Steps named: 'standardscaler', 'logisticregression'

# ── ColumnTransformer (different transforms per column type) ──
numeric_features = ['age', 'salary', 'years_experience']
categorical_features = ['department', 'city', 'education_level']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ],
    remainder='drop',           # 'drop' or 'passthrough'
    verbose_feature_names_out=False,
)

# ── Full pipeline with preprocessing + model ──
full_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42)),
])
full_pipe.fit(X_train, y_train)
y_pred = full_pipe.predict(X_test)

pipeline_advanced.py

# ── Access pipeline steps ──
pipe.named_steps['scaler']               # access a step
pipe.named_steps['model'].coef_          # model coefficients
pipe[:-1]                                # all steps except last
pipe[-1]                                 # last step only

# ── Set parameters (nested with __) ──
pipe.set_params(classifier__n_estimators=200)
pipe.get_params()                        # all params with __ notation

# ── GridSearchCV with pipeline ──
param_grid = {
    'preprocessor__num__scaler': [StandardScaler(), RobustScaler(), 'passthrough'],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [3, 5, None],
    'classifier__min_samples_split': [2, 5, 10],
}

grid = GridSearchCV(full_pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

# ── Feature names from pipeline ──
feature_names = full_pipe.named_steps['preprocessor'].get_feature_names_out()

# ── Caching steps (speed up GridSearch) ──
from tempfile import mkdtemp
cachedir = mkdtemp()
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42)),
], memory=cachedir)  # caches fitted transformers

# ── Custom transformer ──
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureRatioAdder(BaseEstimator, TransformerMixin):
    def __init__(self, col1, col2, name='ratio'):
        self.col1 = col1
        self.col2 = col2
        self.name = name

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.name] = X[self.col1] / X[self.col2].replace(0, np.nan)
        return X

# Use in pipeline
pipe = Pipeline([
    ('ratio', FeatureRatioAdder('height', 'weight', 'bmi')),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression()),
])

Pipeline Benefits

Benefit	Description
No data leakage	Fit on train, transform test automatically
Cleaner code	Single object for fit/predict
GridSearch	Search over ALL params (preprocessing + model)
Reproducibility	One object to pickle/deploy
Safety	Can't accidentally fit on test data

ColumnTransformer Patterns

Pattern	Code	Use Case
Named columns	Transformer(cols=['a','b'])	Select specific columns
Type-based	Transformer(StandardScaler())	All numeric columns
Selector	make_column_selector(dtype_include=number)	Auto-select by type
Passthrough	remainder='passthrough'	Keep unprocessed columns
Drop	remainder='drop'	Drop unprocessed columns

💡

Always wrap your workflow in a Pipeline. It prevents data leakage during cross-validation, makes GridSearch work on preprocessing params, and produces a single serializable object for deployment.

📋

Evaluation Metrics

MEASURE

classification_metrics.py

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix,
    roc_auc_score, average_precision_score,
    roc_curve, precision_recall_curve,
    ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay,
    cohen_kappa_score, matthews_corrcoef,
    balanced_accuracy_score,
)

# ── Basic metrics ──
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='binary')
rec = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

# ── Multi-class metrics (average parameter) ──
# average='micro': globally (counts all TP, FP, FN)
# average='macro': per-class unweighted mean
# average='weighted': per-class weighted by support
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')
prec_macro = precision_score(y_test, y_pred, average='macro')

# ── Classification report ──
print(classification_report(
    y_test, y_pred,
    target_names=['Class 0', 'Class 1'],
    digits=4,
    output_dict=False,          # True for dict output
))
# precision  recall  f1-score  support
# Class 0     0.95    0.93      0.94       150
# Class 1     0.91    0.94      0.92       100

# ── Confusion Matrix ──
cm = confusion_matrix(y_test, y_pred, normalize='true')  # 'true', 'pred', 'all'
ConfusionMatrixDisplay(cm, display_labels=['Neg', 'Pos']).plot()

# ── ROC Curve & AUC ──
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)
RocCurveDisplay.from_predictions(y_test, y_prob)

# ── Precision-Recall Curve ──
prec_vals, rec_vals, pr_thresholds = precision_recall_curve(y_test, y_prob)
ap = average_precision_score(y_test, y_prob)
PrecisionRecallDisplay.from_predictions(y_test, y_prob)

# ── Threshold tuning ──
from sklearn.metrics import fbeta_score
f2 = fbeta_score(y_test, (y_prob > 0.3).astype(int), beta=2)  # emphasize recall

regression_metrics_advanced.py

from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    mean_absolute_percentage_error, median_absolute_error,
    max_error, explained_variance_score,
)
import matplotlib.pyplot as plt

# ── Regression metrics ──
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)
me = max_error(y_test, y_pred)           # worst single prediction
medae = median_absolute_error(y_test, y_pred)

# ── Prediction error plot ──
from sklearn.metrics import PredictionErrorDisplay
PredictionErrorDisplay.from_predictions(
    y_test, y_pred, kind='actual_vs_predicted'
)
PredictionErrorDisplay.from_predictions(
    y_test, y_pred, kind='residual_vs_predicted'
)

# ── Cross-validated metrics ──
from sklearn.model_selection import cross_val_score, cross_validate
scoring = {
    'r2': 'r2',
    'neg_mse': 'neg_mean_squared_error',
    'neg_mae': 'neg_mean_absolute_error',
}
cv_results = cross_validate(model, X, y, cv=5, scoring=scoring)
print(f'CV R²:  {-cv_results["test_neg_mse"].mean():.4f}')
print(f'CV MAE: {-cv_results["test_neg_mae"].mean():.4f}')

# ── Custom scoring function ──
from sklearn.metrics import make_scorer
def custom_metric(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred) / y_true)

scorer = make_scorer(custom_metric, greater_is_better=False)
scores = cross_val_score(model, X, y, cv=5, scoring=scorer)

# ── Dummy baseline (always compare to this!) ──
from sklearn.dummy import DummyClassifier, DummyRegressor
dummy_clf = DummyClassifier(strategy='most_frequent')  # or 'stratified'
dummy_reg = DummyRegressor(strategy='mean')             # or 'median'
dummy_clf.fit(X_train, y_train).score(X_test, y_test)
dummy_reg.fit(X_train, y_train).score(X_test, y_test)

Classification Metrics Summary

Metric	Focus	When to Use
Accuracy	Overall correctness	Balanced classes
Precision	Minimize false positives	Spam detection
Recall	Minimize false negatives	Disease screening
F1	Balance P & R	Imbalanced classes
Fβ (β>1)	Weighted toward recall	Medical diagnosis
ROC AUC	Ranking quality	Threshold-independent
Avg Precision	PR curve area	Imbalanced (better than AUC)
Cohen's Kappa	Agreement beyond chance	Inter-rater

Imbalanced Data Tips

class_weight='balanced'Auto-weight classes (sklearn)

SMOTE/ADASYNOversample minority (imbalanced-learn)

StratifiedKFoldPreserve class ratio in CV

f1_macro / f1_weightedBetter than accuracy

PR AUCMore informative than ROC AUC

adjust thresholdOptimize for precision/recall tradeoff

DummyClassifierCompare to majority-class baseline

🚫

Always compare against a dummy baseline! Use DummyClassifier(strategy='stratified') or DummyRegressor(strategy='mean') to verify your model actually learns. A high accuracy on imbalanced data might just mean it always predicts the majority class.

⏳

Loading cheatsheet...