Loading W Code...
Learn to properly evaluate your models — avoiding overfitting, ensuring reliable estimates, and preventing data leakage.
5
Topics
sklearn
Implementation
🏠 Indian Context
Examples use Mumbai/Bangalore house prices and loan approval data — predicting real estate values and credit decisions!
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# Sample: Mumbai/Bangalore house price data
np.random.seed(42)
n = 100
house_df = pd.DataFrame({
'Size_Sqft': np.random.randint(500, 3000, n),
'Bedrooms': np.random.randint(1, 5, n),
'Age_Years': np.random.randint(0, 30, n),
'City': np.random.choice(['Mumbai', 'Bangalore'], n),
'Price_Lakhs': np.random.randint(30, 300, n)
})
print("=" * 60)
print("📊 WHY WE SPLIT DATA")
print("=" * 60)
print(f"\nTotal samples: {len(house_df)}")
print(house_df.head())
# Prepare features (X) and target (y)
X = house_df[['Size_Sqft', 'Bedrooms', 'Age_Years']]
y = house_df['Price_Lakhs']
print(f"\nFeatures (X): {X.shape}")
print(f"Target (y): {y.shape}")
# =====================================
# THE TRAIN-TEST SPLIT
# =====================================
print("\n" + "=" * 60)
print("✂️ TRAIN-TEST SPLIT")
print("=" * 60)
# 80-20 split
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2, # 20% for testing
random_state=42 # For reproducibility
)
print(f"\nTraining set:")
print(f" X_train: {X_train.shape} → {len(X_train)} samples")
print(f" y_train: {y_train.shape} → {len(y_train)} labels")
print(f"\nTest set:")
print(f" X_test: {X_test.shape} → {len(X_test)} samples")
print(f" y_test: {y_test.shape} → {len(y_test)} labels")
print(f"\n📊 Split ratio: {len(X_train)}/{len(X_test)} = {len(X_train)/len(house_df)*100:.0f}%/{len(X_test)/len(house_df)*100:.0f}%")
# The random_state ensures reproducibility
print("\n💡 TIP: Use random_state=42 for reproducible results")
print(" Without it, every run gives different splits!")import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# Simulating IMBALANCED dataset
# Loan approval: 85% approved, 15% rejected (realistic!)
np.random.seed(42)
n = 200
loan_df = pd.DataFrame({
'Income': np.random.randint(200000, 2000000, n),
'Credit_Score': np.random.randint(300, 900, n),
'Loan_Amount': np.random.randint(100000, 5000000, n),
'Approved': np.random.choice([1, 0], n, p=[0.85, 0.15]) # Imbalanced!
})
X = loan_df[['Income', 'Credit_Score', 'Loan_Amount']]
y = loan_df['Approved']
print("=" * 60)
print("📊 STRATIFIED SAMPLING")
print("=" * 60)
# Original class distribution
print("\n🎯 ORIGINAL CLASS DISTRIBUTION:")
print(f" Approved (1): {(y == 1).sum()} ({(y == 1).mean()*100:.1f}%)")
print(f" Rejected (0): {(y == 0).sum()} ({(y == 0).mean()*100:.1f}%)")
# =====================================
# WITHOUT STRATIFICATION
# =====================================
print("\n" + "=" * 60)
print("❌ WITHOUT STRATIFICATION")
print("=" * 60)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"\nTraining Set Distribution:")
print(f" Approved: {(y_train == 1).sum()} ({(y_train == 1).mean()*100:.1f}%)")
print(f" Rejected: {(y_train == 0).sum()} ({(y_train == 0).mean()*100:.1f}%)")
print(f"\nTest Set Distribution:")
print(f" Approved: {(y_test == 1).sum()} ({(y_test == 1).mean()*100:.1f}%)")
print(f" Rejected: {(y_test == 0).sum()} ({(y_test == 0).mean()*100:.1f}%)")
# =====================================
# WITH STRATIFICATION
# =====================================
print("\n" + "=" * 60)
print("✅ WITH STRATIFICATION (stratify=y)")
print("=" * 60)
X_train_strat, X_test_strat, y_train_strat, y_test_strat = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y # <-- Key change!
)
print(f"\nTraining Set Distribution:")
print(f" Approved: {(y_train_strat == 1).sum()} ({(y_train_strat == 1).mean()*100:.1f}%)")
print(f" Rejected: {(y_train_strat == 0).sum()} ({(y_train_strat == 0).mean()*100:.1f}%)")
print(f"\nTest Set Distribution:")
print(f" Approved: {(y_test_strat == 1).sum()} ({(y_test_strat == 1).mean()*100:.1f}%)")
print(f" Rejected: {(y_test_strat == 0).sum()} ({(y_test_strat == 0).mean()*100:.1f}%)")
print("\n💡 Both sets now have ~85% Approved, ~15% Rejected")
print(" This gives more reliable model evaluation!")import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
# House price prediction data
np.random.seed(42)
n = 100
X = np.random.randint(500, 3000, (n, 1)) # Size in sqft
y = 50 + 0.1 * X.ravel() + np.random.normal(0, 20, n) # Price in lakhs
print("=" * 60)
print("🔄 K-FOLD CROSS-VALIDATION")
print("=" * 60)
print(f"\nDataset: {n} samples")
# =====================================
# SINGLE TRAIN-TEST SPLIT
# =====================================
print("\n" + "=" * 60)
print("❌ SINGLE SPLIT (Unreliable)")
print("=" * 60)
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
model = LinearRegression()
# Try different random states to see variation
scores_single = []
for rs in [42, 123, 456, 789, 101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
scores_single.append(score)
print(f" Random State {rs}: R² = {score:.4f}")
print(f"\n Mean: {np.mean(scores_single):.4f}")
print(f" Std Dev: {np.std(scores_single):.4f}")
print(f" ⚠️ High variance! Results depend on lucky/unlucky split.")
# =====================================
# K-FOLD CROSS-VALIDATION
# =====================================
print("\n" + "=" * 60)
print("✅ 5-FOLD CROSS-VALIDATION (Reliable)")
print("=" * 60)
# Method 1: Using KFold manually
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_scores = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
fold_scores.append(score)
print(f" Fold {fold}: R² = {score:.4f} | Train: {len(X_train)}, Test: {len(X_test)}")
print(f"\n Mean R²: {np.mean(fold_scores):.4f}")
print(f" Std Dev: {np.std(fold_scores):.4f}")
# Method 2: Using cross_val_score (simpler!)
print("\n" + "=" * 60)
print("🚀 EASIER: cross_val_score()")
print("=" * 60)
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"\nScores: {cv_scores.round(4)}")
print(f"Mean ± Std: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
# Different scoring metrics
print("\n📊 Different Metrics:")
for metric in ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']:
scores = cross_val_score(model, X, y, cv=5, scoring=metric)
print(f" {metric}: {scores.mean():.4f} ± {scores.std():.4f}")import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
# Imbalanced classification: Fraud detection
# Only 5% are fraud (realistic!)
np.random.seed(42)
n = 500
X = np.random.randn(n, 2)
y = np.random.choice([0, 1], n, p=[0.95, 0.05]) # 5% fraud
print("=" * 60)
print("📊 STRATIFIED K-FOLD (for Classification)")
print("=" * 60)
print(f"\nDataset: {n} samples")
print(f"Class Distribution:")
print(f" Normal (0): {(y == 0).sum()} ({(y == 0).mean()*100:.1f}%)")
print(f" Fraud (1): {(y == 1).sum()} ({(y == 1).mean()*100:.1f}%)")
# =====================================
# REGULAR K-FOLD (Problem)
# =====================================
print("\n" + "=" * 60)
print("❌ REGULAR K-FOLD (Class imbalance issue)")
print("=" * 60)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
y_test_fold = y[test_idx]
fraud_pct = (y_test_fold == 1).mean() * 100
print(f" Fold {fold}: Fraud in test = {(y_test_fold == 1).sum()} ({fraud_pct:.1f}%)")
print(" ⚠️ Fraud percentage varies widely between folds!")
# =====================================
# STRATIFIED K-FOLD (Solution)
# =====================================
print("\n" + "=" * 60)
print("✅ STRATIFIED K-FOLD (Consistent)")
print("=" * 60)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
y_test_fold = y[test_idx]
fraud_pct = (y_test_fold == 1).mean() * 100
print(f" Fold {fold}: Fraud in test = {(y_test_fold == 1).sum()} ({fraud_pct:.1f}%)")
print(" ✅ Each fold has ~5% fraud (matches original)")
# =====================================
# COMPARING SCORES
# =====================================
print("\n" + "=" * 60)
print("📊 COMPARING MODEL EVALUATION")
print("=" * 60)
model = LogisticRegression(class_weight='balanced', random_state=42)
# Regular K-Fold
kf_scores = cross_val_score(model, X, y, cv=KFold(5, shuffle=True, random_state=42),
scoring='f1')
print(f"\nRegular KFold F1 scores: {kf_scores.round(3)}")
print(f" Mean ± Std: {kf_scores.mean():.3f} ± {kf_scores.std():.3f}")
# Stratified K-Fold
skf_scores = cross_val_score(model, X, y, cv=StratifiedKFold(5, shuffle=True, random_state=42),
scoring='f1')
print(f"\nStratified KFold F1 scores: {skf_scores.round(3)}")
print(f" Mean ± Std: {skf_scores.mean():.3f} ± {skf_scores.std():.3f}")
print("\n💡 Stratified gives more consistent scores (lower std dev)")import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
# House price data
np.random.seed(42)
n = 200
X = np.column_stack([
np.random.randint(500, 3000, n), # Size
np.random.randint(1, 5, n), # Bedrooms
np.random.randint(0, 30, n) # Age
])
y = 30 + 0.08 * X[:, 0] + 10 * X[:, 1] - 0.5 * X[:, 2] + np.random.normal(0, 20, n)
print("=" * 60)
print("🚨 DATA LEAKAGE PREVENTION")
print("=" * 60)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# =====================================
# ❌ WRONG: Scaling on entire dataset (LEAKAGE!)
# =====================================
print("\n" + "=" * 60)
print("❌ WRONG: Scale BEFORE split (Data Leakage!)")
print("=" * 60)
scaler_wrong = StandardScaler()
# This is WRONG! Test data influences the scaling
X_scaled_all = scaler_wrong.fit_transform(X) # Uses ALL data
X_train_wrong, X_test_wrong, y_train_w, y_test_w = train_test_split(
X_scaled_all, y, test_size=0.2, random_state=42
)
model = LinearRegression()
model.fit(X_train_wrong, y_train_w)
score_wrong = model.score(X_test_wrong, y_test_w)
print(f"R² Score (with leakage): {score_wrong:.4f}")
print("⚠️ This score is OPTIMISTIC and won't generalize!")
# =====================================
# ✅ CORRECT: Scale AFTER split
# =====================================
print("\n" + "=" * 60)
print("✅ CORRECT: Scale AFTER split (No Leakage)")
print("=" * 60)
scaler_correct = StandardScaler()
# Fit ONLY on training data
scaler_correct.fit(X_train)
# Transform both using the SAME fitted scaler
X_train_scaled = scaler_correct.transform(X_train)
X_test_scaled = scaler_correct.transform(X_test)
print(f"Training data mean: {X_train_scaled.mean(axis=0).round(4)}")
print(f"Test data mean: {X_test_scaled.mean(axis=0).round(4)}")
print("💡 Test mean ≠ 0 because we didn't fit on test data!")
model.fit(X_train_scaled, y_train)
score_correct = model.score(X_test_scaled, y_test)
print(f"\nR² Score (correct method): {score_correct:.4f}")
# =====================================
# 🚀 BEST: Use sklearn Pipeline
# =====================================
print("\n" + "=" * 60)
print("🚀 BEST PRACTICE: sklearn Pipeline")
print("=" * 60)
# Pipeline ensures correct order automatically!
pipeline = Pipeline([
('scaler', StandardScaler()),
('model', LinearRegression())
])
# Cross-validation with pipeline (automatically handles leakage)
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
print(f"\nCross-validation R² scores: {cv_scores.round(4)}")
print(f"Mean ± Std: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
print("\n✅ Pipeline automatically:")
print(" 1. Fits scaler on each fold's training data")
print(" 2. Transforms training and test separately")
print(" 3. Prevents any data leakage!")
# Summary
print("\n" + "=" * 60)
print("📋 SUMMARY: AVOIDING DATA LEAKAGE")
print("=" * 60)
print("""
| Step | Rule |
|-------------------------|--------------------------------|
| 1. Split data | ALWAYS do this FIRST |
| 2. Preprocessing | Fit on TRAIN only |
| 3. Feature Engineering | No target info in features |
| 4. Time Series | Split chronologically |
| 5. Best Practice | Use sklearn Pipeline! |
""")You've mastered the essential data skills. Now you're ready to build your first ML model!