W Code - Pattern-Based DSA Learning Platform

Bhanu Bisht

Train-Test Split & Cross-Validation

Learn to properly evaluate your models — avoiding overfitting, ensuring reliable estimates, and preventing data leakage.

Topics

sklearn

Implementation

🏠 Indian Context

Examples use Mumbai/Bangalore house prices and loan approval data — predicting real estate values and credit decisions!

Why Split Data?

Foundation

Python

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Sample: Mumbai/Bangalore house price data
np.random.seed(42)
n = 100

house_df = pd.DataFrame({
    'Size_Sqft': np.random.randint(500, 3000, n),
    'Bedrooms': np.random.randint(1, 5, n),
    'Age_Years': np.random.randint(0, 30, n),
    'City': np.random.choice(['Mumbai', 'Bangalore'], n),
    'Price_Lakhs': np.random.randint(30, 300, n)
})

print("=" * 60)
print("📊 WHY WE SPLIT DATA")
print("=" * 60)
print(f"\nTotal samples: {len(house_df)}")
print(house_df.head())

# Prepare features (X) and target (y)
X = house_df[['Size_Sqft', 'Bedrooms', 'Age_Years']]
y = house_df['Price_Lakhs']

print(f"\nFeatures (X): {X.shape}")
print(f"Target (y): {y.shape}")

# =====================================
# THE TRAIN-TEST SPLIT
# =====================================
print("\n" + "=" * 60)
print("✂️ TRAIN-TEST SPLIT")
print("=" * 60)

# 80-20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,    # 20% for testing
    random_state=42   # For reproducibility
)

print(f"\nTraining set:")
print(f"   X_train: {X_train.shape} → {len(X_train)} samples")
print(f"   y_train: {y_train.shape} → {len(y_train)} labels")

print(f"\nTest set:")
print(f"   X_test: {X_test.shape} → {len(X_test)} samples")
print(f"   y_test: {y_test.shape} → {len(y_test)} labels")

print(f"\n📊 Split ratio: {len(X_train)}/{len(X_test)} = {len(X_train)/len(house_df)*100:.0f}%/{len(X_test)/len(house_df)*100:.0f}%")

# The random_state ensures reproducibility
print("\n💡 TIP: Use random_state=42 for reproducible results")
print("   Without it, every run gives different splits!")

Stratified Sampling

Imbalanced Data

Python

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Simulating IMBALANCED dataset
# Loan approval: 85% approved, 15% rejected (realistic!)
np.random.seed(42)
n = 200

loan_df = pd.DataFrame({
    'Income': np.random.randint(200000, 2000000, n),
    'Credit_Score': np.random.randint(300, 900, n),
    'Loan_Amount': np.random.randint(100000, 5000000, n),
    'Approved': np.random.choice([1, 0], n, p=[0.85, 0.15])  # Imbalanced!
})

X = loan_df[['Income', 'Credit_Score', 'Loan_Amount']]
y = loan_df['Approved']

print("=" * 60)
print("📊 STRATIFIED SAMPLING")
print("=" * 60)

# Original class distribution
print("\n🎯 ORIGINAL CLASS DISTRIBUTION:")
print(f"   Approved (1): {(y == 1).sum()} ({(y == 1).mean()*100:.1f}%)")
print(f"   Rejected (0): {(y == 0).sum()} ({(y == 0).mean()*100:.1f}%)")

# =====================================
# WITHOUT STRATIFICATION
# =====================================
print("\n" + "=" * 60)
print("❌ WITHOUT STRATIFICATION")
print("=" * 60)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTraining Set Distribution:")
print(f"   Approved: {(y_train == 1).sum()} ({(y_train == 1).mean()*100:.1f}%)")
print(f"   Rejected: {(y_train == 0).sum()} ({(y_train == 0).mean()*100:.1f}%)")

print(f"\nTest Set Distribution:")
print(f"   Approved: {(y_test == 1).sum()} ({(y_test == 1).mean()*100:.1f}%)")
print(f"   Rejected: {(y_test == 0).sum()} ({(y_test == 0).mean()*100:.1f}%)")

# =====================================
# WITH STRATIFICATION
# =====================================
print("\n" + "=" * 60)
print("✅ WITH STRATIFICATION (stratify=y)")
print("=" * 60)

X_train_strat, X_test_strat, y_train_strat, y_test_strat = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # <-- Key change!
)

print(f"\nTraining Set Distribution:")
print(f"   Approved: {(y_train_strat == 1).sum()} ({(y_train_strat == 1).mean()*100:.1f}%)")
print(f"   Rejected: {(y_train_strat == 0).sum()} ({(y_train_strat == 0).mean()*100:.1f}%)")

print(f"\nTest Set Distribution:")
print(f"   Approved: {(y_test_strat == 1).sum()} ({(y_test_strat == 1).mean()*100:.1f}%)")
print(f"   Rejected: {(y_test_strat == 0).sum()} ({(y_test_strat == 0).mean()*100:.1f}%)")

print("\n💡 Both sets now have ~85% Approved, ~15% Rejected")
print("   This gives more reliable model evaluation!")

K-Fold Cross-Validation

Robust Evaluation

Python

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression

# House price prediction data
np.random.seed(42)
n = 100

X = np.random.randint(500, 3000, (n, 1))  # Size in sqft
y = 50 + 0.1 * X.ravel() + np.random.normal(0, 20, n)  # Price in lakhs

print("=" * 60)
print("🔄 K-FOLD CROSS-VALIDATION")
print("=" * 60)
print(f"\nDataset: {n} samples")

# =====================================
# SINGLE TRAIN-TEST SPLIT
# =====================================
print("\n" + "=" * 60)
print("❌ SINGLE SPLIT (Unreliable)")
print("=" * 60)

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

model = LinearRegression()

# Try different random states to see variation
scores_single = []
for rs in [42, 123, 456, 789, 101]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    scores_single.append(score)
    print(f"   Random State {rs}: R² = {score:.4f}")

print(f"\n   Mean: {np.mean(scores_single):.4f}")
print(f"   Std Dev: {np.std(scores_single):.4f}")
print(f"   ⚠️  High variance! Results depend on lucky/unlucky split.")

# =====================================
# K-FOLD CROSS-VALIDATION
# =====================================
print("\n" + "=" * 60)
print("✅ 5-FOLD CROSS-VALIDATION (Reliable)")
print("=" * 60)

# Method 1: Using KFold manually
kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    fold_scores.append(score)
    print(f"   Fold {fold}: R² = {score:.4f} | Train: {len(X_train)}, Test: {len(X_test)}")

print(f"\n   Mean R²: {np.mean(fold_scores):.4f}")
print(f"   Std Dev: {np.std(fold_scores):.4f}")

# Method 2: Using cross_val_score (simpler!)
print("\n" + "=" * 60)
print("🚀 EASIER: cross_val_score()")
print("=" * 60)

cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"\nScores: {cv_scores.round(4)}")
print(f"Mean ± Std: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Different scoring metrics
print("\n📊 Different Metrics:")
for metric in ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']:
    scores = cross_val_score(model, X, y, cv=5, scoring=metric)
    print(f"   {metric}: {scores.mean():.4f} ± {scores.std():.4f}")

Stratified K-Fold (For Classification)

Classification

Python

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression

# Imbalanced classification: Fraud detection
# Only 5% are fraud (realistic!)
np.random.seed(42)
n = 500

X = np.random.randn(n, 2)
y = np.random.choice([0, 1], n, p=[0.95, 0.05])  # 5% fraud

print("=" * 60)
print("📊 STRATIFIED K-FOLD (for Classification)")
print("=" * 60)

print(f"\nDataset: {n} samples")
print(f"Class Distribution:")
print(f"   Normal (0): {(y == 0).sum()} ({(y == 0).mean()*100:.1f}%)")
print(f"   Fraud (1): {(y == 1).sum()} ({(y == 1).mean()*100:.1f}%)")

# =====================================
# REGULAR K-FOLD (Problem)
# =====================================
print("\n" + "=" * 60)
print("❌ REGULAR K-FOLD (Class imbalance issue)")
print("=" * 60)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    y_test_fold = y[test_idx]
    fraud_pct = (y_test_fold == 1).mean() * 100
    print(f"   Fold {fold}: Fraud in test = {(y_test_fold == 1).sum()} ({fraud_pct:.1f}%)")

print("   ⚠️  Fraud percentage varies widely between folds!")

# =====================================
# STRATIFIED K-FOLD (Solution)
# =====================================
print("\n" + "=" * 60)
print("✅ STRATIFIED K-FOLD (Consistent)")
print("=" * 60)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
    y_test_fold = y[test_idx]
    fraud_pct = (y_test_fold == 1).mean() * 100
    print(f"   Fold {fold}: Fraud in test = {(y_test_fold == 1).sum()} ({fraud_pct:.1f}%)")

print("   ✅ Each fold has ~5% fraud (matches original)")

# =====================================
# COMPARING SCORES
# =====================================
print("\n" + "=" * 60)
print("📊 COMPARING MODEL EVALUATION")
print("=" * 60)

model = LogisticRegression(class_weight='balanced', random_state=42)

# Regular K-Fold
kf_scores = cross_val_score(model, X, y, cv=KFold(5, shuffle=True, random_state=42), 
                             scoring='f1')
print(f"\nRegular KFold F1 scores: {kf_scores.round(3)}")
print(f"   Mean ± Std: {kf_scores.mean():.3f} ± {kf_scores.std():.3f}")

# Stratified K-Fold
skf_scores = cross_val_score(model, X, y, cv=StratifiedKFold(5, shuffle=True, random_state=42),
                              scoring='f1')
print(f"\nStratified KFold F1 scores: {skf_scores.round(3)}")
print(f"   Mean ± Std: {skf_scores.mean():.3f} ± {skf_scores.std():.3f}")

print("\n💡 Stratified gives more consistent scores (lower std dev)")

Data Leakage Prevention

Critical Concept

Python

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

# House price data
np.random.seed(42)
n = 200

X = np.column_stack([
    np.random.randint(500, 3000, n),   # Size
    np.random.randint(1, 5, n),        # Bedrooms
    np.random.randint(0, 30, n)        # Age
])
y = 30 + 0.08 * X[:, 0] + 10 * X[:, 1] - 0.5 * X[:, 2] + np.random.normal(0, 20, n)

print("=" * 60)
print("🚨 DATA LEAKAGE PREVENTION")
print("=" * 60)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =====================================
# ❌ WRONG: Scaling on entire dataset (LEAKAGE!)
# =====================================
print("\n" + "=" * 60)
print("❌ WRONG: Scale BEFORE split (Data Leakage!)")
print("=" * 60)

scaler_wrong = StandardScaler()

# This is WRONG! Test data influences the scaling
X_scaled_all = scaler_wrong.fit_transform(X)  # Uses ALL data
X_train_wrong, X_test_wrong, y_train_w, y_test_w = train_test_split(
    X_scaled_all, y, test_size=0.2, random_state=42
)

model = LinearRegression()
model.fit(X_train_wrong, y_train_w)
score_wrong = model.score(X_test_wrong, y_test_w)
print(f"R² Score (with leakage): {score_wrong:.4f}")
print("⚠️  This score is OPTIMISTIC and won't generalize!")

# =====================================
# ✅ CORRECT: Scale AFTER split
# =====================================
print("\n" + "=" * 60)
print("✅ CORRECT: Scale AFTER split (No Leakage)")
print("=" * 60)

scaler_correct = StandardScaler()

# Fit ONLY on training data
scaler_correct.fit(X_train)

# Transform both using the SAME fitted scaler
X_train_scaled = scaler_correct.transform(X_train)
X_test_scaled = scaler_correct.transform(X_test)

print(f"Training data mean: {X_train_scaled.mean(axis=0).round(4)}")
print(f"Test data mean: {X_test_scaled.mean(axis=0).round(4)}")
print("💡 Test mean ≠ 0 because we didn't fit on test data!")

model.fit(X_train_scaled, y_train)
score_correct = model.score(X_test_scaled, y_test)
print(f"\nR² Score (correct method): {score_correct:.4f}")

# =====================================
# 🚀 BEST: Use sklearn Pipeline
# =====================================
print("\n" + "=" * 60)
print("🚀 BEST PRACTICE: sklearn Pipeline")
print("=" * 60)

# Pipeline ensures correct order automatically!
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

# Cross-validation with pipeline (automatically handles leakage)
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
print(f"\nCross-validation R² scores: {cv_scores.round(4)}")
print(f"Mean ± Std: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

print("\n✅ Pipeline automatically:")
print("   1. Fits scaler on each fold's training data")
print("   2. Transforms training and test separately")
print("   3. Prevents any data leakage!")

# Summary
print("\n" + "=" * 60)
print("📋 SUMMARY: AVOIDING DATA LEAKAGE")
print("=" * 60)
print("""
| Step                    | Rule                           |
|-------------------------|--------------------------------|
| 1. Split data           | ALWAYS do this FIRST           |
| 2. Preprocessing        | Fit on TRAIN only              |
| 3. Feature Engineering  | No target info in features     |
| 4. Time Series          | Split chronologically          |
| 5. Best Practice        | Use sklearn Pipeline!          |
""")

🎯 Key Takeaways

• Train-Test Split: 80/20 or 70/30 is common, use random_state for reproducibility
• Stratified Split: Use stratify=y for imbalanced classification
• K-Fold CV: 5 or 10 folds, gives mean ± std for reliable evaluation
• Stratified K-Fold: Use for classification to maintain class ratios
• Data Leakage: ALWAYS split first, then preprocess. Use Pipeline!

🎉 Phase 1 Complete: Foundation & Data Preprocessing!

You've mastered the essential data skills. Now you're ready to build your first ML model!

Start Phase 2: Linear Regression Back to ML Theory →

import numpy as np import pandas as pd from sklearn.model_selection import train_test_split # Sample: Mumbai/Bangalore house price data np.random.seed(42) n = 100 house_df = pd.DataFrame({ 'Size_Sqft': np.random.randint(500, 3000, n), 'Bedrooms': np.random.randint(1, 5, n), 'Age_Years': np.random.randint(0, 30, n), 'City': np.random.choice(['Mumbai', 'Bangalore'], n), 'Price_Lakhs': np.random.randint(30, 300, n) }) print("=" * 60) print("📊 WHY WE SPLIT DATA") print("=" * 60) print(f"\nTotal samples: {len(house_df)}") print(house_df.head()) # Prepare features (X) and target (y) X = house_df[['Size_Sqft', 'Bedrooms', 'Age_Years']] y = house_df['Price_Lakhs'] print(f"\nFeatures (X): {X.shape}") print(f"Target (y): {y.shape}") # ===================================== # THE TRAIN-TEST SPLIT # ===================================== print("\n" + "=" * 60) print("✂️ TRAIN-TEST SPLIT") print("=" * 60) # 80-20 split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, # 20% for testing random_state=42 # For reproducibility ) print(f"\nTraining set:") print(f" X_train: {X_train.shape} → {len(X_train)} samples") print(f" y_train: {y_train.shape} → {len(y_train)} labels") print(f"\nTest set:") print(f" X_test: {X_test.shape} → {len(X_test)} samples") print(f" y_test: {y_test.shape} → {len(y_test)} labels") print(f"\n📊 Split ratio: {len(X_train)}/{len(X_test)} = {len(X_train)/len(house_df)*100:.0f}%/{len(X_test)/len(house_df)*100:.0f}%") # The random_state ensures reproducibility print("\n💡 TIP: Use random_state=42 for reproducible results") print(" Without it, every run gives different splits!")

import numpy as np import pandas as pd from sklearn.model_selection import train_test_split # Simulating IMBALANCED dataset # Loan approval: 85% approved, 15% rejected (realistic!) np.random.seed(42) n = 200 loan_df = pd.DataFrame({ 'Income': np.random.randint(200000, 2000000, n), 'Credit_Score': np.random.randint(300, 900, n), 'Loan_Amount': np.random.randint(100000, 5000000, n), 'Approved': np.random.choice([1, 0], n, p=[0.85, 0.15]) # Imbalanced! }) X = loan_df[['Income', 'Credit_Score', 'Loan_Amount']] y = loan_df['Approved'] print("=" * 60) print("📊 STRATIFIED SAMPLING") print("=" * 60) # Original class distribution print("\n🎯 ORIGINAL CLASS DISTRIBUTION:") print(f" Approved (1): {(y == 1).sum()} ({(y == 1).mean()*100:.1f}%)") print(f" Rejected (0): {(y == 0).sum()} ({(y == 0).mean()*100:.1f}%)") # ===================================== # WITHOUT STRATIFICATION # ===================================== print("\n" + "=" * 60) print("❌ WITHOUT STRATIFICATION") print("=" * 60) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) print(f"\nTraining Set Distribution:") print(f" Approved: {(y_train == 1).sum()} ({(y_train == 1).mean()*100:.1f}%)") print(f" Rejected: {(y_train == 0).sum()} ({(y_train == 0).mean()*100:.1f}%)") print(f"\nTest Set Distribution:") print(f" Approved: {(y_test == 1).sum()} ({(y_test == 1).mean()*100:.1f}%)") print(f" Rejected: {(y_test == 0).sum()} ({(y_test == 0).mean()*100:.1f}%)") # ===================================== # WITH STRATIFICATION # ===================================== print("\n" + "=" * 60) print("✅ WITH STRATIFICATION (stratify=y)") print("=" * 60) X_train_strat, X_test_strat, y_train_strat, y_test_strat = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y # <-- Key change! ) print(f"\nTraining Set Distribution:") print(f" Approved: {(y_train_strat == 1).sum()} ({(y_train_strat == 1).mean()*100:.1f}%)") print(f" Rejected: {(y_train_strat == 0).sum()} ({(y_train_strat == 0).mean()*100:.1f}%)") print(f"\nTest Set Distribution:") print(f" Approved: {(y_test_strat == 1).sum()} ({(y_test_strat == 1).mean()*100:.1f}%)") print(f" Rejected: {(y_test_strat == 0).sum()} ({(y_test_strat == 0).mean()*100:.1f}%)") print("\n💡 Both sets now have ~85% Approved, ~15% Rejected") print(" This gives more reliable model evaluation!")

import numpy as np import pandas as pd from sklearn.model_selection import KFold, cross_val_score from sklearn.linear_model import LinearRegression # House price prediction data np.random.seed(42) n = 100 X = np.random.randint(500, 3000, (n, 1)) # Size in sqft y = 50 + 0.1 * X.ravel() + np.random.normal(0, 20, n) # Price in lakhs print("=" * 60) print("🔄 K-FOLD CROSS-VALIDATION") print("=" * 60) print(f"\nDataset: {n} samples") # ===================================== # SINGLE TRAIN-TEST SPLIT # ===================================== print("\n" + "=" * 60) print("❌ SINGLE SPLIT (Unreliable)") print("=" * 60) from sklearn.model_selection import train_test_split from sklearn.metrics import r2_score model = LinearRegression() # Try different random states to see variation scores_single = [] for rs in [42, 123, 456, 789, 101]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs) model.fit(X_train, y_train) score = model.score(X_test, y_test) scores_single.append(score) print(f" Random State {rs}: R² = {score:.4f}") print(f"\n Mean: {np.mean(scores_single):.4f}") print(f" Std Dev: {np.std(scores_single):.4f}") print(f" ⚠️ High variance! Results depend on lucky/unlucky split.") # ===================================== # K-FOLD CROSS-VALIDATION # ===================================== print("\n" + "=" * 60) print("✅ 5-FOLD CROSS-VALIDATION (Reliable)") print("=" * 60) # Method 1: Using KFold manually kf = KFold(n_splits=5, shuffle=True, random_state=42) fold_scores = [] for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1): X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] model.fit(X_train, y_train) score = model.score(X_test, y_test) fold_scores.append(score) print(f" Fold {fold}: R² = {score:.4f} | Train: {len(X_train)}, Test: {len(X_test)}") print(f"\n Mean R²: {np.mean(fold_scores):.4f}") print(f" Std Dev: {np.std(fold_scores):.4f}") # Method 2: Using cross_val_score (simpler!) print("\n" + "=" * 60) print("🚀 EASIER: cross_val_score()") print("=" * 60) cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2') print(f"\nScores: {cv_scores.round(4)}") print(f"Mean ± Std: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}") # Different scoring metrics print("\n📊 Different Metrics:") for metric in ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']: scores = cross_val_score(model, X, y, cv=5, scoring=metric) print(f" {metric}: {scores.mean():.4f} ± {scores.std():.4f}")

import numpy as np import pandas as pd from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score from sklearn.linear_model import LogisticRegression # Imbalanced classification: Fraud detection # Only 5% are fraud (realistic!) np.random.seed(42) n = 500 X = np.random.randn(n, 2) y = np.random.choice([0, 1], n, p=[0.95, 0.05]) # 5% fraud print("=" * 60) print("📊 STRATIFIED K-FOLD (for Classification)") print("=" * 60) print(f"\nDataset: {n} samples") print(f"Class Distribution:") print(f" Normal (0): {(y == 0).sum()} ({(y == 0).mean()*100:.1f}%)") print(f" Fraud (1): {(y == 1).sum()} ({(y == 1).mean()*100:.1f}%)") # ===================================== # REGULAR K-FOLD (Problem) # ===================================== print("\n" + "=" * 60) print("❌ REGULAR K-FOLD (Class imbalance issue)") print("=" * 60) kf = KFold(n_splits=5, shuffle=True, random_state=42) for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1): y_test_fold = y[test_idx] fraud_pct = (y_test_fold == 1).mean() * 100 print(f" Fold {fold}: Fraud in test = {(y_test_fold == 1).sum()} ({fraud_pct:.1f}%)") print(" ⚠️ Fraud percentage varies widely between folds!") # ===================================== # STRATIFIED K-FOLD (Solution) # ===================================== print("\n" + "=" * 60) print("✅ STRATIFIED K-FOLD (Consistent)") print("=" * 60) skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1): y_test_fold = y[test_idx] fraud_pct = (y_test_fold == 1).mean() * 100 print(f" Fold {fold}: Fraud in test = {(y_test_fold == 1).sum()} ({fraud_pct:.1f}%)") print(" ✅ Each fold has ~5% fraud (matches original)") # ===================================== # COMPARING SCORES # ===================================== print("\n" + "=" * 60) print("📊 COMPARING MODEL EVALUATION") print("=" * 60) model = LogisticRegression(class_weight='balanced', random_state=42) # Regular K-Fold kf_scores = cross_val_score(model, X, y, cv=KFold(5, shuffle=True, random_state=42), scoring='f1') print(f"\nRegular KFold F1 scores: {kf_scores.round(3)}") print(f" Mean ± Std: {kf_scores.mean():.3f} ± {kf_scores.std():.3f}") # Stratified K-Fold skf_scores = cross_val_score(model, X, y, cv=StratifiedKFold(5, shuffle=True, random_state=42), scoring='f1') print(f"\nStratified KFold F1 scores: {skf_scores.round(3)}") print(f" Mean ± Std: {skf_scores.mean():.3f} ± {skf_scores.std():.3f}") print("\n💡 Stratified gives more consistent scores (lower std dev)")

import numpy as np import pandas as pd from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression from sklearn.pipeline import Pipeline # House price data np.random.seed(42) n = 200 X = np.column_stack([ np.random.randint(500, 3000, n), # Size np.random.randint(1, 5, n), # Bedrooms np.random.randint(0, 30, n) # Age ]) y = 30 + 0.08 * X[:, 0] + 10 * X[:, 1] - 0.5 * X[:, 2] + np.random.normal(0, 20, n) print("=" * 60) print("🚨 DATA LEAKAGE PREVENTION") print("=" * 60) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # ===================================== # ❌ WRONG: Scaling on entire dataset (LEAKAGE!) # ===================================== print("\n" + "=" * 60) print("❌ WRONG: Scale BEFORE split (Data Leakage!)") print("=" * 60) scaler_wrong = StandardScaler() # This is WRONG! Test data influences the scaling X_scaled_all = scaler_wrong.fit_transform(X) # Uses ALL data X_train_wrong, X_test_wrong, y_train_w, y_test_w = train_test_split( X_scaled_all, y, test_size=0.2, random_state=42 ) model = LinearRegression() model.fit(X_train_wrong, y_train_w) score_wrong = model.score(X_test_wrong, y_test_w) print(f"R² Score (with leakage): {score_wrong:.4f}") print("⚠️ This score is OPTIMISTIC and won't generalize!") # ===================================== # ✅ CORRECT: Scale AFTER split # ===================================== print("\n" + "=" * 60) print("✅ CORRECT: Scale AFTER split (No Leakage)") print("=" * 60) scaler_correct = StandardScaler() # Fit ONLY on training data scaler_correct.fit(X_train) # Transform both using the SAME fitted scaler X_train_scaled = scaler_correct.transform(X_train) X_test_scaled = scaler_correct.transform(X_test) print(f"Training data mean: {X_train_scaled.mean(axis=0).round(4)}") print(f"Test data mean: {X_test_scaled.mean(axis=0).round(4)}") print("💡 Test mean ≠ 0 because we didn't fit on test data!") model.fit(X_train_scaled, y_train) score_correct = model.score(X_test_scaled, y_test) print(f"\nR² Score (correct method): {score_correct:.4f}") # ===================================== # 🚀 BEST: Use sklearn Pipeline # ===================================== print("\n" + "=" * 60) print("🚀 BEST PRACTICE: sklearn Pipeline") print("=" * 60) # Pipeline ensures correct order automatically! pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', LinearRegression()) ]) # Cross-validation with pipeline (automatically handles leakage) cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2') print(f"\nCross-validation R² scores: {cv_scores.round(4)}") print(f"Mean ± Std: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}") print("\n✅ Pipeline automatically:") print(" 1. Fits scaler on each fold's training data") print(" 2. Transforms training and test separately") print(" 3. Prevents any data leakage!") # Summary print("\n" + "=" * 60) print("📋 SUMMARY: AVOIDING DATA LEAKAGE") print("=" * 60) print(""" | Step | Rule | |-------------------------|--------------------------------| | 1. Split data | ALWAYS do this FIRST | | 2. Preprocessing | Fit on TRAIN only | | 3. Feature Engineering | No target info in features | | 4. Time Series | Split chronologically | | 5. Best Practice | Use sklearn Pipeline! | """)