W Code - Pattern-Based DSA Learning Platform

Bhanu Bisht

Back to ML Theory

Data Cleaning & Preprocessing

Transform messy real-world data into clean, model-ready format — missing values, outliers, encoding, and scaling.

Topics

Python

+ sklearn

🚗 Indian Context

Examples use Ola/Uber ride data — handling missing pickup times, outlier fares, payment modes!

Why Data Preprocessing?

Introduction

Python

import pandas as pd
import numpy as np

# Simulating Ola/Uber ride data with REAL problems
ride_data = {
    'Ride_ID': range(1, 11),
    'Pickup_Time': ['08:30', '09:15', None, '12:45', '14:00', 
                    '18:30', None, '20:00', '22:15', '23:45'],
    'Distance_KM': [5.2, 12.5, 8.0, 3.5, 50000, 7.0, 15.2, 9.8, 4.5, 6.0],  # 50000 is outlier!
    'Fare': [85, 195, 125, 60, None, 110, 230, 155, 75, 95],
    'Payment': ['Card', 'Cash', 'UPI', 'cash', 'CARD', 'upi', 'Cash', 'Card', 'Upi', 'cash'],
    'Rating': [4.5, 4.0, None, 3.5, 4.8, 4.2, 3.0, 4.7, None, 4.3]
}
df = pd.DataFrame(ride_data)

print("=" * 60)
print("🚗 RAW OLA/UBER RIDE DATA (with problems)")
print("=" * 60)
print(df)

# Identify all the problems
print("\n" + "=" * 60)
print("🔍 DATA QUALITY ISSUES DETECTED")
print("=" * 60)

# 1. Missing Values
print(f"\n❌ Missing Values:")
print(df.isnull().sum())

# 2. Inconsistent Categories
print(f"\n❌ Inconsistent Payment Types:")
print(df['Payment'].unique())

# 3. Outliers
print(f"\n❌ Distance Statistics (check for outliers):")
print(f"   Max Distance: {df['Distance_KM'].max()} km")
print(f"   Mean Distance: {df['Distance_KM'].mean():.1f} km")
print(f"   Suspicious: 50000 km is impossible for a city ride!")

# 4. Summary
total_issues = df.isnull().sum().sum() + 1 + 4  # missing + outlier + format
print(f"\n📊 Total Issues: {total_issues} problems to fix before ML")

Handling Missing Values

Data Cleaning

Python

import pandas as pd
import numpy as np

# Create sample data with missing values
np.random.seed(42)
n = 100

customer_df = pd.DataFrame({
    'Customer_ID': range(1, n + 1),
    'Age': np.random.choice([25, 30, 35, 40, 45, np.nan], n, p=[0.3, 0.25, 0.2, 0.1, 0.1, 0.05]),
    'Income': np.random.choice([30000, 50000, 80000, np.nan], n, p=[0.4, 0.3, 0.2, 0.1]),
    'City': np.random.choice(['Mumbai', 'Delhi', 'Bangalore', None], n, p=[0.35, 0.3, 0.25, 0.1]),
    'Purchase_Amount': np.random.uniform(100, 5000, n)
})

print("=" * 60)
print("📊 HANDLING MISSING VALUES")
print("=" * 60)

# Step 1: Detect Missing Values
print("\n1️⃣ MISSING VALUE DETECTION:")
print(f"\nMissing per column:")
print(customer_df.isnull().sum())
print(f"\nPercentage missing:")
print((customer_df.isnull().sum() / len(customer_df) * 100).round(1))

# Step 2: Handle NUMERICAL missing (Age) with MEDIAN
print("\n2️⃣ FILL NUMERICAL (Age) WITH MEDIAN:")
age_median = customer_df['Age'].median()
print(f"   Age Median: {age_median}")
customer_df['Age'].fillna(age_median, inplace=True)
print(f"   Missing Ages after fix: {customer_df['Age'].isnull().sum()}")

# Step 3: Handle NUMERICAL missing (Income) with MEAN
print("\n3️⃣ FILL NUMERICAL (Income) WITH MEAN:")
income_mean = customer_df['Income'].mean()
print(f"   Income Mean: ₹{income_mean:.0f}")
customer_df['Income'].fillna(income_mean, inplace=True)
print(f"   Missing Incomes after fix: {customer_df['Income'].isnull().sum()}")

# Step 4: Handle CATEGORICAL missing (City) with MODE
print("\n4️⃣ FILL CATEGORICAL (City) WITH MODE:")
city_mode = customer_df['City'].mode()[0]
print(f"   Most common city: {city_mode}")
customer_df['City'].fillna(city_mode, inplace=True)
print(f"   Missing Cities after fix: {customer_df['City'].isnull().sum()}")

# Final Check
print("\n✅ AFTER HANDLING MISSING VALUES:")
print(customer_df.isnull().sum())
print(f"\nTotal missing: {customer_df.isnull().sum().sum()}")

# Advanced: Using sklearn's SimpleImputer
print("\n" + "=" * 60)
print("🔧 ADVANCED: Using sklearn's SimpleImputer")
print("=" * 60)

from sklearn.impute import SimpleImputer

# Create imputer for numerical data
num_imputer = SimpleImputer(strategy='median')  # or 'mean'

# Create imputer for categorical data  
cat_imputer = SimpleImputer(strategy='most_frequent')

print("\n✅ SimpleImputer allows consistent imputation in ML pipelines")
print("   Usage: imputer.fit_transform(data)")

Handling Outliers: IQR & Z-Score

Outlier Handling

Python

import pandas as pd
import numpy as np

# Sample Ola/Uber fare data with outliers
np.random.seed(42)
fares = np.concatenate([
    np.random.normal(150, 30, 95),  # Normal fares (₹120-180)
    np.array([500, 800, 15, 1200, 2000])  # Outliers!
])
df = pd.DataFrame({'Fare': fares})

print("=" * 60)
print("🔍 OUTLIER DETECTION METHODS")
print("=" * 60)

# ===============================
# METHOD 1: IQR (Interquartile Range)
# ===============================
print("\n1️⃣ IQR METHOD (Interquartile Range)")
print("-" * 40)

Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Q1 (25th percentile): ₹{Q1:.2f}")
print(f"Q3 (75th percentile): ₹{Q3:.2f}")
print(f"IQR: ₹{IQR:.2f}")
print(f"\nLower Bound: ₹{lower_bound:.2f}")
print(f"Upper Bound: ₹{upper_bound:.2f}")

# Detect outliers
iqr_outliers = df[(df['Fare'] < lower_bound) | (df['Fare'] > upper_bound)]
print(f"\n🚨 Outliers detected (IQR): {len(iqr_outliers)}")
print(iqr_outliers['Fare'].values)

# ===============================
# METHOD 2: Z-Score
# ===============================
print("\n2️⃣ Z-SCORE METHOD")
print("-" * 40)

from scipy import stats

# Calculate Z-scores
z_scores = np.abs(stats.zscore(df['Fare']))
df['Z_Score'] = z_scores

# Outliers: Z-score > 2 (or 3 for stricter)
threshold = 2
zscore_outliers = df[df['Z_Score'] > threshold]

print(f"Mean: ₹{df['Fare'].mean():.2f}")
print(f"Std Dev: ₹{df['Fare'].std():.2f}")
print(f"Threshold: |Z| > {threshold}")
print(f"\n🚨 Outliers detected (Z-Score): {len(zscore_outliers)}")
print(zscore_outliers[['Fare', 'Z_Score']].values)

# ===============================
# TREATMENT: Remove vs Cap
# ===============================
print("\n" + "=" * 60)
print("🔧 OUTLIER TREATMENT")
print("=" * 60)

# Option 1: Remove outliers
df_removed = df[(df['Fare'] >= lower_bound) & (df['Fare'] <= upper_bound)]
print(f"\nOption 1 - REMOVE:")
print(f"   Before: {len(df)} rows")
print(f"   After: {len(df_removed)} rows")

# Option 2: Cap outliers (Winsorizing)
df_capped = df.copy()
df_capped['Fare'] = df_capped['Fare'].clip(lower=lower_bound, upper=upper_bound)
print(f"\nOption 2 - CAP (Winsorize):")
print(f"   Original max: ₹{df['Fare'].max():.2f}")
print(f"   Capped max: ₹{df_capped['Fare'].max():.2f}")

# Compare means
print(f"\n📊 Impact on Mean:")
print(f"   Original: ₹{df['Fare'].mean():.2f}")
print(f"   After Remove: ₹{df_removed['Fare'].mean():.2f}")
print(f"   After Cap: ₹{df_capped['Fare'].mean():.2f}")

Encoding Categorical Data

Categorical Encoding

Python

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Sample data with categorical features
job_data = {
    'Name': ['Rahul', 'Priya', 'Amit', 'Sneha', 'Vikram'],
    'City': ['Mumbai', 'Delhi', 'Bangalore', 'Mumbai', 'Delhi'],
    'Education': ['Bachelor', 'Master', 'PhD', 'Bachelor', 'Master'],
    'Experience_Level': ['Junior', 'Mid', 'Senior', 'Junior', 'Senior'],
    'Salary': [50000, 75000, 120000, 55000, 95000]
}
df = pd.DataFrame(job_data)

print("=" * 60)
print("🏷️ ENCODING CATEGORICAL DATA")
print("=" * 60)
print("\nOriginal Data:")
print(df)

# ===============================
# METHOD 1: Label Encoding
# ===============================
print("\n" + "=" * 60)
print("1️⃣ LABEL ENCODING")
print("=" * 60)

# For ORDINAL data (has natural order)
# Experience: Junior < Mid < Senior
experience_order = {'Junior': 0, 'Mid': 1, 'Senior': 2}
df['Experience_Encoded'] = df['Experience_Level'].map(experience_order)

print("\nExperience Level (Ordinal - HAS order):")
print(df[['Experience_Level', 'Experience_Encoded']])

# Using sklearn LabelEncoder
le = LabelEncoder()
df['City_Label'] = le.fit_transform(df['City'])

print("\nCity (using LabelEncoder):")
print(df[['City', 'City_Label']])
print(f"Classes: {le.classes_}")  # Shows mapping

print("\n⚠️  WARNING: City has NO order, but Label Encoding gives:")
print("   Bangalore=0, Delhi=1, Mumbai=2")
print("   Model might think: Mumbai > Delhi > Bangalore (WRONG!)")

# ===============================
# METHOD 2: One-Hot Encoding
# ===============================
print("\n" + "=" * 60)
print("2️⃣ ONE-HOT ENCODING (for Nominal data)")
print("=" * 60)

# Using pandas get_dummies (easiest)
df_onehot = pd.get_dummies(df[['City', 'Salary']], columns=['City'], drop_first=False)
print("\nOne-Hot Encoded (City):")
print(df_onehot)

# With drop_first=True (avoid dummy variable trap)
df_onehot_dropped = pd.get_dummies(df[['City', 'Salary']], columns=['City'], drop_first=True)
print("\nWith drop_first=True (avoids multicollinearity):")
print(df_onehot_dropped)

# Using sklearn OneHotEncoder
print("\n" + "=" * 60)
print("🔧 SKLEARN OneHotEncoder (for ML pipelines)")
print("=" * 60)

ohe = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' avoids trap
city_encoded = ohe.fit_transform(df[['City']])
print(f"\nShape: {city_encoded.shape}")
print(f"Feature names: {ohe.get_feature_names_out(['City'])}")
print(f"\nEncoded values:\n{city_encoded}")

# Summary: When to use what
print("\n" + "=" * 60)
print("📋 SUMMARY: WHEN TO USE WHAT")
print("=" * 60)
print("""
| Feature Type        | Encoding Method    |
|---------------------|-------------------|
| Ordinal (has order) | Label/Ordinal     |
| Nominal (no order)  | One-Hot           |
| High cardinality    | Target Encoding   |
| Tree-based models   | Either works      |
| Linear models       | One-Hot ONLY      |
""")

Feature Scaling: Normalization vs Standardization

Feature Scaling

Python

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Sample data: Employee features
np.random.seed(42)
employee_df = pd.DataFrame({
    'Age': np.random.randint(22, 60, 10),
    'Experience_Years': np.random.randint(0, 35, 10),
    'Salary': np.random.randint(300000, 5000000, 10),  # 3L to 50L
    'Projects_Completed': np.random.randint(1, 50, 10)
})

print("=" * 60)
print("📐 FEATURE SCALING")
print("=" * 60)
print("\nOriginal Data:")
print(employee_df.describe().round(0))

# Look at the scale difference!
print("\n⚠️  PROBLEM: Different Scales!")
print(f"   Age range: {employee_df['Age'].min()} - {employee_df['Age'].max()}")
print(f"   Salary range: ₹{employee_df['Salary'].min():,} - ₹{employee_df['Salary'].max():,}")
print("   Salary is 100,000x larger than Age!")

# ===============================
# METHOD 1: Normalization (Min-Max)
# ===============================
print("\n" + "=" * 60)
print("1️⃣ NORMALIZATION (Min-Max Scaling)")
print("=" * 60)

scaler_minmax = MinMaxScaler()
df_normalized = pd.DataFrame(
    scaler_minmax.fit_transform(employee_df),
    columns=employee_df.columns
)

print("\nNormalized Data (0 to 1):")
print(df_normalized.round(3))

print("\n✅ Formula: (x - min) / (max - min)")
print("   All values now between 0 and 1")

# ===============================
# METHOD 2: Standardization (Z-Score)
# ===============================
print("\n" + "=" * 60)
print("2️⃣ STANDARDIZATION (Z-Score Scaling)")
print("=" * 60)

scaler_std = StandardScaler()
df_standardized = pd.DataFrame(
    scaler_std.fit_transform(employee_df),
    columns=employee_df.columns
)

print("\nStandardized Data (mean=0, std=1):")
print(df_standardized.round(3))

print("\n✅ Formula: (x - mean) / std")
print(f"   Mean of each column: {df_standardized.mean().round(5).tolist()}")
print(f"   Std of each column: {df_standardized.std().round(2).tolist()}")

# ===============================
# COMPARISON: Which to Choose?
# ===============================
print("\n" + "=" * 60)
print("📊 COMPARISON: Normalization vs Standardization")
print("=" * 60)

# Add an outlier to see the effect
employee_with_outlier = employee_df.copy()
employee_with_outlier.loc[0, 'Salary'] = 50000000  # 5 Crore outlier!

# Normalize
normalized_outlier = MinMaxScaler().fit_transform(employee_with_outlier)
# Standardize
standardized_outlier = StandardScaler().fit_transform(employee_with_outlier)

print("\n🚨 With an OUTLIER (₹5 Crore salary):")
print(f"   Normalization - All other salaries compressed to < 0.1")
print(f"   Standardization - Other values less affected")
print("\n💡 Standardization is more ROBUST to outliers!")

# Summary table
print("\n" + "=" * 60)
print("📋 WHEN TO USE WHICH?")
print("=" * 60)
print("""
| Scenario                      | Method          |
|-------------------------------|-----------------|
| Unknown distribution          | Standardization |
| Need bounded values (0-1)     | Normalization   |
| Has outliers                  | Standardization |
| Neural network (ReLU/Sigmoid) | Normalization   |
| SVM, Logistic Regression      | Standardization |
| Image pixel values            | Normalization   |
""")

# Important: Fit on TRAIN, transform on TEST!
print("\n⚠️  CRITICAL: Avoid Data Leakage!")
print("   scaler.fit(X_train) → scaler.transform(X_train)")
print("   scaler.transform(X_test) → Use SAME scaler, don't fit again!")

🎯 Key Takeaways

• Missing Values: Mean (normal), Median (skewed), Mode (categorical)
• Outliers: IQR (skewed data), Z-Score (normal data)
• Encoding: Label (ordinal), One-Hot (nominal)
• Scaling: Normalization (0-1), Standardization (mean=0, std=1)
• Critical: Fit scaler on TRAIN only, transform on TEST!

🎯 What's Next?

Now that your data is clean, learn how to split it properly for training and evaluation:

Train-Test Split & Cross-Validation Start Linear Regression →

import pandas as pd import numpy as np # Simulating Ola/Uber ride data with REAL problems ride_data = { 'Ride_ID': range(1, 11), 'Pickup_Time': ['08:30', '09:15', None, '12:45', '14:00', '18:30', None, '20:00', '22:15', '23:45'], 'Distance_KM': [5.2, 12.5, 8.0, 3.5, 50000, 7.0, 15.2, 9.8, 4.5, 6.0], # 50000 is outlier! 'Fare': [85, 195, 125, 60, None, 110, 230, 155, 75, 95], 'Payment': ['Card', 'Cash', 'UPI', 'cash', 'CARD', 'upi', 'Cash', 'Card', 'Upi', 'cash'], 'Rating': [4.5, 4.0, None, 3.5, 4.8, 4.2, 3.0, 4.7, None, 4.3] } df = pd.DataFrame(ride_data) print("=" * 60) print("🚗 RAW OLA/UBER RIDE DATA (with problems)") print("=" * 60) print(df) # Identify all the problems print("\n" + "=" * 60) print("🔍 DATA QUALITY ISSUES DETECTED") print("=" * 60) # 1. Missing Values print(f"\n❌ Missing Values:") print(df.isnull().sum()) # 2. Inconsistent Categories print(f"\n❌ Inconsistent Payment Types:") print(df['Payment'].unique()) # 3. Outliers print(f"\n❌ Distance Statistics (check for outliers):") print(f" Max Distance: {df['Distance_KM'].max()} km") print(f" Mean Distance: {df['Distance_KM'].mean():.1f} km") print(f" Suspicious: 50000 km is impossible for a city ride!") # 4. Summary total_issues = df.isnull().sum().sum() + 1 + 4 # missing + outlier + format print(f"\n📊 Total Issues: {total_issues} problems to fix before ML")

import pandas as pd import numpy as np # Create sample data with missing values np.random.seed(42) n = 100 customer_df = pd.DataFrame({ 'Customer_ID': range(1, n + 1), 'Age': np.random.choice([25, 30, 35, 40, 45, np.nan], n, p=[0.3, 0.25, 0.2, 0.1, 0.1, 0.05]), 'Income': np.random.choice([30000, 50000, 80000, np.nan], n, p=[0.4, 0.3, 0.2, 0.1]), 'City': np.random.choice(['Mumbai', 'Delhi', 'Bangalore', None], n, p=[0.35, 0.3, 0.25, 0.1]), 'Purchase_Amount': np.random.uniform(100, 5000, n) }) print("=" * 60) print("📊 HANDLING MISSING VALUES") print("=" * 60) # Step 1: Detect Missing Values print("\n1️⃣ MISSING VALUE DETECTION:") print(f"\nMissing per column:") print(customer_df.isnull().sum()) print(f"\nPercentage missing:") print((customer_df.isnull().sum() / len(customer_df) * 100).round(1)) # Step 2: Handle NUMERICAL missing (Age) with MEDIAN print("\n2️⃣ FILL NUMERICAL (Age) WITH MEDIAN:") age_median = customer_df['Age'].median() print(f" Age Median: {age_median}") customer_df['Age'].fillna(age_median, inplace=True) print(f" Missing Ages after fix: {customer_df['Age'].isnull().sum()}") # Step 3: Handle NUMERICAL missing (Income) with MEAN print("\n3️⃣ FILL NUMERICAL (Income) WITH MEAN:") income_mean = customer_df['Income'].mean() print(f" Income Mean: ₹{income_mean:.0f}") customer_df['Income'].fillna(income_mean, inplace=True) print(f" Missing Incomes after fix: {customer_df['Income'].isnull().sum()}") # Step 4: Handle CATEGORICAL missing (City) with MODE print("\n4️⃣ FILL CATEGORICAL (City) WITH MODE:") city_mode = customer_df['City'].mode()[0] print(f" Most common city: {city_mode}") customer_df['City'].fillna(city_mode, inplace=True) print(f" Missing Cities after fix: {customer_df['City'].isnull().sum()}") # Final Check print("\n✅ AFTER HANDLING MISSING VALUES:") print(customer_df.isnull().sum()) print(f"\nTotal missing: {customer_df.isnull().sum().sum()}") # Advanced: Using sklearn's SimpleImputer print("\n" + "=" * 60) print("🔧 ADVANCED: Using sklearn's SimpleImputer") print("=" * 60) from sklearn.impute import SimpleImputer # Create imputer for numerical data num_imputer = SimpleImputer(strategy='median') # or 'mean' # Create imputer for categorical data cat_imputer = SimpleImputer(strategy='most_frequent') print("\n✅ SimpleImputer allows consistent imputation in ML pipelines") print(" Usage: imputer.fit_transform(data)")

import pandas as pd import numpy as np # Sample Ola/Uber fare data with outliers np.random.seed(42) fares = np.concatenate([ np.random.normal(150, 30, 95), # Normal fares (₹120-180) np.array([500, 800, 15, 1200, 2000]) # Outliers! ]) df = pd.DataFrame({'Fare': fares}) print("=" * 60) print("🔍 OUTLIER DETECTION METHODS") print("=" * 60) # =============================== # METHOD 1: IQR (Interquartile Range) # =============================== print("\n1️⃣ IQR METHOD (Interquartile Range)") print("-" * 40) Q1 = df['Fare'].quantile(0.25) Q3 = df['Fare'].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR print(f"Q1 (25th percentile): ₹{Q1:.2f}") print(f"Q3 (75th percentile): ₹{Q3:.2f}") print(f"IQR: ₹{IQR:.2f}") print(f"\nLower Bound: ₹{lower_bound:.2f}") print(f"Upper Bound: ₹{upper_bound:.2f}") # Detect outliers iqr_outliers = df[(df['Fare'] < lower_bound) | (df['Fare'] > upper_bound)] print(f"\n🚨 Outliers detected (IQR): {len(iqr_outliers)}") print(iqr_outliers['Fare'].values) # =============================== # METHOD 2: Z-Score # =============================== print("\n2️⃣ Z-SCORE METHOD") print("-" * 40) from scipy import stats # Calculate Z-scores z_scores = np.abs(stats.zscore(df['Fare'])) df['Z_Score'] = z_scores # Outliers: Z-score > 2 (or 3 for stricter) threshold = 2 zscore_outliers = df[df['Z_Score'] > threshold] print(f"Mean: ₹{df['Fare'].mean():.2f}") print(f"Std Dev: ₹{df['Fare'].std():.2f}") print(f"Threshold: |Z| > {threshold}") print(f"\n🚨 Outliers detected (Z-Score): {len(zscore_outliers)}") print(zscore_outliers[['Fare', 'Z_Score']].values) # =============================== # TREATMENT: Remove vs Cap # =============================== print("\n" + "=" * 60) print("🔧 OUTLIER TREATMENT") print("=" * 60) # Option 1: Remove outliers df_removed = df[(df['Fare'] >= lower_bound) & (df['Fare'] <= upper_bound)] print(f"\nOption 1 - REMOVE:") print(f" Before: {len(df)} rows") print(f" After: {len(df_removed)} rows") # Option 2: Cap outliers (Winsorizing) df_capped = df.copy() df_capped['Fare'] = df_capped['Fare'].clip(lower=lower_bound, upper=upper_bound) print(f"\nOption 2 - CAP (Winsorize):") print(f" Original max: ₹{df['Fare'].max():.2f}") print(f" Capped max: ₹{df_capped['Fare'].max():.2f}") # Compare means print(f"\n📊 Impact on Mean:") print(f" Original: ₹{df['Fare'].mean():.2f}") print(f" After Remove: ₹{df_removed['Fare'].mean():.2f}") print(f" After Cap: ₹{df_capped['Fare'].mean():.2f}")

import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder, OneHotEncoder # Sample data with categorical features job_data = { 'Name': ['Rahul', 'Priya', 'Amit', 'Sneha', 'Vikram'], 'City': ['Mumbai', 'Delhi', 'Bangalore', 'Mumbai', 'Delhi'], 'Education': ['Bachelor', 'Master', 'PhD', 'Bachelor', 'Master'], 'Experience_Level': ['Junior', 'Mid', 'Senior', 'Junior', 'Senior'], 'Salary': [50000, 75000, 120000, 55000, 95000] } df = pd.DataFrame(job_data) print("=" * 60) print("🏷️ ENCODING CATEGORICAL DATA") print("=" * 60) print("\nOriginal Data:") print(df) # =============================== # METHOD 1: Label Encoding # =============================== print("\n" + "=" * 60) print("1️⃣ LABEL ENCODING") print("=" * 60) # For ORDINAL data (has natural order) # Experience: Junior < Mid < Senior experience_order = {'Junior': 0, 'Mid': 1, 'Senior': 2} df['Experience_Encoded'] = df['Experience_Level'].map(experience_order) print("\nExperience Level (Ordinal - HAS order):") print(df[['Experience_Level', 'Experience_Encoded']]) # Using sklearn LabelEncoder le = LabelEncoder() df['City_Label'] = le.fit_transform(df['City']) print("\nCity (using LabelEncoder):") print(df[['City', 'City_Label']]) print(f"Classes: {le.classes_}") # Shows mapping print("\n⚠️ WARNING: City has NO order, but Label Encoding gives:") print(" Bangalore=0, Delhi=1, Mumbai=2") print(" Model might think: Mumbai > Delhi > Bangalore (WRONG!)") # =============================== # METHOD 2: One-Hot Encoding # =============================== print("\n" + "=" * 60) print("2️⃣ ONE-HOT ENCODING (for Nominal data)") print("=" * 60) # Using pandas get_dummies (easiest) df_onehot = pd.get_dummies(df[['City', 'Salary']], columns=['City'], drop_first=False) print("\nOne-Hot Encoded (City):") print(df_onehot) # With drop_first=True (avoid dummy variable trap) df_onehot_dropped = pd.get_dummies(df[['City', 'Salary']], columns=['City'], drop_first=True) print("\nWith drop_first=True (avoids multicollinearity):") print(df_onehot_dropped) # Using sklearn OneHotEncoder print("\n" + "=" * 60) print("🔧 SKLEARN OneHotEncoder (for ML pipelines)") print("=" * 60) ohe = OneHotEncoder(sparse_output=False, drop='first') # drop='first' avoids trap city_encoded = ohe.fit_transform(df[['City']]) print(f"\nShape: {city_encoded.shape}") print(f"Feature names: {ohe.get_feature_names_out(['City'])}") print(f"\nEncoded values:\n{city_encoded}") # Summary: When to use what print("\n" + "=" * 60) print("📋 SUMMARY: WHEN TO USE WHAT") print("=" * 60) print(""" | Feature Type | Encoding Method | |---------------------|-------------------| | Ordinal (has order) | Label/Ordinal | | Nominal (no order) | One-Hot | | High cardinality | Target Encoding | | Tree-based models | Either works | | Linear models | One-Hot ONLY | """)

import pandas as pd import numpy as np from sklearn.preprocessing import MinMaxScaler, StandardScaler # Sample data: Employee features np.random.seed(42) employee_df = pd.DataFrame({ 'Age': np.random.randint(22, 60, 10), 'Experience_Years': np.random.randint(0, 35, 10), 'Salary': np.random.randint(300000, 5000000, 10), # 3L to 50L 'Projects_Completed': np.random.randint(1, 50, 10) }) print("=" * 60) print("📐 FEATURE SCALING") print("=" * 60) print("\nOriginal Data:") print(employee_df.describe().round(0)) # Look at the scale difference! print("\n⚠️ PROBLEM: Different Scales!") print(f" Age range: {employee_df['Age'].min()} - {employee_df['Age'].max()}") print(f" Salary range: ₹{employee_df['Salary'].min():,} - ₹{employee_df['Salary'].max():,}") print(" Salary is 100,000x larger than Age!") # =============================== # METHOD 1: Normalization (Min-Max) # =============================== print("\n" + "=" * 60) print("1️⃣ NORMALIZATION (Min-Max Scaling)") print("=" * 60) scaler_minmax = MinMaxScaler() df_normalized = pd.DataFrame( scaler_minmax.fit_transform(employee_df), columns=employee_df.columns ) print("\nNormalized Data (0 to 1):") print(df_normalized.round(3)) print("\n✅ Formula: (x - min) / (max - min)") print(" All values now between 0 and 1") # =============================== # METHOD 2: Standardization (Z-Score) # =============================== print("\n" + "=" * 60) print("2️⃣ STANDARDIZATION (Z-Score Scaling)") print("=" * 60) scaler_std = StandardScaler() df_standardized = pd.DataFrame( scaler_std.fit_transform(employee_df), columns=employee_df.columns ) print("\nStandardized Data (mean=0, std=1):") print(df_standardized.round(3)) print("\n✅ Formula: (x - mean) / std") print(f" Mean of each column: {df_standardized.mean().round(5).tolist()}") print(f" Std of each column: {df_standardized.std().round(2).tolist()}") # =============================== # COMPARISON: Which to Choose? # =============================== print("\n" + "=" * 60) print("📊 COMPARISON: Normalization vs Standardization") print("=" * 60) # Add an outlier to see the effect employee_with_outlier = employee_df.copy() employee_with_outlier.loc[0, 'Salary'] = 50000000 # 5 Crore outlier! # Normalize normalized_outlier = MinMaxScaler().fit_transform(employee_with_outlier) # Standardize standardized_outlier = StandardScaler().fit_transform(employee_with_outlier) print("\n🚨 With an OUTLIER (₹5 Crore salary):") print(f" Normalization - All other salaries compressed to < 0.1") print(f" Standardization - Other values less affected") print("\n💡 Standardization is more ROBUST to outliers!") # Summary table print("\n" + "=" * 60) print("📋 WHEN TO USE WHICH?") print("=" * 60) print(""" | Scenario | Method | |-------------------------------|-----------------| | Unknown distribution | Standardization | | Need bounded values (0-1) | Normalization | | Has outliers | Standardization | | Neural network (ReLU/Sigmoid) | Normalization | | SVM, Logistic Regression | Standardization | | Image pixel values | Normalization | """) # Important: Fit on TRAIN, transform on TEST! print("\n⚠️ CRITICAL: Avoid Data Leakage!") print(" scaler.fit(X_train) → scaler.transform(X_train)") print(" scaler.transform(X_test) → Use SAME scaler, don't fit again!")