Loading W Code...
Transform messy real-world data into clean, model-ready format ā missing values, outliers, encoding, and scaling.
5
Topics
Python
+ sklearn
š Indian Context
Examples use Ola/Uber ride data ā handling missing pickup times, outlier fares, payment modes!
import pandas as pd
import numpy as np
# Simulating Ola/Uber ride data with REAL problems
ride_data = {
'Ride_ID': range(1, 11),
'Pickup_Time': ['08:30', '09:15', None, '12:45', '14:00',
'18:30', None, '20:00', '22:15', '23:45'],
'Distance_KM': [5.2, 12.5, 8.0, 3.5, 50000, 7.0, 15.2, 9.8, 4.5, 6.0], # 50000 is outlier!
'Fare': [85, 195, 125, 60, None, 110, 230, 155, 75, 95],
'Payment': ['Card', 'Cash', 'UPI', 'cash', 'CARD', 'upi', 'Cash', 'Card', 'Upi', 'cash'],
'Rating': [4.5, 4.0, None, 3.5, 4.8, 4.2, 3.0, 4.7, None, 4.3]
}
df = pd.DataFrame(ride_data)
print("=" * 60)
print("š RAW OLA/UBER RIDE DATA (with problems)")
print("=" * 60)
print(df)
# Identify all the problems
print("\n" + "=" * 60)
print("š DATA QUALITY ISSUES DETECTED")
print("=" * 60)
# 1. Missing Values
print(f"\nā Missing Values:")
print(df.isnull().sum())
# 2. Inconsistent Categories
print(f"\nā Inconsistent Payment Types:")
print(df['Payment'].unique())
# 3. Outliers
print(f"\nā Distance Statistics (check for outliers):")
print(f" Max Distance: {df['Distance_KM'].max()} km")
print(f" Mean Distance: {df['Distance_KM'].mean():.1f} km")
print(f" Suspicious: 50000 km is impossible for a city ride!")
# 4. Summary
total_issues = df.isnull().sum().sum() + 1 + 4 # missing + outlier + format
print(f"\nš Total Issues: {total_issues} problems to fix before ML")import pandas as pd
import numpy as np
# Create sample data with missing values
np.random.seed(42)
n = 100
customer_df = pd.DataFrame({
'Customer_ID': range(1, n + 1),
'Age': np.random.choice([25, 30, 35, 40, 45, np.nan], n, p=[0.3, 0.25, 0.2, 0.1, 0.1, 0.05]),
'Income': np.random.choice([30000, 50000, 80000, np.nan], n, p=[0.4, 0.3, 0.2, 0.1]),
'City': np.random.choice(['Mumbai', 'Delhi', 'Bangalore', None], n, p=[0.35, 0.3, 0.25, 0.1]),
'Purchase_Amount': np.random.uniform(100, 5000, n)
})
print("=" * 60)
print("š HANDLING MISSING VALUES")
print("=" * 60)
# Step 1: Detect Missing Values
print("\n1ļøā£ MISSING VALUE DETECTION:")
print(f"\nMissing per column:")
print(customer_df.isnull().sum())
print(f"\nPercentage missing:")
print((customer_df.isnull().sum() / len(customer_df) * 100).round(1))
# Step 2: Handle NUMERICAL missing (Age) with MEDIAN
print("\n2ļøā£ FILL NUMERICAL (Age) WITH MEDIAN:")
age_median = customer_df['Age'].median()
print(f" Age Median: {age_median}")
customer_df['Age'].fillna(age_median, inplace=True)
print(f" Missing Ages after fix: {customer_df['Age'].isnull().sum()}")
# Step 3: Handle NUMERICAL missing (Income) with MEAN
print("\n3ļøā£ FILL NUMERICAL (Income) WITH MEAN:")
income_mean = customer_df['Income'].mean()
print(f" Income Mean: ā¹{income_mean:.0f}")
customer_df['Income'].fillna(income_mean, inplace=True)
print(f" Missing Incomes after fix: {customer_df['Income'].isnull().sum()}")
# Step 4: Handle CATEGORICAL missing (City) with MODE
print("\n4ļøā£ FILL CATEGORICAL (City) WITH MODE:")
city_mode = customer_df['City'].mode()[0]
print(f" Most common city: {city_mode}")
customer_df['City'].fillna(city_mode, inplace=True)
print(f" Missing Cities after fix: {customer_df['City'].isnull().sum()}")
# Final Check
print("\nā
AFTER HANDLING MISSING VALUES:")
print(customer_df.isnull().sum())
print(f"\nTotal missing: {customer_df.isnull().sum().sum()}")
# Advanced: Using sklearn's SimpleImputer
print("\n" + "=" * 60)
print("š§ ADVANCED: Using sklearn's SimpleImputer")
print("=" * 60)
from sklearn.impute import SimpleImputer
# Create imputer for numerical data
num_imputer = SimpleImputer(strategy='median') # or 'mean'
# Create imputer for categorical data
cat_imputer = SimpleImputer(strategy='most_frequent')
print("\nā
SimpleImputer allows consistent imputation in ML pipelines")
print(" Usage: imputer.fit_transform(data)")import pandas as pd
import numpy as np
# Sample Ola/Uber fare data with outliers
np.random.seed(42)
fares = np.concatenate([
np.random.normal(150, 30, 95), # Normal fares (ā¹120-180)
np.array([500, 800, 15, 1200, 2000]) # Outliers!
])
df = pd.DataFrame({'Fare': fares})
print("=" * 60)
print("š OUTLIER DETECTION METHODS")
print("=" * 60)
# ===============================
# METHOD 1: IQR (Interquartile Range)
# ===============================
print("\n1ļøā£ IQR METHOD (Interquartile Range)")
print("-" * 40)
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f"Q1 (25th percentile): ā¹{Q1:.2f}")
print(f"Q3 (75th percentile): ā¹{Q3:.2f}")
print(f"IQR: ā¹{IQR:.2f}")
print(f"\nLower Bound: ā¹{lower_bound:.2f}")
print(f"Upper Bound: ā¹{upper_bound:.2f}")
# Detect outliers
iqr_outliers = df[(df['Fare'] < lower_bound) | (df['Fare'] > upper_bound)]
print(f"\nšØ Outliers detected (IQR): {len(iqr_outliers)}")
print(iqr_outliers['Fare'].values)
# ===============================
# METHOD 2: Z-Score
# ===============================
print("\n2ļøā£ Z-SCORE METHOD")
print("-" * 40)
from scipy import stats
# Calculate Z-scores
z_scores = np.abs(stats.zscore(df['Fare']))
df['Z_Score'] = z_scores
# Outliers: Z-score > 2 (or 3 for stricter)
threshold = 2
zscore_outliers = df[df['Z_Score'] > threshold]
print(f"Mean: ā¹{df['Fare'].mean():.2f}")
print(f"Std Dev: ā¹{df['Fare'].std():.2f}")
print(f"Threshold: |Z| > {threshold}")
print(f"\nšØ Outliers detected (Z-Score): {len(zscore_outliers)}")
print(zscore_outliers[['Fare', 'Z_Score']].values)
# ===============================
# TREATMENT: Remove vs Cap
# ===============================
print("\n" + "=" * 60)
print("š§ OUTLIER TREATMENT")
print("=" * 60)
# Option 1: Remove outliers
df_removed = df[(df['Fare'] >= lower_bound) & (df['Fare'] <= upper_bound)]
print(f"\nOption 1 - REMOVE:")
print(f" Before: {len(df)} rows")
print(f" After: {len(df_removed)} rows")
# Option 2: Cap outliers (Winsorizing)
df_capped = df.copy()
df_capped['Fare'] = df_capped['Fare'].clip(lower=lower_bound, upper=upper_bound)
print(f"\nOption 2 - CAP (Winsorize):")
print(f" Original max: ā¹{df['Fare'].max():.2f}")
print(f" Capped max: ā¹{df_capped['Fare'].max():.2f}")
# Compare means
print(f"\nš Impact on Mean:")
print(f" Original: ā¹{df['Fare'].mean():.2f}")
print(f" After Remove: ā¹{df_removed['Fare'].mean():.2f}")
print(f" After Cap: ā¹{df_capped['Fare'].mean():.2f}")import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Sample data with categorical features
job_data = {
'Name': ['Rahul', 'Priya', 'Amit', 'Sneha', 'Vikram'],
'City': ['Mumbai', 'Delhi', 'Bangalore', 'Mumbai', 'Delhi'],
'Education': ['Bachelor', 'Master', 'PhD', 'Bachelor', 'Master'],
'Experience_Level': ['Junior', 'Mid', 'Senior', 'Junior', 'Senior'],
'Salary': [50000, 75000, 120000, 55000, 95000]
}
df = pd.DataFrame(job_data)
print("=" * 60)
print("š·ļø ENCODING CATEGORICAL DATA")
print("=" * 60)
print("\nOriginal Data:")
print(df)
# ===============================
# METHOD 1: Label Encoding
# ===============================
print("\n" + "=" * 60)
print("1ļøā£ LABEL ENCODING")
print("=" * 60)
# For ORDINAL data (has natural order)
# Experience: Junior < Mid < Senior
experience_order = {'Junior': 0, 'Mid': 1, 'Senior': 2}
df['Experience_Encoded'] = df['Experience_Level'].map(experience_order)
print("\nExperience Level (Ordinal - HAS order):")
print(df[['Experience_Level', 'Experience_Encoded']])
# Using sklearn LabelEncoder
le = LabelEncoder()
df['City_Label'] = le.fit_transform(df['City'])
print("\nCity (using LabelEncoder):")
print(df[['City', 'City_Label']])
print(f"Classes: {le.classes_}") # Shows mapping
print("\nā ļø WARNING: City has NO order, but Label Encoding gives:")
print(" Bangalore=0, Delhi=1, Mumbai=2")
print(" Model might think: Mumbai > Delhi > Bangalore (WRONG!)")
# ===============================
# METHOD 2: One-Hot Encoding
# ===============================
print("\n" + "=" * 60)
print("2ļøā£ ONE-HOT ENCODING (for Nominal data)")
print("=" * 60)
# Using pandas get_dummies (easiest)
df_onehot = pd.get_dummies(df[['City', 'Salary']], columns=['City'], drop_first=False)
print("\nOne-Hot Encoded (City):")
print(df_onehot)
# With drop_first=True (avoid dummy variable trap)
df_onehot_dropped = pd.get_dummies(df[['City', 'Salary']], columns=['City'], drop_first=True)
print("\nWith drop_first=True (avoids multicollinearity):")
print(df_onehot_dropped)
# Using sklearn OneHotEncoder
print("\n" + "=" * 60)
print("š§ SKLEARN OneHotEncoder (for ML pipelines)")
print("=" * 60)
ohe = OneHotEncoder(sparse_output=False, drop='first') # drop='first' avoids trap
city_encoded = ohe.fit_transform(df[['City']])
print(f"\nShape: {city_encoded.shape}")
print(f"Feature names: {ohe.get_feature_names_out(['City'])}")
print(f"\nEncoded values:\n{city_encoded}")
# Summary: When to use what
print("\n" + "=" * 60)
print("š SUMMARY: WHEN TO USE WHAT")
print("=" * 60)
print("""
| Feature Type | Encoding Method |
|---------------------|-------------------|
| Ordinal (has order) | Label/Ordinal |
| Nominal (no order) | One-Hot |
| High cardinality | Target Encoding |
| Tree-based models | Either works |
| Linear models | One-Hot ONLY |
""")import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# Sample data: Employee features
np.random.seed(42)
employee_df = pd.DataFrame({
'Age': np.random.randint(22, 60, 10),
'Experience_Years': np.random.randint(0, 35, 10),
'Salary': np.random.randint(300000, 5000000, 10), # 3L to 50L
'Projects_Completed': np.random.randint(1, 50, 10)
})
print("=" * 60)
print("š FEATURE SCALING")
print("=" * 60)
print("\nOriginal Data:")
print(employee_df.describe().round(0))
# Look at the scale difference!
print("\nā ļø PROBLEM: Different Scales!")
print(f" Age range: {employee_df['Age'].min()} - {employee_df['Age'].max()}")
print(f" Salary range: ā¹{employee_df['Salary'].min():,} - ā¹{employee_df['Salary'].max():,}")
print(" Salary is 100,000x larger than Age!")
# ===============================
# METHOD 1: Normalization (Min-Max)
# ===============================
print("\n" + "=" * 60)
print("1ļøā£ NORMALIZATION (Min-Max Scaling)")
print("=" * 60)
scaler_minmax = MinMaxScaler()
df_normalized = pd.DataFrame(
scaler_minmax.fit_transform(employee_df),
columns=employee_df.columns
)
print("\nNormalized Data (0 to 1):")
print(df_normalized.round(3))
print("\nā
Formula: (x - min) / (max - min)")
print(" All values now between 0 and 1")
# ===============================
# METHOD 2: Standardization (Z-Score)
# ===============================
print("\n" + "=" * 60)
print("2ļøā£ STANDARDIZATION (Z-Score Scaling)")
print("=" * 60)
scaler_std = StandardScaler()
df_standardized = pd.DataFrame(
scaler_std.fit_transform(employee_df),
columns=employee_df.columns
)
print("\nStandardized Data (mean=0, std=1):")
print(df_standardized.round(3))
print("\nā
Formula: (x - mean) / std")
print(f" Mean of each column: {df_standardized.mean().round(5).tolist()}")
print(f" Std of each column: {df_standardized.std().round(2).tolist()}")
# ===============================
# COMPARISON: Which to Choose?
# ===============================
print("\n" + "=" * 60)
print("š COMPARISON: Normalization vs Standardization")
print("=" * 60)
# Add an outlier to see the effect
employee_with_outlier = employee_df.copy()
employee_with_outlier.loc[0, 'Salary'] = 50000000 # 5 Crore outlier!
# Normalize
normalized_outlier = MinMaxScaler().fit_transform(employee_with_outlier)
# Standardize
standardized_outlier = StandardScaler().fit_transform(employee_with_outlier)
print("\nšØ With an OUTLIER (ā¹5 Crore salary):")
print(f" Normalization - All other salaries compressed to < 0.1")
print(f" Standardization - Other values less affected")
print("\nš” Standardization is more ROBUST to outliers!")
# Summary table
print("\n" + "=" * 60)
print("š WHEN TO USE WHICH?")
print("=" * 60)
print("""
| Scenario | Method |
|-------------------------------|-----------------|
| Unknown distribution | Standardization |
| Need bounded values (0-1) | Normalization |
| Has outliers | Standardization |
| Neural network (ReLU/Sigmoid) | Normalization |
| SVM, Logistic Regression | Standardization |
| Image pixel values | Normalization |
""")
# Important: Fit on TRAIN, transform on TEST!
print("\nā ļø CRITICAL: Avoid Data Leakage!")
print(" scaler.fit(X_train) ā scaler.transform(X_train)")
print(" scaler.transform(X_test) ā Use SAME scaler, don't fit again!")Now that your data is clean, learn how to split it properly for training and evaluation: