Loading W Code...
Learn to explore, visualize, and understand your data — the foundation of every successful ML project.
5
Topics
Python
Code Examples
🍽️ Indian Context
All examples use Zomato restaurant data — analyzing ratings, prices, and cuisines across Indian cities!
import pandas as pd
import numpy as np
# =========================================
# STEP 1: LOAD AND OVERVIEW THE DATA
# =========================================
# Sample Zomato-like restaurant data (Indian cities)
data = {
'Restaurant': ['Saravana Bhavan', 'Truffles', 'Barbeque Nation', 'Haldiram', 'Dominos',
'MTR', 'Cafe Coffee Day', 'McDonald India', 'Subway', 'Pizza Hut'],
'City': ['Chennai', 'Bangalore', 'Mumbai', 'Delhi', 'Hyderabad',
'Bangalore', 'Delhi', 'Mumbai', 'Chennai', 'Bangalore'],
'Cuisine': ['South Indian', 'Continental', 'North Indian', 'North Indian', 'Italian',
'South Indian', 'Cafe', 'Fast Food', 'Fast Food', 'Italian'],
'Rating': [4.5, 4.7, 4.2, 4.3, 3.9, 4.6, 3.8, 3.7, 3.5, 4.0],
'Price_for_Two': [300, 800, 1500, 400, 600, 350, 400, 500, 450, 700],
'Votes': [12500, 8900, 6700, 9800, 5400, 7600, 4300, 6100, 3200, 5600]
}
df = pd.DataFrame(data)
# =========================================
# STEP 2: BASIC DATA OVERVIEW
# =========================================
print("=" * 50)
print("📊 DATASET OVERVIEW")
print("=" * 50)
# Shape: How many rows and columns?
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
# Data types
print(f"\nData Types:")
print(df.dtypes)
# First few rows
print(f"\nFirst 5 rows:")
print(df.head())
# Missing values check (CRITICAL!)
print(f"\nMissing Values:")
print(df.isnull().sum())
print(f"Total Missing: {df.isnull().sum().sum()}")
# Memory usage
print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")import pandas as pd
import numpy as np
# Zomato-style rating data with an outlier
ratings = [4.2, 4.5, 4.3, 4.1, 4.4, 4.0, 4.2, 1.0, 4.3, 4.5] # Note: 1.0 is an outlier
print("=" * 50)
print("📊 MEASURES OF CENTRAL TENDENCY")
print("=" * 50)
# MEAN (Average)
mean_rating = np.mean(ratings)
print(f"Mean Rating: {mean_rating:.2f}")
# MEDIAN (Middle value)
median_rating = np.median(ratings)
print(f"Median Rating: {median_rating:.2f}")
# MODE (Most frequent)
from statistics import mode, multimode
mode_rating = mode(ratings)
print(f"Mode Rating: {mode_rating}")
# See the difference when outlier is present!
print(f"\n⚠️ Notice: Mean ({mean_rating:.2f}) is pulled DOWN by the outlier 1.0")
print(f"✅ Median ({median_rating:.2f}) is more representative of typical ratings")
# PRACTICAL EXAMPLE: Zomato Price Analysis
print("\n" + "=" * 50)
print("🍽️ ZOMATO PRICE ANALYSIS (Price for Two)")
print("=" * 50)
prices = {
'Restaurant': ['Saravana Bhavan', 'Truffles', 'Taj Restaurant', 'Haldiram', 'Local Dhaba'],
'Price_for_Two': [300, 800, 5000, 400, 150] # Taj is expensive outlier
}
df = pd.DataFrame(prices)
print(f"\nPrices: {df['Price_for_Two'].tolist()}")
print(f"Mean Price: ₹{df['Price_for_Two'].mean():.0f}")
print(f"Median Price: ₹{df['Price_for_Two'].median():.0f}")
print(f"\n💡 Insight: Taj Restaurant (₹5000) skews the mean")
print(f" Mean suggests avg ₹1330, but most places are under ₹800!")
print(f" Median (₹400) better represents 'typical' prices")
# Using describe() for quick statistics
print("\n📋 COMPLETE STATISTICS (.describe()):")
print(df['Price_for_Two'].describe())import pandas as pd
import numpy as np
print("=" * 50)
print("📊 MEASURES OF DISPERSION")
print("=" * 50)
# Restaurant ratings data
ratings_consistent = [4.0, 4.1, 4.0, 3.9, 4.2, 4.0, 4.1, 4.0] # Low variance
ratings_varied = [2.0, 5.0, 3.0, 4.5, 1.5, 4.8, 3.2, 4.0] # High variance
# VARIANCE & STANDARD DEVIATION
print("\n📈 CONSISTENT RESTAURANT RATINGS:")
print(f" Data: {ratings_consistent}")
print(f" Mean: {np.mean(ratings_consistent):.2f}")
print(f" Variance: {np.var(ratings_consistent):.4f}")
print(f" Std Dev: {np.std(ratings_consistent):.4f}")
print(f" Range: {max(ratings_consistent) - min(ratings_consistent):.1f}")
print("\n📉 VARIED RESTAURANT RATINGS:")
print(f" Data: {ratings_varied}")
print(f" Mean: {np.mean(ratings_varied):.2f}")
print(f" Variance: {np.var(ratings_varied):.4f}")
print(f" Std Dev: {np.std(ratings_varied):.4f}")
print(f" Range: {max(ratings_varied) - min(ratings_varied):.1f}")
print("\n💡 Insight: Same average (~3.5), but second has 15x more variance!")
print(" High variance = less predictable ratings")
# PRACTICAL: Detecting Outliers with Standard Deviation
print("\n" + "=" * 50)
print("🔍 OUTLIER DETECTION USING STD DEV")
print("=" * 50)
delivery_times = [25, 30, 28, 32, 27, 29, 31, 120, 26, 28] # 120 is outlier
print(f"Delivery times (mins): {delivery_times}")
mean_time = np.mean(delivery_times)
std_time = np.std(delivery_times)
print(f"\nMean: {mean_time:.1f} mins")
print(f"Std Dev: {std_time:.1f} mins")
print(f"\nNormal Range (mean ± 2*std): {mean_time - 2*std_time:.1f} to {mean_time + 2*std_time:.1f}")
# Find outliers (beyond 2 std deviations)
outliers = [x for x in delivery_times if abs(x - mean_time) > 2 * std_time]
print(f"\n🚨 Outliers detected: {outliers}")
print(f" 120 mins is clearly abnormal (maybe order was lost!)")
# COEFFICIENT OF VARIATION (for comparing different scales)
print("\n" + "=" * 50)
print("📊 COEFFICIENT OF VARIATION (CV)")
print("=" * 50)
# Compare variability of different metrics
ratings = [4.2, 4.5, 4.3, 4.1, 4.4]
prices = [300, 800, 1500, 400, 600]
cv_ratings = (np.std(ratings) / np.mean(ratings)) * 100
cv_prices = (np.std(prices) / np.mean(prices)) * 100
print(f"Ratings CV: {cv_ratings:.1f}%")
print(f"Prices CV: {cv_prices:.1f}%")
print(f"\n💡 Prices are more variable (CV={cv_prices:.1f}%) than ratings (CV={cv_ratings:.1f}%)")import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Create realistic Zomato data
np.random.seed(42)
n = 200
zomato_df = pd.DataFrame({
'Rating': np.random.normal(3.8, 0.5, n).clip(1, 5), # Normal distribution
'Price_for_Two': np.random.exponential(500, n) + 200, # Right skewed (most cheap, few expensive)
'Delivery_Time': np.concatenate([np.random.normal(30, 5, 180), # Normal deliveries
np.random.normal(90, 10, 20)]), # Some late ones (outliers)
'City': np.random.choice(['Delhi', 'Mumbai', 'Bangalore', 'Chennai'], n)
})
print("=" * 50)
print("📊 DISTRIBUTION ANALYSIS")
print("=" * 50)
# HISTOGRAM: Rating Distribution
print("\n1️⃣ HISTOGRAM - Rating Distribution")
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.hist(zomato_df['Rating'], bins=20, color='#667eea', edgecolor='white', alpha=0.8)
plt.axvline(zomato_df['Rating'].mean(), color='red', linestyle='--', label=f'Mean: {zomato_df["Rating"].mean():.2f}')
plt.axvline(zomato_df['Rating'].median(), color='green', linestyle='--', label=f'Median: {zomato_df["Rating"].median():.2f}')
plt.title('Rating Distribution (Normal)', fontweight='bold')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.legend(fontsize=8)
# HISTOGRAM: Price Distribution (Right Skewed)
plt.subplot(1, 3, 2)
plt.hist(zomato_df['Price_for_Two'], bins=25, color='#f6ad55', edgecolor='white', alpha=0.8)
plt.axvline(zomato_df['Price_for_Two'].mean(), color='red', linestyle='--', label=f'Mean: {zomato_df["Price_for_Two"].mean():.0f}')
plt.axvline(zomato_df['Price_for_Two'].median(), color='green', linestyle='--', label=f'Median: {zomato_df["Price_for_Two"].median():.0f}')
plt.title('Price Distribution (Right Skewed)', fontweight='bold')
plt.xlabel('Price for Two (₹)')
plt.ylabel('Frequency')
plt.legend(fontsize=8)
# BOX PLOT: Delivery Time with Outliers
plt.subplot(1, 3, 3)
plt.boxplot(zomato_df['Delivery_Time'], vert=True)
plt.title('Delivery Time (Box Plot)', fontweight='bold')
plt.ylabel('Minutes')
plt.tight_layout()
plt.show()
# INTERPRETING THE DISTRIBUTION
print("\n📈 INTERPRETATION:")
print(f"\nRatings: Mean={zomato_df['Rating'].mean():.2f}, Median={zomato_df['Rating'].median():.2f}")
print(" → Mean ≈ Median = Symmetric (Normal) distribution")
print(f"\nPrices: Mean={zomato_df['Price_for_Two'].mean():.0f}, Median={zomato_df['Price_for_Two'].median():.0f}")
print(" → Mean > Median = Right Skewed (few expensive restaurants)")
# SKEWNESS (quantitative measure)
from scipy import stats
print(f"\n📐 Skewness Values:")
print(f" Ratings: {stats.skew(zomato_df['Rating']):.3f} (close to 0 = symmetric)")
print(f" Prices: {stats.skew(zomato_df['Price_for_Two']):.3f} (positive = right skewed)")import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Create Zomato-like data with correlations
np.random.seed(42)
n = 100
# Price and Rating might be correlated (expensive = better quality)
price = np.random.uniform(200, 2000, n)
rating = 3.0 + 0.7 * (price / 1000) + np.random.normal(0, 0.3, n)
rating = np.clip(rating, 1, 5)
# Votes correlated with rating (good restaurants get more votes)
votes = 1000 + 2000 * (rating - 3) + np.random.normal(0, 500, n)
votes = np.clip(votes, 100, 10000)
# Delivery time NOT correlated with rating
delivery_time = np.random.normal(35, 10, n)
zomato_df = pd.DataFrame({
'Price_for_Two': price,
'Rating': rating,
'Votes': votes,
'Delivery_Time': delivery_time
})
print("=" * 50)
print("📊 CORRELATION ANALYSIS")
print("=" * 50)
# Calculate correlation matrix
correlation_matrix = zomato_df.corr()
print("\n📈 CORRELATION MATRIX:")
print(correlation_matrix.round(3))
# Interpretation
print("\n💡 INTERPRETATION:")
print(f" Price vs Rating: r = {correlation_matrix.loc['Price_for_Two', 'Rating']:.3f}")
print(f" → Moderate positive: Expensive restaurants tend to have higher ratings")
print(f"\n Rating vs Votes: r = {correlation_matrix.loc['Rating', 'Votes']:.3f}")
print(f" → Strong positive: Higher rated restaurants get more votes")
print(f"\n Delivery Time vs Rating: r = {correlation_matrix.loc['Delivery_Time', 'Rating']:.3f}")
print(f" → Near zero: Delivery time doesn't affect rating")
# HEATMAP VISUALIZATION
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix,
annot=True,
cmap='coolwarm',
center=0,
fmt='.2f',
square=True,
linewidths=0.5)
plt.title('Correlation Heatmap - Zomato Features', fontweight='bold', fontsize=14)
plt.tight_layout()
plt.show()
# SCATTER PLOT for strong correlation
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.scatter(zomato_df['Price_for_Two'], zomato_df['Rating'], alpha=0.6, c='#667eea')
plt.xlabel('Price for Two (₹)')
plt.ylabel('Rating')
plt.title(f'Price vs Rating (r = {correlation_matrix.loc["Price_for_Two", "Rating"]:.2f})', fontweight='bold')
plt.subplot(1, 2, 2)
plt.scatter(zomato_df['Delivery_Time'], zomato_df['Rating'], alpha=0.6, c='#f6ad55')
plt.xlabel('Delivery Time (mins)')
plt.ylabel('Rating')
plt.title(f'Delivery vs Rating (r = {correlation_matrix.loc["Delivery_Time", "Rating"]:.2f})', fontweight='bold')
plt.tight_layout()
plt.show()
# ML INSIGHT: Feature Selection
print("\n" + "=" * 50)
print("🎯 ML INSIGHT: FEATURE SELECTION")
print("=" * 50)
print("\nIf predicting RATING, good features are:")
print(" ✅ Price_for_Two (r=0.70) - strong correlation")
print(" ✅ Votes (r=0.85) - but may cause data leakage!")
print(" ❌ Delivery_Time (r≈0) - no predictive power")Now that you can explore data, learn how to clean and preprocess it for ML models: