W Code - Pattern-Based DSA Learning Platform

Bhanu Bisht

Back to ML Theory

Exploratory Data Analysis (EDA)

Learn to explore, visualize, and understand your data — the foundation of every successful ML project.

Topics

Python

Code Examples

🍽️ Indian Context

All examples use Zomato restaurant data — analyzing ratings, prices, and cuisines across Indian cities!

What is Exploratory Data Analysis (EDA)?

Foundation

Python

import pandas as pd
import numpy as np

# =========================================
# STEP 1: LOAD AND OVERVIEW THE DATA
# =========================================

# Sample Zomato-like restaurant data (Indian cities)
data = {
    'Restaurant': ['Saravana Bhavan', 'Truffles', 'Barbeque Nation', 'Haldiram', 'Dominos', 
                   'MTR', 'Cafe Coffee Day', 'McDonald India', 'Subway', 'Pizza Hut'],
    'City': ['Chennai', 'Bangalore', 'Mumbai', 'Delhi', 'Hyderabad', 
             'Bangalore', 'Delhi', 'Mumbai', 'Chennai', 'Bangalore'],
    'Cuisine': ['South Indian', 'Continental', 'North Indian', 'North Indian', 'Italian',
                'South Indian', 'Cafe', 'Fast Food', 'Fast Food', 'Italian'],
    'Rating': [4.5, 4.7, 4.2, 4.3, 3.9, 4.6, 3.8, 3.7, 3.5, 4.0],
    'Price_for_Two': [300, 800, 1500, 400, 600, 350, 400, 500, 450, 700],
    'Votes': [12500, 8900, 6700, 9800, 5400, 7600, 4300, 6100, 3200, 5600]
}
df = pd.DataFrame(data)

# =========================================
# STEP 2: BASIC DATA OVERVIEW
# =========================================
print("=" * 50)
print("📊 DATASET OVERVIEW")
print("=" * 50)

# Shape: How many rows and columns?
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

# Data types
print(f"\nData Types:")
print(df.dtypes)

# First few rows
print(f"\nFirst 5 rows:")
print(df.head())

# Missing values check (CRITICAL!)
print(f"\nMissing Values:")
print(df.isnull().sum())
print(f"Total Missing: {df.isnull().sum().sum()}")

# Memory usage
print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")

Statistical Measures: Central Tendency

Statistics

Python

import pandas as pd
import numpy as np

# Zomato-style rating data with an outlier
ratings = [4.2, 4.5, 4.3, 4.1, 4.4, 4.0, 4.2, 1.0, 4.3, 4.5]  # Note: 1.0 is an outlier

print("=" * 50)
print("📊 MEASURES OF CENTRAL TENDENCY")
print("=" * 50)

# MEAN (Average)
mean_rating = np.mean(ratings)
print(f"Mean Rating: {mean_rating:.2f}")

# MEDIAN (Middle value)
median_rating = np.median(ratings)
print(f"Median Rating: {median_rating:.2f}")

# MODE (Most frequent)
from statistics import mode, multimode
mode_rating = mode(ratings)
print(f"Mode Rating: {mode_rating}")

# See the difference when outlier is present!
print(f"\n⚠️  Notice: Mean ({mean_rating:.2f}) is pulled DOWN by the outlier 1.0")
print(f"✅ Median ({median_rating:.2f}) is more representative of typical ratings")

# PRACTICAL EXAMPLE: Zomato Price Analysis
print("\n" + "=" * 50)
print("🍽️  ZOMATO PRICE ANALYSIS (Price for Two)")
print("=" * 50)

prices = {
    'Restaurant': ['Saravana Bhavan', 'Truffles', 'Taj Restaurant', 'Haldiram', 'Local Dhaba'],
    'Price_for_Two': [300, 800, 5000, 400, 150]  # Taj is expensive outlier
}
df = pd.DataFrame(prices)

print(f"\nPrices: {df['Price_for_Two'].tolist()}")
print(f"Mean Price: ₹{df['Price_for_Two'].mean():.0f}")
print(f"Median Price: ₹{df['Price_for_Two'].median():.0f}")

print(f"\n💡 Insight: Taj Restaurant (₹5000) skews the mean")
print(f"   Mean suggests avg ₹1330, but most places are under ₹800!")
print(f"   Median (₹400) better represents 'typical' prices")

# Using describe() for quick statistics
print("\n📋 COMPLETE STATISTICS (.describe()):")
print(df['Price_for_Two'].describe())

Statistical Measures: Dispersion (Spread)

Statistics

Python

import pandas as pd
import numpy as np

print("=" * 50)
print("📊 MEASURES OF DISPERSION")
print("=" * 50)

# Restaurant ratings data
ratings_consistent = [4.0, 4.1, 4.0, 3.9, 4.2, 4.0, 4.1, 4.0]  # Low variance
ratings_varied = [2.0, 5.0, 3.0, 4.5, 1.5, 4.8, 3.2, 4.0]      # High variance

# VARIANCE & STANDARD DEVIATION
print("\n📈 CONSISTENT RESTAURANT RATINGS:")
print(f"   Data: {ratings_consistent}")
print(f"   Mean: {np.mean(ratings_consistent):.2f}")
print(f"   Variance: {np.var(ratings_consistent):.4f}")
print(f"   Std Dev: {np.std(ratings_consistent):.4f}")
print(f"   Range: {max(ratings_consistent) - min(ratings_consistent):.1f}")

print("\n📉 VARIED RESTAURANT RATINGS:")
print(f"   Data: {ratings_varied}")
print(f"   Mean: {np.mean(ratings_varied):.2f}")
print(f"   Variance: {np.var(ratings_varied):.4f}")
print(f"   Std Dev: {np.std(ratings_varied):.4f}")
print(f"   Range: {max(ratings_varied) - min(ratings_varied):.1f}")

print("\n💡 Insight: Same average (~3.5), but second has 15x more variance!")
print("   High variance = less predictable ratings")

# PRACTICAL: Detecting Outliers with Standard Deviation
print("\n" + "=" * 50)
print("🔍 OUTLIER DETECTION USING STD DEV")
print("=" * 50)

delivery_times = [25, 30, 28, 32, 27, 29, 31, 120, 26, 28]  # 120 is outlier
print(f"Delivery times (mins): {delivery_times}")

mean_time = np.mean(delivery_times)
std_time = np.std(delivery_times)

print(f"\nMean: {mean_time:.1f} mins")
print(f"Std Dev: {std_time:.1f} mins")
print(f"\nNormal Range (mean ± 2*std): {mean_time - 2*std_time:.1f} to {mean_time + 2*std_time:.1f}")

# Find outliers (beyond 2 std deviations)
outliers = [x for x in delivery_times if abs(x - mean_time) > 2 * std_time]
print(f"\n🚨 Outliers detected: {outliers}")
print(f"   120 mins is clearly abnormal (maybe order was lost!)")

# COEFFICIENT OF VARIATION (for comparing different scales)
print("\n" + "=" * 50)
print("📊 COEFFICIENT OF VARIATION (CV)")
print("=" * 50)

# Compare variability of different metrics
ratings = [4.2, 4.5, 4.3, 4.1, 4.4]
prices = [300, 800, 1500, 400, 600]

cv_ratings = (np.std(ratings) / np.mean(ratings)) * 100
cv_prices = (np.std(prices) / np.mean(prices)) * 100

print(f"Ratings CV: {cv_ratings:.1f}%")
print(f"Prices CV: {cv_prices:.1f}%")
print(f"\n💡 Prices are more variable (CV={cv_prices:.1f}%) than ratings (CV={cv_ratings:.1f}%)")

Distribution Analysis: Histograms & Box Plots

Data Distribution

Python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Create realistic Zomato data
np.random.seed(42)
n = 200

zomato_df = pd.DataFrame({
    'Rating': np.random.normal(3.8, 0.5, n).clip(1, 5),  # Normal distribution
    'Price_for_Two': np.random.exponential(500, n) + 200,  # Right skewed (most cheap, few expensive)
    'Delivery_Time': np.concatenate([np.random.normal(30, 5, 180),  # Normal deliveries
                                      np.random.normal(90, 10, 20)]),  # Some late ones (outliers)
    'City': np.random.choice(['Delhi', 'Mumbai', 'Bangalore', 'Chennai'], n)
})

print("=" * 50)
print("📊 DISTRIBUTION ANALYSIS")
print("=" * 50)

# HISTOGRAM: Rating Distribution
print("\n1️⃣ HISTOGRAM - Rating Distribution")
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.hist(zomato_df['Rating'], bins=20, color='#667eea', edgecolor='white', alpha=0.8)
plt.axvline(zomato_df['Rating'].mean(), color='red', linestyle='--', label=f'Mean: {zomato_df["Rating"].mean():.2f}')
plt.axvline(zomato_df['Rating'].median(), color='green', linestyle='--', label=f'Median: {zomato_df["Rating"].median():.2f}')
plt.title('Rating Distribution (Normal)', fontweight='bold')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.legend(fontsize=8)

# HISTOGRAM: Price Distribution (Right Skewed)
plt.subplot(1, 3, 2)
plt.hist(zomato_df['Price_for_Two'], bins=25, color='#f6ad55', edgecolor='white', alpha=0.8)
plt.axvline(zomato_df['Price_for_Two'].mean(), color='red', linestyle='--', label=f'Mean: {zomato_df["Price_for_Two"].mean():.0f}')
plt.axvline(zomato_df['Price_for_Two'].median(), color='green', linestyle='--', label=f'Median: {zomato_df["Price_for_Two"].median():.0f}')
plt.title('Price Distribution (Right Skewed)', fontweight='bold')
plt.xlabel('Price for Two (₹)')
plt.ylabel('Frequency')
plt.legend(fontsize=8)

# BOX PLOT: Delivery Time with Outliers
plt.subplot(1, 3, 3)
plt.boxplot(zomato_df['Delivery_Time'], vert=True)
plt.title('Delivery Time (Box Plot)', fontweight='bold')
plt.ylabel('Minutes')

plt.tight_layout()
plt.show()

# INTERPRETING THE DISTRIBUTION
print("\n📈 INTERPRETATION:")
print(f"\nRatings: Mean={zomato_df['Rating'].mean():.2f}, Median={zomato_df['Rating'].median():.2f}")
print("   → Mean ≈ Median = Symmetric (Normal) distribution")

print(f"\nPrices: Mean={zomato_df['Price_for_Two'].mean():.0f}, Median={zomato_df['Price_for_Two'].median():.0f}")
print("   → Mean > Median = Right Skewed (few expensive restaurants)")

# SKEWNESS (quantitative measure)
from scipy import stats
print(f"\n📐 Skewness Values:")
print(f"   Ratings: {stats.skew(zomato_df['Rating']):.3f} (close to 0 = symmetric)")
print(f"   Prices: {stats.skew(zomato_df['Price_for_Two']):.3f} (positive = right skewed)")

Correlation Analysis: Finding Relationships

Feature Analysis

Python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Create Zomato-like data with correlations
np.random.seed(42)
n = 100

# Price and Rating might be correlated (expensive = better quality)
price = np.random.uniform(200, 2000, n)
rating = 3.0 + 0.7 * (price / 1000) + np.random.normal(0, 0.3, n)
rating = np.clip(rating, 1, 5)

# Votes correlated with rating (good restaurants get more votes)
votes = 1000 + 2000 * (rating - 3) + np.random.normal(0, 500, n)
votes = np.clip(votes, 100, 10000)

# Delivery time NOT correlated with rating
delivery_time = np.random.normal(35, 10, n)

zomato_df = pd.DataFrame({
    'Price_for_Two': price,
    'Rating': rating,
    'Votes': votes,
    'Delivery_Time': delivery_time
})

print("=" * 50)
print("📊 CORRELATION ANALYSIS")
print("=" * 50)

# Calculate correlation matrix
correlation_matrix = zomato_df.corr()

print("\n📈 CORRELATION MATRIX:")
print(correlation_matrix.round(3))

# Interpretation
print("\n💡 INTERPRETATION:")
print(f"   Price vs Rating: r = {correlation_matrix.loc['Price_for_Two', 'Rating']:.3f}")
print(f"      → Moderate positive: Expensive restaurants tend to have higher ratings")

print(f"\n   Rating vs Votes: r = {correlation_matrix.loc['Rating', 'Votes']:.3f}")
print(f"      → Strong positive: Higher rated restaurants get more votes")

print(f"\n   Delivery Time vs Rating: r = {correlation_matrix.loc['Delivery_Time', 'Rating']:.3f}")
print(f"      → Near zero: Delivery time doesn't affect rating")

# HEATMAP VISUALIZATION
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, 
            annot=True, 
            cmap='coolwarm', 
            center=0,
            fmt='.2f',
            square=True,
            linewidths=0.5)
plt.title('Correlation Heatmap - Zomato Features', fontweight='bold', fontsize=14)
plt.tight_layout()
plt.show()

# SCATTER PLOT for strong correlation
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.scatter(zomato_df['Price_for_Two'], zomato_df['Rating'], alpha=0.6, c='#667eea')
plt.xlabel('Price for Two (₹)')
plt.ylabel('Rating')
plt.title(f'Price vs Rating (r = {correlation_matrix.loc["Price_for_Two", "Rating"]:.2f})', fontweight='bold')

plt.subplot(1, 2, 2)
plt.scatter(zomato_df['Delivery_Time'], zomato_df['Rating'], alpha=0.6, c='#f6ad55')
plt.xlabel('Delivery Time (mins)')
plt.ylabel('Rating')
plt.title(f'Delivery vs Rating (r = {correlation_matrix.loc["Delivery_Time", "Rating"]:.2f})', fontweight='bold')

plt.tight_layout()
plt.show()

# ML INSIGHT: Feature Selection
print("\n" + "=" * 50)
print("🎯 ML INSIGHT: FEATURE SELECTION")
print("=" * 50)
print("\nIf predicting RATING, good features are:")
print("   ✅ Price_for_Two (r=0.70) - strong correlation")
print("   ✅ Votes (r=0.85) - but may cause data leakage!")
print("   ❌ Delivery_Time (r≈0) - no predictive power")

🎯 Key Takeaways

• Central Tendency: Mean for normal data, Median for skewed/outliers
• Dispersion: Std Dev = typical distance from mean, use for outlier detection
• Distribution: Histograms show shape, Box plots show outliers
• Correlation: Find related features, but remember correlation ≠ causation

🎯 What's Next?

Now that you can explore data, learn how to clean and preprocess it for ML models:

Data Preprocessing Train-Test Split →

import pandas as pd import numpy as np # ========================================= # STEP 1: LOAD AND OVERVIEW THE DATA # ========================================= # Sample Zomato-like restaurant data (Indian cities) data = { 'Restaurant': ['Saravana Bhavan', 'Truffles', 'Barbeque Nation', 'Haldiram', 'Dominos', 'MTR', 'Cafe Coffee Day', 'McDonald India', 'Subway', 'Pizza Hut'], 'City': ['Chennai', 'Bangalore', 'Mumbai', 'Delhi', 'Hyderabad', 'Bangalore', 'Delhi', 'Mumbai', 'Chennai', 'Bangalore'], 'Cuisine': ['South Indian', 'Continental', 'North Indian', 'North Indian', 'Italian', 'South Indian', 'Cafe', 'Fast Food', 'Fast Food', 'Italian'], 'Rating': [4.5, 4.7, 4.2, 4.3, 3.9, 4.6, 3.8, 3.7, 3.5, 4.0], 'Price_for_Two': [300, 800, 1500, 400, 600, 350, 400, 500, 450, 700], 'Votes': [12500, 8900, 6700, 9800, 5400, 7600, 4300, 6100, 3200, 5600] } df = pd.DataFrame(data) # ========================================= # STEP 2: BASIC DATA OVERVIEW # ========================================= print("=" * 50) print("📊 DATASET OVERVIEW") print("=" * 50) # Shape: How many rows and columns? print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}") # Data types print(f"\nData Types:") print(df.dtypes) # First few rows print(f"\nFirst 5 rows:") print(df.head()) # Missing values check (CRITICAL!) print(f"\nMissing Values:") print(df.isnull().sum()) print(f"Total Missing: {df.isnull().sum().sum()}") # Memory usage print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")

import pandas as pd import numpy as np # Zomato-style rating data with an outlier ratings = [4.2, 4.5, 4.3, 4.1, 4.4, 4.0, 4.2, 1.0, 4.3, 4.5] # Note: 1.0 is an outlier print("=" * 50) print("📊 MEASURES OF CENTRAL TENDENCY") print("=" * 50) # MEAN (Average) mean_rating = np.mean(ratings) print(f"Mean Rating: {mean_rating:.2f}") # MEDIAN (Middle value) median_rating = np.median(ratings) print(f"Median Rating: {median_rating:.2f}") # MODE (Most frequent) from statistics import mode, multimode mode_rating = mode(ratings) print(f"Mode Rating: {mode_rating}") # See the difference when outlier is present! print(f"\n⚠️ Notice: Mean ({mean_rating:.2f}) is pulled DOWN by the outlier 1.0") print(f"✅ Median ({median_rating:.2f}) is more representative of typical ratings") # PRACTICAL EXAMPLE: Zomato Price Analysis print("\n" + "=" * 50) print("🍽️ ZOMATO PRICE ANALYSIS (Price for Two)") print("=" * 50) prices = { 'Restaurant': ['Saravana Bhavan', 'Truffles', 'Taj Restaurant', 'Haldiram', 'Local Dhaba'], 'Price_for_Two': [300, 800, 5000, 400, 150] # Taj is expensive outlier } df = pd.DataFrame(prices) print(f"\nPrices: {df['Price_for_Two'].tolist()}") print(f"Mean Price: ₹{df['Price_for_Two'].mean():.0f}") print(f"Median Price: ₹{df['Price_for_Two'].median():.0f}") print(f"\n💡 Insight: Taj Restaurant (₹5000) skews the mean") print(f" Mean suggests avg ₹1330, but most places are under ₹800!") print(f" Median (₹400) better represents 'typical' prices") # Using describe() for quick statistics print("\n📋 COMPLETE STATISTICS (.describe()):") print(df['Price_for_Two'].describe())

import pandas as pd import numpy as np print("=" * 50) print("📊 MEASURES OF DISPERSION") print("=" * 50) # Restaurant ratings data ratings_consistent = [4.0, 4.1, 4.0, 3.9, 4.2, 4.0, 4.1, 4.0] # Low variance ratings_varied = [2.0, 5.0, 3.0, 4.5, 1.5, 4.8, 3.2, 4.0] # High variance # VARIANCE & STANDARD DEVIATION print("\n📈 CONSISTENT RESTAURANT RATINGS:") print(f" Data: {ratings_consistent}") print(f" Mean: {np.mean(ratings_consistent):.2f}") print(f" Variance: {np.var(ratings_consistent):.4f}") print(f" Std Dev: {np.std(ratings_consistent):.4f}") print(f" Range: {max(ratings_consistent) - min(ratings_consistent):.1f}") print("\n📉 VARIED RESTAURANT RATINGS:") print(f" Data: {ratings_varied}") print(f" Mean: {np.mean(ratings_varied):.2f}") print(f" Variance: {np.var(ratings_varied):.4f}") print(f" Std Dev: {np.std(ratings_varied):.4f}") print(f" Range: {max(ratings_varied) - min(ratings_varied):.1f}") print("\n💡 Insight: Same average (~3.5), but second has 15x more variance!") print(" High variance = less predictable ratings") # PRACTICAL: Detecting Outliers with Standard Deviation print("\n" + "=" * 50) print("🔍 OUTLIER DETECTION USING STD DEV") print("=" * 50) delivery_times = [25, 30, 28, 32, 27, 29, 31, 120, 26, 28] # 120 is outlier print(f"Delivery times (mins): {delivery_times}") mean_time = np.mean(delivery_times) std_time = np.std(delivery_times) print(f"\nMean: {mean_time:.1f} mins") print(f"Std Dev: {std_time:.1f} mins") print(f"\nNormal Range (mean ± 2*std): {mean_time - 2*std_time:.1f} to {mean_time + 2*std_time:.1f}") # Find outliers (beyond 2 std deviations) outliers = [x for x in delivery_times if abs(x - mean_time) > 2 * std_time] print(f"\n🚨 Outliers detected: {outliers}") print(f" 120 mins is clearly abnormal (maybe order was lost!)") # COEFFICIENT OF VARIATION (for comparing different scales) print("\n" + "=" * 50) print("📊 COEFFICIENT OF VARIATION (CV)") print("=" * 50) # Compare variability of different metrics ratings = [4.2, 4.5, 4.3, 4.1, 4.4] prices = [300, 800, 1500, 400, 600] cv_ratings = (np.std(ratings) / np.mean(ratings)) * 100 cv_prices = (np.std(prices) / np.mean(prices)) * 100 print(f"Ratings CV: {cv_ratings:.1f}%") print(f"Prices CV: {cv_prices:.1f}%") print(f"\n💡 Prices are more variable (CV={cv_prices:.1f}%) than ratings (CV={cv_ratings:.1f}%)")

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns # Create realistic Zomato data np.random.seed(42) n = 200 zomato_df = pd.DataFrame({ 'Rating': np.random.normal(3.8, 0.5, n).clip(1, 5), # Normal distribution 'Price_for_Two': np.random.exponential(500, n) + 200, # Right skewed (most cheap, few expensive) 'Delivery_Time': np.concatenate([np.random.normal(30, 5, 180), # Normal deliveries np.random.normal(90, 10, 20)]), # Some late ones (outliers) 'City': np.random.choice(['Delhi', 'Mumbai', 'Bangalore', 'Chennai'], n) }) print("=" * 50) print("📊 DISTRIBUTION ANALYSIS") print("=" * 50) # HISTOGRAM: Rating Distribution print("\n1️⃣ HISTOGRAM - Rating Distribution") plt.figure(figsize=(12, 4)) plt.subplot(1, 3, 1) plt.hist(zomato_df['Rating'], bins=20, color='#667eea', edgecolor='white', alpha=0.8) plt.axvline(zomato_df['Rating'].mean(), color='red', linestyle='--', label=f'Mean: {zomato_df["Rating"].mean():.2f}') plt.axvline(zomato_df['Rating'].median(), color='green', linestyle='--', label=f'Median: {zomato_df["Rating"].median():.2f}') plt.title('Rating Distribution (Normal)', fontweight='bold') plt.xlabel('Rating') plt.ylabel('Frequency') plt.legend(fontsize=8) # HISTOGRAM: Price Distribution (Right Skewed) plt.subplot(1, 3, 2) plt.hist(zomato_df['Price_for_Two'], bins=25, color='#f6ad55', edgecolor='white', alpha=0.8) plt.axvline(zomato_df['Price_for_Two'].mean(), color='red', linestyle='--', label=f'Mean: {zomato_df["Price_for_Two"].mean():.0f}') plt.axvline(zomato_df['Price_for_Two'].median(), color='green', linestyle='--', label=f'Median: {zomato_df["Price_for_Two"].median():.0f}') plt.title('Price Distribution (Right Skewed)', fontweight='bold') plt.xlabel('Price for Two (₹)') plt.ylabel('Frequency') plt.legend(fontsize=8) # BOX PLOT: Delivery Time with Outliers plt.subplot(1, 3, 3) plt.boxplot(zomato_df['Delivery_Time'], vert=True) plt.title('Delivery Time (Box Plot)', fontweight='bold') plt.ylabel('Minutes') plt.tight_layout() plt.show() # INTERPRETING THE DISTRIBUTION print("\n📈 INTERPRETATION:") print(f"\nRatings: Mean={zomato_df['Rating'].mean():.2f}, Median={zomato_df['Rating'].median():.2f}") print(" → Mean ≈ Median = Symmetric (Normal) distribution") print(f"\nPrices: Mean={zomato_df['Price_for_Two'].mean():.0f}, Median={zomato_df['Price_for_Two'].median():.0f}") print(" → Mean > Median = Right Skewed (few expensive restaurants)") # SKEWNESS (quantitative measure) from scipy import stats print(f"\n📐 Skewness Values:") print(f" Ratings: {stats.skew(zomato_df['Rating']):.3f} (close to 0 = symmetric)") print(f" Prices: {stats.skew(zomato_df['Price_for_Two']):.3f} (positive = right skewed)")

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns # Create Zomato-like data with correlations np.random.seed(42) n = 100 # Price and Rating might be correlated (expensive = better quality) price = np.random.uniform(200, 2000, n) rating = 3.0 + 0.7 * (price / 1000) + np.random.normal(0, 0.3, n) rating = np.clip(rating, 1, 5) # Votes correlated with rating (good restaurants get more votes) votes = 1000 + 2000 * (rating - 3) + np.random.normal(0, 500, n) votes = np.clip(votes, 100, 10000) # Delivery time NOT correlated with rating delivery_time = np.random.normal(35, 10, n) zomato_df = pd.DataFrame({ 'Price_for_Two': price, 'Rating': rating, 'Votes': votes, 'Delivery_Time': delivery_time }) print("=" * 50) print("📊 CORRELATION ANALYSIS") print("=" * 50) # Calculate correlation matrix correlation_matrix = zomato_df.corr() print("\n📈 CORRELATION MATRIX:") print(correlation_matrix.round(3)) # Interpretation print("\n💡 INTERPRETATION:") print(f" Price vs Rating: r = {correlation_matrix.loc['Price_for_Two', 'Rating']:.3f}") print(f" → Moderate positive: Expensive restaurants tend to have higher ratings") print(f"\n Rating vs Votes: r = {correlation_matrix.loc['Rating', 'Votes']:.3f}") print(f" → Strong positive: Higher rated restaurants get more votes") print(f"\n Delivery Time vs Rating: r = {correlation_matrix.loc['Delivery_Time', 'Rating']:.3f}") print(f" → Near zero: Delivery time doesn't affect rating") # HEATMAP VISUALIZATION plt.figure(figsize=(10, 8)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', square=True, linewidths=0.5) plt.title('Correlation Heatmap - Zomato Features', fontweight='bold', fontsize=14) plt.tight_layout() plt.show() # SCATTER PLOT for strong correlation plt.figure(figsize=(10, 4)) plt.subplot(1, 2, 1) plt.scatter(zomato_df['Price_for_Two'], zomato_df['Rating'], alpha=0.6, c='#667eea') plt.xlabel('Price for Two (₹)') plt.ylabel('Rating') plt.title(f'Price vs Rating (r = {correlation_matrix.loc["Price_for_Two", "Rating"]:.2f})', fontweight='bold') plt.subplot(1, 2, 2) plt.scatter(zomato_df['Delivery_Time'], zomato_df['Rating'], alpha=0.6, c='#f6ad55') plt.xlabel('Delivery Time (mins)') plt.ylabel('Rating') plt.title(f'Delivery vs Rating (r = {correlation_matrix.loc["Delivery_Time", "Rating"]:.2f})', fontweight='bold') plt.tight_layout() plt.show() # ML INSIGHT: Feature Selection print("\n" + "=" * 50) print("🎯 ML INSIGHT: FEATURE SELECTION") print("=" * 50) print("\nIf predicting RATING, good features are:") print(" ✅ Price_for_Two (r=0.70) - strong correlation") print(" ✅ Votes (r=0.85) - but may cause data leakage!") print(" ❌ Delivery_Time (r≈0) - no predictive power")