W Code - Pattern-Based DSA Learning Platform

Bhanu Bisht

Back to ML Theory

Python for Machine Learning

Master NumPy, Pandas, and Visualization — the essential Python toolkit for every ML engineer.

Topics

Python

Code Examples

🏏 Indian Context

All examples use IPL cricket data — analyzing player stats, team performances, and match outcomes using Python!

NumPy: The Foundation of ML

Foundation

Python

import numpy as np

# Creating Arrays
arr = np.array([1, 2, 3, 4, 5])
print(f"1D Array: {arr}")

# 2D Array (Matrix) - Common in ML for datasets
matrix = np.array([[1, 2, 3], 
                   [4, 5, 6]])
print(f"Shape: {matrix.shape}")  # Output: (2, 3) = 2 rows, 3 columns

# Creating special arrays
zeros = np.zeros((3, 3))  # Used for weight initialization
ones = np.ones((2, 4))    # Used for bias initialization
random = np.random.randn(3, 3)  # Random normal distribution

# VECTORIZATION: Fast element-wise operations
prices = np.array([100, 200, 300, 400, 500])
gst = prices * 0.18  # Apply 18% GST to all prices at ONCE
print(f"GST amounts: {gst}")

# BROADCASTING Example
# Add different values to each column
data = np.array([[1, 2, 3],
                 [4, 5, 6]])
bias = np.array([10, 20, 30])  # Smaller array
result = data + bias  # Broadcasting: bias is "stretched" to match data
print(f"After broadcasting:\n{result}")

# DOT PRODUCT - Heart of ML (weights × features)
features = np.array([1800, 3, 2])  # [sqft, bedrooms, bathrooms]
weights = np.array([50, 10000, 5000])  # Learned weights
predicted_price = np.dot(features, weights)
print(f"Predicted Price: ₹{predicted_price:,}")  # ₹1,25,000

Pandas: Data Manipulation Powerhouse

Data Handling

Python

import pandas as pd
import numpy as np

# Creating a DataFrame (like a spreadsheet)
ipl_data = {
    'Player': ['Virat Kohli', 'Rohit Sharma', 'MS Dhoni', 'Jasprit Bumrah'],
    'Team': ['RCB', 'MI', 'CSK', 'MI'],
    'Matches': [237, 243, 250, 120],
    'Runs': [7263, 6211, 5082, 56],
    'Average': [36.2, 29.7, 38.6, 6.2],
    'Role': ['Batsman', 'Batsman', 'Batsman', 'Bowler']
}
df = pd.DataFrame(ipl_data)

# Basic Inspection
print(df.head())           # First 5 rows
print(df.info())           # Data types & null counts
print(df.describe())       # Statistics for numeric columns

# SELECTION: Multiple ways to select data
# Method 1: Column by name
runs = df['Runs']  # Returns a Series

# Method 2: Multiple columns
batting_stats = df[['Player', 'Runs', 'Average']]

# Method 3: .loc[] - Label-based selection
dhoni_stats = df.loc[df['Player'] == 'MS Dhoni', ['Player', 'Team', 'Runs']]

# Method 4: .iloc[] - Integer position-based
first_3_rows = df.iloc[0:3]  # First 3 rows

# FILTERING: Find batsmen with average > 30
top_batsmen = df[(df['Role'] == 'Batsman') & (df['Average'] > 30)]
print("Top Batsmen (Avg > 30):")
print(top_batsmen)

# HANDLING MISSING VALUES (crucial in real datasets)
df_with_nulls = df.copy()
df_with_nulls.loc[0, 'Average'] = np.nan  # Simulate missing value

# Check for nulls
print(f"Null values:\n{df_with_nulls.isnull().sum()}")

# Fill missing with median (robust to outliers)
df_with_nulls['Average'].fillna(df_with_nulls['Average'].median(), inplace=True)

# GROUPBY: Aggregate statistics by category
team_stats = df.groupby('Team').agg({
    'Matches': 'sum',
    'Runs': 'sum',
    'Average': 'mean'
}).round(2)
print("\nTeam Statistics:")
print(team_stats)

Matplotlib & Seaborn: Visualization

Visualization

Python

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Sample IPL batting data
np.random.seed(42)
data = {
    'Runs': np.random.normal(35, 15, 100).clip(0, 100),
    'Strike_Rate': np.random.normal(130, 20, 100).clip(80, 200),
    'Boundaries': np.random.randint(0, 15, 100),
    'Team': np.random.choice(['CSK', 'MI', 'RCB', 'KKR'], 100)
}
df = pd.DataFrame(data)

# Set Seaborn style for beautiful plots
sns.set_style("darkgrid")
plt.figure(figsize=(12, 8))

# 1. HISTOGRAM: Distribution of runs
plt.subplot(2, 2, 1)
plt.hist(df['Runs'], bins=20, color='#4CAF50', edgecolor='white')
plt.title('Distribution of Runs', fontweight='bold')
plt.xlabel('Runs')
plt.ylabel('Frequency')

# 2. BOX PLOT: Detect outliers in Strike Rate by Team
plt.subplot(2, 2, 2)
sns.boxplot(x='Team', y='Strike_Rate', data=df, palette='Set2')
plt.title('Strike Rate by Team (with Outliers)', fontweight='bold')

# 3. SCATTER PLOT: Relationship between Runs and Strike Rate
plt.subplot(2, 2, 3)
plt.scatter(df['Runs'], df['Strike_Rate'], 
            c=df['Boundaries'], cmap='viridis', alpha=0.7)
plt.colorbar(label='Boundaries')
plt.title('Runs vs Strike Rate', fontweight='bold')
plt.xlabel('Runs')
plt.ylabel('Strike Rate')

# 4. CORRELATION HEATMAP: Essential for feature selection
plt.subplot(2, 2, 4)
numeric_df = df.select_dtypes(include=[np.number])
correlation = numeric_df.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap', fontweight='bold')

plt.tight_layout()
plt.show()

# PAIR PLOT: See all relationships at once (for EDA)
# sns.pairplot(df, hue='Team', palette='husl')
# plt.show()

Putting It All Together: IPL Analysis

Complete Pipeline

Python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ========================================
# STEP 1: CREATE A REALISTIC IPL DATASET
# ========================================
np.random.seed(2024)
n_players = 50

ipl_df = pd.DataFrame({
    'Player': [f'Player_{i}' for i in range(1, n_players + 1)],
    'Team': np.random.choice(['CSK', 'MI', 'RCB', 'KKR', 'SRH', 'DC'], n_players),
    'Matches': np.random.randint(10, 200, n_players),
    'Runs': np.random.randint(100, 8000, n_players),
    'Balls_Faced': np.random.randint(80, 6000, n_players),
    'Wickets': np.random.randint(0, 150, n_players),
    'Catches': np.random.randint(0, 100, n_players),
    'Role': np.random.choice(['Batsman', 'Bowler', 'All-rounder'], n_players)
})

# Add some missing values (realistic scenario)
ipl_df.loc[np.random.choice(n_players, 5), 'Catches'] = np.nan

# ========================================
# STEP 2: EXPLORE THE DATA
# ========================================
print("=" * 50)
print("DATASET OVERVIEW")
print("=" * 50)
print(f"Shape: {ipl_df.shape[0]} players × {ipl_df.shape[1]} features")
print(f"\nData Types:\n{ipl_df.dtypes}")
print(f"\nMissing Values:\n{ipl_df.isnull().sum()}")
print(f"\nStatistics:\n{ipl_df.describe().round(2)}")

# ========================================
# STEP 3: CLEAN THE DATA
# ========================================
# Fill missing catches with median
ipl_df['Catches'].fillna(ipl_df['Catches'].median(), inplace=True)

# ========================================
# STEP 4: FEATURE ENGINEERING
# ========================================
# Create new meaningful features using NumPy operations
ipl_df['Strike_Rate'] = (ipl_df['Runs'] / ipl_df['Balls_Faced'] * 100).round(2)
ipl_df['Runs_Per_Match'] = (ipl_df['Runs'] / ipl_df['Matches']).round(2)

# Handle infinite values (division by zero cases)
ipl_df.replace([np.inf, -np.inf], np.nan, inplace=True)
ipl_df.dropna(inplace=True)

print("\n" + "=" * 50)
print("AFTER FEATURE ENGINEERING")
print("=" * 50)
print(ipl_df[['Player', 'Team', 'Runs', 'Strike_Rate', 'Runs_Per_Match']].head(10))

# ========================================
# STEP 5: ANALYSIS & INSIGHTS
# ========================================
# Top 5 run scorers
print("\n🏆 TOP 5 RUN SCORERS:")
print(ipl_df.nlargest(5, 'Runs')[['Player', 'Team', 'Runs', 'Strike_Rate']])

# Team-wise performance
print("\n📊 TEAM-WISE AVERAGE RUNS:")
team_avg = ipl_df.groupby('Team')['Runs'].mean().sort_values(ascending=False)
print(team_avg.round(0))

# Correlation between features
print("\n🔗 FEATURE CORRELATIONS:")
numeric_cols = ['Runs', 'Matches', 'Strike_Rate', 'Runs_Per_Match']
print(ipl_df[numeric_cols].corr().round(2))

🎯 Key Takeaways

• NumPy: Fast numerical operations, foundation for ML math
• Pandas: Data loading, cleaning, transformation (80% of ML work)
• Visualization: Understand data before modeling
• Feature Engineering: Create meaningful features from raw data

🎯 What's Next?

Now that you know the Python tools, learn how to explore and understand your data:

Exploratory Data Analysis Data Preprocessing →

import numpy as np # Creating Arrays arr = np.array([1, 2, 3, 4, 5]) print(f"1D Array: {arr}") # 2D Array (Matrix) - Common in ML for datasets matrix = np.array([[1, 2, 3], [4, 5, 6]]) print(f"Shape: {matrix.shape}") # Output: (2, 3) = 2 rows, 3 columns # Creating special arrays zeros = np.zeros((3, 3)) # Used for weight initialization ones = np.ones((2, 4)) # Used for bias initialization random = np.random.randn(3, 3) # Random normal distribution # VECTORIZATION: Fast element-wise operations prices = np.array([100, 200, 300, 400, 500]) gst = prices * 0.18 # Apply 18% GST to all prices at ONCE print(f"GST amounts: {gst}") # BROADCASTING Example # Add different values to each column data = np.array([[1, 2, 3], [4, 5, 6]]) bias = np.array([10, 20, 30]) # Smaller array result = data + bias # Broadcasting: bias is "stretched" to match data print(f"After broadcasting:\n{result}") # DOT PRODUCT - Heart of ML (weights × features) features = np.array([1800, 3, 2]) # [sqft, bedrooms, bathrooms] weights = np.array([50, 10000, 5000]) # Learned weights predicted_price = np.dot(features, weights) print(f"Predicted Price: ₹{predicted_price:,}") # ₹1,25,000

import pandas as pd import numpy as np # Creating a DataFrame (like a spreadsheet) ipl_data = { 'Player': ['Virat Kohli', 'Rohit Sharma', 'MS Dhoni', 'Jasprit Bumrah'], 'Team': ['RCB', 'MI', 'CSK', 'MI'], 'Matches': [237, 243, 250, 120], 'Runs': [7263, 6211, 5082, 56], 'Average': [36.2, 29.7, 38.6, 6.2], 'Role': ['Batsman', 'Batsman', 'Batsman', 'Bowler'] } df = pd.DataFrame(ipl_data) # Basic Inspection print(df.head()) # First 5 rows print(df.info()) # Data types & null counts print(df.describe()) # Statistics for numeric columns # SELECTION: Multiple ways to select data # Method 1: Column by name runs = df['Runs'] # Returns a Series # Method 2: Multiple columns batting_stats = df[['Player', 'Runs', 'Average']] # Method 3: .loc[] - Label-based selection dhoni_stats = df.loc[df['Player'] == 'MS Dhoni', ['Player', 'Team', 'Runs']] # Method 4: .iloc[] - Integer position-based first_3_rows = df.iloc[0:3] # First 3 rows # FILTERING: Find batsmen with average > 30 top_batsmen = df[(df['Role'] == 'Batsman') & (df['Average'] > 30)] print("Top Batsmen (Avg > 30):") print(top_batsmen) # HANDLING MISSING VALUES (crucial in real datasets) df_with_nulls = df.copy() df_with_nulls.loc[0, 'Average'] = np.nan # Simulate missing value # Check for nulls print(f"Null values:\n{df_with_nulls.isnull().sum()}") # Fill missing with median (robust to outliers) df_with_nulls['Average'].fillna(df_with_nulls['Average'].median(), inplace=True) # GROUPBY: Aggregate statistics by category team_stats = df.groupby('Team').agg({ 'Matches': 'sum', 'Runs': 'sum', 'Average': 'mean' }).round(2) print("\nTeam Statistics:") print(team_stats)

import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np # Sample IPL batting data np.random.seed(42) data = { 'Runs': np.random.normal(35, 15, 100).clip(0, 100), 'Strike_Rate': np.random.normal(130, 20, 100).clip(80, 200), 'Boundaries': np.random.randint(0, 15, 100), 'Team': np.random.choice(['CSK', 'MI', 'RCB', 'KKR'], 100) } df = pd.DataFrame(data) # Set Seaborn style for beautiful plots sns.set_style("darkgrid") plt.figure(figsize=(12, 8)) # 1. HISTOGRAM: Distribution of runs plt.subplot(2, 2, 1) plt.hist(df['Runs'], bins=20, color='#4CAF50', edgecolor='white') plt.title('Distribution of Runs', fontweight='bold') plt.xlabel('Runs') plt.ylabel('Frequency') # 2. BOX PLOT: Detect outliers in Strike Rate by Team plt.subplot(2, 2, 2) sns.boxplot(x='Team', y='Strike_Rate', data=df, palette='Set2') plt.title('Strike Rate by Team (with Outliers)', fontweight='bold') # 3. SCATTER PLOT: Relationship between Runs and Strike Rate plt.subplot(2, 2, 3) plt.scatter(df['Runs'], df['Strike_Rate'], c=df['Boundaries'], cmap='viridis', alpha=0.7) plt.colorbar(label='Boundaries') plt.title('Runs vs Strike Rate', fontweight='bold') plt.xlabel('Runs') plt.ylabel('Strike Rate') # 4. CORRELATION HEATMAP: Essential for feature selection plt.subplot(2, 2, 4) numeric_df = df.select_dtypes(include=[np.number]) correlation = numeric_df.corr() sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0) plt.title('Feature Correlation Heatmap', fontweight='bold') plt.tight_layout() plt.show() # PAIR PLOT: See all relationships at once (for EDA) # sns.pairplot(df, hue='Team', palette='husl') # plt.show()

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns # ======================================== # STEP 1: CREATE A REALISTIC IPL DATASET # ======================================== np.random.seed(2024) n_players = 50 ipl_df = pd.DataFrame({ 'Player': [f'Player_{i}' for i in range(1, n_players + 1)], 'Team': np.random.choice(['CSK', 'MI', 'RCB', 'KKR', 'SRH', 'DC'], n_players), 'Matches': np.random.randint(10, 200, n_players), 'Runs': np.random.randint(100, 8000, n_players), 'Balls_Faced': np.random.randint(80, 6000, n_players), 'Wickets': np.random.randint(0, 150, n_players), 'Catches': np.random.randint(0, 100, n_players), 'Role': np.random.choice(['Batsman', 'Bowler', 'All-rounder'], n_players) }) # Add some missing values (realistic scenario) ipl_df.loc[np.random.choice(n_players, 5), 'Catches'] = np.nan # ======================================== # STEP 2: EXPLORE THE DATA # ======================================== print("=" * 50) print("DATASET OVERVIEW") print("=" * 50) print(f"Shape: {ipl_df.shape[0]} players × {ipl_df.shape[1]} features") print(f"\nData Types:\n{ipl_df.dtypes}") print(f"\nMissing Values:\n{ipl_df.isnull().sum()}") print(f"\nStatistics:\n{ipl_df.describe().round(2)}") # ======================================== # STEP 3: CLEAN THE DATA # ======================================== # Fill missing catches with median ipl_df['Catches'].fillna(ipl_df['Catches'].median(), inplace=True) # ======================================== # STEP 4: FEATURE ENGINEERING # ======================================== # Create new meaningful features using NumPy operations ipl_df['Strike_Rate'] = (ipl_df['Runs'] / ipl_df['Balls_Faced'] * 100).round(2) ipl_df['Runs_Per_Match'] = (ipl_df['Runs'] / ipl_df['Matches']).round(2) # Handle infinite values (division by zero cases) ipl_df.replace([np.inf, -np.inf], np.nan, inplace=True) ipl_df.dropna(inplace=True) print("\n" + "=" * 50) print("AFTER FEATURE ENGINEERING") print("=" * 50) print(ipl_df[['Player', 'Team', 'Runs', 'Strike_Rate', 'Runs_Per_Match']].head(10)) # ======================================== # STEP 5: ANALYSIS & INSIGHTS # ======================================== # Top 5 run scorers print("\n🏆 TOP 5 RUN SCORERS:") print(ipl_df.nlargest(5, 'Runs')[['Player', 'Team', 'Runs', 'Strike_Rate']]) # Team-wise performance print("\n📊 TEAM-WISE AVERAGE RUNS:") team_avg = ipl_df.groupby('Team')['Runs'].mean().sort_values(ascending=False) print(team_avg.round(0)) # Correlation between features print("\n🔗 FEATURE CORRELATIONS:") numeric_cols = ['Runs', 'Matches', 'Strike_Rate', 'Runs_Per_Match'] print(ipl_df[numeric_cols].corr().round(2))