Loading W Code...
Master NumPy, Pandas, and Visualization ā the essential Python toolkit for every ML engineer.
4
Topics
Python
Code Examples
š Indian Context
All examples use IPL cricket data ā analyzing player stats, team performances, and match outcomes using Python!
import numpy as np
# Creating Arrays
arr = np.array([1, 2, 3, 4, 5])
print(f"1D Array: {arr}")
# 2D Array (Matrix) - Common in ML for datasets
matrix = np.array([[1, 2, 3],
[4, 5, 6]])
print(f"Shape: {matrix.shape}") # Output: (2, 3) = 2 rows, 3 columns
# Creating special arrays
zeros = np.zeros((3, 3)) # Used for weight initialization
ones = np.ones((2, 4)) # Used for bias initialization
random = np.random.randn(3, 3) # Random normal distribution
# VECTORIZATION: Fast element-wise operations
prices = np.array([100, 200, 300, 400, 500])
gst = prices * 0.18 # Apply 18% GST to all prices at ONCE
print(f"GST amounts: {gst}")
# BROADCASTING Example
# Add different values to each column
data = np.array([[1, 2, 3],
[4, 5, 6]])
bias = np.array([10, 20, 30]) # Smaller array
result = data + bias # Broadcasting: bias is "stretched" to match data
print(f"After broadcasting:\n{result}")
# DOT PRODUCT - Heart of ML (weights Ć features)
features = np.array([1800, 3, 2]) # [sqft, bedrooms, bathrooms]
weights = np.array([50, 10000, 5000]) # Learned weights
predicted_price = np.dot(features, weights)
print(f"Predicted Price: ā¹{predicted_price:,}") # ā¹1,25,000import pandas as pd
import numpy as np
# Creating a DataFrame (like a spreadsheet)
ipl_data = {
'Player': ['Virat Kohli', 'Rohit Sharma', 'MS Dhoni', 'Jasprit Bumrah'],
'Team': ['RCB', 'MI', 'CSK', 'MI'],
'Matches': [237, 243, 250, 120],
'Runs': [7263, 6211, 5082, 56],
'Average': [36.2, 29.7, 38.6, 6.2],
'Role': ['Batsman', 'Batsman', 'Batsman', 'Bowler']
}
df = pd.DataFrame(ipl_data)
# Basic Inspection
print(df.head()) # First 5 rows
print(df.info()) # Data types & null counts
print(df.describe()) # Statistics for numeric columns
# SELECTION: Multiple ways to select data
# Method 1: Column by name
runs = df['Runs'] # Returns a Series
# Method 2: Multiple columns
batting_stats = df[['Player', 'Runs', 'Average']]
# Method 3: .loc[] - Label-based selection
dhoni_stats = df.loc[df['Player'] == 'MS Dhoni', ['Player', 'Team', 'Runs']]
# Method 4: .iloc[] - Integer position-based
first_3_rows = df.iloc[0:3] # First 3 rows
# FILTERING: Find batsmen with average > 30
top_batsmen = df[(df['Role'] == 'Batsman') & (df['Average'] > 30)]
print("Top Batsmen (Avg > 30):")
print(top_batsmen)
# HANDLING MISSING VALUES (crucial in real datasets)
df_with_nulls = df.copy()
df_with_nulls.loc[0, 'Average'] = np.nan # Simulate missing value
# Check for nulls
print(f"Null values:\n{df_with_nulls.isnull().sum()}")
# Fill missing with median (robust to outliers)
df_with_nulls['Average'].fillna(df_with_nulls['Average'].median(), inplace=True)
# GROUPBY: Aggregate statistics by category
team_stats = df.groupby('Team').agg({
'Matches': 'sum',
'Runs': 'sum',
'Average': 'mean'
}).round(2)
print("\nTeam Statistics:")
print(team_stats)import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
# Sample IPL batting data
np.random.seed(42)
data = {
'Runs': np.random.normal(35, 15, 100).clip(0, 100),
'Strike_Rate': np.random.normal(130, 20, 100).clip(80, 200),
'Boundaries': np.random.randint(0, 15, 100),
'Team': np.random.choice(['CSK', 'MI', 'RCB', 'KKR'], 100)
}
df = pd.DataFrame(data)
# Set Seaborn style for beautiful plots
sns.set_style("darkgrid")
plt.figure(figsize=(12, 8))
# 1. HISTOGRAM: Distribution of runs
plt.subplot(2, 2, 1)
plt.hist(df['Runs'], bins=20, color='#4CAF50', edgecolor='white')
plt.title('Distribution of Runs', fontweight='bold')
plt.xlabel('Runs')
plt.ylabel('Frequency')
# 2. BOX PLOT: Detect outliers in Strike Rate by Team
plt.subplot(2, 2, 2)
sns.boxplot(x='Team', y='Strike_Rate', data=df, palette='Set2')
plt.title('Strike Rate by Team (with Outliers)', fontweight='bold')
# 3. SCATTER PLOT: Relationship between Runs and Strike Rate
plt.subplot(2, 2, 3)
plt.scatter(df['Runs'], df['Strike_Rate'],
c=df['Boundaries'], cmap='viridis', alpha=0.7)
plt.colorbar(label='Boundaries')
plt.title('Runs vs Strike Rate', fontweight='bold')
plt.xlabel('Runs')
plt.ylabel('Strike Rate')
# 4. CORRELATION HEATMAP: Essential for feature selection
plt.subplot(2, 2, 4)
numeric_df = df.select_dtypes(include=[np.number])
correlation = numeric_df.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap', fontweight='bold')
plt.tight_layout()
plt.show()
# PAIR PLOT: See all relationships at once (for EDA)
# sns.pairplot(df, hue='Team', palette='husl')
# plt.show()import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# ========================================
# STEP 1: CREATE A REALISTIC IPL DATASET
# ========================================
np.random.seed(2024)
n_players = 50
ipl_df = pd.DataFrame({
'Player': [f'Player_{i}' for i in range(1, n_players + 1)],
'Team': np.random.choice(['CSK', 'MI', 'RCB', 'KKR', 'SRH', 'DC'], n_players),
'Matches': np.random.randint(10, 200, n_players),
'Runs': np.random.randint(100, 8000, n_players),
'Balls_Faced': np.random.randint(80, 6000, n_players),
'Wickets': np.random.randint(0, 150, n_players),
'Catches': np.random.randint(0, 100, n_players),
'Role': np.random.choice(['Batsman', 'Bowler', 'All-rounder'], n_players)
})
# Add some missing values (realistic scenario)
ipl_df.loc[np.random.choice(n_players, 5), 'Catches'] = np.nan
# ========================================
# STEP 2: EXPLORE THE DATA
# ========================================
print("=" * 50)
print("DATASET OVERVIEW")
print("=" * 50)
print(f"Shape: {ipl_df.shape[0]} players Ć {ipl_df.shape[1]} features")
print(f"\nData Types:\n{ipl_df.dtypes}")
print(f"\nMissing Values:\n{ipl_df.isnull().sum()}")
print(f"\nStatistics:\n{ipl_df.describe().round(2)}")
# ========================================
# STEP 3: CLEAN THE DATA
# ========================================
# Fill missing catches with median
ipl_df['Catches'].fillna(ipl_df['Catches'].median(), inplace=True)
# ========================================
# STEP 4: FEATURE ENGINEERING
# ========================================
# Create new meaningful features using NumPy operations
ipl_df['Strike_Rate'] = (ipl_df['Runs'] / ipl_df['Balls_Faced'] * 100).round(2)
ipl_df['Runs_Per_Match'] = (ipl_df['Runs'] / ipl_df['Matches']).round(2)
# Handle infinite values (division by zero cases)
ipl_df.replace([np.inf, -np.inf], np.nan, inplace=True)
ipl_df.dropna(inplace=True)
print("\n" + "=" * 50)
print("AFTER FEATURE ENGINEERING")
print("=" * 50)
print(ipl_df[['Player', 'Team', 'Runs', 'Strike_Rate', 'Runs_Per_Match']].head(10))
# ========================================
# STEP 5: ANALYSIS & INSIGHTS
# ========================================
# Top 5 run scorers
print("\nš TOP 5 RUN SCORERS:")
print(ipl_df.nlargest(5, 'Runs')[['Player', 'Team', 'Runs', 'Strike_Rate']])
# Team-wise performance
print("\nš TEAM-WISE AVERAGE RUNS:")
team_avg = ipl_df.groupby('Team')['Runs'].mean().sort_values(ascending=False)
print(team_avg.round(0))
# Correlation between features
print("\nš FEATURE CORRELATIONS:")
numeric_cols = ['Runs', 'Matches', 'Strike_Rate', 'Runs_Per_Match']
print(ipl_df[numeric_cols].corr().round(2))Now that you know the Python tools, learn how to explore and understand your data: