W Code - Pattern-Based DSA Learning Platform

Bhanu Bisht

Information Theory

Entropy, cross-entropy, KL divergence — the information foundation of ML

Topics

Entropy

ML-Core

Python Implementation

import numpy as np

print("=" * 55)
print("ENTROPY")
print("=" * 55)

def entropy(probs):
    probs = np.array(probs)
    probs = probs[probs > 0]  # Remove zeros
    return -np.sum(probs * np.log2(probs))

# Coin examples
print("\nCoin Flip Entropy:")
for name, p in [("Fair", 0.5), ("Biased (0.9)", 0.9), ("Biased (0.99)", 0.99), ("Certain", 1.0)]:
    h = entropy([p, 1-p])
    print(f"  {name:20s}: P(H)={p}, H = {h:.4f} bits")

# Die entropy
die_probs = [1/6] * 6
print(f"\nFair die entropy: H = {entropy(die_probs):.4f} bits")
print(f"  (= log₂(6) = {np.log2(6):.4f})")

# Decision Tree Information Gain
print("\n" + "=" * 55)
print("📊 DECISION TREE: INFORMATION GAIN")
print("=" * 55)

# Should I play cricket today?
# 14 examples: 9 Yes, 5 No
# Feature: Weather (Sunny: 5 total, Rainy: 5, Cloudy: 4)
# Sunny: 2 Yes, 3 No
# Rainy: 3 Yes, 2 No
# Cloudy: 4 Yes, 0 No

# Parent entropy
H_parent = entropy([9/14, 5/14])
print(f"\nParent entropy H(Play) = {H_parent:.4f}")

# Conditional entropy H(Play | Weather)
H_sunny = entropy([2/5, 3/5])
H_rainy = entropy([3/5, 2/5])
H_cloudy = entropy([4/4])  # All Yes = 0

H_conditional = (5/14) * H_sunny + (5/14) * H_rainy + (4/14) * H_cloudy
IG = H_parent - H_conditional

print(f"\nWeather split:")
print(f"  H(Play|Sunny) = {H_sunny:.4f} (2Y, 3N)")
print(f"  H(Play|Rainy) = {H_rainy:.4f} (3Y, 2N)")
print(f"  H(Play|Cloudy) = {H_cloudy:.4f} (4Y, 0N)")
print(f"\n  H(Play|Weather) = {H_conditional:.4f}")
print(f"  Information Gain = {H_parent:.4f} - {H_conditional:.4f} = {IG:.4f}")
print(f"  → Split on Weather reduces uncertainty by {IG:.4f} bits!")

# Compare with another feature
# Temperature: Hot (4), Mild (6), Cool (4)
# Hot: 2Y, 2N; Mild: 4Y, 2N; Cool: 3Y, 1N
H_hot = entropy([2/4, 2/4])
H_mild = entropy([4/6, 2/6])
H_cool = entropy([3/4, 1/4])
H_temp = (4/14)*H_hot + (6/14)*H_mild + (4/14)*H_cool
IG_temp = H_parent - H_temp
print(f"\n  Temperature IG = {IG_temp:.4f}")
print(f"  Weather IG = {IG:.4f}")
print(f"  → Split on {'Weather' if IG > IG_temp else 'Temperature'} first! (higher IG)")

Key Takeaways

Entropy H(X) = -ΣpᵢlogPᵢ measures uncertainty/randomness

H = 0 means certain; H = log(n) means maximum uncertainty (uniform)

Information gain = H(parent) - H(children) → used in decision trees

Higher IG → better split → chosen first in tree building

Entropy is always non-negative

import numpy as np print("=" * 55) print("ENTROPY") print("=" * 55) def entropy(probs): probs = np.array(probs) probs = probs[probs > 0] # Remove zeros return -np.sum(probs * np.log2(probs)) # Coin examples print("\nCoin Flip Entropy:") for name, p in [("Fair", 0.5), ("Biased (0.9)", 0.9), ("Biased (0.99)", 0.99), ("Certain", 1.0)]: h = entropy([p, 1-p]) print(f" {name:20s}: P(H)={p}, H = {h:.4f} bits") # Die entropy die_probs = [1/6] * 6 print(f"\nFair die entropy: H = {entropy(die_probs):.4f} bits") print(f" (= log₂(6) = {np.log2(6):.4f})") # Decision Tree Information Gain print("\n" + "=" * 55) print("📊 DECISION TREE: INFORMATION GAIN") print("=" * 55) # Should I play cricket today? # 14 examples: 9 Yes, 5 No # Feature: Weather (Sunny: 5 total, Rainy: 5, Cloudy: 4) # Sunny: 2 Yes, 3 No # Rainy: 3 Yes, 2 No # Cloudy: 4 Yes, 0 No # Parent entropy H_parent = entropy([9/14, 5/14]) print(f"\nParent entropy H(Play) = {H_parent:.4f}") # Conditional entropy H(Play | Weather) H_sunny = entropy([2/5, 3/5]) H_rainy = entropy([3/5, 2/5]) H_cloudy = entropy([4/4]) # All Yes = 0 H_conditional = (5/14) * H_sunny + (5/14) * H_rainy + (4/14) * H_cloudy IG = H_parent - H_conditional print(f"\nWeather split:") print(f" H(Play|Sunny) = {H_sunny:.4f} (2Y, 3N)") print(f" H(Play|Rainy) = {H_rainy:.4f} (3Y, 2N)") print(f" H(Play|Cloudy) = {H_cloudy:.4f} (4Y, 0N)") print(f"\n H(Play|Weather) = {H_conditional:.4f}") print(f" Information Gain = {H_parent:.4f} - {H_conditional:.4f} = {IG:.4f}") print(f" → Split on Weather reduces uncertainty by {IG:.4f} bits!") # Compare with another feature # Temperature: Hot (4), Mild (6), Cool (4) # Hot: 2Y, 2N; Mild: 4Y, 2N; Cool: 3Y, 1N H_hot = entropy([2/4, 2/4]) H_mild = entropy([4/6, 2/6]) H_cool = entropy([3/4, 1/4]) H_temp = (4/14)*H_hot + (6/14)*H_mild + (4/14)*H_cool IG_temp = H_parent - H_temp print(f"\n Temperature IG = {IG_temp:.4f}") print(f" Weather IG = {IG:.4f}") print(f" → Split on {'Weather' if IG > IG_temp else 'Temperature'} first! (higher IG)")