Loading W Code...
6
Topics
sklearn
Implementation
Indian Context
SMS Spam Detection, Loan Default Prediction, Disease Screening in Indian Healthcare
Concept Level: Beginner
# THE PROBLEM WITH LINEAR REGRESSION FOR CLASSIFICATION
import numpy as np
import matplotlib.pyplot as plt
# Example: Predicting if a student passes (1) or fails (0)
# based on hours studied
hours = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
passed = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
# Linear Regression would give us a line
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(hours.reshape(-1, 1), passed)
# But it predicts values outside [0, 1]!
test_hours = np.array([0, 12]).reshape(-1, 1)
print("Linear Regression predictions:")
print(f"0 hours: {lin_reg.predict([[0]])[0]:.2f}") # Negative!
print(f"12 hours: {lin_reg.predict([[12]])[0]:.2f}") # Greater than 1!
# Logistic Regression fixes this!
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(hours.reshape(-1, 1), passed)
print("\nLogistic Regression predictions:")
print(f"0 hours: {log_reg.predict_proba([[0]])[0][1]:.2f}") # Near 0
print(f"12 hours: {log_reg.predict_proba([[12]])[0][1]:.2f}") # Near 1