Loading W Code...
6
Topics
sklearn
Implementation
Indian Context
Aadhaar face recognition, ISRO satellite image compression, recommendation systems
Concept Level: Beginner
# THE CURSE OF DIMENSIONALITY
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import time
# Compare training time as dimensions increase
dimensions = [10, 50, 100, 200, 500, 1000]
times = []
accuracies = []
for d in dimensions:
# Generate data
X, y = make_classification(
n_samples=1000, n_features=d,
n_informative=10, n_redundant=0,
random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Train model and measure time
start = time.time()
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)
elapsed = time.time() - start
times.append(elapsed)
accuracies.append(accuracy_score(y_test, model.predict(X_test)))
# Plot results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(dimensions, times, 'ro-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Dimensions')
axes[0].set_ylabel('Training Time (seconds)')
axes[0].set_title('Training Time vs Dimensions')
axes[0].grid(True, alpha=0.3)
axes[1].plot(dimensions, accuracies, 'bo-', linewidth=2, markersize=8)
axes[1].set_xlabel('Number of Dimensions')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Accuracy vs Dimensions (only 10 informative)')
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("="*60)
print("OBSERVATIONS")
print("="*60)
print("• Training time increases with dimensions")
print("• Accuracy may drop despite more features (overfitting)")
print("• Only 10 features were truly informative!")