5 min read min read
Logistic Regression
Learn to classify with Logistic Regression
Logistic Regression
What is Logistic Regression?
Despite the name, it's for classification (not regression)!
Predicts probability of belonging to a class (0 to 1).
Simple Example
code.py
from sklearn.linear_model import LogisticRegression
import numpy as np
# Predict if student passes (1) or fails (0) based on hours studied
hours = np.array([[1], [2], [3], [4], [5], [6], [7], [8]])
passed = np.array([0, 0, 0, 0, 1, 1, 1, 1])
model = LogisticRegression()
model.fit(hours, passed)
# Predict
new_student = np.array([[4.5]])
prediction = model.predict(new_student)
probability = model.predict_proba(new_student)
print(f"Prediction: {'Pass' if prediction[0] == 1 else 'Fail'}")
print(f"Probability: {probability[0][1]:.1%}")How It Works
- Calculate linear combination: z = wx + b
- Apply sigmoid function: p = 1 / (1 + e^(-z))
- If p ≥ 0.5, predict class 1; else class 0
code.py
# The sigmoid function
import matplotlib.pyplot as plt
z = np.linspace(-10, 10, 100)
sigmoid = 1 / (1 + np.exp(-z))
plt.plot(z, sigmoid)
plt.axhline(y=0.5, color='r', linestyle='--')
plt.xlabel('z')
plt.ylabel('Probability')
plt.title('Sigmoid Function')
plt.show()Binary Classification
code.py
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
# Load data
data = load_breast_cancer()
X = data.data
y = data.target
# Split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Train
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
# Evaluate
accuracy = model.score(X_test, y_test)
print(f"Accuracy: {accuracy:.1%}")Multiclass Classification
code.py
from sklearn.datasets import load_iris
# Load iris (3 classes)
iris = load_iris()
X = iris.data
y = iris.target
# Split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Train (handles multiclass automatically)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
# Predict probabilities for each class
proba = model.predict_proba(X_test[:1])
print(f"Probabilities: {proba[0]}")
print(f"Classes: {iris.target_names}")Predictions and Probabilities
code.py
# Get predictions
predictions = model.predict(X_test)
# Get probability for each class
probabilities = model.predict_proba(X_test)
# Example for first sample
print(f"Predicted class: {predictions[0]}")
print(f"Probabilities: {probabilities[0]}")
print(f"Most likely: {iris.target_names[predictions[0]]}")Confusion Matrix
See where model makes mistakes:
code.py
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
# Visual
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.title('Confusion Matrix')
plt.show()Classification Metrics
code.py
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
y_pred = model.predict(X_test)
# For binary classification
print(f"Accuracy: {accuracy_score(y_test, y_pred):.1%}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.1%}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.1%}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.1%}")Definitions:
- Accuracy: Overall correct predictions
- Precision: Of predicted positive, how many are actually positive
- Recall: Of actual positive, how many were predicted positive
- F1: Balance between precision and recall
Classification Report
All metrics at once:
code.py
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=iris.target_names))Adjusting Threshold
Default threshold is 0.5, but you can change it:
code.py
# Get probabilities
proba = model.predict_proba(X_test)[:, 1] # Probability of class 1
# Custom threshold
threshold = 0.7
custom_pred = (proba >= threshold).astype(int)
print(f"Default predictions: {model.predict(X_test[:5])}")
print(f"Custom predictions: {custom_pred[:5]}")Feature Importance
code.py
# Coefficients show feature importance
for feature, coef in zip(iris.feature_names, model.coef_[0]):
print(f"{feature}: {coef:.3f}")Complete Example
code.py
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.datasets import load_breast_cancer
import numpy as np
# Load data
data = load_breast_cancer()
X = data.data
y = data.target
print(f"Features: {len(data.feature_names)}")
print(f"Classes: {data.target_names}")
# Split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)
# Predict
y_pred = model.predict(X_test_scaled)
# Evaluate
print(f"\n=== Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.1%}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=data.target_names))
# Top features
print("\nTop 5 Important Features:")
importance = np.abs(model.coef_[0])
top_indices = np.argsort(importance)[-5:][::-1]
for idx in top_indices:
print(f" {data.feature_names[idx]}: {model.coef_[0][idx]:.3f}")Key Points
- Logistic Regression is for classification
- Outputs probabilities (0 to 1)
- Default threshold is 0.5
- Check confusion matrix for mistakes
- Use precision/recall for imbalanced data
- Scale features for better performance
What's Next?
Learn about model evaluation metrics in detail.