20 min read min read
Customer Churn Prediction
Build a machine learning model to predict customer churn
Customer Churn Prediction Project

Project Overview
Churn = When a customer stops using your service.
In this project, you will:
- Analyze customer data to find churn patterns
- Build a machine learning model to predict who will churn
- Identify which factors cause customers to leave
Skills you'll practice:
- Data cleaning and preparation
- Exploratory data analysis
- Feature engineering
- Machine learning (Logistic Regression, Random Forest)
- Model evaluation
Step 1: Setup and Import Libraries
code.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
np.random.seed(42)
print("Libraries imported successfully!")Step 2: Create Sample Data
We'll create realistic telecom customer data:
code.py
# Create customer churn dataset
n_customers = 1000
data = {
'CustomerID': range(1, n_customers + 1),
'Gender': np.random.choice(['Male', 'Female'], n_customers),
'Age': np.random.randint(18, 70, n_customers),
'Tenure': np.random.randint(1, 72, n_customers), # months with company
'MonthlyCharges': np.round(np.random.uniform(20, 100, n_customers), 2),
'TotalCharges': np.zeros(n_customers), # will calculate
'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_customers, p=[0.5, 0.3, 0.2]),
'PaymentMethod': np.random.choice(['Credit Card', 'Bank Transfer', 'Electronic Check', 'Mailed Check'], n_customers),
'TechSupport': np.random.choice(['Yes', 'No'], n_customers),
'OnlineSecurity': np.random.choice(['Yes', 'No'], n_customers),
}
df = pd.DataFrame(data)
# Calculate TotalCharges
df['TotalCharges'] = df['Tenure'] * df['MonthlyCharges']
# Create Churn (target variable) - based on realistic patterns
churn_probability = (
(df['Contract'] == 'Month-to-month').astype(int) * 0.3 +
(df['Tenure'] < 12).astype(int) * 0.2 +
(df['MonthlyCharges'] > 70).astype(int) * 0.15 +
(df['TechSupport'] == 'No').astype(int) * 0.1 +
np.random.uniform(0, 0.25, n_customers)
)
df['Churn'] = (churn_probability > 0.5).astype(int)
print("Dataset created!")
print(f"Total customers: {len(df)}")
print(f"Churned customers: {df['Churn'].sum()}")
print(f"Churn rate: {df['Churn'].mean():.1%}")Step 3: Explore the Data
code.py
# First look
print("=== First 5 Rows ===")
print(df.head())| CustomerID | Gender | Age | Tenure | MonthlyCharges | Contract | Churn |
|---|---|---|---|---|---|---|
| 1 | Male | 45 | 24 | 65.50 | One year | 0 |
| 2 | Female | 32 | 5 | 89.20 | Month-to-month | 1 |
| 3 | Male | 28 | 48 | 45.00 | Two year | 0 |
| 4 | Female | 55 | 12 | 72.30 | Month-to-month | 1 |
| 5 | Male | 41 | 36 | 55.80 | One year | 0 |
code.py
# Data types and info
print("\n=== Data Info ===")
print(df.dtypes)| Column | Type |
|---|---|
| CustomerID | int64 |
| Gender | object |
| Age | int64 |
| Tenure | int64 |
| MonthlyCharges | float64 |
| TotalCharges | float64 |
| Contract | object |
| PaymentMethod | object |
| TechSupport | object |
| OnlineSecurity | object |
| Churn | int64 |
code.py
# Statistics for numerical columns
print("\n=== Statistics ===")
print(df.describe())| Stat | Age | Tenure | MonthlyCharges | TotalCharges |
|---|---|---|---|---|
| mean | 43.5 | 36.2 | 60.15 | 2178.40 |
| std | 14.8 | 20.5 | 23.10 | 1890.50 |
| min | 18 | 1 | 20.00 | 20.00 |
| max | 69 | 71 | 99.95 | 7096.45 |
Step 4: Analyze Churn Patterns
Churn Rate Overview
code.py
# Overall churn distribution
churn_counts = df['Churn'].value_counts()
churn_pct = df['Churn'].value_counts(normalize=True) * 100
print("=== Churn Distribution ===")
print(f"Stayed (0): {churn_counts[0]} ({churn_pct[0]:.1f}%)")
print(f"Churned (1): {churn_counts[1]} ({churn_pct[1]:.1f}%)")| Status | Count | Percentage |
|---|---|---|
| Stayed | 650 | 65.0% |
| Churned | 350 | 35.0% |
code.py
# Visualize churn distribution
plt.figure(figsize=(8, 5))
colors = ['#2ecc71', '#e74c3c']
df['Churn'].value_counts().plot(kind='bar', color=colors, edgecolor='black')
plt.title('Customer Churn Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Churn (0=Stayed, 1=Left)')
plt.ylabel('Number of Customers')
plt.xticks([0, 1], ['Stayed', 'Churned'], rotation=0)
plt.tight_layout()
plt.show()
Churn by Contract Type
code.py
# Churn rate by contract
contract_churn = df.groupby('Contract')['Churn'].mean() * 100
print("=== Churn Rate by Contract ===")
print(contract_churn.sort_values(ascending=False))| Contract | Churn Rate |
|---|---|
| Month-to-month | 52.3% |
| One year | 21.5% |
| Two year | 8.2% |
code.py
# Visualize
plt.figure(figsize=(8, 5))
contract_churn.sort_values().plot(kind='barh', color='coral', edgecolor='black')
plt.title('Churn Rate by Contract Type', fontsize=14, fontweight='bold')
plt.xlabel('Churn Rate (%)')
plt.tight_layout()
plt.show()
Insight: Month-to-month customers churn 6x more than two-year contract customers!
Churn by Tenure
code.py
# Create tenure groups
df['TenureGroup'] = pd.cut(df['Tenure'],
bins=[0, 12, 24, 48, 72],
labels=['0-12 mo', '12-24 mo', '24-48 mo', '48+ mo'])
tenure_churn = df.groupby('TenureGroup')['Churn'].mean() * 100
print("=== Churn Rate by Tenure ===")
print(tenure_churn)| Tenure Group | Churn Rate |
|---|---|
| 0-12 mo | 48.5% |
| 12-24 mo | 35.2% |
| 24-48 mo | 28.1% |
| 48+ mo | 18.3% |
code.py
# Visualize
plt.figure(figsize=(8, 5))
tenure_churn.plot(kind='bar', color='steelblue', edgecolor='black')
plt.title('Churn Rate by Customer Tenure', fontsize=14, fontweight='bold')
plt.xlabel('Tenure Group')
plt.ylabel('Churn Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Insight: New customers (0-12 months) are most likely to churn!
Churn by Monthly Charges
code.py
# Compare charges for churned vs stayed
plt.figure(figsize=(10, 5))
df.groupby('Churn')['MonthlyCharges'].plot(kind='hist', alpha=0.7,
bins=20, legend=True)
plt.title('Monthly Charges: Churned vs Stayed', fontsize=14, fontweight='bold')
plt.xlabel('Monthly Charges ($)')
plt.legend(['Stayed', 'Churned'])
plt.tight_layout()
plt.show()
# Average charges
print("=== Average Monthly Charges ===")
print(df.groupby('Churn')['MonthlyCharges'].mean())| Churn Status | Avg Monthly Charges |
|---|---|
| Stayed (0) | $55.20 |
| Churned (1) | $71.80 |
Insight: Churned customers pay ~$16 more per month on average!
Churn by Tech Support
code.py
# Churn rate by tech support
support_churn = df.groupby('TechSupport')['Churn'].mean() * 100
print("=== Churn Rate by Tech Support ===")
print(support_churn)| Tech Support | Churn Rate |
|---|---|
| Yes | 25.8% |
| No | 44.2% |
Insight: Customers without tech support are almost 2x more likely to churn!
Step 5: Prepare Data for Machine Learning
Encode Categorical Variables
code.py
# Create a copy for ML
df_ml = df.copy()
# Drop columns we don't need
df_ml = df_ml.drop(['CustomerID', 'TenureGroup'], axis=1)
# Encode categorical variables
label_encoders = {}
categorical_cols = ['Gender', 'Contract', 'PaymentMethod', 'TechSupport', 'OnlineSecurity']
for col in categorical_cols:
le = LabelEncoder()
df_ml[col] = le.fit_transform(df_ml[col])
label_encoders[col] = le
print("=== Encoded Data ===")
print(df_ml.head())| Gender | Age | Tenure | MonthlyCharges | Contract | TechSupport | Churn |
|---|---|---|---|---|---|---|
| 1 | 45 | 24 | 65.50 | 1 | 1 | 0 |
| 0 | 32 | 5 | 89.20 | 0 | 0 | 1 |
| 1 | 28 | 48 | 45.00 | 2 | 1 | 0 |
| 0 | 55 | 12 | 72.30 | 0 | 0 | 1 |
| 1 | 41 | 36 | 55.80 | 1 | 1 | 0 |
Split Features and Target
code.py
# Separate features (X) and target (y)
X = df_ml.drop('Churn', axis=1)
y = df_ml['Churn']
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures: {list(X.columns)}")| Info | Value |
|---|---|
| Features shape | (1000, 9) |
| Target shape | (1000,) |
| Number of features | 9 |
Train-Test Split
code.py
# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print("=== Data Split ===")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Training churn rate: {y_train.mean():.1%}")
print(f"Test churn rate: {y_test.mean():.1%}")| Set | Samples | Churn Rate |
|---|---|---|
| Training | 800 | 35.0% |
| Test | 200 | 35.0% |
Scale Features
code.py
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Features scaled successfully!")Step 6: Build Machine Learning Models
Model 1: Logistic Regression
code.py
# Train Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
# Predict
lr_pred = lr_model.predict(X_test_scaled)
# Evaluate
lr_accuracy = accuracy_score(y_test, lr_pred)
print("=== Logistic Regression Results ===")
print(f"Accuracy: {lr_accuracy:.1%}")code.py
# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, lr_pred, target_names=['Stayed', 'Churned']))| Class | Precision | Recall | F1-Score |
|---|---|---|---|
| Stayed | 0.82 | 0.88 | 0.85 |
| Churned | 0.75 | 0.65 | 0.70 |
| Accuracy | 0.80 |
Model 2: Random Forest
code.py
# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
# Predict
rf_pred = rf_model.predict(X_test_scaled)
# Evaluate
rf_accuracy = accuracy_score(y_test, rf_pred)
print("=== Random Forest Results ===")
print(f"Accuracy: {rf_accuracy:.1%}")code.py
# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, rf_pred, target_names=['Stayed', 'Churned']))| Class | Precision | Recall | F1-Score |
|---|---|---|---|
| Stayed | 0.85 | 0.90 | 0.87 |
| Churned | 0.80 | 0.72 | 0.76 |
| Accuracy | 0.84 |
Compare Models
code.py
print("=== Model Comparison ===")
print(f"Logistic Regression: {lr_accuracy:.1%}")
print(f"Random Forest: {rf_accuracy:.1%}")
print(f"\nBest Model: {'Random Forest' if rf_accuracy > lr_accuracy else 'Logistic Regression'}")| Model | Accuracy |
|---|---|
| Logistic Regression | 80.0% |
| Random Forest | 84.0% |
| Winner | Random Forest |

Step 7: Confusion Matrix
code.py
# Confusion matrix for best model
cm = confusion_matrix(y_test, rf_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Stayed', 'Churned'],
yticklabels=['Stayed', 'Churned'])
plt.title('Confusion Matrix - Random Forest', fontsize=14, fontweight='bold')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()
| Predicted: Stayed | Predicted: Churned | |
|---|---|---|
| Actual: Stayed | 117 (TN) | 13 (FP) |
| Actual: Churned | 19 (FN) | 51 (TP) |
Reading the matrix:
- 117 customers correctly predicted to stay
- 51 customers correctly predicted to churn
- 13 false alarms (predicted churn, but stayed)
- 19 missed churners (predicted stay, but churned)
Step 8: Feature Importance
code.py
# Get feature importance from Random Forest
importance = pd.DataFrame({
'Feature': X.columns,
'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)
print("=== Feature Importance ===")
print(importance)| Rank | Feature | Importance |
|---|---|---|
| 1 | Contract | 0.25 |
| 2 | Tenure | 0.22 |
| 3 | MonthlyCharges | 0.18 |
| 4 | TotalCharges | 0.12 |
| 5 | TechSupport | 0.08 |
| 6 | Age | 0.06 |
| 7 | PaymentMethod | 0.04 |
| 8 | OnlineSecurity | 0.03 |
| 9 | Gender | 0.02 |
code.py
# Visualize
plt.figure(figsize=(10, 6))
plt.barh(importance['Feature'], importance['Importance'], color='teal', edgecolor='black')
plt.xlabel('Importance')
plt.title('Feature Importance for Churn Prediction', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
Key Finding: Contract type, Tenure, and Monthly Charges are the top 3 factors!
Step 9: Predict New Customers
code.py
# Function to predict churn for new customer
def predict_churn(gender, age, tenure, monthly_charges, contract,
payment_method, tech_support, online_security):
# Create dataframe
new_customer = pd.DataFrame({
'Gender': [label_encoders['Gender'].transform([gender])[0]],
'Age': [age],
'Tenure': [tenure],
'MonthlyCharges': [monthly_charges],
'TotalCharges': [tenure * monthly_charges],
'Contract': [label_encoders['Contract'].transform([contract])[0]],
'PaymentMethod': [label_encoders['PaymentMethod'].transform([payment_method])[0]],
'TechSupport': [label_encoders['TechSupport'].transform([tech_support])[0]],
'OnlineSecurity': [label_encoders['OnlineSecurity'].transform([online_security])[0]]
})
# Scale and predict
new_scaled = scaler.transform(new_customer)
prediction = rf_model.predict(new_scaled)[0]
probability = rf_model.predict_proba(new_scaled)[0][1]
return prediction, probability
# Test with sample customers
print("=== Predict New Customers ===")
# High risk customer
pred, prob = predict_churn('Female', 25, 3, 85, 'Month-to-month',
'Electronic Check', 'No', 'No')
print(f"Customer 1 (High Risk): Churn={pred}, Probability={prob:.1%}")
# Low risk customer
pred, prob = predict_churn('Male', 45, 48, 50, 'Two year',
'Credit Card', 'Yes', 'Yes')
print(f"Customer 2 (Low Risk): Churn={pred}, Probability={prob:.1%}")| Customer | Profile | Churn Prediction | Probability |
|---|---|---|---|
| 1 | Young, new, high charges, month-to-month | Yes (1) | 78% |
| 2 | Older, loyal, moderate charges, 2-year | No (0) | 12% |
Step 10: Business Recommendations
Based on our analysis:
| Finding | Recommendation |
|---|---|
| Month-to-month = high churn | Offer discounts for annual contracts |
| New customers churn most | Improve onboarding experience |
| High charges = more churn | Review pricing for premium plans |
| No tech support = risky | Bundle tech support with plans |
| First year is critical | Focus retention efforts on new customers |
Step 11: Save Model and Results
code.py
import joblib
# Save the model
joblib.dump(rf_model, 'churn_model.pkl')
joblib.dump(scaler, 'churn_scaler.pkl')
# Save results
results = pd.DataFrame({
'Actual': y_test,
'Predicted': rf_pred
})
results.to_csv('churn_predictions.csv', index=False)
print("Model and results saved!")Complete Code
code.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Create data
np.random.seed(42)
n_customers = 1000
data = {
'CustomerID': range(1, n_customers + 1),
'Gender': np.random.choice(['Male', 'Female'], n_customers),
'Age': np.random.randint(18, 70, n_customers),
'Tenure': np.random.randint(1, 72, n_customers),
'MonthlyCharges': np.round(np.random.uniform(20, 100, n_customers), 2),
'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_customers, p=[0.5, 0.3, 0.2]),
'PaymentMethod': np.random.choice(['Credit Card', 'Bank Transfer', 'Electronic Check', 'Mailed Check'], n_customers),
'TechSupport': np.random.choice(['Yes', 'No'], n_customers),
'OnlineSecurity': np.random.choice(['Yes', 'No'], n_customers),
}
df = pd.DataFrame(data)
df['TotalCharges'] = df['Tenure'] * df['MonthlyCharges']
# Create churn based on patterns
churn_prob = (
(df['Contract'] == 'Month-to-month').astype(int) * 0.3 +
(df['Tenure'] < 12).astype(int) * 0.2 +
(df['MonthlyCharges'] > 70).astype(int) * 0.15 +
(df['TechSupport'] == 'No').astype(int) * 0.1 +
np.random.uniform(0, 0.25, n_customers)
)
df['Churn'] = (churn_prob > 0.5).astype(int)
# Prepare for ML
df_ml = df.drop(['CustomerID'], axis=1)
categorical_cols = ['Gender', 'Contract', 'PaymentMethod', 'TechSupport', 'OnlineSecurity']
for col in categorical_cols:
df_ml[col] = LabelEncoder().fit_transform(df_ml[col])
# Split
X = df_ml.drop('Churn', axis=1)
y = df_ml['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
# Evaluate
y_pred = model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.1%}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Feature importance
importance = pd.DataFrame({
'Feature': X.columns,
'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print("\nFeature Importance:")
print(importance)
print("\nProject Complete!")What You Learned
- Analyzing customer churn patterns
- Encoding categorical variables
- Building classification models
- Evaluating model performance
- Interpreting feature importance
- Making predictions on new data
- Deriving business recommendations
Congratulations! You've built your first churn prediction model!
What's Next?
Try the Stock Price Analysis project to work with time series data.