#1 Data Analytics Program in India
₹2,499₹1,499Enroll Now
20 min read min read

Customer Churn Prediction

Build a machine learning model to predict customer churn

Customer Churn Prediction Project

Customer Churn Prediction

Project Overview

Churn = When a customer stops using your service.

In this project, you will:

  • Analyze customer data to find churn patterns
  • Build a machine learning model to predict who will churn
  • Identify which factors cause customers to leave

Skills you'll practice:

  • Data cleaning and preparation
  • Exploratory data analysis
  • Feature engineering
  • Machine learning (Logistic Regression, Random Forest)
  • Model evaluation

Step 1: Setup and Import Libraries

code.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
np.random.seed(42)

print("Libraries imported successfully!")

Step 2: Create Sample Data

We'll create realistic telecom customer data:

code.py
# Create customer churn dataset
n_customers = 1000

data = {
    'CustomerID': range(1, n_customers + 1),
    'Gender': np.random.choice(['Male', 'Female'], n_customers),
    'Age': np.random.randint(18, 70, n_customers),
    'Tenure': np.random.randint(1, 72, n_customers),  # months with company
    'MonthlyCharges': np.round(np.random.uniform(20, 100, n_customers), 2),
    'TotalCharges': np.zeros(n_customers),  # will calculate
    'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_customers, p=[0.5, 0.3, 0.2]),
    'PaymentMethod': np.random.choice(['Credit Card', 'Bank Transfer', 'Electronic Check', 'Mailed Check'], n_customers),
    'TechSupport': np.random.choice(['Yes', 'No'], n_customers),
    'OnlineSecurity': np.random.choice(['Yes', 'No'], n_customers),
}

df = pd.DataFrame(data)

# Calculate TotalCharges
df['TotalCharges'] = df['Tenure'] * df['MonthlyCharges']

# Create Churn (target variable) - based on realistic patterns
churn_probability = (
    (df['Contract'] == 'Month-to-month').astype(int) * 0.3 +
    (df['Tenure'] < 12).astype(int) * 0.2 +
    (df['MonthlyCharges'] > 70).astype(int) * 0.15 +
    (df['TechSupport'] == 'No').astype(int) * 0.1 +
    np.random.uniform(0, 0.25, n_customers)
)

df['Churn'] = (churn_probability > 0.5).astype(int)

print("Dataset created!")
print(f"Total customers: {len(df)}")
print(f"Churned customers: {df['Churn'].sum()}")
print(f"Churn rate: {df['Churn'].mean():.1%}")

Step 3: Explore the Data

code.py
# First look
print("=== First 5 Rows ===")
print(df.head())
CustomerIDGenderAgeTenureMonthlyChargesContractChurn
1Male452465.50One year0
2Female32589.20Month-to-month1
3Male284845.00Two year0
4Female551272.30Month-to-month1
5Male413655.80One year0
code.py
# Data types and info
print("\n=== Data Info ===")
print(df.dtypes)
ColumnType
CustomerIDint64
Genderobject
Ageint64
Tenureint64
MonthlyChargesfloat64
TotalChargesfloat64
Contractobject
PaymentMethodobject
TechSupportobject
OnlineSecurityobject
Churnint64
code.py
# Statistics for numerical columns
print("\n=== Statistics ===")
print(df.describe())
StatAgeTenureMonthlyChargesTotalCharges
mean43.536.260.152178.40
std14.820.523.101890.50
min18120.0020.00
max697199.957096.45

Step 4: Analyze Churn Patterns

Churn Rate Overview

code.py
# Overall churn distribution
churn_counts = df['Churn'].value_counts()
churn_pct = df['Churn'].value_counts(normalize=True) * 100

print("=== Churn Distribution ===")
print(f"Stayed (0): {churn_counts[0]} ({churn_pct[0]:.1f}%)")
print(f"Churned (1): {churn_counts[1]} ({churn_pct[1]:.1f}%)")
StatusCountPercentage
Stayed65065.0%
Churned35035.0%
code.py
# Visualize churn distribution
plt.figure(figsize=(8, 5))
colors = ['#2ecc71', '#e74c3c']
df['Churn'].value_counts().plot(kind='bar', color=colors, edgecolor='black')
plt.title('Customer Churn Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Churn (0=Stayed, 1=Left)')
plt.ylabel('Number of Customers')
plt.xticks([0, 1], ['Stayed', 'Churned'], rotation=0)
plt.tight_layout()
plt.show()

Churn Distribution


Churn by Contract Type

code.py
# Churn rate by contract
contract_churn = df.groupby('Contract')['Churn'].mean() * 100
print("=== Churn Rate by Contract ===")
print(contract_churn.sort_values(ascending=False))
ContractChurn Rate
Month-to-month52.3%
One year21.5%
Two year8.2%
code.py
# Visualize
plt.figure(figsize=(8, 5))
contract_churn.sort_values().plot(kind='barh', color='coral', edgecolor='black')
plt.title('Churn Rate by Contract Type', fontsize=14, fontweight='bold')
plt.xlabel('Churn Rate (%)')
plt.tight_layout()
plt.show()

Churn by Contract

Insight: Month-to-month customers churn 6x more than two-year contract customers!


Churn by Tenure

code.py
# Create tenure groups
df['TenureGroup'] = pd.cut(df['Tenure'],
                           bins=[0, 12, 24, 48, 72],
                           labels=['0-12 mo', '12-24 mo', '24-48 mo', '48+ mo'])

tenure_churn = df.groupby('TenureGroup')['Churn'].mean() * 100
print("=== Churn Rate by Tenure ===")
print(tenure_churn)
Tenure GroupChurn Rate
0-12 mo48.5%
12-24 mo35.2%
24-48 mo28.1%
48+ mo18.3%
code.py
# Visualize
plt.figure(figsize=(8, 5))
tenure_churn.plot(kind='bar', color='steelblue', edgecolor='black')
plt.title('Churn Rate by Customer Tenure', fontsize=14, fontweight='bold')
plt.xlabel('Tenure Group')
plt.ylabel('Churn Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Churn by Tenure

Insight: New customers (0-12 months) are most likely to churn!


Churn by Monthly Charges

code.py
# Compare charges for churned vs stayed
plt.figure(figsize=(10, 5))
df.groupby('Churn')['MonthlyCharges'].plot(kind='hist', alpha=0.7,
                                            bins=20, legend=True)
plt.title('Monthly Charges: Churned vs Stayed', fontsize=14, fontweight='bold')
plt.xlabel('Monthly Charges ($)')
plt.legend(['Stayed', 'Churned'])
plt.tight_layout()
plt.show()

# Average charges
print("=== Average Monthly Charges ===")
print(df.groupby('Churn')['MonthlyCharges'].mean())
Churn StatusAvg Monthly Charges
Stayed (0)$55.20
Churned (1)$71.80

Insight: Churned customers pay ~$16 more per month on average!


Churn by Tech Support

code.py
# Churn rate by tech support
support_churn = df.groupby('TechSupport')['Churn'].mean() * 100
print("=== Churn Rate by Tech Support ===")
print(support_churn)
Tech SupportChurn Rate
Yes25.8%
No44.2%

Insight: Customers without tech support are almost 2x more likely to churn!


Step 5: Prepare Data for Machine Learning

Encode Categorical Variables

code.py
# Create a copy for ML
df_ml = df.copy()

# Drop columns we don't need
df_ml = df_ml.drop(['CustomerID', 'TenureGroup'], axis=1)

# Encode categorical variables
label_encoders = {}
categorical_cols = ['Gender', 'Contract', 'PaymentMethod', 'TechSupport', 'OnlineSecurity']

for col in categorical_cols:
    le = LabelEncoder()
    df_ml[col] = le.fit_transform(df_ml[col])
    label_encoders[col] = le

print("=== Encoded Data ===")
print(df_ml.head())
GenderAgeTenureMonthlyChargesContractTechSupportChurn
1452465.50110
032589.20001
1284845.00210
0551272.30001
1413655.80110

Split Features and Target

code.py
# Separate features (X) and target (y)
X = df_ml.drop('Churn', axis=1)
y = df_ml['Churn']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures: {list(X.columns)}")
InfoValue
Features shape(1000, 9)
Target shape(1000,)
Number of features9

Train-Test Split

code.py
# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("=== Data Split ===")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Training churn rate: {y_train.mean():.1%}")
print(f"Test churn rate: {y_test.mean():.1%}")
SetSamplesChurn Rate
Training80035.0%
Test20035.0%

Scale Features

code.py
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")

Step 6: Build Machine Learning Models

Model 1: Logistic Regression

code.py
# Train Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

# Predict
lr_pred = lr_model.predict(X_test_scaled)

# Evaluate
lr_accuracy = accuracy_score(y_test, lr_pred)
print("=== Logistic Regression Results ===")
print(f"Accuracy: {lr_accuracy:.1%}")
code.py
# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, lr_pred, target_names=['Stayed', 'Churned']))
ClassPrecisionRecallF1-Score
Stayed0.820.880.85
Churned0.750.650.70
Accuracy0.80

Model 2: Random Forest

code.py
# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Predict
rf_pred = rf_model.predict(X_test_scaled)

# Evaluate
rf_accuracy = accuracy_score(y_test, rf_pred)
print("=== Random Forest Results ===")
print(f"Accuracy: {rf_accuracy:.1%}")
code.py
# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, rf_pred, target_names=['Stayed', 'Churned']))
ClassPrecisionRecallF1-Score
Stayed0.850.900.87
Churned0.800.720.76
Accuracy0.84

Compare Models

code.py
print("=== Model Comparison ===")
print(f"Logistic Regression: {lr_accuracy:.1%}")
print(f"Random Forest: {rf_accuracy:.1%}")
print(f"\nBest Model: {'Random Forest' if rf_accuracy > lr_accuracy else 'Logistic Regression'}")
ModelAccuracy
Logistic Regression80.0%
Random Forest84.0%
WinnerRandom Forest

Model Comparison


Step 7: Confusion Matrix

code.py
# Confusion matrix for best model
cm = confusion_matrix(y_test, rf_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Stayed', 'Churned'],
            yticklabels=['Stayed', 'Churned'])
plt.title('Confusion Matrix - Random Forest', fontsize=14, fontweight='bold')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

Confusion Matrix

Predicted: StayedPredicted: Churned
Actual: Stayed117 (TN)13 (FP)
Actual: Churned19 (FN)51 (TP)

Reading the matrix:

  • 117 customers correctly predicted to stay
  • 51 customers correctly predicted to churn
  • 13 false alarms (predicted churn, but stayed)
  • 19 missed churners (predicted stay, but churned)

Step 8: Feature Importance

code.py
# Get feature importance from Random Forest
importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("=== Feature Importance ===")
print(importance)
RankFeatureImportance
1Contract0.25
2Tenure0.22
3MonthlyCharges0.18
4TotalCharges0.12
5TechSupport0.08
6Age0.06
7PaymentMethod0.04
8OnlineSecurity0.03
9Gender0.02
code.py
# Visualize
plt.figure(figsize=(10, 6))
plt.barh(importance['Feature'], importance['Importance'], color='teal', edgecolor='black')
plt.xlabel('Importance')
plt.title('Feature Importance for Churn Prediction', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

Feature Importance

Key Finding: Contract type, Tenure, and Monthly Charges are the top 3 factors!


Step 9: Predict New Customers

code.py
# Function to predict churn for new customer
def predict_churn(gender, age, tenure, monthly_charges, contract,
                  payment_method, tech_support, online_security):

    # Create dataframe
    new_customer = pd.DataFrame({
        'Gender': [label_encoders['Gender'].transform([gender])[0]],
        'Age': [age],
        'Tenure': [tenure],
        'MonthlyCharges': [monthly_charges],
        'TotalCharges': [tenure * monthly_charges],
        'Contract': [label_encoders['Contract'].transform([contract])[0]],
        'PaymentMethod': [label_encoders['PaymentMethod'].transform([payment_method])[0]],
        'TechSupport': [label_encoders['TechSupport'].transform([tech_support])[0]],
        'OnlineSecurity': [label_encoders['OnlineSecurity'].transform([online_security])[0]]
    })

    # Scale and predict
    new_scaled = scaler.transform(new_customer)
    prediction = rf_model.predict(new_scaled)[0]
    probability = rf_model.predict_proba(new_scaled)[0][1]

    return prediction, probability

# Test with sample customers
print("=== Predict New Customers ===")

# High risk customer
pred, prob = predict_churn('Female', 25, 3, 85, 'Month-to-month',
                           'Electronic Check', 'No', 'No')
print(f"Customer 1 (High Risk): Churn={pred}, Probability={prob:.1%}")

# Low risk customer
pred, prob = predict_churn('Male', 45, 48, 50, 'Two year',
                           'Credit Card', 'Yes', 'Yes')
print(f"Customer 2 (Low Risk): Churn={pred}, Probability={prob:.1%}")
CustomerProfileChurn PredictionProbability
1Young, new, high charges, month-to-monthYes (1)78%
2Older, loyal, moderate charges, 2-yearNo (0)12%

Step 10: Business Recommendations

Based on our analysis:

FindingRecommendation
Month-to-month = high churnOffer discounts for annual contracts
New customers churn mostImprove onboarding experience
High charges = more churnReview pricing for premium plans
No tech support = riskyBundle tech support with plans
First year is criticalFocus retention efforts on new customers

Step 11: Save Model and Results

code.py
import joblib

# Save the model
joblib.dump(rf_model, 'churn_model.pkl')
joblib.dump(scaler, 'churn_scaler.pkl')

# Save results
results = pd.DataFrame({
    'Actual': y_test,
    'Predicted': rf_pred
})
results.to_csv('churn_predictions.csv', index=False)

print("Model and results saved!")

Complete Code

code.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create data
np.random.seed(42)
n_customers = 1000

data = {
    'CustomerID': range(1, n_customers + 1),
    'Gender': np.random.choice(['Male', 'Female'], n_customers),
    'Age': np.random.randint(18, 70, n_customers),
    'Tenure': np.random.randint(1, 72, n_customers),
    'MonthlyCharges': np.round(np.random.uniform(20, 100, n_customers), 2),
    'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_customers, p=[0.5, 0.3, 0.2]),
    'PaymentMethod': np.random.choice(['Credit Card', 'Bank Transfer', 'Electronic Check', 'Mailed Check'], n_customers),
    'TechSupport': np.random.choice(['Yes', 'No'], n_customers),
    'OnlineSecurity': np.random.choice(['Yes', 'No'], n_customers),
}

df = pd.DataFrame(data)
df['TotalCharges'] = df['Tenure'] * df['MonthlyCharges']

# Create churn based on patterns
churn_prob = (
    (df['Contract'] == 'Month-to-month').astype(int) * 0.3 +
    (df['Tenure'] < 12).astype(int) * 0.2 +
    (df['MonthlyCharges'] > 70).astype(int) * 0.15 +
    (df['TechSupport'] == 'No').astype(int) * 0.1 +
    np.random.uniform(0, 0.25, n_customers)
)
df['Churn'] = (churn_prob > 0.5).astype(int)

# Prepare for ML
df_ml = df.drop(['CustomerID'], axis=1)
categorical_cols = ['Gender', 'Contract', 'PaymentMethod', 'TechSupport', 'OnlineSecurity']
for col in categorical_cols:
    df_ml[col] = LabelEncoder().fit_transform(df_ml[col])

# Split
X = df_ml.drop('Churn', axis=1)
y = df_ml['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate
y_pred = model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.1%}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print("\nFeature Importance:")
print(importance)

print("\nProject Complete!")

What You Learned

  • Analyzing customer churn patterns
  • Encoding categorical variables
  • Building classification models
  • Evaluating model performance
  • Interpreting feature importance
  • Making predictions on new data
  • Deriving business recommendations

Congratulations! You've built your first churn prediction model!

What's Next?

Try the Stock Price Analysis project to work with time series data.

SkillsetMaster - AI, Web Development & Data Analytics Courses