Customer Churn Prediction Project

Customer Churn Prediction

Project Overview

Churn = When a customer stops using your service.

In this project, you will:

Analyze customer data to find churn patterns
Build a machine learning model to predict who will churn
Identify which factors cause customers to leave

Skills you'll practice:

Data cleaning and preparation
Exploratory data analysis
Feature engineering
Machine learning (Logistic Regression, Random Forest)
Model evaluation

Step 1: Setup and Import Libraries

code.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
np.random.seed(42)

print("Libraries imported successfully!")

Step 2: Create Sample Data

We'll create realistic telecom customer data:

code.py

# Create customer churn dataset
n_customers = 1000

data = {
    'CustomerID': range(1, n_customers + 1),
    'Gender': np.random.choice(['Male', 'Female'], n_customers),
    'Age': np.random.randint(18, 70, n_customers),
    'Tenure': np.random.randint(1, 72, n_customers),  # months with company
    'MonthlyCharges': np.round(np.random.uniform(20, 100, n_customers), 2),
    'TotalCharges': np.zeros(n_customers),  # will calculate
    'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_customers, p=[0.5, 0.3, 0.2]),
    'PaymentMethod': np.random.choice(['Credit Card', 'Bank Transfer', 'Electronic Check', 'Mailed Check'], n_customers),
    'TechSupport': np.random.choice(['Yes', 'No'], n_customers),
    'OnlineSecurity': np.random.choice(['Yes', 'No'], n_customers),
}

df = pd.DataFrame(data)

# Calculate TotalCharges
df['TotalCharges'] = df['Tenure'] * df['MonthlyCharges']

# Create Churn (target variable) - based on realistic patterns
churn_probability = (
    (df['Contract'] == 'Month-to-month').astype(int) * 0.3 +
    (df['Tenure'] < 12).astype(int) * 0.2 +
    (df['MonthlyCharges'] > 70).astype(int) * 0.15 +
    (df['TechSupport'] == 'No').astype(int) * 0.1 +
    np.random.uniform(0, 0.25, n_customers)
)

df['Churn'] = (churn_probability > 0.5).astype(int)

print("Dataset created!")
print(f"Total customers: {len(df)}")
print(f"Churned customers: {df['Churn'].sum()}")
print(f"Churn rate: {df['Churn'].mean():.1%}")

Step 3: Explore the Data

code.py

# First look
print("=== First 5 Rows ===")
print(df.head())

CustomerID	Gender	Age	Tenure	MonthlyCharges	Contract	Churn
1	Male	45	24	65.50	One year	0
2	Female	32	5	89.20	Month-to-month	1
3	Male	28	48	45.00	Two year	0
4	Female	55	12	72.30	Month-to-month	1
5	Male	41	36	55.80	One year	0

code.py

# Data types and info
print("\n=== Data Info ===")
print(df.dtypes)

Column	Type
CustomerID	int64
Gender	object
Age	int64
Tenure	int64
MonthlyCharges	float64
TotalCharges	float64
Contract	object
PaymentMethod	object
TechSupport	object
OnlineSecurity	object
Churn	int64

code.py

# Statistics for numerical columns
print("\n=== Statistics ===")
print(df.describe())

Stat	Age	Tenure	MonthlyCharges	TotalCharges
mean	43.5	36.2	60.15	2178.40
std	14.8	20.5	23.10	1890.50
min	18	1	20.00	20.00
max	69	71	99.95	7096.45

Step 4: Analyze Churn Patterns

Churn Rate Overview

code.py

# Overall churn distribution
churn_counts = df['Churn'].value_counts()
churn_pct = df['Churn'].value_counts(normalize=True) * 100

print("=== Churn Distribution ===")
print(f"Stayed (0): {churn_counts[0]} ({churn_pct[0]:.1f}%)")
print(f"Churned (1): {churn_counts[1]} ({churn_pct[1]:.1f}%)")

Status	Count	Percentage
Stayed	650	65.0%
Churned	350	35.0%

code.py

# Visualize churn distribution
plt.figure(figsize=(8, 5))
colors = ['#2ecc71', '#e74c3c']
df['Churn'].value_counts().plot(kind='bar', color=colors, edgecolor='black')
plt.title('Customer Churn Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Churn (0=Stayed, 1=Left)')
plt.ylabel('Number of Customers')
plt.xticks([0, 1], ['Stayed', 'Churned'], rotation=0)
plt.tight_layout()
plt.show()

Churn Distribution

Churn by Contract Type

code.py

# Churn rate by contract
contract_churn = df.groupby('Contract')['Churn'].mean() * 100
print("=== Churn Rate by Contract ===")
print(contract_churn.sort_values(ascending=False))

Contract	Churn Rate
Month-to-month	52.3%
One year	21.5%
Two year	8.2%

code.py

# Visualize
plt.figure(figsize=(8, 5))
contract_churn.sort_values().plot(kind='barh', color='coral', edgecolor='black')
plt.title('Churn Rate by Contract Type', fontsize=14, fontweight='bold')
plt.xlabel('Churn Rate (%)')
plt.tight_layout()
plt.show()

Churn by Contract

Insight: Month-to-month customers churn 6x more than two-year contract customers!

Churn by Tenure

code.py

# Create tenure groups
df['TenureGroup'] = pd.cut(df['Tenure'],
                           bins=[0, 12, 24, 48, 72],
                           labels=['0-12 mo', '12-24 mo', '24-48 mo', '48+ mo'])

tenure_churn = df.groupby('TenureGroup')['Churn'].mean() * 100
print("=== Churn Rate by Tenure ===")
print(tenure_churn)

Tenure Group	Churn Rate
0-12 mo	48.5%
12-24 mo	35.2%
24-48 mo	28.1%
48+ mo	18.3%

code.py

# Visualize
plt.figure(figsize=(8, 5))
tenure_churn.plot(kind='bar', color='steelblue', edgecolor='black')
plt.title('Churn Rate by Customer Tenure', fontsize=14, fontweight='bold')
plt.xlabel('Tenure Group')
plt.ylabel('Churn Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Churn by Tenure

Insight: New customers (0-12 months) are most likely to churn!

Churn by Monthly Charges

code.py

# Compare charges for churned vs stayed
plt.figure(figsize=(10, 5))
df.groupby('Churn')['MonthlyCharges'].plot(kind='hist', alpha=0.7,
                                            bins=20, legend=True)
plt.title('Monthly Charges: Churned vs Stayed', fontsize=14, fontweight='bold')
plt.xlabel('Monthly Charges ($)')
plt.legend(['Stayed', 'Churned'])
plt.tight_layout()
plt.show()

# Average charges
print("=== Average Monthly Charges ===")
print(df.groupby('Churn')['MonthlyCharges'].mean())

Churn Status	Avg Monthly Charges
Stayed (0)	$55.20
Churned (1)	$71.80

Insight: Churned customers pay ~$16 more per month on average!

Churn by Tech Support

code.py

# Churn rate by tech support
support_churn = df.groupby('TechSupport')['Churn'].mean() * 100
print("=== Churn Rate by Tech Support ===")
print(support_churn)

Tech Support	Churn Rate
Yes	25.8%
No	44.2%

Insight: Customers without tech support are almost 2x more likely to churn!

Step 5: Prepare Data for Machine Learning

Encode Categorical Variables

code.py

# Create a copy for ML
df_ml = df.copy()

# Drop columns we don't need
df_ml = df_ml.drop(['CustomerID', 'TenureGroup'], axis=1)

# Encode categorical variables
label_encoders = {}
categorical_cols = ['Gender', 'Contract', 'PaymentMethod', 'TechSupport', 'OnlineSecurity']

for col in categorical_cols:
    le = LabelEncoder()
    df_ml[col] = le.fit_transform(df_ml[col])
    label_encoders[col] = le

print("=== Encoded Data ===")
print(df_ml.head())

Gender	Age	Tenure	MonthlyCharges	Contract	TechSupport	Churn
1	45	24	65.50	1	1	0
0	32	5	89.20	0	0	1
1	28	48	45.00	2	1	0
0	55	12	72.30	0	0	1
1	41	36	55.80	1	1	0

Split Features and Target

code.py

# Separate features (X) and target (y)
X = df_ml.drop('Churn', axis=1)
y = df_ml['Churn']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures: {list(X.columns)}")

Info	Value
Features shape	(1000, 9)
Target shape	(1000,)
Number of features	9

Train-Test Split

code.py

# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("=== Data Split ===")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Training churn rate: {y_train.mean():.1%}")
print(f"Test churn rate: {y_test.mean():.1%}")

Set	Samples	Churn Rate
Training	800	35.0%
Test	200	35.0%

Scale Features

code.py

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")

Step 6: Build Machine Learning Models

Model 1: Logistic Regression

code.py

# Train Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

# Predict
lr_pred = lr_model.predict(X_test_scaled)

# Evaluate
lr_accuracy = accuracy_score(y_test, lr_pred)
print("=== Logistic Regression Results ===")
print(f"Accuracy: {lr_accuracy:.1%}")

code.py

# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, lr_pred, target_names=['Stayed', 'Churned']))

Class	Precision	Recall	F1-Score
Stayed	0.82	0.88	0.85
Churned	0.75	0.65	0.70
Accuracy			0.80

Model 2: Random Forest

code.py

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Predict
rf_pred = rf_model.predict(X_test_scaled)

# Evaluate
rf_accuracy = accuracy_score(y_test, rf_pred)
print("=== Random Forest Results ===")
print(f"Accuracy: {rf_accuracy:.1%}")

code.py

# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, rf_pred, target_names=['Stayed', 'Churned']))

Class	Precision	Recall	F1-Score
Stayed	0.85	0.90	0.87
Churned	0.80	0.72	0.76
Accuracy			0.84

Compare Models

code.py

print("=== Model Comparison ===")
print(f"Logistic Regression: {lr_accuracy:.1%}")
print(f"Random Forest: {rf_accuracy:.1%}")
print(f"\nBest Model: {'Random Forest' if rf_accuracy > lr_accuracy else 'Logistic Regression'}")

Model	Accuracy
Logistic Regression	80.0%
Random Forest	84.0%
Winner	Random Forest

Model Comparison

Step 7: Confusion Matrix

code.py

# Confusion matrix for best model
cm = confusion_matrix(y_test, rf_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Stayed', 'Churned'],
            yticklabels=['Stayed', 'Churned'])
plt.title('Confusion Matrix - Random Forest', fontsize=14, fontweight='bold')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

Confusion Matrix

	Predicted: Stayed	Predicted: Churned
Actual: Stayed	117 (TN)	13 (FP)
Actual: Churned	19 (FN)	51 (TP)

Reading the matrix:

117 customers correctly predicted to stay
51 customers correctly predicted to churn
13 false alarms (predicted churn, but stayed)
19 missed churners (predicted stay, but churned)

Step 8: Feature Importance

code.py

# Get feature importance from Random Forest
importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("=== Feature Importance ===")
print(importance)

Rank	Feature	Importance
1	Contract	0.25
2	Tenure	0.22
3	MonthlyCharges	0.18
4	TotalCharges	0.12
5	TechSupport	0.08
6	Age	0.06
7	PaymentMethod	0.04
8	OnlineSecurity	0.03
9	Gender	0.02

code.py

# Visualize
plt.figure(figsize=(10, 6))
plt.barh(importance['Feature'], importance['Importance'], color='teal', edgecolor='black')
plt.xlabel('Importance')
plt.title('Feature Importance for Churn Prediction', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

Feature Importance

Key Finding: Contract type, Tenure, and Monthly Charges are the top 3 factors!

Step 9: Predict New Customers

code.py

# Function to predict churn for new customer
def predict_churn(gender, age, tenure, monthly_charges, contract,
                  payment_method, tech_support, online_security):

    # Create dataframe
    new_customer = pd.DataFrame({
        'Gender': [label_encoders['Gender'].transform([gender])[0]],
        'Age': [age],
        'Tenure': [tenure],
        'MonthlyCharges': [monthly_charges],
        'TotalCharges': [tenure * monthly_charges],
        'Contract': [label_encoders['Contract'].transform([contract])[0]],
        'PaymentMethod': [label_encoders['PaymentMethod'].transform([payment_method])[0]],
        'TechSupport': [label_encoders['TechSupport'].transform([tech_support])[0]],
        'OnlineSecurity': [label_encoders['OnlineSecurity'].transform([online_security])[0]]
    })

    # Scale and predict
    new_scaled = scaler.transform(new_customer)
    prediction = rf_model.predict(new_scaled)[0]
    probability = rf_model.predict_proba(new_scaled)[0][1]

    return prediction, probability

# Test with sample customers
print("=== Predict New Customers ===")

# High risk customer
pred, prob = predict_churn('Female', 25, 3, 85, 'Month-to-month',
                           'Electronic Check', 'No', 'No')
print(f"Customer 1 (High Risk): Churn={pred}, Probability={prob:.1%}")

# Low risk customer
pred, prob = predict_churn('Male', 45, 48, 50, 'Two year',
                           'Credit Card', 'Yes', 'Yes')
print(f"Customer 2 (Low Risk): Churn={pred}, Probability={prob:.1%}")

Customer	Profile	Churn Prediction	Probability
1	Young, new, high charges, month-to-month	Yes (1)	78%
2	Older, loyal, moderate charges, 2-year	No (0)	12%

Step 10: Business Recommendations

Based on our analysis:

Finding	Recommendation
Month-to-month = high churn	Offer discounts for annual contracts
New customers churn most	Improve onboarding experience
High charges = more churn	Review pricing for premium plans
No tech support = risky	Bundle tech support with plans
First year is critical	Focus retention efforts on new customers

Step 11: Save Model and Results

code.py

import joblib

# Save the model
joblib.dump(rf_model, 'churn_model.pkl')
joblib.dump(scaler, 'churn_scaler.pkl')

# Save results
results = pd.DataFrame({
    'Actual': y_test,
    'Predicted': rf_pred
})
results.to_csv('churn_predictions.csv', index=False)

print("Model and results saved!")

Complete Code

code.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create data
np.random.seed(42)
n_customers = 1000

data = {
    'CustomerID': range(1, n_customers + 1),
    'Gender': np.random.choice(['Male', 'Female'], n_customers),
    'Age': np.random.randint(18, 70, n_customers),
    'Tenure': np.random.randint(1, 72, n_customers),
    'MonthlyCharges': np.round(np.random.uniform(20, 100, n_customers), 2),
    'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_customers, p=[0.5, 0.3, 0.2]),
    'PaymentMethod': np.random.choice(['Credit Card', 'Bank Transfer', 'Electronic Check', 'Mailed Check'], n_customers),
    'TechSupport': np.random.choice(['Yes', 'No'], n_customers),
    'OnlineSecurity': np.random.choice(['Yes', 'No'], n_customers),
}

df = pd.DataFrame(data)
df['TotalCharges'] = df['Tenure'] * df['MonthlyCharges']

# Create churn based on patterns
churn_prob = (
    (df['Contract'] == 'Month-to-month').astype(int) * 0.3 +
    (df['Tenure'] < 12).astype(int) * 0.2 +
    (df['MonthlyCharges'] > 70).astype(int) * 0.15 +
    (df['TechSupport'] == 'No').astype(int) * 0.1 +
    np.random.uniform(0, 0.25, n_customers)
)
df['Churn'] = (churn_prob > 0.5).astype(int)

# Prepare for ML
df_ml = df.drop(['CustomerID'], axis=1)
categorical_cols = ['Gender', 'Contract', 'PaymentMethod', 'TechSupport', 'OnlineSecurity']
for col in categorical_cols:
    df_ml[col] = LabelEncoder().fit_transform(df_ml[col])

# Split
X = df_ml.drop('Churn', axis=1)
y = df_ml['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate
y_pred = model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.1%}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print("\nFeature Importance:")
print(importance)

print("\nProject Complete!")

What You Learned

Analyzing customer churn patterns
Encoding categorical variables
Building classification models
Evaluating model performance
Interpreting feature importance
Making predictions on new data
Deriving business recommendations

Congratulations! You've built your first churn prediction model!

What's Next?

Try the Stock Price Analysis project to work with time series data.