5 min read min read
Creating Summary Reports
Learn to create complete data summaries
Creating Summary Reports
Why Summary Reports?
After EDA, you need to share findings. A good report includes:
- Data overview
- Key statistics
- Important patterns
- Data quality issues
Basic Data Overview
code.py
import pandas as pd
df = pd.DataFrame({
'Name': ['John', 'Sarah', 'Mike', 'Emma', None],
'Age': [25, 30, None, 35, 28],
'City': ['NYC', 'LA', 'NYC', 'Chicago', 'LA'],
'Salary': [50000, 60000, 55000, 70000, 45000]
})
def data_overview(df):
print("=" * 40)
print("DATA OVERVIEW")
print("=" * 40)
print(f"Rows: {len(df)}")
print(f"Columns: {len(df.columns)}")
print(f"\nColumn names: {list(df.columns)}")
print(f"\nData types:")
print(df.dtypes)
data_overview(df)Missing Data Report
code.py
def missing_report(df):
print("=" * 40)
print("MISSING DATA")
print("=" * 40)
missing = df.isna().sum()
pct = (missing / len(df)) * 100
report = pd.DataFrame({
'Missing': missing,
'Percent': pct
})
print(report[report['Missing'] > 0])
print(f"\nTotal missing values: {missing.sum()}")
missing_report(df)Numeric Summary
code.py
def numeric_summary(df):
print("=" * 40)
print("NUMERIC COLUMNS")
print("=" * 40)
numeric_cols = df.select_dtypes(include=['number']).columns
for col in numeric_cols:
print(f"\n{col}:")
print(f" Mean: {df[col].mean():.2f}")
print(f" Median: {df[col].median():.2f}")
print(f" Min: {df[col].min()}")
print(f" Max: {df[col].max()}")
numeric_summary(df)Categorical Summary
code.py
def categorical_summary(df):
print("=" * 40)
print("CATEGORICAL COLUMNS")
print("=" * 40)
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
print(f"\n{col}:")
print(f" Unique values: {df[col].nunique()}")
print(f" Most common: {df[col].mode()[0]}")
print(f" Value counts:")
print(df[col].value_counts().head(5).to_string())
categorical_summary(df)Complete EDA Report Function
code.py
def full_eda_report(df):
print("\n" + "=" * 50)
print("EXPLORATORY DATA ANALYSIS REPORT")
print("=" * 50)
# 1. Overview
print(f"\n1. DATA SHAPE")
print(f" Rows: {len(df)}")
print(f" Columns: {len(df.columns)}")
# 2. Missing data
print(f"\n2. MISSING DATA")
missing = df.isna().sum()
if missing.sum() > 0:
for col in df.columns:
if missing[col] > 0:
pct = (missing[col] / len(df)) * 100
print(f" {col}: {missing[col]} ({pct:.1f}%)")
else:
print(" No missing values!")
# 3. Numeric columns
print(f"\n3. NUMERIC SUMMARY")
print(df.describe().round(2))
# 4. Categorical columns
print(f"\n4. CATEGORICAL SUMMARY")
for col in df.select_dtypes(include=['object']).columns:
print(f"\n {col}:")
print(df[col].value_counts().to_string())
# 5. Correlations
print(f"\n5. CORRELATIONS")
numeric_df = df.select_dtypes(include=['number'])
if len(numeric_df.columns) > 1:
print(numeric_df.corr().round(2))
print("\n" + "=" * 50)
print("END OF REPORT")
print("=" * 50)
# Run the report
full_eda_report(df)Save Report to File
code.py
import sys
# Save print output to file
with open('eda_report.txt', 'w') as f:
# Redirect print to file
old_stdout = sys.stdout
sys.stdout = f
full_eda_report(df)
sys.stdout = old_stdout
print("Report saved to eda_report.txt")Key Points
- Start with data overview (shape, types)
- Report missing values
- Summarize numeric columns (mean, median)
- Summarize categorical columns (counts)
- Note any unusual findings
- Save report for sharing
EDA Checklist
- Data size and shape
- Data types
- Missing values
- Duplicates
- Numeric statistics
- Categorical distributions
- Correlations
- Outliers
- Key findings
What's Next?
Congratulations! You've completed EDA. Next, learn Data Visualization to show your findings in charts.