5 min read min read
Probability Distributions
Learn about common probability distributions
Probability Distributions
What is a Distribution?
A distribution shows how values are spread out.
Think of it as: "What values are likely? What values are rare?"
Normal Distribution
The famous bell curve:
- Most values near the center
- Fewer values at the edges
- Symmetric
code.py
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
# Generate normal data
data = np.random.normal(loc=100, scale=15, size=1000)
# loc = mean, scale = std dev
plt.hist(data, bins=30, density=True, alpha=0.7)
plt.title('Normal Distribution')
plt.show()Key Properties
code.py
# 68-95-99.7 Rule
mean = 100
std = 15
# 68% of data within 1 std
print(f"68%: {mean - std} to {mean + std}") # 85 to 115
# 95% of data within 2 std
print(f"95%: {mean - 2*std} to {mean + 2*std}") # 70 to 130
# 99.7% of data within 3 std
print(f"99.7%: {mean - 3*std} to {mean + 3*std}") # 55 to 145Calculating Normal Probabilities
code.py
from scipy.stats import norm
# IQ scores: mean=100, std=15
# P(IQ < 115)
p = norm.cdf(115, loc=100, scale=15)
print(f"P(IQ < 115): {p:.2%}") # 84.13%
# P(IQ > 130)
p = 1 - norm.cdf(130, loc=100, scale=15)
print(f"P(IQ > 130): {p:.2%}") # 2.28%
# P(85 < IQ < 115)
p = norm.cdf(115, 100, 15) - norm.cdf(85, 100, 15)
print(f"P(85 < IQ < 115): {p:.2%}") # 68.27%Standard Normal (Z-scores)
Convert any normal to standard (mean=0, std=1):
code.py
# Z-score formula: z = (x - mean) / std
x = 115
mean = 100
std = 15
z = (x - mean) / std
print(f"Z-score: {z}") # 1.0
# Find percentile from z-score
percentile = norm.cdf(z)
print(f"Percentile: {percentile:.2%}") # 84.13%Binomial Distribution
For yes/no outcomes repeated n times:
code.py
from scipy.stats import binom
# Flip coin 10 times, P(heads) = 0.5
# What's P(exactly 7 heads)?
n = 10 # trials
p = 0.5 # probability of success
# P(X = 7)
prob = binom.pmf(7, n, p)
print(f"P(7 heads): {prob:.2%}") # 11.72%
# P(X >= 7)
prob = 1 - binom.cdf(6, n, p)
print(f"P(7+ heads): {prob:.2%}") # 17.19%Poisson Distribution
For count of events in fixed time/space:
code.py
from scipy.stats import poisson
# Average 3 customers per hour
# What's P(5 customers in an hour)?
lambda_rate = 3 # average rate
# P(X = 5)
prob = poisson.pmf(5, lambda_rate)
print(f"P(5 customers): {prob:.2%}") # 10.08%
# P(X >= 5)
prob = 1 - poisson.cdf(4, lambda_rate)
print(f"P(5+ customers): {prob:.2%}") # 18.47%Uniform Distribution
All values equally likely:
code.py
from scipy.stats import uniform
# Random number between 0 and 10
# P(value between 3 and 7)?
a = 0 # min
b = 10 # max
prob = uniform.cdf(7, a, b-a) - uniform.cdf(3, a, b-a)
print(f"P(3 < X < 7): {prob:.2%}") # 40%Exponential Distribution
Time between events:
code.py
from scipy.stats import expon
# Average 1 customer every 20 minutes
# P(next customer in < 10 min)?
scale = 20 # average time
prob = expon.cdf(10, scale=scale)
print(f"P(< 10 min): {prob:.2%}") # 39.35%Comparing Distributions
| Distribution | Use When |
|---|---|
| Normal | Continuous data, bell-shaped |
| Binomial | Yes/no trials, fixed count |
| Poisson | Count of events |
| Uniform | All values equally likely |
| Exponential | Time between events |
Complete Example
code.py
import numpy as np
from scipy import stats
# Exam scores: normally distributed
# Mean = 75, Std = 10
mean = 75
std = 10
# What score is needed for top 10%?
top_10_score = stats.norm.ppf(0.90, mean, std)
print(f"Top 10% score: {top_10_score:.1f}") # 87.8
# What percent scored between 65 and 85?
percent = stats.norm.cdf(85, mean, std) - stats.norm.cdf(65, mean, std)
print(f"Between 65-85: {percent:.1%}") # 68.3%
# Generate sample and verify
np.random.seed(42)
scores = np.random.normal(mean, std, 1000)
actual_percent = np.sum((scores >= 65) & (scores <= 85)) / len(scores)
print(f"Actual sample: {actual_percent:.1%}")Key Points
- Normal: Bell curve, most common
- Binomial: Count successes in n trials
- Poisson: Count events in fixed period
- Use cdf for P(X ≤ x)
- Use ppf for finding cutoff values
- Use pmf for exact probability (discrete)
- Use pdf for density (continuous)
What's Next?
Learn about hypothesis testing.