#1 Data Analytics Program in India
₹2,499₹1,499Enroll Now
5 min read min read

Feature Engineering Basics

Learn to create better features for your models

Feature Engineering Basics

What is Feature Engineering?

Creating new features from existing data to improve model performance.

Good features = Better predictions!

Handling Categorical Data

Label Encoding

Convert categories to numbers:

code.py
from sklearn.preprocessing import LabelEncoder

colors = ['red', 'blue', 'green', 'red', 'blue']

le = LabelEncoder()
encoded = le.fit_transform(colors)
print(encoded)  # [2, 0, 1, 2, 0]

Problem: Model may think blue(0) < green(1) < red(2)

One-Hot Encoding

Create binary columns:

code.py
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

df = pd.DataFrame({'color': ['red', 'blue', 'green', 'red']})

# Method 1: pandas
encoded = pd.get_dummies(df, columns=['color'])
print(encoded)

# Method 2: sklearn
encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(df[['color']])
print(encoded)

Feature Scaling

Standardization (Z-score)

Mean=0, Std=1:

code.py
from sklearn.preprocessing import StandardScaler

data = [[100, 0.5], [200, 1.0], [150, 0.8]]

scaler = StandardScaler()
scaled = scaler.fit_transform(data)
print(scaled)

Use for: Most algorithms (SVM, Neural Networks, KNN)

Min-Max Scaling

Scale to range [0, 1]:

code.py
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled = scaler.fit_transform(data)
print(scaled)

Use for: Neural networks, image data

Creating New Features

Mathematical Combinations

code.py
import pandas as pd

df = pd.DataFrame({
    'length': [10, 20, 15],
    'width': [5, 8, 6]
})

# New features
df['area'] = df['length'] * df['width']
df['perimeter'] = 2 * (df['length'] + df['width'])
df['ratio'] = df['length'] / df['width']

print(df)

Binning (Discretization)

Convert continuous to categorical:

code.py
import pandas as pd

df = pd.DataFrame({'age': [22, 35, 45, 18, 60, 28]})

# Create age groups
df['age_group'] = pd.cut(df['age'],
                         bins=[0, 25, 40, 60, 100],
                         labels=['Young', 'Adult', 'Middle', 'Senior'])
print(df)

Polynomial Features

code.py
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

X = np.array([[2, 3], [4, 5]])

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

print(f"Original: {X.shape}")  # (2, 2)
print(f"Polynomial: {X_poly.shape}")  # (2, 5)
# Includes: x1, x2, x1², x1*x2, x2²

Handling Missing Values

code.py
from sklearn.impute import SimpleImputer
import numpy as np

data = [[1, 2], [np.nan, 3], [7, np.nan]]

# Mean imputation
imputer = SimpleImputer(strategy='mean')
filled = imputer.fit_transform(data)
print(filled)

# Other strategies: 'median', 'most_frequent', 'constant'

Date Features

code.py
import pandas as pd

df = pd.DataFrame({
    'date': pd.to_datetime(['2024-01-15', '2024-06-20', '2024-12-25'])
})

# Extract features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek  # 0=Monday
df['is_weekend'] = df['date'].dt.dayofweek >= 5

print(df)

Text Features

code.py
import pandas as pd

df = pd.DataFrame({
    'text': ['Hello world', 'Machine learning is fun', 'Python']
})

# Basic features
df['word_count'] = df['text'].str.split().str.len()
df['char_count'] = df['text'].str.len()
df['avg_word_length'] = df['char_count'] / df['word_count']

print(df)

Feature Selection

Remove irrelevant features:

code.py
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target

# Select top 2 features
selector = SelectKBest(f_classif, k=2)
X_new = selector.fit_transform(X, y)

print(f"Original: {X.shape}")  # (150, 4)
print(f"Selected: {X_new.shape}")  # (150, 2)

# Which features were selected?
print(f"Selected features: {selector.get_support()}")

Using Pipelines

Combine all preprocessing:

code.py
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

# Fit everything at once
pipeline.fit(X_train, y_train)

# Predict
predictions = pipeline.predict(X_test)

Complete Example

code.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Sample data
df = pd.DataFrame({
    'age': [25, 35, 45, np.nan, 30, 55],
    'income': [50000, 80000, 120000, 60000, 70000, 150000],
    'city': ['NYC', 'LA', 'NYC', 'Chicago', 'LA', 'NYC'],
    'purchased': [0, 1, 1, 0, 0, 1]
})

# Feature engineering
df['income_per_year'] = df['income'] / df['age']

# Separate features and target
X = df[['age', 'income', 'city']]
y = df['purchased']

# Define transformers
numeric_features = ['age', 'income']
categorical_features = ['city']

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# Full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Split and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
pipeline.fit(X_train, y_train)

print(f"Accuracy: {pipeline.score(X_test, y_test):.0%}")

Key Points

  • One-hot encode categorical features
  • Scale numerical features
  • Create new features from existing ones
  • Handle missing values before training
  • Extract features from dates and text
  • Use pipelines to combine all steps
  • Select only useful features

Module Complete!

You've learned Statistics and Machine Learning basics:

  • Descriptive statistics
  • Probability
  • Hypothesis testing
  • ML fundamentals
  • Model evaluation
  • Cross-validation
  • Feature engineering

Now you're ready for real data science projects!