name	Cohort Analysis
description	Track and analyze user cohorts over time, calculate retention rates, and identify behavioral patterns for customer lifecycle and retention analysis

Cohort Analysis

Cohort analysis tracks groups of users with shared characteristics over time, revealing patterns in retention, engagement, and lifetime value.

Core Concepts

Cohort: Group of users sharing a characteristic (signup date, region, etc.)
Cohort Size: Initial group size
Retention Rate: Percentage remaining active
Churn Rate: Percentage who left
Retention Curve: How cohort degrades over time

Cohort Types

Acquisition Date: Users grouped by signup period
Behavioral: Users grouped by actions taken
Revenue: Users grouped by purchase value
Geographic: Users grouped by location
Demographic: Users grouped by characteristics

Implementation with Python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Create sample user lifecycle data
np.random.seed(42)

# Generate user data
n_users = 5000
users = []

for user_id in range(n_users):
    signup_month = np.random.choice(range(1, 13))
    lifetime_months = np.random.poisson(6) + 1

    for month in range(1, lifetime_months + 1):
        users.append({
            'user_id': user_id,
            'signup_month': signup_month,
            'month': month,
            'active': 1,
        })

df = pd.DataFrame(users)

# Add derived columns
df['cohort_month'] = df['signup_month']
df['cohort_age'] = df['month']  # Could be day, week, etc.
df['date'] = pd.to_datetime('2023-01-01') + pd.to_timedelta(df['signup_month'] * 30, unit='D')

print("User Data Summary:")
print(df.head(10))

# 1. Cohort Table (Retention Matrix)
cohort_data = df.groupby(['cohort_month', 'cohort_age']).agg({
    'user_id': 'nunique'
}).reset_index()
cohort_data.columns = ['cohort_month', 'cohort_age', 'unique_users']

# Create pivot table
cohort_pivot = cohort_data.pivot(index='cohort_month', columns='cohort_age', values='unique_users')

print("\nCohort Sizes (Raw User Counts):")
print(cohort_pivot)

# 2. Cohort Retention (as percentage of cohort size)
cohort_size = cohort_pivot.iloc[:, 0]
retention_table = cohort_pivot.divide(cohort_size, axis=0) * 100

print("\nCohort Retention Rate (%):")
print(retention_table.round(1))

# 3. Visualize Retention Matrix
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Heatmap of raw counts
sns.heatmap(cohort_pivot, annot=True, fmt='g', cmap='YlOrRd', ax=axes[0],
            cbar_kws={'label': 'User Count'})
axes[0].set_title('Cohort Sizes - User Counts')
axes[0].set_xlabel('Cohort Age (Months)')
axes[0].set_ylabel('Cohort Month')

# Heatmap of retention rates
sns.heatmap(retention_table, annot=True, fmt='.0f', cmap='RdYlGn', vmin=0, vmax=100,
            ax=axes[1], cbar_kws={'label': 'Retention %'})
axes[1].set_title('Cohort Retention Rates (%)')
axes[1].set_xlabel('Cohort Age (Months)')
axes[1].set_ylabel('Cohort Month')

plt.tight_layout()
plt.show()

# 4. Retention Curve
fig, ax = plt.subplots(figsize=(12, 6))

# Plot retention curves for each cohort
for cohort_month in cohort_pivot.index[:8]:  # First 8 cohorts
    cohort_retention = retention_table.loc[cohort_month]
    ax.plot(cohort_retention.index, cohort_retention.values, marker='o', label=f'Cohort {cohort_month}')

ax.set_xlabel('Cohort Age (Months)')
ax.set_ylabel('Retention Rate (%)')
ax.set_title('Retention Curves by Cohort')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True, alpha=0.3)
ax.set_ylim([0, 105])

plt.tight_layout()
plt.show()

# 5. Average Retention Curve
fig, ax = plt.subplots(figsize=(10, 6))

# Calculate average retention at each age
avg_retention = retention_table.mean()
ax.plot(avg_retention.index, avg_retention.values, marker='o', linewidth=2, markersize=8, color='navy')
ax.fill_between(avg_retention.index, avg_retention.values, alpha=0.3, color='navy')

# Add confidence interval
std_retention = retention_table.std()
ax.fill_between(std_retention.index,
                avg_retention - std_retention,
                avg_retention + std_retention,
                alpha=0.2, color='navy', label='±1 Std Dev')

ax.set_xlabel('Cohort Age (Months)')
ax.set_ylabel('Retention Rate (%)')
ax.set_title('Average Retention Curve with Confidence Band')
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_ylim([0, 105])

plt.tight_layout()
plt.show()

# 6. Churn Rate
churn_rate = 100 - retention_table
print("\nChurn Rates (%):")
print(churn_rate.round(1).head())

# 7. Revenue Cohort Analysis
# Add revenue data
np.random.seed(42)
df['revenue'] = np.random.exponential(50, len(df))

# Revenue by cohort
revenue_data = df.groupby(['cohort_month', 'cohort_age']).agg({
    'revenue': 'sum',
    'user_id': 'nunique'
}).reset_index()
revenue_data['revenue_per_user'] = revenue_data['revenue'] / revenue_data['user_id']

revenue_pivot = revenue_data.pivot(index='cohort_month', columns='cohort_age', values='revenue')
rpu_pivot = revenue_data.pivot(index='cohort_month', columns='cohort_age', values='revenue_per_user')

# Visualize revenue
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

sns.heatmap(revenue_pivot, annot=True, fmt='.0f', cmap='YlGnBu', ax=axes[0],
            cbar_kws={'label': 'Total Revenue ($)'})
axes[0].set_title('Total Revenue by Cohort')
axes[0].set_xlabel('Cohort Age (Months)')
axes[0].set_ylabel('Cohort Month')

sns.heatmap(rpu_pivot, annot=True, fmt='.2f', cmap='YlGnBu', ax=axes[1],
            cbar_kws={'label': 'Revenue per User ($)'})
axes[1].set_title('Revenue per User by Cohort')
axes[1].set_xlabel('Cohort Age (Months)')
axes[1].set_ylabel('Cohort Month')

plt.tight_layout()
plt.show()

# 8. Lifetime Value Calculation
df['month_since_signup'] = df['cohort_age']
ltv_data = df.groupby('user_id').agg({
    'revenue': 'sum',
    'cohort_month': 'first',
    'month_since_signup': 'max',
}).reset_index()
ltv_data.columns = ['user_id', 'lifetime_value', 'cohort_month', 'lifetime_months']

# Average LTV by cohort
ltv_by_cohort = ltv_data.groupby('cohort_month')['lifetime_value'].agg(['mean', 'median', 'std'])

print("\nLifetime Value by Cohort:")
print(ltv_by_cohort.round(2))

fig, ax = plt.subplots(figsize=(10, 6))
ltv_by_cohort['mean'].plot(kind='bar', ax=ax, color='skyblue', edgecolor='black')
ax.set_title('Average Lifetime Value by Cohort')
ax.set_xlabel('Cohort Month')
ax.set_ylabel('Lifetime Value ($)')
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

# 9. Cohort Composition Over Time
fig, ax = plt.subplots(figsize=(12, 6))

# Active users per month by cohort
active_by_month = df.groupby(['date', 'cohort_month']).size().reset_index(name='active_users')
pivot_active = active_by_month.pivot(index='date', columns='cohort_month', values='active_users')

pivot_active.plot(ax=ax, marker='o')
ax.set_title('Active Users Per Month by Cohort')
ax.set_xlabel('Month')
ax.set_ylabel('Active Users')
ax.legend(title='Cohort Month', bbox_to_anchor=(1.05, 1))
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 10. Cohort Summary Metrics
summary_metrics = pd.DataFrame({
    'Cohort Month': cohort_size.index,
    'Initial Size': cohort_size.values,
    'Month 1 Retention': retention_table.iloc[:, 0].values,
    'Month 3 Retention': retention_table.iloc[:, min(2, retention_table.shape[1]-1)].values,
    'Avg LTV': ltv_by_cohort['mean'].values,
})

print("\nCohort Summary Metrics:")
print(summary_metrics.round(2))

# 11. Visualization comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Month 1 vs Month 3 retention
ax_plot = axes[0]
months = ['Month 1', 'Month 3']
month_1_ret = retention_table.iloc[:, 0].mean()
month_3_ret = retention_table.iloc[:, min(2, retention_table.shape[1]-1)].mean()
ax_plot.bar(months, [month_1_ret, month_3_ret], color=['#1f77b4', '#ff7f0e'], edgecolor='black')
ax_plot.set_ylabel('Retention Rate (%)')
ax_plot.set_title('Average Retention by Milestone')
ax_plot.set_ylim([0, 100])
for i, v in enumerate([month_1_ret, month_3_ret]):
    ax_plot.text(i, v + 2, f'{v:.1f}%', ha='center')

# Cohort size trend
axes[1].plot(cohort_size.index, cohort_size.values, marker='o', linewidth=2, markersize=8)
axes[1].set_xlabel('Cohort Month')
axes[1].set_ylabel('Cohort Size')
axes[1].set_title('Cohort Sizes Over Time')
axes[1].grid(True, alpha=0.3)

# LTV trend
axes[2].plot(ltv_by_cohort.index, ltv_by_cohort['mean'].values, marker='o', linewidth=2, markersize=8, color='green')
axes[2].set_xlabel('Cohort Month')
axes[2].set_ylabel('Average Lifetime Value ($)')
axes[2].set_title('LTV Trend by Cohort')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nCohort analysis complete!")

Key Metrics

Retention Rate: % of cohort active
Churn Rate: % of cohort lost
Day/Month 1 Retention: Early engagement
Lifetime Value: Total revenue per user
Payback Period: Time to recover CAC

Insights to Look For

Early retention predictors
Differences between cohorts
Seasonal patterns
Engagement degradation
Revenue trends

Deliverables

Cohort retention matrix
Retention curve visualization
Churn rate analysis
Lifetime value calculations
Revenue per cohort
Executive summary with insights
Actionable recommendations

Cohort Analysis

Cohort analysis tracks groups of users with shared characteristics over time, revealing patterns in retention, engagement, and lifetime value.

Core Concepts

Cohort: Group of users sharing a characteristic (signup date, region, etc.)

Cohort Size: Initial group size

Retention Rate: Percentage remaining active

Churn Rate: Percentage who left

Retention Curve: How cohort degrades over time

Cohort Types

Acquisition Date: Users grouped by signup period

Behavioral: Users grouped by actions taken

Revenue: Users grouped by purchase value

Geographic: Users grouped by location

Demographic: Users grouped by characteristics

Implementation with Python

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns # Create sample user lifecycle data np.random.seed(42) # Generate user data n_users = 5000 users = [] for user_id in range(n_users): signup_month = np.random.choice(range(1, 13)) lifetime_months = np.random.poisson(6) + 1 for month in range(1, lifetime_months + 1): users.append({ 'user_id': user_id, 'signup_month': signup_month, 'month': month, 'active': 1, }) df = pd.DataFrame(users) # Add derived columns df['cohort_month'] = df['signup_month'] df['cohort_age'] = df['month'] # Could be day, week, etc. df['date'] = pd.to_datetime('2023-01-01') + pd.to_timedelta(df['signup_month'] * 30, unit='D') print("User Data Summary:") print(df.head(10)) # 1. Cohort Table (Retention Matrix) cohort_data = df.groupby(['cohort_month', 'cohort_age']).agg({ 'user_id': 'nunique' }).reset_index() cohort_data.columns = ['cohort_month', 'cohort_age', 'unique_users'] # Create pivot table cohort_pivot = cohort_data.pivot(index='cohort_month', columns='cohort_age', values='unique_users') print("\nCohort Sizes (Raw User Counts):") print(cohort_pivot) # 2. Cohort Retention (as percentage of cohort size) cohort_size = cohort_pivot.iloc[:, 0] retention_table = cohort_pivot.divide(cohort_size, axis=0) * 100 print("\nCohort Retention Rate (%):") print(retention_table.round(1)) # 3. Visualize Retention Matrix fig, axes = plt.subplots(2, 1, figsize=(14, 8)) # Heatmap of raw counts sns.heatmap(cohort_pivot, annot=True, fmt='g', cmap='YlOrRd', ax=axes[0], cbar_kws={'label': 'User Count'}) axes[0].set_title('Cohort Sizes - User Counts') axes[0].set_xlabel('Cohort Age (Months)') axes[0].set_ylabel('Cohort Month') # Heatmap of retention rates sns.heatmap(retention_table, annot=True, fmt='.0f', cmap='RdYlGn', vmin=0, vmax=100, ax=axes[1], cbar_kws={'label': 'Retention %'}) axes[1].set_title('Cohort Retention Rates (%)') axes[1].set_xlabel('Cohort Age (Months)') axes[1].set_ylabel('Cohort Month') plt.tight_layout() plt.show() # 4. Retention Curve fig, ax = plt.subplots(figsize=(12, 6)) # Plot retention curves for each cohort for cohort_month in cohort_pivot.index[:8]: # First 8 cohorts cohort_retention = retention_table.loc[cohort_month] ax.plot(cohort_retention.index, cohort_retention.values, marker='o', label=f'Cohort {cohort_month}') ax.set_xlabel('Cohort Age (Months)') ax.set_ylabel('Retention Rate (%)') ax.set_title('Retention Curves by Cohort') ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') ax.grid(True, alpha=0.3) ax.set_ylim([0, 105]) plt.tight_layout() plt.show() # 5. Average Retention Curve fig, ax = plt.subplots(figsize=(10, 6)) # Calculate average retention at each age avg_retention = retention_table.mean() ax.plot(avg_retention.index, avg_retention.values, marker='o', linewidth=2, markersize=8, color='navy') ax.fill_between(avg_retention.index, avg_retention.values, alpha=0.3, color='navy') # Add confidence interval std_retention = retention_table.std() ax.fill_between(std_retention.index, avg_retention - std_retention, avg_retention + std_retention, alpha=0.2, color='navy', label='±1 Std Dev') ax.set_xlabel('Cohort Age (Months)') ax.set_ylabel('Retention Rate (%)') ax.set_title('Average Retention Curve with Confidence Band') ax.legend() ax.grid(True, alpha=0.3) ax.set_ylim([0, 105]) plt.tight_layout() plt.show() # 6. Churn Rate churn_rate = 100 - retention_table print("\nChurn Rates (%):") print(churn_rate.round(1).head()) # 7. Revenue Cohort Analysis # Add revenue data np.random.seed(42) df['revenue'] = np.random.exponential(50, len(df)) # Revenue by cohort revenue_data = df.groupby(['cohort_month', 'cohort_age']).agg({ 'revenue': 'sum', 'user_id': 'nunique' }).reset_index() revenue_data['revenue_per_user'] = revenue_data['revenue'] / revenue_data['user_id'] revenue_pivot = revenue_data.pivot(index='cohort_month', columns='cohort_age', values='revenue') rpu_pivot = revenue_data.pivot(index='cohort_month', columns='cohort_age', values='revenue_per_user') # Visualize revenue fig, axes = plt.subplots(2, 1, figsize=(14, 8)) sns.heatmap(revenue_pivot, annot=True, fmt='.0f', cmap='YlGnBu', ax=axes[0], cbar_kws={'label': 'Total Revenue ($)'}) axes[0].set_title('Total Revenue by Cohort') axes[0].set_xlabel('Cohort Age (Months)') axes[0].set_ylabel('Cohort Month') sns.heatmap(rpu_pivot, annot=True, fmt='.2f', cmap='YlGnBu', ax=axes[1], cbar_kws={'label': 'Revenue per User ($)'}) axes[1].set_title('Revenue per User by Cohort') axes[1].set_xlabel('Cohort Age (Months)') axes[1].set_ylabel('Cohort Month') plt.tight_layout() plt.show() # 8. Lifetime Value Calculation df['month_since_signup'] = df['cohort_age'] ltv_data = df.groupby('user_id').agg({ 'revenue': 'sum', 'cohort_month': 'first', 'month_since_signup': 'max', }).reset_index() ltv_data.columns = ['user_id', 'lifetime_value', 'cohort_month', 'lifetime_months'] # Average LTV by cohort ltv_by_cohort = ltv_data.groupby('cohort_month')['lifetime_value'].agg(['mean', 'median', 'std']) print("\nLifetime Value by Cohort:") print(ltv_by_cohort.round(2)) fig, ax = plt.subplots(figsize=(10, 6)) ltv_by_cohort['mean'].plot(kind='bar', ax=ax, color='skyblue', edgecolor='black') ax.set_title('Average Lifetime Value by Cohort') ax.set_xlabel('Cohort Month') ax.set_ylabel('Lifetime Value ($)') ax.grid(True, alpha=0.3, axis='y') plt.tight_layout() plt.show() # 9. Cohort Composition Over Time fig, ax = plt.subplots(figsize=(12, 6)) # Active users per month by cohort active_by_month = df.groupby(['date', 'cohort_month']).size().reset_index(name='active_users') pivot_active = active_by_month.pivot(index='date', columns='cohort_month', values='active_users') pivot_active.plot(ax=ax, marker='o') ax.set_title('Active Users Per Month by Cohort') ax.set_xlabel('Month') ax.set_ylabel('Active Users') ax.legend(title='Cohort Month', bbox_to_anchor=(1.05, 1)) ax.grid(True, alpha=0.3) plt.tight_layout() plt.show() # 10. Cohort Summary Metrics summary_metrics = pd.DataFrame({ 'Cohort Month': cohort_size.index, 'Initial Size': cohort_size.values, 'Month 1 Retention': retention_table.iloc[:, 0].values, 'Month 3 Retention': retention_table.iloc[:, min(2, retention_table.shape[1]-1)].values, 'Avg LTV': ltv_by_cohort['mean'].values, }) print("\nCohort Summary Metrics:") print(summary_metrics.round(2)) # 11. Visualization comparison fig, axes = plt.subplots(1, 3, figsize=(15, 4)) # Month 1 vs Month 3 retention ax_plot = axes[0] months = ['Month 1', 'Month 3'] month_1_ret = retention_table.iloc[:, 0].mean() month_3_ret = retention_table.iloc[:, min(2, retention_table.shape[1]-1)].mean() ax_plot.bar(months, [month_1_ret, month_3_ret], color=['#1f77b4', '#ff7f0e'], edgecolor='black') ax_plot.set_ylabel('Retention Rate (%)') ax_plot.set_title('Average Retention by Milestone') ax_plot.set_ylim([0, 100]) for i, v in enumerate([month_1_ret, month_3_ret]): ax_plot.text(i, v + 2, f'{v:.1f}%', ha='center') # Cohort size trend axes[1].plot(cohort_size.index, cohort_size.values, marker='o', linewidth=2, markersize=8) axes[1].set_xlabel('Cohort Month') axes[1].set_ylabel('Cohort Size') axes[1].set_title('Cohort Sizes Over Time') axes[1].grid(True, alpha=0.3) # LTV trend axes[2].plot(ltv_by_cohort.index, ltv_by_cohort['mean'].values, marker='o', linewidth=2, markersize=8, color='green') axes[2].set_xlabel('Cohort Month') axes[2].set_ylabel('Average Lifetime Value ($)') axes[2].set_title('LTV Trend by Cohort') axes[2].grid(True, alpha=0.3) plt.tight_layout() plt.show() print("\nCohort analysis complete!")

Key Metrics

Retention Rate: % of cohort active

Churn Rate: % of cohort lost

Day/Month 1 Retention: Early engagement

Lifetime Value: Total revenue per user

Payback Period: Time to recover CAC

Insights to Look For

Early retention predictors

Differences between cohorts

Seasonal patterns

Engagement degradation

Revenue trends

Deliverables

Cohort retention matrix

Retention curve visualization

Churn rate analysis

Lifetime value calculations

Revenue per cohort

Executive summary with insights

Actionable recommendations

cohort-analysis

Cohort Analysis

Core Concepts

Cohort Types

Implementation with Python

Key Metrics

Insights to Look For

Deliverables

Cohort Analysis

Core Concepts

Cohort Types

Implementation with Python

Key Metrics

Insights to Look For

Deliverables