"""
Final Project: Comprehensive Retail Sales Analysis
Course: Data Visualization Analysis
Author: [Your Name]
Date: [Submission Date]
Business Question:
Analyze 2024 sales performance across products, regions, and time periods
to identify growth opportunities and operational improvements for 2025 planning.
Dataset: Retail sales data with 10,000+ transactions
"""
# ============================================================================
# SECTION 1: SETUP AND DATA LOADING
# ============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Set visualization styles
sns.set_theme(style='whitegrid', context='talk', palette='colorblind')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11
print("="*70)
print("RETAIL SALES ANALYSIS PROJECT - 2024")
print("="*70)
# ============================================================================
# SECTION 2: DATA IMPORT AND INITIAL EXPLORATION
# ============================================================================
# Load data (in actual project, this would be from file)
# For demonstration, we'll generate realistic sample data
np.random.seed(42)
# Generate comprehensive retail dataset
n_transactions = 10000
dates = pd.date_range('2024-01-01', '2024-12-31', freq='H')
products = ['Electronics', 'Clothing', 'Home Goods', 'Sports Equipment',
'Books', 'Toys', 'Food & Beverage', 'Health & Beauty']
regions = ['North', 'South', 'East', 'West', 'Central']
channels = ['Online', 'In-Store', 'Mobile App']
transactions = []
for i in range(n_transactions):
product = np.random.choice(products, p=[0.15, 0.18, 0.12, 0.10, 0.08, 0.09, 0.15, 0.13])
region = np.random.choice(regions)
channel = np.random.choice(channels, p=[0.45, 0.40, 0.15])
# Price varies by product category
price_ranges = {
'Electronics': (50, 500), 'Clothing': (20, 150),
'Home Goods': (30, 300), 'Sports Equipment': (40, 400),
'Books': (10, 50), 'Toys': (15, 100),
'Food & Beverage': (5, 50), 'Health & Beauty': (10, 100)
}
price = np.random.uniform(*price_ranges[product])
# Quantity varies by channel
if channel == 'Online':
quantity = np.random.randint(1, 5)
else:
quantity = np.random.randint(1, 3)
# Customer satisfaction varies
satisfaction = np.random.uniform(3.0, 5.0)
# Shipping time (online only)
shipping_days = np.random.randint(2, 10) if channel == 'Online' else 0
transactions.append({
'transaction_id': f'TXN{i:06d}',
'date': np.random.choice(dates),
'product_category': product,
'region': region,
'channel': channel,
'unit_price': price,
'quantity': quantity,
'customer_satisfaction': satisfaction,
'shipping_days': shipping_days
})
df = pd.DataFrame(transactions)
# Calculate derived fields
df['total_amount'] = df['unit_price'] * df['quantity']
df['year_month'] = df['date'].dt.to_period('M')
df['day_of_week'] = df['date'].dt.day_name()
df['hour'] = df['date'].dt.hour
print("\n1. Dataset Overview:")
print("-" * 70)
print(f"Total Transactions: {len(df):,}")
print(f"Date Range: {df['date'].min()} to {df['date'].max()}")
print(f"Total Revenue: ${df['total_amount'].sum():,.2f}")
print(f"Average Transaction Value: ${df['total_amount'].mean():.2f}")
print("\n2. Data Structure:")
print(df.info())
print("\n3. Sample Records:")
print(df.head())
# ============================================================================
# SECTION 3: DATA CLEANING AND PREPARATION
# ============================================================================
print("\n" + "="*70)
print("DATA CLEANING AND PREPARATION")
print("="*70)
# Check for missing values
print("\n1. Missing Values Check:")
print(df.isnull().sum())
# Check for duplicates
duplicates = df.duplicated(subset=['transaction_id']).sum()
print(f"\n2. Duplicate Transactions: {duplicates}")
# Data quality checks
print("\n3. Data Quality Checks:")
print(f" - Negative prices: {(df['unit_price'] < 0).sum()}")
print(f" - Zero quantities: {(df['quantity'] == 0).sum()}")
print(f" - Invalid satisfaction scores: {((df['customer_satisfaction'] < 1) | (df['customer_satisfaction'] > 5)).sum()}")
# Outlier detection (IQR method)
Q1 = df['total_amount'].quantile(0.25)
Q3 = df['total_amount'].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df['total_amount'] < Q1 - 1.5*IQR) | (df['total_amount'] > Q3 + 1.5*IQR)]
print(f"\n4. Statistical Outliers Detected: {len(outliers)} ({len(outliers)/len(df)*100:.2f}%)")
# Clean dataset
df_clean = df.copy()
# Keep outliers but flag them
df_clean['is_outlier'] = ((df_clean['total_amount'] < Q1 - 1.5*IQR) |
(df_clean['total_amount'] > Q3 + 1.5*IQR))
print(f"\n5. Clean Dataset: {len(df_clean):,} transactions ready for analysis")
# ============================================================================
# SECTION 4: EXPLORATORY DATA ANALYSIS
# ============================================================================
print("\n" + "="*70)
print("EXPLORATORY DATA ANALYSIS")
print("="*70)
# Summary statistics
print("\n1. Summary Statistics:")
print(df_clean[['unit_price', 'quantity', 'total_amount', 'customer_satisfaction']].describe())
# Category analysis
print("\n2. Performance by Product Category:")
category_summary = df_clean.groupby('product_category').agg({
'total_amount': ['sum', 'mean', 'count'],
'customer_satisfaction': 'mean'
}).round(2)
category_summary.columns = ['Total_Revenue', 'Avg_Transaction', 'Count', 'Avg_Satisfaction']
print(category_summary.sort_values('Total_Revenue', ascending=False))
# Regional analysis
print("\n3. Performance by Region:")
regional_summary = df_clean.groupby('region').agg({
'total_amount': ['sum', 'mean'],
'transaction_id': 'count'
}).round(2)
regional_summary.columns = ['Total_Revenue', 'Avg_Transaction', 'Transaction_Count']
print(regional_summary.sort_values('Total_Revenue', ascending=False))
# Channel analysis
print("\n4. Performance by Channel:")
channel_summary = df_clean.groupby('channel').agg({
'total_amount': ['sum', 'mean'],
'customer_satisfaction': 'mean'
}).round(2)
channel_summary.columns = ['Total_Revenue', 'Avg_Transaction', 'Avg_Satisfaction']
print(channel_summary.sort_values('Total_Revenue', ascending=False))
# Correlation analysis
print("\n5. Correlation Analysis:")
correlation_vars = df_clean[['unit_price', 'quantity', 'total_amount',
'customer_satisfaction', 'shipping_days']].corr()
print(correlation_vars.round(3))
# ============================================================================
# SECTION 5: STATIC VISUALIZATIONS (Matplotlib & Seaborn)
# ============================================================================
print("\n" + "="*70)
print("CREATING STATIC VISUALIZATIONS")
print("="*70)
# Create output directory for charts
import os
os.makedirs('FinalProject_Outputs', exist_ok=True)
# Visualization 1: Revenue trend over time
print("\n1. Creating: Monthly revenue trend...")
monthly_revenue = df_clean.groupby('year_month')['total_amount'].sum()
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(monthly_revenue.index.astype(str), monthly_revenue.values,
marker='o', linewidth=3, markersize=8, color='#2E86AB')
ax.fill_between(range(len(monthly_revenue)), monthly_revenue.values, alpha=0.3, color='#2E86AB')
ax.set_title('Monthly Revenue Trend - 2024', fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Month', fontsize=12)
ax.set_ylabel('Revenue ($)', fontsize=12)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1e6:.2f}M'))
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('FinalProject_Outputs/FP_01_Revenue_Trend.png', dpi=300, bbox_inches='tight')
plt.close()
# Visualization 2: Category performance comparison
print("2. Creating: Category performance bars...")
category_rev = df_clean.groupby('product_category')['total_amount'].sum().sort_values(ascending=True)
fig, ax = plt.subplots(figsize=(12, 8))
colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(category_rev)))
ax.barh(category_rev.index, category_rev.values, color=colors, edgecolor='black', linewidth=1.5)
ax.set_title('Total Revenue by Product Category', fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Revenue ($)', fontsize=12)
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1e6:.1f}M'))
# Add value labels
for i, v in enumerate(category_rev.values):
ax.text(v + 50000, i, f'${v/1e6:.2f}M', va='center', fontsize=10, fontweight='bold')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('FinalProject_Outputs/FP_02_Category_Revenue.png', dpi=300, bbox_inches='tight')
plt.close()
# Visualization 3: Customer satisfaction distribution
print("3. Creating: Satisfaction distribution...")
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Overall distribution
sns.histplot(data=df_clean, x='customer_satisfaction', bins=30, kde=True,
color='#06A77D', ax=axes[0])
axes[0].set_title('Overall Customer Satisfaction Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Satisfaction Score (1-5)', fontsize=11)
axes[0].axvline(df_clean['customer_satisfaction'].mean(), color='red',
linestyle='--', linewidth=2, label=f"Mean: {df_clean['customer_satisfaction'].mean():.2f}")
axes[0].legend()
# By channel
sns.boxplot(data=df_clean, x='channel', y='customer_satisfaction',
palette='Set2', ax=axes[1])
axes[1].set_title('Satisfaction by Channel', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Channel', fontsize=11)
axes[1].set_ylabel('Satisfaction Score', fontsize=11)
axes[1].axhline(4.0, color='red', linestyle='--', linewidth=2, alpha=0.5, label='Target: 4.0')
axes[1].legend()
plt.tight_layout()
plt.savefig('FinalProject_Outputs/FP_03_Satisfaction_Analysis.png', dpi=300, bbox_inches='tight')
plt.close()
# Visualization 4: Regional performance heatmap
print("4. Creating: Regional performance heatmap...")
regional_channel = df_clean.pivot_table(values='total_amount',
index='region',
columns='channel',
aggfunc='sum')
fig, ax = plt.subplots(figsize=(8, 5))
sns.heatmap(regional_channel, annot=True, fmt=',.0f', cmap='YlGnBu',
linewidths=1, cbar_kws={'label': 'Revenue ($)'}, ax=ax)
ax.set_title('Revenue by Region and Channel', fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Channel', fontsize=12)
ax.set_ylabel('Region', fontsize=12)
plt.tight_layout()
plt.savefig('FinalProject_Outputs/FP_04_Regional_Heatmap.png', dpi=300, bbox_inches='tight')
plt.close()
# Visualization 5: Price vs Quantity relationship
print("5. Creating: Price-quantity scatter...")
fig, ax = plt.subplots(figsize=(12, 7))
for category in df_clean['product_category'].unique():
df_cat = df_clean[df_clean['product_category'] == category]
ax.scatter(df_cat['unit_price'], df_cat['quantity'],
alpha=0.6, s=50, label=category)
ax.set_title('Unit Price vs Quantity Sold by Category', fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Unit Price ($)', fontsize=12)
ax.set_ylabel('Quantity', fontsize=12)
ax.legend(title='Product Category', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('FinalProject_Outputs/FP_05_Price_Quantity_Scatter.png', dpi=300, bbox_inches='tight')
plt.close()
# Visualization 6: Correlation heatmap
print("6. Creating: Correlation matrix...")
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(correlation_vars, annot=True, fmt='.3f', cmap='coolwarm',
center=0, square=True, linewidths=1, cbar_kws={'label': 'Correlation'}, ax=ax)
ax.set_title('Correlation Matrix: Key Metrics', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('FinalProject_Outputs/FP_06_Correlation_Matrix.png', dpi=300, bbox_inches='tight')
plt.close()
print("\n All static visualizations saved to FinalProject_Outputs/")
# ============================================================================
# SECTION 6: INTERACTIVE DASHBOARD (Plotly)
# ============================================================================
print("\n" + "="*70)
print("CREATING INTERACTIVE DASHBOARD")
print("="*70)
# Prepare aggregated data for dashboard
monthly_data = df_clean.groupby('year_month').agg({
'total_amount': 'sum',
'transaction_id': 'count',
'customer_satisfaction': 'mean'
}).reset_index()
monthly_data['year_month'] = monthly_data['year_month'].astype(str)
category_data = df_clean.groupby('product_category').agg({
'total_amount': 'sum',
'customer_satisfaction': 'mean',
'transaction_id': 'count'
}).reset_index()
# Create comprehensive dashboard
fig = make_subplots(
rows=3, cols=3,
row_heights=[0.15, 0.40, 0.45],
column_widths=[0.33, 0.33, 0.34],
specs=[
[{'type': 'indicator'}, {'type': 'indicator'}, {'type': 'indicator'}],
[{'type': 'scatter', 'colspan': 2}, None, {'type': 'bar'}],
[{'type': 'bar', 'colspan': 2}, None, {'type': 'pie'}]
],
subplot_titles=['', '', '',
'Monthly Revenue Trend', '', 'Top Categories',
'Regional Performance', '', 'Channel Distribution'],
vertical_spacing=0.10,
horizontal_spacing=0.08
)
# Row 1: KPI Indicators
total_revenue = df_clean['total_amount'].sum()
total_transactions = len(df_clean)
avg_satisfaction = df_clean['customer_satisfaction'].mean()
fig.add_trace(go.Indicator(
mode='number',
value=total_revenue,
title={'text': 'Total Revenue'},
number={'prefix': '$', 'valueformat': ',.0f'},
domain={'x': [0, 1], 'y': [0, 1]}
), row=1, col=1)
fig.add_trace(go.Indicator(
mode='number',
value=total_transactions,
title={'text': 'Total Transactions'},
number={'valueformat': ','},
domain={'x': [0, 1], 'y': [0, 1]}
), row=1, col=2)
fig.add_trace(go.Indicator(
mode='gauge+number',
value=avg_satisfaction,
title={'text': 'Avg Satisfaction'},
gauge={
'axis': {'range': [1, 5]},
'bar': {'color': '#06A77D'},
'threshold': {
'line': {'color': 'red', 'width': 4},
'thickness': 0.75,
'value': 4.0
}
},
domain={'x': [0, 1], 'y': [0, 1]}
), row=1, col=3)
# Row 2: Monthly trend and Top categories
fig.add_trace(go.Scatter(
x=monthly_data['year_month'],
y=monthly_data['total_amount'],
mode='lines+markers',
line=dict(color='#2E86AB', width=3),
marker=dict(size=10),
name='Revenue',
hovertemplate='<b>%{x}</b><br>Revenue: $%{y:,.0f}<extra></extra>'
), row=2, col=1)
top_categories = category_data.nlargest(5, 'total_amount')
fig.add_trace(go.Bar(
x=top_categories['total_amount'],
y=top_categories['product_category'],
orientation='h',
marker_color='#FF6B6B',
name='Categories',
hovertemplate='<b>%{y}</b><br>$%{x:,.0f}<extra></extra>'
), row=2, col=3)
# Row 3: Regional bars and Channel pie
regional_totals = df_clean.groupby('region')['total_amount'].sum().sort_values(ascending=False)
fig.add_trace(go.Bar(
x=regional_totals.index,
y=regional_totals.values,
marker_color='#4ECDC4',
name='Regions',
hovertemplate='<b>%{x}</b><br>$%{y:,.0f}<extra></extra>'
), row=3, col=1)
channel_totals = df_clean.groupby('channel')['total_amount'].sum()
fig.add_trace(go.Pie(
labels=channel_totals.index,
values=channel_totals.values,
marker_colors=['#45B7D1', '#FFA07A', '#98D8C8'],
name='Channels',
hovertemplate='<b>%{label}</b><br>$%{value:,.0f}<br>%{percent}<extra></extra>'
), row=3, col=3)
# Update layout
fig.update_layout(
title_text='<b>Retail Sales Analytics Dashboard - 2024</b><br>' +
'<sub>Interactive Performance Overview</sub>',
title_x=0.5,
title_font_size=20,
showlegend=False,
height=1100,
plot_bgcolor='#F8F9FA',
paper_bgcolor='white'
)
# Update axes
fig.update_xaxes(showgrid=True, gridcolor='white')
fig.update_yaxes(showgrid=True, gridcolor='white')
fig.update_yaxes(tickformat='$,.0f', row=2, col=1)
fig.update_xaxes(tickformat='$,.0f', row=2, col=3)
fig.update_yaxes(tickformat='$,.0f', row=3, col=1)
# Save dashboard
fig.write_html('FinalProject_Outputs/FP_Interactive_Dashboard.html')
print("\n Interactive dashboard saved: FP_Interactive_Dashboard.html")
# ============================================================================
# SECTION 7: KEY FINDINGS AND RECOMMENDATIONS
# ============================================================================
print("\n" + "="*70)
print("KEY FINDINGS AND RECOMMENDATIONS")
print("="*70)
print("\n📊 KEY FINDINGS:")
print("-" * 70)
# Finding 1: Revenue trends
total_rev = df_clean['total_amount'].sum()
print(f"\n1. REVENUE PERFORMANCE:")
print(f" • Total 2024 revenue: ${total_rev:,.2f}")
print(f" • Average monthly revenue: ${total_rev/12:,.2f}")
best_month = df_clean.groupby('year_month')['total_amount'].sum().idxmax()
print(f" • Best performing month: {best_month}")
# Finding 2: Category insights
top_category = category_data.nlargest(1, 'total_amount').iloc[0]
print(f"\n2. PRODUCT CATEGORIES:")
print(f" • Top category: {top_category['product_category']}")
print(f" • Category revenue: ${top_category['total_amount']:,.2f}")
print(f" • Category satisfaction: {top_category['customer_satisfaction']:.2f}/5.0")
# Finding 3: Regional performance
best_region = df_clean.groupby('region')['total_amount'].sum().idxmax()
worst_region = df_clean.groupby('region')['total_amount'].sum().idxmin()
print(f"\n3. REGIONAL ANALYSIS:")
print(f" • Best region: {best_region}")
print(f" • Underperforming region: {worst_region}")
# Finding 4: Channel effectiveness
channel_perf = df_clean.groupby('channel').agg({
'total_amount': 'sum',
'customer_satisfaction': 'mean'
})
best_channel = channel_perf['total_amount'].idxmax()
print(f"\n4. CHANNEL PERFORMANCE:")
print(f" • Leading channel: {best_channel}")
print(f" • Overall satisfaction: {avg_satisfaction:.2f}/5.0")
print("\n" + "="*70)
print("💡 STRATEGIC RECOMMENDATIONS:")
print("="*70)
print(f"""
1. INVESTMENT PRIORITIES:
• Expand {top_category['product_category']} inventory (top performer)
• Increase {best_channel} capacity (highest revenue channel)
• Deploy best practices from {best_region} to other regions
2. IMPROVEMENT AREAS:
• Focus on {worst_region} region - investigate and address gap
• Improve satisfaction in lower-rated categories
• Optimize shipping times for online orders
3. GROWTH OPPORTUNITIES:
• Launch targeted campaigns in underperforming regions
• Develop mobile app features (growing channel)
• Create bundle offers for complementary categories
4. OPERATIONAL EXCELLENCE:
• Maintain satisfaction above 4.0 benchmark
• Standardize processes across high-performing regions
• Monitor and address outlier transactions
5. 2025 STRATEGIC INITIATIVES:
• Set aggressive growth targets for {worst_region} (+30%)
• Expand {top_category['product_category']} product line
• Enhance {best_channel} customer experience
• Implement predictive analytics for inventory management
""")
print("\n" + "="*70)
print("PROJECT COMPLETE - All deliverables generated!")
print("="*70)
print("\nDeliverables created:")
print(" 6 static visualizations (Matplotlib/Seaborn)")
print(" 1 interactive dashboard (Plotly)")
print(" Comprehensive analysis script")
print(" Business insights and recommendations")
print("\nAll files saved to: FinalProject_Outputs/")