Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import io | |
import os | |
from reportlab.lib.pagesizes import letter | |
from reportlab.pdfgen import canvas | |
from reportlab.lib.utils import ImageReader | |
# Load the call center logs CSV (assumed to be uploaded to the Space) | |
CSV_FILE_PATH = "call_center_logs.csv" | |
# Data cleanup function | |
def clean_data(df): | |
original_count = len(df) | |
cleanup_details = { | |
'original': original_count, | |
'nulls_removed': 0, | |
'duplicates_removed': 0, | |
'short_removed': 0, | |
'malformed_removed': 0, | |
'invalid_timestamps': 0 | |
} | |
# Remove nulls in critical columns | |
critical_columns = ['query', 'resolution', 'duration_minutes', 'satisfaction_score'] | |
null_rows = df[critical_columns].isna().any(axis=1) | |
cleanup_details['nulls_removed'] = null_rows.sum() | |
df = df[~null_rows] | |
# Remove duplicates based on call_id | |
duplicate_rows = df['call_id'].duplicated() | |
cleanup_details['duplicates_removed'] = duplicate_rows.sum() | |
df = df[~duplicate_rows] | |
# Remove short queries | |
short_rows = (df['query'].str.len() < 5) | (df['resolution'].str.len() < 5) | |
cleanup_details['short_removed'] = short_rows.sum() | |
df = df[~short_rows] | |
# Remove malformed queries | |
malformed_rows = df['query'].str.contains(r'[!?]{2,}|\b(Invalid|N/A)\b', regex=True, case=False, na=False) | |
cleanup_details['malformed_removed'] = malformed_rows.sum() | |
df = df[~malformed_rows] | |
# Validate and clean timestamps | |
invalid_timestamps = pd.to_datetime(df['timestamp'], errors='coerce').isna() | |
cleanup_details['invalid_timestamps'] = invalid_timestamps.sum() | |
df = df[~invalid_timestamps] | |
# Standardize language (fill missing with 'en') | |
df['language'] = df['language'].fillna('en') | |
# Convert duration and satisfaction score to numeric | |
df['duration_minutes'] = pd.to_numeric(df['duration_minutes'], errors='coerce') | |
df['satisfaction_score'] = pd.to_numeric(df['satisfaction_score'], errors='coerce') | |
cleaned_count = len(df) | |
cleanup_details['cleaned'] = cleaned_count | |
cleanup_details['removed'] = original_count - cleaned_count | |
# Save cleaned CSV for SageMaker/Azure AI | |
cleaned_path = 'cleaned_call_center_logs.csv' | |
df.to_csv(cleaned_path, index=False) | |
return df, cleanup_details, cleaned_path | |
# Statistical plotting function | |
def plot_statistics(df): | |
# Plot 1: Distribution of Call Durations | |
plt.figure(figsize=(10, 6)) | |
sns.histplot(df['duration_minutes'], bins=20, kde=True, color='skyblue') | |
plt.title('Distribution of Call Durations') | |
plt.xlabel('Duration (minutes)') | |
plt.ylabel('Frequency') | |
plt.savefig('duration_distribution.png') | |
plt.close() | |
# Plot 2: Satisfaction Scores by Agent | |
plt.figure(figsize=(10, 6)) | |
sns.boxplot(x='agent_id', y='satisfaction_score', data=df, color='lightblue') | |
plt.title('Satisfaction Scores by Agent') | |
plt.xlabel('Agent ID') | |
plt.ylabel('Satisfaction Score') | |
plt.savefig('satisfaction_by_agent.png') | |
plt.close() | |
# Plot 3: Query Frequency by Language | |
plt.figure(figsize=(10, 6)) | |
sns.countplot(x='language', data=df, color='skyblue') | |
plt.title('Query Frequency by Language') | |
plt.xlabel('Language') | |
plt.ylabel('Number of Queries') | |
plt.savefig('query_by_language.png') | |
plt.close() | |
return ['duration_distribution.png', 'satisfaction_by_agent.png', 'query_by_language.png'] | |
# Generate PDF report | |
def generate_pdf_report(cleanup_details, plot_paths): | |
pdf_path = 'data_analysis_report.pdf' | |
c = canvas.Canvas(pdf_path, pagesize=letter) | |
width, height = letter | |
# Title | |
c.setFont("Helvetica-Bold", 16) | |
c.drawString(50, height - 50, "Call Center Data Analysis Report") | |
# Cleanup Stats | |
c.setFont("Helvetica", 12) | |
y_position = height - 80 | |
c.drawString(50, y_position, "Data Cleanup Statistics:") | |
y_position -= 20 | |
for key, value in cleanup_details.items(): | |
c.drawString(70, y_position, f"{key.replace('_', ' ').title()}: {value}") | |
y_position -= 15 | |
# Add Plots | |
y_position -= 30 | |
for plot_path in plot_paths: | |
if os.path.exists(plot_path): | |
img = ImageReader(plot_path) | |
img_width, img_height = img.getSize() | |
aspect = img_height / float(img_width) | |
plot_width = 500 | |
plot_height = plot_width * aspect | |
if y_position - plot_height < 50: | |
c.showPage() | |
y_position = height - 50 | |
c.drawImage(img, 50, y_position - plot_height, width=plot_width, height=plot_height) | |
y_position -= plot_height + 20 | |
c.save() | |
return pdf_path | |
# Main analysis function | |
def analyze_data(): | |
try: | |
# Load the CSV | |
df = pd.read_csv(CSV_FILE_PATH) | |
# Clean the data | |
cleaned_df, cleanup_details, cleaned_path = clean_data(df) | |
# Generate statistical plots | |
plot_paths = plot_statistics(cleaned_df) | |
# Generate PDF report | |
pdf_path = generate_pdf_report(cleanup_details, plot_paths) | |
# Prepare cleanup stats for display | |
cleanup_stats = "\n".join([f"{key.replace('_', ' ').title()}: {value}" for key, value in cleanup_details.items()]) | |
return ( | |
cleaned_df.head(50).to_html(), # Display first 50 rows as a table | |
cleanup_stats, | |
plot_paths[0], # Duration distribution | |
plot_paths[1], # Satisfaction by agent | |
plot_paths[2], # Query by language | |
gr.File(value=cleaned_path, label="Download Cleaned CSV"), | |
gr.File(value=pdf_path, label="Download PDF Report") | |
) | |
except Exception as e: | |
return f"Error: {str(e)}", "", None, None, None, None, None | |
# Gradio interface | |
custom_css = """ | |
body { | |
background: linear-gradient(135deg, #1a1a1a 0%, #2a2a2a 100%); | |
color: #e0e0e0; | |
font-family: 'Arial', sans-serif; | |
display: flex; | |
justify-content: center; | |
align-items: center; | |
min-height: 100vh; | |
margin: 0; | |
} | |
.gr-box { | |
background: #3a3a3a; | |
border: 1px solid #4a4a4a; | |
border-radius: 8px; | |
padding: 20px; | |
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.3); | |
} | |
.gr-button { | |
background: #1e90ff; | |
color: white; | |
border-radius: 5px; | |
padding: 12px 20px; | |
margin: 8px 0; | |
width: 100%; | |
text-align: center; | |
transition: background 0.3s ease; | |
font-size: 16px; | |
} | |
.gr-button:hover { | |
background: #1c86ee; | |
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2); | |
} | |
.gr-textbox { | |
background: #2f2f2f; | |
color: #e0e0e0; | |
border: 1px solid #4a4a4a; | |
border-radius: 5px; | |
margin-bottom: 15px; | |
font-size: 16px; | |
padding: 15px; | |
min-height: 120px; | |
width: 100%; | |
} | |
.gr-image { | |
width: 100%; | |
height: auto; | |
max-height: 400px; | |
} | |
#app-container { | |
max-width: 900px; | |
width: 100%; | |
padding: 20px; | |
background: #252525; | |
border-radius: 12px; | |
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5); | |
} | |
.text-center { | |
text-align: center; | |
margin-bottom: 20px; | |
} | |
""" | |
with gr.Blocks(css=custom_css) as demo: | |
with gr.Column(elem_id="app-container"): | |
gr.Markdown("# Call Center Data Analysis", elem_classes="text-center") | |
gr.Markdown("Analyze call center logs, view statistics, and export cleaned data for SageMaker/Azure AI.", elem_classes="text-center") | |
# Button to trigger analysis | |
analyze_button = gr.Button("Analyze Data") | |
# Outputs | |
raw_data_output = gr.HTML(label="Raw Data (First 50 Rows)") | |
cleanup_stats_output = gr.Textbox(label="Data Cleanup Statistics") | |
duration_plot_output = gr.Image(label="Distribution of Call Durations") | |
satisfaction_plot_output = gr.Image(label="Satisfaction Scores by Agent") | |
language_plot_output = gr.Image(label="Query Frequency by Language") | |
csv_download = gr.File(label="Download Cleaned CSV") | |
pdf_download = gr.File(label="Download PDF Report") | |
# Connect the button to the analysis function | |
analyze_button.click( | |
fn=analyze_data, | |
inputs=None, | |
outputs=[ | |
raw_data_output, | |
cleanup_stats_output, | |
duration_plot_output, | |
satisfaction_plot_output, | |
language_plot_output, | |
csv_download, | |
pdf_download | |
] | |
) | |
demo.launch() |