ghostai1 commited on
Commit
6ff6074
·
verified ·
1 Parent(s): 50a6c2d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +264 -0
app.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import io
7
+ import os
8
+ from reportlab.lib.pagesizes import letter
9
+ from reportlab.pdfgen import canvas
10
+ from reportlab.lib.utils import ImageReader
11
+
12
+ # Load the call center logs CSV (assumed to be uploaded to the Space)
13
+ CSV_FILE_PATH = "call_center_logs.csv"
14
+
15
+ # Data cleanup function
16
+ def clean_data(df):
17
+ original_count = len(df)
18
+ cleanup_details = {
19
+ 'original': original_count,
20
+ 'nulls_removed': 0,
21
+ 'duplicates_removed': 0,
22
+ 'short_removed': 0,
23
+ 'malformed_removed': 0,
24
+ 'invalid_timestamps': 0
25
+ }
26
+
27
+ # Remove nulls in critical columns
28
+ critical_columns = ['query', 'resolution', 'duration_minutes', 'satisfaction_score']
29
+ null_rows = df[critical_columns].isna().any(axis=1)
30
+ cleanup_details['nulls_removed'] = null_rows.sum()
31
+ df = df[~null_rows]
32
+
33
+ # Remove duplicates based on call_id
34
+ duplicate_rows = df['call_id'].duplicated()
35
+ cleanup_details['duplicates_removed'] = duplicate_rows.sum()
36
+ df = df[~duplicate_rows]
37
+
38
+ # Remove short queries
39
+ short_rows = (df['query'].str.len() < 5) | (df['resolution'].str.len() < 5)
40
+ cleanup_details['short_removed'] = short_rows.sum()
41
+ df = df[~short_rows]
42
+
43
+ # Remove malformed queries
44
+ malformed_rows = df['query'].str.contains(r'[!?]{2,}|\b(Invalid|N/A)\b', regex=True, case=False, na=False)
45
+ cleanup_details['malformed_removed'] = malformed_rows.sum()
46
+ df = df[~malformed_rows]
47
+
48
+ # Validate and clean timestamps
49
+ invalid_timestamps = pd.to_datetime(df['timestamp'], errors='coerce').isna()
50
+ cleanup_details['invalid_timestamps'] = invalid_timestamps.sum()
51
+ df = df[~invalid_timestamps]
52
+
53
+ # Standardize language (fill missing with 'en')
54
+ df['language'] = df['language'].fillna('en')
55
+
56
+ # Convert duration and satisfaction score to numeric
57
+ df['duration_minutes'] = pd.to_numeric(df['duration_minutes'], errors='coerce')
58
+ df['satisfaction_score'] = pd.to_numeric(df['satisfaction_score'], errors='coerce')
59
+
60
+ cleaned_count = len(df)
61
+ cleanup_details['cleaned'] = cleaned_count
62
+ cleanup_details['removed'] = original_count - cleaned_count
63
+
64
+ # Save cleaned CSV for SageMaker/Azure AI
65
+ cleaned_path = 'cleaned_call_center_logs.csv'
66
+ df.to_csv(cleaned_path, index=False)
67
+
68
+ return df, cleanup_details, cleaned_path
69
+
70
+ # Statistical plotting function
71
+ def plot_statistics(df):
72
+ # Plot 1: Distribution of Call Durations
73
+ plt.figure(figsize=(10, 6))
74
+ sns.histplot(df['duration_minutes'], bins=20, kde=True, color='skyblue')
75
+ plt.title('Distribution of Call Durations')
76
+ plt.xlabel('Duration (minutes)')
77
+ plt.ylabel('Frequency')
78
+ plt.savefig('duration_distribution.png')
79
+ plt.close()
80
+
81
+ # Plot 2: Satisfaction Scores by Agent
82
+ plt.figure(figsize=(10, 6))
83
+ sns.boxplot(x='agent_id', y='satisfaction_score', data=df, color='lightblue')
84
+ plt.title('Satisfaction Scores by Agent')
85
+ plt.xlabel('Agent ID')
86
+ plt.ylabel('Satisfaction Score')
87
+ plt.savefig('satisfaction_by_agent.png')
88
+ plt.close()
89
+
90
+ # Plot 3: Query Frequency by Language
91
+ plt.figure(figsize=(10, 6))
92
+ sns.countplot(x='language', data=df, color='skyblue')
93
+ plt.title('Query Frequency by Language')
94
+ plt.xlabel('Language')
95
+ plt.ylabel('Number of Queries')
96
+ plt.savefig('query_by_language.png')
97
+ plt.close()
98
+
99
+ return ['duration_distribution.png', 'satisfaction_by_agent.png', 'query_by_language.png']
100
+
101
+ # Generate PDF report
102
+ def generate_pdf_report(cleanup_details, plot_paths):
103
+ pdf_path = 'data_analysis_report.pdf'
104
+ c = canvas.Canvas(pdf_path, pagesize=letter)
105
+ width, height = letter
106
+
107
+ # Title
108
+ c.setFont("Helvetica-Bold", 16)
109
+ c.drawString(50, height - 50, "Call Center Data Analysis Report")
110
+
111
+ # Cleanup Stats
112
+ c.setFont("Helvetica", 12)
113
+ y_position = height - 80
114
+ c.drawString(50, y_position, "Data Cleanup Statistics:")
115
+ y_position -= 20
116
+ for key, value in cleanup_details.items():
117
+ c.drawString(70, y_position, f"{key.replace('_', ' ').title()}: {value}")
118
+ y_position -= 15
119
+
120
+ # Add Plots
121
+ y_position -= 30
122
+ for plot_path in plot_paths:
123
+ if os.path.exists(plot_path):
124
+ img = ImageReader(plot_path)
125
+ img_width, img_height = img.getSize()
126
+ aspect = img_height / float(img_width)
127
+ plot_width = 500
128
+ plot_height = plot_width * aspect
129
+ if y_position - plot_height < 50:
130
+ c.showPage()
131
+ y_position = height - 50
132
+ c.drawImage(img, 50, y_position - plot_height, width=plot_width, height=plot_height)
133
+ y_position -= plot_height + 20
134
+
135
+ c.save()
136
+ return pdf_path
137
+
138
+ # Main analysis function
139
+ def analyze_data():
140
+ try:
141
+ # Load the CSV
142
+ df = pd.read_csv(CSV_FILE_PATH)
143
+
144
+ # Clean the data
145
+ cleaned_df, cleanup_details, cleaned_path = clean_data(df)
146
+
147
+ # Generate statistical plots
148
+ plot_paths = plot_statistics(cleaned_df)
149
+
150
+ # Generate PDF report
151
+ pdf_path = generate_pdf_report(cleanup_details, plot_paths)
152
+
153
+ # Prepare cleanup stats for display
154
+ cleanup_stats = "\n".join([f"{key.replace('_', ' ').title()}: {value}" for key, value in cleanup_details.items()])
155
+
156
+ return (
157
+ cleaned_df.head(50).to_html(), # Display first 50 rows as a table
158
+ cleanup_stats,
159
+ plot_paths[0], # Duration distribution
160
+ plot_paths[1], # Satisfaction by agent
161
+ plot_paths[2], # Query by language
162
+ gr.File(value=cleaned_path, label="Download Cleaned CSV"),
163
+ gr.File(value=pdf_path, label="Download PDF Report")
164
+ )
165
+ except Exception as e:
166
+ return f"Error: {str(e)}", "", None, None, None, None, None
167
+
168
+ # Gradio interface
169
+ custom_css = """
170
+ body {
171
+ background: linear-gradient(135deg, #1a1a1a 0%, #2a2a2a 100%);
172
+ color: #e0e0e0;
173
+ font-family: 'Arial', sans-serif;
174
+ display: flex;
175
+ justify-content: center;
176
+ align-items: center;
177
+ min-height: 100vh;
178
+ margin: 0;
179
+ }
180
+ .gr-box {
181
+ background: #3a3a3a;
182
+ border: 1px solid #4a4a4a;
183
+ border-radius: 8px;
184
+ padding: 20px;
185
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.3);
186
+ }
187
+ .gr-button {
188
+ background: #1e90ff;
189
+ color: white;
190
+ border-radius: 5px;
191
+ padding: 12px 20px;
192
+ margin: 8px 0;
193
+ width: 100%;
194
+ text-align: center;
195
+ transition: background 0.3s ease;
196
+ font-size: 16px;
197
+ }
198
+ .gr-button:hover {
199
+ background: #1c86ee;
200
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
201
+ }
202
+ .gr-textbox {
203
+ background: #2f2f2f;
204
+ color: #e0e0e0;
205
+ border: 1px solid #4a4a4a;
206
+ border-radius: 5px;
207
+ margin-bottom: 15px;
208
+ font-size: 16px;
209
+ padding: 15px;
210
+ min-height: 120px;
211
+ width: 100%;
212
+ }
213
+ .gr-image {
214
+ width: 100%;
215
+ height: auto;
216
+ max-height: 400px;
217
+ }
218
+ #app-container {
219
+ max-width: 900px;
220
+ width: 100%;
221
+ padding: 20px;
222
+ background: #252525;
223
+ border-radius: 12px;
224
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5);
225
+ }
226
+ .text-center {
227
+ text-align: center;
228
+ margin-bottom: 20px;
229
+ }
230
+ """
231
+
232
+ with gr.Blocks(css=custom_css) as demo:
233
+ with gr.Column(elem_id="app-container"):
234
+ gr.Markdown("# Call Center Data Analysis", elem_classes="text-center")
235
+ gr.Markdown("Analyze call center logs, view statistics, and export cleaned data for SageMaker/Azure AI.", elem_classes="text-center")
236
+
237
+ # Button to trigger analysis
238
+ analyze_button = gr.Button("Analyze Data")
239
+
240
+ # Outputs
241
+ raw_data_output = gr.HTML(label="Raw Data (First 50 Rows)")
242
+ cleanup_stats_output = gr.Textbox(label="Data Cleanup Statistics")
243
+ duration_plot_output = gr.Image(label="Distribution of Call Durations")
244
+ satisfaction_plot_output = gr.Image(label="Satisfaction Scores by Agent")
245
+ language_plot_output = gr.Image(label="Query Frequency by Language")
246
+ csv_download = gr.File(label="Download Cleaned CSV")
247
+ pdf_download = gr.File(label="Download PDF Report")
248
+
249
+ # Connect the button to the analysis function
250
+ analyze_button.click(
251
+ fn=analyze_data,
252
+ inputs=None,
253
+ outputs=[
254
+ raw_data_output,
255
+ cleanup_stats_output,
256
+ duration_plot_output,
257
+ satisfaction_plot_output,
258
+ language_plot_output,
259
+ csv_download,
260
+ pdf_download
261
+ ]
262
+ )
263
+
264
+ demo.launch()