aiqcamp commited on
Commit
66afa9e
·
verified ·
1 Parent(s): 92535df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +272 -148
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  import numpy as np
3
  import matplotlib.pyplot as plt
4
  import time
 
5
  from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
6
  import pandas as pd
7
  from sklearn.feature_extraction.text import CountVectorizer
@@ -11,30 +12,47 @@ import re
11
 
12
  # Download necessary NLTK data
13
  try:
14
- nltk.data.find('tokenizers/punkt')
15
- except LookupError:
 
 
 
 
 
 
 
 
 
 
 
16
  nltk.download('punkt')
17
- try:
18
- nltk.data.find('taggers/averaged_perceptron_tagger')
19
- except LookupError:
20
  nltk.download('averaged_perceptron_tagger')
21
 
22
- # Load Whisper for ASR
23
- asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
 
 
24
 
25
- # Load Grammar Scoring Model (CoLA)
26
- cola_model = AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-CoLA")
27
- cola_tokenizer = AutoTokenizer.from_pretrained("textattack/roberta-base-CoLA")
28
- grammar_pipeline = pipeline("text-classification", model=cola_model, tokenizer=cola_tokenizer)
29
 
30
- # Load Grammar Correction Model (T5)
31
- correction_pipeline = pipeline("text2text-generation", model="vennify/t5-base-grammar-correction")
32
 
33
- # Add sentiment analysis
34
- sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
35
 
36
- # Add fluency analysis (using BERT)
37
- fluency_pipeline = pipeline("text-classification", model="textattack/bert-base-uncased-CoLA")
 
 
 
 
 
 
 
38
 
39
  # Common English filler words to detect
40
  FILLER_WORDS = ["um", "uh", "like", "you know", "actually", "basically", "literally",
@@ -57,38 +75,56 @@ def calculate_speaking_rate(text, duration):
57
 
58
  def analyze_vocabulary_richness(text):
59
  """Analyze vocabulary richness"""
60
- words = word_tokenize(text.lower())
 
 
 
 
 
 
 
61
  if not words:
62
- return 0, 0
63
 
64
  # Vocabulary richness (unique words / total words)
65
  unique_words = set(words)
66
  richness = len(unique_words) / len(words)
67
 
68
- # POS tagging to see variety of word types used
69
- pos_tags = nltk.pos_tag(words)
70
- pos_counts = {}
71
- for _, tag in pos_tags:
72
- pos_counts[tag] = pos_counts.get(tag, 0) + 1
 
 
 
 
73
 
74
  return richness, pos_counts
75
 
76
  def analyze_sentence_complexity(text):
77
- """Analyze sentence complexity"""
78
- sentences = re.split(r'[.!?]+', text)
79
- sentences = [s.strip() for s in sentences if s.strip()]
80
-
81
- if not sentences:
82
- return 0, 0
83
-
84
- # Average words per sentence
85
- words_per_sentence = [len(s.split()) for s in sentences]
86
- avg_words = sum(words_per_sentence) / len(sentences)
87
-
88
- # Sentence length variation (standard deviation)
89
- sentence_length_variation = np.std(words_per_sentence) if len(sentences) > 1 else 0
90
-
91
- return avg_words, sentence_length_variation
 
 
 
 
 
 
 
92
 
93
  def create_detailed_feedback(transcription, grammar_score, corrected_text,
94
  sentiment, fluency, filler_ratio, speaking_rate,
@@ -152,120 +188,208 @@ def process_audio(audio):
152
 
153
  start_time = time.time()
154
 
155
- # Get audio duration (assuming audio[1] contains the sample rate)
156
- sample_rate = 16000 # Default if we can't determine
157
- if isinstance(audio, tuple) and len(audio) > 1:
158
- sample_rate = audio[1]
159
-
160
- # For file uploads, we need to handle differently
161
- if isinstance(audio, str):
162
- # This is a file path
163
- import librosa
164
- y, sr = librosa.load(audio, sr=None)
165
- duration = librosa.get_duration(y=y, sr=sr)
166
- else:
167
- # Assuming a tuple with (samples, sample_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  try:
169
- duration = len(audio[0]) / sample_rate if sample_rate > 0 else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  except:
171
- duration = 0
172
-
173
- # Step 1: Transcription
174
- transcription_result = asr_pipeline(audio)
175
- transcription = transcription_result["text"]
176
-
177
- # Step 2: Grammar Scoring
178
- score_output = grammar_pipeline(transcription)[0]
179
- label = score_output["label"]
180
- confidence = score_output["score"]
181
- grammar_score = f"{label} ({confidence:.2f})"
182
-
183
- # Step 3: Grammar Correction
184
- corrected = correction_pipeline(transcription, max_length=128)[0]["generated_text"]
185
-
186
- # Step 4: Sentiment Analysis
187
- sentiment_result = sentiment_pipeline(transcription)[0]
188
- sentiment = sentiment_result["label"]
189
- sentiment_score = sentiment_result["score"]
190
-
191
- # Step 5: Fluency Analysis
192
- fluency_result = fluency_pipeline(transcription)[0]
193
- fluency_score = fluency_result["score"] if fluency_result["label"] == "acceptable" else 1 - fluency_result["score"]
194
-
195
- # Step 6: Filler Words Analysis
196
- filler_count, filler_ratio = count_filler_words(transcription)
197
-
198
- # Step 7: Speaking Rate
199
- speaking_rate = calculate_speaking_rate(transcription, duration)
200
-
201
- # Step 8: Vocabulary Richness
202
- vocab_richness, pos_counts = analyze_vocabulary_richness(transcription)
203
-
204
- # Step 9: Sentence Complexity
205
- avg_words, sentence_variation = analyze_sentence_complexity(transcription)
206
-
207
- # Create feedback
208
- feedback = create_detailed_feedback(
209
- transcription, grammar_score, corrected, sentiment,
210
- fluency_score, filler_ratio, speaking_rate, vocab_richness, avg_words
211
- )
212
-
213
- # Create metrics visualization
214
- fig, ax = plt.subplots(figsize=(10, 6))
215
-
216
- # Define metrics for radar chart
217
- categories = ['Grammar', 'Fluency', 'Vocabulary', 'Speaking Rate', 'Clarity']
218
-
219
- # Normalize scores between 0 and 1
220
- grammar_norm = confidence if label == "acceptable" else 1 - confidence
221
- speaking_rate_norm = max(0, min(1, 1 - abs((speaking_rate - 140) / 100))) # Optimal around 140 wpm
222
-
223
- values = [
224
- grammar_norm,
225
- fluency_score,
226
- vocab_richness,
227
- speaking_rate_norm,
228
- 1 - filler_ratio # Lower filler ratio is better
229
- ]
230
-
231
- # Complete the loop for the radar chart
232
- values += values[:1]
233
- categories += categories[:1]
234
-
235
- # Convert to radians and plot
236
- angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False).tolist()
237
- angles += angles[:1]
238
-
239
- ax.plot(angles, values, linewidth=2, linestyle='solid')
240
- ax.fill(angles, values, alpha=0.25)
241
- ax.set_yticklabels([])
242
- ax.set_xticks(angles[:-1])
243
- ax.set_xticklabels(categories[:-1])
244
- ax.grid(True)
245
- plt.title('Speaking Performance Metrics', size=15, color='navy', y=1.1)
246
-
247
- # Create detailed analysis text
248
- processing_time = time.time() - start_time
249
- detailed_analysis = f"""
250
- ## Detailed Speech Analysis
251
 
252
- **Processing Time:** {processing_time:.2f} seconds
253
- **Audio Duration:** {duration:.2f} seconds
254
 
255
- ### Metrics:
256
- - **Grammar Score:** {confidence:.2f} ({label})
257
- - **Fluency Score:** {fluency_score:.2f}
258
- - **Speaking Rate:** {speaking_rate:.1f} words per minute
259
- - **Vocabulary Richness:** {vocab_richness:.2f} (higher is better)
260
- - **Filler Words:** {filler_count} occurrences ({filler_ratio:.1%} of speech)
261
- - **Avg Words Per Sentence:** {avg_words:.1f}
262
- - **Sentiment:** {sentiment} ({sentiment_score:.2f})
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
- ### Word Types Used:
265
- {', '.join([f"{k}: {v}" for k, v in sorted(pos_counts.items(), key=lambda x: x[1], reverse=True)[:5]])}
266
- """
267
-
268
- return transcription, grammar_score, corrected, feedback, fig, detailed_analysis
269
 
270
  # Create theme
271
  theme = gr.themes.Soft(
 
2
  import numpy as np
3
  import matplotlib.pyplot as plt
4
  import time
5
+ import os
6
  from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
7
  import pandas as pd
8
  from sklearn.feature_extraction.text import CountVectorizer
 
12
 
13
  # Download necessary NLTK data
14
  try:
15
+ # Make the download more reliable by specifying download directory
16
+ nltk_data_dir = '/home/user/nltk_data'
17
+ os.makedirs(nltk_data_dir, exist_ok=True)
18
+
19
+ # Download all required resources
20
+ nltk.download('punkt', download_dir=nltk_data_dir)
21
+ nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_dir)
22
+
23
+ # Set the data path to include our custom directory
24
+ nltk.data.path.insert(0, nltk_data_dir)
25
+ except Exception as e:
26
+ print(f"NLTK download issue: {e}")
27
+ # Fallback simple approach if the directory approach fails
28
  nltk.download('punkt')
 
 
 
29
  nltk.download('averaged_perceptron_tagger')
30
 
31
+ # Add error handling around model loading
32
+ try:
33
+ # Load Whisper for ASR
34
+ asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
35
 
36
+ # Load Grammar Scoring Model (CoLA)
37
+ cola_model = AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-CoLA")
38
+ cola_tokenizer = AutoTokenizer.from_pretrained("textattack/roberta-base-CoLA")
39
+ grammar_pipeline = pipeline("text-classification", model=cola_model, tokenizer=cola_tokenizer)
40
 
41
+ # Load Grammar Correction Model (T5)
42
+ correction_pipeline = pipeline("text2text-generation", model="vennify/t5-base-grammar-correction")
43
 
44
+ # Add sentiment analysis
45
+ sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
46
 
47
+ # Add fluency analysis (using BERT)
48
+ fluency_pipeline = pipeline("text-classification", model="textattack/bert-base-uncased-CoLA")
49
+
50
+ # Set variables to track loaded models
51
+ MODELS_LOADED = True
52
+ except Exception as e:
53
+ print(f"Error loading models: {e}")
54
+ # Set variable to track failed model loading
55
+ MODELS_LOADED = False
56
 
57
  # Common English filler words to detect
58
  FILLER_WORDS = ["um", "uh", "like", "you know", "actually", "basically", "literally",
 
75
 
76
  def analyze_vocabulary_richness(text):
77
  """Analyze vocabulary richness"""
78
+ # Split text by simple regex instead of using word_tokenize to avoid NLTK issues
79
+ try:
80
+ # Try using word_tokenize first
81
+ words = word_tokenize(text.lower())
82
+ except LookupError:
83
+ # Fallback to simple regex-based tokenization if NLTK fails
84
+ words = re.findall(r'\b\w+\b', text.lower())
85
+
86
  if not words:
87
+ return 0, {}
88
 
89
  # Vocabulary richness (unique words / total words)
90
  unique_words = set(words)
91
  richness = len(unique_words) / len(words)
92
 
93
+ # Use simple POS tagging or skip it if NLTK fails
94
+ try:
95
+ pos_tags = nltk.pos_tag(words)
96
+ pos_counts = {}
97
+ for _, tag in pos_tags:
98
+ pos_counts[tag] = pos_counts.get(tag, 0) + 1
99
+ except Exception:
100
+ # Return simplified count if POS tagging fails
101
+ pos_counts = {"WORD": len(words), "UNIQUE": len(unique_words)}
102
 
103
  return richness, pos_counts
104
 
105
  def analyze_sentence_complexity(text):
106
+ """Analyze sentence complexity with error handling"""
107
+ try:
108
+ # Simple sentence splitting by punctuation
109
+ sentences = re.split(r'[.!?]+', text)
110
+ sentences = [s.strip() for s in sentences if s.strip()]
111
+
112
+ if not sentences:
113
+ return 0, 0
114
+
115
+ # Average words per sentence
116
+ words_per_sentence = [len(s.split()) for s in sentences]
117
+ avg_words = sum(words_per_sentence) / len(sentences)
118
+
119
+ # Sentence length variation (standard deviation)
120
+ sentence_length_variation = np.std(words_per_sentence) if len(sentences) > 1 else 0
121
+
122
+ return avg_words, sentence_length_variation
123
+ except Exception:
124
+ # In case of any error, return simple defaults
125
+ word_count = len(text.split())
126
+ # Assume approximately 15 words per sentence if we can't detect
127
+ return word_count / max(1, text.count('.') + text.count('!') + text.count('?')), 0
128
 
129
  def create_detailed_feedback(transcription, grammar_score, corrected_text,
130
  sentiment, fluency, filler_ratio, speaking_rate,
 
188
 
189
  start_time = time.time()
190
 
191
+ # Check if models loaded properly
192
+ if 'MODELS_LOADED' in globals() and not MODELS_LOADED:
193
+ return ("Models failed to load. Please check the logs for details.",
194
+ "Error", "Error", "Unable to process audio due to model loading issues.",
195
+ None, "## Error\nThe required models couldn't be loaded. Please check the system configuration.")
196
+
197
+ try:
198
+ # Get audio duration (assuming audio[1] contains the sample rate)
199
+ sample_rate = 16000 # Default if we can't determine
200
+ if isinstance(audio, tuple) and len(audio) > 1:
201
+ sample_rate = audio[1]
202
+
203
+ # For file uploads, we need to handle differently
204
+ duration = 0
205
+ if isinstance(audio, str):
206
+ # This is a file path
207
+ try:
208
+ import librosa
209
+ y, sr = librosa.load(audio, sr=None)
210
+ duration = librosa.get_duration(y=y, sr=sr)
211
+ except Exception as e:
212
+ print(f"Error getting duration: {e}")
213
+ # Estimate duration based on file size
214
+ try:
215
+ file_size = os.path.getsize(audio)
216
+ # Rough estimate: 16kHz, 16-bit audio is about 32KB per second
217
+ duration = file_size / 32000
218
+ except:
219
+ duration = 10 # Default to 10 seconds if we can't determine
220
+ else:
221
+ # Assuming a tuple with (samples, sample_rate)
222
+ try:
223
+ duration = len(audio[0]) / sample_rate if sample_rate > 0 else 0
224
+ except:
225
+ duration = 10 # Default duration
226
+
227
+ # Step 1: Transcription
228
+ try:
229
+ transcription_result = asr_pipeline(audio)
230
+ transcription = transcription_result["text"]
231
+ except Exception as e:
232
+ print(f"Transcription error: {e}")
233
+ return ("Error in speech recognition. Please try again.",
234
+ "Error", "Error", "There was an error processing your audio.",
235
+ None, f"## Error\nError in speech recognition: {str(e)[:100]}...")
236
+
237
+ if not transcription or transcription.strip() == "":
238
+ return ("No speech detected. Please speak louder or check your microphone.",
239
+ "N/A", "N/A", "No speech detected in the audio.",
240
+ None, "## No Speech Detected\nPlease try recording again with clearer speech.")
241
+
242
+ # Step 2: Grammar Scoring
243
  try:
244
+ score_output = grammar_pipeline(transcription)[0]
245
+ label = score_output["label"]
246
+ confidence = score_output["score"]
247
+ grammar_score = f"{label} ({confidence:.2f})"
248
+ except Exception as e:
249
+ print(f"Grammar scoring error: {e}")
250
+ label = "UNKNOWN"
251
+ confidence = 0.5
252
+ grammar_score = "Could not analyze grammar"
253
+
254
+ # Step 3: Grammar Correction
255
+ try:
256
+ corrected = correction_pipeline(transcription, max_length=128)[0]["generated_text"]
257
+ except Exception as e:
258
+ print(f"Grammar correction error: {e}")
259
+ corrected = transcription
260
+
261
+ # Step 4: Sentiment Analysis
262
+ try:
263
+ sentiment_result = sentiment_pipeline(transcription)[0]
264
+ sentiment = sentiment_result["label"]
265
+ sentiment_score = sentiment_result["score"]
266
+ except Exception as e:
267
+ print(f"Sentiment analysis error: {e}")
268
+ sentiment = "NEUTRAL"
269
+ sentiment_score = 0.5
270
+
271
+ # Step 5: Fluency Analysis
272
+ try:
273
+ fluency_result = fluency_pipeline(transcription)[0]
274
+ fluency_score = fluency_result["score"] if fluency_result["label"] == "acceptable" else 1 - fluency_result["score"]
275
+ except Exception as e:
276
+ print(f"Fluency analysis error: {e}")
277
+ fluency_score = 0.5
278
+
279
+ # Step 6: Filler Words Analysis
280
+ try:
281
+ filler_count, filler_ratio = count_filler_words(transcription)
282
+ except Exception as e:
283
+ print(f"Filler word analysis error: {e}")
284
+ filler_count, filler_ratio = 0, 0
285
+
286
+ # Step 7: Speaking Rate
287
+ try:
288
+ speaking_rate = calculate_speaking_rate(transcription, duration)
289
+ except Exception as e:
290
+ print(f"Speaking rate calculation error: {e}")
291
+ speaking_rate = 0
292
+
293
+ # Step 8: Vocabulary Richness
294
+ try:
295
+ vocab_richness, pos_counts = analyze_vocabulary_richness(transcription)
296
+ except Exception as e:
297
+ print(f"Vocabulary analysis error: {e}")
298
+ vocab_richness, pos_counts = 0.5, {"N/A": 1}
299
+
300
+ # Step 9: Sentence Complexity
301
+ try:
302
+ avg_words, sentence_variation = analyze_sentence_complexity(transcription)
303
+ except Exception as e:
304
+ print(f"Sentence complexity analysis error: {e}")
305
+ avg_words, sentence_variation = 0, 0
306
+
307
+ # Create feedback
308
+ try:
309
+ feedback = create_detailed_feedback(
310
+ transcription, grammar_score, corrected, sentiment,
311
+ fluency_score, filler_ratio, speaking_rate, vocab_richness, avg_words
312
+ )
313
+ except Exception as e:
314
+ print(f"Feedback creation error: {e}")
315
+ feedback = "Error generating detailed feedback."
316
+
317
+ # Create metrics visualization
318
+ try:
319
+ fig, ax = plt.subplots(figsize=(10, 6))
320
+
321
+ # Define metrics for radar chart
322
+ categories = ['Grammar', 'Fluency', 'Vocabulary', 'Speaking Rate', 'Clarity']
323
+
324
+ # Normalize scores between 0 and 1
325
+ grammar_norm = confidence if label == "acceptable" else 1 - confidence
326
+ speaking_rate_norm = max(0, min(1, 1 - abs((speaking_rate - 140) / 100))) # Optimal around 140 wpm
327
+
328
+ values = [
329
+ grammar_norm,
330
+ fluency_score,
331
+ vocab_richness,
332
+ speaking_rate_norm,
333
+ 1 - filler_ratio # Lower filler ratio is better
334
+ ]
335
+
336
+ # Complete the loop for the radar chart
337
+ values += values[:1]
338
+ categories += categories[:1]
339
+
340
+ # Convert to radians and plot
341
+ angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False).tolist()
342
+ angles += angles[:1]
343
+
344
+ ax.plot(angles, values, linewidth=2, linestyle='solid')
345
+ ax.fill(angles, values, alpha=0.25)
346
+ ax.set_yticklabels([])
347
+ ax.set_xticks(angles[:-1])
348
+ ax.set_xticklabels(categories[:-1])
349
+ ax.grid(True)
350
+ plt.title('Speaking Performance Metrics', size=15, color='navy', y=1.1)
351
+ except Exception as e:
352
+ print(f"Visualization error: {e}")
353
+ # Create a simple error figure
354
+ fig, ax = plt.subplots(figsize=(6, 3))
355
+ ax.text(0.5, 0.5, "Error creating visualization",
356
+ horizontalalignment='center', verticalalignment='center')
357
+ ax.axis('off')
358
+
359
+ # Create detailed analysis text
360
+ processing_time = time.time() - start_time
361
+ try:
362
+ pos_counts_str = ', '.join([f"{k}: {v}" for k, v in sorted(pos_counts.items(), key=lambda x: x[1], reverse=True)[:5]])
363
  except:
364
+ pos_counts_str = "N/A"
365
+
366
+ detailed_analysis = f"""
367
+ ## Detailed Speech Analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
 
369
+ **Processing Time:** {processing_time:.2f} seconds
370
+ **Audio Duration:** {duration:.2f} seconds
371
 
372
+ ### Metrics:
373
+ - **Grammar Score:** {confidence:.2f} ({label})
374
+ - **Fluency Score:** {fluency_score:.2f}
375
+ - **Speaking Rate:** {speaking_rate:.1f} words per minute
376
+ - **Vocabulary Richness:** {vocab_richness:.2f} (higher is better)
377
+ - **Filler Words:** {filler_count} occurrences ({filler_ratio:.1%} of speech)
378
+ - **Avg Words Per Sentence:** {avg_words:.1f}
379
+ - **Sentiment:** {sentiment} ({sentiment_score:.2f})
380
+
381
+ ### Word Types Used:
382
+ {pos_counts_str}
383
+ """
384
+
385
+ return transcription, grammar_score, corrected, feedback, fig, detailed_analysis
386
+
387
+ except Exception as e:
388
+ print(f"Unexpected error in process_audio: {e}")
389
+ return ("An unexpected error occurred during processing.",
390
+ "Error", "Error", "There was an unexpected error processing your audio.",
391
+ None, f"## Unexpected Error\n\nAn error occurred: {str(e)[:200]}...")
392
 
 
 
 
 
 
393
 
394
  # Create theme
395
  theme = gr.themes.Soft(