vukosi commited on
Commit
1be67ab
·
verified ·
1 Parent(s): 6432040

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +423 -573
app.py CHANGED
@@ -1,627 +1,477 @@
1
  import gradio as gr
2
- import logging
 
 
3
  import time
4
- import json
5
- import csv
6
- import io
7
- from transformers import pipeline
8
- from typing import Tuple, Optional, List, Dict
9
- import traceback
10
- from datetime import datetime
11
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Configure logging for debugging
14
- logging.basicConfig(level=logging.INFO)
15
- logger = logging.getLogger(__name__)
16
 
17
- class LinguisticTranslationApp:
18
- def __init__(self):
19
- self.translators = {}
20
- self.translation_history = []
21
- self.load_models()
22
 
23
- def load_models(self):
24
- """Load translation models with error handling"""
25
- try:
26
- logger.info("Loading translation models...")
27
- self.translators['en_to_ss'] = pipeline(
28
- "translation",
29
- model="dsfsi/en-ss-m2m100-combo",
30
- src_lang="en",
31
- tgt_lang="ss"
32
- )
33
- self.translators['ss_to_en'] = pipeline(
34
- "translation",
35
- model="dsfsi/ss-en-m2m100-combo",
36
- src_lang="ss",
37
- tgt_lang="en"
38
- )
39
- logger.info("Models loaded successfully!")
40
- except Exception as e:
41
- logger.error(f"Error loading models: {str(e)}")
42
- raise e
43
 
44
- def analyze_text_complexity(self, text: str, lang: str) -> Dict:
45
- """Analyze linguistic features of the input text"""
46
- words = text.split()
47
- sentences = re.split(r'[.!?]+', text)
48
- sentences = [s.strip() for s in sentences if s.strip()]
49
-
50
- # Basic linguistic metrics
51
- analysis = {
52
- 'character_count': len(text),
53
- 'word_count': len(words),
54
- 'sentence_count': len(sentences),
55
- 'avg_word_length': sum(len(word) for word in words) / len(words) if words else 0,
56
- 'avg_sentence_length': len(words) / len(sentences) if sentences else 0,
57
- 'unique_words': len(set(word.lower() for word in words)),
58
- 'lexical_diversity': len(set(word.lower() for word in words)) / len(words) if words else 0
59
- }
60
-
61
- # Language-specific features
62
- if lang == 'ss': # Siswati
63
- # Check for common Siswati features
64
- analysis['potential_agglutination'] = sum(1 for word in words if len(word) > 10)
65
- analysis['click_consonants'] = sum(text.count(click) for click in ['c', 'q', 'x'])
66
- analysis['tone_markers'] = text.count('́') + text.count('̀') # Acute and grave accents
67
-
68
- return analysis
69
 
70
- def translate_text(self, text: str, direction: str, save_to_history: bool = True) -> Tuple[str, str, bool, Dict]:
71
- """
72
- Translate text with comprehensive linguistic analysis
73
-
74
- Returns:
75
- Tuple[str, str, bool, Dict]: (translated_text, status_message, success, analysis)
76
- """
77
- if not text or not text.strip():
78
- return "", "⚠️ Please enter some text to translate", False, {}
79
-
80
- if not direction:
81
- return "", "⚠️ Please select a translation direction", False, {}
82
-
83
- # Input validation
84
- if len(text) > 2000: # Increased limit for linguistic work
85
- return "", "⚠️ Text is too long. Please limit to 2000 characters.", False, {}
86
-
87
- try:
88
- start_time = time.time()
89
-
90
- # Determine source and target languages
91
- if direction == 'English → Siswati':
92
- translator = self.translators['en_to_ss']
93
- source_lang = "English"
94
- target_lang = "Siswati"
95
- source_code = "en"
96
- target_code = "ss"
97
- else:
98
- translator = self.translators['ss_to_en']
99
- source_lang = "Siswati"
100
- target_lang = "English"
101
- source_code = "ss"
102
- target_code = "en"
103
-
104
- logger.info(f"Translating from {source_lang} to {target_lang}")
105
-
106
- # Analyze source text
107
- source_analysis = self.analyze_text_complexity(text, source_code)
108
-
109
- # Perform translation
110
- result = translator(
111
- text,
112
- max_length=512,
113
- early_stopping=True,
114
- do_sample=False,
115
- num_beams=4 # Better quality for linguistic analysis
116
- )
117
-
118
- translation = result[0]['translation_text']
119
-
120
- # Analyze translated text
121
- target_analysis = self.analyze_text_complexity(translation, target_code)
122
-
123
- # Calculate processing time
124
- processing_time = time.time() - start_time
125
-
126
- # Linguistic comparison
127
- analysis = {
128
- 'source': source_analysis,
129
- 'target': target_analysis,
130
- 'translation_ratio': len(translation) / len(text) if text else 0,
131
- 'word_ratio': target_analysis['word_count'] / source_analysis['word_count'] if source_analysis['word_count'] else 0,
132
- 'processing_time': processing_time,
133
- 'timestamp': datetime.now().isoformat()
134
- }
135
 
136
- # Save to history for linguistic research
137
- if save_to_history:
138
- history_entry = {
139
- 'source_text': text,
140
- 'translated_text': translation,
141
- 'direction': direction,
142
- 'source_lang': source_lang,
143
- 'target_lang': target_lang,
144
- 'analysis': analysis,
145
- 'timestamp': datetime.now().isoformat()
146
- }
147
- self.translation_history.append(history_entry)
148
 
149
- # Success message with linguistic metadata
150
- status_msg = f"✅ Translation completed in {processing_time:.2f}s | Word ratio: {analysis['word_ratio']:.2f} | Character ratio: {analysis['translation_ratio']:.2f}"
 
 
151
 
152
- logger.info(f"Translation completed: {processing_time:.2f}s")
 
 
153
 
154
- return translation, status_msg, True, analysis
 
155
 
156
- except Exception as e:
157
- error_msg = f"❌ Translation failed: {str(e)}"
158
- logger.error(f"Translation error: {str(e)}")
159
- logger.error(traceback.format_exc())
160
- return "", error_msg, False, {}
161
-
162
- def batch_translate(self, text_list: List[str], direction: str) -> List[Dict]:
163
- """Translate multiple texts for corpus analysis"""
164
- results = []
165
- for i, text in enumerate(text_list):
166
- if text.strip():
167
- translation, status, success, analysis = self.translate_text(text, direction, False)
168
- results.append({
169
- 'index': i + 1,
170
- 'source': text,
171
- 'translation': translation,
172
- 'success': success,
173
- 'analysis': analysis
174
- })
175
- return results
176
-
177
- def export_history_csv(self) -> str:
178
- """Export translation history as CSV for linguistic analysis"""
179
- if not self.translation_history:
180
- return None
181
 
182
- output = io.StringIO()
183
- writer = csv.writer(output)
184
 
185
- # Headers
186
- writer.writerow([
187
- 'Timestamp', 'Source Language', 'Target Language', 'Source Text',
188
- 'Translation', 'Source Words', 'Target Words', 'Word Ratio',
189
- 'Source Characters', 'Target Characters', 'Character Ratio',
190
- 'Lexical Diversity (Source)', 'Lexical Diversity (Target)',
191
- 'Processing Time (s)'
192
- ])
193
 
194
- # Data rows
195
- for entry in self.translation_history:
196
- analysis = entry['analysis']
197
- writer.writerow([
198
- entry['timestamp'],
199
- entry['source_lang'],
200
- entry['target_lang'],
201
- entry['source_text'],
202
- entry['translated_text'],
203
- analysis['source']['word_count'],
204
- analysis['target']['word_count'],
205
- analysis['word_ratio'],
206
- analysis['source']['character_count'],
207
- analysis['target']['character_count'],
208
- analysis['translation_ratio'],
209
- analysis['source']['lexical_diversity'],
210
- analysis['target']['lexical_diversity'],
211
- analysis['processing_time']
212
- ])
213
 
214
- return output.getvalue()
215
-
216
- # Initialize the app
217
- app = LinguisticTranslationApp()
218
-
219
- # Custom CSS for linguistic interface
220
- custom_css = """
221
- #logo {
222
- display: block;
223
- margin: 0 auto 20px auto;
224
- }
225
 
226
- .linguistic-panel {
227
- background: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%);
228
- border: 1px solid #0891b2;
229
- border-radius: 12px;
230
- padding: 20px;
231
- margin: 10px 0;
232
- }
233
 
234
- .analysis-metric {
235
- background: white;
236
- padding: 10px;
237
- border-radius: 8px;
238
- margin: 5px;
239
- border-left: 4px solid #0891b2;
240
- }
241
 
242
- .status-success {
243
- color: #059669 !important;
244
- font-weight: 500;
245
- }
 
 
 
 
246
 
247
- .status-error {
248
- color: #DC2626 !important;
249
- font-weight: 500;
250
- }
 
 
 
 
 
 
251
 
252
- .gradient-text {
253
- background: linear-gradient(45deg, #059669, #0891b2);
254
- -webkit-background-clip: text;
255
- -webkit-text-fill-color: transparent;
256
- background-clip: text;
257
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
- .linguistic-header {
260
- text-align: center;
261
- margin-bottom: 30px;
262
- padding: 20px;
263
- background: linear-gradient(135deg, #f8fafc 0%, #e2e8f0 100%);
264
- border-radius: 16px;
265
- border: 1px solid #cbd5e1;
266
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
- .comparison-grid {
269
- display: grid;
270
- grid-template-columns: 1fr 1fr;
271
- gap: 15px;
272
- margin: 15px 0;
273
- }
 
 
 
 
 
274
 
275
- .metric-card {
276
- background: white;
277
- padding: 15px;
278
- border-radius: 8px;
279
- border: 1px solid #e2e8f0;
280
- text-align: center;
281
- }
282
- """
283
-
284
- # Create the Gradio interface
285
- with gr.Blocks(css=custom_css, title="Linguistic Translation Analysis Tool", theme=gr.themes.Soft()) as demo:
286
-
287
- # Header section
288
- with gr.Row():
289
- with gr.Column():
290
- gr.HTML("""
291
- <div class='linguistic-header'>
292
- <h1 class='gradient-text' style='font-size: 2.5em; margin-bottom: 10px;'>
293
- 🔬 Siswati ⇄ English Linguistic Analysis Tool
294
- </h1>
295
- <p style='font-size: 1.1em; color: #475569; max-width: 800px; margin: 0 auto;'>
296
- Advanced translation system with comprehensive linguistic analysis for researchers,
297
- linguists, and language documentation projects. Includes morphological insights,
298
- statistical analysis, and corpus management features.
299
- </p>
300
- </div>
301
- """)
302
 
303
- # Main translation interface
304
- with gr.Row():
305
- with gr.Column(scale=2):
306
- # Input section
307
- with gr.Group():
308
- gr.HTML("<h3>📝 Translation Input</h3>")
309
- direction = gr.Radio(
310
- choices=['English Siswati', 'Siswati English'],
311
- label="Translation Direction",
312
- value='English Siswati',
313
- interactive=True
314
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
- input_text = gr.Textbox(
317
- lines=6,
318
- placeholder="Enter your text here for linguistic analysis... (maximum 2000 characters)",
319
- label="Source Text",
320
- max_lines=12,
321
- show_copy_button=True
322
  )
323
 
324
- char_count = gr.HTML("Character count: 0/2000")
 
 
 
 
 
 
 
 
 
325
 
326
- with gr.Row():
327
- translate_btn = gr.Button("🔄 Translate & Analyze", variant="primary", size="lg")
328
- clear_btn = gr.Button("🗑️ Clear", variant="secondary")
329
-
330
- # Output section
331
- with gr.Group():
332
- gr.HTML("<h3>✨ Translation Output</h3>")
333
- output_text = gr.Textbox(
334
- label="Translation",
335
- lines=6,
336
- max_lines=12,
337
- show_copy_button=True,
338
- interactive=False
339
  )
340
- status_display = gr.HTML()
341
-
342
- # Linguistic analysis panel
343
- with gr.Column(scale=1):
344
- with gr.Group():
345
- gr.HTML("<h3>📊 Linguistic Analysis</h3>")
346
-
347
- # Real-time metrics
348
- with gr.Accordion("📈 Text Metrics", open=True):
349
- metrics_display = gr.HTML("""
350
- <div style='text-align: center; color: #64748b; padding: 20px;'>
351
- <em>Translate text to see linguistic analysis</em>
352
- </div>
353
- """)
354
 
355
- # Language-specific features
356
- with gr.Accordion("🔍 Language Features", open=False):
357
- features_display = gr.HTML("")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
- # Translation quality indicators
360
- with gr.Accordion("⚖️ Translation Ratios", open=False):
361
- ratios_display = gr.HTML("")
362
-
363
- # Batch processing section
364
- with gr.Accordion("📚 Batch Translation & Corpus Analysis", open=False):
365
- with gr.Row():
366
- with gr.Column():
367
- gr.HTML("<h4>Upload text file or enter multiple lines:</h4>")
368
- batch_input = gr.File(
369
- label="Upload .txt file",
370
- file_types=[".txt"],
371
- type="filepath"
372
- )
373
- batch_text = gr.Textbox(
374
- lines=8,
375
- placeholder="Or paste multiple lines here (one per line)...",
376
- label="Batch Text Input",
377
- show_copy_button=True
378
- )
379
- batch_direction = gr.Radio(
380
- choices=['English → Siswati', 'Siswati → English'],
381
- label="Batch Translation Direction",
382
- value='English → Siswati'
383
  )
384
- batch_btn = gr.Button("🔄 Process Batch", variant="primary")
385
 
386
- with gr.Column():
387
- batch_results = gr.Dataframe(
388
- headers=["Index", "Source", "Translation", "Words (S→T)", "Chars (S→T)"],
389
- label="Batch Results",
390
- interactive=False
391
- )
392
-
393
- # Research tools section
394
- with gr.Accordion("🔬 Research & Export Tools", open=False):
395
- with gr.Row():
396
- with gr.Column():
397
- gr.HTML("<h4>Translation History & Export</h4>")
398
- history_display = gr.Dataframe(
399
- headers=["Timestamp", "Direction", "Source", "Translation"],
400
- label="Translation History",
401
- interactive=False
402
- )
403
 
404
  with gr.Row():
405
- refresh_history_btn = gr.Button("🔄 Refresh History")
406
- export_csv_btn = gr.Button("📊 Export CSV", variant="secondary")
407
- clear_history_btn = gr.Button("🗑️ Clear History", variant="stop")
 
 
 
 
 
 
 
 
 
 
408
 
409
- csv_download = gr.File(label="Download CSV", visible=False)
410
-
411
- with gr.Column():
412
- gr.HTML("<h4>Linguistic Resources</h4>")
413
- gr.HTML("""
414
- <div style='background: #f8fafc; padding: 20px; border-radius: 8px; border: 1px solid #e2e8f0;'>
415
- <h5>📖 Siswati Language Notes:</h5>
416
- <ul style='text-align: left; margin: 10px 0;'>
417
- <li><strong>Script:</strong> Latin alphabet</li>
418
- <li><strong>Family:</strong> Niger-Congo, Bantu</li>
419
- <li><strong>Features:</strong> Agglutinative, click consonants</li>
420
- <li><strong>Speakers:</strong> ~2.3 million (Eswatini, South Africa)</li>
421
- </ul>
422
- <h5>🔧 Research Features:</h5>
423
- <ul style='text-align: left; margin: 10px 0;'>
424
- <li>Morphological complexity analysis</li>
425
- <li>Translation ratio tracking</li>
426
- <li>Lexical diversity measurement</li>
427
- <li>Batch processing for corpora</li>
428
- <li>Export capabilities for further analysis</li>
429
- </ul>
430
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
431
  """)
432
-
433
- # Examples for linguists
434
- with gr.Accordion("💡 Linguistic Examples", open=False):
435
- examples = gr.Examples(
436
- examples=[
437
- ["The child is playing with traditional toys.", "English → Siswati"],
438
- ["Umntfwana udlala ngetinsisimane tesintu.", "Siswati → English"],
439
- ["Agglutination demonstrates morphological complexity in Bantu languages.", "English → Siswati"],
440
- ["Lolimi lune-morphology leyinkimbinkimbi.", "Siswati → English"],
441
- ["What are the phonological features of this language?", "English → Siswati"],
442
- ["Yini tinchubo te-phonology talolimi?", "Siswati → English"],
443
- ],
444
- inputs=[input_text, direction],
445
- label="Click examples to analyze linguistic features:"
446
- )
447
-
448
- # Footer
449
- with gr.Row():
450
- with gr.Column():
451
- gr.HTML("""
452
- <div style='text-align: center; margin-top: 40px; padding: 30px; border-top: 1px solid #E5E7EB; background: #f8fafc;'>
453
- <div style='margin-bottom: 20px;'>
454
- <a href='https://github.com/dsfsi/en-ss-m2m100-combo' target='_blank' style='margin: 0 15px; color: #0891b2; text-decoration: none;'>📁 En→Ss Model Repository</a>
455
- <a href='https://github.com/dsfsi/ss-en-m2m100-combo' target='_blank' style='margin: 0 15px; color: #0891b2; text-decoration: none;'>📁 Ss→En Model Repository</a>
456
- <a href='https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform' target='_blank' style='margin: 0 15px; color: #0891b2; text-decoration: none;'>💬 Research Feedback</a>
457
- </div>
458
- <div style='color: #475569; font-size: 0.95em;'>
459
- <strong>Research Team:</strong> Vukosi Marivate, Richard Lastrucci<br>
460
- <em>Supporting African language documentation and computational linguistics research</em><br>
461
- <small style='color: #64748b; margin-top: 10px; display: block;'>
462
- For academic use: Please cite the original models in your publications
463
- </small>
464
- </div>
465
- </div>
466
- """)
467
-
468
- # Event handlers
469
- def update_char_count(text):
470
- count = len(text) if text else 0
471
- color = "#DC2626" if count > 2000 else "#059669" if count > 1600 else "#64748b"
472
- return f"<span style='color: {color}; font-weight: 500;'>Character count: {count}/2000</span>"
473
-
474
- def clear_all():
475
- return "", "", "Character count: 0/2000", "", "", "", ""
476
-
477
- def translate_with_analysis(text, direction):
478
- translation, status, success, analysis = app.translate_text(text, direction)
479
- status_html = f"<div class='{'status-success' if success else 'status-error'}'>{status}</div>"
480
 
481
- if success and analysis:
482
- # Create metrics display
483
- source_metrics = analysis['source']
484
- target_metrics = analysis['target']
485
-
486
- metrics_html = f"""
487
- <div class='comparison-grid'>
488
- <div class='metric-card'>
489
- <h5>📊 Source Text</h5>
490
- <p><strong>Words:</strong> {source_metrics['word_count']}</p>
491
- <p><strong>Characters:</strong> {source_metrics['character_count']}</p>
492
- <p><strong>Sentences:</strong> {source_metrics['sentence_count']}</p>
493
- <p><strong>Lexical Diversity:</strong> {source_metrics['lexical_diversity']:.3f}</p>
494
- </div>
495
- <div class='metric-card' style='border-left: 4px solid #059669;'>
496
- <h5>📊 Translation</h5>
497
- <p><strong>Words:</strong> {target_metrics['word_count']}</p>
498
- <p><strong>Characters:</strong> {target_metrics['character_count']}</p>
499
- <p><strong>Sentences:</strong> {target_metrics['sentence_count']}</p>
500
- <p><strong>Lexical Diversity:</strong> {target_metrics['lexical_diversity']:.3f}</p>
501
- </div>
502
- </div>
503
- """
504
-
505
- # Language features
506
- features_html = ""
507
- if 'potential_agglutination' in source_metrics:
508
- features_html = f"""
509
- <div class='analysis-metric'>
510
- <h5>🔍 Siswati Features Detected:</h5>
511
- <p><strong>Potential agglutinated words:</strong> {source_metrics['potential_agglutination']}</p>
512
- <p><strong>Click consonants (c,q,x):</strong> {source_metrics['click_consonants']}</p>
513
- <p><strong>Tone markers:</strong> {source_metrics['tone_markers']}</p>
514
- </div>
515
- """
516
-
517
- # Translation ratios
518
- ratios_html = f"""
519
- <div class='analysis-metric'>
520
- <h5>⚖️ Translation Ratios:</h5>
521
- <p><strong>Word ratio:</strong> {analysis['word_ratio']:.3f}</p>
522
- <p><strong>Character ratio:</strong> {analysis['translation_ratio']:.3f}</p>
523
- <p><strong>Processing time:</strong> {analysis['processing_time']:.3f}s</p>
524
- </div>
525
- """
526
-
527
- return translation, status_html, metrics_html, features_html, ratios_html
528
 
529
- return translation, status_html, "", "", ""
530
-
531
- def process_batch(file_path, batch_text, direction):
532
- texts = []
533
 
534
- if file_path:
535
- try:
536
- with open(file_path, 'r', encoding='utf-8') as f:
537
- texts = [line.strip() for line in f.readlines() if line.strip()]
538
- except Exception as e:
539
- return [[f"Error reading file: {str(e)}", "", "", "", ""]]
540
- elif batch_text:
541
- texts = [line.strip() for line in batch_text.split('\n') if line.strip()]
542
 
543
- if not texts:
544
- return [["No text provided", "", "", "", ""]]
545
 
546
- results = app.batch_translate(texts, direction)
 
547
 
548
- # Format for display
549
- display_data = []
550
- for r in results:
551
- if r['success']:
552
- word_ratio = f"{r['analysis']['source']['word_count']}→{r['analysis']['target']['word_count']}"
553
- char_ratio = f"{r['analysis']['source']['character_count']}→{r['analysis']['target']['character_count']}"
554
- else:
555
- word_ratio = "Error"
556
- char_ratio = "Error"
557
-
558
- display_data.append([
559
- r['index'],
560
- r['source'][:50] + "..." if len(r['source']) > 50 else r['source'],
561
- r['translation'][:50] + "..." if len(r['translation']) > 50 else r['translation'],
562
- word_ratio,
563
- char_ratio
564
- ])
565
 
566
- return display_data
567
-
568
- def get_history():
569
- if not app.translation_history:
570
- return []
571
 
572
- return [[
573
- entry['timestamp'][:19], # Remove microseconds
574
- entry['direction'],
575
- entry['source_text'][:50] + "..." if len(entry['source_text']) > 50 else entry['source_text'],
576
- entry['translated_text'][:50] + "..." if len(entry['translated_text']) > 50 else entry['translated_text']
577
- ] for entry in app.translation_history[-20:]] # Show last 20
578
-
579
- def export_csv():
580
- csv_content = app.export_history_csv()
581
- if csv_content:
582
- filename = f"translation_history_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
583
- return gr.File.update(value=csv_content, visible=True, label=f"📊 {filename}")
584
- return gr.File.update(visible=False)
585
-
586
- def clear_history():
587
- app.translation_history = []
588
- return []
589
-
590
- # Wire up events
591
- input_text.change(fn=update_char_count, inputs=input_text, outputs=char_count)
592
-
593
- translate_btn.click(
594
- fn=translate_with_analysis,
595
- inputs=[input_text, direction],
596
- outputs=[output_text, status_display, metrics_display, features_display, ratios_display]
597
- )
598
-
599
- clear_btn.click(
600
- fn=clear_all,
601
- outputs=[input_text, output_text, char_count, status_display, metrics_display, features_display, ratios_display]
602
- )
603
-
604
- batch_btn.click(
605
- fn=process_batch,
606
- inputs=[batch_input, batch_text, batch_direction],
607
- outputs=batch_results
608
- )
609
-
610
- refresh_history_btn.click(fn=get_history, outputs=history_display)
611
- export_csv_btn.click(fn=export_csv, outputs=csv_download)
612
- clear_history_btn.click(fn=clear_history, outputs=history_display)
613
 
614
- # Auto-translate on Enter
615
- input_text.submit(
616
- fn=translate_with_analysis,
617
- inputs=[input_text, direction],
618
- outputs=[output_text, status_display, metrics_display, features_display, ratios_display]
619
- )
620
 
 
621
  if __name__ == "__main__":
 
622
  demo.launch(
 
623
  server_name="0.0.0.0",
624
  server_port=7860,
625
- share=False,
626
- debug=True
627
  )
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
4
+ import pandas as pd
5
  import time
 
 
 
 
 
 
 
6
  import re
7
+ from datetime import datetime
8
+ import json
9
+
10
+ # Model loading and caching
11
+ @gr.cache_model
12
+ def load_translation_models():
13
+ """Load and cache both translation models"""
14
+ try:
15
+ # English to Siswati
16
+ en_ss_tokenizer = AutoTokenizer.from_pretrained("dsfsi/en-ss-m2m100-combo")
17
+ en_ss_model = AutoModelForSeq2SeqLM.from_pretrained("dsfsi/en-ss-m2m100-combo")
18
+ en_ss_pipeline = pipeline("translation", model=en_ss_model, tokenizer=en_ss_tokenizer)
19
+
20
+ # Siswati to English
21
+ ss_en_tokenizer = AutoTokenizer.from_pretrained("dsfsi/ss-en-m2m100-combo")
22
+ ss_en_model = AutoModelForSeq2SeqLM.from_pretrained("dsfsi/ss-en-m2m100-combo")
23
+ ss_en_pipeline = pipeline("translation", model=ss_en_model, tokenizer=ss_en_tokenizer)
24
+
25
+ return en_ss_pipeline, ss_en_pipeline
26
+ except Exception as e:
27
+ print(f"Error loading models: {e}")
28
+ return None, None
29
 
30
+ # Load models at startup
31
+ en_ss_translator, ss_en_translator = load_translation_models()
 
32
 
33
+ def analyze_siswati_features(text):
34
+ """Analyze Siswati-specific linguistic features"""
35
+ features = {}
 
 
36
 
37
+ # Click consonants (c, q, x sounds)
38
+ click_pattern = r'[cqx]'
39
+ features['click_consonants'] = len(re.findall(click_pattern, text.lower()))
40
+
41
+ # Tone markers (acute and grave accents)
42
+ tone_pattern = r'[áàéèíìóòúù]'
43
+ features['tone_markers'] = len(re.findall(tone_pattern, text.lower()))
44
+
45
+ # Potential agglutination (words longer than 10 characters)
46
+ words = text.split()
47
+ long_words = [word for word in words if len(word) > 10]
48
+ features['potential_agglutination'] = len(long_words)
49
+ features['long_words'] = long_words[:5] # Show first 5 examples
50
+
51
+ return features
 
 
 
 
 
52
 
53
+ def calculate_linguistic_metrics(text):
54
+ """Calculate comprehensive linguistic metrics"""
55
+ if not text.strip():
56
+ return {}
57
+
58
+ # Basic counts
59
+ char_count = len(text)
60
+ word_count = len(text.split())
61
+ sentence_count = len([s for s in re.split(r'[.!?]+', text) if s.strip()])
62
+
63
+ # Advanced metrics
64
+ words = text.split()
65
+ unique_words = set(words)
66
+ lexical_diversity = len(unique_words) / word_count if word_count > 0 else 0
67
+ avg_word_length = sum(len(word) for word in words) / word_count if word_count > 0 else 0
68
+
69
+ return {
70
+ 'char_count': char_count,
71
+ 'word_count': word_count,
72
+ 'sentence_count': sentence_count,
73
+ 'lexical_diversity': lexical_diversity,
74
+ 'avg_word_length': avg_word_length,
75
+ 'unique_words': len(unique_words)
76
+ }
 
77
 
78
+ def translate_text(text, direction):
79
+ """Main translation function with linguistic analysis"""
80
+ if not text.strip():
81
+ return "Please enter text to translate.", "", ""
82
+
83
+ start_time = time.time()
84
+
85
+ try:
86
+ # Perform translation
87
+ if direction == "English → Siswati":
88
+ if en_ss_translator is None:
89
+ return "Translation model not loaded. Please try again.", "", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ result = en_ss_translator(text, max_length=512)
92
+ translated_text = result[0]['translation_text']
 
 
 
 
 
 
 
 
 
 
93
 
94
+ # Analyze source (English) and target (Siswati)
95
+ source_metrics = calculate_linguistic_metrics(text)
96
+ target_metrics = calculate_linguistic_metrics(translated_text)
97
+ siswati_features = analyze_siswati_features(translated_text)
98
 
99
+ else: # Siswati → English
100
+ if ss_en_translator is None:
101
+ return "Translation model not loaded. Please try again.", "", ""
102
 
103
+ result = ss_en_translator(text, max_length=512)
104
+ translated_text = result[0]['translation_text']
105
 
106
+ # Analyze source (Siswati) and target (English)
107
+ source_metrics = calculate_linguistic_metrics(text)
108
+ target_metrics = calculate_linguistic_metrics(translated_text)
109
+ siswati_features = analyze_siswati_features(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ processing_time = time.time() - start_time
 
112
 
113
+ # Create linguistic analysis report
114
+ analysis_report = create_analysis_report(
115
+ source_metrics, target_metrics, siswati_features,
116
+ processing_time, direction
117
+ )
 
 
 
118
 
119
+ # Create metrics table
120
+ metrics_table = create_metrics_table(source_metrics, target_metrics, processing_time)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
+ return translated_text, analysis_report, metrics_table
123
+
124
+ except Exception as e:
125
+ return f"Translation error: {str(e)}", "", ""
 
 
 
 
 
 
 
126
 
127
+ def create_analysis_report(source_metrics, target_metrics, siswati_features, processing_time, direction):
128
+ """Create a comprehensive linguistic analysis report"""
129
+ report = f"""
130
+ ## 📊 Linguistic Analysis Report
 
 
 
131
 
132
+ ### Translation Details
133
+ - **Direction**: {direction}
134
+ - **Processing Time**: {processing_time:.2f} seconds
135
+ - **Timestamp**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
 
 
 
136
 
137
+ ### Text Complexity Metrics
138
+ | Metric | Source | Target | Ratio |
139
+ |--------|--------|--------|-------|
140
+ | Word Count | {source_metrics.get('word_count', 0)} | {target_metrics.get('word_count', 0)} | {target_metrics.get('word_count', 0) / max(source_metrics.get('word_count', 1), 1):.2f} |
141
+ | Character Count | {source_metrics.get('char_count', 0)} | {target_metrics.get('char_count', 0)} | {target_metrics.get('char_count', 0) / max(source_metrics.get('char_count', 1), 1):.2f} |
142
+ | Sentence Count | {source_metrics.get('sentence_count', 0)} | {target_metrics.get('sentence_count', 0)} | {target_metrics.get('sentence_count', 0) / max(source_metrics.get('sentence_count', 1), 1):.2f} |
143
+ | Avg Word Length | {source_metrics.get('avg_word_length', 0):.1f} | {target_metrics.get('avg_word_length', 0):.1f} | {target_metrics.get('avg_word_length', 0) / max(source_metrics.get('avg_word_length', 1), 1):.2f} |
144
+ | Lexical Diversity | {source_metrics.get('lexical_diversity', 0):.3f} | {target_metrics.get('lexical_diversity', 0):.3f} | {target_metrics.get('lexical_diversity', 0) / max(source_metrics.get('lexical_diversity', 0.001), 0.001):.2f} |
145
 
146
+ ### Siswati-Specific Features
147
+ - **Click Consonants**: {siswati_features.get('click_consonants', 0)} detected
148
+ - **Tone Markers**: {siswati_features.get('tone_markers', 0)} detected
149
+ - **Potential Agglutination**: {siswati_features.get('potential_agglutination', 0)} words longer than 10 characters
150
+ """
151
+
152
+ if siswati_features.get('long_words'):
153
+ report += f"- **Long Word Examples**: {', '.join(siswati_features['long_words'])}\n"
154
+
155
+ return report
156
 
157
+ def create_metrics_table(source_metrics, target_metrics, processing_time):
158
+ """Create a DataFrame for metrics visualization"""
159
+ data = {
160
+ 'Metric': ['Words', 'Characters', 'Sentences', 'Unique Words', 'Avg Word Length', 'Lexical Diversity'],
161
+ 'Source Text': [
162
+ source_metrics.get('word_count', 0),
163
+ source_metrics.get('char_count', 0),
164
+ source_metrics.get('sentence_count', 0),
165
+ source_metrics.get('unique_words', 0),
166
+ f"{source_metrics.get('avg_word_length', 0):.1f}",
167
+ f"{source_metrics.get('lexical_diversity', 0):.3f}"
168
+ ],
169
+ 'Target Text': [
170
+ target_metrics.get('word_count', 0),
171
+ target_metrics.get('char_count', 0),
172
+ target_metrics.get('sentence_count', 0),
173
+ target_metrics.get('unique_words', 0),
174
+ f"{target_metrics.get('avg_word_length', 0):.1f}",
175
+ f"{target_metrics.get('lexical_diversity', 0):.3f}"
176
+ ]
177
+ }
178
+
179
+ return pd.DataFrame(data)
180
 
181
+ def batch_translate(file_obj, direction):
182
+ """Process batch translations from uploaded file"""
183
+ if file_obj is None:
184
+ return "Please upload a file.", ""
185
+
186
+ try:
187
+ # Read file content
188
+ if file_obj.name.endswith('.csv'):
189
+ df = pd.read_csv(file_obj.name)
190
+ # Assume first column contains text to translate
191
+ texts = df.iloc[:, 0].dropna().astype(str).tolist()
192
+ else:
193
+ # Plain text file
194
+ with open(file_obj.name, 'r', encoding='utf-8') as f:
195
+ content = f.read()
196
+ texts = [line.strip() for line in content.split('\n') if line.strip()]
197
+
198
+ # Limit batch size for demo
199
+ texts = texts[:10] # Process first 10 entries
200
+
201
+ results = []
202
+ for i, text in enumerate(texts):
203
+ translated, _, _ = translate_text(text, direction)
204
+ results.append({
205
+ 'Original': text[:100] + '...' if len(text) > 100 else text,
206
+ 'Translation': translated[:100] + '...' if len(translated) > 100 else translated,
207
+ 'Index': i + 1
208
+ })
209
+
210
+ results_df = pd.DataFrame(results)
211
+ summary = f"Processed {len(results)} texts successfully."
212
+
213
+ return summary, results_df
214
+
215
+ except Exception as e:
216
+ return f"Error processing file: {str(e)}", ""
217
 
218
+ # Define example texts
219
+ TRANSLATION_EXAMPLES = [
220
+ ["English Siswati", "Hello, how are you today?"],
221
+ ["English → Siswati", "The weather is beautiful this morning."],
222
+ ["English Siswati", "I am learning Siswati language."],
223
+ ["English → Siswati", "Thank you for your help."],
224
+ ["Siswati → English", "Sawubona, unjani namuhla?"],
225
+ ["Siswati → English", "Siyabonga ngekusita kwakho."],
226
+ ["Siswati → English", "Lolu luhle kakhulu."],
227
+ ["Siswati → English", "Ngiyakuthanda."]
228
+ ]
229
 
230
+ def create_gradio_interface():
231
+ """Create the main Gradio interface"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
+ with gr.Blocks(
234
+ title="🔬 Siswati-English Linguistic Translation Tool",
235
+ theme=gr.themes.Soft(),
236
+ css="""
237
+ .gradio-container {font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;}
238
+ .main-header {text-align: center; padding: 2rem 0;}
239
+ .metric-table {font-size: 0.9em;}
240
+ .feature-highlight {background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; margin: 1rem 0;}
241
+ """
242
+ ) as demo:
243
+
244
+ # Header Section
245
+ gr.HTML("""
246
+ <div class="main-header">
247
+ <img src="https://www.dsfsi.co.za/images/logo_transparent_expanded.png" width="400" alt="DSFSI Logo" style="margin-bottom: 1rem;">
248
+ <h1>🔬 Siswati-English Linguistic Translation Tool</h1>
249
+ <p style="font-size: 1.1em; color: #666; max-width: 800px; margin: 0 auto;">
250
+ Advanced AI-powered translation system with comprehensive linguistic analysis features,
251
+ designed specifically for linguists, researchers, and language documentation projects.
252
+ </p>
253
+ </div>
254
+ """)
255
+
256
+ # Main Content Tabs
257
+ with gr.Tabs():
258
+
259
+ # Single Translation Tab
260
+ with gr.Tab("🌐 Translation & Analysis"):
261
+ gr.Markdown("""
262
+ ### Real-time Translation with Linguistic Analysis
263
+ Translate between English and Siswati while getting detailed linguistic insights including morphological complexity, lexical diversity, and Siswati-specific features.
264
+ """)
265
+
266
+ with gr.Row():
267
+ with gr.Column(scale=1):
268
+ direction = gr.Dropdown(
269
+ choices=["English → Siswati", "Siswati → English"],
270
+ label="Translation Direction",
271
+ value="English → Siswati"
272
+ )
273
+
274
+ input_text = gr.Textbox(
275
+ label="Input Text",
276
+ placeholder="Enter text to translate...",
277
+ lines=4
278
+ )
279
+
280
+ translate_btn = gr.Button("🔄 Translate & Analyze", variant="primary", size="lg")
281
+
282
+ with gr.Column(scale=1):
283
+ output_text = gr.Textbox(
284
+ label="Translation",
285
+ lines=4,
286
+ interactive=False
287
+ )
288
+
289
+ # Quick metrics display
290
+ with gr.Row():
291
+ processing_info = gr.Textbox(
292
+ label="Processing Info",
293
+ lines=1,
294
+ interactive=False
295
+ )
296
 
297
+ # Examples Section
298
+ gr.Markdown("### 📚 Example Translations")
299
+ gr.Examples(
300
+ examples=TRANSLATION_EXAMPLES,
301
+ inputs=[direction, input_text],
302
+ label="Click an example to try it:"
303
  )
304
 
305
+ # Analysis Results
306
+ with gr.Accordion("📊 Detailed Linguistic Analysis", open=False):
307
+ analysis_output = gr.Markdown(label="Analysis Report")
308
+
309
+ with gr.Accordion("📈 Metrics Table", open=False):
310
+ metrics_table = gr.Dataframe(
311
+ label="Comparative Metrics",
312
+ headers=["Metric", "Source Text", "Target Text"],
313
+ interactive=False
314
+ )
315
 
316
+ # Connect translation function
317
+ translate_btn.click(
318
+ fn=translate_text,
319
+ inputs=[input_text, direction],
320
+ outputs=[output_text, analysis_output, metrics_table]
 
 
 
 
 
 
 
 
321
  )
322
+
323
+ # Batch Processing Tab
324
+ with gr.Tab("📁 Batch Processing"):
325
+ gr.Markdown("""
326
+ ### Corpus Analysis & Batch Translation
327
+ Upload text files or CSV files for batch translation and corpus analysis. Perfect for linguistic research and documentation projects.
328
+ """)
 
 
 
 
 
 
 
329
 
330
+ with gr.Row():
331
+ with gr.Column():
332
+ batch_direction = gr.Dropdown(
333
+ choices=["English → Siswati", "Siswati → English"],
334
+ label="Translation Direction",
335
+ value="English → Siswati"
336
+ )
337
+
338
+ file_upload = gr.File(
339
+ label="Upload File",
340
+ file_types=[".txt", ".csv"],
341
+ type="filepath"
342
+ )
343
+
344
+ batch_btn = gr.Button("🔄 Process Batch", variant="primary")
345
+
346
+ gr.Markdown("""
347
+ **Supported formats:**
348
+ - `.txt` files: One text per line
349
+ - `.csv` files: Text in first column
350
+ - **Limit**: First 10 entries for demo
351
+ """)
352
+
353
+ with gr.Column():
354
+ batch_summary = gr.Textbox(
355
+ label="Processing Summary",
356
+ lines=3,
357
+ interactive=False
358
+ )
359
+
360
+ batch_results = gr.Dataframe(
361
+ label="Translation Results",
362
+ interactive=False,
363
+ wrap=True
364
+ )
365
 
366
+ batch_btn.click(
367
+ fn=batch_translate,
368
+ inputs=[file_upload, batch_direction],
369
+ outputs=[batch_summary, batch_results]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  )
 
371
 
372
+ # Research Tools Tab
373
+ with gr.Tab("🔬 Research Tools"):
374
+ gr.Markdown("""
375
+ ### Advanced Linguistic Analysis Tools
376
+ Explore detailed linguistic features and export research data.
377
+ """)
 
 
 
 
 
 
 
 
 
 
 
378
 
379
  with gr.Row():
380
+ with gr.Column():
381
+ research_text = gr.Textbox(
382
+ label="Text for Analysis",
383
+ lines=6,
384
+ placeholder="Enter Siswati or English text for detailed analysis..."
385
+ )
386
+
387
+ analyze_btn = gr.Button("🔍 Analyze Text", variant="primary")
388
+
389
+ with gr.Column():
390
+ research_output = gr.JSON(
391
+ label="Detailed Analysis Results"
392
+ )
393
 
394
+ def detailed_analysis(text):
395
+ """Perform detailed linguistic analysis"""
396
+ if not text.strip():
397
+ return {}
398
+
399
+ metrics = calculate_linguistic_metrics(text)
400
+ siswati_features = analyze_siswati_features(text)
401
+
402
+ return {
403
+ "basic_metrics": metrics,
404
+ "siswati_features": siswati_features,
405
+ "text_preview": text[:100] + "..." if len(text) > 100 else text,
406
+ "analysis_timestamp": datetime.now().isoformat()
407
+ }
408
+
409
+ analyze_btn.click(
410
+ fn=detailed_analysis,
411
+ inputs=research_text,
412
+ outputs=research_output
413
+ )
414
+
415
+ # Language Information
416
+ gr.Markdown("""
417
+ ### 🗣️ About Siswati Language
418
+
419
+ **Siswati** (also known as **Swati** or **Swazi**) is a Bantu language spoken by approximately 2.3 million people, primarily in:
420
+ - 🇸🇿 **Eswatini** (Kingdom of Eswatini) - Official language
421
+ - 🇿🇦 **South Africa** - One of 11 official languages
422
+
423
+ **Key Linguistic Features:**
424
+ - **Language Family**: Niger-Congo → Bantu → Southeast Bantu
425
+ - **Script**: Latin alphabet
426
+ - **Characteristics**: Agglutinative morphology, click consonants, tonal
427
+ - **ISO Code**: ss (ISO 639-1), ssw (ISO 639-3)
428
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
 
430
+ # Footer Section
431
+ gr.Markdown("""
432
+ ---
433
+ ### 📚 Model Information & Citation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
 
435
+ **Models Used:**
436
+ - **English → Siswati**: [`dsfsi/en-ss-m2m100-combo`](https://huggingface.co/dsfsi/en-ss-m2m100-combo)
437
+ - **Siswati English**: [`dsfsi/ss-en-m2m100-combo`](https://huggingface.co/dsfsi/ss-en-m2m100-combo)
 
438
 
439
+ Both models are based on Meta's M2M100 architecture, fine-tuned specifically for Siswati-English translation pairs by the **Data Science for Social Impact Research Group**.
 
 
 
 
 
 
 
440
 
441
+ **Training Data**: Models trained on the Vuk'uzenzele and ZA-gov-multilingual South African corpora.
 
442
 
443
+ ### 🙏 Acknowledgments
444
+ We thank **Thapelo Sindanie** and **Unarine Netshifhefhe** for their contributions to this work.
445
 
446
+ ### 📖 Citation
447
+ ```bibtex
448
+ @inproceedings{lastrucci2023preparing,
449
+ title={Preparing the Vuk'uzenzele and ZA-gov-multilingual South African multilingual corpora},
450
+ author={Lastrucci, Richard and Rajab, Jenalea and Shingange, Matimba and Njini, Daniel and Marivate, Vukosi},
451
+ booktitle={Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)},
452
+ pages={18--25},
453
+ year={2023}
454
+ }
455
+ ```
 
 
 
 
 
 
 
456
 
457
+ **Links**:
458
+ - [DSFSI](https://www.dsfsi.co.za/)
459
+ - [En→Ss Model](https://huggingface.co/dsfsi/en-ss-m2m100-combo) | [Ss→En Model](https://huggingface.co/dsfsi/ss-en-m2m100-combo)
460
+ - [Vuk'uzenzele Data](https://github.com/dsfsi/vukuzenzele-nlp) | [ZA-gov Data](https://github.com/dsfsi/gov-za-multilingual)
461
+ - [Research Feedback](https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform)
462
 
463
+ ---
464
+ **Built with ❤️ for the African NLP community**
465
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
 
467
+ return demo
 
 
 
 
 
468
 
469
+ # Create and launch the interface
470
  if __name__ == "__main__":
471
+ demo = create_gradio_interface()
472
  demo.launch(
473
+ share=True,
474
  server_name="0.0.0.0",
475
  server_port=7860,
476
+ show_error=True
 
477
  )