Mgolo commited on
Commit
488ebab
·
verified ·
1 Parent(s): 366e052

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +178 -0
  2. localenpl5.jpeg +0 -0
  3. requirements.txt +14 -0
app.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM
3
+ import torch
4
+ import unicodedata
5
+ import re
6
+ import whisper
7
+ import tempfile
8
+ import os
9
+
10
+ import nltk
11
+ nltk.download('punkt')
12
+ from nltk.tokenize import sent_tokenize
13
+
14
+ import fitz # PyMuPDF
15
+ import docx
16
+ from bs4 import BeautifulSoup
17
+ import markdown2
18
+ import chardet
19
+
20
+ # Device setup
21
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
22
+
23
+ # Load Wolof MarianMT model from HF hub (cached manually)
24
+ translator = None
25
+ whisper_model = None
26
+
27
+ def load_wolof_model():
28
+ global translator
29
+ if translator is None:
30
+ model_name = "LocaleNLP/eng_wolof"
31
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
32
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
33
+ translator = pipeline("translation", model=model, tokenizer=tokenizer, device=0 if device.type == 'cuda' else -1)
34
+ return translator
35
+
36
+ def load_whisper_model():
37
+ global whisper_model
38
+ if whisper_model is None:
39
+ whisper_model = whisper.load_model("base")
40
+ return whisper_model
41
+
42
+ def transcribe_audio(audio_file):
43
+ model = load_whisper_model()
44
+ if isinstance(audio_file, str):
45
+ audio_path = audio_file
46
+ else:
47
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
48
+ tmp.write(audio_file.read())
49
+ audio_path = tmp.name
50
+ result = model.transcribe(audio_path)
51
+ if not isinstance(audio_file, str):
52
+ os.remove(audio_path)
53
+ return result["text"]
54
+
55
+ def extract_text_from_file(uploaded_file):
56
+ # Handle both filepath (str) and file-like object
57
+ if isinstance(uploaded_file, str):
58
+ file_path = uploaded_file
59
+ file_type = file_path.split('.')[-1].lower()
60
+ with open(file_path, "rb") as f:
61
+ content = f.read()
62
+ else:
63
+ file_type = uploaded_file.name.split('.')[-1].lower()
64
+ content = uploaded_file.read()
65
+
66
+ if file_type == "pdf":
67
+ with fitz.open(stream=content, filetype="pdf") as doc:
68
+ return "\n".join([page.get_text() for page in doc])
69
+ elif file_type == "docx":
70
+ if isinstance(uploaded_file, str):
71
+ doc = docx.Document(file_path)
72
+ else:
73
+ doc = docx.Document(uploaded_file)
74
+ return "\n".join([para.text for para in doc.paragraphs])
75
+ else:
76
+ encoding = chardet.detect(content)['encoding']
77
+ if encoding:
78
+ content = content.decode(encoding, errors='ignore')
79
+ if file_type in ("html", "htm"):
80
+ soup = BeautifulSoup(content, "html.parser")
81
+ return soup.get_text()
82
+ elif file_type == "md":
83
+ html = markdown2.markdown(content)
84
+ soup = BeautifulSoup(html, "html.parser")
85
+ return soup.get_text()
86
+ elif file_type == "srt":
87
+ return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", content)
88
+ elif file_type in ("txt", "text"):
89
+ return content
90
+ else:
91
+ raise ValueError("Unsupported file type")
92
+
93
+ def translate(text):
94
+ translator = load_wolof_model()
95
+ lang_tag = ">>wol<<"
96
+
97
+ paragraphs = text.split("\n")
98
+ translated_output = []
99
+
100
+ with torch.no_grad():
101
+ for para in paragraphs:
102
+ if not para.strip():
103
+ translated_output.append("")
104
+ continue
105
+ sentences = [s.strip() for s in para.split('. ') if s.strip()]
106
+ formatted = [f"{lang_tag} {s}" for s in sentences]
107
+
108
+ results = translator(formatted,
109
+ max_length=5000,
110
+ num_beams=5,
111
+ early_stopping=True,
112
+ no_repeat_ngram_size=3,
113
+ repetition_penalty=1.5,
114
+ length_penalty=1.2)
115
+ translated_sentences = [r['translation_text'].capitalize() for r in results]
116
+ translated_output.append('. '.join(translated_sentences))
117
+
118
+ return "\n".join(translated_output)
119
+
120
+ def process_input(input_mode, text, audio_file, file_obj):
121
+ input_text = ""
122
+ if input_mode == "Text":
123
+ input_text = text
124
+ elif input_mode == "Audio":
125
+ if audio_file is not None:
126
+ input_text = transcribe_audio(audio_file)
127
+ elif input_mode == "File":
128
+ if file_obj is not None:
129
+ input_text = extract_text_from_file(file_obj)
130
+ return input_text
131
+
132
+ def translate_and_return(text):
133
+ if not text.strip():
134
+ return "No input text to translate."
135
+ return translate(text)
136
+
137
+ # Gradio UI components
138
+ with gr.Blocks() as demo:
139
+ gr.Markdown("## LocaleNLP English-to-Wolof Translator")
140
+ gr.Markdown("Upload English text, audio, or document to translate to Wolof using a custom MarianMT model.")
141
+
142
+ with gr.Row():
143
+ input_mode = gr.Radio(choices=["Text", "Audio", "File"], label="Select input mode", value="Text")
144
+
145
+ input_text = gr.Textbox(label="Enter English text", lines=10, visible=True)
146
+ audio_input = gr.Audio(label="Upload audio (.wav, .mp3, .m4a)", type="filepath", visible=False)
147
+ file_input = gr.File(file_types=['.pdf', '.docx', '.html', '.htm', '.md', '.srt', '.txt'], label="Upload document", visible=False)
148
+
149
+ extracted_text = gr.Textbox(label="Extracted / Transcribed Text", lines=10, interactive=False)
150
+ translate_button = gr.Button("Translate to Wolof")
151
+ output_text = gr.Textbox(label="Translated Wolof Text", lines=10, interactive=False)
152
+
153
+ def update_visibility(mode):
154
+ return {
155
+ input_text: gr.update(visible=(mode=="Text")),
156
+ audio_input: gr.update(visible=(mode=="Audio")),
157
+ file_input: gr.update(visible=(mode=="File")),
158
+ extracted_text: gr.update(value="", visible=True),
159
+ output_text: gr.update(value="")
160
+ }
161
+
162
+ input_mode.change(fn=update_visibility, inputs=input_mode, outputs=[input_text, audio_input, file_input, extracted_text, output_text])
163
+
164
+ def handle_process(mode, text, audio, file_obj):
165
+ try:
166
+ extracted = process_input(mode, text, audio, file_obj)
167
+ return extracted, ""
168
+ except Exception as e:
169
+ return "", f"Error: {str(e)}"
170
+
171
+ translate_button.click(fn=handle_process, inputs=[input_mode, input_text, audio_input, file_input], outputs=[extracted_text, output_text])
172
+
173
+ def handle_translate(text):
174
+ return translate_and_return(text)
175
+
176
+ translate_button.click(fn=handle_translate, inputs=extracted_text, outputs=output_text)
177
+
178
+ demo.launch()
localenpl5.jpeg ADDED
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair
2
+ pandas
3
+ streamlit
4
+ transformers
5
+ torch
6
+ openai-whisper
7
+ nltk
8
+ PyMuPDF
9
+ python-docx
10
+ beautifulsoup4
11
+ markdown2
12
+ chardet
13
+ sentencepiece
14
+ sacremoses