Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import VitsTokenizer, VitsModel, set_seed | |
import tempfile | |
from scipy.io.wavfile import write | |
import numpy as np | |
# Predefined Marma sentences | |
marma_sentences = [ | |
"အဒေါ် မျက်နှာ", | |
"ဆေနိ ဆေရက် ဒို့ခခံရို့ ဧလောကတို့ ယူခါရေ အမိ။", | |
"ရင်ဖတ်၏သွီးကို နို့ပျင်ရို့။", | |
"အကျွန်ဧ အသက် ကို ဟြင်အောင်ပျင်ရေ။", | |
"အကျွန့် အရှေခါ တစ်ခုလဲ မသိ။", | |
"မွတ်ကေ နာကယ် ငိုရေ။", | |
"မိခင်(အဒေါ်)၏ အသန် တစ်ချက် ကြားကေ။", | |
"အသက် မာမြာ့ ကျာလာရေ။", | |
"အဒေါ် အကျွန့် မှာ ပထမ ဆရာ ငို ပညာ သင်ပီးရေ။", | |
"မသိ သကြား တစ်သက် ပတ်လုံး", | |
"ဧသဲဇာ့ ကာ ဖျစ်ပီးရေ။", | |
"ငို့ မာ တခါ ခန္ဒာ မကောင်း ဖြစ်ကေ", | |
"အဒေါ် ယာခါ ဝေဆာရေ/စိတ်ဆိုးရေ။", | |
"အဝေး တခေါက် တစ်ခါ လားကေ", | |
"အဒေါ် လန်းကာ့ ကြည့်နီရေ။", | |
"ဧ လောကမာ ကံကောင်ရယ် ကျေးဇု", | |
"အဒေါ် ဆိုဗော်/အမိ ခေါ်ရာရေ၊", | |
"အဒေါ် မျက်နှာကို တချက် မြင်ကေ", | |
"ဒုတ်ခကိုလဲ မိလားရေ။" | |
] | |
def tts(text): | |
""" | |
Synthesize the given text | |
""" | |
if not text.strip(): | |
return None, "Please enter text or select a sample sentence" | |
if len(text) > 2000: | |
return None, f"Text is too long ({len(text)} characters). Please keep it under 2000 characters." | |
try: | |
print(f"Loading model...") | |
# Load the model and tokenizer | |
model_name = "CLEAR-Global/marmaspeak-tts-v1" | |
tokenizer = VitsTokenizer.from_pretrained(model_name) | |
model = VitsModel.from_pretrained(model_name) | |
print("Model loaded. Processing text...") | |
# Preprocess the input text | |
inputs = tokenizer(text=text, return_tensors="pt") | |
# Make the speech synthesis deterministic | |
set_seed(555) | |
# Generate the audio waveform | |
print("Generating audio...") | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
waveform = outputs.waveform[0] | |
sample_rate = model.config.sampling_rate | |
# Save to temporary file | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
# Save the waveform to the temporary file | |
write(f.name, sample_rate, waveform.numpy()) | |
# Get the file name | |
waveform_file = f.name | |
print("Audio generation complete.") | |
return waveform_file, text | |
except Exception as e: | |
print(f"Error in TTS: {str(e)}") | |
return None, f"Error synthesizing text: {str(e)}" | |
def use_sample(sample_idx): | |
"""Handle sample selection""" | |
if sample_idx is not None and sample_idx >= 0 and sample_idx < len(marma_sentences): | |
sample_text = marma_sentences[sample_idx] | |
return tts(sample_text) | |
return None, "Please select a valid sample" | |
def update_input_text(sample_idx): | |
"""Update input textbox with selected sample""" | |
if sample_idx is not None and sample_idx >= 0 and sample_idx < len(marma_sentences): | |
return marma_sentences[sample_idx] | |
return "" | |
def clear_outputs(): | |
"""Clear outputs""" | |
return None, "" | |
# Create Gradio interface | |
with gr.Blocks(title="MarmaSpeakTTS Demo") as demo: | |
gr.Markdown("# MarmaSpeakTTS: Marma Language Text-to-Speech Demo") | |
gr.Markdown(""" | |
This demo showcases the MarmaSpeakTTS model, which provides text-to-speech synthesis | |
for the Marma language (ISO code: rmz), a Tibeto-Burman language spoken by the Marma people | |
in Bangladesh and Myanmar. | |
You can enter custom Marma text or select from the sample sentences. | |
*Note: Model will load when you submit text. This may take a minute on first run.* | |
""") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
text_input = gr.Textbox( | |
label="Marma Text", | |
placeholder="Enter Marma text here...", | |
lines=3 | |
) | |
with gr.Row(): | |
submit_btn = gr.Button("Synthesize", variant="primary") | |
clear_btn = gr.Button("Clear") | |
audio_output = gr.Audio(label="Generated Speech") | |
text_display = gr.Textbox(label="Text Being Synthesized", interactive=False) | |
with gr.Column(scale=1): | |
gr.Markdown("### Sample Sentences") | |
sample_dropdown = gr.Dropdown( | |
choices=[f"{i+1}. {sent[:30]}..." for i, sent in enumerate(marma_sentences)], | |
label="Select a sample sentence", | |
type="index" | |
) | |
use_sample_btn = gr.Button("Use Selected Sample") | |
# Set up event handlers | |
submit_btn.click( | |
fn=tts, | |
inputs=text_input, | |
outputs=[audio_output, text_display] | |
) | |
text_input.submit( | |
fn=tts, | |
inputs=text_input, | |
outputs=[audio_output, text_display] | |
) | |
use_sample_btn.click( | |
fn=use_sample, | |
inputs=sample_dropdown, | |
outputs=[audio_output, text_display] | |
) | |
clear_btn.click( | |
fn=clear_outputs, | |
inputs=None, | |
outputs=[audio_output, text_display] | |
) | |
sample_dropdown.change( | |
fn=update_input_text, | |
inputs=sample_dropdown, | |
outputs=text_input | |
) | |
# Launch the app | |
demo.launch() |