import gradio as gr import torch from transformers import VitsTokenizer, VitsModel, set_seed import tempfile from scipy.io.wavfile import write import numpy as np # Predefined Marma sentences marma_sentences = [ "အဒေါ် မျက်နှာ", "ဆေနိ ဆေရက် ဒို့ခခံရို့ ဧလောကတို့ ယူခါရေ အမိ။", "ရင်ဖတ်၏သွီးကို နို့ပျင်ရို့။", "အကျွန်ဧ အသက် ကို ဟြင်‌အောင်ပျင်ရေ။", "အကျွန့် အရှေခါ တစ်ခုလဲ မသိ။", "မွတ်ကေ နာကယ် ငိုရေ။", "မိခင်(အဒေါ်)၏ အသန် တစ်ချက် ကြားကေ။", "အသက် မာမြာ့ ကျာလာရေ။", "အဒေါ် အကျွန့် မှာ ပထမ ဆရာ ငို ပညာ သင်ပီးရေ။", "မသိ သကြား တစ်သက် ပတ်လုံး", "ဧသဲဇာ့ ကာ ဖျစ်ပီးရေ။", "ငို့ မာ တခါ ခန္ဒာ မကောင်း ဖြစ်ကေ", "အဒေါ် ယာခါ ဝေဆာရေ/စိတ်ဆိုးရေ။", "အဝေး တခေါက် တစ်ခါ လားကေ", "အဒေါ် လန်းကာ့ ကြည့်နီရေ။", "ဧ လောကမာ ကံကောင်ရယ် ကျေးဇု", "အဒေါ် ဆိုဗော်/အမိ ခေါ်ရာရေ၊", "အဒေါ် မျက်နှာကို တချက် မြင်ကေ", "ဒုတ်ခကိုလဲ မိလားရေ။" ] def tts(text): """ Synthesize the given text """ if not text.strip(): return None, "Please enter text or select a sample sentence" if len(text) > 2000: return None, f"Text is too long ({len(text)} characters). Please keep it under 2000 characters." try: print(f"Loading model...") # Load the model and tokenizer model_name = "CLEAR-Global/marmaspeak-tts-v1" tokenizer = VitsTokenizer.from_pretrained(model_name) model = VitsModel.from_pretrained(model_name) print("Model loaded. Processing text...") # Preprocess the input text inputs = tokenizer(text=text, return_tensors="pt") # Make the speech synthesis deterministic set_seed(555) # Generate the audio waveform print("Generating audio...") with torch.no_grad(): outputs = model(**inputs) waveform = outputs.waveform[0] sample_rate = model.config.sampling_rate # Save to temporary file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: # Save the waveform to the temporary file write(f.name, sample_rate, waveform.numpy()) # Get the file name waveform_file = f.name print("Audio generation complete.") return waveform_file, text except Exception as e: print(f"Error in TTS: {str(e)}") return None, f"Error synthesizing text: {str(e)}" def use_sample(sample_idx): """Handle sample selection""" if sample_idx is not None and sample_idx >= 0 and sample_idx < len(marma_sentences): sample_text = marma_sentences[sample_idx] return tts(sample_text) return None, "Please select a valid sample" def update_input_text(sample_idx): """Update input textbox with selected sample""" if sample_idx is not None and sample_idx >= 0 and sample_idx < len(marma_sentences): return marma_sentences[sample_idx] return "" def clear_outputs(): """Clear outputs""" return None, "" # Create Gradio interface with gr.Blocks(title="MarmaSpeakTTS Demo") as demo: gr.Markdown("# MarmaSpeakTTS: Marma Language Text-to-Speech Demo") gr.Markdown(""" This demo showcases the MarmaSpeakTTS model, which provides text-to-speech synthesis for the Marma language (ISO code: rmz), a Tibeto-Burman language spoken by the Marma people in Bangladesh and Myanmar. You can enter custom Marma text or select from the sample sentences. *Note: Model will load when you submit text. This may take a minute on first run.* """) with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox( label="Marma Text", placeholder="Enter Marma text here...", lines=3 ) with gr.Row(): submit_btn = gr.Button("Synthesize", variant="primary") clear_btn = gr.Button("Clear") audio_output = gr.Audio(label="Generated Speech") text_display = gr.Textbox(label="Text Being Synthesized", interactive=False) with gr.Column(scale=1): gr.Markdown("### Sample Sentences") sample_dropdown = gr.Dropdown( choices=[f"{i+1}. {sent[:30]}..." for i, sent in enumerate(marma_sentences)], label="Select a sample sentence", type="index" ) use_sample_btn = gr.Button("Use Selected Sample") # Set up event handlers submit_btn.click( fn=tts, inputs=text_input, outputs=[audio_output, text_display] ) text_input.submit( fn=tts, inputs=text_input, outputs=[audio_output, text_display] ) use_sample_btn.click( fn=use_sample, inputs=sample_dropdown, outputs=[audio_output, text_display] ) clear_btn.click( fn=clear_outputs, inputs=None, outputs=[audio_output, text_display] ) sample_dropdown.change( fn=update_input_text, inputs=sample_dropdown, outputs=text_input ) # Launch the app demo.launch()