Spaces:

CLEAR-Global
/

MarmaSpeak-TTS

Sleeping

File size: 6,156 Bytes

import gradio as gr
import torch
from transformers import VitsTokenizer, VitsModel, set_seed
import tempfile
from scipy.io.wavfile import write
import numpy as np

# Predefined Marma sentences
marma_sentences = [
    "အဒေါ်  မျက်နှာ",
    "ဆေနိ ဆေရက် ဒို့ခခံရို့ ဧလောကတို့ ယူခါရေ အမိ။",
    "ရင်ဖတ်၏သွီးကို နို့ပျင်ရို့။",
    "အကျွန်ဧ အသက် ကို ဟြင်‌အောင်ပျင်ရေ။",
    "အကျွန့် အရှေခါ တစ်ခုလဲ မသိ။",
    "မွတ်ကေ နာကယ် ငိုရေ။",
    "မိခင်(အဒေါ်)၏  အသန် တစ်ချက် ကြားကေ။",
    "အသက် မာမြာ့ ကျာလာရေ။",
    "အဒေါ်  အကျွန့် မှာ ပထမ ဆရာ ငို ပညာ  သင်ပီးရေ။",
    "မသိ သကြား တစ်သက် ပတ်လုံး",
    "ဧသဲဇာ့ ကာ ဖျစ်ပီးရေ။",
    "ငို့ မာ တခါ ခန္ဒာ မကောင်း ဖြစ်ကေ",
    "အဒေါ်  ယာခါ ဝေဆာရေ/စိတ်ဆိုးရေ။",
    "အဝေး တခေါက် တစ်ခါ လားကေ",
    "အဒေါ်  လန်းကာ့ ကြည့်နီရေ။",
    "ဧ လောကမာ ကံကောင်ရယ် ကျေးဇု",
    "အဒေါ်  ဆိုဗော်/အမိ ခေါ်ရာရေ၊",
    "အဒေါ်  မျက်နှာကို တချက် မြင်ကေ",
    "ဒုတ်ခကိုလဲ မိလားရေ။"
]

def tts(text):
    """
    Synthesize the given text
    """
    if not text.strip():
        return None, "Please enter text or select a sample sentence"
        
    if len(text) > 2000:
        return None, f"Text is too long ({len(text)} characters). Please keep it under 2000 characters."
    
    try:
        print(f"Loading model...")
        
        # Load the model and tokenizer
        model_name = "CLEAR-Global/marmaspeak-tts-v1"
        tokenizer = VitsTokenizer.from_pretrained(model_name)
        model = VitsModel.from_pretrained(model_name)
        
        print("Model loaded. Processing text...")
        
        # Preprocess the input text
        inputs = tokenizer(text=text, return_tensors="pt")
        
        # Make the speech synthesis deterministic
        set_seed(555)
        
        # Generate the audio waveform
        print("Generating audio...")
        with torch.no_grad():
            outputs = model(**inputs)
        
        waveform = outputs.waveform[0]
        sample_rate = model.config.sampling_rate
        
        # Save to temporary file
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            # Save the waveform to the temporary file
            write(f.name, sample_rate, waveform.numpy())
            # Get the file name
            waveform_file = f.name
            
        print("Audio generation complete.")
        return waveform_file, text
        
    except Exception as e:
        print(f"Error in TTS: {str(e)}")
        return None, f"Error synthesizing text: {str(e)}"

def use_sample(sample_idx):
    """Handle sample selection"""
    if sample_idx is not None and sample_idx >= 0 and sample_idx < len(marma_sentences):
        sample_text = marma_sentences[sample_idx]
        return tts(sample_text)
    return None, "Please select a valid sample"

def update_input_text(sample_idx):
    """Update input textbox with selected sample"""
    if sample_idx is not None and sample_idx >= 0 and sample_idx < len(marma_sentences):
        return marma_sentences[sample_idx]
    return ""

def clear_outputs():
    """Clear outputs"""
    return None, ""

# Create Gradio interface
with gr.Blocks(title="MarmaSpeakTTS Demo") as demo:
    gr.Markdown("# MarmaSpeakTTS: Marma Language Text-to-Speech Demo")
    gr.Markdown("""
    This demo showcases the MarmaSpeakTTS model, which provides text-to-speech synthesis 
    for the Marma language (ISO code: rmz), a Tibeto-Burman language spoken by the Marma people 
    in Bangladesh and Myanmar.
    
    You can enter custom Marma text or select from the sample sentences.
    
    *Note: Model will load when you submit text. This may take a minute on first run.*
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                label="Marma Text", 
                placeholder="Enter Marma text here...", 
                lines=3
            )
            
            with gr.Row():
                submit_btn = gr.Button("Synthesize", variant="primary")
                clear_btn = gr.Button("Clear")
            
            audio_output = gr.Audio(label="Generated Speech")
            text_display = gr.Textbox(label="Text Being Synthesized", interactive=False)
            
        with gr.Column(scale=1):
            gr.Markdown("### Sample Sentences")
            sample_dropdown = gr.Dropdown(
                choices=[f"{i+1}. {sent[:30]}..." for i, sent in enumerate(marma_sentences)],
                label="Select a sample sentence",
                type="index"
            )
            use_sample_btn = gr.Button("Use Selected Sample")

    # Set up event handlers
    submit_btn.click(
        fn=tts, 
        inputs=text_input, 
        outputs=[audio_output, text_display]
    )
    
    text_input.submit(
        fn=tts,
        inputs=text_input,
        outputs=[audio_output, text_display]
    )
    
    use_sample_btn.click(
        fn=use_sample,
        inputs=sample_dropdown,
        outputs=[audio_output, text_display]
    )
    
    clear_btn.click(
        fn=clear_outputs,
        inputs=None,
        outputs=[audio_output, text_display]
    )
    
    sample_dropdown.change(
        fn=update_input_text,
        inputs=sample_dropdown,
        outputs=text_input
    )

# Launch the app
demo.launch()