File size: 6,156 Bytes
a106764
 
f33dae4
 
 
 
a106764
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ea0688
a106764
4ea0688
a106764
 
 
f33dae4
 
 
a106764
 
f33dae4
 
 
 
 
 
a106764
f33dae4
a106764
f33dae4
 
a106764
f33dae4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a106764
 
f33dae4
a106764
 
4ea0688
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a106764
f33dae4
a106764
 
 
 
 
 
 
f33dae4
 
a106764
 
 
 
 
 
 
 
 
 
 
 
 
 
f33dae4
a106764
 
 
 
 
 
 
 
 
 
 
 
 
f33dae4
4ea0688
a106764
 
 
f33dae4
 
4ea0688
f33dae4
 
 
a106764
4ea0688
 
a106764
 
 
 
4ea0688
a106764
 
 
 
 
4ea0688
a106764
 
 
 
f33dae4
a106764
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import gradio as gr
import torch
from transformers import VitsTokenizer, VitsModel, set_seed
import tempfile
from scipy.io.wavfile import write
import numpy as np

# Predefined Marma sentences
marma_sentences = [
    "အဒေါ်  မျက်နှာ",
    "ဆေနိ ဆေရက် ဒို့ခခံရို့ ဧလောကတို့ ယူခါရေ အမိ။",
    "ရင်ဖတ်၏သွီးကို နို့ပျင်ရို့။",
    "အကျွန်ဧ အသက် ကို ဟြင်‌အောင်ပျင်ရေ။",
    "အကျွန့် အရှေခါ တစ်ခုလဲ မသိ။",
    "မွတ်ကေ နာကယ် ငိုရေ။",
    "မိခင်(အဒေါ်)၏  အသန် တစ်ချက် ကြားကေ။",
    "အသက် မာမြာ့ ကျာလာရေ။",
    "အဒေါ်  အကျွန့် မှာ ပထမ ဆရာ ငို ပညာ  သင်ပီးရေ။",
    "မသိ သကြား တစ်သက် ပတ်လုံး",
    "ဧသဲဇာ့ ကာ ဖျစ်ပီးရေ။",
    "ငို့ မာ တခါ ခန္ဒာ မကောင်း ဖြစ်ကေ",
    "အဒေါ်  ယာခါ ဝေဆာရေ/စိတ်ဆိုးရေ။",
    "အဝေး တခေါက် တစ်ခါ လားကေ",
    "အဒေါ်  လန်းကာ့ ကြည့်နီရေ။",
    "ဧ လောကမာ ကံကောင်ရယ် ကျေးဇု",
    "အဒေါ်  ဆိုဗော်/အမိ ခေါ်ရာရေ၊",
    "အဒေါ်  မျက်နှာကို တချက် မြင်ကေ",
    "ဒုတ်ခကိုလဲ မိလားရေ။"
]

def tts(text):
    """
    Synthesize the given text
    """
    if not text.strip():
        return None, "Please enter text or select a sample sentence"
        
    if len(text) > 2000:
        return None, f"Text is too long ({len(text)} characters). Please keep it under 2000 characters."
    
    try:
        print(f"Loading model...")
        
        # Load the model and tokenizer
        model_name = "CLEAR-Global/marmaspeak-tts-v1"
        tokenizer = VitsTokenizer.from_pretrained(model_name)
        model = VitsModel.from_pretrained(model_name)
        
        print("Model loaded. Processing text...")
        
        # Preprocess the input text
        inputs = tokenizer(text=text, return_tensors="pt")
        
        # Make the speech synthesis deterministic
        set_seed(555)
        
        # Generate the audio waveform
        print("Generating audio...")
        with torch.no_grad():
            outputs = model(**inputs)
        
        waveform = outputs.waveform[0]
        sample_rate = model.config.sampling_rate
        
        # Save to temporary file
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            # Save the waveform to the temporary file
            write(f.name, sample_rate, waveform.numpy())
            # Get the file name
            waveform_file = f.name
            
        print("Audio generation complete.")
        return waveform_file, text
        
    except Exception as e:
        print(f"Error in TTS: {str(e)}")
        return None, f"Error synthesizing text: {str(e)}"

def use_sample(sample_idx):
    """Handle sample selection"""
    if sample_idx is not None and sample_idx >= 0 and sample_idx < len(marma_sentences):
        sample_text = marma_sentences[sample_idx]
        return tts(sample_text)
    return None, "Please select a valid sample"

def update_input_text(sample_idx):
    """Update input textbox with selected sample"""
    if sample_idx is not None and sample_idx >= 0 and sample_idx < len(marma_sentences):
        return marma_sentences[sample_idx]
    return ""

def clear_outputs():
    """Clear outputs"""
    return None, ""

# Create Gradio interface
with gr.Blocks(title="MarmaSpeakTTS Demo") as demo:
    gr.Markdown("# MarmaSpeakTTS: Marma Language Text-to-Speech Demo")
    gr.Markdown("""
    This demo showcases the MarmaSpeakTTS model, which provides text-to-speech synthesis 
    for the Marma language (ISO code: rmz), a Tibeto-Burman language spoken by the Marma people 
    in Bangladesh and Myanmar.
    
    You can enter custom Marma text or select from the sample sentences.
    
    *Note: Model will load when you submit text. This may take a minute on first run.*
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                label="Marma Text", 
                placeholder="Enter Marma text here...", 
                lines=3
            )
            
            with gr.Row():
                submit_btn = gr.Button("Synthesize", variant="primary")
                clear_btn = gr.Button("Clear")
            
            audio_output = gr.Audio(label="Generated Speech")
            text_display = gr.Textbox(label="Text Being Synthesized", interactive=False)
            
        with gr.Column(scale=1):
            gr.Markdown("### Sample Sentences")
            sample_dropdown = gr.Dropdown(
                choices=[f"{i+1}. {sent[:30]}..." for i, sent in enumerate(marma_sentences)],
                label="Select a sample sentence",
                type="index"
            )
            use_sample_btn = gr.Button("Use Selected Sample")

    # Set up event handlers
    submit_btn.click(
        fn=tts, 
        inputs=text_input, 
        outputs=[audio_output, text_display]
    )
    
    text_input.submit(
        fn=tts,
        inputs=text_input,
        outputs=[audio_output, text_display]
    )
    
    use_sample_btn.click(
        fn=use_sample,
        inputs=sample_dropdown,
        outputs=[audio_output, text_display]
    )
    
    clear_btn.click(
        fn=clear_outputs,
        inputs=None,
        outputs=[audio_output, text_display]
    )
    
    sample_dropdown.change(
        fn=update_input_text,
        inputs=sample_dropdown,
        outputs=text_input
    )

# Launch the app
demo.launch()