Spaces:
Sleeping
Sleeping
File size: 6,156 Bytes
a106764 f33dae4 a106764 4ea0688 a106764 4ea0688 a106764 f33dae4 a106764 f33dae4 a106764 f33dae4 a106764 f33dae4 a106764 f33dae4 a106764 f33dae4 a106764 4ea0688 a106764 f33dae4 a106764 f33dae4 a106764 f33dae4 a106764 f33dae4 4ea0688 a106764 f33dae4 4ea0688 f33dae4 a106764 4ea0688 a106764 4ea0688 a106764 4ea0688 a106764 f33dae4 a106764 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import gradio as gr
import torch
from transformers import VitsTokenizer, VitsModel, set_seed
import tempfile
from scipy.io.wavfile import write
import numpy as np
# Predefined Marma sentences
marma_sentences = [
"အဒေါ် မျက်နှာ",
"ဆေနိ ဆေရက် ဒို့ခခံရို့ ဧလောကတို့ ယူခါရေ အမိ။",
"ရင်ဖတ်၏သွီးကို နို့ပျင်ရို့။",
"အကျွန်ဧ အသက် ကို ဟြင်အောင်ပျင်ရေ။",
"အကျွန့် အရှေခါ တစ်ခုလဲ မသိ။",
"မွတ်ကေ နာကယ် ငိုရေ။",
"မိခင်(အဒေါ်)၏ အသန် တစ်ချက် ကြားကေ။",
"အသက် မာမြာ့ ကျာလာရေ။",
"အဒေါ် အကျွန့် မှာ ပထမ ဆရာ ငို ပညာ သင်ပီးရေ။",
"မသိ သကြား တစ်သက် ပတ်လုံး",
"ဧသဲဇာ့ ကာ ဖျစ်ပီးရေ။",
"ငို့ မာ တခါ ခန္ဒာ မကောင်း ဖြစ်ကေ",
"အဒေါ် ယာခါ ဝေဆာရေ/စိတ်ဆိုးရေ။",
"အဝေး တခေါက် တစ်ခါ လားကေ",
"အဒေါ် လန်းကာ့ ကြည့်နီရေ။",
"ဧ လောကမာ ကံကောင်ရယ် ကျေးဇု",
"အဒေါ် ဆိုဗော်/အမိ ခေါ်ရာရေ၊",
"အဒေါ် မျက်နှာကို တချက် မြင်ကေ",
"ဒုတ်ခကိုလဲ မိလားရေ။"
]
def tts(text):
"""
Synthesize the given text
"""
if not text.strip():
return None, "Please enter text or select a sample sentence"
if len(text) > 2000:
return None, f"Text is too long ({len(text)} characters). Please keep it under 2000 characters."
try:
print(f"Loading model...")
# Load the model and tokenizer
model_name = "CLEAR-Global/marmaspeak-tts-v1"
tokenizer = VitsTokenizer.from_pretrained(model_name)
model = VitsModel.from_pretrained(model_name)
print("Model loaded. Processing text...")
# Preprocess the input text
inputs = tokenizer(text=text, return_tensors="pt")
# Make the speech synthesis deterministic
set_seed(555)
# Generate the audio waveform
print("Generating audio...")
with torch.no_grad():
outputs = model(**inputs)
waveform = outputs.waveform[0]
sample_rate = model.config.sampling_rate
# Save to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
# Save the waveform to the temporary file
write(f.name, sample_rate, waveform.numpy())
# Get the file name
waveform_file = f.name
print("Audio generation complete.")
return waveform_file, text
except Exception as e:
print(f"Error in TTS: {str(e)}")
return None, f"Error synthesizing text: {str(e)}"
def use_sample(sample_idx):
"""Handle sample selection"""
if sample_idx is not None and sample_idx >= 0 and sample_idx < len(marma_sentences):
sample_text = marma_sentences[sample_idx]
return tts(sample_text)
return None, "Please select a valid sample"
def update_input_text(sample_idx):
"""Update input textbox with selected sample"""
if sample_idx is not None and sample_idx >= 0 and sample_idx < len(marma_sentences):
return marma_sentences[sample_idx]
return ""
def clear_outputs():
"""Clear outputs"""
return None, ""
# Create Gradio interface
with gr.Blocks(title="MarmaSpeakTTS Demo") as demo:
gr.Markdown("# MarmaSpeakTTS: Marma Language Text-to-Speech Demo")
gr.Markdown("""
This demo showcases the MarmaSpeakTTS model, which provides text-to-speech synthesis
for the Marma language (ISO code: rmz), a Tibeto-Burman language spoken by the Marma people
in Bangladesh and Myanmar.
You can enter custom Marma text or select from the sample sentences.
*Note: Model will load when you submit text. This may take a minute on first run.*
""")
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Marma Text",
placeholder="Enter Marma text here...",
lines=3
)
with gr.Row():
submit_btn = gr.Button("Synthesize", variant="primary")
clear_btn = gr.Button("Clear")
audio_output = gr.Audio(label="Generated Speech")
text_display = gr.Textbox(label="Text Being Synthesized", interactive=False)
with gr.Column(scale=1):
gr.Markdown("### Sample Sentences")
sample_dropdown = gr.Dropdown(
choices=[f"{i+1}. {sent[:30]}..." for i, sent in enumerate(marma_sentences)],
label="Select a sample sentence",
type="index"
)
use_sample_btn = gr.Button("Use Selected Sample")
# Set up event handlers
submit_btn.click(
fn=tts,
inputs=text_input,
outputs=[audio_output, text_display]
)
text_input.submit(
fn=tts,
inputs=text_input,
outputs=[audio_output, text_display]
)
use_sample_btn.click(
fn=use_sample,
inputs=sample_dropdown,
outputs=[audio_output, text_display]
)
clear_btn.click(
fn=clear_outputs,
inputs=None,
outputs=[audio_output, text_display]
)
sample_dropdown.change(
fn=update_input_text,
inputs=sample_dropdown,
outputs=text_input
)
# Launch the app
demo.launch() |