Spaces:

CLEAR-Global
/

MarmaSpeak-TTS

Sleeping

App Files Files Community

MarmaSpeak-TTS / app.py

alp

Update app.py

4ea0688 verified 6 months ago

raw

history blame contribute delete

6.16 kB

	import gradio as gr
	import torch
	from transformers import VitsTokenizer, VitsModel, set_seed
	import tempfile
	from scipy.io.wavfile import write
	import numpy as np

	# Predefined Marma sentences
	marma_sentences = [
	"အဒေါ် မျက်နှာ",
	"ဆေနိ ဆေရက် ဒို့ခခံရို့ ဧလောကတို့ ယူခါရေ အမိ။",
	"ရင်ဖတ်၏သွီးကို နို့ပျင်ရို့။",
	"အကျွန်ဧ အသက် ကို ဟြင်‌အောင်ပျင်ရေ။",
	"အကျွန့် အရှေခါ တစ်ခုလဲ မသိ။",
	"မွတ်ကေ နာကယ် ငိုရေ။",
	"မိခင်(အဒေါ်)၏ အသန် တစ်ချက် ကြားကေ။",
	"အသက် မာမြာ့ ကျာလာရေ။",
	"အဒေါ် အကျွန့် မှာ ပထမ ဆရာ ငို ပညာ သင်ပီးရေ။",
	"မသိ သကြား တစ်သက် ပတ်လုံး",
	"ဧသဲဇာ့ ကာ ဖျစ်ပီးရေ။",
	"ငို့ မာ တခါ ခန္ဒာ မကောင်း ဖြစ်ကေ",
	"အဒေါ် ယာခါ ဝေဆာရေ/စိတ်ဆိုးရေ။",
	"အဝေး တခေါက် တစ်ခါ လားကေ",
	"အဒေါ် လန်းကာ့ ကြည့်နီရေ။",
	"ဧ လောကမာ ကံကောင်ရယ် ကျေးဇု",
	"အဒေါ် ဆိုဗော်/အမိ ခေါ်ရာရေ၊",
	"အဒေါ် မျက်နှာကို တချက် မြင်ကေ",
	"ဒုတ်ခကိုလဲ မိလားရေ။"
	]

	def tts(text):
	"""
	Synthesize the given text
	"""
	if not text.strip():
	return None, "Please enter text or select a sample sentence"

	if len(text) > 2000:
	return None, f"Text is too long ({len(text)} characters). Please keep it under 2000 characters."

	try:
	print(f"Loading model...")

	# Load the model and tokenizer
	model_name = "CLEAR-Global/marmaspeak-tts-v1"
	tokenizer = VitsTokenizer.from_pretrained(model_name)
	model = VitsModel.from_pretrained(model_name)

	print("Model loaded. Processing text...")

	# Preprocess the input text
	inputs = tokenizer(text=text, return_tensors="pt")

	# Make the speech synthesis deterministic
	set_seed(555)

	# Generate the audio waveform
	print("Generating audio...")
	with torch.no_grad():
	outputs = model(**inputs)

	waveform = outputs.waveform[0]
	sample_rate = model.config.sampling_rate

	# Save to temporary file
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	# Save the waveform to the temporary file
	write(f.name, sample_rate, waveform.numpy())
	# Get the file name
	waveform_file = f.name

	print("Audio generation complete.")
	return waveform_file, text

	except Exception as e:
	print(f"Error in TTS: {str(e)}")
	return None, f"Error synthesizing text: {str(e)}"

	def use_sample(sample_idx):
	"""Handle sample selection"""
	if sample_idx is not None and sample_idx >= 0 and sample_idx < len(marma_sentences):
	sample_text = marma_sentences[sample_idx]
	return tts(sample_text)
	return None, "Please select a valid sample"

	def update_input_text(sample_idx):
	"""Update input textbox with selected sample"""
	if sample_idx is not None and sample_idx >= 0 and sample_idx < len(marma_sentences):
	return marma_sentences[sample_idx]
	return ""

	def clear_outputs():
	"""Clear outputs"""
	return None, ""

	# Create Gradio interface
	with gr.Blocks(title="MarmaSpeakTTS Demo") as demo:
	gr.Markdown("# MarmaSpeakTTS: Marma Language Text-to-Speech Demo")
	gr.Markdown("""
	This demo showcases the MarmaSpeakTTS model, which provides text-to-speech synthesis
	for the Marma language (ISO code: rmz), a Tibeto-Burman language spoken by the Marma people
	in Bangladesh and Myanmar.

	You can enter custom Marma text or select from the sample sentences.

	Note: Model will load when you submit text. This may take a minute on first run.
	""")

	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(
	label="Marma Text",
	placeholder="Enter Marma text here...",
	lines=3
	)

	with gr.Row():
	submit_btn = gr.Button("Synthesize", variant="primary")
	clear_btn = gr.Button("Clear")

	audio_output = gr.Audio(label="Generated Speech")
	text_display = gr.Textbox(label="Text Being Synthesized", interactive=False)

	with gr.Column(scale=1):
	gr.Markdown("### Sample Sentences")
	sample_dropdown = gr.Dropdown(
	choices=[f"{i+1}. {sent[:30]}..." for i, sent in enumerate(marma_sentences)],
	label="Select a sample sentence",
	type="index"
	)
	use_sample_btn = gr.Button("Use Selected Sample")

	# Set up event handlers
	submit_btn.click(
	fn=tts,
	inputs=text_input,
	outputs=[audio_output, text_display]
	)

	text_input.submit(
	fn=tts,
	inputs=text_input,
	outputs=[audio_output, text_display]
	)

	use_sample_btn.click(
	fn=use_sample,
	inputs=sample_dropdown,
	outputs=[audio_output, text_display]
	)

	clear_btn.click(
	fn=clear_outputs,
	inputs=None,
	outputs=[audio_output, text_display]
	)

	sample_dropdown.change(
	fn=update_input_text,
	inputs=sample_dropdown,
	outputs=text_input
	)

	# Launch the app
	demo.launch()