MohamedRashad commited on
Commit
69b575f
·
1 Parent(s): 2d772b8

Add initial implementation of Egyptian-Arabic TTS with Gradio interface

Browse files
Files changed (2) hide show
  1. app.py +80 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from TTS.tts.configs.xtts_config import XttsConfig
3
+ from TTS.tts.models.xtts import Xtts
4
+ from pathlib import Path
5
+ import gradio as gr
6
+ import spaces
7
+
8
+ CONFIG_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/config.json'
9
+ VOCAB_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/vocab.json'
10
+ MODEL_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/model.pth'
11
+ SPEAKER_AUDIO_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/speaker_reference.wav'
12
+
13
+ base_path = Path(__file__).parent
14
+
15
+ # Download the files into the base_path
16
+ config_path = base_path / 'config.json'
17
+ if not config_path.exists():
18
+ torch.hub.download_url_to_file(CONFIG_URL, config_path)
19
+ vocab_path = base_path / 'vocab.json'
20
+ if not vocab_path.exists():
21
+ torch.hub.download_url_to_file(VOCAB_URL, vocab_path)
22
+ model_path = base_path / 'model.pth'
23
+ if not model_path.exists():
24
+ torch.hub.download_url_to_file(MODEL_URL, model_path)
25
+ speaker_audio_path = base_path / 'speaker_reference.wav'
26
+ if not speaker_audio_path.exists():
27
+ torch.hub.download_url_to_file(SPEAKER_AUDIO_URL, speaker_audio_path)
28
+
29
+ config_path = str(config_path)
30
+ vocab_path = str(vocab_path)
31
+ model_path = str(model_path.parent)
32
+ speaker_audio_path = str(speaker_audio_path)
33
+
34
+ print("Loading model...")
35
+ device = "cuda" if torch.cuda.is_available() else "cpu"
36
+ config = XttsConfig()
37
+ config.load_json(config_path)
38
+ model = Xtts.init_from_config(config)
39
+ model.load_checkpoint(config, checkpoint_dir=model_path, use_deepspeed=True, vocab_path=vocab_path)
40
+ model.to(device)
41
+
42
+ @spaces.GPU
43
+ def infer_EGTTS(text: str, speaker_audio_path: str, temperature: float = 0.75):
44
+ print("Computing speaker latents...")
45
+ gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[speaker_audio_path])
46
+
47
+ print("Inference...")
48
+ out = model.inference(
49
+ text,
50
+ "ar",
51
+ gpt_cond_latent,
52
+ speaker_embedding,
53
+ temperature=temperature,
54
+ )
55
+
56
+ return 24000, out["wav"]
57
+
58
+ header = """<h1 style="text-align:center">Egyptian-Arabic-TTS (EGTTS)</h1>
59
+
60
+ ## Instructions:
61
+
62
+ 1. Enter the text you want to synthesize.
63
+ 2. Upload a 4-5 seconds audio file of the speaker you want to clone.
64
+ 3. Click on the "Generate" button.
65
+
66
+ **This space was only possible because of the amazing work done by [OmarSamir](https://huggingface.co/OmarSamir) on the [EGTTS](https://huggingface.co/OmarSamir/EGTTS-V0.1) model.**
67
+ """
68
+ with gr.Blocks(title="EGTTS") as app:
69
+ gr.Markdown(header)
70
+ with gr.Row():
71
+ with gr.Column():
72
+ text = gr.Textbox(label="Text to synthesize", value="السلام عليكم ورحمة الله")
73
+ speaker_refrence = gr.Audio(label="Speaker reference", value=speaker_audio_path, type="filepath")
74
+ temperature = gr.Slider(label="Temperature", min_value=0.1, max_value=1.0, value=0.75, step=0.05)
75
+ generate_btn = gr.Button(value="Generate", variant="primary")
76
+ output = gr.Audio(label="Synthesized audio")
77
+
78
+ generate_btn.click(infer_EGTTS, inputs=[text, speaker_refrence, temperature], outputs=output)
79
+
80
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ git+https://github.com/coqui-ai/TTS
2
+ transformers
3
+ deepspeed
4
+ torch
5
+ torchaudio
6
+ spaces
7
+ gradio