barghavani commited on
Commit
8edffb9
·
1 Parent(s): bc9d1ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +695 -48
app.py CHANGED
@@ -1,65 +1,712 @@
1
- import tempfile
2
- from typing import Optional
3
- from TTS.config import load_config
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import gradio as gr
5
- import numpy as np
6
- from TTS.utils.manage import ModelManager
7
- from TTS.utils.synthesizer import Synthesizer
8
-
9
- MODELS = {}
10
- SPEAKERS = {}
11
- MAX_TXT_LEN = 100
12
-
13
- manager = ModelManager()
14
- MODEL_NAMES = ["saillab/xtts_v2_fa_revision1"]
15
-
16
- def tts(text: str):
17
- model_name = "saillab/xtts_v2_fa_revision1"
18
- if len(text) > MAX_TXT_LEN:
19
- text = text[:MAX_TXT_LEN]
20
- print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
21
- print(text, model_name)
22
- model_path, config_path, model_item = manager.download_model(model_name)
23
- vocoder_name: Optional[str] = model_item["default_vocoder"]
24
- vocoder_path = None
25
- vocoder_config_path = None
26
- if vocoder_name is not None:
27
- vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)
28
- synthesizer = Synthesizer(model_path, config_path, None, None, vocoder_path, vocoder_config_path,)
29
- if synthesizer is None:
30
- raise NameError("model not found")
31
- wavs = synthesizer.tts(text, None)
32
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
33
- synthesizer.save_wav(wavs, fp)
34
- return fp.name
35
-
36
- title = """<h1 align="center">🐸💬 CoquiTTS Playground </h1>"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  with gr.Blocks(analytics_enabled=False) as demo:
39
  with gr.Row():
40
  with gr.Column():
41
- gr.Markdown("GitHub Markdown Details")
 
 
 
 
42
  with gr.Column():
43
- gr.Markdown("GitHub Markdown Details")
 
44
 
45
  with gr.Row():
46
- gr.Markdown("GitHub Markdown Details")
 
 
 
47
 
48
  with gr.Row():
49
  with gr.Column():
50
- input_text = gr.inputs.Textbox(
51
- label="Input Text",
52
- default="This sentence has been generated by a speech synthesis system.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
55
 
 
56
  with gr.Column():
57
- output_audio = gr.outputs.Audio(label="Output", type="filepath")
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- tts_button.click(
60
- tts,
61
- inputs=[input_text],
62
- outputs=[output_audio],
63
- )
64
 
65
- demo.queue(concurrency_count=16).launch(debug=True)
 
 
1
+ import sys
2
+ import io, os, stat
3
+ import subprocess
4
+ import random
5
+ from zipfile import ZipFile
6
+ import uuid
7
+ import time
8
+ import torch
9
+ import torchaudio
10
+
11
+ #download for mecab
12
+ os.system('python -m unidic download')
13
+
14
+ # By using XTTS you agree to CPML license https://coqui.ai/cpml
15
+ os.environ["COQUI_TOS_AGREED"] = "1"
16
+
17
+ # langid is used to detect language for longer text
18
+ # Most users expect text to be their own language, there is checkbox to disable it
19
+ import langid
20
+ import base64
21
+ import csv
22
+ from io import StringIO
23
+ import datetime
24
+ import re
25
+
26
  import gradio as gr
27
+ from scipy.io.wavfile import write
28
+ from pydub import AudioSegment
29
+
30
+ from TTS.api import TTS
31
+ from TTS.tts.configs.xtts_config import XttsConfig
32
+ from TTS.tts.models.xtts import Xtts
33
+ from TTS.utils.generic_utils import get_user_data_dir
34
+
35
+ HF_TOKEN = os.environ.get("HF_TOKEN")
36
+
37
+ from huggingface_hub import HfApi
38
+ from huggingface_hub import hf_hub_download
39
+
40
+ # will use api to restart space on a unrecoverable error
41
+ api = HfApi(token=HF_TOKEN)
42
+ repo_id = "saillab/xtts-base"
43
+
44
+ # Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
45
+ print("Export newer ffmpeg binary for denoise filter")
46
+ ZipFile("ffmpeg.zip").extractall()
47
+ print("Make ffmpeg binary executable")
48
+ st = os.stat("ffmpeg")
49
+ os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
50
+
51
+ # This will trigger downloading model
52
+ print("Downloading if not downloaded Coqui XTTS V2")
53
+ ### Vahid's modification - 12/9/2023 ###
54
+ #!mkdir SAIL_XTTS
55
+ SAIL_file = "checkpoint_30000.pth" #checkpoint file name
56
+ SAIL_repo = "saillab/xtts_v2_fa_revision1"
57
+ model_file = hf_hub_download(repo_id=SAIL_repo, filename=SAIL_file, local_dir="SAIL_XTTS", use_auth_token=HF_TOKEN)
58
+ config_file = hf_hub_download(repo_id=SAIL_repo, filename='config.json', local_dir="SAIL_XTTS", use_auth_token=HF_TOKEN)
59
+ vocab_file = hf_hub_download(repo_id=SAIL_repo, filename='vocab.json', local_dir="SAIL_XTTS", use_auth_token=HF_TOKEN)
60
+
61
+
62
+ #from TTS.utils.manage import ModelManager
63
+
64
+ #model_name = "saillab/xtts_v2_fa_revision1"
65
+ #ModelManager().download_model(model_name)
66
+ #model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
67
+ print("XTTS downloaded")
68
+
69
+ config = XttsConfig()
70
+ #config.load_json(os.path.join(model_path, "config.json"))
71
+ config.load_json(config_file)
72
+
73
+ model = Xtts.init_from_config(config)
74
+ model.load_checkpoint(
75
+ config,
76
+ checkpoint_path=model_file,#os.path.join(model_path, "model.pth"),
77
+ vocab_path=vocab_file,#os.path.join(model_path, "vocab.json"),
78
+ eval=True,
79
+ use_deepspeed=True,
80
+ )
81
+ model.cuda()
82
+
83
+ # This is for debugging purposes only
84
+ DEVICE_ASSERT_DETECTED = 0
85
+ DEVICE_ASSERT_PROMPT = None
86
+ DEVICE_ASSERT_LANG = None
87
+
88
+ supported_languages = config.languages
89
+
90
+ def predict(
91
+ prompt,
92
+ language,
93
+ audio_file_pth,
94
+ mic_file_path,
95
+ use_mic,
96
+ voice_cleanup,
97
+ no_lang_auto_detect,
98
+ agree,
99
+ ):
100
+ if agree == True:
101
+ if language not in supported_languages:
102
+ gr.Warning(
103
+ f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
104
+ )
105
+
106
+ return (
107
+ None,
108
+ None,
109
+ None,
110
+ None,
111
+ )
112
+
113
+ language_predicted = langid.classify(prompt)[
114
+ 0
115
+ ].strip() # strip need as there is space at end!
116
+
117
+ # tts expects chinese as zh-cn
118
+ if language_predicted == "zh":
119
+ # we use zh-cn
120
+ language_predicted = "zh-cn"
121
+
122
+ print(f"Detected language:{language_predicted}, Chosen language:{language}")
123
+
124
+ # After text character length 15 trigger language detection
125
+ if len(prompt) > 15:
126
+ # allow any language for short text as some may be common
127
+ # If user unchecks language autodetection it will not trigger
128
+ # You may remove this completely for own use
129
+ if language_predicted != language and not no_lang_auto_detect:
130
+ # Please duplicate and remove this check if you really want this
131
+ # Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
132
+ gr.Warning(
133
+ f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox"
134
+ )
135
+
136
+ return (
137
+ None,
138
+ None,
139
+ None,
140
+ None,
141
+ )
142
+
143
+ if use_mic == True:
144
+ if mic_file_path is not None:
145
+ speaker_wav = mic_file_path
146
+ else:
147
+ gr.Warning(
148
+ "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
149
+ )
150
+ return (
151
+ None,
152
+ None,
153
+ None,
154
+ None,
155
+ )
156
+
157
+ else:
158
+ speaker_wav = audio_file_pth
159
+
160
+ # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
161
+ # This is fast filtering not perfect
162
+
163
+ # Apply all on demand
164
+ lowpassfilter = denoise = trim = loudness = True
165
+
166
+ if lowpassfilter:
167
+ lowpass_highpass = "lowpass=8000,highpass=75,"
168
+ else:
169
+ lowpass_highpass = ""
170
+
171
+ if trim:
172
+ # better to remove silence in beginning and end for microphone
173
+ trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
174
+ else:
175
+ trim_silence = ""
176
+
177
+ if voice_cleanup:
178
+ try:
179
+ out_filename = (
180
+ speaker_wav + str(uuid.uuid4()) + ".wav"
181
+ ) # ffmpeg to know output format
182
+
183
+ # we will use newer ffmpeg as that has afftn denoise filter
184
+ shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(
185
+ " "
186
+ )
187
+
188
+ command_result = subprocess.run(
189
+ [item for item in shell_command],
190
+ capture_output=False,
191
+ text=True,
192
+ check=True,
193
+ )
194
+ speaker_wav = out_filename
195
+ print("Filtered microphone input")
196
+ except subprocess.CalledProcessError:
197
+ # There was an error - command exited with non-zero code
198
+ print("Error: failed filtering, use original microphone input")
199
+ else:
200
+ speaker_wav = speaker_wav
201
+
202
+ if len(prompt) < 2:
203
+ gr.Warning("Please give a longer prompt text")
204
+ return (
205
+ None,
206
+ None,
207
+ None,
208
+ None,
209
+ )
210
+ if len(prompt) > 200:
211
+ gr.Warning(
212
+ "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
213
+ )
214
+ return (
215
+ None,
216
+ None,
217
+ None,
218
+ None,
219
+ )
220
+ global DEVICE_ASSERT_DETECTED
221
+ if DEVICE_ASSERT_DETECTED:
222
+ global DEVICE_ASSERT_PROMPT
223
+ global DEVICE_ASSERT_LANG
224
+ # It will likely never come here as we restart space on first unrecoverable error now
225
+ print(
226
+ f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}"
227
+ )
228
+
229
+ # HF Space specific.. This error is unrecoverable need to restart space
230
+ space = api.get_space_runtime(repo_id=repo_id)
231
+ if space.stage!="BUILDING":
232
+ api.restart_space(repo_id=repo_id)
233
+ else:
234
+ print("TRIED TO RESTART but space is building")
235
+
236
+ try:
237
+ metrics_text = ""
238
+ t_latent = time.time()
239
+
240
+ # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
241
+ try:
242
+ (
243
+ gpt_cond_latent,
244
+ speaker_embedding,
245
+ ) = model.get_conditioning_latents(audio_path=speaker_wav, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
246
+ except Exception as e:
247
+ print("Speaker encoding error", str(e))
248
+ gr.Warning(
249
+ "It appears something wrong with reference, did you unmute your microphone?"
250
+ )
251
+ return (
252
+ None,
253
+ None,
254
+ None,
255
+ None,
256
+ )
257
+
258
+ latent_calculation_time = time.time() - t_latent
259
+ # metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
260
+
261
+ # temporary comma fix
262
+ prompt= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",prompt)
263
+
264
+ wav_chunks = []
265
+ ## Direct mode
266
+
267
+ print("I: Generating new audio...")
268
+ t0 = time.time()
269
+ out = model.inference(
270
+ prompt,
271
+ language,
272
+ gpt_cond_latent,
273
+ speaker_embedding,
274
+ repetition_penalty=5.0,
275
+ temperature=0.75,
276
+ )
277
+ inference_time = time.time() - t0
278
+ print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
279
+ metrics_text+=f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
280
+ real_time_factor= (time.time() - t0) / out['wav'].shape[-1] * 24000
281
+ print(f"Real-time factor (RTF): {real_time_factor}")
282
+ metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
283
+ torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
284
+
285
+
286
+ """
287
+ print("I: Generating new audio in streaming mode...")
288
+ t0 = time.time()
289
+ chunks = model.inference_stream(
290
+ prompt,
291
+ language,
292
+ gpt_cond_latent,
293
+ speaker_embedding,
294
+ repetition_penalty=7.0,
295
+ temperature=0.85,
296
+ )
297
+ first_chunk = True
298
+ for i, chunk in enumerate(chunks):
299
+ if first_chunk:
300
+ first_chunk_time = time.time() - t0
301
+ metrics_text += f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
302
+ first_chunk = False
303
+ wav_chunks.append(chunk)
304
+ print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
305
+ inference_time = time.time() - t0
306
+ print(
307
+ f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
308
+ )
309
+ #metrics_text += (
310
+ # f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
311
+ #)
312
+ wav = torch.cat(wav_chunks, dim=0)
313
+ print(wav.shape)
314
+ real_time_factor = (time.time() - t0) / wav.shape[0] * 24000
315
+ print(f"Real-time factor (RTF): {real_time_factor}")
316
+ metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
317
+ torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
318
+ """
319
+
320
+ except RuntimeError as e:
321
+ if "device-side assert" in str(e):
322
+ # cannot do anything on cuda device side error, need tor estart
323
+ print(
324
+ f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
325
+ flush=True,
326
+ )
327
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
328
+ print("Cuda device-assert Runtime encountered need restart")
329
+ if not DEVICE_ASSERT_DETECTED:
330
+ DEVICE_ASSERT_DETECTED = 1
331
+ DEVICE_ASSERT_PROMPT = prompt
332
+ DEVICE_ASSERT_LANG = language
333
+
334
+ # just before restarting save what caused the issue so we can handle it in future
335
+ # Uploading Error data only happens for unrecovarable error
336
+ error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
337
+ error_data = [
338
+ error_time,
339
+ prompt,
340
+ language,
341
+ audio_file_pth,
342
+ mic_file_path,
343
+ use_mic,
344
+ voice_cleanup,
345
+ no_lang_auto_detect,
346
+ agree,
347
+ ]
348
+ error_data = [str(e) if type(e) != str else e for e in error_data]
349
+ print(error_data)
350
+ print(speaker_wav)
351
+ write_io = StringIO()
352
+ csv.writer(write_io).writerows([error_data])
353
+ csv_upload = write_io.getvalue().encode()
354
+
355
+ filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
356
+ print("Writing error csv")
357
+ error_api = HfApi()
358
+ error_api.upload_file(
359
+ path_or_fileobj=csv_upload,
360
+ path_in_repo=filename,
361
+ repo_id="coqui/xtts-flagged-dataset",
362
+ repo_type="dataset",
363
+ )
364
+
365
+ # speaker_wav
366
+ print("Writing error reference audio")
367
+ speaker_filename = (
368
+ error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
369
+ )
370
+ error_api = HfApi()
371
+ error_api.upload_file(
372
+ path_or_fileobj=speaker_wav,
373
+ path_in_repo=speaker_filename,
374
+ repo_id="coqui/xtts-flagged-dataset",
375
+ repo_type="dataset",
376
+ )
377
+
378
+ # HF Space specific.. This error is unrecoverable need to restart space
379
+ space = api.get_space_runtime(repo_id=repo_id)
380
+ if space.stage!="BUILDING":
381
+ api.restart_space(repo_id=repo_id)
382
+ else:
383
+ print("TRIED TO RESTART but space is building")
384
+
385
+ else:
386
+ if "Failed to decode" in str(e):
387
+ print("Speaker encoding error", str(e))
388
+ gr.Warning(
389
+ "It appears something wrong with reference, did you unmute your microphone?"
390
+ )
391
+ else:
392
+ print("RuntimeError: non device-side assert error:", str(e))
393
+ gr.Warning("Something unexpected happened please retry again.")
394
+ return (
395
+ None,
396
+ None,
397
+ None,
398
+ None,
399
+ )
400
+ return (
401
+ gr.make_waveform(
402
+ audio="output.wav",
403
+ ),
404
+ "output.wav",
405
+ metrics_text,
406
+ speaker_wav,
407
+ )
408
+ else:
409
+ gr.Warning("Please accept the Terms & Condition!")
410
+ return (
411
+ None,
412
+ None,
413
+ None,
414
+ None,
415
+ )
416
+
417
+
418
+ title = "Coqui🐸 XTTS"
419
+
420
+ description = """
421
+ <br/>
422
+ This demo is currently running **XTTS v2.0.3**
423
+ <br/>
424
+ <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a text-to-speech model that lets you clone voices into different languages.
425
+ <br/>
426
+ This is the same model that powers our creator application <a href="https://coqui.ai">Coqui Studio</a> as well as the <a href="https://docs.coqui.ai">Coqui API</a>. In production we apply modifications to make low-latency streaming possible.
427
+ <br/>
428
+ There are 16 languages.
429
+ <p>
430
+ Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, French: fr, German: de, Italian: it, Polish: pl, Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu, Hindi: hi <br/>
431
+ </p>
432
+ <br/>
433
+ Leave a star 🌟 on the Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
434
+ <br/>
435
+ """
436
+
437
+ links = """
438
+ <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
439
+ | | |
440
+ | ------------------------------- | --------------------------------------- |
441
+ | 🐸💬 **CoquiTTS** | <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>|
442
+ | 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
443
+ | 👩‍💻 **Questions** | [GitHub Discussions](https://github.com/coqui-ai/TTS/discussions) |
444
+ | 🗯 **Community** | [![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) |
445
+ """
446
+
447
+ article = """
448
+ <div style='margin:20px auto;'>
449
+ <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
450
+ <p>We collect data only for error cases for improvement.</p>
451
+ </div>
452
+ """
453
+ examples = [
454
+ [
455
+ "Once when I was six years old I saw a magnificent picture",
456
+ "en",
457
+ "examples/female.wav",
458
+ None,
459
+ False,
460
+ False,
461
+ False,
462
+ True,
463
+ ],
464
+ [
465
+ "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
466
+ "fr",
467
+ "examples/male.wav",
468
+ None,
469
+ False,
470
+ False,
471
+ False,
472
+ True,
473
+ ],
474
+ [
475
+ "Als ich sechs war, sah ich einmal ein wunderbares Bild",
476
+ "de",
477
+ "examples/female.wav",
478
+ None,
479
+ False,
480
+ False,
481
+ False,
482
+ True,
483
+ ],
484
+ [
485
+ "Cuando tenía seis años, vi una vez una imagen magnífica",
486
+ "es",
487
+ "examples/male.wav",
488
+ None,
489
+ False,
490
+ False,
491
+ False,
492
+ True,
493
+ ],
494
+ [
495
+ "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
496
+ "pt",
497
+ "examples/female.wav",
498
+ None,
499
+ False,
500
+ False,
501
+ False,
502
+ True,
503
+ ],
504
+ [
505
+ "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
506
+ "pl",
507
+ "examples/male.wav",
508
+ None,
509
+ False,
510
+ False,
511
+ False,
512
+ True,
513
+ ],
514
+ [
515
+ "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
516
+ "it",
517
+ "examples/female.wav",
518
+ None,
519
+ False,
520
+ False,
521
+ False,
522
+ True,
523
+ ],
524
+ [
525
+ "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
526
+ "tr",
527
+ "examples/female.wav",
528
+ None,
529
+ False,
530
+ False,
531
+ False,
532
+ True,
533
+ ],
534
+ [
535
+ "Когда мне было шесть лет, я увидел однажды удивительную картинку",
536
+ "ru",
537
+ "examples/female.wav",
538
+ None,
539
+ False,
540
+ False,
541
+ False,
542
+ True,
543
+ ],
544
+ [
545
+ "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
546
+ "nl",
547
+ "examples/male.wav",
548
+ None,
549
+ False,
550
+ False,
551
+ False,
552
+ True,
553
+ ],
554
+ [
555
+ "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
556
+ "cs",
557
+ "examples/female.wav",
558
+ None,
559
+ False,
560
+ False,
561
+ False,
562
+ True,
563
+ ],
564
+ [
565
+ "当我还只有六岁的时候, 看到了一副精彩的插画",
566
+ "zh-cn",
567
+ "examples/female.wav",
568
+ None,
569
+ False,
570
+ False,
571
+ False,
572
+ True,
573
+ ],
574
+ [
575
+ "かつて 六歳のとき、素晴らしい絵を見ました",
576
+ "ja",
577
+ "examples/female.wav",
578
+ None,
579
+ False,
580
+ True,
581
+ False,
582
+ True,
583
+ ],
584
+ [
585
+ "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
586
+ "ko",
587
+ "examples/female.wav",
588
+ None,
589
+ False,
590
+ True,
591
+ False,
592
+ True,
593
+ ],
594
+ [
595
+ "Egyszer hat éves koromban láttam egy csodálatos képet",
596
+ "hu",
597
+ "examples/male.wav",
598
+ None,
599
+ False,
600
+ True,
601
+ False,
602
+ True,
603
+ ],
604
+ ]
605
+
606
+
607
 
608
  with gr.Blocks(analytics_enabled=False) as demo:
609
  with gr.Row():
610
  with gr.Column():
611
+ gr.Markdown(
612
+ """
613
+ ## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
614
+ """
615
+ )
616
  with gr.Column():
617
+ # placeholder to align the image
618
+ pass
619
 
620
  with gr.Row():
621
+ with gr.Column():
622
+ gr.Markdown(description)
623
+ with gr.Column():
624
+ gr.Markdown(links)
625
 
626
  with gr.Row():
627
  with gr.Column():
628
+ input_text_gr = gr.Textbox(
629
+ label="Text Prompt",
630
+ info="One or two sentences at a time is better. Up to 200 text characters.",
631
+ value="Hi there, I'm your new voice clone. Try your best to upload quality audio.",
632
+ )
633
+ language_gr = gr.Dropdown(
634
+ label="Language",
635
+ info="Select an output language for the synthesised speech",
636
+ choices=[
637
+ "en",
638
+ "es",
639
+ "fr",
640
+ "de",
641
+ "it",
642
+ "pt",
643
+ "pl",
644
+ "tr",
645
+ "ru",
646
+ "nl",
647
+ "cs",
648
+ "ar",
649
+ "zh-cn",
650
+ "hu",
651
+ "ko",
652
+ "ja",
653
+ "hi",
654
+ "fa"
655
+ ],
656
+ max_choices=1,
657
+ value="en",
658
+ )
659
+ ref_gr = gr.Audio(
660
+ label="Reference Audio",
661
+ info="Click on the ✎ button to upload your own target speaker audio",
662
+ type="filepath",
663
+ value="examples/female.wav",
664
  )
665
+ mic_gr = gr.Audio(
666
+ source="microphone",
667
+ type="filepath",
668
+ info="Use your microphone to record audio",
669
+ label="Use Microphone for Reference",
670
+ )
671
+ use_mic_gr = gr.Checkbox(
672
+ label="Use Microphone",
673
+ value=False,
674
+ info="Notice: Microphone input may not work properly under traffic",
675
+ )
676
+ clean_ref_gr = gr.Checkbox(
677
+ label="Cleanup Reference Voice",
678
+ value=False,
679
+ info="This check can improve output if your microphone or reference voice is noisy",
680
+ )
681
+ auto_det_lang_gr = gr.Checkbox(
682
+ label="Do not use language auto-detect",
683
+ value=False,
684
+ info="Check to disable language auto-detection",
685
+ )
686
+ tos_gr = gr.Checkbox(
687
+ label="Agree",
688
+ value=False,
689
+ info="I have purchased a commercial license from Coqui: [email protected]\nOtherwise, I agree to the terms of the non-commercial CPML: https://coqui.ai/cpml",
690
+ )
691
+
692
  tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
693
 
694
+
695
  with gr.Column():
696
+ video_gr = gr.Video(label="Waveform Visual")
697
+ audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
698
+ out_text_gr = gr.Text(label="Metrics")
699
+ ref_audio_gr = gr.Audio(label="Reference Audio Used")
700
+
701
+ with gr.Row():
702
+ gr.Examples(examples,
703
+ label="Examples",
704
+ inputs=[input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
705
+ outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
706
+ fn=predict,
707
+ cache_examples=False,)
708
 
709
+ tts_button.click(predict, [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
 
 
 
 
710
 
711
+ demo.queue()
712
+ demo.launch(debug=True, show_api=True)