Den4ikAI commited on
Commit
6ad1164
·
verified ·
1 Parent(s): 923bdc3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +250 -160
app.py CHANGED
@@ -1,13 +1,21 @@
 
 
 
1
  import gc
2
  import json
3
  import tempfile
 
 
4
 
5
  import gradio as gr
6
  import numpy as np
7
  import soundfile as sf
8
  import torch
9
  import torchaudio
10
- from cached_path import cached_path
 
 
 
11
  from ruaccent import RUAccent
12
  import onnx_asr
13
 
@@ -22,54 +30,131 @@ from f5_tts.infer.utils_infer import (
22
  )
23
  from f5_tts.model import DiT
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- # --- Model configuration ---
27
- MODEL_CFG = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
 
 
28
 
29
- # Paths for all models (впишите свои пути)
30
- MODEL_PATHS = {
31
- "ESpeech-TTS-1 [RL] V2": "stripped_states/espeech_tts_rlv2.pt",
32
- "ESpeech-TTS-1 [RL] V1": "stripped_states/espeech_tts_rlv1.pt",
33
- "ESpeech-TTS-1 [SFT] 95K": "stripped_states/espeech_tts_95k.pt",
34
- "ESpeech-TTS-1 [SFT] 265K": "stripped_states/espeech_tts_256k.pt",
35
- "ESpeech-TTS-1 PODCASTER [SFT]": "stripped_states/espeech_tts_podcaster.pt"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  }
37
 
38
- # Shared vocabulary path (одинаковый для всех моделей)
39
- VOCAB_PATH = "/media/denis/work/f5tts/F5-TTS/base_checkpoint1/vocab.txt"
 
40
 
41
- # Load vocoder (shared)
42
- vocoder = load_vocoder()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- # Dictionary to store loaded models
45
- loaded_models = {}
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- # Initialize RUAccent
48
  print("Loading RUAccent...")
49
  accentizer = RUAccent()
50
  accentizer.load(omograph_model_size='turbo3.1', use_dictionary=True, tiny_mode=False)
51
- print("RUAccent loaded successfully.")
52
 
53
- # Initialize ASR model
54
- print("Loading ASR model...")
55
  asr_model = onnx_asr.load_model("nemo-fastconformer-ru-rnnt")
56
- print("ASR model loaded successfully.")
57
-
58
- # Load all models at startup
59
- print("Loading models...")
60
- for model_name, model_path in MODEL_PATHS.items():
61
- print(f"Loading {model_name}...")
62
- loaded_models[model_name] = load_model(
63
- DiT,
64
- MODEL_CFG,
65
- model_path,
66
- vocab_file=VOCAB_PATH
67
- )
68
- print(f"{model_name} loaded successfully.")
69
-
70
- print("All models loaded successfully.")
71
 
 
 
 
72
 
 
 
 
 
73
  def synthesize(
74
  model_choice,
75
  ref_audio,
@@ -81,174 +166,180 @@ def synthesize(
81
  nfe_step=32,
82
  speed=1.0,
83
  ):
 
 
 
 
 
 
 
 
84
  if not ref_audio:
85
  gr.Warning("Please provide reference audio.")
86
  return None, None, ref_text
87
 
88
- if seed < 0 or seed > 2**31 - 1:
89
  seed = np.random.randint(0, 2**31 - 1)
90
- torch.manual_seed(seed)
91
 
92
- if not gen_text.strip():
93
  gr.Warning("Please enter text to generate.")
94
  return None, None, ref_text
95
 
96
- # If reference text is empty, use ASR to transcribe reference audio
97
- # If reference text is empty, use ASR to transcribe reference audio
98
- if not ref_text.strip():
99
  gr.Info("Reference text is empty. Running ASR to transcribe reference audio...")
100
  try:
101
- # Load audio data from Gradio (correct order: waveform first, then sample_rate)
102
  waveform, sample_rate = torchaudio.load(ref_audio)
103
-
104
- # Convert tensor to numpy
105
  waveform = waveform.numpy()
106
-
107
- # Convert to the format expected by onnx-asr
108
  if waveform.dtype == np.int16:
109
  waveform = waveform / 2**15
110
  elif waveform.dtype == np.int32:
111
  waveform = waveform / 2**31
112
- elif waveform.dtype == np.float32 or waveform.dtype == np.float64:
113
- pass # already in the right range
114
-
115
- # Convert to mono if stereo
116
  if waveform.ndim == 2:
117
- waveform = waveform.mean(axis=0) # average across channels (first dimension)
118
- elif waveform.ndim == 1:
119
- pass # already mono
120
- else:
121
- waveform = waveform.squeeze()
122
-
123
- # Run ASR on the audio data directly
124
  transcribed_text = asr_model.recognize(waveform, sample_rate=sample_rate)
125
  ref_text = transcribed_text
126
  gr.Info(f"ASR transcription: {ref_text}")
127
-
128
  except Exception as e:
129
- gr.Warning(f"ASR transcription failed: {str(e)}")
130
  return None, None, ref_text
131
 
132
- # Apply accent marks to reference text and generation text
133
- processed_ref_text = accentizer.process_all(ref_text) if ref_text.strip() else ref_text
134
  processed_gen_text = accentizer.process_all(gen_text)
135
 
136
- # Select model based on choice
137
- model = loaded_models[model_choice]
138
-
139
- # Preprocess reference audio and text
140
- ref_audio, processed_ref_text = preprocess_ref_audio_text(
141
- ref_audio,
142
- processed_ref_text,
143
- show_info=gr.Info
144
- )
145
 
146
- # Generate speech
147
- final_wave, final_sample_rate, combined_spectrogram = infer_process(
148
- ref_audio,
149
- processed_ref_text,
150
- processed_gen_text,
151
- model,
152
- vocoder,
153
- cross_fade_duration=cross_fade_duration,
154
- nfe_step=nfe_step,
155
- speed=speed,
156
- show_info=gr.Info,
157
- progress=gr.Progress(),
158
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
- # Remove silence if requested
161
- if remove_silence:
162
- with tempfile.NamedTemporaryFile(suffix=".wav", **tempfile_kwargs) as f:
163
- temp_path = f.name
164
- sf.write(temp_path, final_wave, final_sample_rate)
165
- remove_silence_for_generated_wav(temp_path)
166
- final_wave, _ = torchaudio.load(temp_path)
167
- final_wave = final_wave.squeeze().cpu().numpy()
 
 
 
168
 
169
- # Save spectrogram
170
- with tempfile.NamedTemporaryFile(suffix=".png", **tempfile_kwargs) as tmp_spectrogram:
171
- spectrogram_path = tmp_spectrogram.name
172
- save_spectrogram(combined_spectrogram, spectrogram_path)
 
 
 
 
173
 
174
- return (final_sample_rate, final_wave), spectrogram_path, processed_ref_text
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
- # --- Gradio interface ---
178
- with gr.Blocks(title="ESpeech-TTS") as app:
179
  gr.Markdown("# ESpeech-TTS")
180
- gr.Markdown("Text-to-Speech synthesis system with multiple model variants")
181
- gr.Markdown("💡 **Tip:** If you leave the Reference Text empty, it will be automatically transcribed using ASR and then processed with accent marks!")
 
 
 
 
 
 
 
182
 
183
- with gr.Row():
184
- model_choice = gr.Dropdown(
185
- choices=list(MODEL_PATHS.keys()),
186
- label="Select Model",
187
- value="ESpeech-TTS-1 [RL] V2",
188
- interactive=True
189
- )
190
-
191
  with gr.Row():
192
  with gr.Column():
193
- ref_audio_input = gr.Audio(
194
- label="Reference Audio",
195
- type="filepath"
196
- )
197
- ref_text_input = gr.Textbox(
198
- label="Reference Text",
199
- lines=2,
200
- placeholder="Enter the transcription of the reference audio... (leave empty for automatic ASR transcription)"
201
- )
202
-
203
  with gr.Column():
204
- gen_text_input = gr.Textbox(
205
- label="Text to Generate",
206
- lines=5,
207
- max_lines=20,
208
- placeholder="Enter the text you want to synthesize..."
209
- )
210
-
211
  with gr.Row():
212
  with gr.Column():
213
  with gr.Accordion("Advanced Settings", open=False):
214
- seed_input = gr.Number(
215
- label="Seed (-1 for random)",
216
- value=-1,
217
- precision=0
218
- )
219
- remove_silence = gr.Checkbox(
220
- label="Remove Silences",
221
- value=False
222
- )
223
- speed_slider = gr.Slider(
224
- label="Speed",
225
- minimum=0.3,
226
- maximum=2.0,
227
- value=1.0,
228
- step=0.1
229
- )
230
- nfe_slider = gr.Slider(
231
- label="NFE Steps (higher = better quality, slower)",
232
- minimum=4,
233
- maximum=64,
234
- value=48,
235
- step=2
236
- )
237
- cross_fade_slider = gr.Slider(
238
- label="Cross-Fade Duration (s)",
239
- minimum=0.0,
240
- maximum=1.0,
241
- value=0.15,
242
- step=0.01
243
- )
244
-
245
  generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
246
-
247
  with gr.Row():
248
  audio_output = gr.Audio(label="Generated Audio", type="numpy")
249
  spectrogram_output = gr.Image(label="Spectrogram", type="filepath")
250
-
251
-
252
  generate_btn.click(
253
  synthesize,
254
  inputs=[
@@ -265,7 +356,6 @@ with gr.Blocks(title="ESpeech-TTS") as app:
265
  outputs=[audio_output, spectrogram_output, ref_text_input]
266
  )
267
 
268
-
269
  if __name__ == "__main__":
270
  #app.launch(server_name="0.0.0.0", server_port=7860)
271
  app.launch()
 
1
+ #!/usr/bin/env python3
2
+ # app.py — ESpeech-TTS с поддержкой ZeroGPU (Hugging Face Spaces)
3
+ import os
4
  import gc
5
  import json
6
  import tempfile
7
+ import traceback
8
+ from pathlib import Path
9
 
10
  import gradio as gr
11
  import numpy as np
12
  import soundfile as sf
13
  import torch
14
  import torchaudio
15
+
16
+ from huggingface_hub import hf_hub_download, HFValidationError
17
+
18
+ # Ваши зависимости / локальные импорты
19
  from ruaccent import RUAccent
20
  import onnx_asr
21
 
 
30
  )
31
  from f5_tts.model import DiT
32
 
33
+ # ----------------- ZeroGPU / spaces импорт + fallback -----------------
34
+ # В среде ZeroGPU доступен пакет `spaces`, который предоставляет декоратор GPU.
35
+ # Для локальной отладки мы делаем fallback — noop-декоратор.
36
+ try:
37
+ import spaces # provided by Spaces/ZeroGPU environment
38
+ GPU_DECORATOR = spaces.GPU
39
+ print("spaces module available — ZeroGPU features enabled")
40
+ except Exception:
41
+ # fallback: noop decorator, чтобы локально всё работало
42
+ def GPU_DECO(duration: int = None):
43
+ def _decorator(fn):
44
+ return fn
45
+ return _decorator
46
+ GPU_DECORATOR = GPU_DECO
47
+ print("spaces module NOT available — running in local/CPU fallback mode")
48
 
49
+ # Явно включаем ленивый режим кеширования примеров, чтобы примеры не запускались на старте
50
+ # (ZeroGPU по умолчанию использует lazy — делаем это явным).
51
+ os.environ.setdefault("GRADIO_CACHE_MODE", "lazy")
52
+ os.environ.setdefault("GRADIO_CACHE_EXAMPLES", "lazy")
53
 
54
+ # ----------------- HF hub / модели -----------------
55
+ # Настройте репозитории и имена файлов в Hub под себя
56
+ MODEL_REPOS = {
57
+ "ESpeech-TTS-1 [RL] V2": {
58
+ "repo_id": "ESpeech/ESpeech-TTS-1_RL-V2",
59
+ "filename": "espeech_tts_rlv2.pt",
60
+ },
61
+ "ESpeech-TTS-1 [RL] V1": {
62
+ "repo_id": "ESpeech/ESpeech-TTS-1_RL-V1",
63
+ "filename": "espeech_tts_rlv1.pt",
64
+ },
65
+ "ESpeech-TTS-1 [SFT] 95K": {
66
+ "repo_id": "ESpeech/ESpeech-TTS-1_SFT-95K",
67
+ "filename": "espeech_tts_95k.pt",
68
+ },
69
+ "ESpeech-TTS-1 [SFT] 265K": {
70
+ "repo_id": "ESpeech/ESpeech-TTS-1_SFT-256K",
71
+ "filename": "espeech_tts_256k.pt",
72
+ },
73
+ "ESpeech-TTS-1 PODCASTER [SFT]": {
74
+ "repo_id": "ESpeech/ESpeech-TTS-1_podcaster",
75
+ "filename": "espeech_tts_podcaster.pt",
76
+ },
77
  }
78
 
79
+ # где лежит общий vocab в Hub
80
+ VOCAB_REPO = "ESpeech/ESpeech-TTS-1_podcaster"
81
+ VOCAB_FILENAME = "vocab.txt"
82
 
83
+ # токен, если репозитории приватные (в Spaces обычно берут из Secrets)
84
+ HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN") or None
85
+
86
+ MODEL_CFG = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
87
+
88
+ # кэш локальных путей после hf_hub_download
89
+ _cached_local_paths = {}
90
+ loaded_models = {} # хранит объекты моделей в памяти (по имени выбора)
91
+
92
+ # ----------------- Вспомогательные функции HF -----------------
93
+ def hf_download_file(repo_id: str, filename: str, token: str = None):
94
+ try:
95
+ print(f"hf_hub_download: {repo_id}/{filename}")
96
+ p = hf_hub_download(repo_id=repo_id, filename=filename, token=token, repo_type="model")
97
+ print(" ->", p)
98
+ return p
99
+ except HFValidationError as e:
100
+ print("HFValidationError:", e)
101
+ raise
102
+ except Exception as e:
103
+ print("Download error:", e)
104
+ raise
105
+
106
+ def get_vocab_path():
107
+ key = f"{VOCAB_REPO}::{VOCAB_FILENAME}"
108
+ if key in _cached_local_paths and Path(_cached_local_paths[key]).exists():
109
+ return _cached_local_paths[key]
110
+ p = hf_download_file(VOCAB_REPO, VOCAB_FILENAME, token=HF_TOKEN)
111
+ _cached_local_paths[key] = p
112
+ return p
113
+
114
+ def get_model_local_path(choice: str):
115
+ if choice not in MODEL_REPOS:
116
+ raise KeyError("Unknown model choice: " + repr(choice))
117
+ repo = MODEL_REPOS[choice]
118
+ key = f"{repo['repo_id']}::{repo['filename']}"
119
+ if key in _cached_local_paths and Path(_cached_local_paths[key]).exists():
120
+ return _cached_local_paths[key]
121
+ p = hf_download_file(repo["repo_id"], repo["filename"], token=HF_TOKEN)
122
+ _cached_local_paths[key] = p
123
+ return p
124
 
125
+ def load_model_if_needed(choice: str):
126
+ """
127
+ Лениво: если модель уже загружена в loaded_models — вернуть.
128
+ Иначе скачать файл (если нужно) и вызвать вашу load_model (возвращает PyTorch модель в CPU).
129
+ Не переводим на GPU здесь — это делается внутри GPU-декорированной функции.
130
+ """
131
+ if choice in loaded_models:
132
+ return loaded_models[choice]
133
+ model_file = get_model_local_path(choice)
134
+ vocab_file = get_vocab_path()
135
+ print(f"Loading model into CPU memory: {choice} from {model_file}")
136
+ model = load_model(DiT, MODEL_CFG, model_file, vocab_file=vocab_file)
137
+ loaded_models[choice] = model
138
+ return model
139
 
140
+ # ----------------- общие ресурсы (vocoder, RUAccent, ASR) -----------------
141
  print("Loading RUAccent...")
142
  accentizer = RUAccent()
143
  accentizer.load(omograph_model_size='turbo3.1', use_dictionary=True, tiny_mode=False)
144
+ print("RUAccent loaded.")
145
 
146
+ print("Loading ASR (onnx) ...")
 
147
  asr_model = onnx_asr.load_model("nemo-fastconformer-ru-rnnt")
148
+ print("ASR ready.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
+ print("Loading vocoder (CPU) ...")
151
+ vocoder = load_vocoder()
152
+ print("Vocoder loaded.")
153
 
154
+ # ----------------- Основная функция синтеза (GPU-aware) -----------------
155
+ # Декорируем synthesize, чтобы при вызове Space выделял GPU (если доступно).
156
+ # duration — сколько секунд просим GPU (адаптируйте под ваш инференс).
157
+ @GPU_DECORATOR(duration=90)
158
  def synthesize(
159
  model_choice,
160
  ref_audio,
 
166
  nfe_step=32,
167
  speed=1.0,
168
  ):
169
+ """
170
+ Эта функция будет выполняться с выделенным GPU в ZeroGPU Spaces.
171
+ Подход:
172
+ - лениво загружаем модель (в CPU) если надо
173
+ - переносим модель и (если требуется) vocoder на cuda
174
+ - делаем infer
175
+ - возвращаем модели на CPU и очищаем cuda cache
176
+ """
177
  if not ref_audio:
178
  gr.Warning("Please provide reference audio.")
179
  return None, None, ref_text
180
 
181
+ if seed is None or seed < 0 or seed > 2**31 - 1:
182
  seed = np.random.randint(0, 2**31 - 1)
183
+ torch.manual_seed(int(seed))
184
 
185
+ if not gen_text or not gen_text.strip():
186
  gr.Warning("Please enter text to generate.")
187
  return None, None, ref_text
188
 
189
+ # ASR если нужно
190
+ if not ref_text or not ref_text.strip():
 
191
  gr.Info("Reference text is empty. Running ASR to transcribe reference audio...")
192
  try:
 
193
  waveform, sample_rate = torchaudio.load(ref_audio)
 
 
194
  waveform = waveform.numpy()
 
 
195
  if waveform.dtype == np.int16:
196
  waveform = waveform / 2**15
197
  elif waveform.dtype == np.int32:
198
  waveform = waveform / 2**31
 
 
 
 
199
  if waveform.ndim == 2:
200
+ waveform = waveform.mean(axis=0)
 
 
 
 
 
 
201
  transcribed_text = asr_model.recognize(waveform, sample_rate=sample_rate)
202
  ref_text = transcribed_text
203
  gr.Info(f"ASR transcription: {ref_text}")
 
204
  except Exception as e:
205
+ gr.Warning(f"ASR failed: {e}")
206
  return None, None, ref_text
207
 
208
+ # Акцентирование
209
+ processed_ref_text = accentizer.process_all(ref_text) if ref_text and ref_text.strip() else ref_text
210
  processed_gen_text = accentizer.process_all(gen_text)
211
 
212
+ # Ленивая загрузка модели CPU)
213
+ try:
214
+ model = load_model_if_needed(model_choice)
215
+ except Exception as e:
216
+ gr.Warning(f"Failed to download/load model {model_choice}: {e}")
217
+ return None, None, ref_text
 
 
 
218
 
219
+ # Определяем устройство (в ZeroGPU внутри декоратора должен быть доступен CUDA)
220
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
221
+ moved_to_cuda = []
222
+
223
+ try:
224
+ # Переносим модель на GPU (если есть)
225
+ if device.type == "cuda":
226
+ try:
227
+ model.to(device)
228
+ moved_to_cuda.append(("model", model))
229
+ # если vocoder использует torch — переносим его тоже
230
+ try:
231
+ vocoder.to(device)
232
+ moved_to_cuda.append(("vocoder", vocoder))
233
+ except Exception:
234
+ # если vocoder не torch-объект — ок
235
+ pass
236
+ except Exception as e:
237
+ print("Warning: failed to move model/vocoder to cuda:", e)
238
+
239
+ # Препроцессинг рефа (оно ожидает путь/файл)
240
+ try:
241
+ ref_audio_proc, processed_ref_text = preprocess_ref_audio_text(
242
+ ref_audio,
243
+ processed_ref_text,
244
+ show_info=gr.Info
245
+ )
246
+ except Exception as e:
247
+ gr.Warning(f"Preprocess failed: {e}")
248
+ traceback.print_exc()
249
+ return None, None, ref_text
250
+
251
+ # Инференс (предполагается, что infer_process корректно работает и на GPU)
252
+ try:
253
+ final_wave, final_sample_rate, combined_spectrogram = infer_process(
254
+ ref_audio_proc,
255
+ processed_ref_text,
256
+ processed_gen_text,
257
+ model,
258
+ vocoder,
259
+ cross_fade_duration=cross_fade_duration,
260
+ nfe_step=nfe_step,
261
+ speed=speed,
262
+ show_info=gr.Info,
263
+ progress=gr.Progress(),
264
+ )
265
+ except Exception as e:
266
+ gr.Warning(f"Infer failed: {e}")
267
+ traceback.print_exc()
268
+ return None, None, ref_text
269
 
270
+ # Удаление тишин (на CPU)
271
+ if remove_silence:
272
+ try:
273
+ with tempfile.NamedTemporaryFile(suffix=".wav", **tempfile_kwargs) as f:
274
+ temp_path = f.name
275
+ sf.write(temp_path, final_wave, final_sample_rate)
276
+ remove_silence_for_generated_wav(temp_path)
277
+ final_wave_tensor, _ = torchaudio.load(temp_path)
278
+ final_wave = final_wave_tensor.squeeze().cpu().numpy()
279
+ except Exception as e:
280
+ print("Remove silence failed:", e)
281
 
282
+ # Сохраняем спектрограмму
283
+ try:
284
+ with tempfile.NamedTemporaryFile(suffix=".png", **tempfile_kwargs) as tmp_spectrogram:
285
+ spectrogram_path = tmp_spectrogram.name
286
+ save_spectrogram(combined_spectrogram, spectrogram_path)
287
+ except Exception as e:
288
+ print("Save spectrogram failed:", e)
289
+ spectrogram_path = None
290
 
291
+ return (final_sample_rate, final_wave), spectrogram_path, processed_ref_text
292
 
293
+ finally:
294
+ # Переносим всё обратно на CPU и очищаем GPU память
295
+ if device.type == "cuda":
296
+ try:
297
+ for name, obj in moved_to_cuda:
298
+ try:
299
+ obj.to("cpu")
300
+ except Exception:
301
+ pass
302
+ torch.cuda.empty_cache()
303
+ # немножко сборки мусора
304
+ gc.collect()
305
+ except Exception as e:
306
+ print("Warning during cuda cleanup:", e)
307
 
308
+ # ----------------- Gradio UI (как у вас) -----------------
309
+ with gr.Blocks(title="ESpeech-TTS (ZeroGPU-ready)") as app:
310
  gr.Markdown("# ESpeech-TTS")
311
+ gr.Markdown("Text-to-Speech synthesis system with multiple model variants (models auto-download from HF Hub).")
312
+ gr.Markdown("💡 Tip: Leave Reference Text empty to transcribe with ASR. On ZeroGPU the heavy work runs on GPU only during synthesize call.")
313
+
314
+ model_choice = gr.Dropdown(
315
+ choices=list(MODEL_REPOS.keys()),
316
+ label="Select Model",
317
+ value=list(MODEL_REPOS.keys())[0],
318
+ interactive=True
319
+ )
320
 
 
 
 
 
 
 
 
 
321
  with gr.Row():
322
  with gr.Column():
323
+ ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
324
+ ref_text_input = gr.Textbox(label="Reference Text", lines=2, placeholder="leave empty → ASR")
 
 
 
 
 
 
 
 
325
  with gr.Column():
326
+ gen_text_input = gr.Textbox(label="Text to Generate", lines=5, max_lines=20)
327
+
 
 
 
 
 
328
  with gr.Row():
329
  with gr.Column():
330
  with gr.Accordion("Advanced Settings", open=False):
331
+ seed_input = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
332
+ remove_silence = gr.Checkbox(label="Remove Silences", value=False)
333
+ speed_slider = gr.Slider(label="Speed", minimum=0.3, maximum=2.0, value=1.0, step=0.1)
334
+ nfe_slider = gr.Slider(label="NFE Steps", minimum=4, maximum=64, value=48, step=2)
335
+ cross_fade_slider = gr.Slider(label="Cross-Fade Duration (s)", minimum=0.0, maximum=1.0, value=0.15, step=0.01)
336
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
338
+
339
  with gr.Row():
340
  audio_output = gr.Audio(label="Generated Audio", type="numpy")
341
  spectrogram_output = gr.Image(label="Spectrogram", type="filepath")
342
+
 
343
  generate_btn.click(
344
  synthesize,
345
  inputs=[
 
356
  outputs=[audio_output, spectrogram_output, ref_text_input]
357
  )
358
 
 
359
  if __name__ == "__main__":
360
  #app.launch(server_name="0.0.0.0", server_port=7860)
361
  app.launch()