NeuralFalcon commited on
Commit
86e1cdd
·
verified ·
1 Parent(s): b182ef7

Delete srt_dubbing.py

Browse files
Files changed (1) hide show
  1. srt_dubbing.py +0 -557
srt_dubbing.py DELETED
@@ -1,557 +0,0 @@
1
- from KOKORO.models import build_model
2
- from KOKORO.utils import tts,tts_file_name,podcast
3
- import sys
4
- sys.path.append('.')
5
- import torch
6
- import gc
7
- print("Loading model...")
8
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
9
- print(f'Using device: {device}')
10
- MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
11
- print("Model loaded successfully.")
12
-
13
- def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
14
- # Sanitize the save_path to remove any newline characters
15
- save_path = save_path.replace('\n', '').replace('\r', '')
16
- global MODEL
17
- audio_path=tts(MODEL,device,text,voice_name,speed=speed,trim=trim,pad_between_segments=pad_between,output_file=save_path,remove_silence=remove_silence,minimum_silence=minimum_silence)
18
- return audio_path
19
-
20
-
21
- model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
22
- current_model = model_list[0]
23
-
24
- def update_model(model_name):
25
- """
26
- Updates the TTS model only if the specified model is not already loaded.
27
- """
28
- global MODEL, current_model
29
- if current_model == model_name:
30
- return f"Model already set to {model_name}" # No need to reload
31
- model_path = f"./KOKORO/{model_name}" # Default model path
32
- if model_name == "kokoro-v0_19-half.pth":
33
- model_path = f"./KOKORO/fp16/{model_name}" # Update path for specific model
34
- # print(f"Loading new model: {model_name}")
35
- del MODEL # Cleanup existing model
36
- gc.collect()
37
- torch.cuda.empty_cache() # Ensure GPU memory is cleared
38
- MODEL = build_model(model_path, device)
39
- current_model = model_name
40
- return f"Model updated to {model_name}"
41
-
42
-
43
-
44
- def text_to_speech(text, model_name="kokoro-v0_19.pth", voice_name="af", speed=1.0, trim=1.0, pad_between_segments=0, remove_silence=True, minimum_silence=0.20):
45
- """
46
- Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
47
- """
48
- update_status = update_model(model_name) # Load the model only if required
49
- # print(update_status) # Log model loading status
50
- if not minimum_silence:
51
- minimum_silence = 0.05
52
- keep_silence = int(minimum_silence * 1000)
53
- save_at = tts_file_name(text)
54
- audio_path = tts_maker(
55
- text,
56
- voice_name,
57
- speed,
58
- trim,
59
- pad_between_segments,
60
- save_at,
61
- remove_silence,
62
- keep_silence
63
- )
64
- return audio_path
65
-
66
-
67
-
68
-
69
- import gradio as gr
70
-
71
- # voice_list = [
72
- # 'af', # Default voice is a 50-50 mix of af_bella & af_sarah
73
- # 'af_bella', 'af_sarah', 'am_adam', 'am_michael',
74
- # 'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
75
- # ]
76
-
77
-
78
-
79
- import os
80
-
81
- # Get the list of voice names without file extensions
82
- voice_list = [
83
- os.path.splitext(filename)[0]
84
- for filename in os.listdir("./KOKORO/voices")
85
- if filename.endswith('.pt')
86
- ]
87
-
88
- # Sort the list based on the length of each name
89
- voice_list = sorted(voice_list, key=len)
90
-
91
- def toggle_autoplay(autoplay):
92
- return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
93
-
94
- with gr.Blocks() as demo1:
95
- gr.Markdown("# Batched TTS")
96
- with gr.Row():
97
- with gr.Column():
98
- text = gr.Textbox(
99
- label='Enter Text',
100
- lines=3,
101
- placeholder="Type your text here..."
102
- )
103
- with gr.Row():
104
- voice = gr.Dropdown(
105
- voice_list,
106
- value='af',
107
- allow_custom_value=False,
108
- label='Voice',
109
- info='Starred voices are more stable'
110
- )
111
- with gr.Row():
112
- generate_btn = gr.Button('Generate', variant='primary')
113
- with gr.Accordion('Audio Settings', open=False):
114
- model_name=gr.Dropdown(model_list,label="Model",value=model_list[0])
115
- remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
116
- minimum_silence = gr.Number(
117
- label="Keep Silence Upto (In seconds)",
118
- value=0.05
119
- )
120
- speed = gr.Slider(
121
- minimum=0.25, maximum=2, value=1, step=0.1,
122
- label='⚡️Speed', info='Adjust the speaking speed'
123
- )
124
- trim = gr.Slider(
125
- minimum=0, maximum=1, value=0, step=0.1,
126
- label='🔪 Trim', info='How much to cut from both ends of each segment'
127
- )
128
- pad_between = gr.Slider(
129
- minimum=0, maximum=2, value=0, step=0.1,
130
- label='🔇 Pad Between', info='Silent Duration between segments [For Large Text]'
131
- )
132
-
133
- with gr.Column():
134
- audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
135
- with gr.Accordion('Enable Autoplay', open=False):
136
- autoplay = gr.Checkbox(value=True, label='Autoplay')
137
- autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
138
-
139
- text.submit(
140
- text_to_speech,
141
- inputs=[text, model_name,voice, speed, trim, pad_between, remove_silence, minimum_silence],
142
- outputs=[audio]
143
- )
144
- generate_btn.click(
145
- text_to_speech,
146
- inputs=[text,model_name, voice, speed, trim, pad_between, remove_silence, minimum_silence],
147
- outputs=[audio]
148
- )
149
-
150
- def podcast_maker(text,remove_silence=False,minimum_silence=50,model_name="kokoro-v0_19.pth"):
151
- global MODEL,device
152
- update_model(model_name)
153
- if not minimum_silence:
154
- minimum_silence = 0.05
155
- keep_silence = int(minimum_silence * 1000)
156
- podcast_save_at=podcast(MODEL, device,text,remove_silence=remove_silence, minimum_silence=keep_silence)
157
- return podcast_save_at
158
-
159
-
160
-
161
- dummpy_example="""{af} Hello, I'd like to order a sandwich please.
162
- {af_sky} What do you mean you're out of bread?
163
- {af_bella} I really wanted a sandwich though...
164
- {af_nicole} You know what, darn you and your little shop!
165
- {bm_george} I'll just go back home and cry now.
166
- {am_adam} Why me?"""
167
- with gr.Blocks() as demo2:
168
- gr.Markdown(
169
- """
170
- # Multiple Speech-Type Generation
171
- This section allows you to generate multiple speech types or multiple people's voices. Enter your text in the format shown below, and the system will generate speech using the appropriate type. If unspecified, the model will use "af" voice.
172
- Format:
173
- {voice_name} your text here
174
- """
175
- )
176
- with gr.Row():
177
- gr.Markdown(
178
- """
179
- **Example Input:**
180
- {af} Hello, I'd like to order a sandwich please.
181
- {af_sky} What do you mean you're out of bread?
182
- {af_bella} I really wanted a sandwich though...
183
- {af_nicole} You know what, darn you and your little shop!
184
- {bm_george} I'll just go back home and cry now.
185
- {am_adam} Why me?!
186
- """
187
- )
188
- with gr.Row():
189
- with gr.Column():
190
- text = gr.Textbox(
191
- label='Enter Text',
192
- lines=7,
193
- placeholder=dummpy_example
194
- )
195
- with gr.Row():
196
- generate_btn = gr.Button('Generate', variant='primary')
197
- with gr.Accordion('Audio Settings', open=False):
198
- remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
199
- minimum_silence = gr.Number(
200
- label="Keep Silence Upto (In seconds)",
201
- value=0.20
202
- )
203
- with gr.Column():
204
- audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
205
- with gr.Accordion('Enable Autoplay', open=False):
206
- autoplay = gr.Checkbox(value=True, label='Autoplay')
207
- autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
208
-
209
- text.submit(
210
- podcast_maker,
211
- inputs=[text, remove_silence, minimum_silence],
212
- outputs=[audio]
213
- )
214
- generate_btn.click(
215
- podcast_maker,
216
- inputs=[text, remove_silence, minimum_silence],
217
- outputs=[audio]
218
- )
219
-
220
-
221
-
222
-
223
- import shutil
224
- import os
225
-
226
- # Ensure the output directory exists
227
- output_dir = "./temp_audio"
228
- os.makedirs(output_dir, exist_ok=True)
229
-
230
-
231
-
232
-
233
-
234
-
235
-
236
-
237
-
238
- #@title Generate Audio File From Subtitle
239
- # from tqdm.notebook import tqdm
240
- from tqdm import tqdm
241
- import subprocess
242
- import json
243
- import pysrt
244
- import os
245
- from pydub import AudioSegment
246
- import shutil
247
- import uuid
248
- import re
249
- import time
250
-
251
- # os.chdir(install_path)
252
-
253
- def your_tts(text,audio_path,actual_duration,speed=1.0):
254
- global srt_voice_name
255
- model_name="kokoro-v0_19.pth"
256
- tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speed)
257
- print(tts_path)
258
- tts_audio = AudioSegment.from_file(tts_path)
259
- tts_duration = len(tts_audio)
260
- if tts_duration > actual_duration:
261
- speedup_factor = tts_duration / actual_duration
262
- tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speedup_factor)
263
- print(tts_path)
264
- shutil.copy(tts_path,audio_path)
265
-
266
-
267
-
268
- base_path="."
269
- import datetime
270
- def get_current_time():
271
- # Return current time as a string in the format HH_MM_AM/PM
272
- return datetime.datetime.now().strftime("%I_%M_%p")
273
-
274
- def get_subtitle_Dub_path(srt_file_path,Language="en"):
275
- file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
276
- if not os.path.exists(f"{base_path}/TTS_DUB"):
277
- os.mkdir(f"{base_path}/TTS_DUB")
278
- random_string = str(uuid.uuid4())[:6]
279
- new_path=f"{base_path}/TTS_DUB/{file_name}_{Language}_{get_current_time()}_{random_string}.wav"
280
- return new_path
281
-
282
-
283
-
284
-
285
-
286
-
287
-
288
-
289
- def clean_srt(input_path):
290
- file_name = os.path.basename(input_path)
291
- output_folder = f"{base_path}/save_srt"
292
- if not os.path.exists(output_folder):
293
- os.mkdir(output_folder)
294
- output_path = f"{output_folder}/{file_name}"
295
-
296
- def clean_srt_line(text):
297
- bad_list = ["[", "]", "♫", "\n"]
298
- for i in bad_list:
299
- text = text.replace(i, "")
300
- return text.strip()
301
-
302
- # Load the subtitle file
303
- subs = pysrt.open(input_path)
304
-
305
- # Iterate through each subtitle and print its details
306
- with open(output_path, "w", encoding='utf-8') as file:
307
- for sub in subs:
308
- file.write(f"{sub.index}\n")
309
- file.write(f"{sub.start} --> {sub.end}\n")
310
- file.write(f"{clean_srt_line(sub.text)}\n")
311
- file.write("\n")
312
- file.close()
313
- # print(f"Clean SRT saved at: {output_path}")
314
- return output_path
315
- # Example usage
316
-
317
-
318
-
319
-
320
-
321
-
322
- class SRTDubbing:
323
- def __init__(self):
324
- pass
325
-
326
- @staticmethod
327
- def text_to_speech_srt(text, audio_path, language, actual_duration):
328
- tts_filename = "./cache/temp.wav"
329
- your_tts(text,tts_filename,actual_duration,speed=1.0)
330
- # Check the duration of the generated TTS audio
331
- tts_audio = AudioSegment.from_file(tts_filename)
332
- tts_duration = len(tts_audio)
333
-
334
- if actual_duration == 0:
335
- # If actual duration is zero, use the original TTS audio without modifications
336
- shutil.move(tts_filename, audio_path)
337
- return
338
- # If TTS audio duration is longer than actual duration, speed up the audio
339
- if tts_duration > actual_duration:
340
- speedup_factor = tts_duration / actual_duration
341
- speedup_filename = "./cache/speedup_temp.wav"
342
- # Use ffmpeg to change audio speed
343
- subprocess.run([
344
- "ffmpeg",
345
- "-i", tts_filename,
346
- "-filter:a", f"atempo={speedup_factor}",
347
- speedup_filename,
348
- "-y"
349
- ], check=True)
350
-
351
- # Replace the original TTS audio with the sped-up version
352
- shutil.move(speedup_filename, audio_path)
353
- elif tts_duration < actual_duration:
354
- # If TTS audio duration is less than actual duration, add silence to match the duration
355
- silence_gap = actual_duration - tts_duration
356
- silence = AudioSegment.silent(duration=int(silence_gap))
357
- new_audio = tts_audio + silence
358
-
359
- # Save the new audio with added silence
360
- new_audio.export(audio_path, format="wav")
361
- else:
362
- # If TTS audio duration is equal to actual duration, use the original TTS audio
363
- shutil.move(tts_filename, audio_path)
364
-
365
- @staticmethod
366
- def make_silence(pause_time, pause_save_path):
367
- silence = AudioSegment.silent(duration=pause_time)
368
- silence.export(pause_save_path, format="wav")
369
- return pause_save_path
370
-
371
- @staticmethod
372
- def create_folder_for_srt(srt_file_path):
373
- srt_base_name = os.path.splitext(os.path.basename(srt_file_path))[0]
374
- random_uuid = str(uuid.uuid4())[:4]
375
- dummy_folder_path = f"{base_path}/dummy"
376
- if not os.path.exists(dummy_folder_path):
377
- os.makedirs(dummy_folder_path)
378
- folder_path = os.path.join(dummy_folder_path, f"{srt_base_name}_{random_uuid}")
379
- os.makedirs(folder_path, exist_ok=True)
380
- return folder_path
381
-
382
- @staticmethod
383
- def concatenate_audio_files(audio_paths, output_path):
384
- concatenated_audio = AudioSegment.silent(duration=0)
385
- for audio_path in audio_paths:
386
- audio_segment = AudioSegment.from_file(audio_path)
387
- concatenated_audio += audio_segment
388
- concatenated_audio.export(output_path, format="wav")
389
-
390
- def srt_to_dub(self, srt_file_path,dub_save_path,language='en'):
391
- result = self.read_srt_file(srt_file_path)
392
- new_folder_path = self.create_folder_for_srt(srt_file_path)
393
- join_path = []
394
- for i in tqdm(result):
395
- # for i in result:
396
- text = i['text']
397
- actual_duration = i['end_time'] - i['start_time']
398
- pause_time = i['pause_time']
399
- slient_path = f"{new_folder_path}/{i['previous_pause']}"
400
- self.make_silence(pause_time, slient_path)
401
- join_path.append(slient_path)
402
- tts_path = f"{new_folder_path}/{i['audio_name']}"
403
- self.text_to_speech_srt(text, tts_path, language, actual_duration)
404
- join_path.append(tts_path)
405
- self.concatenate_audio_files(join_path, dub_save_path)
406
-
407
- @staticmethod
408
- def convert_to_millisecond(time_str):
409
- if isinstance(time_str, str):
410
- hours, minutes, second_millisecond = time_str.split(':')
411
- seconds, milliseconds = second_millisecond.split(",")
412
-
413
- total_milliseconds = (
414
- int(hours) * 3600000 +
415
- int(minutes) * 60000 +
416
- int(seconds) * 1000 +
417
- int(milliseconds)
418
- )
419
-
420
- return total_milliseconds
421
- @staticmethod
422
- def read_srt_file(file_path):
423
- entries = []
424
- default_start = 0
425
- previous_end_time = default_start
426
- entry_number = 1
427
- audio_name_template = "{}.wav"
428
- previous_pause_template = "{}_before_pause.wav"
429
-
430
- with open(file_path, 'r', encoding='utf-8') as file:
431
- lines = file.readlines()
432
- # print(lines)
433
- for i in range(0, len(lines), 4):
434
- time_info = re.findall(r'(\d+:\d+:\d+,\d+) --> (\d+:\d+:\d+,\d+)', lines[i + 1])
435
- start_time = SRTDubbing.convert_to_millisecond(time_info[0][0])
436
- end_time = SRTDubbing.convert_to_millisecond(time_info[0][1])
437
-
438
- current_entry = {
439
- 'entry_number': entry_number,
440
- 'start_time': start_time,
441
- 'end_time': end_time,
442
- 'text': lines[i + 2].strip(),
443
- 'pause_time': start_time - previous_end_time if entry_number != 1 else start_time - default_start,
444
- 'audio_name': audio_name_template.format(entry_number),
445
- 'previous_pause': previous_pause_template.format(entry_number),
446
- }
447
-
448
- entries.append(current_entry)
449
- previous_end_time = end_time
450
- entry_number += 1
451
-
452
- with open("entries.json", "w") as file:
453
- json.dump(entries, file, indent=4)
454
- return entries
455
- srt_voice_name="am_adam"
456
- def srt_process(srt_file_path,voice_name,dest_language="en"):
457
- global srt_voice_name
458
- srt_voice_name=voice_name
459
- srt_dubbing = SRTDubbing()
460
- dub_save_path=get_subtitle_Dub_path(srt_file_path,dest_language)
461
- srt_dubbing.srt_to_dub(srt_file_path,dub_save_path,dest_language)
462
- return dub_save_path
463
-
464
- #
465
- # srt_file_path="./long.srt"
466
- # dub_audio_path=srt_process(srt_file_path)
467
- # print(f"Audio file saved at: {dub_audio_path}")
468
-
469
-
470
-
471
- with gr.Blocks() as demo3:
472
-
473
- gr.Markdown(
474
- """
475
- # Generate Audio File From Subtitle [Single Speaker Only]
476
-
477
- To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
478
-
479
- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NeuralFalconYT/Whisper-Turbo-Subtitle/blob/main/Whisper_Turbo_Subtitle.ipynb)
480
- """
481
- )
482
- with gr.Row():
483
- with gr.Column():
484
- srt_file = gr.File(label='Upload .srt Subtitle File Only')
485
- with gr.Row():
486
- voice = gr.Dropdown(
487
- voice_list,
488
- value='af',
489
- allow_custom_value=False,
490
- label='Voice',
491
- )
492
- with gr.Row():
493
- generate_btn_ = gr.Button('Generate', variant='primary')
494
-
495
- with gr.Column():
496
- audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
497
- with gr.Accordion('Enable Autoplay', open=False):
498
- autoplay = gr.Checkbox(value=True, label='Autoplay')
499
- autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
500
-
501
- # srt_file.submit(
502
- # srt_process,
503
- # inputs=[srt_file, voice],
504
- # outputs=[audio]
505
- # )
506
- generate_btn_.click(
507
- srt_process,
508
- inputs=[srt_file,voice],
509
- outputs=[audio]
510
- )
511
-
512
-
513
- display_text = " \n".join(voice_list)
514
-
515
- with gr.Blocks() as demo4:
516
- gr.Markdown(f"# Voice Names \n{display_text}")
517
-
518
-
519
- import click
520
- @click.command()
521
- @click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
522
- @click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
523
- def main(debug, share):
524
- demo = gr.TabbedInterface([demo1, demo2,demo3,demo4], ["Batched TTS", "Multiple Speech-Type Generation","SRT Dubbing","Available Voice Names"],title="Kokoro TTS")
525
-
526
- demo.queue().launch(debug=debug, share=share)
527
- #Run on local network
528
- # laptop_ip="192.168.0.30"
529
- # port=8080
530
- # demo.queue().launch(debug=debug, share=share,server_name=laptop_ip,server_port=port)
531
-
532
- if __name__ == "__main__":
533
- main()
534
-
535
-
536
- ##For client side
537
- # from gradio_client import Client
538
- # import shutil
539
- # import os
540
- # os.makedirs("temp_audio", exist_ok=True)
541
- # from gradio_client import Client
542
- # client = Client("http://127.0.0.1:7860/")
543
- # result = client.predict(
544
- # text="Hello!!",
545
- # model_name="kokoro-v0_19.pth",
546
- # voice_name="af_bella",
547
- # speed=1,
548
- # trim=0,
549
- # pad_between_segments=0,
550
- # remove_silence=False,
551
- # minimum_silence=0.05,
552
- # api_name="/text_to_speech"
553
- # )
554
-
555
- # save_at=f"./temp_audio/{os.path.basename(result)}"
556
- # shutil.move(result, save_at)
557
- # print(f"Saved at {save_at}")