NeuralFalcon commited on
Commit
49ef1cb
·
verified ·
1 Parent(s): bf5ea2a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +308 -12
app.py CHANGED
@@ -2,14 +2,15 @@ from KOKORO.models import build_model
2
  from KOKORO.utils import tts,tts_file_name,podcast
3
  import sys
4
  sys.path.append('.')
 
 
5
  import torch
6
  import gc
7
  print("Loading model...")
8
- import os
9
- os.system("python download_model.py")
10
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
11
  print(f'Using device: {device}')
12
- MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
 
13
  print("Model loaded successfully.")
14
 
15
  def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
@@ -42,7 +43,8 @@ def update_model(model_name):
42
  return f"Model updated to {model_name}"
43
 
44
 
45
- def text_to_speech(text, model_name, voice_name, speed, trim, pad_between_segments, remove_silence, minimum_silence):
 
46
  """
47
  Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
48
  """
@@ -218,17 +220,311 @@ with gr.Blocks() as demo2:
218
  outputs=[audio]
219
  )
220
 
221
- display_text = " \n".join(voice_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
  with gr.Blocks() as demo3:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  gr.Markdown(f"# Voice Names \n{display_text}")
225
 
226
- import click
227
- @click.command()
228
- @click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
229
- @click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
230
- def main(debug, share):
231
- demo = gr.TabbedInterface([demo1, demo2,demo3], ["Batched TTS", "Multiple Speech-Type Generation","Available Voice Names"],title="Kokoro TTS")
 
232
 
233
  demo.queue().launch(debug=debug, share=share)
234
  #Run on local network
@@ -261,4 +557,4 @@ if __name__ == "__main__":
261
 
262
  # save_at=f"./temp_audio/{os.path.basename(result)}"
263
  # shutil.move(result, save_at)
264
- # print(f"Saved at {save_at}")
 
2
  from KOKORO.utils import tts,tts_file_name,podcast
3
  import sys
4
  sys.path.append('.')
5
+ import os
6
+ os.system("python download_model.py")
7
  import torch
8
  import gc
9
  print("Loading model...")
 
 
10
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
11
  print(f'Using device: {device}')
12
+ # MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
13
+ MODEL = build_model('./KOKORO/fp16/kokoro-v0_19-half.pth', device)
14
  print("Model loaded successfully.")
15
 
16
  def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
 
43
  return f"Model updated to {model_name}"
44
 
45
 
46
+
47
+ def text_to_speech(text, model_name="kokoro-v0_19-half.pth", voice_name="af", speed=1.0, trim=1.0, pad_between_segments=0, remove_silence=True, minimum_silence=0.20):
48
  """
49
  Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
50
  """
 
220
  outputs=[audio]
221
  )
222
 
223
+
224
+
225
+
226
+ import shutil
227
+ import os
228
+
229
+ # Ensure the output directory exists
230
+ output_dir = "./temp_audio"
231
+ os.makedirs(output_dir, exist_ok=True)
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+
241
+ #@title Generate Audio File From Subtitle
242
+ # from tqdm.notebook import tqdm
243
+ from tqdm import tqdm
244
+ import subprocess
245
+ import json
246
+ import pysrt
247
+ import os
248
+ from pydub import AudioSegment
249
+ import shutil
250
+ import uuid
251
+ import re
252
+ import time
253
+
254
+ # os.chdir(install_path)
255
+
256
+ def your_tts(text,audio_path,actual_duration,speed=1.0):
257
+ global srt_voice_name
258
+ model_name="kokoro-v0_19.pth"
259
+ tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speed)
260
+ print(tts_path)
261
+ tts_audio = AudioSegment.from_file(tts_path)
262
+ tts_duration = len(tts_audio)
263
+ if tts_duration > actual_duration:
264
+ speedup_factor = tts_duration / actual_duration
265
+ tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speedup_factor)
266
+ print(tts_path)
267
+ shutil.copy(tts_path,audio_path)
268
+
269
+
270
+
271
+ base_path="."
272
+ import datetime
273
+ def get_current_time():
274
+ # Return current time as a string in the format HH_MM_AM/PM
275
+ return datetime.datetime.now().strftime("%I_%M_%p")
276
+
277
+ def get_subtitle_Dub_path(srt_file_path,Language="en"):
278
+ file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
279
+ if not os.path.exists(f"{base_path}/TTS_DUB"):
280
+ os.mkdir(f"{base_path}/TTS_DUB")
281
+ random_string = str(uuid.uuid4())[:6]
282
+ new_path=f"{base_path}/TTS_DUB/{file_name}_{Language}_{get_current_time()}_{random_string}.wav"
283
+ return new_path
284
+
285
+
286
+
287
+
288
+
289
+
290
+
291
+
292
+ def clean_srt(input_path):
293
+ file_name = os.path.basename(input_path)
294
+ output_folder = f"{base_path}/save_srt"
295
+ if not os.path.exists(output_folder):
296
+ os.mkdir(output_folder)
297
+ output_path = f"{output_folder}/{file_name}"
298
+
299
+ def clean_srt_line(text):
300
+ bad_list = ["[", "]", "♫", "\n"]
301
+ for i in bad_list:
302
+ text = text.replace(i, "")
303
+ return text.strip()
304
+
305
+ # Load the subtitle file
306
+ subs = pysrt.open(input_path)
307
+
308
+ # Iterate through each subtitle and print its details
309
+ with open(output_path, "w", encoding='utf-8') as file:
310
+ for sub in subs:
311
+ file.write(f"{sub.index}\n")
312
+ file.write(f"{sub.start} --> {sub.end}\n")
313
+ file.write(f"{clean_srt_line(sub.text)}\n")
314
+ file.write("\n")
315
+ file.close()
316
+ # print(f"Clean SRT saved at: {output_path}")
317
+ return output_path
318
+ # Example usage
319
+
320
+
321
+
322
+
323
+
324
+
325
+ class SRTDubbing:
326
+ def __init__(self):
327
+ pass
328
+
329
+ @staticmethod
330
+ def text_to_speech_srt(text, audio_path, language, actual_duration):
331
+ tts_filename = "./cache/temp.wav"
332
+ your_tts(text,tts_filename,actual_duration,speed=1.0)
333
+ # Check the duration of the generated TTS audio
334
+ tts_audio = AudioSegment.from_file(tts_filename)
335
+ tts_duration = len(tts_audio)
336
+
337
+ if actual_duration == 0:
338
+ # If actual duration is zero, use the original TTS audio without modifications
339
+ shutil.move(tts_filename, audio_path)
340
+ return
341
+ # If TTS audio duration is longer than actual duration, speed up the audio
342
+ if tts_duration > actual_duration:
343
+ speedup_factor = tts_duration / actual_duration
344
+ speedup_filename = "./cache/speedup_temp.wav"
345
+ # Use ffmpeg to change audio speed
346
+ subprocess.run([
347
+ "ffmpeg",
348
+ "-i", tts_filename,
349
+ "-filter:a", f"atempo={speedup_factor}",
350
+ speedup_filename,
351
+ "-y"
352
+ ], check=True)
353
+
354
+ # Replace the original TTS audio with the sped-up version
355
+ shutil.move(speedup_filename, audio_path)
356
+ elif tts_duration < actual_duration:
357
+ # If TTS audio duration is less than actual duration, add silence to match the duration
358
+ silence_gap = actual_duration - tts_duration
359
+ silence = AudioSegment.silent(duration=int(silence_gap))
360
+ new_audio = tts_audio + silence
361
+
362
+ # Save the new audio with added silence
363
+ new_audio.export(audio_path, format="wav")
364
+ else:
365
+ # If TTS audio duration is equal to actual duration, use the original TTS audio
366
+ shutil.move(tts_filename, audio_path)
367
+
368
+ @staticmethod
369
+ def make_silence(pause_time, pause_save_path):
370
+ silence = AudioSegment.silent(duration=pause_time)
371
+ silence.export(pause_save_path, format="wav")
372
+ return pause_save_path
373
+
374
+ @staticmethod
375
+ def create_folder_for_srt(srt_file_path):
376
+ srt_base_name = os.path.splitext(os.path.basename(srt_file_path))[0]
377
+ random_uuid = str(uuid.uuid4())[:4]
378
+ dummy_folder_path = f"{base_path}/dummy"
379
+ if not os.path.exists(dummy_folder_path):
380
+ os.makedirs(dummy_folder_path)
381
+ folder_path = os.path.join(dummy_folder_path, f"{srt_base_name}_{random_uuid}")
382
+ os.makedirs(folder_path, exist_ok=True)
383
+ return folder_path
384
+
385
+ @staticmethod
386
+ def concatenate_audio_files(audio_paths, output_path):
387
+ concatenated_audio = AudioSegment.silent(duration=0)
388
+ for audio_path in audio_paths:
389
+ audio_segment = AudioSegment.from_file(audio_path)
390
+ concatenated_audio += audio_segment
391
+ concatenated_audio.export(output_path, format="wav")
392
+
393
+ def srt_to_dub(self, srt_file_path,dub_save_path,language='en'):
394
+ result = self.read_srt_file(srt_file_path)
395
+ new_folder_path = self.create_folder_for_srt(srt_file_path)
396
+ join_path = []
397
+ for i in tqdm(result):
398
+ # for i in result:
399
+ text = i['text']
400
+ actual_duration = i['end_time'] - i['start_time']
401
+ pause_time = i['pause_time']
402
+ slient_path = f"{new_folder_path}/{i['previous_pause']}"
403
+ self.make_silence(pause_time, slient_path)
404
+ join_path.append(slient_path)
405
+ tts_path = f"{new_folder_path}/{i['audio_name']}"
406
+ self.text_to_speech_srt(text, tts_path, language, actual_duration)
407
+ join_path.append(tts_path)
408
+ self.concatenate_audio_files(join_path, dub_save_path)
409
+
410
+ @staticmethod
411
+ def convert_to_millisecond(time_str):
412
+ if isinstance(time_str, str):
413
+ hours, minutes, second_millisecond = time_str.split(':')
414
+ seconds, milliseconds = second_millisecond.split(",")
415
+
416
+ total_milliseconds = (
417
+ int(hours) * 3600000 +
418
+ int(minutes) * 60000 +
419
+ int(seconds) * 1000 +
420
+ int(milliseconds)
421
+ )
422
+
423
+ return total_milliseconds
424
+ @staticmethod
425
+ def read_srt_file(file_path):
426
+ entries = []
427
+ default_start = 0
428
+ previous_end_time = default_start
429
+ entry_number = 1
430
+ audio_name_template = "{}.wav"
431
+ previous_pause_template = "{}_before_pause.wav"
432
+
433
+ with open(file_path, 'r', encoding='utf-8') as file:
434
+ lines = file.readlines()
435
+ # print(lines)
436
+ for i in range(0, len(lines), 4):
437
+ time_info = re.findall(r'(\d+:\d+:\d+,\d+) --> (\d+:\d+:\d+,\d+)', lines[i + 1])
438
+ start_time = SRTDubbing.convert_to_millisecond(time_info[0][0])
439
+ end_time = SRTDubbing.convert_to_millisecond(time_info[0][1])
440
+
441
+ current_entry = {
442
+ 'entry_number': entry_number,
443
+ 'start_time': start_time,
444
+ 'end_time': end_time,
445
+ 'text': lines[i + 2].strip(),
446
+ 'pause_time': start_time - previous_end_time if entry_number != 1 else start_time - default_start,
447
+ 'audio_name': audio_name_template.format(entry_number),
448
+ 'previous_pause': previous_pause_template.format(entry_number),
449
+ }
450
+
451
+ entries.append(current_entry)
452
+ previous_end_time = end_time
453
+ entry_number += 1
454
+
455
+ with open("entries.json", "w") as file:
456
+ json.dump(entries, file, indent=4)
457
+ return entries
458
+ srt_voice_name="am_adam"
459
+ def srt_process(srt_file_path,voice_name,dest_language="en"):
460
+ global srt_voice_name
461
+ srt_voice_name=voice_name
462
+ srt_dubbing = SRTDubbing()
463
+ dub_save_path=get_subtitle_Dub_path(srt_file_path,dest_language)
464
+ srt_dubbing.srt_to_dub(srt_file_path,dub_save_path,dest_language)
465
+ return dub_save_path
466
+
467
+ #
468
+ # srt_file_path="./long.srt"
469
+ # dub_audio_path=srt_process(srt_file_path)
470
+ # print(f"Audio file saved at: {dub_audio_path}")
471
+
472
+
473
 
474
  with gr.Blocks() as demo3:
475
+
476
+ gr.Markdown(
477
+ """
478
+ # Generate Audio File From Subtitle [Single Speaker Only]
479
+
480
+ To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
481
+
482
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NeuralFalconYT/Whisper-Turbo-Subtitle/blob/main/Whisper_Turbo_Subtitle.ipynb)
483
+ """
484
+ )
485
+ with gr.Row():
486
+ with gr.Column():
487
+ srt_file = gr.File(label='Upload .srt Subtitle File Only')
488
+ with gr.Row():
489
+ voice = gr.Dropdown(
490
+ voice_list,
491
+ value='af',
492
+ allow_custom_value=False,
493
+ label='Voice',
494
+ )
495
+ with gr.Row():
496
+ generate_btn_ = gr.Button('Generate', variant='primary')
497
+
498
+ with gr.Column():
499
+ audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
500
+ with gr.Accordion('Enable Autoplay', open=False):
501
+ autoplay = gr.Checkbox(value=True, label='Autoplay')
502
+ autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
503
+
504
+ # srt_file.submit(
505
+ # srt_process,
506
+ # inputs=[srt_file, voice],
507
+ # outputs=[audio]
508
+ # )
509
+ generate_btn_.click(
510
+ srt_process,
511
+ inputs=[srt_file,voice],
512
+ outputs=[audio]
513
+ )
514
+
515
+
516
+ display_text = " \n".join(voice_list)
517
+
518
+ with gr.Blocks() as demo4:
519
  gr.Markdown(f"# Voice Names \n{display_text}")
520
 
521
+
522
+ # import click
523
+ # @click.command()
524
+ # @click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
525
+ # @click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
526
+ def main(debug=False, share=False):
527
+ demo = gr.TabbedInterface([demo1, demo2,demo3,demo4], ["Batched TTS", "Multiple Speech-Type Generation","SRT Dubbing","Available Voice Names"],title="Kokoro TTS")
528
 
529
  demo.queue().launch(debug=debug, share=share)
530
  #Run on local network
 
557
 
558
  # save_at=f"./temp_audio/{os.path.basename(result)}"
559
  # shutil.move(result, save_at)
560
+ # print(f"Saved at {save_at}")