Ktk

Sleeping

App Files Files Community

NeuralFalcon commited on Jan 20

Commit

49ef1cb

verified ·

1 Parent(s): bf5ea2a

Update app.py

Browse files

Files changed (1) hide show

app.py +308 -12

app.py CHANGED Viewed

@@ -2,14 +2,15 @@ from KOKORO.models import build_model
 from KOKORO.utils import tts,tts_file_name,podcast
 import sys
 sys.path.append('.')
 import torch
 import gc
 print("Loading model...")
-import os
-os.system("python download_model.py")
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 print(f'Using device: {device}')
-MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
 print("Model loaded successfully.")
 def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
@@ -42,7 +43,8 @@ def update_model(model_name):
     return f"Model updated to {model_name}"
-def text_to_speech(text, model_name, voice_name, speed, trim, pad_between_segments, remove_silence, minimum_silence):
     """
     Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
     """
@@ -218,17 +220,311 @@ with gr.Blocks() as demo2:
         outputs=[audio]
     )
-display_text = "  \n".join(voice_list)
 with gr.Blocks() as demo3:
     gr.Markdown(f"# Voice Names \n{display_text}")
-import click
-@click.command()
-@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
-@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
-def main(debug, share):
-    demo = gr.TabbedInterface([demo1, demo2,demo3], ["Batched TTS", "Multiple Speech-Type Generation","Available Voice Names"],title="Kokoro TTS")
     demo.queue().launch(debug=debug, share=share)
     #Run on local network
@@ -261,4 +557,4 @@ if __name__ == "__main__":
 # save_at=f"./temp_audio/{os.path.basename(result)}"
 # shutil.move(result, save_at)
-# print(f"Saved at {save_at}")

 from KOKORO.utils import tts,tts_file_name,podcast
 import sys
 sys.path.append('.')
+import os
+os.system("python download_model.py")
 import torch
 import gc
 print("Loading model...")
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 print(f'Using device: {device}')
+# MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
+MODEL = build_model('./KOKORO/fp16/kokoro-v0_19-half.pth', device)
 print("Model loaded successfully.")
 def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
     return f"Model updated to {model_name}"
+def text_to_speech(text, model_name="kokoro-v0_19-half.pth", voice_name="af", speed=1.0, trim=1.0, pad_between_segments=0, remove_silence=True, minimum_silence=0.20):
     """
     Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
     """
         outputs=[audio]
     )
+import shutil
+import os
+# Ensure the output directory exists
+output_dir = "./temp_audio"
+os.makedirs(output_dir, exist_ok=True)
+#@title Generate Audio File From Subtitle
+# from tqdm.notebook import tqdm
+from tqdm import tqdm
+import subprocess
+import json
+import pysrt
+import os
+from pydub import AudioSegment
+import shutil
+import uuid
+import re
+import time
+# os.chdir(install_path)
+def your_tts(text,audio_path,actual_duration,speed=1.0):
+  global srt_voice_name
+  model_name="kokoro-v0_19.pth"
+  tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speed)
+  print(tts_path)
+  tts_audio = AudioSegment.from_file(tts_path)
+  tts_duration = len(tts_audio)
+  if tts_duration > actual_duration:
+    speedup_factor = tts_duration / actual_duration
+    tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speedup_factor)
+  print(tts_path)
+  shutil.copy(tts_path,audio_path)
+base_path="."
+import datetime
+def get_current_time():
+    # Return current time as a string in the format HH_MM_AM/PM
+    return datetime.datetime.now().strftime("%I_%M_%p")
+def get_subtitle_Dub_path(srt_file_path,Language="en"):
+  file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
+  if not os.path.exists(f"{base_path}/TTS_DUB"):
+    os.mkdir(f"{base_path}/TTS_DUB")
+  random_string = str(uuid.uuid4())[:6]
+  new_path=f"{base_path}/TTS_DUB/{file_name}_{Language}_{get_current_time()}_{random_string}.wav"
+  return new_path
+def clean_srt(input_path):
+    file_name = os.path.basename(input_path)
+    output_folder = f"{base_path}/save_srt"
+    if not os.path.exists(output_folder):
+        os.mkdir(output_folder)
+    output_path = f"{output_folder}/{file_name}"
+    def clean_srt_line(text):
+        bad_list = ["[", "]", "♫", "\n"]
+        for i in bad_list:
+            text = text.replace(i, "")
+        return text.strip()
+    # Load the subtitle file
+    subs = pysrt.open(input_path)
+    # Iterate through each subtitle and print its details
+    with open(output_path, "w", encoding='utf-8') as file:
+        for sub in subs:
+            file.write(f"{sub.index}\n")
+            file.write(f"{sub.start} --> {sub.end}\n")
+            file.write(f"{clean_srt_line(sub.text)}\n")
+            file.write("\n")
+        file.close()
+    # print(f"Clean SRT saved at: {output_path}")
+    return output_path
+# Example usage
+class SRTDubbing:
+    def __init__(self):
+        pass
+    @staticmethod
+    def text_to_speech_srt(text, audio_path, language, actual_duration):
+        tts_filename = "./cache/temp.wav"
+        your_tts(text,tts_filename,actual_duration,speed=1.0)
+        # Check the duration of the generated TTS audio
+        tts_audio = AudioSegment.from_file(tts_filename)
+        tts_duration = len(tts_audio)
+        if actual_duration == 0:
+            # If actual duration is zero, use the original TTS audio without modifications
+            shutil.move(tts_filename, audio_path)
+            return
+        # If TTS audio duration is longer than actual duration, speed up the audio
+        if tts_duration > actual_duration:
+            speedup_factor = tts_duration / actual_duration
+            speedup_filename = "./cache/speedup_temp.wav"
+            # Use ffmpeg to change audio speed
+            subprocess.run([
+                "ffmpeg",
+                "-i", tts_filename,
+                "-filter:a", f"atempo={speedup_factor}",
+                speedup_filename,
+                "-y"
+            ], check=True)
+            # Replace the original TTS audio with the sped-up version
+            shutil.move(speedup_filename, audio_path)
+        elif tts_duration < actual_duration:
+            # If TTS audio duration is less than actual duration, add silence to match the duration
+            silence_gap = actual_duration - tts_duration
+            silence = AudioSegment.silent(duration=int(silence_gap))
+            new_audio = tts_audio + silence
+            # Save the new audio with added silence
+            new_audio.export(audio_path, format="wav")
+        else:
+            # If TTS audio duration is equal to actual duration, use the original TTS audio
+            shutil.move(tts_filename, audio_path)
+    @staticmethod
+    def make_silence(pause_time, pause_save_path):
+        silence = AudioSegment.silent(duration=pause_time)
+        silence.export(pause_save_path, format="wav")
+        return pause_save_path
+    @staticmethod
+    def create_folder_for_srt(srt_file_path):
+        srt_base_name = os.path.splitext(os.path.basename(srt_file_path))[0]
+        random_uuid = str(uuid.uuid4())[:4]
+        dummy_folder_path = f"{base_path}/dummy"
+        if not os.path.exists(dummy_folder_path):
+            os.makedirs(dummy_folder_path)
+        folder_path = os.path.join(dummy_folder_path, f"{srt_base_name}_{random_uuid}")
+        os.makedirs(folder_path, exist_ok=True)
+        return folder_path
+    @staticmethod
+    def concatenate_audio_files(audio_paths, output_path):
+        concatenated_audio = AudioSegment.silent(duration=0)
+        for audio_path in audio_paths:
+            audio_segment = AudioSegment.from_file(audio_path)
+            concatenated_audio += audio_segment
+        concatenated_audio.export(output_path, format="wav")
+    def srt_to_dub(self, srt_file_path,dub_save_path,language='en'):
+        result = self.read_srt_file(srt_file_path)
+        new_folder_path = self.create_folder_for_srt(srt_file_path)
+        join_path = []
+        for i in tqdm(result):
+        # for i in result:
+            text = i['text']
+            actual_duration = i['end_time'] - i['start_time']
+            pause_time = i['pause_time']
+            slient_path = f"{new_folder_path}/{i['previous_pause']}"
+            self.make_silence(pause_time, slient_path)
+            join_path.append(slient_path)
+            tts_path = f"{new_folder_path}/{i['audio_name']}"
+            self.text_to_speech_srt(text, tts_path, language, actual_duration)
+            join_path.append(tts_path)
+        self.concatenate_audio_files(join_path, dub_save_path)
+    @staticmethod
+    def convert_to_millisecond(time_str):
+      if isinstance(time_str, str):
+          hours, minutes, second_millisecond = time_str.split(':')
+          seconds, milliseconds = second_millisecond.split(",")
+          total_milliseconds = (
+              int(hours) * 3600000 +
+              int(minutes) * 60000 +
+              int(seconds) * 1000 +
+              int(milliseconds)
+          )
+          return total_milliseconds
+    @staticmethod
+    def read_srt_file(file_path):
+        entries = []
+        default_start = 0
+        previous_end_time = default_start
+        entry_number = 1
+        audio_name_template = "{}.wav"
+        previous_pause_template = "{}_before_pause.wav"
+        with open(file_path, 'r', encoding='utf-8') as file:
+            lines = file.readlines()
+            # print(lines)
+            for i in range(0, len(lines), 4):
+                time_info = re.findall(r'(\d+:\d+:\d+,\d+) --> (\d+:\d+:\d+,\d+)', lines[i + 1])
+                start_time = SRTDubbing.convert_to_millisecond(time_info[0][0])
+                end_time = SRTDubbing.convert_to_millisecond(time_info[0][1])
+                current_entry = {
+                    'entry_number': entry_number,
+                    'start_time': start_time,
+                    'end_time': end_time,
+                    'text': lines[i + 2].strip(),
+                    'pause_time': start_time - previous_end_time if entry_number != 1 else start_time - default_start,
+                    'audio_name': audio_name_template.format(entry_number),
+                    'previous_pause': previous_pause_template.format(entry_number),
+                }
+                entries.append(current_entry)
+                previous_end_time = end_time
+                entry_number += 1
+        with open("entries.json", "w") as file:
+            json.dump(entries, file, indent=4)
+        return entries
+srt_voice_name="am_adam"
+def srt_process(srt_file_path,voice_name,dest_language="en"):
+  global srt_voice_name
+  srt_voice_name=voice_name
+  srt_dubbing = SRTDubbing()
+  dub_save_path=get_subtitle_Dub_path(srt_file_path,dest_language)
+  srt_dubbing.srt_to_dub(srt_file_path,dub_save_path,dest_language)
+  return dub_save_path
+#
+# srt_file_path="./long.srt"
+# dub_audio_path=srt_process(srt_file_path)
+# print(f"Audio file saved at: {dub_audio_path}")
 with gr.Blocks() as demo3:
+    gr.Markdown(
+        """
+        # Generate Audio File From Subtitle [Single Speaker Only]
+        To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
+        [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NeuralFalconYT/Whisper-Turbo-Subtitle/blob/main/Whisper_Turbo_Subtitle.ipynb)
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            srt_file = gr.File(label='Upload .srt Subtitle File Only')
+            with gr.Row():
+                voice = gr.Dropdown(
+                    voice_list,
+                    value='af',
+                    allow_custom_value=False,
+                    label='Voice',
+                )
+            with gr.Row():
+                generate_btn_ = gr.Button('Generate', variant='primary')
+        with gr.Column():
+            audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
+            with gr.Accordion('Enable Autoplay', open=False):
+                autoplay = gr.Checkbox(value=True, label='Autoplay')
+                autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
+    # srt_file.submit(
+    #     srt_process,
+    #     inputs=[srt_file, voice],
+    #     outputs=[audio]
+    # )
+    generate_btn_.click(
+        srt_process,
+        inputs=[srt_file,voice],
+        outputs=[audio]
+    )
+display_text = "  \n".join(voice_list)
+with gr.Blocks() as demo4:
     gr.Markdown(f"# Voice Names \n{display_text}")
+# import click
+# @click.command()
+# @click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
+# @click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
+def main(debug=False, share=False):
+    demo = gr.TabbedInterface([demo1, demo2,demo3,demo4], ["Batched TTS", "Multiple Speech-Type Generation","SRT Dubbing","Available Voice Names"],title="Kokoro TTS")
     demo.queue().launch(debug=debug, share=share)
     #Run on local network
 # save_at=f"./temp_audio/{os.path.basename(result)}"
 # shutil.move(result, save_at)
+# print(f"Saved at {save_at}")