Spaces:

sbapan41
/

Quantum_Dubbing

Running

App Files Files Community

sbapan41 commited on 7 days ago

Commit

b8d1b97

verified ·

1 Parent(s): 75101d3

Upload 12 files

Browse files

Files changed (12) hide show

quantum_dubbing/audio_segments.py +141 -0
quantum_dubbing/language_configuration.py +551 -0
quantum_dubbing/languages_gui.py +0 -0
quantum_dubbing/logging_setup.py +68 -0
quantum_dubbing/mdx_net.py +594 -0
quantum_dubbing/postprocessor.py +231 -0
quantum_dubbing/preprocessor.py +309 -0
quantum_dubbing/speech_segmentation.py +499 -0
quantum_dubbing/text_multiformat_processor.py +987 -0
quantum_dubbing/text_to_speech.py +1574 -0
quantum_dubbing/translate_segments.py +457 -0
quantum_dubbing/utils.py +487 -0

quantum_dubbing/audio_segments.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from pydub import AudioSegment
+from tqdm import tqdm
+from .utils import run_command
+from .logging_setup import logger
+import numpy as np
+class Mixer:
+    def __init__(self):
+        self.parts = []
+    def __len__(self):
+        parts = self._sync()
+        seg = parts[0][1]
+        frame_count = max(offset + seg.frame_count() for offset, seg in parts)
+        return int(1000.0 * frame_count / seg.frame_rate)
+    def overlay(self, sound, position=0):
+        self.parts.append((position, sound))
+        return self
+    def _sync(self):
+        positions, segs = zip(*self.parts)
+        frame_rate = segs[0].frame_rate
+        array_type = segs[0].array_type # noqa
+        offsets = [int(frame_rate * pos / 1000.0) for pos in positions]
+        segs = AudioSegment.empty()._sync(*segs)
+        return list(zip(offsets, segs))
+    def append(self, sound):
+        self.overlay(sound, position=len(self))
+    def to_audio_segment(self):
+        parts = self._sync()
+        seg = parts[0][1]
+        channels = seg.channels
+        frame_count = max(offset + seg.frame_count() for offset, seg in parts)
+        sample_count = int(frame_count * seg.channels)
+        output = np.zeros(sample_count, dtype="int32")
+        for offset, seg in parts:
+            sample_offset = offset * channels
+            samples = np.frombuffer(seg.get_array_of_samples(), dtype="int32")
+            samples = np.int16(samples/np.max(np.abs(samples)) * 32767)
+            start = sample_offset
+            end = start + len(samples)
+            output[start:end] += samples
+        return seg._spawn(
+            output, overrides={"sample_width": 4}).normalize(headroom=0.0)
+def create_translated_audio(
+    result_diarize, audio_files, final_file, concat=False, avoid_overlap=False,
+):
+    total_duration = result_diarize["segments"][-1]["end"]  # in seconds
+    if concat:
+        """
+        file .\audio\1.ogg
+        file .\audio\2.ogg
+        file .\audio\3.ogg
+        file .\audio\4.ogg
+        ...
+        """
+        # Write the file paths to list.txt
+        with open("list.txt", "w") as file:
+            for i, audio_file in enumerate(audio_files):
+                if i == len(audio_files) - 1:  # Check if it's the last item
+                    file.write(f"file {audio_file}")
+                else:
+                    file.write(f"file {audio_file}\n")
+        # command = f"ffmpeg -f concat -safe 0 -i list.txt {final_file}"
+        command = (
+            f"ffmpeg -f concat -safe 0 -i list.txt -c:a pcm_s16le {final_file}"
+        )
+        run_command(command)
+    else:
+        # silent audio with total_duration
+        base_audio = AudioSegment.silent(
+            duration=int(total_duration * 1000), frame_rate=41000
+        )
+        combined_audio = Mixer()
+        combined_audio.overlay(base_audio)
+        logger.debug(
+            f"Audio duration: {total_duration // 60} "
+            f"minutes and {int(total_duration % 60)} seconds"
+        )
+        last_end_time = 0
+        previous_speaker = ""
+        for line, audio_file in tqdm(
+            zip(result_diarize["segments"], audio_files)
+        ):
+            start = float(line["start"])
+            # Overlay each audio at the corresponding time
+            try:
+                audio = AudioSegment.from_file(audio_file)
+                # audio_a = audio.speedup(playback_speed=1.5)
+                if avoid_overlap:
+                    speaker = line["speaker"]
+                    if (last_end_time - 0.500) > start:
+                        overlap_time = last_end_time - start
+                        if previous_speaker and previous_speaker != speaker:
+                            start = (last_end_time - 0.500)
+                        else:
+                            start = (last_end_time - 0.200)
+                        if overlap_time > 2.5:
+                            start = start - 0.3
+                        logger.info(
+                              f"Avoid overlap for {str(audio_file)} "
+                              f"with {str(start)}"
+                        )
+                    previous_speaker = speaker
+                    duration_tts_seconds = len(audio) / 1000.0  # to sec
+                    last_end_time = (start + duration_tts_seconds)
+                start_time = start * 1000  # to ms
+                combined_audio = combined_audio.overlay(
+                    audio, position=start_time
+                )
+            except Exception as error:
+                logger.debug(str(error))
+                logger.error(f"Error audio file {audio_file}")
+        # combined audio as a file
+        combined_audio_data = combined_audio.to_audio_segment()
+        combined_audio_data.export(
+            final_file, format="wav"
+        )  # best than ogg, change if the audio is anomalous

quantum_dubbing/language_configuration.py ADDED Viewed

	@@ -0,0 +1,551 @@

+from .logging_setup import logger
+LANGUAGES_UNIDIRECTIONAL = {
+    "Aymara (ay)": "ay",
+    "Bambara (bm)": "bm",
+    "Cebuano (ceb)": "ceb",
+    "Chichewa (ny)": "ny",
+    "Divehi (dv)": "dv",
+    "Dogri (doi)": "doi",
+    "Ewe (ee)": "ee",
+    "Guarani (gn)": "gn",
+    "Iloko (ilo)": "ilo",
+    "Kinyarwanda (rw)": "rw",
+    "Krio (kri)": "kri",
+    "Kurdish (ku)": "ku",
+    "Kirghiz (ky)": "ky",
+    "Ganda (lg)": "lg",
+    "Maithili (mai)": "mai",
+    "Oriya (or)": "or",
+    "Oromo (om)": "om",
+    "Quechua (qu)": "qu",
+    "Samoan (sm)": "sm",
+    "Tigrinya (ti)": "ti",
+    "Tsonga (ts)": "ts",
+    "Akan (ak)": "ak",
+    "Uighur (ug)": "ug"
+}
+UNIDIRECTIONAL_L_LIST = LANGUAGES_UNIDIRECTIONAL.keys()
+LANGUAGES = {
+    "Automatic detection": "Automatic detection",
+    "Arabic (ar)": "ar",
+    "Chinese - Simplified (zh-CN)": "zh",
+    "Czech (cs)": "cs",
+    "Danish (da)": "da",
+    "Dutch (nl)": "nl",
+    "English (en)": "en",
+    "Finnish (fi)": "fi",
+    "French (fr)": "fr",
+    "German (de)": "de",
+    "Greek (el)": "el",
+    "Hebrew (he)": "he",
+    "Hungarian (hu)": "hu",
+    "Italian (it)": "it",
+    "Japanese (ja)": "ja",
+    "Korean (ko)": "ko",
+    "Persian (fa)": "fa",  # no aux gTTS
+    "Polish (pl)": "pl",
+    "Portuguese (pt)": "pt",
+    "Russian (ru)": "ru",
+    "Spanish (es)": "es",
+    "Turkish (tr)": "tr",
+    "Ukrainian (uk)": "uk",
+    "Urdu (ur)": "ur",
+    "Vietnamese (vi)": "vi",
+    "Hindi (hi)": "hi",
+    "Indonesian (id)": "id",
+    "Bengali (bn)": "bn",
+    "Telugu (te)": "te",
+    "Marathi (mr)": "mr",
+    "Tamil (ta)": "ta",
+    "Javanese (jw|jv)": "jw",
+    "Catalan (ca)": "ca",
+    "Nepali (ne)": "ne",
+    "Thai (th)": "th",
+    "Swedish (sv)": "sv",
+    "Amharic (am)": "am",
+    "Welsh (cy)": "cy",  # no aux gTTS
+    "Estonian (et)": "et",
+    "Croatian (hr)": "hr",
+    "Icelandic (is)": "is",
+    "Georgian (ka)": "ka",  # no aux gTTS
+    "Khmer (km)": "km",
+    "Slovak (sk)": "sk",
+    "Albanian (sq)": "sq",
+    "Serbian (sr)": "sr",
+    "Azerbaijani (az)": "az",  # no aux gTTS
+    "Bulgarian (bg)": "bg",
+    "Galician (gl)": "gl",  # no aux gTTS
+    "Gujarati (gu)": "gu",
+    "Kazakh (kk)": "kk",  # no aux gTTS
+    "Kannada (kn)": "kn",
+    "Lithuanian (lt)": "lt",  # no aux gTTS
+    "Latvian (lv)": "lv",
+    "Macedonian (mk)": "mk",  # no aux gTTS # error get align model
+    "Malayalam (ml)": "ml",
+    "Malay (ms)": "ms",  # error get align model
+    "Romanian (ro)": "ro",
+    "Sinhala (si)": "si",
+    "Sundanese (su)": "su",
+    "Swahili (sw)": "sw",  # error aling
+    "Afrikaans (af)": "af",
+    "Bosnian (bs)": "bs",
+    "Latin (la)": "la",
+    "Myanmar Burmese (my)": "my",
+    "Norwegian (no|nb)": "no",
+    "Chinese - Traditional (zh-TW)": "zh-TW",
+    "Assamese (as)": "as",
+    "Basque (eu)": "eu",
+    "Hausa (ha)": "ha",
+    "Haitian Creole (ht)": "ht",
+    "Armenian (hy)": "hy",
+    "Lao (lo)": "lo",
+    "Malagasy (mg)": "mg",
+    "Mongolian (mn)": "mn",
+    "Maltese (mt)": "mt",
+    "Punjabi (pa)": "pa",
+    "Pashto (ps)": "ps",
+    "Slovenian (sl)": "sl",
+    "Shona (sn)": "sn",
+    "Somali (so)": "so",
+    "Tajik (tg)": "tg",
+    "Turkmen (tk)": "tk",
+    "Tatar (tt)": "tt",
+    "Uzbek (uz)": "uz",
+    "Yoruba (yo)": "yo",
+    **LANGUAGES_UNIDIRECTIONAL
+}
+BASE_L_LIST = LANGUAGES.keys()
+LANGUAGES_LIST = [list(BASE_L_LIST)[0]] + sorted(list(BASE_L_LIST)[1:])
+INVERTED_LANGUAGES = {value: key for key, value in LANGUAGES.items()}
+EXTRA_ALIGN = {
+    "id": "indonesian-nlp/wav2vec2-large-xlsr-indonesian",
+    "bn": "arijitx/wav2vec2-large-xlsr-bengali",
+    "mr": "sumedh/wav2vec2-large-xlsr-marathi",
+    "ta": "Amrrs/wav2vec2-large-xlsr-53-tamil",
+    "jw": "cahya/wav2vec2-large-xlsr-javanese",
+    "ne": "shniranjan/wav2vec2-large-xlsr-300m-nepali",
+    "th": "sakares/wav2vec2-large-xlsr-thai-demo",
+    "sv": "KBLab/wav2vec2-large-voxrex-swedish",
+    "am": "agkphysics/wav2vec2-large-xlsr-53-amharic",
+    "cy": "Srulikbdd/Wav2Vec2-large-xlsr-welsh",
+    "et": "anton-l/wav2vec2-large-xlsr-53-estonian",
+    "hr": "classla/wav2vec2-xls-r-parlaspeech-hr",
+    "is": "carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h",
+    "ka": "MehdiHosseiniMoghadam/wav2vec2-large-xlsr-53-Georgian",
+    "km": "vitouphy/wav2vec2-xls-r-300m-khmer",
+    "sk": "infinitejoy/wav2vec2-large-xls-r-300m-slovak",
+    "sq": "Alimzhan/wav2vec2-large-xls-r-300m-albanian-colab",
+    "sr": "dnikolic/wav2vec2-xlsr-530-serbian-colab",
+    "az": "nijatzeynalov/wav2vec2-large-mms-1b-azerbaijani-common_voice15.0",
+    "bg": "infinitejoy/wav2vec2-large-xls-r-300m-bulgarian",
+    "gl": "ifrz/wav2vec2-large-xlsr-galician",
+    "gu": "Harveenchadha/vakyansh-wav2vec2-gujarati-gnm-100",
+    "kk": "aismlv/wav2vec2-large-xlsr-kazakh",
+    "kn": "Harveenchadha/vakyansh-wav2vec2-kannada-knm-560",
+    "lt": "DeividasM/wav2vec2-large-xlsr-53-lithuanian",
+    "lv": "anton-l/wav2vec2-large-xlsr-53-latvian",
+    "mk": "",  # Konstantin-Bogdanoski/wav2vec2-macedonian-base
+    "ml": "gvs/wav2vec2-large-xlsr-malayalam",
+    "ms": "",  # Duy/wav2vec2_malay
+    "ro": "anton-l/wav2vec2-large-xlsr-53-romanian",
+    "si": "IAmNotAnanth/wav2vec2-large-xls-r-300m-sinhala",
+    "su": "cahya/wav2vec2-large-xlsr-sundanese",
+    "sw": "",  # Lians/fine-tune-wav2vec2-large-swahili
+    "af": "",  # ylacombe/wav2vec2-common_voice-af-demo
+    "bs": "",
+    "la": "",
+    "my": "",
+    "no": "NbAiLab/wav2vec2-xlsr-300m-norwegian",
+    "zh-TW": "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn",
+    "as": "",
+    "eu": "", # cahya/wav2vec2-large-xlsr-basque # verify
+    "ha": "infinitejoy/wav2vec2-large-xls-r-300m-hausa",
+    "ht": "",
+    "hy": "infinitejoy/wav2vec2-large-xls-r-300m-armenian", # no (.)
+    "lo": "",
+    "mg": "",
+    "mn": "tugstugi/wav2vec2-large-xlsr-53-mongolian",
+    "mt": "carlosdanielhernandezmena/wav2vec2-large-xlsr-53-maltese-64h",
+    "pa": "kingabzpro/wav2vec2-large-xlsr-53-punjabi",
+    "ps": "aamirhs/wav2vec2-large-xls-r-300m-pashto-colab",
+    "sl": "anton-l/wav2vec2-large-xlsr-53-slovenian",
+    "sn": "",
+    "so": "",
+    "tg": "",
+    "tk": "",  # Ragav/wav2vec2-tk
+    "tt": "anton-l/wav2vec2-large-xlsr-53-tatar",
+    "uz": "",  # Mekhriddin/wav2vec2-large-xls-r-300m-uzbek-colab
+    "yo": "ogbi/wav2vec2-large-mms-1b-yoruba-test",
+}
+def fix_code_language(translate_to, syntax="google"):
+    if syntax == "google":
+        # google-translator, gTTS
+        replace_lang_code = {"zh": "zh-CN", "he": "iw", "zh-cn": "zh-CN"}
+    elif syntax == "coqui":
+        # coqui-xtts
+        replace_lang_code = {"zh": "zh-cn", "zh-CN": "zh-cn", "zh-TW": "zh-cn"}
+    new_code_lang = replace_lang_code.get(translate_to, translate_to)
+    logger.debug(f"Fix code {translate_to} -> {new_code_lang}")
+    return new_code_lang
+BARK_VOICES_LIST = {
+    "de_speaker_0-Male BARK": "v2/de_speaker_0",
+    "de_speaker_1-Male BARK": "v2/de_speaker_1",
+    "de_speaker_2-Male BARK": "v2/de_speaker_2",
+    "de_speaker_3-Female BARK": "v2/de_speaker_3",
+    "de_speaker_4-Male BARK": "v2/de_speaker_4",
+    "de_speaker_5-Male BARK": "v2/de_speaker_5",
+    "de_speaker_6-Male BARK": "v2/de_speaker_6",
+    "de_speaker_7-Male BARK": "v2/de_speaker_7",
+    "de_speaker_8-Female BARK": "v2/de_speaker_8",
+    "de_speaker_9-Male BARK": "v2/de_speaker_9",
+    "en_speaker_0-Male BARK": "v2/en_speaker_0",
+    "en_speaker_1-Male BARK": "v2/en_speaker_1",
+    "en_speaker_2-Male BARK": "v2/en_speaker_2",
+    "en_speaker_3-Male BARK": "v2/en_speaker_3",
+    "en_speaker_4-Male BARK": "v2/en_speaker_4",
+    "en_speaker_5-Male BARK": "v2/en_speaker_5",
+    "en_speaker_6-Male BARK": "v2/en_speaker_6",
+    "en_speaker_7-Male BARK": "v2/en_speaker_7",
+    "en_speaker_8-Male BARK": "v2/en_speaker_8",
+    "en_speaker_9-Female BARK": "v2/en_speaker_9",
+    "es_speaker_0-Male BARK": "v2/es_speaker_0",
+    "es_speaker_1-Male BARK": "v2/es_speaker_1",
+    "es_speaker_2-Male BARK": "v2/es_speaker_2",
+    "es_speaker_3-Male BARK": "v2/es_speaker_3",
+    "es_speaker_4-Male BARK": "v2/es_speaker_4",
+    "es_speaker_5-Male BARK": "v2/es_speaker_5",
+    "es_speaker_6-Male BARK": "v2/es_speaker_6",
+    "es_speaker_7-Male BARK": "v2/es_speaker_7",
+    "es_speaker_8-Female BARK": "v2/es_speaker_8",
+    "es_speaker_9-Female BARK": "v2/es_speaker_9",
+    "fr_speaker_0-Male BARK": "v2/fr_speaker_0",
+    "fr_speaker_1-Female BARK": "v2/fr_speaker_1",
+    "fr_speaker_2-Female BARK": "v2/fr_speaker_2",
+    "fr_speaker_3-Male BARK": "v2/fr_speaker_3",
+    "fr_speaker_4-Male BARK": "v2/fr_speaker_4",
+    "fr_speaker_5-Female BARK": "v2/fr_speaker_5",
+    "fr_speaker_6-Male BARK": "v2/fr_speaker_6",
+    "fr_speaker_7-Male BARK": "v2/fr_speaker_7",
+    "fr_speaker_8-Male BARK": "v2/fr_speaker_8",
+    "fr_speaker_9-Male BARK": "v2/fr_speaker_9",
+    "hi_speaker_0-Female BARK": "v2/hi_speaker_0",
+    "hi_speaker_1-Female BARK": "v2/hi_speaker_1",
+    "hi_speaker_2-Male BARK": "v2/hi_speaker_2",
+    "hi_speaker_3-Female BARK": "v2/hi_speaker_3",
+    "hi_speaker_4-Female BARK": "v2/hi_speaker_4",
+    "hi_speaker_5-Male BARK": "v2/hi_speaker_5",
+    "hi_speaker_6-Male BARK": "v2/hi_speaker_6",
+    "hi_speaker_7-Male BARK": "v2/hi_speaker_7",
+    "hi_speaker_8-Male BARK": "v2/hi_speaker_8",
+    "hi_speaker_9-Female BARK": "v2/hi_speaker_9",
+    "it_speaker_0-Male BARK": "v2/it_speaker_0",
+    "it_speaker_1-Male BARK": "v2/it_speaker_1",
+    "it_speaker_2-Female BARK": "v2/it_speaker_2",
+    "it_speaker_3-Male BARK": "v2/it_speaker_3",
+    "it_speaker_4-Male BARK": "v2/it_speaker_4",
+    "it_speaker_5-Male BARK": "v2/it_speaker_5",
+    "it_speaker_6-Male BARK": "v2/it_speaker_6",
+    "it_speaker_7-Female BARK": "v2/it_speaker_7",
+    "it_speaker_8-Male BARK": "v2/it_speaker_8",
+    "it_speaker_9-Female BARK": "v2/it_speaker_9",
+    "ja_speaker_0-Female BARK": "v2/ja_speaker_0",
+    "ja_speaker_1-Female BARK": "v2/ja_speaker_1",
+    "ja_speaker_2-Male BARK": "v2/ja_speaker_2",
+    "ja_speaker_3-Female BARK": "v2/ja_speaker_3",
+    "ja_speaker_4-Female BARK": "v2/ja_speaker_4",
+    "ja_speaker_5-Female BARK": "v2/ja_speaker_5",
+    "ja_speaker_6-Male BARK": "v2/ja_speaker_6",
+    "ja_speaker_7-Female BARK": "v2/ja_speaker_7",
+    "ja_speaker_8-Female BARK": "v2/ja_speaker_8",
+    "ja_speaker_9-Female BARK": "v2/ja_speaker_9",
+    "ko_speaker_0-Female BARK": "v2/ko_speaker_0",
+    "ko_speaker_1-Male BARK": "v2/ko_speaker_1",
+    "ko_speaker_2-Male BARK": "v2/ko_speaker_2",
+    "ko_speaker_3-Male BARK": "v2/ko_speaker_3",
+    "ko_speaker_4-Male BARK": "v2/ko_speaker_4",
+    "ko_speaker_5-Male BARK": "v2/ko_speaker_5",
+    "ko_speaker_6-Male BARK": "v2/ko_speaker_6",
+    "ko_speaker_7-Male BARK": "v2/ko_speaker_7",
+    "ko_speaker_8-Male BARK": "v2/ko_speaker_8",
+    "ko_speaker_9-Male BARK": "v2/ko_speaker_9",
+    "pl_speaker_0-Male BARK": "v2/pl_speaker_0",
+    "pl_speaker_1-Male BARK": "v2/pl_speaker_1",
+    "pl_speaker_2-Male BARK": "v2/pl_speaker_2",
+    "pl_speaker_3-Male BARK": "v2/pl_speaker_3",
+    "pl_speaker_4-Female BARK": "v2/pl_speaker_4",
+    "pl_speaker_5-Male BARK": "v2/pl_speaker_5",
+    "pl_speaker_6-Female BARK": "v2/pl_speaker_6",
+    "pl_speaker_7-Male BARK": "v2/pl_speaker_7",
+    "pl_speaker_8-Male BARK": "v2/pl_speaker_8",
+    "pl_speaker_9-Female BARK": "v2/pl_speaker_9",
+    "pt_speaker_0-Male BARK": "v2/pt_speaker_0",
+    "pt_speaker_1-Male BARK": "v2/pt_speaker_1",
+    "pt_speaker_2-Male BARK": "v2/pt_speaker_2",
+    "pt_speaker_3-Male BARK": "v2/pt_speaker_3",
+    "pt_speaker_4-Male BARK": "v2/pt_speaker_4",
+    "pt_speaker_5-Male BARK": "v2/pt_speaker_5",
+    "pt_speaker_6-Male BARK": "v2/pt_speaker_6",
+    "pt_speaker_7-Male BARK": "v2/pt_speaker_7",
+    "pt_speaker_8-Male BARK": "v2/pt_speaker_8",
+    "pt_speaker_9-Male BARK": "v2/pt_speaker_9",
+    "ru_speaker_0-Male BARK": "v2/ru_speaker_0",
+    "ru_speaker_1-Male BARK": "v2/ru_speaker_1",
+    "ru_speaker_2-Male BARK": "v2/ru_speaker_2",
+    "ru_speaker_3-Male BARK": "v2/ru_speaker_3",
+    "ru_speaker_4-Male BARK": "v2/ru_speaker_4",
+    "ru_speaker_5-Female BARK": "v2/ru_speaker_5",
+    "ru_speaker_6-Female BARK": "v2/ru_speaker_6",
+    "ru_speaker_7-Male BARK": "v2/ru_speaker_7",
+    "ru_speaker_8-Male BARK": "v2/ru_speaker_8",
+    "ru_speaker_9-Female BARK": "v2/ru_speaker_9",
+    "tr_speaker_0-Male BARK": "v2/tr_speaker_0",
+    "tr_speaker_1-Male BARK": "v2/tr_speaker_1",
+    "tr_speaker_2-Male BARK": "v2/tr_speaker_2",
+    "tr_speaker_3-Male BARK": "v2/tr_speaker_3",
+    "tr_speaker_4-Female BARK": "v2/tr_speaker_4",
+    "tr_speaker_5-Female BARK": "v2/tr_speaker_5",
+    "tr_speaker_6-Male BARK": "v2/tr_speaker_6",
+    "tr_speaker_7-Male BARK": "v2/tr_speaker_7",
+    "tr_speaker_8-Male BARK": "v2/tr_speaker_8",
+    "tr_speaker_9-Male BARK": "v2/tr_speaker_9",
+    "zh_speaker_0-Male BARK": "v2/zh_speaker_0",
+    "zh_speaker_1-Male BARK": "v2/zh_speaker_1",
+    "zh_speaker_2-Male BARK": "v2/zh_speaker_2",
+    "zh_speaker_3-Male BARK": "v2/zh_speaker_3",
+    "zh_speaker_4-Female BARK": "v2/zh_speaker_4",
+    "zh_speaker_5-Male BARK": "v2/zh_speaker_5",
+    "zh_speaker_6-Female BARK": "v2/zh_speaker_6",
+    "zh_speaker_7-Female BARK": "v2/zh_speaker_7",
+    "zh_speaker_8-Male BARK": "v2/zh_speaker_8",
+    "zh_speaker_9-Female BARK": "v2/zh_speaker_9",
+}
+VITS_VOICES_LIST = {
+    "ar-facebook-mms VITS": "facebook/mms-tts-ara",
+    # 'zh-facebook-mms VITS': 'facebook/mms-tts-cmn',
+    "zh_Hakka-facebook-mms VITS": "facebook/mms-tts-hak",
+    "zh_MinNan-facebook-mms VITS": "facebook/mms-tts-nan",
+    # 'cs-facebook-mms VITS': 'facebook/mms-tts-ces',
+    # 'da-facebook-mms VITS': 'facebook/mms-tts-dan',
+    "nl-facebook-mms VITS": "facebook/mms-tts-nld",
+    "en-facebook-mms VITS": "facebook/mms-tts-eng",
+    "fi-facebook-mms VITS": "facebook/mms-tts-fin",
+    "fr-facebook-mms VITS": "facebook/mms-tts-fra",
+    "de-facebook-mms VITS": "facebook/mms-tts-deu",
+    "el-facebook-mms VITS": "facebook/mms-tts-ell",
+    "el_Ancient-facebook-mms VITS": "facebook/mms-tts-grc",
+    "he-facebook-mms VITS": "facebook/mms-tts-heb",
+    "hu-facebook-mms VITS": "facebook/mms-tts-hun",
+    # 'it-facebook-mms VITS': 'facebook/mms-tts-ita',
+    # 'ja-facebook-mms VITS': 'facebook/mms-tts-jpn',
+    "ko-facebook-mms VITS": "facebook/mms-tts-kor",
+    "fa-facebook-mms VITS": "facebook/mms-tts-fas",
+    "pl-facebook-mms VITS": "facebook/mms-tts-pol",
+    "pt-facebook-mms VITS": "facebook/mms-tts-por",
+    "ru-facebook-mms VITS": "facebook/mms-tts-rus",
+    "es-facebook-mms VITS": "facebook/mms-tts-spa",
+    "tr-facebook-mms VITS": "facebook/mms-tts-tur",
+    "uk-facebook-mms VITS": "facebook/mms-tts-ukr",
+    "ur_arabic-facebook-mms VITS": "facebook/mms-tts-urd-script_arabic",
+    "ur_devanagari-facebook-mms VITS": "facebook/mms-tts-urd-script_devanagari",
+    "ur_latin-facebook-mms VITS": "facebook/mms-tts-urd-script_latin",
+    "vi-facebook-mms VITS": "facebook/mms-tts-vie",
+    "hi-facebook-mms VITS": "facebook/mms-tts-hin",
+    "hi_Fiji-facebook-mms VITS": "facebook/mms-tts-hif",
+    "id-facebook-mms VITS": "facebook/mms-tts-ind",
+    "bn-facebook-mms VITS": "facebook/mms-tts-ben",
+    "te-facebook-mms VITS": "facebook/mms-tts-tel",
+    "mr-facebook-mms VITS": "facebook/mms-tts-mar",
+    "ta-facebook-mms VITS": "facebook/mms-tts-tam",
+    "jw-facebook-mms VITS": "facebook/mms-tts-jav",
+    "jw_Suriname-facebook-mms VITS": "facebook/mms-tts-jvn",
+    "ca-facebook-mms VITS": "facebook/mms-tts-cat",
+    "ne-facebook-mms VITS": "facebook/mms-tts-nep",
+    "th-facebook-mms VITS": "facebook/mms-tts-tha",
+    "th_Northern-facebook-mms VITS": "facebook/mms-tts-nod",
+    "sv-facebook-mms VITS": "facebook/mms-tts-swe",
+    "am-facebook-mms VITS": "facebook/mms-tts-amh",
+    "cy-facebook-mms VITS": "facebook/mms-tts-cym",
+    # "et-facebook-mms VITS": "facebook/mms-tts-est",
+    # "ht-facebook-mms VITS": "facebook/mms-tts-hrv",
+    "is-facebook-mms VITS": "facebook/mms-tts-isl",
+    "km-facebook-mms VITS": "facebook/mms-tts-khm",
+    "km_Northern-facebook-mms VITS": "facebook/mms-tts-kxm",
+    # "sk-facebook-mms VITS": "facebook/mms-tts-slk",
+    "sq_Northern-facebook-mms VITS": "facebook/mms-tts-sqi",
+    "az_South-facebook-mms VITS": "facebook/mms-tts-azb",
+    "az_North_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-azj-script_cyrillic",
+    "az_North_script_latin-facebook-mms VITS": "facebook/mms-tts-azj-script_latin",
+    "bg-facebook-mms VITS": "facebook/mms-tts-bul",
+    # "gl-facebook-mms VITS": "facebook/mms-tts-glg",
+    "gu-facebook-mms VITS": "facebook/mms-tts-guj",
+    "kk-facebook-mms VITS": "facebook/mms-tts-kaz",
+    "kn-facebook-mms VITS": "facebook/mms-tts-kan",
+    # "lt-facebook-mms VITS": "facebook/mms-tts-lit",
+    "lv-facebook-mms VITS": "facebook/mms-tts-lav",
+    # "mk-facebook-mms VITS": "facebook/mms-tts-mkd",
+    "ml-facebook-mms VITS": "facebook/mms-tts-mal",
+    "ms-facebook-mms VITS": "facebook/mms-tts-zlm",
+    "ms_Central-facebook-mms VITS": "facebook/mms-tts-pse",
+    "ms_Manado-facebook-mms VITS": "facebook/mms-tts-xmm",
+    "ro-facebook-mms VITS": "facebook/mms-tts-ron",
+    # "si-facebook-mms VITS": "facebook/mms-tts-sin",
+    "sw-facebook-mms VITS": "facebook/mms-tts-swh",
+    # "af-facebook-mms VITS": "facebook/mms-tts-afr",
+    # "bs-facebook-mms VITS": "facebook/mms-tts-bos",
+    "la-facebook-mms VITS": "facebook/mms-tts-lat",
+    "my-facebook-mms VITS": "facebook/mms-tts-mya",
+    # "no_Bokmål-facebook-mms VITS": "thomasht86/mms-tts-nob",  # verify
+    "as-facebook-mms VITS": "facebook/mms-tts-asm",
+    "as_Nagamese-facebook-mms VITS": "facebook/mms-tts-nag",
+    "eu-facebook-mms VITS": "facebook/mms-tts-eus",
+    "ha-facebook-mms VITS": "facebook/mms-tts-hau",
+    "ht-facebook-mms VITS": "facebook/mms-tts-hat",
+    "hy_Western-facebook-mms VITS": "facebook/mms-tts-hyw",
+    "lo-facebook-mms VITS": "facebook/mms-tts-lao",
+    "mg-facebook-mms VITS": "facebook/mms-tts-mlg",
+    "mn-facebook-mms VITS": "facebook/mms-tts-mon",
+    # "mt-facebook-mms VITS": "facebook/mms-tts-mlt",
+    "pa_Eastern-facebook-mms VITS": "facebook/mms-tts-pan",
+    # "pa_Western-facebook-mms VITS": "facebook/mms-tts-pnb",
+    # "ps-facebook-mms VITS": "facebook/mms-tts-pus",
+    # "sl-facebook-mms VITS": "facebook/mms-tts-slv",
+    "sn-facebook-mms VITS": "facebook/mms-tts-sna",
+    "so-facebook-mms VITS": "facebook/mms-tts-son",
+    "tg-facebook-mms VITS": "facebook/mms-tts-tgk",
+    "tk_script_arabic-facebook-mms VITS": "facebook/mms-tts-tuk-script_arabic",
+    "tk_script_latin-facebook-mms VITS": "facebook/mms-tts-tuk-script_latin",
+    "tt-facebook-mms VITS": "facebook/mms-tts-tat",
+    "tt_Crimean-facebook-mms VITS": "facebook/mms-tts-crh",
+    "uz_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-uzb-script_cyrillic",
+    "yo-facebook-mms VITS": "facebook/mms-tts-yor",
+    "ay-facebook-mms VITS": "facebook/mms-tts-ayr",
+    "bm-facebook-mms VITS": "facebook/mms-tts-bam",
+    "ceb-facebook-mms VITS": "facebook/mms-tts-ceb",
+    "ny-facebook-mms VITS": "facebook/mms-tts-nya",
+    "dv-facebook-mms VITS": "facebook/mms-tts-div",
+    "doi-facebook-mms VITS": "facebook/mms-tts-dgo",
+    "ee-facebook-mms VITS": "facebook/mms-tts-ewe",
+    "gn-facebook-mms VITS": "facebook/mms-tts-grn",
+    "ilo-facebook-mms VITS": "facebook/mms-tts-ilo",
+    "rw-facebook-mms VITS": "facebook/mms-tts-kin",
+    "kri-facebook-mms VITS": "facebook/mms-tts-kri",
+    "ku_script_arabic-facebook-mms VITS": "facebook/mms-tts-kmr-script_arabic",
+    "ku_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-kmr-script_cyrillic",
+    "ku_script_latin-facebook-mms VITS": "facebook/mms-tts-kmr-script_latin",
+    "ckb-facebook-mms VITS": "razhan/mms-tts-ckb",  # Verify w
+    "ky-facebook-mms VITS": "facebook/mms-tts-kir",
+    "lg-facebook-mms VITS": "facebook/mms-tts-lug",
+    "mai-facebook-mms VITS": "facebook/mms-tts-mai",
+    "or-facebook-mms VITS": "facebook/mms-tts-ory",
+    "om-facebook-mms VITS": "facebook/mms-tts-orm",
+    "qu_Huallaga-facebook-mms VITS": "facebook/mms-tts-qub",
+    "qu_Lambayeque-facebook-mms VITS": "facebook/mms-tts-quf",
+    "qu_South_Bolivian-facebook-mms VITS": "facebook/mms-tts-quh",
+    "qu_North_Bolivian-facebook-mms VITS": "facebook/mms-tts-qul",
+    "qu_Tena_Lowland-facebook-mms VITS": "facebook/mms-tts-quw",
+    "qu_Ayacucho-facebook-mms VITS": "facebook/mms-tts-quy",
+    "qu_Cusco-facebook-mms VITS": "facebook/mms-tts-quz",
+    "qu_Cajamarca-facebook-mms VITS": "facebook/mms-tts-qvc",
+    "qu_Eastern_Apurímac-facebook-mms VITS": "facebook/mms-tts-qve",
+    "qu_Huamalíes_Dos_de_Mayo_Huánuco-facebook-mms VITS": "facebook/mms-tts-qvh",
+    "qu_Margos_Yarowilca_Lauricocha-facebook-mms VITS": "facebook/mms-tts-qvm",
+    "qu_North_Junín-facebook-mms VITS": "facebook/mms-tts-qvn",
+    "qu_Napo-facebook-mms VITS": "facebook/mms-tts-qvo",
+    "qu_San_Martín-facebook-mms VITS": "facebook/mms-tts-qvs",
+    "qu_Huaylla_Wanca-facebook-mms VITS": "facebook/mms-tts-qvw",
+    "qu_Northern_Pastaza-facebook-mms VITS": "facebook/mms-tts-qvz",
+    "qu_Huaylas_Ancash-facebook-mms VITS": "facebook/mms-tts-qwh",
+    "qu_Panao-facebook-mms VITS": "facebook/mms-tts-qxh",
+    "qu_Salasaca_Highland-facebook-mms VITS": "facebook/mms-tts-qxl",
+    "qu_Northern_Conchucos_Ancash-facebook-mms VITS": "facebook/mms-tts-qxn",
+    "qu_Southern_Conchucos-facebook-mms VITS": "facebook/mms-tts-qxo",
+    "qu_Cañar_Highland-facebook-mms VITS": "facebook/mms-tts-qxr",
+    "sm-facebook-mms VITS": "facebook/mms-tts-smo",
+    "ti-facebook-mms VITS": "facebook/mms-tts-tir",
+    "ts-facebook-mms VITS": "facebook/mms-tts-tso",
+    "ak-facebook-mms VITS": "facebook/mms-tts-aka",
+    "ug_script_arabic-facebook-mms VITS": "facebook/mms-tts-uig-script_arabic",
+    "ug_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-uig-script_cyrillic",
+}
+OPENAI_TTS_CODES = [
+    "af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", "cs", "da",
+    "nl", "en", "et", "fi", "fr", "gl", "de", "el", "he", "hi", "hu", "is",
+    "id", "it", "ja", "kn", "kk", "ko", "lv", "lt", "mk", "ms", "mr", "mi",
+    "ne", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw",
+    "sv", "tl", "ta", "th", "tr", "uk", "ur", "vi", "cy", "zh-TW"
+]
+OPENAI_TTS_MODELS = [
+    ">alloy OpenAI-TTS",
+    ">echo OpenAI-TTS",
+    ">fable OpenAI-TTS",
+    ">onyx OpenAI-TTS",
+    ">nova OpenAI-TTS",
+    ">shimmer OpenAI-TTS",
+    ">alloy HD OpenAI-TTS",
+    ">echo HD OpenAI-TTS",
+    ">fable HD OpenAI-TTS",
+    ">onyx HD OpenAI-TTS",
+    ">nova HD OpenAI-TTS",
+    ">shimmer HD OpenAI-TTS"
+]
+LANGUAGE_CODE_IN_THREE_LETTERS = {
+    "Automatic detection": "aut",
+    "ar": "ara",
+    "zh": "chi",
+    "cs": "cze",
+    "da": "dan",
+    "nl": "dut",
+    "en": "eng",
+    "fi": "fin",
+    "fr": "fre",
+    "de": "ger",
+    "el": "gre",
+    "he": "heb",
+    "hu": "hun",
+    "it": "ita",
+    "ja": "jpn",
+    "ko": "kor",
+    "fa": "per",
+    "pl": "pol",
+    "pt": "por",
+    "ru": "rus",
+    "es": "spa",
+    "tr": "tur",
+    "uk": "ukr",
+    "ur": "urd",
+    "vi": "vie",
+    "hi": "hin",
+    "id": "ind",
+    "bn": "ben",
+    "te": "tel",
+    "mr": "mar",
+    "ta": "tam",
+    "jw": "jav",
+    "ca": "cat",
+    "ne": "nep",
+    "th": "tha",
+    "sv": "swe",
+    "am": "amh",
+    "cy": "cym",
+    "et": "est",
+    "hr": "hrv",
+    "is": "isl",
+    "km": "khm",
+    "sk": "slk",
+    "sq": "sqi",
+    "sr": "srp",
+}

quantum_dubbing/languages_gui.py ADDED Viewed

The diff for this file is too large to render. See raw diff

quantum_dubbing/logging_setup.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import logging
+import sys
+import warnings
+import os
+def configure_logging_libs(debug=False):
+    warnings.filterwarnings(
+      action="ignore", category=UserWarning, module="pyannote"
+    )
+    modules = [
+      "numba", "httpx", "markdown_it", "speechbrain", "fairseq", "pyannote",
+      "faiss",
+      "pytorch_lightning.utilities.migration.utils",
+      "pytorch_lightning.utilities.migration",
+      "pytorch_lightning",
+      "lightning",
+      "lightning.pytorch.utilities.migration.utils",
+    ]
+    try:
+        for module in modules:
+            logging.getLogger(module).setLevel(logging.WARNING)
+        os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" if not debug else "1"
+        # fix verbose pyannote audio
+        def fix_verbose_pyannote(*args, what=""):
+            pass
+        import pyannote.audio.core.model # noqa
+        pyannote.audio.core.model.check_version = fix_verbose_pyannote
+    except Exception as error:
+        logger.error(str(error))
+def setup_logger(name_log):
+    logger = logging.getLogger(name_log)
+    logger.setLevel(logging.INFO)
+    _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+    _default_handler.flush = sys.stderr.flush
+    logger.addHandler(_default_handler)
+    logger.propagate = False
+    handlers = logger.handlers
+    for handler in handlers:
+        formatter = logging.Formatter("[%(levelname)s] >> %(message)s")
+        handler.setFormatter(formatter)
+    # logger.handlers
+    return logger
+logger = setup_logger("quantum_dubbing")
+logger.setLevel(logging.INFO)
+def set_logging_level(verbosity_level):
+    logging_level_mapping = {
+        "debug": logging.DEBUG,
+        "info": logging.INFO,
+        "warning": logging.WARNING,
+        "error": logging.ERROR,
+        "critical": logging.CRITICAL,
+    }
+    logger.setLevel(logging_level_mapping.get(verbosity_level, logging.INFO))

quantum_dubbing/mdx_net.py ADDED Viewed

	@@ -0,0 +1,594 @@

+import gc
+import hashlib
+import os
+import queue
+import threading
+import json
+import shlex
+import sys
+import subprocess
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+from tqdm import tqdm
+try:
+    from .utils import (
+        remove_directory_contents,
+        create_directories,
+    )
+except:  # noqa
+    from utils import (
+        remove_directory_contents,
+        create_directories,
+    )
+from .logging_setup import logger
+try:
+    import onnxruntime as ort
+except Exception as error:
+    logger.error(str(error))
+# import warnings
+# warnings.filterwarnings("ignore")
+stem_naming = {
+    "Vocals": "Instrumental",
+    "Other": "Instruments",
+    "Instrumental": "Vocals",
+    "Drums": "Drumless",
+    "Bass": "Bassless",
+}
+class MDXModel:
+    def __init__(
+        self,
+        device,
+        dim_f,
+        dim_t,
+        n_fft,
+        hop=1024,
+        stem_name=None,
+        compensation=1.000,
+    ):
+        self.dim_f = dim_f
+        self.dim_t = dim_t
+        self.dim_c = 4
+        self.n_fft = n_fft
+        self.hop = hop
+        self.stem_name = stem_name
+        self.compensation = compensation
+        self.n_bins = self.n_fft // 2 + 1
+        self.chunk_size = hop * (self.dim_t - 1)
+        self.window = torch.hann_window(
+            window_length=self.n_fft, periodic=True
+        ).to(device)
+        out_c = self.dim_c
+        self.freq_pad = torch.zeros(
+            [1, out_c, self.n_bins - self.dim_f, self.dim_t]
+        ).to(device)
+    def stft(self, x):
+        x = x.reshape([-1, self.chunk_size])
+        x = torch.stft(
+            x,
+            n_fft=self.n_fft,
+            hop_length=self.hop,
+            window=self.window,
+            center=True,
+            return_complex=True,
+        )
+        x = torch.view_as_real(x)
+        x = x.permute([0, 3, 1, 2])
+        x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
+            [-1, 4, self.n_bins, self.dim_t]
+        )
+        return x[:, :, : self.dim_f]
+    def istft(self, x, freq_pad=None):
+        freq_pad = (
+            self.freq_pad.repeat([x.shape[0], 1, 1, 1])
+            if freq_pad is None
+            else freq_pad
+        )
+        x = torch.cat([x, freq_pad], -2)
+        # c = 4*2 if self.target_name=='*' else 2
+        x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
+            [-1, 2, self.n_bins, self.dim_t]
+        )
+        x = x.permute([0, 2, 3, 1])
+        x = x.contiguous()
+        x = torch.view_as_complex(x)
+        x = torch.istft(
+            x,
+            n_fft=self.n_fft,
+            hop_length=self.hop,
+            window=self.window,
+            center=True,
+        )
+        return x.reshape([-1, 2, self.chunk_size])
+class MDX:
+    DEFAULT_SR = 44100
+    # Unit: seconds
+    DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
+    DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR
+    def __init__(
+        self, model_path: str, params: MDXModel, processor=0
+    ):
+        # Set the device and the provider (CPU or CUDA)
+        self.device = (
+            torch.device(f"cuda:{processor}")
+            if processor >= 0
+            else torch.device("cpu")
+        )
+        self.provider = (
+            ["CUDAExecutionProvider"]
+            if processor >= 0
+            else ["CPUExecutionProvider"]
+        )
+        self.model = params
+        # Load the ONNX model using ONNX Runtime
+        self.ort = ort.InferenceSession(model_path, providers=self.provider)
+        # Preload the model for faster performance
+        self.ort.run(
+            None,
+            {"input": torch.rand(1, 4, params.dim_f, params.dim_t).numpy()},
+        )
+        self.process = lambda spec: self.ort.run(
+            None, {"input": spec.cpu().numpy()}
+        )[0]
+        self.prog = None
+    @staticmethod
+    def get_hash(model_path):
+        try:
+            with open(model_path, "rb") as f:
+                f.seek(-10000 * 1024, 2)
+                model_hash = hashlib.md5(f.read()).hexdigest()
+        except: # noqa
+            model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest()
+        return model_hash
+    @staticmethod
+    def segment(
+        wave,
+        combine=True,
+        chunk_size=DEFAULT_CHUNK_SIZE,
+        margin_size=DEFAULT_MARGIN_SIZE,
+    ):
+        """
+        Segment or join segmented wave array
+        Args:
+            wave: (np.array) Wave array to be segmented or joined
+            combine: (bool) If True, combines segmented wave array.
+                If False, segments wave array.
+            chunk_size: (int) Size of each segment (in samples)
+            margin_size: (int) Size of margin between segments (in samples)
+        Returns:
+            numpy array: Segmented or joined wave array
+        """
+        if combine:
+            # Initializing as None instead of [] for later numpy array concatenation
+            processed_wave = None
+            for segment_count, segment in enumerate(wave):
+                start = 0 if segment_count == 0 else margin_size
+                end = None if segment_count == len(wave) - 1 else -margin_size
+                if margin_size == 0:
+                    end = None
+                if processed_wave is None:  # Create array for first segment
+                    processed_wave = segment[:, start:end]
+                else:  # Concatenate to existing array for subsequent segments
+                    processed_wave = np.concatenate(
+                        (processed_wave, segment[:, start:end]), axis=-1
+                    )
+        else:
+            processed_wave = []
+            sample_count = wave.shape[-1]
+            if chunk_size <= 0 or chunk_size > sample_count:
+                chunk_size = sample_count
+            if margin_size > chunk_size:
+                margin_size = chunk_size
+            for segment_count, skip in enumerate(
+                range(0, sample_count, chunk_size)
+            ):
+                margin = 0 if segment_count == 0 else margin_size
+                end = min(skip + chunk_size + margin_size, sample_count)
+                start = skip - margin
+                cut = wave[:, start:end].copy()
+                processed_wave.append(cut)
+                if end == sample_count:
+                    break
+        return processed_wave
+    def pad_wave(self, wave):
+        """
+        Pad the wave array to match the required chunk size
+        Args:
+            wave: (np.array) Wave array to be padded
+        Returns:
+            tuple: (padded_wave, pad, trim)
+                - padded_wave: Padded wave array
+                - pad: Number of samples that were padded
+                - trim: Number of samples that were trimmed
+        """
+        n_sample = wave.shape[1]
+        trim = self.model.n_fft // 2
+        gen_size = self.model.chunk_size - 2 * trim
+        pad = gen_size - n_sample % gen_size
+        # Padded wave
+        wave_p = np.concatenate(
+            (
+                np.zeros((2, trim)),
+                wave,
+                np.zeros((2, pad)),
+                np.zeros((2, trim)),
+            ),
+            1,
+        )
+        mix_waves = []
+        for i in range(0, n_sample + pad, gen_size):
+            waves = np.array(wave_p[:, i:i + self.model.chunk_size])
+            mix_waves.append(waves)
+        mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(
+            self.device
+        )
+        return mix_waves, pad, trim
+    def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int):
+        """
+        Process each wave segment in a multi-threaded environment
+        Args:
+            mix_waves: (torch.Tensor) Wave segments to be processed
+            trim: (int) Number of samples trimmed during padding
+            pad: (int) Number of samples padded during padding
+            q: (queue.Queue) Queue to hold the processed wave segments
+            _id: (int) Identifier of the processed wave segment
+        Returns:
+            numpy array: Processed wave segment
+        """
+        mix_waves = mix_waves.split(1)
+        with torch.no_grad():
+            pw = []
+            for mix_wave in mix_waves:
+                self.prog.update()
+                spec = self.model.stft(mix_wave)
+                processed_spec = torch.tensor(self.process(spec))
+                processed_wav = self.model.istft(
+                    processed_spec.to(self.device)
+                )
+                processed_wav = (
+                    processed_wav[:, :, trim:-trim]
+                    .transpose(0, 1)
+                    .reshape(2, -1)
+                    .cpu()
+                    .numpy()
+                )
+                pw.append(processed_wav)
+        processed_signal = np.concatenate(pw, axis=-1)[:, :-pad]
+        q.put({_id: processed_signal})
+        return processed_signal
+    def process_wave(self, wave: np.array, mt_threads=1):
+        """
+        Process the wave array in a multi-threaded environment
+        Args:
+            wave: (np.array) Wave array to be processed
+            mt_threads: (int) Number of threads to be used for processing
+        Returns:
+            numpy array: Processed wave array
+        """
+        self.prog = tqdm(total=0)
+        chunk = wave.shape[-1] // mt_threads
+        waves = self.segment(wave, False, chunk)
+        # Create a queue to hold the processed wave segments
+        q = queue.Queue()
+        threads = []
+        for c, batch in enumerate(waves):
+            mix_waves, pad, trim = self.pad_wave(batch)
+            self.prog.total = len(mix_waves) * mt_threads
+            thread = threading.Thread(
+                target=self._process_wave, args=(mix_waves, trim, pad, q, c)
+            )
+            thread.start()
+            threads.append(thread)
+        for thread in threads:
+            thread.join()
+        self.prog.close()
+        processed_batches = []
+        while not q.empty():
+            processed_batches.append(q.get())
+        processed_batches = [
+            list(wave.values())[0]
+            for wave in sorted(
+                processed_batches, key=lambda d: list(d.keys())[0]
+            )
+        ]
+        assert len(processed_batches) == len(
+            waves
+        ), "Incomplete processed batches, please reduce batch size!"
+        return self.segment(processed_batches, True, chunk)
+def run_mdx(
+    model_params,
+    output_dir,
+    model_path,
+    filename,
+    exclude_main=False,
+    exclude_inversion=False,
+    suffix=None,
+    invert_suffix=None,
+    denoise=False,
+    keep_orig=True,
+    m_threads=2,
+    device_base="cuda",
+):
+    if device_base == "cuda":
+        device = torch.device("cuda:0")
+        processor_num = 0
+        device_properties = torch.cuda.get_device_properties(device)
+        vram_gb = device_properties.total_memory / 1024**3
+        m_threads = 1 if vram_gb < 8 else 2
+    else:
+        device = torch.device("cpu")
+        processor_num = -1
+        m_threads = 1
+    if os.environ.get("ZERO_GPU") == "TRUE":
+        duration = librosa.get_duration(filename=filename)
+        if duration < 60:
+            pass
+        elif duration >= 60 and duration <= 900:
+            m_threads = 4
+        elif duration > 900:
+            m_threads = 16
+    logger.info(f"MDX-NET Threads: {m_threads}, duration {duration}")
+    model_hash = MDX.get_hash(model_path)
+    mp = model_params.get(model_hash)
+    model = MDXModel(
+        device,
+        dim_f=mp["mdx_dim_f_set"],
+        dim_t=2 ** mp["mdx_dim_t_set"],
+        n_fft=mp["mdx_n_fft_scale_set"],
+        stem_name=mp["primary_stem"],
+        compensation=mp["compensate"],
+    )
+    mdx_sess = MDX(model_path, model, processor=processor_num)
+    wave, sr = librosa.load(filename, mono=False, sr=44100)
+    # normalizing input wave gives better output
+    peak = max(np.max(wave), abs(np.min(wave)))
+    wave /= peak
+    if denoise:
+        wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (
+            mdx_sess.process_wave(wave, m_threads)
+        )
+        wave_processed *= 0.5
+    else:
+        wave_processed = mdx_sess.process_wave(wave, m_threads)
+    # return to previous peak
+    wave_processed *= peak
+    stem_name = model.stem_name if suffix is None else suffix
+    main_filepath = None
+    if not exclude_main:
+        main_filepath = os.path.join(
+            output_dir,
+            f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
+        )
+        sf.write(main_filepath, wave_processed.T, sr)
+    invert_filepath = None
+    if not exclude_inversion:
+        diff_stem_name = (
+            stem_naming.get(stem_name)
+            if invert_suffix is None
+            else invert_suffix
+        )
+        stem_name = (
+            f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name
+        )
+        invert_filepath = os.path.join(
+            output_dir,
+            f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
+        )
+        sf.write(
+            invert_filepath,
+            (-wave_processed.T * model.compensation) + wave.T,
+            sr,
+        )
+    if not keep_orig:
+        os.remove(filename)
+    del mdx_sess, wave_processed, wave
+    gc.collect()
+    torch.cuda.empty_cache()
+    return main_filepath, invert_filepath
+MDX_DOWNLOAD_LINK = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/"
+UVR_MODELS = [
+    "UVR-MDX-NET-Voc_FT.onnx",
+    "UVR_MDXNET_KARA_2.onnx",
+    "Reverb_HQ_By_FoxJoy.onnx",
+    "UVR-MDX-NET-Inst_HQ_4.onnx",
+]
+BASE_DIR = "."  # os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+mdxnet_models_dir = os.path.join(BASE_DIR, "mdx_models")
+output_dir = os.path.join(BASE_DIR, "clean_song_output")
+def convert_to_stereo_and_wav(audio_path):
+    wave, sr = librosa.load(audio_path, mono=False, sr=44100)
+    # check if mono
+    if type(wave[0]) != np.ndarray or audio_path[-4:].lower() != ".wav": # noqa
+        stereo_path = f"{os.path.splitext(audio_path)[0]}_stereo.wav"
+        stereo_path = os.path.join(output_dir, stereo_path)
+        command = shlex.split(
+            f'ffmpeg -y -loglevel error -i "{audio_path}" -ac 2 -f wav "{stereo_path}"'
+        )
+        sub_params = {
+            "stdout": subprocess.PIPE,
+            "stderr": subprocess.PIPE,
+            "creationflags": subprocess.CREATE_NO_WINDOW
+            if sys.platform == "win32"
+            else 0,
+        }
+        process_wav = subprocess.Popen(command, **sub_params)
+        output, errors = process_wav.communicate()
+        if process_wav.returncode != 0 or not os.path.exists(stereo_path):
+            raise Exception("Error processing audio to stereo wav")
+        return stereo_path
+    else:
+        return audio_path
+def process_uvr_task(
+    orig_song_path: str = "aud_test.mp3",
+    main_vocals: bool = False,
+    dereverb: bool = True,
+    song_id: str = "mdx",  # folder output name
+    only_voiceless: bool = False,
+    remove_files_output_dir: bool = False,
+):
+    if os.environ.get("QUANTUM_DEVICE") == "cpu":
+        device_base = "cpu"
+    else:
+        device_base = "cuda" if torch.cuda.is_available() else "cpu"
+    if remove_files_output_dir:
+        remove_directory_contents(output_dir)
+    with open(os.path.join(mdxnet_models_dir, "data.json")) as infile:
+        mdx_model_params = json.load(infile)
+    song_output_dir = os.path.join(output_dir, song_id)
+    create_directories(song_output_dir)
+    orig_song_path = convert_to_stereo_and_wav(orig_song_path)
+    logger.debug(f"onnxruntime device >> {ort.get_device()}")
+    if only_voiceless:
+        logger.info("Voiceless Track Separation...")
+        return run_mdx(
+            mdx_model_params,
+            song_output_dir,
+            os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Inst_HQ_4.onnx"),
+            orig_song_path,
+            suffix="Voiceless",
+            denoise=False,
+            keep_orig=True,
+            exclude_inversion=True,
+            device_base=device_base,
+        )
+    logger.info("Vocal Track Isolation and Voiceless Track Separation...")
+    vocals_path, instrumentals_path = run_mdx(
+        mdx_model_params,
+        song_output_dir,
+        os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Voc_FT.onnx"),
+        orig_song_path,
+        denoise=True,
+        keep_orig=True,
+        device_base=device_base,
+    )
+    if main_vocals:
+        logger.info("Main Voice Separation from Supporting Vocals...")
+        backup_vocals_path, main_vocals_path = run_mdx(
+            mdx_model_params,
+            song_output_dir,
+            os.path.join(mdxnet_models_dir, "UVR_MDXNET_KARA_2.onnx"),
+            vocals_path,
+            suffix="Backup",
+            invert_suffix="Main",
+            denoise=True,
+            device_base=device_base,
+        )
+    else:
+        backup_vocals_path, main_vocals_path = None, vocals_path
+    if dereverb:
+        logger.info("Vocal Clarity Enhancement through De-Reverberation...")
+        _, vocals_dereverb_path = run_mdx(
+            mdx_model_params,
+            song_output_dir,
+            os.path.join(mdxnet_models_dir, "Reverb_HQ_By_FoxJoy.onnx"),
+            main_vocals_path,
+            invert_suffix="DeReverb",
+            exclude_main=True,
+            denoise=True,
+            device_base=device_base,
+        )
+    else:
+        vocals_dereverb_path = main_vocals_path
+    return (
+        vocals_path,
+        instrumentals_path,
+        backup_vocals_path,
+        main_vocals_path,
+        vocals_dereverb_path,
+    )
+if __name__ == "__main__":
+    from utils import download_manager
+    for id_model in UVR_MODELS:
+        download_manager(
+            os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir
+        )
+    (
+        vocals_path_,
+        instrumentals_path_,
+        backup_vocals_path_,
+        main_vocals_path_,
+        vocals_dereverb_path_,
+    ) = process_uvr_task(
+        orig_song_path="aud.mp3",
+        main_vocals=True,
+        dereverb=True,
+        song_id="mdx",
+        remove_files_output_dir=True,
+    )

quantum_dubbing/postprocessor.py ADDED Viewed

	@@ -0,0 +1,231 @@

+from .utils import remove_files, run_command
+from .text_multiformat_processor import get_subtitle
+from .logging_setup import logger
+import unicodedata
+import shutil
+import copy
+import os
+import re
+OUTPUT_TYPE_OPTIONS = [
+    "video (mp4)",
+    "video (mkv)",
+    "audio (mp3)",
+    "audio (ogg)",
+    "audio (wav)",
+    "subtitle",
+    "subtitle [by speaker]",
+    "video [subtitled] (mp4)",
+    "video [subtitled] (mkv)",
+    "audio [original vocal sound]",
+    "audio [original background sound]",
+    "audio [original vocal and background sound]",
+    "audio [original vocal-dereverb sound]",
+    "audio [original vocal-dereverb and background sound]",
+    "raw media",
+]
+DOCS_OUTPUT_TYPE_OPTIONS = [
+    "videobook (mp4)",
+    "videobook (mkv)",
+    "audiobook (wav)",
+    "audiobook (mp3)",
+    "audiobook (ogg)",
+    "book (txt)",
+]  # Add DOCX and etc.
+def get_no_ext_filename(file_path):
+    file_name_with_extension = os.path.basename(rf"{file_path}")
+    filename_without_extension, _ = os.path.splitext(file_name_with_extension)
+    return filename_without_extension
+def get_video_info(link):
+    aux_name = f"video_url_{link}"
+    params_dlp = {"quiet": True, "no_warnings": True, "noplaylist": True}
+    try:
+        from yt_dlp import YoutubeDL
+        with YoutubeDL(params_dlp) as ydl:
+            if link.startswith(("www.youtube.com/", "m.youtube.com/")):
+                link = "https://" + link
+            info_dict = ydl.extract_info(link, download=False, process=False)
+            video_id = info_dict.get("id", aux_name)
+            video_title = info_dict.get("title", video_id)
+            if "youtube.com" in link and "&list=" in link:
+                video_title = ydl.extract_info(
+                    "https://m.youtube.com/watch?v="+video_id,
+                    download=False,
+                    process=False
+                ).get("title", video_title)
+    except Exception as error:
+        logger.error(str(error))
+        video_title, video_id = aux_name, "NO_ID"
+    return video_title, video_id
+def sanitize_file_name(file_name):
+    # Normalize the string to NFKD form to separate combined
+    # characters into base characters and diacritics
+    normalized_name = unicodedata.normalize("NFKD", file_name)
+    # Replace any non-ASCII characters or special symbols with an underscore
+    sanitized_name = re.sub(r"[^\w\s.-]", "_", normalized_name)
+    return sanitized_name
+def get_output_file(
+        original_file,
+        new_file_name,
+        soft_subtitles,
+        output_directory="",
+):
+    directory_base = "."  # default directory
+    if output_directory and os.path.isdir(output_directory):
+        new_file_path = os.path.join(output_directory, new_file_name)
+    else:
+        new_file_path = os.path.join(directory_base, "outputs", new_file_name)
+    remove_files(new_file_path)
+    cm = None
+    if soft_subtitles and original_file.endswith(".mp4"):
+        if new_file_path.endswith(".mp4"):
+            cm = f'ffmpeg -y -i "{original_file}" -i sub_tra.srt -i sub_ori.srt -map 0:v -map 0:a -map 1 -map 2 -c:v copy -c:a copy -c:s mov_text "{new_file_path}"'
+        else:
+            cm = f'ffmpeg -y -i "{original_file}" -i sub_tra.srt -i sub_ori.srt -map 0:v -map 0:a -map 1 -map 2 -c:v copy -c:a copy -c:s srt -movflags use_metadata_tags -map_metadata 0 "{new_file_path}"'
+    elif new_file_path.endswith(".mkv"):
+        cm = f'ffmpeg -i "{original_file}" -c:v copy -c:a copy "{new_file_path}"'
+    elif new_file_path.endswith(".wav") and not original_file.endswith(".wav"):
+        cm = f'ffmpeg -y -i "{original_file}" -acodec pcm_s16le -ar 44100 -ac 2 "{new_file_path}"'
+    elif new_file_path.endswith(".ogg"):
+        cm = f'ffmpeg -i "{original_file}" -c:a libvorbis "{new_file_path}"'
+    elif new_file_path.endswith(".mp3") and not original_file.endswith(".mp3"):
+        cm = f'ffmpeg -y -i "{original_file}" -codec:a libmp3lame -qscale:a 2 "{new_file_path}"'
+    if cm:
+        try:
+            run_command(cm)
+        except Exception as error:
+            logger.error(str(error))
+            remove_files(new_file_path)
+            shutil.copy2(original_file, new_file_path)
+    else:
+        shutil.copy2(original_file, new_file_path)
+    return os.path.abspath(new_file_path)
+def media_out(
+    media_file,
+    lang_code,
+    media_out_name="",
+    extension="mp4",
+    file_obj="video_dub.mp4",
+    soft_subtitles=False,
+    subtitle_files="disable",
+):
+    if media_out_name:
+        base_name = media_out_name + "_origin"
+    else:
+        if os.path.exists(media_file):
+            base_name = get_no_ext_filename(media_file)
+        else:
+            base_name, _ = get_video_info(media_file)
+        media_out_name = f"{base_name}__{lang_code}"
+    f_name = f"{sanitize_file_name(media_out_name)}.{extension}"
+    if subtitle_files != "disable":
+        final_media = [get_output_file(file_obj, f_name, soft_subtitles)]
+        name_tra = f"{sanitize_file_name(media_out_name)}.{subtitle_files}"
+        name_ori = f"{sanitize_file_name(base_name)}.{subtitle_files}"
+        tgt_subs = f"sub_tra.{subtitle_files}"
+        ori_subs = f"sub_ori.{subtitle_files}"
+        final_subtitles = [
+            get_output_file(tgt_subs, name_tra, False),
+            get_output_file(ori_subs, name_ori, False)
+        ]
+        return final_media + final_subtitles
+    else:
+        return get_output_file(file_obj, f_name, soft_subtitles)
+def get_subtitle_speaker(media_file, result, language, extension, base_name):
+    segments_base = copy.deepcopy(result)
+    # Sub segments by speaker
+    segments_by_speaker = {}
+    for segment in segments_base["segments"]:
+        if segment["speaker"] not in segments_by_speaker.keys():
+            segments_by_speaker[segment["speaker"]] = [segment]
+        else:
+            segments_by_speaker[segment["speaker"]].append(segment)
+    if not base_name:
+        if os.path.exists(media_file):
+            base_name = get_no_ext_filename(media_file)
+        else:
+            base_name, _ = get_video_info(media_file)
+    files_subs = []
+    for name_sk, segments in segments_by_speaker.items():
+        subtitle_speaker = get_subtitle(
+            language,
+            {"segments": segments},
+            extension,
+            filename=name_sk,
+        )
+        media_out_name = f"{base_name}_{language}_{name_sk}"
+        output = media_out(
+            media_file,  # no need
+            language,
+            media_out_name,
+            extension,
+            file_obj=subtitle_speaker,
+        )
+        files_subs.append(output)
+    return files_subs
+def sound_separate(media_file, task_uvr):
+    from .mdx_net import process_uvr_task
+    outputs = []
+    if "vocal" in task_uvr:
+        try:
+            _, _, _, _, vocal_audio = process_uvr_task(
+                orig_song_path=media_file,
+                main_vocals=False,
+                dereverb=True if "dereverb" in task_uvr else False,
+                remove_files_output_dir=True,
+            )
+            outputs.append(vocal_audio)
+        except Exception as error:
+            logger.error(str(error))
+    if "background" in task_uvr:
+        try:
+            background_audio, _ = process_uvr_task(
+                orig_song_path=media_file,
+                song_id="voiceless",
+                only_voiceless=True,
+                remove_files_output_dir=False if "vocal" in task_uvr else True,
+            )
+            # copy_files(background_audio, ".")
+            outputs.append(background_audio)
+        except Exception as error:
+            logger.error(str(error))
+    if not outputs:
+        raise Exception("Error in uvr process")
+    return outputs

quantum_dubbing/preprocessor.py ADDED Viewed

	@@ -0,0 +1,309 @@

+from .utils import remove_files
+import os, shutil, subprocess, time, shlex, sys # noqa
+from .logging_setup import logger
+import json
+ERROR_INCORRECT_CODEC_PARAMETERS = [
+    "prores",  # mov
+    "ffv1",  # mkv
+    "msmpeg4v3",  # avi
+    "wmv2",  # wmv
+    "theora",  # ogv
+]  # fix final merge
+TESTED_CODECS = [
+    "h264",  # mp4
+    "h265",  # mp4
+    "hevc",  # test
+    "vp9",  # webm
+    "mpeg4",  # mp4
+    "mpeg2video",  # mpg
+    "mjpeg",  # avi
+]
+class OperationFailedError(Exception):
+    def __init__(self, message="The operation did not complete successfully."):
+        self.message = message
+        super().__init__(self.message)
+def get_video_codec(video_file):
+    command_base = rf'ffprobe -v error -select_streams v:0 -show_entries stream=codec_name -of json "{video_file}"'
+    command = shlex.split(command_base)
+    try:
+        process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            creationflags=subprocess.CREATE_NO_WINDOW if sys.platform == "win32" else 0,
+        )
+        output, _ = process.communicate()
+        codec_info = json.loads(output.decode('utf-8'))
+        codec_name = codec_info['streams'][0]['codec_name']
+        return codec_name
+    except Exception as error:
+        logger.debug(str(error))
+        return None
+def audio_preprocessor(preview, base_audio, audio_wav, use_cuda=False):
+    base_audio = base_audio.strip()
+    previous_files_to_remove = [audio_wav]
+    remove_files(previous_files_to_remove)
+    if preview:
+        logger.warning(
+            "Creating a preview video of 10 seconds, to disable "
+            "this option, go to advanced settings and turn off preview."
+        )
+        wav_ = f'ffmpeg -y -i "{base_audio}" -ss 00:00:20 -t 00:00:10 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav'
+    else:
+        wav_ = f'ffmpeg -y -i "{base_audio}" -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav'
+    # Run cmd process
+    sub_params = {
+        "stdout": subprocess.PIPE,
+        "stderr": subprocess.PIPE,
+        "creationflags": subprocess.CREATE_NO_WINDOW
+        if sys.platform == "win32"
+        else 0,
+    }
+    wav_ = shlex.split(wav_)
+    result_convert_audio = subprocess.Popen(wav_, **sub_params)
+    output, errors = result_convert_audio.communicate()
+    time.sleep(1)
+    if result_convert_audio.returncode in [1, 2] or not os.path.exists(
+        audio_wav
+    ):
+        raise OperationFailedError(f"Error can't create the audio file:\n{errors.decode('utf-8')}")
+def audio_video_preprocessor(
+    preview, video, OutputFile, audio_wav, use_cuda=False
+):
+    video = video.strip()
+    previous_files_to_remove = [OutputFile, "audio.webm", audio_wav]
+    remove_files(previous_files_to_remove)
+    if os.path.exists(video):
+        if preview:
+            logger.warning(
+                "Creating a preview video of 10 seconds, "
+                "to disable this option, go to advanced "
+                "settings and turn off preview."
+            )
+            mp4_ = f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4'
+        else:
+            video_codec = get_video_codec(video)
+            if not video_codec:
+                logger.debug("No video codec found in video")
+            else:
+                logger.info(f"Video codec: {video_codec}")
+            # Check if the file ends with ".mp4" extension or is valid codec
+            if video.endswith(".mp4") or video_codec in TESTED_CODECS:
+                destination_path = os.path.join(os.getcwd(), "Video.mp4")
+                shutil.copy(video, destination_path)
+                time.sleep(0.5)
+                if os.path.exists(OutputFile):
+                    mp4_ = "ffmpeg -h"
+                else:
+                    mp4_ = f'ffmpeg -y -i "{video}" -c copy Video.mp4'
+            else:
+                logger.warning(
+                    "File does not have the '.mp4' extension  or a "
+                    "supported codec. Converting video to mp4 (codec: h264)."
+                )
+                mp4_ = f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4'
+    else:
+        if preview:
+            logger.warning(
+                "Creating a preview from the link, 10 seconds "
+                "to disable this option, go to advanced "
+                "settings and turn off preview."
+            )
+            # https://github.com/yt-dlp/yt-dlp/issues/2220
+            mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-playlist --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
+            wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
+        else:
+            mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-playlist --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
+            wav_ = f"python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-playlist --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}"
+    # Run cmd process
+    mp4_ = shlex.split(mp4_)
+    sub_params = {
+        "stdout": subprocess.PIPE,
+        "stderr": subprocess.PIPE,
+        "creationflags": subprocess.CREATE_NO_WINDOW
+        if sys.platform == "win32"
+        else 0,
+    }
+    if os.path.exists(video):
+        logger.info("Process video...")
+        result_convert_video = subprocess.Popen(mp4_, **sub_params)
+        # result_convert_video.wait()
+        output, errors = result_convert_video.communicate()
+        time.sleep(1)
+        if result_convert_video.returncode in [1, 2] or not os.path.exists(
+            OutputFile
+        ):
+            raise OperationFailedError(f"Error processing video:\n{errors.decode('utf-8')}")
+        logger.info("Process audio...")
+        wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
+        wav_ = shlex.split(wav_)
+        result_convert_audio = subprocess.Popen(wav_, **sub_params)
+        output, errors = result_convert_audio.communicate()
+        time.sleep(1)
+        if result_convert_audio.returncode in [1, 2] or not os.path.exists(
+            audio_wav
+        ):
+            raise OperationFailedError(f"Error can't create the audio file:\n{errors.decode('utf-8')}")
+    else:
+        wav_ = shlex.split(wav_)
+        if preview:
+            result_convert_video = subprocess.Popen(mp4_, **sub_params)
+            output, errors = result_convert_video.communicate()
+            time.sleep(0.5)
+            result_convert_audio = subprocess.Popen(wav_, **sub_params)
+            output, errors = result_convert_audio.communicate()
+            time.sleep(0.5)
+            if result_convert_audio.returncode in [1, 2] or not os.path.exists(
+                audio_wav
+            ):
+                raise OperationFailedError(
+                    f"Error can't create the preview file:\n{errors.decode('utf-8')}"
+                )
+        else:
+            logger.info("Process audio...")
+            result_convert_audio = subprocess.Popen(wav_, **sub_params)
+            output, errors = result_convert_audio.communicate()
+            time.sleep(1)
+            if result_convert_audio.returncode in [1, 2] or not os.path.exists(
+                audio_wav
+            ):
+                raise OperationFailedError(f"Error can't download the audio:\n{errors.decode('utf-8')}")
+            logger.info("Process video...")
+            result_convert_video = subprocess.Popen(mp4_, **sub_params)
+            output, errors = result_convert_video.communicate()
+            time.sleep(1)
+            if result_convert_video.returncode in [1, 2] or not os.path.exists(
+                OutputFile
+            ):
+                raise OperationFailedError(f"Error can't download the video:\n{errors.decode('utf-8')}")
+def old_audio_video_preprocessor(preview, video, OutputFile, audio_wav):
+    previous_files_to_remove = [OutputFile, "audio.webm", audio_wav]
+    remove_files(previous_files_to_remove)
+    if os.path.exists(video):
+        if preview:
+            logger.warning(
+                "Creating a preview video of 10 seconds, "
+                "to disable this option, go to advanced "
+                "settings and turn off preview."
+            )
+            command = f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4'
+            result_convert_video = subprocess.run(
+                command, capture_output=True, text=True, shell=True
+            )
+        else:
+            # Check if the file ends with ".mp4" extension
+            if video.endswith(".mp4"):
+                destination_path = os.path.join(os.getcwd(), "Video.mp4")
+                shutil.copy(video, destination_path)
+                result_convert_video = {}
+                result_convert_video = subprocess.run(
+                    "echo Video copied",
+                    capture_output=True,
+                    text=True,
+                    shell=True,
+                )
+            else:
+                logger.warning(
+                    "File does not have the '.mp4' extension. Converting video."
+                )
+                command = f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4'
+                result_convert_video = subprocess.run(
+                    command, capture_output=True, text=True, shell=True
+                )
+        if result_convert_video.returncode in [1, 2]:
+            raise OperationFailedError("Error can't convert the video")
+        for i in range(120):
+            time.sleep(1)
+            logger.info("Process video...")
+            if os.path.exists(OutputFile):
+                time.sleep(1)
+                command = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
+                result_convert_audio = subprocess.run(
+                    command, capture_output=True, text=True, shell=True
+                )
+                time.sleep(1)
+                break
+            if i == 119:
+                # if not os.path.exists(OutputFile):
+                raise OperationFailedError("Error processing video")
+        if result_convert_audio.returncode in [1, 2]:
+            raise OperationFailedError(
+                f"Error can't create the audio file: {result_convert_audio.stderr}"
+            )
+        for i in range(120):
+            time.sleep(1)
+            logger.info("Process audio...")
+            if os.path.exists(audio_wav):
+                break
+            if i == 119:
+                raise OperationFailedError("Error can't create the audio file")
+    else:
+        video = video.strip()
+        if preview:
+            logger.warning(
+                "Creating a preview from the link, 10 "
+                "seconds to disable this option, go to "
+                "advanced settings and turn off preview."
+            )
+            # https://github.com/yt-dlp/yt-dlp/issues/2220
+            mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
+            wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
+            result_convert_video = subprocess.run(
+                mp4_, capture_output=True, text=True, shell=True
+            )
+            result_convert_audio = subprocess.run(
+                wav_, capture_output=True, text=True, shell=True
+            )
+            if result_convert_audio.returncode in [1, 2]:
+                raise OperationFailedError("Error can't download a preview")
+        else:
+            mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
+            wav_ = f"python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}"
+            result_convert_audio = subprocess.run(
+                wav_, capture_output=True, text=True, shell=True
+            )
+            if result_convert_audio.returncode in [1, 2]:
+                raise OperationFailedError("Error can't download the audio")
+            for i in range(120):
+                time.sleep(1)
+                logger.info("Process audio...")
+                if os.path.exists(audio_wav) and not os.path.exists(
+                    "audio.webm"
+                ):
+                    time.sleep(1)
+                    result_convert_video = subprocess.run(
+                        mp4_, capture_output=True, text=True, shell=True
+                    )
+                    break
+                if i == 119:
+                    raise OperationFailedError("Error downloading the audio")
+            if result_convert_video.returncode in [1, 2]:
+                raise OperationFailedError("Error can't download the video")

quantum_dubbing/speech_segmentation.py ADDED Viewed

	@@ -0,0 +1,499 @@

+from whisperx.alignment import (
+    DEFAULT_ALIGN_MODELS_TORCH as DAMT,
+    DEFAULT_ALIGN_MODELS_HF as DAMHF,
+)
+from whisperx.utils import TO_LANGUAGE_CODE
+import whisperx
+import torch
+import gc
+import os
+import soundfile as sf
+from IPython.utils import capture # noqa
+from .language_configuration import EXTRA_ALIGN, INVERTED_LANGUAGES
+from .logging_setup import logger
+from .postprocessor import sanitize_file_name
+from .utils import remove_directory_contents, run_command
+# ZERO GPU CONFIG
+import spaces
+import copy
+import random
+import time
+def random_sleep():
+    if os.environ.get("ZERO_GPU") == "TRUE":
+        print("Random sleep")
+        sleep_time = round(random.uniform(7.2, 9.9), 1)
+        time.sleep(sleep_time)
+@spaces.GPU
+def load_and_transcribe_audio(asr_model, audio, compute_type, language, asr_options, batch_size, segment_duration_limit):
+    # Load model
+    model = whisperx.load_model(
+        asr_model,
+        os.environ.get("QUANTUM_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda",
+        compute_type=compute_type,
+        language=language,
+        asr_options=asr_options,
+    )
+    # Transcribe audio
+    result = model.transcribe(
+        audio,
+        batch_size=batch_size,
+        chunk_size=segment_duration_limit,
+        print_progress=True,
+    )
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()  # noqa
+    return result
+def load_align_and_align_segments(result, audio, DAMHF):
+    # Load alignment model
+    model_a, metadata = whisperx.load_align_model(
+        language_code=result["language"],
+        device=os.environ.get("QUANTUM_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cpu",
+        model_name=None
+        if result["language"] in DAMHF.keys()
+        else EXTRA_ALIGN[result["language"]],
+    )
+    # Align segments
+    alignment_result = whisperx.align(
+        result["segments"],
+        model_a,
+        metadata,
+        audio,
+        os.environ.get("QUANTUM_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cpu",
+        return_char_alignments=True,
+        print_progress=False,
+    )
+    # Clean up
+    del model_a
+    gc.collect()
+    torch.cuda.empty_cache()  # noqa
+    return alignment_result
+@spaces.GPU
+def diarize_audio(diarize_model, audio_wav, min_speakers, max_speakers):
+    if os.environ.get("ZERO_GPU") == "TRUE":
+        diarize_model.model.to(torch.device("cuda"))
+    diarize_segments = diarize_model(
+        audio_wav,
+        min_speakers=min_speakers,
+        max_speakers=max_speakers
+    )
+    return diarize_segments
+# ZERO GPU CONFIG
+ASR_MODEL_OPTIONS = [
+    "tiny",
+    "base",
+    "small",
+    "medium",
+    "large",
+    "large-v1",
+    "large-v2",
+    "large-v3",
+    "distil-large-v2",
+    "Systran/faster-distil-whisper-large-v3",
+    "tiny.en",
+    "base.en",
+    "small.en",
+    "medium.en",
+    "distil-small.en",
+    "distil-medium.en",
+    "OpenAI_API_Whisper",
+]
+COMPUTE_TYPE_GPU = [
+    "default",
+    "auto",
+    "int8",
+    "int8_float32",
+    "int8_float16",
+    "int8_bfloat16",
+    "float16",
+    "bfloat16",
+    "float32"
+]
+COMPUTE_TYPE_CPU = [
+    "default",
+    "auto",
+    "int8",
+    "int8_float32",
+    "int16",
+    "float32",
+]
+WHISPER_MODELS_PATH = './WHISPER_MODELS'
+def openai_api_whisper(
+    input_audio_file,
+    source_lang=None,
+    chunk_duration=1800
+):
+    info = sf.info(input_audio_file)
+    duration = info.duration
+    output_directory = "./whisper_api_audio_parts"
+    os.makedirs(output_directory, exist_ok=True)
+    remove_directory_contents(output_directory)
+    if duration > chunk_duration:
+        # Split the audio file into smaller chunks with 30-minute duration
+        cm = f'ffmpeg -i "{input_audio_file}" -f segment -segment_time {chunk_duration} -c:a libvorbis "{output_directory}/output%03d.ogg"'
+        run_command(cm)
+        # Get list of generated chunk files
+        chunk_files = sorted(
+            [f"{output_directory}/{f}" for f in os.listdir(output_directory) if f.endswith('.ogg')]
+        )
+    else:
+        one_file = f"{output_directory}/output000.ogg"
+        cm = f'ffmpeg -i "{input_audio_file}" -c:a libvorbis {one_file}'
+        run_command(cm)
+        chunk_files = [one_file]
+    # Transcript
+    segments = []
+    language = source_lang if source_lang else None
+    for i, chunk in enumerate(chunk_files):
+        from openai import OpenAI
+        client = OpenAI()
+        audio_file = open(chunk, "rb")
+        transcription = client.audio.transcriptions.create(
+          model="whisper-1",
+          file=audio_file,
+          language=language,
+          response_format="verbose_json",
+          timestamp_granularities=["segment"],
+        )
+        try:
+            transcript_dict = transcription.model_dump()
+        except: # noqa
+            transcript_dict = transcription.to_dict()
+        if language is None:
+            logger.info(f'Language detected: {transcript_dict["language"]}')
+            language = TO_LANGUAGE_CODE[transcript_dict["language"]]
+        chunk_time = chunk_duration * (i)
+        for seg in transcript_dict["segments"]:
+            if "start" in seg.keys():
+                segments.append(
+                    {
+                        "text": seg["text"],
+                        "start": seg["start"] + chunk_time,
+                        "end": seg["end"] + chunk_time,
+                    }
+                )
+    audio = whisperx.load_audio(input_audio_file)
+    result = {"segments": segments, "language": language}
+    return audio, result
+def find_whisper_models():
+    path = WHISPER_MODELS_PATH
+    folders = []
+    if os.path.exists(path):
+        for folder in os.listdir(path):
+            folder_path = os.path.join(path, folder)
+            if (
+                os.path.isdir(folder_path)
+                and 'model.bin' in os.listdir(folder_path)
+            ):
+                folders.append(folder)
+    return folders
+def transcribe_speech(
+    audio_wav,
+    asr_model,
+    compute_type,
+    batch_size,
+    SOURCE_LANGUAGE,
+    literalize_numbers=True,
+    segment_duration_limit=15,
+):
+    """
+    Transcribe speech using a whisper model.
+    Parameters:
+    - audio_wav (str): Path to the audio file in WAV format.
+    - asr_model (str): The whisper model to be loaded.
+    - compute_type (str): Type of compute to be used (e.g., 'int8', 'float16').
+    - batch_size (int): Batch size for transcription.
+    - SOURCE_LANGUAGE (str): Source language for transcription.
+    Returns:
+    - Tuple containing:
+        - audio: Loaded audio file.
+        - result: Transcription result as a dictionary.
+    """
+    if asr_model == "OpenAI_API_Whisper":
+        if literalize_numbers:
+            logger.info(
+                "OpenAI's API Whisper does not support "
+                "the literalization of numbers."
+            )
+        return openai_api_whisper(audio_wav, SOURCE_LANGUAGE)
+    # https://github.com/openai/whisper/discussions/277
+    prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else None
+    SOURCE_LANGUAGE = (
+        SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh"
+    )
+    asr_options = {
+        "initial_prompt": prompt,
+        "suppress_numerals": literalize_numbers
+    }
+    if asr_model not in ASR_MODEL_OPTIONS:
+        base_dir = WHISPER_MODELS_PATH
+        if not os.path.exists(base_dir):
+            os.makedirs(base_dir)
+        model_dir = os.path.join(base_dir, sanitize_file_name(asr_model))
+        if not os.path.exists(model_dir):
+            from ctranslate2.converters import TransformersConverter
+            quantization = "float32"
+            # Download new model
+            try:
+                converter = TransformersConverter(
+                    asr_model,
+                    low_cpu_mem_usage=True,
+                    copy_files=[
+                        "tokenizer_config.json", "preprocessor_config.json"
+                    ]
+                )
+                converter.convert(
+                    model_dir,
+                    quantization=quantization,
+                    force=False
+                )
+            except Exception as error:
+                if "File tokenizer_config.json does not exist" in str(error):
+                    converter._copy_files = [
+                        "tokenizer.json", "preprocessor_config.json"
+                    ]
+                    converter.convert(
+                        model_dir,
+                        quantization=quantization,
+                        force=True
+                    )
+                else:
+                    raise error
+        asr_model = model_dir
+        logger.info(f"ASR Model: {str(model_dir)}")
+    audio = whisperx.load_audio(audio_wav)
+    result = load_and_transcribe_audio(
+        asr_model, audio, compute_type, SOURCE_LANGUAGE, asr_options, batch_size, segment_duration_limit
+    )
+    if result["language"] == "zh" and not prompt:
+        result["language"] = "zh-TW"
+        logger.info("Chinese - Traditional (zh-TW)")
+    return audio, result
+def align_speech(audio, result):
+    """
+    Aligns speech segments based on the provided audio and result metadata.
+    Parameters:
+    - audio (array): The audio data in a suitable format for alignment.
+    - result (dict): Metadata containing information about the segments
+         and language.
+    Returns:
+    - result (dict): Updated metadata after aligning the segments with
+        the audio. This includes character-level alignments if
+        'return_char_alignments' is set to True.
+    Notes:
+    - This function uses language-specific models to align speech segments.
+    - It performs language compatibility checks and selects the
+        appropriate alignment model.
+    - Cleans up memory by releasing resources after alignment.
+    """
+    DAMHF.update(DAMT)  # lang align
+    if (
+        not result["language"] in DAMHF.keys()
+        and not result["language"] in EXTRA_ALIGN.keys()
+    ):
+        logger.warning(
+            "Automatic detection: Source language not compatible with align"
+        )
+        raise ValueError(
+            f"Detected language {result['language']}  incompatible, "
+            "you can select the source language to avoid this error."
+        )
+    if (
+        result["language"] in EXTRA_ALIGN.keys()
+        and EXTRA_ALIGN[result["language"]] == ""
+    ):
+        lang_name = (
+            INVERTED_LANGUAGES[result["language"]]
+            if result["language"] in INVERTED_LANGUAGES.keys()
+            else result["language"]
+        )
+        logger.warning(
+            "No compatible wav2vec2 model found "
+            f"for the language '{lang_name}', skipping alignment."
+        )
+        return result
+    # random_sleep()
+    result = load_align_and_align_segments(result, audio, DAMHF)
+    return result
+diarization_models = {
+    "pyannote_3.1": "pyannote/speaker-diarization-3.1",
+    "pyannote_2.1": "pyannote/[email protected]",
+    "disable": "",
+}
+def reencode_speakers(result):
+    if result["segments"][0]["speaker"] == "SPEAKER_00":
+        return result
+    speaker_mapping = {}
+    counter = 0
+    logger.debug("Reencode speakers")
+    for segment in result["segments"]:
+        old_speaker = segment["speaker"]
+        if old_speaker not in speaker_mapping:
+            speaker_mapping[old_speaker] = f"SPEAKER_{counter:02d}"
+            counter += 1
+        segment["speaker"] = speaker_mapping[old_speaker]
+    return result
+def diarize_speech(
+    audio_wav,
+    result,
+    min_speakers,
+    max_speakers,
+    YOUR_HF_TOKEN,
+    model_name="pyannote/[email protected]",
+):
+    """
+    Performs speaker diarization on speech segments.
+    Parameters:
+    - audio_wav (array): Audio data in WAV format to perform speaker
+        diarization.
+    - result (dict): Metadata containing information about speech segments
+        and alignments.
+    - min_speakers (int): Minimum number of speakers expected in the audio.
+    - max_speakers (int): Maximum number of speakers expected in the audio.
+    - YOUR_HF_TOKEN (str): Your Hugging Face API token for model
+        authentication.
+    - model_name (str): Name of the speaker diarization model to be used
+        (default: "pyannote/[email protected]").
+    Returns:
+    - result_diarize (dict): Updated metadata after assigning speaker
+        labels to segments.
+    Notes:
+    - This function utilizes a speaker diarization model to label speaker
+        segments in the audio.
+    - It assigns speakers to word-level segments based on diarization results.
+    - Cleans up memory by releasing resources after diarization.
+    - If only one speaker is specified, each segment is automatically assigned
+        as the first speaker, eliminating the need for diarization inference.
+    """
+    if max(min_speakers, max_speakers) > 1 and model_name:
+        try:
+            diarize_model = whisperx.DiarizationPipeline(
+                model_name=model_name,
+                use_auth_token=YOUR_HF_TOKEN,
+                device=os.environ.get("QUANTUM_DEVICE"),
+            )
+        except Exception as error:
+            error_str = str(error)
+            gc.collect()
+            torch.cuda.empty_cache()  # noqa
+            if "'NoneType' object has no attribute 'to'" in error_str:
+                if model_name == diarization_models["pyannote_2.1"]:
+                    raise ValueError(
+                        "Accept the license agreement for using Pyannote 2.1."
+                        " You need to have an account on Hugging Face and "
+                        "accept the license to use the models: "
+                        "https://huggingface.co/pyannote/speaker-diarization "
+                        "and https://huggingface.co/pyannote/segmentation "
+                        "Get your KEY TOKEN here: "
+                        "https://hf.co/settings/tokens "
+                    )
+                elif model_name == diarization_models["pyannote_3.1"]:
+                    raise ValueError(
+                        "New Licence Pyannote 3.1: You need to have an account"
+                        " on Hugging Face and accept the license to use the "
+                        "models: https://huggingface.co/pyannote/speaker-diarization-3.1 " # noqa
+                        "and https://huggingface.co/pyannote/segmentation-3.0 "
+                    )
+            else:
+                raise error
+        random_sleep()
+        diarize_segments = diarize_audio(diarize_model, audio_wav, min_speakers, max_speakers)
+        result_diarize = whisperx.assign_word_speakers(
+            diarize_segments, result
+        )
+        for segment in result_diarize["segments"]:
+            if "speaker" not in segment:
+                segment["speaker"] = "SPEAKER_00"
+                logger.warning(
+                    f"No speaker detected in {segment['start']}. First TTS "
+                    f"will be used for the segment text: {segment['text']} "
+                )
+        del diarize_model
+        gc.collect()
+        torch.cuda.empty_cache()  # noqa
+    else:
+        result_diarize = result
+        result_diarize["segments"] = [
+            {**item, "speaker": "SPEAKER_00"}
+            for item in result_diarize["segments"]
+        ]
+    return reencode_speakers(result_diarize)

quantum_dubbing/text_multiformat_processor.py ADDED Viewed

	@@ -0,0 +1,987 @@

+from .logging_setup import logger
+from whisperx.utils import get_writer
+from .utils import remove_files, run_command, remove_directory_contents
+from typing import List
+import srt
+import re
+import os
+import copy
+import string
+import soundfile as sf
+from PIL import Image, ImageOps, ImageDraw, ImageFont
+punctuation_list = list(
+    string.punctuation + "¡¿«»„”“”‚‘’「」『』《》（）【】〈〉〔〕〖〗〘〙〚〛⸤⸥⸨⸩"
+)
+symbol_list = punctuation_list + ["", "..", "..."]
+def extract_from_srt(file_path):
+    with open(file_path, "r", encoding="utf-8") as file:
+        srt_content = file.read()
+    subtitle_generator = srt.parse(srt_content)
+    srt_content_list = list(subtitle_generator)
+    return srt_content_list
+def clean_text(text):
+    # Remove content within square brackets
+    text = re.sub(r'\[.*?\]', '', text)
+    # Add pattern to remove content within <comment> tags
+    text = re.sub(r'<comment>.*?</comment>', '', text)
+    # Remove HTML tags
+    text = re.sub(r'<.*?>', '', text)
+    # Remove "♫" and "♪" content
+    text = re.sub(r'♫.*?♫', '', text)
+    text = re.sub(r'♪.*?♪', '', text)
+    # Replace newline characters with an empty string
+    text = text.replace("\n", ". ")
+    # Remove double quotation marks
+    text = text.replace('"', '')
+    # Collapse multiple spaces and replace with a single space
+    text = re.sub(r"\s+", " ", text)
+    # Normalize spaces around periods
+    text = re.sub(r"[\s\.]+(?=\s)", ". ", text)
+    # Check if there are ♫ or ♪ symbols present
+    if '♫' in text or '♪' in text:
+        return ""
+    text = text.strip()
+    # Valid text
+    return text if text not in symbol_list else ""
+def srt_file_to_segments(file_path, speaker=False):
+    try:
+        srt_content_list = extract_from_srt(file_path)
+    except Exception as error:
+        logger.error(str(error))
+        fixed_file = "fixed_sub.srt"
+        remove_files(fixed_file)
+        fix_sub = f'ffmpeg -i "{file_path}" "{fixed_file}" -y'
+        run_command(fix_sub)
+        srt_content_list = extract_from_srt(fixed_file)
+    segments = []
+    for segment in srt_content_list:
+        text = clean_text(str(segment.content))
+        if text:
+            segments.append(
+                {
+                    "text": text,
+                    "start": float(segment.start.total_seconds()),
+                    "end": float(segment.end.total_seconds()),
+                }
+            )
+    if not segments:
+        raise Exception("No data found in srt subtitle file")
+    if speaker:
+        segments = [{**seg, "speaker": "SPEAKER_00"} for seg in segments]
+    return {"segments": segments}
+# documents
+def dehyphenate(lines: List[str], line_no: int) -> List[str]:
+    next_line = lines[line_no + 1]
+    word_suffix = next_line.split(" ")[0]
+    lines[line_no] = lines[line_no][:-1] + word_suffix
+    lines[line_no + 1] = lines[line_no + 1][len(word_suffix):]
+    return lines
+def remove_hyphens(text: str) -> str:
+    """
+    This fails for:
+    * Natural dashes: well-known, self-replication, use-cases, non-semantic,
+                      Post-processing, Window-wise, viewpoint-dependent
+    * Trailing math operands: 2 - 4
+    * Names: Lopez-Ferreras, VGG-19, CIFAR-100
+    """
+    lines = [line.rstrip() for line in text.split("\n")]
+    # Find dashes
+    line_numbers = []
+    for line_no, line in enumerate(lines[:-1]):
+        if line.endswith("-"):
+            line_numbers.append(line_no)
+    # Replace
+    for line_no in line_numbers:
+        lines = dehyphenate(lines, line_no)
+    return "\n".join(lines)
+def pdf_to_txt(pdf_file, start_page, end_page):
+    from pypdf import PdfReader
+    with open(pdf_file, "rb") as file:
+        reader = PdfReader(file)
+        logger.debug(f"Total pages: {reader.get_num_pages()}")
+        text = ""
+        start_page_idx = max((start_page-1), 0)
+        end_page_inx = min((end_page), (reader.get_num_pages()))
+        document_pages = reader.pages[start_page_idx:end_page_inx]
+        logger.info(
+            f"Selected pages from {start_page_idx} to {end_page_inx}: "
+            f"{len(document_pages)}"
+        )
+        for page in document_pages:
+            text += remove_hyphens(page.extract_text())
+    return text
+def docx_to_txt(docx_file):
+    # https://github.com/AlJohri/docx2pdf update
+    from docx import Document
+    doc = Document(docx_file)
+    text = ""
+    for paragraph in doc.paragraphs:
+        text += paragraph.text + "\n"
+    return text
+def replace_multiple_elements(text, replacements):
+    pattern = re.compile("|".join(map(re.escape, replacements.keys())))
+    replaced_text = pattern.sub(
+        lambda match: replacements[match.group(0)], text
+    )
+    # Remove multiple spaces
+    replaced_text = re.sub(r"\s+", " ", replaced_text)
+    return replaced_text
+def document_preprocessor(file_path, is_string, start_page, end_page):
+    if not is_string:
+        file_ext = os.path.splitext(file_path)[1].lower()
+    if is_string:
+        text = file_path
+    elif file_ext == ".pdf":
+        text = pdf_to_txt(file_path, start_page, end_page)
+    elif file_ext == ".docx":
+        text = docx_to_txt(file_path)
+    elif file_ext == ".txt":
+        with open(
+            file_path, "r", encoding='utf-8', errors='replace'
+        ) as file:
+            text = file.read()
+    else:
+        raise Exception("Unsupported file format")
+    # Add space to break segments more easily later
+    replacements = {
+        "、": "、 ",
+        "。": "。 ",
+        # "\n": " ",
+    }
+    text = replace_multiple_elements(text, replacements)
+    # Save text to a .txt file
+    # file_name = os.path.splitext(os.path.basename(file_path))[0]
+    txt_file_path = "./text_preprocessor.txt"
+    with open(
+        txt_file_path, "w", encoding='utf-8', errors='replace'
+    ) as txt_file:
+        txt_file.write(text)
+    return txt_file_path, text
+def split_text_into_chunks(text, chunk_size):
+    words = re.findall(r"\b\w+\b", text)
+    chunks = []
+    current_chunk = ""
+    for word in words:
+        if (
+            len(current_chunk) + len(word) + 1 <= chunk_size
+        ):  # Adding 1 for the space between words
+            if current_chunk:
+                current_chunk += " "
+            current_chunk += word
+        else:
+            chunks.append(current_chunk)
+            current_chunk = word
+    if current_chunk:
+        chunks.append(current_chunk)
+    return chunks
+def determine_chunk_size(file_name):
+    patterns = {
+        re.compile(r".*-(Male|Female)$"): 1024,  # by character
+        re.compile(r".* BARK$"): 100,  # t 64 256
+        re.compile(r".* VITS$"): 500,
+        re.compile(
+            r".+\.(wav|mp3|ogg|m4a)$"
+        ): 150,  # t 250 400 api automatic split
+        re.compile(r".* VITS-onnx$"): 250,  # automatic sentence split
+        re.compile(r".* OpenAI-TTS$"): 1024  # max charaters 4096
+    }
+    for pattern, chunk_size in patterns.items():
+        if pattern.match(file_name):
+            return chunk_size
+    # Default chunk size if the file doesn't match any pattern; max 1800
+    return 100
+def plain_text_to_segments(result_text=None, chunk_size=None):
+    if not chunk_size:
+        chunk_size = 100
+    text_chunks = split_text_into_chunks(result_text, chunk_size)
+    segments_chunks = []
+    for num, chunk in enumerate(text_chunks):
+        chunk_dict = {
+            "text": chunk,
+            "start": (1.0 + num),
+            "end": (2.0 + num),
+            "speaker": "SPEAKER_00",
+        }
+        segments_chunks.append(chunk_dict)
+    result_diarize = {"segments": segments_chunks}
+    return result_diarize
+def segments_to_plain_text(result_diarize):
+    complete_text = ""
+    for seg in result_diarize["segments"]:
+        complete_text += seg["text"] + " "  # issue
+    # Save text to a .txt file
+    # file_name = os.path.splitext(os.path.basename(file_path))[0]
+    txt_file_path = "./text_translation.txt"
+    with open(
+        txt_file_path, "w", encoding='utf-8', errors='replace'
+    ) as txt_file:
+        txt_file.write(complete_text)
+    return txt_file_path, complete_text
+# doc to video
+COLORS = {
+    "black": (0, 0, 0),
+    "white": (255, 255, 255),
+    "red": (255, 0, 0),
+    "green": (0, 255, 0),
+    "blue": (0, 0, 255),
+    "yellow": (255, 255, 0),
+    "light_gray": (200, 200, 200),
+    "light_blue": (173, 216, 230),
+    "light_green": (144, 238, 144),
+    "light_yellow": (255, 255, 224),
+    "light_pink": (255, 182, 193),
+    "lavender": (230, 230, 250),
+    "peach": (255, 218, 185),
+    "light_cyan": (224, 255, 255),
+    "light_salmon": (255, 160, 122),
+    "light_green_yellow": (173, 255, 47),
+}
+BORDER_COLORS = ["dynamic"] + list(COLORS.keys())
+def calculate_average_color(img):
+    # Resize the image to a small size for faster processing
+    img_small = img.resize((50, 50))
+    # Calculate the average color
+    average_color = img_small.convert("RGB").resize((1, 1)).getpixel((0, 0))
+    return average_color
+def add_border_to_image(
+    image_path,
+    target_width,
+    target_height,
+    border_color=None
+):
+    img = Image.open(image_path)
+    # Calculate the width and height for the new image with borders
+    original_width, original_height = img.size
+    original_aspect_ratio = original_width / original_height
+    target_aspect_ratio = target_width / target_height
+    # Resize the image to fit the target resolution retaining aspect ratio
+    if original_aspect_ratio > target_aspect_ratio:
+        # Image is wider, calculate new height
+        new_height = int(target_width / original_aspect_ratio)
+        resized_img = img.resize((target_width, new_height))
+    else:
+        # Image is taller, calculate new width
+        new_width = int(target_height * original_aspect_ratio)
+        resized_img = img.resize((new_width, target_height))
+    # Calculate padding for borders
+    padding = (0, 0, 0, 0)
+    if resized_img.size[0] != target_width or resized_img.size[1] != target_height:
+        if original_aspect_ratio > target_aspect_ratio:
+            # Add borders vertically
+            padding = (0, (target_height - resized_img.size[1]) // 2, 0, (target_height - resized_img.size[1]) // 2)
+        else:
+            # Add borders horizontally
+            padding = ((target_width - resized_img.size[0]) // 2, 0, (target_width - resized_img.size[0]) // 2, 0)
+    # Add borders with specified color
+    if not border_color or border_color == "dynamic":
+        border_color = calculate_average_color(resized_img)
+    else:
+        border_color = COLORS.get(border_color, (0, 0, 0))
+    bordered_img = ImageOps.expand(resized_img, padding, fill=border_color)
+    bordered_img.save(image_path)
+    return image_path
+def resize_and_position_subimage(
+    subimage,
+    max_width,
+    max_height,
+    subimage_position,
+    main_width,
+    main_height
+):
+    subimage_width, subimage_height = subimage.size
+    # Resize subimage if it exceeds maximum dimensions
+    if subimage_width > max_width or subimage_height > max_height:
+        # Calculate scaling factor
+        width_scale = max_width / subimage_width
+        height_scale = max_height / subimage_height
+        scale = min(width_scale, height_scale)
+        # Resize subimage
+        subimage = subimage.resize(
+            (int(subimage_width * scale), int(subimage_height * scale))
+        )
+    # Calculate position to place the subimage
+    if subimage_position == "top-left":
+        subimage_x = 0
+        subimage_y = 0
+    elif subimage_position == "top-right":
+        subimage_x = main_width - subimage.width
+        subimage_y = 0
+    elif subimage_position == "bottom-left":
+        subimage_x = 0
+        subimage_y = main_height - subimage.height
+    elif subimage_position == "bottom-right":
+        subimage_x = main_width - subimage.width
+        subimage_y = main_height - subimage.height
+    else:
+        raise ValueError(
+            "Invalid subimage_position. Choose from 'top-left', 'top-right',"
+            " 'bottom-left', or 'bottom-right'."
+        )
+    return subimage, subimage_x, subimage_y
+def create_image_with_text_and_subimages(
+    text,
+    subimages,
+    width,
+    height,
+    text_color,
+    background_color,
+    output_file
+):
+    # Create an image with the specified resolution and background color
+    image = Image.new('RGB', (width, height), color=background_color)
+    # Initialize ImageDraw object
+    draw = ImageDraw.Draw(image)
+    # Load a font
+    font = ImageFont.load_default()  # You can specify your font file here
+    # Calculate text size and position
+    text_bbox = draw.textbbox((0, 0), text, font=font)
+    text_width = text_bbox[2] - text_bbox[0]
+    text_height = text_bbox[3] - text_bbox[1]
+    text_x = (width - text_width) / 2
+    text_y = (height - text_height) / 2
+    # Draw text on the image
+    draw.text((text_x, text_y), text, fill=text_color, font=font)
+    # Paste subimages onto the main image
+    for subimage_path, subimage_position in subimages:
+        # Open the subimage
+        subimage = Image.open(subimage_path)
+        # Convert subimage to RGBA mode if it doesn't have an alpha channel
+        if subimage.mode != 'RGBA':
+            subimage = subimage.convert('RGBA')
+        # Resize and position the subimage
+        subimage, subimage_x, subimage_y = resize_and_position_subimage(
+            subimage, width / 4, height / 4, subimage_position, width, height
+        )
+        # Paste the subimage onto the main image
+        image.paste(subimage, (int(subimage_x), int(subimage_y)), subimage)
+    image.save(output_file)
+    return output_file
+def doc_to_txtximg_pages(
+    document,
+    width,
+    height,
+    start_page,
+    end_page,
+    bcolor
+):
+    from pypdf import PdfReader
+    images_folder = "pdf_images/"
+    os.makedirs(images_folder, exist_ok=True)
+    remove_directory_contents(images_folder)
+    # First image
+    text_image = os.path.basename(document)[:-4]
+    subimages = [("./assets/logo.jpeg", "top-left")]
+    text_color = (255, 255, 255) if bcolor == "black" else (0, 0, 0)  # w|b
+    background_color = COLORS.get(bcolor, (255, 255, 255))  # dynamic white
+    first_image = "pdf_images/0000_00_aaa.png"
+    create_image_with_text_and_subimages(
+        text_image,
+        subimages,
+        width,
+        height,
+        text_color,
+        background_color,
+        first_image
+    )
+    reader = PdfReader(document)
+    logger.debug(f"Total pages: {reader.get_num_pages()}")
+    start_page_idx = max((start_page-1), 0)
+    end_page_inx = min((end_page), (reader.get_num_pages()))
+    document_pages = reader.pages[start_page_idx:end_page_inx]
+    logger.info(
+        f"Selected pages from {start_page_idx} to {end_page_inx}: "
+        f"{len(document_pages)}"
+    )
+    data_doc = {}
+    for i, page in enumerate(document_pages):
+        count = 0
+        images = []
+        for image_file_object in page.images:
+            img_name = f"{images_folder}{i:04d}_{count:02d}_{image_file_object.name}"
+            images.append(img_name)
+            with open(img_name, "wb") as fp:
+                fp.write(image_file_object.data)
+                count += 1
+            img_name = add_border_to_image(img_name, width, height, bcolor)
+        data_doc[i] = {
+            "text": remove_hyphens(page.extract_text()),
+            "images": images
+        }
+    return data_doc
+def page_data_to_segments(result_text=None, chunk_size=None):
+    if not chunk_size:
+        chunk_size = 100
+    segments_chunks = []
+    time_global = 0
+    for page, result_data in result_text.items():
+        # result_image = result_data["images"]
+        result_text = result_data["text"]
+        text_chunks = split_text_into_chunks(result_text, chunk_size)
+        if not text_chunks:
+            text_chunks = [" "]
+        for chunk in text_chunks:
+            chunk_dict = {
+                "text": chunk,
+                "start": (1.0 + time_global),
+                "end": (2.0 + time_global),
+                "speaker": "SPEAKER_00",
+                "page": page,
+            }
+            segments_chunks.append(chunk_dict)
+            time_global += 1
+    result_diarize = {"segments": segments_chunks}
+    return result_diarize
+def update_page_data(result_diarize, doc_data):
+    complete_text = ""
+    current_page = result_diarize["segments"][0]["page"]
+    text_page = ""
+    for seg in result_diarize["segments"]:
+        text = seg["text"] + " "  # issue
+        complete_text += text
+        page = seg["page"]
+        if page == current_page:
+            text_page += text
+        else:
+            doc_data[current_page]["text"] = text_page
+            # Next
+            text_page = text
+            current_page = page
+    if doc_data[current_page]["text"] != text_page:
+        doc_data[current_page]["text"] = text_page
+    return doc_data
+def fix_timestamps_docs(result_diarize, audio_files):
+    current_start = 0.0
+    for seg, audio in zip(result_diarize["segments"], audio_files):
+        duration = round(sf.info(audio).duration, 2)
+        seg["start"] = current_start
+        current_start += duration
+        seg["end"] = current_start
+    return result_diarize
+def create_video_from_images(
+    doc_data,
+    result_diarize
+):
+    # First image path
+    first_image = "pdf_images/0000_00_aaa.png"
+    # Time segments and images
+    max_pages_idx = len(doc_data) - 1
+    current_page = result_diarize["segments"][0]["page"]
+    duration_page = 0.0
+    last_image = None
+    for seg in result_diarize["segments"]:
+        start = seg["start"]
+        end = seg["end"]
+        duration_seg = end - start
+        page = seg["page"]
+        if page == current_page:
+            duration_page += duration_seg
+        else:
+            images = doc_data[current_page]["images"]
+            if first_image:
+                images = [first_image] + images
+                first_image = None
+            if not doc_data[min(max_pages_idx, (current_page+1))]["text"].strip():
+                images = images + doc_data[min(max_pages_idx, (current_page+1))]["images"]
+            if not images and last_image:
+                images = [last_image]
+            # Calculate images duration
+            time_duration_per_image = round((duration_page / len(images)), 2)
+            doc_data[current_page]["time_per_image"] = time_duration_per_image
+            # Next values
+            doc_data[current_page]["images"] = images
+            last_image = images[-1]
+            duration_page = duration_seg
+            current_page = page
+    if "time_per_image" not in doc_data[current_page].keys():
+        images = doc_data[current_page]["images"]
+        if first_image:
+            images = [first_image] + images
+        if not images:
+            images = [last_image]
+        time_duration_per_image = round((duration_page / len(images)), 2)
+        doc_data[current_page]["time_per_image"] = time_duration_per_image
+    # Timestamped image video.
+    with open("list.txt", "w") as file:
+        for i, page in enumerate(doc_data.values()):
+            duration = page["time_per_image"]
+            for img in page["images"]:
+                if i == len(doc_data) - 1 and img == page["images"][-1]:  # Check if it's the last item
+                    file.write(f"file {img}\n")
+                    file.write(f"outpoint {duration}")
+                else:
+                    file.write(f"file {img}\n")
+                    file.write(f"outpoint {duration}\n")
+    out_video = "video_from_images.mp4"
+    remove_files(out_video)
+    cm = f"ffmpeg -y -f concat -i list.txt -c:v libx264 -preset veryfast -crf 18 -pix_fmt yuv420p {out_video}"
+    cm_alt = f"ffmpeg -f concat -i list.txt -c:v libx264 -r 30 -pix_fmt yuv420p -y {out_video}"
+    try:
+        run_command(cm)
+    except Exception as error:
+        logger.error(str(error))
+        remove_files(out_video)
+        run_command(cm_alt)
+    return out_video
+def merge_video_and_audio(video_doc, final_wav_file):
+    fixed_audio = "fixed_audio.mp3"
+    remove_files(fixed_audio)
+    cm = f"ffmpeg -i {final_wav_file} -c:a libmp3lame {fixed_audio}"
+    run_command(cm)
+    vid_out = "video_book.mp4"
+    remove_files(vid_out)
+    cm = f"ffmpeg -i {video_doc} -i {fixed_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {vid_out}"
+    run_command(cm)
+    return vid_out
+# subtitles
+def get_subtitle(
+    language,
+    segments_data,
+    extension,
+    filename=None,
+    highlight_words=False,
+):
+    if not filename:
+        filename = "task_subtitle"
+    is_ass_extension = False
+    if extension == "ass":
+        is_ass_extension = True
+        extension = "srt"
+    sub_file = filename + "." + extension
+    support_name = filename + ".mp3"
+    remove_files(sub_file)
+    writer = get_writer(extension, output_dir=".")
+    word_options = {
+        "highlight_words": highlight_words,
+        "max_line_count": None,
+        "max_line_width": None,
+    }
+    # Get data subs
+    subtitle_data = copy.deepcopy(segments_data)
+    subtitle_data["language"] = (
+        "ja" if language in ["ja", "zh", "zh-TW"] else language
+    )
+    # Clean
+    if not highlight_words:
+        subtitle_data.pop("word_segments", None)
+        for segment in subtitle_data["segments"]:
+            for key in ["speaker", "chars", "words"]:
+                segment.pop(key, None)
+    writer(
+        subtitle_data,
+        support_name,
+        word_options,
+    )
+    if is_ass_extension:
+        temp_name = filename + ".ass"
+        remove_files(temp_name)
+        convert_sub = f'ffmpeg -i "{sub_file}" "{temp_name}" -y'
+        run_command(convert_sub)
+        sub_file = temp_name
+    return sub_file
+def process_subtitles(
+    deep_copied_result,
+    align_language,
+    result_diarize,
+    output_format_subtitle,
+    TRANSLATE_AUDIO_TO,
+):
+    name_ori = "sub_ori."
+    name_tra = "sub_tra."
+    remove_files(
+        [name_ori + output_format_subtitle, name_tra + output_format_subtitle]
+    )
+    writer = get_writer(output_format_subtitle, output_dir=".")
+    word_options = {
+        "highlight_words": False,
+        "max_line_count": None,
+        "max_line_width": None,
+    }
+    # original lang
+    subs_copy_result = copy.deepcopy(deep_copied_result)
+    subs_copy_result["language"] = (
+        "zh" if align_language == "zh-TW" else align_language
+    )
+    for segment in subs_copy_result["segments"]:
+        segment.pop("speaker", None)
+    try:
+        writer(
+            subs_copy_result,
+            name_ori[:-1] + ".mp3",
+            word_options,
+        )
+    except Exception as error:
+        logger.error(str(error))
+        if str(error) == "list indices must be integers or slices, not str":
+            logger.error(
+                "Related to poor word segmentation"
+                " in segments after alignment."
+            )
+        subs_copy_result["segments"][0].pop("words")
+        writer(
+            subs_copy_result,
+            name_ori[:-1] + ".mp3",
+            word_options,
+        )
+    # translated lang
+    subs_tra_copy_result = copy.deepcopy(result_diarize)
+    subs_tra_copy_result["language"] = (
+        "ja" if TRANSLATE_AUDIO_TO in ["ja", "zh", "zh-TW"] else align_language
+    )
+    subs_tra_copy_result.pop("word_segments", None)
+    for segment in subs_tra_copy_result["segments"]:
+        for key in ["speaker", "chars", "words"]:
+            segment.pop(key, None)
+    writer(
+        subs_tra_copy_result,
+        name_tra[:-1] + ".mp3",
+        word_options,
+    )
+    return name_tra + output_format_subtitle
+def linguistic_level_segments(
+    result_base,
+    linguistic_unit="word",  # word or char
+):
+    linguistic_unit = linguistic_unit[:4]
+    linguistic_unit_key = linguistic_unit + "s"
+    result = copy.deepcopy(result_base)
+    if linguistic_unit_key not in result["segments"][0].keys():
+        raise ValueError("No alignment detected, can't process")
+    segments_by_unit = []
+    for segment in result["segments"]:
+        segment_units = segment[linguistic_unit_key]
+        # segment_speaker = segment.get("speaker", "SPEAKER_00")
+        for unit in segment_units:
+            text = unit[linguistic_unit]
+            if "start" in unit.keys():
+                segments_by_unit.append(
+                    {
+                        "start": unit["start"],
+                        "end": unit["end"],
+                        "text": text,
+                        # "speaker": segment_speaker,
+                    }
+                    )
+            elif not segments_by_unit:
+                pass
+            else:
+                segments_by_unit[-1]["text"] += text
+    return {"segments": segments_by_unit}
+def break_aling_segments(
+    result: dict,
+    break_characters: str = "",  # ":|,|.|"
+):
+    result_align = copy.deepcopy(result)
+    break_characters_list = break_characters.split("|")
+    break_characters_list = [i for i in break_characters_list if i != '']
+    if not break_characters_list:
+        logger.info("No valid break characters were specified.")
+        return result
+    logger.info(f"Redivide text segments by: {str(break_characters_list)}")
+    # create new with filters
+    normal = []
+    def process_chars(chars, letter_new_start, num, text):
+        start_key, end_key = "start", "end"
+        start_value = end_value = None
+        for char in chars:
+            if start_key in char:
+                start_value = char[start_key]
+                break
+        for char in reversed(chars):
+            if end_key in char:
+                end_value = char[end_key]
+                break
+        if not start_value or not end_value:
+            raise Exception(
+                f"Unable to obtain a valid timestamp for chars: {str(chars)}"
+            )
+        return {
+            "start": start_value,
+            "end": end_value,
+            "text": text,
+            "words": chars,
+        }
+    for i, segment in enumerate(result_align['segments']):
+        logger.debug(f"- Process segment: {i}, text: {segment['text']}")
+        # start = segment['start']
+        letter_new_start = 0
+        for num, char in enumerate(segment['chars']):
+            if char["char"] is None:
+                continue
+            # if "start" in char:
+            #     start = char["start"]
+            # if "end" in char:
+            #     end = char["end"]
+            # Break by character
+            if char['char'] in break_characters_list:
+                text = segment['text'][letter_new_start:num+1]
+                logger.debug(
+                    f"Break in: {char['char']}, position: {num}, text: {text}"
+                )
+                chars = segment['chars'][letter_new_start:num+1]
+                if not text:
+                    logger.debug("No text")
+                    continue
+                if num == 0 and not text.strip():
+                    logger.debug("blank space in start")
+                    continue
+                if len(text) == 1:
+                    logger.debug(f"Short char append, num: {num}")
+                    normal[-1]["text"] += text
+                    normal[-1]["words"].append(chars)
+                    continue
+                # logger.debug(chars)
+                normal_dict = process_chars(chars, letter_new_start, num, text)
+                letter_new_start = num+1
+                normal.append(normal_dict)
+            # If we reach the end of the segment, add the last part of chars.
+            if num == len(segment["chars"]) - 1:
+                text = segment['text'][letter_new_start:num+1]
+                # If remain text len is not default len text
+                if num not in [len(text)-1, len(text)] and text:
+                    logger.debug(f'Remaining text: {text}')
+                if not text:
+                    logger.debug("No remaining text.")
+                    continue
+                if len(text) == 1:
+                    logger.debug(f"Short char append, num: {num}")
+                    normal[-1]["text"] += text
+                    normal[-1]["words"].append(chars)
+                    continue
+                chars = segment['chars'][letter_new_start:num+1]
+                normal_dict = process_chars(chars, letter_new_start, num, text)
+                letter_new_start = num+1
+                normal.append(normal_dict)
+    # Rename char to word
+    for item in normal:
+        words_list = item['words']
+        for word_item in words_list:
+            if 'char' in word_item:
+                word_item['word'] = word_item.pop('char')
+    # Convert to dict default
+    break_segments = {"segments": normal}
+    msg_count = (
+        f"Segment count before: {len(result['segments'])}, "
+        f"after: {len(break_segments['segments'])}."
+    )
+    logger.info(msg_count)
+    return break_segments

quantum_dubbing/text_to_speech.py ADDED Viewed

	@@ -0,0 +1,1574 @@

+from gtts import gTTS
+import edge_tts, asyncio, json, glob # noqa
+from tqdm import tqdm
+import librosa, os, re, torch, gc, subprocess # noqa
+from .language_configuration import (
+    fix_code_language,
+    BARK_VOICES_LIST,
+    VITS_VOICES_LIST,
+)
+from .utils import (
+    download_manager,
+    create_directories,
+    copy_files,
+    rename_file,
+    remove_directory_contents,
+    remove_files,
+    run_command,
+)
+import numpy as np
+from typing import Any, Dict
+from pathlib import Path
+import soundfile as sf
+import platform
+import logging
+import traceback
+from .logging_setup import logger
+class TTS_OperationError(Exception):
+    def __init__(self, message="The operation did not complete successfully."):
+        self.message = message
+        super().__init__(self.message)
+def verify_saved_file_and_size(filename):
+    if not os.path.exists(filename):
+        raise TTS_OperationError(f"File '{filename}' was not saved.")
+    if os.path.getsize(filename) == 0:
+        raise TTS_OperationError(
+            f"File '{filename}' has a zero size. "
+            "Related to incorrect TTS for the target language"
+        )
+def error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename):
+    traceback.print_exc()
+    logger.error(f"Error: {str(error)}")
+    try:
+        from tempfile import TemporaryFile
+        tts = gTTS(segment["text"], lang=fix_code_language(TRANSLATE_AUDIO_TO))
+        # tts.save(filename)
+        f = TemporaryFile()
+        tts.write_to_fp(f)
+        # Reset the file pointer to the beginning of the file
+        f.seek(0)
+        # Read audio data from the TemporaryFile using soundfile
+        audio_data, samplerate = sf.read(f)
+        f.close()  # Close the TemporaryFile
+        sf.write(
+            filename, audio_data, samplerate, format="ogg", subtype="vorbis"
+        )
+        logger.warning(
+            'TTS auxiliary will be utilized '
+            f'rather than TTS: {segment["tts_name"]}'
+        )
+        verify_saved_file_and_size(filename)
+    except Exception as error:
+        logger.critical(f"Error: {str(error)}")
+        sample_rate_aux = 22050
+        duration = float(segment["end"]) - float(segment["start"])
+        data = np.zeros(int(sample_rate_aux * duration)).astype(np.float32)
+        sf.write(
+            filename, data, sample_rate_aux, format="ogg", subtype="vorbis"
+        )
+        logger.error("Audio will be replaced -> [silent audio].")
+        verify_saved_file_and_size(filename)
+def pad_array(array, sr):
+    if isinstance(array, list):
+        array = np.array(array)
+    if not array.shape[0]:
+        raise ValueError("The generated audio does not contain any data")
+    valid_indices = np.where(np.abs(array) > 0.001)[0]
+    if len(valid_indices) == 0:
+        logger.debug(f"No valid indices: {array}")
+        return array
+    try:
+        pad_indice = int(0.1 * sr)
+        start_pad = max(0, valid_indices[0] - pad_indice)
+        end_pad = min(len(array), valid_indices[-1] + 1 + pad_indice)
+        padded_array = array[start_pad:end_pad]
+        return padded_array
+    except Exception as error:
+        logger.error(str(error))
+        return array
+# =====================================
+# EDGE TTS
+# =====================================
+def edge_tts_voices_list():
+    try:
+        completed_process = subprocess.run(
+            ["edge-tts", "--list-voices"], capture_output=True, text=True
+        )
+        lines = completed_process.stdout.strip().split("\n")
+    except Exception as error:
+        logger.debug(str(error))
+        lines = []
+    voices = []
+    for line in lines:
+        if line.startswith("Name: "):
+            voice_entry = {}
+            voice_entry["Name"] = line.split(": ")[1]
+        elif line.startswith("Gender: "):
+            voice_entry["Gender"] = line.split(": ")[1]
+            voices.append(voice_entry)
+    formatted_voices = [
+        f"{entry['Name']}-{entry['Gender']}" for entry in voices
+    ]
+    if not formatted_voices:
+        logger.warning(
+            "The list of Edge TTS voices could not be obtained, "
+            "switching to an alternative method"
+        )
+        tts_voice_list = asyncio.new_event_loop().run_until_complete(
+            edge_tts.list_voices()
+        )
+        formatted_voices = sorted(
+            [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
+        )
+    if not formatted_voices:
+        logger.error("Can't get EDGE TTS - list voices")
+    return formatted_voices
+def segments_egde_tts(filtered_edge_segments, TRANSLATE_AUDIO_TO, is_gui):
+    for segment in tqdm(filtered_edge_segments["segments"]):
+        speaker = segment["speaker"] # noqa
+        text = segment["text"]
+        start = segment["start"]
+        tts_name = segment["tts_name"]
+        # make the tts audio
+        filename = f"audio/{start}.ogg"
+        temp_file = filename[:-3] + "mp3"
+        logger.info(f"{text} >> {filename}")
+        try:
+            if is_gui:
+                asyncio.run(
+                    edge_tts.Communicate(
+                        text, "-".join(tts_name.split("-")[:-1])
+                    ).save(temp_file)
+                )
+            else:
+                # nest_asyncio.apply() if not is_gui else None
+                command = f'edge-tts -t "{text}" -v "{tts_name.replace("-Male", "").replace("-Female", "")}" --write-media "{temp_file}"'
+                run_command(command)
+            verify_saved_file_and_size(temp_file)
+            data, sample_rate = sf.read(temp_file)
+            data = pad_array(data, sample_rate)
+            # os.remove(temp_file)
+            # Save file
+            sf.write(
+                file=filename,
+                samplerate=sample_rate,
+                data=data,
+                format="ogg",
+                subtype="vorbis",
+            )
+            verify_saved_file_and_size(filename)
+        except Exception as error:
+            error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
+# =====================================
+# BARK TTS
+# =====================================
+def segments_bark_tts(
+    filtered_bark_segments, TRANSLATE_AUDIO_TO, model_id_bark="suno/bark-small"
+):
+    from transformers import AutoProcessor, BarkModel
+    from optimum.bettertransformer import BetterTransformer
+    device = os.environ.get("QUANTUM_DEVICE")
+    torch_dtype_env = torch.float16 if device == "cuda" else torch.float32
+    # load model bark
+    model = BarkModel.from_pretrained(
+        model_id_bark, torch_dtype=torch_dtype_env
+    ).to(device)
+    model = model.to(device)
+    processor = AutoProcessor.from_pretrained(
+        model_id_bark, return_tensors="pt"
+    )  # , padding=True
+    if device == "cuda":
+        # convert to bettertransformer
+        model = BetterTransformer.transform(model, keep_original_model=False)
+        # enable CPU offload
+        # model.enable_cpu_offload()
+    sampling_rate = model.generation_config.sample_rate
+    # filtered_segments = filtered_bark_segments['segments']
+    # Sorting the segments by 'tts_name'
+    # sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name'])
+    # logger.debug(sorted_segments)
+    for segment in tqdm(filtered_bark_segments["segments"]):
+        speaker = segment["speaker"] # noqa
+        text = segment["text"]
+        start = segment["start"]
+        tts_name = segment["tts_name"]
+        inputs = processor(text, voice_preset=BARK_VOICES_LIST[tts_name]).to(
+            device
+        )
+        # make the tts audio
+        filename = f"audio/{start}.ogg"
+        logger.info(f"{text} >> {filename}")
+        try:
+            # Infer
+            with torch.inference_mode():
+                speech_output = model.generate(
+                    **inputs,
+                    do_sample=True,
+                    fine_temperature=0.4,
+                    coarse_temperature=0.8,
+                    pad_token_id=processor.tokenizer.pad_token_id,
+                )
+            # Save file
+            data_tts = pad_array(
+                speech_output.cpu().numpy().squeeze().astype(np.float32),
+                sampling_rate,
+            )
+            sf.write(
+                file=filename,
+                samplerate=sampling_rate,
+                data=data_tts,
+                format="ogg",
+                subtype="vorbis",
+            )
+            verify_saved_file_and_size(filename)
+        except Exception as error:
+            error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
+        gc.collect()
+        torch.cuda.empty_cache()
+    try:
+        del processor
+        del model
+        gc.collect()
+        torch.cuda.empty_cache()
+    except Exception as error:
+        logger.error(str(error))
+        gc.collect()
+        torch.cuda.empty_cache()
+# =====================================
+# VITS TTS
+# =====================================
+def uromanize(input_string):
+    """Convert non-Roman strings to Roman using the `uroman` perl package."""
+    # script_path = os.path.join(uroman_path, "bin", "uroman.pl")
+    if not os.path.exists("./uroman"):
+        logger.info(
+            "Clonning repository uroman https://github.com/isi-nlp/uroman.git"
+            " for romanize the text"
+        )
+        process = subprocess.Popen(
+            ["git", "clone", "https://github.com/isi-nlp/uroman.git"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        stdout, stderr = process.communicate()
+    script_path = os.path.join("./uroman", "uroman", "uroman.pl")
+    command = ["perl", script_path]
+    process = subprocess.Popen(
+        command,
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    # Execute the perl command
+    stdout, stderr = process.communicate(input=input_string.encode())
+    if process.returncode != 0:
+        raise ValueError(f"Error {process.returncode}: {stderr.decode()}")
+    # Return the output as a string and skip the new-line character at the end
+    return stdout.decode()[:-1]
+def segments_vits_tts(filtered_vits_segments, TRANSLATE_AUDIO_TO):
+    from transformers import VitsModel, AutoTokenizer
+    filtered_segments = filtered_vits_segments["segments"]
+    # Sorting the segments by 'tts_name'
+    sorted_segments = sorted(filtered_segments, key=lambda x: x["tts_name"])
+    logger.debug(sorted_segments)
+    model_name_key = None
+    for segment in tqdm(sorted_segments):
+        speaker = segment["speaker"] # noqa
+        text = segment["text"]
+        start = segment["start"]
+        tts_name = segment["tts_name"]
+        if tts_name != model_name_key:
+            model_name_key = tts_name
+            model = VitsModel.from_pretrained(VITS_VOICES_LIST[tts_name])
+            tokenizer = AutoTokenizer.from_pretrained(
+                VITS_VOICES_LIST[tts_name]
+            )
+            sampling_rate = model.config.sampling_rate
+        if tokenizer.is_uroman:
+            romanize_text = uromanize(text)
+            logger.debug(f"Romanize text: {romanize_text}")
+            inputs = tokenizer(romanize_text, return_tensors="pt")
+        else:
+            inputs = tokenizer(text, return_tensors="pt")
+        # make the tts audio
+        filename = f"audio/{start}.ogg"
+        logger.info(f"{text} >> {filename}")
+        try:
+            # Infer
+            with torch.no_grad():
+                speech_output = model(**inputs).waveform
+            data_tts = pad_array(
+                speech_output.cpu().numpy().squeeze().astype(np.float32),
+                sampling_rate,
+            )
+            # Save file
+            sf.write(
+                file=filename,
+                samplerate=sampling_rate,
+                data=data_tts,
+                format="ogg",
+                subtype="vorbis",
+            )
+            verify_saved_file_and_size(filename)
+        except Exception as error:
+            error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
+        gc.collect()
+        torch.cuda.empty_cache()
+    try:
+        del tokenizer
+        del model
+        gc.collect()
+        torch.cuda.empty_cache()
+    except Exception as error:
+        logger.error(str(error))
+        gc.collect()
+        torch.cuda.empty_cache()
+# =====================================
+# Coqui XTTS
+# =====================================
+def coqui_xtts_voices_list():
+    main_folder = "_XTTS_"
+    pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
+    pattern_automatic_speaker = re.compile(r"AUTOMATIC_SPEAKER_\d+\.wav$")
+    # List only files in the directory matching the pattern but not matching
+    # AUTOMATIC_SPEAKER_00.wav, AUTOMATIC_SPEAKER_01.wav, etc.
+    wav_voices = [
+        "_XTTS_/" + f
+        for f in os.listdir(main_folder)
+        if os.path.isfile(os.path.join(main_folder, f))
+        and pattern_coqui.match(f)
+        and not pattern_automatic_speaker.match(f)
+    ]
+    return ["_XTTS_/AUTOMATIC.wav"] + wav_voices
+def seconds_to_hhmmss_ms(seconds):
+    hours = seconds // 3600
+    minutes = (seconds % 3600) // 60
+    seconds = seconds % 60
+    milliseconds = int((seconds - int(seconds)) * 1000)
+    return "%02d:%02d:%02d.%03d" % (hours, minutes, int(seconds), milliseconds)
+def audio_trimming(audio_path, destination, start, end):
+    if isinstance(start, (int, float)):
+        start = seconds_to_hhmmss_ms(start)
+    if isinstance(end, (int, float)):
+        end = seconds_to_hhmmss_ms(end)
+    if destination:
+        file_directory = destination
+    else:
+        file_directory = os.path.dirname(audio_path)
+    file_name = os.path.splitext(os.path.basename(audio_path))[0]
+    file_ = f"{file_name}_trim.wav"
+    # file_ = f'{os.path.splitext(audio_path)[0]}_trim.wav'
+    output_path = os.path.join(file_directory, file_)
+    # -t (duration from -ss) | -to (time stop) | -af silenceremove=1:0:-50dB (remove silence)
+    command = f'ffmpeg -y -loglevel error -i "{audio_path}" -ss {start} -to {end} -acodec pcm_s16le -f wav "{output_path}"'
+    run_command(command)
+    return output_path
+def convert_to_xtts_good_sample(audio_path: str = "", destination: str = ""):
+    if destination:
+        file_directory = destination
+    else:
+        file_directory = os.path.dirname(audio_path)
+    file_name = os.path.splitext(os.path.basename(audio_path))[0]
+    file_ = f"{file_name}_good_sample.wav"
+    # file_ = f'{os.path.splitext(audio_path)[0]}_good_sample.wav'
+    mono_path = os.path.join(file_directory, file_)  # get root
+    command = f'ffmpeg -y -loglevel error -i "{audio_path}" -ac 1 -ar 22050 -sample_fmt s16 -f wav "{mono_path}"'
+    run_command(command)
+    return mono_path
+def sanitize_file_name(file_name):
+    import unicodedata
+    # Normalize the string to NFKD form to separate combined characters into
+    # base characters and diacritics
+    normalized_name = unicodedata.normalize("NFKD", file_name)
+    # Replace any non-ASCII characters or special symbols with an underscore
+    sanitized_name = re.sub(r"[^\w\s.-]", "_", normalized_name)
+    return sanitized_name
+def create_wav_file_vc(
+    sample_name="",  # name final file
+    audio_wav="",  # path
+    start=None,  # trim start
+    end=None,  # trim end
+    output_final_path="_XTTS_",
+    get_vocals_dereverb=True,
+):
+    sample_name = sample_name if sample_name else "default_name"
+    sample_name = sanitize_file_name(sample_name)
+    audio_wav = audio_wav if isinstance(audio_wav, str) else audio_wav.name
+    BASE_DIR = (
+        "."  # os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    )
+    output_dir = os.path.join(BASE_DIR, "clean_song_output")  # remove content
+    # remove_directory_contents(output_dir)
+    if start or end:
+        # Cut file
+        audio_segment = audio_trimming(audio_wav, output_dir, start, end)
+    else:
+        # Complete file
+        audio_segment = audio_wav
+    from .mdx_net import process_uvr_task
+    try:
+        _, _, _, _, audio_segment = process_uvr_task(
+            orig_song_path=audio_segment,
+            main_vocals=True,
+            dereverb=get_vocals_dereverb,
+        )
+    except Exception as error:
+        logger.error(str(error))
+    sample = convert_to_xtts_good_sample(audio_segment)
+    sample_name = f"{sample_name}.wav"
+    sample_rename = rename_file(sample, sample_name)
+    copy_files(sample_rename, output_final_path)
+    final_sample = os.path.join(output_final_path, sample_name)
+    if os.path.exists(final_sample):
+        logger.info(final_sample)
+        return final_sample
+    else:
+        raise Exception(f"Error wav: {final_sample}")
+def create_new_files_for_vc(
+    speakers_coqui,
+    segments_base,
+    dereverb_automatic=True
+):
+    # before function delete automatic delete_previous_automatic
+    output_dir = os.path.join(".", "clean_song_output")  # remove content
+    remove_directory_contents(output_dir)
+    for speaker in speakers_coqui:
+        filtered_speaker = [
+            segment
+            for segment in segments_base
+            if segment["speaker"] == speaker
+        ]
+        if len(filtered_speaker) > 4:
+            filtered_speaker = filtered_speaker[1:]
+        if filtered_speaker[0]["tts_name"] == "_XTTS_/AUTOMATIC.wav":
+            name_automatic_wav = f"AUTOMATIC_{speaker}"
+            if os.path.exists(f"_XTTS_/{name_automatic_wav}.wav"):
+                logger.info(f"WAV automatic {speaker} exists")
+                # path_wav = path_automatic_wav
+                pass
+            else:
+                # create wav
+                wav_ok = False
+                for seg in filtered_speaker:
+                    duration = float(seg["end"]) - float(seg["start"])
+                    if duration > 7.0 and duration < 12.0:
+                        logger.info(
+                            f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {duration}, {seg["text"]}'
+                        )
+                        create_wav_file_vc(
+                            sample_name=name_automatic_wav,
+                            audio_wav="audio.wav",
+                            start=(float(seg["start"]) + 1.0),
+                            end=(float(seg["end"]) - 1.0),
+                            get_vocals_dereverb=dereverb_automatic,
+                        )
+                        wav_ok = True
+                        break
+                if not wav_ok:
+                    logger.info("Taking the first segment")
+                    seg = filtered_speaker[0]
+                    logger.info(
+                        f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {seg["text"]}'
+                    )
+                    max_duration = float(seg["end"]) - float(seg["start"])
+                    max_duration = max(2.0, min(max_duration, 9.0))
+                    create_wav_file_vc(
+                        sample_name=name_automatic_wav,
+                        audio_wav="audio.wav",
+                        start=(float(seg["start"])),
+                        end=(float(seg["start"]) + max_duration),
+                        get_vocals_dereverb=dereverb_automatic,
+                    )
+def segments_coqui_tts(
+    filtered_coqui_segments,
+    TRANSLATE_AUDIO_TO,
+    model_id_coqui="tts_models/multilingual/multi-dataset/xtts_v2",
+    speakers_coqui=None,
+    delete_previous_automatic=True,
+    dereverb_automatic=True,
+    emotion=None,
+):
+    """XTTS
+    Install:
+    pip install -q TTS==0.21.1
+    pip install -q numpy==1.23.5
+    Notes:
+    - tts_name is the wav|mp3|ogg|m4a file for VC
+    """
+    from TTS.api import TTS
+    TRANSLATE_AUDIO_TO = fix_code_language(TRANSLATE_AUDIO_TO, syntax="coqui")
+    supported_lang_coqui = [
+        "zh-cn",
+        "en",
+        "fr",
+        "de",
+        "it",
+        "pt",
+        "pl",
+        "tr",
+        "ru",
+        "nl",
+        "cs",
+        "ar",
+        "es",
+        "hu",
+        "ko",
+        "ja",
+    ]
+    if TRANSLATE_AUDIO_TO not in supported_lang_coqui:
+        raise TTS_OperationError(
+            f"'{TRANSLATE_AUDIO_TO}' is not a supported language for Coqui XTTS"
+        )
+    # Emotion and speed can only be used with Coqui Studio models. discontinued
+    # emotions = ["Neutral", "Happy", "Sad", "Angry", "Dull"]
+    if delete_previous_automatic:
+        for spk in speakers_coqui:
+            remove_files(f"_XTTS_/AUTOMATIC_{spk}.wav")
+    directory_audios_vc = "_XTTS_"
+    create_directories(directory_audios_vc)
+    create_new_files_for_vc(
+        speakers_coqui,
+        filtered_coqui_segments["segments"],
+        dereverb_automatic,
+    )
+    # Init TTS
+    device = os.environ.get("QUANTUM_DEVICE")
+    model = TTS(model_id_coqui).to(device)
+    sampling_rate = 24000
+    # filtered_segments = filtered_coqui_segments['segments']
+    # Sorting the segments by 'tts_name'
+    # sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name'])
+    # logger.debug(sorted_segments)
+    for segment in tqdm(filtered_coqui_segments["segments"]):
+        speaker = segment["speaker"]
+        text = segment["text"]
+        start = segment["start"]
+        tts_name = segment["tts_name"]
+        if tts_name == "_XTTS_/AUTOMATIC.wav":
+            tts_name = f"_XTTS_/AUTOMATIC_{speaker}.wav"
+        # make the tts audio
+        filename = f"audio/{start}.ogg"
+        logger.info(f"{text} >> {filename}")
+        try:
+            # Infer
+            wav = model.tts(
+                text=text, speaker_wav=tts_name, language=TRANSLATE_AUDIO_TO
+            )
+            data_tts = pad_array(
+                wav,
+                sampling_rate,
+            )
+            # Save file
+            sf.write(
+                file=filename,
+                samplerate=sampling_rate,
+                data=data_tts,
+                format="ogg",
+                subtype="vorbis",
+            )
+            verify_saved_file_and_size(filename)
+        except Exception as error:
+            error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
+        gc.collect()
+        torch.cuda.empty_cache()
+    try:
+        del model
+        gc.collect()
+        torch.cuda.empty_cache()
+    except Exception as error:
+        logger.error(str(error))
+        gc.collect()
+        torch.cuda.empty_cache()
+# =====================================
+# PIPER TTS
+# =====================================
+def piper_tts_voices_list():
+    file_path = download_manager(
+        url="https://huggingface.co/rhasspy/piper-voices/resolve/main/voices.json",
+        path="./PIPER_MODELS",
+    )
+    with open(file_path, "r", encoding="utf8") as file:
+        data = json.load(file)
+    piper_id_models = [key + " VITS-onnx" for key in data.keys()]
+    return piper_id_models
+def replace_text_in_json(file_path, key_to_replace, new_text, condition=None):
+    # Read the JSON file
+    with open(file_path, "r", encoding="utf-8") as file:
+        data = json.load(file)
+    # Modify the specified key's value with the new text
+    if key_to_replace in data:
+        if condition:
+            value_condition = condition
+        else:
+            value_condition = data[key_to_replace]
+        if data[key_to_replace] == value_condition:
+            data[key_to_replace] = new_text
+    # Write the modified content back to the JSON file
+    with open(file_path, "w") as file:
+        json.dump(
+            data, file, indent=2
+        )  # Write the modified data back to the file with indentation for readability
+def load_piper_model(
+    model: str,
+    data_dir: list,
+    download_dir: str = "",
+    update_voices: bool = False,
+):
+    from piper import PiperVoice
+    from piper.download import ensure_voice_exists, find_voice, get_voices
+    try:
+        import onnxruntime as rt
+        if rt.get_device() == "GPU" and os.environ.get("QUANTUM_DEVICE") == "cuda":
+            logger.debug("onnxruntime device > GPU")
+            cuda = True
+        else:
+            logger.info(
+                "onnxruntime device > CPU"
+            )  # try pip install onnxruntime-gpu
+            cuda = False
+    except Exception as error:
+        raise TTS_OperationError(f"onnxruntime error: {str(error)}")
+    # Disable CUDA in Windows
+    if platform.system() == "Windows":
+        logger.info("Employing CPU exclusivity with Piper TTS")
+        cuda = False
+    if not download_dir:
+        # Download to first data directory by default
+        download_dir = data_dir[0]
+    else:
+        data_dir = [os.path.join(data_dir[0], download_dir)]
+    # Download voice if file doesn't exist
+    model_path = Path(model)
+    if not model_path.exists():
+        # Load voice info
+        voices_info = get_voices(download_dir, update_voices=update_voices)
+        # Resolve aliases for backwards compatibility with old voice names
+        aliases_info: Dict[str, Any] = {}
+        for voice_info in voices_info.values():
+            for voice_alias in voice_info.get("aliases", []):
+                aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
+        voices_info.update(aliases_info)
+        ensure_voice_exists(model, data_dir, download_dir, voices_info)
+        model, config = find_voice(model, data_dir)
+        replace_text_in_json(
+            config, "phoneme_type", "espeak", "PhonemeType.ESPEAK"
+        )
+    # Load voice
+    voice = PiperVoice.load(model, config_path=config, use_cuda=cuda)
+    return voice
+def synthesize_text_to_audio_np_array(voice, text, synthesize_args):
+    audio_stream = voice.synthesize_stream_raw(text, **synthesize_args)
+    # Collect the audio bytes into a single NumPy array
+    audio_data = b""
+    for audio_bytes in audio_stream:
+        audio_data += audio_bytes
+    # Ensure correct data type and convert audio bytes to NumPy array
+    audio_np = np.frombuffer(audio_data, dtype=np.int16)
+    return audio_np
+def segments_vits_onnx_tts(filtered_onnx_vits_segments, TRANSLATE_AUDIO_TO):
+    """
+    Install:
+    pip install -q piper-tts==1.2.0 onnxruntime-gpu # for cuda118
+    """
+    data_dir = [
+        str(Path.cwd())
+    ]  # "Data directory to check for downloaded models (default: current directory)"
+    download_dir = "PIPER_MODELS"
+    # model_name = "en_US-lessac-medium" tts_name in a dict like VITS
+    update_voices = True  # "Download latest voices.json during startup",
+    synthesize_args = {
+        "speaker_id": None,
+        "length_scale": 1.0,
+        "noise_scale": 0.667,
+        "noise_w": 0.8,
+        "sentence_silence": 0.0,
+    }
+    filtered_segments = filtered_onnx_vits_segments["segments"]
+    # Sorting the segments by 'tts_name'
+    sorted_segments = sorted(filtered_segments, key=lambda x: x["tts_name"])
+    logger.debug(sorted_segments)
+    model_name_key = None
+    for segment in tqdm(sorted_segments):
+        speaker = segment["speaker"] # noqa
+        text = segment["text"]
+        start = segment["start"]
+        tts_name = segment["tts_name"].replace(" VITS-onnx", "")
+        if tts_name != model_name_key:
+            model_name_key = tts_name
+            model = load_piper_model(
+                tts_name, data_dir, download_dir, update_voices
+            )
+            sampling_rate = model.config.sample_rate
+        # make the tts audio
+        filename = f"audio/{start}.ogg"
+        logger.info(f"{text} >> {filename}")
+        try:
+            # Infer
+            speech_output = synthesize_text_to_audio_np_array(
+                model, text, synthesize_args
+            )
+            data_tts = pad_array(
+                speech_output,  # .cpu().numpy().squeeze().astype(np.float32),
+                sampling_rate,
+            )
+            # Save file
+            sf.write(
+                file=filename,
+                samplerate=sampling_rate,
+                data=data_tts,
+                format="ogg",
+                subtype="vorbis",
+            )
+            verify_saved_file_and_size(filename)
+        except Exception as error:
+            error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
+        gc.collect()
+        torch.cuda.empty_cache()
+    try:
+        del model
+        gc.collect()
+        torch.cuda.empty_cache()
+    except Exception as error:
+        logger.error(str(error))
+        gc.collect()
+        torch.cuda.empty_cache()
+# =====================================
+# CLOSEAI TTS
+# =====================================
+def segments_openai_tts(
+    filtered_openai_tts_segments, TRANSLATE_AUDIO_TO
+):
+    from openai import OpenAI
+    client = OpenAI()
+    sampling_rate = 24000
+    # filtered_segments = filtered_openai_tts_segments['segments']
+    # Sorting the segments by 'tts_name'
+    # sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name'])
+    for segment in tqdm(filtered_openai_tts_segments["segments"]):
+        speaker = segment["speaker"] # noqa
+        text = segment["text"].strip()
+        start = segment["start"]
+        tts_name = segment["tts_name"]
+        # make the tts audio
+        filename = f"audio/{start}.ogg"
+        logger.info(f"{text} >> {filename}")
+        try:
+            # Request
+            response = client.audio.speech.create(
+                model="tts-1-hd" if "HD" in tts_name else "tts-1",
+                voice=tts_name.split()[0][1:],
+                response_format="wav",
+                input=text
+            )
+            audio_bytes = b''
+            for data in response.iter_bytes(chunk_size=4096):
+                audio_bytes += data
+            speech_output = np.frombuffer(audio_bytes, dtype=np.int16)
+            # Save file
+            data_tts = pad_array(
+                speech_output[240:],
+                sampling_rate,
+            )
+            sf.write(
+                file=filename,
+                samplerate=sampling_rate,
+                data=data_tts,
+                format="ogg",
+                subtype="vorbis",
+            )
+            verify_saved_file_and_size(filename)
+        except Exception as error:
+            error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
+# =====================================
+# Select task TTS
+# =====================================
+def find_spkr(pattern, speaker_to_voice, segments):
+    return [
+        speaker
+        for speaker, voice in speaker_to_voice.items()
+        if pattern.match(voice) and any(
+            segment["speaker"] == speaker for segment in segments
+        )
+    ]
+def filter_by_speaker(speakers, segments):
+    return {
+        "segments": [
+            segment
+            for segment in segments
+            if segment["speaker"] in speakers
+        ]
+    }
+def audio_segmentation_to_voice(
+    result_diarize,
+    TRANSLATE_AUDIO_TO,
+    is_gui,
+    tts_voice00,
+    tts_voice01="",
+    tts_voice02="",
+    tts_voice03="",
+    tts_voice04="",
+    tts_voice05="",
+    tts_voice06="",
+    tts_voice07="",
+    tts_voice08="",
+    tts_voice09="",
+    tts_voice10="",
+    tts_voice11="",
+    dereverb_automatic=True,
+    model_id_bark="suno/bark-small",
+    model_id_coqui="tts_models/multilingual/multi-dataset/xtts_v2",
+    delete_previous_automatic=True,
+):
+    remove_directory_contents("audio")
+    # Mapping speakers to voice variables
+    speaker_to_voice = {
+        "SPEAKER_00": tts_voice00,
+        "SPEAKER_01": tts_voice01,
+        "SPEAKER_02": tts_voice02,
+        "SPEAKER_03": tts_voice03,
+        "SPEAKER_04": tts_voice04,
+        "SPEAKER_05": tts_voice05,
+        "SPEAKER_06": tts_voice06,
+        "SPEAKER_07": tts_voice07,
+        "SPEAKER_08": tts_voice08,
+        "SPEAKER_09": tts_voice09,
+        "SPEAKER_10": tts_voice10,
+        "SPEAKER_11": tts_voice11,
+    }
+    # Assign 'SPEAKER_00' to segments without a 'speaker' key
+    for segment in result_diarize["segments"]:
+        if "speaker" not in segment:
+            segment["speaker"] = "SPEAKER_00"
+            logger.warning(
+                "NO SPEAKER DETECT IN SEGMENT: First TTS will be used in the"
+                f" segment time {segment['start'], segment['text']}"
+            )
+        # Assign the TTS name
+        segment["tts_name"] = speaker_to_voice[segment["speaker"]]
+    # Find TTS method
+    pattern_edge = re.compile(r".*-(Male|Female)$")
+    pattern_bark = re.compile(r".* BARK$")
+    pattern_vits = re.compile(r".* VITS$")
+    pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
+    pattern_vits_onnx = re.compile(r".* VITS-onnx$")
+    pattern_openai_tts = re.compile(r".* OpenAI-TTS$")
+    all_segments = result_diarize["segments"]
+    speakers_edge = find_spkr(pattern_edge, speaker_to_voice, all_segments)
+    speakers_bark = find_spkr(pattern_bark, speaker_to_voice, all_segments)
+    speakers_vits = find_spkr(pattern_vits, speaker_to_voice, all_segments)
+    speakers_coqui = find_spkr(pattern_coqui, speaker_to_voice, all_segments)
+    speakers_vits_onnx = find_spkr(
+        pattern_vits_onnx, speaker_to_voice, all_segments
+    )
+    speakers_openai_tts = find_spkr(
+        pattern_openai_tts, speaker_to_voice, all_segments
+    )
+    # Filter method in segments
+    filtered_edge = filter_by_speaker(speakers_edge, all_segments)
+    filtered_bark = filter_by_speaker(speakers_bark, all_segments)
+    filtered_vits = filter_by_speaker(speakers_vits, all_segments)
+    filtered_coqui = filter_by_speaker(speakers_coqui, all_segments)
+    filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments)
+    filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments)
+    # Infer
+    if filtered_edge["segments"]:
+        logger.info(f"EDGE TTS: {speakers_edge}")
+        segments_egde_tts(filtered_edge, TRANSLATE_AUDIO_TO, is_gui)  # mp3
+    if filtered_bark["segments"]:
+        logger.info(f"BARK TTS: {speakers_bark}")
+        segments_bark_tts(
+            filtered_bark, TRANSLATE_AUDIO_TO, model_id_bark
+        )  # wav
+    if filtered_vits["segments"]:
+        logger.info(f"VITS TTS: {speakers_vits}")
+        segments_vits_tts(filtered_vits, TRANSLATE_AUDIO_TO)  # wav
+    if filtered_coqui["segments"]:
+        logger.info(f"Coqui TTS: {speakers_coqui}")
+        segments_coqui_tts(
+            filtered_coqui,
+            TRANSLATE_AUDIO_TO,
+            model_id_coqui,
+            speakers_coqui,
+            delete_previous_automatic,
+            dereverb_automatic,
+        )  # wav
+    if filtered_vits_onnx["segments"]:
+        logger.info(f"PIPER TTS: {speakers_vits_onnx}")
+        segments_vits_onnx_tts(filtered_vits_onnx, TRANSLATE_AUDIO_TO)  # wav
+    if filtered_openai_tts["segments"]:
+        logger.info(f"OpenAI TTS: {speakers_openai_tts}")
+        segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO)  # wav
+    [result.pop("tts_name", None) for result in result_diarize["segments"]]
+    return [
+        speakers_edge,
+        speakers_bark,
+        speakers_vits,
+        speakers_coqui,
+        speakers_vits_onnx,
+        speakers_openai_tts
+    ]
+def accelerate_segments(
+    result_diarize,
+    max_accelerate_audio,
+    valid_speakers,
+    acceleration_rate_regulation=False,
+    folder_output="audio2",
+):
+    logger.info("Apply acceleration")
+    (
+        speakers_edge,
+        speakers_bark,
+        speakers_vits,
+        speakers_coqui,
+        speakers_vits_onnx,
+        speakers_openai_tts
+    ) = valid_speakers
+    create_directories(f"{folder_output}/audio/")
+    remove_directory_contents(f"{folder_output}/audio/")
+    audio_files = []
+    speakers_list = []
+    max_count_segments_idx = len(result_diarize["segments"]) - 1
+    for i, segment in tqdm(enumerate(result_diarize["segments"])):
+        text = segment["text"] # noqa
+        start = segment["start"]
+        end = segment["end"]
+        speaker = segment["speaker"]
+        # find name audio
+        # if speaker in speakers_edge:
+        filename = f"audio/{start}.ogg"
+        # elif speaker in speakers_bark + speakers_vits + speakers_coqui + speakers_vits_onnx:
+        #    filename = f"audio/{start}.wav" # wav
+        # duration
+        duration_true = end - start
+        duration_tts = librosa.get_duration(filename=filename)
+        # Accelerate percentage
+        acc_percentage = duration_tts / duration_true
+        # Smoth
+        if acceleration_rate_regulation and acc_percentage >= 1.3:
+            try:
+                next_segment = result_diarize["segments"][
+                    min(max_count_segments_idx, i + 1)
+                ]
+                next_start = next_segment["start"]
+                next_speaker = next_segment["speaker"]
+                duration_with_next_start = next_start - start
+                if duration_with_next_start > duration_true:
+                    extra_time = duration_with_next_start - duration_true
+                    if speaker == next_speaker:
+                        # half
+                        smoth_duration = duration_true + (extra_time * 0.5)
+                    else:
+                        # 7/10
+                        smoth_duration = duration_true + (extra_time * 0.7)
+                    logger.debug(
+                        f"Base acc: {acc_percentage}, "
+                        f"smoth acc: {duration_tts / smoth_duration}"
+                    )
+                    acc_percentage = max(1.2, (duration_tts / smoth_duration))
+            except Exception as error:
+                logger.error(str(error))
+        if acc_percentage > max_accelerate_audio:
+            acc_percentage = max_accelerate_audio
+        elif acc_percentage <= 1.15 and acc_percentage >= 0.8:
+            acc_percentage = 1.0
+        elif acc_percentage <= 0.79:
+            acc_percentage = 0.8
+        # Round
+        acc_percentage = round(acc_percentage + 0.0, 1)
+        # Format read if need
+        if speaker in speakers_edge:
+            info_enc = sf.info(filename).format
+        else:
+            info_enc = "OGG"
+        # Apply aceleration or opposite to the audio file in folder_output folder
+        if acc_percentage == 1.0 and info_enc == "OGG":
+            copy_files(filename, f"{folder_output}{os.sep}audio")
+        else:
+            os.system(
+                f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={acc_percentage} {folder_output}/{filename}"
+            )
+        if logger.isEnabledFor(logging.DEBUG):
+            duration_create = librosa.get_duration(
+                filename=f"{folder_output}/{filename}"
+            )
+            logger.debug(
+                f"acc_percen is {acc_percentage}, tts duration "
+                f"is {duration_tts}, new duration is {duration_create}"
+                f", for {filename}"
+            )
+        audio_files.append(f"{folder_output}/{filename}")
+        speaker = "TTS Speaker {:02d}".format(int(speaker[-2:]) + 1)
+        speakers_list.append(speaker)
+    return audio_files, speakers_list
+# =====================================
+# Tone color converter
+# =====================================
+def se_process_audio_segments(
+    source_seg, tone_color_converter, device, remove_previous_processed=True
+):
+    # list wav seg
+    source_audio_segs = glob.glob(f"{source_seg}/*.wav")
+    if not source_audio_segs:
+        raise ValueError(
+            f"No audio segments found in {str(source_audio_segs)}"
+        )
+    source_se_path = os.path.join(source_seg, "se.pth")
+    # if exist not create wav
+    if os.path.isfile(source_se_path):
+        se = torch.load(source_se_path).to(device)
+        logger.debug(f"Previous created {source_se_path}")
+    else:
+        se = tone_color_converter.extract_se(source_audio_segs, source_se_path)
+    return se
+def create_wav_vc(
+    valid_speakers,
+    segments_base,
+    audio_name,
+    max_segments=10,
+    target_dir="processed",
+    get_vocals_dereverb=False,
+):
+    # valid_speakers = list({item['speaker'] for item in segments_base})
+    # Before function delete automatic delete_previous_automatic
+    output_dir = os.path.join(".", target_dir)  # remove content
+    # remove_directory_contents(output_dir)
+    path_source_segments = []
+    path_target_segments = []
+    for speaker in valid_speakers:
+        filtered_speaker = [
+            segment
+            for segment in segments_base
+            if segment["speaker"] == speaker
+        ]
+        if len(filtered_speaker) > 4:
+            filtered_speaker = filtered_speaker[1:]
+        dir_name_speaker = speaker + audio_name
+        dir_name_speaker_tts = "tts" + speaker + audio_name
+        dir_path_speaker = os.path.join(output_dir, dir_name_speaker)
+        dir_path_speaker_tts = os.path.join(output_dir, dir_name_speaker_tts)
+        create_directories([dir_path_speaker, dir_path_speaker_tts])
+        path_target_segments.append(dir_path_speaker)
+        path_source_segments.append(dir_path_speaker_tts)
+        # create wav
+        max_segments_count = 0
+        for seg in filtered_speaker:
+            duration = float(seg["end"]) - float(seg["start"])
+            if duration > 3.0 and duration < 18.0:
+                logger.info(
+                    f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {duration}, {seg["text"]}'
+                )
+                name_new_wav = str(seg["start"])
+                check_segment_audio_target_file = os.path.join(
+                    dir_path_speaker, f"{name_new_wav}.wav"
+                )
+                if os.path.exists(check_segment_audio_target_file):
+                    logger.debug(
+                        "Segment vc source exists: "
+                        f"{check_segment_audio_target_file}"
+                    )
+                    pass
+                else:
+                    create_wav_file_vc(
+                        sample_name=name_new_wav,
+                        audio_wav="audio.wav",
+                        start=(float(seg["start"]) + 1.0),
+                        end=(float(seg["end"]) - 1.0),
+                        output_final_path=dir_path_speaker,
+                        get_vocals_dereverb=get_vocals_dereverb,
+                    )
+                    file_name_tts = f"audio2/audio/{str(seg['start'])}.ogg"
+                    # copy_files(file_name_tts, os.path.join(output_dir, dir_name_speaker_tts)
+                    convert_to_xtts_good_sample(
+                        file_name_tts, dir_path_speaker_tts
+                    )
+                max_segments_count += 1
+                if max_segments_count == max_segments:
+                    break
+        if max_segments_count == 0:
+            logger.info("Taking the first segment")
+            seg = filtered_speaker[0]
+            logger.info(
+                f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {seg["text"]}'
+            )
+            max_duration = float(seg["end"]) - float(seg["start"])
+            max_duration = max(1.0, min(max_duration, 18.0))
+            name_new_wav = str(seg["start"])
+            create_wav_file_vc(
+                sample_name=name_new_wav,
+                audio_wav="audio.wav",
+                start=(float(seg["start"])),
+                end=(float(seg["start"]) + max_duration),
+                output_final_path=dir_path_speaker,
+                get_vocals_dereverb=get_vocals_dereverb,
+            )
+            file_name_tts = f"audio2/audio/{str(seg['start'])}.ogg"
+            # copy_files(file_name_tts, os.path.join(output_dir, dir_name_speaker_tts)
+            convert_to_xtts_good_sample(file_name_tts, dir_path_speaker_tts)
+    logger.debug(f"Base: {str(path_source_segments)}")
+    logger.debug(f"Target: {str(path_target_segments)}")
+    return path_source_segments, path_target_segments
+def toneconverter_openvoice(
+    result_diarize,
+    preprocessor_max_segments,
+    remove_previous_process=True,
+    get_vocals_dereverb=False,
+    model="openvoice",
+):
+    audio_path = "audio.wav"
+    # se_path = "se.pth"
+    target_dir = "processed"
+    create_directories(target_dir)
+    from openvoice import se_extractor
+    from openvoice.api import ToneColorConverter
+    audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{se_extractor.hash_numpy_array(audio_path)}"
+    # se_path = os.path.join(target_dir, audio_name, 'se.pth')
+    # create wav seg original and target
+    valid_speakers = list(
+        {item["speaker"] for item in result_diarize["segments"]}
+    )
+    logger.info("Openvoice preprocessor...")
+    if remove_previous_process:
+        remove_directory_contents(target_dir)
+    path_source_segments, path_target_segments = create_wav_vc(
+        valid_speakers,
+        result_diarize["segments"],
+        audio_name,
+        max_segments=preprocessor_max_segments,
+        get_vocals_dereverb=get_vocals_dereverb,
+    )
+    logger.info("Openvoice loading model...")
+    model_path_openvoice = "./OPENVOICE_MODELS"
+    url_model_openvoice = "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter"
+    if "v2" in model:
+        model_path = os.path.join(model_path_openvoice, "v2")
+        url_model_openvoice = url_model_openvoice.replace(
+            "OpenVoice", "OpenVoiceV2"
+        ).replace("checkpoints/", "")
+    else:
+        model_path = os.path.join(model_path_openvoice, "v1")
+    create_directories(model_path)
+    config_url = f"{url_model_openvoice}/config.json"
+    checkpoint_url = f"{url_model_openvoice}/checkpoint.pth"
+    config_path = download_manager(url=config_url, path=model_path)
+    checkpoint_path = download_manager(
+        url=checkpoint_url, path=model_path
+    )
+    device = os.environ.get("QUANTUM_DEVICE")
+    tone_color_converter = ToneColorConverter(config_path, device=device)
+    tone_color_converter.load_ckpt(checkpoint_path)
+    logger.info("Openvoice tone color converter:")
+    global_progress_bar = tqdm(total=len(result_diarize["segments"]), desc="Progress")
+    for source_seg, target_seg, speaker in zip(
+        path_source_segments, path_target_segments, valid_speakers
+    ):
+        # source_se_path = os.path.join(source_seg, 'se.pth')
+        source_se = se_process_audio_segments(source_seg, tone_color_converter, device)
+        # target_se_path = os.path.join(target_seg, 'se.pth')
+        target_se = se_process_audio_segments(target_seg, tone_color_converter, device)
+        # Iterate throw segments
+        encode_message = "@MyShell"
+        filtered_speaker = [
+            segment
+            for segment in result_diarize["segments"]
+            if segment["speaker"] == speaker
+        ]
+        for seg in filtered_speaker:
+            src_path = (
+                save_path
+            ) = f"audio2/audio/{str(seg['start'])}.ogg"  # overwrite
+            logger.debug(f"{src_path}")
+            tone_color_converter.convert(
+                audio_src_path=src_path,
+                src_se=source_se,
+                tgt_se=target_se,
+                output_path=save_path,
+                message=encode_message,
+            )
+            global_progress_bar.update(1)
+    global_progress_bar.close()
+    try:
+        del tone_color_converter
+        gc.collect()
+        torch.cuda.empty_cache()
+    except Exception as error:
+        logger.error(str(error))
+        gc.collect()
+        torch.cuda.empty_cache()
+def toneconverter_freevc(
+    result_diarize,
+    remove_previous_process=True,
+    get_vocals_dereverb=False,
+):
+    audio_path = "audio.wav"
+    target_dir = "processed"
+    create_directories(target_dir)
+    from openvoice import se_extractor
+    audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{se_extractor.hash_numpy_array(audio_path)}"
+    # create wav seg; original is target and dubbing is source
+    valid_speakers = list(
+        {item["speaker"] for item in result_diarize["segments"]}
+    )
+    logger.info("FreeVC preprocessor...")
+    if remove_previous_process:
+        remove_directory_contents(target_dir)
+    path_source_segments, path_target_segments = create_wav_vc(
+        valid_speakers,
+        result_diarize["segments"],
+        audio_name,
+        max_segments=1,
+        get_vocals_dereverb=get_vocals_dereverb,
+    )
+    logger.info("FreeVC loading model...")
+    device_id = os.environ.get("QUANTUM_DEVICE")
+    device = None if device_id == "cpu" else device_id
+    try:
+        from TTS.api import TTS
+        tts = TTS(
+            model_name="voice_conversion_models/multilingual/vctk/freevc24",
+            progress_bar=False
+        ).to(device)
+    except Exception as error:
+        logger.error(str(error))
+        logger.error("Error loading the FreeVC model.")
+        return
+    logger.info("FreeVC process:")
+    global_progress_bar = tqdm(total=len(result_diarize["segments"]), desc="Progress")
+    for source_seg, target_seg, speaker in zip(
+        path_source_segments, path_target_segments, valid_speakers
+    ):
+        filtered_speaker = [
+            segment
+            for segment in result_diarize["segments"]
+            if segment["speaker"] == speaker
+        ]
+        files_and_directories = os.listdir(target_seg)
+        wav_files = [file for file in files_and_directories if file.endswith(".wav")]
+        original_wav_audio_segment = os.path.join(target_seg, wav_files[0])
+        for seg in filtered_speaker:
+            src_path = (
+                  save_path
+              ) = f"audio2/audio/{str(seg['start'])}.ogg"  # overwrite
+            logger.debug(f"{src_path} - {original_wav_audio_segment}")
+            wav = tts.voice_conversion(
+                source_wav=src_path,
+                target_wav=original_wav_audio_segment,
+            )
+            sf.write(
+                file=save_path,
+                samplerate=tts.voice_converter.vc_config.audio.output_sample_rate,
+                data=wav,
+                format="ogg",
+                subtype="vorbis",
+            )
+            global_progress_bar.update(1)
+    global_progress_bar.close()
+    try:
+        del tts
+        gc.collect()
+        torch.cuda.empty_cache()
+    except Exception as error:
+        logger.error(str(error))
+        gc.collect()
+        torch.cuda.empty_cache()
+def toneconverter(
+    result_diarize,
+    preprocessor_max_segments,
+    remove_previous_process=True,
+    get_vocals_dereverb=False,
+    method_vc="freevc"
+):
+    if method_vc == "freevc":
+        if preprocessor_max_segments > 1:
+            logger.info("FreeVC only uses one segment.")
+        return toneconverter_freevc(
+                    result_diarize,
+                    remove_previous_process=remove_previous_process,
+                    get_vocals_dereverb=get_vocals_dereverb,
+                )
+    elif "openvoice" in method_vc:
+        return toneconverter_openvoice(
+                    result_diarize,
+                    preprocessor_max_segments,
+                    remove_previous_process=remove_previous_process,
+                    get_vocals_dereverb=get_vocals_dereverb,
+                    model=method_vc,
+                )
+if __name__ == "__main__":
+    from segments import result_diarize
+    audio_segmentation_to_voice(
+        result_diarize,
+        TRANSLATE_AUDIO_TO="en",
+        max_accelerate_audio=2.1,
+        is_gui=True,
+        tts_voice00="en-facebook-mms VITS",
+        tts_voice01="en-CA-ClaraNeural-Female",
+        tts_voice02="en-GB-ThomasNeural-Male",
+        tts_voice03="en-GB-SoniaNeural-Female",
+        tts_voice04="en-NZ-MitchellNeural-Male",
+        tts_voice05="en-GB-MaisieNeural-Female",
+    )

quantum_dubbing/translate_segments.py ADDED Viewed

	@@ -0,0 +1,457 @@

+from tqdm import tqdm
+from deep_translator import GoogleTranslator
+from itertools import chain
+import copy
+from .language_configuration import fix_code_language, INVERTED_LANGUAGES
+from .logging_setup import logger
+import re
+import json
+import time
+TRANSLATION_PROCESS_OPTIONS = [
+    "google_translator_batch",
+    "google_translator",
+    "gpt-3.5-turbo-0125_batch",
+    "gpt-3.5-turbo-0125",
+    "gpt-4-turbo-preview_batch",
+    "gpt-4-turbo-preview",
+    "disable_translation",
+]
+DOCS_TRANSLATION_PROCESS_OPTIONS = [
+    "google_translator",
+    "gpt-3.5-turbo-0125",
+    "gpt-4-turbo-preview",
+    "disable_translation",
+]
+def translate_iterative(segments, target, source=None):
+    """
+    Translate text segments individually to the specified language.
+    Parameters:
+    - segments (list): A list of dictionaries with 'text' as a key for
+        segment text.
+    - target (str): Target language code.
+    - source (str, optional): Source language code. Defaults to None.
+    Returns:
+    - list: Translated text segments in the target language.
+    Notes:
+    - Translates each segment using Google Translate.
+    Example:
+    segments = [{'text': 'first segment.'}, {'text': 'second segment.'}]
+    translated_segments = translate_iterative(segments, 'es')
+    """
+    segments_ = copy.deepcopy(segments)
+    if (
+        not source
+    ):
+        logger.debug("No source language")
+        source = "auto"
+    translator = GoogleTranslator(source=source, target=target)
+    for line in tqdm(range(len(segments_))):
+        text = segments_[line]["text"]
+        translated_line = translator.translate(text.strip())
+        segments_[line]["text"] = translated_line
+    return segments_
+def verify_translate(
+    segments,
+    segments_copy,
+    translated_lines,
+    target,
+    source
+):
+    """
+    Verify integrity and translate segments if lengths match, otherwise
+    switch to iterative translation.
+    """
+    if len(segments) == len(translated_lines):
+        for line in range(len(segments_copy)):
+            logger.debug(
+                f"{segments_copy[line]['text']} >> "
+                f"{translated_lines[line].strip()}"
+            )
+            segments_copy[line]["text"] = translated_lines[
+                line].replace("\t", "").replace("\n", "").strip()
+        return segments_copy
+    else:
+        logger.error(
+            "The translation failed, switching to google_translate iterative. "
+            f"{len(segments), len(translated_lines)}"
+        )
+        return translate_iterative(segments, target, source)
+def translate_batch(segments, target, chunk_size=2000, source=None):
+    """
+    Translate a batch of text segments into the specified language in chunks,
+        respecting the character limit.
+    Parameters:
+    - segments (list): List of dictionaries with 'text' as a key for segment
+        text.
+    - target (str): Target language code.
+    - chunk_size (int, optional): Maximum character limit for each translation
+        chunk (default is 2000; max 5000).
+    - source (str, optional): Source language code. Defaults to None.
+    Returns:
+    - list: Translated text segments in the target language.
+    Notes:
+    - Splits input segments into chunks respecting the character limit for
+        translation.
+    - Translates the chunks using Google Translate.
+    - If chunked translation fails, switches to iterative translation using
+        `translate_iterative()`.
+    Example:
+    segments = [{'text': 'first segment.'}, {'text': 'second segment.'}]
+    translated = translate_batch(segments, 'es', chunk_size=4000, source='en')
+    """
+    segments_copy = copy.deepcopy(segments)
+    if (
+        not source
+    ):
+        logger.debug("No source language")
+        source = "auto"
+    # Get text
+    text_lines = []
+    for line in range(len(segments_copy)):
+        text = segments_copy[line]["text"].strip()
+        text_lines.append(text)
+    # chunk limit
+    text_merge = []
+    actual_chunk = ""
+    global_text_list = []
+    actual_text_list = []
+    for one_line in text_lines:
+        one_line = " " if not one_line else one_line
+        if (len(actual_chunk) + len(one_line)) <= chunk_size:
+            if actual_chunk:
+                actual_chunk += " ||||| "
+            actual_chunk += one_line
+            actual_text_list.append(one_line)
+        else:
+            text_merge.append(actual_chunk)
+            actual_chunk = one_line
+            global_text_list.append(actual_text_list)
+            actual_text_list = [one_line]
+    if actual_chunk:
+        text_merge.append(actual_chunk)
+        global_text_list.append(actual_text_list)
+    # translate chunks
+    progress_bar = tqdm(total=len(segments), desc="Translating")
+    translator = GoogleTranslator(source=source, target=target)
+    split_list = []
+    try:
+        for text, text_iterable in zip(text_merge, global_text_list):
+            translated_line = translator.translate(text.strip())
+            split_text = translated_line.split("|||||")
+            if len(split_text) == len(text_iterable):
+                progress_bar.update(len(split_text))
+            else:
+                logger.debug(
+                    "Chunk fixing iteratively. Len chunk: "
+                    f"{len(split_text)}, expected: {len(text_iterable)}"
+                )
+                split_text = []
+                for txt_iter in text_iterable:
+                    translated_txt = translator.translate(txt_iter.strip())
+                    split_text.append(translated_txt)
+                    progress_bar.update(1)
+            split_list.append(split_text)
+        progress_bar.close()
+    except Exception as error:
+        progress_bar.close()
+        logger.error(str(error))
+        logger.warning(
+            "The translation in chunks failed, switching to iterative."
+            " Related: too many request"
+        )  # use proxy or less chunk size
+        return translate_iterative(segments, target, source)
+    # un chunk
+    translated_lines = list(chain.from_iterable(split_list))
+    return verify_translate(
+        segments, segments_copy, translated_lines, target, source
+    )
+def call_gpt_translate(
+    client,
+    model,
+    system_prompt,
+    user_prompt,
+    original_text=None,
+    batch_lines=None,
+):
+    # https://platform.openai.com/docs/guides/text-generation/json-mode
+    response = client.chat.completions.create(
+        model=model,
+        response_format={"type": "json_object"},
+        messages=[
+          {"role": "system", "content": system_prompt},
+          {"role": "user", "content": user_prompt}
+        ]
+    )
+    result = response.choices[0].message.content
+    logger.debug(f"Result: {str(result)}")
+    try:
+        translation = json.loads(result)
+    except Exception as error:
+        match_result = re.search(r'\{.*?\}', result)
+        if match_result:
+            logger.error(str(error))
+            json_str = match_result.group(0)
+            translation = json.loads(json_str)
+        else:
+            raise error
+    # Get valid data
+    if batch_lines:
+        for conversation in translation.values():
+            if isinstance(conversation, dict):
+                conversation = list(conversation.values())[0]
+            if (
+                list(
+                    original_text["conversation"][0].values()
+                )[0].strip() ==
+                list(conversation[0].values())[0].strip()
+            ):
+                continue
+            if len(conversation) == batch_lines:
+                break
+        fix_conversation_length = []
+        for line in conversation:
+            for speaker_code, text_tr in line.items():
+                fix_conversation_length.append({speaker_code: text_tr})
+        logger.debug(f"Data batch: {str(fix_conversation_length)}")
+        logger.debug(
+            f"Lines Received: {len(fix_conversation_length)},"
+            f" expected: {batch_lines}"
+        )
+        return fix_conversation_length
+    else:
+        if isinstance(translation, dict):
+            translation = list(translation.values())[0]
+        if isinstance(translation, list):
+            translation = translation[0]
+        if isinstance(translation, set):
+            translation = list(translation)[0]
+        if not isinstance(translation, str):
+            raise ValueError(f"No valid response received: {str(translation)}")
+        return translation
+def gpt_sequential(segments, model, target, source=None):
+    from openai import OpenAI
+    translated_segments = copy.deepcopy(segments)
+    client = OpenAI()
+    progress_bar = tqdm(total=len(segments), desc="Translating")
+    lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip()
+    lang_sc = ""
+    if source:
+        lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip()
+    fixed_target = fix_code_language(target)
+    fixed_source = fix_code_language(source) if source else "auto"
+    system_prompt = "Machine translation designed to output the translated_text JSON."
+    for i, line in enumerate(translated_segments):
+        text = line["text"].strip()
+        start = line["start"]
+        user_prompt = f"Translate the following {lang_sc} text into {lang_tg}, write the fully translated text and nothing more:\n{text}"
+        time.sleep(0.5)
+        try:
+            translated_text = call_gpt_translate(
+                client,
+                model,
+                system_prompt,
+                user_prompt,
+            )
+        except Exception as error:
+            logger.error(
+                f"{str(error)} >> The text of segment {start} "
+                "is being corrected with Google Translate"
+            )
+            translator = GoogleTranslator(
+                source=fixed_source, target=fixed_target
+            )
+            translated_text = translator.translate(text.strip())
+        translated_segments[i]["text"] = translated_text.strip()
+        progress_bar.update(1)
+    progress_bar.close()
+    return translated_segments
+def gpt_batch(segments, model, target, token_batch_limit=900, source=None):
+    from openai import OpenAI
+    import tiktoken
+    token_batch_limit = max(100, (token_batch_limit - 40) // 2)
+    progress_bar = tqdm(total=len(segments), desc="Translating")
+    segments_copy = copy.deepcopy(segments)
+    encoding = tiktoken.get_encoding("cl100k_base")
+    client = OpenAI()
+    lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip()
+    lang_sc = ""
+    if source:
+        lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip()
+    fixed_target = fix_code_language(target)
+    fixed_source = fix_code_language(source) if source else "auto"
+    name_speaker = "ABCDEFGHIJKL"
+    translated_lines = []
+    text_data_dict = []
+    num_tokens = 0
+    count_sk = {char: 0 for char in "ABCDEFGHIJKL"}
+    for i, line in enumerate(segments_copy):
+        text = line["text"]
+        speaker = line["speaker"]
+        last_start = line["start"]
+        # text_data_dict.append({str(int(speaker[-1])+1): text})
+        index_sk = int(speaker[-2:])
+        character_sk = name_speaker[index_sk]
+        count_sk[character_sk] += 1
+        code_sk = character_sk+str(count_sk[character_sk])
+        text_data_dict.append({code_sk: text})
+        num_tokens += len(encoding.encode(text)) + 7
+        if num_tokens >= token_batch_limit or i == len(segments_copy)-1:
+            try:
+                batch_lines = len(text_data_dict)
+                batch_conversation = {"conversation": copy.deepcopy(text_data_dict)}
+                # Reset vars
+                num_tokens = 0
+                text_data_dict = []
+                count_sk = {char: 0 for char in "ABCDEFGHIJKL"}
+                # Process translation
+                # https://arxiv.org/pdf/2309.03409.pdf
+                system_prompt = f"Machine translation designed to output the translated_conversation key JSON containing a list of {batch_lines} items."
+                user_prompt = f"Translate each of the following text values in conversation{' from' if lang_sc else ''} {lang_sc} to {lang_tg}:\n{batch_conversation}"
+                logger.debug(f"Prompt: {str(user_prompt)}")
+                conversation = call_gpt_translate(
+                    client,
+                    model,
+                    system_prompt,
+                    user_prompt,
+                    original_text=batch_conversation,
+                    batch_lines=batch_lines,
+                )
+                if len(conversation) < batch_lines:
+                    raise ValueError(
+                        "Incomplete result received. Batch lines: "
+                        f"{len(conversation)}, expected: {batch_lines}"
+                    )
+                for i, translated_text in enumerate(conversation):
+                    if i+1 > batch_lines:
+                        break
+                    translated_lines.append(list(translated_text.values())[0])
+                progress_bar.update(batch_lines)
+            except Exception as error:
+                logger.error(str(error))
+                first_start = segments_copy[max(0, i-(batch_lines-1))]["start"]
+                logger.warning(
+                    f"The batch from {first_start} to {last_start} "
+                    "failed, is being corrected with Google Translate"
+                )
+                translator = GoogleTranslator(
+                    source=fixed_source,
+                    target=fixed_target
+                )
+                for txt_source in batch_conversation["conversation"]:
+                    translated_txt = translator.translate(
+                        list(txt_source.values())[0].strip()
+                    )
+                    translated_lines.append(translated_txt.strip())
+                    progress_bar.update(1)
+    progress_bar.close()
+    return verify_translate(
+        segments, segments_copy, translated_lines, fixed_target, fixed_source
+    )
+def translate_text(
+    segments,
+    target,
+    translation_process="google_translator_batch",
+    chunk_size=4500,
+    source=None,
+    token_batch_limit=1000,
+):
+    """Translates text segments using a specified process."""
+    match translation_process:
+        case "google_translator_batch":
+            return translate_batch(
+                segments,
+                fix_code_language(target),
+                chunk_size,
+                fix_code_language(source)
+            )
+        case "google_translator":
+            return translate_iterative(
+                segments,
+                fix_code_language(target),
+                fix_code_language(source)
+            )
+        case model if model in ["gpt-3.5-turbo-0125", "gpt-4-turbo-preview"]:
+            return gpt_sequential(segments, model, target, source)
+        case model if model in ["gpt-3.5-turbo-0125_batch", "gpt-4-turbo-preview_batch",]:
+            return gpt_batch(
+                segments,
+                translation_process.replace("_batch", ""),
+                target,
+                token_batch_limit,
+                source
+            )
+        case "disable_translation":
+            return segments
+        case _:
+            raise ValueError("No valid translation process")

quantum_dubbing/utils.py ADDED Viewed

	@@ -0,0 +1,487 @@

+import os, zipfile, rarfile, shutil, subprocess, shlex, sys # noqa
+from .logging_setup import logger
+from urllib.parse import urlparse
+from IPython.utils import capture
+import re
+VIDEO_EXTENSIONS = [
+    ".mp4",
+    ".avi",
+    ".mov",
+    ".mkv",
+    ".wmv",
+    ".flv",
+    ".webm",
+    ".m4v",
+    ".mpeg",
+    ".mpg",
+    ".3gp"
+]
+AUDIO_EXTENSIONS = [
+    ".mp3",
+    ".wav",
+    ".aiff",
+    ".aif",
+    ".flac",
+    ".aac",
+    ".ogg",
+    ".wma",
+    ".m4a",
+    ".alac",
+    ".pcm",
+    ".opus",
+    ".ape",
+    ".amr",
+    ".ac3",
+    ".vox",
+    ".caf"
+]
+SUBTITLE_EXTENSIONS = [
+    ".srt",
+    ".vtt",
+    ".ass"
+]
+def run_command(command):
+    logger.debug(command)
+    if isinstance(command, str):
+        command = shlex.split(command)
+    sub_params = {
+        "stdout": subprocess.PIPE,
+        "stderr": subprocess.PIPE,
+        "creationflags": subprocess.CREATE_NO_WINDOW
+        if sys.platform == "win32"
+        else 0,
+    }
+    process_command = subprocess.Popen(command, **sub_params)
+    output, errors = process_command.communicate()
+    if (
+        process_command.returncode != 0
+    ):  # or not os.path.exists(mono_path) or os.path.getsize(mono_path) == 0:
+        logger.error("Error comnand")
+        raise Exception(errors.decode())
+def print_tree_directory(root_dir, indent=""):
+    if not os.path.exists(root_dir):
+        logger.error(f"{indent} Invalid directory or file: {root_dir}")
+        return
+    items = os.listdir(root_dir)
+    for index, item in enumerate(sorted(items)):
+        item_path = os.path.join(root_dir, item)
+        is_last_item = index == len(items) - 1
+        if os.path.isfile(item_path) and item_path.endswith(".zip"):
+            with zipfile.ZipFile(item_path, "r") as zip_file:
+                print(
+                    f"{indent}{'└──' if is_last_item else '├──'} {item} (zip file)"
+                )
+                zip_contents = zip_file.namelist()
+                for zip_item in sorted(zip_contents):
+                    print(
+                        f"{indent}{'    ' if is_last_item else '│   '}{zip_item}"
+                    )
+        else:
+            print(f"{indent}{'└──' if is_last_item else '├──'} {item}")
+            if os.path.isdir(item_path):
+                new_indent = indent + ("    " if is_last_item else "│   ")
+                print_tree_directory(item_path, new_indent)
+def upload_model_list():
+    weight_root = "weights"
+    models = []
+    for name in os.listdir(weight_root):
+        if name.endswith(".pth"):
+            models.append("weights/" + name)
+    if models:
+        logger.debug(models)
+    index_root = "logs"
+    index_paths = [None]
+    for name in os.listdir(index_root):
+        if name.endswith(".index"):
+            index_paths.append("logs/" + name)
+    if index_paths:
+        logger.debug(index_paths)
+    return models, index_paths
+def manual_download(url, dst):
+    if "drive.google" in url:
+        logger.info("Drive url")
+        if "folders" in url:
+            logger.info("folder")
+            os.system(f'gdown --folder "{url}" -O {dst} --fuzzy -c')
+        else:
+            logger.info("single")
+            os.system(f'gdown "{url}" -O {dst} --fuzzy -c')
+    elif "huggingface" in url:
+        logger.info("HuggingFace url")
+        if "/blob/" in url or "/resolve/" in url:
+            if "/blob/" in url:
+                url = url.replace("/blob/", "/resolve/")
+            download_manager(url=url, path=dst, overwrite=True, progress=True)
+        else:
+            os.system(f"git clone {url} {dst+'repo/'}")
+    elif "http" in url:
+        logger.info("URL")
+        download_manager(url=url, path=dst, overwrite=True, progress=True)
+    elif os.path.exists(url):
+        logger.info("Path")
+        copy_files(url, dst)
+    else:
+        logger.error(f"No valid URL: {url}")
+def download_list(text_downloads):
+    if os.environ.get("ZERO_GPU") == "TRUE":
+        raise RuntimeError("This option is disabled in this demo.")
+    try:
+        urls = [elem.strip() for elem in text_downloads.split(",")]
+    except Exception as error:
+        raise ValueError(f"No valid URL. {str(error)}")
+    create_directories(["downloads", "logs", "weights"])
+    path_download = "downloads/"
+    for url in urls:
+        manual_download(url, path_download)
+    # Tree
+    print("####################################")
+    print_tree_directory("downloads", indent="")
+    print("####################################")
+    # Place files
+    select_zip_and_rar_files("downloads/")
+    models, _ = upload_model_list()
+    # hf space models files delete
+    remove_directory_contents("downloads/repo")
+    return f"Downloaded = {models}"
+def select_zip_and_rar_files(directory_path="downloads/"):
+    # filter
+    zip_files = []
+    rar_files = []
+    for file_name in os.listdir(directory_path):
+        if file_name.endswith(".zip"):
+            zip_files.append(file_name)
+        elif file_name.endswith(".rar"):
+            rar_files.append(file_name)
+    # extract
+    for file_name in zip_files:
+        file_path = os.path.join(directory_path, file_name)
+        with zipfile.ZipFile(file_path, "r") as zip_ref:
+            zip_ref.extractall(directory_path)
+    for file_name in rar_files:
+        file_path = os.path.join(directory_path, file_name)
+        with rarfile.RarFile(file_path, "r") as rar_ref:
+            rar_ref.extractall(directory_path)
+    # set in path
+    def move_files_with_extension(src_dir, extension, destination_dir):
+        for root, _, files in os.walk(src_dir):
+            for file_name in files:
+                if file_name.endswith(extension):
+                    source_file = os.path.join(root, file_name)
+                    destination = os.path.join(destination_dir, file_name)
+                    shutil.move(source_file, destination)
+    move_files_with_extension(directory_path, ".index", "logs/")
+    move_files_with_extension(directory_path, ".pth", "weights/")
+    return "Download complete"
+def is_file_with_extensions(string_path, extensions):
+    return any(string_path.lower().endswith(ext) for ext in extensions)
+def is_video_file(string_path):
+    return is_file_with_extensions(string_path, VIDEO_EXTENSIONS)
+def is_audio_file(string_path):
+    return is_file_with_extensions(string_path, AUDIO_EXTENSIONS)
+def is_subtitle_file(string_path):
+    return is_file_with_extensions(string_path, SUBTITLE_EXTENSIONS)
+def get_directory_files(directory):
+    audio_files = []
+    video_files = []
+    sub_files = []
+    for item in os.listdir(directory):
+        item_path = os.path.join(directory, item)
+        if os.path.isfile(item_path):
+            if is_audio_file(item_path):
+                audio_files.append(item_path)
+            elif is_video_file(item_path):
+                video_files.append(item_path)
+            elif is_subtitle_file(item_path):
+                sub_files.append(item_path)
+    logger.info(
+        f"Files in path ({directory}): "
+        f"{str(audio_files + video_files + sub_files)}"
+    )
+    return audio_files, video_files, sub_files
+def get_valid_files(paths):
+    valid_paths = []
+    for path in paths:
+        if os.path.isdir(path):
+            audio_files, video_files, sub_files = get_directory_files(path)
+            valid_paths.extend(audio_files)
+            valid_paths.extend(video_files)
+            valid_paths.extend(sub_files)
+        else:
+            valid_paths.append(path)
+    return valid_paths
+def extract_video_links(link):
+    params_dlp = {"quiet": False, "no_warnings": True, "noplaylist": False}
+    try:
+        from yt_dlp import YoutubeDL
+        with capture.capture_output() as cap:
+            with YoutubeDL(params_dlp) as ydl:
+                info_dict = ydl.extract_info( # noqa
+                    link, download=False, process=True
+                )
+        urls = re.findall(r'\[youtube\] Extracting URL: (.*?)\n', cap.stdout)
+        logger.info(f"List of videos in ({link}): {str(urls)}")
+        del cap
+    except Exception as error:
+        logger.error(f"{link} >> {str(error)}")
+        urls = [link]
+    return urls
+def get_link_list(urls):
+    valid_links = []
+    for url_video in urls:
+        if "youtube.com" in url_video and "/watch?v=" not in url_video:
+            url_links = extract_video_links(url_video)
+            valid_links.extend(url_links)
+        else:
+            valid_links.append(url_video)
+    return valid_links
+# =====================================
+# Download Manager
+# =====================================
+def load_file_from_url(
+    url: str,
+    model_dir: str,
+    file_name: str | None = None,
+    overwrite: bool = False,
+    progress: bool = True,
+) -> str:
+    """Download a file from `url` into `model_dir`,
+    using the file present if possible.
+    Returns the path to the downloaded file.
+    """
+    os.makedirs(model_dir, exist_ok=True)
+    if not file_name:
+        parts = urlparse(url)
+        file_name = os.path.basename(parts.path)
+    cached_file = os.path.abspath(os.path.join(model_dir, file_name))
+    # Overwrite
+    if os.path.exists(cached_file):
+        if overwrite or os.path.getsize(cached_file) == 0:
+            remove_files(cached_file)
+    # Download
+    if not os.path.exists(cached_file):
+        logger.info(f'Downloading: "{url}" to {cached_file}\n')
+        from torch.hub import download_url_to_file
+        download_url_to_file(url, cached_file, progress=progress)
+    else:
+        logger.debug(cached_file)
+    return cached_file
+def friendly_name(file: str):
+    if file.startswith("http"):
+        file = urlparse(file).path
+    file = os.path.basename(file)
+    model_name, extension = os.path.splitext(file)
+    return model_name, extension
+def download_manager(
+    url: str,
+    path: str,
+    extension: str = "",
+    overwrite: bool = False,
+    progress: bool = True,
+):
+    url = url.strip()
+    name, ext = friendly_name(url)
+    name += ext if not extension else f".{extension}"
+    if url.startswith("http"):
+        filename = load_file_from_url(
+            url=url,
+            model_dir=path,
+            file_name=name,
+            overwrite=overwrite,
+            progress=progress,
+        )
+    else:
+        filename = path
+    return filename
+# =====================================
+# File management
+# =====================================
+# only remove files
+def remove_files(file_list):
+    if isinstance(file_list, str):
+        file_list = [file_list]
+    for file in file_list:
+        if os.path.exists(file):
+            os.remove(file)
+def remove_directory_contents(directory_path):
+    """
+    Removes all files and subdirectories within a directory.
+    Parameters:
+    directory_path (str): Path to the directory whose
+    contents need to be removed.
+    """
+    if os.path.exists(directory_path):
+        for filename in os.listdir(directory_path):
+            file_path = os.path.join(directory_path, filename)
+            try:
+                if os.path.isfile(file_path):
+                    os.remove(file_path)
+                elif os.path.isdir(file_path):
+                    shutil.rmtree(file_path)
+            except Exception as e:
+                logger.error(f"Failed to delete {file_path}. Reason: {e}")
+        logger.info(f"Content in '{directory_path}' removed.")
+    else:
+        logger.error(f"Directory '{directory_path}' does not exist.")
+# Create directory if not exists
+def create_directories(directory_path):
+    if isinstance(directory_path, str):
+        directory_path = [directory_path]
+    for one_dir_path in directory_path:
+        if not os.path.exists(one_dir_path):
+            os.makedirs(one_dir_path)
+            logger.debug(f"Directory '{one_dir_path}' created.")
+def move_files(source_dir, destination_dir, extension=""):
+    """
+    Moves file(s) from the source path to the destination path.
+    Parameters:
+    source_dir (str): Path to the source directory.
+    destination_dir (str): Path to the destination directory.
+    extension (str): Only move files with this extension.
+    """
+    create_directories(destination_dir)
+    for filename in os.listdir(source_dir):
+        source_path = os.path.join(source_dir, filename)
+        destination_path = os.path.join(destination_dir, filename)
+        if extension and not filename.endswith(extension):
+            continue
+        os.replace(source_path, destination_path)
+def copy_files(source_path, destination_path):
+    """
+    Copies a file or multiple files from a source path to a destination path.
+    Parameters:
+    source_path (str or list): Path or list of paths to the source
+    file(s) or directory.
+    destination_path (str): Path to the destination directory.
+    """
+    create_directories(destination_path)
+    if isinstance(source_path, str):
+        source_path = [source_path]
+    if os.path.isdir(source_path[0]):
+        # Copy all files from the source directory to the destination directory
+        base_path = source_path[0]
+        source_path = os.listdir(source_path[0])
+        source_path = [
+            os.path.join(base_path, file_name) for file_name in source_path
+        ]
+    for one_source_path in source_path:
+        if os.path.exists(one_source_path):
+            shutil.copy2(one_source_path, destination_path)
+            logger.debug(
+                f"File '{one_source_path}' copied to '{destination_path}'."
+            )
+        else:
+            logger.error(f"File '{one_source_path}' does not exist.")
+def rename_file(current_name, new_name):
+    file_directory = os.path.dirname(current_name)
+    if os.path.exists(current_name):
+        dir_new_name_file = os.path.join(file_directory, new_name)
+        os.rename(current_name, dir_new_name_file)
+        logger.debug(f"File '{current_name}' renamed to '{new_name}'.")
+        return dir_new_name_file
+    else:
+        logger.error(f"File '{current_name}' does not exist.")
+        return None