Update app.py
Browse files
app.py
CHANGED
@@ -2,14 +2,15 @@ from KOKORO.models import build_model
|
|
2 |
from KOKORO.utils import tts,tts_file_name,podcast
|
3 |
import sys
|
4 |
sys.path.append('.')
|
|
|
|
|
5 |
import torch
|
6 |
import gc
|
7 |
print("Loading model...")
|
8 |
-
import os
|
9 |
-
os.system("python download_model.py")
|
10 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
11 |
print(f'Using device: {device}')
|
12 |
-
MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
|
|
|
13 |
print("Model loaded successfully.")
|
14 |
|
15 |
def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
|
@@ -42,7 +43,8 @@ def update_model(model_name):
|
|
42 |
return f"Model updated to {model_name}"
|
43 |
|
44 |
|
45 |
-
|
|
|
46 |
"""
|
47 |
Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
|
48 |
"""
|
@@ -218,17 +220,311 @@ with gr.Blocks() as demo2:
|
|
218 |
outputs=[audio]
|
219 |
)
|
220 |
|
221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
with gr.Blocks() as demo3:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
gr.Markdown(f"# Voice Names \n{display_text}")
|
225 |
|
226 |
-
|
227 |
-
|
228 |
-
@click.
|
229 |
-
@click.option("--
|
230 |
-
|
231 |
-
|
|
|
232 |
|
233 |
demo.queue().launch(debug=debug, share=share)
|
234 |
#Run on local network
|
@@ -261,4 +557,4 @@ if __name__ == "__main__":
|
|
261 |
|
262 |
# save_at=f"./temp_audio/{os.path.basename(result)}"
|
263 |
# shutil.move(result, save_at)
|
264 |
-
# print(f"Saved at {save_at}")
|
|
|
2 |
from KOKORO.utils import tts,tts_file_name,podcast
|
3 |
import sys
|
4 |
sys.path.append('.')
|
5 |
+
import os
|
6 |
+
os.system("python download_model.py")
|
7 |
import torch
|
8 |
import gc
|
9 |
print("Loading model...")
|
|
|
|
|
10 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
11 |
print(f'Using device: {device}')
|
12 |
+
# MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
|
13 |
+
MODEL = build_model('./KOKORO/fp16/kokoro-v0_19-half.pth', device)
|
14 |
print("Model loaded successfully.")
|
15 |
|
16 |
def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
|
|
|
43 |
return f"Model updated to {model_name}"
|
44 |
|
45 |
|
46 |
+
|
47 |
+
def text_to_speech(text, model_name="kokoro-v0_19-half.pth", voice_name="af", speed=1.0, trim=1.0, pad_between_segments=0, remove_silence=True, minimum_silence=0.20):
|
48 |
"""
|
49 |
Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
|
50 |
"""
|
|
|
220 |
outputs=[audio]
|
221 |
)
|
222 |
|
223 |
+
|
224 |
+
|
225 |
+
|
226 |
+
import shutil
|
227 |
+
import os
|
228 |
+
|
229 |
+
# Ensure the output directory exists
|
230 |
+
output_dir = "./temp_audio"
|
231 |
+
os.makedirs(output_dir, exist_ok=True)
|
232 |
+
|
233 |
+
|
234 |
+
|
235 |
+
|
236 |
+
|
237 |
+
|
238 |
+
|
239 |
+
|
240 |
+
|
241 |
+
#@title Generate Audio File From Subtitle
|
242 |
+
# from tqdm.notebook import tqdm
|
243 |
+
from tqdm import tqdm
|
244 |
+
import subprocess
|
245 |
+
import json
|
246 |
+
import pysrt
|
247 |
+
import os
|
248 |
+
from pydub import AudioSegment
|
249 |
+
import shutil
|
250 |
+
import uuid
|
251 |
+
import re
|
252 |
+
import time
|
253 |
+
|
254 |
+
# os.chdir(install_path)
|
255 |
+
|
256 |
+
def your_tts(text,audio_path,actual_duration,speed=1.0):
|
257 |
+
global srt_voice_name
|
258 |
+
model_name="kokoro-v0_19.pth"
|
259 |
+
tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speed)
|
260 |
+
print(tts_path)
|
261 |
+
tts_audio = AudioSegment.from_file(tts_path)
|
262 |
+
tts_duration = len(tts_audio)
|
263 |
+
if tts_duration > actual_duration:
|
264 |
+
speedup_factor = tts_duration / actual_duration
|
265 |
+
tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speedup_factor)
|
266 |
+
print(tts_path)
|
267 |
+
shutil.copy(tts_path,audio_path)
|
268 |
+
|
269 |
+
|
270 |
+
|
271 |
+
base_path="."
|
272 |
+
import datetime
|
273 |
+
def get_current_time():
|
274 |
+
# Return current time as a string in the format HH_MM_AM/PM
|
275 |
+
return datetime.datetime.now().strftime("%I_%M_%p")
|
276 |
+
|
277 |
+
def get_subtitle_Dub_path(srt_file_path,Language="en"):
|
278 |
+
file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
|
279 |
+
if not os.path.exists(f"{base_path}/TTS_DUB"):
|
280 |
+
os.mkdir(f"{base_path}/TTS_DUB")
|
281 |
+
random_string = str(uuid.uuid4())[:6]
|
282 |
+
new_path=f"{base_path}/TTS_DUB/{file_name}_{Language}_{get_current_time()}_{random_string}.wav"
|
283 |
+
return new_path
|
284 |
+
|
285 |
+
|
286 |
+
|
287 |
+
|
288 |
+
|
289 |
+
|
290 |
+
|
291 |
+
|
292 |
+
def clean_srt(input_path):
|
293 |
+
file_name = os.path.basename(input_path)
|
294 |
+
output_folder = f"{base_path}/save_srt"
|
295 |
+
if not os.path.exists(output_folder):
|
296 |
+
os.mkdir(output_folder)
|
297 |
+
output_path = f"{output_folder}/{file_name}"
|
298 |
+
|
299 |
+
def clean_srt_line(text):
|
300 |
+
bad_list = ["[", "]", "♫", "\n"]
|
301 |
+
for i in bad_list:
|
302 |
+
text = text.replace(i, "")
|
303 |
+
return text.strip()
|
304 |
+
|
305 |
+
# Load the subtitle file
|
306 |
+
subs = pysrt.open(input_path)
|
307 |
+
|
308 |
+
# Iterate through each subtitle and print its details
|
309 |
+
with open(output_path, "w", encoding='utf-8') as file:
|
310 |
+
for sub in subs:
|
311 |
+
file.write(f"{sub.index}\n")
|
312 |
+
file.write(f"{sub.start} --> {sub.end}\n")
|
313 |
+
file.write(f"{clean_srt_line(sub.text)}\n")
|
314 |
+
file.write("\n")
|
315 |
+
file.close()
|
316 |
+
# print(f"Clean SRT saved at: {output_path}")
|
317 |
+
return output_path
|
318 |
+
# Example usage
|
319 |
+
|
320 |
+
|
321 |
+
|
322 |
+
|
323 |
+
|
324 |
+
|
325 |
+
class SRTDubbing:
|
326 |
+
def __init__(self):
|
327 |
+
pass
|
328 |
+
|
329 |
+
@staticmethod
|
330 |
+
def text_to_speech_srt(text, audio_path, language, actual_duration):
|
331 |
+
tts_filename = "./cache/temp.wav"
|
332 |
+
your_tts(text,tts_filename,actual_duration,speed=1.0)
|
333 |
+
# Check the duration of the generated TTS audio
|
334 |
+
tts_audio = AudioSegment.from_file(tts_filename)
|
335 |
+
tts_duration = len(tts_audio)
|
336 |
+
|
337 |
+
if actual_duration == 0:
|
338 |
+
# If actual duration is zero, use the original TTS audio without modifications
|
339 |
+
shutil.move(tts_filename, audio_path)
|
340 |
+
return
|
341 |
+
# If TTS audio duration is longer than actual duration, speed up the audio
|
342 |
+
if tts_duration > actual_duration:
|
343 |
+
speedup_factor = tts_duration / actual_duration
|
344 |
+
speedup_filename = "./cache/speedup_temp.wav"
|
345 |
+
# Use ffmpeg to change audio speed
|
346 |
+
subprocess.run([
|
347 |
+
"ffmpeg",
|
348 |
+
"-i", tts_filename,
|
349 |
+
"-filter:a", f"atempo={speedup_factor}",
|
350 |
+
speedup_filename,
|
351 |
+
"-y"
|
352 |
+
], check=True)
|
353 |
+
|
354 |
+
# Replace the original TTS audio with the sped-up version
|
355 |
+
shutil.move(speedup_filename, audio_path)
|
356 |
+
elif tts_duration < actual_duration:
|
357 |
+
# If TTS audio duration is less than actual duration, add silence to match the duration
|
358 |
+
silence_gap = actual_duration - tts_duration
|
359 |
+
silence = AudioSegment.silent(duration=int(silence_gap))
|
360 |
+
new_audio = tts_audio + silence
|
361 |
+
|
362 |
+
# Save the new audio with added silence
|
363 |
+
new_audio.export(audio_path, format="wav")
|
364 |
+
else:
|
365 |
+
# If TTS audio duration is equal to actual duration, use the original TTS audio
|
366 |
+
shutil.move(tts_filename, audio_path)
|
367 |
+
|
368 |
+
@staticmethod
|
369 |
+
def make_silence(pause_time, pause_save_path):
|
370 |
+
silence = AudioSegment.silent(duration=pause_time)
|
371 |
+
silence.export(pause_save_path, format="wav")
|
372 |
+
return pause_save_path
|
373 |
+
|
374 |
+
@staticmethod
|
375 |
+
def create_folder_for_srt(srt_file_path):
|
376 |
+
srt_base_name = os.path.splitext(os.path.basename(srt_file_path))[0]
|
377 |
+
random_uuid = str(uuid.uuid4())[:4]
|
378 |
+
dummy_folder_path = f"{base_path}/dummy"
|
379 |
+
if not os.path.exists(dummy_folder_path):
|
380 |
+
os.makedirs(dummy_folder_path)
|
381 |
+
folder_path = os.path.join(dummy_folder_path, f"{srt_base_name}_{random_uuid}")
|
382 |
+
os.makedirs(folder_path, exist_ok=True)
|
383 |
+
return folder_path
|
384 |
+
|
385 |
+
@staticmethod
|
386 |
+
def concatenate_audio_files(audio_paths, output_path):
|
387 |
+
concatenated_audio = AudioSegment.silent(duration=0)
|
388 |
+
for audio_path in audio_paths:
|
389 |
+
audio_segment = AudioSegment.from_file(audio_path)
|
390 |
+
concatenated_audio += audio_segment
|
391 |
+
concatenated_audio.export(output_path, format="wav")
|
392 |
+
|
393 |
+
def srt_to_dub(self, srt_file_path,dub_save_path,language='en'):
|
394 |
+
result = self.read_srt_file(srt_file_path)
|
395 |
+
new_folder_path = self.create_folder_for_srt(srt_file_path)
|
396 |
+
join_path = []
|
397 |
+
for i in tqdm(result):
|
398 |
+
# for i in result:
|
399 |
+
text = i['text']
|
400 |
+
actual_duration = i['end_time'] - i['start_time']
|
401 |
+
pause_time = i['pause_time']
|
402 |
+
slient_path = f"{new_folder_path}/{i['previous_pause']}"
|
403 |
+
self.make_silence(pause_time, slient_path)
|
404 |
+
join_path.append(slient_path)
|
405 |
+
tts_path = f"{new_folder_path}/{i['audio_name']}"
|
406 |
+
self.text_to_speech_srt(text, tts_path, language, actual_duration)
|
407 |
+
join_path.append(tts_path)
|
408 |
+
self.concatenate_audio_files(join_path, dub_save_path)
|
409 |
+
|
410 |
+
@staticmethod
|
411 |
+
def convert_to_millisecond(time_str):
|
412 |
+
if isinstance(time_str, str):
|
413 |
+
hours, minutes, second_millisecond = time_str.split(':')
|
414 |
+
seconds, milliseconds = second_millisecond.split(",")
|
415 |
+
|
416 |
+
total_milliseconds = (
|
417 |
+
int(hours) * 3600000 +
|
418 |
+
int(minutes) * 60000 +
|
419 |
+
int(seconds) * 1000 +
|
420 |
+
int(milliseconds)
|
421 |
+
)
|
422 |
+
|
423 |
+
return total_milliseconds
|
424 |
+
@staticmethod
|
425 |
+
def read_srt_file(file_path):
|
426 |
+
entries = []
|
427 |
+
default_start = 0
|
428 |
+
previous_end_time = default_start
|
429 |
+
entry_number = 1
|
430 |
+
audio_name_template = "{}.wav"
|
431 |
+
previous_pause_template = "{}_before_pause.wav"
|
432 |
+
|
433 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
434 |
+
lines = file.readlines()
|
435 |
+
# print(lines)
|
436 |
+
for i in range(0, len(lines), 4):
|
437 |
+
time_info = re.findall(r'(\d+:\d+:\d+,\d+) --> (\d+:\d+:\d+,\d+)', lines[i + 1])
|
438 |
+
start_time = SRTDubbing.convert_to_millisecond(time_info[0][0])
|
439 |
+
end_time = SRTDubbing.convert_to_millisecond(time_info[0][1])
|
440 |
+
|
441 |
+
current_entry = {
|
442 |
+
'entry_number': entry_number,
|
443 |
+
'start_time': start_time,
|
444 |
+
'end_time': end_time,
|
445 |
+
'text': lines[i + 2].strip(),
|
446 |
+
'pause_time': start_time - previous_end_time if entry_number != 1 else start_time - default_start,
|
447 |
+
'audio_name': audio_name_template.format(entry_number),
|
448 |
+
'previous_pause': previous_pause_template.format(entry_number),
|
449 |
+
}
|
450 |
+
|
451 |
+
entries.append(current_entry)
|
452 |
+
previous_end_time = end_time
|
453 |
+
entry_number += 1
|
454 |
+
|
455 |
+
with open("entries.json", "w") as file:
|
456 |
+
json.dump(entries, file, indent=4)
|
457 |
+
return entries
|
458 |
+
srt_voice_name="am_adam"
|
459 |
+
def srt_process(srt_file_path,voice_name,dest_language="en"):
|
460 |
+
global srt_voice_name
|
461 |
+
srt_voice_name=voice_name
|
462 |
+
srt_dubbing = SRTDubbing()
|
463 |
+
dub_save_path=get_subtitle_Dub_path(srt_file_path,dest_language)
|
464 |
+
srt_dubbing.srt_to_dub(srt_file_path,dub_save_path,dest_language)
|
465 |
+
return dub_save_path
|
466 |
+
|
467 |
+
#
|
468 |
+
# srt_file_path="./long.srt"
|
469 |
+
# dub_audio_path=srt_process(srt_file_path)
|
470 |
+
# print(f"Audio file saved at: {dub_audio_path}")
|
471 |
+
|
472 |
+
|
473 |
|
474 |
with gr.Blocks() as demo3:
|
475 |
+
|
476 |
+
gr.Markdown(
|
477 |
+
"""
|
478 |
+
# Generate Audio File From Subtitle [Single Speaker Only]
|
479 |
+
|
480 |
+
To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
|
481 |
+
|
482 |
+
[](https://colab.research.google.com/github/NeuralFalconYT/Whisper-Turbo-Subtitle/blob/main/Whisper_Turbo_Subtitle.ipynb)
|
483 |
+
"""
|
484 |
+
)
|
485 |
+
with gr.Row():
|
486 |
+
with gr.Column():
|
487 |
+
srt_file = gr.File(label='Upload .srt Subtitle File Only')
|
488 |
+
with gr.Row():
|
489 |
+
voice = gr.Dropdown(
|
490 |
+
voice_list,
|
491 |
+
value='af',
|
492 |
+
allow_custom_value=False,
|
493 |
+
label='Voice',
|
494 |
+
)
|
495 |
+
with gr.Row():
|
496 |
+
generate_btn_ = gr.Button('Generate', variant='primary')
|
497 |
+
|
498 |
+
with gr.Column():
|
499 |
+
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
500 |
+
with gr.Accordion('Enable Autoplay', open=False):
|
501 |
+
autoplay = gr.Checkbox(value=True, label='Autoplay')
|
502 |
+
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
503 |
+
|
504 |
+
# srt_file.submit(
|
505 |
+
# srt_process,
|
506 |
+
# inputs=[srt_file, voice],
|
507 |
+
# outputs=[audio]
|
508 |
+
# )
|
509 |
+
generate_btn_.click(
|
510 |
+
srt_process,
|
511 |
+
inputs=[srt_file,voice],
|
512 |
+
outputs=[audio]
|
513 |
+
)
|
514 |
+
|
515 |
+
|
516 |
+
display_text = " \n".join(voice_list)
|
517 |
+
|
518 |
+
with gr.Blocks() as demo4:
|
519 |
gr.Markdown(f"# Voice Names \n{display_text}")
|
520 |
|
521 |
+
|
522 |
+
# import click
|
523 |
+
# @click.command()
|
524 |
+
# @click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
|
525 |
+
# @click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
|
526 |
+
def main(debug=False, share=False):
|
527 |
+
demo = gr.TabbedInterface([demo1, demo2,demo3,demo4], ["Batched TTS", "Multiple Speech-Type Generation","SRT Dubbing","Available Voice Names"],title="Kokoro TTS")
|
528 |
|
529 |
demo.queue().launch(debug=debug, share=share)
|
530 |
#Run on local network
|
|
|
557 |
|
558 |
# save_at=f"./temp_audio/{os.path.basename(result)}"
|
559 |
# shutil.move(result, save_at)
|
560 |
+
# print(f"Saved at {save_at}")
|