Update app.py
Browse files
app.py
CHANGED
@@ -5,19 +5,22 @@ LangBridge Restricted
|
|
5 |
'''
|
6 |
|
7 |
import os
|
|
|
|
|
8 |
import gradio as gr
|
9 |
-
import
|
|
|
10 |
import numpy as np
|
11 |
-
import moviepy.editor as mp
|
12 |
-
from moviepy.audio.AudioClip import AudioArrayClip
|
13 |
-
from pytube import YouTube
|
14 |
-
import deepl
|
15 |
-
import torch
|
16 |
import pyrubberband as pyrb
|
17 |
import soundfile as sf
|
18 |
-
import
|
|
|
|
|
|
|
|
|
19 |
from TTS.api import TTS
|
20 |
|
|
|
21 |
HF_TOKEN = os.environ["HF_TOKEN"]
|
22 |
DEEPL_TOKEN = os.environ["DEEPL_TOKEN"]
|
23 |
|
@@ -25,11 +28,11 @@ DEEPL_TOKEN = os.environ["DEEPL_TOKEN"]
|
|
25 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
26 |
|
27 |
# Extract audio from video
|
28 |
-
def extract_audio(video_path):
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
|
34 |
|
35 |
# Perform speech diarization
|
@@ -65,7 +68,6 @@ def speech_diarization(audio_path, hf_token):
|
|
65 |
|
66 |
return result["segments"]
|
67 |
|
68 |
-
|
69 |
# Create per speaker voice clips for tts voice cloning
|
70 |
def speaker_voice_clips(transcription, audio_path):
|
71 |
# Create 3 uninterrupted per speaker timecodes
|
@@ -92,7 +94,7 @@ def speaker_voice_clips(transcription, audio_path):
|
|
92 |
subclips = []
|
93 |
for snippet in speaker_snippets:
|
94 |
start, end = snippet['start'], snippet['end']
|
95 |
-
subclip = original_audio.
|
96 |
subclips.append(subclip)
|
97 |
|
98 |
concatenated_clip = mp.concatenate_audioclips(subclips)
|
@@ -201,7 +203,7 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
|
|
201 |
audio_clip = AudioArrayClip(audio_stereo, fps=44100)
|
202 |
|
203 |
# Cut out possible glitch from AudioArrayClip end
|
204 |
-
audio_clip = audio_clip.
|
205 |
clips.append(audio_clip)
|
206 |
print(f"Added speech: Start={speech_item['start']}, Final duration={audio_clip.duration}, Original duration={speech_item_duration}")
|
207 |
|
@@ -223,10 +225,10 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
|
|
223 |
|
224 |
def dub_video(video_path, translated_audio_track, target_language):
|
225 |
video = mp.VideoFileClip(video_path)
|
226 |
-
video = video.
|
227 |
-
original_audio = video.audio.
|
228 |
-
dubbed_audio = mp.CompositeAudioClip([original_audio, translated_audio_track.
|
229 |
-
video_with_dubbing = video.
|
230 |
|
231 |
video_with_dubbing_path = os.path.splitext(video_path)[0] + "_" + target_language + ".mp4"
|
232 |
video_with_dubbing.write_videofile(video_with_dubbing_path)
|
@@ -236,7 +238,7 @@ def dub_video(video_path, translated_audio_track, target_language):
|
|
236 |
|
237 |
# Perform video translation
|
238 |
def video_translation(video_path, target_language_codes, speaker_model, hf_token, deepl_token):
|
239 |
-
|
240 |
original_audio_path = extract_audio(video_path)
|
241 |
|
242 |
transcription = speech_diarization(original_audio_path, hf_token)
|
@@ -336,11 +338,11 @@ def translate_video(video_path, youtube_link, target_language, speaker_model):
|
|
336 |
if video_path is None:
|
337 |
gr.Warning("Video input did not process well, try again")
|
338 |
return translation_limit(), None
|
339 |
-
|
340 |
if check_video_duration(video_path):
|
341 |
gr.Warning("Video is longer than 3 minutes, please provide a shorter one")
|
342 |
return translation_limit(), None
|
343 |
-
|
344 |
target_language_codes = language_codes[target_language]
|
345 |
dubbed_video_path = video_translation(video_path, target_language_codes, speaker_model, HF_TOKEN, DEEPL_TOKEN)
|
346 |
limit_info = translation_limit()
|
|
|
5 |
'''
|
6 |
|
7 |
import os
|
8 |
+
|
9 |
+
import deepl
|
10 |
import gradio as gr
|
11 |
+
import librosa
|
12 |
+
import moviepy as mp
|
13 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
14 |
import pyrubberband as pyrb
|
15 |
import soundfile as sf
|
16 |
+
import torch
|
17 |
+
import whisperx
|
18 |
+
from moviepy import afx
|
19 |
+
from moviepy.audio.AudioClip import AudioArrayClip
|
20 |
+
from pytube import YouTube
|
21 |
from TTS.api import TTS
|
22 |
|
23 |
+
|
24 |
HF_TOKEN = os.environ["HF_TOKEN"]
|
25 |
DEEPL_TOKEN = os.environ["DEEPL_TOKEN"]
|
26 |
|
|
|
28 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
29 |
|
30 |
# Extract audio from video
|
31 |
+
def extract_audio(video_path: str):
|
32 |
+
clip = mp.VideoFileClip(video_path)
|
33 |
+
audio_path = os.path.splitext(video_path)[0] + ".wav"
|
34 |
+
clip.audio.write_audiofile(audio_path)
|
35 |
+
return audio_path
|
36 |
|
37 |
|
38 |
# Perform speech diarization
|
|
|
68 |
|
69 |
return result["segments"]
|
70 |
|
|
|
71 |
# Create per speaker voice clips for tts voice cloning
|
72 |
def speaker_voice_clips(transcription, audio_path):
|
73 |
# Create 3 uninterrupted per speaker timecodes
|
|
|
94 |
subclips = []
|
95 |
for snippet in speaker_snippets:
|
96 |
start, end = snippet['start'], snippet['end']
|
97 |
+
subclip = original_audio.subclipped(start, end)
|
98 |
subclips.append(subclip)
|
99 |
|
100 |
concatenated_clip = mp.concatenate_audioclips(subclips)
|
|
|
203 |
audio_clip = AudioArrayClip(audio_stereo, fps=44100)
|
204 |
|
205 |
# Cut out possible glitch from AudioArrayClip end
|
206 |
+
audio_clip = audio_clip.subclipped(0, audio_clip.duration - 0.2)
|
207 |
clips.append(audio_clip)
|
208 |
print(f"Added speech: Start={speech_item['start']}, Final duration={audio_clip.duration}, Original duration={speech_item_duration}")
|
209 |
|
|
|
225 |
|
226 |
def dub_video(video_path, translated_audio_track, target_language):
|
227 |
video = mp.VideoFileClip(video_path)
|
228 |
+
video = video.subclipped(0, translated_audio_track.duration)
|
229 |
+
original_audio = video.audio.with_effects([afx.MultiplyVolume(0.15)])
|
230 |
+
dubbed_audio = mp.CompositeAudioClip([original_audio, translated_audio_track.with_start(0)])
|
231 |
+
video_with_dubbing = video.with_audio(dubbed_audio)
|
232 |
|
233 |
video_with_dubbing_path = os.path.splitext(video_path)[0] + "_" + target_language + ".mp4"
|
234 |
video_with_dubbing.write_videofile(video_with_dubbing_path)
|
|
|
238 |
|
239 |
# Perform video translation
|
240 |
def video_translation(video_path, target_language_codes, speaker_model, hf_token, deepl_token):
|
241 |
+
|
242 |
original_audio_path = extract_audio(video_path)
|
243 |
|
244 |
transcription = speech_diarization(original_audio_path, hf_token)
|
|
|
338 |
if video_path is None:
|
339 |
gr.Warning("Video input did not process well, try again")
|
340 |
return translation_limit(), None
|
341 |
+
|
342 |
if check_video_duration(video_path):
|
343 |
gr.Warning("Video is longer than 3 minutes, please provide a shorter one")
|
344 |
return translation_limit(), None
|
345 |
+
|
346 |
target_language_codes = language_codes[target_language]
|
347 |
dubbed_video_path = video_translation(video_path, target_language_codes, speaker_model, HF_TOKEN, DEEPL_TOKEN)
|
348 |
limit_info = translation_limit()
|