vitaliy-sharandin commited on
Commit
9c9cefd
·
verified ·
1 Parent(s): a055e93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -22
app.py CHANGED
@@ -5,19 +5,22 @@ LangBridge Restricted
5
  '''
6
 
7
  import os
 
 
8
  import gradio as gr
9
- import whisperx
 
10
  import numpy as np
11
- import moviepy.editor as mp
12
- from moviepy.audio.AudioClip import AudioArrayClip
13
- from pytube import YouTube
14
- import deepl
15
- import torch
16
  import pyrubberband as pyrb
17
  import soundfile as sf
18
- import librosa
 
 
 
 
19
  from TTS.api import TTS
20
 
 
21
  HF_TOKEN = os.environ["HF_TOKEN"]
22
  DEEPL_TOKEN = os.environ["DEEPL_TOKEN"]
23
 
@@ -25,11 +28,11 @@ DEEPL_TOKEN = os.environ["DEEPL_TOKEN"]
25
  os.environ["COQUI_TOS_AGREED"] = "1"
26
 
27
  # Extract audio from video
28
- def extract_audio(video_path):
29
- clip = mp.VideoFileClip(video_path)
30
- audio_path = os.path.splitext(video_path)[0] + ".wav"
31
- clip.audio.write_audiofile(audio_path)
32
- return audio_path
33
 
34
 
35
  # Perform speech diarization
@@ -65,7 +68,6 @@ def speech_diarization(audio_path, hf_token):
65
 
66
  return result["segments"]
67
 
68
-
69
  # Create per speaker voice clips for tts voice cloning
70
  def speaker_voice_clips(transcription, audio_path):
71
  # Create 3 uninterrupted per speaker timecodes
@@ -92,7 +94,7 @@ def speaker_voice_clips(transcription, audio_path):
92
  subclips = []
93
  for snippet in speaker_snippets:
94
  start, end = snippet['start'], snippet['end']
95
- subclip = original_audio.subclip(start, end)
96
  subclips.append(subclip)
97
 
98
  concatenated_clip = mp.concatenate_audioclips(subclips)
@@ -201,7 +203,7 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
201
  audio_clip = AudioArrayClip(audio_stereo, fps=44100)
202
 
203
  # Cut out possible glitch from AudioArrayClip end
204
- audio_clip = audio_clip.subclip(0, audio_clip.duration - 0.2)
205
  clips.append(audio_clip)
206
  print(f"Added speech: Start={speech_item['start']}, Final duration={audio_clip.duration}, Original duration={speech_item_duration}")
207
 
@@ -223,10 +225,10 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
223
 
224
  def dub_video(video_path, translated_audio_track, target_language):
225
  video = mp.VideoFileClip(video_path)
226
- video = video.subclip(0, translated_audio_track.duration)
227
- original_audio = video.audio.volumex(0.15)
228
- dubbed_audio = mp.CompositeAudioClip([original_audio, translated_audio_track.set_start(0)])
229
- video_with_dubbing = video.set_audio(dubbed_audio)
230
 
231
  video_with_dubbing_path = os.path.splitext(video_path)[0] + "_" + target_language + ".mp4"
232
  video_with_dubbing.write_videofile(video_with_dubbing_path)
@@ -236,7 +238,7 @@ def dub_video(video_path, translated_audio_track, target_language):
236
 
237
  # Perform video translation
238
  def video_translation(video_path, target_language_codes, speaker_model, hf_token, deepl_token):
239
-
240
  original_audio_path = extract_audio(video_path)
241
 
242
  transcription = speech_diarization(original_audio_path, hf_token)
@@ -336,11 +338,11 @@ def translate_video(video_path, youtube_link, target_language, speaker_model):
336
  if video_path is None:
337
  gr.Warning("Video input did not process well, try again")
338
  return translation_limit(), None
339
-
340
  if check_video_duration(video_path):
341
  gr.Warning("Video is longer than 3 minutes, please provide a shorter one")
342
  return translation_limit(), None
343
-
344
  target_language_codes = language_codes[target_language]
345
  dubbed_video_path = video_translation(video_path, target_language_codes, speaker_model, HF_TOKEN, DEEPL_TOKEN)
346
  limit_info = translation_limit()
 
5
  '''
6
 
7
  import os
8
+
9
+ import deepl
10
  import gradio as gr
11
+ import librosa
12
+ import moviepy as mp
13
  import numpy as np
 
 
 
 
 
14
  import pyrubberband as pyrb
15
  import soundfile as sf
16
+ import torch
17
+ import whisperx
18
+ from moviepy import afx
19
+ from moviepy.audio.AudioClip import AudioArrayClip
20
+ from pytube import YouTube
21
  from TTS.api import TTS
22
 
23
+
24
  HF_TOKEN = os.environ["HF_TOKEN"]
25
  DEEPL_TOKEN = os.environ["DEEPL_TOKEN"]
26
 
 
28
  os.environ["COQUI_TOS_AGREED"] = "1"
29
 
30
  # Extract audio from video
31
+ def extract_audio(video_path: str):
32
+ clip = mp.VideoFileClip(video_path)
33
+ audio_path = os.path.splitext(video_path)[0] + ".wav"
34
+ clip.audio.write_audiofile(audio_path)
35
+ return audio_path
36
 
37
 
38
  # Perform speech diarization
 
68
 
69
  return result["segments"]
70
 
 
71
  # Create per speaker voice clips for tts voice cloning
72
  def speaker_voice_clips(transcription, audio_path):
73
  # Create 3 uninterrupted per speaker timecodes
 
94
  subclips = []
95
  for snippet in speaker_snippets:
96
  start, end = snippet['start'], snippet['end']
97
+ subclip = original_audio.subclipped(start, end)
98
  subclips.append(subclip)
99
 
100
  concatenated_clip = mp.concatenate_audioclips(subclips)
 
203
  audio_clip = AudioArrayClip(audio_stereo, fps=44100)
204
 
205
  # Cut out possible glitch from AudioArrayClip end
206
+ audio_clip = audio_clip.subclipped(0, audio_clip.duration - 0.2)
207
  clips.append(audio_clip)
208
  print(f"Added speech: Start={speech_item['start']}, Final duration={audio_clip.duration}, Original duration={speech_item_duration}")
209
 
 
225
 
226
  def dub_video(video_path, translated_audio_track, target_language):
227
  video = mp.VideoFileClip(video_path)
228
+ video = video.subclipped(0, translated_audio_track.duration)
229
+ original_audio = video.audio.with_effects([afx.MultiplyVolume(0.15)])
230
+ dubbed_audio = mp.CompositeAudioClip([original_audio, translated_audio_track.with_start(0)])
231
+ video_with_dubbing = video.with_audio(dubbed_audio)
232
 
233
  video_with_dubbing_path = os.path.splitext(video_path)[0] + "_" + target_language + ".mp4"
234
  video_with_dubbing.write_videofile(video_with_dubbing_path)
 
238
 
239
  # Perform video translation
240
  def video_translation(video_path, target_language_codes, speaker_model, hf_token, deepl_token):
241
+
242
  original_audio_path = extract_audio(video_path)
243
 
244
  transcription = speech_diarization(original_audio_path, hf_token)
 
338
  if video_path is None:
339
  gr.Warning("Video input did not process well, try again")
340
  return translation_limit(), None
341
+
342
  if check_video_duration(video_path):
343
  gr.Warning("Video is longer than 3 minutes, please provide a shorter one")
344
  return translation_limit(), None
345
+
346
  target_language_codes = language_codes[target_language]
347
  dubbed_video_path = video_translation(video_path, target_language_codes, speaker_model, HF_TOKEN, DEEPL_TOKEN)
348
  limit_info = translation_limit()