Spaces:

woak-oa
/

DeepDubber-V1

Running

App Files Files Community

none commited on Mar 25

Commit

64daaa2

1 Parent(s): 1b58092

init

Browse files

Files changed (2) hide show

app.py +18 -17
src/moviedubber/infer_with_mmlm_result.py +4 -4

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import sys
 import tempfile
@@ -8,7 +9,7 @@ import soundfile
 import torch
 import torch.nn.functional as F
 import torchaudio
-from huggingface_hub import hf_hub_download
 from moviepy import VideoFileClip
 from pydub import AudioSegment
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoTokenizer, pipeline
@@ -52,8 +53,8 @@ def load_asr_model(model_id="openai/whisper-large-v3-turbo"):
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-mmlm_path = hf_hub_download(repo_id="woak-oa/DeepDubber-V1", filename="mmlm")
 mmlm = InternVLChatModel.from_pretrained(
     mmlm_path,
     torch_dtype=torch.bfloat16,
@@ -67,7 +68,7 @@ tokenizer = AutoTokenizer.from_pretrained(mmlm_path, trust_remote_code=True, use
 generation_config = dict(max_new_tokens=1024, do_sample=False)
-ema_model, vocoder, ort_session = load_models(device=device)
 asr_pipe = load_asr_model()
 videofeature_extractor = VideoFeatureExtractor(device=device)
@@ -190,25 +191,25 @@ def deepdubber(video_path: str, subtitle_text: str, audio_path: str = None) -> s
 def process_video_dubbing(video_path: str, subtitle_text: str, audio_path: str = None) -> str:
-    # try:
-    print(f"Processing video: {video_path}")
-    if not os.path.exists(video_path):
-        raise ValueError("Video file does not exist")
-    if not subtitle_text.strip():
-        raise ValueError("Subtitle text cannot be empty")
-    if audio_path is None:
-        audio_path = "datasets/CoTMovieDubbing/GT.wav"
-    res, output_path = deepdubber(video_path, subtitle_text, audio_path)
-    return res, output_path
-    # except Exception as e:
-    #     print(f"Error in process_video_dubbing: {e}")
-    #     return None, None
 def create_ui():

 import os
+import os.path as osp
 import sys
 import tempfile
 import torch
 import torch.nn.functional as F
 import torchaudio
+from huggingface_hub import hf_hub_download, snapshot_download
 from moviepy import VideoFileClip
 from pydub import AudioSegment
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoTokenizer, pipeline
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+repo_local_path = snapshot_download(repo_id="woak-oa/DeepDubber-V1")
+mmlm_path = osp.join(repo_local_path, "mmlm")
 mmlm = InternVLChatModel.from_pretrained(
     mmlm_path,
     torch_dtype=torch.bfloat16,
 generation_config = dict(max_new_tokens=1024, do_sample=False)
+ema_model, vocoder, ort_session = load_models(repo_local_path, device=device)
 asr_pipe = load_asr_model()
 videofeature_extractor = VideoFeatureExtractor(device=device)
 def process_video_dubbing(video_path: str, subtitle_text: str, audio_path: str = None) -> str:
+    try:
+        print(f"Processing video: {video_path}")
+        if not os.path.exists(video_path):
+            raise ValueError("Video file does not exist")
+        if not subtitle_text.strip():
+            raise ValueError("Subtitle text cannot be empty")
+        if audio_path is None:
+            audio_path = "datasets/CoTMovieDubbing/GT.wav"
+        res, output_path = deepdubber(video_path, subtitle_text, audio_path)
+        return res, output_path
+    except Exception as e:
+        print(f"Error in process_video_dubbing: {e}")
+        return None, None
 def create_ui():

src/moviedubber/infer_with_mmlm_result.py CHANGED Viewed

@@ -65,15 +65,15 @@ def get_spk_emb(audio_path, ort_session):
     return embedding
-def load_models(device):
     model_cfg = "src/moviedubber/configs/basemodel.yaml"
     vocoder_name = "bigvgan"
     vocoder = load_vocoder(local_path="nvidia/bigvgan_v2_24khz_100band_256x", device=device)
-    ckpt_path = hf_hub_download(repo_id="woak-oa/DeepDubber-V1", filename="mmdubber.pt")
-    vocab_file = hf_hub_download(repo_id="woak-oa/DeepDubber-V1", filename="vocab.txt")
-    campplus_path = hf_hub_download(repo_id="woak-oa/DeepDubber-V1", filename="campplus.onnx")
     model_cls = DiT
     model_cfg = OmegaConf.load(model_cfg).model.arch

     return embedding
+def load_models(repo_local_path, device):
     model_cfg = "src/moviedubber/configs/basemodel.yaml"
     vocoder_name = "bigvgan"
     vocoder = load_vocoder(local_path="nvidia/bigvgan_v2_24khz_100band_256x", device=device)
+    ckpt_path = os.path.join(repo_local_path, "mmdubber.pt")
+    vocab_file = os.path.join(repo_local_path, "vocab.txt")
+    campplus_path = os.path.join(repo_local_path, "campplus.onnx")
     model_cls = DiT
     model_cfg = OmegaConf.load(model_cfg).model.arch