none commited on
Commit
64daaa2
·
1 Parent(s): 1b58092
Files changed (2) hide show
  1. app.py +18 -17
  2. src/moviedubber/infer_with_mmlm_result.py +4 -4
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import sys
3
  import tempfile
4
 
@@ -8,7 +9,7 @@ import soundfile
8
  import torch
9
  import torch.nn.functional as F
10
  import torchaudio
11
- from huggingface_hub import hf_hub_download
12
  from moviepy import VideoFileClip
13
  from pydub import AudioSegment
14
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoTokenizer, pipeline
@@ -52,8 +53,8 @@ def load_asr_model(model_id="openai/whisper-large-v3-turbo"):
52
 
53
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
54
 
55
- mmlm_path = hf_hub_download(repo_id="woak-oa/DeepDubber-V1", filename="mmlm")
56
-
57
  mmlm = InternVLChatModel.from_pretrained(
58
  mmlm_path,
59
  torch_dtype=torch.bfloat16,
@@ -67,7 +68,7 @@ tokenizer = AutoTokenizer.from_pretrained(mmlm_path, trust_remote_code=True, use
67
  generation_config = dict(max_new_tokens=1024, do_sample=False)
68
 
69
 
70
- ema_model, vocoder, ort_session = load_models(device=device)
71
  asr_pipe = load_asr_model()
72
 
73
  videofeature_extractor = VideoFeatureExtractor(device=device)
@@ -190,25 +191,25 @@ def deepdubber(video_path: str, subtitle_text: str, audio_path: str = None) -> s
190
 
191
 
192
  def process_video_dubbing(video_path: str, subtitle_text: str, audio_path: str = None) -> str:
193
- # try:
194
- print(f"Processing video: {video_path}")
195
- if not os.path.exists(video_path):
196
- raise ValueError("Video file does not exist")
197
 
198
- if not subtitle_text.strip():
199
- raise ValueError("Subtitle text cannot be empty")
200
 
201
- if audio_path is None:
202
- audio_path = "datasets/CoTMovieDubbing/GT.wav"
203
 
204
- res, output_path = deepdubber(video_path, subtitle_text, audio_path)
205
 
206
- return res, output_path
207
 
208
- # except Exception as e:
209
- # print(f"Error in process_video_dubbing: {e}")
210
 
211
- # return None, None
212
 
213
 
214
  def create_ui():
 
1
  import os
2
+ import os.path as osp
3
  import sys
4
  import tempfile
5
 
 
9
  import torch
10
  import torch.nn.functional as F
11
  import torchaudio
12
+ from huggingface_hub import hf_hub_download, snapshot_download
13
  from moviepy import VideoFileClip
14
  from pydub import AudioSegment
15
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoTokenizer, pipeline
 
53
 
54
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
55
 
56
+ repo_local_path = snapshot_download(repo_id="woak-oa/DeepDubber-V1")
57
+ mmlm_path = osp.join(repo_local_path, "mmlm")
58
  mmlm = InternVLChatModel.from_pretrained(
59
  mmlm_path,
60
  torch_dtype=torch.bfloat16,
 
68
  generation_config = dict(max_new_tokens=1024, do_sample=False)
69
 
70
 
71
+ ema_model, vocoder, ort_session = load_models(repo_local_path, device=device)
72
  asr_pipe = load_asr_model()
73
 
74
  videofeature_extractor = VideoFeatureExtractor(device=device)
 
191
 
192
 
193
  def process_video_dubbing(video_path: str, subtitle_text: str, audio_path: str = None) -> str:
194
+ try:
195
+ print(f"Processing video: {video_path}")
196
+ if not os.path.exists(video_path):
197
+ raise ValueError("Video file does not exist")
198
 
199
+ if not subtitle_text.strip():
200
+ raise ValueError("Subtitle text cannot be empty")
201
 
202
+ if audio_path is None:
203
+ audio_path = "datasets/CoTMovieDubbing/GT.wav"
204
 
205
+ res, output_path = deepdubber(video_path, subtitle_text, audio_path)
206
 
207
+ return res, output_path
208
 
209
+ except Exception as e:
210
+ print(f"Error in process_video_dubbing: {e}")
211
 
212
+ return None, None
213
 
214
 
215
  def create_ui():
src/moviedubber/infer_with_mmlm_result.py CHANGED
@@ -65,15 +65,15 @@ def get_spk_emb(audio_path, ort_session):
65
  return embedding
66
 
67
 
68
- def load_models(device):
69
  model_cfg = "src/moviedubber/configs/basemodel.yaml"
70
  vocoder_name = "bigvgan"
71
 
72
  vocoder = load_vocoder(local_path="nvidia/bigvgan_v2_24khz_100band_256x", device=device)
73
 
74
- ckpt_path = hf_hub_download(repo_id="woak-oa/DeepDubber-V1", filename="mmdubber.pt")
75
- vocab_file = hf_hub_download(repo_id="woak-oa/DeepDubber-V1", filename="vocab.txt")
76
- campplus_path = hf_hub_download(repo_id="woak-oa/DeepDubber-V1", filename="campplus.onnx")
77
 
78
  model_cls = DiT
79
  model_cfg = OmegaConf.load(model_cfg).model.arch
 
65
  return embedding
66
 
67
 
68
+ def load_models(repo_local_path, device):
69
  model_cfg = "src/moviedubber/configs/basemodel.yaml"
70
  vocoder_name = "bigvgan"
71
 
72
  vocoder = load_vocoder(local_path="nvidia/bigvgan_v2_24khz_100band_256x", device=device)
73
 
74
+ ckpt_path = os.path.join(repo_local_path, "mmdubber.pt")
75
+ vocab_file = os.path.join(repo_local_path, "vocab.txt")
76
+ campplus_path = os.path.join(repo_local_path, "campplus.onnx")
77
 
78
  model_cls = DiT
79
  model_cfg = OmegaConf.load(model_cfg).model.arch