DeepDubber-V1 / src /moviedubber /infer_with_mmlm_result.py
woak's picture
init
3a55fb8
import os
import subprocess as sp
import cv2
import onnxruntime
import torchaudio
import torchaudio.compliance.kaldi as kaldi
from omegaconf import OmegaConf
from src.moviedubber.infer.utils_infer import (
load_model,
load_vocoder,
)
from src.moviedubber.model import ControlNetDiT, DiT
def get_video_duration(video_path):
cap = cv2.VideoCapture(video_path)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
duration = total_frames / fps
return duration
def merge_video_audio(video_path, audio_path, output_path, start_time, duration):
command = [
"ffmpeg",
"-y",
"-ss",
str(start_time),
"-t",
str(duration),
"-i",
video_path,
"-i",
audio_path,
"-c:v",
"copy",
"-c:a",
"aac",
"-map",
"0:v:0",
"-map",
"1:a:0",
"-shortest",
"-strict",
"experimental",
output_path,
]
try:
sp.run(command, check=True, stdout=sp.DEVNULL, stderr=sp.DEVNULL, stdin=sp.DEVNULL)
print(f"Successfully merged audio and video into {output_path}")
return output_path
except sp.CalledProcessError as e:
print(f"Error merging audio and video: {e}")
return None
def get_spk_emb(audio_path, ort_session):
audio, sample_rate = torchaudio.load(str(audio_path))
if sample_rate != 16000:
audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio)
feat = kaldi.fbank(audio, num_mel_bins=80, dither=0, sample_frequency=16000)
feat = feat - feat.mean(dim=0, keepdim=True)
embedding = (
ort_session.run(None, {ort_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0]
.flatten()
.tolist()
)
return embedding
def load_models(repo_local_path, device):
model_cfg = "src/moviedubber/configs/basemodel.yaml"
vocoder_name = "bigvgan"
vocoder = load_vocoder(local_path="nvidia/bigvgan_v2_24khz_100band_256x", device=device)
ckpt_path = os.path.join(repo_local_path, "mmdubber.pt")
vocab_file = os.path.join(repo_local_path, "vocab.txt")
campplus_path = os.path.join(repo_local_path, "campplus.onnx")
model_cls = DiT
model_cfg = OmegaConf.load(model_cfg).model.arch
controlnet = ControlNetDiT
ema_model = load_model(
model_cls,
model_cfg,
ckpt_path=ckpt_path,
mel_spec_type=vocoder_name,
vocab_file=vocab_file,
controlnet=controlnet,
device=device,
)
option = onnxruntime.SessionOptions()
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
option.intra_op_num_threads = 1
providers = ["CPUExecutionProvider"]
ort_session = onnxruntime.InferenceSession(
campplus_path,
sess_options=option,
providers=providers,
)
return ema_model, vocoder, ort_session