Spaces:
Running
Running
import os | |
import subprocess as sp | |
import cv2 | |
import onnxruntime | |
import torchaudio | |
import torchaudio.compliance.kaldi as kaldi | |
from omegaconf import OmegaConf | |
from src.moviedubber.infer.utils_infer import ( | |
load_model, | |
load_vocoder, | |
) | |
from src.moviedubber.model import ControlNetDiT, DiT | |
def get_video_duration(video_path): | |
cap = cv2.VideoCapture(video_path) | |
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
fps = cap.get(cv2.CAP_PROP_FPS) | |
duration = total_frames / fps | |
return duration | |
def merge_video_audio(video_path, audio_path, output_path, start_time, duration): | |
command = [ | |
"ffmpeg", | |
"-y", | |
"-ss", | |
str(start_time), | |
"-t", | |
str(duration), | |
"-i", | |
video_path, | |
"-i", | |
audio_path, | |
"-c:v", | |
"copy", | |
"-c:a", | |
"aac", | |
"-map", | |
"0:v:0", | |
"-map", | |
"1:a:0", | |
"-shortest", | |
"-strict", | |
"experimental", | |
output_path, | |
] | |
try: | |
sp.run(command, check=True, stdout=sp.DEVNULL, stderr=sp.DEVNULL, stdin=sp.DEVNULL) | |
print(f"Successfully merged audio and video into {output_path}") | |
return output_path | |
except sp.CalledProcessError as e: | |
print(f"Error merging audio and video: {e}") | |
return None | |
def get_spk_emb(audio_path, ort_session): | |
audio, sample_rate = torchaudio.load(str(audio_path)) | |
if sample_rate != 16000: | |
audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio) | |
feat = kaldi.fbank(audio, num_mel_bins=80, dither=0, sample_frequency=16000) | |
feat = feat - feat.mean(dim=0, keepdim=True) | |
embedding = ( | |
ort_session.run(None, {ort_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0] | |
.flatten() | |
.tolist() | |
) | |
return embedding | |
def load_models(repo_local_path, device): | |
model_cfg = "src/moviedubber/configs/basemodel.yaml" | |
vocoder_name = "bigvgan" | |
vocoder = load_vocoder(local_path="nvidia/bigvgan_v2_24khz_100band_256x", device=device) | |
ckpt_path = os.path.join(repo_local_path, "mmdubber.pt") | |
vocab_file = os.path.join(repo_local_path, "vocab.txt") | |
campplus_path = os.path.join(repo_local_path, "campplus.onnx") | |
model_cls = DiT | |
model_cfg = OmegaConf.load(model_cfg).model.arch | |
controlnet = ControlNetDiT | |
ema_model = load_model( | |
model_cls, | |
model_cfg, | |
ckpt_path=ckpt_path, | |
mel_spec_type=vocoder_name, | |
vocab_file=vocab_file, | |
controlnet=controlnet, | |
device=device, | |
) | |
option = onnxruntime.SessionOptions() | |
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL | |
option.intra_op_num_threads = 1 | |
providers = ["CPUExecutionProvider"] | |
ort_session = onnxruntime.InferenceSession( | |
campplus_path, | |
sess_options=option, | |
providers=providers, | |
) | |
return ema_model, vocoder, ort_session | |