Jarod Castillo
init
88b57c0
raw
history blame
2.37 kB
# Standard Library Imports
# Third Party Imports
import torch
import onnxruntime as ort
# Local Imports
from src.models.MDX_net.mdx_net import Conv_TDF_net_trimm
from src.loader import Loader
# Global Variables
from src.constants import EXECUTION_PROVIDER_LIST, COMPUTATION_DEVICE, ONNX_MODEL_PATH
class KimVocal:
"""
TODO: Put something here for flexibility purposes (model types).
"""
def __init__(self):
pass
def demix_vocals(self, music_tensor, sample_rate, model, streamlit_progressbar):
"""
Removing vocals using a ONNX model.
Args:
music_tensor (torch.Tensor): Input tensor.
model (torch.nn): Model used for inferring.
Returns:
torch.Tensor: Output tensor after passing through the network.
"""
number_of_samples = music_tensor.shape[1]
overlap = model.overlap
# Calculate chunk_size and gen_size based on the sample rate
chunk_size = model.chunk_size
gen_size = chunk_size - 2 * overlap
pad_size = gen_size - number_of_samples % gen_size
mix_padded = torch.cat(
[torch.zeros(2, overlap), music_tensor, torch.zeros(2, pad_size + overlap)],
1,
)
# Start running the session for the model
ort_session = ort.InferenceSession(ONNX_MODEL_PATH, providers=EXECUTION_PROVIDER_LIST)
# TODO: any way to optimize against silence? I think that's what skips are for, gotta double check.
# process one chunk at a time (batch_size=1)
demixed_chunks = []
i = 0
while i < number_of_samples + pad_size:
# Progress Bar
streamlit_progressbar.progress(i / (number_of_samples + pad_size))
# Computation
chunk = mix_padded[:, i : i + chunk_size]
x = model.stft(chunk.unsqueeze(0).to(COMPUTATION_DEVICE))
with torch.no_grad():
x = torch.tensor(ort_session.run(None, {"input": x.cpu().numpy()})[0])
x = model.stft.inverse(x).squeeze(0)
x = x[..., overlap:-overlap]
demixed_chunks.append(x)
i += gen_size
vocals_output = torch.cat(demixed_chunks, -1)[..., :-pad_size].cpu()
return vocals_output
if __name__ == "__main__":
kimvocal = KimVocal()
kimvocal.main()