Intro
The demucs model in the ICASSP 2024 Cadenza Challenge is an innovative sound separation technique that efficiently separates pristine audio tracks from audio mixtures using deep learning algorithms. demucs models utilize complex neural network structures, including encoder-decoder architectures and attentional mechanisms, to improve the quality and accuracy of the audio during the separation process. The demucs model utilizes complex neural network structures including encoder-decoder architectures and attention mechanisms to improve audio quality and accuracy during the separation process. The model has demonstrated exceptional performance in a variety of areas including post-processing of music production, audio analysis, and music information retrieval, bringing breakthroughs in the field of music technology.
Usage
import torch
import torchaudio
from typing import Callable
from functools import partial
from dataclasses import dataclass
from modelscope import snapshot_download
from torchaudio.models import hdemucs_high
@dataclass
class SourceSeparationBundle:
"""Dataclass that bundles components for performing source separation.
Example
>>> import torchaudio
>>> from torchaudio.pipelines import CONVTASNET_BASE_LIBRI2MIX
>>> import torch
>>>
>>> # Build the separation model.
>>> model = CONVTASNET_BASE_LIBRI2MIX.get_model()
>>> 100%|βββββββββββββββββββββββββββββββ|19.1M/19.1M [00:04<00:00, 4.93MB/s]
>>>
>>> # Instantiate the test set of Libri2Mix dataset.
>>> dataset = torchaudio.datasets.LibriMix("/home/datasets/", subset="test")
>>>
>>> # Apply source separation on mixture audio.
>>> for i, data in enumerate(dataset):
>>> sample_rate, mixture, clean_sources = data
>>> # Make sure the shape of input suits the model requirement.
>>> mixture = mixture.reshape(1, 1, -1)
>>> estimated_sources = model(mixture)
>>> score = si_snr_pit(estimated_sources, clean_sources) # for demonstration
>>> print(f"Si-SNR score is : {score}.)
>>> break
>>> Si-SNR score is : 16.24.
>>>
"""
_model_path: str
_model_factory_func: Callable[[], torch.nn.Module]
_sample_rate: int
@property
def sample_rate(self) -> int:
"""Sample rate of the audio that the model is trained on.
:type: int
"""
return self._sample_rate
def get_model(self) -> torch.nn.Module:
"""Construct the model and load the pretrained weight."""
model = self._model_factory_func()
path = torchaudio.utils.download_asset(self._model_path)
state_dict = torch.load(path)
model.load_state_dict(state_dict)
model.eval()
return model
model_dir = snapshot_download('monetjoe/hdemucs_high_musdbhq')
HDEMUCS_HIGH_MUSDB = SourceSeparationBundle(
_model_path=f"{model_dir}/hdemucs_high_musdbhq_only.pt",
_model_factory_func=partial(
hdemucs_high, sources=["drums", "bass", "other", "vocals"]
),
_sample_rate=44100,
)
HDEMUCS_HIGH_MUSDB.__doc__ = """Pre-trained music source separation pipeline with
*Hybrid Demucs* :cite:`defossez2021hybrid` trained on the training set of MUSDB-HQ :cite:`MUSDB18HQ`.
The model is constructed by :func:`~torchaudio.models.hdemucs_high`.
Training was performed in the original HDemucs repository `here <https://github.com/facebookresearch/demucs/>`__.
Please refer to :class:`SourceSeparationBundle` for usage instructions.
"""
Maintenance
git clone [email protected]:monetjoe/hdemucs_high_musdbhq
cd hdemucs_high_musdbhq
Mirror
https://www.modelscope.cn/models/monetjoe/hdemucs_high_musdbhq