Spaces:

ASesYusuf1
/

dgfsfxc-tgsacxs-otyhrhs

Running on Zero

App Files Files Community

ASesYusuf1 commited on May 23

Commit

01f8b5b

verified ·

1 Parent(s): 23546b1

Upload 131 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +20 -0
audio_separator/__init__.py +0 -0
audio_separator/model-data.json +22 -0
audio_separator/models-scores.json +0 -0
audio_separator/models.json +216 -0
audio_separator/separator/__init__.py +1 -0
audio_separator/separator/architectures/__init__.py +0 -0
audio_separator/separator/architectures/demucs_separator.py +195 -0
audio_separator/separator/architectures/mdx_separator.py +451 -0
audio_separator/separator/architectures/mdxc_separator.py +423 -0
audio_separator/separator/architectures/vr_separator.py +357 -0
audio_separator/separator/common_separator.py +403 -0
audio_separator/separator/separator.py +959 -0
audio_separator/separator/uvr_lib_v5/__init__.py +0 -0
audio_separator/separator/uvr_lib_v5/demucs/__init__.py +5 -0
audio_separator/separator/uvr_lib_v5/demucs/__main__.py +212 -0
audio_separator/separator/uvr_lib_v5/demucs/apply.py +294 -0
audio_separator/separator/uvr_lib_v5/demucs/demucs.py +453 -0
audio_separator/separator/uvr_lib_v5/demucs/filtering.py +451 -0
audio_separator/separator/uvr_lib_v5/demucs/hdemucs.py +783 -0
audio_separator/separator/uvr_lib_v5/demucs/htdemucs.py +620 -0
audio_separator/separator/uvr_lib_v5/demucs/model.py +204 -0
audio_separator/separator/uvr_lib_v5/demucs/model_v2.py +222 -0
audio_separator/separator/uvr_lib_v5/demucs/pretrained.py +181 -0
audio_separator/separator/uvr_lib_v5/demucs/repo.py +146 -0
audio_separator/separator/uvr_lib_v5/demucs/spec.py +38 -0
audio_separator/separator/uvr_lib_v5/demucs/states.py +131 -0
audio_separator/separator/uvr_lib_v5/demucs/tasnet.py +401 -0
audio_separator/separator/uvr_lib_v5/demucs/tasnet_v2.py +404 -0
audio_separator/separator/uvr_lib_v5/demucs/transformer.py +675 -0
audio_separator/separator/uvr_lib_v5/demucs/utils.py +496 -0
audio_separator/separator/uvr_lib_v5/mdxnet.py +136 -0
audio_separator/separator/uvr_lib_v5/mixer.ckpt +3 -0
audio_separator/separator/uvr_lib_v5/modules.py +74 -0
audio_separator/separator/uvr_lib_v5/playsound.py +241 -0
audio_separator/separator/uvr_lib_v5/pyrb.py +92 -0
audio_separator/separator/uvr_lib_v5/results.py +48 -0
audio_separator/separator/uvr_lib_v5/roformer/attend.py +112 -0
audio_separator/separator/uvr_lib_v5/roformer/bs_roformer.py +535 -0
audio_separator/separator/uvr_lib_v5/roformer/mel_band_roformer.py +445 -0
audio_separator/separator/uvr_lib_v5/spec_utils.py +1327 -0
audio_separator/separator/uvr_lib_v5/stft.py +126 -0
audio_separator/separator/uvr_lib_v5/tfc_tdf_v3.py +253 -0
audio_separator/separator/uvr_lib_v5/vr_network/__init__.py +1 -0
audio_separator/separator/uvr_lib_v5/vr_network/layers.py +294 -0
audio_separator/separator/uvr_lib_v5/vr_network/layers_new.py +149 -0
audio_separator/separator/uvr_lib_v5/vr_network/model_param_init.py +71 -0
audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr16000_hl512.json +19 -0
audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr32000_hl512.json +19 -0
audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr33075_hl384.json +19 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,23 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tests/inputs/mardy20s.flac filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Bass)_htdemucs_6s_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Drum-Bass)_model_bs_roformer_ep_937_sdr_10_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Drums)_htdemucs_6s_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Guitar)_htdemucs_6s_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Instrumental)_2_HP-UVR_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Instrumental)_kuielab_b_vocals_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Instrumental)_MGM_MAIN_v4_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Instrumental)_model_bs_roformer_ep_317_sdr_12_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Instrumental)_UVR-MDX-NET-Inst_HQ_4_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(No[[:space:]]Drum-Bass)_model_bs_roformer_ep_937_sdr_10_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Other)_htdemucs_6s_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Piano)_htdemucs_6s_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Vocals)_2_HP-UVR_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Vocals)_htdemucs_6s_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Vocals)_kuielab_b_vocals_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Vocals)_MGM_MAIN_v4_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Vocals)_model_bs_roformer_ep_317_sdr_12_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_(Vocals)_UVR-MDX-NET-Inst_HQ_4_spectrogram.png filter=lfs diff=lfs merge=lfs -text
+tests/inputs/reference/expected_mardy20s_spectrogram.png filter=lfs diff=lfs merge=lfs -text

audio_separator/__init__.py ADDED Viewed

File without changes

audio_separator/model-data.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "vr_model_data": {
+        "97dc361a7a88b2c4542f68364b32c7f6": {
+            "vr_model_param": "4band_v4_ms_fullband",
+            "primary_stem": "Dry",
+            "nout": 32,
+            "nout_lstm": 128,
+            "is_karaoke": false,
+            "is_bv_model": false,
+            "is_bv_model_rebalanced": 0.0
+        }
+    },
+    "mdx_model_data": {
+        "cb790d0c913647ced70fc6b38f5bea1a": {
+            "compensate": 1.010,
+            "mdx_dim_f_set": 2560,
+            "mdx_dim_t_set": 8,
+            "mdx_n_fft_scale_set": 5120,
+            "primary_stem": "Instrumental"
+        }
+    }
+}

audio_separator/models-scores.json ADDED Viewed

The diff for this file is too large to render. See raw diff

audio_separator/models.json ADDED Viewed

	@@ -0,0 +1,216 @@

+{
+    "vr_download_list": {
+        "VR Arch Single Model v4: UVR-De-Reverb by aufr33-jarredou": "UVR-De-Reverb-aufr33-jarredou.pth"
+    },
+    "mdx_download_list": {
+        "MDX-Net Model: UVR-MDX-NET Inst HQ 5": "UVR-MDX-NET-Inst_HQ_5.onnx"
+    },
+    "mdx23c_download_list": {
+        "MDX23C Model: MDX23C De-Reverb by aufr33-jarredou": {
+            "MDX23C-De-Reverb-aufr33-jarredou.ckpt": "config_dereverb_mdx23c.yaml"
+        },
+        "MDX23C Model: MDX23C DrumSep by aufr33-jarredou": {
+            "MDX23C-DrumSep-aufr33-jarredou.ckpt": "config_drumsep_mdx23c.yaml"
+        }
+    },
+    "roformer_download_list": {
+        "Roformer Model: Mel-Roformer-Karaoke-Aufr33-Viperx": {
+            "mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt": "mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956_config.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Karaoke by Gabox": {
+            "mel_band_roformer_karaoke_gabox.ckpt": "mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956_config.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Karaoke by becruily": {
+            "mel_band_roformer_karaoke_becruily.ckpt": "config_mel_band_roformer_karaoke_becruily.yaml"
+        },
+        "Roformer Model: Mel-Roformer-Denoise-Aufr33": {
+            "denoise_mel_band_roformer_aufr33_sdr_27.9959.ckpt": "denoise_mel_band_roformer_aufr33_sdr_27.9959_config.yaml"
+        },
+        "Roformer Model: Mel-Roformer-Denoise-Aufr33-Aggr": {
+            "denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768.ckpt": "denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768_config.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Denoise-Debleed by Gabox": {
+            "mel_band_roformer_denoise_debleed_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: Mel-Roformer-Crowd-Aufr33-Viperx": {
+            "mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144.ckpt": "mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144_config.yaml"
+        },
+        "Roformer Model: BS-Roformer-De-Reverb": {
+            "deverb_bs_roformer_8_384dim_10depth.ckpt": "deverb_bs_roformer_8_384dim_10depth_config.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Vocals by Kimberley Jensen": {
+            "vocals_mel_band_roformer.ckpt": "vocals_mel_band_roformer.yaml"
+        },
+        "Roformer Model: MelBand Roformer Kim | FT by unwa": {
+            "mel_band_roformer_kim_ft_unwa.ckpt": "config_mel_band_roformer_kim_ft_unwa.yaml"
+        },
+        "Roformer Model: MelBand Roformer Kim | FT 2 by unwa": {
+            "mel_band_roformer_kim_ft2_unwa.ckpt": "config_mel_band_roformer_kim_ft_unwa.yaml"
+        },
+        "Roformer Model: MelBand Roformer Kim | FT 2 Bleedless by unwa": {
+            "mel_band_roformer_kim_ft2_bleedless_unwa.ckpt": "config_mel_band_roformer_kim_ft_unwa.yaml"
+        },
+        "Roformer Model: MelBand Roformer Kim | FT 3 by unwa": {
+            "mel_band_roformer_kim_ft3_unwa.ckpt": "config_mel_band_roformer_kim_ft_unwa.yaml"
+        },
+        "Roformer Model: MelBand Roformer Kim | Inst V1 Plus by Unwa": {
+            "melband_roformer_inst_v1_plus.ckpt": "config_melbandroformer_inst.yaml"
+        },
+        "Roformer Model: MelBand Roformer Kim | Inst V1 (E) by Unwa": {
+            "melband_roformer_inst_v1e.ckpt": "config_melbandroformer_inst.yaml"
+        },
+        "Roformer Model: MelBand Roformer Kim | Inst V1 (E) Plus by Unwa": {
+            "melband_roformer_inst_v1e_plus.ckpt": "config_melbandroformer_inst.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Vocals by becruily": {
+            "mel_band_roformer_vocals_becruily.ckpt": "config_mel_band_roformer_vocals_becruily.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Instrumental by becruily": {
+            "mel_band_roformer_instrumental_becruily.ckpt": "config_mel_band_roformer_instrumental_becruily.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Vocals Fullness by Aname": {
+            "mel_band_roformer_vocal_fullness_aname.ckpt": "config_mel_band_roformer_vocal_fullness_aname.yaml"
+        },
+        "Roformer Model: BS Roformer | Vocals by Gabox": {
+            "bs_roformer_vocals_gabox.ckpt": "config_bs_roformer_vocals_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Vocals by Gabox": {
+            "mel_band_roformer_vocals_gabox.ckpt": "config_mel_band_roformer_vocals_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Vocals FV1 by Gabox": {
+            "mel_band_roformer_vocals_fv1_gabox.ckpt": "config_mel_band_roformer_vocals_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Vocals FV2 by Gabox": {
+            "mel_band_roformer_vocals_fv2_gabox.ckpt": "config_mel_band_roformer_vocals_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Vocals FV3 by Gabox": {
+            "mel_band_roformer_vocals_fv3_gabox.ckpt": "config_mel_band_roformer_vocals_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Vocals FV4 by Gabox": {
+            "mel_band_roformer_vocals_fv4_gabox.ckpt": "config_mel_band_roformer_vocals_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Instrumental by Gabox": {
+            "mel_band_roformer_instrumental_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Instrumental 2 by Gabox": {
+            "mel_band_roformer_instrumental_2_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Instrumental 3 by Gabox": {
+            "mel_band_roformer_instrumental_3_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Instrumental Bleedless V1 by Gabox": {
+            "mel_band_roformer_instrumental_bleedless_v1_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Instrumental Bleedless V2 by Gabox": {
+            "mel_band_roformer_instrumental_bleedless_v2_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Instrumental Bleedless V3 by Gabox": {
+            "mel_band_roformer_instrumental_bleedless_v3_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Instrumental Fullness V1 by Gabox": {
+            "mel_band_roformer_instrumental_fullness_v1_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Instrumental Fullness V2 by Gabox": {
+            "mel_band_roformer_instrumental_fullness_v2_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Instrumental Fullness V3 by Gabox": {
+            "mel_band_roformer_instrumental_fullness_v3_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Instrumental Fullness Noisy V4 by Gabox": {
+            "mel_band_roformer_instrumental_fullness_noise_v4_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | INSTV5 by Gabox": {
+            "mel_band_roformer_instrumental_instv5_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | INSTV5N by Gabox": {
+            "mel_band_roformer_instrumental_instv5n_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | INSTV6 by Gabox": {
+            "mel_band_roformer_instrumental_instv6_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | INSTV6N by Gabox": {
+            "mel_band_roformer_instrumental_instv6n_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | INSTV7 by Gabox": {
+            "mel_band_roformer_instrumental_instv7_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | INSTV7N by Gabox": {
+            "mel_band_roformer_instrumental_instv7n_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | INSTV8 by Gabox": {
+            "mel_band_roformer_instrumental_instv8_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | INSTV8N by Gabox": {
+            "mel_band_roformer_instrumental_instv8n_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | FVX by Gabox": {
+            "mel_band_roformer_instrumental_fvx_gabox.ckpt": "config_mel_band_roformer_instrumental_gabox.yaml"
+        },
+        "Roformer Model: MelBand Roformer | De-Reverb by anvuew": {
+            "dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt": "dereverb_mel_band_roformer_anvuew.yaml"
+        },
+        "Roformer Model: MelBand Roformer | De-Reverb Less Aggressive by anvuew": {
+            "dereverb_mel_band_roformer_less_aggressive_anvuew_sdr_18.8050.ckpt": "dereverb_mel_band_roformer_anvuew.yaml"
+        },
+        "Roformer Model: MelBand Roformer | De-Reverb Mono by anvuew": {
+            "dereverb_mel_band_roformer_mono_anvuew.ckpt": "dereverb_mel_band_roformer_anvuew.yaml"
+        },
+        "Roformer Model: MelBand Roformer | De-Reverb Big by Sucial": {
+            "dereverb_big_mbr_ep_362.ckpt": "config_dereverb_echo_mel_band_roformer_v2.yaml"
+        },
+        "Roformer Model: MelBand Roformer | De-Reverb Super Big by Sucial": {
+            "dereverb_super_big_mbr_ep_346.ckpt": "config_dereverb_echo_mel_band_roformer_v2.yaml"
+        },
+        "Roformer Model: MelBand Roformer | De-Reverb-Echo by Sucial": {
+            "dereverb-echo_mel_band_roformer_sdr_10.0169.ckpt": "config_dereverb-echo_mel_band_roformer.yaml"
+        },
+        "Roformer Model: MelBand Roformer | De-Reverb-Echo V2 by Sucial": {
+            "dereverb-echo_mel_band_roformer_sdr_13.4843_v2.ckpt": "config_dereverb-echo_mel_band_roformer_sdr_13.4843_v2.yaml"
+        },
+        "Roformer Model: MelBand Roformer | De-Reverb-Echo Fused by Sucial": {
+            "dereverb_echo_mbr_fused.ckpt": "config_dereverb_echo_mel_band_roformer_v2.yaml"
+        },
+        "Roformer Model: MelBand Roformer Kim | SYHFT by SYH99999": {
+            "MelBandRoformerSYHFT.ckpt": "config_vocals_mel_band_roformer_ft.yaml"
+        },
+        "Roformer Model: MelBand Roformer Kim | SYHFT V2 by SYH99999": {
+            "MelBandRoformerSYHFTV2.ckpt": "config_vocals_mel_band_roformer_ft.yaml"
+        },
+        "Roformer Model: MelBand Roformer Kim | SYHFT V2.5 by SYH99999": {
+            "MelBandRoformerSYHFTV2.5.ckpt": "config_vocals_mel_band_roformer_ft.yaml"
+        },
+        "Roformer Model: MelBand Roformer Kim | SYHFT V3 by SYH99999": {
+            "MelBandRoformerSYHFTV3Epsilon.ckpt": "config_vocals_mel_band_roformer_ft.yaml"
+        },
+        "Roformer Model: MelBand Roformer Kim | Big SYHFT V1 by SYH99999": {
+            "MelBandRoformerBigSYHFTV1.ckpt": "config_vocals_mel_band_roformer_big_v1_ft.yaml"
+        },
+        "Roformer Model: MelBand Roformer Kim | Big Beta 4 FT by unwa": {
+            "melband_roformer_big_beta4.ckpt": "config_melbandroformer_big_beta4.yaml"
+        },
+        "Roformer Model: MelBand Roformer Kim | Big Beta 5e FT by unwa": {
+            "melband_roformer_big_beta5e.ckpt": "config_melband_roformer_big_beta5e.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Big Beta 6 by unwa": {
+            "melband_roformer_big_beta6.ckpt": "config_melbandroformer_big_beta6.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Big Beta 6X by unwa": {
+            "melband_roformer_big_beta6x.ckpt": "config_melbandroformer_big_beta6x.yaml"
+        },
+        "Roformer Model: BS Roformer | Chorus Male-Female by Sucial": {
+            "model_chorus_bs_roformer_ep_267_sdr_24.1275.ckpt": "config_chorus_male_female_bs_roformer.yaml"
+        },
+        "Roformer Model: BS Roformer | Male-Female by aufr33": {
+            "bs_roformer_male_female_by_aufr33_sdr_7.2889.ckpt": "config_chorus_male_female_bs_roformer.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Aspiration by Sucial": {
+            "aspiration_mel_band_roformer_sdr_18.9845.ckpt": "config_aspiration_mel_band_roformer.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Aspiration Less Aggressive by Sucial": {
+            "aspiration_mel_band_roformer_less_aggr_sdr_18.1201.ckpt": "config_aspiration_mel_band_roformer.yaml"
+        },
+        "Roformer Model: MelBand Roformer | Bleed Suppressor V1 by unwa-97chris": {
+            "mel_band_roformer_bleed_suppressor_v1.ckpt": "config_mel_band_roformer_bleed_suppressor_v1.yaml"
+        }
+    }
+}

audio_separator/separator/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .separator import Separator

audio_separator/separator/architectures/__init__.py ADDED Viewed

File without changes

audio_separator/separator/architectures/demucs_separator.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import os
+import sys
+from pathlib import Path
+import torch
+import numpy as np
+from audio_separator.separator.common_separator import CommonSeparator
+from audio_separator.separator.uvr_lib_v5.demucs.apply import apply_model, demucs_segments
+from audio_separator.separator.uvr_lib_v5.demucs.hdemucs import HDemucs
+from audio_separator.separator.uvr_lib_v5.demucs.pretrained import get_model as get_demucs_model
+from audio_separator.separator.uvr_lib_v5 import spec_utils
+DEMUCS_4_SOURCE = ["drums", "bass", "other", "vocals"]
+DEMUCS_2_SOURCE_MAPPER = {CommonSeparator.INST_STEM: 0, CommonSeparator.VOCAL_STEM: 1}
+DEMUCS_4_SOURCE_MAPPER = {CommonSeparator.BASS_STEM: 0, CommonSeparator.DRUM_STEM: 1, CommonSeparator.OTHER_STEM: 2, CommonSeparator.VOCAL_STEM: 3}
+DEMUCS_6_SOURCE_MAPPER = {
+    CommonSeparator.BASS_STEM: 0,
+    CommonSeparator.DRUM_STEM: 1,
+    CommonSeparator.OTHER_STEM: 2,
+    CommonSeparator.VOCAL_STEM: 3,
+    CommonSeparator.GUITAR_STEM: 4,
+    CommonSeparator.PIANO_STEM: 5,
+}
+class DemucsSeparator(CommonSeparator):
+    """
+    DemucsSeparator is responsible for separating audio sources using Demucs models.
+    It initializes with configuration parameters and prepares the model for separation tasks.
+    """
+    def __init__(self, common_config, arch_config):
+        # Any configuration values which can be shared between architectures should be set already in CommonSeparator,
+        # e.g. user-specified functionality choices (self.output_single_stem) or common model parameters (self.primary_stem_name)
+        super().__init__(config=common_config)
+        # Initializing user-configurable parameters, passed through with an mdx_from the CLI or Separator instance
+        # Adjust segments to manage RAM or V-RAM usage:
+        # - Smaller sizes consume less resources.
+        # - Bigger sizes consume more resources, but may provide better results.
+        # - "Default" picks the optimal size.
+        # DEMUCS_SEGMENTS = (DEF_OPT, '1', '5', '10', '15', '20',
+        #           '25', '30', '35', '40', '45', '50',
+        #           '55', '60', '65', '70', '75', '80',
+        #           '85', '90', '95', '100')
+        self.segment_size = arch_config.get("segment_size", "Default")
+        # Performs multiple predictions with random shifts of the input and averages them.
+        # The higher number of shifts, the longer the prediction will take.
+        # Not recommended unless you have a GPU.
+        # DEMUCS_SHIFTS = (0, 1, 2, 3, 4, 5,
+        #                 6, 7, 8, 9, 10, 11,
+        #                 12, 13, 14, 15, 16, 17,
+        #                 18, 19, 20)
+        self.shifts = arch_config.get("shifts", 2)
+        # This option controls the amount of overlap between prediction windows.
+        #  - Higher values can provide better results, but will lead to longer processing times.
+        #  - You can choose between 0.001-0.999
+        # DEMUCS_OVERLAP = (0.25, 0.50, 0.75, 0.99)
+        self.overlap = arch_config.get("overlap", 0.25)
+        # Enables "Segments". Deselecting this option is only recommended for those with powerful PCs.
+        self.segments_enabled = arch_config.get("segments_enabled", True)
+        self.logger.debug(f"Demucs arch params: segment_size={self.segment_size}, segments_enabled={self.segments_enabled}")
+        self.logger.debug(f"Demucs arch params: shifts={self.shifts}, overlap={self.overlap}")
+        self.demucs_source_map = DEMUCS_4_SOURCE_MAPPER
+        self.audio_file_path = None
+        self.audio_file_base = None
+        self.demucs_model_instance = None
+        # Add uvr_lib_v5 folder to system path so pytorch serialization can find the demucs module
+        current_dir = os.path.dirname(__file__)
+        uvr_lib_v5_path = os.path.join(current_dir, "..", "uvr_lib_v5")
+        sys.path.insert(0, uvr_lib_v5_path)
+        self.logger.info("Demucs Separator initialisation complete")
+    def separate(self, audio_file_path, custom_output_names=None):
+        """
+        Separates the audio file into its component stems using the Demucs model.
+        Args:
+            audio_file_path (str): The path to the audio file to be processed.
+            custom_output_names (dict, optional): Custom names for the output files. Defaults to None.
+        Returns:
+            list: A list of paths to the output files generated by the separation process.
+        """
+        self.logger.debug("Starting separation process...")
+        source = None
+        stem_source = None
+        inst_source = {}
+        self.audio_file_path = audio_file_path
+        self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
+        # Prepare the mix for processing
+        self.logger.debug("Preparing mix...")
+        mix = self.prepare_mix(self.audio_file_path)
+        self.logger.debug(f"Mix prepared for demixing. Shape: {mix.shape}")
+        self.logger.debug("Loading model for demixing...")
+        self.demucs_model_instance = HDemucs(sources=DEMUCS_4_SOURCE)
+        self.demucs_model_instance = get_demucs_model(name=os.path.splitext(os.path.basename(self.model_path))[0], repo=Path(os.path.dirname(self.model_path)))
+        self.demucs_model_instance = demucs_segments(self.segment_size, self.demucs_model_instance)
+        self.demucs_model_instance.to(self.torch_device)
+        self.demucs_model_instance.eval()
+        self.logger.debug("Model loaded and set to evaluation mode.")
+        source = self.demix_demucs(mix)
+        del self.demucs_model_instance
+        self.clear_gpu_cache()
+        self.logger.debug("Model and GPU cache cleared after demixing.")
+        output_files = []
+        self.logger.debug("Processing output files...")
+        if isinstance(inst_source, np.ndarray):
+            self.logger.debug("Processing instance source...")
+            source_reshape = spec_utils.reshape_sources(inst_source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]], source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]])
+            inst_source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]] = source_reshape
+            source = inst_source
+        if isinstance(source, np.ndarray):
+            source_length = len(source)
+            self.logger.debug(f"Processing source array, source length is {source_length}")
+            match source_length:
+                case 2:
+                    self.logger.debug("Setting source map to 2-stem...")
+                    self.demucs_source_map = DEMUCS_2_SOURCE_MAPPER
+                case 6:
+                    self.logger.debug("Setting source map to 6-stem...")
+                    self.demucs_source_map = DEMUCS_6_SOURCE_MAPPER
+                case _:
+                    self.logger.debug("Setting source map to 4-stem...")
+                    self.demucs_source_map = DEMUCS_4_SOURCE_MAPPER
+        self.logger.debug("Processing for all stems...")
+        for stem_name, stem_value in self.demucs_source_map.items():
+            if self.output_single_stem is not None:
+                if stem_name.lower() != self.output_single_stem.lower():
+                    self.logger.debug(f"Skipping writing stem {stem_name} as output_single_stem is set to {self.output_single_stem}...")
+                    continue
+            stem_path = self.get_stem_output_path(stem_name, custom_output_names)
+            stem_source = source[stem_value].T
+            self.final_process(stem_path, stem_source, stem_name)
+            output_files.append(stem_path)
+        return output_files
+    def demix_demucs(self, mix):
+        """
+        Demixes the input mix using the demucs model.
+        """
+        self.logger.debug("Starting demixing process in demix_demucs...")
+        processed = {}
+        mix = torch.tensor(mix, dtype=torch.float32)
+        ref = mix.mean(0)
+        mix = (mix - ref.mean()) / ref.std()
+        mix_infer = mix
+        with torch.no_grad():
+            self.logger.debug("Running model inference...")
+            sources = apply_model(
+                model=self.demucs_model_instance,
+                mix=mix_infer[None],
+                shifts=self.shifts,
+                split=self.segments_enabled,
+                overlap=self.overlap,
+                static_shifts=1 if self.shifts == 0 else self.shifts,
+                set_progress_bar=None,
+                device=self.torch_device,
+                progress=True,
+            )[0]
+        sources = (sources * ref.std() + ref.mean()).cpu().numpy()
+        sources[[0, 1]] = sources[[1, 0]]
+        processed[mix] = sources[:, :, 0:None].copy()
+        sources = list(processed.values())
+        sources = [s[:, :, 0:None] for s in sources]
+        sources = np.concatenate(sources, axis=-1)
+        return sources

audio_separator/separator/architectures/mdx_separator.py ADDED Viewed

	@@ -0,0 +1,451 @@

+"""Module for separating audio sources using MDX architecture models."""
+import os
+import platform
+import torch
+import onnx
+import onnxruntime as ort
+import numpy as np
+import onnx2torch
+from tqdm import tqdm
+from audio_separator.separator.uvr_lib_v5 import spec_utils
+from audio_separator.separator.uvr_lib_v5.stft import STFT
+from audio_separator.separator.common_separator import CommonSeparator
+class MDXSeparator(CommonSeparator):
+    """
+    MDXSeparator is responsible for separating audio sources using MDX models.
+    It initializes with configuration parameters and prepares the model for separation tasks.
+    """
+    def __init__(self, common_config, arch_config):
+        # Any configuration values which can be shared between architectures should be set already in CommonSeparator,
+        # e.g. user-specified functionality choices (self.output_single_stem) or common model parameters (self.primary_stem_name)
+        super().__init__(config=common_config)
+        # Initializing user-configurable parameters, passed through with an mdx_from the CLI or Separator instance
+        # Pick a segment size to balance speed, resource use, and quality:
+        # - Smaller sizes consume less resources.
+        # - Bigger sizes consume more resources, but may provide better results.
+        # - Default size is 256. Quality can change based on your pick.
+        self.segment_size = arch_config.get("segment_size")
+        # This option controls the amount of overlap between prediction windows.
+        #  - Higher values can provide better results, but will lead to longer processing times.
+        #  - For Non-MDX23C models: You can choose between 0.001-0.999
+        self.overlap = arch_config.get("overlap")
+        # Number of batches to be processed at a time.
+        # - Higher values mean more RAM usage but slightly faster processing times.
+        # - Lower values mean less RAM usage but slightly longer processing times.
+        # - Batch size value has no effect on output quality.
+        # BATCH_SIZE = ('1', ''2', '3', '4', '5', '6', '7', '8', '9', '10')
+        self.batch_size = arch_config.get("batch_size", 1)
+        # hop_length is equivalent to the more commonly used term "stride" in convolutional neural networks
+        # In machine learning, particularly in the context of convolutional neural networks (CNNs),
+        # the term "stride" refers to the number of pixels by which we move the filter across the input image.
+        # Strides are a crucial component in the convolution operation, a fundamental building block of CNNs used primarily in the field of computer vision.
+        # Stride is a parameter that dictates the movement of the kernel, or filter, across the input data, such as an image.
+        # When performing a convolution operation, the stride determines how many units the filter shifts at each step.
+        # The choice of stride affects the model in several ways:
+        # Output Size: A larger stride will result in a smaller output spatial dimension.
+        # Computational Efficiency: Increasing the stride can decrease the computational load.
+        # Field of View: A higher stride means that each step of the filter takes into account a wider area of the input image.
+        #   This can be beneficial when the model needs to capture more global features rather than focusing on finer details.
+        self.hop_length = arch_config.get("hop_length")
+        # If enabled, model will be run twice to reduce noise in output audio.
+        self.enable_denoise = arch_config.get("enable_denoise")
+        self.logger.debug(f"MDX arch params: batch_size={self.batch_size}, segment_size={self.segment_size}")
+        self.logger.debug(f"MDX arch params: overlap={self.overlap}, hop_length={self.hop_length}, enable_denoise={self.enable_denoise}")
+        # Initializing model-specific parameters from model_data JSON
+        self.compensate = self.model_data["compensate"]
+        self.dim_f = self.model_data["mdx_dim_f_set"]
+        self.dim_t = 2 ** self.model_data["mdx_dim_t_set"]
+        self.n_fft = self.model_data["mdx_n_fft_scale_set"]
+        self.config_yaml = self.model_data.get("config_yaml", None)
+        self.logger.debug(f"MDX arch params: compensate={self.compensate}, dim_f={self.dim_f}, dim_t={self.dim_t}, n_fft={self.n_fft}")
+        self.logger.debug(f"MDX arch params: config_yaml={self.config_yaml}")
+        # In UVR, these variables are set but either aren't useful or are better handled in audio-separator.
+        # Leaving these comments explaining to help myself or future developers understand why these aren't in audio-separator.
+        # "chunks" is not actually used for anything in UVR...
+        # self.chunks = 0
+        # "adjust" is hard-coded to 1 in UVR, and only used as a multiplier in run_model, so it does nothing.
+        # self.adjust = 1
+        # "hop" is hard-coded to 1024 in UVR. We have a "hop_length" parameter instead
+        # self.hop = 1024
+        # "margin" maps to sample rate and is set from the GUI in UVR (default: 44100). We have a "sample_rate" parameter instead.
+        # self.margin = 44100
+        # "dim_c" is hard-coded to 4 in UVR, seems to be a parameter for the number of channels, and is only used for checkpoint models.
+        # We haven't implemented support for the checkpoint models here, so we're not using it.
+        # self.dim_c = 4
+        self.load_model()
+        self.n_bins = 0
+        self.trim = 0
+        self.chunk_size = 0
+        self.gen_size = 0
+        self.stft = None
+        self.primary_source = None
+        self.secondary_source = None
+        self.audio_file_path = None
+        self.audio_file_base = None
+    def load_model(self):
+        """
+        Load the model into memory from file on disk, initialize it with config from the model data,
+        and prepare for inferencing using hardware accelerated Torch device.
+        """
+        self.logger.debug("Loading ONNX model for inference...")
+        if self.segment_size == self.dim_t:
+            ort_session_options = ort.SessionOptions()
+            if self.log_level > 10:
+                ort_session_options.log_severity_level = 3
+            else:
+                ort_session_options.log_severity_level = 0
+            ort_inference_session = ort.InferenceSession(self.model_path, providers=self.onnx_execution_provider, sess_options=ort_session_options)
+            self.model_run = lambda spek: ort_inference_session.run(None, {"input": spek.cpu().numpy()})[0]
+            self.logger.debug("Model loaded successfully using ONNXruntime inferencing session.")
+        else:
+            if platform.system() == 'Windows':
+                onnx_model = onnx.load(self.model_path)
+                self.model_run = onnx2torch.convert(onnx_model)
+            else:
+                self.model_run = onnx2torch.convert(self.model_path)
+            self.model_run.to(self.torch_device).eval()
+            self.logger.warning("Model converted from onnx to pytorch due to segment size not matching dim_t, processing may be slower.")
+    def separate(self, audio_file_path, custom_output_names=None):
+        """
+        Separates the audio file into primary and secondary sources based on the model's configuration.
+        It processes the mix, demixes it into sources, normalizes the sources, and saves the output files.
+        Args:
+            audio_file_path (str): The path to the audio file to be processed.
+            custom_output_names (dict, optional): Custom names for the output files. Defaults to None.
+        Returns:
+            list: A list of paths to the output files generated by the separation process.
+        """
+        self.audio_file_path = audio_file_path
+        self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
+        # Prepare the mix for processing
+        self.logger.debug(f"Preparing mix for input audio file {self.audio_file_path}...")
+        mix = self.prepare_mix(self.audio_file_path)
+        self.logger.debug("Normalizing mix before demixing...")
+        mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold)
+        # Start the demixing process
+        source = self.demix(mix)
+        self.logger.debug("Demixing completed.")
+        # In UVR, the source is cached here if it's a vocal split model, but we're not supporting that yet
+        # Initialize the list for output files
+        output_files = []
+        self.logger.debug("Processing output files...")
+        # Normalize and transpose the primary source if it's not already an array
+        if not isinstance(self.primary_source, np.ndarray):
+            self.logger.debug("Normalizing primary source...")
+            self.primary_source = spec_utils.normalize(wave=source, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold).T
+        # Process the secondary source if not already an array
+        if not isinstance(self.secondary_source, np.ndarray):
+            self.logger.debug("Producing secondary source: demixing in match_mix mode")
+            raw_mix = self.demix(mix, is_match_mix=True)
+            if self.invert_using_spec:
+                self.logger.debug("Inverting secondary stem using spectogram as invert_using_spec is set to True")
+                self.secondary_source = spec_utils.invert_stem(raw_mix, source)
+            else:
+                self.logger.debug("Inverting secondary stem by subtracting of transposed demixed stem from transposed original mix")
+                self.secondary_source = mix.T - source.T
+        # Save and process the secondary stem if needed
+        if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower():
+            self.secondary_stem_output_path = self.get_stem_output_path(self.secondary_stem_name, custom_output_names)
+            self.logger.info(f"Saving {self.secondary_stem_name} stem to {self.secondary_stem_output_path}...")
+            self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
+            output_files.append(self.secondary_stem_output_path)
+        # Save and process the primary stem if needed
+        if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower():
+            self.primary_stem_output_path = self.get_stem_output_path(self.primary_stem_name, custom_output_names)
+            if not isinstance(self.primary_source, np.ndarray):
+                self.primary_source = source.T
+            self.logger.info(f"Saving {self.primary_stem_name} stem to {self.primary_stem_output_path}...")
+            self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
+            output_files.append(self.primary_stem_output_path)
+        # Not yet implemented from UVR features:
+        # self.process_vocal_split_chain(secondary_sources)
+        # self.logger.debug("Vocal split chain processed.")
+        return output_files
+    def initialize_model_settings(self):
+        """
+        This function sets up the necessary parameters for the model, like the number of frequency bins (n_bins), the trimming size (trim),
+        the size of each audio chunk (chunk_size), and the window function for spectral transformations (window).
+        It ensures that the model is configured with the correct settings for processing the audio data.
+        """
+        self.logger.debug("Initializing model settings...")
+        # n_bins is half the FFT size plus one (self.n_fft // 2 + 1).
+        self.n_bins = self.n_fft // 2 + 1
+        # trim is half the FFT size (self.n_fft // 2).
+        self.trim = self.n_fft // 2
+        # chunk_size is the hop_length size times the segment size minus one
+        self.chunk_size = self.hop_length * (self.segment_size - 1)
+        # gen_size is the chunk size minus twice the trim size
+        self.gen_size = self.chunk_size - 2 * self.trim
+        self.stft = STFT(self.logger, self.n_fft, self.hop_length, self.dim_f, self.torch_device)
+        self.logger.debug(f"Model input params: n_fft={self.n_fft} hop_length={self.hop_length} dim_f={self.dim_f}")
+        self.logger.debug(f"Model settings: n_bins={self.n_bins}, trim={self.trim}, chunk_size={self.chunk_size}, gen_size={self.gen_size}")
+    def initialize_mix(self, mix, is_ckpt=False):
+        """
+        After prepare_mix segments the audio, initialize_mix further processes each segment.
+        It ensures each audio segment is in the correct format for the model, applies necessary padding,
+        and converts the segments into tensors for processing with the model.
+        This step is essential for preparing the audio data in a format that the neural network can process.
+        """
+        # Log the initialization of the mix and whether checkpoint mode is used
+        self.logger.debug(f"Initializing mix with is_ckpt={is_ckpt}. Initial mix shape: {mix.shape}")
+        # Ensure the mix is a 2-channel (stereo) audio signal
+        if mix.shape[0] != 2:
+            error_message = f"Expected a 2-channel audio signal, but got {mix.shape[0]} channels"
+            self.logger.error(error_message)
+            raise ValueError(error_message)
+        # If in checkpoint mode, process the mix differently
+        if is_ckpt:
+            self.logger.debug("Processing in checkpoint mode...")
+            # Calculate padding based on the generation size and trim
+            pad = self.gen_size + self.trim - (mix.shape[-1] % self.gen_size)
+            self.logger.debug(f"Padding calculated: {pad}")
+            # Add padding at the beginning and the end of the mix
+            mixture = np.concatenate((np.zeros((2, self.trim), dtype="float32"), mix, np.zeros((2, pad), dtype="float32")), 1)
+            # Determine the number of chunks based on the mixture's length
+            num_chunks = mixture.shape[-1] // self.gen_size
+            self.logger.debug(f"Mixture shape after padding: {mixture.shape}, Number of chunks: {num_chunks}")
+            # Split the mixture into chunks
+            mix_waves = [mixture[:, i * self.gen_size : i * self.gen_size + self.chunk_size] for i in range(num_chunks)]
+        else:
+            # If not in checkpoint mode, process normally
+            self.logger.debug("Processing in non-checkpoint mode...")
+            mix_waves = []
+            n_sample = mix.shape[1]
+            # Calculate necessary padding to make the total length divisible by the generation size
+            pad = self.gen_size - n_sample % self.gen_size
+            self.logger.debug(f"Number of samples: {n_sample}, Padding calculated: {pad}")
+            # Apply padding to the mix
+            mix_p = np.concatenate((np.zeros((2, self.trim)), mix, np.zeros((2, pad)), np.zeros((2, self.trim))), 1)
+            self.logger.debug(f"Shape of mix after padding: {mix_p.shape}")
+            # Process the mix in chunks
+            i = 0
+            while i < n_sample + pad:
+                waves = np.array(mix_p[:, i : i + self.chunk_size])
+                mix_waves.append(waves)
+                self.logger.debug(f"Processed chunk {len(mix_waves)}: Start {i}, End {i + self.chunk_size}")
+                i += self.gen_size
+        # Convert the list of wave chunks into a tensor for processing on the specified device
+        mix_waves_tensor = torch.tensor(mix_waves, dtype=torch.float32).to(self.torch_device)
+        self.logger.debug(f"Converted mix_waves to tensor. Tensor shape: {mix_waves_tensor.shape}")
+        return mix_waves_tensor, pad
+    def demix(self, mix, is_match_mix=False):
+        """
+        Demixes the input mix into its constituent sources. If is_match_mix is True, the function adjusts the processing
+        to better match the mix, affecting chunk sizes and overlaps. The demixing process involves padding the mix,
+        processing it in chunks, applying windowing for overlaps, and accumulating the results to separate the sources.
+        """
+        self.logger.debug(f"Starting demixing process with is_match_mix: {is_match_mix}...")
+        self.initialize_model_settings()
+        # Preserves the original mix for later use.
+        # In UVR, this is used for the pitch fix and VR denoise processes, which aren't yet implemented here.
+        org_mix = mix
+        self.logger.debug(f"Original mix stored. Shape: {org_mix.shape}")
+        # Initializes a list to store the separated waveforms.
+        tar_waves_ = []
+        # Handling different chunk sizes and overlaps based on the matching requirement.
+        if is_match_mix:
+            # Sets a smaller chunk size specifically for matching the mix.
+            chunk_size = self.hop_length * (self.segment_size - 1)
+            # Sets a small overlap for the chunks.
+            overlap = 0.02
+            self.logger.debug(f"Chunk size for matching mix: {chunk_size}, Overlap: {overlap}")
+        else:
+            # Uses the regular chunk size defined in model settings.
+            chunk_size = self.chunk_size
+            # Uses the overlap specified in the model settings.
+            overlap = self.overlap
+            self.logger.debug(f"Standard chunk size: {chunk_size}, Overlap: {overlap}")
+        # Calculates the generated size after subtracting the trim from both ends of the chunk.
+        gen_size = chunk_size - 2 * self.trim
+        self.logger.debug(f"Generated size calculated: {gen_size}")
+        # Calculates padding to make the mix length a multiple of the generated size.
+        pad = gen_size + self.trim - ((mix.shape[-1]) % gen_size)
+        # Prepares the mixture with padding at the beginning and the end.
+        mixture = np.concatenate((np.zeros((2, self.trim), dtype="float32"), mix, np.zeros((2, pad), dtype="float32")), 1)
+        self.logger.debug(f"Mixture prepared with padding. Mixture shape: {mixture.shape}")
+        # Calculates the step size for processing chunks based on the overlap.
+        step = int((1 - overlap) * chunk_size)
+        self.logger.debug(f"Step size for processing chunks: {step} as overlap is set to {overlap}.")
+        # Initializes arrays to store the results and to account for overlap.
+        result = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32)
+        divider = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32)
+        # Initializes counters for processing chunks.
+        total = 0
+        total_chunks = (mixture.shape[-1] + step - 1) // step
+        self.logger.debug(f"Total chunks to process: {total_chunks}")
+        # Processes each chunk of the mixture.
+        for i in tqdm(range(0, mixture.shape[-1], step)):
+            total += 1
+            start = i
+            end = min(i + chunk_size, mixture.shape[-1])
+            self.logger.debug(f"Processing chunk {total}/{total_chunks}: Start {start}, End {end}")
+            # Handles windowing for overlapping chunks.
+            chunk_size_actual = end - start
+            window = None
+            if overlap != 0:
+                window = np.hanning(chunk_size_actual)
+                window = np.tile(window[None, None, :], (1, 2, 1))
+                self.logger.debug("Window applied to the chunk.")
+            # Zero-pad the chunk to prepare it for processing.
+            mix_part_ = mixture[:, start:end]
+            if end != i + chunk_size:
+                pad_size = (i + chunk_size) - end
+                mix_part_ = np.concatenate((mix_part_, np.zeros((2, pad_size), dtype="float32")), axis=-1)
+            # Converts the chunk to a tensor for processing.
+            mix_part = torch.tensor([mix_part_], dtype=torch.float32).to(self.torch_device)
+            # Splits the chunk into smaller batches if necessary.
+            mix_waves = mix_part.split(self.batch_size)
+            total_batches = len(mix_waves)
+            self.logger.debug(f"Mix part split into batches. Number of batches: {total_batches}")
+            with torch.no_grad():
+                # Processes each batch in the chunk.
+                batches_processed = 0
+                for mix_wave in mix_waves:
+                    batches_processed += 1
+                    self.logger.debug(f"Processing mix_wave batch {batches_processed}/{total_batches}")
+                    # Runs the model to separate the sources.
+                    tar_waves = self.run_model(mix_wave, is_match_mix=is_match_mix)
+                    # Applies windowing if needed and accumulates the results.
+                    if window is not None:
+                        tar_waves[..., :chunk_size_actual] *= window
+                        divider[..., start:end] += window
+                    else:
+                        divider[..., start:end] += 1
+                    result[..., start:end] += tar_waves[..., : end - start]
+        # Normalizes the results by the divider to account for overlap.
+        self.logger.debug("Normalizing result by dividing result by divider.")
+        tar_waves = result / divider
+        tar_waves_.append(tar_waves)
+        # Reshapes the results to match the original dimensions.
+        tar_waves_ = np.vstack(tar_waves_)[:, :, self.trim : -self.trim]
+        tar_waves = np.concatenate(tar_waves_, axis=-1)[:, : mix.shape[-1]]
+        # Extracts the source from the results.
+        source = tar_waves[:, 0:None]
+        self.logger.debug(f"Concatenated tar_waves. Shape: {tar_waves.shape}")
+        # TODO: In UVR, pitch changing happens here. Consider implementing this as a feature.
+        # Compensates the source if not matching the mix.
+        if not is_match_mix:
+            source *= self.compensate
+            self.logger.debug("Match mix mode; compensate multiplier applied.")
+        # TODO: In UVR, VR denoise model gets applied here. Consider implementing this as a feature.
+        self.logger.debug("Demixing process completed.")
+        return source
+    def run_model(self, mix, is_match_mix=False):
+        """
+        Processes the input mix through the model to separate the sources.
+        Applies STFT, handles spectrum modifications, and runs the model for source separation.
+        """
+        # Applying the STFT to the mix. The mix is moved to the specified device (e.g., GPU) before processing.
+        # self.logger.debug(f"Running STFT on the mix. Mix shape before STFT: {mix.shape}")
+        spek = self.stft(mix.to(self.torch_device))
+        self.logger.debug(f"STFT applied on mix. Spectrum shape: {spek.shape}")
+        # Zeroing out the first 3 bins of the spectrum. This is often done to reduce low-frequency noise.
+        spek[:, :, :3, :] *= 0
+        # self.logger.debug("First 3 bins of the spectrum zeroed out.")
+        # Handling the case where the mix needs to be matched (is_match_mix = True)
+        if is_match_mix:
+            # self.logger.debug("Match mix mode is enabled. Converting spectrum to NumPy array.")
+            spec_pred = spek.cpu().numpy()
+            self.logger.debug("is_match_mix: spectrum prediction obtained directly from STFT output.")
+        else:
+            # If denoising is enabled, the model is run on both the negative and positive spectrums.
+            if self.enable_denoise:
+                # Assuming spek is a tensor and self.model_run can process it directly
+                spec_pred_neg = self.model_run(-spek)  # Ensure this line correctly negates spek and runs the model
+                spec_pred_pos = self.model_run(spek)
+                # Ensure both spec_pred_neg and spec_pred_pos are tensors before applying operations
+                spec_pred = (spec_pred_neg * -0.5) + (spec_pred_pos * 0.5)  # [invalid-unary-operand-type]
+                self.logger.debug("Model run on both negative and positive spectrums for denoising.")
+            else:
+                spec_pred = self.model_run(spek)
+                self.logger.debug("Model run on the spectrum without denoising.")
+        # Applying the inverse STFT to convert the spectrum back to the time domain.
+        result = self.stft.inverse(torch.tensor(spec_pred).to(self.torch_device)).cpu().detach().numpy()
+        self.logger.debug(f"Inverse STFT applied. Returning result with shape: {result.shape}")
+        return result

audio_separator/separator/architectures/mdxc_separator.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import os
+import sys
+import torch
+import numpy as np
+from tqdm import tqdm
+from ml_collections import ConfigDict
+from scipy import signal
+from audio_separator.separator.common_separator import CommonSeparator
+from audio_separator.separator.uvr_lib_v5 import spec_utils
+from audio_separator.separator.uvr_lib_v5.tfc_tdf_v3 import TFC_TDF_net
+from audio_separator.separator.uvr_lib_v5.roformer.mel_band_roformer import MelBandRoformer
+from audio_separator.separator.uvr_lib_v5.roformer.bs_roformer import BSRoformer
+class MDXCSeparator(CommonSeparator):
+    """
+    MDXCSeparator is responsible for separating audio sources using MDXC models.
+    It initializes with configuration parameters and prepares the model for separation tasks.
+    """
+    def __init__(self, common_config, arch_config):
+        # Any configuration values which can be shared between architectures should be set already in CommonSeparator,
+        # e.g. user-specified functionality choices (self.output_single_stem) or common model parameters (self.primary_stem_name)
+        super().__init__(config=common_config)
+        # Model data is basic overview metadata about the model, e.g. which stem is primary and whether it's a karaoke model
+        # It's loaded in from model_data_new.json in Separator.load_model and there are JSON examples in that method
+        # The instance variable self.model_data is passed through from Separator and set in CommonSeparator
+        self.logger.debug(f"Model data: {self.model_data}")
+        # Arch Config is the MDXC architecture specific user configuration options, which should all be configurable by the user
+        # either by their Separator class instantiation or by passing in a CLI parameter.
+        # While there are similarities between architectures for some of these (e.g. batch_size), they are deliberately configured
+        # this way as they have architecture-specific default values.
+        self.segment_size = arch_config.get("segment_size", 256)
+        # Whether or not to use the segment size from model config, or the default
+        # The segment size is set based on the value provided in a chosen model's associated config file (yaml).
+        self.override_model_segment_size = arch_config.get("override_model_segment_size", False)
+        self.overlap = arch_config.get("overlap", 8)
+        self.batch_size = arch_config.get("batch_size", 1)
+        # Amount of pitch shift to apply during processing (this does NOT affect the pitch of the output audio):
+        # • Whole numbers indicate semitones.
+        # • Using higher pitches may cut the upper bandwidth, even in high-quality models.
+        # • Upping the pitch can be better for tracks with deeper vocals.
+        # • Dropping the pitch may take more processing time but works well for tracks with high-pitched vocals.
+        self.pitch_shift = arch_config.get("pitch_shift", 0)
+        self.process_all_stems = arch_config.get("process_all_stems", True)
+        self.logger.debug(f"MDXC arch params: batch_size={self.batch_size}, segment_size={self.segment_size}, overlap={self.overlap}")
+        self.logger.debug(f"MDXC arch params: override_model_segment_size={self.override_model_segment_size}, pitch_shift={self.pitch_shift}")
+        self.logger.debug(f"MDXC multi-stem params: process_all_stems={self.process_all_stems}")
+        self.is_roformer = "is_roformer" in self.model_data
+        self.load_model()
+        self.primary_source = None
+        self.secondary_source = None
+        self.audio_file_path = None
+        self.audio_file_base = None
+        self.is_primary_stem_main_target = False
+        if self.model_data_cfgdict.training.target_instrument == "Vocals" or len(self.model_data_cfgdict.training.instruments) > 1:
+            self.is_primary_stem_main_target = True
+        self.logger.debug(f"is_primary_stem_main_target: {self.is_primary_stem_main_target}")
+        self.logger.info("MDXC Separator initialisation complete")
+    def load_model(self):
+        """
+        Load the model into memory from file on disk, initialize it with config from the model data,
+        and prepare for inferencing using hardware accelerated Torch device.
+        """
+        self.logger.debug("Loading checkpoint model for inference...")
+        self.model_data_cfgdict = ConfigDict(self.model_data)
+        try:
+            if self.is_roformer:
+                self.logger.debug("Loading Roformer model...")
+                # Determine the model type based on the configuration and instantiate it
+                if "num_bands" in self.model_data_cfgdict.model:
+                    self.logger.debug("Loading MelBandRoformer model...")
+                    model = MelBandRoformer(**self.model_data_cfgdict.model)
+                elif "freqs_per_bands" in self.model_data_cfgdict.model:
+                    self.logger.debug("Loading BSRoformer model...")
+                    model = BSRoformer(**self.model_data_cfgdict.model)
+                else:
+                    raise ValueError("Unknown Roformer model type in the configuration.")
+                # Load model checkpoint
+                checkpoint = torch.load(self.model_path, map_location="cpu", weights_only=True)
+                self.model_run = model if not isinstance(model, torch.nn.DataParallel) else model.module
+                self.model_run.load_state_dict(checkpoint)
+                self.model_run.to(self.torch_device).eval()
+            else:
+                self.logger.debug("Loading TFC_TDF_net model...")
+                self.model_run = TFC_TDF_net(self.model_data_cfgdict, device=self.torch_device)
+                self.logger.debug("Loading model onto cpu")
+                # For some reason loading the state onto a hardware accelerated devices causes issues,
+                # so we load it onto CPU first then move it to the device
+                self.model_run.load_state_dict(torch.load(self.model_path, map_location="cpu"))
+                self.model_run.to(self.torch_device).eval()
+        except RuntimeError as e:
+            self.logger.error(f"Error: {e}")
+            self.logger.error("An error occurred while loading the model file. This often occurs when the model file is corrupt or incomplete.")
+            self.logger.error(f"Please try deleting the model file from {self.model_path} and run audio-separator again to re-download it.")
+            sys.exit(1)
+    def separate(self, audio_file_path, custom_output_names=None):
+        """
+        Separates the audio file into primary and secondary sources based on the model's configuration.
+        It processes the mix, demixes it into sources, normalizes the sources, and saves the output files.
+        Args:
+            audio_file_path (str): The path to the audio file to be processed.
+            custom_output_names (dict, optional): Custom names for the output files. Defaults to None.
+        Returns:
+            list: A list of paths to the output files generated by the separation process.
+        """
+        self.primary_source = None
+        self.secondary_source = None
+        self.audio_file_path = audio_file_path
+        self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
+        self.logger.debug(f"Preparing mix for input audio file {self.audio_file_path}...")
+        mix = self.prepare_mix(self.audio_file_path)
+        self.logger.debug("Normalizing mix before demixing...")
+        mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold)
+        source = self.demix(mix=mix)
+        self.logger.debug("Demixing completed.")
+        output_files = []
+        self.logger.debug("Processing output files...")
+        if isinstance(source, dict):
+            self.logger.debug("Source is a dict, processing each stem...")
+            stem_list = []
+            if self.model_data_cfgdict.training.target_instrument:
+                stem_list = [self.model_data_cfgdict.training.target_instrument]
+            else:
+                stem_list = self.model_data_cfgdict.training.instruments
+            self.logger.debug(f"Available stems: {stem_list}")
+            is_multi_stem_model = len(stem_list) > 2
+            should_process_all_stems = self.process_all_stems and is_multi_stem_model
+            if should_process_all_stems:
+                self.logger.debug("Processing all stems from multi-stem model...")
+                for stem_name in stem_list:
+                    stem_output_path = self.get_stem_output_path(stem_name, custom_output_names)
+                    stem_source = spec_utils.normalize(
+                        wave=source[stem_name],
+                        max_peak=self.normalization_threshold,
+                        min_peak=self.amplification_threshold
+                    ).T
+                    self.logger.info(f"Saving {stem_name} stem to {stem_output_path}...")
+                    self.final_process(stem_output_path, stem_source, stem_name)
+                    output_files.append(stem_output_path)
+            else:
+                # Standard processing for primary and secondary stems
+                if not isinstance(self.primary_source, np.ndarray):
+                    self.logger.debug(f"Normalizing primary source for primary stem {self.primary_stem_name}...")
+                    self.primary_source = spec_utils.normalize(
+                        wave=source[self.primary_stem_name],
+                        max_peak=self.normalization_threshold,
+                        min_peak=self.amplification_threshold
+                    ).T
+                if not isinstance(self.secondary_source, np.ndarray):
+                    self.logger.debug(f"Normalizing secondary source for secondary stem {self.secondary_stem_name}...")
+                    self.secondary_source = spec_utils.normalize(
+                        wave=source[self.secondary_stem_name],
+                        max_peak=self.normalization_threshold,
+                        min_peak=self.amplification_threshold
+                    ).T
+                if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower():
+                    self.secondary_stem_output_path = self.get_stem_output_path(self.secondary_stem_name, custom_output_names)
+                    self.logger.info(f"Saving {self.secondary_stem_name} stem to {self.secondary_stem_output_path}...")
+                    self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
+                    output_files.append(self.secondary_stem_output_path)
+                if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower():
+                    self.primary_stem_output_path = self.get_stem_output_path(self.primary_stem_name, custom_output_names)
+                    self.logger.info(f"Saving {self.primary_stem_name} stem to {self.primary_stem_output_path}...")
+                    self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
+                    output_files.append(self.primary_stem_output_path)
+        else:
+            # Handle case when source is not a dictionary (single source model)
+            if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower():
+                self.primary_stem_output_path = self.get_stem_output_path(self.primary_stem_name, custom_output_names)
+                if not isinstance(self.primary_source, np.ndarray):
+                    self.primary_source = source.T
+                self.logger.info(f"Saving {self.primary_stem_name} stem to {self.primary_stem_output_path}...")
+                self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
+                output_files.append(self.primary_stem_output_path)
+        return output_files
+    def pitch_fix(self, source, sr_pitched, orig_mix):
+        """
+        Change the pitch of the source audio by a number of semitones.
+        Args:
+            source (np.ndarray): The source audio to be pitch-shifted.
+            sr_pitched (int): The sample rate of the pitch-shifted audio.
+            orig_mix (np.ndarray): The original mix, used to match the shape of the pitch-shifted audio.
+        Returns:
+            np.ndarray: The pitch-shifted source audio.
+        """
+        source = spec_utils.change_pitch_semitones(source, sr_pitched, semitone_shift=self.pitch_shift)[0]
+        source = spec_utils.match_array_shapes(source, orig_mix)
+        return source
+    def overlap_add(self, result, x, weights, start, length):
+        """
+        Adds the overlapping part of the result to the result tensor.
+        """
+        result[..., start : start + length] += x[..., :length] * weights[:length]
+        return result
+    def demix(self, mix: np.ndarray) -> dict:
+        """
+        Demixes the input mix into primary and secondary sources using the model and model data.
+        Args:
+            mix (np.ndarray): The mix to be demixed.
+        Returns:
+            dict: A dictionary containing the demixed sources.
+        """
+        orig_mix = mix
+        if self.pitch_shift != 0:
+            self.logger.debug(f"Shifting pitch by -{self.pitch_shift} semitones...")
+            mix, sample_rate = spec_utils.change_pitch_semitones(mix, self.sample_rate, semitone_shift=-self.pitch_shift)
+        if self.is_roformer:
+            # Note: Currently, for Roformer models, `batch_size` is not utilized due to negligible performance improvements.
+            mix = torch.tensor(mix, dtype=torch.float32)
+            if self.override_model_segment_size:
+                mdx_segment_size = self.segment_size
+                self.logger.debug(f"Using configured segment size: {mdx_segment_size}")
+            else:
+                mdx_segment_size = self.model_data_cfgdict.inference.dim_t
+                self.logger.debug(f"Using model default segment size: {mdx_segment_size}")
+            # num_stems aka "S" in UVR
+            num_stems = 1 if self.model_data_cfgdict.training.target_instrument else len(self.model_data_cfgdict.training.instruments)
+            self.logger.debug(f"Number of stems: {num_stems}")
+            # chunk_size aka "C" in UVR
+            chunk_size = self.model_data_cfgdict.audio.hop_length * (mdx_segment_size - 1)
+            self.logger.debug(f"Chunk size: {chunk_size}")
+            step = int(self.overlap * self.model_data_cfgdict.audio.sample_rate)
+            self.logger.debug(f"Step: {step}")
+            # Create a weighting table and convert it to a PyTorch tensor
+            window = torch.tensor(signal.windows.hamming(chunk_size), dtype=torch.float32)
+            device = next(self.model_run.parameters()).device
+            with torch.no_grad():
+                req_shape = (len(self.model_data_cfgdict.training.instruments),) + tuple(mix.shape)
+                result = torch.zeros(req_shape, dtype=torch.float32)
+                counter = torch.zeros(req_shape, dtype=torch.float32)
+                for i in tqdm(range(0, mix.shape[1], step)):
+                    part = mix[:, i : i + chunk_size]
+                    length = part.shape[-1]
+                    if i + chunk_size > mix.shape[1]:
+                        part = mix[:, -chunk_size:]
+                        length = chunk_size
+                    part = part.to(device)
+                    x = self.model_run(part.unsqueeze(0))[0]
+                    x = x.cpu()
+                    # Perform overlap_add on CPU
+                    if i + chunk_size > mix.shape[1]:
+                        # Fixed to correctly add to the end of the tensor
+                        result = self.overlap_add(result, x, window, result.shape[-1] - chunk_size, length)
+                        counter[..., result.shape[-1] - chunk_size :] += window[:length]
+                    else:
+                        result = self.overlap_add(result, x, window, i, length)
+                        counter[..., i : i + length] += window[:length]
+            inferenced_outputs = result / counter.clamp(min=1e-10)
+        else:
+            mix = torch.tensor(mix, dtype=torch.float32)
+            try:
+                num_stems = self.model_run.num_target_instruments
+            except AttributeError:
+                num_stems = self.model_run.module.num_target_instruments
+            self.logger.debug(f"Number of stems: {num_stems}")
+            if self.override_model_segment_size:
+                mdx_segment_size = self.segment_size
+                self.logger.debug(f"Using configured segment size: {mdx_segment_size}")
+            else:
+                mdx_segment_size = self.model_data_cfgdict.inference.dim_t
+                self.logger.debug(f"Using model default segment size: {mdx_segment_size}")
+            chunk_size = self.model_data_cfgdict.audio.hop_length * (mdx_segment_size - 1)
+            self.logger.debug(f"Chunk size: {chunk_size}")
+            hop_size = chunk_size // self.overlap
+            self.logger.debug(f"Hop size: {hop_size}")
+            mix_shape = mix.shape[1]
+            pad_size = hop_size - (mix_shape - chunk_size) % hop_size
+            self.logger.debug(f"Pad size: {pad_size}")
+            mix = torch.cat([torch.zeros(2, chunk_size - hop_size), mix, torch.zeros(2, pad_size + chunk_size - hop_size)], 1)
+            self.logger.debug(f"Mix shape: {mix.shape}")
+            chunks = mix.unfold(1, chunk_size, hop_size).transpose(0, 1)
+            self.logger.debug(f"Chunks length: {len(chunks)} and shape: {chunks.shape}")
+            batches = [chunks[i : i + self.batch_size] for i in range(0, len(chunks), self.batch_size)]
+            self.logger.debug(f"Batch size: {self.batch_size}, number of batches: {len(batches)}")
+            # accumulated_outputs is used to accumulate the output from processing each batch of chunks through the model.
+            # It starts as a tensor of zeros and is updated in-place as the model processes each batch.
+            # The variable holds the combined result of all processed batches, which, after post-processing, represents the separated audio sources.
+            accumulated_outputs = torch.zeros(num_stems, *mix.shape) if num_stems > 1 else torch.zeros_like(mix)
+            with torch.no_grad():
+                count = 0
+                for batch in tqdm(batches):
+                    # Since the model processes the audio data in batches, single_batch_result temporarily holds the model's output
+                    # for each batch before it is accumulated into accumulated_outputs.
+                    single_batch_result = self.model_run(batch.to(self.torch_device))
+                    # Each individual output tensor from the current batch's processing result.
+                    # Since single_batch_result can contain multiple output tensors (one for each piece of audio in the batch),
+                    # individual_output is used to iterate through these tensors and accumulate them into accumulated_outputs.
+                    for individual_output in single_batch_result:
+                        individual_output_cpu = individual_output.cpu()
+                        # Accumulate outputs on CPU
+                        accumulated_outputs[..., count * hop_size : count * hop_size + chunk_size] += individual_output_cpu
+                        count += 1
+            self.logger.debug("Calculating inferenced outputs based on accumulated outputs and overlap")
+            inferenced_outputs = accumulated_outputs[..., chunk_size - hop_size : -(pad_size + chunk_size - hop_size)] / self.overlap
+            self.logger.debug("Deleting accumulated outputs to free up memory")
+            del accumulated_outputs
+        if num_stems > 1 or self.is_primary_stem_main_target:
+            self.logger.debug("Number of stems is greater than 1 or vocals are main target, detaching individual sources and correcting pitch if necessary...")
+            sources = {}
+            # Iterates over each instrument specified in the model's configuration and its corresponding separated audio source.
+            # self.model_data_cfgdict.training.instruments provides the list of stems.
+            # estimated_sources.cpu().detach().numpy() converts the separated sources tensor to a NumPy array for processing.
+            # Each iteration provides an instrument name ('key') and its separated audio ('value') for further processing.
+            for key, value in zip(self.model_data_cfgdict.training.instruments, inferenced_outputs.cpu().detach().numpy()):
+                self.logger.debug(f"Processing instrument: {key}")
+                if self.pitch_shift != 0:
+                    self.logger.debug(f"Applying pitch correction for {key}")
+                    sources[key] = self.pitch_fix(value, sample_rate, orig_mix)
+                else:
+                    sources[key] = value
+            if self.is_primary_stem_main_target:
+                self.logger.debug(f"Primary stem: {self.primary_stem_name} is main target, detaching and matching array shapes if necessary...")
+                if sources[self.primary_stem_name].shape[1] != orig_mix.shape[1]:
+                    sources[self.primary_stem_name] = spec_utils.match_array_shapes(sources[self.primary_stem_name], orig_mix)
+                sources[self.secondary_stem_name] = orig_mix - sources[self.primary_stem_name]
+            self.logger.debug("Deleting inferenced outputs to free up memory")
+            del inferenced_outputs
+            self.logger.debug("Returning separated sources")
+            return sources
+        else:
+            self.logger.debug("Processing single source...")
+            if self.is_roformer:
+                sources = {k: v.cpu().detach().numpy() for k, v in zip([self.model_data_cfgdict.training.target_instrument], inferenced_outputs)}
+                inferenced_output = sources[self.model_data_cfgdict.training.target_instrument]
+            else:
+                inferenced_output = inferenced_outputs.cpu().detach().numpy()
+            self.logger.debug("Demix process completed for single source.")
+            self.logger.debug("Deleting inferenced outputs to free up memory")
+            del inferenced_outputs
+            if self.pitch_shift != 0:
+                self.logger.debug("Applying pitch correction for single instrument")
+                return self.pitch_fix(inferenced_output, sample_rate, orig_mix)
+            else:
+                self.logger.debug("Returning inferenced output for single instrument")
+                return inferenced_output

audio_separator/separator/architectures/vr_separator.py ADDED Viewed

	@@ -0,0 +1,357 @@

+"""Module for separating audio sources using VR architecture models."""
+import os
+import math
+import torch
+import librosa
+import numpy as np
+from tqdm import tqdm
+# Check if we really need the rerun_mp3 function, remove if not
+import audioread
+from audio_separator.separator.common_separator import CommonSeparator
+from audio_separator.separator.uvr_lib_v5 import spec_utils
+from audio_separator.separator.uvr_lib_v5.vr_network import nets
+from audio_separator.separator.uvr_lib_v5.vr_network import nets_new
+from audio_separator.separator.uvr_lib_v5.vr_network.model_param_init import ModelParameters
+class VRSeparator(CommonSeparator):
+    """
+    VRSeparator is responsible for separating audio sources using VR models.
+    It initializes with configuration parameters and prepares the model for separation tasks.
+    """
+    def __init__(self, common_config, arch_config: dict):
+        # Any configuration values which can be shared between architectures should be set already in CommonSeparator,
+        # e.g. user-specified functionality choices (self.output_single_stem) or common model parameters (self.primary_stem_name)
+        super().__init__(config=common_config)
+        # Model data is basic overview metadata about the model, e.g. which stem is primary and whether it's a karaoke model
+        # It's loaded in from model_data_new.json in Separator.load_model and there are JSON examples in that method
+        # The instance variable self.model_data is passed through from Separator and set in CommonSeparator
+        self.logger.debug(f"Model data: {self.model_data}")
+        # Most of the VR models use the same number of output channels, but the VR 51 models have specific values set in model_data JSON
+        self.model_capacity = 32, 128
+        self.is_vr_51_model = False
+        if "nout" in self.model_data.keys() and "nout_lstm" in self.model_data.keys():
+            self.model_capacity = self.model_data["nout"], self.model_data["nout_lstm"]
+            self.is_vr_51_model = True
+        # Model params are additional technical parameter values from JSON files in separator/uvr_lib_v5/vr_network/modelparams/*.json,
+        # with filenames referenced by the model_data["vr_model_param"] value
+        package_root_filepath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        vr_params_json_dir = os.path.join(package_root_filepath, "uvr_lib_v5", "vr_network", "modelparams")
+        vr_params_json_filename = f"{self.model_data['vr_model_param']}.json"
+        vr_params_json_filepath = os.path.join(vr_params_json_dir, vr_params_json_filename)
+        self.model_params = ModelParameters(vr_params_json_filepath)
+        self.logger.debug(f"Model params: {self.model_params.param}")
+        # Arch Config is the VR architecture specific user configuration options, which should all be configurable by the user
+        # either by their Separator class instantiation or by passing in a CLI parameter.
+        # While there are similarities between architectures for some of these (e.g. batch_size), they are deliberately configured
+        # this way as they have architecture-specific default values.
+        # This option performs Test-Time-Augmentation to improve the separation quality.
+        # Note: Having this selected will increase the time it takes to complete a conversion
+        self.enable_tta = arch_config.get("enable_tta", False)
+        # This option can potentially identify leftover instrumental artifacts within the vocal outputs; may improve the separation of some songs.
+        # Note: Selecting this option can adversely affect the conversion process, depending on the track. Because of this, it is only recommended as a last resort.
+        self.enable_post_process = arch_config.get("enable_post_process", False)
+        # post_process_threshold values = ('0.1', '0.2', '0.3')
+        self.post_process_threshold = arch_config.get("post_process_threshold", 0.2)
+        # Number of batches to be processed at a time.
+        # - Higher values mean more RAM usage but slightly faster processing times.
+        # - Lower values mean less RAM usage but slightly longer processing times.
+        # - Batch size value has no effect on output quality.
+        # Andrew note: for some reason, lower batch sizes seem to cause broken output for VR arch; need to investigate why
+        self.batch_size = arch_config.get("batch_size", 1)
+        # Select window size to balance quality and speed:
+        # - 1024 - Quick but lesser quality.
+        # - 512 - Medium speed and quality.
+        # - 320 - Takes longer but may offer better quality.
+        self.window_size = arch_config.get("window_size", 512)
+        # The application will mirror the missing frequency range of the output.
+        self.high_end_process = arch_config.get("high_end_process", False)
+        self.input_high_end_h = None
+        self.input_high_end = None
+        # Adjust the intensity of primary stem extraction:
+        # - Ranges from -100 - 100.
+        # - Bigger values mean deeper extractions.
+        # - Typically, it's set to 5 for vocals & instrumentals.
+        # - Values beyond 5 might muddy the sound for non-vocal models.
+        self.aggression = float(int(arch_config.get("aggression", 5)) / 100)
+        self.aggressiveness = {"value": self.aggression, "split_bin": self.model_params.param["band"][1]["crop_stop"], "aggr_correction": self.model_params.param.get("aggr_correction")}
+        self.model_samplerate = self.model_params.param["sr"]
+        self.logger.debug(f"VR arch params: enable_tta={self.enable_tta}, enable_post_process={self.enable_post_process}, post_process_threshold={self.post_process_threshold}")
+        self.logger.debug(f"VR arch params: batch_size={self.batch_size}, window_size={self.window_size}")
+        self.logger.debug(f"VR arch params: high_end_process={self.high_end_process}, aggression={self.aggression}")
+        self.logger.debug(f"VR arch params: is_vr_51_model={self.is_vr_51_model}, model_samplerate={self.model_samplerate}, model_capacity={self.model_capacity}")
+        self.model_run = lambda *args, **kwargs: self.logger.error("Model run method is not initialised yet.")
+        # This should go away once we refactor to remove soundfile.write and replace with pydub like we did for the MDX rewrite
+        self.wav_subtype = "PCM_16"
+        self.logger.info("VR Separator initialisation complete")
+    def separate(self, audio_file_path, custom_output_names=None):
+        """
+        Separates the audio file into primary and secondary sources based on the model's configuration.
+        It processes the mix, demixes it into sources, normalizes the sources, and saves the output files.
+        Args:
+            audio_file_path (str): The path to the audio file to be processed.
+            custom_output_names (dict, optional): Custom names for the output files. Defaults to None.
+        Returns:
+            list: A list of paths to the output files generated by the separation process.
+        """
+        self.primary_source = None
+        self.secondary_source = None
+        self.audio_file_path = audio_file_path
+        self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
+        self.logger.debug(f"Starting separation for input audio file {self.audio_file_path}...")
+        nn_arch_sizes = [31191, 33966, 56817, 123821, 123812, 129605, 218409, 537238, 537227]  # default
+        vr_5_1_models = [56817, 218409]
+        model_size = math.ceil(os.stat(self.model_path).st_size / 1024)
+        nn_arch_size = min(nn_arch_sizes, key=lambda x: abs(x - model_size))
+        self.logger.debug(f"Model size determined: {model_size}, NN architecture size: {nn_arch_size}")
+        if nn_arch_size in vr_5_1_models or self.is_vr_51_model:
+            self.logger.debug("Using CascadedNet for VR 5.1 model...")
+            self.model_run = nets_new.CascadedNet(self.model_params.param["bins"] * 2, nn_arch_size, nout=self.model_capacity[0], nout_lstm=self.model_capacity[1])
+            self.is_vr_51_model = True
+        else:
+            self.logger.debug("Determining model capacity...")
+            self.model_run = nets.determine_model_capacity(self.model_params.param["bins"] * 2, nn_arch_size)
+        self.model_run.load_state_dict(torch.load(self.model_path, map_location="cpu"))
+        self.model_run.to(self.torch_device)
+        self.logger.debug("Model loaded and moved to device.")
+        y_spec, v_spec = self.inference_vr(self.loading_mix(), self.torch_device, self.aggressiveness)
+        self.logger.debug("Inference completed.")
+        # Sanitize y_spec and v_spec to replace NaN and infinite values
+        y_spec = np.nan_to_num(y_spec, nan=0.0, posinf=0.0, neginf=0.0)
+        v_spec = np.nan_to_num(v_spec, nan=0.0, posinf=0.0, neginf=0.0)
+        self.logger.debug("Sanitization completed. Replaced NaN and infinite values in y_spec and v_spec.")
+        # After inference_vr call
+        self.logger.debug(f"Inference VR completed. y_spec shape: {y_spec.shape}, v_spec shape: {v_spec.shape}")
+        self.logger.debug(f"y_spec stats - min: {np.min(y_spec)}, max: {np.max(y_spec)}, isnan: {np.isnan(y_spec).any()}, isinf: {np.isinf(y_spec).any()}")
+        self.logger.debug(f"v_spec stats - min: {np.min(v_spec)}, max: {np.max(v_spec)}, isnan: {np.isnan(v_spec).any()}, isinf: {np.isinf(v_spec).any()}")
+        # Not yet implemented from UVR features:
+        #
+        # if not self.is_vocal_split_model:
+        #     self.cache_source((y_spec, v_spec))
+        # if self.is_secondary_model_activated and self.secondary_model:
+        #     self.logger.debug("Processing secondary model...")
+        #     self.secondary_source_primary, self.secondary_source_secondary = process_secondary_model(
+        #         self.secondary_model, self.process_data, main_process_method=self.process_method, main_model_primary=self.primary_stem
+        #     )
+        # Initialize the list for output files
+        output_files = []
+        self.logger.debug("Processing output files...")
+        # Note: logic similar to the following should probably be added to the other architectures
+        # Check if output_single_stem is set to a value that would result in no output files
+        if self.output_single_stem and (self.output_single_stem.lower() != self.primary_stem_name.lower() and self.output_single_stem.lower() != self.secondary_stem_name.lower()):
+            # If so, reset output_single_stem to None to save both stems
+            self.output_single_stem = None
+            self.logger.warning(f"The output_single_stem setting '{self.output_single_stem}' does not match any of the output files: '{self.primary_stem_name}' and '{self.secondary_stem_name}'. For this model '{self.model_name}', the output_single_stem setting will be ignored and all output files will be saved.")
+        # Save and process the primary stem if needed
+        if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower():
+            self.logger.debug(f"Processing primary stem: {self.primary_stem_name}")
+            if not isinstance(self.primary_source, np.ndarray):
+                self.logger.debug(f"Preparing to convert spectrogram to waveform. Spec shape: {y_spec.shape}")
+                self.primary_source = self.spec_to_wav(y_spec).T
+                self.logger.debug("Converting primary source spectrogram to waveform.")
+                if not self.model_samplerate == 44100:
+                    self.primary_source = librosa.resample(self.primary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
+                    self.logger.debug("Resampling primary source to 44100Hz.")
+            self.primary_stem_output_path = self.get_stem_output_path(self.primary_stem_name, custom_output_names)
+            self.logger.info(f"Saving {self.primary_stem_name} stem to {self.primary_stem_output_path}...")
+            self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
+            output_files.append(self.primary_stem_output_path)
+        # Save and process the secondary stem if needed
+        if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower():
+            self.logger.debug(f"Processing secondary stem: {self.secondary_stem_name}")
+            if not isinstance(self.secondary_source, np.ndarray):
+                self.logger.debug(f"Preparing to convert spectrogram to waveform. Spec shape: {v_spec.shape}")
+                self.secondary_source = self.spec_to_wav(v_spec).T
+                self.logger.debug("Converting secondary source spectrogram to waveform.")
+                if not self.model_samplerate == 44100:
+                    self.secondary_source = librosa.resample(self.secondary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
+                    self.logger.debug("Resampling secondary source to 44100Hz.")
+            self.secondary_stem_output_path = self.get_stem_output_path(self.secondary_stem_name, custom_output_names)
+            self.logger.info(f"Saving {self.secondary_stem_name} stem to {self.secondary_stem_output_path}...")
+            self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
+            output_files.append(self.secondary_stem_output_path)
+        # Not yet implemented from UVR features:
+        # self.process_vocal_split_chain(secondary_sources)
+        # self.logger.debug("Vocal split chain processed.")
+        return output_files
+    def loading_mix(self):
+        X_wave, X_spec_s = {}, {}
+        bands_n = len(self.model_params.param["band"])
+        audio_file = spec_utils.write_array_to_mem(self.audio_file_path, subtype=self.wav_subtype)
+        is_mp3 = audio_file.endswith(".mp3") if isinstance(audio_file, str) else False
+        self.logger.debug(f"loading_mix iteraring through {bands_n} bands")
+        for d in tqdm(range(bands_n, 0, -1)):
+            bp = self.model_params.param["band"][d]
+            wav_resolution = bp["res_type"]
+            if self.torch_device_mps is not None:
+                wav_resolution = "polyphase"
+            if d == bands_n:  # high-end band
+                X_wave[d], _ = librosa.load(audio_file, sr=bp["sr"], mono=False, dtype=np.float32, res_type=wav_resolution)
+                X_spec_s[d] = spec_utils.wave_to_spectrogram(X_wave[d], bp["hl"], bp["n_fft"], self.model_params, band=d, is_v51_model=self.is_vr_51_model)
+                if not np.any(X_wave[d]) and is_mp3:
+                    X_wave[d] = rerun_mp3(audio_file, bp["sr"])
+                if X_wave[d].ndim == 1:
+                    X_wave[d] = np.asarray([X_wave[d], X_wave[d]])
+            else:  # lower bands
+                X_wave[d] = librosa.resample(X_wave[d + 1], orig_sr=self.model_params.param["band"][d + 1]["sr"], target_sr=bp["sr"], res_type=wav_resolution)
+                X_spec_s[d] = spec_utils.wave_to_spectrogram(X_wave[d], bp["hl"], bp["n_fft"], self.model_params, band=d, is_v51_model=self.is_vr_51_model)
+            if d == bands_n and self.high_end_process:
+                self.input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (self.model_params.param["pre_filter_stop"] - self.model_params.param["pre_filter_start"])
+                self.input_high_end = X_spec_s[d][:, bp["n_fft"] // 2 - self.input_high_end_h : bp["n_fft"] // 2, :]
+        X_spec = spec_utils.combine_spectrograms(X_spec_s, self.model_params, is_v51_model=self.is_vr_51_model)
+        del X_wave, X_spec_s, audio_file
+        return X_spec
+    def inference_vr(self, X_spec, device, aggressiveness):
+        def _execute(X_mag_pad, roi_size):
+            X_dataset = []
+            patches = (X_mag_pad.shape[2] - 2 * self.model_run.offset) // roi_size
+            self.logger.debug(f"inference_vr appending to X_dataset for each of {patches} patches")
+            for i in tqdm(range(patches)):
+                start = i * roi_size
+                X_mag_window = X_mag_pad[:, :, start : start + self.window_size]
+                X_dataset.append(X_mag_window)
+            total_iterations = patches // self.batch_size if not self.enable_tta else (patches // self.batch_size) * 2
+            self.logger.debug(f"inference_vr iterating through {total_iterations} batches, batch_size = {self.batch_size}")
+            X_dataset = np.asarray(X_dataset)
+            self.model_run.eval()
+            with torch.no_grad():
+                mask = []
+                for i in tqdm(range(0, patches, self.batch_size)):
+                    X_batch = X_dataset[i : i + self.batch_size]
+                    X_batch = torch.from_numpy(X_batch).to(device)
+                    pred = self.model_run.predict_mask(X_batch)
+                    if not pred.size()[3] > 0:
+                        raise ValueError(f"Window size error: h1_shape[3] must be greater than h2_shape[3]")
+                    pred = pred.detach().cpu().numpy()
+                    pred = np.concatenate(pred, axis=2)
+                    mask.append(pred)
+                if len(mask) == 0:
+                    raise ValueError(f"Window size error: h1_shape[3] must be greater than h2_shape[3]")
+                mask = np.concatenate(mask, axis=2)
+            return mask
+        def postprocess(mask, X_mag, X_phase):
+            is_non_accom_stem = False
+            for stem in CommonSeparator.NON_ACCOM_STEMS:
+                if stem == self.primary_stem_name:
+                    is_non_accom_stem = True
+            mask = spec_utils.adjust_aggr(mask, is_non_accom_stem, aggressiveness)
+            if self.enable_post_process:
+                mask = spec_utils.merge_artifacts(mask, thres=self.post_process_threshold)
+            y_spec = mask * X_mag * np.exp(1.0j * X_phase)
+            v_spec = (1 - mask) * X_mag * np.exp(1.0j * X_phase)
+            return y_spec, v_spec
+        X_mag, X_phase = spec_utils.preprocess(X_spec)
+        n_frame = X_mag.shape[2]
+        pad_l, pad_r, roi_size = spec_utils.make_padding(n_frame, self.window_size, self.model_run.offset)
+        X_mag_pad = np.pad(X_mag, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
+        X_mag_pad /= X_mag_pad.max()
+        mask = _execute(X_mag_pad, roi_size)
+        if self.enable_tta:
+            pad_l += roi_size // 2
+            pad_r += roi_size // 2
+            X_mag_pad = np.pad(X_mag, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
+            X_mag_pad /= X_mag_pad.max()
+            mask_tta = _execute(X_mag_pad, roi_size)
+            mask_tta = mask_tta[:, :, roi_size // 2 :]
+            mask = (mask[:, :, :n_frame] + mask_tta[:, :, :n_frame]) * 0.5
+        else:
+            mask = mask[:, :, :n_frame]
+        y_spec, v_spec = postprocess(mask, X_mag, X_phase)
+        return y_spec, v_spec
+    def spec_to_wav(self, spec):
+        if self.high_end_process and isinstance(self.input_high_end, np.ndarray) and self.input_high_end_h:
+            input_high_end_ = spec_utils.mirroring("mirroring", spec, self.input_high_end, self.model_params)
+            wav = spec_utils.cmb_spectrogram_to_wave(spec, self.model_params, self.input_high_end_h, input_high_end_, is_v51_model=self.is_vr_51_model)
+        else:
+            wav = spec_utils.cmb_spectrogram_to_wave(spec, self.model_params, is_v51_model=self.is_vr_51_model)
+        return wav
+# Check if we really need the rerun_mp3 function, refactor or remove if not
+def rerun_mp3(audio_file, sample_rate=44100):
+    with audioread.audio_open(audio_file) as f:
+        track_length = int(f.duration)
+    return librosa.load(audio_file, duration=track_length, mono=False, sr=sample_rate)[0]

audio_separator/separator/common_separator.py ADDED Viewed

	@@ -0,0 +1,403 @@

+""" This file contains the CommonSeparator class, common to all architecture-specific Separator classes. """
+from logging import Logger
+import os
+import re
+import gc
+import numpy as np
+import librosa
+import torch
+from pydub import AudioSegment
+import soundfile as sf
+from audio_separator.separator.uvr_lib_v5 import spec_utils
+class CommonSeparator:
+    """
+    This class contains the common methods and attributes common to all architecture-specific Separator classes.
+    """
+    ALL_STEMS = "All Stems"
+    VOCAL_STEM = "Vocals"
+    INST_STEM = "Instrumental"
+    OTHER_STEM = "Other"
+    BASS_STEM = "Bass"
+    DRUM_STEM = "Drums"
+    GUITAR_STEM = "Guitar"
+    PIANO_STEM = "Piano"
+    SYNTH_STEM = "Synthesizer"
+    STRINGS_STEM = "Strings"
+    WOODWINDS_STEM = "Woodwinds"
+    BRASS_STEM = "Brass"
+    WIND_INST_STEM = "Wind Inst"
+    NO_OTHER_STEM = "No Other"
+    NO_BASS_STEM = "No Bass"
+    NO_DRUM_STEM = "No Drums"
+    NO_GUITAR_STEM = "No Guitar"
+    NO_PIANO_STEM = "No Piano"
+    NO_SYNTH_STEM = "No Synthesizer"
+    NO_STRINGS_STEM = "No Strings"
+    NO_WOODWINDS_STEM = "No Woodwinds"
+    NO_WIND_INST_STEM = "No Wind Inst"
+    NO_BRASS_STEM = "No Brass"
+    PRIMARY_STEM = "Primary Stem"
+    SECONDARY_STEM = "Secondary Stem"
+    LEAD_VOCAL_STEM = "lead_only"
+    BV_VOCAL_STEM = "backing_only"
+    LEAD_VOCAL_STEM_I = "with_lead_vocals"
+    BV_VOCAL_STEM_I = "with_backing_vocals"
+    LEAD_VOCAL_STEM_LABEL = "Lead Vocals"
+    BV_VOCAL_STEM_LABEL = "Backing Vocals"
+    NO_STEM = "No "
+    STEM_PAIR_MAPPER = {VOCAL_STEM: INST_STEM, INST_STEM: VOCAL_STEM, LEAD_VOCAL_STEM: BV_VOCAL_STEM, BV_VOCAL_STEM: LEAD_VOCAL_STEM, PRIMARY_STEM: SECONDARY_STEM}
+    NON_ACCOM_STEMS = (VOCAL_STEM, OTHER_STEM, BASS_STEM, DRUM_STEM, GUITAR_STEM, PIANO_STEM, SYNTH_STEM, STRINGS_STEM, WOODWINDS_STEM, BRASS_STEM, WIND_INST_STEM)
+    def __init__(self, config):
+        self.logger: Logger = config.get("logger")
+        self.log_level: int = config.get("log_level")
+        # Inferencing device / acceleration config
+        self.torch_device = config.get("torch_device")
+        self.torch_device_cpu = config.get("torch_device_cpu")
+        self.torch_device_mps = config.get("torch_device_mps")
+        self.onnx_execution_provider = config.get("onnx_execution_provider")
+        # Model data
+        self.model_name = config.get("model_name")
+        self.model_path = config.get("model_path")
+        self.model_data = config.get("model_data")
+        # Output directory and format
+        self.output_dir = config.get("output_dir")
+        self.output_format = config.get("output_format")
+        self.output_bitrate = config.get("output_bitrate")
+        # Functional options which are applicable to all architectures and the user may tweak to affect the output
+        self.normalization_threshold = config.get("normalization_threshold")
+        self.amplification_threshold = config.get("amplification_threshold")
+        self.enable_denoise = config.get("enable_denoise")
+        self.output_single_stem = config.get("output_single_stem")
+        self.invert_using_spec = config.get("invert_using_spec")
+        self.sample_rate = config.get("sample_rate")
+        self.use_soundfile = config.get("use_soundfile")
+        # Model specific properties
+        # Check if model_data has a "training" key with "instruments" list
+        self.primary_stem_name = None
+        self.secondary_stem_name = None
+        if "training" in self.model_data and "instruments" in self.model_data["training"]:
+            instruments = self.model_data["training"]["instruments"]
+            if instruments:
+                self.primary_stem_name = instruments[0]
+                self.secondary_stem_name = instruments[1] if len(instruments) > 1 else self.secondary_stem(self.primary_stem_name)
+        if self.primary_stem_name is None:
+            self.primary_stem_name = self.model_data.get("primary_stem", "Vocals")
+            self.secondary_stem_name = self.secondary_stem(self.primary_stem_name)
+        self.is_karaoke = self.model_data.get("is_karaoke", False)
+        self.is_bv_model = self.model_data.get("is_bv_model", False)
+        self.bv_model_rebalance = self.model_data.get("is_bv_model_rebalanced", 0)
+        self.logger.debug(f"Common params: model_name={self.model_name}, model_path={self.model_path}")
+        self.logger.debug(f"Common params: output_dir={self.output_dir}, output_format={self.output_format}")
+        self.logger.debug(f"Common params: normalization_threshold={self.normalization_threshold}, amplification_threshold={self.amplification_threshold}")
+        self.logger.debug(f"Common params: enable_denoise={self.enable_denoise}, output_single_stem={self.output_single_stem}")
+        self.logger.debug(f"Common params: invert_using_spec={self.invert_using_spec}, sample_rate={self.sample_rate}")
+        self.logger.debug(f"Common params: primary_stem_name={self.primary_stem_name}, secondary_stem_name={self.secondary_stem_name}")
+        self.logger.debug(f"Common params: is_karaoke={self.is_karaoke}, is_bv_model={self.is_bv_model}, bv_model_rebalance={self.bv_model_rebalance}")
+        # File-specific variables which need to be cleared between processing different audio inputs
+        self.audio_file_path = None
+        self.audio_file_base = None
+        self.primary_source = None
+        self.secondary_source = None
+        self.primary_stem_output_path = None
+        self.secondary_stem_output_path = None
+        self.cached_sources_map = {}
+    def secondary_stem(self, primary_stem: str):
+        """Determines secondary stem name based on the primary stem name."""
+        primary_stem = primary_stem if primary_stem else self.NO_STEM
+        if primary_stem in self.STEM_PAIR_MAPPER:
+            secondary_stem = self.STEM_PAIR_MAPPER[primary_stem]
+        else:
+            secondary_stem = primary_stem.replace(self.NO_STEM, "") if self.NO_STEM in primary_stem else f"{self.NO_STEM}{primary_stem}"
+        return secondary_stem
+    def separate(self, audio_file_path):
+        """
+        Placeholder method for separating audio sources. Should be overridden by subclasses.
+        """
+        raise NotImplementedError("This method should be overridden by subclasses.")
+    def final_process(self, stem_path, source, stem_name):
+        """
+        Finalizes the processing of a stem by writing the audio to a file and returning the processed source.
+        """
+        self.logger.debug(f"Finalizing {stem_name} stem processing and writing audio...")
+        self.write_audio(stem_path, source)
+        return {stem_name: source}
+    def cached_sources_clear(self):
+        """
+        Clears the cache dictionaries for VR, MDX, and Demucs models.
+        This function is essential for ensuring that the cache does not hold outdated or irrelevant data
+        between different processing sessions or when a new batch of audio files is processed.
+        It helps in managing memory efficiently and prevents potential errors due to stale data.
+        """
+        self.cached_sources_map = {}
+    def cached_source_callback(self, model_architecture, model_name=None):
+        """
+        Retrieves the model and sources from the cache based on the processing method and model name.
+        Args:
+            model_architecture: The architecture type (VR, MDX, or Demucs) being used for processing.
+            model_name: The specific model name within the architecture type, if applicable.
+        Returns:
+            A tuple containing the model and its sources if found in the cache; otherwise, None.
+        This function is crucial for optimizing performance by avoiding redundant processing.
+        If the requested model and its sources are already in the cache, they can be reused directly,
+        saving time and computational resources.
+        """
+        model, sources = None, None
+        mapper = self.cached_sources_map[model_architecture]
+        for key, value in mapper.items():
+            if model_name in key:
+                model = key
+                sources = value
+        return model, sources
+    def cached_model_source_holder(self, model_architecture, sources, model_name=None):
+        """
+        Update the dictionary for the given model_architecture with the new model name and its sources.
+        Use the model_architecture as a key to access the corresponding cache source mapper dictionary.
+        """
+        self.cached_sources_map[model_architecture] = {**self.cached_sources_map.get(model_architecture, {}), **{model_name: sources}}
+    def prepare_mix(self, mix):
+        """
+        Prepares the mix for processing. This includes loading the audio from a file if necessary,
+        ensuring the mix is in the correct format, and converting mono to stereo if needed.
+        """
+        # Store the original path or the mix itself for later checks
+        audio_path = mix
+        # Check if the input is a file path (string) and needs to be loaded
+        if not isinstance(mix, np.ndarray):
+            self.logger.debug(f"Loading audio from file: {mix}")
+            mix, sr = librosa.load(mix, mono=False, sr=self.sample_rate)
+            self.logger.debug(f"Audio loaded. Sample rate: {sr}, Audio shape: {mix.shape}")
+        else:
+            # Transpose the mix if it's already an ndarray (expected shape: [channels, samples])
+            self.logger.debug("Transposing the provided mix array.")
+            mix = mix.T
+            self.logger.debug(f"Transposed mix shape: {mix.shape}")
+        # If the original input was a filepath, check if the loaded mix is empty
+        if isinstance(audio_path, str):
+            if not np.any(mix):
+                error_msg = f"Audio file {audio_path} is empty or not valid"
+                self.logger.error(error_msg)
+                raise ValueError(error_msg)
+            else:
+                self.logger.debug("Audio file is valid and contains data.")
+        # Ensure the mix is in stereo format
+        if mix.ndim == 1:
+            self.logger.debug("Mix is mono. Converting to stereo.")
+            mix = np.asfortranarray([mix, mix])
+            self.logger.debug("Converted to stereo mix.")
+        # Final log indicating successful preparation of the mix
+        self.logger.debug("Mix preparation completed.")
+        return mix
+    def write_audio(self, stem_path: str, stem_source):
+        """
+        Writes the separated audio source to a file using pydub or soundfile
+        Pydub supports a much wider range of audio formats and produces better encoded lossy files for some formats.
+        Soundfile is used for very large files (longer than 1 hour), as pydub has memory issues with large files:
+        https://github.com/jiaaro/pydub/issues/135
+        """
+        # Get the duration of the input audio file
+        duration_seconds = librosa.get_duration(filename=self.audio_file_path)
+        duration_hours = duration_seconds / 3600
+        self.logger.info(f"Audio duration is {duration_hours:.2f} hours ({duration_seconds:.2f} seconds).")
+        if self.use_soundfile:
+            self.logger.warning(f"Using soundfile for writing.")
+            self.write_audio_soundfile(stem_path, stem_source)
+        else:
+            self.logger.info(f"Using pydub for writing.")
+            self.write_audio_pydub(stem_path, stem_source)
+    def write_audio_pydub(self, stem_path: str, stem_source):
+        """
+        Writes the separated audio source to a file using pydub (ffmpeg)
+        """
+        self.logger.debug(f"Entering write_audio_pydub with stem_path: {stem_path}")
+        stem_source = spec_utils.normalize(wave=stem_source, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold)
+        # Check if the numpy array is empty or contains very low values
+        if np.max(np.abs(stem_source)) < 1e-6:
+            self.logger.warning("Warning: stem_source array is near-silent or empty.")
+            return
+        # If output_dir is specified, create it and join it with stem_path
+        if self.output_dir:
+            os.makedirs(self.output_dir, exist_ok=True)
+            stem_path = os.path.join(self.output_dir, stem_path)
+        self.logger.debug(f"Audio data shape before processing: {stem_source.shape}")
+        self.logger.debug(f"Data type before conversion: {stem_source.dtype}")
+        # Ensure the audio data is in the correct format (e.g., int16)
+        if stem_source.dtype != np.int16:
+            stem_source = (stem_source * 32767).astype(np.int16)
+            self.logger.debug("Converted stem_source to int16.")
+        # Correctly interleave stereo channels
+        stem_source_interleaved = np.empty((2 * stem_source.shape[0],), dtype=np.int16)
+        stem_source_interleaved[0::2] = stem_source[:, 0]  # Left channel
+        stem_source_interleaved[1::2] = stem_source[:, 1]  # Right channel
+        self.logger.debug(f"Interleaved audio data shape: {stem_source_interleaved.shape}")
+        # Create a pydub AudioSegment
+        try:
+            audio_segment = AudioSegment(stem_source_interleaved.tobytes(), frame_rate=self.sample_rate, sample_width=stem_source.dtype.itemsize, channels=2)
+            self.logger.debug("Created AudioSegment successfully.")
+        except (IOError, ValueError) as e:
+            self.logger.error(f"Specific error creating AudioSegment: {e}")
+            return
+        # Determine file format based on the file extension
+        file_format = stem_path.lower().split(".")[-1]
+        # For m4a files, specify mp4 as the container format as the extension doesn't match the format name
+        if file_format == "m4a":
+            file_format = "mp4"
+        elif file_format == "mka":
+            file_format = "matroska"
+        # Set the bitrate to 320k for mp3 files if output_bitrate is not specified
+        bitrate = "320k" if file_format == "mp3" and self.output_bitrate is None else self.output_bitrate
+        # Export using the determined format
+        try:
+            audio_segment.export(stem_path, format=file_format, bitrate=bitrate)
+            self.logger.debug(f"Exported audio file successfully to {stem_path}")
+        except (IOError, ValueError) as e:
+            self.logger.error(f"Error exporting audio file: {e}")
+    def write_audio_soundfile(self, stem_path: str, stem_source):
+        """
+        Writes the separated audio source to a file using soundfile library.
+        """
+        self.logger.debug(f"Entering write_audio_soundfile with stem_path: {stem_path}")
+        # Correctly interleave stereo channels if needed
+        if stem_source.shape[1] == 2:
+            # If the audio is already interleaved, ensure it's in the correct order
+            # Check if the array is Fortran contiguous (column-major)
+            if stem_source.flags["F_CONTIGUOUS"]:
+                # Convert to C contiguous (row-major)
+                stem_source = np.ascontiguousarray(stem_source)
+            # Otherwise, perform interleaving
+            else:
+                stereo_interleaved = np.empty((2 * stem_source.shape[0],), dtype=np.int16)
+                # Left channel
+                stereo_interleaved[0::2] = stem_source[:, 0]
+                # Right channel
+                stereo_interleaved[1::2] = stem_source[:, 1]
+                stem_source = stereo_interleaved
+        self.logger.debug(f"Interleaved audio data shape: {stem_source.shape}")
+        """
+        Write audio using soundfile (for formats other than M4A).
+        """
+        # Save audio using soundfile
+        try:
+            # Specify the subtype to define the sample width
+            sf.write(stem_path, stem_source, self.sample_rate)
+            self.logger.debug(f"Exported audio file successfully to {stem_path}")
+        except Exception as e:
+            self.logger.error(f"Error exporting audio file: {e}")
+    def clear_gpu_cache(self):
+        """
+        This method clears the GPU cache to free up memory.
+        """
+        self.logger.debug("Running garbage collection...")
+        gc.collect()
+        if self.torch_device == torch.device("mps"):
+            self.logger.debug("Clearing MPS cache...")
+            torch.mps.empty_cache()
+        if self.torch_device == torch.device("cuda"):
+            self.logger.debug("Clearing CUDA cache...")
+            torch.cuda.empty_cache()
+    def clear_file_specific_paths(self):
+        """
+        Clears the file-specific variables which need to be cleared between processing different audio inputs.
+        """
+        self.logger.info("Clearing input audio file paths, sources and stems...")
+        self.audio_file_path = None
+        self.audio_file_base = None
+        self.primary_source = None
+        self.secondary_source = None
+        self.primary_stem_output_path = None
+        self.secondary_stem_output_path = None
+    def sanitize_filename(self, filename):
+        """
+        Cleans the filename by replacing invalid characters with underscores.
+        """
+        sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
+        sanitized = re.sub(r'_+', '_', sanitized)
+        sanitized = sanitized.strip('_. ')
+        return sanitized
+    def get_stem_output_path(self, stem_name, custom_output_names):
+        """
+        Gets the output path for a stem based on the stem name and custom output names.
+        """
+        # Convert custom_output_names keys to lowercase for case-insensitive comparison
+        if custom_output_names:
+            custom_output_names_lower = {k.lower(): v for k, v in custom_output_names.items()}
+            stem_name_lower = stem_name.lower()
+            if stem_name_lower in custom_output_names_lower:
+                sanitized_custom_name = self.sanitize_filename(custom_output_names_lower[stem_name_lower])
+                return os.path.join(f"{sanitized_custom_name}.{self.output_format.lower()}")
+        sanitized_audio_base = self.sanitize_filename(self.audio_file_base)
+        sanitized_stem_name = self.sanitize_filename(stem_name)
+        sanitized_model_name = self.sanitize_filename(self.model_name)
+        filename = f"{sanitized_audio_base}_({sanitized_stem_name})_{sanitized_model_name}.{self.output_format.lower()}"
+        return os.path.join(filename)

audio_separator/separator/separator.py ADDED Viewed

	@@ -0,0 +1,959 @@

+""" This file contains the Separator class, to facilitate the separation of stems from audio. """
+from importlib import metadata, resources
+import os
+import sys
+import platform
+import subprocess
+import time
+import logging
+import warnings
+import importlib
+import io
+from typing import Optional
+import hashlib
+import json
+import yaml
+import requests
+import torch
+import torch.amp.autocast_mode as autocast_mode
+import onnxruntime as ort
+from tqdm import tqdm
+class Separator:
+    """
+    The Separator class is designed to facilitate the separation of audio sources from a given audio file.
+    It supports various separation architectures and models, including MDX, VR, and Demucs. The class provides
+    functionalities to configure separation parameters, load models, and perform audio source separation.
+    It also handles logging, normalization, and output formatting of the separated audio stems.
+    The actual separation task is handled by one of the architecture-specific classes in the `architectures` module;
+    this class is responsible for initialising logging, configuring hardware acceleration, loading the model,
+    initiating the separation process and passing outputs back to the caller.
+    Common Attributes:
+        log_level (int): The logging level.
+        log_formatter (logging.Formatter): The logging formatter.
+        model_file_dir (str): The directory where model files are stored.
+        output_dir (str): The directory where output files will be saved.
+        output_format (str): The format of the output audio file.
+        output_bitrate (str): The bitrate of the output audio file.
+        amplification_threshold (float): The threshold for audio amplification.
+        normalization_threshold (float): The threshold for audio normalization.
+        output_single_stem (str): Option to output a single stem.
+        invert_using_spec (bool): Flag to invert using spectrogram.
+        sample_rate (int): The sample rate of the audio.
+        use_soundfile (bool): Use soundfile for audio writing, can solve OOM issues.
+        use_autocast (bool): Flag to use PyTorch autocast for faster inference.
+    MDX Architecture Specific Attributes:
+        hop_length (int): The hop length for STFT.
+        segment_size (int): The segment size for processing.
+        overlap (float): The overlap between segments.
+        batch_size (int): The batch size for processing.
+        enable_denoise (bool): Flag to enable or disable denoising.
+    VR Architecture Specific Attributes & Defaults:
+        batch_size: 16
+        window_size: 512
+        aggression: 5
+        enable_tta: False
+        enable_post_process: False
+        post_process_threshold: 0.2
+        high_end_process: False
+    Demucs Architecture Specific Attributes & Defaults:
+        segment_size: "Default"
+        shifts: 2
+        overlap: 0.25
+        segments_enabled: True
+    MDXC Architecture Specific Attributes & Defaults:
+        segment_size: 256
+        override_model_segment_size: False
+        batch_size: 1
+        overlap: 8
+        pitch_shift: 0
+    """
+    def __init__(
+        self,
+        log_level=logging.INFO,
+        log_formatter=None,
+        model_file_dir="/tmp/audio-separator-models/",
+        output_dir=None,
+        output_format="WAV",
+        output_bitrate=None,
+        normalization_threshold=0.9,
+        amplification_threshold=0.0,
+        output_single_stem=None,
+        invert_using_spec=False,
+        sample_rate=44100,
+        use_soundfile=False,
+        use_autocast=False,
+        use_directml=False,
+        mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1, "enable_denoise": False},
+        vr_params={"batch_size": 1, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False},
+        demucs_params={"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True},
+        mdxc_params={"segment_size": 256, "override_model_segment_size": False, "batch_size": 1, "overlap": 8, "pitch_shift": 0},
+        info_only=False,
+    ):
+        """Initialize the separator."""
+        self.logger = logging.getLogger(__name__)
+        self.logger.setLevel(log_level)
+        self.log_level = log_level
+        self.log_formatter = log_formatter
+        self.log_handler = logging.StreamHandler()
+        if self.log_formatter is None:
+            self.log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(module)s - %(message)s")
+        self.log_handler.setFormatter(self.log_formatter)
+        if not self.logger.hasHandlers():
+            self.logger.addHandler(self.log_handler)
+        # Filter out noisy warnings from PyTorch for users who don't care about them
+        if log_level > logging.DEBUG:
+            warnings.filterwarnings("ignore")
+        # Skip initialization logs if info_only is True
+        if not info_only:
+            package_version = self.get_package_distribution("audio-separator").version
+            self.logger.info(f"Separator version {package_version} instantiating with output_dir: {output_dir}, output_format: {output_format}")
+        if output_dir is None:
+            output_dir = os.getcwd()
+            if not info_only:
+                self.logger.info("Output directory not specified. Using current working directory.")
+        self.output_dir = output_dir
+        # Check for environment variable to override model_file_dir
+        env_model_dir = os.environ.get("AUDIO_SEPARATOR_MODEL_DIR")
+        if env_model_dir:
+            self.model_file_dir = env_model_dir
+            self.logger.info(f"Using model directory from AUDIO_SEPARATOR_MODEL_DIR env var: {self.model_file_dir}")
+            if not os.path.exists(self.model_file_dir):
+                raise FileNotFoundError(f"The specified model directory does not exist: {self.model_file_dir}")
+        else:
+            self.logger.info(f"Using model directory from model_file_dir parameter: {model_file_dir}")
+            self.model_file_dir = model_file_dir
+        # Create the model directory if it does not exist
+        os.makedirs(self.model_file_dir, exist_ok=True)
+        os.makedirs(self.output_dir, exist_ok=True)
+        self.output_format = output_format
+        self.output_bitrate = output_bitrate
+        if self.output_format is None:
+            self.output_format = "WAV"
+        self.normalization_threshold = normalization_threshold
+        if normalization_threshold <= 0 or normalization_threshold > 1:
+            raise ValueError("The normalization_threshold must be greater than 0 and less than or equal to 1.")
+        self.amplification_threshold = amplification_threshold
+        if amplification_threshold < 0 or amplification_threshold > 1:
+            raise ValueError("The amplification_threshold must be greater than or equal to 0 and less than or equal to 1.")
+        self.output_single_stem = output_single_stem
+        if output_single_stem is not None:
+            self.logger.debug(f"Single stem output requested, so only one output file ({output_single_stem}) will be written")
+        self.invert_using_spec = invert_using_spec
+        if self.invert_using_spec:
+            self.logger.debug(f"Secondary step will be inverted using spectogram rather than waveform. This may improve quality but is slightly slower.")
+        try:
+            self.sample_rate = int(sample_rate)
+            if self.sample_rate <= 0:
+                raise ValueError(f"The sample rate setting is {self.sample_rate} but it must be a non-zero whole number.")
+            if self.sample_rate > 12800000:
+                raise ValueError(f"The sample rate setting is {self.sample_rate}. Enter something less ambitious.")
+        except ValueError:
+            raise ValueError("The sample rate must be a non-zero whole number. Please provide a valid integer.")
+        self.use_soundfile = use_soundfile
+        self.use_autocast = use_autocast
+        self.use_directml = use_directml
+        # These are parameters which users may want to configure so we expose them to the top-level Separator class,
+        # even though they are specific to a single model architecture
+        self.arch_specific_params = {"MDX": mdx_params, "VR": vr_params, "Demucs": demucs_params, "MDXC": mdxc_params}
+        self.torch_device = None
+        self.torch_device_cpu = None
+        self.torch_device_mps = None
+        self.onnx_execution_provider = None
+        self.model_instance = None
+        self.model_is_uvr_vip = False
+        self.model_friendly_name = None
+        if not info_only:
+            self.setup_accelerated_inferencing_device()
+    def setup_accelerated_inferencing_device(self):
+        """
+        This method sets up the PyTorch and/or ONNX Runtime inferencing device, using GPU hardware acceleration if available.
+        """
+        system_info = self.get_system_info()
+        self.check_ffmpeg_installed()
+        self.log_onnxruntime_packages()
+        self.setup_torch_device(system_info)
+    def get_system_info(self):
+        """
+        This method logs the system information, including the operating system, CPU archutecture and Python version
+        """
+        os_name = platform.system()
+        os_version = platform.version()
+        self.logger.info(f"Operating System: {os_name} {os_version}")
+        system_info = platform.uname()
+        self.logger.info(f"System: {system_info.system} Node: {system_info.node} Release: {system_info.release} Machine: {system_info.machine} Proc: {system_info.processor}")
+        python_version = platform.python_version()
+        self.logger.info(f"Python Version: {python_version}")
+        pytorch_version = torch.__version__
+        self.logger.info(f"PyTorch Version: {pytorch_version}")
+        return system_info
+    def check_ffmpeg_installed(self):
+        """
+        This method checks if ffmpeg is installed and logs its version.
+        """
+        try:
+            ffmpeg_version_output = subprocess.check_output(["ffmpeg", "-version"], text=True)
+            first_line = ffmpeg_version_output.splitlines()[0]
+            self.logger.info(f"FFmpeg installed: {first_line}")
+        except FileNotFoundError:
+            self.logger.error("FFmpeg is not installed. Please install FFmpeg to use this package.")
+            # Raise an exception if this is being run by a user, as ffmpeg is required for pydub to write audio
+            # but if we're just running unit tests in CI, no reason to throw
+            if "PYTEST_CURRENT_TEST" not in os.environ:
+                raise
+    def log_onnxruntime_packages(self):
+        """
+        This method logs the ONNX Runtime package versions, including the GPU and Silicon packages if available.
+        """
+        onnxruntime_gpu_package = self.get_package_distribution("onnxruntime-gpu")
+        onnxruntime_silicon_package = self.get_package_distribution("onnxruntime-silicon")
+        onnxruntime_cpu_package = self.get_package_distribution("onnxruntime")
+        onnxruntime_dml_package = self.get_package_distribution("onnxruntime-directml")
+        if onnxruntime_gpu_package is not None:
+            self.logger.info(f"ONNX Runtime GPU package installed with version: {onnxruntime_gpu_package.version}")
+        if onnxruntime_silicon_package is not None:
+            self.logger.info(f"ONNX Runtime Silicon package installed with version: {onnxruntime_silicon_package.version}")
+        if onnxruntime_cpu_package is not None:
+            self.logger.info(f"ONNX Runtime CPU package installed with version: {onnxruntime_cpu_package.version}")
+        if onnxruntime_dml_package is not None:
+            self.logger.info(f"ONNX Runtime DirectML package installed with version: {onnxruntime_dml_package.version}")
+    def setup_torch_device(self, system_info):
+        """
+        This method sets up the PyTorch and/or ONNX Runtime inferencing device, using GPU hardware acceleration if available.
+        """
+        hardware_acceleration_enabled = False
+        ort_providers = ort.get_available_providers()
+        has_torch_dml_installed = self.get_package_distribution("torch_directml")
+        self.torch_device_cpu = torch.device("cpu")
+        if torch.cuda.is_available():
+            self.configure_cuda(ort_providers)
+            hardware_acceleration_enabled = True
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and system_info.processor == "arm":
+            self.configure_mps(ort_providers)
+            hardware_acceleration_enabled = True
+        elif self.use_directml and has_torch_dml_installed:
+            import torch_directml
+            if torch_directml.is_available():
+                self.configure_dml(ort_providers)
+                hardware_acceleration_enabled = True
+        if not hardware_acceleration_enabled:
+            self.logger.info("No hardware acceleration could be configured, running in CPU mode")
+            self.torch_device = self.torch_device_cpu
+            self.onnx_execution_provider = ["CPUExecutionProvider"]
+    def configure_cuda(self, ort_providers):
+        """
+        This method configures the CUDA device for PyTorch and ONNX Runtime, if available.
+        """
+        self.logger.info("CUDA is available in Torch, setting Torch device to CUDA")
+        self.torch_device = torch.device("cuda")
+        if "CUDAExecutionProvider" in ort_providers:
+            self.logger.info("ONNXruntime has CUDAExecutionProvider available, enabling acceleration")
+            self.onnx_execution_provider = ["CUDAExecutionProvider"]
+        else:
+            self.logger.warning("CUDAExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled")
+    def configure_mps(self, ort_providers):
+        """
+        This method configures the Apple Silicon MPS/CoreML device for PyTorch and ONNX Runtime, if available.
+        """
+        self.logger.info("Apple Silicon MPS/CoreML is available in Torch and processor is ARM, setting Torch device to MPS")
+        self.torch_device_mps = torch.device("mps")
+        self.torch_device = self.torch_device_mps
+        if "CoreMLExecutionProvider" in ort_providers:
+            self.logger.info("ONNXruntime has CoreMLExecutionProvider available, enabling acceleration")
+            self.onnx_execution_provider = ["CoreMLExecutionProvider"]
+        else:
+            self.logger.warning("CoreMLExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled")
+    def configure_dml(self, ort_providers):
+        """
+        This method configures the DirectML device for PyTorch and ONNX Runtime, if available.
+        """
+        import torch_directml
+        self.logger.info("DirectML is available in Torch, setting Torch device to DirectML")
+        self.torch_device_dml = torch_directml.device()
+        self.torch_device = self.torch_device_dml
+        if "DmlExecutionProvider" in ort_providers:
+            self.logger.info("ONNXruntime has DmlExecutionProvider available, enabling acceleration")
+            self.onnx_execution_provider = ["DmlExecutionProvider"]
+        else:
+            self.logger.warning("DmlExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled")
+    def get_package_distribution(self, package_name):
+        """
+        This method returns the package distribution for a given package name if installed, or None otherwise.
+        """
+        try:
+            return metadata.distribution(package_name)
+        except metadata.PackageNotFoundError:
+            self.logger.debug(f"Python package: {package_name} not installed")
+            return None
+    def get_model_hash(self, model_path):
+        """
+        This method returns the MD5 hash of a given model file.
+        """
+        self.logger.debug(f"Calculating hash of model file {model_path}")
+        # Use the specific byte count from the original logic
+        BYTES_TO_HASH = 10000 * 1024  # 10,240,000 bytes
+        try:
+            file_size = os.path.getsize(model_path)
+            with open(model_path, "rb") as f:
+                if file_size < BYTES_TO_HASH:
+                    # Hash the entire file if smaller than the target byte count
+                    self.logger.debug(f"File size {file_size} < {BYTES_TO_HASH}, hashing entire file.")
+                    hash_value = hashlib.md5(f.read()).hexdigest()
+                else:
+                    # Seek to the specific position before the end (from the beginning) and hash
+                    seek_pos = file_size - BYTES_TO_HASH
+                    self.logger.debug(f"File size {file_size} >= {BYTES_TO_HASH}, seeking to {seek_pos} and hashing remaining bytes.")
+                    f.seek(seek_pos, io.SEEK_SET)
+                    hash_value = hashlib.md5(f.read()).hexdigest()
+            # Log the calculated hash
+            self.logger.info(f"Hash of model file {model_path} is {hash_value}")
+            return hash_value
+        except FileNotFoundError:
+            self.logger.error(f"Model file not found at {model_path}")
+            raise # Re-raise the specific error
+        except Exception as e:
+            # Catch other potential errors (e.g., permissions, other IOErrors)
+            self.logger.error(f"Error calculating hash for {model_path}: {e}")
+            raise # Re-raise other errors
+    def download_file_if_not_exists(self, url, output_path):
+        """
+        This method downloads a file from a given URL to a given output path, if the file does not already exist.
+        """
+        if os.path.isfile(output_path):
+            self.logger.debug(f"File already exists at {output_path}, skipping download")
+            return
+        self.logger.debug(f"Downloading file from {url} to {output_path} with timeout 300s")
+        response = requests.get(url, stream=True, timeout=300)
+        if response.status_code == 200:
+            total_size_in_bytes = int(response.headers.get("content-length", 0))
+            progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+            with open(output_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    progress_bar.update(len(chunk))
+                    f.write(chunk)
+            progress_bar.close()
+        else:
+            raise RuntimeError(f"Failed to download file from {url}, response code: {response.status_code}")
+    def list_supported_model_files(self):
+        """
+        This method lists the supported model files for audio-separator, by fetching the same file UVR uses to list these.
+        Also includes model performance scores where available.
+        Example response object:
+        {
+            "MDX": {
+                "MDX-Net Model VIP: UVR-MDX-NET-Inst_full_292": {
+                "filename": "UVR-MDX-NET-Inst_full_292.onnx",
+                "scores": {
+                    "vocals": {
+                    "SDR": 10.6497,
+                    "SIR": 20.3786,
+                    "SAR": 10.692,
+                    "ISR": 14.848
+                    },
+                    "instrumental": {
+                    "SDR": 15.2149,
+                    "SIR": 25.6075,
+                    "SAR": 17.1363,
+                    "ISR": 17.7893
+                    }
+                },
+                "download_files": [
+                    "UVR-MDX-NET-Inst_full_292.onnx"
+                ]
+                }
+            },
+            "Demucs": {
+                "Demucs v4: htdemucs_ft": {
+                "filename": "htdemucs_ft.yaml",
+                "scores": {
+                    "vocals": {
+                    "SDR": 11.2685,
+                    "SIR": 21.257,
+                    "SAR": 11.0359,
+                    "ISR": 19.3753
+                    },
+                    "drums": {
+                    "SDR": 13.235,
+                    "SIR": 23.3053,
+                    "SAR": 13.0313,
+                    "ISR": 17.2889
+                    },
+                    "bass": {
+                    "SDR": 9.72743,
+                    "SIR": 19.5435,
+                    "SAR": 9.20801,
+                    "ISR": 13.5037
+                    }
+                },
+                "download_files": [
+                    "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/f7e0c4bc-ba3fe64a.th",
+                    "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/d12395a8-e57c48e6.th",
+                    "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/92cfc3b6-ef3bcb9c.th",
+                    "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/04573f0d-f3cf25b2.th",
+                    "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs_ft.yaml"
+                ]
+                }
+            },
+            "MDXC": {
+                "MDX23C Model: MDX23C-InstVoc HQ": {
+                "filename": "MDX23C-8KFFT-InstVoc_HQ.ckpt",
+                "scores": {
+                    "vocals": {
+                    "SDR": 11.9504,
+                    "SIR": 23.1166,
+                    "SAR": 12.093,
+                    "ISR": 15.4782
+                    },
+                    "instrumental": {
+                    "SDR": 16.3035,
+                    "SIR": 26.6161,
+                    "SAR": 18.5167,
+                    "ISR": 18.3939
+                    }
+                },
+                "download_files": [
+                    "MDX23C-8KFFT-InstVoc_HQ.ckpt",
+                    "model_2_stem_full_band_8k.yaml"
+                ]
+                }
+            }
+        }
+        """
+        download_checks_path = os.path.join(self.model_file_dir, "download_checks.json")
+        self.download_file_if_not_exists("https://raw.githubusercontent.com/TRvlvr/application_data/main/filelists/download_checks.json", download_checks_path)
+        model_downloads_list = json.load(open(download_checks_path, encoding="utf-8"))
+        self.logger.debug(f"UVR model download list loaded")
+        # Load the model scores with error handling
+        model_scores = {}
+        try:
+            with resources.open_text("audio_separator", "models-scores.json") as f:
+                model_scores = json.load(f)
+            self.logger.debug(f"Model scores loaded")
+        except json.JSONDecodeError as e:
+            self.logger.warning(f"Failed to load model scores: {str(e)}")
+            self.logger.warning("Continuing without model scores")
+        # Only show Demucs v4 models as we've only implemented support for v4
+        filtered_demucs_v4 = {key: value for key, value in model_downloads_list["demucs_download_list"].items() if key.startswith("Demucs v4")}
+        # Modified Demucs handling to use YAML files as identifiers and include download files
+        demucs_models = {}
+        for name, files in filtered_demucs_v4.items():
+            # Find the YAML file in the model files
+            yaml_file = next((filename for filename in files.keys() if filename.endswith(".yaml")), None)
+            if yaml_file:
+                model_score_data = model_scores.get(yaml_file, {})
+                demucs_models[name] = {
+                    "filename": yaml_file,
+                    "scores": model_score_data.get("median_scores", {}),
+                    "stems": model_score_data.get("stems", []),
+                    "target_stem": model_score_data.get("target_stem"),
+                    "download_files": list(files.values()),  # List of all download URLs/filenames
+                }
+        # Load the JSON file using importlib.resources
+        with resources.open_text("audio_separator", "models.json") as f:
+            audio_separator_models_list = json.load(f)
+        self.logger.debug(f"Audio-Separator model list loaded")
+        # Return object with list of model names
+        model_files_grouped_by_type = {
+            "VR": {
+                name: {
+                    "filename": filename,
+                    "scores": model_scores.get(filename, {}).get("median_scores", {}),
+                    "stems": model_scores.get(filename, {}).get("stems", []),
+                    "target_stem": model_scores.get(filename, {}).get("target_stem"),
+                    "download_files": [filename],
+                }  # Just the filename for VR models
+                for name, filename in {**model_downloads_list["vr_download_list"], **audio_separator_models_list["vr_download_list"]}.items()
+            },
+            "MDX": {
+                name: {
+                    "filename": filename,
+                    "scores": model_scores.get(filename, {}).get("median_scores", {}),
+                    "stems": model_scores.get(filename, {}).get("stems", []),
+                    "target_stem": model_scores.get(filename, {}).get("target_stem"),
+                    "download_files": [filename],
+                }  # Just the filename for MDX models
+                for name, filename in {**model_downloads_list["mdx_download_list"], **model_downloads_list["mdx_download_vip_list"], **audio_separator_models_list["mdx_download_list"]}.items()
+            },
+            "Demucs": demucs_models,
+            "MDXC": {
+                name: {
+                    "filename": next(iter(files.keys())),
+                    "scores": model_scores.get(next(iter(files.keys())), {}).get("median_scores", {}),
+                    "stems": model_scores.get(next(iter(files.keys())), {}).get("stems", []),
+                    "target_stem": model_scores.get(next(iter(files.keys())), {}).get("target_stem"),
+                    "download_files": list(files.keys()) + list(files.values()),  # List of both model filenames and config filenames
+                }
+                for name, files in {
+                    **model_downloads_list["mdx23c_download_list"],
+                    **model_downloads_list["mdx23c_download_vip_list"],
+                    **model_downloads_list["roformer_download_list"],
+                    **audio_separator_models_list["mdx23c_download_list"],
+                    **audio_separator_models_list["roformer_download_list"],
+                }.items()
+            },
+        }
+        return model_files_grouped_by_type
+    def print_uvr_vip_message(self):
+        """
+        This method prints a message to the user if they have downloaded a VIP model, reminding them to support Anjok07 on Patreon.
+        """
+        if self.model_is_uvr_vip:
+            self.logger.warning(f"The model: '{self.model_friendly_name}' is a VIP model, intended by Anjok07 for access by paying subscribers only.")
+            self.logger.warning("If you are not already subscribed, please consider supporting the developer of UVR, Anjok07 by subscribing here: https://patreon.com/uvr")
+    def download_model_files(self, model_filename):
+        """
+        This method downloads the model files for a given model filename, if they are not already present.
+        Returns tuple of (model_filename, model_type, model_friendly_name, model_path, yaml_config_filename)
+        """
+        model_path = os.path.join(self.model_file_dir, f"{model_filename}")
+        supported_model_files_grouped = self.list_supported_model_files()
+        public_model_repo_url_prefix = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models"
+        vip_model_repo_url_prefix = "https://github.com/Anjok0109/ai_magic/releases/download/v5"
+        audio_separator_models_repo_url_prefix = "https://github.com/nomadkaraoke/python-audio-separator/releases/download/model-configs"
+        yaml_config_filename = None
+        self.logger.debug(f"Searching for model_filename {model_filename} in supported_model_files_grouped")
+        # Iterate through model types (MDX, Demucs, MDXC)
+        for model_type, models in supported_model_files_grouped.items():
+            # Iterate through each model in this type
+            for model_friendly_name, model_info in models.items():
+                self.model_is_uvr_vip = "VIP" in model_friendly_name
+                model_repo_url_prefix = vip_model_repo_url_prefix if self.model_is_uvr_vip else public_model_repo_url_prefix
+                # Check if this model matches our target filename
+                if model_info["filename"] == model_filename or model_filename in model_info["download_files"]:
+                    self.logger.debug(f"Found matching model: {model_friendly_name}")
+                    self.model_friendly_name = model_friendly_name
+                    self.print_uvr_vip_message()
+                    # Download each required file for this model
+                    for file_to_download in model_info["download_files"]:
+                        # For URLs, extract just the filename portion
+                        if file_to_download.startswith("http"):
+                            filename = file_to_download.split("/")[-1]
+                            download_path = os.path.join(self.model_file_dir, filename)
+                            self.download_file_if_not_exists(file_to_download, download_path)
+                            continue
+                        download_path = os.path.join(self.model_file_dir, file_to_download)
+                        # For MDXC models, handle YAML config files specially
+                        if model_type == "MDXC" and file_to_download.endswith(".yaml"):
+                            yaml_config_filename = file_to_download
+                            try:
+                                yaml_url = f"{model_repo_url_prefix}/mdx_model_data/mdx_c_configs/{file_to_download}"
+                                self.download_file_if_not_exists(yaml_url, download_path)
+                            except RuntimeError:
+                                self.logger.debug("YAML config not found in UVR repo, trying audio-separator models repo...")
+                                yaml_url = f"{audio_separator_models_repo_url_prefix}/{file_to_download}"
+                                self.download_file_if_not_exists(yaml_url, download_path)
+                            continue
+                        # For regular model files, try UVR repo first, then audio-separator repo
+                        try:
+                            download_url = f"{model_repo_url_prefix}/{file_to_download}"
+                            self.download_file_if_not_exists(download_url, download_path)
+                        except RuntimeError:
+                            self.logger.debug("Model not found in UVR repo, trying audio-separator models repo...")
+                            download_url = f"{audio_separator_models_repo_url_prefix}/{file_to_download}"
+                            self.download_file_if_not_exists(download_url, download_path)
+                    return model_filename, model_type, model_friendly_name, model_path, yaml_config_filename
+        raise ValueError(f"Model file {model_filename} not found in supported model files")
+    def load_model_data_from_yaml(self, yaml_config_filename):
+        """
+        This method loads model-specific parameters from the YAML file for that model.
+        The parameters in the YAML are critical to inferencing, as they need to match whatever was used during training.
+        """
+        # Verify if the YAML filename includes a full path or just the filename
+        if not os.path.exists(yaml_config_filename):
+            model_data_yaml_filepath = os.path.join(self.model_file_dir, yaml_config_filename)
+        else:
+            model_data_yaml_filepath = yaml_config_filename
+        self.logger.debug(f"Loading model data from YAML at path {model_data_yaml_filepath}")
+        model_data = yaml.load(open(model_data_yaml_filepath, encoding="utf-8"), Loader=yaml.FullLoader)
+        self.logger.debug(f"Model data loaded from YAML file: {model_data}")
+        if "roformer" in model_data_yaml_filepath:
+            model_data["is_roformer"] = True
+        return model_data
+    def load_model_data_using_hash(self, model_path):
+        """
+        This method loads model-specific parameters from UVR model data files.
+        These parameters are critical to inferencing using a given model, as they need to match whatever was used during training.
+        The correct parameters are identified by calculating the hash of the model file and looking up the hash in the UVR data files.
+        """
+        # Model data and configuration sources from UVR
+        model_data_url_prefix = "https://raw.githubusercontent.com/TRvlvr/application_data/main"
+        vr_model_data_url = f"{model_data_url_prefix}/vr_model_data/model_data_new.json"
+        mdx_model_data_url = f"{model_data_url_prefix}/mdx_model_data/model_data_new.json"
+        # Calculate hash for the downloaded model
+        self.logger.debug("Calculating MD5 hash for model file to identify model parameters from UVR data...")
+        model_hash = self.get_model_hash(model_path)
+        self.logger.debug(f"Model {model_path} has hash {model_hash}")
+        # Setting up the path for model data and checking its existence
+        vr_model_data_path = os.path.join(self.model_file_dir, "vr_model_data.json")
+        self.logger.debug(f"VR model data path set to {vr_model_data_path}")
+        self.download_file_if_not_exists(vr_model_data_url, vr_model_data_path)
+        mdx_model_data_path = os.path.join(self.model_file_dir, "mdx_model_data.json")
+        self.logger.debug(f"MDX model data path set to {mdx_model_data_path}")
+        self.download_file_if_not_exists(mdx_model_data_url, mdx_model_data_path)
+        # Loading model data from UVR
+        self.logger.debug("Loading MDX and VR model parameters from UVR model data files...")
+        vr_model_data_object = json.load(open(vr_model_data_path, encoding="utf-8"))
+        mdx_model_data_object = json.load(open(mdx_model_data_path, encoding="utf-8"))
+        # Load additional model data from audio-separator
+        self.logger.debug("Loading additional model parameters from audio-separator model data file...")
+        with resources.open_text("audio_separator", "model-data.json") as f:
+            audio_separator_model_data = json.load(f)
+        # Merge the model data objects, with audio-separator data taking precedence
+        vr_model_data_object = {**vr_model_data_object, **audio_separator_model_data.get("vr_model_data", {})}
+        mdx_model_data_object = {**mdx_model_data_object, **audio_separator_model_data.get("mdx_model_data", {})}
+        if model_hash in mdx_model_data_object:
+            model_data = mdx_model_data_object[model_hash]
+        elif model_hash in vr_model_data_object:
+            model_data = vr_model_data_object[model_hash]
+        else:
+            raise ValueError(f"Unsupported Model File: parameters for MD5 hash {model_hash} could not be found in UVR model data file for MDX or VR arch.")
+        self.logger.debug(f"Model data loaded using hash {model_hash}: {model_data}")
+        return model_data
+    def load_model(self, model_filename="model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt"):
+        """
+        This method instantiates the architecture-specific separation class,
+        loading the separation model into memory, downloading it first if necessary.
+        """
+        self.logger.info(f"Loading model {model_filename}...")
+        load_model_start_time = time.perf_counter()
+        # Setting up the model path
+        model_filename, model_type, model_friendly_name, model_path, yaml_config_filename = self.download_model_files(model_filename)
+        model_name = model_filename.split(".")[0]
+        self.logger.debug(f"Model downloaded, friendly name: {model_friendly_name}, model_path: {model_path}")
+        if model_path.lower().endswith(".yaml"):
+            yaml_config_filename = model_path
+        if yaml_config_filename is not None:
+            model_data = self.load_model_data_from_yaml(yaml_config_filename)
+        else:
+            model_data = self.load_model_data_using_hash(model_path)
+        common_params = {
+            "logger": self.logger,
+            "log_level": self.log_level,
+            "torch_device": self.torch_device,
+            "torch_device_cpu": self.torch_device_cpu,
+            "torch_device_mps": self.torch_device_mps,
+            "onnx_execution_provider": self.onnx_execution_provider,
+            "model_name": model_name,
+            "model_path": model_path,
+            "model_data": model_data,
+            "output_format": self.output_format,
+            "output_bitrate": self.output_bitrate,
+            "output_dir": self.output_dir,
+            "normalization_threshold": self.normalization_threshold,
+            "amplification_threshold": self.amplification_threshold,
+            "output_single_stem": self.output_single_stem,
+            "invert_using_spec": self.invert_using_spec,
+            "sample_rate": self.sample_rate,
+            "use_soundfile": self.use_soundfile,
+        }
+        # Instantiate the appropriate separator class depending on the model type
+        separator_classes = {"MDX": "mdx_separator.MDXSeparator", "VR": "vr_separator.VRSeparator", "Demucs": "demucs_separator.DemucsSeparator", "MDXC": "mdxc_separator.MDXCSeparator"}
+        if model_type not in self.arch_specific_params or model_type not in separator_classes:
+            raise ValueError(f"Model type not supported (yet): {model_type}")
+        if model_type == "Demucs" and sys.version_info < (3, 10):
+            raise Exception("Demucs models require Python version 3.10 or newer.")
+        self.logger.debug(f"Importing module for model type {model_type}: {separator_classes[model_type]}")
+        module_name, class_name = separator_classes[model_type].split(".")
+        module = importlib.import_module(f"audio_separator.separator.architectures.{module_name}")
+        separator_class = getattr(module, class_name)
+        self.logger.debug(f"Instantiating separator class for model type {model_type}: {separator_class}")
+        self.model_instance = separator_class(common_config=common_params, arch_config=self.arch_specific_params[model_type])
+        # Log the completion of the model load process
+        self.logger.debug("Loading model completed.")
+        self.logger.info(f'Load model duration: {time.strftime("%H:%M:%S", time.gmtime(int(time.perf_counter() - load_model_start_time)))}')
+    def separate(self, audio_file_path, custom_output_names=None):
+        """
+        Separates the audio file(s) into different stems (e.g., vocals, instruments) using the loaded model.
+        This method takes the path to an audio file or a directory containing audio files, processes them through
+        the loaded separation model, and returns the paths to the output files containing the separated audio stems.
+        It handles the entire flow from loading the audio, running the separation, clearing up resources, and logging the process.
+        Parameters:
+        - audio_file_path (str or list): The path to the audio file or directory, or a list of paths.
+        - custom_output_names (dict, optional): Custom names for the output files. Defaults to None.
+        Returns:
+        - output_files (list of str): A list containing the paths to the separated audio stem files.
+        """
+        # Check if the model and device are properly initialized
+        if not (self.torch_device and self.model_instance):
+            raise ValueError("Initialization failed or model not loaded. Please load a model before attempting to separate.")
+        # If audio_file_path is a string, convert it to a list for uniform processing
+        if isinstance(audio_file_path, str):
+            audio_file_path = [audio_file_path]
+        # Initialize a list to store paths of all output files
+        output_files = []
+        # Process each path in the list
+        for path in audio_file_path:
+            if os.path.isdir(path):
+                # If the path is a directory, recursively search for all audio files
+                for root, dirs, files in os.walk(path):
+                    for file in files:
+                        # Check the file extension to ensure it's an audio file
+                        if file.endswith((".wav", ".flac", ".mp3", ".ogg", ".opus", ".m4a", ".aiff", ".ac3")):  # Add other formats if needed
+                            full_path = os.path.join(root, file)
+                            self.logger.info(f"Processing file: {full_path}")
+                            try:
+                                # Perform separation for each file
+                                files_output = self._separate_file(full_path, custom_output_names)
+                                output_files.extend(files_output)
+                            except Exception as e:
+                                self.logger.error(f"Failed to process file {full_path}: {e}")
+            else:
+                # If the path is a file, process it directly
+                self.logger.info(f"Processing file: {path}")
+                try:
+                    files_output = self._separate_file(path, custom_output_names)
+                    output_files.extend(files_output)
+                except Exception as e:
+                    self.logger.error(f"Failed to process file {path}: {e}")
+        return output_files
+    def _separate_file(self, audio_file_path, custom_output_names=None):
+        """
+        Internal method to handle separation for a single audio file.
+        This method performs the actual separation process for a single audio file. It logs the start and end of the process,
+        handles autocast if enabled, and ensures GPU cache is cleared after processing.
+        Parameters:
+        - audio_file_path (str): The path to the audio file.
+        - custom_output_names (dict, optional): Custom names for the output files. Defaults to None.
+        Returns:
+        - output_files (list of str): A list containing the paths to the separated audio stem files.
+        """
+        # Log the start of the separation process
+        self.logger.info(f"Starting separation process for audio_file_path: {audio_file_path}")
+        separate_start_time = time.perf_counter()
+        # Log normalization and amplification thresholds
+        self.logger.debug(f"Normalization threshold set to {self.normalization_threshold}, waveform will be lowered to this max amplitude to avoid clipping.")
+        self.logger.debug(f"Amplification threshold set to {self.amplification_threshold}, waveform will be scaled up to this max amplitude if below it.")
+        # Run separation method for the loaded model with autocast enabled if supported by the device
+        output_files = None
+        if self.use_autocast and autocast_mode.is_autocast_available(self.torch_device.type):
+            self.logger.debug("Autocast available.")
+            with autocast_mode.autocast(self.torch_device.type):
+                output_files = self.model_instance.separate(audio_file_path, custom_output_names)
+        else:
+            self.logger.debug("Autocast unavailable.")
+            output_files = self.model_instance.separate(audio_file_path, custom_output_names)
+        # Clear GPU cache to free up memory
+        self.model_instance.clear_gpu_cache()
+        # Unset separation parameters to prevent accidentally re-using the wrong source files or output paths
+        self.model_instance.clear_file_specific_paths()
+        # Remind the user one more time if they used a VIP model, so the message doesn't get lost in the logs
+        self.print_uvr_vip_message()
+        # Log the completion of the separation process
+        self.logger.debug("Separation process completed.")
+        self.logger.info(f'Separation duration: {time.strftime("%H:%M:%S", time.gmtime(int(time.perf_counter() - separate_start_time)))}')
+        return output_files
+    def download_model_and_data(self, model_filename):
+        """
+        Downloads the model file without loading it into memory.
+        """
+        self.logger.info(f"Downloading model {model_filename}...")
+        model_filename, model_type, model_friendly_name, model_path, yaml_config_filename = self.download_model_files(model_filename)
+        if model_path.lower().endswith(".yaml"):
+            yaml_config_filename = model_path
+        if yaml_config_filename is not None:
+            model_data = self.load_model_data_from_yaml(yaml_config_filename)
+        else:
+            model_data = self.load_model_data_using_hash(model_path)
+        model_data_dict_size = len(model_data)
+        self.logger.info(f"Model downloaded, type: {model_type}, friendly name: {model_friendly_name}, model_path: {model_path}, model_data: {model_data_dict_size} items")
+    def get_simplified_model_list(self, filter_sort_by: Optional[str] = None):
+        """
+        Returns a simplified, user-friendly list of models with their key metrics.
+        Optionally sorts the list based on the specified criteria.
+        :param sort_by: Criteria to sort by. Can be "name", "filename", or any stem name
+        """
+        model_files = self.list_supported_model_files()
+        simplified_list = {}
+        for model_type, models in model_files.items():
+            for name, data in models.items():
+                filename = data["filename"]
+                scores = data.get("scores") or {}
+                stems = data.get("stems") or []
+                target_stem = data.get("target_stem")
+                # Format stems with their SDR scores where available
+                stems_with_scores = []
+                stem_sdr_dict = {}
+                # Process each stem from the model's stem list
+                for stem in stems:
+                    stem_scores = scores.get(stem, {})
+                    # Add asterisk if this is the target stem
+                    stem_display = f"{stem}*" if stem == target_stem else stem
+                    if isinstance(stem_scores, dict) and "SDR" in stem_scores:
+                        sdr = round(stem_scores["SDR"], 1)
+                        stems_with_scores.append(f"{stem_display} ({sdr})")
+                        stem_sdr_dict[stem.lower()] = sdr
+                    else:
+                        # Include stem without SDR score
+                        stems_with_scores.append(stem_display)
+                        stem_sdr_dict[stem.lower()] = None
+                # If no stems listed, mark as Unknown
+                if not stems_with_scores:
+                    stems_with_scores = ["Unknown"]
+                    stem_sdr_dict["unknown"] = None
+                simplified_list[filename] = {"Name": name, "Type": model_type, "Stems": stems_with_scores, "SDR": stem_sdr_dict}
+        # Sort and filter the list if a sort_by parameter is provided
+        if filter_sort_by:
+            if filter_sort_by == "name":
+                return dict(sorted(simplified_list.items(), key=lambda x: x[1]["Name"]))
+            elif filter_sort_by == "filename":
+                return dict(sorted(simplified_list.items()))
+            else:
+                # Convert sort_by to lowercase for case-insensitive comparison
+                sort_by_lower = filter_sort_by.lower()
+                # Filter out models that don't have the specified stem
+                filtered_list = {k: v for k, v in simplified_list.items() if sort_by_lower in v["SDR"]}
+                # Sort by SDR score if available, putting None values last
+                def sort_key(item):
+                    sdr = item[1]["SDR"][sort_by_lower]
+                    return (0 if sdr is None else 1, sdr if sdr is not None else float("-inf"))
+                return dict(sorted(filtered_list.items(), key=sort_key, reverse=True))
+        return simplified_list

audio_separator/separator/uvr_lib_v5/__init__.py ADDED Viewed

File without changes

audio_separator/separator/uvr_lib_v5/demucs/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

audio_separator/separator/uvr_lib_v5/demucs/__main__.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import sys
+import time
+from dataclasses import dataclass, field
+from fractions import Fraction
+import torch as th
+from torch import distributed, nn
+from torch.nn.parallel.distributed import DistributedDataParallel
+from .augment import FlipChannels, FlipSign, Remix, Shift
+from .compressed import StemsSet, build_musdb_metadata, get_musdb_tracks
+from .model import Demucs
+from .parser import get_name, get_parser
+from .raw import Rawset
+from .tasnet import ConvTasNet
+from .test import evaluate
+from .train import train_model, validate_model
+from .utils import human_seconds, load_model, save_model, sizeof_fmt
+@dataclass
+class SavedState:
+    metrics: list = field(default_factory=list)
+    last_state: dict = None
+    best_state: dict = None
+    optimizer: dict = None
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    name = get_name(parser, args)
+    print(f"Experiment {name}")
+    if args.musdb is None and args.rank == 0:
+        print("You must provide the path to the MusDB dataset with the --musdb flag. " "To download the MusDB dataset, see https://sigsep.github.io/datasets/musdb.html.", file=sys.stderr)
+        sys.exit(1)
+    eval_folder = args.evals / name
+    eval_folder.mkdir(exist_ok=True, parents=True)
+    args.logs.mkdir(exist_ok=True)
+    metrics_path = args.logs / f"{name}.json"
+    eval_folder.mkdir(exist_ok=True, parents=True)
+    args.checkpoints.mkdir(exist_ok=True, parents=True)
+    args.models.mkdir(exist_ok=True, parents=True)
+    if args.device is None:
+        device = "cpu"
+        if th.cuda.is_available():
+            device = "cuda"
+    else:
+        device = args.device
+    th.manual_seed(args.seed)
+    # Prevents too many threads to be started when running `museval` as it can be quite
+    # inefficient on NUMA architectures.
+    os.environ["OMP_NUM_THREADS"] = "1"
+    if args.world_size > 1:
+        if device != "cuda" and args.rank == 0:
+            print("Error: distributed training is only available with cuda device", file=sys.stderr)
+            sys.exit(1)
+        th.cuda.set_device(args.rank % th.cuda.device_count())
+        distributed.init_process_group(backend="nccl", init_method="tcp://" + args.master, rank=args.rank, world_size=args.world_size)
+    checkpoint = args.checkpoints / f"{name}.th"
+    checkpoint_tmp = args.checkpoints / f"{name}.th.tmp"
+    if args.restart and checkpoint.exists():
+        checkpoint.unlink()
+    if args.test:
+        args.epochs = 1
+        args.repeat = 0
+        model = load_model(args.models / args.test)
+    elif args.tasnet:
+        model = ConvTasNet(audio_channels=args.audio_channels, samplerate=args.samplerate, X=args.X)
+    else:
+        model = Demucs(
+            audio_channels=args.audio_channels,
+            channels=args.channels,
+            context=args.context,
+            depth=args.depth,
+            glu=args.glu,
+            growth=args.growth,
+            kernel_size=args.kernel_size,
+            lstm_layers=args.lstm_layers,
+            rescale=args.rescale,
+            rewrite=args.rewrite,
+            sources=4,
+            stride=args.conv_stride,
+            upsample=args.upsample,
+            samplerate=args.samplerate,
+        )
+    model.to(device)
+    if args.show:
+        print(model)
+        size = sizeof_fmt(4 * sum(p.numel() for p in model.parameters()))
+        print(f"Model size {size}")
+        return
+    optimizer = th.optim.Adam(model.parameters(), lr=args.lr)
+    try:
+        saved = th.load(checkpoint, map_location="cpu")
+    except IOError:
+        saved = SavedState()
+    else:
+        model.load_state_dict(saved.last_state)
+        optimizer.load_state_dict(saved.optimizer)
+    if args.save_model:
+        if args.rank == 0:
+            model.to("cpu")
+            model.load_state_dict(saved.best_state)
+            save_model(model, args.models / f"{name}.th")
+        return
+    if args.rank == 0:
+        done = args.logs / f"{name}.done"
+        if done.exists():
+            done.unlink()
+    if args.augment:
+        augment = nn.Sequential(FlipSign(), FlipChannels(), Shift(args.data_stride), Remix(group_size=args.remix_group_size)).to(device)
+    else:
+        augment = Shift(args.data_stride)
+    if args.mse:
+        criterion = nn.MSELoss()
+    else:
+        criterion = nn.L1Loss()
+    # Setting number of samples so that all convolution windows are full.
+    # Prevents hard to debug mistake with the prediction being shifted compared
+    # to the input mixture.
+    samples = model.valid_length(args.samples)
+    print(f"Number of training samples adjusted to {samples}")
+    if args.raw:
+        train_set = Rawset(args.raw / "train", samples=samples + args.data_stride, channels=args.audio_channels, streams=[0, 1, 2, 3, 4], stride=args.data_stride)
+        valid_set = Rawset(args.raw / "valid", channels=args.audio_channels)
+    else:
+        if not args.metadata.is_file() and args.rank == 0:
+            build_musdb_metadata(args.metadata, args.musdb, args.workers)
+        if args.world_size > 1:
+            distributed.barrier()
+        metadata = json.load(open(args.metadata))
+        duration = Fraction(samples + args.data_stride, args.samplerate)
+        stride = Fraction(args.data_stride, args.samplerate)
+        train_set = StemsSet(get_musdb_tracks(args.musdb, subsets=["train"], split="train"), metadata, duration=duration, stride=stride, samplerate=args.samplerate, channels=args.audio_channels)
+        valid_set = StemsSet(get_musdb_tracks(args.musdb, subsets=["train"], split="valid"), metadata, samplerate=args.samplerate, channels=args.audio_channels)
+    best_loss = float("inf")
+    for epoch, metrics in enumerate(saved.metrics):
+        print(f"Epoch {epoch:03d}: " f"train={metrics['train']:.8f} " f"valid={metrics['valid']:.8f} " f"best={metrics['best']:.4f} " f"duration={human_seconds(metrics['duration'])}")
+        best_loss = metrics["best"]
+    if args.world_size > 1:
+        dmodel = DistributedDataParallel(model, device_ids=[th.cuda.current_device()], output_device=th.cuda.current_device())
+    else:
+        dmodel = model
+    for epoch in range(len(saved.metrics), args.epochs):
+        begin = time.time()
+        model.train()
+        train_loss = train_model(
+            epoch, train_set, dmodel, criterion, optimizer, augment, batch_size=args.batch_size, device=device, repeat=args.repeat, seed=args.seed, workers=args.workers, world_size=args.world_size
+        )
+        model.eval()
+        valid_loss = validate_model(epoch, valid_set, model, criterion, device=device, rank=args.rank, split=args.split_valid, world_size=args.world_size)
+        duration = time.time() - begin
+        if valid_loss < best_loss:
+            best_loss = valid_loss
+            saved.best_state = {key: value.to("cpu").clone() for key, value in model.state_dict().items()}
+        saved.metrics.append({"train": train_loss, "valid": valid_loss, "best": best_loss, "duration": duration})
+        if args.rank == 0:
+            json.dump(saved.metrics, open(metrics_path, "w"))
+        saved.last_state = model.state_dict()
+        saved.optimizer = optimizer.state_dict()
+        if args.rank == 0 and not args.test:
+            th.save(saved, checkpoint_tmp)
+            checkpoint_tmp.rename(checkpoint)
+        print(f"Epoch {epoch:03d}: " f"train={train_loss:.8f} valid={valid_loss:.8f} best={best_loss:.4f} " f"duration={human_seconds(duration)}")
+    del dmodel
+    model.load_state_dict(saved.best_state)
+    if args.eval_cpu:
+        device = "cpu"
+        model.to(device)
+    model.eval()
+    evaluate(model, args.musdb, eval_folder, rank=args.rank, world_size=args.world_size, device=device, save=args.save, split=args.split_valid, shifts=args.shifts, workers=args.eval_workers)
+    model.to("cpu")
+    save_model(model, args.models / f"{name}.th")
+    if args.rank == 0:
+        print("done")
+        done.write_text("done")
+if __name__ == "__main__":
+    main()

audio_separator/separator/uvr_lib_v5/demucs/apply.py ADDED Viewed

	@@ -0,0 +1,294 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Code to apply a model to a mix. It will handle chunking with overlaps and
+inteprolation between chunks, as well as the "shift trick".
+"""
+from concurrent.futures import ThreadPoolExecutor
+import random
+import typing as tp
+import torch as th
+from torch import nn
+from torch.nn import functional as F
+import tqdm
+from .demucs import Demucs
+from .hdemucs import HDemucs
+from .utils import center_trim, DummyPoolExecutor
+Model = tp.Union[Demucs, HDemucs]
+progress_bar_num = 0
+class BagOfModels(nn.Module):
+    def __init__(self, models: tp.List[Model], weights: tp.Optional[tp.List[tp.List[float]]] = None, segment: tp.Optional[float] = None):
+        """
+        Represents a bag of models with specific weights.
+        You should call `apply_model` rather than calling directly the forward here for
+        optimal performance.
+        Args:
+            models (list[nn.Module]): list of Demucs/HDemucs models.
+            weights (list[list[float]]): list of weights. If None, assumed to
+                be all ones, otherwise it should be a list of N list (N number of models),
+                each containing S floats (S number of sources).
+            segment (None or float): overrides the `segment` attribute of each model
+                (this is performed inplace, be careful if you reuse the models passed).
+        """
+        super().__init__()
+        assert len(models) > 0
+        first = models[0]
+        for other in models:
+            assert other.sources == first.sources
+            assert other.samplerate == first.samplerate
+            assert other.audio_channels == first.audio_channels
+            if segment is not None:
+                other.segment = segment
+        self.audio_channels = first.audio_channels
+        self.samplerate = first.samplerate
+        self.sources = first.sources
+        self.models = nn.ModuleList(models)
+        if weights is None:
+            weights = [[1.0 for _ in first.sources] for _ in models]
+        else:
+            assert len(weights) == len(models)
+            for weight in weights:
+                assert len(weight) == len(first.sources)
+        self.weights = weights
+    def forward(self, x):
+        raise NotImplementedError("Call `apply_model` on this.")
+class TensorChunk:
+    def __init__(self, tensor, offset=0, length=None):
+        total_length = tensor.shape[-1]
+        assert offset >= 0
+        assert offset < total_length
+        if length is None:
+            length = total_length - offset
+        else:
+            length = min(total_length - offset, length)
+        if isinstance(tensor, TensorChunk):
+            self.tensor = tensor.tensor
+            self.offset = offset + tensor.offset
+        else:
+            self.tensor = tensor
+            self.offset = offset
+        self.length = length
+        self.device = tensor.device
+    @property
+    def shape(self):
+        shape = list(self.tensor.shape)
+        shape[-1] = self.length
+        return shape
+    def padded(self, target_length):
+        delta = target_length - self.length
+        total_length = self.tensor.shape[-1]
+        assert delta >= 0
+        start = self.offset - delta // 2
+        end = start + target_length
+        correct_start = max(0, start)
+        correct_end = min(total_length, end)
+        pad_left = correct_start - start
+        pad_right = end - correct_end
+        out = F.pad(self.tensor[..., correct_start:correct_end], (pad_left, pad_right))
+        assert out.shape[-1] == target_length
+        return out
+def tensor_chunk(tensor_or_chunk):
+    if isinstance(tensor_or_chunk, TensorChunk):
+        return tensor_or_chunk
+    else:
+        assert isinstance(tensor_or_chunk, th.Tensor)
+        return TensorChunk(tensor_or_chunk)
+def apply_model(model, mix, shifts=1, split=True, overlap=0.25, transition_power=1.0, static_shifts=1, set_progress_bar=None, device=None, progress=False, num_workers=0, pool=None):
+    """
+    Apply model to a given mixture.
+    Args:
+        shifts (int): if > 0, will shift in time `mix` by a random amount between 0 and 0.5 sec
+            and apply the oppositve shift to the output. This is repeated `shifts` time and
+            all predictions are averaged. This effectively makes the model time equivariant
+            and improves SDR by up to 0.2 points.
+        split (bool): if True, the input will be broken down in 8 seconds extracts
+            and predictions will be performed individually on each and concatenated.
+            Useful for model with large memory footprint like Tasnet.
+        progress (bool): if True, show a progress bar (requires split=True)
+        device (torch.device, str, or None): if provided, device on which to
+            execute the computation, otherwise `mix.device` is assumed.
+            When `device` is different from `mix.device`, only local computations will
+            be on `device`, while the entire tracks will be stored on `mix.device`.
+    """
+    global fut_length
+    global bag_num
+    global prog_bar
+    if device is None:
+        device = mix.device
+    else:
+        device = th.device(device)
+    if pool is None:
+        if num_workers > 0 and device.type == "cpu":
+            pool = ThreadPoolExecutor(num_workers)
+        else:
+            pool = DummyPoolExecutor()
+    kwargs = {
+        "shifts": shifts,
+        "split": split,
+        "overlap": overlap,
+        "transition_power": transition_power,
+        "progress": progress,
+        "device": device,
+        "pool": pool,
+        "set_progress_bar": set_progress_bar,
+        "static_shifts": static_shifts,
+    }
+    if isinstance(model, BagOfModels):
+        # Special treatment for bag of model.
+        # We explicitely apply multiple times `apply_model` so that the random shifts
+        # are different for each model.
+        estimates = 0
+        totals = [0] * len(model.sources)
+        bag_num = len(model.models)
+        fut_length = 0
+        prog_bar = 0
+        current_model = 0  # (bag_num + 1)
+        for sub_model, weight in zip(model.models, model.weights):
+            original_model_device = next(iter(sub_model.parameters())).device
+            sub_model.to(device)
+            fut_length += fut_length
+            current_model += 1
+            out = apply_model(sub_model, mix, **kwargs)
+            sub_model.to(original_model_device)
+            for k, inst_weight in enumerate(weight):
+                out[:, k, :, :] *= inst_weight
+                totals[k] += inst_weight
+            estimates += out
+            del out
+        for k in range(estimates.shape[1]):
+            estimates[:, k, :, :] /= totals[k]
+        return estimates
+    model.to(device)
+    model.eval()
+    assert transition_power >= 1, "transition_power < 1 leads to weird behavior."
+    batch, channels, length = mix.shape
+    if shifts:
+        kwargs["shifts"] = 0
+        max_shift = int(0.5 * model.samplerate)
+        mix = tensor_chunk(mix)
+        padded_mix = mix.padded(length + 2 * max_shift)
+        out = 0
+        for _ in range(shifts):
+            offset = random.randint(0, max_shift)
+            shifted = TensorChunk(padded_mix, offset, length + max_shift - offset)
+            shifted_out = apply_model(model, shifted, **kwargs)
+            out += shifted_out[..., max_shift - offset :]
+        out /= shifts
+        return out
+    elif split:
+        kwargs["split"] = False
+        out = th.zeros(batch, len(model.sources), channels, length, device=mix.device)
+        sum_weight = th.zeros(length, device=mix.device)
+        segment = int(model.samplerate * model.segment)
+        stride = int((1 - overlap) * segment)
+        offsets = range(0, length, stride)
+        scale = float(format(stride / model.samplerate, ".2f"))
+        # We start from a triangle shaped weight, with maximal weight in the middle
+        # of the segment. Then we normalize and take to the power `transition_power`.
+        # Large values of transition power will lead to sharper transitions.
+        weight = th.cat([th.arange(1, segment // 2 + 1, device=device), th.arange(segment - segment // 2, 0, -1, device=device)])
+        assert len(weight) == segment
+        # If the overlap < 50%, this will translate to linear transition when
+        # transition_power is 1.
+        weight = (weight / weight.max()) ** transition_power
+        futures = []
+        for offset in offsets:
+            chunk = TensorChunk(mix, offset, segment)
+            future = pool.submit(apply_model, model, chunk, **kwargs)
+            futures.append((future, offset))
+            offset += segment
+        if progress:
+            futures = tqdm.tqdm(futures)
+        for future, offset in futures:
+            if set_progress_bar:
+                fut_length = len(futures) * bag_num * static_shifts
+                prog_bar += 1
+                set_progress_bar(0.1, (0.8 / fut_length * prog_bar))
+            chunk_out = future.result()
+            chunk_length = chunk_out.shape[-1]
+            out[..., offset : offset + segment] += (weight[:chunk_length] * chunk_out).to(mix.device)
+            sum_weight[offset : offset + segment] += weight[:chunk_length].to(mix.device)
+        assert sum_weight.min() > 0
+        out /= sum_weight
+        return out
+    else:
+        if hasattr(model, "valid_length"):
+            valid_length = model.valid_length(length)
+        else:
+            valid_length = length
+        mix = tensor_chunk(mix)
+        padded_mix = mix.padded(valid_length).to(device)
+        with th.no_grad():
+            out = model(padded_mix)
+        return center_trim(out, length)
+def demucs_segments(demucs_segment, demucs_model):
+    if demucs_segment == "Default":
+        segment = None
+        if isinstance(demucs_model, BagOfModels):
+            if segment is not None:
+                for sub in demucs_model.models:
+                    sub.segment = segment
+        else:
+            if segment is not None:
+                sub.segment = segment
+    else:
+        try:
+            segment = int(demucs_segment)
+            if isinstance(demucs_model, BagOfModels):
+                if segment is not None:
+                    for sub in demucs_model.models:
+                        sub.segment = segment
+            else:
+                if segment is not None:
+                    sub.segment = segment
+        except:
+            segment = None
+            if isinstance(demucs_model, BagOfModels):
+                if segment is not None:
+                    for sub in demucs_model.models:
+                        sub.segment = segment
+            else:
+                if segment is not None:
+                    sub.segment = segment
+    return demucs_model

audio_separator/separator/uvr_lib_v5/demucs/demucs.py ADDED Viewed

	@@ -0,0 +1,453 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import typing as tp
+import julius
+import torch
+from torch import nn
+from torch.nn import functional as F
+from .states import capture_init
+from .utils import center_trim, unfold
+class BLSTM(nn.Module):
+    """
+    BiLSTM with same hidden units as input dim.
+    If `max_steps` is not None, input will be splitting in overlapping
+    chunks and the LSTM applied separately on each chunk.
+    """
+    def __init__(self, dim, layers=1, max_steps=None, skip=False):
+        super().__init__()
+        assert max_steps is None or max_steps % 4 == 0
+        self.max_steps = max_steps
+        self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
+        self.linear = nn.Linear(2 * dim, dim)
+        self.skip = skip
+    def forward(self, x):
+        B, C, T = x.shape
+        y = x
+        framed = False
+        if self.max_steps is not None and T > self.max_steps:
+            width = self.max_steps
+            stride = width // 2
+            frames = unfold(x, width, stride)
+            nframes = frames.shape[2]
+            framed = True
+            x = frames.permute(0, 2, 1, 3).reshape(-1, C, width)
+        x = x.permute(2, 0, 1)
+        x = self.lstm(x)[0]
+        x = self.linear(x)
+        x = x.permute(1, 2, 0)
+        if framed:
+            out = []
+            frames = x.reshape(B, -1, C, width)
+            limit = stride // 2
+            for k in range(nframes):
+                if k == 0:
+                    out.append(frames[:, k, :, :-limit])
+                elif k == nframes - 1:
+                    out.append(frames[:, k, :, limit:])
+                else:
+                    out.append(frames[:, k, :, limit:-limit])
+            out = torch.cat(out, -1)
+            out = out[..., :T]
+            x = out
+        if self.skip:
+            x = x + y
+        return x
+def rescale_conv(conv, reference):
+    """Rescale initial weight scale. It is unclear why it helps but it certainly does."""
+    std = conv.weight.std().detach()
+    scale = (std / reference) ** 0.5
+    conv.weight.data /= scale
+    if conv.bias is not None:
+        conv.bias.data /= scale
+def rescale_module(module, reference):
+    for sub in module.modules():
+        if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d, nn.Conv2d, nn.ConvTranspose2d)):
+            rescale_conv(sub, reference)
+class LayerScale(nn.Module):
+    """Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
+    This rescales diagonaly residual outputs close to 0 initially, then learnt.
+    """
+    def __init__(self, channels: int, init: float = 0):
+        super().__init__()
+        self.scale = nn.Parameter(torch.zeros(channels, requires_grad=True))
+        self.scale.data[:] = init
+    def forward(self, x):
+        return self.scale[:, None] * x
+class DConv(nn.Module):
+    """
+    New residual branches in each encoder layer.
+    This alternates dilated convolutions, potentially with LSTMs and attention.
+    Also before entering each residual branch, dimension is projected on a smaller subspace,
+    e.g. of dim `channels // compress`.
+    """
+    def __init__(self, channels: int, compress: float = 4, depth: int = 2, init: float = 1e-4, norm=True, attn=False, heads=4, ndecay=4, lstm=False, gelu=True, kernel=3, dilate=True):
+        """
+        Args:
+            channels: input/output channels for residual branch.
+            compress: amount of channel compression inside the branch.
+            depth: number of layers in the residual branch. Each layer has its own
+                projection, and potentially LSTM and attention.
+            init: initial scale for LayerNorm.
+            norm: use GroupNorm.
+            attn: use LocalAttention.
+            heads: number of heads for the LocalAttention.
+            ndecay: number of decay controls in the LocalAttention.
+            lstm: use LSTM.
+            gelu: Use GELU activation.
+            kernel: kernel size for the (dilated) convolutions.
+            dilate: if true, use dilation, increasing with the depth.
+        """
+        super().__init__()
+        assert kernel % 2 == 1
+        self.channels = channels
+        self.compress = compress
+        self.depth = abs(depth)
+        dilate = depth > 0
+        norm_fn: tp.Callable[[int], nn.Module]
+        norm_fn = lambda d: nn.Identity()  # noqa
+        if norm:
+            norm_fn = lambda d: nn.GroupNorm(1, d)  # noqa
+        hidden = int(channels / compress)
+        act: tp.Type[nn.Module]
+        if gelu:
+            act = nn.GELU
+        else:
+            act = nn.ReLU
+        self.layers = nn.ModuleList([])
+        for d in range(self.depth):
+            dilation = 2**d if dilate else 1
+            padding = dilation * (kernel // 2)
+            mods = [
+                nn.Conv1d(channels, hidden, kernel, dilation=dilation, padding=padding),
+                norm_fn(hidden),
+                act(),
+                nn.Conv1d(hidden, 2 * channels, 1),
+                norm_fn(2 * channels),
+                nn.GLU(1),
+                LayerScale(channels, init),
+            ]
+            if attn:
+                mods.insert(3, LocalState(hidden, heads=heads, ndecay=ndecay))
+            if lstm:
+                mods.insert(3, BLSTM(hidden, layers=2, max_steps=200, skip=True))
+            layer = nn.Sequential(*mods)
+            self.layers.append(layer)
+    def forward(self, x):
+        for layer in self.layers:
+            x = x + layer(x)
+        return x
+class LocalState(nn.Module):
+    """Local state allows to have attention based only on data (no positional embedding),
+    but while setting a constraint on the time window (e.g. decaying penalty term).
+    Also a failed experiments with trying to provide some frequency based attention.
+    """
+    def __init__(self, channels: int, heads: int = 4, nfreqs: int = 0, ndecay: int = 4):
+        super().__init__()
+        assert channels % heads == 0, (channels, heads)
+        self.heads = heads
+        self.nfreqs = nfreqs
+        self.ndecay = ndecay
+        self.content = nn.Conv1d(channels, channels, 1)
+        self.query = nn.Conv1d(channels, channels, 1)
+        self.key = nn.Conv1d(channels, channels, 1)
+        if nfreqs:
+            self.query_freqs = nn.Conv1d(channels, heads * nfreqs, 1)
+        if ndecay:
+            self.query_decay = nn.Conv1d(channels, heads * ndecay, 1)
+            # Initialize decay close to zero (there is a sigmoid), for maximum initial window.
+            self.query_decay.weight.data *= 0.01
+            assert self.query_decay.bias is not None  # stupid type checker
+            self.query_decay.bias.data[:] = -2
+        self.proj = nn.Conv1d(channels + heads * nfreqs, channels, 1)
+    def forward(self, x):
+        B, C, T = x.shape
+        heads = self.heads
+        indexes = torch.arange(T, device=x.device, dtype=x.dtype)
+        # left index are keys, right index are queries
+        delta = indexes[:, None] - indexes[None, :]
+        queries = self.query(x).view(B, heads, -1, T)
+        keys = self.key(x).view(B, heads, -1, T)
+        # t are keys, s are queries
+        dots = torch.einsum("bhct,bhcs->bhts", keys, queries)
+        dots /= keys.shape[2] ** 0.5
+        if self.nfreqs:
+            periods = torch.arange(1, self.nfreqs + 1, device=x.device, dtype=x.dtype)
+            freq_kernel = torch.cos(2 * math.pi * delta / periods.view(-1, 1, 1))
+            freq_q = self.query_freqs(x).view(B, heads, -1, T) / self.nfreqs**0.5
+            dots += torch.einsum("fts,bhfs->bhts", freq_kernel, freq_q)
+        if self.ndecay:
+            decays = torch.arange(1, self.ndecay + 1, device=x.device, dtype=x.dtype)
+            decay_q = self.query_decay(x).view(B, heads, -1, T)
+            decay_q = torch.sigmoid(decay_q) / 2
+            decay_kernel = -decays.view(-1, 1, 1) * delta.abs() / self.ndecay**0.5
+            dots += torch.einsum("fts,bhfs->bhts", decay_kernel, decay_q)
+        # Kill self reference.
+        dots.masked_fill_(torch.eye(T, device=dots.device, dtype=torch.bool), -100)
+        weights = torch.softmax(dots, dim=2)
+        content = self.content(x).view(B, heads, -1, T)
+        result = torch.einsum("bhts,bhct->bhcs", weights, content)
+        if self.nfreqs:
+            time_sig = torch.einsum("bhts,fts->bhfs", weights, freq_kernel)
+            result = torch.cat([result, time_sig], 2)
+        result = result.reshape(B, -1, T)
+        return x + self.proj(result)
+class Demucs(nn.Module):
+    @capture_init
+    def __init__(
+        self,
+        sources,
+        # Channels
+        audio_channels=2,
+        channels=64,
+        growth=2.0,
+        # Main structure
+        depth=6,
+        rewrite=True,
+        lstm_layers=0,
+        # Convolutions
+        kernel_size=8,
+        stride=4,
+        context=1,
+        # Activations
+        gelu=True,
+        glu=True,
+        # Normalization
+        norm_starts=4,
+        norm_groups=4,
+        # DConv residual branch
+        dconv_mode=1,
+        dconv_depth=2,
+        dconv_comp=4,
+        dconv_attn=4,
+        dconv_lstm=4,
+        dconv_init=1e-4,
+        # Pre/post processing
+        normalize=True,
+        resample=True,
+        # Weight init
+        rescale=0.1,
+        # Metadata
+        samplerate=44100,
+        segment=4 * 10,
+    ):
+        """
+        Args:
+            sources (list[str]): list of source names
+            audio_channels (int): stereo or mono
+            channels (int): first convolution channels
+            depth (int): number of encoder/decoder layers
+            growth (float): multiply (resp divide) number of channels by that
+                for each layer of the encoder (resp decoder)
+            depth (int): number of layers in the encoder and in the decoder.
+            rewrite (bool): add 1x1 convolution to each layer.
+            lstm_layers (int): number of lstm layers, 0 = no lstm. Deactivated
+                by default, as this is now replaced by the smaller and faster small LSTMs
+                in the DConv branches.
+            kernel_size (int): kernel size for convolutions
+            stride (int): stride for convolutions
+            context (int): kernel size of the convolution in the
+                decoder before the transposed convolution. If > 1,
+                will provide some context from neighboring time steps.
+            gelu: use GELU activation function.
+            glu (bool): use glu instead of ReLU for the 1x1 rewrite conv.
+            norm_starts: layer at which group norm starts being used.
+                decoder layers are numbered in reverse order.
+            norm_groups: number of groups for group norm.
+            dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both.
+            dconv_depth: depth of residual DConv branch.
+            dconv_comp: compression of DConv branch.
+            dconv_attn: adds attention layers in DConv branch starting at this layer.
+            dconv_lstm: adds a LSTM layer in DConv branch starting at this layer.
+            dconv_init: initial scale for the DConv branch LayerScale.
+            normalize (bool): normalizes the input audio on the fly, and scales back
+                the output by the same amount.
+            resample (bool): upsample x2 the input and downsample /2 the output.
+            rescale (int): rescale initial weights of convolutions
+                to get their standard deviation closer to `rescale`.
+            samplerate (int): stored as meta information for easing
+                future evaluations of the model.
+            segment (float): duration of the chunks of audio to ideally evaluate the model on.
+                This is used by `demucs.apply.apply_model`.
+        """
+        super().__init__()
+        self.audio_channels = audio_channels
+        self.sources = sources
+        self.kernel_size = kernel_size
+        self.context = context
+        self.stride = stride
+        self.depth = depth
+        self.resample = resample
+        self.channels = channels
+        self.normalize = normalize
+        self.samplerate = samplerate
+        self.segment = segment
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        self.skip_scales = nn.ModuleList()
+        if glu:
+            activation = nn.GLU(dim=1)
+            ch_scale = 2
+        else:
+            activation = nn.ReLU()
+            ch_scale = 1
+        if gelu:
+            act2 = nn.GELU
+        else:
+            act2 = nn.ReLU
+        in_channels = audio_channels
+        padding = 0
+        for index in range(depth):
+            norm_fn = lambda d: nn.Identity()  # noqa
+            if index >= norm_starts:
+                norm_fn = lambda d: nn.GroupNorm(norm_groups, d)  # noqa
+            encode = []
+            encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), norm_fn(channels), act2()]
+            attn = index >= dconv_attn
+            lstm = index >= dconv_lstm
+            if dconv_mode & 1:
+                encode += [DConv(channels, depth=dconv_depth, init=dconv_init, compress=dconv_comp, attn=attn, lstm=lstm)]
+            if rewrite:
+                encode += [nn.Conv1d(channels, ch_scale * channels, 1), norm_fn(ch_scale * channels), activation]
+            self.encoder.append(nn.Sequential(*encode))
+            decode = []
+            if index > 0:
+                out_channels = in_channels
+            else:
+                out_channels = len(self.sources) * audio_channels
+            if rewrite:
+                decode += [nn.Conv1d(channels, ch_scale * channels, 2 * context + 1, padding=context), norm_fn(ch_scale * channels), activation]
+            if dconv_mode & 2:
+                decode += [DConv(channels, depth=dconv_depth, init=dconv_init, compress=dconv_comp, attn=attn, lstm=lstm)]
+            decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride, padding=padding)]
+            if index > 0:
+                decode += [norm_fn(out_channels), act2()]
+            self.decoder.insert(0, nn.Sequential(*decode))
+            in_channels = channels
+            channels = int(growth * channels)
+        channels = in_channels
+        if lstm_layers:
+            self.lstm = BLSTM(channels, lstm_layers)
+        else:
+            self.lstm = None
+        if rescale:
+            rescale_module(self, reference=rescale)
+    def valid_length(self, length):
+        """
+        Return the nearest valid length to use with the model so that
+        there is no time steps left over in a convolution, e.g. for all
+        layers, size of the input - kernel_size % stride = 0.
+        Note that input are automatically padded if necessary to ensure that the output
+        has the same length as the input.
+        """
+        if self.resample:
+            length *= 2
+        for _ in range(self.depth):
+            length = math.ceil((length - self.kernel_size) / self.stride) + 1
+            length = max(1, length)
+        for idx in range(self.depth):
+            length = (length - 1) * self.stride + self.kernel_size
+        if self.resample:
+            length = math.ceil(length / 2)
+        return int(length)
+    def forward(self, mix):
+        x = mix
+        length = x.shape[-1]
+        if self.normalize:
+            mono = mix.mean(dim=1, keepdim=True)
+            mean = mono.mean(dim=-1, keepdim=True)
+            std = mono.std(dim=-1, keepdim=True)
+            x = (x - mean) / (1e-5 + std)
+        else:
+            mean = 0
+            std = 1
+        delta = self.valid_length(length) - length
+        x = F.pad(x, (delta // 2, delta - delta // 2))
+        if self.resample:
+            x = julius.resample_frac(x, 1, 2)
+        saved = []
+        for encode in self.encoder:
+            x = encode(x)
+            saved.append(x)
+        if self.lstm:
+            x = self.lstm(x)
+        for decode in self.decoder:
+            skip = saved.pop(-1)
+            skip = center_trim(skip, x)
+            x = decode(x + skip)
+        if self.resample:
+            x = julius.resample_frac(x, 2, 1)
+        x = x * std + mean
+        x = center_trim(x, length)
+        x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1))
+        return x
+    def load_state_dict(self, state, strict=True):
+        # fix a mismatch with previous generation Demucs models.
+        for idx in range(self.depth):
+            for a in ["encoder", "decoder"]:
+                for b in ["bias", "weight"]:
+                    new = f"{a}.{idx}.3.{b}"
+                    old = f"{a}.{idx}.2.{b}"
+                    if old in state and new not in state:
+                        state[new] = state.pop(old)
+        super().load_state_dict(state, strict=strict)

audio_separator/separator/uvr_lib_v5/demucs/filtering.py ADDED Viewed

	@@ -0,0 +1,451 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.utils.data import DataLoader
+def atan2(y, x):
+    r"""Element-wise arctangent function of y/x.
+    Returns a new tensor with signed angles in radians.
+    It is an alternative implementation of torch.atan2
+    Args:
+        y (Tensor): First input tensor
+        x (Tensor): Second input tensor [shape=y.shape]
+    Returns:
+        Tensor: [shape=y.shape].
+    """
+    pi = 2 * torch.asin(torch.tensor(1.0))
+    x += ((x == 0) & (y == 0)) * 1.0
+    out = torch.atan(y / x)
+    out += ((y >= 0) & (x < 0)) * pi
+    out -= ((y < 0) & (x < 0)) * pi
+    out *= 1 - ((y > 0) & (x == 0)) * 1.0
+    out += ((y > 0) & (x == 0)) * (pi / 2)
+    out *= 1 - ((y < 0) & (x == 0)) * 1.0
+    out += ((y < 0) & (x == 0)) * (-pi / 2)
+    return out
+# Define basic complex operations on torch.Tensor objects whose last dimension
+# consists in the concatenation of the real and imaginary parts.
+def _norm(x: torch.Tensor) -> torch.Tensor:
+    r"""Computes the norm value of a torch Tensor, assuming that it
+    comes as real and imaginary part in its last dimension.
+    Args:
+        x (Tensor): Input Tensor of shape [shape=(..., 2)]
+    Returns:
+        Tensor: shape as x excluding the last dimension.
+    """
+    return torch.abs(x[..., 0]) ** 2 + torch.abs(x[..., 1]) ** 2
+def _mul_add(a: torch.Tensor, b: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """Element-wise multiplication of two complex Tensors described
+    through their real and imaginary parts.
+    The result is added to the `out` tensor"""
+    # check `out` and allocate it if needed
+    target_shape = torch.Size([max(sa, sb) for (sa, sb) in zip(a.shape, b.shape)])
+    if out is None or out.shape != target_shape:
+        out = torch.zeros(target_shape, dtype=a.dtype, device=a.device)
+    if out is a:
+        real_a = a[..., 0]
+        out[..., 0] = out[..., 0] + (real_a * b[..., 0] - a[..., 1] * b[..., 1])
+        out[..., 1] = out[..., 1] + (real_a * b[..., 1] + a[..., 1] * b[..., 0])
+    else:
+        out[..., 0] = out[..., 0] + (a[..., 0] * b[..., 0] - a[..., 1] * b[..., 1])
+        out[..., 1] = out[..., 1] + (a[..., 0] * b[..., 1] + a[..., 1] * b[..., 0])
+    return out
+def _mul(a: torch.Tensor, b: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """Element-wise multiplication of two complex Tensors described
+    through their real and imaginary parts
+    can work in place in case out is a only"""
+    target_shape = torch.Size([max(sa, sb) for (sa, sb) in zip(a.shape, b.shape)])
+    if out is None or out.shape != target_shape:
+        out = torch.zeros(target_shape, dtype=a.dtype, device=a.device)
+    if out is a:
+        real_a = a[..., 0]
+        out[..., 0] = real_a * b[..., 0] - a[..., 1] * b[..., 1]
+        out[..., 1] = real_a * b[..., 1] + a[..., 1] * b[..., 0]
+    else:
+        out[..., 0] = a[..., 0] * b[..., 0] - a[..., 1] * b[..., 1]
+        out[..., 1] = a[..., 0] * b[..., 1] + a[..., 1] * b[..., 0]
+    return out
+def _inv(z: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """Element-wise multiplicative inverse of a Tensor with complex
+    entries described through their real and imaginary parts.
+    can work in place in case out is z"""
+    ez = _norm(z)
+    if out is None or out.shape != z.shape:
+        out = torch.zeros_like(z)
+    out[..., 0] = z[..., 0] / ez
+    out[..., 1] = -z[..., 1] / ez
+    return out
+def _conj(z, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """Element-wise complex conjugate of a Tensor with complex entries
+    described through their real and imaginary parts.
+    can work in place in case out is z"""
+    if out is None or out.shape != z.shape:
+        out = torch.zeros_like(z)
+    out[..., 0] = z[..., 0]
+    out[..., 1] = -z[..., 1]
+    return out
+def _invert(M: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Invert 1x1 or 2x2 matrices
+    Will generate errors if the matrices are singular: user must handle this
+    through his own regularization schemes.
+    Args:
+        M (Tensor): [shape=(..., nb_channels, nb_channels, 2)]
+            matrices to invert: must be square along dimensions -3 and -2
+    Returns:
+        invM (Tensor): [shape=M.shape]
+            inverses of M
+    """
+    nb_channels = M.shape[-2]
+    if out is None or out.shape != M.shape:
+        out = torch.empty_like(M)
+    if nb_channels == 1:
+        # scalar case
+        out = _inv(M, out)
+    elif nb_channels == 2:
+        # two channels case: analytical expression
+        # first compute the determinent
+        det = _mul(M[..., 0, 0, :], M[..., 1, 1, :])
+        det = det - _mul(M[..., 0, 1, :], M[..., 1, 0, :])
+        # invert it
+        invDet = _inv(det)
+        # then fill out the matrix with the inverse
+        out[..., 0, 0, :] = _mul(invDet, M[..., 1, 1, :], out[..., 0, 0, :])
+        out[..., 1, 0, :] = _mul(-invDet, M[..., 1, 0, :], out[..., 1, 0, :])
+        out[..., 0, 1, :] = _mul(-invDet, M[..., 0, 1, :], out[..., 0, 1, :])
+        out[..., 1, 1, :] = _mul(invDet, M[..., 0, 0, :], out[..., 1, 1, :])
+    else:
+        raise Exception("Only 2 channels are supported for the torch version.")
+    return out
+# Now define the signal-processing low-level functions used by the Separator
+def expectation_maximization(y: torch.Tensor, x: torch.Tensor, iterations: int = 2, eps: float = 1e-10, batch_size: int = 200):
+    r"""Expectation maximization algorithm, for refining source separation
+    estimates.
+    This algorithm allows to make source separation results better by
+    enforcing multichannel consistency for the estimates. This usually means
+    a better perceptual quality in terms of spatial artifacts.
+    The implementation follows the details presented in [1]_, taking
+    inspiration from the original EM algorithm proposed in [2]_ and its
+    weighted refinement proposed in [3]_, [4]_.
+    It works by iteratively:
+     * Re-estimate source parameters (power spectral densities and spatial
+       covariance matrices) through :func:`get_local_gaussian_model`.
+     * Separate again the mixture with the new parameters by first computing
+       the new modelled mixture covariance matrices with :func:`get_mix_model`,
+       prepare the Wiener filters through :func:`wiener_gain` and apply them
+       with :func:`apply_filter``.
+    References
+    ----------
+    .. [1] S. Uhlich and M. Porcu and F. Giron and M. Enenkl and T. Kemp and
+        N. Takahashi and Y. Mitsufuji, "Improving music source separation based
+        on deep neural networks through data augmentation and network
+        blending." 2017 IEEE International Conference on Acoustics, Speech
+        and Signal Processing (ICASSP). IEEE, 2017.
+    .. [2] N.Q. Duong and E. Vincent and R.Gribonval. "Under-determined
+        reverberant audio source separation using a full-rank spatial
+        covariance model." IEEE Transactions on Audio, Speech, and Language
+        Processing 18.7 (2010): 1830-1840.
+    .. [3] A. Nugraha and A. Liutkus and E. Vincent. "Multichannel audio source
+        separation with deep neural networks." IEEE/ACM Transactions on Audio,
+        Speech, and Language Processing 24.9 (2016): 1652-1664.
+    .. [4] A. Nugraha and A. Liutkus and E. Vincent. "Multichannel music
+        separation with deep neural networks." 2016 24th European Signal
+        Processing Conference (EUSIPCO). IEEE, 2016.
+    .. [5] A. Liutkus and R. Badeau and G. Richard "Kernel additive models for
+        source separation." IEEE Transactions on Signal Processing
+        62.16 (2014): 4298-4310.
+    Args:
+        y (Tensor): [shape=(nb_frames, nb_bins, nb_channels, 2, nb_sources)]
+            initial estimates for the sources
+        x (Tensor): [shape=(nb_frames, nb_bins, nb_channels, 2)]
+            complex STFT of the mixture signal
+        iterations (int): [scalar]
+            number of iterations for the EM algorithm.
+        eps (float or None): [scalar]
+            The epsilon value to use for regularization and filters.
+    Returns:
+        y (Tensor): [shape=(nb_frames, nb_bins, nb_channels, 2, nb_sources)]
+            estimated sources after iterations
+        v (Tensor): [shape=(nb_frames, nb_bins, nb_sources)]
+            estimated power spectral densities
+        R (Tensor): [shape=(nb_bins, nb_channels, nb_channels, 2, nb_sources)]
+            estimated spatial covariance matrices
+    Notes:
+        * You need an initial estimate for the sources to apply this
+          algorithm. This is precisely what the :func:`wiener` function does.
+        * This algorithm *is not* an implementation of the "exact" EM
+          proposed in [1]_. In particular, it does compute the posterior
+          covariance matrices the same (exact) way. Instead, it uses the
+          simplified approximate scheme initially proposed in [5]_ and further
+          refined in [3]_, [4]_, that boils down to just take the empirical
+          covariance of the recent source estimates, followed by a weighted
+          average for the update of the spatial covariance matrix. It has been
+          empirically demonstrated that this simplified algorithm is more
+          robust for music separation.
+    Warning:
+        It is *very* important to make sure `x.dtype` is `torch.float64`
+        if you want double precision, because this function will **not**
+        do such conversion for you from `torch.complex32`, in case you want the
+        smaller RAM usage on purpose.
+        It is usually always better in terms of quality to have double
+        precision, by e.g. calling :func:`expectation_maximization`
+        with ``x.to(torch.float64)``.
+    """
+    # dimensions
+    (nb_frames, nb_bins, nb_channels) = x.shape[:-1]
+    nb_sources = y.shape[-1]
+    regularization = torch.cat((torch.eye(nb_channels, dtype=x.dtype, device=x.device)[..., None], torch.zeros((nb_channels, nb_channels, 1), dtype=x.dtype, device=x.device)), dim=2)
+    regularization = torch.sqrt(torch.as_tensor(eps)) * (regularization[None, None, ...].expand((-1, nb_bins, -1, -1, -1)))
+    # allocate the spatial covariance matrices
+    R = [torch.zeros((nb_bins, nb_channels, nb_channels, 2), dtype=x.dtype, device=x.device) for j in range(nb_sources)]
+    weight: torch.Tensor = torch.zeros((nb_bins,), dtype=x.dtype, device=x.device)
+    v: torch.Tensor = torch.zeros((nb_frames, nb_bins, nb_sources), dtype=x.dtype, device=x.device)
+    for it in range(iterations):
+        # constructing the mixture covariance matrix. Doing it with a loop
+        # to avoid storing anytime in RAM the whole 6D tensor
+        # update the PSD as the average spectrogram over channels
+        v = torch.mean(torch.abs(y[..., 0, :]) ** 2 + torch.abs(y[..., 1, :]) ** 2, dim=-2)
+        # update spatial covariance matrices (weighted update)
+        for j in range(nb_sources):
+            R[j] = torch.tensor(0.0, device=x.device)
+            weight = torch.tensor(eps, device=x.device)
+            pos: int = 0
+            batch_size = batch_size if batch_size else nb_frames
+            while pos < nb_frames:
+                t = torch.arange(pos, min(nb_frames, pos + batch_size))
+                pos = int(t[-1]) + 1
+                R[j] = R[j] + torch.sum(_covariance(y[t, ..., j]), dim=0)
+                weight = weight + torch.sum(v[t, ..., j], dim=0)
+            R[j] = R[j] / weight[..., None, None, None]
+            weight = torch.zeros_like(weight)
+        # cloning y if we track gradient, because we're going to update it
+        if y.requires_grad:
+            y = y.clone()
+        pos = 0
+        while pos < nb_frames:
+            t = torch.arange(pos, min(nb_frames, pos + batch_size))
+            pos = int(t[-1]) + 1
+            y[t, ...] = torch.tensor(0.0, device=x.device, dtype=x.dtype)
+            # compute mix covariance matrix
+            Cxx = regularization
+            for j in range(nb_sources):
+                Cxx = Cxx + (v[t, ..., j, None, None, None] * R[j][None, ...].clone())
+            # invert it
+            inv_Cxx = _invert(Cxx)
+            # separate the sources
+            for j in range(nb_sources):
+                # create a wiener gain for this source
+                gain = torch.zeros_like(inv_Cxx)
+                # computes multichannel Wiener gain as v_j R_j inv_Cxx
+                indices = torch.cartesian_prod(torch.arange(nb_channels), torch.arange(nb_channels), torch.arange(nb_channels))
+                for index in indices:
+                    gain[:, :, index[0], index[1], :] = _mul_add(R[j][None, :, index[0], index[2], :].clone(), inv_Cxx[:, :, index[2], index[1], :], gain[:, :, index[0], index[1], :])
+                gain = gain * v[t, ..., None, None, None, j]
+                # apply it to the mixture
+                for i in range(nb_channels):
+                    y[t, ..., j] = _mul_add(gain[..., i, :], x[t, ..., i, None, :], y[t, ..., j])
+    return y, v, R
+def wiener(targets_spectrograms: torch.Tensor, mix_stft: torch.Tensor, iterations: int = 1, softmask: bool = False, residual: bool = False, scale_factor: float = 10.0, eps: float = 1e-10):
+    """Wiener-based separation for multichannel audio.
+    The method uses the (possibly multichannel) spectrograms  of the
+    sources to separate the (complex) Short Term Fourier Transform  of the
+    mix. Separation is done in a sequential way by:
+    * Getting an initial estimate. This can be done in two ways: either by
+      directly using the spectrograms with the mixture phase, or
+      by using a softmasking strategy. This initial phase is controlled
+      by the `softmask` flag.
+    * If required, adding an additional residual target as the mix minus
+      all targets.
+    * Refinining these initial estimates through a call to
+      :func:`expectation_maximization` if the number of iterations is nonzero.
+    This implementation also allows to specify the epsilon value used for
+    regularization. It is based on [1]_, [2]_, [3]_, [4]_.
+    References
+    ----------
+    .. [1] S. Uhlich and M. Porcu and F. Giron and M. Enenkl and T. Kemp and
+        N. Takahashi and Y. Mitsufuji, "Improving music source separation based
+        on deep neural networks through data augmentation and network
+        blending." 2017 IEEE International Conference on Acoustics, Speech
+        and Signal Processing (ICASSP). IEEE, 2017.
+    .. [2] A. Nugraha and A. Liutkus and E. Vincent. "Multichannel audio source
+        separation with deep neural networks." IEEE/ACM Transactions on Audio,
+        Speech, and Language Processing 24.9 (2016): 1652-1664.
+    .. [3] A. Nugraha and A. Liutkus and E. Vincent. "Multichannel music
+        separation with deep neural networks." 2016 24th European Signal
+        Processing Conference (EUSIPCO). IEEE, 2016.
+    .. [4] A. Liutkus and R. Badeau and G. Richard "Kernel additive models for
+        source separation." IEEE Transactions on Signal Processing
+        62.16 (2014): 4298-4310.
+    Args:
+        targets_spectrograms (Tensor): spectrograms of the sources
+            [shape=(nb_frames, nb_bins, nb_channels, nb_sources)].
+            This is a nonnegative tensor that is
+            usually the output of the actual separation method of the user. The
+            spectrograms may be mono, but they need to be 4-dimensional in all
+            cases.
+        mix_stft (Tensor): [shape=(nb_frames, nb_bins, nb_channels, complex=2)]
+            STFT of the mixture signal.
+        iterations (int): [scalar]
+            number of iterations for the EM algorithm
+        softmask (bool): Describes how the initial estimates are obtained.
+            * if `False`, then the mixture phase will directly be used with the
+            spectrogram as initial estimates.
+            * if `True`, initial estimates are obtained by multiplying the
+            complex mix element-wise with the ratio of each target spectrogram
+            with the sum of them all. This strategy is better if the model are
+            not really good, and worse otherwise.
+        residual (bool): if `True`, an additional target is created, which is
+            equal to the mixture minus the other targets, before application of
+            expectation maximization
+        eps (float): Epsilon value to use for computing the separations.
+            This is used whenever division with a model energy is
+            performed, i.e. when softmasking and when iterating the EM.
+            It can be understood as the energy of the additional white noise
+            that is taken out when separating.
+    Returns:
+        Tensor: shape=(nb_frames, nb_bins, nb_channels, complex=2, nb_sources)
+            STFT of estimated sources
+    Notes:
+        * Be careful that you need *magnitude spectrogram estimates* for the
+        case `softmask==False`.
+        * `softmask=False` is recommended
+        * The epsilon value will have a huge impact on performance. If it's
+        large, only the parts of the signal with a significant energy will
+        be kept in the sources. This epsilon then directly controls the
+        energy of the reconstruction error.
+    Warning:
+        As in :func:`expectation_maximization`, we recommend converting the
+        mixture `x` to double precision `torch.float64` *before* calling
+        :func:`wiener`.
+    """
+    if softmask:
+        # if we use softmask, we compute the ratio mask for all targets and
+        # multiply by the mix stft
+        y = mix_stft[..., None] * (targets_spectrograms / (eps + torch.sum(targets_spectrograms, dim=-1, keepdim=True).to(mix_stft.dtype)))[..., None, :]
+    else:
+        # otherwise, we just multiply the targets spectrograms with mix phase
+        # we tacitly assume that we have magnitude estimates.
+        angle = atan2(mix_stft[..., 1], mix_stft[..., 0])[..., None]
+        nb_sources = targets_spectrograms.shape[-1]
+        y = torch.zeros(mix_stft.shape + (nb_sources,), dtype=mix_stft.dtype, device=mix_stft.device)
+        y[..., 0, :] = targets_spectrograms * torch.cos(angle)
+        y[..., 1, :] = targets_spectrograms * torch.sin(angle)
+    if residual:
+        # if required, adding an additional target as the mix minus
+        # available targets
+        y = torch.cat([y, mix_stft[..., None] - y.sum(dim=-1, keepdim=True)], dim=-1)
+    if iterations == 0:
+        return y
+    # we need to refine the estimates. Scales down the estimates for
+    # numerical stability
+    max_abs = torch.max(torch.as_tensor(1.0, dtype=mix_stft.dtype, device=mix_stft.device), torch.sqrt(_norm(mix_stft)).max() / scale_factor)
+    mix_stft = mix_stft / max_abs
+    y = y / max_abs
+    # call expectation maximization
+    y = expectation_maximization(y, mix_stft, iterations, eps=eps)[0]
+    # scale estimates up again
+    y = y * max_abs
+    return y
+def _covariance(y_j):
+    """
+    Compute the empirical covariance for a source.
+    Args:
+        y_j (Tensor): complex stft of the source.
+            [shape=(nb_frames, nb_bins, nb_channels, 2)].
+    Returns:
+        Cj (Tensor): [shape=(nb_frames, nb_bins, nb_channels, nb_channels, 2)]
+            just y_j * conj(y_j.T): empirical covariance for each TF bin.
+    """
+    (nb_frames, nb_bins, nb_channels) = y_j.shape[:-1]
+    Cj = torch.zeros((nb_frames, nb_bins, nb_channels, nb_channels, 2), dtype=y_j.dtype, device=y_j.device)
+    indices = torch.cartesian_prod(torch.arange(nb_channels), torch.arange(nb_channels))
+    for index in indices:
+        Cj[:, :, index[0], index[1], :] = _mul_add(y_j[:, :, index[0], :], _conj(y_j[:, :, index[1], :]), Cj[:, :, index[0], index[1], :])
+    return Cj

audio_separator/separator/uvr_lib_v5/demucs/hdemucs.py ADDED Viewed

	@@ -0,0 +1,783 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+This code contains the spectrogram and Hybrid version of Demucs.
+"""
+from copy import deepcopy
+import math
+import typing as tp
+import torch
+from torch import nn
+from torch.nn import functional as F
+from .filtering import wiener
+from .demucs import DConv, rescale_module
+from .states import capture_init
+from .spec import spectro, ispectro
+def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = "constant", value: float = 0.0):
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen."""
+    x0 = x
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            extra_pad_right = min(padding_right, extra_pad)
+            extra_pad_left = extra_pad - extra_pad_right
+            paddings = (padding_left - extra_pad_left, padding_right - extra_pad_right)
+            x = F.pad(x, (extra_pad_left, extra_pad_right))
+    out = F.pad(x, paddings, mode, value)
+    assert out.shape[-1] == length + padding_left + padding_right
+    assert (out[..., padding_left : padding_left + length] == x0).all()
+    return out
+class ScaledEmbedding(nn.Module):
+    """
+    Boost learning rate for embeddings (with `scale`).
+    Also, can make embeddings continuous with `smooth`.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int, scale: float = 10.0, smooth=False):
+        super().__init__()
+        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
+        if smooth:
+            weight = torch.cumsum(self.embedding.weight.data, dim=0)
+            # when summing gaussian, overscale raises as sqrt(n), so we nornalize by that.
+            weight = weight / torch.arange(1, num_embeddings + 1).to(weight).sqrt()[:, None]
+            self.embedding.weight.data[:] = weight
+        self.embedding.weight.data /= scale
+        self.scale = scale
+    @property
+    def weight(self):
+        return self.embedding.weight * self.scale
+    def forward(self, x):
+        out = self.embedding(x) * self.scale
+        return out
+class HEncLayer(nn.Module):
+    def __init__(self, chin, chout, kernel_size=8, stride=4, norm_groups=1, empty=False, freq=True, dconv=True, norm=True, context=0, dconv_kw={}, pad=True, rewrite=True):
+        """Encoder layer. This used both by the time and the frequency branch.
+        Args:
+            chin: number of input channels.
+            chout: number of output channels.
+            norm_groups: number of groups for group norm.
+            empty: used to make a layer with just the first conv. this is used
+                before merging the time and freq. branches.
+            freq: this is acting on frequencies.
+            dconv: insert DConv residual branches.
+            norm: use GroupNorm.
+            context: context size for the 1x1 conv.
+            dconv_kw: list of kwargs for the DConv class.
+            pad: pad the input. Padding is done so that the output size is
+                always the input size / stride.
+            rewrite: add 1x1 conv at the end of the layer.
+        """
+        super().__init__()
+        norm_fn = lambda d: nn.Identity()  # noqa
+        if norm:
+            norm_fn = lambda d: nn.GroupNorm(norm_groups, d)  # noqa
+        if pad:
+            pad = kernel_size // 4
+        else:
+            pad = 0
+        klass = nn.Conv1d
+        self.freq = freq
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.empty = empty
+        self.norm = norm
+        self.pad = pad
+        if freq:
+            kernel_size = [kernel_size, 1]
+            stride = [stride, 1]
+            pad = [pad, 0]
+            klass = nn.Conv2d
+        self.conv = klass(chin, chout, kernel_size, stride, pad)
+        if self.empty:
+            return
+        self.norm1 = norm_fn(chout)
+        self.rewrite = None
+        if rewrite:
+            self.rewrite = klass(chout, 2 * chout, 1 + 2 * context, 1, context)
+            self.norm2 = norm_fn(2 * chout)
+        self.dconv = None
+        if dconv:
+            self.dconv = DConv(chout, **dconv_kw)
+    def forward(self, x, inject=None):
+        """
+        `inject` is used to inject the result from the time branch into the frequency branch,
+        when both have the same stride.
+        """
+        if not self.freq and x.dim() == 4:
+            B, C, Fr, T = x.shape
+            x = x.view(B, -1, T)
+        if not self.freq:
+            le = x.shape[-1]
+            if not le % self.stride == 0:
+                x = F.pad(x, (0, self.stride - (le % self.stride)))
+        y = self.conv(x)
+        if self.empty:
+            return y
+        if inject is not None:
+            assert inject.shape[-1] == y.shape[-1], (inject.shape, y.shape)
+            if inject.dim() == 3 and y.dim() == 4:
+                inject = inject[:, :, None]
+            y = y + inject
+        y = F.gelu(self.norm1(y))
+        if self.dconv:
+            if self.freq:
+                B, C, Fr, T = y.shape
+                y = y.permute(0, 2, 1, 3).reshape(-1, C, T)
+            y = self.dconv(y)
+            if self.freq:
+                y = y.view(B, Fr, C, T).permute(0, 2, 1, 3)
+        if self.rewrite:
+            z = self.norm2(self.rewrite(y))
+            z = F.glu(z, dim=1)
+        else:
+            z = y
+        return z
+class MultiWrap(nn.Module):
+    """
+    Takes one layer and replicate it N times. each replica will act
+    on a frequency band. All is done so that if the N replica have the same weights,
+    then this is exactly equivalent to applying the original module on all frequencies.
+    This is a bit over-engineered to avoid edge artifacts when splitting
+    the frequency bands, but it is possible the naive implementation would work as well...
+    """
+    def __init__(self, layer, split_ratios):
+        """
+        Args:
+            layer: module to clone, must be either HEncLayer or HDecLayer.
+            split_ratios: list of float indicating which ratio to keep for each band.
+        """
+        super().__init__()
+        self.split_ratios = split_ratios
+        self.layers = nn.ModuleList()
+        self.conv = isinstance(layer, HEncLayer)
+        assert not layer.norm
+        assert layer.freq
+        assert layer.pad
+        if not self.conv:
+            assert not layer.context_freq
+        for k in range(len(split_ratios) + 1):
+            lay = deepcopy(layer)
+            if self.conv:
+                lay.conv.padding = (0, 0)
+            else:
+                lay.pad = False
+            for m in lay.modules():
+                if hasattr(m, "reset_parameters"):
+                    m.reset_parameters()
+            self.layers.append(lay)
+    def forward(self, x, skip=None, length=None):
+        B, C, Fr, T = x.shape
+        ratios = list(self.split_ratios) + [1]
+        start = 0
+        outs = []
+        for ratio, layer in zip(ratios, self.layers):
+            if self.conv:
+                pad = layer.kernel_size // 4
+                if ratio == 1:
+                    limit = Fr
+                    frames = -1
+                else:
+                    limit = int(round(Fr * ratio))
+                    le = limit - start
+                    if start == 0:
+                        le += pad
+                    frames = round((le - layer.kernel_size) / layer.stride + 1)
+                    limit = start + (frames - 1) * layer.stride + layer.kernel_size
+                    if start == 0:
+                        limit -= pad
+                assert limit - start > 0, (limit, start)
+                assert limit <= Fr, (limit, Fr)
+                y = x[:, :, start:limit, :]
+                if start == 0:
+                    y = F.pad(y, (0, 0, pad, 0))
+                if ratio == 1:
+                    y = F.pad(y, (0, 0, 0, pad))
+                outs.append(layer(y))
+                start = limit - layer.kernel_size + layer.stride
+            else:
+                if ratio == 1:
+                    limit = Fr
+                else:
+                    limit = int(round(Fr * ratio))
+                last = layer.last
+                layer.last = True
+                y = x[:, :, start:limit]
+                s = skip[:, :, start:limit]
+                out, _ = layer(y, s, None)
+                if outs:
+                    outs[-1][:, :, -layer.stride :] += out[:, :, : layer.stride] - layer.conv_tr.bias.view(1, -1, 1, 1)
+                    out = out[:, :, layer.stride :]
+                if ratio == 1:
+                    out = out[:, :, : -layer.stride // 2, :]
+                if start == 0:
+                    out = out[:, :, layer.stride // 2 :, :]
+                outs.append(out)
+                layer.last = last
+                start = limit
+        out = torch.cat(outs, dim=2)
+        if not self.conv and not last:
+            out = F.gelu(out)
+        if self.conv:
+            return out
+        else:
+            return out, None
+class HDecLayer(nn.Module):
+    def __init__(
+        self, chin, chout, last=False, kernel_size=8, stride=4, norm_groups=1, empty=False, freq=True, dconv=True, norm=True, context=1, dconv_kw={}, pad=True, context_freq=True, rewrite=True
+    ):
+        """
+        Same as HEncLayer but for decoder. See `HEncLayer` for documentation.
+        """
+        super().__init__()
+        norm_fn = lambda d: nn.Identity()  # noqa
+        if norm:
+            norm_fn = lambda d: nn.GroupNorm(norm_groups, d)  # noqa
+        if pad:
+            pad = kernel_size // 4
+        else:
+            pad = 0
+        self.pad = pad
+        self.last = last
+        self.freq = freq
+        self.chin = chin
+        self.empty = empty
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.norm = norm
+        self.context_freq = context_freq
+        klass = nn.Conv1d
+        klass_tr = nn.ConvTranspose1d
+        if freq:
+            kernel_size = [kernel_size, 1]
+            stride = [stride, 1]
+            klass = nn.Conv2d
+            klass_tr = nn.ConvTranspose2d
+        self.conv_tr = klass_tr(chin, chout, kernel_size, stride)
+        self.norm2 = norm_fn(chout)
+        if self.empty:
+            return
+        self.rewrite = None
+        if rewrite:
+            if context_freq:
+                self.rewrite = klass(chin, 2 * chin, 1 + 2 * context, 1, context)
+            else:
+                self.rewrite = klass(chin, 2 * chin, [1, 1 + 2 * context], 1, [0, context])
+            self.norm1 = norm_fn(2 * chin)
+        self.dconv = None
+        if dconv:
+            self.dconv = DConv(chin, **dconv_kw)
+    def forward(self, x, skip, length):
+        if self.freq and x.dim() == 3:
+            B, C, T = x.shape
+            x = x.view(B, self.chin, -1, T)
+        if not self.empty:
+            x = x + skip
+            if self.rewrite:
+                y = F.glu(self.norm1(self.rewrite(x)), dim=1)
+            else:
+                y = x
+            if self.dconv:
+                if self.freq:
+                    B, C, Fr, T = y.shape
+                    y = y.permute(0, 2, 1, 3).reshape(-1, C, T)
+                y = self.dconv(y)
+                if self.freq:
+                    y = y.view(B, Fr, C, T).permute(0, 2, 1, 3)
+        else:
+            y = x
+            assert skip is None
+        z = self.norm2(self.conv_tr(y))
+        if self.freq:
+            if self.pad:
+                z = z[..., self.pad : -self.pad, :]
+        else:
+            z = z[..., self.pad : self.pad + length]
+            assert z.shape[-1] == length, (z.shape[-1], length)
+        if not self.last:
+            z = F.gelu(z)
+        return z, y
+class HDemucs(nn.Module):
+    """
+    Spectrogram and hybrid Demucs model.
+    The spectrogram model has the same structure as Demucs, except the first few layers are over the
+    frequency axis, until there is only 1 frequency, and then it moves to time convolutions.
+    Frequency layers can still access information across time steps thanks to the DConv residual.
+    Hybrid model have a parallel time branch. At some layer, the time branch has the same stride
+    as the frequency branch and then the two are combined. The opposite happens in the decoder.
+    Models can either use naive iSTFT from masking, Wiener filtering ([Ulhih et al. 2017]),
+    or complex as channels (CaC) [Choi et al. 2020]. Wiener filtering is based on
+    Open Unmix implementation [Stoter et al. 2019].
+    The loss is always on the temporal domain, by backpropagating through the above
+    output methods and iSTFT. This allows to define hybrid models nicely. However, this breaks
+    a bit Wiener filtering, as doing more iteration at test time will change the spectrogram
+    contribution, without changing the one from the waveform, which will lead to worse performance.
+    I tried using the residual option in OpenUnmix Wiener implementation, but it didn't improve.
+    CaC on the other hand provides similar performance for hybrid, and works naturally with
+    hybrid models.
+    This model also uses frequency embeddings are used to improve efficiency on convolutions
+    over the freq. axis, following [Isik et al. 2020] (https://arxiv.org/pdf/2008.04470.pdf).
+    Unlike classic Demucs, there is no resampling here, and normalization is always applied.
+    """
+    @capture_init
+    def __init__(
+        self,
+        sources,
+        # Channels
+        audio_channels=2,
+        channels=48,
+        channels_time=None,
+        growth=2,
+        # STFT
+        nfft=4096,
+        wiener_iters=0,
+        end_iters=0,
+        wiener_residual=False,
+        cac=True,
+        # Main structure
+        depth=6,
+        rewrite=True,
+        hybrid=True,
+        hybrid_old=False,
+        # Frequency branch
+        multi_freqs=None,
+        multi_freqs_depth=2,
+        freq_emb=0.2,
+        emb_scale=10,
+        emb_smooth=True,
+        # Convolutions
+        kernel_size=8,
+        time_stride=2,
+        stride=4,
+        context=1,
+        context_enc=0,
+        # Normalization
+        norm_starts=4,
+        norm_groups=4,
+        # DConv residual branch
+        dconv_mode=1,
+        dconv_depth=2,
+        dconv_comp=4,
+        dconv_attn=4,
+        dconv_lstm=4,
+        dconv_init=1e-4,
+        # Weight init
+        rescale=0.1,
+        # Metadata
+        samplerate=44100,
+        segment=4 * 10,
+    ):
+        """
+        Args:
+            sources (list[str]): list of source names.
+            audio_channels (int): input/output audio channels.
+            channels (int): initial number of hidden channels.
+            channels_time: if not None, use a different `channels` value for the time branch.
+            growth: increase the number of hidden channels by this factor at each layer.
+            nfft: number of fft bins. Note that changing this require careful computation of
+                various shape parameters and will not work out of the box for hybrid models.
+            wiener_iters: when using Wiener filtering, number of iterations at test time.
+            end_iters: same but at train time. For a hybrid model, must be equal to `wiener_iters`.
+            wiener_residual: add residual source before wiener filtering.
+            cac: uses complex as channels, i.e. complex numbers are 2 channels each
+                in input and output. no further processing is done before ISTFT.
+            depth (int): number of layers in the encoder and in the decoder.
+            rewrite (bool): add 1x1 convolution to each layer.
+            hybrid (bool): make a hybrid time/frequency domain, otherwise frequency only.
+            hybrid_old: some models trained for MDX had a padding bug. This replicates
+                this bug to avoid retraining them.
+            multi_freqs: list of frequency ratios for splitting frequency bands with `MultiWrap`.
+            multi_freqs_depth: how many layers to wrap with `MultiWrap`. Only the outermost
+                layers will be wrapped.
+            freq_emb: add frequency embedding after the first frequency layer if > 0,
+                the actual value controls the weight of the embedding.
+            emb_scale: equivalent to scaling the embedding learning rate
+            emb_smooth: initialize the embedding with a smooth one (with respect to frequencies).
+            kernel_size: kernel_size for encoder and decoder layers.
+            stride: stride for encoder and decoder layers.
+            time_stride: stride for the final time layer, after the merge.
+            context: context for 1x1 conv in the decoder.
+            context_enc: context for 1x1 conv in the encoder.
+            norm_starts: layer at which group norm starts being used.
+                decoder layers are numbered in reverse order.
+            norm_groups: number of groups for group norm.
+            dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both.
+            dconv_depth: depth of residual DConv branch.
+            dconv_comp: compression of DConv branch.
+            dconv_attn: adds attention layers in DConv branch starting at this layer.
+            dconv_lstm: adds a LSTM layer in DConv branch starting at this layer.
+            dconv_init: initial scale for the DConv branch LayerScale.
+            rescale: weight recaling trick
+        """
+        super().__init__()
+        self.cac = cac
+        self.wiener_residual = wiener_residual
+        self.audio_channels = audio_channels
+        self.sources = sources
+        self.kernel_size = kernel_size
+        self.context = context
+        self.stride = stride
+        self.depth = depth
+        self.channels = channels
+        self.samplerate = samplerate
+        self.segment = segment
+        self.nfft = nfft
+        self.hop_length = nfft // 4
+        self.wiener_iters = wiener_iters
+        self.end_iters = end_iters
+        self.freq_emb = None
+        self.hybrid = hybrid
+        self.hybrid_old = hybrid_old
+        if hybrid_old:
+            assert hybrid, "hybrid_old must come with hybrid=True"
+        if hybrid:
+            assert wiener_iters == end_iters
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        if hybrid:
+            self.tencoder = nn.ModuleList()
+            self.tdecoder = nn.ModuleList()
+        chin = audio_channels
+        chin_z = chin  # number of channels for the freq branch
+        if self.cac:
+            chin_z *= 2
+        chout = channels_time or channels
+        chout_z = channels
+        freqs = nfft // 2
+        for index in range(depth):
+            lstm = index >= dconv_lstm
+            attn = index >= dconv_attn
+            norm = index >= norm_starts
+            freq = freqs > 1
+            stri = stride
+            ker = kernel_size
+            if not freq:
+                assert freqs == 1
+                ker = time_stride * 2
+                stri = time_stride
+            pad = True
+            last_freq = False
+            if freq and freqs <= kernel_size:
+                ker = freqs
+                pad = False
+                last_freq = True
+            kw = {
+                "kernel_size": ker,
+                "stride": stri,
+                "freq": freq,
+                "pad": pad,
+                "norm": norm,
+                "rewrite": rewrite,
+                "norm_groups": norm_groups,
+                "dconv_kw": {"lstm": lstm, "attn": attn, "depth": dconv_depth, "compress": dconv_comp, "init": dconv_init, "gelu": True},
+            }
+            kwt = dict(kw)
+            kwt["freq"] = 0
+            kwt["kernel_size"] = kernel_size
+            kwt["stride"] = stride
+            kwt["pad"] = True
+            kw_dec = dict(kw)
+            multi = False
+            if multi_freqs and index < multi_freqs_depth:
+                multi = True
+                kw_dec["context_freq"] = False
+            if last_freq:
+                chout_z = max(chout, chout_z)
+                chout = chout_z
+            enc = HEncLayer(chin_z, chout_z, dconv=dconv_mode & 1, context=context_enc, **kw)
+            if hybrid and freq:
+                tenc = HEncLayer(chin, chout, dconv=dconv_mode & 1, context=context_enc, empty=last_freq, **kwt)
+                self.tencoder.append(tenc)
+            if multi:
+                enc = MultiWrap(enc, multi_freqs)
+            self.encoder.append(enc)
+            if index == 0:
+                chin = self.audio_channels * len(self.sources)
+                chin_z = chin
+                if self.cac:
+                    chin_z *= 2
+            dec = HDecLayer(chout_z, chin_z, dconv=dconv_mode & 2, last=index == 0, context=context, **kw_dec)
+            if multi:
+                dec = MultiWrap(dec, multi_freqs)
+            if hybrid and freq:
+                tdec = HDecLayer(chout, chin, dconv=dconv_mode & 2, empty=last_freq, last=index == 0, context=context, **kwt)
+                self.tdecoder.insert(0, tdec)
+            self.decoder.insert(0, dec)
+            chin = chout
+            chin_z = chout_z
+            chout = int(growth * chout)
+            chout_z = int(growth * chout_z)
+            if freq:
+                if freqs <= kernel_size:
+                    freqs = 1
+                else:
+                    freqs //= stride
+            if index == 0 and freq_emb:
+                self.freq_emb = ScaledEmbedding(freqs, chin_z, smooth=emb_smooth, scale=emb_scale)
+                self.freq_emb_scale = freq_emb
+        if rescale:
+            rescale_module(self, reference=rescale)
+    def _spec(self, x):
+        hl = self.hop_length
+        nfft = self.nfft
+        x0 = x  # noqa
+        if self.hybrid:
+            # We re-pad the signal in order to keep the property
+            # that the size of the output is exactly the size of the input
+            # divided by the stride (here hop_length), when divisible.
+            # This is achieved by padding by 1/4th of the kernel size (here nfft).
+            # which is not supported by torch.stft.
+            # Having all convolution operations follow this convention allow to easily
+            # align the time and frequency branches later on.
+            assert hl == nfft // 4
+            le = int(math.ceil(x.shape[-1] / hl))
+            pad = hl // 2 * 3
+            if not self.hybrid_old:
+                x = pad1d(x, (pad, pad + le * hl - x.shape[-1]), mode="reflect")
+            else:
+                x = pad1d(x, (pad, pad + le * hl - x.shape[-1]))
+        z = spectro(x, nfft, hl)[..., :-1, :]
+        if self.hybrid:
+            assert z.shape[-1] == le + 4, (z.shape, x.shape, le)
+            z = z[..., 2 : 2 + le]
+        return z
+    def _ispec(self, z, length=None, scale=0):
+        hl = self.hop_length // (4**scale)
+        z = F.pad(z, (0, 0, 0, 1))
+        if self.hybrid:
+            z = F.pad(z, (2, 2))
+            pad = hl // 2 * 3
+            if not self.hybrid_old:
+                le = hl * int(math.ceil(length / hl)) + 2 * pad
+            else:
+                le = hl * int(math.ceil(length / hl))
+            x = ispectro(z, hl, length=le)
+            if not self.hybrid_old:
+                x = x[..., pad : pad + length]
+            else:
+                x = x[..., :length]
+        else:
+            x = ispectro(z, hl, length)
+        return x
+    def _magnitude(self, z):
+        # return the magnitude of the spectrogram, except when cac is True,
+        # in which case we just move the complex dimension to the channel one.
+        if self.cac:
+            B, C, Fr, T = z.shape
+            m = torch.view_as_real(z).permute(0, 1, 4, 2, 3)
+            m = m.reshape(B, C * 2, Fr, T)
+        else:
+            m = z.abs()
+        return m
+    def _mask(self, z, m):
+        # Apply masking given the mixture spectrogram `z` and the estimated mask `m`.
+        # If `cac` is True, `m` is actually a full spectrogram and `z` is ignored.
+        niters = self.wiener_iters
+        if self.cac:
+            B, S, C, Fr, T = m.shape
+            out = m.view(B, S, -1, 2, Fr, T).permute(0, 1, 2, 4, 5, 3)
+            out = torch.view_as_complex(out.contiguous())
+            return out
+        if self.training:
+            niters = self.end_iters
+        if niters < 0:
+            z = z[:, None]
+            return z / (1e-8 + z.abs()) * m
+        else:
+            return self._wiener(m, z, niters)
+    def _wiener(self, mag_out, mix_stft, niters):
+        # apply wiener filtering from OpenUnmix.
+        init = mix_stft.dtype
+        wiener_win_len = 300
+        residual = self.wiener_residual
+        B, S, C, Fq, T = mag_out.shape
+        mag_out = mag_out.permute(0, 4, 3, 2, 1)
+        mix_stft = torch.view_as_real(mix_stft.permute(0, 3, 2, 1))
+        outs = []
+        for sample in range(B):
+            pos = 0
+            out = []
+            for pos in range(0, T, wiener_win_len):
+                frame = slice(pos, pos + wiener_win_len)
+                z_out = wiener(mag_out[sample, frame], mix_stft[sample, frame], niters, residual=residual)
+                out.append(z_out.transpose(-1, -2))
+            outs.append(torch.cat(out, dim=0))
+        out = torch.view_as_complex(torch.stack(outs, 0))
+        out = out.permute(0, 4, 3, 2, 1).contiguous()
+        if residual:
+            out = out[:, :-1]
+        assert list(out.shape) == [B, S, C, Fq, T]
+        return out.to(init)
+    def forward(self, mix):
+        x = mix
+        length = x.shape[-1]
+        z = self._spec(mix)
+        mag = self._magnitude(z).to(mix.device)
+        x = mag
+        B, C, Fq, T = x.shape
+        # unlike previous Demucs, we always normalize because it is easier.
+        mean = x.mean(dim=(1, 2, 3), keepdim=True)
+        std = x.std(dim=(1, 2, 3), keepdim=True)
+        x = (x - mean) / (1e-5 + std)
+        # x will be the freq. branch input.
+        if self.hybrid:
+            # Prepare the time branch input.
+            xt = mix
+            meant = xt.mean(dim=(1, 2), keepdim=True)
+            stdt = xt.std(dim=(1, 2), keepdim=True)
+            xt = (xt - meant) / (1e-5 + stdt)
+        # okay, this is a giant mess I know...
+        saved = []  # skip connections, freq.
+        saved_t = []  # skip connections, time.
+        lengths = []  # saved lengths to properly remove padding, freq branch.
+        lengths_t = []  # saved lengths for time branch.
+        for idx, encode in enumerate(self.encoder):
+            lengths.append(x.shape[-1])
+            inject = None
+            if self.hybrid and idx < len(self.tencoder):
+                # we have not yet merged branches.
+                lengths_t.append(xt.shape[-1])
+                tenc = self.tencoder[idx]
+                xt = tenc(xt)
+                if not tenc.empty:
+                    # save for skip connection
+                    saved_t.append(xt)
+                else:
+                    # tenc contains just the first conv., so that now time and freq.
+                    # branches have the same shape and can be merged.
+                    inject = xt
+            x = encode(x, inject)
+            if idx == 0 and self.freq_emb is not None:
+                # add frequency embedding to allow for non equivariant convolutions
+                # over the frequency axis.
+                frs = torch.arange(x.shape[-2], device=x.device)
+                emb = self.freq_emb(frs).t()[None, :, :, None].expand_as(x)
+                x = x + self.freq_emb_scale * emb
+            saved.append(x)
+        x = torch.zeros_like(x)
+        if self.hybrid:
+            xt = torch.zeros_like(x)
+        # initialize everything to zero (signal will go through u-net skips).
+        for idx, decode in enumerate(self.decoder):
+            skip = saved.pop(-1)
+            x, pre = decode(x, skip, lengths.pop(-1))
+            # `pre` contains the output just before final transposed convolution,
+            # which is used when the freq. and time branch separate.
+            if self.hybrid:
+                offset = self.depth - len(self.tdecoder)
+            if self.hybrid and idx >= offset:
+                tdec = self.tdecoder[idx - offset]
+                length_t = lengths_t.pop(-1)
+                if tdec.empty:
+                    assert pre.shape[2] == 1, pre.shape
+                    pre = pre[:, :, 0]
+                    xt, _ = tdec(pre, None, length_t)
+                else:
+                    skip = saved_t.pop(-1)
+                    xt, _ = tdec(xt, skip, length_t)
+        # Let's make sure we used all stored skip connections.
+        assert len(saved) == 0
+        assert len(lengths_t) == 0
+        assert len(saved_t) == 0
+        S = len(self.sources)
+        x = x.view(B, S, -1, Fq, T)
+        x = x * std[:, None] + mean[:, None]
+        # to cpu as non-cuda GPUs don't support complex numbers
+        # demucs issue #435 ##432
+        # NOTE: in this case z already is on cpu
+        # TODO: remove this when mps supports complex numbers
+        device_type = x.device.type
+        device_load = f"{device_type}:{x.device.index}" if not device_type == "mps" else device_type
+        x_is_other_gpu = not device_type in ["cuda", "cpu"]
+        if x_is_other_gpu:
+            x = x.cpu()
+        zout = self._mask(z, x)
+        x = self._ispec(zout, length)
+        # back to other device
+        if x_is_other_gpu:
+            x = x.to(device_load)
+        if self.hybrid:
+            xt = xt.view(B, S, -1, length)
+            xt = xt * stdt[:, None] + meant[:, None]
+            x = xt + x
+        return x

audio_separator/separator/uvr_lib_v5/demucs/htdemucs.py ADDED Viewed

	@@ -0,0 +1,620 @@

+# Copyright (c) Meta, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# First author is Simon Rouard.
+"""
+This code contains the spectrogram and Hybrid version of Demucs.
+"""
+import math
+from .filtering import wiener
+import torch
+from torch import nn
+from torch.nn import functional as F
+from fractions import Fraction
+from einops import rearrange
+from .transformer import CrossTransformerEncoder
+from .demucs import rescale_module
+from .states import capture_init
+from .spec import spectro, ispectro
+from .hdemucs import pad1d, ScaledEmbedding, HEncLayer, MultiWrap, HDecLayer
+class HTDemucs(nn.Module):
+    """
+    Spectrogram and hybrid Demucs model.
+    The spectrogram model has the same structure as Demucs, except the first few layers are over the
+    frequency axis, until there is only 1 frequency, and then it moves to time convolutions.
+    Frequency layers can still access information across time steps thanks to the DConv residual.
+    Hybrid model have a parallel time branch. At some layer, the time branch has the same stride
+    as the frequency branch and then the two are combined. The opposite happens in the decoder.
+    Models can either use naive iSTFT from masking, Wiener filtering ([Ulhih et al. 2017]),
+    or complex as channels (CaC) [Choi et al. 2020]. Wiener filtering is based on
+    Open Unmix implementation [Stoter et al. 2019].
+    The loss is always on the temporal domain, by backpropagating through the above
+    output methods and iSTFT. This allows to define hybrid models nicely. However, this breaks
+    a bit Wiener filtering, as doing more iteration at test time will change the spectrogram
+    contribution, without changing the one from the waveform, which will lead to worse performance.
+    I tried using the residual option in OpenUnmix Wiener implementation, but it didn't improve.
+    CaC on the other hand provides similar performance for hybrid, and works naturally with
+    hybrid models.
+    This model also uses frequency embeddings are used to improve efficiency on convolutions
+    over the freq. axis, following [Isik et al. 2020] (https://arxiv.org/pdf/2008.04470.pdf).
+    Unlike classic Demucs, there is no resampling here, and normalization is always applied.
+    """
+    @capture_init
+    def __init__(
+        self,
+        sources,
+        # Channels
+        audio_channels=2,
+        channels=48,
+        channels_time=None,
+        growth=2,
+        # STFT
+        nfft=4096,
+        wiener_iters=0,
+        end_iters=0,
+        wiener_residual=False,
+        cac=True,
+        # Main structure
+        depth=4,
+        rewrite=True,
+        # Frequency branch
+        multi_freqs=None,
+        multi_freqs_depth=3,
+        freq_emb=0.2,
+        emb_scale=10,
+        emb_smooth=True,
+        # Convolutions
+        kernel_size=8,
+        time_stride=2,
+        stride=4,
+        context=1,
+        context_enc=0,
+        # Normalization
+        norm_starts=4,
+        norm_groups=4,
+        # DConv residual branch
+        dconv_mode=1,
+        dconv_depth=2,
+        dconv_comp=8,
+        dconv_init=1e-3,
+        # Before the Transformer
+        bottom_channels=0,
+        # Transformer
+        t_layers=5,
+        t_emb="sin",
+        t_hidden_scale=4.0,
+        t_heads=8,
+        t_dropout=0.0,
+        t_max_positions=10000,
+        t_norm_in=True,
+        t_norm_in_group=False,
+        t_group_norm=False,
+        t_norm_first=True,
+        t_norm_out=True,
+        t_max_period=10000.0,
+        t_weight_decay=0.0,
+        t_lr=None,
+        t_layer_scale=True,
+        t_gelu=True,
+        t_weight_pos_embed=1.0,
+        t_sin_random_shift=0,
+        t_cape_mean_normalize=True,
+        t_cape_augment=True,
+        t_cape_glob_loc_scale=[5000.0, 1.0, 1.4],
+        t_sparse_self_attn=False,
+        t_sparse_cross_attn=False,
+        t_mask_type="diag",
+        t_mask_random_seed=42,
+        t_sparse_attn_window=500,
+        t_global_window=100,
+        t_sparsity=0.95,
+        t_auto_sparsity=False,
+        # ------ Particuliar parameters
+        t_cross_first=False,
+        # Weight init
+        rescale=0.1,
+        # Metadata
+        samplerate=44100,
+        segment=10,
+        use_train_segment=True,
+    ):
+        """
+        Args:
+            sources (list[str]): list of source names.
+            audio_channels (int): input/output audio channels.
+            channels (int): initial number of hidden channels.
+            channels_time: if not None, use a different `channels` value for the time branch.
+            growth: increase the number of hidden channels by this factor at each layer.
+            nfft: number of fft bins. Note that changing this require careful computation of
+                various shape parameters and will not work out of the box for hybrid models.
+            wiener_iters: when using Wiener filtering, number of iterations at test time.
+            end_iters: same but at train time. For a hybrid model, must be equal to `wiener_iters`.
+            wiener_residual: add residual source before wiener filtering.
+            cac: uses complex as channels, i.e. complex numbers are 2 channels each
+                in input and output. no further processing is done before ISTFT.
+            depth (int): number of layers in the encoder and in the decoder.
+            rewrite (bool): add 1x1 convolution to each layer.
+            multi_freqs: list of frequency ratios for splitting frequency bands with `MultiWrap`.
+            multi_freqs_depth: how many layers to wrap with `MultiWrap`. Only the outermost
+                layers will be wrapped.
+            freq_emb: add frequency embedding after the first frequency layer if > 0,
+                the actual value controls the weight of the embedding.
+            emb_scale: equivalent to scaling the embedding learning rate
+            emb_smooth: initialize the embedding with a smooth one (with respect to frequencies).
+            kernel_size: kernel_size for encoder and decoder layers.
+            stride: stride for encoder and decoder layers.
+            time_stride: stride for the final time layer, after the merge.
+            context: context for 1x1 conv in the decoder.
+            context_enc: context for 1x1 conv in the encoder.
+            norm_starts: layer at which group norm starts being used.
+                decoder layers are numbered in reverse order.
+            norm_groups: number of groups for group norm.
+            dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both.
+            dconv_depth: depth of residual DConv branch.
+            dconv_comp: compression of DConv branch.
+            dconv_attn: adds attention layers in DConv branch starting at this layer.
+            dconv_lstm: adds a LSTM layer in DConv branch starting at this layer.
+            dconv_init: initial scale for the DConv branch LayerScale.
+            bottom_channels: if >0 it adds a linear layer (1x1 Conv) before and after the
+                transformer in order to change the number of channels
+            t_layers: number of layers in each branch (waveform and spec) of the transformer
+            t_emb: "sin", "cape" or "scaled"
+            t_hidden_scale: the hidden scale of the Feedforward parts of the transformer
+                for instance if C = 384 (the number of channels in the transformer) and
+                t_hidden_scale = 4.0 then the intermediate layer of the FFN has dimension
+                384 * 4 = 1536
+            t_heads: number of heads for the transformer
+            t_dropout: dropout in the transformer
+            t_max_positions: max_positions for the "scaled" positional embedding, only
+                useful if t_emb="scaled"
+            t_norm_in: (bool) norm before addinf positional embedding and getting into the
+                transformer layers
+            t_norm_in_group: (bool) if True while t_norm_in=True, the norm is on all the
+                timesteps (GroupNorm with group=1)
+            t_group_norm: (bool) if True, the norms of the Encoder Layers are on all the
+                timesteps (GroupNorm with group=1)
+            t_norm_first: (bool) if True the norm is before the attention and before the FFN
+            t_norm_out: (bool) if True, there is a GroupNorm (group=1) at the end of each layer
+            t_max_period: (float) denominator in the sinusoidal embedding expression
+            t_weight_decay: (float) weight decay for the transformer
+            t_lr: (float) specific learning rate for the transformer
+            t_layer_scale: (bool) Layer Scale for the transformer
+            t_gelu: (bool) activations of the transformer are GeLU if True, ReLU else
+            t_weight_pos_embed: (float) weighting of the positional embedding
+            t_cape_mean_normalize: (bool) if t_emb="cape", normalisation of positional embeddings
+                see: https://arxiv.org/abs/2106.03143
+            t_cape_augment: (bool) if t_emb="cape", must be True during training and False
+                during the inference, see: https://arxiv.org/abs/2106.03143
+            t_cape_glob_loc_scale: (list of 3 floats) if t_emb="cape", CAPE parameters
+                see: https://arxiv.org/abs/2106.03143
+            t_sparse_self_attn: (bool) if True, the self attentions are sparse
+            t_sparse_cross_attn: (bool) if True, the cross-attentions are sparse (don't use it
+                unless you designed really specific masks)
+            t_mask_type: (str) can be "diag", "jmask", "random", "global" or any combination
+                with '_' between: i.e. "diag_jmask_random" (note that this is permutation
+                invariant i.e. "diag_jmask_random" is equivalent to "jmask_random_diag")
+            t_mask_random_seed: (int) if "random" is in t_mask_type, controls the seed
+                that generated the random part of the mask
+            t_sparse_attn_window: (int) if "diag" is in t_mask_type, for a query (i), and
+                a key (j), the mask is True id |i-j|<=t_sparse_attn_window
+            t_global_window: (int) if "global" is in t_mask_type, mask[:t_global_window, :]
+                and mask[:, :t_global_window] will be True
+            t_sparsity: (float) if "random" is in t_mask_type, t_sparsity is the sparsity
+                level of the random part of the mask.
+            t_cross_first: (bool) if True cross attention is the first layer of the
+                transformer (False seems to be better)
+            rescale: weight rescaling trick
+            use_train_segment: (bool) if True, the actual size that is used during the
+                training is used during inference.
+        """
+        super().__init__()
+        self.cac = cac
+        self.wiener_residual = wiener_residual
+        self.audio_channels = audio_channels
+        self.sources = sources
+        self.kernel_size = kernel_size
+        self.context = context
+        self.stride = stride
+        self.depth = depth
+        self.bottom_channels = bottom_channels
+        self.channels = channels
+        self.samplerate = samplerate
+        self.segment = segment
+        self.use_train_segment = use_train_segment
+        self.nfft = nfft
+        self.hop_length = nfft // 4
+        self.wiener_iters = wiener_iters
+        self.end_iters = end_iters
+        self.freq_emb = None
+        assert wiener_iters == end_iters
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        self.tencoder = nn.ModuleList()
+        self.tdecoder = nn.ModuleList()
+        chin = audio_channels
+        chin_z = chin  # number of channels for the freq branch
+        if self.cac:
+            chin_z *= 2
+        chout = channels_time or channels
+        chout_z = channels
+        freqs = nfft // 2
+        for index in range(depth):
+            norm = index >= norm_starts
+            freq = freqs > 1
+            stri = stride
+            ker = kernel_size
+            if not freq:
+                assert freqs == 1
+                ker = time_stride * 2
+                stri = time_stride
+            pad = True
+            last_freq = False
+            if freq and freqs <= kernel_size:
+                ker = freqs
+                pad = False
+                last_freq = True
+            kw = {
+                "kernel_size": ker,
+                "stride": stri,
+                "freq": freq,
+                "pad": pad,
+                "norm": norm,
+                "rewrite": rewrite,
+                "norm_groups": norm_groups,
+                "dconv_kw": {"depth": dconv_depth, "compress": dconv_comp, "init": dconv_init, "gelu": True},
+            }
+            kwt = dict(kw)
+            kwt["freq"] = 0
+            kwt["kernel_size"] = kernel_size
+            kwt["stride"] = stride
+            kwt["pad"] = True
+            kw_dec = dict(kw)
+            multi = False
+            if multi_freqs and index < multi_freqs_depth:
+                multi = True
+                kw_dec["context_freq"] = False
+            if last_freq:
+                chout_z = max(chout, chout_z)
+                chout = chout_z
+            enc = HEncLayer(chin_z, chout_z, dconv=dconv_mode & 1, context=context_enc, **kw)
+            if freq:
+                tenc = HEncLayer(chin, chout, dconv=dconv_mode & 1, context=context_enc, empty=last_freq, **kwt)
+                self.tencoder.append(tenc)
+            if multi:
+                enc = MultiWrap(enc, multi_freqs)
+            self.encoder.append(enc)
+            if index == 0:
+                chin = self.audio_channels * len(self.sources)
+                chin_z = chin
+                if self.cac:
+                    chin_z *= 2
+            dec = HDecLayer(chout_z, chin_z, dconv=dconv_mode & 2, last=index == 0, context=context, **kw_dec)
+            if multi:
+                dec = MultiWrap(dec, multi_freqs)
+            if freq:
+                tdec = HDecLayer(chout, chin, dconv=dconv_mode & 2, empty=last_freq, last=index == 0, context=context, **kwt)
+                self.tdecoder.insert(0, tdec)
+            self.decoder.insert(0, dec)
+            chin = chout
+            chin_z = chout_z
+            chout = int(growth * chout)
+            chout_z = int(growth * chout_z)
+            if freq:
+                if freqs <= kernel_size:
+                    freqs = 1
+                else:
+                    freqs //= stride
+            if index == 0 and freq_emb:
+                self.freq_emb = ScaledEmbedding(freqs, chin_z, smooth=emb_smooth, scale=emb_scale)
+                self.freq_emb_scale = freq_emb
+        if rescale:
+            rescale_module(self, reference=rescale)
+        transformer_channels = channels * growth ** (depth - 1)
+        if bottom_channels:
+            self.channel_upsampler = nn.Conv1d(transformer_channels, bottom_channels, 1)
+            self.channel_downsampler = nn.Conv1d(bottom_channels, transformer_channels, 1)
+            self.channel_upsampler_t = nn.Conv1d(transformer_channels, bottom_channels, 1)
+            self.channel_downsampler_t = nn.Conv1d(bottom_channels, transformer_channels, 1)
+            transformer_channels = bottom_channels
+        if t_layers > 0:
+            self.crosstransformer = CrossTransformerEncoder(
+                dim=transformer_channels,
+                emb=t_emb,
+                hidden_scale=t_hidden_scale,
+                num_heads=t_heads,
+                num_layers=t_layers,
+                cross_first=t_cross_first,
+                dropout=t_dropout,
+                max_positions=t_max_positions,
+                norm_in=t_norm_in,
+                norm_in_group=t_norm_in_group,
+                group_norm=t_group_norm,
+                norm_first=t_norm_first,
+                norm_out=t_norm_out,
+                max_period=t_max_period,
+                weight_decay=t_weight_decay,
+                lr=t_lr,
+                layer_scale=t_layer_scale,
+                gelu=t_gelu,
+                sin_random_shift=t_sin_random_shift,
+                weight_pos_embed=t_weight_pos_embed,
+                cape_mean_normalize=t_cape_mean_normalize,
+                cape_augment=t_cape_augment,
+                cape_glob_loc_scale=t_cape_glob_loc_scale,
+                sparse_self_attn=t_sparse_self_attn,
+                sparse_cross_attn=t_sparse_cross_attn,
+                mask_type=t_mask_type,
+                mask_random_seed=t_mask_random_seed,
+                sparse_attn_window=t_sparse_attn_window,
+                global_window=t_global_window,
+                sparsity=t_sparsity,
+                auto_sparsity=t_auto_sparsity,
+            )
+        else:
+            self.crosstransformer = None
+    def _spec(self, x):
+        hl = self.hop_length
+        nfft = self.nfft
+        x0 = x  # noqa
+        # We re-pad the signal in order to keep the property
+        # that the size of the output is exactly the size of the input
+        # divided by the stride (here hop_length), when divisible.
+        # This is achieved by padding by 1/4th of the kernel size (here nfft).
+        # which is not supported by torch.stft.
+        # Having all convolution operations follow this convention allow to easily
+        # align the time and frequency branches later on.
+        assert hl == nfft // 4
+        le = int(math.ceil(x.shape[-1] / hl))
+        pad = hl // 2 * 3
+        x = pad1d(x, (pad, pad + le * hl - x.shape[-1]), mode="reflect")
+        z = spectro(x, nfft, hl)[..., :-1, :]
+        assert z.shape[-1] == le + 4, (z.shape, x.shape, le)
+        z = z[..., 2 : 2 + le]
+        return z
+    def _ispec(self, z, length=None, scale=0):
+        hl = self.hop_length // (4**scale)
+        z = F.pad(z, (0, 0, 0, 1))
+        z = F.pad(z, (2, 2))
+        pad = hl // 2 * 3
+        le = hl * int(math.ceil(length / hl)) + 2 * pad
+        x = ispectro(z, hl, length=le)
+        x = x[..., pad : pad + length]
+        return x
+    def _magnitude(self, z):
+        # return the magnitude of the spectrogram, except when cac is True,
+        # in which case we just move the complex dimension to the channel one.
+        if self.cac:
+            B, C, Fr, T = z.shape
+            m = torch.view_as_real(z).permute(0, 1, 4, 2, 3)
+            m = m.reshape(B, C * 2, Fr, T)
+        else:
+            m = z.abs()
+        return m
+    def _mask(self, z, m):
+        # Apply masking given the mixture spectrogram `z` and the estimated mask `m`.
+        # If `cac` is True, `m` is actually a full spectrogram and `z` is ignored.
+        niters = self.wiener_iters
+        if self.cac:
+            B, S, C, Fr, T = m.shape
+            out = m.view(B, S, -1, 2, Fr, T).permute(0, 1, 2, 4, 5, 3)
+            out = torch.view_as_complex(out.contiguous())
+            return out
+        if self.training:
+            niters = self.end_iters
+        if niters < 0:
+            z = z[:, None]
+            return z / (1e-8 + z.abs()) * m
+        else:
+            return self._wiener(m, z, niters)
+    def _wiener(self, mag_out, mix_stft, niters):
+        # apply wiener filtering from OpenUnmix.
+        init = mix_stft.dtype
+        wiener_win_len = 300
+        residual = self.wiener_residual
+        B, S, C, Fq, T = mag_out.shape
+        mag_out = mag_out.permute(0, 4, 3, 2, 1)
+        mix_stft = torch.view_as_real(mix_stft.permute(0, 3, 2, 1))
+        outs = []
+        for sample in range(B):
+            pos = 0
+            out = []
+            for pos in range(0, T, wiener_win_len):
+                frame = slice(pos, pos + wiener_win_len)
+                z_out = wiener(mag_out[sample, frame], mix_stft[sample, frame], niters, residual=residual)
+                out.append(z_out.transpose(-1, -2))
+            outs.append(torch.cat(out, dim=0))
+        out = torch.view_as_complex(torch.stack(outs, 0))
+        out = out.permute(0, 4, 3, 2, 1).contiguous()
+        if residual:
+            out = out[:, :-1]
+        assert list(out.shape) == [B, S, C, Fq, T]
+        return out.to(init)
+    def valid_length(self, length: int):
+        """
+        Return a length that is appropriate for evaluation.
+        In our case, always return the training length, unless
+        it is smaller than the given length, in which case this
+        raises an error.
+        """
+        if not self.use_train_segment:
+            return length
+        training_length = int(self.segment * self.samplerate)
+        if training_length < length:
+            raise ValueError(f"Given length {length} is longer than " f"training length {training_length}")
+        return training_length
+    def forward(self, mix):
+        length = mix.shape[-1]
+        length_pre_pad = None
+        if self.use_train_segment:
+            if self.training:
+                self.segment = Fraction(mix.shape[-1], self.samplerate)
+            else:
+                training_length = int(self.segment * self.samplerate)
+                if mix.shape[-1] < training_length:
+                    length_pre_pad = mix.shape[-1]
+                    mix = F.pad(mix, (0, training_length - length_pre_pad))
+        z = self._spec(mix)
+        mag = self._magnitude(z).to(mix.device)
+        x = mag
+        B, C, Fq, T = x.shape
+        # unlike previous Demucs, we always normalize because it is easier.
+        mean = x.mean(dim=(1, 2, 3), keepdim=True)
+        std = x.std(dim=(1, 2, 3), keepdim=True)
+        x = (x - mean) / (1e-5 + std)
+        # x will be the freq. branch input.
+        # Prepare the time branch input.
+        xt = mix
+        meant = xt.mean(dim=(1, 2), keepdim=True)
+        stdt = xt.std(dim=(1, 2), keepdim=True)
+        xt = (xt - meant) / (1e-5 + stdt)
+        # okay, this is a giant mess I know...
+        saved = []  # skip connections, freq.
+        saved_t = []  # skip connections, time.
+        lengths = []  # saved lengths to properly remove padding, freq branch.
+        lengths_t = []  # saved lengths for time branch.
+        for idx, encode in enumerate(self.encoder):
+            lengths.append(x.shape[-1])
+            inject = None
+            if idx < len(self.tencoder):
+                # we have not yet merged branches.
+                lengths_t.append(xt.shape[-1])
+                tenc = self.tencoder[idx]
+                xt = tenc(xt)
+                if not tenc.empty:
+                    # save for skip connection
+                    saved_t.append(xt)
+                else:
+                    # tenc contains just the first conv., so that now time and freq.
+                    # branches have the same shape and can be merged.
+                    inject = xt
+            x = encode(x, inject)
+            if idx == 0 and self.freq_emb is not None:
+                # add frequency embedding to allow for non equivariant convolutions
+                # over the frequency axis.
+                frs = torch.arange(x.shape[-2], device=x.device)
+                emb = self.freq_emb(frs).t()[None, :, :, None].expand_as(x)
+                x = x + self.freq_emb_scale * emb
+            saved.append(x)
+        if self.crosstransformer:
+            if self.bottom_channels:
+                b, c, f, t = x.shape
+                x = rearrange(x, "b c f t-> b c (f t)")
+                x = self.channel_upsampler(x)
+                x = rearrange(x, "b c (f t)-> b c f t", f=f)
+                xt = self.channel_upsampler_t(xt)
+            x, xt = self.crosstransformer(x, xt)
+            if self.bottom_channels:
+                x = rearrange(x, "b c f t-> b c (f t)")
+                x = self.channel_downsampler(x)
+                x = rearrange(x, "b c (f t)-> b c f t", f=f)
+                xt = self.channel_downsampler_t(xt)
+        for idx, decode in enumerate(self.decoder):
+            skip = saved.pop(-1)
+            x, pre = decode(x, skip, lengths.pop(-1))
+            # `pre` contains the output just before final transposed convolution,
+            # which is used when the freq. and time branch separate.
+            offset = self.depth - len(self.tdecoder)
+            if idx >= offset:
+                tdec = self.tdecoder[idx - offset]
+                length_t = lengths_t.pop(-1)
+                if tdec.empty:
+                    assert pre.shape[2] == 1, pre.shape
+                    pre = pre[:, :, 0]
+                    xt, _ = tdec(pre, None, length_t)
+                else:
+                    skip = saved_t.pop(-1)
+                    xt, _ = tdec(xt, skip, length_t)
+        # Let's make sure we used all stored skip connections.
+        assert len(saved) == 0
+        assert len(lengths_t) == 0
+        assert len(saved_t) == 0
+        S = len(self.sources)
+        x = x.view(B, S, -1, Fq, T)
+        x = x * std[:, None] + mean[:, None]
+        # to cpu as non-cuda GPUs don't support complex numbers
+        # demucs issue #435 ##432
+        # NOTE: in this case z already is on cpu
+        # TODO: remove this when mps supports complex numbers
+        device_type = x.device.type
+        device_load = f"{device_type}:{x.device.index}" if not device_type == "mps" else device_type
+        x_is_other_gpu = not device_type in ["cuda", "cpu"]
+        if x_is_other_gpu:
+            x = x.cpu()
+        zout = self._mask(z, x)
+        if self.use_train_segment:
+            if self.training:
+                x = self._ispec(zout, length)
+            else:
+                x = self._ispec(zout, training_length)
+        else:
+            x = self._ispec(zout, length)
+        # back to other device
+        if x_is_other_gpu:
+            x = x.to(device_load)
+        if self.use_train_segment:
+            if self.training:
+                xt = xt.view(B, S, -1, length)
+            else:
+                xt = xt.view(B, S, -1, training_length)
+        else:
+            xt = xt.view(B, S, -1, length)
+        xt = xt * stdt[:, None] + meant[:, None]
+        x = xt + x
+        if length_pre_pad:
+            x = x[..., :length_pre_pad]
+        return x

audio_separator/separator/uvr_lib_v5/demucs/model.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch as th
+from torch import nn
+from .utils import capture_init, center_trim
+class BLSTM(nn.Module):
+    def __init__(self, dim, layers=1):
+        super().__init__()
+        self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
+        self.linear = nn.Linear(2 * dim, dim)
+    def forward(self, x):
+        x = x.permute(2, 0, 1)
+        x = self.lstm(x)[0]
+        x = self.linear(x)
+        x = x.permute(1, 2, 0)
+        return x
+def rescale_conv(conv, reference):
+    std = conv.weight.std().detach()
+    scale = (std / reference) ** 0.5
+    conv.weight.data /= scale
+    if conv.bias is not None:
+        conv.bias.data /= scale
+def rescale_module(module, reference):
+    for sub in module.modules():
+        if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)):
+            rescale_conv(sub, reference)
+def upsample(x, stride):
+    """
+    Linear upsampling, the output will be `stride` times longer.
+    """
+    batch, channels, time = x.size()
+    weight = th.arange(stride, device=x.device, dtype=th.float) / stride
+    x = x.view(batch, channels, time, 1)
+    out = x[..., :-1, :] * (1 - weight) + x[..., 1:, :] * weight
+    return out.reshape(batch, channels, -1)
+def downsample(x, stride):
+    """
+    Downsample x by decimation.
+    """
+    return x[:, :, ::stride]
+class Demucs(nn.Module):
+    @capture_init
+    def __init__(
+        self, sources=4, audio_channels=2, channels=64, depth=6, rewrite=True, glu=True, upsample=False, rescale=0.1, kernel_size=8, stride=4, growth=2.0, lstm_layers=2, context=3, samplerate=44100
+    ):
+        """
+        Args:
+            sources (int): number of sources to separate
+            audio_channels (int): stereo or mono
+            channels (int): first convolution channels
+            depth (int): number of encoder/decoder layers
+            rewrite (bool): add 1x1 convolution to each encoder layer
+                and a convolution to each decoder layer.
+                For the decoder layer, `context` gives the kernel size.
+            glu (bool): use glu instead of ReLU
+            upsample (bool): use linear upsampling with convolutions
+                Wave-U-Net style, instead of transposed convolutions
+            rescale (int): rescale initial weights of convolutions
+                to get their standard deviation closer to `rescale`
+            kernel_size (int): kernel size for convolutions
+            stride (int): stride for convolutions
+            growth (float): multiply (resp divide) number of channels by that
+                for each layer of the encoder (resp decoder)
+            lstm_layers (int): number of lstm layers, 0 = no lstm
+            context (int): kernel size of the convolution in the
+                decoder before the transposed convolution. If > 1,
+                will provide some context from neighboring time
+                steps.
+        """
+        super().__init__()
+        self.audio_channels = audio_channels
+        self.sources = sources
+        self.kernel_size = kernel_size
+        self.context = context
+        self.stride = stride
+        self.depth = depth
+        self.upsample = upsample
+        self.channels = channels
+        self.samplerate = samplerate
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        self.final = None
+        if upsample:
+            self.final = nn.Conv1d(channels + audio_channels, sources * audio_channels, 1)
+            stride = 1
+        if glu:
+            activation = nn.GLU(dim=1)
+            ch_scale = 2
+        else:
+            activation = nn.ReLU()
+            ch_scale = 1
+        in_channels = audio_channels
+        for index in range(depth):
+            encode = []
+            encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), nn.ReLU()]
+            if rewrite:
+                encode += [nn.Conv1d(channels, ch_scale * channels, 1), activation]
+            self.encoder.append(nn.Sequential(*encode))
+            decode = []
+            if index > 0:
+                out_channels = in_channels
+            else:
+                if upsample:
+                    out_channels = channels
+                else:
+                    out_channels = sources * audio_channels
+            if rewrite:
+                decode += [nn.Conv1d(channels, ch_scale * channels, context), activation]
+            if upsample:
+                decode += [nn.Conv1d(channels, out_channels, kernel_size, stride=1)]
+            else:
+                decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride)]
+            if index > 0:
+                decode.append(nn.ReLU())
+            self.decoder.insert(0, nn.Sequential(*decode))
+            in_channels = channels
+            channels = int(growth * channels)
+        channels = in_channels
+        if lstm_layers:
+            self.lstm = BLSTM(channels, lstm_layers)
+        else:
+            self.lstm = None
+        if rescale:
+            rescale_module(self, reference=rescale)
+    def valid_length(self, length):
+        """
+        Return the nearest valid length to use with the model so that
+        there is no time steps left over in a convolutions, e.g. for all
+        layers, size of the input - kernel_size % stride = 0.
+        If the mixture has a valid length, the estimated sources
+        will have exactly the same length when context = 1. If context > 1,
+        the two signals can be center trimmed to match.
+        For training, extracts should have a valid length.For evaluation
+        on full tracks we recommend passing `pad = True` to :method:`forward`.
+        """
+        for _ in range(self.depth):
+            if self.upsample:
+                length = math.ceil(length / self.stride) + self.kernel_size - 1
+            else:
+                length = math.ceil((length - self.kernel_size) / self.stride) + 1
+            length = max(1, length)
+            length += self.context - 1
+        for _ in range(self.depth):
+            if self.upsample:
+                length = length * self.stride + self.kernel_size - 1
+            else:
+                length = (length - 1) * self.stride + self.kernel_size
+        return int(length)
+    def forward(self, mix):
+        x = mix
+        saved = [x]
+        for encode in self.encoder:
+            x = encode(x)
+            saved.append(x)
+            if self.upsample:
+                x = downsample(x, self.stride)
+        if self.lstm:
+            x = self.lstm(x)
+        for decode in self.decoder:
+            if self.upsample:
+                x = upsample(x, stride=self.stride)
+            skip = center_trim(saved.pop(-1), x)
+            x = x + skip
+            x = decode(x)
+        if self.final:
+            skip = center_trim(saved.pop(-1), x)
+            x = th.cat([x, skip], dim=1)
+            x = self.final(x)
+        x = x.view(x.size(0), self.sources, self.audio_channels, x.size(-1))
+        return x

audio_separator/separator/uvr_lib_v5/demucs/model_v2.py ADDED Viewed

	@@ -0,0 +1,222 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import julius
+from torch import nn
+from .tasnet_v2 import ConvTasNet
+from .utils import capture_init, center_trim
+class BLSTM(nn.Module):
+    def __init__(self, dim, layers=1):
+        super().__init__()
+        self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
+        self.linear = nn.Linear(2 * dim, dim)
+    def forward(self, x):
+        x = x.permute(2, 0, 1)
+        x = self.lstm(x)[0]
+        x = self.linear(x)
+        x = x.permute(1, 2, 0)
+        return x
+def rescale_conv(conv, reference):
+    std = conv.weight.std().detach()
+    scale = (std / reference) ** 0.5
+    conv.weight.data /= scale
+    if conv.bias is not None:
+        conv.bias.data /= scale
+def rescale_module(module, reference):
+    for sub in module.modules():
+        if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)):
+            rescale_conv(sub, reference)
+def auto_load_demucs_model_v2(sources, demucs_model_name):
+    if "48" in demucs_model_name:
+        channels = 48
+    elif "unittest" in demucs_model_name:
+        channels = 4
+    else:
+        channels = 64
+    if "tasnet" in demucs_model_name:
+        init_demucs_model = ConvTasNet(sources, X=10)
+    else:
+        init_demucs_model = Demucs(sources, channels=channels)
+    return init_demucs_model
+class Demucs(nn.Module):
+    @capture_init
+    def __init__(
+        self,
+        sources,
+        audio_channels=2,
+        channels=64,
+        depth=6,
+        rewrite=True,
+        glu=True,
+        rescale=0.1,
+        resample=True,
+        kernel_size=8,
+        stride=4,
+        growth=2.0,
+        lstm_layers=2,
+        context=3,
+        normalize=False,
+        samplerate=44100,
+        segment_length=4 * 10 * 44100,
+    ):
+        """
+        Args:
+            sources (list[str]): list of source names
+            audio_channels (int): stereo or mono
+            channels (int): first convolution channels
+            depth (int): number of encoder/decoder layers
+            rewrite (bool): add 1x1 convolution to each encoder layer
+                and a convolution to each decoder layer.
+                For the decoder layer, `context` gives the kernel size.
+            glu (bool): use glu instead of ReLU
+            resample_input (bool): upsample x2 the input and downsample /2 the output.
+            rescale (int): rescale initial weights of convolutions
+                to get their standard deviation closer to `rescale`
+            kernel_size (int): kernel size for convolutions
+            stride (int): stride for convolutions
+            growth (float): multiply (resp divide) number of channels by that
+                for each layer of the encoder (resp decoder)
+            lstm_layers (int): number of lstm layers, 0 = no lstm
+            context (int): kernel size of the convolution in the
+                decoder before the transposed convolution. If > 1,
+                will provide some context from neighboring time
+                steps.
+            samplerate (int): stored as meta information for easing
+                future evaluations of the model.
+            segment_length (int): stored as meta information for easing
+                future evaluations of the model. Length of the segments on which
+                the model was trained.
+        """
+        super().__init__()
+        self.audio_channels = audio_channels
+        self.sources = sources
+        self.kernel_size = kernel_size
+        self.context = context
+        self.stride = stride
+        self.depth = depth
+        self.resample = resample
+        self.channels = channels
+        self.normalize = normalize
+        self.samplerate = samplerate
+        self.segment_length = segment_length
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        if glu:
+            activation = nn.GLU(dim=1)
+            ch_scale = 2
+        else:
+            activation = nn.ReLU()
+            ch_scale = 1
+        in_channels = audio_channels
+        for index in range(depth):
+            encode = []
+            encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), nn.ReLU()]
+            if rewrite:
+                encode += [nn.Conv1d(channels, ch_scale * channels, 1), activation]
+            self.encoder.append(nn.Sequential(*encode))
+            decode = []
+            if index > 0:
+                out_channels = in_channels
+            else:
+                out_channels = len(self.sources) * audio_channels
+            if rewrite:
+                decode += [nn.Conv1d(channels, ch_scale * channels, context), activation]
+            decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride)]
+            if index > 0:
+                decode.append(nn.ReLU())
+            self.decoder.insert(0, nn.Sequential(*decode))
+            in_channels = channels
+            channels = int(growth * channels)
+        channels = in_channels
+        if lstm_layers:
+            self.lstm = BLSTM(channels, lstm_layers)
+        else:
+            self.lstm = None
+        if rescale:
+            rescale_module(self, reference=rescale)
+    def valid_length(self, length):
+        """
+        Return the nearest valid length to use with the model so that
+        there is no time steps left over in a convolutions, e.g. for all
+        layers, size of the input - kernel_size % stride = 0.
+        If the mixture has a valid length, the estimated sources
+        will have exactly the same length when context = 1. If context > 1,
+        the two signals can be center trimmed to match.
+        For training, extracts should have a valid length.For evaluation
+        on full tracks we recommend passing `pad = True` to :method:`forward`.
+        """
+        if self.resample:
+            length *= 2
+        for _ in range(self.depth):
+            length = math.ceil((length - self.kernel_size) / self.stride) + 1
+            length = max(1, length)
+            length += self.context - 1
+        for _ in range(self.depth):
+            length = (length - 1) * self.stride + self.kernel_size
+        if self.resample:
+            length = math.ceil(length / 2)
+        return int(length)
+    def forward(self, mix):
+        x = mix
+        if self.normalize:
+            mono = mix.mean(dim=1, keepdim=True)
+            mean = mono.mean(dim=-1, keepdim=True)
+            std = mono.std(dim=-1, keepdim=True)
+        else:
+            mean = 0
+            std = 1
+        x = (x - mean) / (1e-5 + std)
+        if self.resample:
+            x = julius.resample_frac(x, 1, 2)
+        saved = []
+        for encode in self.encoder:
+            x = encode(x)
+            saved.append(x)
+        if self.lstm:
+            x = self.lstm(x)
+        for decode in self.decoder:
+            skip = center_trim(saved.pop(-1), x)
+            x = x + skip
+            x = decode(x)
+        if self.resample:
+            x = julius.resample_frac(x, 2, 1)
+        x = x * std + mean
+        x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1))
+        return x

audio_separator/separator/uvr_lib_v5/demucs/pretrained.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Loading pretrained models.
+"""
+import logging
+from pathlib import Path
+import typing as tp
+# from dora.log import fatal
+import logging
+from diffq import DiffQuantizer
+import torch.hub
+from .model import Demucs
+from .tasnet_v2 import ConvTasNet
+from .utils import set_state
+from .hdemucs import HDemucs
+from .repo import RemoteRepo, LocalRepo, ModelOnlyRepo, BagOnlyRepo, AnyModelRepo, ModelLoadingError  # noqa
+logger = logging.getLogger(__name__)
+ROOT_URL = "https://dl.fbaipublicfiles.com/demucs/mdx_final/"
+REMOTE_ROOT = Path(__file__).parent / "remote"
+SOURCES = ["drums", "bass", "other", "vocals"]
+def demucs_unittest():
+    model = HDemucs(channels=4, sources=SOURCES)
+    return model
+def add_model_flags(parser):
+    group = parser.add_mutually_exclusive_group(required=False)
+    group.add_argument("-s", "--sig", help="Locally trained XP signature.")
+    group.add_argument("-n", "--name", default="mdx_extra_q", help="Pretrained model name or signature. Default is mdx_extra_q.")
+    parser.add_argument("--repo", type=Path, help="Folder containing all pre-trained models for use with -n.")
+def _parse_remote_files(remote_file_list) -> tp.Dict[str, str]:
+    root: str = ""
+    models: tp.Dict[str, str] = {}
+    for line in remote_file_list.read_text().split("\n"):
+        line = line.strip()
+        if line.startswith("#"):
+            continue
+        elif line.startswith("root:"):
+            root = line.split(":", 1)[1].strip()
+        else:
+            sig = line.split("-", 1)[0]
+            assert sig not in models
+            models[sig] = ROOT_URL + root + line
+    return models
+def get_model(name: str, repo: tp.Optional[Path] = None):
+    """`name` must be a bag of models name or a pretrained signature
+    from the remote AWS model repo or the specified local repo if `repo` is not None.
+    """
+    if name == "demucs_unittest":
+        return demucs_unittest()
+    model_repo: ModelOnlyRepo
+    if repo is None:
+        models = _parse_remote_files(REMOTE_ROOT / "files.txt")
+        model_repo = RemoteRepo(models)
+        bag_repo = BagOnlyRepo(REMOTE_ROOT, model_repo)
+    else:
+        if not repo.is_dir():
+            fatal(f"{repo} must exist and be a directory.")
+        model_repo = LocalRepo(repo)
+        bag_repo = BagOnlyRepo(repo, model_repo)
+    any_repo = AnyModelRepo(model_repo, bag_repo)
+    model = any_repo.get_model(name)
+    model.eval()
+    return model
+def get_model_from_args(args):
+    """
+    Load local model package or pre-trained model.
+    """
+    return get_model(name=args.name, repo=args.repo)
+logger = logging.getLogger(__name__)
+ROOT = "https://dl.fbaipublicfiles.com/demucs/v3.0/"
+PRETRAINED_MODELS = {
+    "demucs": "e07c671f",
+    "demucs48_hq": "28a1282c",
+    "demucs_extra": "3646af93",
+    "demucs_quantized": "07afea75",
+    "tasnet": "beb46fac",
+    "tasnet_extra": "df3777b2",
+    "demucs_unittest": "09ebc15f",
+}
+SOURCES = ["drums", "bass", "other", "vocals"]
+def get_url(name):
+    sig = PRETRAINED_MODELS[name]
+    return ROOT + name + "-" + sig[:8] + ".th"
+def is_pretrained(name):
+    return name in PRETRAINED_MODELS
+def load_pretrained(name):
+    if name == "demucs":
+        return demucs(pretrained=True)
+    elif name == "demucs48_hq":
+        return demucs(pretrained=True, hq=True, channels=48)
+    elif name == "demucs_extra":
+        return demucs(pretrained=True, extra=True)
+    elif name == "demucs_quantized":
+        return demucs(pretrained=True, quantized=True)
+    elif name == "demucs_unittest":
+        return demucs_unittest(pretrained=True)
+    elif name == "tasnet":
+        return tasnet(pretrained=True)
+    elif name == "tasnet_extra":
+        return tasnet(pretrained=True, extra=True)
+    else:
+        raise ValueError(f"Invalid pretrained name {name}")
+def _load_state(name, model, quantizer=None):
+    url = get_url(name)
+    state = torch.hub.load_state_dict_from_url(url, map_location="cpu", check_hash=True)
+    set_state(model, quantizer, state)
+    if quantizer:
+        quantizer.detach()
+def demucs_unittest(pretrained=True):
+    model = Demucs(channels=4, sources=SOURCES)
+    if pretrained:
+        _load_state("demucs_unittest", model)
+    return model
+def demucs(pretrained=True, extra=False, quantized=False, hq=False, channels=64):
+    if not pretrained and (extra or quantized or hq):
+        raise ValueError("if extra or quantized is True, pretrained must be True.")
+    model = Demucs(sources=SOURCES, channels=channels)
+    if pretrained:
+        name = "demucs"
+        if channels != 64:
+            name += str(channels)
+        quantizer = None
+        if sum([extra, quantized, hq]) > 1:
+            raise ValueError("Only one of extra, quantized, hq, can be True.")
+        if quantized:
+            quantizer = DiffQuantizer(model, group_size=8, min_size=1)
+            name += "_quantized"
+        if extra:
+            name += "_extra"
+        if hq:
+            name += "_hq"
+        _load_state(name, model, quantizer)
+    return model
+def tasnet(pretrained=True, extra=False):
+    if not pretrained and extra:
+        raise ValueError("if extra is True, pretrained must be True.")
+    model = ConvTasNet(X=10, sources=SOURCES)
+    if pretrained:
+        name = "tasnet"
+        if extra:
+            name = "tasnet_extra"
+        _load_state(name, model)
+    return model

audio_separator/separator/uvr_lib_v5/demucs/repo.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Represents a model repository, including pre-trained models and bags of models.
+A repo can either be the main remote repository stored in AWS, or a local repository
+with your own models.
+"""
+from hashlib import sha256
+from pathlib import Path
+import typing as tp
+import torch
+import yaml
+from .apply import BagOfModels, Model
+from .states import load_model
+AnyModel = tp.Union[Model, BagOfModels]
+class ModelLoadingError(RuntimeError):
+    pass
+def check_checksum(path: Path, checksum: str):
+    sha = sha256()
+    with open(path, "rb") as file:
+        while True:
+            buf = file.read(2**20)
+            if not buf:
+                break
+            sha.update(buf)
+    actual_checksum = sha.hexdigest()[: len(checksum)]
+    if actual_checksum != checksum:
+        raise ModelLoadingError(f"Invalid checksum for file {path}, " f"expected {checksum} but got {actual_checksum}")
+class ModelOnlyRepo:
+    """Base class for all model only repos."""
+    def has_model(self, sig: str) -> bool:
+        raise NotImplementedError()
+    def get_model(self, sig: str) -> Model:
+        raise NotImplementedError()
+class RemoteRepo(ModelOnlyRepo):
+    def __init__(self, models: tp.Dict[str, str]):
+        self._models = models
+    def has_model(self, sig: str) -> bool:
+        return sig in self._models
+    def get_model(self, sig: str) -> Model:
+        try:
+            url = self._models[sig]
+        except KeyError:
+            raise ModelLoadingError(f"Could not find a pre-trained model with signature {sig}.")
+        pkg = torch.hub.load_state_dict_from_url(url, map_location="cpu", check_hash=True)
+        return load_model(pkg)
+class LocalRepo(ModelOnlyRepo):
+    def __init__(self, root: Path):
+        self.root = root
+        self.scan()
+    def scan(self):
+        self._models = {}
+        self._checksums = {}
+        for file in self.root.iterdir():
+            if file.suffix == ".th":
+                if "-" in file.stem:
+                    xp_sig, checksum = file.stem.split("-")
+                    self._checksums[xp_sig] = checksum
+                else:
+                    xp_sig = file.stem
+                if xp_sig in self._models:
+                    print("Whats xp? ", xp_sig)
+                    raise ModelLoadingError(f"Duplicate pre-trained model exist for signature {xp_sig}. " "Please delete all but one.")
+                self._models[xp_sig] = file
+    def has_model(self, sig: str) -> bool:
+        return sig in self._models
+    def get_model(self, sig: str) -> Model:
+        try:
+            file = self._models[sig]
+        except KeyError:
+            raise ModelLoadingError(f"Could not find pre-trained model with signature {sig}.")
+        if sig in self._checksums:
+            check_checksum(file, self._checksums[sig])
+        return load_model(file)
+class BagOnlyRepo:
+    """Handles only YAML files containing bag of models, leaving the actual
+    model loading to some Repo.
+    """
+    def __init__(self, root: Path, model_repo: ModelOnlyRepo):
+        self.root = root
+        self.model_repo = model_repo
+        self.scan()
+    def scan(self):
+        self._bags = {}
+        for file in self.root.iterdir():
+            if file.suffix == ".yaml":
+                self._bags[file.stem] = file
+    def has_model(self, name: str) -> bool:
+        return name in self._bags
+    def get_model(self, name: str) -> BagOfModels:
+        try:
+            yaml_file = self._bags[name]
+        except KeyError:
+            raise ModelLoadingError(f"{name} is neither a single pre-trained model or " "a bag of models.")
+        bag = yaml.safe_load(open(yaml_file))
+        signatures = bag["models"]
+        models = [self.model_repo.get_model(sig) for sig in signatures]
+        weights = bag.get("weights")
+        segment = bag.get("segment")
+        return BagOfModels(models, weights, segment)
+class AnyModelRepo:
+    def __init__(self, model_repo: ModelOnlyRepo, bag_repo: BagOnlyRepo):
+        self.model_repo = model_repo
+        self.bag_repo = bag_repo
+    def has_model(self, name_or_sig: str) -> bool:
+        return self.model_repo.has_model(name_or_sig) or self.bag_repo.has_model(name_or_sig)
+    def get_model(self, name_or_sig: str) -> AnyModel:
+        # print('name_or_sig: ', name_or_sig)
+        if self.model_repo.has_model(name_or_sig):
+            return self.model_repo.get_model(name_or_sig)
+        else:
+            return self.bag_repo.get_model(name_or_sig)

audio_separator/separator/uvr_lib_v5/demucs/spec.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Conveniance wrapper to perform STFT and iSTFT"""
+import torch as th
+def spectro(x, n_fft=512, hop_length=None, pad=0):
+    *other, length = x.shape
+    x = x.reshape(-1, length)
+    device_type = x.device.type
+    is_other_gpu = not device_type in ["cuda", "cpu"]
+    if is_other_gpu:
+        x = x.cpu()
+    z = th.stft(x, n_fft * (1 + pad), hop_length or n_fft // 4, window=th.hann_window(n_fft).to(x), win_length=n_fft, normalized=True, center=True, return_complex=True, pad_mode="reflect")
+    _, freqs, frame = z.shape
+    return z.view(*other, freqs, frame)
+def ispectro(z, hop_length=None, length=None, pad=0):
+    *other, freqs, frames = z.shape
+    n_fft = 2 * freqs - 2
+    z = z.view(-1, freqs, frames)
+    win_length = n_fft // (1 + pad)
+    device_type = z.device.type
+    is_other_gpu = not device_type in ["cuda", "cpu"]
+    if is_other_gpu:
+        z = z.cpu()
+    x = th.istft(z, n_fft, hop_length, window=th.hann_window(win_length).to(z.real), win_length=win_length, normalized=True, length=length, center=True)
+    _, length = x.shape
+    return x.view(*other, length)

audio_separator/separator/uvr_lib_v5/demucs/states.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Utilities to save and load models.
+"""
+from contextlib import contextmanager
+import functools
+import hashlib
+import inspect
+import io
+from pathlib import Path
+import warnings
+from diffq import DiffQuantizer, UniformQuantizer, restore_quantized_state
+import torch
+def get_quantizer(model, args, optimizer=None):
+    """Return the quantizer given the XP quantization args."""
+    quantizer = None
+    if args.diffq:
+        quantizer = DiffQuantizer(model, min_size=args.min_size, group_size=args.group_size)
+        if optimizer is not None:
+            quantizer.setup_optimizer(optimizer)
+    elif args.qat:
+        quantizer = UniformQuantizer(model, bits=args.qat, min_size=args.min_size)
+    return quantizer
+def load_model(path_or_package, strict=False):
+    """Load a model from the given serialized model, either given as a dict (already loaded)
+    or a path to a file on disk."""
+    if isinstance(path_or_package, dict):
+        package = path_or_package
+    elif isinstance(path_or_package, (str, Path)):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            path = path_or_package
+            package = torch.load(path, "cpu", weights_only=False)
+    else:
+        raise ValueError(f"Invalid type for {path_or_package}.")
+    klass = package["klass"]
+    args = package["args"]
+    kwargs = package["kwargs"]
+    if strict:
+        model = klass(*args, **kwargs)
+    else:
+        sig = inspect.signature(klass)
+        for key in list(kwargs):
+            if key not in sig.parameters:
+                warnings.warn("Dropping inexistant parameter " + key)
+                del kwargs[key]
+        model = klass(*args, **kwargs)
+    state = package["state"]
+    set_state(model, state)
+    return model
+def get_state(model, quantizer, half=False):
+    """Get the state from a model, potentially with quantization applied.
+    If `half` is True, model are stored as half precision, which shouldn't impact performance
+    but half the state size."""
+    if quantizer is None:
+        dtype = torch.half if half else None
+        state = {k: p.data.to(device="cpu", dtype=dtype) for k, p in model.state_dict().items()}
+    else:
+        state = quantizer.get_quantized_state()
+        state["__quantized"] = True
+    return state
+def set_state(model, state, quantizer=None):
+    """Set the state on a given model."""
+    if state.get("__quantized"):
+        if quantizer is not None:
+            quantizer.restore_quantized_state(model, state["quantized"])
+        else:
+            restore_quantized_state(model, state)
+    else:
+        model.load_state_dict(state)
+    return state
+def save_with_checksum(content, path):
+    """Save the given value on disk, along with a sha256 hash.
+    Should be used with the output of either `serialize_model` or `get_state`."""
+    buf = io.BytesIO()
+    torch.save(content, buf)
+    sig = hashlib.sha256(buf.getvalue()).hexdigest()[:8]
+    path = path.parent / (path.stem + "-" + sig + path.suffix)
+    path.write_bytes(buf.getvalue())
+def copy_state(state):
+    return {k: v.cpu().clone() for k, v in state.items()}
+@contextmanager
+def swap_state(model, state):
+    """
+    Context manager that swaps the state of a model, e.g:
+        # model is in old state
+        with swap_state(model, new_state):
+            # model in new state
+        # model back to old state
+    """
+    old_state = copy_state(model.state_dict())
+    model.load_state_dict(state, strict=False)
+    try:
+        yield
+    finally:
+        model.load_state_dict(old_state)
+def capture_init(init):
+    @functools.wraps(init)
+    def __init__(self, *args, **kwargs):
+        self._init_args_kwargs = (args, kwargs)
+        init(self, *args, **kwargs)
+    return __init__

audio_separator/separator/uvr_lib_v5/demucs/tasnet.py ADDED Viewed

	@@ -0,0 +1,401 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Created on 2018/12
+# Author: Kaituo XU
+# Modified on 2019/11 by Alexandre Defossez, added support for multiple output channels
+# Here is the original license:
+# The MIT License (MIT)
+#
+# Copyright (c) 2018 Kaituo XU
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .utils import capture_init
+EPS = 1e-8
+def overlap_and_add(signal, frame_step):
+    outer_dimensions = signal.size()[:-2]
+    frames, frame_length = signal.size()[-2:]
+    subframe_length = math.gcd(frame_length, frame_step)  # gcd=Greatest Common Divisor
+    subframe_step = frame_step // subframe_length
+    subframes_per_frame = frame_length // subframe_length
+    output_size = frame_step * (frames - 1) + frame_length
+    output_subframes = output_size // subframe_length
+    subframe_signal = signal.view(*outer_dimensions, -1, subframe_length)
+    frame = torch.arange(0, output_subframes, device=signal.device).unfold(0, subframes_per_frame, subframe_step)
+    frame = frame.long()  # signal may in GPU or CPU
+    frame = frame.contiguous().view(-1)
+    result = signal.new_zeros(*outer_dimensions, output_subframes, subframe_length)
+    result.index_add_(-2, frame, subframe_signal)
+    result = result.view(*outer_dimensions, -1)
+    return result
+class ConvTasNet(nn.Module):
+    @capture_init
+    def __init__(self, N=256, L=20, B=256, H=512, P=3, X=8, R=4, C=4, audio_channels=1, samplerate=44100, norm_type="gLN", causal=False, mask_nonlinear="relu"):
+        """
+        Args:
+            N: Number of filters in autoencoder
+            L: Length of the filters (in samples)
+            B: Number of channels in bottleneck 1 × 1-conv block
+            H: Number of channels in convolutional blocks
+            P: Kernel size in convolutional blocks
+            X: Number of convolutional blocks in each repeat
+            R: Number of repeats
+            C: Number of speakers
+            norm_type: BN, gLN, cLN
+            causal: causal or non-causal
+            mask_nonlinear: use which non-linear function to generate mask
+        """
+        super(ConvTasNet, self).__init__()
+        # Hyper-parameter
+        self.N, self.L, self.B, self.H, self.P, self.X, self.R, self.C = N, L, B, H, P, X, R, C
+        self.norm_type = norm_type
+        self.causal = causal
+        self.mask_nonlinear = mask_nonlinear
+        self.audio_channels = audio_channels
+        self.samplerate = samplerate
+        # Components
+        self.encoder = Encoder(L, N, audio_channels)
+        self.separator = TemporalConvNet(N, B, H, P, X, R, C, norm_type, causal, mask_nonlinear)
+        self.decoder = Decoder(N, L, audio_channels)
+        # init
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+    def valid_length(self, length):
+        return length
+    def forward(self, mixture):
+        """
+        Args:
+            mixture: [M, T], M is batch size, T is #samples
+        Returns:
+            est_source: [M, C, T]
+        """
+        mixture_w = self.encoder(mixture)
+        est_mask = self.separator(mixture_w)
+        est_source = self.decoder(mixture_w, est_mask)
+        # T changed after conv1d in encoder, fix it here
+        T_origin = mixture.size(-1)
+        T_conv = est_source.size(-1)
+        est_source = F.pad(est_source, (0, T_origin - T_conv))
+        return est_source
+class Encoder(nn.Module):
+    """Estimation of the nonnegative mixture weight by a 1-D conv layer."""
+    def __init__(self, L, N, audio_channels):
+        super(Encoder, self).__init__()
+        # Hyper-parameter
+        self.L, self.N = L, N
+        # Components
+        # 50% overlap
+        self.conv1d_U = nn.Conv1d(audio_channels, N, kernel_size=L, stride=L // 2, bias=False)
+    def forward(self, mixture):
+        """
+        Args:
+            mixture: [M, T], M is batch size, T is #samples
+        Returns:
+            mixture_w: [M, N, K], where K = (T-L)/(L/2)+1 = 2T/L-1
+        """
+        mixture_w = F.relu(self.conv1d_U(mixture))  # [M, N, K]
+        return mixture_w
+class Decoder(nn.Module):
+    def __init__(self, N, L, audio_channels):
+        super(Decoder, self).__init__()
+        # Hyper-parameter
+        self.N, self.L = N, L
+        self.audio_channels = audio_channels
+        # Components
+        self.basis_signals = nn.Linear(N, audio_channels * L, bias=False)
+    def forward(self, mixture_w, est_mask):
+        """
+        Args:
+            mixture_w: [M, N, K]
+            est_mask: [M, C, N, K]
+        Returns:
+            est_source: [M, C, T]
+        """
+        # D = W * M
+        source_w = torch.unsqueeze(mixture_w, 1) * est_mask  # [M, C, N, K]
+        source_w = torch.transpose(source_w, 2, 3)  # [M, C, K, N]
+        # S = DV
+        est_source = self.basis_signals(source_w)  # [M, C, K, ac * L]
+        m, c, k, _ = est_source.size()
+        est_source = est_source.view(m, c, k, self.audio_channels, -1).transpose(2, 3).contiguous()
+        est_source = overlap_and_add(est_source, self.L // 2)  # M x C x ac x T
+        return est_source
+class TemporalConvNet(nn.Module):
+    def __init__(self, N, B, H, P, X, R, C, norm_type="gLN", causal=False, mask_nonlinear="relu"):
+        """
+        Args:
+            N: Number of filters in autoencoder
+            B: Number of channels in bottleneck 1 × 1-conv block
+            H: Number of channels in convolutional blocks
+            P: Kernel size in convolutional blocks
+            X: Number of convolutional blocks in each repeat
+            R: Number of repeats
+            C: Number of speakers
+            norm_type: BN, gLN, cLN
+            causal: causal or non-causal
+            mask_nonlinear: use which non-linear function to generate mask
+        """
+        super(TemporalConvNet, self).__init__()
+        # Hyper-parameter
+        self.C = C
+        self.mask_nonlinear = mask_nonlinear
+        # Components
+        # [M, N, K] -> [M, N, K]
+        layer_norm = ChannelwiseLayerNorm(N)
+        # [M, N, K] -> [M, B, K]
+        bottleneck_conv1x1 = nn.Conv1d(N, B, 1, bias=False)
+        # [M, B, K] -> [M, B, K]
+        repeats = []
+        for r in range(R):
+            blocks = []
+            for x in range(X):
+                dilation = 2**x
+                padding = (P - 1) * dilation if causal else (P - 1) * dilation // 2
+                blocks += [TemporalBlock(B, H, P, stride=1, padding=padding, dilation=dilation, norm_type=norm_type, causal=causal)]
+            repeats += [nn.Sequential(*blocks)]
+        temporal_conv_net = nn.Sequential(*repeats)
+        # [M, B, K] -> [M, C*N, K]
+        mask_conv1x1 = nn.Conv1d(B, C * N, 1, bias=False)
+        # Put together
+        self.network = nn.Sequential(layer_norm, bottleneck_conv1x1, temporal_conv_net, mask_conv1x1)
+    def forward(self, mixture_w):
+        """
+        Keep this API same with TasNet
+        Args:
+            mixture_w: [M, N, K], M is batch size
+        returns:
+            est_mask: [M, C, N, K]
+        """
+        M, N, K = mixture_w.size()
+        score = self.network(mixture_w)  # [M, N, K] -> [M, C*N, K]
+        score = score.view(M, self.C, N, K)  # [M, C*N, K] -> [M, C, N, K]
+        if self.mask_nonlinear == "softmax":
+            est_mask = F.softmax(score, dim=1)
+        elif self.mask_nonlinear == "relu":
+            est_mask = F.relu(score)
+        else:
+            raise ValueError("Unsupported mask non-linear function")
+        return est_mask
+class TemporalBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, norm_type="gLN", causal=False):
+        super(TemporalBlock, self).__init__()
+        # [M, B, K] -> [M, H, K]
+        conv1x1 = nn.Conv1d(in_channels, out_channels, 1, bias=False)
+        prelu = nn.PReLU()
+        norm = chose_norm(norm_type, out_channels)
+        # [M, H, K] -> [M, B, K]
+        dsconv = DepthwiseSeparableConv(out_channels, in_channels, kernel_size, stride, padding, dilation, norm_type, causal)
+        # Put together
+        self.net = nn.Sequential(conv1x1, prelu, norm, dsconv)
+    def forward(self, x):
+        """
+        Args:
+            x: [M, B, K]
+        Returns:
+            [M, B, K]
+        """
+        residual = x
+        out = self.net(x)
+        # TODO: when P = 3 here works fine, but when P = 2 maybe need to pad?
+        return out + residual  # look like w/o F.relu is better than w/ F.relu
+        # return F.relu(out + residual)
+class DepthwiseSeparableConv(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, norm_type="gLN", causal=False):
+        super(DepthwiseSeparableConv, self).__init__()
+        # Use `groups` option to implement depthwise convolution
+        # [M, H, K] -> [M, H, K]
+        depthwise_conv = nn.Conv1d(in_channels, in_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=in_channels, bias=False)
+        if causal:
+            chomp = Chomp1d(padding)
+        prelu = nn.PReLU()
+        norm = chose_norm(norm_type, in_channels)
+        # [M, H, K] -> [M, B, K]
+        pointwise_conv = nn.Conv1d(in_channels, out_channels, 1, bias=False)
+        # Put together
+        if causal:
+            self.net = nn.Sequential(depthwise_conv, chomp, prelu, norm, pointwise_conv)
+        else:
+            self.net = nn.Sequential(depthwise_conv, prelu, norm, pointwise_conv)
+    def forward(self, x):
+        """
+        Args:
+            x: [M, H, K]
+        Returns:
+            result: [M, B, K]
+        """
+        return self.net(x)
+class Chomp1d(nn.Module):
+    """To ensure the output length is the same as the input."""
+    def __init__(self, chomp_size):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+    def forward(self, x):
+        """
+        Args:
+            x: [M, H, Kpad]
+        Returns:
+            [M, H, K]
+        """
+        return x[:, :, : -self.chomp_size].contiguous()
+def chose_norm(norm_type, channel_size):
+    """The input of normlization will be (M, C, K), where M is batch size,
+    C is channel size and K is sequence length.
+    """
+    if norm_type == "gLN":
+        return GlobalLayerNorm(channel_size)
+    elif norm_type == "cLN":
+        return ChannelwiseLayerNorm(channel_size)
+    elif norm_type == "id":
+        return nn.Identity()
+    else:  # norm_type == "BN":
+        # Given input (M, C, K), nn.BatchNorm1d(C) will accumulate statics
+        # along M and K, so this BN usage is right.
+        return nn.BatchNorm1d(channel_size)
+# TODO: Use nn.LayerNorm to impl cLN to speed up
+class ChannelwiseLayerNorm(nn.Module):
+    """Channel-wise Layer Normalization (cLN)"""
+    def __init__(self, channel_size):
+        super(ChannelwiseLayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
+        self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
+        self.reset_parameters()
+    def reset_parameters(self):
+        self.gamma.data.fill_(1)
+        self.beta.data.zero_()
+    def forward(self, y):
+        """
+        Args:
+            y: [M, N, K], M is batch size, N is channel size, K is length
+        Returns:
+            cLN_y: [M, N, K]
+        """
+        mean = torch.mean(y, dim=1, keepdim=True)  # [M, 1, K]
+        var = torch.var(y, dim=1, keepdim=True, unbiased=False)  # [M, 1, K]
+        cLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
+        return cLN_y
+class GlobalLayerNorm(nn.Module):
+    """Global Layer Normalization (gLN)"""
+    def __init__(self, channel_size):
+        super(GlobalLayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
+        self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
+        self.reset_parameters()
+    def reset_parameters(self):
+        self.gamma.data.fill_(1)
+        self.beta.data.zero_()
+    def forward(self, y):
+        """
+        Args:
+            y: [M, N, K], M is batch size, N is channel size, K is length
+        Returns:
+            gLN_y: [M, N, K]
+        """
+        # TODO: in torch 1.0, torch.mean() support dim list
+        mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)  # [M, 1, 1]
+        var = (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
+        gLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
+        return gLN_y
+if __name__ == "__main__":
+    torch.manual_seed(123)
+    M, N, L, T = 2, 3, 4, 12
+    K = 2 * T // L - 1
+    B, H, P, X, R, C, norm_type, causal = 2, 3, 3, 3, 2, 2, "gLN", False
+    mixture = torch.randint(3, (M, T))
+    # test Encoder
+    encoder = Encoder(L, N)
+    encoder.conv1d_U.weight.data = torch.randint(2, encoder.conv1d_U.weight.size())
+    mixture_w = encoder(mixture)
+    print("mixture", mixture)
+    print("U", encoder.conv1d_U.weight)
+    print("mixture_w", mixture_w)
+    print("mixture_w size", mixture_w.size())
+    # test TemporalConvNet
+    separator = TemporalConvNet(N, B, H, P, X, R, C, norm_type=norm_type, causal=causal)
+    est_mask = separator(mixture_w)
+    print("est_mask", est_mask)
+    # test Decoder
+    decoder = Decoder(N, L)
+    est_mask = torch.randint(2, (B, K, C, N))
+    est_source = decoder(mixture_w, est_mask)
+    print("est_source", est_source)
+    # test Conv-TasNet
+    conv_tasnet = ConvTasNet(N, L, B, H, P, X, R, C, norm_type=norm_type)
+    est_source = conv_tasnet(mixture)
+    print("est_source", est_source)
+    print("est_source size", est_source.size())

audio_separator/separator/uvr_lib_v5/demucs/tasnet_v2.py ADDED Viewed

	@@ -0,0 +1,404 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Created on 2018/12
+# Author: Kaituo XU
+# Modified on 2019/11 by Alexandre Defossez, added support for multiple output channels
+# Here is the original license:
+# The MIT License (MIT)
+#
+# Copyright (c) 2018 Kaituo XU
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .utils import capture_init
+EPS = 1e-8
+def overlap_and_add(signal, frame_step):
+    outer_dimensions = signal.size()[:-2]
+    frames, frame_length = signal.size()[-2:]
+    subframe_length = math.gcd(frame_length, frame_step)  # gcd=Greatest Common Divisor
+    subframe_step = frame_step // subframe_length
+    subframes_per_frame = frame_length // subframe_length
+    output_size = frame_step * (frames - 1) + frame_length
+    output_subframes = output_size // subframe_length
+    subframe_signal = signal.view(*outer_dimensions, -1, subframe_length)
+    frame = torch.arange(0, output_subframes, device=signal.device).unfold(0, subframes_per_frame, subframe_step)
+    frame = frame.long()  # signal may in GPU or CPU
+    frame = frame.contiguous().view(-1)
+    result = signal.new_zeros(*outer_dimensions, output_subframes, subframe_length)
+    result.index_add_(-2, frame, subframe_signal)
+    result = result.view(*outer_dimensions, -1)
+    return result
+class ConvTasNet(nn.Module):
+    @capture_init
+    def __init__(self, sources, N=256, L=20, B=256, H=512, P=3, X=8, R=4, audio_channels=2, norm_type="gLN", causal=False, mask_nonlinear="relu", samplerate=44100, segment_length=44100 * 2 * 4):
+        """
+        Args:
+            sources: list of sources
+            N: Number of filters in autoencoder
+            L: Length of the filters (in samples)
+            B: Number of channels in bottleneck 1 × 1-conv block
+            H: Number of channels in convolutional blocks
+            P: Kernel size in convolutional blocks
+            X: Number of convolutional blocks in each repeat
+            R: Number of repeats
+            norm_type: BN, gLN, cLN
+            causal: causal or non-causal
+            mask_nonlinear: use which non-linear function to generate mask
+        """
+        super(ConvTasNet, self).__init__()
+        # Hyper-parameter
+        self.sources = sources
+        self.C = len(sources)
+        self.N, self.L, self.B, self.H, self.P, self.X, self.R = N, L, B, H, P, X, R
+        self.norm_type = norm_type
+        self.causal = causal
+        self.mask_nonlinear = mask_nonlinear
+        self.audio_channels = audio_channels
+        self.samplerate = samplerate
+        self.segment_length = segment_length
+        # Components
+        self.encoder = Encoder(L, N, audio_channels)
+        self.separator = TemporalConvNet(N, B, H, P, X, R, self.C, norm_type, causal, mask_nonlinear)
+        self.decoder = Decoder(N, L, audio_channels)
+        # init
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+    def valid_length(self, length):
+        return length
+    def forward(self, mixture):
+        """
+        Args:
+            mixture: [M, T], M is batch size, T is #samples
+        Returns:
+            est_source: [M, C, T]
+        """
+        mixture_w = self.encoder(mixture)
+        est_mask = self.separator(mixture_w)
+        est_source = self.decoder(mixture_w, est_mask)
+        # T changed after conv1d in encoder, fix it here
+        T_origin = mixture.size(-1)
+        T_conv = est_source.size(-1)
+        est_source = F.pad(est_source, (0, T_origin - T_conv))
+        return est_source
+class Encoder(nn.Module):
+    """Estimation of the nonnegative mixture weight by a 1-D conv layer."""
+    def __init__(self, L, N, audio_channels):
+        super(Encoder, self).__init__()
+        # Hyper-parameter
+        self.L, self.N = L, N
+        # Components
+        # 50% overlap
+        self.conv1d_U = nn.Conv1d(audio_channels, N, kernel_size=L, stride=L // 2, bias=False)
+    def forward(self, mixture):
+        """
+        Args:
+            mixture: [M, T], M is batch size, T is #samples
+        Returns:
+            mixture_w: [M, N, K], where K = (T-L)/(L/2)+1 = 2T/L-1
+        """
+        mixture_w = F.relu(self.conv1d_U(mixture))  # [M, N, K]
+        return mixture_w
+class Decoder(nn.Module):
+    def __init__(self, N, L, audio_channels):
+        super(Decoder, self).__init__()
+        # Hyper-parameter
+        self.N, self.L = N, L
+        self.audio_channels = audio_channels
+        # Components
+        self.basis_signals = nn.Linear(N, audio_channels * L, bias=False)
+    def forward(self, mixture_w, est_mask):
+        """
+        Args:
+            mixture_w: [M, N, K]
+            est_mask: [M, C, N, K]
+        Returns:
+            est_source: [M, C, T]
+        """
+        # D = W * M
+        source_w = torch.unsqueeze(mixture_w, 1) * est_mask  # [M, C, N, K]
+        source_w = torch.transpose(source_w, 2, 3)  # [M, C, K, N]
+        # S = DV
+        est_source = self.basis_signals(source_w)  # [M, C, K, ac * L]
+        m, c, k, _ = est_source.size()
+        est_source = est_source.view(m, c, k, self.audio_channels, -1).transpose(2, 3).contiguous()
+        est_source = overlap_and_add(est_source, self.L // 2)  # M x C x ac x T
+        return est_source
+class TemporalConvNet(nn.Module):
+    def __init__(self, N, B, H, P, X, R, C, norm_type="gLN", causal=False, mask_nonlinear="relu"):
+        """
+        Args:
+            N: Number of filters in autoencoder
+            B: Number of channels in bottleneck 1 × 1-conv block
+            H: Number of channels in convolutional blocks
+            P: Kernel size in convolutional blocks
+            X: Number of convolutional blocks in each repeat
+            R: Number of repeats
+            C: Number of speakers
+            norm_type: BN, gLN, cLN
+            causal: causal or non-causal
+            mask_nonlinear: use which non-linear function to generate mask
+        """
+        super(TemporalConvNet, self).__init__()
+        # Hyper-parameter
+        self.C = C
+        self.mask_nonlinear = mask_nonlinear
+        # Components
+        # [M, N, K] -> [M, N, K]
+        layer_norm = ChannelwiseLayerNorm(N)
+        # [M, N, K] -> [M, B, K]
+        bottleneck_conv1x1 = nn.Conv1d(N, B, 1, bias=False)
+        # [M, B, K] -> [M, B, K]
+        repeats = []
+        for r in range(R):
+            blocks = []
+            for x in range(X):
+                dilation = 2**x
+                padding = (P - 1) * dilation if causal else (P - 1) * dilation // 2
+                blocks += [TemporalBlock(B, H, P, stride=1, padding=padding, dilation=dilation, norm_type=norm_type, causal=causal)]
+            repeats += [nn.Sequential(*blocks)]
+        temporal_conv_net = nn.Sequential(*repeats)
+        # [M, B, K] -> [M, C*N, K]
+        mask_conv1x1 = nn.Conv1d(B, C * N, 1, bias=False)
+        # Put together
+        self.network = nn.Sequential(layer_norm, bottleneck_conv1x1, temporal_conv_net, mask_conv1x1)
+    def forward(self, mixture_w):
+        """
+        Keep this API same with TasNet
+        Args:
+            mixture_w: [M, N, K], M is batch size
+        returns:
+            est_mask: [M, C, N, K]
+        """
+        M, N, K = mixture_w.size()
+        score = self.network(mixture_w)  # [M, N, K] -> [M, C*N, K]
+        score = score.view(M, self.C, N, K)  # [M, C*N, K] -> [M, C, N, K]
+        if self.mask_nonlinear == "softmax":
+            est_mask = F.softmax(score, dim=1)
+        elif self.mask_nonlinear == "relu":
+            est_mask = F.relu(score)
+        else:
+            raise ValueError("Unsupported mask non-linear function")
+        return est_mask
+class TemporalBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, norm_type="gLN", causal=False):
+        super(TemporalBlock, self).__init__()
+        # [M, B, K] -> [M, H, K]
+        conv1x1 = nn.Conv1d(in_channels, out_channels, 1, bias=False)
+        prelu = nn.PReLU()
+        norm = chose_norm(norm_type, out_channels)
+        # [M, H, K] -> [M, B, K]
+        dsconv = DepthwiseSeparableConv(out_channels, in_channels, kernel_size, stride, padding, dilation, norm_type, causal)
+        # Put together
+        self.net = nn.Sequential(conv1x1, prelu, norm, dsconv)
+    def forward(self, x):
+        """
+        Args:
+            x: [M, B, K]
+        Returns:
+            [M, B, K]
+        """
+        residual = x
+        out = self.net(x)
+        # TODO: when P = 3 here works fine, but when P = 2 maybe need to pad?
+        return out + residual  # look like w/o F.relu is better than w/ F.relu
+        # return F.relu(out + residual)
+class DepthwiseSeparableConv(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, norm_type="gLN", causal=False):
+        super(DepthwiseSeparableConv, self).__init__()
+        # Use `groups` option to implement depthwise convolution
+        # [M, H, K] -> [M, H, K]
+        depthwise_conv = nn.Conv1d(in_channels, in_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=in_channels, bias=False)
+        if causal:
+            chomp = Chomp1d(padding)
+        prelu = nn.PReLU()
+        norm = chose_norm(norm_type, in_channels)
+        # [M, H, K] -> [M, B, K]
+        pointwise_conv = nn.Conv1d(in_channels, out_channels, 1, bias=False)
+        # Put together
+        if causal:
+            self.net = nn.Sequential(depthwise_conv, chomp, prelu, norm, pointwise_conv)
+        else:
+            self.net = nn.Sequential(depthwise_conv, prelu, norm, pointwise_conv)
+    def forward(self, x):
+        """
+        Args:
+            x: [M, H, K]
+        Returns:
+            result: [M, B, K]
+        """
+        return self.net(x)
+class Chomp1d(nn.Module):
+    """To ensure the output length is the same as the input."""
+    def __init__(self, chomp_size):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+    def forward(self, x):
+        """
+        Args:
+            x: [M, H, Kpad]
+        Returns:
+            [M, H, K]
+        """
+        return x[:, :, : -self.chomp_size].contiguous()
+def chose_norm(norm_type, channel_size):
+    """The input of normlization will be (M, C, K), where M is batch size,
+    C is channel size and K is sequence length.
+    """
+    if norm_type == "gLN":
+        return GlobalLayerNorm(channel_size)
+    elif norm_type == "cLN":
+        return ChannelwiseLayerNorm(channel_size)
+    elif norm_type == "id":
+        return nn.Identity()
+    else:  # norm_type == "BN":
+        # Given input (M, C, K), nn.BatchNorm1d(C) will accumulate statics
+        # along M and K, so this BN usage is right.
+        return nn.BatchNorm1d(channel_size)
+# TODO: Use nn.LayerNorm to impl cLN to speed up
+class ChannelwiseLayerNorm(nn.Module):
+    """Channel-wise Layer Normalization (cLN)"""
+    def __init__(self, channel_size):
+        super(ChannelwiseLayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
+        self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
+        self.reset_parameters()
+    def reset_parameters(self):
+        self.gamma.data.fill_(1)
+        self.beta.data.zero_()
+    def forward(self, y):
+        """
+        Args:
+            y: [M, N, K], M is batch size, N is channel size, K is length
+        Returns:
+            cLN_y: [M, N, K]
+        """
+        mean = torch.mean(y, dim=1, keepdim=True)  # [M, 1, K]
+        var = torch.var(y, dim=1, keepdim=True, unbiased=False)  # [M, 1, K]
+        cLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
+        return cLN_y
+class GlobalLayerNorm(nn.Module):
+    """Global Layer Normalization (gLN)"""
+    def __init__(self, channel_size):
+        super(GlobalLayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
+        self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
+        self.reset_parameters()
+    def reset_parameters(self):
+        self.gamma.data.fill_(1)
+        self.beta.data.zero_()
+    def forward(self, y):
+        """
+        Args:
+            y: [M, N, K], M is batch size, N is channel size, K is length
+        Returns:
+            gLN_y: [M, N, K]
+        """
+        # TODO: in torch 1.0, torch.mean() support dim list
+        mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)  # [M, 1, 1]
+        var = (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
+        gLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
+        return gLN_y
+if __name__ == "__main__":
+    torch.manual_seed(123)
+    M, N, L, T = 2, 3, 4, 12
+    K = 2 * T // L - 1
+    B, H, P, X, R, C, norm_type, causal = 2, 3, 3, 3, 2, 2, "gLN", False
+    mixture = torch.randint(3, (M, T))
+    # test Encoder
+    encoder = Encoder(L, N)
+    encoder.conv1d_U.weight.data = torch.randint(2, encoder.conv1d_U.weight.size())
+    mixture_w = encoder(mixture)
+    print("mixture", mixture)
+    print("U", encoder.conv1d_U.weight)
+    print("mixture_w", mixture_w)
+    print("mixture_w size", mixture_w.size())
+    # test TemporalConvNet
+    separator = TemporalConvNet(N, B, H, P, X, R, C, norm_type=norm_type, causal=causal)
+    est_mask = separator(mixture_w)
+    print("est_mask", est_mask)
+    # test Decoder
+    decoder = Decoder(N, L)
+    est_mask = torch.randint(2, (B, K, C, N))
+    est_source = decoder(mixture_w, est_mask)
+    print("est_source", est_source)
+    # test Conv-TasNet
+    conv_tasnet = ConvTasNet(N, L, B, H, P, X, R, C, norm_type=norm_type)
+    est_source = conv_tasnet(mixture)
+    print("est_source", est_source)
+    print("est_source size", est_source.size())

audio_separator/separator/uvr_lib_v5/demucs/transformer.py ADDED Viewed

	@@ -0,0 +1,675 @@

+# Copyright (c) 2019-present, Meta, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# First author is Simon Rouard.
+import random
+import typing as tp
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import math
+from einops import rearrange
+def create_sin_embedding(length: int, dim: int, shift: int = 0, device="cpu", max_period=10000):
+    # We aim for TBC format
+    assert dim % 2 == 0
+    pos = shift + torch.arange(length, device=device).view(-1, 1, 1)
+    half_dim = dim // 2
+    adim = torch.arange(dim // 2, device=device).view(1, 1, -1)
+    phase = pos / (max_period ** (adim / (half_dim - 1)))
+    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
+def create_2d_sin_embedding(d_model, height, width, device="cpu", max_period=10000):
+    """
+    :param d_model: dimension of the model
+    :param height: height of the positions
+    :param width: width of the positions
+    :return: d_model*height*width position matrix
+    """
+    if d_model % 4 != 0:
+        raise ValueError("Cannot use sin/cos positional encoding with " "odd dimension (got dim={:d})".format(d_model))
+    pe = torch.zeros(d_model, height, width)
+    # Each dimension use half of d_model
+    d_model = int(d_model / 2)
+    div_term = torch.exp(torch.arange(0.0, d_model, 2) * -(math.log(max_period) / d_model))
+    pos_w = torch.arange(0.0, width).unsqueeze(1)
+    pos_h = torch.arange(0.0, height).unsqueeze(1)
+    pe[0:d_model:2, :, :] = torch.sin(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
+    pe[1:d_model:2, :, :] = torch.cos(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
+    pe[d_model::2, :, :] = torch.sin(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
+    pe[d_model + 1 :: 2, :, :] = torch.cos(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
+    return pe[None, :].to(device)
+def create_sin_embedding_cape(
+    length: int,
+    dim: int,
+    batch_size: int,
+    mean_normalize: bool,
+    augment: bool,  # True during training
+    max_global_shift: float = 0.0,  # delta max
+    max_local_shift: float = 0.0,  # epsilon max
+    max_scale: float = 1.0,
+    device: str = "cpu",
+    max_period: float = 10000.0,
+):
+    # We aim for TBC format
+    assert dim % 2 == 0
+    pos = 1.0 * torch.arange(length).view(-1, 1, 1)  # (length, 1, 1)
+    pos = pos.repeat(1, batch_size, 1)  # (length, batch_size, 1)
+    if mean_normalize:
+        pos -= torch.nanmean(pos, dim=0, keepdim=True)
+    if augment:
+        delta = np.random.uniform(-max_global_shift, +max_global_shift, size=[1, batch_size, 1])
+        delta_local = np.random.uniform(-max_local_shift, +max_local_shift, size=[length, batch_size, 1])
+        log_lambdas = np.random.uniform(-np.log(max_scale), +np.log(max_scale), size=[1, batch_size, 1])
+        pos = (pos + delta + delta_local) * np.exp(log_lambdas)
+    pos = pos.to(device)
+    half_dim = dim // 2
+    adim = torch.arange(dim // 2, device=device).view(1, 1, -1)
+    phase = pos / (max_period ** (adim / (half_dim - 1)))
+    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1).float()
+def get_causal_mask(length):
+    pos = torch.arange(length)
+    return pos > pos[:, None]
+def get_elementary_mask(T1, T2, mask_type, sparse_attn_window, global_window, mask_random_seed, sparsity, device):
+    """
+    When the input of the Decoder has length T1 and the output T2
+    The mask matrix has shape (T2, T1)
+    """
+    assert mask_type in ["diag", "jmask", "random", "global"]
+    if mask_type == "global":
+        mask = torch.zeros(T2, T1, dtype=torch.bool)
+        mask[:, :global_window] = True
+        line_window = int(global_window * T2 / T1)
+        mask[:line_window, :] = True
+    if mask_type == "diag":
+        mask = torch.zeros(T2, T1, dtype=torch.bool)
+        rows = torch.arange(T2)[:, None]
+        cols = (T1 / T2 * rows + torch.arange(-sparse_attn_window, sparse_attn_window + 1)).long().clamp(0, T1 - 1)
+        mask.scatter_(1, cols, torch.ones(1, dtype=torch.bool).expand_as(cols))
+    elif mask_type == "jmask":
+        mask = torch.zeros(T2 + 2, T1 + 2, dtype=torch.bool)
+        rows = torch.arange(T2 + 2)[:, None]
+        t = torch.arange(0, int((2 * T1) ** 0.5 + 1))
+        t = (t * (t + 1) / 2).int()
+        t = torch.cat([-t.flip(0)[:-1], t])
+        cols = (T1 / T2 * rows + t).long().clamp(0, T1 + 1)
+        mask.scatter_(1, cols, torch.ones(1, dtype=torch.bool).expand_as(cols))
+        mask = mask[1:-1, 1:-1]
+    elif mask_type == "random":
+        gene = torch.Generator(device=device)
+        gene.manual_seed(mask_random_seed)
+        mask = torch.rand(T1 * T2, generator=gene, device=device).reshape(T2, T1) > sparsity
+    mask = mask.to(device)
+    return mask
+def get_mask(T1, T2, mask_type, sparse_attn_window, global_window, mask_random_seed, sparsity, device):
+    """
+    Return a SparseCSRTensor mask that is a combination of elementary masks
+    mask_type can be a combination of multiple masks: for instance "diag_jmask_random"
+    """
+    from xformers.sparse import SparseCSRTensor
+    # create a list
+    mask_types = mask_type.split("_")
+    all_masks = [get_elementary_mask(T1, T2, mask, sparse_attn_window, global_window, mask_random_seed, sparsity, device) for mask in mask_types]
+    final_mask = torch.stack(all_masks).sum(axis=0) > 0
+    return SparseCSRTensor.from_dense(final_mask[None])
+class ScaledEmbedding(nn.Module):
+    def __init__(self, num_embeddings: int, embedding_dim: int, scale: float = 1.0, boost: float = 3.0):
+        super().__init__()
+        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
+        self.embedding.weight.data *= scale / boost
+        self.boost = boost
+    @property
+    def weight(self):
+        return self.embedding.weight * self.boost
+    def forward(self, x):
+        return self.embedding(x) * self.boost
+class LayerScale(nn.Module):
+    """Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
+    This rescales diagonaly residual outputs close to 0 initially, then learnt.
+    """
+    def __init__(self, channels: int, init: float = 0, channel_last=False):
+        """
+        channel_last = False corresponds to (B, C, T) tensors
+        channel_last = True corresponds to (T, B, C) tensors
+        """
+        super().__init__()
+        self.channel_last = channel_last
+        self.scale = nn.Parameter(torch.zeros(channels, requires_grad=True))
+        self.scale.data[:] = init
+    def forward(self, x):
+        if self.channel_last:
+            return self.scale * x
+        else:
+            return self.scale[:, None] * x
+class MyGroupNorm(nn.GroupNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x):
+        """
+        x: (B, T, C)
+        if num_groups=1: Normalisation on all T and C together for each B
+        """
+        x = x.transpose(1, 2)
+        return super().forward(x).transpose(1, 2)
+class MyTransformerEncoderLayer(nn.TransformerEncoderLayer):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation=F.relu,
+        group_norm=0,
+        norm_first=False,
+        norm_out=False,
+        layer_norm_eps=1e-5,
+        layer_scale=False,
+        init_values=1e-4,
+        device=None,
+        dtype=None,
+        sparse=False,
+        mask_type="diag",
+        mask_random_seed=42,
+        sparse_attn_window=500,
+        global_window=50,
+        auto_sparsity=False,
+        sparsity=0.95,
+        batch_first=False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            d_model=d_model,
+            nhead=nhead,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout,
+            activation=activation,
+            layer_norm_eps=layer_norm_eps,
+            batch_first=batch_first,
+            norm_first=norm_first,
+            device=device,
+            dtype=dtype,
+        )
+        self.sparse = sparse
+        self.auto_sparsity = auto_sparsity
+        if sparse:
+            if not auto_sparsity:
+                self.mask_type = mask_type
+                self.sparse_attn_window = sparse_attn_window
+                self.global_window = global_window
+            self.sparsity = sparsity
+        if group_norm:
+            self.norm1 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs)
+            self.norm2 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs)
+        self.norm_out = None
+        if self.norm_first & norm_out:
+            self.norm_out = MyGroupNorm(num_groups=int(norm_out), num_channels=d_model)
+        self.gamma_1 = LayerScale(d_model, init_values, True) if layer_scale else nn.Identity()
+        self.gamma_2 = LayerScale(d_model, init_values, True) if layer_scale else nn.Identity()
+        if sparse:
+            self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first, auto_sparsity=sparsity if auto_sparsity else 0)
+            self.__setattr__("src_mask", torch.zeros(1, 1))
+            self.mask_random_seed = mask_random_seed
+    def forward(self, src, src_mask=None, src_key_padding_mask=None):
+        """
+        if batch_first = False, src shape is (T, B, C)
+        the case where batch_first=True is not covered
+        """
+        device = src.device
+        x = src
+        T, B, C = x.shape
+        if self.sparse and not self.auto_sparsity:
+            assert src_mask is None
+            src_mask = self.src_mask
+            if src_mask.shape[-1] != T:
+                src_mask = get_mask(T, T, self.mask_type, self.sparse_attn_window, self.global_window, self.mask_random_seed, self.sparsity, device)
+                self.__setattr__("src_mask", src_mask)
+        if self.norm_first:
+            x = x + self.gamma_1(self._sa_block(self.norm1(x), src_mask, src_key_padding_mask))
+            x = x + self.gamma_2(self._ff_block(self.norm2(x)))
+            if self.norm_out:
+                x = self.norm_out(x)
+        else:
+            x = self.norm1(x + self.gamma_1(self._sa_block(x, src_mask, src_key_padding_mask)))
+            x = self.norm2(x + self.gamma_2(self._ff_block(x)))
+        return x
+class CrossTransformerEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        dim_feedforward: int = 2048,
+        dropout: float = 0.1,
+        activation=F.relu,
+        layer_norm_eps: float = 1e-5,
+        layer_scale: bool = False,
+        init_values: float = 1e-4,
+        norm_first: bool = False,
+        group_norm: bool = False,
+        norm_out: bool = False,
+        sparse=False,
+        mask_type="diag",
+        mask_random_seed=42,
+        sparse_attn_window=500,
+        global_window=50,
+        sparsity=0.95,
+        auto_sparsity=None,
+        device=None,
+        dtype=None,
+        batch_first=False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.sparse = sparse
+        self.auto_sparsity = auto_sparsity
+        if sparse:
+            if not auto_sparsity:
+                self.mask_type = mask_type
+                self.sparse_attn_window = sparse_attn_window
+                self.global_window = global_window
+            self.sparsity = sparsity
+        self.cross_attn: nn.Module
+        self.cross_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward, **factory_kwargs)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model, **factory_kwargs)
+        self.norm_first = norm_first
+        self.norm1: nn.Module
+        self.norm2: nn.Module
+        self.norm3: nn.Module
+        if group_norm:
+            self.norm1 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs)
+            self.norm2 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs)
+            self.norm3 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs)
+        else:
+            self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+            self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+            self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+        self.norm_out = None
+        if self.norm_first & norm_out:
+            self.norm_out = MyGroupNorm(num_groups=int(norm_out), num_channels=d_model)
+        self.gamma_1 = LayerScale(d_model, init_values, True) if layer_scale else nn.Identity()
+        self.gamma_2 = LayerScale(d_model, init_values, True) if layer_scale else nn.Identity()
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        # Legacy string support for activation function.
+        if isinstance(activation, str):
+            self.activation = self._get_activation_fn(activation)
+        else:
+            self.activation = activation
+        if sparse:
+            self.cross_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first, auto_sparsity=sparsity if auto_sparsity else 0)
+            if not auto_sparsity:
+                self.__setattr__("mask", torch.zeros(1, 1))
+                self.mask_random_seed = mask_random_seed
+    def forward(self, q, k, mask=None):
+        """
+        Args:
+            q: tensor of shape (T, B, C)
+            k: tensor of shape (S, B, C)
+            mask: tensor of shape (T, S)
+        """
+        device = q.device
+        T, B, C = q.shape
+        S, B, C = k.shape
+        if self.sparse and not self.auto_sparsity:
+            assert mask is None
+            mask = self.mask
+            if mask.shape[-1] != S or mask.shape[-2] != T:
+                mask = get_mask(S, T, self.mask_type, self.sparse_attn_window, self.global_window, self.mask_random_seed, self.sparsity, device)
+                self.__setattr__("mask", mask)
+        if self.norm_first:
+            x = q + self.gamma_1(self._ca_block(self.norm1(q), self.norm2(k), mask))
+            x = x + self.gamma_2(self._ff_block(self.norm3(x)))
+            if self.norm_out:
+                x = self.norm_out(x)
+        else:
+            x = self.norm1(q + self.gamma_1(self._ca_block(q, k, mask)))
+            x = self.norm2(x + self.gamma_2(self._ff_block(x)))
+        return x
+    # self-attention block
+    def _ca_block(self, q, k, attn_mask=None):
+        x = self.cross_attn(q, k, k, attn_mask=attn_mask, need_weights=False)[0]
+        return self.dropout1(x)
+    # feed forward block
+    def _ff_block(self, x):
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout2(x)
+    def _get_activation_fn(self, activation):
+        if activation == "relu":
+            return F.relu
+        elif activation == "gelu":
+            return F.gelu
+        raise RuntimeError("activation should be relu/gelu, not {}".format(activation))
+# ----------------- MULTI-BLOCKS MODELS: -----------------------
+class CrossTransformerEncoder(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        emb: str = "sin",
+        hidden_scale: float = 4.0,
+        num_heads: int = 8,
+        num_layers: int = 6,
+        cross_first: bool = False,
+        dropout: float = 0.0,
+        max_positions: int = 1000,
+        norm_in: bool = True,
+        norm_in_group: bool = False,
+        group_norm: int = False,
+        norm_first: bool = False,
+        norm_out: bool = False,
+        max_period: float = 10000.0,
+        weight_decay: float = 0.0,
+        lr: tp.Optional[float] = None,
+        layer_scale: bool = False,
+        gelu: bool = True,
+        sin_random_shift: int = 0,
+        weight_pos_embed: float = 1.0,
+        cape_mean_normalize: bool = True,
+        cape_augment: bool = True,
+        cape_glob_loc_scale: list = [5000.0, 1.0, 1.4],
+        sparse_self_attn: bool = False,
+        sparse_cross_attn: bool = False,
+        mask_type: str = "diag",
+        mask_random_seed: int = 42,
+        sparse_attn_window: int = 500,
+        global_window: int = 50,
+        auto_sparsity: bool = False,
+        sparsity: float = 0.95,
+    ):
+        super().__init__()
+        """
+        """
+        assert dim % num_heads == 0
+        hidden_dim = int(dim * hidden_scale)
+        self.num_layers = num_layers
+        # classic parity = 1 means that if idx%2 == 1 there is a
+        # classical encoder else there is a cross encoder
+        self.classic_parity = 1 if cross_first else 0
+        self.emb = emb
+        self.max_period = max_period
+        self.weight_decay = weight_decay
+        self.weight_pos_embed = weight_pos_embed
+        self.sin_random_shift = sin_random_shift
+        if emb == "cape":
+            self.cape_mean_normalize = cape_mean_normalize
+            self.cape_augment = cape_augment
+            self.cape_glob_loc_scale = cape_glob_loc_scale
+        if emb == "scaled":
+            self.position_embeddings = ScaledEmbedding(max_positions, dim, scale=0.2)
+        self.lr = lr
+        activation: tp.Any = F.gelu if gelu else F.relu
+        self.norm_in: nn.Module
+        self.norm_in_t: nn.Module
+        if norm_in:
+            self.norm_in = nn.LayerNorm(dim)
+            self.norm_in_t = nn.LayerNorm(dim)
+        elif norm_in_group:
+            self.norm_in = MyGroupNorm(int(norm_in_group), dim)
+            self.norm_in_t = MyGroupNorm(int(norm_in_group), dim)
+        else:
+            self.norm_in = nn.Identity()
+            self.norm_in_t = nn.Identity()
+        # spectrogram layers
+        self.layers = nn.ModuleList()
+        # temporal layers
+        self.layers_t = nn.ModuleList()
+        kwargs_common = {
+            "d_model": dim,
+            "nhead": num_heads,
+            "dim_feedforward": hidden_dim,
+            "dropout": dropout,
+            "activation": activation,
+            "group_norm": group_norm,
+            "norm_first": norm_first,
+            "norm_out": norm_out,
+            "layer_scale": layer_scale,
+            "mask_type": mask_type,
+            "mask_random_seed": mask_random_seed,
+            "sparse_attn_window": sparse_attn_window,
+            "global_window": global_window,
+            "sparsity": sparsity,
+            "auto_sparsity": auto_sparsity,
+            "batch_first": True,
+        }
+        kwargs_classic_encoder = dict(kwargs_common)
+        kwargs_classic_encoder.update({"sparse": sparse_self_attn})
+        kwargs_cross_encoder = dict(kwargs_common)
+        kwargs_cross_encoder.update({"sparse": sparse_cross_attn})
+        for idx in range(num_layers):
+            if idx % 2 == self.classic_parity:
+                self.layers.append(MyTransformerEncoderLayer(**kwargs_classic_encoder))
+                self.layers_t.append(MyTransformerEncoderLayer(**kwargs_classic_encoder))
+            else:
+                self.layers.append(CrossTransformerEncoderLayer(**kwargs_cross_encoder))
+                self.layers_t.append(CrossTransformerEncoderLayer(**kwargs_cross_encoder))
+    def forward(self, x, xt):
+        B, C, Fr, T1 = x.shape
+        pos_emb_2d = create_2d_sin_embedding(C, Fr, T1, x.device, self.max_period)  # (1, C, Fr, T1)
+        pos_emb_2d = rearrange(pos_emb_2d, "b c fr t1 -> b (t1 fr) c")
+        x = rearrange(x, "b c fr t1 -> b (t1 fr) c")
+        x = self.norm_in(x)
+        x = x + self.weight_pos_embed * pos_emb_2d
+        B, C, T2 = xt.shape
+        xt = rearrange(xt, "b c t2 -> b t2 c")  # now T2, B, C
+        pos_emb = self._get_pos_embedding(T2, B, C, x.device)
+        pos_emb = rearrange(pos_emb, "t2 b c -> b t2 c")
+        xt = self.norm_in_t(xt)
+        xt = xt + self.weight_pos_embed * pos_emb
+        for idx in range(self.num_layers):
+            if idx % 2 == self.classic_parity:
+                x = self.layers[idx](x)
+                xt = self.layers_t[idx](xt)
+            else:
+                old_x = x
+                x = self.layers[idx](x, xt)
+                xt = self.layers_t[idx](xt, old_x)
+        x = rearrange(x, "b (t1 fr) c -> b c fr t1", t1=T1)
+        xt = rearrange(xt, "b t2 c -> b c t2")
+        return x, xt
+    def _get_pos_embedding(self, T, B, C, device):
+        if self.emb == "sin":
+            shift = random.randrange(self.sin_random_shift + 1)
+            pos_emb = create_sin_embedding(T, C, shift=shift, device=device, max_period=self.max_period)
+        elif self.emb == "cape":
+            if self.training:
+                pos_emb = create_sin_embedding_cape(
+                    T,
+                    C,
+                    B,
+                    device=device,
+                    max_period=self.max_period,
+                    mean_normalize=self.cape_mean_normalize,
+                    augment=self.cape_augment,
+                    max_global_shift=self.cape_glob_loc_scale[0],
+                    max_local_shift=self.cape_glob_loc_scale[1],
+                    max_scale=self.cape_glob_loc_scale[2],
+                )
+            else:
+                pos_emb = create_sin_embedding_cape(T, C, B, device=device, max_period=self.max_period, mean_normalize=self.cape_mean_normalize, augment=False)
+        elif self.emb == "scaled":
+            pos = torch.arange(T, device=device)
+            pos_emb = self.position_embeddings(pos)[:, None]
+        return pos_emb
+    def make_optim_group(self):
+        group = {"params": list(self.parameters()), "weight_decay": self.weight_decay}
+        if self.lr is not None:
+            group["lr"] = self.lr
+        return group
+# Attention Modules
+class MultiheadAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None, batch_first=False, auto_sparsity=None):
+        super().__init__()
+        assert auto_sparsity is not None, "sanity check"
+        self.num_heads = num_heads
+        self.q = torch.nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.k = torch.nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v = torch.nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.attn_drop = torch.nn.Dropout(dropout)
+        self.proj = torch.nn.Linear(embed_dim, embed_dim, bias)
+        self.proj_drop = torch.nn.Dropout(dropout)
+        self.batch_first = batch_first
+        self.auto_sparsity = auto_sparsity
+    def forward(self, query, key, value, key_padding_mask=None, need_weights=True, attn_mask=None, average_attn_weights=True):
+        if not self.batch_first:  # N, B, C
+            query = query.permute(1, 0, 2)  # B, N_q, C
+            key = key.permute(1, 0, 2)  # B, N_k, C
+            value = value.permute(1, 0, 2)  # B, N_k, C
+        B, N_q, C = query.shape
+        B, N_k, C = key.shape
+        q = self.q(query).reshape(B, N_q, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        q = q.flatten(0, 1)
+        k = self.k(key).reshape(B, N_k, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        k = k.flatten(0, 1)
+        v = self.v(value).reshape(B, N_k, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        v = v.flatten(0, 1)
+        if self.auto_sparsity:
+            assert attn_mask is None
+            x = dynamic_sparse_attention(q, k, v, sparsity=self.auto_sparsity)
+        else:
+            x = scaled_dot_product_attention(q, k, v, attn_mask, dropout=self.attn_drop)
+        x = x.reshape(B, self.num_heads, N_q, C // self.num_heads)
+        x = x.transpose(1, 2).reshape(B, N_q, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        if not self.batch_first:
+            x = x.permute(1, 0, 2)
+        return x, None
+def scaled_query_key_softmax(q, k, att_mask):
+    from xformers.ops import masked_matmul
+    q = q / (k.size(-1)) ** 0.5
+    att = masked_matmul(q, k.transpose(-2, -1), att_mask)
+    att = torch.nn.functional.softmax(att, -1)
+    return att
+def scaled_dot_product_attention(q, k, v, att_mask, dropout):
+    att = scaled_query_key_softmax(q, k, att_mask=att_mask)
+    att = dropout(att)
+    y = att @ v
+    return y
+def _compute_buckets(x, R):
+    qq = torch.einsum("btf,bfhi->bhti", x, R)
+    qq = torch.cat([qq, -qq], dim=-1)
+    buckets = qq.argmax(dim=-1)
+    return buckets.permute(0, 2, 1).byte().contiguous()
+def dynamic_sparse_attention(query, key, value, sparsity, infer_sparsity=True, attn_bias=None):
+    # assert False, "The code for the custom sparse kernel is not ready for release yet."
+    from xformers.ops import find_locations, sparse_memory_efficient_attention
+    n_hashes = 32
+    proj_size = 4
+    query, key, value = [x.contiguous() for x in [query, key, value]]
+    with torch.no_grad():
+        R = torch.randn(1, query.shape[-1], n_hashes, proj_size // 2, device=query.device)
+        bucket_query = _compute_buckets(query, R)
+        bucket_key = _compute_buckets(key, R)
+        row_offsets, column_indices = find_locations(bucket_query, bucket_key, sparsity, infer_sparsity)
+    return sparse_memory_efficient_attention(query, key, value, row_offsets, column_indices, attn_bias)

audio_separator/separator/uvr_lib_v5/demucs/utils.py ADDED Viewed

	@@ -0,0 +1,496 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import defaultdict
+from contextlib import contextmanager
+import math
+import os
+import tempfile
+import typing as tp
+import errno
+import functools
+import hashlib
+import inspect
+import io
+import os
+import random
+import socket
+import tempfile
+import warnings
+import zlib
+from diffq import UniformQuantizer, DiffQuantizer
+import torch as th
+import tqdm
+from torch import distributed
+from torch.nn import functional as F
+import torch
+def unfold(a, kernel_size, stride):
+    """Given input of size [*OT, T], output Tensor of size [*OT, F, K]
+    with K the kernel size, by extracting frames with the given stride.
+    This will pad the input so that `F = ceil(T / K)`.
+    see https://github.com/pytorch/pytorch/issues/60466
+    """
+    *shape, length = a.shape
+    n_frames = math.ceil(length / stride)
+    tgt_length = (n_frames - 1) * stride + kernel_size
+    a = F.pad(a, (0, tgt_length - length))
+    strides = list(a.stride())
+    assert strides[-1] == 1, "data should be contiguous"
+    strides = strides[:-1] + [stride, 1]
+    return a.as_strided([*shape, n_frames, kernel_size], strides)
+def center_trim(tensor: torch.Tensor, reference: tp.Union[torch.Tensor, int]):
+    """
+    Center trim `tensor` with respect to `reference`, along the last dimension.
+    `reference` can also be a number, representing the length to trim to.
+    If the size difference != 0 mod 2, the extra sample is removed on the right side.
+    """
+    ref_size: int
+    if isinstance(reference, torch.Tensor):
+        ref_size = reference.size(-1)
+    else:
+        ref_size = reference
+    delta = tensor.size(-1) - ref_size
+    if delta < 0:
+        raise ValueError("tensor must be larger than reference. " f"Delta is {delta}.")
+    if delta:
+        tensor = tensor[..., delta // 2 : -(delta - delta // 2)]
+    return tensor
+def pull_metric(history: tp.List[dict], name: str):
+    out = []
+    for metrics in history:
+        metric = metrics
+        for part in name.split("."):
+            metric = metric[part]
+        out.append(metric)
+    return out
+def EMA(beta: float = 1):
+    """
+    Exponential Moving Average callback.
+    Returns a single function that can be called to repeatidly update the EMA
+    with a dict of metrics. The callback will return
+    the new averaged dict of metrics.
+    Note that for `beta=1`, this is just plain averaging.
+    """
+    fix: tp.Dict[str, float] = defaultdict(float)
+    total: tp.Dict[str, float] = defaultdict(float)
+    def _update(metrics: dict, weight: float = 1) -> dict:
+        nonlocal total, fix
+        for key, value in metrics.items():
+            total[key] = total[key] * beta + weight * float(value)
+            fix[key] = fix[key] * beta + weight
+        return {key: tot / fix[key] for key, tot in total.items()}
+    return _update
+def sizeof_fmt(num: float, suffix: str = "B"):
+    """
+    Given `num` bytes, return human readable size.
+    Taken from https://stackoverflow.com/a/1094933
+    """
+    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
+        if abs(num) < 1024.0:
+            return "%3.1f%s%s" % (num, unit, suffix)
+        num /= 1024.0
+    return "%.1f%s%s" % (num, "Yi", suffix)
+@contextmanager
+def temp_filenames(count: int, delete=True):
+    names = []
+    try:
+        for _ in range(count):
+            names.append(tempfile.NamedTemporaryFile(delete=False).name)
+        yield names
+    finally:
+        if delete:
+            for name in names:
+                os.unlink(name)
+def average_metric(metric, count=1.0):
+    """
+    Average `metric` which should be a float across all hosts. `count` should be
+    the weight for this particular host (i.e. number of examples).
+    """
+    metric = th.tensor([count, count * metric], dtype=th.float32, device="cuda")
+    distributed.all_reduce(metric, op=distributed.ReduceOp.SUM)
+    return metric[1].item() / metric[0].item()
+def free_port(host="", low=20000, high=40000):
+    """
+    Return a port number that is most likely free.
+    This could suffer from a race condition although
+    it should be quite rare.
+    """
+    sock = socket.socket()
+    while True:
+        port = random.randint(low, high)
+        try:
+            sock.bind((host, port))
+        except OSError as error:
+            if error.errno == errno.EADDRINUSE:
+                continue
+            raise
+        return port
+def sizeof_fmt(num, suffix="B"):
+    """
+    Given `num` bytes, return human readable size.
+    Taken from https://stackoverflow.com/a/1094933
+    """
+    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
+        if abs(num) < 1024.0:
+            return "%3.1f%s%s" % (num, unit, suffix)
+        num /= 1024.0
+    return "%.1f%s%s" % (num, "Yi", suffix)
+def human_seconds(seconds, display=".2f"):
+    """
+    Given `seconds` seconds, return human readable duration.
+    """
+    value = seconds * 1e6
+    ratios = [1e3, 1e3, 60, 60, 24]
+    names = ["us", "ms", "s", "min", "hrs", "days"]
+    last = names.pop(0)
+    for name, ratio in zip(names, ratios):
+        if value / ratio < 0.3:
+            break
+        value /= ratio
+        last = name
+    return f"{format(value, display)} {last}"
+class TensorChunk:
+    def __init__(self, tensor, offset=0, length=None):
+        total_length = tensor.shape[-1]
+        assert offset >= 0
+        assert offset < total_length
+        if length is None:
+            length = total_length - offset
+        else:
+            length = min(total_length - offset, length)
+        self.tensor = tensor
+        self.offset = offset
+        self.length = length
+        self.device = tensor.device
+    @property
+    def shape(self):
+        shape = list(self.tensor.shape)
+        shape[-1] = self.length
+        return shape
+    def padded(self, target_length):
+        delta = target_length - self.length
+        total_length = self.tensor.shape[-1]
+        assert delta >= 0
+        start = self.offset - delta // 2
+        end = start + target_length
+        correct_start = max(0, start)
+        correct_end = min(total_length, end)
+        pad_left = correct_start - start
+        pad_right = end - correct_end
+        out = F.pad(self.tensor[..., correct_start:correct_end], (pad_left, pad_right))
+        assert out.shape[-1] == target_length
+        return out
+def tensor_chunk(tensor_or_chunk):
+    if isinstance(tensor_or_chunk, TensorChunk):
+        return tensor_or_chunk
+    else:
+        assert isinstance(tensor_or_chunk, th.Tensor)
+        return TensorChunk(tensor_or_chunk)
+def apply_model_v1(model, mix, shifts=None, split=False, progress=False, set_progress_bar=None):
+    """
+    Apply model to a given mixture.
+    Args:
+        shifts (int): if > 0, will shift in time `mix` by a random amount between 0 and 0.5 sec
+            and apply the oppositve shift to the output. This is repeated `shifts` time and
+            all predictions are averaged. This effectively makes the model time equivariant
+            and improves SDR by up to 0.2 points.
+        split (bool): if True, the input will be broken down in 8 seconds extracts
+            and predictions will be performed individually on each and concatenated.
+            Useful for model with large memory footprint like Tasnet.
+        progress (bool): if True, show a progress bar (requires split=True)
+    """
+    channels, length = mix.size()
+    device = mix.device
+    progress_value = 0
+    if split:
+        out = th.zeros(4, channels, length, device=device)
+        shift = model.samplerate * 10
+        offsets = range(0, length, shift)
+        scale = 10
+        if progress:
+            offsets = tqdm.tqdm(offsets, unit_scale=scale, ncols=120, unit="seconds")
+        for offset in offsets:
+            chunk = mix[..., offset : offset + shift]
+            if set_progress_bar:
+                progress_value += 1
+                set_progress_bar(0.1, (0.8 / len(offsets) * progress_value))
+                chunk_out = apply_model_v1(model, chunk, shifts=shifts, set_progress_bar=set_progress_bar)
+            else:
+                chunk_out = apply_model_v1(model, chunk, shifts=shifts)
+            out[..., offset : offset + shift] = chunk_out
+            offset += shift
+        return out
+    elif shifts:
+        max_shift = int(model.samplerate / 2)
+        mix = F.pad(mix, (max_shift, max_shift))
+        offsets = list(range(max_shift))
+        random.shuffle(offsets)
+        out = 0
+        for offset in offsets[:shifts]:
+            shifted = mix[..., offset : offset + length + max_shift]
+            if set_progress_bar:
+                shifted_out = apply_model_v1(model, shifted, set_progress_bar=set_progress_bar)
+            else:
+                shifted_out = apply_model_v1(model, shifted)
+            out += shifted_out[..., max_shift - offset : max_shift - offset + length]
+        out /= shifts
+        return out
+    else:
+        valid_length = model.valid_length(length)
+        delta = valid_length - length
+        padded = F.pad(mix, (delta // 2, delta - delta // 2))
+        with th.no_grad():
+            out = model(padded.unsqueeze(0))[0]
+        return center_trim(out, mix)
+def apply_model_v2(model, mix, shifts=None, split=False, overlap=0.25, transition_power=1.0, progress=False, set_progress_bar=None):
+    """
+    Apply model to a given mixture.
+    Args:
+        shifts (int): if > 0, will shift in time `mix` by a random amount between 0 and 0.5 sec
+            and apply the oppositve shift to the output. This is repeated `shifts` time and
+            all predictions are averaged. This effectively makes the model time equivariant
+            and improves SDR by up to 0.2 points.
+        split (bool): if True, the input will be broken down in 8 seconds extracts
+            and predictions will be performed individually on each and concatenated.
+            Useful for model with large memory footprint like Tasnet.
+        progress (bool): if True, show a progress bar (requires split=True)
+    """
+    assert transition_power >= 1, "transition_power < 1 leads to weird behavior."
+    device = mix.device
+    channels, length = mix.shape
+    progress_value = 0
+    if split:
+        out = th.zeros(len(model.sources), channels, length, device=device)
+        sum_weight = th.zeros(length, device=device)
+        segment = model.segment_length
+        stride = int((1 - overlap) * segment)
+        offsets = range(0, length, stride)
+        scale = stride / model.samplerate
+        if progress:
+            offsets = tqdm.tqdm(offsets, unit_scale=scale, ncols=120, unit="seconds")
+        # We start from a triangle shaped weight, with maximal weight in the middle
+        # of the segment. Then we normalize and take to the power `transition_power`.
+        # Large values of transition power will lead to sharper transitions.
+        weight = th.cat([th.arange(1, segment // 2 + 1), th.arange(segment - segment // 2, 0, -1)]).to(device)
+        assert len(weight) == segment
+        # If the overlap < 50%, this will translate to linear transition when
+        # transition_power is 1.
+        weight = (weight / weight.max()) ** transition_power
+        for offset in offsets:
+            chunk = TensorChunk(mix, offset, segment)
+            if set_progress_bar:
+                progress_value += 1
+                set_progress_bar(0.1, (0.8 / len(offsets) * progress_value))
+                chunk_out = apply_model_v2(model, chunk, shifts=shifts, set_progress_bar=set_progress_bar)
+            else:
+                chunk_out = apply_model_v2(model, chunk, shifts=shifts)
+            chunk_length = chunk_out.shape[-1]
+            out[..., offset : offset + segment] += weight[:chunk_length] * chunk_out
+            sum_weight[offset : offset + segment] += weight[:chunk_length]
+            offset += segment
+        assert sum_weight.min() > 0
+        out /= sum_weight
+        return out
+    elif shifts:
+        max_shift = int(0.5 * model.samplerate)
+        mix = tensor_chunk(mix)
+        padded_mix = mix.padded(length + 2 * max_shift)
+        out = 0
+        for _ in range(shifts):
+            offset = random.randint(0, max_shift)
+            shifted = TensorChunk(padded_mix, offset, length + max_shift - offset)
+            if set_progress_bar:
+                progress_value += 1
+                shifted_out = apply_model_v2(model, shifted, set_progress_bar=set_progress_bar)
+            else:
+                shifted_out = apply_model_v2(model, shifted)
+            out += shifted_out[..., max_shift - offset :]
+        out /= shifts
+        return out
+    else:
+        valid_length = model.valid_length(length)
+        mix = tensor_chunk(mix)
+        padded_mix = mix.padded(valid_length)
+        with th.no_grad():
+            out = model(padded_mix.unsqueeze(0))[0]
+        return center_trim(out, length)
+@contextmanager
+def temp_filenames(count, delete=True):
+    names = []
+    try:
+        for _ in range(count):
+            names.append(tempfile.NamedTemporaryFile(delete=False).name)
+        yield names
+    finally:
+        if delete:
+            for name in names:
+                os.unlink(name)
+def get_quantizer(model, args, optimizer=None):
+    quantizer = None
+    if args.diffq:
+        quantizer = DiffQuantizer(model, min_size=args.q_min_size, group_size=8)
+        if optimizer is not None:
+            quantizer.setup_optimizer(optimizer)
+    elif args.qat:
+        quantizer = UniformQuantizer(model, bits=args.qat, min_size=args.q_min_size)
+    return quantizer
+def load_model(path, strict=False):
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        load_from = path
+        package = th.load(load_from, "cpu")
+    klass = package["klass"]
+    args = package["args"]
+    kwargs = package["kwargs"]
+    if strict:
+        model = klass(*args, **kwargs)
+    else:
+        sig = inspect.signature(klass)
+        for key in list(kwargs):
+            if key not in sig.parameters:
+                warnings.warn("Dropping inexistant parameter " + key)
+                del kwargs[key]
+        model = klass(*args, **kwargs)
+    state = package["state"]
+    training_args = package["training_args"]
+    quantizer = get_quantizer(model, training_args)
+    set_state(model, quantizer, state)
+    return model
+def get_state(model, quantizer):
+    if quantizer is None:
+        state = {k: p.data.to("cpu") for k, p in model.state_dict().items()}
+    else:
+        state = quantizer.get_quantized_state()
+        buf = io.BytesIO()
+        th.save(state, buf)
+        state = {"compressed": zlib.compress(buf.getvalue())}
+    return state
+def set_state(model, quantizer, state):
+    if quantizer is None:
+        model.load_state_dict(state)
+    else:
+        buf = io.BytesIO(zlib.decompress(state["compressed"]))
+        state = th.load(buf, "cpu")
+        quantizer.restore_quantized_state(state)
+    return state
+def save_state(state, path):
+    buf = io.BytesIO()
+    th.save(state, buf)
+    sig = hashlib.sha256(buf.getvalue()).hexdigest()[:8]
+    path = path.parent / (path.stem + "-" + sig + path.suffix)
+    path.write_bytes(buf.getvalue())
+def save_model(model, quantizer, training_args, path):
+    args, kwargs = model._init_args_kwargs
+    klass = model.__class__
+    state = get_state(model, quantizer)
+    save_to = path
+    package = {"klass": klass, "args": args, "kwargs": kwargs, "state": state, "training_args": training_args}
+    th.save(package, save_to)
+def capture_init(init):
+    @functools.wraps(init)
+    def __init__(self, *args, **kwargs):
+        self._init_args_kwargs = (args, kwargs)
+        init(self, *args, **kwargs)
+    return __init__
+class DummyPoolExecutor:
+    class DummyResult:
+        def __init__(self, func, *args, **kwargs):
+            self.func = func
+            self.args = args
+            self.kwargs = kwargs
+        def result(self):
+            return self.func(*self.args, **self.kwargs)
+    def __init__(self, workers=0):
+        pass
+    def submit(self, func, *args, **kwargs):
+        return DummyPoolExecutor.DummyResult(func, *args, **kwargs)
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        return

audio_separator/separator/uvr_lib_v5/mdxnet.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import torch
+import torch.nn as nn
+from .modules import TFC_TDF
+from pytorch_lightning import LightningModule
+dim_s = 4
+class AbstractMDXNet(LightningModule):
+    def __init__(self, target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length, overlap):
+        super().__init__()
+        self.target_name = target_name
+        self.lr = lr
+        self.optimizer = optimizer
+        self.dim_c = dim_c
+        self.dim_f = dim_f
+        self.dim_t = dim_t
+        self.n_fft = n_fft
+        self.n_bins = n_fft // 2 + 1
+        self.hop_length = hop_length
+        self.window = nn.Parameter(torch.hann_window(window_length=self.n_fft, periodic=True), requires_grad=False)
+        self.freq_pad = nn.Parameter(torch.zeros([1, dim_c, self.n_bins - self.dim_f, self.dim_t]), requires_grad=False)
+    def get_optimizer(self):
+        if self.optimizer == 'rmsprop':
+            return torch.optim.RMSprop(self.parameters(), self.lr)
+        if self.optimizer == 'adamw':
+            return torch.optim.AdamW(self.parameters(), self.lr)
+class ConvTDFNet(AbstractMDXNet):
+    def __init__(self, target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length,
+                 num_blocks, l, g, k, bn, bias, overlap):
+        super(ConvTDFNet, self).__init__(
+            target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length, overlap)
+        #self.save_hyperparameters()
+        self.num_blocks = num_blocks
+        self.l = l
+        self.g = g
+        self.k = k
+        self.bn = bn
+        self.bias = bias
+        if optimizer == 'rmsprop':
+            norm = nn.BatchNorm2d
+        if optimizer == 'adamw':
+            norm = lambda input:nn.GroupNorm(2, input)
+        self.n = num_blocks // 2
+        scale = (2, 2)
+        self.first_conv = nn.Sequential(
+            nn.Conv2d(in_channels=self.dim_c, out_channels=g, kernel_size=(1, 1)),
+            norm(g),
+            nn.ReLU(),
+        )
+        f = self.dim_f
+        c = g
+        self.encoding_blocks = nn.ModuleList()
+        self.ds = nn.ModuleList()
+        for i in range(self.n):
+            self.encoding_blocks.append(TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm))
+            self.ds.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels=c, out_channels=c + g, kernel_size=scale, stride=scale),
+                    norm(c + g),
+                    nn.ReLU()
+                )
+            )
+            f = f // 2
+            c += g
+        self.bottleneck_block = TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm)
+        self.decoding_blocks = nn.ModuleList()
+        self.us = nn.ModuleList()
+        for i in range(self.n):
+            self.us.append(
+                nn.Sequential(
+                    nn.ConvTranspose2d(in_channels=c, out_channels=c - g, kernel_size=scale, stride=scale),
+                    norm(c - g),
+                    nn.ReLU()
+                )
+            )
+            f = f * 2
+            c -= g
+            self.decoding_blocks.append(TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm))
+        self.final_conv = nn.Sequential(
+            nn.Conv2d(in_channels=c, out_channels=self.dim_c, kernel_size=(1, 1)),
+        )
+    def forward(self, x):
+        x = self.first_conv(x)
+        x = x.transpose(-1, -2)
+        ds_outputs = []
+        for i in range(self.n):
+            x = self.encoding_blocks[i](x)
+            ds_outputs.append(x)
+            x = self.ds[i](x)
+        x = self.bottleneck_block(x)
+        for i in range(self.n):
+            x = self.us[i](x)
+            x *= ds_outputs[-i - 1]
+            x = self.decoding_blocks[i](x)
+        x = x.transpose(-1, -2)
+        x = self.final_conv(x)
+        return x
+class Mixer(nn.Module):
+    def __init__(self, device, mixer_path):
+        super(Mixer, self).__init__()
+        self.linear = nn.Linear((dim_s+1)*2, dim_s*2, bias=False)
+        self.load_state_dict(
+            torch.load(mixer_path, map_location=device)
+        )
+    def forward(self, x):
+        x = x.reshape(1,(dim_s+1)*2,-1).transpose(-1,-2)
+        x = self.linear(x)
+        return x.transpose(-1,-2).reshape(dim_s,2,-1)

audio_separator/separator/uvr_lib_v5/mixer.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea781bd52c6a523b825fa6cdbb6189f52e318edd8b17e6fe404f76f7af8caa9c
+size 1208

audio_separator/separator/uvr_lib_v5/modules.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+import torch.nn as nn
+class TFC(nn.Module):
+    def __init__(self, c, l, k, norm):
+        super(TFC, self).__init__()
+        self.H = nn.ModuleList()
+        for i in range(l):
+            self.H.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels=c, out_channels=c, kernel_size=k, stride=1, padding=k // 2),
+                    norm(c),
+                    nn.ReLU(),
+                )
+            )
+    def forward(self, x):
+        for h in self.H:
+            x = h(x)
+        return x
+class DenseTFC(nn.Module):
+    def __init__(self, c, l, k, norm):
+        super(DenseTFC, self).__init__()
+        self.conv = nn.ModuleList()
+        for i in range(l):
+            self.conv.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels=c, out_channels=c, kernel_size=k, stride=1, padding=k // 2),
+                    norm(c),
+                    nn.ReLU(),
+                )
+            )
+    def forward(self, x):
+        for layer in self.conv[:-1]:
+            x = torch.cat([layer(x), x], 1)
+        return self.conv[-1](x)
+class TFC_TDF(nn.Module):
+    def __init__(self, c, l, f, k, bn, dense=False, bias=True, norm=nn.BatchNorm2d):
+        super(TFC_TDF, self).__init__()
+        self.use_tdf = bn is not None
+        self.tfc = DenseTFC(c, l, k, norm) if dense else TFC(c, l, k, norm)
+        if self.use_tdf:
+            if bn == 0:
+                self.tdf = nn.Sequential(
+                    nn.Linear(f, f, bias=bias),
+                    norm(c),
+                    nn.ReLU()
+                )
+            else:
+                self.tdf = nn.Sequential(
+                    nn.Linear(f, f // bn, bias=bias),
+                    norm(c),
+                    nn.ReLU(),
+                    nn.Linear(f // bn, f, bias=bias),
+                    norm(c),
+                    nn.ReLU()
+                )
+    def forward(self, x):
+        x = self.tfc(x)
+        return x + self.tdf(x) if self.use_tdf else x

audio_separator/separator/uvr_lib_v5/playsound.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import logging
+logger = logging.getLogger(__name__)
+class PlaysoundException(Exception):
+    pass
+def _canonicalizePath(path):
+    """
+    Support passing in a pathlib.Path-like object by converting to str.
+    """
+    import sys
+    if sys.version_info[0] >= 3:
+        return str(path)
+    else:
+        # On earlier Python versions, str is a byte string, so attempting to
+        # convert a unicode string to str will fail. Leave it alone in this case.
+        return path
+def _playsoundWin(sound, block = True):
+    '''
+    Utilizes windll.winmm. Tested and known to work with MP3 and WAVE on
+    Windows 7 with Python 2.7. Probably works with more file formats.
+    Probably works on Windows XP thru Windows 10. Probably works with all
+    versions of Python.
+    Inspired by (but not copied from) Michael Gundlach <[email protected]>'s mp3play:
+    https://github.com/michaelgundlach/mp3play
+    I never would have tried using windll.winmm without seeing his code.
+    '''
+    sound = '"' + _canonicalizePath(sound) + '"'
+    from ctypes import create_unicode_buffer, windll, wintypes
+    windll.winmm.mciSendStringW.argtypes = [wintypes.LPCWSTR, wintypes.LPWSTR, wintypes.UINT, wintypes.HANDLE]
+    windll.winmm.mciGetErrorStringW.argtypes = [wintypes.DWORD, wintypes.LPWSTR, wintypes.UINT]
+    def winCommand(*command):
+        bufLen = 600
+        buf = create_unicode_buffer(bufLen)
+        command = ' '.join(command)
+        errorCode = int(windll.winmm.mciSendStringW(command, buf, bufLen - 1, 0))  # use widestring version of the function
+        if errorCode:
+            errorBuffer = create_unicode_buffer(bufLen)
+            windll.winmm.mciGetErrorStringW(errorCode, errorBuffer, bufLen - 1)  # use widestring version of the function
+            exceptionMessage = ('\n    Error ' + str(errorCode) + ' for command:'
+                                '\n        ' + command +
+                                '\n    ' + errorBuffer.value)
+            logger.error(exceptionMessage)
+            raise PlaysoundException(exceptionMessage)
+        return buf.value
+    try:
+        logger.debug('Starting')
+        winCommand(u'open {}'.format(sound))
+        winCommand(u'play {}{}'.format(sound, ' wait' if block else ''))
+        logger.debug('Returning')
+    finally:
+        try:
+            winCommand(u'close {}'.format(sound))
+        except PlaysoundException:
+            logger.warning(u'Failed to close the file: {}'.format(sound))
+            # If it fails, there's nothing more that can be done...
+            pass
+def _handlePathOSX(sound):
+    sound = _canonicalizePath(sound)
+    if '://' not in sound:
+        if not sound.startswith('/'):
+            from os import getcwd
+            sound = getcwd() + '/' + sound
+        sound = 'file://' + sound
+    try:
+        # Don't double-encode it.
+        sound.encode('ascii')
+        return sound.replace(' ', '%20')
+    except UnicodeEncodeError:
+        try:
+            from urllib.parse import quote  # Try the Python 3 import first...
+        except ImportError:
+            from urllib import quote  # Try using the Python 2 import before giving up entirely...
+        parts = sound.split('://', 1)
+        return parts[0] + '://' + quote(parts[1].encode('utf-8')).replace(' ', '%20')
+def _playsoundOSX(sound, block = True):
+    '''
+    Utilizes AppKit.NSSound. Tested and known to work with MP3 and WAVE on
+    OS X 10.11 with Python 2.7. Probably works with anything QuickTime supports.
+    Probably works on OS X 10.5 and newer. Probably works with all versions of
+    Python.
+    Inspired by (but not copied from) Aaron's Stack Overflow answer here:
+    http://stackoverflow.com/a/34568298/901641
+    I never would have tried using AppKit.NSSound without seeing his code.
+    '''
+    try:
+        from AppKit import NSSound
+    except ImportError:
+        logger.warning("playsound could not find a copy of AppKit - falling back to using macOS's system copy.")
+        sys.path.append('/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/PyObjC')
+        from AppKit import NSSound
+    from Foundation import NSURL
+    from time       import sleep
+    sound = _handlePathOSX(sound)
+    url   = NSURL.URLWithString_(sound)
+    if not url:
+        raise PlaysoundException('Cannot find a sound with filename: ' + sound)
+    for i in range(5):
+        nssound = NSSound.alloc().initWithContentsOfURL_byReference_(url, True)
+        if nssound:
+            break
+        else:
+            logger.debug('Failed to load sound, although url was good... ' + sound)
+    else:
+        raise PlaysoundException('Could not load sound with filename, although URL was good... ' + sound)
+    nssound.play()
+    if block:
+        sleep(nssound.duration())
+def _playsoundNix(sound, block = True):
+    """Play a sound using GStreamer.
+    Inspired by this:
+    https://gstreamer.freedesktop.org/documentation/tutorials/playback/playbin-usage.html
+    """
+    sound = _canonicalizePath(sound)
+    # pathname2url escapes non-URL-safe characters
+    from os.path import abspath, exists
+    try:
+        from urllib.request import pathname2url
+    except ImportError:
+        # python 2
+        from urllib import pathname2url
+    import gi
+    gi.require_version('Gst', '1.0')
+    from gi.repository import Gst
+    Gst.init(None)
+    playbin = Gst.ElementFactory.make('playbin', 'playbin')
+    if sound.startswith(('http://', 'https://')):
+        playbin.props.uri = sound
+    else:
+        path = abspath(sound)
+        if not exists(path):
+            raise PlaysoundException(u'File not found: {}'.format(path))
+        playbin.props.uri = 'file://' + pathname2url(path)
+    set_result = playbin.set_state(Gst.State.PLAYING)
+    if set_result != Gst.StateChangeReturn.ASYNC:
+        raise PlaysoundException(
+            "playbin.set_state returned " + repr(set_result))
+    # FIXME: use some other bus method than poll() with block=False
+    # https://lazka.github.io/pgi-docs/#Gst-1.0/classes/Bus.html
+    logger.debug('Starting play')
+    if block:
+        bus = playbin.get_bus()
+        try:
+            bus.poll(Gst.MessageType.EOS, Gst.CLOCK_TIME_NONE)
+        finally:
+            playbin.set_state(Gst.State.NULL)
+    logger.debug('Finishing play')
+def _playsoundAnotherPython(otherPython, sound, block = True, macOS = False):
+    '''
+    Mostly written so that when this is run on python3 on macOS, it can invoke
+    python2 on macOS... but maybe this idea could be useful on linux, too.
+    '''
+    from inspect    import getsourcefile
+    from os.path    import abspath, exists
+    from subprocess import check_call
+    from threading  import Thread
+    sound = _canonicalizePath(sound)
+    class PropogatingThread(Thread):
+        def run(self):
+            self.exc = None
+            try:
+                self.ret = self._target(*self._args, **self._kwargs)
+            except BaseException as e:
+                self.exc = e
+        def join(self, timeout = None):
+            super().join(timeout)
+            if self.exc:
+                raise self.exc
+            return self.ret
+    # Check if the file exists...
+    if not exists(abspath(sound)):
+        raise PlaysoundException('Cannot find a sound with filename: ' + sound)
+    playsoundPath = abspath(getsourcefile(lambda: 0))
+    t = PropogatingThread(target = lambda: check_call([otherPython, playsoundPath, _handlePathOSX(sound) if macOS else sound]))
+    t.start()
+    if block:
+        t.join()
+from platform import system
+system = system()
+if system == 'Windows':
+    playsound_func = _playsoundWin
+elif system == 'Darwin':
+    playsound_func = _playsoundOSX
+    import sys
+    if sys.version_info[0] > 2:
+        try:
+            from AppKit import NSSound
+        except ImportError:
+            logger.warning("playsound is relying on a python 2 subprocess. Please use `pip3 install PyObjC` if you want playsound to run more efficiently.")
+            playsound_func = lambda sound, block = True: _playsoundAnotherPython('/System/Library/Frameworks/Python.framework/Versions/2.7/bin/python', sound, block, macOS = True)
+else:
+    playsound_func = _playsoundNix
+    if __name__ != '__main__':  # Ensure we don't infinitely recurse trying to get another python instance.
+        try:
+            import gi
+            gi.require_version('Gst', '1.0')
+            from gi.repository import Gst
+        except:
+            logger.warning("playsound is relying on another python subprocess. Please use `pip install pygobject` if you want playsound to run more efficiently.")
+            playsound_func = lambda sound, block = True: _playsoundAnotherPython('/usr/bin/python3', sound, block, macOS = False)
+del system
+def play(audio_filepath):
+    playsound_func(audio_filepath)

audio_separator/separator/uvr_lib_v5/pyrb.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import subprocess
+import tempfile
+import six
+import numpy as np
+import soundfile as sf
+import sys
+if getattr(sys, 'frozen', False):
+    BASE_PATH_RUB = sys._MEIPASS
+else:
+    BASE_PATH_RUB = os.path.dirname(os.path.abspath(__file__))
+__all__ = ['time_stretch', 'pitch_shift']
+__RUBBERBAND_UTIL = os.path.join(BASE_PATH_RUB, 'rubberband')
+if six.PY2:
+    DEVNULL = open(os.devnull, 'w')
+else:
+    DEVNULL = subprocess.DEVNULL
+def __rubberband(y, sr, **kwargs):
+    assert sr > 0
+    # Get the input and output tempfile
+    fd, infile = tempfile.mkstemp(suffix='.wav')
+    os.close(fd)
+    fd, outfile = tempfile.mkstemp(suffix='.wav')
+    os.close(fd)
+    # dump the audio
+    sf.write(infile, y, sr)
+    try:
+        # Execute rubberband
+        arguments = [__RUBBERBAND_UTIL, '-q']
+        for key, value in six.iteritems(kwargs):
+            arguments.append(str(key))
+            arguments.append(str(value))
+        arguments.extend([infile, outfile])
+        subprocess.check_call(arguments, stdout=DEVNULL, stderr=DEVNULL)
+        # Load the processed audio.
+        y_out, _ = sf.read(outfile, always_2d=True)
+        # make sure that output dimensions matches input
+        if y.ndim == 1:
+            y_out = np.squeeze(y_out)
+    except OSError as exc:
+        six.raise_from(RuntimeError('Failed to execute rubberband. '
+                                    'Please verify that rubberband-cli '
+                                    'is installed.'),
+                       exc)
+    finally:
+        # Remove temp files
+        os.unlink(infile)
+        os.unlink(outfile)
+    return y_out
+def time_stretch(y, sr, rate, rbargs=None):
+    if rate <= 0:
+        raise ValueError('rate must be strictly positive')
+    if rate == 1.0:
+        return y
+    if rbargs is None:
+        rbargs = dict()
+    rbargs.setdefault('--tempo', rate)
+    return __rubberband(y, sr, **rbargs)
+def pitch_shift(y, sr, n_steps, rbargs=None):
+    if n_steps == 0:
+        return y
+    if rbargs is None:
+        rbargs = dict()
+    rbargs.setdefault('--pitch', n_steps)
+    return __rubberband(y, sr, **rbargs)

audio_separator/separator/uvr_lib_v5/results.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# -*- coding: utf-8 -*-
+"""
+Matchering - Audio Matching and Mastering Python Library
+Copyright (C) 2016-2022 Sergree
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+import os
+import soundfile as sf
+class Result:
+    def __init__(
+        self, file: str, subtype: str, use_limiter: bool = True, normalize: bool = True
+    ):
+        _, file_ext = os.path.splitext(file)
+        file_ext = file_ext[1:].upper()
+        if not sf.check_format(file_ext):
+            raise TypeError(f"{file_ext} format is not supported")
+        if not sf.check_format(file_ext, subtype):
+            raise TypeError(f"{file_ext} format does not have {subtype} subtype")
+        self.file = file
+        self.subtype = subtype
+        self.use_limiter = use_limiter
+        self.normalize = normalize
+def pcm16(file: str) -> Result:
+    return Result(file, "PCM_16")
+def pcm24(file: str) -> Result:
+    return Result(file, "FLOAT")
+def save_audiofile(file: str, wav_set="PCM_16") -> Result:
+    return Result(file, wav_set)

audio_separator/separator/uvr_lib_v5/roformer/attend.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from functools import wraps
+from packaging import version
+from collections import namedtuple
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from einops import rearrange, reduce
+# constants
+FlashAttentionConfig = namedtuple("FlashAttentionConfig", ["enable_flash", "enable_math", "enable_mem_efficient"])
+# helpers
+def exists(val):
+    return val is not None
+def once(fn):
+    called = False
+    @wraps(fn)
+    def inner(x):
+        nonlocal called
+        if called:
+            return
+        called = True
+        return fn(x)
+    return inner
+print_once = once(print)
+# main class
+class Attend(nn.Module):
+    def __init__(self, dropout=0.0, flash=False):
+        super().__init__()
+        self.dropout = dropout
+        self.attn_dropout = nn.Dropout(dropout)
+        self.flash = flash
+        assert not (flash and version.parse(torch.__version__) < version.parse("2.0.0")), "in order to use flash attention, you must be using pytorch 2.0 or above"
+        # determine efficient attention configs for cuda and cpu
+        self.cpu_config = FlashAttentionConfig(True, True, True)
+        self.cuda_config = None
+        if not torch.cuda.is_available() or not flash:
+            return
+        device_properties = torch.cuda.get_device_properties(torch.device("cuda"))
+        if device_properties.major == 8 and device_properties.minor == 0:
+            print_once("A100 GPU detected, using flash attention if input tensor is on cuda")
+            self.cuda_config = FlashAttentionConfig(True, False, False)
+        else:
+            self.cuda_config = FlashAttentionConfig(False, True, True)
+    def flash_attn(self, q, k, v):
+        _, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device
+        # Check if there is a compatible device for flash attention
+        config = self.cuda_config if is_cuda else self.cpu_config
+        # sdpa_flash kernel only supports float16 on sm80+ architecture gpu
+        if is_cuda and q.dtype != torch.float16:
+            config = FlashAttentionConfig(False, True, True)
+        # pytorch 2.0 flash attn: q, k, v, mask, dropout, softmax_scale
+        with torch.backends.cuda.sdp_kernel(**config._asdict()):
+            out = F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout if self.training else 0.0)
+        return out
+    def forward(self, q, k, v):
+        """
+        einstein notation
+        b - batch
+        h - heads
+        n, i, j - sequence length (base sequence length, source, target)
+        d - feature dimension
+        """
+        q_len, k_len, device = q.shape[-2], k.shape[-2], q.device
+        scale = q.shape[-1] ** -0.5
+        if self.flash:
+            return self.flash_attn(q, k, v)
+        # similarity
+        sim = einsum(f"b h i d, b h j d -> b h i j", q, k) * scale
+        # attention
+        attn = sim.softmax(dim=-1)
+        attn = self.attn_dropout(attn)
+        # aggregate values
+        out = einsum(f"b h i j, b h j d -> b h i d", attn, v)
+        return out

audio_separator/separator/uvr_lib_v5/roformer/bs_roformer.py ADDED Viewed

	@@ -0,0 +1,535 @@

+from functools import partial
+import torch
+from torch import nn, einsum, Tensor
+from torch.nn import Module, ModuleList
+import torch.nn.functional as F
+from .attend import Attend
+from beartype.typing import Tuple, Optional, List, Callable
+from beartype import beartype
+from rotary_embedding_torch import RotaryEmbedding
+from einops import rearrange, pack, unpack
+from einops.layers.torch import Rearrange
+# helper functions
+def exists(val):
+    return val is not None
+def default(v, d):
+    return v if exists(v) else d
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+# norm
+def l2norm(t):
+    return F.normalize(t, dim=-1, p=2)
+class RMSNorm(Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        x = x.to(self.gamma.device)
+        return F.normalize(x, dim=-1) * self.scale * self.gamma
+# attention
+class FeedForward(Module):
+    def __init__(self, dim, mult=4, dropout=0.0):
+        super().__init__()
+        dim_inner = int(dim * mult)
+        self.net = nn.Sequential(RMSNorm(dim), nn.Linear(dim, dim_inner), nn.GELU(), nn.Dropout(dropout), nn.Linear(dim_inner, dim), nn.Dropout(dropout))
+    def forward(self, x):
+        return self.net(x)
+class Attention(Module):
+    def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embed=None, flash=True):
+        super().__init__()
+        self.heads = heads
+        self.scale = dim_head**-0.5
+        dim_inner = heads * dim_head
+        self.rotary_embed = rotary_embed
+        self.attend = Attend(flash=flash, dropout=dropout)
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
+        self.to_gates = nn.Linear(dim, heads)
+        self.to_out = nn.Sequential(nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout))
+    def forward(self, x):
+        x = self.norm(x)
+        q, k, v = rearrange(self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads)
+        if exists(self.rotary_embed):
+            q = self.rotary_embed.rotate_queries_or_keys(q)
+            k = self.rotary_embed.rotate_queries_or_keys(k)
+        out = self.attend(q, k, v)
+        gates = self.to_gates(x)
+        out = out * rearrange(gates, "b n h -> b h n 1").sigmoid()
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class LinearAttention(Module):
+    """
+    this flavor of linear attention proposed in https://arxiv.org/abs/2106.09681 by El-Nouby et al.
+    """
+    @beartype
+    def __init__(self, *, dim, dim_head=32, heads=8, scale=8, flash=False, dropout=0.0):
+        super().__init__()
+        dim_inner = dim_head * heads
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Sequential(nn.Linear(dim, dim_inner * 3, bias=False), Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads))
+        self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
+        self.attend = Attend(scale=scale, dropout=dropout, flash=flash)
+        self.to_out = nn.Sequential(Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False))
+    def forward(self, x):
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x)
+        q, k = map(l2norm, (q, k))
+        q = q * self.temperature.exp()
+        out = self.attend(q, k, v)
+        return self.to_out(out)
+class Transformer(Module):
+    def __init__(self, *, dim, depth, dim_head=64, heads=8, attn_dropout=0.0, ff_dropout=0.0, ff_mult=4, norm_output=True, rotary_embed=None, flash_attn=True, linear_attn=False):
+        super().__init__()
+        self.layers = ModuleList([])
+        for _ in range(depth):
+            if linear_attn:
+                attn = LinearAttention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, flash=flash_attn)
+            else:
+                attn = Attention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, rotary_embed=rotary_embed, flash=flash_attn)
+            self.layers.append(ModuleList([attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)]))
+        self.norm = RMSNorm(dim) if norm_output else nn.Identity()
+    def forward(self, x):
+        for attn, ff in self.layers:
+            x = attn(x) + x
+            x = ff(x) + x
+        return self.norm(x)
+# bandsplit module
+class BandSplit(Module):
+    @beartype
+    def __init__(self, dim, dim_inputs: Tuple[int, ...]):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_features = ModuleList([])
+        for dim_in in dim_inputs:
+            net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim))
+            self.to_features.append(net)
+    def forward(self, x):
+        x = x.split(self.dim_inputs, dim=-1)
+        outs = []
+        for split_input, to_feature in zip(x, self.to_features):
+            split_output = to_feature(split_input)
+            outs.append(split_output)
+        return torch.stack(outs, dim=-2)
+def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
+    dim_hidden = default(dim_hidden, dim_in)
+    net = []
+    dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
+    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
+        is_last = ind == (len(dims) - 2)
+        net.append(nn.Linear(layer_dim_in, layer_dim_out))
+        if is_last:
+            continue
+        net.append(activation())
+    return nn.Sequential(*net)
+class MaskEstimator(Module):
+    @beartype
+    def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_freqs = ModuleList([])
+        dim_hidden = dim * mlp_expansion_factor
+        for dim_in in dim_inputs:
+            net = []
+            mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1))
+            self.to_freqs.append(mlp)
+    def forward(self, x):
+        x = x.unbind(dim=-2)
+        outs = []
+        for band_features, mlp in zip(x, self.to_freqs):
+            freq_out = mlp(band_features)
+            outs.append(freq_out)
+        return torch.cat(outs, dim=-1)
+# main class
+DEFAULT_FREQS_PER_BANDS = (
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    24,
+    24,
+    24,
+    24,
+    24,
+    24,
+    24,
+    24,
+    48,
+    48,
+    48,
+    48,
+    48,
+    48,
+    48,
+    48,
+    128,
+    129,
+)
+class BSRoformer(Module):
+    @beartype
+    def __init__(
+        self,
+        dim,
+        *,
+        depth,
+        stereo=False,
+        num_stems=1,
+        time_transformer_depth=2,
+        freq_transformer_depth=2,
+        linear_transformer_depth=0,
+        freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
+        # in the paper, they divide into ~60 bands, test with 1 for starters
+        dim_head=64,
+        heads=8,
+        attn_dropout=0.0,
+        ff_dropout=0.0,
+        flash_attn=True,
+        dim_freqs_in=1025,
+        stft_n_fft=2048,
+        stft_hop_length=512,
+        # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction
+        stft_win_length=2048,
+        stft_normalized=False,
+        stft_window_fn: Optional[Callable] = None,
+        mask_estimator_depth=2,
+        multi_stft_resolution_loss_weight=1.0,
+        multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
+        multi_stft_hop_size=147,
+        multi_stft_normalized=False,
+        multi_stft_window_fn: Callable = torch.hann_window,
+    ):
+        super().__init__()
+        self.stereo = stereo
+        self.audio_channels = 2 if stereo else 1
+        self.num_stems = num_stems
+        self.layers = ModuleList([])
+        transformer_kwargs = dict(dim=dim, heads=heads, dim_head=dim_head, attn_dropout=attn_dropout, ff_dropout=ff_dropout, flash_attn=flash_attn, norm_output=False)
+        time_rotary_embed = RotaryEmbedding(dim=dim_head)
+        freq_rotary_embed = RotaryEmbedding(dim=dim_head)
+        for _ in range(depth):
+            tran_modules = []
+            if linear_transformer_depth > 0:
+                tran_modules.append(Transformer(depth=linear_transformer_depth, linear_attn=True, **transformer_kwargs))
+            tran_modules.append(Transformer(depth=time_transformer_depth, rotary_embed=time_rotary_embed, **transformer_kwargs))
+            tran_modules.append(Transformer(depth=freq_transformer_depth, rotary_embed=freq_rotary_embed, **transformer_kwargs))
+            self.layers.append(nn.ModuleList(tran_modules))
+        self.final_norm = RMSNorm(dim)
+        self.stft_kwargs = dict(n_fft=stft_n_fft, hop_length=stft_hop_length, win_length=stft_win_length, normalized=stft_normalized)
+        self.stft_window_fn = partial(default(stft_window_fn, torch.hann_window), stft_win_length)
+        freqs = torch.stft(torch.randn(1, 4096), **self.stft_kwargs, return_complex=True).shape[1]
+        assert len(freqs_per_bands) > 1
+        assert sum(freqs_per_bands) == freqs, f"the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}"
+        freqs_per_bands_with_complex = tuple(2 * f * self.audio_channels for f in freqs_per_bands)
+        self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex)
+        self.mask_estimators = nn.ModuleList([])
+        for _ in range(num_stems):
+            mask_estimator = MaskEstimator(dim=dim, dim_inputs=freqs_per_bands_with_complex, depth=mask_estimator_depth)
+            self.mask_estimators.append(mask_estimator)
+        # for the multi-resolution stft loss
+        self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
+        self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
+        self.multi_stft_n_fft = stft_n_fft
+        self.multi_stft_window_fn = multi_stft_window_fn
+        self.multi_stft_kwargs = dict(hop_length=multi_stft_hop_size, normalized=multi_stft_normalized)
+    def forward(self, raw_audio, target=None, return_loss_breakdown=False):
+        """
+        einops
+        b - batch
+        f - freq
+        t - time
+        s - audio channel (1 for mono, 2 for stereo)
+        n - number of 'stems'
+        c - complex (2)
+        d - feature dimension
+        """
+        original_device = raw_audio.device
+        x_is_mps = True if original_device.type == "mps" else False
+        # if x_is_mps:
+        #     raw_audio = raw_audio.cpu()
+        device = raw_audio.device
+        if raw_audio.ndim == 2:
+            raw_audio = rearrange(raw_audio, "b t -> b 1 t")
+        channels = raw_audio.shape[1]
+        assert (not self.stereo and channels == 1) or (
+            self.stereo and channels == 2
+        ), "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)"
+        # to stft
+        raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t")
+        stft_window = self.stft_window_fn().to(device)
+        stft_repr = torch.stft(raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True)
+        stft_repr = torch.view_as_real(stft_repr)
+        stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c")
+        stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")  # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting
+        x = rearrange(stft_repr, "b f t c -> b t (f c)")
+        x = self.band_split(x)
+        # axial / hierarchical attention
+        for transformer_block in self.layers:
+            if len(transformer_block) == 3:
+                linear_transformer, time_transformer, freq_transformer = transformer_block
+                x, ft_ps = pack([x], "b * d")
+                x = linear_transformer(x)
+                (x,) = unpack(x, ft_ps, "b * d")
+            else:
+                time_transformer, freq_transformer = transformer_block
+            x = rearrange(x, "b t f d -> b f t d")
+            x, ps = pack([x], "* t d")
+            x = time_transformer(x)
+            (x,) = unpack(x, ps, "* t d")
+            x = rearrange(x, "b f t d -> b t f d")
+            x, ps = pack([x], "* f d")
+            x = freq_transformer(x)
+            (x,) = unpack(x, ps, "* f d")
+        x = self.final_norm(x)
+        mask = torch.stack([fn(x) for fn in self.mask_estimators], dim=1)
+        mask = rearrange(mask, "b n t (f c) -> b n f t c", c=2)
+        # if x_is_mps:
+        #     mask = mask.to('cpu')
+        # modulate frequency representation
+        stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c")
+        # complex number multiplication
+        stft_repr = torch.view_as_complex(stft_repr)
+        mask = torch.view_as_complex(mask)
+        stft_repr = stft_repr * mask
+        # istft
+        stft_repr = rearrange(stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels)
+        recon_audio = torch.istft(stft_repr.cpu() if x_is_mps else stft_repr, **self.stft_kwargs, window=stft_window.cpu() if x_is_mps else stft_window, return_complex=False).to(device)
+        recon_audio = rearrange(recon_audio, "(b n s) t -> b n s t", s=self.audio_channels, n=self.num_stems)
+        if self.num_stems == 1:
+            recon_audio = rearrange(recon_audio, "b 1 s t -> b s t")
+        # if a target is passed in, calculate loss for learning
+        if not exists(target):
+            return recon_audio
+        if self.num_stems > 1:
+            assert target.ndim == 4 and target.shape[1] == self.num_stems
+        if target.ndim == 2:
+            target = rearrange(target, "... t -> ... 1 t")
+        target = target[..., : recon_audio.shape[-1]]
+        loss = F.l1_loss(recon_audio, target)
+        multi_stft_resolution_loss = 0.0
+        for window_size in self.multi_stft_resolutions_window_sizes:
+            res_stft_kwargs = dict(
+                n_fft=max(window_size, self.multi_stft_n_fft), win_length=window_size, return_complex=True, window=self.multi_stft_window_fn(window_size, device=device), **self.multi_stft_kwargs
+            )
+            recon_Y = torch.stft(rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs)
+            target_Y = torch.stft(rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs)
+            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(recon_Y, target_Y)
+        weighted_multi_resolution_loss = multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
+        total_loss = loss + weighted_multi_resolution_loss
+        if not return_loss_breakdown:
+            # Move the result back to the original device if it was moved to CPU for MPS compatibility
+            # if x_is_mps:
+            #     total_loss = total_loss.to(original_device)
+            return total_loss
+        # For detailed loss breakdown, ensure all components are moved back to the original device for MPS
+        # if x_is_mps:
+        #     loss = loss.to(original_device)
+        #     multi_stft_resolution_loss = multi_stft_resolution_loss.to(original_device)
+        #     weighted_multi_resolution_loss = weighted_multi_resolution_loss.to(original_device)
+        return total_loss, (loss, multi_stft_resolution_loss)
+        # if not return_loss_breakdown:
+        #     return total_loss
+        # return total_loss, (loss, multi_stft_resolution_loss)

audio_separator/separator/uvr_lib_v5/roformer/mel_band_roformer.py ADDED Viewed

	@@ -0,0 +1,445 @@

+from functools import partial
+import torch
+from torch import nn, einsum, Tensor
+from torch.nn import Module, ModuleList
+import torch.nn.functional as F
+from .attend import Attend
+from beartype.typing import Tuple, Optional, List, Callable
+from beartype import beartype
+from rotary_embedding_torch import RotaryEmbedding
+from einops import rearrange, pack, unpack, reduce, repeat
+from librosa import filters
+def exists(val):
+    return val is not None
+def default(v, d):
+    return v if exists(v) else d
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+def pad_at_dim(t, pad, dim=-1, value=0.0):
+    dims_from_right = (-dim - 1) if dim < 0 else (t.ndim - dim - 1)
+    zeros = (0, 0) * dims_from_right
+    return F.pad(t, (*zeros, *pad), value=value)
+class RMSNorm(Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        x = x.to(self.gamma.device)
+        return F.normalize(x, dim=-1) * self.scale * self.gamma
+class FeedForward(Module):
+    def __init__(self, dim, mult=4, dropout=0.0):
+        super().__init__()
+        dim_inner = int(dim * mult)
+        self.net = nn.Sequential(RMSNorm(dim), nn.Linear(dim, dim_inner), nn.GELU(), nn.Dropout(dropout), nn.Linear(dim_inner, dim), nn.Dropout(dropout))
+    def forward(self, x):
+        return self.net(x)
+class Attention(Module):
+    def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embed=None, flash=True):
+        super().__init__()
+        self.heads = heads
+        self.scale = dim_head**-0.5
+        dim_inner = heads * dim_head
+        self.rotary_embed = rotary_embed
+        self.attend = Attend(flash=flash, dropout=dropout)
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
+        self.to_gates = nn.Linear(dim, heads)
+        self.to_out = nn.Sequential(nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout))
+    def forward(self, x):
+        x = self.norm(x)
+        q, k, v = rearrange(self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads)
+        if exists(self.rotary_embed):
+            q = self.rotary_embed.rotate_queries_or_keys(q)
+            k = self.rotary_embed.rotate_queries_or_keys(k)
+        out = self.attend(q, k, v)
+        gates = self.to_gates(x)
+        out = out * rearrange(gates, "b n h -> b h n 1").sigmoid()
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class Transformer(Module):
+    def __init__(self, *, dim, depth, dim_head=64, heads=8, attn_dropout=0.0, ff_dropout=0.0, ff_mult=4, norm_output=True, rotary_embed=None, flash_attn=True):
+        super().__init__()
+        self.layers = ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                ModuleList(
+                    [Attention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, rotary_embed=rotary_embed, flash=flash_attn), FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)]
+                )
+            )
+        self.norm = RMSNorm(dim) if norm_output else nn.Identity()
+    def forward(self, x):
+        for attn, ff in self.layers:
+            x = attn(x) + x
+            x = ff(x) + x
+        return self.norm(x)
+class BandSplit(Module):
+    @beartype
+    def __init__(self, dim, dim_inputs: Tuple[int, ...]):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_features = ModuleList([])
+        for dim_in in dim_inputs:
+            net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim))
+            self.to_features.append(net)
+    def forward(self, x):
+        x = x.split(self.dim_inputs, dim=-1)
+        outs = []
+        for split_input, to_feature in zip(x, self.to_features):
+            split_output = to_feature(split_input)
+            outs.append(split_output)
+        return torch.stack(outs, dim=-2)
+def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
+    dim_hidden = default(dim_hidden, dim_in)
+    net = []
+    dims = (dim_in, *((dim_hidden,) * depth), dim_out)
+    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
+        is_last = ind == (len(dims) - 2)
+        net.append(nn.Linear(layer_dim_in, layer_dim_out))
+        if is_last:
+            continue
+        net.append(activation())
+    return nn.Sequential(*net)
+class MaskEstimator(Module):
+    @beartype
+    def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_freqs = ModuleList([])
+        dim_hidden = dim * mlp_expansion_factor
+        for dim_in in dim_inputs:
+            net = []
+            mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1))
+            self.to_freqs.append(mlp)
+    def forward(self, x):
+        x = x.unbind(dim=-2)
+        outs = []
+        for band_features, mlp in zip(x, self.to_freqs):
+            freq_out = mlp(band_features)
+            outs.append(freq_out)
+        return torch.cat(outs, dim=-1)
+class MelBandRoformer(Module):
+    @beartype
+    def __init__(
+        self,
+        dim,
+        *,
+        depth,
+        stereo=False,
+        num_stems=1,
+        time_transformer_depth=2,
+        freq_transformer_depth=2,
+        num_bands=60,
+        dim_head=64,
+        heads=8,
+        attn_dropout=0.1,
+        ff_dropout=0.1,
+        flash_attn=True,
+        dim_freqs_in=1025,
+        sample_rate=44100,
+        stft_n_fft=2048,
+        stft_hop_length=512,
+        stft_win_length=2048,
+        stft_normalized=False,
+        stft_window_fn: Optional[Callable] = None,
+        mask_estimator_depth=1,
+        multi_stft_resolution_loss_weight=1.0,
+        multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
+        multi_stft_hop_size=147,
+        multi_stft_normalized=False,
+        multi_stft_window_fn: Callable = torch.hann_window,
+        match_input_audio_length=False,
+    ):
+        super().__init__()
+        self.stereo = stereo
+        self.audio_channels = 2 if stereo else 1
+        self.num_stems = num_stems
+        self.layers = ModuleList([])
+        transformer_kwargs = dict(dim=dim, heads=heads, dim_head=dim_head, attn_dropout=attn_dropout, ff_dropout=ff_dropout, flash_attn=flash_attn)
+        time_rotary_embed = RotaryEmbedding(dim=dim_head)
+        freq_rotary_embed = RotaryEmbedding(dim=dim_head)
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        Transformer(depth=time_transformer_depth, rotary_embed=time_rotary_embed, **transformer_kwargs),
+                        Transformer(depth=freq_transformer_depth, rotary_embed=freq_rotary_embed, **transformer_kwargs),
+                    ]
+                )
+            )
+        self.stft_window_fn = partial(default(stft_window_fn, torch.hann_window), stft_win_length)
+        self.stft_kwargs = dict(n_fft=stft_n_fft, hop_length=stft_hop_length, win_length=stft_win_length, normalized=stft_normalized)
+        freqs = torch.stft(torch.randn(1, 4096), **self.stft_kwargs, return_complex=True).shape[1]
+        mel_filter_bank_numpy = filters.mel(sr=sample_rate, n_fft=stft_n_fft, n_mels=num_bands)
+        mel_filter_bank = torch.from_numpy(mel_filter_bank_numpy)
+        mel_filter_bank[0][0] = 1.0
+        mel_filter_bank[-1, -1] = 1.0
+        freqs_per_band = mel_filter_bank > 0
+        assert freqs_per_band.any(dim=0).all(), "all frequencies need to be covered by all bands for now"
+        repeated_freq_indices = repeat(torch.arange(freqs), "f -> b f", b=num_bands)
+        freq_indices = repeated_freq_indices[freqs_per_band]
+        if stereo:
+            freq_indices = repeat(freq_indices, "f -> f s", s=2)
+            freq_indices = freq_indices * 2 + torch.arange(2)
+            freq_indices = rearrange(freq_indices, "f s -> (f s)")
+        self.register_buffer("freq_indices", freq_indices, persistent=False)
+        self.register_buffer("freqs_per_band", freqs_per_band, persistent=False)
+        num_freqs_per_band = reduce(freqs_per_band, "b f -> b", "sum")
+        num_bands_per_freq = reduce(freqs_per_band, "b f -> f", "sum")
+        self.register_buffer("num_freqs_per_band", num_freqs_per_band, persistent=False)
+        self.register_buffer("num_bands_per_freq", num_bands_per_freq, persistent=False)
+        freqs_per_bands_with_complex = tuple(2 * f * self.audio_channels for f in num_freqs_per_band.tolist())
+        self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex)
+        self.mask_estimators = nn.ModuleList([])
+        for _ in range(num_stems):
+            mask_estimator = MaskEstimator(dim=dim, dim_inputs=freqs_per_bands_with_complex, depth=mask_estimator_depth)
+            self.mask_estimators.append(mask_estimator)
+        self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
+        self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
+        self.multi_stft_n_fft = stft_n_fft
+        self.multi_stft_window_fn = multi_stft_window_fn
+        self.multi_stft_kwargs = dict(hop_length=multi_stft_hop_size, normalized=multi_stft_normalized)
+        self.match_input_audio_length = match_input_audio_length
+    def forward(self, raw_audio, target=None, return_loss_breakdown=False):
+        """
+        einops
+        b - batch
+        f - freq
+        t - time
+        s - audio channel (1 for mono, 2 for stereo)
+        n - number of 'stems'
+        c - complex (2)
+        d - feature dimension
+        """
+        original_device = raw_audio.device
+        x_is_mps = True if original_device.type == "mps" else False
+        if x_is_mps:
+            raw_audio = raw_audio.cpu()
+        device = raw_audio.device
+        if raw_audio.ndim == 2:
+            raw_audio = rearrange(raw_audio, "b t -> b 1 t")
+        batch, channels, raw_audio_length = raw_audio.shape
+        istft_length = raw_audio_length if self.match_input_audio_length else None
+        assert (not self.stereo and channels == 1) or (
+            self.stereo and channels == 2
+        ), "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)"
+        raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t")
+        stft_window = self.stft_window_fn().to(device)
+        stft_repr = torch.stft(raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True)
+        stft_repr = torch.view_as_real(stft_repr)
+        stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c")
+        stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")  # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting
+        batch_arange = torch.arange(batch, device=device)[..., None]
+        x = stft_repr[batch_arange, self.freq_indices.cpu()] if x_is_mps else stft_repr[batch_arange, self.freq_indices]
+        x = rearrange(x, "b f t c -> b t (f c)")
+        x = self.band_split(x)
+        for time_transformer, freq_transformer in self.layers:
+            x = rearrange(x, "b t f d -> b f t d")
+            x, ps = pack([x], "* t d")
+            x = time_transformer(x)
+            (x,) = unpack(x, ps, "* t d")
+            x = rearrange(x, "b f t d -> b t f d")
+            x, ps = pack([x], "* f d")
+            x = freq_transformer(x)
+            (x,) = unpack(x, ps, "* f d")
+        masks = torch.stack([fn(x) for fn in self.mask_estimators], dim=1)
+        masks = rearrange(masks, "b n t (f c) -> b n f t c", c=2)
+        if x_is_mps:
+            masks = masks.cpu()
+        stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c")
+        stft_repr = torch.view_as_complex(stft_repr)
+        masks = torch.view_as_complex(masks)
+        masks = masks.type(stft_repr.dtype)
+        if x_is_mps:
+            scatter_indices = repeat(self.freq_indices.cpu(), "f -> b n f t", b=batch, n=self.num_stems, t=stft_repr.shape[-1])
+        else:
+            scatter_indices = repeat(self.freq_indices, "f -> b n f t", b=batch, n=self.num_stems, t=stft_repr.shape[-1])
+        stft_repr_expanded_stems = repeat(stft_repr, "b 1 ... -> b n ...", n=self.num_stems)
+        masks_summed = (
+            torch.zeros_like(stft_repr_expanded_stems.cpu() if x_is_mps else stft_repr_expanded_stems)
+            .scatter_add_(2, scatter_indices.cpu() if x_is_mps else scatter_indices, masks.cpu() if x_is_mps else masks)
+            .to(device)
+        )
+        denom = repeat(self.num_bands_per_freq, "f -> (f r) 1", r=channels)
+        if x_is_mps:
+            denom = denom.cpu()
+        masks_averaged = masks_summed / denom.clamp(min=1e-8)
+        stft_repr = stft_repr * masks_averaged
+        stft_repr = rearrange(stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels)
+        recon_audio = torch.istft(stft_repr.cpu() if x_is_mps else stft_repr, **self.stft_kwargs, window=stft_window.cpu() if x_is_mps else stft_window, return_complex=False, length=istft_length)
+        recon_audio = rearrange(recon_audio, "(b n s) t -> b n s t", b=batch, s=self.audio_channels, n=self.num_stems)
+        if self.num_stems == 1:
+            recon_audio = rearrange(recon_audio, "b 1 s t -> b s t")
+        if not exists(target):
+            return recon_audio
+        if self.num_stems > 1:
+            assert target.ndim == 4 and target.shape[1] == self.num_stems
+        if target.ndim == 2:
+            target = rearrange(target, "... t -> ... 1 t")
+        target = target[..., : recon_audio.shape[-1]]
+        loss = F.l1_loss(recon_audio, target)
+        multi_stft_resolution_loss = 0.0
+        for window_size in self.multi_stft_resolutions_window_sizes:
+            res_stft_kwargs = dict(
+                n_fft=max(window_size, self.multi_stft_n_fft), win_length=window_size, return_complex=True, window=self.multi_stft_window_fn(window_size, device=device), **self.multi_stft_kwargs
+            )
+            recon_Y = torch.stft(rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs)
+            target_Y = torch.stft(rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs)
+            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(recon_Y, target_Y)
+        weighted_multi_resolution_loss = multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
+        total_loss = loss + weighted_multi_resolution_loss
+        # Move the total loss back to the original device if necessary
+        if x_is_mps:
+            total_loss = total_loss.to(original_device)
+        if not return_loss_breakdown:
+            return total_loss
+        # If detailed loss breakdown is requested, ensure all components are on the original device
+        return total_loss, (loss.to(original_device) if x_is_mps else loss, multi_stft_resolution_loss.to(original_device) if x_is_mps else multi_stft_resolution_loss)

audio_separator/separator/uvr_lib_v5/spec_utils.py ADDED Viewed

	@@ -0,0 +1,1327 @@

+import audioread
+import librosa
+import numpy as np
+import soundfile as sf
+import math
+import platform
+import traceback
+from audio_separator.separator.uvr_lib_v5 import pyrb
+from scipy.signal import correlate, hilbert
+import io
+OPERATING_SYSTEM = platform.system()
+SYSTEM_ARCH = platform.platform()
+SYSTEM_PROC = platform.processor()
+ARM = "arm"
+AUTO_PHASE = "Automatic"
+POSITIVE_PHASE = "Positive Phase"
+NEGATIVE_PHASE = "Negative Phase"
+NONE_P = ("None",)
+LOW_P = ("Shifts: Low",)
+MED_P = ("Shifts: Medium",)
+HIGH_P = ("Shifts: High",)
+VHIGH_P = "Shifts: Very High"
+MAXIMUM_P = "Shifts: Maximum"
+progress_value = 0
+last_update_time = 0
+is_macos = False
+if OPERATING_SYSTEM == "Darwin":
+    wav_resolution = "polyphase" if SYSTEM_PROC == ARM or ARM in SYSTEM_ARCH else "sinc_fastest"
+    wav_resolution_float_resampling = "kaiser_best" if SYSTEM_PROC == ARM or ARM in SYSTEM_ARCH else wav_resolution
+    is_macos = True
+else:
+    wav_resolution = "sinc_fastest"
+    wav_resolution_float_resampling = wav_resolution
+MAX_SPEC = "Max Spec"
+MIN_SPEC = "Min Spec"
+LIN_ENSE = "Linear Ensemble"
+MAX_WAV = MAX_SPEC
+MIN_WAV = MIN_SPEC
+AVERAGE = "Average"
+def crop_center(h1, h2):
+    """
+    This function crops the center of the first input tensor to match the size of the second input tensor.
+    It is used to ensure that the two tensors have the same size in the time dimension.
+    """
+    h1_shape = h1.size()
+    h2_shape = h2.size()
+    # If the time dimensions are already equal, return the first tensor as is
+    if h1_shape[3] == h2_shape[3]:
+        return h1
+    # If the time dimension of the first tensor is smaller, raise an error
+    elif h1_shape[3] < h2_shape[3]:
+        raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
+    # Calculate the start and end indices for cropping
+    s_time = (h1_shape[3] - h2_shape[3]) // 2
+    e_time = s_time + h2_shape[3]
+    # Crop the first tensor
+    h1 = h1[:, :, :, s_time:e_time]
+    return h1
+def preprocess(X_spec):
+    """
+    This function preprocesses a spectrogram by separating it into magnitude and phase components.
+    This is a common preprocessing step in audio processing tasks.
+    """
+    X_mag = np.abs(X_spec)
+    X_phase = np.angle(X_spec)
+    return X_mag, X_phase
+def make_padding(width, cropsize, offset):
+    """
+    This function calculates the padding needed to make the width of an image divisible by the crop size.
+    It is used in the process of splitting an image into smaller patches.
+    """
+    left = offset
+    roi_size = cropsize - offset * 2
+    if roi_size == 0:
+        roi_size = cropsize
+    right = roi_size - (width % roi_size) + left
+    return left, right, roi_size
+def normalize(wave, max_peak=1.0, min_peak=None):
+    """Normalize (or amplify) audio waveform to a specified peak value.
+    Args:
+        wave (array-like): Audio waveform.
+        max_peak (float): Maximum peak value for normalization.
+    Returns:
+        array-like: Normalized or original waveform.
+    """
+    maxv = np.abs(wave).max()
+    if maxv > max_peak:
+        wave *= max_peak / maxv
+    elif min_peak is not None and maxv < min_peak:
+        wave *= min_peak / maxv
+    return wave
+def auto_transpose(audio_array: np.ndarray):
+    """
+    Ensure that the audio array is in the (channels, samples) format.
+    Parameters:
+        audio_array (ndarray): Input audio array.
+    Returns:
+        ndarray: Transposed audio array if necessary.
+    """
+    # If the second dimension is 2 (indicating stereo channels), transpose the array
+    if audio_array.shape[1] == 2:
+        return audio_array.T
+    return audio_array
+def write_array_to_mem(audio_data, subtype):
+    if isinstance(audio_data, np.ndarray):
+        audio_buffer = io.BytesIO()
+        sf.write(audio_buffer, audio_data, 44100, subtype=subtype, format="WAV")
+        audio_buffer.seek(0)
+        return audio_buffer
+    else:
+        return audio_data
+def spectrogram_to_image(spec, mode="magnitude"):
+    if mode == "magnitude":
+        if np.iscomplexobj(spec):
+            y = np.abs(spec)
+        else:
+            y = spec
+        y = np.log10(y**2 + 1e-8)
+    elif mode == "phase":
+        if np.iscomplexobj(spec):
+            y = np.angle(spec)
+        else:
+            y = spec
+    y -= y.min()
+    y *= 255 / y.max()
+    img = np.uint8(y)
+    if y.ndim == 3:
+        img = img.transpose(1, 2, 0)
+        img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)
+    return img
+def reduce_vocal_aggressively(X, y, softmask):
+    v = X - y
+    y_mag_tmp = np.abs(y)
+    v_mag_tmp = np.abs(v)
+    v_mask = v_mag_tmp > y_mag_tmp
+    y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
+    return y_mag * np.exp(1.0j * np.angle(y))
+def merge_artifacts(y_mask, thres=0.01, min_range=64, fade_size=32):
+    mask = y_mask
+    try:
+        if min_range < fade_size * 2:
+            raise ValueError("min_range must be >= fade_size * 2")
+        idx = np.where(y_mask.min(axis=(0, 1)) > thres)[0]
+        start_idx = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
+        end_idx = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
+        artifact_idx = np.where(end_idx - start_idx > min_range)[0]
+        weight = np.zeros_like(y_mask)
+        if len(artifact_idx) > 0:
+            start_idx = start_idx[artifact_idx]
+            end_idx = end_idx[artifact_idx]
+            old_e = None
+            for s, e in zip(start_idx, end_idx):
+                if old_e is not None and s - old_e < fade_size:
+                    s = old_e - fade_size * 2
+                if s != 0:
+                    weight[:, :, s : s + fade_size] = np.linspace(0, 1, fade_size)
+                else:
+                    s -= fade_size
+                if e != y_mask.shape[2]:
+                    weight[:, :, e - fade_size : e] = np.linspace(1, 0, fade_size)
+                else:
+                    e += fade_size
+                weight[:, :, s + fade_size : e - fade_size] = 1
+                old_e = e
+        v_mask = 1 - y_mask
+        y_mask += weight * v_mask
+        mask = y_mask
+    except Exception as e:
+        error_name = f"{type(e).__name__}"
+        traceback_text = "".join(traceback.format_tb(e.__traceback__))
+        message = f'{error_name}: "{e}"\n{traceback_text}"'
+        print("Post Process Failed: ", message)
+    return mask
+def align_wave_head_and_tail(a, b):
+    l = min([a[0].size, b[0].size])
+    return a[:l, :l], b[:l, :l]
+def convert_channels(spec, mp, band):
+    cc = mp.param["band"][band].get("convert_channels")
+    if "mid_side_c" == cc:
+        spec_left = np.add(spec[0], spec[1] * 0.25)
+        spec_right = np.subtract(spec[1], spec[0] * 0.25)
+    elif "mid_side" == cc:
+        spec_left = np.add(spec[0], spec[1]) / 2
+        spec_right = np.subtract(spec[0], spec[1])
+    elif "stereo_n" == cc:
+        spec_left = np.add(spec[0], spec[1] * 0.25) / 0.9375
+        spec_right = np.add(spec[1], spec[0] * 0.25) / 0.9375
+    else:
+        return spec
+    return np.asfortranarray([spec_left, spec_right])
+def combine_spectrograms(specs, mp, is_v51_model=False):
+    l = min([specs[i].shape[2] for i in specs])
+    spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
+    offset = 0
+    bands_n = len(mp.param["band"])
+    for d in range(1, bands_n + 1):
+        h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
+        spec_c[:, offset : offset + h, :l] = specs[d][:, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l]
+        offset += h
+    if offset > mp.param["bins"]:
+        raise ValueError("Too much bins")
+    # lowpass fiter
+    if mp.param["pre_filter_start"] > 0:
+        if is_v51_model:
+            spec_c *= get_lp_filter_mask(spec_c.shape[1], mp.param["pre_filter_start"], mp.param["pre_filter_stop"])
+        else:
+            if bands_n == 1:
+                spec_c = fft_lp_filter(spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"])
+            else:
+                gp = 1
+                for b in range(mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]):
+                    g = math.pow(10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0)
+                    gp = g
+                    spec_c[:, b, :] *= g
+    return np.asfortranarray(spec_c)
+def wave_to_spectrogram(wave, hop_length, n_fft, mp, band, is_v51_model=False):
+    if wave.ndim == 1:
+        wave = np.asfortranarray([wave, wave])
+    if not is_v51_model:
+        if mp.param["reverse"]:
+            wave_left = np.flip(np.asfortranarray(wave[0]))
+            wave_right = np.flip(np.asfortranarray(wave[1]))
+        elif mp.param["mid_side"]:
+            wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
+            wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
+        elif mp.param["mid_side_b2"]:
+            wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
+            wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
+        else:
+            wave_left = np.asfortranarray(wave[0])
+            wave_right = np.asfortranarray(wave[1])
+    else:
+        wave_left = np.asfortranarray(wave[0])
+        wave_right = np.asfortranarray(wave[1])
+    spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length)
+    spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length)
+    spec = np.asfortranarray([spec_left, spec_right])
+    if is_v51_model:
+        spec = convert_channels(spec, mp, band)
+    return spec
+def spectrogram_to_wave(spec, hop_length=1024, mp={}, band=0, is_v51_model=True):
+    spec_left = np.asfortranarray(spec[0])
+    spec_right = np.asfortranarray(spec[1])
+    wave_left = librosa.istft(spec_left, hop_length=hop_length)
+    wave_right = librosa.istft(spec_right, hop_length=hop_length)
+    if is_v51_model:
+        cc = mp.param["band"][band].get("convert_channels")
+        if "mid_side_c" == cc:
+            return np.asfortranarray([np.subtract(wave_left / 1.0625, wave_right / 4.25), np.add(wave_right / 1.0625, wave_left / 4.25)])
+        elif "mid_side" == cc:
+            return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
+        elif "stereo_n" == cc:
+            return np.asfortranarray([np.subtract(wave_left, wave_right * 0.25), np.subtract(wave_right, wave_left * 0.25)])
+    else:
+        if mp.param["reverse"]:
+            return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
+        elif mp.param["mid_side"]:
+            return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
+        elif mp.param["mid_side_b2"]:
+            return np.asfortranarray([np.add(wave_right / 1.25, 0.4 * wave_left), np.subtract(wave_left / 1.25, 0.4 * wave_right)])
+    return np.asfortranarray([wave_left, wave_right])
+def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None, is_v51_model=False):
+    bands_n = len(mp.param["band"])
+    offset = 0
+    for d in range(1, bands_n + 1):
+        bp = mp.param["band"][d]
+        spec_s = np.zeros(shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex)
+        h = bp["crop_stop"] - bp["crop_start"]
+        spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[:, offset : offset + h, :]
+        offset += h
+        if d == bands_n:  # higher
+            if extra_bins_h:  # if --high_end_process bypass
+                max_bin = bp["n_fft"] // 2
+                spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[:, :extra_bins_h, :]
+            if bp["hpf_start"] > 0:
+                if is_v51_model:
+                    spec_s *= get_hp_filter_mask(spec_s.shape[1], bp["hpf_start"], bp["hpf_stop"] - 1)
+                else:
+                    spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
+            if bands_n == 1:
+                wave = spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model)
+            else:
+                wave = np.add(wave, spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model))
+        else:
+            sr = mp.param["band"][d + 1]["sr"]
+            if d == 1:  # lower
+                if is_v51_model:
+                    spec_s *= get_lp_filter_mask(spec_s.shape[1], bp["lpf_start"], bp["lpf_stop"])
+                else:
+                    spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
+                try:
+                    wave = librosa.resample(spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model), orig_sr=bp["sr"], target_sr=sr, res_type=wav_resolution)
+                except ValueError as e:
+                    print(f"Error during resampling: {e}")
+                    print(f"Spec_s shape: {spec_s.shape}, SR: {sr}, Res type: {wav_resolution}")
+            else:  # mid
+                if is_v51_model:
+                    spec_s *= get_hp_filter_mask(spec_s.shape[1], bp["hpf_start"], bp["hpf_stop"] - 1)
+                    spec_s *= get_lp_filter_mask(spec_s.shape[1], bp["lpf_start"], bp["lpf_stop"])
+                else:
+                    spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
+                    spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
+                wave2 = np.add(wave, spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model))
+                try:
+                    wave = librosa.resample(wave2, orig_sr=bp["sr"], target_sr=sr, res_type=wav_resolution)
+                except ValueError as e:
+                    print(f"Error during resampling: {e}")
+                    print(f"Spec_s shape: {spec_s.shape}, SR: {sr}, Res type: {wav_resolution}")
+    return wave
+def get_lp_filter_mask(n_bins, bin_start, bin_stop):
+    mask = np.concatenate([np.ones((bin_start - 1, 1)), np.linspace(1, 0, bin_stop - bin_start + 1)[:, None], np.zeros((n_bins - bin_stop, 1))], axis=0)
+    return mask
+def get_hp_filter_mask(n_bins, bin_start, bin_stop):
+    mask = np.concatenate([np.zeros((bin_stop + 1, 1)), np.linspace(0, 1, 1 + bin_start - bin_stop)[:, None], np.ones((n_bins - bin_start - 2, 1))], axis=0)
+    return mask
+def fft_lp_filter(spec, bin_start, bin_stop):
+    g = 1.0
+    for b in range(bin_start, bin_stop):
+        g -= 1 / (bin_stop - bin_start)
+        spec[:, b, :] = g * spec[:, b, :]
+    spec[:, bin_stop:, :] *= 0
+    return spec
+def fft_hp_filter(spec, bin_start, bin_stop):
+    g = 1.0
+    for b in range(bin_start, bin_stop, -1):
+        g -= 1 / (bin_start - bin_stop)
+        spec[:, b, :] = g * spec[:, b, :]
+    spec[:, 0 : bin_stop + 1, :] *= 0
+    return spec
+def spectrogram_to_wave_old(spec, hop_length=1024):
+    if spec.ndim == 2:
+        wave = librosa.istft(spec, hop_length=hop_length)
+    elif spec.ndim == 3:
+        spec_left = np.asfortranarray(spec[0])
+        spec_right = np.asfortranarray(spec[1])
+        wave_left = librosa.istft(spec_left, hop_length=hop_length)
+        wave_right = librosa.istft(spec_right, hop_length=hop_length)
+        wave = np.asfortranarray([wave_left, wave_right])
+    return wave
+def wave_to_spectrogram_old(wave, hop_length, n_fft):
+    wave_left = np.asfortranarray(wave[0])
+    wave_right = np.asfortranarray(wave[1])
+    spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length)
+    spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length)
+    spec = np.asfortranarray([spec_left, spec_right])
+    return spec
+def mirroring(a, spec_m, input_high_end, mp):
+    if "mirroring" == a:
+        mirror = np.flip(np.abs(spec_m[:, mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :]), 1)
+        mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
+        return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
+    if "mirroring2" == a:
+        mirror = np.flip(np.abs(spec_m[:, mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :]), 1)
+        mi = np.multiply(mirror, input_high_end * 1.7)
+        return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
+def adjust_aggr(mask, is_non_accom_stem, aggressiveness):
+    aggr = aggressiveness["value"] * 2
+    if aggr != 0:
+        if is_non_accom_stem:
+            aggr = 1 - aggr
+        if np.any(aggr > 10) or np.any(aggr < -10):
+            print(f"Warning: Extreme aggressiveness values detected: {aggr}")
+        aggr = [aggr, aggr]
+        if aggressiveness["aggr_correction"] is not None:
+            aggr[0] += aggressiveness["aggr_correction"]["left"]
+            aggr[1] += aggressiveness["aggr_correction"]["right"]
+        for ch in range(2):
+            mask[ch, : aggressiveness["split_bin"]] = np.power(mask[ch, : aggressiveness["split_bin"]], 1 + aggr[ch] / 3)
+            mask[ch, aggressiveness["split_bin"] :] = np.power(mask[ch, aggressiveness["split_bin"] :], 1 + aggr[ch])
+    return mask
+def stft(wave, nfft, hl):
+    wave_left = np.asfortranarray(wave[0])
+    wave_right = np.asfortranarray(wave[1])
+    spec_left = librosa.stft(wave_left, n_fft=nfft, hop_length=hl)
+    spec_right = librosa.stft(wave_right, n_fft=nfft, hop_length=hl)
+    spec = np.asfortranarray([spec_left, spec_right])
+    return spec
+def istft(spec, hl):
+    spec_left = np.asfortranarray(spec[0])
+    spec_right = np.asfortranarray(spec[1])
+    wave_left = librosa.istft(spec_left, hop_length=hl)
+    wave_right = librosa.istft(spec_right, hop_length=hl)
+    wave = np.asfortranarray([wave_left, wave_right])
+    return wave
+def spec_effects(wave, algorithm="Default", value=None):
+    if np.isnan(wave).any() or np.isinf(wave).any():
+        print(f"Warning: Detected NaN or infinite values in wave input. Shape: {wave.shape}")
+    spec = [stft(wave[0], 2048, 1024), stft(wave[1], 2048, 1024)]
+    if algorithm == "Min_Mag":
+        v_spec_m = np.where(np.abs(spec[1]) <= np.abs(spec[0]), spec[1], spec[0])
+        wave = istft(v_spec_m, 1024)
+    elif algorithm == "Max_Mag":
+        v_spec_m = np.where(np.abs(spec[1]) >= np.abs(spec[0]), spec[1], spec[0])
+        wave = istft(v_spec_m, 1024)
+    elif algorithm == "Default":
+        wave = (wave[1] * value) + (wave[0] * (1 - value))
+    elif algorithm == "Invert_p":
+        X_mag = np.abs(spec[0])
+        y_mag = np.abs(spec[1])
+        max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
+        v_spec = spec[1] - max_mag * np.exp(1.0j * np.angle(spec[0]))
+        wave = istft(v_spec, 1024)
+    return wave
+def spectrogram_to_wave_no_mp(spec, n_fft=2048, hop_length=1024):
+    wave = librosa.istft(spec, n_fft=n_fft, hop_length=hop_length)
+    if wave.ndim == 1:
+        wave = np.asfortranarray([wave, wave])
+    return wave
+def wave_to_spectrogram_no_mp(wave):
+    spec = librosa.stft(wave, n_fft=2048, hop_length=1024)
+    if spec.ndim == 1:
+        spec = np.asfortranarray([spec, spec])
+    return spec
+def invert_audio(specs, invert_p=True):
+    ln = min([specs[0].shape[2], specs[1].shape[2]])
+    specs[0] = specs[0][:, :, :ln]
+    specs[1] = specs[1][:, :, :ln]
+    if invert_p:
+        X_mag = np.abs(specs[0])
+        y_mag = np.abs(specs[1])
+        max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
+        v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
+    else:
+        specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
+        v_spec = specs[0] - specs[1]
+    return v_spec
+def invert_stem(mixture, stem):
+    mixture = wave_to_spectrogram_no_mp(mixture)
+    stem = wave_to_spectrogram_no_mp(stem)
+    output = spectrogram_to_wave_no_mp(invert_audio([mixture, stem]))
+    return -output.T
+def ensembling(a, inputs, is_wavs=False):
+    for i in range(1, len(inputs)):
+        if i == 1:
+            input = inputs[0]
+        if is_wavs:
+            ln = min([input.shape[1], inputs[i].shape[1]])
+            input = input[:, :ln]
+            inputs[i] = inputs[i][:, :ln]
+        else:
+            ln = min([input.shape[2], inputs[i].shape[2]])
+            input = input[:, :, :ln]
+            inputs[i] = inputs[i][:, :, :ln]
+        if MIN_SPEC == a:
+            input = np.where(np.abs(inputs[i]) <= np.abs(input), inputs[i], input)
+        if MAX_SPEC == a:
+            #input = np.array(np.where(np.greater_equal(np.abs(inputs[i]), np.abs(input)), inputs[i], input), dtype=object)
+            input = np.where(np.abs(inputs[i]) >= np.abs(input), inputs[i], input)
+            #max_spec = np.array([np.where(np.greater_equal(np.abs(inputs[i]), np.abs(input)), s, specs[0]) for s in specs[1:]], dtype=object)[-1]
+    # linear_ensemble
+    # input = ensemble_wav(inputs, split_size=1)
+    return input
+def ensemble_for_align(waves):
+    specs = []
+    for wav in waves:
+        spec = wave_to_spectrogram_no_mp(wav.T)
+        specs.append(spec)
+    wav_aligned = spectrogram_to_wave_no_mp(ensembling(MIN_SPEC, specs)).T
+    wav_aligned = match_array_shapes(wav_aligned, waves[1], is_swap=True)
+    return wav_aligned
+def ensemble_inputs(audio_input, algorithm, is_normalization, wav_type_set, save_path, is_wave=False, is_array=False):
+    wavs_ = []
+    if algorithm == AVERAGE:
+        output = average_audio(audio_input)
+        samplerate = 44100
+    else:
+        specs = []
+        for i in range(len(audio_input)):
+            wave, samplerate = librosa.load(audio_input[i], mono=False, sr=44100)
+            wavs_.append(wave)
+            spec = wave if is_wave else wave_to_spectrogram_no_mp(wave)
+            specs.append(spec)
+        wave_shapes = [w.shape[1] for w in wavs_]
+        target_shape = wavs_[wave_shapes.index(max(wave_shapes))]
+        if is_wave:
+            output = ensembling(algorithm, specs, is_wavs=True)
+        else:
+            output = spectrogram_to_wave_no_mp(ensembling(algorithm, specs))
+        output = to_shape(output, target_shape.shape)
+    sf.write(save_path, normalize(output.T, is_normalization), samplerate, subtype=wav_type_set)
+def to_shape(x, target_shape):
+    padding_list = []
+    for x_dim, target_dim in zip(x.shape, target_shape):
+        pad_value = target_dim - x_dim
+        pad_tuple = (0, pad_value)
+        padding_list.append(pad_tuple)
+    return np.pad(x, tuple(padding_list), mode="constant")
+def to_shape_minimize(x: np.ndarray, target_shape):
+    padding_list = []
+    for x_dim, target_dim in zip(x.shape, target_shape):
+        pad_value = target_dim - x_dim
+        pad_tuple = (0, pad_value)
+        padding_list.append(pad_tuple)
+    return np.pad(x, tuple(padding_list), mode="constant")
+def detect_leading_silence(audio, sr, silence_threshold=0.007, frame_length=1024):
+    """
+    Detect silence at the beginning of an audio signal.
+    :param audio: np.array, audio signal
+    :param sr: int, sample rate
+    :param silence_threshold: float, magnitude threshold below which is considered silence
+    :param frame_length: int, the number of samples to consider for each check
+    :return: float, duration of the leading silence in milliseconds
+    """
+    if len(audio.shape) == 2:
+        # If stereo, pick the channel with more energy to determine the silence
+        channel = np.argmax(np.sum(np.abs(audio), axis=1))
+        audio = audio[channel]
+    for i in range(0, len(audio), frame_length):
+        if np.max(np.abs(audio[i : i + frame_length])) > silence_threshold:
+            return (i / sr) * 1000
+    return (len(audio) / sr) * 1000
+def adjust_leading_silence(target_audio, reference_audio, silence_threshold=0.01, frame_length=1024):
+    """
+    Adjust the leading silence of the target_audio to match the leading silence of the reference_audio.
+    :param target_audio: np.array, audio signal that will have its silence adjusted
+    :param reference_audio: np.array, audio signal used as a reference
+    :param sr: int, sample rate
+    :param silence_threshold: float, magnitude threshold below which is considered silence
+    :param frame_length: int, the number of samples to consider for each check
+    :return: np.array, target_audio adjusted to have the same leading silence as reference_audio
+    """
+    def find_silence_end(audio):
+        if len(audio.shape) == 2:
+            # If stereo, pick the channel with more energy to determine the silence
+            channel = np.argmax(np.sum(np.abs(audio), axis=1))
+            audio_mono = audio[channel]
+        else:
+            audio_mono = audio
+        for i in range(0, len(audio_mono), frame_length):
+            if np.max(np.abs(audio_mono[i : i + frame_length])) > silence_threshold:
+                return i
+        return len(audio_mono)
+    ref_silence_end = find_silence_end(reference_audio)
+    target_silence_end = find_silence_end(target_audio)
+    silence_difference = ref_silence_end - target_silence_end
+    try:
+        ref_silence_end_p = (ref_silence_end / 44100) * 1000
+        target_silence_end_p = (target_silence_end / 44100) * 1000
+        silence_difference_p = ref_silence_end_p - target_silence_end_p
+        print("silence_difference: ", silence_difference_p)
+    except Exception as e:
+        pass
+    if silence_difference > 0:  # Add silence to target_audio
+        if len(target_audio.shape) == 2:  # stereo
+            silence_to_add = np.zeros((target_audio.shape[0], silence_difference))
+        else:  # mono
+            silence_to_add = np.zeros(silence_difference)
+        return np.hstack((silence_to_add, target_audio))
+    elif silence_difference < 0:  # Remove silence from target_audio
+        if len(target_audio.shape) == 2:  # stereo
+            return target_audio[:, -silence_difference:]
+        else:  # mono
+            return target_audio[-silence_difference:]
+    else:  # No adjustment needed
+        return target_audio
+def match_array_shapes(array_1: np.ndarray, array_2: np.ndarray, is_swap=False):
+    if is_swap:
+        array_1, array_2 = array_1.T, array_2.T
+    # print("before", array_1.shape, array_2.shape)
+    if array_1.shape[1] > array_2.shape[1]:
+        array_1 = array_1[:, : array_2.shape[1]]
+    elif array_1.shape[1] < array_2.shape[1]:
+        padding = array_2.shape[1] - array_1.shape[1]
+        array_1 = np.pad(array_1, ((0, 0), (0, padding)), "constant", constant_values=0)
+    # print("after", array_1.shape, array_2.shape)
+    if is_swap:
+        array_1, array_2 = array_1.T, array_2.T
+    return array_1
+def match_mono_array_shapes(array_1: np.ndarray, array_2: np.ndarray):
+    if len(array_1) > len(array_2):
+        array_1 = array_1[: len(array_2)]
+    elif len(array_1) < len(array_2):
+        padding = len(array_2) - len(array_1)
+        array_1 = np.pad(array_1, (0, padding), "constant", constant_values=0)
+    return array_1
+def change_pitch_semitones(y, sr, semitone_shift):
+    factor = 2 ** (semitone_shift / 12)  # Convert semitone shift to factor for resampling
+    y_pitch_tuned = []
+    for y_channel in y:
+        y_pitch_tuned.append(librosa.resample(y_channel, orig_sr=sr, target_sr=sr * factor, res_type=wav_resolution_float_resampling))
+    y_pitch_tuned = np.array(y_pitch_tuned)
+    new_sr = sr * factor
+    return y_pitch_tuned, new_sr
+def augment_audio(export_path, audio_file, rate, is_normalization, wav_type_set, save_format=None, is_pitch=False, is_time_correction=True):
+    wav, sr = librosa.load(audio_file, sr=44100, mono=False)
+    if wav.ndim == 1:
+        wav = np.asfortranarray([wav, wav])
+    if not is_time_correction:
+        wav_mix = change_pitch_semitones(wav, 44100, semitone_shift=-rate)[0]
+    else:
+        if is_pitch:
+            wav_1 = pyrb.pitch_shift(wav[0], sr, rate, rbargs=None)
+            wav_2 = pyrb.pitch_shift(wav[1], sr, rate, rbargs=None)
+        else:
+            wav_1 = pyrb.time_stretch(wav[0], sr, rate, rbargs=None)
+            wav_2 = pyrb.time_stretch(wav[1], sr, rate, rbargs=None)
+        if wav_1.shape > wav_2.shape:
+            wav_2 = to_shape(wav_2, wav_1.shape)
+        if wav_1.shape < wav_2.shape:
+            wav_1 = to_shape(wav_1, wav_2.shape)
+        wav_mix = np.asfortranarray([wav_1, wav_2])
+    sf.write(export_path, normalize(wav_mix.T, is_normalization), sr, subtype=wav_type_set)
+    save_format(export_path)
+def average_audio(audio):
+    waves = []
+    wave_shapes = []
+    final_waves = []
+    for i in range(len(audio)):
+        wave = librosa.load(audio[i], sr=44100, mono=False)
+        waves.append(wave[0])
+        wave_shapes.append(wave[0].shape[1])
+    wave_shapes_index = wave_shapes.index(max(wave_shapes))
+    target_shape = waves[wave_shapes_index]
+    waves.pop(wave_shapes_index)
+    final_waves.append(target_shape)
+    for n_array in waves:
+        wav_target = to_shape(n_array, target_shape.shape)
+        final_waves.append(wav_target)
+    waves = sum(final_waves)
+    waves = waves / len(audio)
+    return waves
+def average_dual_sources(wav_1, wav_2, value):
+    if wav_1.shape > wav_2.shape:
+        wav_2 = to_shape(wav_2, wav_1.shape)
+    if wav_1.shape < wav_2.shape:
+        wav_1 = to_shape(wav_1, wav_2.shape)
+    wave = (wav_1 * value) + (wav_2 * (1 - value))
+    return wave
+def reshape_sources(wav_1: np.ndarray, wav_2: np.ndarray):
+    if wav_1.shape > wav_2.shape:
+        wav_2 = to_shape(wav_2, wav_1.shape)
+    if wav_1.shape < wav_2.shape:
+        ln = min([wav_1.shape[1], wav_2.shape[1]])
+        wav_2 = wav_2[:, :ln]
+    ln = min([wav_1.shape[1], wav_2.shape[1]])
+    wav_1 = wav_1[:, :ln]
+    wav_2 = wav_2[:, :ln]
+    return wav_2
+def reshape_sources_ref(wav_1_shape, wav_2: np.ndarray):
+    if wav_1_shape > wav_2.shape:
+        wav_2 = to_shape(wav_2, wav_1_shape)
+    return wav_2
+def combine_arrarys(audio_sources, is_swap=False):
+    source = np.zeros_like(max(audio_sources, key=np.size))
+    for v in audio_sources:
+        v = match_array_shapes(v, source, is_swap=is_swap)
+        source += v
+    return source
+def combine_audio(paths: list, audio_file_base=None, wav_type_set="FLOAT", save_format=None):
+    source = combine_arrarys([load_audio(i) for i in paths])
+    save_path = f"{audio_file_base}_combined.wav"
+    sf.write(save_path, source.T, 44100, subtype=wav_type_set)
+    save_format(save_path)
+def reduce_mix_bv(inst_source, voc_source, reduction_rate=0.9):
+    # Reduce the volume
+    inst_source = inst_source * (1 - reduction_rate)
+    mix_reduced = combine_arrarys([inst_source, voc_source], is_swap=True)
+    return mix_reduced
+def organize_inputs(inputs):
+    input_list = {"target": None, "reference": None, "reverb": None, "inst": None}
+    for i in inputs:
+        if i.endswith("_(Vocals).wav"):
+            input_list["reference"] = i
+        elif "_RVC_" in i:
+            input_list["target"] = i
+        elif i.endswith("reverbed_stem.wav"):
+            input_list["reverb"] = i
+        elif i.endswith("_(Instrumental).wav"):
+            input_list["inst"] = i
+    return input_list
+def check_if_phase_inverted(wav1, wav2, is_mono=False):
+    # Load the audio files
+    if not is_mono:
+        wav1 = np.mean(wav1, axis=0)
+        wav2 = np.mean(wav2, axis=0)
+    # Compute the correlation
+    correlation = np.corrcoef(wav1[:1000], wav2[:1000])
+    return correlation[0, 1] < 0
+def align_audio(
+    file1,
+    file2,
+    file2_aligned,
+    file_subtracted,
+    wav_type_set,
+    is_save_aligned,
+    command_Text,
+    save_format,
+    align_window: list,
+    align_intro_val: list,
+    db_analysis: tuple,
+    set_progress_bar,
+    phase_option,
+    phase_shifts,
+    is_match_silence,
+    is_spec_match,
+):
+    global progress_value
+    progress_value = 0
+    is_mono = False
+    def get_diff(a, b):
+        corr = np.correlate(a, b, "full")
+        diff = corr.argmax() - (b.shape[0] - 1)
+        return diff
+    def progress_bar(length):
+        global progress_value
+        progress_value += 1
+        if (0.90 / length * progress_value) >= 0.9:
+            length = progress_value + 1
+        set_progress_bar(0.1, (0.9 / length * progress_value))
+    # read tracks
+    if file1.endswith(".mp3") and is_macos:
+        length1 = rerun_mp3(file1)
+        wav1, sr1 = librosa.load(file1, duration=length1, sr=44100, mono=False)
+    else:
+        wav1, sr1 = librosa.load(file1, sr=44100, mono=False)
+    if file2.endswith(".mp3") and is_macos:
+        length2 = rerun_mp3(file2)
+        wav2, sr2 = librosa.load(file2, duration=length2, sr=44100, mono=False)
+    else:
+        wav2, sr2 = librosa.load(file2, sr=44100, mono=False)
+    if wav1.ndim == 1 and wav2.ndim == 1:
+        is_mono = True
+    elif wav1.ndim == 1:
+        wav1 = np.asfortranarray([wav1, wav1])
+    elif wav2.ndim == 1:
+        wav2 = np.asfortranarray([wav2, wav2])
+    # Check if phase is inverted
+    if phase_option == AUTO_PHASE:
+        if check_if_phase_inverted(wav1, wav2, is_mono=is_mono):
+            wav2 = -wav2
+    elif phase_option == POSITIVE_PHASE:
+        wav2 = +wav2
+    elif phase_option == NEGATIVE_PHASE:
+        wav2 = -wav2
+    if is_match_silence:
+        wav2 = adjust_leading_silence(wav2, wav1)
+    wav1_length = int(librosa.get_duration(y=wav1, sr=44100))
+    wav2_length = int(librosa.get_duration(y=wav2, sr=44100))
+    if not is_mono:
+        wav1 = wav1.transpose()
+        wav2 = wav2.transpose()
+    wav2_org = wav2.copy()
+    command_Text("Processing files... \n")
+    seconds_length = min(wav1_length, wav2_length)
+    wav2_aligned_sources = []
+    for sec_len in align_intro_val:
+        # pick a position at 1 second in and get diff
+        sec_seg = 1 if sec_len == 1 else int(seconds_length // sec_len)
+        index = sr1 * sec_seg  # 1 second in, assuming sr1 = sr2 = 44100
+        if is_mono:
+            samp1, samp2 = wav1[index : index + sr1], wav2[index : index + sr1]
+            diff = get_diff(samp1, samp2)
+            # print(f"Estimated difference: {diff}\n")
+        else:
+            index = sr1 * sec_seg  # 1 second in, assuming sr1 = sr2 = 44100
+            samp1, samp2 = wav1[index : index + sr1, 0], wav2[index : index + sr1, 0]
+            samp1_r, samp2_r = wav1[index : index + sr1, 1], wav2[index : index + sr1, 1]
+            diff, diff_r = get_diff(samp1, samp2), get_diff(samp1_r, samp2_r)
+            # print(f"Estimated difference Left Channel: {diff}\nEstimated difference Right Channel: {diff_r}\n")
+        # make aligned track 2
+        if diff > 0:
+            zeros_to_append = np.zeros(diff) if is_mono else np.zeros((diff, 2))
+            wav2_aligned = np.append(zeros_to_append, wav2_org, axis=0)
+        elif diff < 0:
+            wav2_aligned = wav2_org[-diff:]
+        else:
+            wav2_aligned = wav2_org
+            # command_Text(f"Audio files already aligned.\n")
+        if not any(np.array_equal(wav2_aligned, source) for source in wav2_aligned_sources):
+            wav2_aligned_sources.append(wav2_aligned)
+    # print("Unique Sources: ", len(wav2_aligned_sources))
+    unique_sources = len(wav2_aligned_sources)
+    sub_mapper_big_mapper = {}
+    for s in wav2_aligned_sources:
+        wav2_aligned = match_mono_array_shapes(s, wav1) if is_mono else match_array_shapes(s, wav1, is_swap=True)
+        if align_window:
+            wav_sub = time_correction(
+                wav1, wav2_aligned, seconds_length, align_window=align_window, db_analysis=db_analysis, progress_bar=progress_bar, unique_sources=unique_sources, phase_shifts=phase_shifts
+            )
+            wav_sub_size = np.abs(wav_sub).mean()
+            sub_mapper_big_mapper = {**sub_mapper_big_mapper, **{wav_sub_size: wav_sub}}
+        else:
+            wav2_aligned = wav2_aligned * np.power(10, db_analysis[0] / 20)
+            db_range = db_analysis[1]
+            for db_adjustment in db_range:
+                # Adjust the dB of track2
+                s_adjusted = wav2_aligned * (10 ** (db_adjustment / 20))
+                wav_sub = wav1 - s_adjusted
+                wav_sub_size = np.abs(wav_sub).mean()
+                sub_mapper_big_mapper = {**sub_mapper_big_mapper, **{wav_sub_size: wav_sub}}
+        # print(sub_mapper_big_mapper.keys(), min(sub_mapper_big_mapper.keys()))
+    sub_mapper_value_list = list(sub_mapper_big_mapper.values())
+    if is_spec_match and len(sub_mapper_value_list) >= 2:
+        # print("using spec ensemble with align")
+        wav_sub = ensemble_for_align(list(sub_mapper_big_mapper.values()))
+    else:
+        # print("using linear ensemble with align")
+        wav_sub = ensemble_wav(list(sub_mapper_big_mapper.values()))
+    # print(f"Mix Mean: {np.abs(wav1).mean()}\nInst Mean: {np.abs(wav2).mean()}")
+    # print('Final: ', np.abs(wav_sub).mean())
+    wav_sub = np.clip(wav_sub, -1, +1)
+    command_Text(f"Saving inverted track... ")
+    if is_save_aligned or is_spec_match:
+        wav1 = match_mono_array_shapes(wav1, wav_sub) if is_mono else match_array_shapes(wav1, wav_sub, is_swap=True)
+        wav2_aligned = wav1 - wav_sub
+        if is_spec_match:
+            if wav1.ndim == 1 and wav2.ndim == 1:
+                wav2_aligned = np.asfortranarray([wav2_aligned, wav2_aligned]).T
+                wav1 = np.asfortranarray([wav1, wav1]).T
+            wav2_aligned = ensemble_for_align([wav2_aligned, wav1])
+            wav_sub = wav1 - wav2_aligned
+        if is_save_aligned:
+            sf.write(file2_aligned, wav2_aligned, sr1, subtype=wav_type_set)
+            save_format(file2_aligned)
+    sf.write(file_subtracted, wav_sub, sr1, subtype=wav_type_set)
+    save_format(file_subtracted)
+def phase_shift_hilbert(signal, degree):
+    analytic_signal = hilbert(signal)
+    return np.cos(np.radians(degree)) * analytic_signal.real - np.sin(np.radians(degree)) * analytic_signal.imag
+def get_phase_shifted_tracks(track, phase_shift):
+    if phase_shift == 180:
+        return [track, -track]
+    step = phase_shift
+    end = 180 - (180 % step) if 180 % step == 0 else 181
+    phase_range = range(step, end, step)
+    flipped_list = [track, -track]
+    for i in phase_range:
+        flipped_list.extend([phase_shift_hilbert(track, i), phase_shift_hilbert(track, -i)])
+    return flipped_list
+def time_correction(mix: np.ndarray, instrumental: np.ndarray, seconds_length, align_window, db_analysis, sr=44100, progress_bar=None, unique_sources=None, phase_shifts=NONE_P):
+    # Function to align two tracks using cross-correlation
+    def align_tracks(track1, track2):
+        # A dictionary to store each version of track2_shifted and its mean absolute value
+        shifted_tracks = {}
+        # Loop to adjust dB of track2
+        track2 = track2 * np.power(10, db_analysis[0] / 20)
+        db_range = db_analysis[1]
+        if phase_shifts == 190:
+            track2_flipped = [track2]
+        else:
+            track2_flipped = get_phase_shifted_tracks(track2, phase_shifts)
+        for db_adjustment in db_range:
+            for t in track2_flipped:
+                # Adjust the dB of track2
+                track2_adjusted = t * (10 ** (db_adjustment / 20))
+                corr = correlate(track1, track2_adjusted)
+                delay = np.argmax(np.abs(corr)) - (len(track1) - 1)
+                track2_shifted = np.roll(track2_adjusted, shift=delay)
+                # Compute the mean absolute value of track2_shifted
+                track2_shifted_sub = track1 - track2_shifted
+                mean_abs_value = np.abs(track2_shifted_sub).mean()
+                # Store track2_shifted and its mean absolute value in the dictionary
+                shifted_tracks[mean_abs_value] = track2_shifted
+        # Return the version of track2_shifted with the smallest mean absolute value
+        return shifted_tracks[min(shifted_tracks.keys())]
+    # Make sure the audio files have the same shape
+    assert mix.shape == instrumental.shape, f"Audio files must have the same shape - Mix: {mix.shape}, Inst: {instrumental.shape}"
+    seconds_length = seconds_length // 2
+    sub_mapper = {}
+    progress_update_interval = 120
+    total_iterations = 0
+    if len(align_window) > 2:
+        progress_update_interval = 320
+    for secs in align_window:
+        step = secs / 2
+        window_size = int(sr * secs)
+        step_size = int(sr * step)
+        if len(mix.shape) == 1:
+            total_mono = (len(range(0, len(mix) - window_size, step_size)) // progress_update_interval) * unique_sources
+            total_iterations += total_mono
+        else:
+            total_stereo_ = len(range(0, len(mix[:, 0]) - window_size, step_size)) * 2
+            total_stereo = (total_stereo_ // progress_update_interval) * unique_sources
+            total_iterations += total_stereo
+    # print(total_iterations)
+    for secs in align_window:
+        sub = np.zeros_like(mix)
+        divider = np.zeros_like(mix)
+        step = secs / 2
+        window_size = int(sr * secs)
+        step_size = int(sr * step)
+        window = np.hanning(window_size)
+        # For the mono case:
+        if len(mix.shape) == 1:
+            # The files are mono
+            counter = 0
+            for i in range(0, len(mix) - window_size, step_size):
+                counter += 1
+                if counter % progress_update_interval == 0:
+                    progress_bar(total_iterations)
+                window_mix = mix[i : i + window_size] * window
+                window_instrumental = instrumental[i : i + window_size] * window
+                window_instrumental_aligned = align_tracks(window_mix, window_instrumental)
+                sub[i : i + window_size] += window_mix - window_instrumental_aligned
+                divider[i : i + window_size] += window
+        else:
+            # The files are stereo
+            counter = 0
+            for ch in range(mix.shape[1]):
+                for i in range(0, len(mix[:, ch]) - window_size, step_size):
+                    counter += 1
+                    if counter % progress_update_interval == 0:
+                        progress_bar(total_iterations)
+                    window_mix = mix[i : i + window_size, ch] * window
+                    window_instrumental = instrumental[i : i + window_size, ch] * window
+                    window_instrumental_aligned = align_tracks(window_mix, window_instrumental)
+                    sub[i : i + window_size, ch] += window_mix - window_instrumental_aligned
+                    divider[i : i + window_size, ch] += window
+        # Normalize the result by the overlap count
+        sub = np.where(divider > 1e-6, sub / divider, sub)
+        sub_size = np.abs(sub).mean()
+        sub_mapper = {**sub_mapper, **{sub_size: sub}}
+    # print("SUB_LEN", len(list(sub_mapper.values())))
+    sub = ensemble_wav(list(sub_mapper.values()), split_size=12)
+    return sub
+def ensemble_wav(waveforms, split_size=240):
+    # Create a dictionary to hold the thirds of each waveform and their mean absolute values
+    waveform_thirds = {i: np.array_split(waveform, split_size) for i, waveform in enumerate(waveforms)}
+    # Initialize the final waveform
+    final_waveform = []
+    # For chunk
+    for third_idx in range(split_size):
+        # Compute the mean absolute value of each third from each waveform
+        means = [np.abs(waveform_thirds[i][third_idx]).mean() for i in range(len(waveforms))]
+        # Find the index of the waveform with the lowest mean absolute value for this third
+        min_index = np.argmin(means)
+        # Add the least noisy third to the final waveform
+        final_waveform.append(waveform_thirds[min_index][third_idx])
+    # Concatenate all the thirds to create the final waveform
+    final_waveform = np.concatenate(final_waveform)
+    return final_waveform
+def ensemble_wav_min(waveforms):
+    for i in range(1, len(waveforms)):
+        if i == 1:
+            wave = waveforms[0]
+        ln = min(len(wave), len(waveforms[i]))
+        wave = wave[:ln]
+        waveforms[i] = waveforms[i][:ln]
+        wave = np.where(np.abs(waveforms[i]) <= np.abs(wave), waveforms[i], wave)
+    return wave
+def align_audio_test(wav1, wav2, sr1=44100):
+    def get_diff(a, b):
+        corr = np.correlate(a, b, "full")
+        diff = corr.argmax() - (b.shape[0] - 1)
+        return diff
+    # read tracks
+    wav1 = wav1.transpose()
+    wav2 = wav2.transpose()
+    # print(f"Audio file shapes: {wav1.shape} / {wav2.shape}\n")
+    wav2_org = wav2.copy()
+    # pick a position at 1 second in and get diff
+    index = sr1  # *seconds_length  # 1 second in, assuming sr1 = sr2 = 44100
+    samp1 = wav1[index : index + sr1, 0]  # currently use left channel
+    samp2 = wav2[index : index + sr1, 0]
+    diff = get_diff(samp1, samp2)
+    # make aligned track 2
+    if diff > 0:
+        wav2_aligned = np.append(np.zeros((diff, 1)), wav2_org, axis=0)
+    elif diff < 0:
+        wav2_aligned = wav2_org[-diff:]
+    else:
+        wav2_aligned = wav2_org
+    return wav2_aligned
+def load_audio(audio_file):
+    wav, sr = librosa.load(audio_file, sr=44100, mono=False)
+    if wav.ndim == 1:
+        wav = np.asfortranarray([wav, wav])
+    return wav
+def rerun_mp3(audio_file):
+    with audioread.audio_open(audio_file) as f:
+        track_length = int(f.duration)
+    return track_length

audio_separator/separator/uvr_lib_v5/stft.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import torch
+class STFT:
+    """
+    This class performs the Short-Time Fourier Transform (STFT) and its inverse (ISTFT).
+    These functions are essential for converting the audio between the time domain and the frequency domain,
+    which is a crucial aspect of audio processing in neural networks.
+    """
+    def __init__(self, logger, n_fft, hop_length, dim_f, device):
+        self.logger = logger
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.dim_f = dim_f
+        self.device = device
+        # Create a Hann window tensor for use in the STFT.
+        self.hann_window = torch.hann_window(window_length=self.n_fft, periodic=True)
+    def __call__(self, input_tensor):
+        # Determine if the input tensor's device is not a standard computing device (i.e., not CPU or CUDA).
+        is_non_standard_device = not input_tensor.device.type in ["cuda", "cpu"]
+        # If on a non-standard device, temporarily move the tensor to CPU for processing.
+        if is_non_standard_device:
+            input_tensor = input_tensor.cpu()
+        # Transfer the pre-defined window tensor to the same device as the input tensor.
+        stft_window = self.hann_window.to(input_tensor.device)
+        # Extract batch dimensions (all dimensions except the last two which are channel and time).
+        batch_dimensions = input_tensor.shape[:-2]
+        # Extract channel and time dimensions (last two dimensions of the tensor).
+        channel_dim, time_dim = input_tensor.shape[-2:]
+        # Reshape the tensor to merge batch and channel dimensions for STFT processing.
+        reshaped_tensor = input_tensor.reshape([-1, time_dim])
+        # Perform the Short-Time Fourier Transform (STFT) on the reshaped tensor.
+        stft_output = torch.stft(reshaped_tensor, n_fft=self.n_fft, hop_length=self.hop_length, window=stft_window, center=True, return_complex=False)
+        # Rearrange the dimensions of the STFT output to bring the frequency dimension forward.
+        permuted_stft_output = stft_output.permute([0, 3, 1, 2])
+        # Reshape the output to restore the original batch and channel dimensions, while keeping the newly formed frequency and time dimensions.
+        final_output = permuted_stft_output.reshape([*batch_dimensions, channel_dim, 2, -1, permuted_stft_output.shape[-1]]).reshape(
+            [*batch_dimensions, channel_dim * 2, -1, permuted_stft_output.shape[-1]]
+        )
+        # If the original tensor was on a non-standard device, move the processed tensor back to that device.
+        if is_non_standard_device:
+            final_output = final_output.to(self.device)
+        # Return the transformed tensor, sliced to retain only the required frequency dimension (`dim_f`).
+        return final_output[..., : self.dim_f, :]
+    def pad_frequency_dimension(self, input_tensor, batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins):
+        """
+        Adds zero padding to the frequency dimension of the input tensor.
+        """
+        # Create a padding tensor for the frequency dimension
+        freq_padding = torch.zeros([*batch_dimensions, channel_dim, num_freq_bins - freq_dim, time_dim]).to(input_tensor.device)
+        # Concatenate the padding to the input tensor along the frequency dimension.
+        padded_tensor = torch.cat([input_tensor, freq_padding], -2)
+        return padded_tensor
+    def calculate_inverse_dimensions(self, input_tensor):
+        # Extract batch dimensions and frequency-time dimensions.
+        batch_dimensions = input_tensor.shape[:-3]
+        channel_dim, freq_dim, time_dim = input_tensor.shape[-3:]
+        # Calculate the number of frequency bins for the inverse STFT.
+        num_freq_bins = self.n_fft // 2 + 1
+        return batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins
+    def prepare_for_istft(self, padded_tensor, batch_dimensions, channel_dim, num_freq_bins, time_dim):
+        """
+        Prepares the tensor for Inverse Short-Time Fourier Transform (ISTFT) by reshaping
+        and creating a complex tensor from the real and imaginary parts.
+        """
+        # Reshape the tensor to separate real and imaginary parts and prepare for ISTFT.
+        reshaped_tensor = padded_tensor.reshape([*batch_dimensions, channel_dim // 2, 2, num_freq_bins, time_dim])
+        # Flatten batch dimensions and rearrange for ISTFT.
+        flattened_tensor = reshaped_tensor.reshape([-1, 2, num_freq_bins, time_dim])
+        # Rearrange the dimensions of the tensor to bring the frequency dimension forward.
+        permuted_tensor = flattened_tensor.permute([0, 2, 3, 1])
+        # Combine real and imaginary parts into a complex tensor.
+        complex_tensor = permuted_tensor[..., 0] + permuted_tensor[..., 1] * 1.0j
+        return complex_tensor
+    def inverse(self, input_tensor):
+        # Determine if the input tensor's device is not a standard computing device (i.e., not CPU or CUDA).
+        is_non_standard_device = not input_tensor.device.type in ["cuda", "cpu"]
+        # If on a non-standard device, temporarily move the tensor to CPU for processing.
+        if is_non_standard_device:
+            input_tensor = input_tensor.cpu()
+        # Transfer the pre-defined Hann window tensor to the same device as the input tensor.
+        stft_window = self.hann_window.to(input_tensor.device)
+        batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins = self.calculate_inverse_dimensions(input_tensor)
+        padded_tensor = self.pad_frequency_dimension(input_tensor, batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins)
+        complex_tensor = self.prepare_for_istft(padded_tensor, batch_dimensions, channel_dim, num_freq_bins, time_dim)
+        # Perform the Inverse Short-Time Fourier Transform (ISTFT).
+        istft_result = torch.istft(complex_tensor, n_fft=self.n_fft, hop_length=self.hop_length, window=stft_window, center=True)
+        # Reshape ISTFT result to restore original batch and channel dimensions.
+        final_output = istft_result.reshape([*batch_dimensions, 2, -1])
+        # If the original tensor was on a non-standard device, move the processed tensor back to that device.
+        if is_non_standard_device:
+            final_output = final_output.to(self.device)
+        return final_output

audio_separator/separator/uvr_lib_v5/tfc_tdf_v3.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import torch
+import torch.nn as nn
+from functools import partial
+class STFT:
+    def __init__(self, n_fft, hop_length, dim_f, device):
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.window = torch.hann_window(window_length=self.n_fft, periodic=True)
+        self.dim_f = dim_f
+        self.device = device
+    def __call__(self, x):
+        x_is_mps = not x.device.type in ["cuda", "cpu"]
+        if x_is_mps:
+            x = x.cpu()
+        window = self.window.to(x.device)
+        batch_dims = x.shape[:-2]
+        c, t = x.shape[-2:]
+        x = x.reshape([-1, t])
+        x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True,return_complex=False)
+        x = x.permute([0, 3, 1, 2])
+        x = x.reshape([*batch_dims, c, 2, -1, x.shape[-1]]).reshape([*batch_dims, c * 2, -1, x.shape[-1]])
+        if x_is_mps:
+            x = x.to(self.device)
+        return x[..., :self.dim_f, :]
+    def inverse(self, x):
+        x_is_mps = not x.device.type in ["cuda", "cpu"]
+        if x_is_mps:
+            x = x.cpu()
+        window = self.window.to(x.device)
+        batch_dims = x.shape[:-3]
+        c, f, t = x.shape[-3:]
+        n = self.n_fft // 2 + 1
+        f_pad = torch.zeros([*batch_dims, c, n - f, t]).to(x.device)
+        x = torch.cat([x, f_pad], -2)
+        x = x.reshape([*batch_dims, c // 2, 2, n, t]).reshape([-1, 2, n, t])
+        x = x.permute([0, 2, 3, 1])
+        x = x[..., 0] + x[..., 1] * 1.j
+        x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True)
+        x = x.reshape([*batch_dims, 2, -1])
+        if x_is_mps:
+            x = x.to(self.device)
+        return x
+def get_norm(norm_type):
+    def norm(c, norm_type):
+        if norm_type == 'BatchNorm':
+            return nn.BatchNorm2d(c)
+        elif norm_type == 'InstanceNorm':
+            return nn.InstanceNorm2d(c, affine=True)
+        elif 'GroupNorm' in norm_type:
+            g = int(norm_type.replace('GroupNorm', ''))
+            return nn.GroupNorm(num_groups=g, num_channels=c)
+        else:
+            return nn.Identity()
+    return partial(norm, norm_type=norm_type)
+def get_act(act_type):
+    if act_type == 'gelu':
+        return nn.GELU()
+    elif act_type == 'relu':
+        return nn.ReLU()
+    elif act_type[:3] == 'elu':
+        alpha = float(act_type.replace('elu', ''))
+        return nn.ELU(alpha)
+    else:
+        raise Exception
+class Upscale(nn.Module):
+    def __init__(self, in_c, out_c, scale, norm, act):
+        super().__init__()
+        self.conv = nn.Sequential(
+            norm(in_c),
+            act,
+            nn.ConvTranspose2d(in_channels=in_c, out_channels=out_c, kernel_size=scale, stride=scale, bias=False)
+        )
+    def forward(self, x):
+        return self.conv(x)
+class Downscale(nn.Module):
+    def __init__(self, in_c, out_c, scale, norm, act):
+        super().__init__()
+        self.conv = nn.Sequential(
+            norm(in_c),
+            act,
+            nn.Conv2d(in_channels=in_c, out_channels=out_c, kernel_size=scale, stride=scale, bias=False)
+        )
+    def forward(self, x):
+        return self.conv(x)
+class TFC_TDF(nn.Module):
+    def __init__(self, in_c, c, l, f, bn, norm, act):
+        super().__init__()
+        self.blocks = nn.ModuleList()
+        for i in range(l):
+            block = nn.Module()
+            block.tfc1 = nn.Sequential(
+                norm(in_c),
+                act,
+                nn.Conv2d(in_c, c, 3, 1, 1, bias=False),
+            )
+            block.tdf = nn.Sequential(
+                norm(c),
+                act,
+                nn.Linear(f, f // bn, bias=False),
+                norm(c),
+                act,
+                nn.Linear(f // bn, f, bias=False),
+            )
+            block.tfc2 = nn.Sequential(
+                norm(c),
+                act,
+                nn.Conv2d(c, c, 3, 1, 1, bias=False),
+            )
+            block.shortcut = nn.Conv2d(in_c, c, 1, 1, 0, bias=False)
+            self.blocks.append(block)
+            in_c = c
+    def forward(self, x):
+        for block in self.blocks:
+            s = block.shortcut(x)
+            x = block.tfc1(x)
+            x = x + block.tdf(x)
+            x = block.tfc2(x)
+            x = x + s
+        return x
+class TFC_TDF_net(nn.Module):
+    def __init__(self, config, device):
+        super().__init__()
+        self.config = config
+        self.device = device
+        norm = get_norm(norm_type=config.model.norm)
+        act = get_act(act_type=config.model.act)
+        self.num_target_instruments = 1 if config.training.target_instrument else len(config.training.instruments)
+        self.num_subbands = config.model.num_subbands
+        dim_c = self.num_subbands * config.audio.num_channels * 2
+        n = config.model.num_scales
+        scale = config.model.scale
+        l = config.model.num_blocks_per_scale
+        c = config.model.num_channels
+        g = config.model.growth
+        bn = config.model.bottleneck_factor
+        f = config.audio.dim_f // self.num_subbands
+        self.first_conv = nn.Conv2d(dim_c, c, 1, 1, 0, bias=False)
+        self.encoder_blocks = nn.ModuleList()
+        for i in range(n):
+            block = nn.Module()
+            block.tfc_tdf = TFC_TDF(c, c, l, f, bn, norm, act)
+            block.downscale = Downscale(c, c + g, scale, norm, act)
+            f = f // scale[1]
+            c += g
+            self.encoder_blocks.append(block)
+        self.bottleneck_block = TFC_TDF(c, c, l, f, bn, norm, act)
+        self.decoder_blocks = nn.ModuleList()
+        for i in range(n):
+            block = nn.Module()
+            block.upscale = Upscale(c, c - g, scale, norm, act)
+            f = f * scale[1]
+            c -= g
+            block.tfc_tdf = TFC_TDF(2 * c, c, l, f, bn, norm, act)
+            self.decoder_blocks.append(block)
+        self.final_conv = nn.Sequential(
+            nn.Conv2d(c + dim_c, c, 1, 1, 0, bias=False),
+            act,
+            nn.Conv2d(c, self.num_target_instruments * dim_c, 1, 1, 0, bias=False)
+        )
+        self.stft = STFT(config.audio.n_fft, config.audio.hop_length, config.audio.dim_f, self.device)
+    def cac2cws(self, x):
+        k = self.num_subbands
+        b, c, f, t = x.shape
+        x = x.reshape(b, c, k, f // k, t)
+        x = x.reshape(b, c * k, f // k, t)
+        return x
+    def cws2cac(self, x):
+        k = self.num_subbands
+        b, c, f, t = x.shape
+        x = x.reshape(b, c // k, k, f, t)
+        x = x.reshape(b, c // k, f * k, t)
+        return x
+    def forward(self, x):
+        x = self.stft(x)
+        mix = x = self.cac2cws(x)
+        first_conv_out = x = self.first_conv(x)
+        x = x.transpose(-1, -2)
+        encoder_outputs = []
+        for block in self.encoder_blocks:
+            x = block.tfc_tdf(x)
+            encoder_outputs.append(x)
+            x = block.downscale(x)
+        x = self.bottleneck_block(x)
+        for block in self.decoder_blocks:
+            x = block.upscale(x)
+            x = torch.cat([x, encoder_outputs.pop()], 1)
+            x = block.tfc_tdf(x)
+        x = x.transpose(-1, -2)
+        x = x * first_conv_out  # reduce artifacts
+        x = self.final_conv(torch.cat([mix, x], 1))
+        x = self.cws2cac(x)
+        if self.num_target_instruments > 1:
+            b, c, f, t = x.shape
+            x = x.reshape(b, self.num_target_instruments, -1, f, t)
+        x = self.stft.inverse(x)
+        return x

audio_separator/separator/uvr_lib_v5/vr_network/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # VR init.

audio_separator/separator/uvr_lib_v5/vr_network/layers.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from audio_separator.separator.uvr_lib_v5 import spec_utils
+class Conv2DBNActiv(nn.Module):
+    """
+    This class implements a convolutional layer followed by batch normalization and an activation function.
+    It is a common pattern in deep learning for processing images or feature maps. The convolutional layer
+    applies a set of learnable filters to the input. Batch normalization then normalizes the output of the
+    convolution, and finally, an activation function introduces non-linearity to the model, allowing it to
+    learn more complex patterns.
+    Attributes:
+        conv (nn.Sequential): A sequential container of Conv2d, BatchNorm2d, and an activation layer.
+    Args:
+        num_input_channels (int): Number of input channels.
+        num_output_channels (int): Number of output channels.
+        kernel_size (int, optional): Size of the kernel. Defaults to 3.
+        stride_length (int, optional): Stride of the convolution. Defaults to 1.
+        padding_size (int, optional): Padding added to all sides of the input. Defaults to 1.
+        dilation_rate (int, optional): Spacing between kernel elements. Defaults to 1.
+        activation_function (callable, optional): The activation function to use. Defaults to nn.ReLU.
+    """
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        # The nn.Sequential container allows us to stack the Conv2d, BatchNorm2d, and activation layers
+        # into a single module, simplifying the forward pass.
+        self.conv = nn.Sequential(nn.Conv2d(nin, nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, bias=False), nn.BatchNorm2d(nout), activ())
+    def __call__(self, input_tensor):
+        # Defines the computation performed at every call.
+        # Simply passes the input through the sequential container.
+        return self.conv(input_tensor)
+class SeperableConv2DBNActiv(nn.Module):
+    """
+    This class implements a separable convolutional layer followed by batch normalization and an activation function.
+    Separable convolutions are a type of convolution that splits the convolution operation into two simpler operations:
+    a depthwise convolution and a pointwise convolution. This can reduce the number of parameters and computational cost,
+    making the network more efficient while maintaining similar performance.
+    The depthwise convolution applies a single filter per input channel (input depth). The pointwise convolution,
+    which follows, applies a 1x1 convolution to combine the outputs of the depthwise convolution across channels.
+    Batch normalization is then applied to stabilize learning and reduce internal covariate shift. Finally,
+    an activation function introduces non-linearity, allowing the network to learn complex patterns.
+    Attributes:
+        conv (nn.Sequential): A sequential container of depthwise Conv2d, pointwise Conv2d, BatchNorm2d, and an activation layer.
+    Args:
+        num_input_channels (int): Number of input channels.
+        num_output_channels (int): Number of output channels.
+        kernel_size (int, optional): Size of the kernel for the depthwise convolution. Defaults to 3.
+        stride_length (int, optional): Stride of the convolution. Defaults to 1.
+        padding_size (int, optional): Padding added to all sides of the input for the depthwise convolution. Defaults to 1.
+        dilation_rate (int, optional): Spacing between kernel elements for the depthwise convolution. Defaults to 1.
+        activation_function (callable, optional): The activation function to use. Defaults to nn.ReLU.
+    """
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(SeperableConv2DBNActiv, self).__init__()
+        # Initialize the sequential container with the depthwise convolution.
+        # The number of groups in the depthwise convolution is set to num_input_channels, which means each input channel is treated separately.
+        # The pointwise convolution then combines these separate channels into num_output_channels channels.
+        # Batch normalization is applied to the output of the pointwise convolution.
+        # Finally, the activation function is applied to introduce non-linearity.
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nin,  # For depthwise convolution, in_channels = out_channels = num_input_channels
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                groups=nin,  # This makes it a depthwise convolution
+                bias=False,  # Bias is not used because it will be handled by BatchNorm2d
+            ),
+            nn.Conv2d(
+                nin,
+                nout,  # Pointwise convolution to combine channels
+                kernel_size=1,  # Kernel size of 1 for pointwise convolution
+                bias=False,  # Bias is not used because it will be handled by BatchNorm2d
+            ),
+            nn.BatchNorm2d(nout),  # Normalize the output of the pointwise convolution
+            activ(),  # Apply the activation function
+        )
+    def __call__(self, input_tensor):
+        # Pass the input through the sequential container.
+        # This performs the depthwise convolution, followed by the pointwise convolution,
+        # batch normalization, and finally applies the activation function.
+        return self.conv(input_tensor)
+class Encoder(nn.Module):
+    """
+    The Encoder class is a part of the neural network architecture that is responsible for processing the input data.
+    It consists of two convolutional layers, each followed by batch normalization and an activation function.
+    The purpose of the Encoder is to transform the input data into a higher-level, abstract representation.
+    This is achieved by applying filters (through convolutions) that can capture patterns or features in the data.
+    The Encoder can be thought of as a feature extractor that prepares the data for further processing by the network.
+    Attributes:
+        conv1 (Conv2DBNActiv): The first convolutional layer in the encoder.
+        conv2 (Conv2DBNActiv): The second convolutional layer in the encoder.
+    Args:
+        number_of_input_channels (int): Number of input channels for the first convolutional layer.
+        number_of_output_channels (int): Number of output channels for the convolutional layers.
+        kernel_size (int): Kernel size for the convolutional layers.
+        stride_length (int): Stride for the convolutional operations.
+        padding_size (int): Padding added to all sides of the input for the convolutional layers.
+        activation_function (callable): The activation function to use after each convolutional layer.
+    """
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        # The first convolutional layer takes the input and applies a convolution,
+        # followed by batch normalization and an activation function specified by `activation_function`.
+        # This layer is responsible for capturing the initial set of features from the input data.
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        # The second convolutional layer further processes the output from the first layer,
+        # applying another set of convolution, batch normalization, and activation.
+        # This layer helps in capturing more complex patterns in the data by building upon the initial features extracted by conv1.
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+    def __call__(self, input_tensor):
+        # The input data `input_tensor` is passed through the first convolutional layer.
+        # The output of this layer serves as a 'skip connection' that can be used later in the network to preserve spatial information.
+        skip = self.conv1(input_tensor)
+        # The output from the first layer is then passed through the second convolutional layer.
+        # This processed data `hidden` is the final output of the Encoder, representing the abstracted features of the input.
+        hidden = self.conv2(skip)
+        # The Encoder returns two outputs: `hidden`, the abstracted feature representation, and `skip`, the intermediate representation from conv1.
+        return hidden, skip
+class Decoder(nn.Module):
+    """
+    The Decoder class is part of the neural network architecture, specifically designed to perform the inverse operation of an encoder.
+    Its main role is to reconstruct or generate data from encoded representations, which is crucial in tasks like image segmentation or audio processing.
+    This class uses upsampling, convolution, optional dropout for regularization, and concatenation of skip connections to achieve its goal.
+    Attributes:
+        convolution (Conv2DBNActiv): A convolutional layer with batch normalization and activation function.
+        dropout_layer (nn.Dropout2d): An optional dropout layer for regularization to prevent overfitting.
+    Args:
+        input_channels (int): Number of input channels for the convolutional layer.
+        output_channels (int): Number of output channels for the convolutional layer.
+        kernel_size (int): Kernel size for the convolutional layer.
+        stride (int): Stride for the convolutional operations.
+        padding (int): Padding added to all sides of the input for the convolutional layer.
+        activation_function (callable): The activation function to use after the convolutional layer.
+        include_dropout (bool): Whether to include a dropout layer for regularization.
+    """
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+        super(Decoder, self).__init__()
+        # Initialize the convolutional layer with specified parameters.
+        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        # Initialize the dropout layer if include_dropout is set to True
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+    def __call__(self, input_tensor, skip=None):
+        # Upsample the input tensor to a higher resolution using bilinear interpolation.
+        input_tensor = F.interpolate(input_tensor, scale_factor=2, mode="bilinear", align_corners=True)
+        # If a skip connection is provided, crop it to match the size of input_tensor and concatenate them along the channel dimension.
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, input_tensor)  # Crop skip_connection to match input_tensor's dimensions.
+            input_tensor = torch.cat([input_tensor, skip], dim=1)  # Concatenate input_tensor and skip_connection along the channel dimension.
+        # Pass the concatenated tensor (or just input_tensor if no skip_connection is provided) through the convolutional layer.
+        output_tensor = self.conv(input_tensor)
+        # If dropout is enabled, apply it to the output of the convolutional layer.
+        if self.dropout is not None:
+            output_tensor = self.dropout(output_tensor)
+        # Return the final output tensor.
+        return output_tensor
+class ASPPModule(nn.Module):
+    """
+    Atrous Spatial Pyramid Pooling (ASPP) Module is designed for capturing multi-scale context by applying
+    atrous convolution at multiple rates. This is particularly useful in segmentation tasks where capturing
+    objects at various scales is beneficial. The module applies several parallel dilated convolutions with
+    different dilation rates to the input feature map, allowing it to efficiently capture information at
+    multiple scales.
+    Attributes:
+        conv1 (nn.Sequential): Applies adaptive average pooling followed by a 1x1 convolution.
+        nn_architecture (int): Identifier for the neural network architecture being used.
+        six_layer (list): List containing architecture identifiers that require six layers.
+        seven_layer (list): List containing architecture identifiers that require seven layers.
+        conv2-conv7 (nn.Module): Convolutional layers with varying dilation rates for multi-scale feature extraction.
+        bottleneck (nn.Sequential): A 1x1 convolutional layer that combines all features followed by dropout for regularization.
+    """
+    def __init__(self, nn_architecture, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
+        """
+        Initializes the ASPP module with specified parameters.
+        Args:
+            nn_architecture (int): Identifier for the neural network architecture.
+            input_channels (int): Number of input channels.
+            output_channels (int): Number of output channels.
+            dilations (tuple): Tuple of dilation rates for the atrous convolutions.
+            activation (callable): Activation function to use after convolutional layers.
+        """
+        super(ASPPModule, self).__init__()
+        # Adaptive average pooling reduces the spatial dimensions to 1x1, focusing on global context,
+        # followed by a 1x1 convolution to project back to the desired channel dimension.
+        self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, None)), Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ))
+        self.nn_architecture = nn_architecture
+        # Architecture identifiers for models requiring additional layers.
+        self.six_layer = [129605]
+        self.seven_layer = [537238, 537227, 33966]
+        # Extra convolutional layer used for six and seven layer configurations.
+        extra_conv = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+        # Standard 1x1 convolution for channel reduction.
+        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        # Separable convolutions with different dilation rates for multi-scale feature extraction.
+        self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
+        self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
+        self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+        # Depending on the architecture, include the extra convolutional layers.
+        if self.nn_architecture in self.six_layer:
+            self.conv6 = extra_conv
+            nin_x = 6
+        elif self.nn_architecture in self.seven_layer:
+            self.conv6 = extra_conv
+            self.conv7 = extra_conv
+            nin_x = 7
+        else:
+            nin_x = 5
+        # Bottleneck layer combines all the multi-scale features into the desired number of output channels.
+        self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * nin_x, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
+    def forward(self, input_tensor):
+        """
+        Forward pass of the ASPP module.
+        Args:
+            input_tensor (Tensor): Input tensor.
+        Returns:
+            Tensor: Output tensor after applying ASPP.
+        """
+        _, _, h, w = input_tensor.size()
+        # Apply the first convolutional sequence and upsample to the original resolution.
+        feat1 = F.interpolate(self.conv1(input_tensor), size=(h, w), mode="bilinear", align_corners=True)
+        # Apply the remaining convolutions directly on the input.
+        feat2 = self.conv2(input_tensor)
+        feat3 = self.conv3(input_tensor)
+        feat4 = self.conv4(input_tensor)
+        feat5 = self.conv5(input_tensor)
+        # Concatenate features from all layers. Depending on the architecture, include the extra features.
+        if self.nn_architecture in self.six_layer:
+            feat6 = self.conv6(input_tensor)
+            out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6), dim=1)
+        elif self.nn_architecture in self.seven_layer:
+            feat6 = self.conv6(input_tensor)
+            feat7 = self.conv7(input_tensor)
+            out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
+        else:
+            out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+        # Apply the bottleneck layer to combine and reduce the channel dimensions.
+        bottleneck_output = self.bottleneck(out)
+        return bottleneck_output

audio_separator/separator/uvr_lib_v5/vr_network/layers_new.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from audio_separator.separator.uvr_lib_v5 import spec_utils
+class Conv2DBNActiv(nn.Module):
+    """
+    Conv2DBNActiv Class:
+    This class implements a convolutional layer followed by batch normalization and an activation function.
+    It is a fundamental building block for constructing neural networks, especially useful in image and audio processing tasks.
+    The class encapsulates the pattern of applying a convolution, normalizing the output, and then applying a non-linear activation.
+    """
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        # Sequential model combining Conv2D, BatchNorm, and activation function into a single module
+        self.conv = nn.Sequential(nn.Conv2d(nin, nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, bias=False), nn.BatchNorm2d(nout), activ())
+    def __call__(self, input_tensor):
+        # Forward pass through the sequential model
+        return self.conv(input_tensor)
+class Encoder(nn.Module):
+    """
+    Encoder Class:
+    This class defines an encoder module typically used in autoencoder architectures.
+    It consists of two convolutional layers, each followed by batch normalization and an activation function.
+    """
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        # First convolutional layer of the encoder
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
+        # Second convolutional layer of the encoder
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
+    def __call__(self, input_tensor):
+        # Applying the first and then the second convolutional layers
+        hidden = self.conv1(input_tensor)
+        hidden = self.conv2(hidden)
+        return hidden
+class Decoder(nn.Module):
+    """
+    Decoder Class:
+    This class defines a decoder module, which is the counterpart of the Encoder class in autoencoder architectures.
+    It applies a convolutional layer followed by batch normalization and an activation function, with an optional dropout layer for regularization.
+    """
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+        super(Decoder, self).__init__()
+        # Convolutional layer with optional dropout for regularization
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+    def __call__(self, input_tensor, skip=None):
+        # Forward pass through the convolutional layer and optional dropout
+        input_tensor = F.interpolate(input_tensor, scale_factor=2, mode="bilinear", align_corners=True)
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, input_tensor)
+            input_tensor = torch.cat([input_tensor, skip], dim=1)
+        hidden = self.conv1(input_tensor)
+        # hidden = self.conv2(hidden)
+        if self.dropout is not None:
+            hidden = self.dropout(hidden)
+        return hidden
+class ASPPModule(nn.Module):
+    """
+    ASPPModule Class:
+    This class implements the Atrous Spatial Pyramid Pooling (ASPP) module, which is useful for semantic image segmentation tasks.
+    It captures multi-scale contextual information by applying convolutions at multiple dilation rates.
+    """
+    def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
+        super(ASPPModule, self).__init__()
+        # Global context convolution captures the overall context
+        self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, None)), Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ))
+        self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
+        self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ)
+        self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ)
+        self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+    def forward(self, input_tensor):
+        _, _, h, w = input_tensor.size()
+        # Upsample global context to match input size and combine with local and multi-scale features
+        feat1 = F.interpolate(self.conv1(input_tensor), size=(h, w), mode="bilinear", align_corners=True)
+        feat2 = self.conv2(input_tensor)
+        feat3 = self.conv3(input_tensor)
+        feat4 = self.conv4(input_tensor)
+        feat5 = self.conv5(input_tensor)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+        out = self.bottleneck(out)
+        if self.dropout is not None:
+            out = self.dropout(out)
+        return out
+class LSTMModule(nn.Module):
+    """
+    LSTMModule Class:
+    This class defines a module that combines convolutional feature extraction with a bidirectional LSTM for sequence modeling.
+    It is useful for tasks that require understanding temporal dynamics in data, such as speech and audio processing.
+    """
+    def __init__(self, nin_conv, nin_lstm, nout_lstm):
+        super(LSTMModule, self).__init__()
+        # Convolutional layer for initial feature extraction
+        self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
+        # Bidirectional LSTM for capturing temporal dynamics
+        self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True)
+        # Dense layer for output dimensionality matching
+        self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU())
+    def forward(self, input_tensor):
+        N, _, nbins, nframes = input_tensor.size()
+        # Extract features and prepare for LSTM
+        hidden = self.conv(input_tensor)[:, 0]  # N, nbins, nframes
+        hidden = hidden.permute(2, 0, 1)  # nframes, N, nbins
+        hidden, _ = self.lstm(hidden)
+        # Apply dense layer and reshape to match expected output format
+        hidden = self.dense(hidden.reshape(-1, hidden.size()[-1]))  # nframes * N, nbins
+        hidden = hidden.reshape(nframes, N, 1, nbins)
+        hidden = hidden.permute(1, 2, 3, 0)
+        return hidden

audio_separator/separator/uvr_lib_v5/vr_network/model_param_init.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import json
+default_param = {}
+default_param["bins"] = -1
+default_param["unstable_bins"] = -1  # training only
+default_param["stable_bins"] = -1  # training only
+default_param["sr"] = 44100
+default_param["pre_filter_start"] = -1
+default_param["pre_filter_stop"] = -1
+default_param["band"] = {}
+N_BINS = "n_bins"
+def int_keys(d):
+    """
+    Converts string keys that represent integers into actual integer keys in a list.
+    This function is particularly useful when dealing with JSON data that may represent
+    integer keys as strings due to the nature of JSON encoding. By converting these keys
+    back to integers, it ensures that the data can be used in a manner consistent with
+    its original representation, especially in contexts where the distinction between
+    string and integer keys is important.
+    Args:
+        input_list (list of tuples): A list of (key, value) pairs where keys are strings
+                                     that may represent integers.
+    Returns:
+        dict: A dictionary with keys converted to integers where applicable.
+    """
+    # Initialize an empty dictionary to hold the converted key-value pairs.
+    result_dict = {}
+    # Iterate through each key-value pair in the input list.
+    for key, value in d:
+        # Check if the key is a digit (i.e., represents an integer).
+        if key.isdigit():
+            # Convert the key from a string to an integer.
+            key = int(key)
+        result_dict[key] = value
+    return result_dict
+class ModelParameters(object):
+    """
+    A class to manage model parameters, including loading from a configuration file.
+    Attributes:
+        param (dict): Dictionary holding all parameters for the model.
+    """
+    def __init__(self, config_path=""):
+        """
+        Initializes the ModelParameters object by loading parameters from a JSON configuration file.
+        Args:
+            config_path (str): Path to the JSON configuration file.
+        """
+        # Load parameters from the given configuration file path.
+        with open(config_path, "r") as f:
+            self.param = json.loads(f.read(), object_pairs_hook=int_keys)
+        # Ensure certain parameters are set to False if not specified in the configuration.
+        for k in ["mid_side", "mid_side_b", "mid_side_b2", "stereo_w", "stereo_n", "reverse"]:
+            if not k in self.param:
+                self.param[k] = False
+        # If 'n_bins' is specified in the parameters, it's used as the value for 'bins'.
+        if N_BINS in self.param:
+            self.param["bins"] = self.param[N_BINS]

audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr16000_hl512.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 16000,
+			"hl": 512,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 16000,
+	"pre_filter_start": 1023,
+	"pre_filter_stop": 1024
+}

audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr32000_hl512.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 32000,
+			"hl": 512,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "kaiser_fast"
+		}
+	},
+	"sr": 32000,
+	"pre_filter_start": 1000,
+	"pre_filter_stop": 1021
+}

audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr33075_hl384.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 33075,
+			"hl": 384,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 33075,
+	"pre_filter_start": 1000,
+	"pre_filter_stop": 1021
+}