Spaces:

ASesYusuf1
/

Jhfhnrqgx-Gxeelqj-Vwxglr

Running on Zero

App Files Files Community

Jhfhnrqgx-Gxeelqj-Vwxglr / utils.py

ASesYusuf1

Update utils.py

3c66c3b verified 23 days ago

raw

history blame contribute delete

15.6 kB

	# coding: utf-8
	__author__ = 'Roman Solovyev (ZFTurbo): https://github.com/ZFTurbo/'

	import argparse
	import numpy as np
	import torch
	import torch.nn as nn
	import yaml
	import os
	import soundfile as sf
	from ml_collections import ConfigDict
	from omegaconf import OmegaConf
	from tqdm.auto import tqdm
	from typing import Dict, List, Tuple, Any, Union
	import loralib as lora
	import gc # For garbage collection
	import logging # Hata takibi için

	# Log ayarları
	logging.basicConfig(level=logging.INFO, filename='utils.log', format='%(asctime)s - %(message)s')

	def load_config(model_type: str, config_path: str) -> Union[ConfigDict, OmegaConf]:
	try:
	with open(config_path, 'r') as f:
	if model_type == 'htdemucs':
	config = OmegaConf.load(config_path)
	else:
	config = ConfigDict(yaml.load(f, Loader=yaml.FullLoader))
	return config
	except FileNotFoundError:
	raise FileNotFoundError(f"Configuration file not found at {config_path}")
	except Exception as e:
	raise ValueError(f"Error loading configuration: {e}")

	def get_model_from_config(model_type: str, config_path: str) -> Tuple:
	"""
	Load the model specified by the model type and configuration file.

	Parameters:
	----------
	model_type : str
	The type of model to load (e.g., 'mdx23c', 'htdemucs', 'scnet', etc.).
	config_path : str
	The path to the configuration file (YAML or OmegaConf format).

	Returns:
	-------
	model : nn.Module or None
	The initialized model based on the `model_type`, or None if the model type is not recognized.
	config : Any
	The configuration used to initialize the model. This could be in different formats
	depending on the model type (e.g., OmegaConf, ConfigDict).

	Raises:
	------
	ValueError:
	If the `model_type` is unknown or an error occurs during model initialization.
	"""

	config = load_config(model_type, config_path)

	if model_type == 'mdx23c':
	from models.mdx23c_tfc_tdf_v3 import TFC_TDF_net
	model = TFC_TDF_net(config)
	elif model_type == 'htdemucs':
	from models.demucs4ht import get_model
	model = get_model(config)
	elif model_type == 'segm_models':
	from models.segm_models import Segm_Models_Net
	model = Segm_Models_Net(config)
	elif model_type == 'torchseg':
	from models.torchseg_models import Torchseg_Net
	model = Torchseg_Net(config)
	elif model_type == 'mel_band_roformer':
	from models.bs_roformer import MelBandRoformer
	model = MelBandRoformer(**dict(config.model))
	elif model_type == 'bs_roformer':
	from models.bs_roformer import BSRoformer
	model = BSRoformer(**dict(config.model))
	elif model_type == 'swin_upernet':
	from models.upernet_swin_transformers import Swin_UperNet_Model
	model = Swin_UperNet_Model(config)
	elif model_type == 'bandit':
	from models.bandit.core.model import MultiMaskMultiSourceBandSplitRNNSimple
	model = MultiMaskMultiSourceBandSplitRNNSimple(**config.model)
	elif model_type == 'bandit_v2':
	from models.bandit_v2.bandit import Bandit
	model = Bandit(**config.kwargs)
	elif model_type == 'scnet_unofficial':
	from models.scnet_unofficial import SCNet
	model = SCNet(**config.model)
	elif model_type == 'scnet':
	from models.scnet import SCNet
	model = SCNet(**config.model)
	elif model_type == 'apollo':
	from models.look2hear.models import BaseModel
	model = BaseModel.apollo(**config.model)
	elif model_type == 'bs_mamba2':
	from models.ts_bs_mamba2 import Separator
	model = Separator(**config.model)
	elif model_type == 'experimental_mdx23c_stht':
	from models.mdx23c_tfc_tdf_v3_with_STHT import TFC_TDF_net
	model = TFC_TDF_net(config)
	else:
	raise ValueError(f"Unknown model type: {model_type}")

	return model, config

	def read_audio_transposed(path: str, instr: str = None, skip_err: bool = False) -> Tuple[np.ndarray, int]:
	try:
	mix, sr = sf.read(path)
	if len(mix.shape) == 1: # Mono audio
	mix = np.expand_dims(mix, axis=-1)
	return mix.T, sr
	except Exception as e:
	if skip_err:
	print(f"No stem {instr}: skip!")
	return None, None
	raise RuntimeError(f"Error reading the file at {path}: {e}")

	def normalize_audio(audio: np.ndarray) -> Tuple[np.ndarray, Dict[str, float]]:
	mono = audio.mean(0)
	mean, std = mono.mean(), mono.std()
	return (audio - mean) / (std + 1e-8), {"mean": mean, "std": std}

	def denormalize_audio(audio: np.ndarray, norm_params: Dict[str, float]) -> np.ndarray:
	return audio * norm_params["std"] + norm_params["mean"]

	def apply_tta(
	config,
	model: nn.Module,
	mix: torch.Tensor,
	waveforms_orig: Dict[str, torch.Tensor],
	device: str,
	model_type: str,
	progress=None # Gradio progress nesnesi
	) -> Dict[str, torch.Tensor]:
	track_proc_list = [mix[::-1].clone(), -mix.clone()]
	total_steps = len(track_proc_list)
	processed_steps = 0

	for i, augmented_mix in enumerate(track_proc_list):
	# TTA adımı için ilerleme güncellemesi
	processed_steps += 1
	progress_value = round((processed_steps / total_steps) * 50) # TTA için 0-50% aralığı
	if progress is not None and callable(getattr(progress, '__call__', None)):
	progress(progress_value / 100, desc=f"Applying TTA step {processed_steps}/{total_steps}")
	update_progress_html(f"Applying TTA step {processed_steps}/{total_steps}", progress_value)

	waveforms = demix(config, model, augmented_mix, device, model_type=model_type, pbar=False, progress=progress)
	for el in waveforms:
	if i == 0:
	waveforms_orig[el] += waveforms[el][::-1].clone()
	else:
	waveforms_orig[el] -= waveforms[el]
	del waveforms, augmented_mix
	gc.collect()
	if device.startswith('cuda'):
	torch.cuda.empty_cache()

	for el in waveforms_orig:
	waveforms_orig[el] /= (len(track_proc_list) + 1)

	# TTA tamamlandı
	if progress is not None and callable(getattr(progress, '__call__', None)):
	progress(0.5, desc="TTA completed")
	update_progress_html("TTA completed", 50)

	return waveforms_orig

	def _getWindowingArray(window_size: int, fade_size: int) -> torch.Tensor:
	fadein = torch.linspace(0, 1, fade_size)
	fadeout = torch.linspace(1, 0, fade_size)
	window = torch.ones(window_size)
	window[-fade_size:] = fadeout
	window[:fade_size] = fadein
	return window

	def demix(
	config: ConfigDict,
	model: nn.Module,
	mix: torch.Tensor,
	device: str,
	model_type: str,
	pbar: bool = False,
	progress=None # Gradio progress nesnesi
	) -> Dict[str, np.ndarray]:
	logging.info(f"Starting demix for model_type: {model_type}, chunk_size: {config.audio.chunk_size}")

	# CPU'da FP16 ile başla
	mix = torch.tensor(mix, dtype=torch.float16, device='cpu')
	mode = 'demucs' if model_type == 'htdemucs' else 'generic'

	# İşlem parametreleri
	if mode == 'demucs':
	chunk_size = config.training.samplerate * config.training.segment
	num_instruments = len(config.training.instruments)
	num_overlap = config.inference.num_overlap
	step = chunk_size // num_overlap
	else:
	chunk_size = config.audio.chunk_size
	num_instruments = len(prefer_target_instrument(config))
	num_overlap = config.inference.num_overlap
	fade_size = chunk_size // 10
	step = chunk_size // num_overlap
	border = chunk_size - step
	length_init = mix.shape[-1]
	windowing_array = _getWindowingArray(chunk_size, fade_size).to('cpu', dtype=torch.float16)
	if length_init > 2 * border and border > 0:
	mix = nn.functional.pad(mix, (border, border), mode="reflect")

	batch_size = getattr(config.inference, 'batch_size', 1) # Düşük bellek için varsayılan 1

	# Modeli cihaza taşı (ZeroGPU için cuda:0)
	model = model.to(device)
	model.eval()

	# Toplam chunk sayısını hesapla
	total_chunks = (mix.shape[1] + step - 1) // step
	processed_chunks = 0

	with torch.no_grad(): # Çıkarım için gradyan yok
	with torch.cuda.amp.autocast(enabled=device.startswith('cuda'), dtype=torch.float16):
	req_shape = (num_instruments,) + mix.shape
	result = torch.zeros(req_shape, dtype=torch.float16, device='cpu')
	counter = torch.zeros(req_shape, dtype=torch.float16, device='cpu')

	i = 0
	batch_data = []
	batch_locations = []
	start_time = time.time()

	while i < mix.shape[1]:
	part = mix[:, i:i + chunk_size]
	chunk_len = part.shape[-1]
	pad_mode = "reflect" if mode == "generic" and chunk_len > chunk_size // 2 else "constant"
	part = nn.functional.pad(part, (0, chunk_size - chunk_len), mode=pad_mode, value=0)

	batch_data.append(part)
	batch_locations.append((i, chunk_len))
	i += step

	if len(batch_data) >= batch_size or i >= mix.shape[1]:
	# Veriyi GPU'ya taşı
	arr = torch.stack(batch_data, dim=0).to(device, non_blocking=True)
	x = model(arr) # Model çıkarımı GPU'da

	# Sonuçları hemen CPU'ya taşı
	x = x.cpu()

	if mode == "generic":
	window = windowing_array.clone()
	if i - step == 0:
	window[:fade_size] = 1
	elif i >= mix.shape[1]:
	window[-fade_size:] = 1

	for j, (start, seg_len) in enumerate(batch_locations):
	if mode == "generic":
	result[..., start:start + seg_len] += (x[j, ..., :seg_len] * window[..., :seg_len])
	counter[..., start:start + seg_len] += window[..., :seg_len]
	else:
	result[..., start:start + seg_len] += x[j, ..., :seg_len]
	counter[..., start:start + seg_len] += 1.0

	# İlerleme güncellemesi
	processed_chunks += len(batch_data)
	progress_value = min(round((processed_chunks / total_chunks) * 100), 100) # %1 hassasiyet
	if progress is not None and callable(getattr(progress, '__call__', None)):
	progress(progress_value / 100, desc=f"Processing chunk {processed_chunks}/{total_chunks}")
	update_progress_html(f"Processing chunk {processed_chunks}/{total_chunks}", progress_value)

	del arr, x
	batch_data.clear()
	batch_locations.clear()
	gc.collect()
	if device.startswith('cuda'):
	torch.cuda.empty_cache()
	logging.info("Cleared CUDA cache")

	elapsed_time = time.time() - start_time
	logging.info(f"Demix completed in {elapsed_time:.2f} seconds")

	estimated_sources = result / (counter + 1e-8)
	estimated_sources = estimated_sources.numpy().astype(np.float32)
	np.nan_to_num(estimated_sources, copy=False, nan=0.0)

	if mode == "generic" and length_init > 2 * border and border > 0:
	estimated_sources = estimated_sources[..., border:-border]

	instruments = config.training.instruments if mode == "demucs" else prefer_target_instrument(config)
	ret_data = {k: v for k, v in zip(instruments, estimated_sources)}
	logging.info("Demix completed successfully")

	# Son ilerleme güncellemesi
	if progress is not None and callable(getattr(progress, '__call__', None)):
	progress(1.0, desc="Demix completed")
	update_progress_html("Demix completed", 100)

	return ret_data

	def prefer_target_instrument(config: ConfigDict) -> List[str]:
	return [config.training.target_instrument] if getattr(config.training, 'target_instrument', None) else config.training.instruments

	def load_not_compatible_weights(model: nn.Module, weights: str, verbose: bool = False) -> None:
	new_model = model.state_dict()
	old_model = torch.load(weights, map_location='cpu')
	if 'state' in old_model:
	old_model = old_model['state']
	if 'state_dict' in old_model:
	old_model = old_model['state_dict']
	for el in new_model:
	if el in old_model and new_model[el].shape == old_model[el].shape:
	new_model[el] = old_model[el]
	model.load_state_dict(new_model)

	def load_lora_weights(model: nn.Module, lora_path: str, device: str = 'cpu') -> None:
	lora_state_dict = torch.load(lora_path, map_location=device)
	model.load_state_dict(lora_state_dict, strict=False)

	def load_start_checkpoint(args: argparse.Namespace, model: nn.Module, type_='train') -> None:
	print(f'Start from checkpoint: {args.start_check_point}')
	device = 'cpu'
	state_dict = torch.load(args.start_check_point, map_location=device, weights_only=True)
	if args.model_type in ['htdemucs', 'apollo'] and isinstance(state_dict, dict):
	state_dict = state_dict.get('state', state_dict.get('state_dict', state_dict))
	model.load_state_dict(state_dict)
	if args.lora_checkpoint:
	print(f"Loading LoRA weights from: {args.lora_checkpoint}")
	load_lora_weights(model, args.lora_checkpoint, device)

	def bind_lora_to_model(config: Dict[str, Any], model: nn.Module) -> nn.Module:
	if 'lora' not in config:
	raise ValueError("Configuration must contain the 'lora' key with parameters for LoRA.")
	replaced_layers = 0
	for name, module in model.named_modules():
	hierarchy = name.split('.')
	layer_name = hierarchy[-1]
	if isinstance(module, nn.Linear):
	try:
	parent_module = model
	for submodule_name in hierarchy[:-1]:
	parent_module = getattr(parent_module, submodule_name)
	setattr(
	parent_module,
	layer_name,
	lora.MergedLinear(
	in_features=module.in_features,
	out_features=module.out_features,
	bias=module.bias is not None,
	**config['lora']
	)
	)
	replaced_layers += 1
	except Exception as e:
	print(f"Error replacing layer {name}: {e}")
	print(f"Number of layers replaced with LoRA: {replaced_layers}")
	return model

	def draw_spectrogram(waveform, sample_rate, length, output_file):
	import librosa.display
	x = waveform[:int(length * sample_rate), :]
	X = librosa.stft(x.mean(axis=-1))
	Xdb = librosa.amplitude_to_db(np.abs(X), ref=np.max)
	fig, ax = plt.subplots()
	img = librosa.display.specshow(
	Xdb, cmap='plasma', sr=sample_rate, x_axis='time', y_axis='linear', ax=ax
	)
	ax.set(title='File: ' + os.path.basename(output_file))
	fig.colorbar(img, ax=ax, format="%+2.f dB")
	if output_file:
	plt.savefig(output_file)
	plt.close()