Spaces:
Runtime error
Runtime error
Update utils/decode.py
Browse files- utils/decode.py +9 -1
utils/decode.py
CHANGED
|
@@ -188,6 +188,9 @@ def decode_one_audio_frcrn_se_16k(model, device, inputs, args):
|
|
| 188 |
# If no segmentation is required, process the entire input
|
| 189 |
outputs = model.inference(inputs).detach().cpu().numpy() # Inference on full input
|
| 190 |
|
|
|
|
|
|
|
|
|
|
| 191 |
return outputs # Return the decoded audio output
|
| 192 |
|
| 193 |
def decode_one_audio_mossformergan_se_16k(model, device, inputs, args):
|
|
@@ -439,7 +442,12 @@ def decode_one_audio_mossformer2_se_48k(model, device, inputs, args):
|
|
| 439 |
# Reconstruct audio from the masked spectrogram
|
| 440 |
outputs = istft(masked_spec_complex, args, len(audio))
|
| 441 |
|
| 442 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
|
| 444 |
def decode_one_audio_AV_MossFormer2_TSE_16K(model, inputs, args):
|
| 445 |
"""Processes video inputs through the AV mossformer2 model with Target speaker extraction (TSE) for decoding at 16kHz.
|
|
|
|
| 188 |
# If no segmentation is required, process the entire input
|
| 189 |
outputs = model.inference(inputs).detach().cpu().numpy() # Inference on full input
|
| 190 |
|
| 191 |
+
#normalize outputs
|
| 192 |
+
max_abs = max(abs(outputs), 1e-6)
|
| 193 |
+
outputs = outputs / max_abs
|
| 194 |
return outputs # Return the decoded audio output
|
| 195 |
|
| 196 |
def decode_one_audio_mossformergan_se_16k(model, device, inputs, args):
|
|
|
|
| 442 |
# Reconstruct audio from the masked spectrogram
|
| 443 |
outputs = istft(masked_spec_complex, args, len(audio))
|
| 444 |
|
| 445 |
+
outpus = outputs.numpy() / MAX_WAV_VALUE # Return the output normalized to [-1, 1]
|
| 446 |
+
#normalize outputs
|
| 447 |
+
max_abs = max(abs(outputs), 1e-6)
|
| 448 |
+
outputs = outputs / max_abs
|
| 449 |
+
|
| 450 |
+
return outputs
|
| 451 |
|
| 452 |
def decode_one_audio_AV_MossFormer2_TSE_16K(model, inputs, args):
|
| 453 |
"""Processes video inputs through the AV mossformer2 model with Target speaker extraction (TSE) for decoding at 16kHz.
|