clip audio between [-1,1]
Browse files
app.py
CHANGED
|
@@ -293,8 +293,10 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
| 293 |
vc_wave = bigvgan_fn(vc_target.float())[0]
|
| 294 |
if processed_frames == 0:
|
| 295 |
if is_last_chunk:
|
|
|
|
| 296 |
output_wave = vc_wave[0].cpu().numpy()
|
| 297 |
generated_wave_chunks.append(output_wave)
|
|
|
|
| 298 |
output_wave = (output_wave * 32768.0).astype(np.int16)
|
| 299 |
mp3_bytes = AudioSegment(
|
| 300 |
output_wave.tobytes(), frame_rate=sr,
|
|
@@ -306,6 +308,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
| 306 |
generated_wave_chunks.append(output_wave)
|
| 307 |
previous_chunk = vc_wave[0, -overlap_wave_len:]
|
| 308 |
processed_frames += vc_target.size(2) - overlap_frame_len
|
|
|
|
| 309 |
output_wave = (output_wave * 32768.0).astype(np.int16)
|
| 310 |
mp3_bytes = AudioSegment(
|
| 311 |
output_wave.tobytes(), frame_rate=sr,
|
|
@@ -316,6 +319,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
| 316 |
output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0].cpu().numpy(), overlap_wave_len)
|
| 317 |
generated_wave_chunks.append(output_wave)
|
| 318 |
processed_frames += vc_target.size(2) - overlap_frame_len
|
|
|
|
| 319 |
output_wave = (output_wave * 32768.0).astype(np.int16)
|
| 320 |
mp3_bytes = AudioSegment(
|
| 321 |
output_wave.tobytes(), frame_rate=sr,
|
|
@@ -328,6 +332,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
| 328 |
generated_wave_chunks.append(output_wave)
|
| 329 |
previous_chunk = vc_wave[0, -overlap_wave_len:]
|
| 330 |
processed_frames += vc_target.size(2) - overlap_frame_len
|
|
|
|
| 331 |
output_wave = (output_wave * 32768.0).astype(np.int16)
|
| 332 |
mp3_bytes = AudioSegment(
|
| 333 |
output_wave.tobytes(), frame_rate=sr,
|
|
|
|
| 293 |
vc_wave = bigvgan_fn(vc_target.float())[0]
|
| 294 |
if processed_frames == 0:
|
| 295 |
if is_last_chunk:
|
| 296 |
+
# output_wave = torch.clip(vc_wave[0], -0.999, 0.999).cpu().numpy()
|
| 297 |
output_wave = vc_wave[0].cpu().numpy()
|
| 298 |
generated_wave_chunks.append(output_wave)
|
| 299 |
+
output_wave = np.clip(output_wave, -0.999, 0.999)
|
| 300 |
output_wave = (output_wave * 32768.0).astype(np.int16)
|
| 301 |
mp3_bytes = AudioSegment(
|
| 302 |
output_wave.tobytes(), frame_rate=sr,
|
|
|
|
| 308 |
generated_wave_chunks.append(output_wave)
|
| 309 |
previous_chunk = vc_wave[0, -overlap_wave_len:]
|
| 310 |
processed_frames += vc_target.size(2) - overlap_frame_len
|
| 311 |
+
output_wave = np.clip(output_wave, -0.999, 0.999)
|
| 312 |
output_wave = (output_wave * 32768.0).astype(np.int16)
|
| 313 |
mp3_bytes = AudioSegment(
|
| 314 |
output_wave.tobytes(), frame_rate=sr,
|
|
|
|
| 319 |
output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0].cpu().numpy(), overlap_wave_len)
|
| 320 |
generated_wave_chunks.append(output_wave)
|
| 321 |
processed_frames += vc_target.size(2) - overlap_frame_len
|
| 322 |
+
output_wave = np.clip(output_wave, -0.999, 0.999)
|
| 323 |
output_wave = (output_wave * 32768.0).astype(np.int16)
|
| 324 |
mp3_bytes = AudioSegment(
|
| 325 |
output_wave.tobytes(), frame_rate=sr,
|
|
|
|
| 332 |
generated_wave_chunks.append(output_wave)
|
| 333 |
previous_chunk = vc_wave[0, -overlap_wave_len:]
|
| 334 |
processed_frames += vc_target.size(2) - overlap_frame_len
|
| 335 |
+
output_wave = np.clip(output_wave, -0.999, 0.999)
|
| 336 |
output_wave = (output_wave * 32768.0).astype(np.int16)
|
| 337 |
mp3_bytes = AudioSegment(
|
| 338 |
output_wave.tobytes(), frame_rate=sr,
|