import io import base64 import librosa import numpy as np import math import torch import torchaudio import torchaudio import sox import tempfile def encode_wav(wav, sr, rep_format="wav"): with io.BytesIO() as wavio: torchaudio.save(wavio, wav, sr, format=rep_format) audio_bytes = wavio.getvalue() encoded_wav = base64.b64encode(audio_bytes).decode("ascii") return encoded_wav def trim_silence(audio, sr, keep_left_time=0.05, keep_right_time=0.22, hop_size=240): _, index = librosa.effects.trim(audio, top_db=20, frame_length=512, hop_length=128) num_frames = int(math.ceil((index[1] - index[0]) / hop_size)) # 300 left_sil_samples = int(keep_left_time * sr) right_sil_samples = int(keep_right_time * sr) wav_len = len(audio) start_idx = index[0] - left_sil_samples trim_wav = audio if start_idx > 0: trim_wav = trim_wav[start_idx:] else: trim_wav = np.pad( trim_wav, (abs(start_idx), 0), mode="constant", constant_values=0.0 ) wav_len = len(trim_wav) out_len = int(num_frames * hop_size + (keep_left_time + keep_right_time) * sr) if out_len < wav_len: trim_wav = trim_wav[:out_len] else: trim_wav = np.pad( trim_wav, (0, (out_len - wav_len)), mode="constant", constant_values=0.0 ) return trim_wav def volumn_adjust(audio16bit_torch, sr, volumn_ratio): """使用sox进行音频音量调整 Args: audio16bit_torch (Tensor): 输入音频张量 [1, samples] volume_ratio (float): 音量比率,>1增大音量,<1降低音量 Returns: Tensor: 调整音量后的音频张量 """ # 创建临时文件 with tempfile.NamedTemporaryFile( suffix=".wav", delete=True ) as temp_in, tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_out: # 保存输入音频到临时文件 torchaudio.save(temp_in.name, audio16bit_torch, sr) # 假设采样率为16000 # 创建sox转换器 tfm = sox.Transformer() tfm.vol(volumn_ratio) # 设置音量调整比率 # 应用音量调整 tfm.build_file(temp_in.name, temp_out.name) # 读取处理后的音频 audio_changed, _ = torchaudio.load(temp_out.name) return audio_changed def speech_adjust(audio16bit_torch, sr, speed_ratio): """使用sox进行音频变速处理 Args: audio16bit_torch (Tensor): 输入音频张量 [1, samples] speed_ratio (float): 速度比率,>1加速,<1减速 Returns: Tensor: 变速后的音频张量 """ # 创建临时文件 with tempfile.NamedTemporaryFile( suffix=".wav", delete=True ) as temp_in, tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_out: # 保存输入音频到临时文件 torchaudio.save(temp_in.name, audio16bit_torch, sr) # 假设采样率为16000 # 创建sox转换器 tfm = sox.Transformer() tfm.tempo(speed_ratio) # 设置变速比率 # 应用变速处理 tfm.build_file(temp_in.name, temp_out.name) # 读取处理后的音频 audio_changed, _ = torchaudio.load(temp_out.name) return audio_changed def audio_resample(audio16bit_torch, result_sr, target_sample_rate): audio16bit_torch = torchaudio.transforms.Resample( orig_freq=result_sr, new_freq=target_sample_rate )(audio16bit_torch) result_sr = target_sample_rate return audio16bit_torch, result_sr def norm_audio(audio16bit_torch): # 直接 归一化处理。 audio16bit_torch = audio16bit_torch.numpy() audio16bit_torch = ( audio16bit_torch / np.abs(audio16bit_torch).max() * 32767 ).astype(np.int16) audio16bit_torch = torch.from_numpy(audio16bit_torch) return audio16bit_torch def resample_audio(wav, original_sample_rate, target_sample_rate): if original_sample_rate != target_sample_rate: assert ( original_sample_rate > target_sample_rate ), "wav sample rate {} must be greater than {}".format( original_sample_rate, target_sample_rate ) wav = torchaudio.transforms.Resample( orig_freq=original_sample_rate, new_freq=target_sample_rate )(wav) return wav def energy_norm_fn(wav): if type(wav) is np.ndarray: max_data = np.max(np.abs(wav)) wav = wav / max(max_data, 0.01) * 0.999 else: max_data = torch.max(torch.abs(wav)) wav = wav / max(max_data, 0.01) * 0.999 return wav def get_audio_tokens(audio_tokens: str) -> list[int]: audio_tokens = audio_tokens.split(">", "")) + 65536 for token in audio_tokens ] return audio_tokens def load_audio(audio_path: str): audio_wav, sr = torchaudio.load(audio_path) audio_wav = audio_wav.mean(dim=0, keepdim=True) return audio_wav, sr