Spaces:
Running
on
Zero
Running
on
Zero
File size: 8,689 Bytes
82bc972 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import argparse, pickle
import logging
import os, random
import numpy as np
import torch
import torchaudio
from data.tokenizer import (
AudioTokenizer,
TextTokenizer,
tokenize_audio,
tokenize_text
)
import argparse, time, tqdm
# this script only works for the musicgen architecture
def get_args():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--manifest_fn", type=str, default="path/to/eval_metadata_file")
parser.add_argument("--audio_root", type=str, default="path/to/audio_folder")
parser.add_argument("--exp_dir", type=str, default="path/to/model_folder")
parser.add_argument("--seed", type=int, default=1)
parser.add_argument("--codec_audio_sr", type=int, default=16000, help='the sample rate of audio that the codec is trained for')
parser.add_argument("--codec_sr", type=int, default=50, help='the sample rate of the codec codes')
parser.add_argument("--top_k", type=int, default=0, help="sampling param")
parser.add_argument("--top_p", type=float, default=0.8, help="sampling param")
parser.add_argument("--temperature", type=float, default=1.0, help="sampling param")
parser.add_argument("--output_dir", type=str, default=None)
parser.add_argument("--device", type=str, default="cuda")
parser.add_argument("--signature", type=str, default=None, help="path to the encodec model")
parser.add_argument("--crop_concat", type=int, default=0)
parser.add_argument("--stop_repetition", type=int, default=-1, help="used for inference, when the number of consecutive repetition of a token is bigger than this, stop it")
parser.add_argument("--kvcache", type=int, default=1, help='if true, use kv cache, which is 4-8x faster than without')
parser.add_argument("--sample_batch_size", type=int, default=1, help="batch size for sampling, NOTE that it's not running inference for several samples, but duplicate one input sample batch_size times, and during inference, we only return the shortest generation")
parser.add_argument("--silence_tokens", type=str, default="[1388,1898,131]", help="note that if you are not using the pretrained encodec 6f79c6a8, make sure you specified it yourself, rather than using the default")
return parser.parse_args()
@torch.no_grad()
def inference_one_sample(model, model_args, phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_text, device, decode_config, prompt_end_frame, target_generation_length, delay_pattern_increment, prefix_transcript=None, quiet=False, repeat_prompt=0, multi_trial=[]):
# seq_len_thres = 500 # 10s, 26% of the data in seed tts
# encode audio
encoded_frames = tokenize_audio(audio_tokenizer, audio_fn, offset=0, num_frames=prompt_end_frame)
# if sequence length is shorter than seq_len_thres, repeat the audio
# if encoded_frames.shape[2] < seq_len_thres:
# encoded_frames = torch.cat([encoded_frames, encoded_frames, encoded_frames], dim=2)
# doubled = True
single_encoded_frames = encoded_frames
if isinstance(repeat_prompt, int) and repeat_prompt > 0:
cur_repeat_prompt = repeat_prompt
while cur_repeat_prompt > 0:
encoded_frames = torch.cat([encoded_frames, single_encoded_frames], dim=2)
cur_repeat_prompt -= 1
elif isinstance(repeat_prompt, str) and repeat_prompt.lower() == "max":
repeat_prompt = 0
while encoded_frames.shape[2] + decode_config['codec_sr'] * target_generation_length + delay_pattern_increment + single_encoded_frames.shape[2] < model_args.audio_max_length * decode_config['codec_sr']:
encoded_frames = torch.cat([encoded_frames, single_encoded_frames], dim=2)
repeat_prompt += 1
if getattr(model_args, "y_sep_token", None) != None:
encoded_frames = torch.cat([encoded_frames, torch.LongTensor([model_args.y_sep_token]*model_args.n_codebooks).unsqueeze(0).unsqueeze(2).to(encoded_frames.device)], dim=2)
# print(encoded_frames.shape)
original_audio = encoded_frames.transpose(2,1) # [1,T,K]
assert original_audio.ndim==3 and original_audio.shape[0] == 1 and original_audio.shape[2] == model_args.n_codebooks, original_audio.shape
# phonemize
if isinstance(target_text, list):
text_tokens = [phn2num[phn] for phn in target_text if phn in phn2num]
else:
text_tokens = [phn2num[phn] for phn in
tokenize_text(
text_tokenizer, text=target_text.strip()
) if phn in phn2num
]
if getattr(model_args, "x_sep_token", None) != None:
assert prefix_transcript != None, "prefix_transcript must be provided if x_sep_token is not None"
if prefix_transcript is not None:
if isinstance(prefix_transcript, list):
prefix_tokens = [phn2num[phn] for phn in prefix_transcript if phn in phn2num]
else:
prefix_tokens = [phn2num[phn] for phn in
tokenize_text(
text_tokenizer, text=prefix_transcript.strip()
) if phn in phn2num
]
# if doubled:
# prefix_tokens = prefix_tokens + prefix_tokens + prefix_tokens
single_prefix_tokens = prefix_tokens
while repeat_prompt > 0:
prefix_tokens = prefix_tokens + single_prefix_tokens
repeat_prompt -= 1
if getattr(model_args, "x_sep_token", None) != None:
text_tokens = prefix_tokens + [getattr(model_args, "x_sep_token", None)] + text_tokens
else:
text_tokens = prefix_tokens + text_tokens
if getattr(model_args, "add_eos_to_text", 0) != 0:
text_tokens.append(model_args.add_eos_to_text)
if getattr(model_args, "add_bos_to_text", 0) != 0:
text_tokens = [model_args.add_bos_to_text] + text_tokens
text_tokens = torch.LongTensor(text_tokens).unsqueeze(0)
text_tokens_lens = torch.LongTensor([text_tokens.shape[-1]])
if not quiet:
logging.info(f"original audio length: {original_audio.shape[1]} codec frames, which is {original_audio.shape[1]/decode_config['codec_sr']:.2f} sec.")
if getattr(model_args, "parallel_pattern", 0) != 0:
tgt_y_lens = torch.LongTensor([int(original_audio.shape[1] + decode_config['codec_sr'] * target_generation_length + 2)]) # parallel pattern, therefore only add the empty_token (i.e. the sos token) and eos (i.e. 2 more tokens). Note that the delayed pattern between, both sos and eos is counted (sos is counted in the n_codebooks, eos is counted in the 1)
else:
tgt_y_lens = torch.LongTensor([int(original_audio.shape[1] + decode_config['codec_sr'] * target_generation_length + delay_pattern_increment)]) # delay pattern increment has accounted for the added eos
# forward
assert decode_config['sample_batch_size'] <= 1
stime = time.time()
assert multi_trial == []
if not quiet:
logging.info(f"running inference with batch size 1")
concat_frames, gen_frames = model.inference_tts(
text_tokens.to(device),
text_tokens_lens.to(device),
original_audio[...,:model_args.n_codebooks].to(device), # [1,T,8]
tgt_y_lens = tgt_y_lens.to(device),
top_k=decode_config['top_k'],
top_p=decode_config['top_p'],
min_p=decode_config['min_p'],
temperature=decode_config['temperature'],
stop_repetition=decode_config['stop_repetition'],
kvcache=decode_config['kvcache'],
silence_tokens=eval(decode_config['silence_tokens']) if type(decode_config['silence_tokens'])==str else decode_config['silence_tokens']
) # output is [1,K,T]
if not quiet:
logging.info(f"inference on one sample take: {time.time() - stime:.4f} sec.")
logging.info(f"generated encoded_frames.shape: {gen_frames.shape}, which is {gen_frames.shape[-1]/decode_config['codec_sr']} sec.")
# for timestamp, codes in enumerate(gen_frames[0].transpose(1,0)):
# logging.info(f"{timestamp}: {codes.tolist()}")
# decode (both original and generated)
# concat_sample = audio_tokenizer.decode(
# [(concat_frames, None)] # [1,T,8] -> [1,8,T]
# )
if getattr(model_args, "y_sep_token", None) != None:
concat_frames = torch.cat([concat_frames[:, :, :original_audio.shape[1]-1], concat_frames[:, :, original_audio.shape[1]:]], dim=2)
concat_sample = audio_tokenizer.decode(
concat_frames # [1,8,T]
)
gen_sample = audio_tokenizer.decode(
gen_frames
)
#Empty cuda cache between runs
if torch.cuda.is_available():
torch.cuda.empty_cache()
# return
return concat_sample, gen_sample |