File size: 8,689 Bytes
82bc972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import argparse, pickle
import logging
import os, random
import numpy as np
import torch
import torchaudio

from data.tokenizer import (
    AudioTokenizer,
    TextTokenizer,
    tokenize_audio,
    tokenize_text
)
import argparse, time, tqdm


# this script only works for the musicgen architecture
def get_args():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--manifest_fn", type=str, default="path/to/eval_metadata_file")
    parser.add_argument("--audio_root", type=str, default="path/to/audio_folder")
    parser.add_argument("--exp_dir", type=str, default="path/to/model_folder")
    parser.add_argument("--seed", type=int, default=1)
    parser.add_argument("--codec_audio_sr", type=int, default=16000, help='the sample rate of audio that the codec is trained for')
    parser.add_argument("--codec_sr", type=int, default=50, help='the sample rate of the codec codes')
    parser.add_argument("--top_k", type=int, default=0, help="sampling param")
    parser.add_argument("--top_p", type=float, default=0.8, help="sampling param")
    parser.add_argument("--temperature", type=float, default=1.0, help="sampling param")
    parser.add_argument("--output_dir", type=str, default=None)
    parser.add_argument("--device", type=str, default="cuda")
    parser.add_argument("--signature", type=str, default=None, help="path to the encodec model")
    parser.add_argument("--crop_concat", type=int, default=0)
    parser.add_argument("--stop_repetition", type=int, default=-1, help="used for inference, when the number of consecutive repetition of a token is bigger than this, stop it")
    parser.add_argument("--kvcache", type=int, default=1, help='if true, use kv cache, which is 4-8x faster than without')
    parser.add_argument("--sample_batch_size", type=int, default=1, help="batch size for sampling, NOTE that it's not running inference for several samples, but duplicate one input sample batch_size times, and during inference, we only return the shortest generation")
    parser.add_argument("--silence_tokens", type=str, default="[1388,1898,131]", help="note that if you are not using the pretrained encodec 6f79c6a8, make sure you specified it yourself, rather than using the default")
    return parser.parse_args()


@torch.no_grad()
def inference_one_sample(model, model_args, phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_text, device, decode_config, prompt_end_frame, target_generation_length, delay_pattern_increment, prefix_transcript=None, quiet=False, repeat_prompt=0, multi_trial=[]):
    # seq_len_thres = 500 # 10s, 26% of the data in seed tts
    # encode audio
    encoded_frames = tokenize_audio(audio_tokenizer, audio_fn, offset=0, num_frames=prompt_end_frame)
    # if sequence length is shorter than seq_len_thres, repeat the audio
    # if encoded_frames.shape[2] < seq_len_thres:
    #     encoded_frames = torch.cat([encoded_frames, encoded_frames, encoded_frames], dim=2)
    #     doubled = True
    single_encoded_frames = encoded_frames

    if isinstance(repeat_prompt, int) and repeat_prompt > 0:
        cur_repeat_prompt = repeat_prompt
        while cur_repeat_prompt > 0:
            encoded_frames = torch.cat([encoded_frames, single_encoded_frames], dim=2)
            cur_repeat_prompt -= 1
    elif isinstance(repeat_prompt, str) and repeat_prompt.lower() == "max":
        repeat_prompt = 0
        while encoded_frames.shape[2] + decode_config['codec_sr'] * target_generation_length + delay_pattern_increment + single_encoded_frames.shape[2] < model_args.audio_max_length * decode_config['codec_sr']:
            encoded_frames = torch.cat([encoded_frames, single_encoded_frames], dim=2)
            repeat_prompt += 1
    if getattr(model_args, "y_sep_token", None) != None:
        encoded_frames = torch.cat([encoded_frames, torch.LongTensor([model_args.y_sep_token]*model_args.n_codebooks).unsqueeze(0).unsqueeze(2).to(encoded_frames.device)], dim=2)
    # print(encoded_frames.shape)
    original_audio = encoded_frames.transpose(2,1) # [1,T,K]
    assert original_audio.ndim==3 and original_audio.shape[0] == 1 and original_audio.shape[2] == model_args.n_codebooks, original_audio.shape

    # phonemize
    if isinstance(target_text, list):
        text_tokens = [phn2num[phn] for phn in target_text if phn in phn2num]
    else:
        text_tokens = [phn2num[phn] for phn in
                tokenize_text(
                    text_tokenizer, text=target_text.strip()
                ) if phn in phn2num
            ]
    if getattr(model_args, "x_sep_token", None) != None:
        assert prefix_transcript != None, "prefix_transcript must be provided if x_sep_token is not None"
    if prefix_transcript is not None:
        if isinstance(prefix_transcript, list):
            prefix_tokens = [phn2num[phn] for phn in prefix_transcript if phn in phn2num]
        else:
            prefix_tokens = [phn2num[phn] for phn in
                tokenize_text(
                    text_tokenizer, text=prefix_transcript.strip()
                ) if phn in phn2num
            ]
        # if doubled:
        #     prefix_tokens = prefix_tokens + prefix_tokens + prefix_tokens
        single_prefix_tokens = prefix_tokens
        while repeat_prompt > 0:
            prefix_tokens = prefix_tokens + single_prefix_tokens
            repeat_prompt -= 1
        if getattr(model_args, "x_sep_token", None) != None:
            text_tokens = prefix_tokens + [getattr(model_args, "x_sep_token", None)] + text_tokens
        else:
            text_tokens = prefix_tokens + text_tokens
    if getattr(model_args, "add_eos_to_text", 0) != 0:
        text_tokens.append(model_args.add_eos_to_text)
    if getattr(model_args, "add_bos_to_text", 0) != 0:
        text_tokens = [model_args.add_bos_to_text] + text_tokens
    text_tokens = torch.LongTensor(text_tokens).unsqueeze(0)
    text_tokens_lens = torch.LongTensor([text_tokens.shape[-1]])

    if not quiet:
        logging.info(f"original audio length: {original_audio.shape[1]} codec frames, which is {original_audio.shape[1]/decode_config['codec_sr']:.2f} sec.")


    if getattr(model_args, "parallel_pattern", 0) != 0:
        tgt_y_lens = torch.LongTensor([int(original_audio.shape[1] + decode_config['codec_sr'] * target_generation_length + 2)]) # parallel pattern, therefore only add the empty_token (i.e. the sos token) and eos (i.e. 2 more tokens). Note that the delayed pattern between, both sos and eos is counted (sos is counted in the n_codebooks, eos is counted in the 1)
    else:
        tgt_y_lens = torch.LongTensor([int(original_audio.shape[1] + decode_config['codec_sr'] * target_generation_length + delay_pattern_increment)]) # delay pattern increment has accounted for the added eos

    # forward
    assert decode_config['sample_batch_size'] <= 1
    stime = time.time()
    assert multi_trial == [] 
    if not quiet:
        logging.info(f"running inference with batch size 1")
    concat_frames, gen_frames = model.inference_tts(
        text_tokens.to(device),
        text_tokens_lens.to(device),
        original_audio[...,:model_args.n_codebooks].to(device), # [1,T,8]
        tgt_y_lens = tgt_y_lens.to(device),
        top_k=decode_config['top_k'],
        top_p=decode_config['top_p'],
        min_p=decode_config['min_p'],
        temperature=decode_config['temperature'],
        stop_repetition=decode_config['stop_repetition'],
        kvcache=decode_config['kvcache'],
        silence_tokens=eval(decode_config['silence_tokens']) if type(decode_config['silence_tokens'])==str else decode_config['silence_tokens']
    ) # output is [1,K,T]
    if not quiet:
        logging.info(f"inference on one sample take: {time.time() - stime:.4f} sec.")

        logging.info(f"generated encoded_frames.shape: {gen_frames.shape}, which is {gen_frames.shape[-1]/decode_config['codec_sr']} sec.")
    
    # for timestamp, codes in enumerate(gen_frames[0].transpose(1,0)):
    #     logging.info(f"{timestamp}: {codes.tolist()}")
    # decode (both original and generated)
    # concat_sample = audio_tokenizer.decode(
    #     [(concat_frames, None)] # [1,T,8] -> [1,8,T]
    # )
    if getattr(model_args, "y_sep_token", None) != None:
        concat_frames = torch.cat([concat_frames[:, :, :original_audio.shape[1]-1], concat_frames[:, :, original_audio.shape[1]:]], dim=2)
    concat_sample = audio_tokenizer.decode(
        concat_frames # [1,8,T]
    )
    gen_sample = audio_tokenizer.decode(
        gen_frames
    )
    #Empty cuda cache between runs
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    # return
    return concat_sample, gen_sample