from __future__ import annotations import jieba import torch from pypinyin import Style, lazy_pinyin from torch.nn.utils.rnn import pad_sequence def exists(v): return v is not None def default(v, d): return v if exists(v) else d # tensor helpers def lens_to_mask(t: int["b"], length: int | None = None) -> bool["b n"]: # noqa: F722 F821 if not exists(length): length = t.amax() seq = torch.arange(length, device=t.device) return seq[None, :] < t[:, None] # simple utf-8 tokenizer, since paper went character based def list_str_to_tensor(text: list[str], padding_value=-1) -> int["b nt"]: # noqa: F722 list_tensors = [torch.tensor([*bytes(t, "UTF-8")]) for t in text] # ByT5 style text = pad_sequence(list_tensors, padding_value=padding_value, batch_first=True) return text # char tokenizer, based on custom dataset's extracted .txt file def list_str_to_idx( text: list[str] | list[list[str]], vocab_char_map: dict[str, int], # {char: idx} padding_value=-1, ) -> int["b nt"]: # noqa: F722 list_idx_tensors = [torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text] # pinyin or char style text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True) return text # Get tokenizer def get_tokenizer(dataset_name, tokenizer: str = "pinyin"): """ tokenizer - "pinyin" do g2p for only chinese characters, need .txt vocab_file - "char" for char-wise tokenizer, need .txt vocab_file - "byte" for utf-8 tokenizer - "custom" if you're directly passing in a path to the vocab.txt you want to use vocab_size - if use "pinyin", all available pinyin types, common alphabets (also those with accent) and symbols - if use "char", derived from unfiltered character & symbol counts of custom dataset - if use "byte", set to 256 (unicode byte range) """ if tokenizer in ["pinyin", "char"]: # tokenizer_path = os.path.join(files("f5_tts").joinpath("../../data"), f"{dataset_name}_{tokenizer}/vocab.txt") tokenizer_path = "/ailab-train/speech/zhengjunjie/huggingface/models/F5-TTS/F5TTS_Base/vocab.txt" print(f"Loading {tokenizer} tokenizer from {tokenizer_path}") with open(tokenizer_path, "r", encoding="utf-8") as f: vocab_char_map = {} for i, char in enumerate(f): vocab_char_map[char[:-1]] = i vocab_size = len(vocab_char_map) assert vocab_char_map[" "] == 0, "make sure space is of idx 0 in vocab.txt, cuz 0 is used for unknown char" elif tokenizer == "byte": vocab_char_map = None vocab_size = 256 elif tokenizer == "custom": with open(dataset_name, "r", encoding="utf-8") as f: vocab_char_map = {} for i, char in enumerate(f): vocab_char_map[char[:-1]] = i vocab_size = len(vocab_char_map) return vocab_char_map, vocab_size # convert char to pinyin jieba.initialize() print("Word segmentation module jieba initialized.\n") def convert_char_to_pinyin(text_list, polyphone=True): final_text_list = [] custom_trans = str.maketrans( {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'"} ) # add custom trans here, to address oov def is_chinese(c): return ( "\u3100" <= c <= "\u9fff" # common chinese characters ) for text in text_list: char_list = [] text = text.translate(custom_trans) for seg in jieba.cut(text): seg_byte_len = len(bytes(seg, "UTF-8")) if seg_byte_len == len(seg): # if pure alphabets and symbols if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"": char_list.append(" ") char_list.extend(seg) elif polyphone and seg_byte_len == 3 * len(seg): # if pure east asian characters seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True) for i, c in enumerate(seg): if is_chinese(c): char_list.append(" ") char_list.append(seg_[i]) else: # if mixed characters, alphabets and symbols for c in seg: if ord(c) < 256: char_list.extend(c) elif is_chinese(c): char_list.append(" ") char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True)) else: char_list.append(c) final_text_list.append(char_list) return final_text_list