Spaces:
Runtime error
Runtime error
| import os | |
| import pandas as pd | |
| import sentencepiece as spm | |
| import tokenizers | |
| from tokenizers import Tokenizer, decoders | |
| from tokenizers.models import BPE | |
| from tokenizers.trainers import BpeTrainer | |
| from tokenizers.pre_tokenizers import Punctuation, Digits, Metaspace | |
| from tokenizers.normalizers import NFKC | |
| from transformers import PreTrainedTokenizerFast | |
| from config import PROJECT_ROOT | |
| def check_dir_exits(dir: str) -> None: | |
| ''' | |
| 检查文件夹是否存在,如果不存在则创建文件夹 | |
| ''' | |
| if not os.path.exists(dir): | |
| os.makedirs(dir) | |
| def train_my_huggingface_wiki_tokenizer(cropus_file: str, max_train_line: int=None, vocab_size: int=40960,token_type: str='char') -> None: | |
| ''' | |
| 训练tokenizer with huggingface,至少需要32G内存,运行大概需要半个小时。 | |
| ''' | |
| tokenizer_slow_save_path = PROJECT_ROOT + '/model_save/hf_tokenizer_slow/hf_bpe_tokenizer.josn' | |
| tokenizer_fast_save_path = PROJECT_ROOT + '/model_save/hf_tokenizer' | |
| check_dir_exits(PROJECT_ROOT + '/model_save/hf_tokenizer_slow') | |
| check_dir_exits(tokenizer_fast_save_path) | |
| def get_training_corpus(buffer_size: int=1000, chunk_len: int=2048) -> list: | |
| ''' | |
| 一个文本块大小2048 | |
| ''' | |
| line_cnt = 0 | |
| buffer = [] | |
| with open(cropus_file, 'r', encoding='utf-8') as f_read: | |
| cur_chunk_txt, txt_len = [], 0 | |
| for line in f_read: | |
| cur_chunk_txt.append(line) | |
| txt_len += len(line) | |
| line_cnt += 1 | |
| if txt_len >= chunk_len: | |
| buffer.append( | |
| ''.join(cur_chunk_txt) | |
| ) | |
| cur_chunk_txt, txt_len = [], 0 | |
| if len(buffer) >= buffer_size: | |
| yield buffer | |
| buffer = [] | |
| if isinstance(max_train_line, int) and line_cnt > max_train_line: break | |
| # yield last | |
| if len(buffer) > 0: yield buffer | |
| special_tokens = ["[PAD]","[EOS]","[SEP]","[BOS]", "[CLS]", "[MASK]", "[UNK]"] | |
| if token_type =='char': | |
| model = BPE(unk_token="[UNK]") | |
| tokenizer = Tokenizer(model) | |
| # 用兼容等价分解合并对utf编码进行等价组合,比如全角A转换为半角A | |
| tokenizer.normalizer = tokenizers.normalizers.Sequence([NFKC()]) | |
| # 标点符号,数字,及Metaspace预分割(否则decode出来没有空格) | |
| tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence( | |
| [Punctuation(), Digits(individual_digits=True), Metaspace()] | |
| ) | |
| tokenizer.add_special_tokens(special_tokens) | |
| tokenizer.decoder = decoders.Metaspace() | |
| elif token_type == 'byte': | |
| # byte BPE n不需要unk_token | |
| model = BPE() | |
| tokenizer = Tokenizer(model) | |
| tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=True) | |
| tokenizer.add_special_tokens(special_tokens) | |
| tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, use_regex=True) | |
| tokenizer.post_processor = tokenizers.processors.ByteLevel(trim_offsets=False) | |
| else: | |
| raise Exception(f'token type must be `char` or `byte`, but got {token_type}') | |
| trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=100, show_progress=True, special_tokens=special_tokens) | |
| tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer) | |
| # add \t \n | |
| if '\t' not in tokenizer.get_vocab(): | |
| tokenizer.add_tokens(['\t']) | |
| if '\n' not in tokenizer.get_vocab(): | |
| tokenizer.add_tokens(['\n']) | |
| tokenizer.save(tokenizer_slow_save_path) | |
| # 将训练的tokenizer转换为PreTrainedTokenizerFast并保存 | |
| # 转换是为了方便作为`AutoTokenizer`传到其他`huggingface`组件使用。 | |
| # 转换时要手动指定`pad_token`、`eos_token`等特殊token,因为它不指定你原来的tokenizer中哪些字符是这些特殊字符 | |
| slow_tokenizer = tokenizer | |
| fast_tokenizer = PreTrainedTokenizerFast( | |
| tokenizer_object=slow_tokenizer, | |
| unk_token="[UNK]", | |
| pad_token="[PAD]", | |
| cls_token="[CLS]", | |
| sep_token="[SEP]", | |
| mask_token="[MASK]", | |
| bos_token='[BOS]', | |
| eos_token='[EOS]', | |
| ) | |
| fast_tokenizer.save_pretrained(tokenizer_fast_save_path) | |
| print(f'slow tokenizer save in path: {tokenizer_slow_save_path}') | |
| print(f'fast tokenizer save in path: {tokenizer_fast_save_path}') | |
| print(f"\ntrain tokenizer finished. you can use `AutoTokenizer.from_pretrained('{tokenizer_fast_save_path}')` to load and test your tokenizer.") | |
| def train_my_BPE_tokenizer() -> None: | |
| ''' | |
| 使用sentencepiece训练BPE,缺点只能加载300万行,16G内存会OOM | |
| ''' | |
| txt_corpus_file = PROJECT_ROOT + '/data/my_corpus.txt' | |
| special_tokens = ["[PAD]", "[CLS]","[SEP]", "[MASK]", "[UNK]"] | |
| tokenizer = spm.SentencePieceTrainer.train( | |
| input=txt_corpus_file, | |
| model_prefix='my_tokenizer', | |
| vocab_size=40960, | |
| user_defined_symbols=special_tokens, | |
| max_sentence_length=1024, | |
| shuffle_input_sentence=True, | |
| # character_coverage=1.0, | |
| model_type='bpe', | |
| ) | |
| # 模型文件保存在my_tokenizer下 | |
| if __name__ == '__main__': | |
| cropus_file = PROJECT_ROOT + '/data/wiki.simple.txt' | |
| train_my_huggingface_wiki_tokenizer(cropus_file=cropus_file, token_type='char') # token_type must be 'char' or 'byte' | |