#! /usr/bin/python3 #pip3 install transformers accelerate deepspeed triton esupar tgt="KoichiYasuoka/modernbert-small-classical-chinese-traditional" import os os.system(""" if test -d Kanripo-data then : else git clone --depth=1 https://github.com/DHSinology/Kanripo-data fi""") if not os.path.isfile("train.txt"): import glob,subprocess from esupar.tradify import tradify with open("train.txt","w",encoding="utf-8") as w: s=''' { s=$0; while(s>""){ if(f==1){ i=index(s,""); if(i>0){ h=h substr(s,1,i-1)"\\n"; s=substr(s,i+5); f=0; } else{ h=h s; s=""; } } else{ i=index(s,""); if(i>0){ t=substr(s,1,i-1); gsub(/<[^>]+>/,"",t); gsub(/([ -~]| |\\t)/,"",t); printf("%s",t); s=substr(s,i+4); f=1; } else{ gsub(/<[^>]+>/,"",s); gsub(/([ -~]| |\\t)/,"",s); printf("%s",s); s=""; } } } printf("\\n"); } END{ gsub(/<[^>]+>/,"",h); gsub(/([ -~]| |\\t)/,"",h); printf("%s\\n",h); }''' t=subprocess.run(["nawk",s]+glob.glob("Kanripo-data/data/*/*.xml"),stdout=subprocess.PIPE,encoding="utf-8").stdout u="" for s in t.split("\n"): s=s.strip() if len(s)+len(u)>=8200: print(u,file=w) u="" for c in s: u+=tradify[c] if c in tradify else c print(u,file=w) from transformers import BertTokenizerFast from tokenizers.pre_tokenizers import Sequence,Whitespace,Split from tokenizers import Regex s=["[CLS]","[PAD]","[SEP]","[UNK]","[MASK]"] if not os.path.isfile("vocab.txt"): with open("train.txt","r",encoding="utf-8") as r: v=set(c for c in r.read() if not c.isspace()) with open("vocab.txt","w",encoding="utf-8") as w: print("\n".join(s+sorted(v)),file=w) tkz=BertTokenizerFast(vocab_file="vocab.txt",never_split=s,do_lower_case=False,strip_accents=False,tokenize_chinese_chars=True) tkz.backend_tokenizer.pre_tokenizer=Sequence([Whitespace(),Split(Regex("."),"isolated")]) tkz.backend_tokenizer.decoder.prefix=tkz.backend_tokenizer.model.continuing_subword_prefix="" tkz.save_pretrained(tgt) with open("train.py","w",encoding="utf-8") as w: print(f'#! /usr/bin/env deepspeed\ntgt="{tgt}"'+''' from transformers import BertTokenizerFast,ModernBertForMaskedLM,AutoConfig,DataCollatorForLanguageModeling,TrainingArguments,Trainer tkz=BertTokenizerFast.from_pretrained(tgt) c={"trust_remote_code":True,"vocab_size":len(tkz),"tokenizer_class":type(tkz).__name__} for k,v in tkz.special_tokens_map.items(): c[k+"_id"]=tkz.convert_tokens_to_ids(v) cfg=AutoConfig.from_pretrained("KoichiYasuoka/modernbert-small-japanese-char",**c) arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=1,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,save_safetensors=False) class ReadLineDS(object): def __init__(self,file,tokenizer): self.tokenizer=tokenizer with open(file,"r",encoding="utf-8") as r: self.lines=[s.strip() for s in r if s.strip()>""] __len__=lambda self:len(self.lines) __getitem__=lambda self,i:self.tokenizer(self.lines[i],truncation=True,add_special_tokens=True,max_length=8190) trn=Trainer(args=arg,data_collator=DataCollatorForLanguageModeling(tkz),model=ModernBertForMaskedLM(cfg),train_dataset=ReadLineDS("train.txt",tkz)) trn.train() trn.save_model(tgt)''',file=w) os.system("chmod 755 train.py ; ./train.py")