KoichiYasuoka's picture
initial release
151b230
#! /usr/bin/python3
#pip3 install transformers accelerate deepspeed triton esupar
tgt="KoichiYasuoka/modernbert-small-classical-chinese-traditional"
import os
os.system("""
if test -d Kanripo-data
then :
else git clone --depth=1 https://github.com/DHSinology/Kanripo-data
fi""")
if not os.path.isfile("train.txt"):
import glob,subprocess
from esupar.tradify import tradify
with open("train.txt","w",encoding="utf-8") as w:
s='''
{
s=$0;
while(s>""){
if(f==1){
i=index(s,"</hi>");
if(i>0){
h=h substr(s,1,i-1)"\\n";
s=substr(s,i+5);
f=0;
}
else{
h=h s;
s="";
}
}
else{
i=index(s,"<hi>");
if(i>0){
t=substr(s,1,i-1);
gsub(/<[^>]+>/,"",t);
gsub(/([ -~]| |\\t)/,"",t);
printf("%s",t);
s=substr(s,i+4);
f=1;
}
else{
gsub(/<[^>]+>/,"",s);
gsub(/([ -~]| |\\t)/,"",s);
printf("%s",s);
s="";
}
}
}
printf("\\n");
}
END{
gsub(/<[^>]+>/,"",h);
gsub(/([ -~]| |\\t)/,"",h);
printf("%s\\n",h);
}'''
t=subprocess.run(["nawk",s]+glob.glob("Kanripo-data/data/*/*.xml"),stdout=subprocess.PIPE,encoding="utf-8").stdout
u=""
for s in t.split("\n"):
s=s.strip()
if len(s)+len(u)>=8200:
print(u,file=w)
u=""
for c in s:
u+=tradify[c] if c in tradify else c
print(u,file=w)
from transformers import BertTokenizerFast
from tokenizers.pre_tokenizers import Sequence,Whitespace,Split
from tokenizers import Regex
s=["[CLS]","[PAD]","[SEP]","[UNK]","[MASK]"]
if not os.path.isfile("vocab.txt"):
with open("train.txt","r",encoding="utf-8") as r:
v=set(c for c in r.read() if not c.isspace())
with open("vocab.txt","w",encoding="utf-8") as w:
print("\n".join(s+sorted(v)),file=w)
tkz=BertTokenizerFast(vocab_file="vocab.txt",never_split=s,do_lower_case=False,strip_accents=False,tokenize_chinese_chars=True)
tkz.backend_tokenizer.pre_tokenizer=Sequence([Whitespace(),Split(Regex("."),"isolated")])
tkz.backend_tokenizer.decoder.prefix=tkz.backend_tokenizer.model.continuing_subword_prefix=""
tkz.save_pretrained(tgt)
with open("train.py","w",encoding="utf-8") as w:
print(f'#! /usr/bin/env deepspeed\ntgt="{tgt}"'+'''
from transformers import BertTokenizerFast,ModernBertForMaskedLM,AutoConfig,DataCollatorForLanguageModeling,TrainingArguments,Trainer
tkz=BertTokenizerFast.from_pretrained(tgt)
c={"trust_remote_code":True,"vocab_size":len(tkz),"tokenizer_class":type(tkz).__name__}
for k,v in tkz.special_tokens_map.items():
c[k+"_id"]=tkz.convert_tokens_to_ids(v)
cfg=AutoConfig.from_pretrained("KoichiYasuoka/modernbert-small-japanese-char",**c)
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=1,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,save_safetensors=False)
class ReadLineDS(object):
def __init__(self,file,tokenizer):
self.tokenizer=tokenizer
with open(file,"r",encoding="utf-8") as r:
self.lines=[s.strip() for s in r if s.strip()>""]
__len__=lambda self:len(self.lines)
__getitem__=lambda self,i:self.tokenizer(self.lines[i],truncation=True,add_special_tokens=True,max_length=8190)
trn=Trainer(args=arg,data_collator=DataCollatorForLanguageModeling(tkz),model=ModernBertForMaskedLM(cfg),train_dataset=ReadLineDS("train.txt",tkz))
trn.train()
trn.save_model(tgt)''',file=w)
os.system("chmod 755 train.py ; ./train.py")