File size: 3,460 Bytes

151b230

#! /usr/bin/python3
#pip3 install transformers accelerate deepspeed triton esupar
tgt="KoichiYasuoka/modernbert-small-classical-chinese-traditional"
import os
os.system("""
if test -d Kanripo-data
then :
else git clone --depth=1 https://github.com/DHSinology/Kanripo-data
fi""")
if not os.path.isfile("train.txt"):
  import glob,subprocess
  from esupar.tradify import tradify
  with open("train.txt","w",encoding="utf-8") as w:
    s='''
{
  s=$0;
  while(s>""){
    if(f==1){
      i=index(s,"</hi>");
      if(i>0){
        h=h substr(s,1,i-1)"\\n";
        s=substr(s,i+5);
        f=0;
      }
      else{
        h=h s;
        s="";
      }
    }
    else{
      i=index(s,"<hi>");
      if(i>0){
        t=substr(s,1,i-1);
        gsub(/<[^>]+>/,"",t);
        gsub(/([ -~]|　|\\t)/,"",t);
        printf("%s",t);
        s=substr(s,i+4);
        f=1;
      }
      else{
        gsub(/<[^>]+>/,"",s);
        gsub(/([ -~]|　|\\t)/,"",s);
        printf("%s",s);
        s="";
      }
    }
  }
  printf("\\n");
}
END{
  gsub(/<[^>]+>/,"",h);
  gsub(/([ -~]|　|\\t)/,"",h);
  printf("%s\\n",h);
}'''
    t=subprocess.run(["nawk",s]+glob.glob("Kanripo-data/data/*/*.xml"),stdout=subprocess.PIPE,encoding="utf-8").stdout
    u=""
    for s in t.split("\n"):
      s=s.strip()
      if len(s)+len(u)>=8200:
        print(u,file=w)
        u=""
      for c in s:
        u+=tradify[c] if c in tradify else c
    print(u,file=w)

from transformers import BertTokenizerFast
from tokenizers.pre_tokenizers import Sequence,Whitespace,Split
from tokenizers import Regex
s=["[CLS]","[PAD]","[SEP]","[UNK]","[MASK]"]
if not os.path.isfile("vocab.txt"):
  with open("train.txt","r",encoding="utf-8") as r:
    v=set(c for c in r.read() if not c.isspace())
  with open("vocab.txt","w",encoding="utf-8") as w:
    print("\n".join(s+sorted(v)),file=w)
tkz=BertTokenizerFast(vocab_file="vocab.txt",never_split=s,do_lower_case=False,strip_accents=False,tokenize_chinese_chars=True)
tkz.backend_tokenizer.pre_tokenizer=Sequence([Whitespace(),Split(Regex("."),"isolated")])
tkz.backend_tokenizer.decoder.prefix=tkz.backend_tokenizer.model.continuing_subword_prefix=""
tkz.save_pretrained(tgt)

with open("train.py","w",encoding="utf-8") as w:
  print(f'#! /usr/bin/env deepspeed\ntgt="{tgt}"'+'''
from transformers import BertTokenizerFast,ModernBertForMaskedLM,AutoConfig,DataCollatorForLanguageModeling,TrainingArguments,Trainer
tkz=BertTokenizerFast.from_pretrained(tgt)
c={"trust_remote_code":True,"vocab_size":len(tkz),"tokenizer_class":type(tkz).__name__}
for k,v in tkz.special_tokens_map.items():
  c[k+"_id"]=tkz.convert_tokens_to_ids(v)
cfg=AutoConfig.from_pretrained("KoichiYasuoka/modernbert-small-japanese-char",**c)
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=1,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,save_safetensors=False)
class ReadLineDS(object):
  def __init__(self,file,tokenizer):
    self.tokenizer=tokenizer
    with open(file,"r",encoding="utf-8") as r:
      self.lines=[s.strip() for s in r if s.strip()>""]
  __len__=lambda self:len(self.lines)
  __getitem__=lambda self,i:self.tokenizer(self.lines[i],truncation=True,add_special_tokens=True,max_length=8190)
trn=Trainer(args=arg,data_collator=DataCollatorForLanguageModeling(tkz),model=ModernBertForMaskedLM(cfg),train_dataset=ReadLineDS("train.txt",tkz))
trn.train()
trn.save_model(tgt)''',file=w)
os.system("chmod 755 train.py ; ./train.py")