|
|
|
|
|
tgt="KoichiYasuoka/modernbert-small-classical-chinese-traditional" |
|
import os |
|
os.system(""" |
|
if test -d Kanripo-data |
|
then : |
|
else git clone --depth=1 https://github.com/DHSinology/Kanripo-data |
|
fi""") |
|
if not os.path.isfile("train.txt"): |
|
import glob,subprocess |
|
from esupar.tradify import tradify |
|
with open("train.txt","w",encoding="utf-8") as w: |
|
s=''' |
|
{ |
|
s=$0; |
|
while(s>""){ |
|
if(f==1){ |
|
i=index(s,"</hi>"); |
|
if(i>0){ |
|
h=h substr(s,1,i-1)"\\n"; |
|
s=substr(s,i+5); |
|
f=0; |
|
} |
|
else{ |
|
h=h s; |
|
s=""; |
|
} |
|
} |
|
else{ |
|
i=index(s,"<hi>"); |
|
if(i>0){ |
|
t=substr(s,1,i-1); |
|
gsub(/<[^>]+>/,"",t); |
|
gsub(/([ -~]| |\\t)/,"",t); |
|
printf("%s",t); |
|
s=substr(s,i+4); |
|
f=1; |
|
} |
|
else{ |
|
gsub(/<[^>]+>/,"",s); |
|
gsub(/([ -~]| |\\t)/,"",s); |
|
printf("%s",s); |
|
s=""; |
|
} |
|
} |
|
} |
|
printf("\\n"); |
|
} |
|
END{ |
|
gsub(/<[^>]+>/,"",h); |
|
gsub(/([ -~]| |\\t)/,"",h); |
|
printf("%s\\n",h); |
|
}''' |
|
t=subprocess.run(["nawk",s]+glob.glob("Kanripo-data/data/*/*.xml"),stdout=subprocess.PIPE,encoding="utf-8").stdout |
|
u="" |
|
for s in t.split("\n"): |
|
s=s.strip() |
|
if len(s)+len(u)>=8200: |
|
print(u,file=w) |
|
u="" |
|
for c in s: |
|
u+=tradify[c] if c in tradify else c |
|
print(u,file=w) |
|
|
|
from transformers import BertTokenizerFast |
|
from tokenizers.pre_tokenizers import Sequence,Whitespace,Split |
|
from tokenizers import Regex |
|
s=["[CLS]","[PAD]","[SEP]","[UNK]","[MASK]"] |
|
if not os.path.isfile("vocab.txt"): |
|
with open("train.txt","r",encoding="utf-8") as r: |
|
v=set(c for c in r.read() if not c.isspace()) |
|
with open("vocab.txt","w",encoding="utf-8") as w: |
|
print("\n".join(s+sorted(v)),file=w) |
|
tkz=BertTokenizerFast(vocab_file="vocab.txt",never_split=s,do_lower_case=False,strip_accents=False,tokenize_chinese_chars=True) |
|
tkz.backend_tokenizer.pre_tokenizer=Sequence([Whitespace(),Split(Regex("."),"isolated")]) |
|
tkz.backend_tokenizer.decoder.prefix=tkz.backend_tokenizer.model.continuing_subword_prefix="" |
|
tkz.save_pretrained(tgt) |
|
|
|
with open("train.py","w",encoding="utf-8") as w: |
|
print(f'#! /usr/bin/env deepspeed\ntgt="{tgt}"'+''' |
|
from transformers import BertTokenizerFast,ModernBertForMaskedLM,AutoConfig,DataCollatorForLanguageModeling,TrainingArguments,Trainer |
|
tkz=BertTokenizerFast.from_pretrained(tgt) |
|
c={"trust_remote_code":True,"vocab_size":len(tkz),"tokenizer_class":type(tkz).__name__} |
|
for k,v in tkz.special_tokens_map.items(): |
|
c[k+"_id"]=tkz.convert_tokens_to_ids(v) |
|
cfg=AutoConfig.from_pretrained("KoichiYasuoka/modernbert-small-japanese-char",**c) |
|
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=1,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,save_safetensors=False) |
|
class ReadLineDS(object): |
|
def __init__(self,file,tokenizer): |
|
self.tokenizer=tokenizer |
|
with open(file,"r",encoding="utf-8") as r: |
|
self.lines=[s.strip() for s in r if s.strip()>""] |
|
__len__=lambda self:len(self.lines) |
|
__getitem__=lambda self,i:self.tokenizer(self.lines[i],truncation=True,add_special_tokens=True,max_length=8190) |
|
trn=Trainer(args=arg,data_collator=DataCollatorForLanguageModeling(tkz),model=ModernBertForMaskedLM(cfg),train_dataset=ReadLineDS("train.txt",tkz)) |
|
trn.train() |
|
trn.save_model(tgt)''',file=w) |
|
os.system("chmod 755 train.py ; ./train.py") |
|
|