The primary codes below are based on [akpe12/JP-KR-ocr-translator-for-travel](https://github.com/akpe12/JP-KR-ocr-translator-for-travel).

## Import

In [1]:
from typing import Dict, List
import csv

import datasets
import torch
from transformers import (
    PreTrainedTokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    BertJapaneseTokenizer,
    Trainer
)
from transformers.models.encoder_decoder.modeling_encoder_decoder import EncoderDecoderModel

from datasets import load_dataset

# encoder_model_name = "xlm-roberta-base"
encoder_model_name = "cl-tohoku/bert-base-japanese-v2"
decoder_model_name = "skt/kogpt2-base-v2"

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
device, torch.cuda.device_count()

(device(type='cpu'), 0)

In [3]:
class GPT2Tokenizer(PreTrainedTokenizerFast):
    def build_inputs_with_special_tokens(self, token_ids: List[int]) -> List[int]:
        return token_ids + [self.eos_token_id]        

src_tokenizer = BertJapaneseTokenizer.from_pretrained(encoder_model_name)
trg_tokenizer = GPT2Tokenizer.from_pretrained(decoder_model_name, bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')

## Data

In [4]:
dataset = load_dataset("sappho192/Tatoeba-Challenge-jpn-kor")
# dataset = load_dataset("D:\\REPO\\Tatoeba-Challenge-jpn-kor")

train_dataset = dataset['train']
test_dataset = dataset['test']

train_first_row = train_dataset[0]
test_first_row = test_dataset[0]

In [5]:
class PairedDataset:
    def __init__(self, 
        source_tokenizer: PreTrainedTokenizerFast, target_tokenizer: PreTrainedTokenizerFast,
        file_path: str = None,
        dataset_raw: datasets.Dataset = None
    ):
        self.src_tokenizer = source_tokenizer
        self.trg_tokenizer = target_tokenizer
        
        if file_path is not None:
            with open(file_path, 'r') as fd:
                reader = csv.reader(fd)
                next(reader)
                self.data = [row for row in reader]
        elif dataset_raw is not None:
            self.data = dataset_raw
        else:
            raise ValueError('file_path or dataset_raw must be specified')

    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
#         with open('train_log.txt', 'a+') as log_file:
#             log_file.write(f'reading data[{index}] {self.data[index]}\n')
        if isinstance(self.data, datasets.Dataset):
            src, trg = self.data[index]['sourceString'], self.data[index]['targetString']
        else:
            src, trg = self.data[index]
        embeddings = self.src_tokenizer(src, return_attention_mask=False, return_token_type_ids=False)
        embeddings['labels'] = self.trg_tokenizer.build_inputs_with_special_tokens(self.trg_tokenizer(trg, return_attention_mask=False)['input_ids'])

        return embeddings

    def __len__(self):
        return len(self.data)

In [6]:
DATA_ROOT = './output'
FILE_FFAC_FULL = 'ffac_full.csv'
FILE_FFAC_TEST = 'ffac_test.csv'
FILE_JA_KO_TRAIN = 'ja_ko_train.csv'
FILE_JA_KO_TEST = 'ja_ko_test.csv'

# train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_FFAC_FULL}')
# eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_FFAC_TEST}') 

# train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_JA_KO_TRAIN}')
# eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_JA_KO_TEST}')

In [7]:
train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, dataset_raw=train_dataset)
eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, dataset_raw=test_dataset)
eval_dataset[0]

{'input_ids': [2, 33, 2181, 1402, 893, 15200, 893, 13507, 881, 933, 882, 829, 3], 'labels': [9085, 10936, 10993, 23363, 9134, 18368, 8006, 389, 1]}

In [8]:
# be sure to check the column count of each dataset if you encounter "ValueError: too many values to unpack (expected 2)"
# at the `src, trg = self.data[index]`
# The `cat ffac_full.csv tteb_train.csv > ja_ko_train.csv` command may be the reason.
# the last row of first csv and first row of second csv is merged and that's why 3rd column is created (which arouse ValueError)
# debug_data = train_dataset.data


## Model

In [9]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_model_name,
    decoder_model_name,
    pad_token_id=trg_tokenizer.bos_token_id,
)
model.config.decoder_start_token_id = trg_tokenizer.bos_token_id

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'transformer.h.10.crossattention.c_proj.bias', 'transfo

In [11]:
# for Trainer
import wandb

collate_fn = DataCollatorForSeq2Seq(src_tokenizer, model)
wandb.init(project="fftr-poc1", name='jbert+kogpt2')

arguments = Seq2SeqTrainingArguments(
    output_dir='dump',
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    # num_train_epochs=25,
    per_device_train_batch_size=1,
    # per_device_train_batch_size=30, # takes 40GB
    # per_device_train_batch_size=64,
    per_device_eval_batch_size=1,
    # per_device_eval_batch_size=30,
    # per_device_eval_batch_size=64,
    warmup_ratio=0.1,
    gradient_accumulation_steps=4,
    save_total_limit=5,
    dataloader_num_workers=1,
    # fp16=True, # ENABLE if CUDA is enabled
    load_best_model_at_end=True,
    report_to='wandb'
)

trainer = Trainer(
    model,
    arguments,
    data_collator=collate_fn,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

VBox(children=(Label(value='0.001 MB of 0.010 MB uploaded\r'), FloatProgress(value=0.10972568578553615, max=1.…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888884685, max=1.0…

## Training

In [None]:
# model = EncoderDecoderModel.from_encoder_decoder_pretrained("xlm-roberta-base",  "skt/kogpt2-base-v2")

In [12]:
trainer.train()

model.save_pretrained("dump/best_model")
src_tokenizer.save_pretrained("dump/best_model/src_tokenizer")
trg_tokenizer.save_pretrained("dump/best_model/trg_tokenizer")

  0%|          | 0/9671328 [00:00<?, ?it/s]

In [2]:
# import wandb
# wandb.finish()