Spaces:
Sleeping
Sleeping
File size: 1,701 Bytes
72df28d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
# Config
model_name = "google/flan-t5-small"
data_path = "data/final_coding_dataset.jsonl"
# Load dataset
dataset = load_dataset("json", data_files=data_path, split="train")
# Format data for T5
def format_example(example):
return {
"input_text": f"Question: {example['prompt']}",
"target_text": example["completion"]
}
dataset = dataset.map(format_example)
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize(batch):
input_enc = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=512)
target_enc = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=128)
input_enc["labels"] = target_enc["input_ids"]
return input_enc
dataset = dataset.map(tokenize, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Training args
training_args = TrainingArguments(
output_dir="model/codementor-flan",
num_train_epochs=6, # use epochs here
per_device_train_batch_size=2,
gradient_accumulation_steps=2,
save_steps=100,
save_total_limit=2,
logging_steps=100,
report_to="none",
fp16=False
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=tokenizer
)
# Train
trainer.train()
# Save final model
model.save_pretrained("model/codementor-flan")
tokenizer.save_pretrained("model/codementor-flan")
|