CodeMentor-AI / train /train_model.py
Tuathe's picture
Final clean version for Hugging Face deployment
72df28d
raw
history blame
1.7 kB
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
# Config
model_name = "google/flan-t5-small"
data_path = "data/final_coding_dataset.jsonl"
# Load dataset
dataset = load_dataset("json", data_files=data_path, split="train")
# Format data for T5
def format_example(example):
return {
"input_text": f"Question: {example['prompt']}",
"target_text": example["completion"]
}
dataset = dataset.map(format_example)
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize(batch):
input_enc = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=512)
target_enc = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=128)
input_enc["labels"] = target_enc["input_ids"]
return input_enc
dataset = dataset.map(tokenize, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Training args
training_args = TrainingArguments(
output_dir="model/codementor-flan",
num_train_epochs=6, # use epochs here
per_device_train_batch_size=2,
gradient_accumulation_steps=2,
save_steps=100,
save_total_limit=2,
logging_steps=100,
report_to="none",
fp16=False
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=tokenizer
)
# Train
trainer.train()
# Save final model
model.save_pretrained("model/codementor-flan")
tokenizer.save_pretrained("model/codementor-flan")