Spaces:
Running
Running
import os | |
import torch | |
from datasets import load_dataset | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer | |
# Config | |
model_name = "google/flan-t5-small" | |
data_path = "data/final_coding_dataset.jsonl" | |
# Load dataset | |
dataset = load_dataset("json", data_files=data_path, split="train") | |
# Format data for T5 | |
def format_example(example): | |
return { | |
"input_text": f"Question: {example['prompt']}", | |
"target_text": example["completion"] | |
} | |
dataset = dataset.map(format_example) | |
# Tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
def tokenize(batch): | |
input_enc = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=512) | |
target_enc = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=128) | |
input_enc["labels"] = target_enc["input_ids"] | |
return input_enc | |
dataset = dataset.map(tokenize, batched=True) | |
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) | |
# Load model | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
# Training args | |
training_args = TrainingArguments( | |
output_dir="model/codementor-flan", | |
num_train_epochs=6, # use epochs here | |
per_device_train_batch_size=2, | |
gradient_accumulation_steps=2, | |
save_steps=100, | |
save_total_limit=2, | |
logging_steps=100, | |
report_to="none", | |
fp16=False | |
) | |
# Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=dataset, | |
tokenizer=tokenizer | |
) | |
# Train | |
trainer.train() | |
# Save final model | |
model.save_pretrained("model/codementor-flan") | |
tokenizer.save_pretrained("model/codementor-flan") | |