Spaces:

SagaMKM
/

Projekti

Sleeping

File size: 4,129 Bytes

23fcc89

#This file was our attempt at training the model, which ultimately failed.

!pip install datasets peft transformers
from google.colab import userdata
my_secret_key = userdata.get('Cli2')
from huggingface_hub import login
login(my_secret_key)

# Name for finetuned model and folder.
model_output = "./BudgetAdvisor"

# Dataset loading and manipulation.
from datasets import load_dataset
dataset = load_dataset("gbharti/finance-alpaca") # features: ['text', 'instruction', 'input', 'output']
# Remove empty columns from dataset.
dataset = dataset.remove_columns(["text", "input"])
# Splits dataset to test and train sets, 90 % for train and 10 % for test.
dataset = dataset["train"].train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

#Tokenizer and model settings.
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", use_fast=True)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

# Make arrays of token the same size.
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))

# For memory efficiency.
model.gradient_checkpointing_enable()

# Parameter-Efficient Fine-Tuning
from peft import LoraConfig, get_peft_model
# Define a PEFT configuration for LoRA
lora_config = LoraConfig(
    r=8,  # Reduced rank for faster training
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Check if cuda is available.
import torch
model = get_peft_model(model, lora_config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Preprocessing function
def preprocess_data(examples):
    # Combine instruction and input as the prompt
    inputs = [f"Instruction: {instr}\nInput: {inp}\n" for instr, inp in zip(examples['instruction'], examples['output'])]
    targets = [output for output in examples['output']]
    return {'input_text': inputs, 'target_text': targets}

train_dataset = train_dataset.map(preprocess_data, batched=True)
eval_dataset = eval_dataset.map(preprocess_data, batched=True)

# Tokenization function
def tokenize_data(examples):
    model_inputs = tokenizer(
        examples['input_text'],
        max_length=128,  # Reduced max_length for faster processing
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        examples['target_text'],
        max_length=128,
        truncation=True,
        padding="max_length"
    )["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_data, batched=True, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(tokenize_data, batched=True, remove_columns=eval_dataset.column_names)

# Set the format for PyTorch tensors
train_dataset.set_format(type="torch")
eval_dataset.set_format(type="torch")

# Training arguments and trainer.
training_args = TrainingArguments(
    output_dir=model_output, # "./BudgetAdvisor"
    per_device_train_batch_size=8,  # Increase if GPU memory allows
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    num_train_epochs=3,  # Increased epochs for better training
    learning_rate=5e-5,
    fp16=True,  # Enable mixed precision for faster training
    logging_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none",  # Disable reporting to third-party services
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer    
)

# Message for testing.
print("Trainer is set up!")

# Trains the model and saves it.
trainer.train()
print("Model trained!")
trainer.save_model(model_output) # "./BudgetAdvisor"
tokenizer.save_pretrained(model_output)# "./BudgetAdvisor"

!zip -r BudgetAdvisor.zip ./BudgetAdvisor