|
|
|
|
|
!pip install datasets peft transformers |
|
from google.colab import userdata |
|
my_secret_key = userdata.get('Cli2') |
|
from huggingface_hub import login |
|
login(my_secret_key) |
|
|
|
|
|
model_output = "./BudgetAdvisor" |
|
|
|
|
|
from datasets import load_dataset |
|
dataset = load_dataset("gbharti/finance-alpaca") |
|
|
|
dataset = dataset.remove_columns(["text", "input"]) |
|
|
|
dataset = dataset["train"].train_test_split(test_size=0.1) |
|
train_dataset = dataset["train"] |
|
eval_dataset = dataset["test"] |
|
|
|
|
|
from transformers import AutoTokenizer |
|
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM |
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", use_fast=True) |
|
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B") |
|
|
|
|
|
if tokenizer.pad_token is None: |
|
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) |
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
|
|
model.gradient_checkpointing_enable() |
|
|
|
|
|
from peft import LoraConfig, get_peft_model |
|
|
|
lora_config = LoraConfig( |
|
r=8, |
|
lora_alpha=16, |
|
target_modules=["q_proj", "v_proj"], |
|
lora_dropout=0.05, |
|
bias="none", |
|
task_type="CAUSAL_LM" |
|
) |
|
|
|
|
|
import torch |
|
model = get_peft_model(model, lora_config) |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
|
|
def preprocess_data(examples): |
|
|
|
inputs = [f"Instruction: {instr}\nInput: {inp}\n" for instr, inp in zip(examples['instruction'], examples['output'])] |
|
targets = [output for output in examples['output']] |
|
return {'input_text': inputs, 'target_text': targets} |
|
|
|
train_dataset = train_dataset.map(preprocess_data, batched=True) |
|
eval_dataset = eval_dataset.map(preprocess_data, batched=True) |
|
|
|
|
|
def tokenize_data(examples): |
|
model_inputs = tokenizer( |
|
examples['input_text'], |
|
max_length=128, |
|
truncation=True, |
|
padding="max_length" |
|
) |
|
labels = tokenizer( |
|
examples['target_text'], |
|
max_length=128, |
|
truncation=True, |
|
padding="max_length" |
|
)["input_ids"] |
|
model_inputs["labels"] = labels |
|
return model_inputs |
|
|
|
|
|
train_dataset = train_dataset.map(tokenize_data, batched=True, remove_columns=train_dataset.column_names) |
|
eval_dataset = eval_dataset.map(tokenize_data, batched=True, remove_columns=eval_dataset.column_names) |
|
|
|
|
|
train_dataset.set_format(type="torch") |
|
eval_dataset.set_format(type="torch") |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir=model_output, |
|
per_device_train_batch_size=8, |
|
per_device_eval_batch_size=8, |
|
evaluation_strategy="steps", |
|
eval_steps=500, |
|
save_steps=500, |
|
num_train_epochs=3, |
|
learning_rate=5e-5, |
|
fp16=True, |
|
logging_steps=100, |
|
save_total_limit=2, |
|
load_best_model_at_end=True, |
|
report_to="none", |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=eval_dataset, |
|
tokenizer=tokenizer |
|
) |
|
|
|
|
|
print("Trainer is set up!") |
|
|
|
|
|
trainer.train() |
|
print("Model trained!") |
|
trainer.save_model(model_output) |
|
tokenizer.save_pretrained(model_output) |
|
|
|
!zip -r BudgetAdvisor.zip ./BudgetAdvisor |