Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

logical-reasoning / competition /gemma.py

nicoleathy

Upload 2 files

9b7e8b1 verified over 1 year ago

raw

history blame

4.23 kB

	from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
	from datasets import Dataset
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from peft import get_peft_model, LoraConfig, TaskType
	import evaluate
	import numpy as np

	# Load the dataset
	file_path = 'train_en.csv'
	dataset = pd.read_csv(file_path)

	# Map labels to expected responses
	label_mapping = {
	"Yes": 0,
	"No": 1,
	"It doesn't matter": 2,
	"Unimportant": 2, # Assuming "unimportant" is synonymous with "It doesn't matter"
	"Incorrect questioning": 3,
	"Correct answers": 4
	}

	# Apply label mapping
	dataset['label'] = dataset['label'].map(label_mapping)

	# Handle NaN values: Drop rows where label is NaN
	dataset = dataset.dropna(subset=['label'])

	# Ensure labels are integers
	dataset['label'] = dataset['label'].astype(int)

	# Format puzzle, truth, text into rows
	dataset['combined_text'] = (
	"==========================================\n"
	"puzzle: " + dataset['puzzle'] + "\n"
	"==========================================\n"
	"truth: " + dataset['truth'] + "\n"
	"==========================================\n"
	"text: " + dataset['text']
	)

	# Split the dataset into training and validation sets
	train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)

	# Convert the dataframes to datasets
	train_dataset = Dataset.from_pandas(train_df)
	val_dataset = Dataset.from_pandas(val_df)

	# Load the tokenizer and model
	model_name = "google/gemma-2-9b"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

	# Tokenize the data
	def tokenize_function(examples):
	return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)

	train_dataset = train_dataset.map(tokenize_function, batched=True)
	val_dataset = val_dataset.map(tokenize_function, batched=True)

	# Set the format for PyTorch
	train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
	val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

	# Define LoRA configuration
	lora_config = LoraConfig(
	task_type=TaskType.SEQ_CLS,
	r=16,
	lora_alpha=16,
	target_modules=["q_proj", "v_proj"],
	lora_dropout=0.05,
	bias="none"
	)

	# Apply LoRA to the model
	model = get_peft_model(model, lora_config)
	model.print_trainable_parameters()

	# Training arguments
	training_args = TrainingArguments(
	output_dir='./results',
	learning_rate=1e-4,
	lr_scheduler_type="linear",
	warmup_ratio=0.1,
	max_grad_norm=0.3,
	per_device_train_batch_size=4,
	per_device_eval_batch_size=4,
	num_train_epochs=3,
	weight_decay=0.001,
	evaluation_strategy="epoch",
	save_strategy="epoch",
	load_best_model_at_end=True,
	report_to="wandb",
	fp16=True,
	gradient_checkpointing=True,
	gradient_accumulation_steps=4,
	dataloader_num_workers=4,
	logging_steps=100,
	save_total_limit=2,
	)

	def compute_metrics(eval_pred):
	precision_metric = evaluate.load("precision")
	recall_metric = evaluate.load("recall")
	f1_metric = evaluate.load("f1")
	accuracy_metric = evaluate.load("accuracy")

	logits, labels = eval_pred
	predictions = np.argmax(logits, axis=-1)

	precision = precision_metric.compute(predictions=predictions, references=labels, average='macro')["precision"]
	recall = recall_metric.compute(predictions=predictions, references=labels, average='macro')["recall"]
	f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')["f1"]
	accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]

	return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

	# Initialize the Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=val_dataset,
	compute_metrics=compute_metrics
	)

	# Train the model
	trainer.train()

	# Save the model
	model.save_pretrained('trained_gemma_model')
	tokenizer.save_pretrained('trained_gemma_model')

	# Evaluate the model
	trainer.evaluate()