Spaces:
Build error
Build error
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments | |
| from datasets import Dataset | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from peft import get_peft_model, LoraConfig, TaskType | |
| import evaluate | |
| import numpy as np | |
| # Load the dataset | |
| file_path = 'train_en.csv' | |
| dataset = pd.read_csv(file_path) | |
| # Map labels to expected responses | |
| label_mapping = { | |
| "Yes": 0, | |
| "No": 1, | |
| "It doesn't matter": 2, | |
| "Unimportant": 2, # Assuming "unimportant" is synonymous with "It doesn't matter" | |
| "Incorrect questioning": 3, | |
| "Correct answers": 4 | |
| } | |
| # Apply label mapping | |
| dataset['label'] = dataset['label'].map(label_mapping) | |
| # Handle NaN values: Drop rows where label is NaN | |
| dataset = dataset.dropna(subset=['label']) | |
| # Ensure labels are integers | |
| dataset['label'] = dataset['label'].astype(int) | |
| # Format puzzle, truth, text into rows | |
| dataset['combined_text'] = ( | |
| "==========================================\n" | |
| "puzzle: " + dataset['puzzle'] + "\n" | |
| "==========================================\n" | |
| "truth: " + dataset['truth'] + "\n" | |
| "==========================================\n" | |
| "text: " + dataset['text'] | |
| ) | |
| # Split the dataset into training and validation sets | |
| train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42) | |
| # Convert the dataframes to datasets | |
| train_dataset = Dataset.from_pandas(train_df) | |
| val_dataset = Dataset.from_pandas(val_df) | |
| # Load the tokenizer and model | |
| model_name = "google/gemma-2-9b" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5) | |
| # Tokenize the data | |
| def tokenize_function(examples): | |
| return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128) | |
| train_dataset = train_dataset.map(tokenize_function, batched=True) | |
| val_dataset = val_dataset.map(tokenize_function, batched=True) | |
| # Set the format for PyTorch | |
| train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) | |
| val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) | |
| # Define LoRA configuration | |
| lora_config = LoraConfig( | |
| task_type=TaskType.SEQ_CLS, | |
| r=16, | |
| lora_alpha=16, | |
| target_modules=["q_proj", "v_proj"], | |
| lora_dropout=0.05, | |
| bias="none" | |
| ) | |
| # Apply LoRA to the model | |
| model = get_peft_model(model, lora_config) | |
| model.print_trainable_parameters() | |
| # Training arguments | |
| training_args = TrainingArguments( | |
| output_dir='./results', | |
| learning_rate=1e-4, | |
| lr_scheduler_type="linear", | |
| warmup_ratio=0.1, | |
| max_grad_norm=0.3, | |
| per_device_train_batch_size=4, | |
| per_device_eval_batch_size=4, | |
| num_train_epochs=3, | |
| weight_decay=0.001, | |
| evaluation_strategy="epoch", | |
| save_strategy="epoch", | |
| load_best_model_at_end=True, | |
| report_to="wandb", | |
| fp16=True, | |
| gradient_checkpointing=True, | |
| gradient_accumulation_steps=4, | |
| dataloader_num_workers=4, | |
| logging_steps=100, | |
| save_total_limit=2, | |
| ) | |
| def compute_metrics(eval_pred): | |
| precision_metric = evaluate.load("precision") | |
| recall_metric = evaluate.load("recall") | |
| f1_metric = evaluate.load("f1") | |
| accuracy_metric = evaluate.load("accuracy") | |
| logits, labels = eval_pred | |
| predictions = np.argmax(logits, axis=-1) | |
| precision = precision_metric.compute(predictions=predictions, references=labels, average='macro')["precision"] | |
| recall = recall_metric.compute(predictions=predictions, references=labels, average='macro')["recall"] | |
| f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')["f1"] | |
| accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"] | |
| return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy} | |
| # Initialize the Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=val_dataset, | |
| compute_metrics=compute_metrics | |
| ) | |
| # Train the model | |
| trainer.train() | |
| # Save the model | |
| model.save_pretrained('trained_gemma_model') | |
| tokenizer.save_pretrained('trained_gemma_model') | |
| # Evaluate the model | |
| trainer.evaluate() |