import os import torch import pandas as pd import numpy as np from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding ) from datasets import Dataset, load_from_disk from sklearn.metrics import accuracy_score, f1_score from sklearn.utils.class_weight import compute_class_weight from tqdm import tqdm # Set paths RAW_CSV = "data.csv" CACHE_DIR = "./cached_deberta_dataset" # Load tokenizer tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small") # Load or process dataset if os.path.exists(CACHE_DIR): print("📦 Loading cached dataset...") dataset = load_from_disk(CACHE_DIR) train_ds, val_ds = dataset["train"], dataset["test"] else: print("🔧 Processing and caching dataset...") df = pd.read_csv(RAW_CSV) df = df[["text", "organic"]] df["organic"] = df["organic"].astype(int) data = { "text": df["text"].tolist(), "label": df["organic"].tolist() } full_dataset = Dataset.from_dict(data) dataset = full_dataset.train_test_split(test_size=0.1, seed=42) def tokenize(batch): tokenized = tokenizer( batch["text"], truncation=True, padding="max_length", max_length=512 ) tokenized["label"] = batch["label"] return tokenized dataset = dataset.map(tokenize, batched=True) dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) dataset.save_to_disk(CACHE_DIR) train_ds, val_ds = dataset["train"], dataset["test"] # Calculate class weights from training labels train_labels = np.array(train_ds["label"]) class_weights = compute_class_weight( class_weight="balanced", classes=np.array([0, 1]), y=train_labels ) class_weights_tensor = torch.tensor(class_weights, dtype=torch.float) # Load model model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", num_labels=2) # Custom Trainer with weighted loss class WeightedLossTrainer(Trainer): def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): labels = inputs.pop("labels") outputs = model(**inputs) logits = outputs.logits loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor.to(logits.device)) loss = loss_fct(logits, labels) return (loss, outputs) if return_outputs else loss # Evaluation metrics def compute_metrics(eval_pred): logits, labels = eval_pred preds = torch.tensor(logits).argmax(dim=-1) acc = accuracy_score(labels, preds) f1 = f1_score(labels, preds) return {"accuracy": acc, "f1": f1} # Training arguments training_args = TrainingArguments( output_dir="./ai-small-weighted", evaluation_strategy="steps", eval_steps=5000, save_strategy="steps", save_steps=5000, save_total_limit=20, logging_steps=10, per_device_train_batch_size=48, gradient_accumulation_steps=8, num_train_epochs=3, learning_rate=1e-6, weight_decay=0.01, max_grad_norm=1.0, fp16=torch.cuda.is_available(), load_best_model_at_end=True, metric_for_best_model="f1", greater_is_better=True, logging_dir="./logs", ) # Trainer trainer = WeightedLossTrainer( model=model, args=training_args, train_dataset=train_ds, eval_dataset=val_ds, tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer), compute_metrics=compute_metrics, ) # Train and save trainer.train() trainer.save_model("./ai-small-weighted/final_model") tokenizer.save_pretrained("./ai-small-weighted/final_model")