dejanseo commited on
Commit
31def30
·
verified ·
1 Parent(s): aa8dfa4

Upload train2.py

Browse files
Files changed (1) hide show
  1. train2.py +124 -0
train2.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import pandas as pd
4
+ import numpy as np
5
+ from transformers import (
6
+ AutoTokenizer,
7
+ AutoModelForSequenceClassification,
8
+ Trainer,
9
+ TrainingArguments,
10
+ DataCollatorWithPadding
11
+ )
12
+ from datasets import Dataset, load_from_disk
13
+ from sklearn.metrics import accuracy_score, f1_score
14
+ from sklearn.utils.class_weight import compute_class_weight
15
+ from tqdm import tqdm
16
+
17
+ # Set paths
18
+ RAW_CSV = "data.csv"
19
+ CACHE_DIR = "./cached_deberta_dataset"
20
+
21
+ # Load tokenizer
22
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
23
+
24
+ # Load or process dataset
25
+ if os.path.exists(CACHE_DIR):
26
+ print("📦 Loading cached dataset...")
27
+ dataset = load_from_disk(CACHE_DIR)
28
+ train_ds, val_ds = dataset["train"], dataset["test"]
29
+ else:
30
+ print("🔧 Processing and caching dataset...")
31
+ df = pd.read_csv(RAW_CSV)
32
+ df = df[["text", "organic"]]
33
+ df["organic"] = df["organic"].astype(int)
34
+
35
+ data = {
36
+ "text": df["text"].tolist(),
37
+ "label": df["organic"].tolist()
38
+ }
39
+
40
+ full_dataset = Dataset.from_dict(data)
41
+ dataset = full_dataset.train_test_split(test_size=0.1, seed=42)
42
+
43
+ def tokenize(batch):
44
+ tokenized = tokenizer(
45
+ batch["text"],
46
+ truncation=True,
47
+ padding="max_length",
48
+ max_length=512
49
+ )
50
+ tokenized["label"] = batch["label"]
51
+ return tokenized
52
+
53
+ dataset = dataset.map(tokenize, batched=True)
54
+ dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
55
+ dataset.save_to_disk(CACHE_DIR)
56
+ train_ds, val_ds = dataset["train"], dataset["test"]
57
+
58
+ # Calculate class weights from training labels
59
+ train_labels = np.array(train_ds["label"])
60
+ class_weights = compute_class_weight(
61
+ class_weight="balanced",
62
+ classes=np.array([0, 1]),
63
+ y=train_labels
64
+ )
65
+ class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)
66
+
67
+ # Load model
68
+ model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", num_labels=2)
69
+
70
+ # Custom Trainer with weighted loss
71
+ class WeightedLossTrainer(Trainer):
72
+ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
73
+ labels = inputs.pop("labels")
74
+ outputs = model(**inputs)
75
+ logits = outputs.logits
76
+ loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor.to(logits.device))
77
+ loss = loss_fct(logits, labels)
78
+ return (loss, outputs) if return_outputs else loss
79
+
80
+ # Evaluation metrics
81
+ def compute_metrics(eval_pred):
82
+ logits, labels = eval_pred
83
+ preds = torch.tensor(logits).argmax(dim=-1)
84
+ acc = accuracy_score(labels, preds)
85
+ f1 = f1_score(labels, preds)
86
+ return {"accuracy": acc, "f1": f1}
87
+
88
+ # Training arguments
89
+ training_args = TrainingArguments(
90
+ output_dir="./ai-small-weighted",
91
+ evaluation_strategy="steps",
92
+ eval_steps=5000,
93
+ save_strategy="steps",
94
+ save_steps=5000,
95
+ save_total_limit=20,
96
+ logging_steps=10,
97
+ per_device_train_batch_size=48,
98
+ gradient_accumulation_steps=8,
99
+ num_train_epochs=3,
100
+ learning_rate=1e-6,
101
+ weight_decay=0.01,
102
+ max_grad_norm=1.0,
103
+ fp16=torch.cuda.is_available(),
104
+ load_best_model_at_end=True,
105
+ metric_for_best_model="f1",
106
+ greater_is_better=True,
107
+ logging_dir="./logs",
108
+ )
109
+
110
+ # Trainer
111
+ trainer = WeightedLossTrainer(
112
+ model=model,
113
+ args=training_args,
114
+ train_dataset=train_ds,
115
+ eval_dataset=val_ds,
116
+ tokenizer=tokenizer,
117
+ data_collator=DataCollatorWithPadding(tokenizer),
118
+ compute_metrics=compute_metrics,
119
+ )
120
+
121
+ # Train and save
122
+ trainer.train()
123
+ trainer.save_model("./ai-small-weighted/final_model")
124
+ tokenizer.save_pretrained("./ai-small-weighted/final_model")