Spaces:

AvocadoMuffin
/

roberta_cuad_trainer

Sleeping

App Files Files Community

AvocadoMuffin commited on 19 days ago

Commit

53f26f3

verified ·

1 Parent(s): 3368d9b

Update train.py

Browse files

Files changed (1) hide show

train.py +67 -10

train.py CHANGED Viewed

@@ -12,9 +12,9 @@ import torch, numpy as np
 from datasets import load_dataset, Dataset, disable_caching
 from transformers import (
     AutoTokenizer, AutoModelForQuestionAnswering,
-    TrainingArguments, default_data_collator
 )
-from transformers import QuestionAnsweringTrainer, EvalPrediction
 from peft import LoraConfig, get_peft_model, TaskType
 import evaluate
 from huggingface_hub import login
@@ -80,7 +80,8 @@ def postprocess_qa(examples, features, raw_predictions, tokenizer):
         )
     return predictions
-def compute_metrics(eval_pred: EvalPrediction):
     predictions = postprocess_qa(raw_val, val_feats, eval_pred.predictions, tok)
     references  = [
         {"id": ex["id"], "answers": ex["answers"]} for ex in raw_val
@@ -92,7 +93,7 @@ def compute_metrics(eval_pred: EvalPrediction):
 def main():
     set_seed(SEED)
-    #  model name to store on Hub
     model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v2")
     if (tokn := os.getenv("roberta_token")):
@@ -124,7 +125,7 @@ def main():
     # ── preprocess ─────────────────────────────────────────────────────
     def preprocess(examples):
-        return tok(
             examples["question"],
             examples["context"],
             truncation="only_second",
@@ -133,7 +134,62 @@ def main():
             return_overflowing_tokens=True,
             return_offsets_mapping=True,
             padding="max_length",
-        ) | { "example_id": examples["id"] }
     train_feats = train_raw.map(
         preprocess, batched=True, remove_columns=train_raw.column_names,
@@ -144,7 +200,7 @@ def main():
         num_proc=4, desc="tokenise-val"
     )
-    global raw_val     # for metric fn
     raw_val = val_raw
     # ── training args ──────────────────────────────────────────────────
@@ -156,7 +212,7 @@ def main():
         per_device_eval_batch_size=8,
         gradient_accumulation_steps=4,      # eff. BS 32
         fp16=False, bf16=True,              # L4 = bf16
-        evaluation_strategy="steps",
         eval_steps=250,
         save_steps=500,
         save_total_limit=2,
@@ -170,7 +226,8 @@ def main():
         report_to="none",
     )
-    trainer = QuestionAnsweringTrainer(
         model=model,
         args=args,
         train_dataset=train_feats,
@@ -194,4 +251,4 @@ def main():
         print("🚀 Pushed to:", f"https://huggingface.co/{model_repo}")
 if __name__ == "__main__":
-    main()

 from datasets import load_dataset, Dataset, disable_caching
 from transformers import (
     AutoTokenizer, AutoModelForQuestionAnswering,
+    TrainingArguments, default_data_collator, Trainer
 )
+# FIXED: Use regular Trainer instead of QuestionAnsweringTrainer
 from peft import LoraConfig, get_peft_model, TaskType
 import evaluate
 from huggingface_hub import login
         )
     return predictions
+def compute_metrics(eval_pred):
+    """FIXED: Use regular eval_pred structure instead of EvalPrediction"""
     predictions = postprocess_qa(raw_val, val_feats, eval_pred.predictions, tok)
     references  = [
         {"id": ex["id"], "answers": ex["answers"]} for ex in raw_val
 def main():
     set_seed(SEED)
+    #  model name to store on Hub
     model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v2")
     if (tokn := os.getenv("roberta_token")):
     # ── preprocess ─────────────────────────────────────────────────────
     def preprocess(examples):
+        tokenized = tok(
             examples["question"],
             examples["context"],
             truncation="only_second",
             return_overflowing_tokens=True,
             return_offsets_mapping=True,
             padding="max_length",
+        )
+        # FIXED: Add proper answer position computation for QA
+        sample_mapping = tokenized.pop("overflow_to_sample_mapping")
+        offset_mapping = tokenized.pop("offset_mapping")
+        start_positions = []
+        end_positions = []
+        for i, offsets in enumerate(offset_mapping):
+            input_ids = tokenized["input_ids"][i]
+            cls_index = input_ids.index(tok.cls_token_id)
+            sequence_ids = tokenized.sequence_ids(i)
+            sample_index = sample_mapping[i]
+            answers = examples["answers"][sample_index]
+            # If no answers are given, set the cls_index as answer
+            if len(answers["answer_start"]) == 0:
+                start_positions.append(cls_index)
+                end_positions.append(cls_index)
+            else:
+                # Start/end character index of the answer in the text
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+                # Start token index of the current span in the text
+                token_start_index = 0
+                while sequence_ids[token_start_index] != 1:
+                    token_start_index += 1
+                # End token index of the current span in the text
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != 1:
+                    token_end_index -= 1
+                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index)
+                if not (offsets[token_start_index][0] <= start_char and
+                        offsets[token_end_index][1] >= end_char):
+                    start_positions.append(cls_index)
+                    end_positions.append(cls_index)
+                else:
+                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer
+                    # Note: we could go after the last offset if the answer is the last word (edge case)
+                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
+                        token_start_index += 1
+                    start_positions.append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    end_positions.append(token_end_index + 1)
+        tokenized["start_positions"] = start_positions
+        tokenized["end_positions"] = end_positions
+        tokenized["example_id"] = [examples["id"][sample_mapping[i]] for i in range(len(tokenized["input_ids"]))]
+        return tokenized
     train_feats = train_raw.map(
         preprocess, batched=True, remove_columns=train_raw.column_names,
         num_proc=4, desc="tokenise-val"
     )
+    global raw_val, val_feats     # for metric fn
     raw_val = val_raw
     # ── training args ──────────────────────────────────────────────────
         per_device_eval_batch_size=8,
         gradient_accumulation_steps=4,      # eff. BS 32
         fp16=False, bf16=True,              # L4 = bf16
+        eval_strategy="steps",
         eval_steps=250,
         save_steps=500,
         save_total_limit=2,
         report_to="none",
     )
+    # FIXED: Use regular Trainer instead of QuestionAnsweringTrainer
+    trainer = Trainer(
         model=model,
         args=args,
         train_dataset=train_feats,
         print("🚀 Pushed to:", f"https://huggingface.co/{model_repo}")
 if __name__ == "__main__":
+    main()