Spaces:
Sleeping
Sleeping
Update train.py
Browse files
train.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
#!/usr/bin/env python
|
2 |
-
# train_cuad_lora_efficient.py
|
3 |
"""
|
4 |
-
CUAD fine-tune with LoRA -
|
5 |
-
Fixes bottlenecks and uses proper batching instead of chunking.
|
6 |
-
GUARANTEED FIX for offset_mapping error AND metric computation issues.
|
7 |
"""
|
8 |
|
9 |
import os, json, random, gc, time
|
@@ -27,14 +25,14 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
27 |
|
28 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ config ββ
|
29 |
|
30 |
-
MAX_LEN =
|
31 |
-
DOC_STRIDE =
|
32 |
SEED = 42
|
33 |
BATCH_SIZE = 1000 # Process in larger, more efficient batches
|
34 |
|
35 |
-
#
|
36 |
-
USE_SUBSET = True
|
37 |
-
SUBSET_SIZE =
|
38 |
|
39 |
def set_seed(seed):
|
40 |
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
|
@@ -48,17 +46,29 @@ def balance_has_answer(dataset, ratio=2.0, max_samples=None):
|
|
48 |
|
49 |
print(f"π Original: {len(has)} has-answer, {len(no)} no-answer")
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
k = int(len(has) * ratio)
|
52 |
-
|
|
|
53 |
|
54 |
balanced = has + no
|
|
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
balanced = random.sample(balanced, max_samples)
|
59 |
-
print(f"π Reduced to {max_samples} samples for faster training")
|
60 |
-
|
61 |
-
print(f"π Balanced: {len([x for x in balanced if x['answers']['text']])} has-answer, {len([x for x in balanced if not x['answers']['text']])} no-answer")
|
62 |
|
63 |
return Dataset.from_list(balanced)
|
64 |
|
@@ -105,13 +115,10 @@ def postprocess_qa(examples, features, raw_predictions, tokenizer):
|
|
105 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ preprocessing ββ
|
106 |
|
107 |
def preprocess_training_batch(examples, tokenizer):
|
108 |
-
"""
|
109 |
-
Training preprocessing - NO offset_mapping included
|
110 |
-
"""
|
111 |
questions = examples["question"]
|
112 |
contexts = examples["context"]
|
113 |
|
114 |
-
# Batch tokenization
|
115 |
tokenized_examples = tokenizer(
|
116 |
questions,
|
117 |
contexts,
|
@@ -119,37 +126,30 @@ def preprocess_training_batch(examples, tokenizer):
|
|
119 |
max_length=MAX_LEN,
|
120 |
stride=DOC_STRIDE,
|
121 |
return_overflowing_tokens=True,
|
122 |
-
return_offsets_mapping=True,
|
123 |
padding="max_length",
|
124 |
)
|
125 |
|
126 |
-
# Map back to original examples
|
127 |
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
128 |
-
offset_mapping = tokenized_examples.pop("offset_mapping")
|
129 |
|
130 |
-
# Initialize output
|
131 |
start_positions = []
|
132 |
end_positions = []
|
133 |
|
134 |
for i, offsets in enumerate(offset_mapping):
|
135 |
-
cls_index = 0
|
136 |
-
|
137 |
-
# Get the original example for this tokenized chunk
|
138 |
sample_index = sample_mapping[i]
|
139 |
answers = examples["answers"][sample_index]
|
140 |
|
141 |
-
# Handle cases with no answer
|
142 |
if not answers["text"] or not answers["text"][0]:
|
143 |
start_positions.append(cls_index)
|
144 |
end_positions.append(cls_index)
|
145 |
continue
|
146 |
|
147 |
-
# Find answer span in tokens
|
148 |
answer_start_char = answers["answer_start"][0]
|
149 |
answer_text = answers["text"][0]
|
150 |
answer_end_char = answer_start_char + len(answer_text)
|
151 |
|
152 |
-
# Find token positions
|
153 |
token_start_index = cls_index
|
154 |
token_end_index = cls_index
|
155 |
|
@@ -160,7 +160,6 @@ def preprocess_training_batch(examples, tokenizer):
|
|
160 |
token_end_index = token_index
|
161 |
break
|
162 |
|
163 |
-
# Validate positions
|
164 |
if token_start_index <= token_end_index and token_start_index > 0:
|
165 |
start_positions.append(token_start_index)
|
166 |
end_positions.append(token_end_index)
|
@@ -171,17 +170,13 @@ def preprocess_training_batch(examples, tokenizer):
|
|
171 |
tokenized_examples["start_positions"] = start_positions
|
172 |
tokenized_examples["end_positions"] = end_positions
|
173 |
|
174 |
-
# NO offset_mapping or example_id for training
|
175 |
return tokenized_examples
|
176 |
|
177 |
def preprocess_validation_batch(examples, tokenizer):
|
178 |
-
"""
|
179 |
-
Validation preprocessing - INCLUDES offset_mapping and example_id for post-processing
|
180 |
-
"""
|
181 |
questions = examples["question"]
|
182 |
contexts = examples["context"]
|
183 |
|
184 |
-
# Batch tokenization
|
185 |
tokenized_examples = tokenizer(
|
186 |
questions,
|
187 |
contexts,
|
@@ -193,21 +188,16 @@ def preprocess_validation_batch(examples, tokenizer):
|
|
193 |
padding="max_length",
|
194 |
)
|
195 |
|
196 |
-
# Map back to original examples
|
197 |
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
198 |
|
199 |
-
# Add example IDs for evaluation
|
200 |
tokenized_examples["example_id"] = [
|
201 |
examples["id"][sample_mapping[i]] for i in range(len(tokenized_examples["input_ids"]))
|
202 |
]
|
203 |
|
204 |
-
# Keep offset_mapping for post-processing
|
205 |
return tokenized_examples
|
206 |
|
207 |
def preprocess_dataset_streaming(dataset, tokenizer, desc="Processing", is_training=True):
|
208 |
-
"""
|
209 |
-
Process dataset in batches using HuggingFace's map function with batching.
|
210 |
-
"""
|
211 |
print(f"π {desc} dataset with batch processing...")
|
212 |
|
213 |
if is_training:
|
@@ -232,8 +222,7 @@ def preprocess_dataset_streaming(dataset, tokenizer, desc="Processing", is_train
|
|
232 |
def main():
|
233 |
set_seed(SEED)
|
234 |
|
235 |
-
|
236 |
-
model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v3")
|
237 |
|
238 |
if (tokn := os.getenv("roberta_token")):
|
239 |
try:
|
@@ -253,70 +242,70 @@ def main():
|
|
253 |
|
254 |
cuad = cuad.shuffle(seed=SEED)
|
255 |
|
256 |
-
# Apply subset reduction
|
257 |
subset_size = SUBSET_SIZE if USE_SUBSET else None
|
258 |
-
cuad = balance_has_answer(cuad, ratio=
|
259 |
print(f"π Final dataset size: {len(cuad)} examples")
|
260 |
|
261 |
-
#
|
|
|
|
|
|
|
|
|
262 |
ds = cuad.train_test_split(test_size=0.1, seed=SEED)
|
263 |
train_raw, val_raw = ds["train"], ds["test"]
|
264 |
|
265 |
-
# ββ tokeniser & model
|
266 |
base_ckpt = "deepset/roberta-base-squad2"
|
267 |
tok = AutoTokenizer.from_pretrained(base_ckpt, use_fast=True)
|
268 |
model = AutoModelForQuestionAnswering.from_pretrained(base_ckpt)
|
269 |
|
270 |
-
#
|
271 |
lora = LoraConfig(
|
272 |
task_type=TaskType.QUESTION_ANS,
|
273 |
-
r=
|
274 |
-
|
|
|
|
|
275 |
)
|
276 |
model = get_peft_model(model, lora)
|
277 |
model.print_trainable_parameters()
|
278 |
|
279 |
-
# ββ
|
280 |
-
print("π Starting
|
281 |
|
282 |
-
# Process training data (NO offset_mapping)
|
283 |
train_feats = preprocess_dataset_streaming(train_raw, tok, "Training", is_training=True)
|
284 |
-
|
285 |
-
# Process validation data (WITH offset_mapping)
|
286 |
val_feats = preprocess_dataset_streaming(val_raw, tok, "Validation", is_training=False)
|
287 |
|
288 |
print(f"β
Preprocessing completed!")
|
289 |
print(f" Training features: {len(train_feats)}")
|
290 |
print(f" Validation features: {len(val_feats)}")
|
291 |
-
print(f" Training columns: {train_feats.column_names}")
|
292 |
-
print(f" Validation columns: {val_feats.column_names}")
|
293 |
-
|
294 |
-
# ββ No custom compute_metrics - just use loss for monitoring ββ
|
295 |
|
296 |
-
# ββ training args
|
297 |
-
batch_size = 16
|
298 |
gradient_accumulation_steps = 2
|
299 |
effective_batch_size = batch_size * gradient_accumulation_steps
|
300 |
|
301 |
-
num_epochs =
|
302 |
steps_per_epoch = len(train_feats) // effective_batch_size
|
303 |
total_steps = steps_per_epoch * num_epochs
|
304 |
|
305 |
-
eval_steps = max(
|
306 |
-
save_steps = eval_steps *
|
307 |
|
308 |
print(f"π Training configuration:")
|
|
|
309 |
print(f" Steps per epoch: {steps_per_epoch}")
|
310 |
print(f" Total steps: {total_steps}")
|
311 |
-
print(f"
|
312 |
-
print(f"
|
313 |
|
314 |
args = TrainingArguments(
|
315 |
output_dir="./cuad_lora_out",
|
316 |
-
learning_rate=
|
317 |
num_train_epochs=num_epochs,
|
318 |
per_device_train_batch_size=batch_size,
|
319 |
-
per_device_eval_batch_size=
|
320 |
gradient_accumulation_steps=gradient_accumulation_steps,
|
321 |
fp16=False, bf16=True,
|
322 |
eval_strategy="steps",
|
@@ -326,10 +315,8 @@ def main():
|
|
326 |
weight_decay=0.01,
|
327 |
lr_scheduler_type="cosine",
|
328 |
warmup_ratio=0.1,
|
329 |
-
load_best_model_at_end=False,
|
330 |
-
|
331 |
-
# greater_is_better=False, # Disabled
|
332 |
-
logging_steps=25,
|
333 |
report_to="none",
|
334 |
dataloader_num_workers=2,
|
335 |
dataloader_pin_memory=True,
|
@@ -343,7 +330,7 @@ def main():
|
|
343 |
eval_dataset=val_feats,
|
344 |
tokenizer=tok,
|
345 |
data_collator=default_data_collator,
|
346 |
-
compute_metrics=None,
|
347 |
)
|
348 |
|
349 |
print("π Trainingβ¦")
|
@@ -364,7 +351,7 @@ def main():
|
|
364 |
trainer.save_model("./cuad_lora_out")
|
365 |
tok.save_pretrained("./cuad_lora_out")
|
366 |
|
367 |
-
# Push to hub
|
368 |
if tokn:
|
369 |
for attempt in range(3):
|
370 |
try:
|
|
|
1 |
#!/usr/bin/env python
|
2 |
+
# train_cuad_lora_efficient.py - FIXED VERSION
|
3 |
"""
|
4 |
+
CUAD fine-tune with LoRA - Fixed for realistic training times
|
|
|
|
|
5 |
"""
|
6 |
|
7 |
import os, json, random, gc, time
|
|
|
25 |
|
26 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ config ββ
|
27 |
|
28 |
+
MAX_LEN = 512 # Slightly longer context
|
29 |
+
DOC_STRIDE = 256 # Larger stride = fewer chunks = faster training
|
30 |
SEED = 42
|
31 |
BATCH_SIZE = 1000 # Process in larger, more efficient batches
|
32 |
|
33 |
+
# Back to reasonable subset size since you've trained 5k before
|
34 |
+
USE_SUBSET = True
|
35 |
+
SUBSET_SIZE = 7000 # Good middle ground - more than your 5k success
|
36 |
|
37 |
def set_seed(seed):
|
38 |
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
|
|
|
46 |
|
47 |
print(f"π Original: {len(has)} has-answer, {len(no)} no-answer")
|
48 |
|
49 |
+
# FIXED: Apply max_samples FIRST, then balance
|
50 |
+
if max_samples:
|
51 |
+
total_available = len(has) + len(no)
|
52 |
+
if total_available > max_samples:
|
53 |
+
# Sample proportionally from original distribution
|
54 |
+
has_ratio = len(has) / total_available
|
55 |
+
target_has = int(max_samples * has_ratio)
|
56 |
+
target_no = max_samples - target_has
|
57 |
+
|
58 |
+
has = random.sample(has, min(target_has, len(has)))
|
59 |
+
no = random.sample(no, min(target_no, len(no)))
|
60 |
+
print(f"π Pre-balance subset: {len(has)} has-answer, {len(no)} no-answer")
|
61 |
+
|
62 |
+
# Now balance within the subset
|
63 |
k = int(len(has) * ratio)
|
64 |
+
if len(no) > k:
|
65 |
+
no = random.sample(no, k)
|
66 |
|
67 |
balanced = has + no
|
68 |
+
random.shuffle(balanced) # Shuffle the final dataset
|
69 |
|
70 |
+
print(f"π Final balanced: {len([x for x in balanced if x['answers']['text']])} has-answer, {len([x for x in balanced if not x['answers']['text']])} no-answer")
|
71 |
+
print(f"π Total examples: {len(balanced)}")
|
|
|
|
|
|
|
|
|
72 |
|
73 |
return Dataset.from_list(balanced)
|
74 |
|
|
|
115 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ preprocessing ββ
|
116 |
|
117 |
def preprocess_training_batch(examples, tokenizer):
|
118 |
+
"""Training preprocessing - NO offset_mapping included"""
|
|
|
|
|
119 |
questions = examples["question"]
|
120 |
contexts = examples["context"]
|
121 |
|
|
|
122 |
tokenized_examples = tokenizer(
|
123 |
questions,
|
124 |
contexts,
|
|
|
126 |
max_length=MAX_LEN,
|
127 |
stride=DOC_STRIDE,
|
128 |
return_overflowing_tokens=True,
|
129 |
+
return_offsets_mapping=True,
|
130 |
padding="max_length",
|
131 |
)
|
132 |
|
|
|
133 |
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
134 |
+
offset_mapping = tokenized_examples.pop("offset_mapping")
|
135 |
|
|
|
136 |
start_positions = []
|
137 |
end_positions = []
|
138 |
|
139 |
for i, offsets in enumerate(offset_mapping):
|
140 |
+
cls_index = 0
|
|
|
|
|
141 |
sample_index = sample_mapping[i]
|
142 |
answers = examples["answers"][sample_index]
|
143 |
|
|
|
144 |
if not answers["text"] or not answers["text"][0]:
|
145 |
start_positions.append(cls_index)
|
146 |
end_positions.append(cls_index)
|
147 |
continue
|
148 |
|
|
|
149 |
answer_start_char = answers["answer_start"][0]
|
150 |
answer_text = answers["text"][0]
|
151 |
answer_end_char = answer_start_char + len(answer_text)
|
152 |
|
|
|
153 |
token_start_index = cls_index
|
154 |
token_end_index = cls_index
|
155 |
|
|
|
160 |
token_end_index = token_index
|
161 |
break
|
162 |
|
|
|
163 |
if token_start_index <= token_end_index and token_start_index > 0:
|
164 |
start_positions.append(token_start_index)
|
165 |
end_positions.append(token_end_index)
|
|
|
170 |
tokenized_examples["start_positions"] = start_positions
|
171 |
tokenized_examples["end_positions"] = end_positions
|
172 |
|
|
|
173 |
return tokenized_examples
|
174 |
|
175 |
def preprocess_validation_batch(examples, tokenizer):
|
176 |
+
"""Validation preprocessing - INCLUDES offset_mapping and example_id"""
|
|
|
|
|
177 |
questions = examples["question"]
|
178 |
contexts = examples["context"]
|
179 |
|
|
|
180 |
tokenized_examples = tokenizer(
|
181 |
questions,
|
182 |
contexts,
|
|
|
188 |
padding="max_length",
|
189 |
)
|
190 |
|
|
|
191 |
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
192 |
|
|
|
193 |
tokenized_examples["example_id"] = [
|
194 |
examples["id"][sample_mapping[i]] for i in range(len(tokenized_examples["input_ids"]))
|
195 |
]
|
196 |
|
|
|
197 |
return tokenized_examples
|
198 |
|
199 |
def preprocess_dataset_streaming(dataset, tokenizer, desc="Processing", is_training=True):
|
200 |
+
"""Process dataset in batches using HuggingFace's map function with batching."""
|
|
|
|
|
201 |
print(f"π {desc} dataset with batch processing...")
|
202 |
|
203 |
if is_training:
|
|
|
222 |
def main():
|
223 |
set_seed(SEED)
|
224 |
|
225 |
+
model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v4")
|
|
|
226 |
|
227 |
if (tokn := os.getenv("roberta_token")):
|
228 |
try:
|
|
|
242 |
|
243 |
cuad = cuad.shuffle(seed=SEED)
|
244 |
|
245 |
+
# FIXED: Apply subset reduction more aggressively
|
246 |
subset_size = SUBSET_SIZE if USE_SUBSET else None
|
247 |
+
cuad = balance_has_answer(cuad, ratio=1.5, max_samples=subset_size) # Reduced ratio too
|
248 |
print(f"π Final dataset size: {len(cuad)} examples")
|
249 |
|
250 |
+
# Estimate features after preprocessing
|
251 |
+
avg_features_per_example = 2.5 # Conservative estimate with stride
|
252 |
+
estimated_features = len(cuad) * avg_features_per_example
|
253 |
+
print(f"π Estimated training features: ~{int(estimated_features)}")
|
254 |
+
|
255 |
ds = cuad.train_test_split(test_size=0.1, seed=SEED)
|
256 |
train_raw, val_raw = ds["train"], ds["test"]
|
257 |
|
258 |
+
# ββ tokeniser & model ββ
|
259 |
base_ckpt = "deepset/roberta-base-squad2"
|
260 |
tok = AutoTokenizer.from_pretrained(base_ckpt, use_fast=True)
|
261 |
model = AutoModelForQuestionAnswering.from_pretrained(base_ckpt)
|
262 |
|
263 |
+
# FIXED: Lighter LoRA config for faster training
|
264 |
lora = LoraConfig(
|
265 |
task_type=TaskType.QUESTION_ANS,
|
266 |
+
r=16, # Reduced from 32
|
267 |
+
lora_alpha=32, # Reduced from 64
|
268 |
+
lora_dropout=0.1,
|
269 |
+
target_modules=["query", "value"], # Fewer modules
|
270 |
)
|
271 |
model = get_peft_model(model, lora)
|
272 |
model.print_trainable_parameters()
|
273 |
|
274 |
+
# ββ preprocessing βββββββββββββββββββββββββββββββββββββββββ
|
275 |
+
print("π Starting preprocessing...")
|
276 |
|
|
|
277 |
train_feats = preprocess_dataset_streaming(train_raw, tok, "Training", is_training=True)
|
|
|
|
|
278 |
val_feats = preprocess_dataset_streaming(val_raw, tok, "Validation", is_training=False)
|
279 |
|
280 |
print(f"β
Preprocessing completed!")
|
281 |
print(f" Training features: {len(train_feats)}")
|
282 |
print(f" Validation features: {len(val_feats)}")
|
|
|
|
|
|
|
|
|
283 |
|
284 |
+
# ββ training args - FIXED for reasonable training time ββ
|
285 |
+
batch_size = 16 # Good balance
|
286 |
gradient_accumulation_steps = 2
|
287 |
effective_batch_size = batch_size * gradient_accumulation_steps
|
288 |
|
289 |
+
num_epochs = 3 # Keep it reasonable
|
290 |
steps_per_epoch = len(train_feats) // effective_batch_size
|
291 |
total_steps = steps_per_epoch * num_epochs
|
292 |
|
293 |
+
eval_steps = max(25, steps_per_epoch // 8) # More frequent eval
|
294 |
+
save_steps = eval_steps * 3
|
295 |
|
296 |
print(f"π Training configuration:")
|
297 |
+
print(f" Effective batch size: {effective_batch_size}")
|
298 |
print(f" Steps per epoch: {steps_per_epoch}")
|
299 |
print(f" Total steps: {total_steps}")
|
300 |
+
print(f" Estimated time: ~{total_steps/2.4/60:.1f} minutes")
|
301 |
+
print(f" Eval every: {eval_steps} steps")
|
302 |
|
303 |
args = TrainingArguments(
|
304 |
output_dir="./cuad_lora_out",
|
305 |
+
learning_rate=3e-5, # Slightly lower LR
|
306 |
num_train_epochs=num_epochs,
|
307 |
per_device_train_batch_size=batch_size,
|
308 |
+
per_device_eval_batch_size=8,
|
309 |
gradient_accumulation_steps=gradient_accumulation_steps,
|
310 |
fp16=False, bf16=True,
|
311 |
eval_strategy="steps",
|
|
|
315 |
weight_decay=0.01,
|
316 |
lr_scheduler_type="cosine",
|
317 |
warmup_ratio=0.1,
|
318 |
+
load_best_model_at_end=False,
|
319 |
+
logging_steps=10, # More frequent logging
|
|
|
|
|
320 |
report_to="none",
|
321 |
dataloader_num_workers=2,
|
322 |
dataloader_pin_memory=True,
|
|
|
330 |
eval_dataset=val_feats,
|
331 |
tokenizer=tok,
|
332 |
data_collator=default_data_collator,
|
333 |
+
compute_metrics=None,
|
334 |
)
|
335 |
|
336 |
print("π Trainingβ¦")
|
|
|
351 |
trainer.save_model("./cuad_lora_out")
|
352 |
tok.save_pretrained("./cuad_lora_out")
|
353 |
|
354 |
+
# Push to hub
|
355 |
if tokn:
|
356 |
for attempt in range(3):
|
357 |
try:
|