alexhotti commited on
Commit
4b105b2
·
verified ·
1 Parent(s): 9f725fd

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -3,16 +3,16 @@
3
  This is a fine-tuned version of Qwen2.5-7B-Instruct optimized for agent tasks.
4
 
5
  ## Dataset Information
6
- - Train Dataset Size: 387 examples
7
- - Test Dataset Size: 96 examples
8
 
9
  ## Model Performance
10
- - Test Accuracy: 0.7983
11
- - Train Accuracy: 0.9606
12
 
13
  ## Training Configuration
14
  - Base Model: Qwen/Qwen2.5-VL-7B-Instruct
15
- - Checkpoint: checkpoints_27feb/run_20250228_004641/checkpoint-2425
16
  - Dataset: AgentEvalDatapointDataset
17
  - Training Script: [train_transformer.py](train_transformer.py)
18
  - DeepSpeed Config: [deepspeed_config.json](deepspeed_config.json)
 
3
  This is a fine-tuned version of Qwen2.5-7B-Instruct optimized for agent tasks.
4
 
5
  ## Dataset Information
6
+ - Train Dataset Size: 380 examples
7
+ - Test Dataset Size: 94 examples
8
 
9
  ## Model Performance
10
+ - Test Accuracy: 0.8811
11
+ - Train Accuracy: 0.9912
12
 
13
  ## Training Configuration
14
  - Base Model: Qwen/Qwen2.5-VL-7B-Instruct
15
+ - Checkpoint: checkpoints_27feb/run_20250302_224224/checkpoint-5035
16
  - Dataset: AgentEvalDatapointDataset
17
  - Training Script: [train_transformer.py](train_transformer.py)
18
  - DeepSpeed Config: [deepspeed_config.json](deepspeed_config.json)
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:912a6e8beb806d1e74da4525dba6232b70ca5699d6fcd6e749191ac4b9350f61
3
  size 4968243304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e66be497d125879d760904220a2e7e9f170d93a532bd96120dead07347bb114
3
  size 4968243304
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d49fa25a0488877719f473f2a7e6952db490ee0076949367251e0b0bd1e8810e
3
  size 4991495816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fcb74ba81b4109788c6135e4ce8ba8044856585e9a0527d8ce857ac66b00859
3
  size 4991495816
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b1a7d618ae2a83635f0a56196c916e0e602cb68bae41468b7348614641195d8
3
  size 4932751040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbd6e723e4e3fb285f040dfe214dc318924853da918061fea20b6e8631b19aee
3
  size 4932751040
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d032ca82caa25a287aed4c0214ba117526479c3e54f27a753244575b4dd8577
3
  size 1691924384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f191a1a83a9b7b616d0a85e5073b264fe89088fc00d5e442d47c0ba835d32c62
3
  size 1691924384
train_transformer.py CHANGED
@@ -1,5 +1,7 @@
1
  import torch
2
  import gc
 
 
3
 
4
  torch.cuda.empty_cache()
5
  import torch.distributed
@@ -221,6 +223,21 @@ class CustomTrainingCallback(TrainerCallback):
221
  def __init__(self, trainer, eval_epoch_interval=2):
222
  self.trainer = trainer
223
  self.eval_epoch_interval = eval_epoch_interval
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  def on_log(self, args, state, control, logs=None, **kwargs):
226
  """Log metrics at each logging step"""
@@ -257,10 +274,31 @@ class CustomTrainingCallback(TrainerCallback):
257
  self.trainer.model.eval()
258
 
259
  if (state.epoch + 1) % self.eval_epoch_interval == 0 and state.epoch > 4:
260
- self.trainer.evaluate_step(dataset=self.trainer.eval_dataset, split="test")
261
- self.trainer.evaluate_step(
262
- dataset=self.trainer.train_dataset_eval, split="train"
263
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
  if was_training:
266
  for_training(self.trainer.model)
@@ -327,7 +365,7 @@ class CustomSFTTrainer(SFTTrainer):
327
  }
328
  )
329
 
330
- # Don't finish wandb here to avoid conflicts with the training process
331
 
332
  except Exception as e:
333
  logger.error(f"Error evaluating: {e}")
@@ -430,7 +468,7 @@ def load_model(MODEL_ID: str, USE_QLORA: bool, training_args):
430
  return model, processor
431
 
432
 
433
- def main(args):
434
  # Set CUDA device explicitly based on local_rank
435
  if args.local_rank != -1:
436
  torch.cuda.set_device(args.local_rank)
@@ -539,10 +577,10 @@ def main(args):
539
  # Pass training args to load_model function
540
  model, processor = load_model(args.model_id, args.use_qlora, training_args)
541
  # Train dataset
542
- train_dataset = AgentDatapointDataset(split="train")
543
  # Eval datasets
544
- test_dataset = AgentEvalDatapointDataset(split="test")
545
- train_dataset_eval = AgentEvalDatapointDataset(split="train")
546
  for_training(model)
547
 
548
  trainer = CustomSFTTrainer(
@@ -615,5 +653,11 @@ if __name__ == "__main__":
615
  parser.add_argument(
616
  "--local_rank", type=int, default=-1, help="Local rank for distributed training"
617
  )
 
 
 
 
 
 
618
  args = parser.parse_args()
619
- main(args)
 
1
  import torch
2
  import gc
3
+ import numpy as np
4
+ import json
5
 
6
  torch.cuda.empty_cache()
7
  import torch.distributed
 
223
  def __init__(self, trainer, eval_epoch_interval=2):
224
  self.trainer = trainer
225
  self.eval_epoch_interval = eval_epoch_interval
226
+ self.best_test_accuracy = 0.0
227
+ self.best_test_epoch = 0
228
+ self.best_metrics = {
229
+ 'test_accuracy': 0.0,
230
+ 'train_accuracy': 0.0,
231
+ 'epoch': 0,
232
+ 'global_step': 0
233
+ }
234
+
235
+ def save_best_metrics(self, output_dir):
236
+ """Save best metrics to a file in the checkpoint directory"""
237
+ metrics_file = os.path.join(output_dir, 'best_metrics.json')
238
+ with open(metrics_file, 'w') as f:
239
+ json.dump(self.best_metrics, f, indent=4)
240
+ print(f"Saved best metrics to {metrics_file}")
241
 
242
  def on_log(self, args, state, control, logs=None, **kwargs):
243
  """Log metrics at each logging step"""
 
274
  self.trainer.model.eval()
275
 
276
  if (state.epoch + 1) % self.eval_epoch_interval == 0 and state.epoch > 4:
277
+ # Get test accuracy
278
+ test_accuracy = self.trainer.evaluate_step(dataset=self.trainer.eval_dataset, split="test")
279
+ train_accuracy = self.trainer.evaluate_step(dataset=self.trainer.train_dataset_eval, split="train")
280
+
281
+ print(f"Test accuracy: {test_accuracy:.4f}, Train accuracy: {train_accuracy:.4f}")
282
+
283
+ # Update best test accuracy if current is better
284
+ if test_accuracy > self.best_test_accuracy:
285
+ self.best_test_accuracy = test_accuracy
286
+ self.best_test_epoch = state.epoch + 1
287
+
288
+ # Update best metrics dictionary
289
+ self.best_metrics.update({
290
+ 'best_test_accuracy': float(test_accuracy),
291
+ 'train_accuracy': float(train_accuracy),
292
+ 'epoch': int(state.epoch + 1),
293
+ 'global_step': int(state.global_step)
294
+ })
295
+
296
+ # Save best metrics to file
297
+ self.save_best_metrics(args.output_dir)
298
+
299
+ # Log to wandb
300
+
301
+ print(f"\nNew best test accuracy: {self.best_test_accuracy:.4f} at epoch {self.best_test_epoch}")
302
 
303
  if was_training:
304
  for_training(self.trainer.model)
 
365
  }
366
  )
367
 
368
+ return accuracy # Return the accuracy value
369
 
370
  except Exception as e:
371
  logger.error(f"Error evaluating: {e}")
 
468
  return model, processor
469
 
470
 
471
+ def train(args):
472
  # Set CUDA device explicitly based on local_rank
473
  if args.local_rank != -1:
474
  torch.cuda.set_device(args.local_rank)
 
577
  # Pass training args to load_model function
578
  model, processor = load_model(args.model_id, args.use_qlora, training_args)
579
  # Train dataset
580
+ train_dataset = AgentDatapointDataset(split="train", num_samples=args.train_size)
581
  # Eval datasets
582
+ test_dataset = AgentEvalDatapointDataset(split="test", num_samples=args.test_size)
583
+ train_dataset_eval = AgentEvalDatapointDataset(split="train", num_samples=args.train_size)
584
  for_training(model)
585
 
586
  trainer = CustomSFTTrainer(
 
653
  parser.add_argument(
654
  "--local_rank", type=int, default=-1, help="Local rank for distributed training"
655
  )
656
+ parser.add_argument(
657
+ "--train_size", type=int, default=10000000, help="Number of training samples"
658
+ )
659
+ parser.add_argument(
660
+ "--test_size", type=int, default=10000000, help="Number of test samples"
661
+ )
662
  args = parser.parse_args()
663
+ train(args)