Upload folder using huggingface_hub
Browse files- README.md +5 -5
- model-00001-of-00004.safetensors +1 -1
- model-00002-of-00004.safetensors +1 -1
- model-00003-of-00004.safetensors +1 -1
- model-00004-of-00004.safetensors +1 -1
- train_transformer.py +54 -10
README.md
CHANGED
@@ -3,16 +3,16 @@
|
|
3 |
This is a fine-tuned version of Qwen2.5-7B-Instruct optimized for agent tasks.
|
4 |
|
5 |
## Dataset Information
|
6 |
-
- Train Dataset Size:
|
7 |
-
- Test Dataset Size:
|
8 |
|
9 |
## Model Performance
|
10 |
-
- Test Accuracy: 0.
|
11 |
-
- Train Accuracy: 0.
|
12 |
|
13 |
## Training Configuration
|
14 |
- Base Model: Qwen/Qwen2.5-VL-7B-Instruct
|
15 |
-
- Checkpoint: checkpoints_27feb/
|
16 |
- Dataset: AgentEvalDatapointDataset
|
17 |
- Training Script: [train_transformer.py](train_transformer.py)
|
18 |
- DeepSpeed Config: [deepspeed_config.json](deepspeed_config.json)
|
|
|
3 |
This is a fine-tuned version of Qwen2.5-7B-Instruct optimized for agent tasks.
|
4 |
|
5 |
## Dataset Information
|
6 |
+
- Train Dataset Size: 380 examples
|
7 |
+
- Test Dataset Size: 94 examples
|
8 |
|
9 |
## Model Performance
|
10 |
+
- Test Accuracy: 0.8811
|
11 |
+
- Train Accuracy: 0.9912
|
12 |
|
13 |
## Training Configuration
|
14 |
- Base Model: Qwen/Qwen2.5-VL-7B-Instruct
|
15 |
+
- Checkpoint: checkpoints_27feb/run_20250302_224224/checkpoint-5035
|
16 |
- Dataset: AgentEvalDatapointDataset
|
17 |
- Training Script: [train_transformer.py](train_transformer.py)
|
18 |
- DeepSpeed Config: [deepspeed_config.json](deepspeed_config.json)
|
model-00001-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4968243304
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e66be497d125879d760904220a2e7e9f170d93a532bd96120dead07347bb114
|
3 |
size 4968243304
|
model-00002-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4991495816
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2fcb74ba81b4109788c6135e4ce8ba8044856585e9a0527d8ce857ac66b00859
|
3 |
size 4991495816
|
model-00003-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4932751040
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cbd6e723e4e3fb285f040dfe214dc318924853da918061fea20b6e8631b19aee
|
3 |
size 4932751040
|
model-00004-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1691924384
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f191a1a83a9b7b616d0a85e5073b264fe89088fc00d5e442d47c0ba835d32c62
|
3 |
size 1691924384
|
train_transformer.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import torch
|
2 |
import gc
|
|
|
|
|
3 |
|
4 |
torch.cuda.empty_cache()
|
5 |
import torch.distributed
|
@@ -221,6 +223,21 @@ class CustomTrainingCallback(TrainerCallback):
|
|
221 |
def __init__(self, trainer, eval_epoch_interval=2):
|
222 |
self.trainer = trainer
|
223 |
self.eval_epoch_interval = eval_epoch_interval
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
|
225 |
def on_log(self, args, state, control, logs=None, **kwargs):
|
226 |
"""Log metrics at each logging step"""
|
@@ -257,10 +274,31 @@ class CustomTrainingCallback(TrainerCallback):
|
|
257 |
self.trainer.model.eval()
|
258 |
|
259 |
if (state.epoch + 1) % self.eval_epoch_interval == 0 and state.epoch > 4:
|
260 |
-
|
261 |
-
self.trainer.evaluate_step(
|
262 |
-
|
263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
if was_training:
|
266 |
for_training(self.trainer.model)
|
@@ -327,7 +365,7 @@ class CustomSFTTrainer(SFTTrainer):
|
|
327 |
}
|
328 |
)
|
329 |
|
330 |
-
#
|
331 |
|
332 |
except Exception as e:
|
333 |
logger.error(f"Error evaluating: {e}")
|
@@ -430,7 +468,7 @@ def load_model(MODEL_ID: str, USE_QLORA: bool, training_args):
|
|
430 |
return model, processor
|
431 |
|
432 |
|
433 |
-
def
|
434 |
# Set CUDA device explicitly based on local_rank
|
435 |
if args.local_rank != -1:
|
436 |
torch.cuda.set_device(args.local_rank)
|
@@ -539,10 +577,10 @@ def main(args):
|
|
539 |
# Pass training args to load_model function
|
540 |
model, processor = load_model(args.model_id, args.use_qlora, training_args)
|
541 |
# Train dataset
|
542 |
-
train_dataset = AgentDatapointDataset(split="train")
|
543 |
# Eval datasets
|
544 |
-
test_dataset = AgentEvalDatapointDataset(split="test")
|
545 |
-
train_dataset_eval = AgentEvalDatapointDataset(split="train")
|
546 |
for_training(model)
|
547 |
|
548 |
trainer = CustomSFTTrainer(
|
@@ -615,5 +653,11 @@ if __name__ == "__main__":
|
|
615 |
parser.add_argument(
|
616 |
"--local_rank", type=int, default=-1, help="Local rank for distributed training"
|
617 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
618 |
args = parser.parse_args()
|
619 |
-
|
|
|
1 |
import torch
|
2 |
import gc
|
3 |
+
import numpy as np
|
4 |
+
import json
|
5 |
|
6 |
torch.cuda.empty_cache()
|
7 |
import torch.distributed
|
|
|
223 |
def __init__(self, trainer, eval_epoch_interval=2):
|
224 |
self.trainer = trainer
|
225 |
self.eval_epoch_interval = eval_epoch_interval
|
226 |
+
self.best_test_accuracy = 0.0
|
227 |
+
self.best_test_epoch = 0
|
228 |
+
self.best_metrics = {
|
229 |
+
'test_accuracy': 0.0,
|
230 |
+
'train_accuracy': 0.0,
|
231 |
+
'epoch': 0,
|
232 |
+
'global_step': 0
|
233 |
+
}
|
234 |
+
|
235 |
+
def save_best_metrics(self, output_dir):
|
236 |
+
"""Save best metrics to a file in the checkpoint directory"""
|
237 |
+
metrics_file = os.path.join(output_dir, 'best_metrics.json')
|
238 |
+
with open(metrics_file, 'w') as f:
|
239 |
+
json.dump(self.best_metrics, f, indent=4)
|
240 |
+
print(f"Saved best metrics to {metrics_file}")
|
241 |
|
242 |
def on_log(self, args, state, control, logs=None, **kwargs):
|
243 |
"""Log metrics at each logging step"""
|
|
|
274 |
self.trainer.model.eval()
|
275 |
|
276 |
if (state.epoch + 1) % self.eval_epoch_interval == 0 and state.epoch > 4:
|
277 |
+
# Get test accuracy
|
278 |
+
test_accuracy = self.trainer.evaluate_step(dataset=self.trainer.eval_dataset, split="test")
|
279 |
+
train_accuracy = self.trainer.evaluate_step(dataset=self.trainer.train_dataset_eval, split="train")
|
280 |
+
|
281 |
+
print(f"Test accuracy: {test_accuracy:.4f}, Train accuracy: {train_accuracy:.4f}")
|
282 |
+
|
283 |
+
# Update best test accuracy if current is better
|
284 |
+
if test_accuracy > self.best_test_accuracy:
|
285 |
+
self.best_test_accuracy = test_accuracy
|
286 |
+
self.best_test_epoch = state.epoch + 1
|
287 |
+
|
288 |
+
# Update best metrics dictionary
|
289 |
+
self.best_metrics.update({
|
290 |
+
'best_test_accuracy': float(test_accuracy),
|
291 |
+
'train_accuracy': float(train_accuracy),
|
292 |
+
'epoch': int(state.epoch + 1),
|
293 |
+
'global_step': int(state.global_step)
|
294 |
+
})
|
295 |
+
|
296 |
+
# Save best metrics to file
|
297 |
+
self.save_best_metrics(args.output_dir)
|
298 |
+
|
299 |
+
# Log to wandb
|
300 |
+
|
301 |
+
print(f"\nNew best test accuracy: {self.best_test_accuracy:.4f} at epoch {self.best_test_epoch}")
|
302 |
|
303 |
if was_training:
|
304 |
for_training(self.trainer.model)
|
|
|
365 |
}
|
366 |
)
|
367 |
|
368 |
+
return accuracy # Return the accuracy value
|
369 |
|
370 |
except Exception as e:
|
371 |
logger.error(f"Error evaluating: {e}")
|
|
|
468 |
return model, processor
|
469 |
|
470 |
|
471 |
+
def train(args):
|
472 |
# Set CUDA device explicitly based on local_rank
|
473 |
if args.local_rank != -1:
|
474 |
torch.cuda.set_device(args.local_rank)
|
|
|
577 |
# Pass training args to load_model function
|
578 |
model, processor = load_model(args.model_id, args.use_qlora, training_args)
|
579 |
# Train dataset
|
580 |
+
train_dataset = AgentDatapointDataset(split="train", num_samples=args.train_size)
|
581 |
# Eval datasets
|
582 |
+
test_dataset = AgentEvalDatapointDataset(split="test", num_samples=args.test_size)
|
583 |
+
train_dataset_eval = AgentEvalDatapointDataset(split="train", num_samples=args.train_size)
|
584 |
for_training(model)
|
585 |
|
586 |
trainer = CustomSFTTrainer(
|
|
|
653 |
parser.add_argument(
|
654 |
"--local_rank", type=int, default=-1, help="Local rank for distributed training"
|
655 |
)
|
656 |
+
parser.add_argument(
|
657 |
+
"--train_size", type=int, default=10000000, help="Number of training samples"
|
658 |
+
)
|
659 |
+
parser.add_argument(
|
660 |
+
"--test_size", type=int, default=10000000, help="Number of test samples"
|
661 |
+
)
|
662 |
args = parser.parse_args()
|
663 |
+
train(args)
|