Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	coerce akk numeric config values to safe values
Browse files
    	
        scripts/trackio_tonic/deploy_trackio_space.py
    CHANGED
    
    | @@ -411,8 +411,8 @@ class TrackioSpaceDeployer: | |
| 411 |  | 
| 412 | 
             
                        # Wait a bit for the space to build
         | 
| 413 | 
             
                        import time
         | 
| 414 | 
            -
                        print("Waiting  | 
| 415 | 
            -
                        time.sleep( | 
| 416 |  | 
| 417 | 
             
                        # Try to access the space
         | 
| 418 | 
             
                        response = requests.get(self.space_url, timeout=30)
         | 
|  | |
| 411 |  | 
| 412 | 
             
                        # Wait a bit for the space to build
         | 
| 413 | 
             
                        import time
         | 
| 414 | 
            +
                        print("Waiting 120 seconds for Space to build...")
         | 
| 415 | 
            +
                        time.sleep(120)
         | 
| 416 |  | 
| 417 | 
             
                        # Try to access the space
         | 
| 418 | 
             
                        response = requests.get(self.space_url, timeout=30)
         | 
    	
        scripts/training/train_gpt_oss.py
    CHANGED
    
    | @@ -345,38 +345,60 @@ def create_sft_config(config, output_dir): | |
| 345 |  | 
| 346 | 
             
                print("Creating enhanced SFT configuration...")
         | 
| 347 |  | 
| 348 | 
            -
                #  | 
| 349 | 
            -
                 | 
| 350 | 
            -
             | 
| 351 | 
            -
             | 
| 352 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 353 |  | 
| 354 | 
             
                # Learning rate configuration
         | 
| 355 | 
            -
                learning_rate = config | 
| 356 | 
             
                lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
         | 
| 357 |  | 
| 358 | 
             
                # Batch configuration
         | 
| 359 | 
            -
                per_device_train_batch_size = config | 
| 360 | 
            -
                per_device_eval_batch_size = getattr(config, 'eval_batch_size',  | 
| 361 | 
            -
                gradient_accumulation_steps = config | 
| 362 |  | 
| 363 | 
             
                # Evaluation and logging
         | 
| 364 | 
             
                eval_strategy = getattr(config, 'eval_strategy', 'steps')
         | 
| 365 | 
            -
                eval_steps = getattr(config, 'eval_steps', 100)
         | 
| 366 | 
            -
                 | 
|  | |
| 367 |  | 
| 368 | 
             
                # Saving configuration
         | 
| 369 | 
             
                save_strategy = getattr(config, 'save_strategy', 'steps')
         | 
| 370 | 
            -
                save_steps = getattr(config, 'save_steps', 500)
         | 
| 371 | 
            -
                save_total_limit = getattr(config, 'save_total_limit', 3)
         | 
| 372 |  | 
| 373 | 
             
                # Mixed precision
         | 
| 374 | 
            -
                fp16 = getattr(config, 'fp16', False)
         | 
| 375 | 
            -
                bf16 = getattr(config, 'bf16', True)
         | 
|  | |
| 376 |  | 
| 377 | 
             
                # Regularization
         | 
| 378 | 
            -
                weight_decay = getattr(config, 'weight_decay', 0.01)
         | 
| 379 | 
            -
                max_grad_norm = getattr(config, 'max_grad_norm', 1.0)
         | 
| 380 |  | 
| 381 | 
             
                # HuggingFace Hub integration
         | 
| 382 | 
             
                push_to_hub = getattr(config, 'push_to_hub', False)
         | 
| @@ -406,12 +428,15 @@ def create_sft_config(config, output_dir): | |
| 406 | 
             
                    # Mixed precision
         | 
| 407 | 
             
                    "fp16": fp16,
         | 
| 408 | 
             
                    "bf16": bf16,
         | 
|  | |
|  | |
| 409 | 
             
                    # Regularization
         | 
| 410 | 
             
                    "weight_decay": weight_decay,
         | 
| 411 | 
             
                    "max_grad_norm": max_grad_norm,
         | 
| 412 | 
             
                    # Evaluation (name may vary across versions)
         | 
| 413 | 
             
                    "evaluation_strategy": eval_strategy,
         | 
| 414 | 
             
                    "eval_steps": eval_steps,
         | 
|  | |
| 415 | 
             
                    # Logging
         | 
| 416 | 
             
                    "logging_steps": logging_steps,
         | 
| 417 | 
             
                    # Saving
         | 
| @@ -421,8 +446,10 @@ def create_sft_config(config, output_dir): | |
| 421 | 
             
                    # Output
         | 
| 422 | 
             
                    "output_dir": output_dir,
         | 
| 423 | 
             
                    # Data loading
         | 
| 424 | 
            -
                    "dataloader_num_workers": getattr(config, 'dataloader_num_workers', 4),
         | 
| 425 | 
             
                    "dataloader_pin_memory": getattr(config, 'dataloader_pin_memory', True),
         | 
|  | |
|  | |
| 426 | 
             
                    # Performance
         | 
| 427 | 
             
                    "group_by_length": getattr(config, 'group_by_length', True),
         | 
| 428 | 
             
                    "remove_unused_columns": getattr(config, 'remove_unused_columns', True),
         | 
| @@ -432,6 +459,9 @@ def create_sft_config(config, output_dir): | |
| 432 | 
             
                    "report_to": ("trackio" if getattr(config, 'enable_tracking', False) else None),
         | 
| 433 | 
             
                }
         | 
| 434 |  | 
|  | |
|  | |
|  | |
| 435 | 
             
                # Adapt to transformers versions where 'evaluation_strategy' was renamed
         | 
| 436 | 
             
                try:
         | 
| 437 | 
             
                    ta_sig = inspect.signature(TrainingArguments.__init__)
         | 
|  | |
| 345 |  | 
| 346 | 
             
                print("Creating enhanced SFT configuration...")
         | 
| 347 |  | 
| 348 | 
            +
                # Helper coercion utilities to guarantee numeric types
         | 
| 349 | 
            +
                def _as_int(value, default):
         | 
| 350 | 
            +
                    if value is None:
         | 
| 351 | 
            +
                        return int(default)
         | 
| 352 | 
            +
                    try:
         | 
| 353 | 
            +
                        return int(value)
         | 
| 354 | 
            +
                    except Exception:
         | 
| 355 | 
            +
                        return int(default)
         | 
| 356 | 
            +
             | 
| 357 | 
            +
                def _as_float(value, default):
         | 
| 358 | 
            +
                    if value is None:
         | 
| 359 | 
            +
                        return float(default)
         | 
| 360 | 
            +
                    try:
         | 
| 361 | 
            +
                        return float(value)
         | 
| 362 | 
            +
                    except Exception:
         | 
| 363 | 
            +
                        return float(default)
         | 
| 364 | 
            +
             | 
| 365 | 
            +
                # Extract training parameters from config with enhanced defaults and coercion
         | 
| 366 | 
            +
                num_train_epochs = _as_float(getattr(config, 'num_train_epochs', 1.0), 1.0)
         | 
| 367 | 
            +
                # Transformers expects max_steps default -1 (disabled). Some code compares > 0
         | 
| 368 | 
            +
                raw_max_steps = getattr(config, 'max_steps', None)
         | 
| 369 | 
            +
                max_steps = _as_int(raw_max_steps if raw_max_steps is not None else -1, -1)
         | 
| 370 | 
            +
                warmup_ratio = _as_float(getattr(config, 'warmup_ratio', 0.03), 0.03)
         | 
| 371 | 
            +
                # Ensure warmup_steps is an int; default 0 to avoid None comparisons in schedulers
         | 
| 372 | 
            +
                warmup_steps = _as_int(getattr(config, 'warmup_steps', 0), 0)
         | 
| 373 |  | 
| 374 | 
             
                # Learning rate configuration
         | 
| 375 | 
            +
                learning_rate = _as_float(getattr(config, 'learning_rate', 2e-4), 2e-4)
         | 
| 376 | 
             
                lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
         | 
| 377 |  | 
| 378 | 
             
                # Batch configuration
         | 
| 379 | 
            +
                per_device_train_batch_size = _as_int(getattr(config, 'batch_size', 2), 2)
         | 
| 380 | 
            +
                per_device_eval_batch_size = _as_int(getattr(config, 'eval_batch_size', per_device_train_batch_size), per_device_train_batch_size)
         | 
| 381 | 
            +
                gradient_accumulation_steps = _as_int(getattr(config, 'gradient_accumulation_steps', 1), 1)
         | 
| 382 |  | 
| 383 | 
             
                # Evaluation and logging
         | 
| 384 | 
             
                eval_strategy = getattr(config, 'eval_strategy', 'steps')
         | 
| 385 | 
            +
                eval_steps = _as_int(getattr(config, 'eval_steps', 100), 100)
         | 
| 386 | 
            +
                eval_accumulation_steps = _as_int(getattr(config, 'eval_accumulation_steps', 1), 1)
         | 
| 387 | 
            +
                logging_steps = _as_int(getattr(config, 'logging_steps', 10), 10)
         | 
| 388 |  | 
| 389 | 
             
                # Saving configuration
         | 
| 390 | 
             
                save_strategy = getattr(config, 'save_strategy', 'steps')
         | 
| 391 | 
            +
                save_steps = _as_int(getattr(config, 'save_steps', 500), 500)
         | 
| 392 | 
            +
                save_total_limit = _as_int(getattr(config, 'save_total_limit', 3), 3)
         | 
| 393 |  | 
| 394 | 
             
                # Mixed precision
         | 
| 395 | 
            +
                fp16 = bool(getattr(config, 'fp16', False))
         | 
| 396 | 
            +
                bf16 = bool(getattr(config, 'bf16', True))
         | 
| 397 | 
            +
                tf32 = bool(getattr(config, 'tf32', False))
         | 
| 398 |  | 
| 399 | 
             
                # Regularization
         | 
| 400 | 
            +
                weight_decay = _as_float(getattr(config, 'weight_decay', 0.01), 0.01)
         | 
| 401 | 
            +
                max_grad_norm = _as_float(getattr(config, 'max_grad_norm', 1.0), 1.0)
         | 
| 402 |  | 
| 403 | 
             
                # HuggingFace Hub integration
         | 
| 404 | 
             
                push_to_hub = getattr(config, 'push_to_hub', False)
         | 
|  | |
| 428 | 
             
                    # Mixed precision
         | 
| 429 | 
             
                    "fp16": fp16,
         | 
| 430 | 
             
                    "bf16": bf16,
         | 
| 431 | 
            +
                    # Some versions support tf32
         | 
| 432 | 
            +
                    "tf32": tf32 if 'tf32' in TrainingArguments.__init__.__code__.co_varnames else None,
         | 
| 433 | 
             
                    # Regularization
         | 
| 434 | 
             
                    "weight_decay": weight_decay,
         | 
| 435 | 
             
                    "max_grad_norm": max_grad_norm,
         | 
| 436 | 
             
                    # Evaluation (name may vary across versions)
         | 
| 437 | 
             
                    "evaluation_strategy": eval_strategy,
         | 
| 438 | 
             
                    "eval_steps": eval_steps,
         | 
| 439 | 
            +
                    "eval_accumulation_steps": eval_accumulation_steps,
         | 
| 440 | 
             
                    # Logging
         | 
| 441 | 
             
                    "logging_steps": logging_steps,
         | 
| 442 | 
             
                    # Saving
         | 
|  | |
| 446 | 
             
                    # Output
         | 
| 447 | 
             
                    "output_dir": output_dir,
         | 
| 448 | 
             
                    # Data loading
         | 
| 449 | 
            +
                    "dataloader_num_workers": _as_int(getattr(config, 'dataloader_num_workers', 4), 4),
         | 
| 450 | 
             
                    "dataloader_pin_memory": getattr(config, 'dataloader_pin_memory', True),
         | 
| 451 | 
            +
                    # Optional in some versions
         | 
| 452 | 
            +
                    "dataloader_prefetch_factor": _as_int(getattr(config, 'dataloader_prefetch_factor', 2), 2),
         | 
| 453 | 
             
                    # Performance
         | 
| 454 | 
             
                    "group_by_length": getattr(config, 'group_by_length', True),
         | 
| 455 | 
             
                    "remove_unused_columns": getattr(config, 'remove_unused_columns', True),
         | 
|  | |
| 459 | 
             
                    "report_to": ("trackio" if getattr(config, 'enable_tracking', False) else None),
         | 
| 460 | 
             
                }
         | 
| 461 |  | 
| 462 | 
            +
                # Drop any None-valued kwargs
         | 
| 463 | 
            +
                ta_kwargs = {k: v for k, v in ta_kwargs.items() if v is not None}
         | 
| 464 | 
            +
             | 
| 465 | 
             
                # Adapt to transformers versions where 'evaluation_strategy' was renamed
         | 
| 466 | 
             
                try:
         | 
| 467 | 
             
                    ta_sig = inspect.signature(TrainingArguments.__init__)
         | 
