George-API commited on
Commit
23c5657
·
verified ·
1 Parent(s): 493e679

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +24 -18
run_cloud_training.py CHANGED
@@ -2,7 +2,7 @@
2
 
3
  """
4
  Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
5
- - Optimized for L40S GPU with pre-tokenized datasets
6
  - Research training only (no inference)
7
  - CLOUD BASED TRAINING - Hugging Face Spaces
8
  """
@@ -21,9 +21,9 @@ from peft import LoraConfig, get_peft_model
21
  from dotenv import load_dotenv
22
  from huggingface_hub import HfApi, upload_folder
23
 
24
- # Basic environment setup for L40S
25
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
26
- os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
27
 
28
  # Force GPU mode in Space if we're using a pre-quantized model
29
  os.environ["FORCE_GPU"] = "1"
@@ -469,13 +469,17 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
469
  use_4bit = False
470
  logger.warning("Using CPU mode without quantization")
471
 
 
 
 
 
472
  # For pre-quantized models, always use device_map="auto"
473
  if is_pre_quantized and is_gpu_available():
474
  logger.info("Loading pre-quantized model with GPU support")
475
  model = AutoModelForCausalLM.from_pretrained(
476
  model_name,
477
  device_map="auto",
478
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
479
  trust_remote_code=True,
480
  use_cache=model_config.get("use_cache", False)
481
  )
@@ -484,9 +488,10 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
484
  logger.info(f"Loading model with 4-bit quantization")
485
 
486
  # Create quantization config for GPU
 
487
  bnb_config = BitsAndBytesConfig(
488
  load_in_4bit=True,
489
- bnb_4bit_compute_dtype=torch.float16,
490
  bnb_4bit_quant_type=quant_config.get("bnb_4bit_quant_type", "nf4"),
491
  bnb_4bit_use_double_quant=quant_config.get("bnb_4bit_use_double_quant", True)
492
  )
@@ -496,10 +501,10 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
496
  model_name,
497
  quantization_config=bnb_config,
498
  device_map="auto",
499
- torch_dtype=torch.float16,
500
  trust_remote_code=True,
501
  use_cache=model_config.get("use_cache", False),
502
- attn_implementation=hardware_config.get("attn_implementation", "eager")
503
  )
504
  else:
505
  # CPU fallback (or non-quantized GPU) mode
@@ -571,14 +576,14 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
571
  gpu_info = torch.cuda.get_device_properties(0)
572
  logger.info(f"GPU: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
573
 
574
- # Check if it's an L40S or high-memory GPU
575
- if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
576
- logger.info("Detected L40S GPU - optimizing for high-memory GPU")
577
- per_device_train_batch_size = training_config.get("per_device_train_batch_size", 2)
578
  else:
579
  # Use a smaller batch size for other GPUs
580
  per_device_train_batch_size = 2
581
- logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
582
  else:
583
  # Use minimal batch size for CPU
584
  per_device_train_batch_size = 1
@@ -587,9 +592,9 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
587
  # Use full training parameters for pre-quantized models or GPU mode
588
  if is_pre_quantized or can_use_4bit or not is_running_in_space():
589
  num_train_epochs = training_config.get("num_train_epochs", 3)
590
- gradient_accumulation_steps = training_config.get("gradient_accumulation_steps", 4)
591
- fp16 = torch.cuda.is_available() and hardware_config.get("fp16", True)
592
- bf16 = torch.cuda.is_available() and hardware_config.get("bf16", False)
593
  # Disable gradient checkpointing for pre-quantized models as it can cause gradient issues
594
  gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True) and not is_pre_quantized
595
  dataloader_workers = training_config.get("dataloader_num_workers", 4)
@@ -633,14 +638,15 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
633
  logging_steps=training_config.get("logging_steps", 10),
634
  save_steps=training_config.get("save_steps", 200),
635
  save_total_limit=training_config.get("save_total_limit", 3),
636
- eval_strategy=eval_strategy, # Updated from evaluation_strategy
637
  load_best_model_at_end=load_best_model_at_end,
638
  report_to=reports,
639
  logging_first_step=training_config.get("logging_first_step", True),
640
  disable_tqdm=training_config.get("disable_tqdm", False),
641
  remove_unused_columns=False,
642
  gradient_checkpointing=gradient_checkpointing,
643
- dataloader_num_workers=dataloader_workers
 
644
  )
645
 
646
  # Create trainer with pre-tokenized collator
 
2
 
3
  """
4
  Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
5
+ - Optimized for A100 GPU with pre-tokenized datasets
6
  - Research training only (no inference)
7
  - CLOUD BASED TRAINING - Hugging Face Spaces
8
  """
 
21
  from dotenv import load_dotenv
22
  from huggingface_hub import HfApi, upload_folder
23
 
24
+ # Basic environment setup for A100
25
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:512"
26
+ os.environ["NCCL_P2P_DISABLE"] = "1" # Can help with A100 multi-GPU setups
27
 
28
  # Force GPU mode in Space if we're using a pre-quantized model
29
  os.environ["FORCE_GPU"] = "1"
 
469
  use_4bit = False
470
  logger.warning("Using CPU mode without quantization")
471
 
472
+ # Determine compute dtype based on hardware config
473
+ compute_dtype = torch.bfloat16 if hardware_config.get("bf16", False) else torch.float16
474
+ logger.info(f"Using compute dtype: {compute_dtype}")
475
+
476
  # For pre-quantized models, always use device_map="auto"
477
  if is_pre_quantized and is_gpu_available():
478
  logger.info("Loading pre-quantized model with GPU support")
479
  model = AutoModelForCausalLM.from_pretrained(
480
  model_name,
481
  device_map="auto",
482
+ torch_dtype=compute_dtype,
483
  trust_remote_code=True,
484
  use_cache=model_config.get("use_cache", False)
485
  )
 
488
  logger.info(f"Loading model with 4-bit quantization")
489
 
490
  # Create quantization config for GPU
491
+ bnb_compute_dtype = torch.bfloat16 if quant_config.get("bnb_4bit_compute_dtype", "float16") == "bfloat16" else torch.float16
492
  bnb_config = BitsAndBytesConfig(
493
  load_in_4bit=True,
494
+ bnb_4bit_compute_dtype=bnb_compute_dtype,
495
  bnb_4bit_quant_type=quant_config.get("bnb_4bit_quant_type", "nf4"),
496
  bnb_4bit_use_double_quant=quant_config.get("bnb_4bit_use_double_quant", True)
497
  )
 
501
  model_name,
502
  quantization_config=bnb_config,
503
  device_map="auto",
504
+ torch_dtype=compute_dtype,
505
  trust_remote_code=True,
506
  use_cache=model_config.get("use_cache", False),
507
+ attn_implementation=hardware_config.get("attn_implementation", "flash_attention_2")
508
  )
509
  else:
510
  # CPU fallback (or non-quantized GPU) mode
 
576
  gpu_info = torch.cuda.get_device_properties(0)
577
  logger.info(f"GPU: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
578
 
579
+ # Check if it's an A100 or high-memory GPU
580
+ if "A100" in gpu_info.name or "A10G" in gpu_info.name or gpu_info.total_memory > 40e9:
581
+ logger.info("Detected A100 GPU - optimizing for A100")
582
+ per_device_train_batch_size = training_config.get("per_device_train_batch_size", 3)
583
  else:
584
  # Use a smaller batch size for other GPUs
585
  per_device_train_batch_size = 2
586
+ logger.info(f"Using conservative batch size for non-A100 GPU: {per_device_train_batch_size}")
587
  else:
588
  # Use minimal batch size for CPU
589
  per_device_train_batch_size = 1
 
592
  # Use full training parameters for pre-quantized models or GPU mode
593
  if is_pre_quantized or can_use_4bit or not is_running_in_space():
594
  num_train_epochs = training_config.get("num_train_epochs", 3)
595
+ gradient_accumulation_steps = training_config.get("gradient_accumulation_steps", 2)
596
+ fp16 = torch.cuda.is_available() and hardware_config.get("fp16", False)
597
+ bf16 = torch.cuda.is_available() and hardware_config.get("bf16", True)
598
  # Disable gradient checkpointing for pre-quantized models as it can cause gradient issues
599
  gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True) and not is_pre_quantized
600
  dataloader_workers = training_config.get("dataloader_num_workers", 4)
 
638
  logging_steps=training_config.get("logging_steps", 10),
639
  save_steps=training_config.get("save_steps", 200),
640
  save_total_limit=training_config.get("save_total_limit", 3),
641
+ eval_strategy=eval_strategy,
642
  load_best_model_at_end=load_best_model_at_end,
643
  report_to=reports,
644
  logging_first_step=training_config.get("logging_first_step", True),
645
  disable_tqdm=training_config.get("disable_tqdm", False),
646
  remove_unused_columns=False,
647
  gradient_checkpointing=gradient_checkpointing,
648
+ dataloader_num_workers=dataloader_workers,
649
+ group_by_length=training_config.get("group_by_length", True)
650
  )
651
 
652
  # Create trainer with pre-tokenized collator