Spaces:
Running
Running
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +24 -18
run_cloud_training.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
|
3 |
"""
|
4 |
Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
|
5 |
-
- Optimized for
|
6 |
- Research training only (no inference)
|
7 |
- CLOUD BASED TRAINING - Hugging Face Spaces
|
8 |
"""
|
@@ -21,9 +21,9 @@ from peft import LoraConfig, get_peft_model
|
|
21 |
from dotenv import load_dotenv
|
22 |
from huggingface_hub import HfApi, upload_folder
|
23 |
|
24 |
-
# Basic environment setup for
|
25 |
-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:
|
26 |
-
os.environ["
|
27 |
|
28 |
# Force GPU mode in Space if we're using a pre-quantized model
|
29 |
os.environ["FORCE_GPU"] = "1"
|
@@ -469,13 +469,17 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
469 |
use_4bit = False
|
470 |
logger.warning("Using CPU mode without quantization")
|
471 |
|
|
|
|
|
|
|
|
|
472 |
# For pre-quantized models, always use device_map="auto"
|
473 |
if is_pre_quantized and is_gpu_available():
|
474 |
logger.info("Loading pre-quantized model with GPU support")
|
475 |
model = AutoModelForCausalLM.from_pretrained(
|
476 |
model_name,
|
477 |
device_map="auto",
|
478 |
-
torch_dtype=
|
479 |
trust_remote_code=True,
|
480 |
use_cache=model_config.get("use_cache", False)
|
481 |
)
|
@@ -484,9 +488,10 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
484 |
logger.info(f"Loading model with 4-bit quantization")
|
485 |
|
486 |
# Create quantization config for GPU
|
|
|
487 |
bnb_config = BitsAndBytesConfig(
|
488 |
load_in_4bit=True,
|
489 |
-
bnb_4bit_compute_dtype=
|
490 |
bnb_4bit_quant_type=quant_config.get("bnb_4bit_quant_type", "nf4"),
|
491 |
bnb_4bit_use_double_quant=quant_config.get("bnb_4bit_use_double_quant", True)
|
492 |
)
|
@@ -496,10 +501,10 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
496 |
model_name,
|
497 |
quantization_config=bnb_config,
|
498 |
device_map="auto",
|
499 |
-
torch_dtype=
|
500 |
trust_remote_code=True,
|
501 |
use_cache=model_config.get("use_cache", False),
|
502 |
-
attn_implementation=hardware_config.get("attn_implementation", "
|
503 |
)
|
504 |
else:
|
505 |
# CPU fallback (or non-quantized GPU) mode
|
@@ -571,14 +576,14 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
571 |
gpu_info = torch.cuda.get_device_properties(0)
|
572 |
logger.info(f"GPU: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
|
573 |
|
574 |
-
# Check if it's an
|
575 |
-
if "
|
576 |
-
logger.info("Detected
|
577 |
-
per_device_train_batch_size = training_config.get("per_device_train_batch_size",
|
578 |
else:
|
579 |
# Use a smaller batch size for other GPUs
|
580 |
per_device_train_batch_size = 2
|
581 |
-
logger.info(f"Using conservative batch size for non-
|
582 |
else:
|
583 |
# Use minimal batch size for CPU
|
584 |
per_device_train_batch_size = 1
|
@@ -587,9 +592,9 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
587 |
# Use full training parameters for pre-quantized models or GPU mode
|
588 |
if is_pre_quantized or can_use_4bit or not is_running_in_space():
|
589 |
num_train_epochs = training_config.get("num_train_epochs", 3)
|
590 |
-
gradient_accumulation_steps = training_config.get("gradient_accumulation_steps",
|
591 |
-
fp16 = torch.cuda.is_available() and hardware_config.get("fp16",
|
592 |
-
bf16 = torch.cuda.is_available() and hardware_config.get("bf16",
|
593 |
# Disable gradient checkpointing for pre-quantized models as it can cause gradient issues
|
594 |
gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True) and not is_pre_quantized
|
595 |
dataloader_workers = training_config.get("dataloader_num_workers", 4)
|
@@ -633,14 +638,15 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
633 |
logging_steps=training_config.get("logging_steps", 10),
|
634 |
save_steps=training_config.get("save_steps", 200),
|
635 |
save_total_limit=training_config.get("save_total_limit", 3),
|
636 |
-
eval_strategy=eval_strategy,
|
637 |
load_best_model_at_end=load_best_model_at_end,
|
638 |
report_to=reports,
|
639 |
logging_first_step=training_config.get("logging_first_step", True),
|
640 |
disable_tqdm=training_config.get("disable_tqdm", False),
|
641 |
remove_unused_columns=False,
|
642 |
gradient_checkpointing=gradient_checkpointing,
|
643 |
-
dataloader_num_workers=dataloader_workers
|
|
|
644 |
)
|
645 |
|
646 |
# Create trainer with pre-tokenized collator
|
|
|
2 |
|
3 |
"""
|
4 |
Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
|
5 |
+
- Optimized for A100 GPU with pre-tokenized datasets
|
6 |
- Research training only (no inference)
|
7 |
- CLOUD BASED TRAINING - Hugging Face Spaces
|
8 |
"""
|
|
|
21 |
from dotenv import load_dotenv
|
22 |
from huggingface_hub import HfApi, upload_folder
|
23 |
|
24 |
+
# Basic environment setup for A100
|
25 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:512"
|
26 |
+
os.environ["NCCL_P2P_DISABLE"] = "1" # Can help with A100 multi-GPU setups
|
27 |
|
28 |
# Force GPU mode in Space if we're using a pre-quantized model
|
29 |
os.environ["FORCE_GPU"] = "1"
|
|
|
469 |
use_4bit = False
|
470 |
logger.warning("Using CPU mode without quantization")
|
471 |
|
472 |
+
# Determine compute dtype based on hardware config
|
473 |
+
compute_dtype = torch.bfloat16 if hardware_config.get("bf16", False) else torch.float16
|
474 |
+
logger.info(f"Using compute dtype: {compute_dtype}")
|
475 |
+
|
476 |
# For pre-quantized models, always use device_map="auto"
|
477 |
if is_pre_quantized and is_gpu_available():
|
478 |
logger.info("Loading pre-quantized model with GPU support")
|
479 |
model = AutoModelForCausalLM.from_pretrained(
|
480 |
model_name,
|
481 |
device_map="auto",
|
482 |
+
torch_dtype=compute_dtype,
|
483 |
trust_remote_code=True,
|
484 |
use_cache=model_config.get("use_cache", False)
|
485 |
)
|
|
|
488 |
logger.info(f"Loading model with 4-bit quantization")
|
489 |
|
490 |
# Create quantization config for GPU
|
491 |
+
bnb_compute_dtype = torch.bfloat16 if quant_config.get("bnb_4bit_compute_dtype", "float16") == "bfloat16" else torch.float16
|
492 |
bnb_config = BitsAndBytesConfig(
|
493 |
load_in_4bit=True,
|
494 |
+
bnb_4bit_compute_dtype=bnb_compute_dtype,
|
495 |
bnb_4bit_quant_type=quant_config.get("bnb_4bit_quant_type", "nf4"),
|
496 |
bnb_4bit_use_double_quant=quant_config.get("bnb_4bit_use_double_quant", True)
|
497 |
)
|
|
|
501 |
model_name,
|
502 |
quantization_config=bnb_config,
|
503 |
device_map="auto",
|
504 |
+
torch_dtype=compute_dtype,
|
505 |
trust_remote_code=True,
|
506 |
use_cache=model_config.get("use_cache", False),
|
507 |
+
attn_implementation=hardware_config.get("attn_implementation", "flash_attention_2")
|
508 |
)
|
509 |
else:
|
510 |
# CPU fallback (or non-quantized GPU) mode
|
|
|
576 |
gpu_info = torch.cuda.get_device_properties(0)
|
577 |
logger.info(f"GPU: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
|
578 |
|
579 |
+
# Check if it's an A100 or high-memory GPU
|
580 |
+
if "A100" in gpu_info.name or "A10G" in gpu_info.name or gpu_info.total_memory > 40e9:
|
581 |
+
logger.info("Detected A100 GPU - optimizing for A100")
|
582 |
+
per_device_train_batch_size = training_config.get("per_device_train_batch_size", 3)
|
583 |
else:
|
584 |
# Use a smaller batch size for other GPUs
|
585 |
per_device_train_batch_size = 2
|
586 |
+
logger.info(f"Using conservative batch size for non-A100 GPU: {per_device_train_batch_size}")
|
587 |
else:
|
588 |
# Use minimal batch size for CPU
|
589 |
per_device_train_batch_size = 1
|
|
|
592 |
# Use full training parameters for pre-quantized models or GPU mode
|
593 |
if is_pre_quantized or can_use_4bit or not is_running_in_space():
|
594 |
num_train_epochs = training_config.get("num_train_epochs", 3)
|
595 |
+
gradient_accumulation_steps = training_config.get("gradient_accumulation_steps", 2)
|
596 |
+
fp16 = torch.cuda.is_available() and hardware_config.get("fp16", False)
|
597 |
+
bf16 = torch.cuda.is_available() and hardware_config.get("bf16", True)
|
598 |
# Disable gradient checkpointing for pre-quantized models as it can cause gradient issues
|
599 |
gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True) and not is_pre_quantized
|
600 |
dataloader_workers = training_config.get("dataloader_num_workers", 4)
|
|
|
638 |
logging_steps=training_config.get("logging_steps", 10),
|
639 |
save_steps=training_config.get("save_steps", 200),
|
640 |
save_total_limit=training_config.get("save_total_limit", 3),
|
641 |
+
eval_strategy=eval_strategy,
|
642 |
load_best_model_at_end=load_best_model_at_end,
|
643 |
report_to=reports,
|
644 |
logging_first_step=training_config.get("logging_first_step", True),
|
645 |
disable_tqdm=training_config.get("disable_tqdm", False),
|
646 |
remove_unused_columns=False,
|
647 |
gradient_checkpointing=gradient_checkpointing,
|
648 |
+
dataloader_num_workers=dataloader_workers,
|
649 |
+
group_by_length=training_config.get("group_by_length", True)
|
650 |
)
|
651 |
|
652 |
# Create trainer with pre-tokenized collator
|