ivxxdegen commited on
Commit
969e90e
Β·
1 Parent(s): 37078d1

requirements added

Browse files
Files changed (1) hide show
  1. app.py +19 -13
app.py CHANGED
@@ -3,7 +3,7 @@ import shutil
3
  import pandas as pd
4
  from datasets import Dataset
5
 
6
- # Disable hf_transfer and set CUDA allocation configuration to help with fragmentation
7
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
8
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
9
 
@@ -14,7 +14,6 @@ from transformers.models.auto.configuration_auto import CONFIG_MAPPING
14
  class Phi3Config(PretrainedConfig):
15
  model_type = "phi3"
16
 
17
- # Register our dummy config class for "phi3"
18
  CONFIG_MAPPING["phi3"] = Phi3Config
19
 
20
  # --- Standard imports ---
@@ -44,7 +43,7 @@ if os.path.exists(json_cache_dir):
44
  shutil.rmtree(json_cache_dir)
45
 
46
  # --- Define paths ---
47
- dataset_path = 'datasets/finetune_dataset_ready.jsonl' # Path to your merged JSONL file
48
  model_name = "microsoft/phi-4"
49
  HF_REPO = "ivxxdegen/mibera-v1-merged"
50
 
@@ -73,17 +72,24 @@ model = AutoModelForCausalLM.from_pretrained(
73
  device_map="auto", # Automatically map layers between GPU and CPU
74
  max_memory=max_memory,
75
  offload_folder=offload_folder,
76
- low_cpu_mem_usage=True
 
77
  )
78
  torch.cuda.empty_cache()
79
  model.gradient_checkpointing_enable()
80
 
 
 
 
 
 
 
81
  # --- Integrate PEFT (LoRA) ---
82
- # Inspect your model's modules (run a snippet if needed) to determine the correct target modules.
83
- # Based on your inspection, it seems that "qkv_proj" is available. Update if necessary.
84
  lora_config = LoraConfig(
85
- r=16, # LoRA rank
86
- lora_alpha=32, # Scaling factor
87
  target_modules=["qkv_proj"], # Update this list based on your model inspection
88
  lora_dropout=0.1,
89
  bias="none"
@@ -113,18 +119,18 @@ def add_labels(batch):
113
  print("πŸ›  Adding labels to train dataset...")
114
  tokenized_train = tokenized_train.map(add_labels, batched=True)
115
  print("πŸ›  Adding labels to eval dataset...")
116
- tokenized_eval = tokenized_eval.map(add_labels, batched=True)
117
 
118
  # --- Set training arguments with memory-saving parameters ---
119
  training_args = TrainingArguments(
120
  output_dir=output_dir,
121
- evaluation_strategy="epoch", # (Deprecated: use eval_strategy in future versions)
122
  logging_dir="./logs",
123
  logging_steps=500,
124
  num_train_epochs=3,
125
- per_device_train_batch_size=1, # Very low batch size to reduce memory usage
126
- gradient_accumulation_steps=8, # Accumulate gradients to simulate a larger effective batch size
127
- fp16=True, # Enable mixed precision training
128
  )
129
 
130
  # --- Initialize Trainer ---
 
3
  import pandas as pd
4
  from datasets import Dataset
5
 
6
+ # Disable hf_transfer and set CUDA allocation configuration
7
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
8
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
9
 
 
14
  class Phi3Config(PretrainedConfig):
15
  model_type = "phi3"
16
 
 
17
  CONFIG_MAPPING["phi3"] = Phi3Config
18
 
19
  # --- Standard imports ---
 
43
  shutil.rmtree(json_cache_dir)
44
 
45
  # --- Define paths ---
46
+ dataset_path = 'datasets/finetune_dataset_ready.jsonl' # Your merged dataset file
47
  model_name = "microsoft/phi-4"
48
  HF_REPO = "ivxxdegen/mibera-v1-merged"
49
 
 
72
  device_map="auto", # Automatically map layers between GPU and CPU
73
  max_memory=max_memory,
74
  offload_folder=offload_folder,
75
+ low_cpu_mem_usage=True,
76
+ offload_state_dict=True # Offload state dict from meta
77
  )
78
  torch.cuda.empty_cache()
79
  model.gradient_checkpointing_enable()
80
 
81
+ # --- Force materialize all parameters by re-loading state dict ---
82
+ print("Materializing model parameters...")
83
+ state = model.state_dict()
84
+ model.load_state_dict(state)
85
+ print("Model parameters are fully materialized.")
86
+
87
  # --- Integrate PEFT (LoRA) ---
88
+ # Inspect your model's modules to determine the right target modules.
89
+ # Based on your previous inspection, use "qkv_proj" if that's the correct layer.
90
  lora_config = LoraConfig(
91
+ r=16,
92
+ lora_alpha=32,
93
  target_modules=["qkv_proj"], # Update this list based on your model inspection
94
  lora_dropout=0.1,
95
  bias="none"
 
119
  print("πŸ›  Adding labels to train dataset...")
120
  tokenized_train = tokenized_train.map(add_labels, batched=True)
121
  print("πŸ›  Adding labels to eval dataset...")
122
+ tokenized_eval = eval_dataset.map(add_labels, batched=True)
123
 
124
  # --- Set training arguments with memory-saving parameters ---
125
  training_args = TrainingArguments(
126
  output_dir=output_dir,
127
+ evaluation_strategy="epoch", # (Deprecated: use eval_strategy in future)
128
  logging_dir="./logs",
129
  logging_steps=500,
130
  num_train_epochs=3,
131
+ per_device_train_batch_size=1,
132
+ gradient_accumulation_steps=8,
133
+ fp16=True,
134
  )
135
 
136
  # --- Initialize Trainer ---