ivxxdegen commited on
Commit
826e9d0
Β·
1 Parent(s): 860c901

requirements added

Browse files
Files changed (1) hide show
  1. app.py +28 -24
app.py CHANGED
@@ -14,6 +14,7 @@ from transformers.models.auto.configuration_auto import CONFIG_MAPPING
14
  class Phi3Config(PretrainedConfig):
15
  model_type = "phi3"
16
 
 
17
  CONFIG_MAPPING["phi3"] = Phi3Config
18
 
19
  # --- Standard imports ---
@@ -24,7 +25,7 @@ import torch
24
  # Import PEFT for parameter-efficient fine-tuning
25
  from peft import LoraConfig, get_peft_model
26
 
27
- # --- Setup directories ---
28
  cache_dir = "./cache"
29
  os.makedirs(cache_dir, exist_ok=True)
30
  output_dir = "./output/mibera-v1-merged"
@@ -32,16 +33,18 @@ os.makedirs(output_dir, exist_ok=True)
32
  offload_folder = "./offload"
33
  os.makedirs(offload_folder, exist_ok=True)
34
 
 
35
  os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface")
36
  os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache")
37
  os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
38
 
 
39
  json_cache_dir = os.path.join(cache_dir, "datasets_cache", "json")
40
  if os.path.exists(json_cache_dir):
41
  shutil.rmtree(json_cache_dir)
42
 
43
  # --- Define paths ---
44
- dataset_path = 'datasets/finetune_dataset_ready.jsonl'
45
  model_name = "microsoft/phi-4"
46
  HF_REPO = "ivxxdegen/mibera-v1-merged"
47
 
@@ -49,51 +52,48 @@ if not os.path.exists(dataset_path):
49
  print(f"Dataset file {dataset_path} not found. Please upload it!")
50
  exit(1)
51
 
 
52
  print("πŸ“₯ Loading dataset using pandas...")
53
  df = pd.read_json(dataset_path, lines=True)
54
  dataset = Dataset.from_pandas(df)
55
  print("Dataset columns:", dataset.column_names)
56
 
 
57
  split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
58
  train_dataset = split_dataset["train"]
59
  eval_dataset = split_dataset["test"]
60
 
 
61
  print("πŸ“₯ Loading tokenizer and model with trust_remote_code=True and offloading...")
62
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
63
- max_memory = {0: "10GiB"}
64
  model = AutoModelForCausalLM.from_pretrained(
65
  model_name,
66
  trust_remote_code=True,
67
- device_map="auto",
68
  max_memory=max_memory,
69
  offload_folder=offload_folder,
70
  low_cpu_mem_usage=True,
71
- offload_state_dict=True
72
  )
73
  torch.cuda.empty_cache()
74
  model.gradient_checkpointing_enable()
75
 
76
- # --- Inspect model modules to determine correct target_modules for LoRA ---
77
- print("Inspecting model modules (filtering by 'attn', 'query', or 'value'):")
78
- for name, module in model.named_modules():
79
- if "attn" in name or "query" in name or "value" in name:
80
- print(name)
81
- # After inspecting the output, update target_modules below accordingly
82
-
83
- # --- Configure PEFT (LoRA) ---
84
- # Replace the target_modules list with the correct module names from the inspection step.
85
  lora_config = LoraConfig(
86
- r=16,
87
- lora_alpha=32,
88
- target_modules=["q_proj", "v_proj"], # <-- UPDATE THESE NAMES based on your model inspection
89
  lora_dropout=0.1,
90
  bias="none"
91
  )
92
  model = get_peft_model(model, lora_config)
93
  model.print_trainable_parameters()
94
 
95
- # --- Preprocess dataset ---
96
  def preprocess_function(examples):
 
97
  tweets = examples.get("tweet", [])
98
  lores = examples.get("lore", [])
99
  combined_texts = []
@@ -107,6 +107,7 @@ tokenized_train = train_dataset.map(preprocess_function, batched=True)
107
  print("πŸ›  Preprocessing eval dataset...")
108
  tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
109
 
 
110
  def add_labels(batch):
111
  batch["labels"] = batch["input_ids"].copy()
112
  return batch
@@ -116,18 +117,19 @@ tokenized_train = tokenized_train.map(add_labels, batched=True)
116
  print("πŸ›  Adding labels to eval dataset...")
117
  tokenized_eval = tokenized_eval.map(add_labels, batched=True)
118
 
 
119
  training_args = TrainingArguments(
120
  output_dir=output_dir,
121
- evaluation_strategy="epoch",
122
  logging_dir="./logs",
123
  logging_steps=500,
124
  num_train_epochs=3,
125
- per_device_train_batch_size=1,
126
- gradient_accumulation_steps=8,
127
- fp16=True,
128
  )
129
 
130
- # Initialize Trainer
131
  trainer = Trainer(
132
  model=model,
133
  args=training_args,
@@ -136,7 +138,7 @@ trainer = Trainer(
136
  tokenizer=tokenizer,
137
  )
138
 
139
- # --- Optional: Clear existing model repo on HF Hub ---
140
  api = HfApi()
141
  print(f"πŸ—‘ Deleting previous version from Hugging Face: {HF_REPO}...")
142
  try:
@@ -144,9 +146,11 @@ try:
144
  except Exception as e:
145
  print(f"⚠️ Could not delete the existing model: {e}. Proceeding with a clean upload...")
146
 
 
147
  print("πŸŽ“ Starting training...")
148
  trainer.train()
149
 
 
150
  print("πŸ’Ύ Saving model and tokenizer...")
151
  model.save_pretrained(output_dir)
152
  tokenizer.save_pretrained(output_dir)
 
14
  class Phi3Config(PretrainedConfig):
15
  model_type = "phi3"
16
 
17
+ # Register our dummy config class for "phi3"
18
  CONFIG_MAPPING["phi3"] = Phi3Config
19
 
20
  # --- Standard imports ---
 
25
  # Import PEFT for parameter-efficient fine-tuning
26
  from peft import LoraConfig, get_peft_model
27
 
28
+ # --- Setup local directories for cache, output, and offload ---
29
  cache_dir = "./cache"
30
  os.makedirs(cache_dir, exist_ok=True)
31
  output_dir = "./output/mibera-v1-merged"
 
33
  offload_folder = "./offload"
34
  os.makedirs(offload_folder, exist_ok=True)
35
 
36
+ # Set environment variables for caching to local, writable directories
37
  os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface")
38
  os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache")
39
  os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
40
 
41
+ # Clear any existing JSON cache to force a fresh load
42
  json_cache_dir = os.path.join(cache_dir, "datasets_cache", "json")
43
  if os.path.exists(json_cache_dir):
44
  shutil.rmtree(json_cache_dir)
45
 
46
  # --- Define paths ---
47
+ dataset_path = 'datasets/finetune_dataset_ready.jsonl' # Make sure this is the correct path to your merged JSONL file
48
  model_name = "microsoft/phi-4"
49
  HF_REPO = "ivxxdegen/mibera-v1-merged"
50
 
 
52
  print(f"Dataset file {dataset_path} not found. Please upload it!")
53
  exit(1)
54
 
55
+ # --- Load the dataset using pandas ---
56
  print("πŸ“₯ Loading dataset using pandas...")
57
  df = pd.read_json(dataset_path, lines=True)
58
  dataset = Dataset.from_pandas(df)
59
  print("Dataset columns:", dataset.column_names)
60
 
61
+ # --- Split the dataset into train and evaluation subsets ---
62
  split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
63
  train_dataset = split_dataset["train"]
64
  eval_dataset = split_dataset["test"]
65
 
66
+ # --- Load the tokenizer and model with trust_remote_code=True and offloading ---
67
  print("πŸ“₯ Loading tokenizer and model with trust_remote_code=True and offloading...")
68
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
69
+ max_memory = {0: "10GiB"} # Limit GPU 0 usage to 10GiB; adjust as needed
70
  model = AutoModelForCausalLM.from_pretrained(
71
  model_name,
72
  trust_remote_code=True,
73
+ device_map="auto", # Automatically map layers between GPU and CPU
74
  max_memory=max_memory,
75
  offload_folder=offload_folder,
76
  low_cpu_mem_usage=True,
77
+ offload_state_dict=True # Offload state dict from meta
78
  )
79
  torch.cuda.empty_cache()
80
  model.gradient_checkpointing_enable()
81
 
82
+ # --- Integrate PEFT (LoRA) ---
83
+ # Based on inspection, the model uses "qkv_proj" for query, key, and value projections.
 
 
 
 
 
 
 
84
  lora_config = LoraConfig(
85
+ r=16, # LoRA rank
86
+ lora_alpha=32, # Scaling factor
87
+ target_modules=["qkv_proj"], # Use "qkv_proj" based on model inspection
88
  lora_dropout=0.1,
89
  bias="none"
90
  )
91
  model = get_peft_model(model, lora_config)
92
  model.print_trainable_parameters()
93
 
94
+ # --- Preprocess the dataset ---
95
  def preprocess_function(examples):
96
+ # In batched mode, each field is a list.
97
  tweets = examples.get("tweet", [])
98
  lores = examples.get("lore", [])
99
  combined_texts = []
 
107
  print("πŸ›  Preprocessing eval dataset...")
108
  tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
109
 
110
+ # --- Add labels to tokenized data ---
111
  def add_labels(batch):
112
  batch["labels"] = batch["input_ids"].copy()
113
  return batch
 
117
  print("πŸ›  Adding labels to eval dataset...")
118
  tokenized_eval = tokenized_eval.map(add_labels, batched=True)
119
 
120
+ # --- Set training arguments ---
121
  training_args = TrainingArguments(
122
  output_dir=output_dir,
123
+ evaluation_strategy="epoch", # Future: use eval_strategy
124
  logging_dir="./logs",
125
  logging_steps=500,
126
  num_train_epochs=3,
127
+ per_device_train_batch_size=1, # Low batch size to minimize memory usage
128
+ gradient_accumulation_steps=8, # Accumulate gradients to simulate a larger effective batch size
129
+ fp16=True, # Mixed precision training
130
  )
131
 
132
+ # --- Initialize Trainer ---
133
  trainer = Trainer(
134
  model=model,
135
  args=training_args,
 
138
  tokenizer=tokenizer,
139
  )
140
 
141
+ # --- Clear the existing model repository on Hugging Face (optional) ---
142
  api = HfApi()
143
  print(f"πŸ—‘ Deleting previous version from Hugging Face: {HF_REPO}...")
144
  try:
 
146
  except Exception as e:
147
  print(f"⚠️ Could not delete the existing model: {e}. Proceeding with a clean upload...")
148
 
149
+ # --- Start training ---
150
  print("πŸŽ“ Starting training...")
151
  trainer.train()
152
 
153
+ # --- Save the fine-tuned model and tokenizer ---
154
  print("πŸ’Ύ Saving model and tokenizer...")
155
  model.save_pretrained(output_dir)
156
  tokenizer.save_pretrained(output_dir)