Spaces:

ivxxdegen
/

new_mibera_train

Runtime error

App Files Files Community

ivxxdegen commited on Feb 3

Commit

826e9d0

1 Parent(s): 860c901

requirements added

Browse files

Files changed (1) hide show

app.py +28 -24

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ from transformers.models.auto.configuration_auto import CONFIG_MAPPING
 class Phi3Config(PretrainedConfig):
     model_type = "phi3"
 CONFIG_MAPPING["phi3"] = Phi3Config
 # --- Standard imports ---
@@ -24,7 +25,7 @@ import torch
 # Import PEFT for parameter-efficient fine-tuning
 from peft import LoraConfig, get_peft_model
-# --- Setup directories ---
 cache_dir = "./cache"
 os.makedirs(cache_dir, exist_ok=True)
 output_dir = "./output/mibera-v1-merged"
@@ -32,16 +33,18 @@ os.makedirs(output_dir, exist_ok=True)
 offload_folder = "./offload"
 os.makedirs(offload_folder, exist_ok=True)
 os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface")
 os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache")
 os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
 json_cache_dir = os.path.join(cache_dir, "datasets_cache", "json")
 if os.path.exists(json_cache_dir):
     shutil.rmtree(json_cache_dir)
 # --- Define paths ---
-dataset_path = 'datasets/finetune_dataset_ready.jsonl'
 model_name = "microsoft/phi-4"
 HF_REPO = "ivxxdegen/mibera-v1-merged"
@@ -49,51 +52,48 @@ if not os.path.exists(dataset_path):
     print(f"Dataset file {dataset_path} not found. Please upload it!")
     exit(1)
 print("📥 Loading dataset using pandas...")
 df = pd.read_json(dataset_path, lines=True)
 dataset = Dataset.from_pandas(df)
 print("Dataset columns:", dataset.column_names)
 split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
 train_dataset = split_dataset["train"]
 eval_dataset = split_dataset["test"]
 print("📥 Loading tokenizer and model with trust_remote_code=True and offloading...")
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-max_memory = {0: "10GiB"}
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     trust_remote_code=True,
-    device_map="auto",
     max_memory=max_memory,
     offload_folder=offload_folder,
     low_cpu_mem_usage=True,
-    offload_state_dict=True
 )
 torch.cuda.empty_cache()
 model.gradient_checkpointing_enable()
-# --- Inspect model modules to determine correct target_modules for LoRA ---
-print("Inspecting model modules (filtering by 'attn', 'query', or 'value'):")
-for name, module in model.named_modules():
-    if "attn" in name or "query" in name or "value" in name:
-        print(name)
-# After inspecting the output, update target_modules below accordingly
-# --- Configure PEFT (LoRA) ---
-# Replace the target_modules list with the correct module names from the inspection step.
 lora_config = LoraConfig(
-    r=16,
-    lora_alpha=32,
-    target_modules=["q_proj", "v_proj"],  # <-- UPDATE THESE NAMES based on your model inspection
     lora_dropout=0.1,
     bias="none"
 )
 model = get_peft_model(model, lora_config)
 model.print_trainable_parameters()
-# --- Preprocess dataset ---
 def preprocess_function(examples):
     tweets = examples.get("tweet", [])
     lores = examples.get("lore", [])
     combined_texts = []
@@ -107,6 +107,7 @@ tokenized_train = train_dataset.map(preprocess_function, batched=True)
 print("🛠 Preprocessing eval dataset...")
 tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
 def add_labels(batch):
     batch["labels"] = batch["input_ids"].copy()
     return batch
@@ -116,18 +117,19 @@ tokenized_train = tokenized_train.map(add_labels, batched=True)
 print("🛠 Adding labels to eval dataset...")
 tokenized_eval = tokenized_eval.map(add_labels, batched=True)
 training_args = TrainingArguments(
     output_dir=output_dir,
-    evaluation_strategy="epoch",
     logging_dir="./logs",
     logging_steps=500,
     num_train_epochs=3,
-    per_device_train_batch_size=1,
-    gradient_accumulation_steps=8,
-    fp16=True,
 )
-# Initialize Trainer
 trainer = Trainer(
     model=model,
     args=training_args,
@@ -136,7 +138,7 @@ trainer = Trainer(
     tokenizer=tokenizer,
 )
-# --- Optional: Clear existing model repo on HF Hub ---
 api = HfApi()
 print(f"🗑 Deleting previous version from Hugging Face: {HF_REPO}...")
 try:
@@ -144,9 +146,11 @@ try:
 except Exception as e:
     print(f"⚠️ Could not delete the existing model: {e}. Proceeding with a clean upload...")
 print("🎓 Starting training...")
 trainer.train()
 print("💾 Saving model and tokenizer...")
 model.save_pretrained(output_dir)
 tokenizer.save_pretrained(output_dir)

 class Phi3Config(PretrainedConfig):
     model_type = "phi3"
+# Register our dummy config class for "phi3"
 CONFIG_MAPPING["phi3"] = Phi3Config
 # --- Standard imports ---
 # Import PEFT for parameter-efficient fine-tuning
 from peft import LoraConfig, get_peft_model
+# --- Setup local directories for cache, output, and offload ---
 cache_dir = "./cache"
 os.makedirs(cache_dir, exist_ok=True)
 output_dir = "./output/mibera-v1-merged"
 offload_folder = "./offload"
 os.makedirs(offload_folder, exist_ok=True)
+# Set environment variables for caching to local, writable directories
 os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface")
 os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache")
 os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
+# Clear any existing JSON cache to force a fresh load
 json_cache_dir = os.path.join(cache_dir, "datasets_cache", "json")
 if os.path.exists(json_cache_dir):
     shutil.rmtree(json_cache_dir)
 # --- Define paths ---
+dataset_path = 'datasets/finetune_dataset_ready.jsonl'  # Make sure this is the correct path to your merged JSONL file
 model_name = "microsoft/phi-4"
 HF_REPO = "ivxxdegen/mibera-v1-merged"
     print(f"Dataset file {dataset_path} not found. Please upload it!")
     exit(1)
+# --- Load the dataset using pandas ---
 print("📥 Loading dataset using pandas...")
 df = pd.read_json(dataset_path, lines=True)
 dataset = Dataset.from_pandas(df)
 print("Dataset columns:", dataset.column_names)
+# --- Split the dataset into train and evaluation subsets ---
 split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
 train_dataset = split_dataset["train"]
 eval_dataset = split_dataset["test"]
+# --- Load the tokenizer and model with trust_remote_code=True and offloading ---
 print("📥 Loading tokenizer and model with trust_remote_code=True and offloading...")
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+max_memory = {0: "10GiB"}  # Limit GPU 0 usage to 10GiB; adjust as needed
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     trust_remote_code=True,
+    device_map="auto",         # Automatically map layers between GPU and CPU
     max_memory=max_memory,
     offload_folder=offload_folder,
     low_cpu_mem_usage=True,
+    offload_state_dict=True    # Offload state dict from meta
 )
 torch.cuda.empty_cache()
 model.gradient_checkpointing_enable()
+# --- Integrate PEFT (LoRA) ---
+# Based on inspection, the model uses "qkv_proj" for query, key, and value projections.
 lora_config = LoraConfig(
+    r=16,                  # LoRA rank
+    lora_alpha=32,         # Scaling factor
+    target_modules=["qkv_proj"],  # Use "qkv_proj" based on model inspection
     lora_dropout=0.1,
     bias="none"
 )
 model = get_peft_model(model, lora_config)
 model.print_trainable_parameters()
+# --- Preprocess the dataset ---
 def preprocess_function(examples):
+    # In batched mode, each field is a list.
     tweets = examples.get("tweet", [])
     lores = examples.get("lore", [])
     combined_texts = []
 print("🛠 Preprocessing eval dataset...")
 tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
+# --- Add labels to tokenized data ---
 def add_labels(batch):
     batch["labels"] = batch["input_ids"].copy()
     return batch
 print("🛠 Adding labels to eval dataset...")
 tokenized_eval = tokenized_eval.map(add_labels, batched=True)
+# --- Set training arguments ---
 training_args = TrainingArguments(
     output_dir=output_dir,
+    evaluation_strategy="epoch",  # Future: use eval_strategy
     logging_dir="./logs",
     logging_steps=500,
     num_train_epochs=3,
+    per_device_train_batch_size=1,      # Low batch size to minimize memory usage
+    gradient_accumulation_steps=8,      # Accumulate gradients to simulate a larger effective batch size
+    fp16=True,                          # Mixed precision training
 )
+# --- Initialize Trainer ---
 trainer = Trainer(
     model=model,
     args=training_args,
     tokenizer=tokenizer,
 )
+# --- Clear the existing model repository on Hugging Face (optional) ---
 api = HfApi()
 print(f"🗑 Deleting previous version from Hugging Face: {HF_REPO}...")
 try:
 except Exception as e:
     print(f"⚠️ Could not delete the existing model: {e}. Proceeding with a clean upload...")
+# --- Start training ---
 print("🎓 Starting training...")
 trainer.train()
+# --- Save the fine-tuned model and tokenizer ---
 print("💾 Saving model and tokenizer...")
 model.save_pretrained(output_dir)
 tokenizer.save_pretrained(output_dir)