Spaces:

ivxxdegen
/

new_mibera_train

Runtime error

App Files Files Community

ivxxdegen commited on Feb 3

Commit

860c901

1 Parent(s): 06204fd

requirements added

Browse files

Files changed (1) hide show

app.py +150 -1

app.py CHANGED Viewed

@@ -1,3 +1,152 @@
 for name, module in model.named_modules():
     if "attn" in name or "query" in name or "value" in name:
-        print(name)

+import os
+import shutil
+import pandas as pd
+from datasets import Dataset
+# Disable hf_transfer and set CUDA allocation configuration
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
+# --- Monkey-patch CONFIG_MAPPING to handle custom model type "phi3" ---
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+class Phi3Config(PretrainedConfig):
+    model_type = "phi3"
+CONFIG_MAPPING["phi3"] = Phi3Config
+# --- Standard imports ---
+from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
+from huggingface_hub import HfApi
+import torch
+# Import PEFT for parameter-efficient fine-tuning
+from peft import LoraConfig, get_peft_model
+# --- Setup directories ---
+cache_dir = "./cache"
+os.makedirs(cache_dir, exist_ok=True)
+output_dir = "./output/mibera-v1-merged"
+os.makedirs(output_dir, exist_ok=True)
+offload_folder = "./offload"
+os.makedirs(offload_folder, exist_ok=True)
+os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface")
+os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache")
+os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
+json_cache_dir = os.path.join(cache_dir, "datasets_cache", "json")
+if os.path.exists(json_cache_dir):
+    shutil.rmtree(json_cache_dir)
+# --- Define paths ---
+dataset_path = 'datasets/finetune_dataset_ready.jsonl'
+model_name = "microsoft/phi-4"
+HF_REPO = "ivxxdegen/mibera-v1-merged"
+if not os.path.exists(dataset_path):
+    print(f"Dataset file {dataset_path} not found. Please upload it!")
+    exit(1)
+print("📥 Loading dataset using pandas...")
+df = pd.read_json(dataset_path, lines=True)
+dataset = Dataset.from_pandas(df)
+print("Dataset columns:", dataset.column_names)
+split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
+train_dataset = split_dataset["train"]
+eval_dataset = split_dataset["test"]
+print("📥 Loading tokenizer and model with trust_remote_code=True and offloading...")
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+max_memory = {0: "10GiB"}
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    trust_remote_code=True,
+    device_map="auto",
+    max_memory=max_memory,
+    offload_folder=offload_folder,
+    low_cpu_mem_usage=True,
+    offload_state_dict=True
+)
+torch.cuda.empty_cache()
+model.gradient_checkpointing_enable()
+# --- Inspect model modules to determine correct target_modules for LoRA ---
+print("Inspecting model modules (filtering by 'attn', 'query', or 'value'):")
 for name, module in model.named_modules():
     if "attn" in name or "query" in name or "value" in name:
+        print(name)
+# After inspecting the output, update target_modules below accordingly
+# --- Configure PEFT (LoRA) ---
+# Replace the target_modules list with the correct module names from the inspection step.
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules=["q_proj", "v_proj"],  # <-- UPDATE THESE NAMES based on your model inspection
+    lora_dropout=0.1,
+    bias="none"
+)
+model = get_peft_model(model, lora_config)
+model.print_trainable_parameters()
+# --- Preprocess dataset ---
+def preprocess_function(examples):
+    tweets = examples.get("tweet", [])
+    lores = examples.get("lore", [])
+    combined_texts = []
+    for tweet, lore in zip(tweets, lores):
+        combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore
+        combined_texts.append(combined_text)
+    return tokenizer(combined_texts, truncation=True, padding=True)
+print("🛠 Preprocessing train dataset...")
+tokenized_train = train_dataset.map(preprocess_function, batched=True)
+print("🛠 Preprocessing eval dataset...")
+tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
+def add_labels(batch):
+    batch["labels"] = batch["input_ids"].copy()
+    return batch
+print("🛠 Adding labels to train dataset...")
+tokenized_train = tokenized_train.map(add_labels, batched=True)
+print("🛠 Adding labels to eval dataset...")
+tokenized_eval = tokenized_eval.map(add_labels, batched=True)
+training_args = TrainingArguments(
+    output_dir=output_dir,
+    evaluation_strategy="epoch",
+    logging_dir="./logs",
+    logging_steps=500,
+    num_train_epochs=3,
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=8,
+    fp16=True,
+)
+# Initialize Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_train,
+    eval_dataset=tokenized_eval,
+    tokenizer=tokenizer,
+)
+# --- Optional: Clear existing model repo on HF Hub ---
+api = HfApi()
+print(f"🗑 Deleting previous version from Hugging Face: {HF_REPO}...")
+try:
+    api.delete_repo(HF_REPO, repo_type="model")
+except Exception as e:
+    print(f"⚠️ Could not delete the existing model: {e}. Proceeding with a clean upload...")
+print("🎓 Starting training...")
+trainer.train()
+print("💾 Saving model and tokenizer...")
+model.save_pretrained(output_dir)
+tokenizer.save_pretrained(output_dir)