import os import shutil import pandas as pd from datasets import Dataset # Disable hf_transfer and set CUDA allocation configuration to help with fragmentation os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32" # --- Monkey-patch CONFIG_MAPPING to handle custom model type "phi3" --- from transformers.configuration_utils import PretrainedConfig from transformers.models.auto.configuration_auto import CONFIG_MAPPING class Phi3Config(PretrainedConfig): model_type = "phi3" # Register our dummy config class for "phi3" CONFIG_MAPPING["phi3"] = Phi3Config # --- Continue with standard imports --- from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments from huggingface_hub import HfApi import torch # Import PEFT for parameter-efficient fine-tuning from peft import LoraConfig, get_peft_model # --- Setup local directories for cache, output, and offload --- cache_dir = "./cache" os.makedirs(cache_dir, exist_ok=True) output_dir = "./output/mibera-v1-merged" os.makedirs(output_dir, exist_ok=True) offload_folder = "./offload" os.makedirs(offload_folder, exist_ok=True) # Set environment variables for caching to local, writable directories os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface") os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache") os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers") # Clear any existing JSON cache to force a fresh load json_cache_dir = os.path.join(cache_dir, "datasets_cache", "json") if os.path.exists(json_cache_dir): shutil.rmtree(json_cache_dir) # --- Define paths --- dataset_path = 'datasets/finetune_dataset_ready.jsonl' # Ensure this is the correct path model_name = "microsoft/phi-4" HF_REPO = "ivxxdegen/mibera-v1-merged" # Verify that the dataset file exists if not os.path.exists(dataset_path): print(f"Dataset file {dataset_path} not found. Please upload it!") exit(1) # --- Load the dataset using pandas --- print("📥 Loading dataset using pandas...") df = pd.read_json(dataset_path, lines=True) dataset = Dataset.from_pandas(df) print("Dataset columns:", dataset.column_names) # --- Split the dataset into train and evaluation subsets --- split_dataset = dataset.train_test_split(test_size=0.1, seed=42) train_dataset = split_dataset["train"] eval_dataset = split_dataset["test"] # --- Load the tokenizer and base model with trust_remote_code=True and offloading --- print("📥 Loading tokenizer and model with trust_remote_code=True and offloading...") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) max_memory = {0: "10GiB"} # Limit GPU 0 usage to 10GiB; adjust as needed model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, device_map="auto", # Automatically map layers between GPU and CPU max_memory=max_memory, offload_folder=offload_folder, low_cpu_mem_usage=True, offload_state_dict=True # Offload state dict from meta ) torch.cuda.empty_cache() # --- Integrate PEFT (LoRA) --- # Configure LoRA settings; adjust target_modules as appropriate for your model. lora_config = LoraConfig( r=16, # LoRA rank lora_alpha=32, # Scaling factor target_modules=["q_proj", "v_proj"], # Typical target modules for transformer models lora_dropout=0.1, bias="none" ) # Wrap the model with PEFT model = get_peft_model(model, lora_config) model.print_trainable_parameters() # Optionally enable gradient checkpointing to save memory model.gradient_checkpointing_enable() # --- Preprocess the dataset --- def preprocess_function(examples): tweets = examples.get("tweet", []) lores = examples.get("lore", []) combined_texts = [] for tweet, lore in zip(tweets, lores): combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore combined_texts.append(combined_text) return tokenizer(combined_texts, truncation=True, padding=True) print("🛠 Preprocessing train dataset...") tokenized_train = train_dataset.map(preprocess_function, batched=True) print("🛠 Preprocessing eval dataset...") tokenized_eval = eval_dataset.map(preprocess_function, batched=True) # --- Add labels to tokenized data --- def add_labels(batch): batch["labels"] = batch["input_ids"].copy() return batch print("🛠 Adding labels to train dataset...") tokenized_train = tokenized_train.map(add_labels, batched=True) print("🛠 Adding labels to eval dataset...") tokenized_eval = tokenized_eval.map(add_labels, batched=True) # --- Set training arguments with memory-saving parameters --- training_args = TrainingArguments( output_dir=output_dir, evaluation_strategy="epoch", # (Deprecated: use eval_strategy in future versions) logging_dir="./logs", logging_steps=500, num_train_epochs=3, per_device_train_batch_size=1, # Very low batch size to minimize memory usage gradient_accumulation_steps=8, # Accumulate gradients to simulate a larger batch size fp16=True, # Enable mixed precision training ) # --- Initialize Trainer --- trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_eval, tokenizer=tokenizer, ) # --- (Optional) Clear the existing model repository on Hugging Face --- api = HfApi() print(f"🗑 Deleting previous version from Hugging Face: {HF_REPO}...") try: api.delete_repo(HF_REPO, repo_type="model") except Exception as e: print(f"⚠️ Could not delete the existing model: {e}. Proceeding with a clean upload...") # --- Start training --- print("🎓 Starting training...") trainer.train() # --- Save the fine-tuned model and tokenizer --- print("💾 Saving model and tokenizer...") model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)