import os import shutil import pandas as pd from datasets import Dataset # Disable hf_transfer and set CUDA allocation configuration to help with fragmentation os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32" # --- Monkey-patch CONFIG_MAPPING to handle custom model type "phi3" --- from transformers.configuration_utils import PretrainedConfig from transformers.models.auto.configuration_auto import CONFIG_MAPPING class Phi3Config(PretrainedConfig): model_type = "phi3" # Register our dummy config class for "phi3" CONFIG_MAPPING["phi3"] = Phi3Config # --- Standard imports --- from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments from huggingface_hub import HfApi import torch # Import PEFT for parameter-efficient fine-tuning from peft import LoraConfig, get_peft_model # --- Setup directories for cache, output, and offload --- cache_dir = "./cache" os.makedirs(cache_dir, exist_ok=True) output_dir = "./output/mibera-v1-merged" os.makedirs(output_dir, exist_ok=True) offload_folder = "./offload" os.makedirs(offload_folder, exist_ok=True) os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface") os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache") os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers") # Clear any existing JSON cache json_cache_dir = os.path.join(cache_dir, "datasets_cache", "json") if os.path.exists(json_cache_dir): shutil.rmtree(json_cache_dir) # --- Define paths --- dataset_path = 'datasets/finetune_dataset_ready.jsonl' model_name = "microsoft/phi-4" HF_REPO = "ivxxdegen/mibera-v1-merged" if not os.path.exists(dataset_path): print(f"Dataset file {dataset_path} not found. Please upload it!") exit(1) # --- Load the dataset using pandas --- print("📥 Loading dataset using pandas...") df = pd.read_json(dataset_path, lines=True) # Flatten nested JSON columns: extract "content" from tweet and "response" from lore. df["tweet_text"] = df["tweet"].apply(lambda x: x.get("content", "") if isinstance(x, dict) else str(x)) df["lore_text"] = df["lore"].apply(lambda x: x.get("response", "") if isinstance(x, dict) else str(x)) # Optionally drop the original nested columns: df = df.drop(columns=["tweet", "lore"]) # Now convert the flattened DataFrame into a Hugging Face Dataset. dataset = Dataset.from_pandas(df) print("Dataset columns:", dataset.column_names) # Expected columns are now: ['tweet_text', 'lore_text'] plus any others # --- Split the dataset into train and evaluation subsets --- split_dataset = dataset.train_test_split(test_size=0.1, seed=42) train_dataset = split_dataset["train"] eval_dataset = split_dataset["test"] # --- Load the tokenizer and model with trust_remote_code=True and offloading --- print("📥 Loading tokenizer and model with trust_remote_code=True and offloading...") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) max_memory = {0: "10GiB"} model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, device_map="auto", max_memory=max_memory, offload_folder="./offload", low_cpu_mem_usage=True, offload_state_dict=True ) torch.cuda.empty_cache() model.gradient_checkpointing_enable() # --- Integrate PEFT (LoRA) --- # Based on your inspection, we target "qkv_proj". Update if necessary. lora_config = LoraConfig( r=16, lora_alpha=32, target_modules=["qkv_proj"], lora_dropout=0.1, bias="none" ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # --- Preprocess the dataset --- def preprocess_function(examples): combined_texts = [] # Use the new flattened columns: "tweet_text" and "lore_text" tweets = examples.get("tweet_text", []) lores = examples.get("lore_text", []) for tweet, lore in zip(tweets, lores): combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore combined_texts.append(combined_text) return tokenizer(combined_texts, truncation=True, padding=True) print("🛠 Preprocessing train dataset...") tokenized_train = train_dataset.map(preprocess_function, batched=True) print("🛠 Preprocessing eval dataset...") tokenized_eval = eval_dataset.map(preprocess_function, batched=True) def add_labels(batch): batch["labels"] = batch["input_ids"].copy() return batch print("🛠 Adding labels to train dataset...") tokenized_train = tokenized_train.map(add_labels, batched=True) print("🛠 Adding labels to eval dataset...") tokenized_eval = tokenized_eval.map(add_labels, batched=True) # --- Set training arguments --- training_args = TrainingArguments( output_dir=output_dir, evaluation_strategy="epoch", # (Deprecated: use eval_strategy in future versions) logging_dir="./logs", logging_steps=500, num_train_epochs=3, per_device_train_batch_size=1, gradient_accumulation_steps=8, fp16=True, ) # --- Initialize Trainer --- trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_eval, tokenizer=tokenizer, ) print("🎓 Starting training...") trainer.train() print("💾 Saving model and tokenizer...") model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)