Spaces:
Runtime error
Runtime error
File size: 5,265 Bytes
860c901 969e90e 860c901 37078d1 860c901 37078d1 860c901 37078d1 860c901 37078d1 860c901 969e90e 860c901 826e9d0 860c901 826e9d0 860c901 826e9d0 860c901 826e9d0 860c901 826e9d0 860c901 969e90e 860c901 969e90e 826e9d0 969e90e 860c901 969e90e 37078d1 860c901 826e9d0 860c901 969e90e 860c901 97b40c3 860c901 969e90e 860c901 969e90e 860c901 826e9d0 860c901 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import os
import shutil
import pandas as pd
from datasets import Dataset
# Disable hf_transfer and set CUDA allocation configuration
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
# --- Monkey-patch CONFIG_MAPPING to handle custom model type "phi3" ---
from transformers.configuration_utils import PretrainedConfig
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
class Phi3Config(PretrainedConfig):
model_type = "phi3"
CONFIG_MAPPING["phi3"] = Phi3Config
# --- Standard imports ---
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from huggingface_hub import HfApi
import torch
# Import PEFT for parameter-efficient fine-tuning
from peft import LoraConfig, get_peft_model
# --- Setup directories for cache, output, and offload ---
cache_dir = "./cache"
os.makedirs(cache_dir, exist_ok=True)
output_dir = "./output/mibera-v1-merged"
os.makedirs(output_dir, exist_ok=True)
offload_folder = "./offload"
os.makedirs(offload_folder, exist_ok=True)
# Set environment variables for caching to local directories
os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface")
os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache")
os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
# Clear any existing JSON cache
json_cache_dir = os.path.join(cache_dir, "datasets_cache", "json")
if os.path.exists(json_cache_dir):
shutil.rmtree(json_cache_dir)
# --- Define paths ---
dataset_path = 'datasets/finetune_dataset_ready.jsonl' # Your merged dataset file
model_name = "microsoft/phi-4"
HF_REPO = "ivxxdegen/mibera-v1-merged"
if not os.path.exists(dataset_path):
print(f"Dataset file {dataset_path} not found. Please upload it!")
exit(1)
# --- Load the dataset using pandas ---
print("π₯ Loading dataset using pandas...")
df = pd.read_json(dataset_path, lines=True)
dataset = Dataset.from_pandas(df)
print("Dataset columns:", dataset.column_names)
# --- Split the dataset into train and evaluation subsets ---
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]
# --- Load the tokenizer and model with trust_remote_code=True and offloading ---
print("π₯ Loading tokenizer and model with trust_remote_code=True and offloading...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
max_memory = {0: "10GiB"} # Limit GPU 0 usage to 10GiB; adjust as needed
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
device_map="auto", # Automatically map layers between GPU and CPU
max_memory=max_memory,
offload_folder=offload_folder,
low_cpu_mem_usage=True,
offload_state_dict=True # Offload state dict from meta
)
torch.cuda.empty_cache()
model.gradient_checkpointing_enable()
# --- Force materialize all parameters by re-loading state dict ---
print("Materializing model parameters...")
state = model.state_dict()
model.load_state_dict(state)
print("Model parameters are fully materialized.")
# --- Integrate PEFT (LoRA) ---
# Inspect your model's modules to determine the right target modules.
# Based on your previous inspection, use "qkv_proj" if that's the correct layer.
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["qkv_proj"], # Update this list based on your model inspection
lora_dropout=0.1,
bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# --- Preprocess the dataset ---
def preprocess_function(examples):
tweets = examples.get("tweet", [])
lores = examples.get("lore", [])
combined_texts = []
for tweet, lore in zip(tweets, lores):
combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore
combined_texts.append(combined_text)
return tokenizer(combined_texts, truncation=True, padding=True)
print("π Preprocessing train dataset...")
tokenized_train = train_dataset.map(preprocess_function, batched=True)
print("π Preprocessing eval dataset...")
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
def add_labels(batch):
batch["labels"] = batch["input_ids"].copy()
return batch
print("π Adding labels to train dataset...")
tokenized_train = tokenized_train.map(add_labels, batched=True)
print("π Adding labels to eval dataset...")
tokenized_eval = eval_dataset.map(add_labels, batched=True)
# --- Set training arguments with memory-saving parameters ---
training_args = TrainingArguments(
output_dir=output_dir,
evaluation_strategy="epoch", # (Deprecated: use eval_strategy in future)
logging_dir="./logs",
logging_steps=500,
num_train_epochs=3,
per_device_train_batch_size=1,
gradient_accumulation_steps=8,
fp16=True,
)
# --- Initialize Trainer ---
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
tokenizer=tokenizer,
)
print("π Starting training...")
trainer.train()
print("πΎ Saving model and tokenizer...")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
|