File size: 5,265 Bytes
860c901
 
 
 
 
969e90e
860c901
 
 
 
 
 
 
 
 
 
 
 
37078d1
860c901
 
 
 
 
 
 
37078d1
860c901
 
 
 
 
 
 
37078d1
860c901
 
 
 
37078d1
860c901
 
 
 
 
969e90e
860c901
 
 
 
 
 
 
826e9d0
860c901
 
 
 
 
826e9d0
860c901
 
 
 
826e9d0
860c901
 
826e9d0
860c901
 
 
826e9d0
860c901
 
969e90e
 
860c901
 
 
 
969e90e
 
 
 
 
 
826e9d0
969e90e
 
860c901
969e90e
 
37078d1
860c901
 
 
 
 
 
826e9d0
860c901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
969e90e
860c901
97b40c3
860c901
 
969e90e
860c901
 
 
969e90e
 
 
860c901
 
826e9d0
860c901
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import shutil
import pandas as pd
from datasets import Dataset

# Disable hf_transfer and set CUDA allocation configuration
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"

# --- Monkey-patch CONFIG_MAPPING to handle custom model type "phi3" ---
from transformers.configuration_utils import PretrainedConfig
from transformers.models.auto.configuration_auto import CONFIG_MAPPING

class Phi3Config(PretrainedConfig):
    model_type = "phi3"

CONFIG_MAPPING["phi3"] = Phi3Config

# --- Standard imports ---
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from huggingface_hub import HfApi
import torch

# Import PEFT for parameter-efficient fine-tuning
from peft import LoraConfig, get_peft_model

# --- Setup directories for cache, output, and offload ---
cache_dir = "./cache"
os.makedirs(cache_dir, exist_ok=True)
output_dir = "./output/mibera-v1-merged"
os.makedirs(output_dir, exist_ok=True)
offload_folder = "./offload"
os.makedirs(offload_folder, exist_ok=True)

# Set environment variables for caching to local directories
os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface")
os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache")
os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")

# Clear any existing JSON cache
json_cache_dir = os.path.join(cache_dir, "datasets_cache", "json")
if os.path.exists(json_cache_dir):
    shutil.rmtree(json_cache_dir)

# --- Define paths ---
dataset_path = 'datasets/finetune_dataset_ready.jsonl'  # Your merged dataset file
model_name = "microsoft/phi-4"
HF_REPO = "ivxxdegen/mibera-v1-merged"

if not os.path.exists(dataset_path):
    print(f"Dataset file {dataset_path} not found. Please upload it!")
    exit(1)

# --- Load the dataset using pandas ---
print("πŸ“₯ Loading dataset using pandas...")
df = pd.read_json(dataset_path, lines=True)
dataset = Dataset.from_pandas(df)
print("Dataset columns:", dataset.column_names)

# --- Split the dataset into train and evaluation subsets ---
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# --- Load the tokenizer and model with trust_remote_code=True and offloading ---
print("πŸ“₯ Loading tokenizer and model with trust_remote_code=True and offloading...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
max_memory = {0: "10GiB"}  # Limit GPU 0 usage to 10GiB; adjust as needed
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",         # Automatically map layers between GPU and CPU
    max_memory=max_memory,
    offload_folder=offload_folder,
    low_cpu_mem_usage=True,
    offload_state_dict=True    # Offload state dict from meta
)
torch.cuda.empty_cache()
model.gradient_checkpointing_enable()

# --- Force materialize all parameters by re-loading state dict ---
print("Materializing model parameters...")
state = model.state_dict()
model.load_state_dict(state)
print("Model parameters are fully materialized.")

# --- Integrate PEFT (LoRA) ---
# Inspect your model's modules to determine the right target modules. 
# Based on your previous inspection, use "qkv_proj" if that's the correct layer.
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["qkv_proj"],  # Update this list based on your model inspection
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# --- Preprocess the dataset ---
def preprocess_function(examples):
    tweets = examples.get("tweet", [])
    lores = examples.get("lore", [])
    combined_texts = []
    for tweet, lore in zip(tweets, lores):
        combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore
        combined_texts.append(combined_text)
    return tokenizer(combined_texts, truncation=True, padding=True)

print("πŸ›  Preprocessing train dataset...")
tokenized_train = train_dataset.map(preprocess_function, batched=True)
print("πŸ›  Preprocessing eval dataset...")
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)

def add_labels(batch):
    batch["labels"] = batch["input_ids"].copy()
    return batch

print("πŸ›  Adding labels to train dataset...")
tokenized_train = tokenized_train.map(add_labels, batched=True)
print("πŸ›  Adding labels to eval dataset...")
tokenized_eval = eval_dataset.map(add_labels, batched=True)

# --- Set training arguments with memory-saving parameters ---
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",  # (Deprecated: use eval_strategy in future)
    logging_dir="./logs",
    logging_steps=500,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    fp16=True,
)

# --- Initialize Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
)

print("πŸŽ“ Starting training...")
trainer.train()

print("πŸ’Ύ Saving model and tokenizer...")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)