File size: 5,337 Bytes
860c901
 
 
 
 
57bccaa
860c901
 
 
 
 
 
 
 
 
 
57bccaa
860c901
 
37078d1
860c901
 
 
 
 
 
 
37078d1
860c901
 
 
 
 
 
 
 
 
 
 
37078d1
860c901
 
 
 
 
57bccaa
860c901
 
 
 
 
 
 
826e9d0
860c901
 
4ba1c51
 
b6e95ca
 
4ba1c51
 
 
 
 
860c901
 
4ba1c51
860c901
826e9d0
860c901
 
 
 
826e9d0
860c901
 
57bccaa
860c901
 
 
57bccaa
860c901
4ba1c51
969e90e
57bccaa
860c901
 
 
 
826e9d0
4ba1c51
860c901
969e90e
 
7e188c1
860c901
 
 
 
 
 
826e9d0
860c901
7e188c1
4ba1c51
b6e95ca
 
 
 
860c901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ba1c51
860c901
57bccaa
860c901
 
4ba1c51
860c901
 
 
969e90e
 
 
860c901
 
826e9d0
860c901
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
import shutil
import pandas as pd
from datasets import Dataset

# Disable hf_transfer and set CUDA allocation configuration to help with fragmentation
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"

# --- Monkey-patch CONFIG_MAPPING to handle custom model type "phi3" ---
from transformers.configuration_utils import PretrainedConfig
from transformers.models.auto.configuration_auto import CONFIG_MAPPING

class Phi3Config(PretrainedConfig):
    model_type = "phi3"

# Register our dummy config class for "phi3"
CONFIG_MAPPING["phi3"] = Phi3Config

# --- Standard imports ---
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from huggingface_hub import HfApi
import torch

# Import PEFT for parameter-efficient fine-tuning
from peft import LoraConfig, get_peft_model

# --- Setup directories for cache, output, and offload ---
cache_dir = "./cache"
os.makedirs(cache_dir, exist_ok=True)
output_dir = "./output/mibera-v1-merged"
os.makedirs(output_dir, exist_ok=True)
offload_folder = "./offload"
os.makedirs(offload_folder, exist_ok=True)

os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface")
os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache")
os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")

# Clear any existing JSON cache
json_cache_dir = os.path.join(cache_dir, "datasets_cache", "json")
if os.path.exists(json_cache_dir):
    shutil.rmtree(json_cache_dir)

# --- Define paths ---
dataset_path = 'datasets/finetune_dataset_ready.jsonl'
model_name = "microsoft/phi-4"
HF_REPO = "ivxxdegen/mibera-v1-merged"

if not os.path.exists(dataset_path):
    print(f"Dataset file {dataset_path} not found. Please upload it!")
    exit(1)

# --- Load the dataset using pandas ---
print("πŸ“₯ Loading dataset using pandas...")
df = pd.read_json(dataset_path, lines=True)

# Flatten nested JSON columns: extract "content" from tweet and "response" from lore.
df["tweet_text"] = df["tweet"].apply(lambda x: x.get("content", "") if isinstance(x, dict) else str(x))
df["lore_text"] = df["lore"].apply(lambda x: x.get("response", "") if isinstance(x, dict) else str(x))

# Optionally drop the original nested columns:
df = df.drop(columns=["tweet", "lore"])

# Now convert the flattened DataFrame into a Hugging Face Dataset.
dataset = Dataset.from_pandas(df)
print("Dataset columns:", dataset.column_names)
# Expected columns are now: ['tweet_text', 'lore_text'] plus any others

# --- Split the dataset into train and evaluation subsets ---
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# --- Load the tokenizer and model with trust_remote_code=True and offloading ---
print("πŸ“₯ Loading tokenizer and model with trust_remote_code=True and offloading...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
max_memory = {0: "10GiB"}
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    max_memory=max_memory,
    offload_folder="./offload",
    low_cpu_mem_usage=True,
    offload_state_dict=True
)
torch.cuda.empty_cache()
model.gradient_checkpointing_enable()

# --- Integrate PEFT (LoRA) ---
# Based on your inspection, we target "qkv_proj". Update if necessary.
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["qkv_proj"],
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# --- Preprocess the dataset ---
def preprocess_function(examples):
    combined_texts = []
    # Use the new flattened columns: "tweet_text" and "lore_text"
    tweets = examples.get("tweet_text", [])
    lores = examples.get("lore_text", [])
    for tweet, lore in zip(tweets, lores):
        combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore
        combined_texts.append(combined_text)
    return tokenizer(combined_texts, truncation=True, padding=True)

print("πŸ›  Preprocessing train dataset...")
tokenized_train = train_dataset.map(preprocess_function, batched=True)
print("πŸ›  Preprocessing eval dataset...")
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)

def add_labels(batch):
    batch["labels"] = batch["input_ids"].copy()
    return batch

print("πŸ›  Adding labels to train dataset...")
tokenized_train = tokenized_train.map(add_labels, batched=True)
print("πŸ›  Adding labels to eval dataset...")
tokenized_eval = tokenized_eval.map(add_labels, batched=True)

# --- Set training arguments ---
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",  # (Deprecated: use eval_strategy in future versions)
    logging_dir="./logs",
    logging_steps=500,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    fp16=True,
)

# --- Initialize Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
)

print("πŸŽ“ Starting training...")
trainer.train()

print("πŸ’Ύ Saving model and tokenizer...")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)