Spaces:
Runtime error
Runtime error
File size: 5,337 Bytes
860c901 57bccaa 860c901 57bccaa 860c901 37078d1 860c901 37078d1 860c901 37078d1 860c901 57bccaa 860c901 826e9d0 860c901 4ba1c51 b6e95ca 4ba1c51 860c901 4ba1c51 860c901 826e9d0 860c901 826e9d0 860c901 57bccaa 860c901 57bccaa 860c901 4ba1c51 969e90e 57bccaa 860c901 826e9d0 4ba1c51 860c901 969e90e 7e188c1 860c901 826e9d0 860c901 7e188c1 4ba1c51 b6e95ca 860c901 4ba1c51 860c901 57bccaa 860c901 4ba1c51 860c901 969e90e 860c901 826e9d0 860c901 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import os
import shutil
import pandas as pd
from datasets import Dataset
# Disable hf_transfer and set CUDA allocation configuration to help with fragmentation
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
# --- Monkey-patch CONFIG_MAPPING to handle custom model type "phi3" ---
from transformers.configuration_utils import PretrainedConfig
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
class Phi3Config(PretrainedConfig):
model_type = "phi3"
# Register our dummy config class for "phi3"
CONFIG_MAPPING["phi3"] = Phi3Config
# --- Standard imports ---
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from huggingface_hub import HfApi
import torch
# Import PEFT for parameter-efficient fine-tuning
from peft import LoraConfig, get_peft_model
# --- Setup directories for cache, output, and offload ---
cache_dir = "./cache"
os.makedirs(cache_dir, exist_ok=True)
output_dir = "./output/mibera-v1-merged"
os.makedirs(output_dir, exist_ok=True)
offload_folder = "./offload"
os.makedirs(offload_folder, exist_ok=True)
os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface")
os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache")
os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
# Clear any existing JSON cache
json_cache_dir = os.path.join(cache_dir, "datasets_cache", "json")
if os.path.exists(json_cache_dir):
shutil.rmtree(json_cache_dir)
# --- Define paths ---
dataset_path = 'datasets/finetune_dataset_ready.jsonl'
model_name = "microsoft/phi-4"
HF_REPO = "ivxxdegen/mibera-v1-merged"
if not os.path.exists(dataset_path):
print(f"Dataset file {dataset_path} not found. Please upload it!")
exit(1)
# --- Load the dataset using pandas ---
print("π₯ Loading dataset using pandas...")
df = pd.read_json(dataset_path, lines=True)
# Flatten nested JSON columns: extract "content" from tweet and "response" from lore.
df["tweet_text"] = df["tweet"].apply(lambda x: x.get("content", "") if isinstance(x, dict) else str(x))
df["lore_text"] = df["lore"].apply(lambda x: x.get("response", "") if isinstance(x, dict) else str(x))
# Optionally drop the original nested columns:
df = df.drop(columns=["tweet", "lore"])
# Now convert the flattened DataFrame into a Hugging Face Dataset.
dataset = Dataset.from_pandas(df)
print("Dataset columns:", dataset.column_names)
# Expected columns are now: ['tweet_text', 'lore_text'] plus any others
# --- Split the dataset into train and evaluation subsets ---
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]
# --- Load the tokenizer and model with trust_remote_code=True and offloading ---
print("π₯ Loading tokenizer and model with trust_remote_code=True and offloading...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
max_memory = {0: "10GiB"}
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
device_map="auto",
max_memory=max_memory,
offload_folder="./offload",
low_cpu_mem_usage=True,
offload_state_dict=True
)
torch.cuda.empty_cache()
model.gradient_checkpointing_enable()
# --- Integrate PEFT (LoRA) ---
# Based on your inspection, we target "qkv_proj". Update if necessary.
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["qkv_proj"],
lora_dropout=0.1,
bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# --- Preprocess the dataset ---
def preprocess_function(examples):
combined_texts = []
# Use the new flattened columns: "tweet_text" and "lore_text"
tweets = examples.get("tweet_text", [])
lores = examples.get("lore_text", [])
for tweet, lore in zip(tweets, lores):
combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore
combined_texts.append(combined_text)
return tokenizer(combined_texts, truncation=True, padding=True)
print("π Preprocessing train dataset...")
tokenized_train = train_dataset.map(preprocess_function, batched=True)
print("π Preprocessing eval dataset...")
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
def add_labels(batch):
batch["labels"] = batch["input_ids"].copy()
return batch
print("π Adding labels to train dataset...")
tokenized_train = tokenized_train.map(add_labels, batched=True)
print("π Adding labels to eval dataset...")
tokenized_eval = tokenized_eval.map(add_labels, batched=True)
# --- Set training arguments ---
training_args = TrainingArguments(
output_dir=output_dir,
evaluation_strategy="epoch", # (Deprecated: use eval_strategy in future versions)
logging_dir="./logs",
logging_steps=500,
num_train_epochs=3,
per_device_train_batch_size=1,
gradient_accumulation_steps=8,
fp16=True,
)
# --- Initialize Trainer ---
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
tokenizer=tokenizer,
)
print("π Starting training...")
trainer.train()
print("πΎ Saving model and tokenizer...")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
|