Spaces:
Runtime error
Runtime error
import os | |
import shutil | |
import pandas as pd | |
from datasets import Dataset | |
# Disable hf_transfer and set CUDA allocation configuration to help with fragmentation | |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" | |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32" | |
# --- Monkey-patch CONFIG_MAPPING to handle custom model type "phi3" --- | |
from transformers.configuration_utils import PretrainedConfig | |
from transformers.models.auto.configuration_auto import CONFIG_MAPPING | |
class Phi3Config(PretrainedConfig): | |
model_type = "phi3" | |
# Register our dummy config class for "phi3" | |
CONFIG_MAPPING["phi3"] = Phi3Config | |
# --- Standard imports --- | |
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments | |
from huggingface_hub import HfApi | |
import torch | |
# Import PEFT for parameter-efficient fine-tuning | |
from peft import LoraConfig, get_peft_model | |
# --- Setup directories for cache, output, and offload --- | |
cache_dir = "./cache" | |
os.makedirs(cache_dir, exist_ok=True) | |
output_dir = "./output/mibera-v1-merged" | |
os.makedirs(output_dir, exist_ok=True) | |
offload_folder = "./offload" | |
os.makedirs(offload_folder, exist_ok=True) | |
os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface") | |
os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache") | |
os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers") | |
# Clear any existing JSON cache | |
json_cache_dir = os.path.join(cache_dir, "datasets_cache", "json") | |
if os.path.exists(json_cache_dir): | |
shutil.rmtree(json_cache_dir) | |
# --- Define paths --- | |
dataset_path = 'datasets/finetune_dataset_ready.jsonl' | |
model_name = "microsoft/phi-4" | |
HF_REPO = "ivxxdegen/mibera-v1-merged" | |
if not os.path.exists(dataset_path): | |
print(f"Dataset file {dataset_path} not found. Please upload it!") | |
exit(1) | |
# --- Load the dataset using pandas --- | |
print("π₯ Loading dataset using pandas...") | |
df = pd.read_json(dataset_path, lines=True) | |
# Flatten nested JSON columns: extract "content" from tweet and "response" from lore. | |
df["tweet_text"] = df["tweet"].apply(lambda x: x.get("content", "") if isinstance(x, dict) else str(x)) | |
df["lore_text"] = df["lore"].apply(lambda x: x.get("response", "") if isinstance(x, dict) else str(x)) | |
# Optionally drop the original nested columns: | |
df = df.drop(columns=["tweet", "lore"]) | |
# Now convert the flattened DataFrame into a Hugging Face Dataset. | |
dataset = Dataset.from_pandas(df) | |
print("Dataset columns:", dataset.column_names) | |
# Expected columns are now: ['tweet_text', 'lore_text'] plus any others | |
# --- Split the dataset into train and evaluation subsets --- | |
split_dataset = dataset.train_test_split(test_size=0.1, seed=42) | |
train_dataset = split_dataset["train"] | |
eval_dataset = split_dataset["test"] | |
# --- Load the tokenizer and model with trust_remote_code=True and offloading --- | |
print("π₯ Loading tokenizer and model with trust_remote_code=True and offloading...") | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
max_memory = {0: "10GiB"} | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
trust_remote_code=True, | |
device_map="auto", | |
max_memory=max_memory, | |
offload_folder="./offload", | |
low_cpu_mem_usage=True, | |
offload_state_dict=True | |
) | |
torch.cuda.empty_cache() | |
model.gradient_checkpointing_enable() | |
# --- Integrate PEFT (LoRA) --- | |
# Based on your inspection, we target "qkv_proj". Update if necessary. | |
lora_config = LoraConfig( | |
r=16, | |
lora_alpha=32, | |
target_modules=["qkv_proj"], | |
lora_dropout=0.1, | |
bias="none" | |
) | |
model = get_peft_model(model, lora_config) | |
model.print_trainable_parameters() | |
# --- Preprocess the dataset --- | |
def preprocess_function(examples): | |
combined_texts = [] | |
# Use the new flattened columns: "tweet_text" and "lore_text" | |
tweets = examples.get("tweet_text", []) | |
lores = examples.get("lore_text", []) | |
for tweet, lore in zip(tweets, lores): | |
combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore | |
combined_texts.append(combined_text) | |
return tokenizer(combined_texts, truncation=True, padding=True) | |
print("π Preprocessing train dataset...") | |
tokenized_train = train_dataset.map(preprocess_function, batched=True) | |
print("π Preprocessing eval dataset...") | |
tokenized_eval = eval_dataset.map(preprocess_function, batched=True) | |
def add_labels(batch): | |
batch["labels"] = batch["input_ids"].copy() | |
return batch | |
print("π Adding labels to train dataset...") | |
tokenized_train = tokenized_train.map(add_labels, batched=True) | |
print("π Adding labels to eval dataset...") | |
tokenized_eval = tokenized_eval.map(add_labels, batched=True) | |
# --- Set training arguments --- | |
training_args = TrainingArguments( | |
output_dir=output_dir, | |
evaluation_strategy="epoch", # (Deprecated: use eval_strategy in future versions) | |
logging_dir="./logs", | |
logging_steps=500, | |
num_train_epochs=3, | |
per_device_train_batch_size=1, | |
gradient_accumulation_steps=8, | |
fp16=True, | |
) | |
# --- Initialize Trainer --- | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_train, | |
eval_dataset=tokenized_eval, | |
tokenizer=tokenizer, | |
) | |
print("π Starting training...") | |
trainer.train() | |
print("πΎ Saving model and tokenizer...") | |
model.save_pretrained(output_dir) | |
tokenizer.save_pretrained(output_dir) | |