Spaces:

ivxxdegen
/

new_mibera_train

Runtime error

App Files Files Community

ivxxdegen commited on Feb 3

Commit

1673a4a

1 Parent(s): 969e90e

upgrade

Browse files

Files changed (2) hide show

app_bak.py +84 -21
datasets/finetune_dataset_ready.jsonl +0 -0

app_bak.py CHANGED Viewed

@@ -3,27 +3,37 @@ import shutil
 import pandas as pd
 from datasets import Dataset
 # --- Monkey-patch CONFIG_MAPPING to handle custom model type "phi3" ---
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto.configuration_auto import CONFIG_MAPPING
 class Phi3Config(PretrainedConfig):
     model_type = "phi3"
 # Register our dummy config class for "phi3"
 CONFIG_MAPPING["phi3"] = Phi3Config
 # --- Continue with standard imports ---
 from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
 from huggingface_hub import HfApi
-# --- Setup local directories for cache and output ---
 cache_dir = "./cache"
 os.makedirs(cache_dir, exist_ok=True)
 output_dir = "./output/mibera-v1-merged"
 os.makedirs(output_dir, exist_ok=True)
-# Set environment variables to force caching to local, writable directories
 os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface")
 os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache")
 os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
@@ -43,42 +53,95 @@ if not os.path.exists(dataset_path):
     print(f"Dataset file {dataset_path} not found. Please upload it!")
     exit(1)
-# --- Load the dataset using pandas to bypass caching issues ---
 print("📥 Loading dataset using pandas...")
 df = pd.read_json(dataset_path, lines=True)
 dataset = Dataset.from_pandas(df)
-# --- Load the tokenizer and model with trust_remote_code=True ---
-print("📥 Loading tokenizer and model with trust_remote_code=True...")
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
 # --- Preprocess the dataset ---
 def preprocess_function(examples):
-    return tokenizer(examples['text'], truncation=True, padding=True)
-print("🛠 Preprocessing dataset...")
-tokenized_dataset = dataset.map(preprocess_function, batched=True)
-# --- Set training arguments ---
 training_args = TrainingArguments(
-    output_dir=output_dir,              # Where to save the fine-tuned model
-    evaluation_strategy="epoch",        # Evaluate at each epoch
-    logging_dir="./logs",               # Directory for logs
-    logging_steps=500,                  # Log every 500 steps
-    num_train_epochs=3,                 # Number of training epochs
-    per_device_train_batch_size=8,      # Batch size per device
 )
 # --- Initialize Trainer ---
 trainer = Trainer(
     model=model,
     args=training_args,
-    train_dataset=tokenized_dataset,
     tokenizer=tokenizer,
 )
-# --- Clear the existing model repository on Hugging Face ---
 api = HfApi()
 print(f"🗑 Deleting previous version from Hugging Face: {HF_REPO}...")
 try:

 import pandas as pd
 from datasets import Dataset
+# Disable hf_transfer and set CUDA allocation configuration to help with fragmentation
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
 # --- Monkey-patch CONFIG_MAPPING to handle custom model type "phi3" ---
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto.configuration_auto import CONFIG_MAPPING
 class Phi3Config(PretrainedConfig):
     model_type = "phi3"
 # Register our dummy config class for "phi3"
 CONFIG_MAPPING["phi3"] = Phi3Config
 # --- Continue with standard imports ---
 from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
 from huggingface_hub import HfApi
+import torch
+# Import PEFT for parameter-efficient fine-tuning
+from peft import LoraConfig, get_peft_model
+# --- Setup local directories for cache, output, and offload ---
 cache_dir = "./cache"
 os.makedirs(cache_dir, exist_ok=True)
 output_dir = "./output/mibera-v1-merged"
 os.makedirs(output_dir, exist_ok=True)
+offload_folder = "./offload"
+os.makedirs(offload_folder, exist_ok=True)
+# Set environment variables for caching to local, writable directories
 os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface")
 os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache")
 os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
     print(f"Dataset file {dataset_path} not found. Please upload it!")
     exit(1)
+# --- Load the dataset using pandas ---
 print("📥 Loading dataset using pandas...")
 df = pd.read_json(dataset_path, lines=True)
 dataset = Dataset.from_pandas(df)
+print("Dataset columns:", dataset.column_names)
+# --- Split the dataset into train and evaluation subsets ---
+split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
+train_dataset = split_dataset["train"]
+eval_dataset = split_dataset["test"]
+# --- Load the tokenizer and base model with trust_remote_code=True and offloading ---
+print("📥 Loading tokenizer and model with trust_remote_code=True and offloading...")
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+max_memory = {0: "10GiB"}  # Limit GPU 0 usage to 10GiB; adjust as needed
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    trust_remote_code=True,
+    device_map="auto",         # Automatically map layers between GPU and CPU
+    max_memory=max_memory,
+    offload_folder=offload_folder,
+    low_cpu_mem_usage=True,
+    offload_state_dict=True    # Offload state dict from meta
+)
+torch.cuda.empty_cache()
+# --- Integrate PEFT (LoRA) ---
+# Configure LoRA settings; adjust target_modules as appropriate for your model.
+lora_config = LoraConfig(
+    r=16,                  # LoRA rank
+    lora_alpha=32,         # Scaling factor
+    target_modules=["q_proj", "v_proj"],  # Typical target modules for transformer models
+    lora_dropout=0.1,
+    bias="none"
+)
+# Wrap the model with PEFT
+model = get_peft_model(model, lora_config)
+model.print_trainable_parameters()
+# Optionally enable gradient checkpointing to save memory
+model.gradient_checkpointing_enable()
 # --- Preprocess the dataset ---
 def preprocess_function(examples):
+    tweets = examples.get("tweet", [])
+    lores = examples.get("lore", [])
+    combined_texts = []
+    for tweet, lore in zip(tweets, lores):
+        combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore
+        combined_texts.append(combined_text)
+    return tokenizer(combined_texts, truncation=True, padding=True)
+print("🛠 Preprocessing train dataset...")
+tokenized_train = train_dataset.map(preprocess_function, batched=True)
+print("🛠 Preprocessing eval dataset...")
+tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
+# --- Add labels to tokenized data ---
+def add_labels(batch):
+    batch["labels"] = batch["input_ids"].copy()
+    return batch
+print("🛠 Adding labels to train dataset...")
+tokenized_train = tokenized_train.map(add_labels, batched=True)
+print("🛠 Adding labels to eval dataset...")
+tokenized_eval = tokenized_eval.map(add_labels, batched=True)
+# --- Set training arguments with memory-saving parameters ---
 training_args = TrainingArguments(
+    output_dir=output_dir,
+    evaluation_strategy="epoch",  # (Deprecated: use eval_strategy in future versions)
+    logging_dir="./logs",
+    logging_steps=500,
+    num_train_epochs=3,
+    per_device_train_batch_size=1,      # Very low batch size to minimize memory usage
+    gradient_accumulation_steps=8,      # Accumulate gradients to simulate a larger batch size
+    fp16=True,                          # Enable mixed precision training
 )
 # --- Initialize Trainer ---
 trainer = Trainer(
     model=model,
     args=training_args,
+    train_dataset=tokenized_train,
+    eval_dataset=tokenized_eval,
     tokenizer=tokenizer,
 )
+# --- (Optional) Clear the existing model repository on Hugging Face ---
 api = HfApi()
 print(f"🗑 Deleting previous version from Hugging Face: {HF_REPO}...")
 try:

datasets/finetune_dataset_ready.jsonl CHANGED Viewed

The diff for this file is too large to render. See raw diff