ivxxdegen commited on
Commit
1673a4a
Β·
1 Parent(s): 969e90e
Files changed (2) hide show
  1. app_bak.py +84 -21
  2. datasets/finetune_dataset_ready.jsonl +0 -0
app_bak.py CHANGED
@@ -3,27 +3,37 @@ import shutil
3
  import pandas as pd
4
  from datasets import Dataset
5
 
 
 
 
 
6
  # --- Monkey-patch CONFIG_MAPPING to handle custom model type "phi3" ---
7
  from transformers.configuration_utils import PretrainedConfig
8
  from transformers.models.auto.configuration_auto import CONFIG_MAPPING
9
 
10
  class Phi3Config(PretrainedConfig):
11
  model_type = "phi3"
12
-
13
  # Register our dummy config class for "phi3"
14
  CONFIG_MAPPING["phi3"] = Phi3Config
15
 
16
  # --- Continue with standard imports ---
17
  from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
18
  from huggingface_hub import HfApi
 
 
 
 
19
 
20
- # --- Setup local directories for cache and output ---
21
  cache_dir = "./cache"
22
  os.makedirs(cache_dir, exist_ok=True)
23
  output_dir = "./output/mibera-v1-merged"
24
  os.makedirs(output_dir, exist_ok=True)
 
 
25
 
26
- # Set environment variables to force caching to local, writable directories
27
  os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface")
28
  os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache")
29
  os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
@@ -43,42 +53,95 @@ if not os.path.exists(dataset_path):
43
  print(f"Dataset file {dataset_path} not found. Please upload it!")
44
  exit(1)
45
 
46
- # --- Load the dataset using pandas to bypass caching issues ---
47
  print("πŸ“₯ Loading dataset using pandas...")
48
  df = pd.read_json(dataset_path, lines=True)
49
  dataset = Dataset.from_pandas(df)
 
50
 
51
- # --- Load the tokenizer and model with trust_remote_code=True ---
52
- print("πŸ“₯ Loading tokenizer and model with trust_remote_code=True...")
 
 
 
 
 
53
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
54
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  # --- Preprocess the dataset ---
57
  def preprocess_function(examples):
58
- return tokenizer(examples['text'], truncation=True, padding=True)
59
-
60
- print("πŸ›  Preprocessing dataset...")
61
- tokenized_dataset = dataset.map(preprocess_function, batched=True)
62
-
63
- # --- Set training arguments ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  training_args = TrainingArguments(
65
- output_dir=output_dir, # Where to save the fine-tuned model
66
- evaluation_strategy="epoch", # Evaluate at each epoch
67
- logging_dir="./logs", # Directory for logs
68
- logging_steps=500, # Log every 500 steps
69
- num_train_epochs=3, # Number of training epochs
70
- per_device_train_batch_size=8, # Batch size per device
 
 
71
  )
72
 
73
  # --- Initialize Trainer ---
74
  trainer = Trainer(
75
  model=model,
76
  args=training_args,
77
- train_dataset=tokenized_dataset,
 
78
  tokenizer=tokenizer,
79
  )
80
 
81
- # --- Clear the existing model repository on Hugging Face ---
82
  api = HfApi()
83
  print(f"πŸ—‘ Deleting previous version from Hugging Face: {HF_REPO}...")
84
  try:
 
3
  import pandas as pd
4
  from datasets import Dataset
5
 
6
+ # Disable hf_transfer and set CUDA allocation configuration to help with fragmentation
7
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
8
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
9
+
10
  # --- Monkey-patch CONFIG_MAPPING to handle custom model type "phi3" ---
11
  from transformers.configuration_utils import PretrainedConfig
12
  from transformers.models.auto.configuration_auto import CONFIG_MAPPING
13
 
14
  class Phi3Config(PretrainedConfig):
15
  model_type = "phi3"
16
+
17
  # Register our dummy config class for "phi3"
18
  CONFIG_MAPPING["phi3"] = Phi3Config
19
 
20
  # --- Continue with standard imports ---
21
  from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
22
  from huggingface_hub import HfApi
23
+ import torch
24
+
25
+ # Import PEFT for parameter-efficient fine-tuning
26
+ from peft import LoraConfig, get_peft_model
27
 
28
+ # --- Setup local directories for cache, output, and offload ---
29
  cache_dir = "./cache"
30
  os.makedirs(cache_dir, exist_ok=True)
31
  output_dir = "./output/mibera-v1-merged"
32
  os.makedirs(output_dir, exist_ok=True)
33
+ offload_folder = "./offload"
34
+ os.makedirs(offload_folder, exist_ok=True)
35
 
36
+ # Set environment variables for caching to local, writable directories
37
  os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface")
38
  os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache")
39
  os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
 
53
  print(f"Dataset file {dataset_path} not found. Please upload it!")
54
  exit(1)
55
 
56
+ # --- Load the dataset using pandas ---
57
  print("πŸ“₯ Loading dataset using pandas...")
58
  df = pd.read_json(dataset_path, lines=True)
59
  dataset = Dataset.from_pandas(df)
60
+ print("Dataset columns:", dataset.column_names)
61
 
62
+ # --- Split the dataset into train and evaluation subsets ---
63
+ split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
64
+ train_dataset = split_dataset["train"]
65
+ eval_dataset = split_dataset["test"]
66
+
67
+ # --- Load the tokenizer and base model with trust_remote_code=True and offloading ---
68
+ print("πŸ“₯ Loading tokenizer and model with trust_remote_code=True and offloading...")
69
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
70
+ max_memory = {0: "10GiB"} # Limit GPU 0 usage to 10GiB; adjust as needed
71
+ model = AutoModelForCausalLM.from_pretrained(
72
+ model_name,
73
+ trust_remote_code=True,
74
+ device_map="auto", # Automatically map layers between GPU and CPU
75
+ max_memory=max_memory,
76
+ offload_folder=offload_folder,
77
+ low_cpu_mem_usage=True,
78
+ offload_state_dict=True # Offload state dict from meta
79
+ )
80
+ torch.cuda.empty_cache()
81
+
82
+ # --- Integrate PEFT (LoRA) ---
83
+ # Configure LoRA settings; adjust target_modules as appropriate for your model.
84
+ lora_config = LoraConfig(
85
+ r=16, # LoRA rank
86
+ lora_alpha=32, # Scaling factor
87
+ target_modules=["q_proj", "v_proj"], # Typical target modules for transformer models
88
+ lora_dropout=0.1,
89
+ bias="none"
90
+ )
91
+ # Wrap the model with PEFT
92
+ model = get_peft_model(model, lora_config)
93
+ model.print_trainable_parameters()
94
+
95
+ # Optionally enable gradient checkpointing to save memory
96
+ model.gradient_checkpointing_enable()
97
 
98
  # --- Preprocess the dataset ---
99
  def preprocess_function(examples):
100
+ tweets = examples.get("tweet", [])
101
+ lores = examples.get("lore", [])
102
+ combined_texts = []
103
+ for tweet, lore in zip(tweets, lores):
104
+ combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore
105
+ combined_texts.append(combined_text)
106
+ return tokenizer(combined_texts, truncation=True, padding=True)
107
+
108
+ print("πŸ›  Preprocessing train dataset...")
109
+ tokenized_train = train_dataset.map(preprocess_function, batched=True)
110
+ print("πŸ›  Preprocessing eval dataset...")
111
+ tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
112
+
113
+ # --- Add labels to tokenized data ---
114
+ def add_labels(batch):
115
+ batch["labels"] = batch["input_ids"].copy()
116
+ return batch
117
+
118
+ print("πŸ›  Adding labels to train dataset...")
119
+ tokenized_train = tokenized_train.map(add_labels, batched=True)
120
+ print("πŸ›  Adding labels to eval dataset...")
121
+ tokenized_eval = tokenized_eval.map(add_labels, batched=True)
122
+
123
+ # --- Set training arguments with memory-saving parameters ---
124
  training_args = TrainingArguments(
125
+ output_dir=output_dir,
126
+ evaluation_strategy="epoch", # (Deprecated: use eval_strategy in future versions)
127
+ logging_dir="./logs",
128
+ logging_steps=500,
129
+ num_train_epochs=3,
130
+ per_device_train_batch_size=1, # Very low batch size to minimize memory usage
131
+ gradient_accumulation_steps=8, # Accumulate gradients to simulate a larger batch size
132
+ fp16=True, # Enable mixed precision training
133
  )
134
 
135
  # --- Initialize Trainer ---
136
  trainer = Trainer(
137
  model=model,
138
  args=training_args,
139
+ train_dataset=tokenized_train,
140
+ eval_dataset=tokenized_eval,
141
  tokenizer=tokenizer,
142
  )
143
 
144
+ # --- (Optional) Clear the existing model repository on Hugging Face ---
145
  api = HfApi()
146
  print(f"πŸ—‘ Deleting previous version from Hugging Face: {HF_REPO}...")
147
  try:
datasets/finetune_dataset_ready.jsonl CHANGED
The diff for this file is too large to render. See raw diff