ivxxdegen commited on
Commit
860c901
Β·
1 Parent(s): 06204fd

requirements added

Browse files
Files changed (1) hide show
  1. app.py +150 -1
app.py CHANGED
@@ -1,3 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  for name, module in model.named_modules():
2
  if "attn" in name or "query" in name or "value" in name:
3
- print(name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import pandas as pd
4
+ from datasets import Dataset
5
+
6
+ # Disable hf_transfer and set CUDA allocation configuration
7
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
8
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
9
+
10
+ # --- Monkey-patch CONFIG_MAPPING to handle custom model type "phi3" ---
11
+ from transformers.configuration_utils import PretrainedConfig
12
+ from transformers.models.auto.configuration_auto import CONFIG_MAPPING
13
+
14
+ class Phi3Config(PretrainedConfig):
15
+ model_type = "phi3"
16
+
17
+ CONFIG_MAPPING["phi3"] = Phi3Config
18
+
19
+ # --- Standard imports ---
20
+ from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
21
+ from huggingface_hub import HfApi
22
+ import torch
23
+
24
+ # Import PEFT for parameter-efficient fine-tuning
25
+ from peft import LoraConfig, get_peft_model
26
+
27
+ # --- Setup directories ---
28
+ cache_dir = "./cache"
29
+ os.makedirs(cache_dir, exist_ok=True)
30
+ output_dir = "./output/mibera-v1-merged"
31
+ os.makedirs(output_dir, exist_ok=True)
32
+ offload_folder = "./offload"
33
+ os.makedirs(offload_folder, exist_ok=True)
34
+
35
+ os.environ["HF_HOME"] = os.path.join(cache_dir, ".huggingface")
36
+ os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets_cache")
37
+ os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
38
+
39
+ json_cache_dir = os.path.join(cache_dir, "datasets_cache", "json")
40
+ if os.path.exists(json_cache_dir):
41
+ shutil.rmtree(json_cache_dir)
42
+
43
+ # --- Define paths ---
44
+ dataset_path = 'datasets/finetune_dataset_ready.jsonl'
45
+ model_name = "microsoft/phi-4"
46
+ HF_REPO = "ivxxdegen/mibera-v1-merged"
47
+
48
+ if not os.path.exists(dataset_path):
49
+ print(f"Dataset file {dataset_path} not found. Please upload it!")
50
+ exit(1)
51
+
52
+ print("πŸ“₯ Loading dataset using pandas...")
53
+ df = pd.read_json(dataset_path, lines=True)
54
+ dataset = Dataset.from_pandas(df)
55
+ print("Dataset columns:", dataset.column_names)
56
+
57
+ split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
58
+ train_dataset = split_dataset["train"]
59
+ eval_dataset = split_dataset["test"]
60
+
61
+ print("πŸ“₯ Loading tokenizer and model with trust_remote_code=True and offloading...")
62
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
63
+ max_memory = {0: "10GiB"}
64
+ model = AutoModelForCausalLM.from_pretrained(
65
+ model_name,
66
+ trust_remote_code=True,
67
+ device_map="auto",
68
+ max_memory=max_memory,
69
+ offload_folder=offload_folder,
70
+ low_cpu_mem_usage=True,
71
+ offload_state_dict=True
72
+ )
73
+ torch.cuda.empty_cache()
74
+ model.gradient_checkpointing_enable()
75
+
76
+ # --- Inspect model modules to determine correct target_modules for LoRA ---
77
+ print("Inspecting model modules (filtering by 'attn', 'query', or 'value'):")
78
  for name, module in model.named_modules():
79
  if "attn" in name or "query" in name or "value" in name:
80
+ print(name)
81
+ # After inspecting the output, update target_modules below accordingly
82
+
83
+ # --- Configure PEFT (LoRA) ---
84
+ # Replace the target_modules list with the correct module names from the inspection step.
85
+ lora_config = LoraConfig(
86
+ r=16,
87
+ lora_alpha=32,
88
+ target_modules=["q_proj", "v_proj"], # <-- UPDATE THESE NAMES based on your model inspection
89
+ lora_dropout=0.1,
90
+ bias="none"
91
+ )
92
+ model = get_peft_model(model, lora_config)
93
+ model.print_trainable_parameters()
94
+
95
+ # --- Preprocess dataset ---
96
+ def preprocess_function(examples):
97
+ tweets = examples.get("tweet", [])
98
+ lores = examples.get("lore", [])
99
+ combined_texts = []
100
+ for tweet, lore in zip(tweets, lores):
101
+ combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore
102
+ combined_texts.append(combined_text)
103
+ return tokenizer(combined_texts, truncation=True, padding=True)
104
+
105
+ print("πŸ›  Preprocessing train dataset...")
106
+ tokenized_train = train_dataset.map(preprocess_function, batched=True)
107
+ print("πŸ›  Preprocessing eval dataset...")
108
+ tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
109
+
110
+ def add_labels(batch):
111
+ batch["labels"] = batch["input_ids"].copy()
112
+ return batch
113
+
114
+ print("πŸ›  Adding labels to train dataset...")
115
+ tokenized_train = tokenized_train.map(add_labels, batched=True)
116
+ print("πŸ›  Adding labels to eval dataset...")
117
+ tokenized_eval = tokenized_eval.map(add_labels, batched=True)
118
+
119
+ training_args = TrainingArguments(
120
+ output_dir=output_dir,
121
+ evaluation_strategy="epoch",
122
+ logging_dir="./logs",
123
+ logging_steps=500,
124
+ num_train_epochs=3,
125
+ per_device_train_batch_size=1,
126
+ gradient_accumulation_steps=8,
127
+ fp16=True,
128
+ )
129
+
130
+ # Initialize Trainer
131
+ trainer = Trainer(
132
+ model=model,
133
+ args=training_args,
134
+ train_dataset=tokenized_train,
135
+ eval_dataset=tokenized_eval,
136
+ tokenizer=tokenizer,
137
+ )
138
+
139
+ # --- Optional: Clear existing model repo on HF Hub ---
140
+ api = HfApi()
141
+ print(f"πŸ—‘ Deleting previous version from Hugging Face: {HF_REPO}...")
142
+ try:
143
+ api.delete_repo(HF_REPO, repo_type="model")
144
+ except Exception as e:
145
+ print(f"⚠️ Could not delete the existing model: {e}. Proceeding with a clean upload...")
146
+
147
+ print("πŸŽ“ Starting training...")
148
+ trainer.train()
149
+
150
+ print("πŸ’Ύ Saving model and tokenizer...")
151
+ model.save_pretrained(output_dir)
152
+ tokenizer.save_pretrained(output_dir)