Spaces:
Running
Running
improves launcher with model family and defaults based on options, updates trl trainer , removes trl config paths by switching to trainingarguments class , tokenizer parameters updated to sfttrainer , resolves evaluation_strategy error
Browse files- launch.sh +137 -104
- scripts/training/train_gpt_oss.py +5 -7
launch.sh
CHANGED
|
@@ -158,112 +158,116 @@ except:
|
|
| 158 |
fi
|
| 159 |
}
|
| 160 |
|
| 161 |
-
# Function to show training configurations
|
| 162 |
show_training_configs() {
|
|
|
|
| 163 |
echo ""
|
| 164 |
print_header "Available Training Configurations"
|
| 165 |
echo "======================================"
|
| 166 |
echo ""
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
|
|
|
|
|
|
|
|
|
| 267 |
}
|
| 268 |
|
| 269 |
# Function to get training configuration
|
|
@@ -785,11 +789,40 @@ HF_TOKEN="$HF_WRITE_TOKEN"
|
|
| 785 |
print_step "Step 2: Training Configuration"
|
| 786 |
echo "=================================="
|
| 787 |
|
| 788 |
-
|
| 789 |
-
select_option "Select
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 790 |
|
| 791 |
get_training_config "$TRAINING_CONFIG_TYPE"
|
| 792 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 793 |
# Step 3: Get experiment details
|
| 794 |
print_step "Step 3: Experiment Details"
|
| 795 |
echo "=============================="
|
|
@@ -1042,7 +1075,7 @@ print_info "This information will be used in the model card and citation."
|
|
| 1042 |
get_input "Author name for model card" "$HF_USERNAME" AUTHOR_NAME
|
| 1043 |
|
| 1044 |
print_info "Model description will be used in the model card and repository."
|
| 1045 |
-
get_input "Model description" "
|
| 1046 |
|
| 1047 |
# Step 9: Deploy Trackio Space (automated)
|
| 1048 |
print_step "Step 9: Deploying Trackio Space"
|
|
|
|
| 158 |
fi
|
| 159 |
}
|
| 160 |
|
| 161 |
+
# Function to show training configurations (optionally filtered by family)
|
| 162 |
show_training_configs() {
|
| 163 |
+
local family="$1" # Optional: "SmolLM3" or "GPT-OSS"
|
| 164 |
echo ""
|
| 165 |
print_header "Available Training Configurations"
|
| 166 |
echo "======================================"
|
| 167 |
echo ""
|
| 168 |
+
|
| 169 |
+
if [ -z "$family" ] || [ "$family" = "SmolLM3" ]; then
|
| 170 |
+
echo "=== SmolLM3 Configurations ==="
|
| 171 |
+
echo "1. Basic Training (Default)"
|
| 172 |
+
echo " - Model: SmolLM3-3B"
|
| 173 |
+
echo " - Dataset: SmolTalk"
|
| 174 |
+
echo " - Epochs: 3"
|
| 175 |
+
echo " - Batch Size: 2"
|
| 176 |
+
echo " - Learning Rate: 5e-6"
|
| 177 |
+
echo ""
|
| 178 |
+
echo "2. H100 Lightweight (Rapid)"
|
| 179 |
+
echo " - Model: SmolLM3-3B"
|
| 180 |
+
echo " - Dataset: OpenHermes-FR (80K samples)"
|
| 181 |
+
echo " - Epochs: 1"
|
| 182 |
+
echo " - Batch Size: 16"
|
| 183 |
+
echo " - Learning Rate: 8e-6"
|
| 184 |
+
echo " - Sequence Length: 8192"
|
| 185 |
+
echo " - Optimized for H100 rapid training"
|
| 186 |
+
echo ""
|
| 187 |
+
echo "3. A100 Large Scale"
|
| 188 |
+
echo " - Model: SmolLM3-3B"
|
| 189 |
+
echo " - Dataset: OpenHermes-FR"
|
| 190 |
+
echo " - Epochs: 1.3 passes"
|
| 191 |
+
echo " - Batch Size: 8"
|
| 192 |
+
echo " - Learning Rate: 5e-6"
|
| 193 |
+
echo " - Sequence Length: 8192"
|
| 194 |
+
echo ""
|
| 195 |
+
echo "4. Multiple Passes"
|
| 196 |
+
echo " - Model: SmolLM3-3B"
|
| 197 |
+
echo " - Dataset: OpenHermes-FR"
|
| 198 |
+
echo " - Epochs: 4 passes"
|
| 199 |
+
echo " - Batch Size: 6"
|
| 200 |
+
echo " - Learning Rate: 3e-6"
|
| 201 |
+
echo " - Sequence Length: 8192"
|
| 202 |
+
echo ""
|
| 203 |
+
fi
|
| 204 |
+
|
| 205 |
+
if [ -z "$family" ] || [ "$family" = "GPT-OSS" ]; then
|
| 206 |
+
echo "=== GPT-OSS Configurations ==="
|
| 207 |
+
echo "1. GPT-OSS Basic Training"
|
| 208 |
+
echo " - Model: openai/gpt-oss-20b"
|
| 209 |
+
echo " - Dataset: Multilingual-Thinking"
|
| 210 |
+
echo " - Epochs: 1"
|
| 211 |
+
echo " - Batch Size: 4"
|
| 212 |
+
echo " - Learning Rate: 2e-4"
|
| 213 |
+
echo " - LoRA + MXFP4 Quantization"
|
| 214 |
+
echo " - Optimized for multilingual reasoning"
|
| 215 |
+
echo ""
|
| 216 |
+
echo "2. GPT-OSS H100 Optimized"
|
| 217 |
+
echo " - Model: openai/gpt-oss-20b"
|
| 218 |
+
echo " - Dataset: Multilingual-Thinking"
|
| 219 |
+
echo " - Epochs: 2"
|
| 220 |
+
echo " - Batch Size: 8"
|
| 221 |
+
echo " - Learning Rate: 3e-4"
|
| 222 |
+
echo " - Enhanced LoRA (rank 16)"
|
| 223 |
+
echo " - Optimized for H100 performance"
|
| 224 |
+
echo ""
|
| 225 |
+
echo "3. GPT-OSS Multilingual Reasoning"
|
| 226 |
+
echo " - Model: openai/gpt-oss-20b"
|
| 227 |
+
echo " - Dataset: Multilingual-Thinking"
|
| 228 |
+
echo " - Epochs: 1"
|
| 229 |
+
echo " - Batch Size: 4"
|
| 230 |
+
echo " - Learning Rate: 2e-4"
|
| 231 |
+
echo " - Specialized for reasoning tasks"
|
| 232 |
+
echo " - Supports 10+ languages"
|
| 233 |
+
echo ""
|
| 234 |
+
echo "4. GPT-OSS Memory Optimized"
|
| 235 |
+
echo " - Model: openai/gpt-oss-20b"
|
| 236 |
+
echo " - Dataset: Multilingual-Thinking"
|
| 237 |
+
echo " - Epochs: 1"
|
| 238 |
+
echo " - Batch Size: 1 (effective 16 with accumulation)"
|
| 239 |
+
echo " - Learning Rate: 2e-4"
|
| 240 |
+
echo " - 4-bit quantization + reduced LoRA"
|
| 241 |
+
echo " - Optimized for limited GPU memory"
|
| 242 |
+
echo ""
|
| 243 |
+
echo "5. GPT-OSS OpenHermes-FR (Recommended)"
|
| 244 |
+
echo " - Model: openai/gpt-oss-20b"
|
| 245 |
+
echo " - Dataset: legmlai/openhermes-fr (800K French examples)"
|
| 246 |
+
echo " - Epochs: 1.5"
|
| 247 |
+
echo " - Batch Size: 6 (effective 36 with accumulation)"
|
| 248 |
+
echo " - Learning Rate: 2.5e-4"
|
| 249 |
+
echo " - Optimized for French language training"
|
| 250 |
+
echo " - Quality filtering enabled"
|
| 251 |
+
echo ""
|
| 252 |
+
echo "6. GPT-OSS OpenHermes-FR Memory Optimized"
|
| 253 |
+
echo " - Model: openai/gpt-oss-20b"
|
| 254 |
+
echo " - Dataset: legmlai/openhermes-fr (200K samples)"
|
| 255 |
+
echo " - Epochs: 1"
|
| 256 |
+
echo " - Batch Size: 2 (effective 32 with accumulation)"
|
| 257 |
+
echo " - Learning Rate: 2e-4"
|
| 258 |
+
echo " - Native MXFP4 quantization"
|
| 259 |
+
echo " - Memory optimized for 40-80GB GPUs"
|
| 260 |
+
echo " - Harmony format compatible"
|
| 261 |
+
echo ""
|
| 262 |
+
echo "7. GPT-OSS Custom Dataset"
|
| 263 |
+
echo " - Model: openai/gpt-oss-20b"
|
| 264 |
+
echo " - Dataset: User-defined (fully customizable)"
|
| 265 |
+
echo " - Epochs: Configurable"
|
| 266 |
+
echo " - Batch Size: Configurable"
|
| 267 |
+
echo " - Learning Rate: Configurable"
|
| 268 |
+
echo " - Maximum flexibility with all parameters"
|
| 269 |
+
echo ""
|
| 270 |
+
fi
|
| 271 |
}
|
| 272 |
|
| 273 |
# Function to get training configuration
|
|
|
|
| 789 |
print_step "Step 2: Training Configuration"
|
| 790 |
echo "=================================="
|
| 791 |
|
| 792 |
+
# 2.1 Select model family first
|
| 793 |
+
select_option "Select model family:" "SmolLM3" "GPT-OSS" MODEL_FAMILY
|
| 794 |
+
|
| 795 |
+
# 2.2 Show only the configurations for the selected family and prompt selection
|
| 796 |
+
show_training_configs "$MODEL_FAMILY"
|
| 797 |
+
if [ "$MODEL_FAMILY" = "SmolLM3" ]; then
|
| 798 |
+
select_option "Select training configuration:" \
|
| 799 |
+
"Basic Training" \
|
| 800 |
+
"H100 Lightweight (Rapid)" \
|
| 801 |
+
"A100 Large Scale" \
|
| 802 |
+
"Multiple Passes" \
|
| 803 |
+
"Custom Configuration" \
|
| 804 |
+
TRAINING_CONFIG_TYPE
|
| 805 |
+
else
|
| 806 |
+
select_option "Select training configuration:" \
|
| 807 |
+
"GPT-OSS Basic Training" \
|
| 808 |
+
"GPT-OSS H100 Optimized" \
|
| 809 |
+
"GPT-OSS Multilingual Reasoning" \
|
| 810 |
+
"GPT-OSS Memory Optimized" \
|
| 811 |
+
"GPT-OSS OpenHermes-FR (Recommended)" \
|
| 812 |
+
"GPT-OSS OpenHermes-FR Memory Optimized" \
|
| 813 |
+
"GPT-OSS Custom Dataset" \
|
| 814 |
+
TRAINING_CONFIG_TYPE
|
| 815 |
+
fi
|
| 816 |
|
| 817 |
get_training_config "$TRAINING_CONFIG_TYPE"
|
| 818 |
|
| 819 |
+
# 2.3 Set a family-specific default model description for the model card
|
| 820 |
+
if [ "$MODEL_FAMILY" = "GPT-OSS" ]; then
|
| 821 |
+
DEFAULT_MODEL_DESCRIPTION="A fine-tuned GPT-OSS-20B model optimized for multilingual reasoning and instruction following."
|
| 822 |
+
else
|
| 823 |
+
DEFAULT_MODEL_DESCRIPTION="A fine-tuned SmolLM3-3B model optimized for instruction following and French language tasks."
|
| 824 |
+
fi
|
| 825 |
+
|
| 826 |
# Step 3: Get experiment details
|
| 827 |
print_step "Step 3: Experiment Details"
|
| 828 |
echo "=============================="
|
|
|
|
| 1075 |
get_input "Author name for model card" "$HF_USERNAME" AUTHOR_NAME
|
| 1076 |
|
| 1077 |
print_info "Model description will be used in the model card and repository."
|
| 1078 |
+
get_input "Model description" "$DEFAULT_MODEL_DESCRIPTION" MODEL_DESCRIPTION
|
| 1079 |
|
| 1080 |
# Step 9: Deploy Trackio Space (automated)
|
| 1081 |
print_step "Step 9: Deploying Trackio Space"
|
scripts/training/train_gpt_oss.py
CHANGED
|
@@ -9,9 +9,9 @@ import os
|
|
| 9 |
import sys
|
| 10 |
import argparse
|
| 11 |
import torch
|
| 12 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 13 |
from peft import LoraConfig, get_peft_model
|
| 14 |
-
from trl import SFTTrainer
|
| 15 |
from datasets import load_dataset
|
| 16 |
from pathlib import Path
|
| 17 |
|
|
@@ -353,7 +353,6 @@ def create_sft_config(config, output_dir):
|
|
| 353 |
# Learning rate configuration
|
| 354 |
learning_rate = config.learning_rate
|
| 355 |
lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
|
| 356 |
-
lr_scheduler_kwargs = getattr(config, 'lr_scheduler_kwargs', {"min_lr_rate": 0.1})
|
| 357 |
|
| 358 |
# Batch configuration
|
| 359 |
per_device_train_batch_size = config.batch_size
|
|
@@ -387,7 +386,7 @@ def create_sft_config(config, output_dir):
|
|
| 387 |
print(f" • Gradient accumulation: {gradient_accumulation_steps}")
|
| 388 |
print(f" • Effective batch size: {per_device_train_batch_size * gradient_accumulation_steps}")
|
| 389 |
|
| 390 |
-
sft_config =
|
| 391 |
# Training duration
|
| 392 |
num_train_epochs=num_train_epochs,
|
| 393 |
max_steps=max_steps,
|
|
@@ -395,7 +394,6 @@ def create_sft_config(config, output_dir):
|
|
| 395 |
# Learning rate
|
| 396 |
learning_rate=learning_rate,
|
| 397 |
lr_scheduler_type=lr_scheduler_type,
|
| 398 |
-
lr_scheduler_kwargs=lr_scheduler_kwargs,
|
| 399 |
warmup_ratio=warmup_ratio,
|
| 400 |
warmup_steps=warmup_steps,
|
| 401 |
|
|
@@ -442,7 +440,7 @@ def create_sft_config(config, output_dir):
|
|
| 442 |
push_to_hub=push_to_hub,
|
| 443 |
|
| 444 |
# Monitoring
|
| 445 |
-
report_to="trackio" if getattr(config, 'enable_tracking', False) else None,
|
| 446 |
)
|
| 447 |
|
| 448 |
return sft_config
|
|
@@ -504,7 +502,7 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
|
|
| 504 |
model=peft_model,
|
| 505 |
args=sft_config,
|
| 506 |
train_dataset=dataset,
|
| 507 |
-
|
| 508 |
dataset_text_field="text",
|
| 509 |
max_seq_length=getattr(config, 'max_seq_length', 2048),
|
| 510 |
)
|
|
|
|
| 9 |
import sys
|
| 10 |
import argparse
|
| 11 |
import torch
|
| 12 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
|
| 13 |
from peft import LoraConfig, get_peft_model
|
| 14 |
+
from trl import SFTTrainer
|
| 15 |
from datasets import load_dataset
|
| 16 |
from pathlib import Path
|
| 17 |
|
|
|
|
| 353 |
# Learning rate configuration
|
| 354 |
learning_rate = config.learning_rate
|
| 355 |
lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
|
|
|
|
| 356 |
|
| 357 |
# Batch configuration
|
| 358 |
per_device_train_batch_size = config.batch_size
|
|
|
|
| 386 |
print(f" • Gradient accumulation: {gradient_accumulation_steps}")
|
| 387 |
print(f" • Effective batch size: {per_device_train_batch_size * gradient_accumulation_steps}")
|
| 388 |
|
| 389 |
+
sft_config = TrainingArguments(
|
| 390 |
# Training duration
|
| 391 |
num_train_epochs=num_train_epochs,
|
| 392 |
max_steps=max_steps,
|
|
|
|
| 394 |
# Learning rate
|
| 395 |
learning_rate=learning_rate,
|
| 396 |
lr_scheduler_type=lr_scheduler_type,
|
|
|
|
| 397 |
warmup_ratio=warmup_ratio,
|
| 398 |
warmup_steps=warmup_steps,
|
| 399 |
|
|
|
|
| 440 |
push_to_hub=push_to_hub,
|
| 441 |
|
| 442 |
# Monitoring
|
| 443 |
+
report_to=("trackio" if getattr(config, 'enable_tracking', False) else None),
|
| 444 |
)
|
| 445 |
|
| 446 |
return sft_config
|
|
|
|
| 502 |
model=peft_model,
|
| 503 |
args=sft_config,
|
| 504 |
train_dataset=dataset,
|
| 505 |
+
tokenizer=tokenizer,
|
| 506 |
dataset_text_field="text",
|
| 507 |
max_seq_length=getattr(config, 'max_seq_length', 2048),
|
| 508 |
)
|