Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	improves launcher with model family and defaults based on options, updates trl trainer , removes trl config paths by switching to trainingarguments class , tokenizer parameters updated to sfttrainer , resolves evaluation_strategy error
Browse files- launch.sh +137 -104
- scripts/training/train_gpt_oss.py +5 -7
    	
        launch.sh
    CHANGED
    
    | @@ -158,112 +158,116 @@ except: | |
| 158 | 
             
                fi
         | 
| 159 | 
             
            }
         | 
| 160 |  | 
| 161 | 
            -
            # Function to show training configurations
         | 
| 162 | 
             
            show_training_configs() {
         | 
|  | |
| 163 | 
             
                echo ""
         | 
| 164 | 
             
                print_header "Available Training Configurations"
         | 
| 165 | 
             
                echo "======================================"
         | 
| 166 | 
             
                echo ""
         | 
| 167 | 
            -
             | 
| 168 | 
            -
                 | 
| 169 | 
            -
             | 
| 170 | 
            -
             | 
| 171 | 
            -
             | 
| 172 | 
            -
             | 
| 173 | 
            -
             | 
| 174 | 
            -
             | 
| 175 | 
            -
             | 
| 176 | 
            -
             | 
| 177 | 
            -
             | 
| 178 | 
            -
             | 
| 179 | 
            -
             | 
| 180 | 
            -
             | 
| 181 | 
            -
             | 
| 182 | 
            -
             | 
| 183 | 
            -
             | 
| 184 | 
            -
             | 
| 185 | 
            -
             | 
| 186 | 
            -
             | 
| 187 | 
            -
             | 
| 188 | 
            -
             | 
| 189 | 
            -
             | 
| 190 | 
            -
             | 
| 191 | 
            -
             | 
| 192 | 
            -
             | 
| 193 | 
            -
             | 
| 194 | 
            -
             | 
| 195 | 
            -
             | 
| 196 | 
            -
             | 
| 197 | 
            -
             | 
| 198 | 
            -
             | 
| 199 | 
            -
             | 
| 200 | 
            -
             | 
| 201 | 
            -
             | 
| 202 | 
            -
                 | 
| 203 | 
            -
             | 
| 204 | 
            -
                 | 
| 205 | 
            -
             | 
| 206 | 
            -
             | 
| 207 | 
            -
             | 
| 208 | 
            -
             | 
| 209 | 
            -
             | 
| 210 | 
            -
             | 
| 211 | 
            -
             | 
| 212 | 
            -
             | 
| 213 | 
            -
             | 
| 214 | 
            -
             | 
| 215 | 
            -
             | 
| 216 | 
            -
             | 
| 217 | 
            -
             | 
| 218 | 
            -
             | 
| 219 | 
            -
             | 
| 220 | 
            -
             | 
| 221 | 
            -
             | 
| 222 | 
            -
             | 
| 223 | 
            -
             | 
| 224 | 
            -
             | 
| 225 | 
            -
             | 
| 226 | 
            -
             | 
| 227 | 
            -
             | 
| 228 | 
            -
             | 
| 229 | 
            -
             | 
| 230 | 
            -
             | 
| 231 | 
            -
             | 
| 232 | 
            -
             | 
| 233 | 
            -
             | 
| 234 | 
            -
             | 
| 235 | 
            -
             | 
| 236 | 
            -
             | 
| 237 | 
            -
             | 
| 238 | 
            -
             | 
| 239 | 
            -
             | 
| 240 | 
            -
             | 
| 241 | 
            -
             | 
| 242 | 
            -
             | 
| 243 | 
            -
             | 
| 244 | 
            -
             | 
| 245 | 
            -
             | 
| 246 | 
            -
             | 
| 247 | 
            -
             | 
| 248 | 
            -
             | 
| 249 | 
            -
             | 
| 250 | 
            -
             | 
| 251 | 
            -
             | 
| 252 | 
            -
             | 
| 253 | 
            -
             | 
| 254 | 
            -
             | 
| 255 | 
            -
             | 
| 256 | 
            -
             | 
| 257 | 
            -
             | 
| 258 | 
            -
             | 
| 259 | 
            -
             | 
| 260 | 
            -
             | 
| 261 | 
            -
             | 
| 262 | 
            -
             | 
| 263 | 
            -
             | 
| 264 | 
            -
             | 
| 265 | 
            -
             | 
| 266 | 
            -
             | 
|  | |
|  | |
|  | |
| 267 | 
             
            }
         | 
| 268 |  | 
| 269 | 
             
            # Function to get training configuration
         | 
| @@ -785,11 +789,40 @@ HF_TOKEN="$HF_WRITE_TOKEN" | |
| 785 | 
             
            print_step "Step 2: Training Configuration"
         | 
| 786 | 
             
            echo "=================================="
         | 
| 787 |  | 
| 788 | 
            -
             | 
| 789 | 
            -
            select_option "Select  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 790 |  | 
| 791 | 
             
            get_training_config "$TRAINING_CONFIG_TYPE"
         | 
| 792 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 793 | 
             
            # Step 3: Get experiment details
         | 
| 794 | 
             
            print_step "Step 3: Experiment Details"
         | 
| 795 | 
             
            echo "=============================="
         | 
| @@ -1042,7 +1075,7 @@ print_info "This information will be used in the model card and citation." | |
| 1042 | 
             
            get_input "Author name for model card" "$HF_USERNAME" AUTHOR_NAME
         | 
| 1043 |  | 
| 1044 | 
             
            print_info "Model description will be used in the model card and repository."
         | 
| 1045 | 
            -
            get_input "Model description" " | 
| 1046 |  | 
| 1047 | 
             
            # Step 9: Deploy Trackio Space (automated)
         | 
| 1048 | 
             
            print_step "Step 9: Deploying Trackio Space"
         | 
|  | |
| 158 | 
             
                fi
         | 
| 159 | 
             
            }
         | 
| 160 |  | 
| 161 | 
            +
            # Function to show training configurations (optionally filtered by family)
         | 
| 162 | 
             
            show_training_configs() {
         | 
| 163 | 
            +
                local family="$1"  # Optional: "SmolLM3" or "GPT-OSS"
         | 
| 164 | 
             
                echo ""
         | 
| 165 | 
             
                print_header "Available Training Configurations"
         | 
| 166 | 
             
                echo "======================================"
         | 
| 167 | 
             
                echo ""
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                if [ -z "$family" ] || [ "$family" = "SmolLM3" ]; then
         | 
| 170 | 
            +
                    echo "=== SmolLM3 Configurations ==="
         | 
| 171 | 
            +
                    echo "1. Basic Training (Default)"
         | 
| 172 | 
            +
                    echo "   - Model: SmolLM3-3B"
         | 
| 173 | 
            +
                    echo "   - Dataset: SmolTalk"
         | 
| 174 | 
            +
                    echo "   - Epochs: 3"
         | 
| 175 | 
            +
                    echo "   - Batch Size: 2"
         | 
| 176 | 
            +
                    echo "   - Learning Rate: 5e-6"
         | 
| 177 | 
            +
                    echo ""
         | 
| 178 | 
            +
                    echo "2. H100 Lightweight (Rapid)"
         | 
| 179 | 
            +
                    echo "   - Model: SmolLM3-3B"
         | 
| 180 | 
            +
                    echo "   - Dataset: OpenHermes-FR (80K samples)"
         | 
| 181 | 
            +
                    echo "   - Epochs: 1"
         | 
| 182 | 
            +
                    echo "   - Batch Size: 16"
         | 
| 183 | 
            +
                    echo "   - Learning Rate: 8e-6"
         | 
| 184 | 
            +
                    echo "   - Sequence Length: 8192"
         | 
| 185 | 
            +
                    echo "   - Optimized for H100 rapid training"
         | 
| 186 | 
            +
                    echo ""
         | 
| 187 | 
            +
                    echo "3. A100 Large Scale"
         | 
| 188 | 
            +
                    echo "   - Model: SmolLM3-3B"
         | 
| 189 | 
            +
                    echo "   - Dataset: OpenHermes-FR"
         | 
| 190 | 
            +
                    echo "   - Epochs: 1.3 passes"
         | 
| 191 | 
            +
                    echo "   - Batch Size: 8"
         | 
| 192 | 
            +
                    echo "   - Learning Rate: 5e-6"
         | 
| 193 | 
            +
                    echo "   - Sequence Length: 8192"
         | 
| 194 | 
            +
                    echo ""
         | 
| 195 | 
            +
                    echo "4. Multiple Passes"
         | 
| 196 | 
            +
                    echo "   - Model: SmolLM3-3B"
         | 
| 197 | 
            +
                    echo "   - Dataset: OpenHermes-FR"
         | 
| 198 | 
            +
                    echo "   - Epochs: 4 passes"
         | 
| 199 | 
            +
                    echo "   - Batch Size: 6"
         | 
| 200 | 
            +
                    echo "   - Learning Rate: 3e-6"
         | 
| 201 | 
            +
                    echo "   - Sequence Length: 8192"
         | 
| 202 | 
            +
                    echo ""
         | 
| 203 | 
            +
                fi
         | 
| 204 | 
            +
             | 
| 205 | 
            +
                if [ -z "$family" ] || [ "$family" = "GPT-OSS" ]; then
         | 
| 206 | 
            +
                    echo "=== GPT-OSS Configurations ==="
         | 
| 207 | 
            +
                    echo "1. GPT-OSS Basic Training"
         | 
| 208 | 
            +
                    echo "   - Model: openai/gpt-oss-20b"
         | 
| 209 | 
            +
                    echo "   - Dataset: Multilingual-Thinking"
         | 
| 210 | 
            +
                    echo "   - Epochs: 1"
         | 
| 211 | 
            +
                    echo "   - Batch Size: 4"
         | 
| 212 | 
            +
                    echo "   - Learning Rate: 2e-4"
         | 
| 213 | 
            +
                    echo "   - LoRA + MXFP4 Quantization"
         | 
| 214 | 
            +
                    echo "   - Optimized for multilingual reasoning"
         | 
| 215 | 
            +
                    echo ""
         | 
| 216 | 
            +
                    echo "2. GPT-OSS H100 Optimized"
         | 
| 217 | 
            +
                    echo "   - Model: openai/gpt-oss-20b"
         | 
| 218 | 
            +
                    echo "   - Dataset: Multilingual-Thinking"
         | 
| 219 | 
            +
                    echo "   - Epochs: 2"
         | 
| 220 | 
            +
                    echo "   - Batch Size: 8"
         | 
| 221 | 
            +
                    echo "   - Learning Rate: 3e-4"
         | 
| 222 | 
            +
                    echo "   - Enhanced LoRA (rank 16)"
         | 
| 223 | 
            +
                    echo "   - Optimized for H100 performance"
         | 
| 224 | 
            +
                    echo ""
         | 
| 225 | 
            +
                    echo "3. GPT-OSS Multilingual Reasoning"
         | 
| 226 | 
            +
                    echo "   - Model: openai/gpt-oss-20b"
         | 
| 227 | 
            +
                    echo "   - Dataset: Multilingual-Thinking"
         | 
| 228 | 
            +
                    echo "   - Epochs: 1"
         | 
| 229 | 
            +
                    echo "   - Batch Size: 4"
         | 
| 230 | 
            +
                    echo "   - Learning Rate: 2e-4"
         | 
| 231 | 
            +
                    echo "   - Specialized for reasoning tasks"
         | 
| 232 | 
            +
                    echo "   - Supports 10+ languages"
         | 
| 233 | 
            +
                    echo ""
         | 
| 234 | 
            +
                    echo "4. GPT-OSS Memory Optimized"
         | 
| 235 | 
            +
                    echo "   - Model: openai/gpt-oss-20b"
         | 
| 236 | 
            +
                    echo "   - Dataset: Multilingual-Thinking"
         | 
| 237 | 
            +
                    echo "   - Epochs: 1"
         | 
| 238 | 
            +
                    echo "   - Batch Size: 1 (effective 16 with accumulation)"
         | 
| 239 | 
            +
                    echo "   - Learning Rate: 2e-4"
         | 
| 240 | 
            +
                    echo "   - 4-bit quantization + reduced LoRA"
         | 
| 241 | 
            +
                    echo "   - Optimized for limited GPU memory"
         | 
| 242 | 
            +
                    echo ""
         | 
| 243 | 
            +
                    echo "5. GPT-OSS OpenHermes-FR (Recommended)"
         | 
| 244 | 
            +
                    echo "   - Model: openai/gpt-oss-20b"
         | 
| 245 | 
            +
                    echo "   - Dataset: legmlai/openhermes-fr (800K French examples)"
         | 
| 246 | 
            +
                    echo "   - Epochs: 1.5"
         | 
| 247 | 
            +
                    echo "   - Batch Size: 6 (effective 36 with accumulation)"
         | 
| 248 | 
            +
                    echo "   - Learning Rate: 2.5e-4"
         | 
| 249 | 
            +
                    echo "   - Optimized for French language training"
         | 
| 250 | 
            +
                    echo "   - Quality filtering enabled"
         | 
| 251 | 
            +
                    echo ""
         | 
| 252 | 
            +
                    echo "6. GPT-OSS OpenHermes-FR Memory Optimized"
         | 
| 253 | 
            +
                    echo "   - Model: openai/gpt-oss-20b"
         | 
| 254 | 
            +
                    echo "   - Dataset: legmlai/openhermes-fr (200K samples)"
         | 
| 255 | 
            +
                    echo "   - Epochs: 1"
         | 
| 256 | 
            +
                    echo "   - Batch Size: 2 (effective 32 with accumulation)"
         | 
| 257 | 
            +
                    echo "   - Learning Rate: 2e-4"
         | 
| 258 | 
            +
                    echo "   - Native MXFP4 quantization"
         | 
| 259 | 
            +
                    echo "   - Memory optimized for 40-80GB GPUs"
         | 
| 260 | 
            +
                    echo "   - Harmony format compatible"
         | 
| 261 | 
            +
                    echo ""
         | 
| 262 | 
            +
                    echo "7. GPT-OSS Custom Dataset"
         | 
| 263 | 
            +
                    echo "   - Model: openai/gpt-oss-20b"
         | 
| 264 | 
            +
                    echo "   - Dataset: User-defined (fully customizable)"
         | 
| 265 | 
            +
                    echo "   - Epochs: Configurable"
         | 
| 266 | 
            +
                    echo "   - Batch Size: Configurable"
         | 
| 267 | 
            +
                    echo "   - Learning Rate: Configurable"
         | 
| 268 | 
            +
                    echo "   - Maximum flexibility with all parameters"
         | 
| 269 | 
            +
                    echo ""
         | 
| 270 | 
            +
                fi
         | 
| 271 | 
             
            }
         | 
| 272 |  | 
| 273 | 
             
            # Function to get training configuration
         | 
|  | |
| 789 | 
             
            print_step "Step 2: Training Configuration"
         | 
| 790 | 
             
            echo "=================================="
         | 
| 791 |  | 
| 792 | 
            +
            # 2.1 Select model family first
         | 
| 793 | 
            +
            select_option "Select model family:" "SmolLM3" "GPT-OSS" MODEL_FAMILY
         | 
| 794 | 
            +
             | 
| 795 | 
            +
            # 2.2 Show only the configurations for the selected family and prompt selection
         | 
| 796 | 
            +
            show_training_configs "$MODEL_FAMILY"
         | 
| 797 | 
            +
            if [ "$MODEL_FAMILY" = "SmolLM3" ]; then
         | 
| 798 | 
            +
                select_option "Select training configuration:" \
         | 
| 799 | 
            +
                    "Basic Training" \
         | 
| 800 | 
            +
                    "H100 Lightweight (Rapid)" \
         | 
| 801 | 
            +
                    "A100 Large Scale" \
         | 
| 802 | 
            +
                    "Multiple Passes" \
         | 
| 803 | 
            +
                    "Custom Configuration" \
         | 
| 804 | 
            +
                    TRAINING_CONFIG_TYPE
         | 
| 805 | 
            +
            else
         | 
| 806 | 
            +
                select_option "Select training configuration:" \
         | 
| 807 | 
            +
                    "GPT-OSS Basic Training" \
         | 
| 808 | 
            +
                    "GPT-OSS H100 Optimized" \
         | 
| 809 | 
            +
                    "GPT-OSS Multilingual Reasoning" \
         | 
| 810 | 
            +
                    "GPT-OSS Memory Optimized" \
         | 
| 811 | 
            +
                    "GPT-OSS OpenHermes-FR (Recommended)" \
         | 
| 812 | 
            +
                    "GPT-OSS OpenHermes-FR Memory Optimized" \
         | 
| 813 | 
            +
                    "GPT-OSS Custom Dataset" \
         | 
| 814 | 
            +
                    TRAINING_CONFIG_TYPE
         | 
| 815 | 
            +
            fi
         | 
| 816 |  | 
| 817 | 
             
            get_training_config "$TRAINING_CONFIG_TYPE"
         | 
| 818 |  | 
| 819 | 
            +
            # 2.3 Set a family-specific default model description for the model card
         | 
| 820 | 
            +
            if [ "$MODEL_FAMILY" = "GPT-OSS" ]; then
         | 
| 821 | 
            +
                DEFAULT_MODEL_DESCRIPTION="A fine-tuned GPT-OSS-20B model optimized for multilingual reasoning and instruction following."
         | 
| 822 | 
            +
            else
         | 
| 823 | 
            +
                DEFAULT_MODEL_DESCRIPTION="A fine-tuned SmolLM3-3B model optimized for instruction following and French language tasks."
         | 
| 824 | 
            +
            fi
         | 
| 825 | 
            +
             | 
| 826 | 
             
            # Step 3: Get experiment details
         | 
| 827 | 
             
            print_step "Step 3: Experiment Details"
         | 
| 828 | 
             
            echo "=============================="
         | 
|  | |
| 1075 | 
             
            get_input "Author name for model card" "$HF_USERNAME" AUTHOR_NAME
         | 
| 1076 |  | 
| 1077 | 
             
            print_info "Model description will be used in the model card and repository."
         | 
| 1078 | 
            +
            get_input "Model description" "$DEFAULT_MODEL_DESCRIPTION" MODEL_DESCRIPTION
         | 
| 1079 |  | 
| 1080 | 
             
            # Step 9: Deploy Trackio Space (automated)
         | 
| 1081 | 
             
            print_step "Step 9: Deploying Trackio Space"
         | 
    	
        scripts/training/train_gpt_oss.py
    CHANGED
    
    | @@ -9,9 +9,9 @@ import os | |
| 9 | 
             
            import sys
         | 
| 10 | 
             
            import argparse
         | 
| 11 | 
             
            import torch
         | 
| 12 | 
            -
            from transformers import AutoTokenizer, AutoModelForCausalLM
         | 
| 13 | 
             
            from peft import LoraConfig, get_peft_model
         | 
| 14 | 
            -
            from trl import SFTTrainer | 
| 15 | 
             
            from datasets import load_dataset
         | 
| 16 | 
             
            from pathlib import Path
         | 
| 17 |  | 
| @@ -353,7 +353,6 @@ def create_sft_config(config, output_dir): | |
| 353 | 
             
                # Learning rate configuration
         | 
| 354 | 
             
                learning_rate = config.learning_rate
         | 
| 355 | 
             
                lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
         | 
| 356 | 
            -
                lr_scheduler_kwargs = getattr(config, 'lr_scheduler_kwargs', {"min_lr_rate": 0.1})
         | 
| 357 |  | 
| 358 | 
             
                # Batch configuration
         | 
| 359 | 
             
                per_device_train_batch_size = config.batch_size
         | 
| @@ -387,7 +386,7 @@ def create_sft_config(config, output_dir): | |
| 387 | 
             
                print(f"  • Gradient accumulation: {gradient_accumulation_steps}")
         | 
| 388 | 
             
                print(f"  • Effective batch size: {per_device_train_batch_size * gradient_accumulation_steps}")
         | 
| 389 |  | 
| 390 | 
            -
                sft_config =  | 
| 391 | 
             
                    # Training duration
         | 
| 392 | 
             
                    num_train_epochs=num_train_epochs,
         | 
| 393 | 
             
                    max_steps=max_steps,
         | 
| @@ -395,7 +394,6 @@ def create_sft_config(config, output_dir): | |
| 395 | 
             
                    # Learning rate
         | 
| 396 | 
             
                    learning_rate=learning_rate,
         | 
| 397 | 
             
                    lr_scheduler_type=lr_scheduler_type,
         | 
| 398 | 
            -
                    lr_scheduler_kwargs=lr_scheduler_kwargs,
         | 
| 399 | 
             
                    warmup_ratio=warmup_ratio,
         | 
| 400 | 
             
                    warmup_steps=warmup_steps,
         | 
| 401 |  | 
| @@ -442,7 +440,7 @@ def create_sft_config(config, output_dir): | |
| 442 | 
             
                    push_to_hub=push_to_hub,
         | 
| 443 |  | 
| 444 | 
             
                    # Monitoring
         | 
| 445 | 
            -
                    report_to="trackio" if getattr(config, 'enable_tracking', False) else None,
         | 
| 446 | 
             
                )
         | 
| 447 |  | 
| 448 | 
             
                return sft_config
         | 
| @@ -504,7 +502,7 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer | |
| 504 | 
             
                    model=peft_model,
         | 
| 505 | 
             
                    args=sft_config,
         | 
| 506 | 
             
                    train_dataset=dataset,
         | 
| 507 | 
            -
                     | 
| 508 | 
             
                    dataset_text_field="text",
         | 
| 509 | 
             
                    max_seq_length=getattr(config, 'max_seq_length', 2048),
         | 
| 510 | 
             
                )
         | 
|  | |
| 9 | 
             
            import sys
         | 
| 10 | 
             
            import argparse
         | 
| 11 | 
             
            import torch
         | 
| 12 | 
            +
            from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
         | 
| 13 | 
             
            from peft import LoraConfig, get_peft_model
         | 
| 14 | 
            +
            from trl import SFTTrainer
         | 
| 15 | 
             
            from datasets import load_dataset
         | 
| 16 | 
             
            from pathlib import Path
         | 
| 17 |  | 
|  | |
| 353 | 
             
                # Learning rate configuration
         | 
| 354 | 
             
                learning_rate = config.learning_rate
         | 
| 355 | 
             
                lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
         | 
|  | |
| 356 |  | 
| 357 | 
             
                # Batch configuration
         | 
| 358 | 
             
                per_device_train_batch_size = config.batch_size
         | 
|  | |
| 386 | 
             
                print(f"  • Gradient accumulation: {gradient_accumulation_steps}")
         | 
| 387 | 
             
                print(f"  • Effective batch size: {per_device_train_batch_size * gradient_accumulation_steps}")
         | 
| 388 |  | 
| 389 | 
            +
                sft_config = TrainingArguments(
         | 
| 390 | 
             
                    # Training duration
         | 
| 391 | 
             
                    num_train_epochs=num_train_epochs,
         | 
| 392 | 
             
                    max_steps=max_steps,
         | 
|  | |
| 394 | 
             
                    # Learning rate
         | 
| 395 | 
             
                    learning_rate=learning_rate,
         | 
| 396 | 
             
                    lr_scheduler_type=lr_scheduler_type,
         | 
|  | |
| 397 | 
             
                    warmup_ratio=warmup_ratio,
         | 
| 398 | 
             
                    warmup_steps=warmup_steps,
         | 
| 399 |  | 
|  | |
| 440 | 
             
                    push_to_hub=push_to_hub,
         | 
| 441 |  | 
| 442 | 
             
                    # Monitoring
         | 
| 443 | 
            +
                    report_to=("trackio" if getattr(config, 'enable_tracking', False) else None),
         | 
| 444 | 
             
                )
         | 
| 445 |  | 
| 446 | 
             
                return sft_config
         | 
|  | |
| 502 | 
             
                    model=peft_model,
         | 
| 503 | 
             
                    args=sft_config,
         | 
| 504 | 
             
                    train_dataset=dataset,
         | 
| 505 | 
            +
                    tokenizer=tokenizer,
         | 
| 506 | 
             
                    dataset_text_field="text",
         | 
| 507 | 
             
                    max_seq_length=getattr(config, 'max_seq_length', 2048),
         | 
| 508 | 
             
                )
         | 
