""" GPT-OSS OpenHermes-FR Optimized Configuration Specifically optimized for the legmlai/openhermes-fr dataset 800K French instruction-response pairs with quality filtering """ from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig # OpenHermes-FR optimized configuration config = GPTOSSEnhancedCustomConfig( # ============================================================================ # DATASET CONFIGURATION - OpenHermes-FR Specific # ============================================================================ dataset_name="legmlai/openhermes-fr", dataset_split="train", dataset_format="openhermes_fr", # OpenHermes-FR field mapping input_field="prompt", # French prompts target_field="accepted_completion", # GPT-4o generated completions # Quality filtering using OpenHermes-FR metadata filter_bad_entries=True, # Use built-in quality flags bad_entry_field="bad_entry", bad_prompt_field="bad_prompt_detected", bad_response_field="bad_response_detected", # Data processing optimized for French with GPT-OSS Harmony Format concatenate_fields=True, field_separator="\n\n### RΓ©ponse:\n", # Fallback separator (harmony format takes precedence) add_eos_token=True, use_harmony_format=True, # Enable GPT-OSS harmony format # Dataset sampling (use all 800K examples by default) max_samples=None, # Use full dataset min_length=20, # Minimum for meaningful French text max_length=None, # Auto-set to max_seq_length # ============================================================================ # TRAINING HYPERPARAMETERS - French Language Optimized # ============================================================================ num_train_epochs=1.5, # 1.5 epochs optimal for large dataset batch_size=6, # Balanced for most GPUs gradient_accumulation_steps=6, # Effective batch size: 36 # Learning rate schedule optimized for French fine-tuning learning_rate=2.5e-4, # Slightly higher for multilingual min_lr=2.5e-5, # 10% of max learning rate warmup_ratio=0.05, # 5% warmup for stability weight_decay=0.01, # Standard L2 regularization max_grad_norm=1.0, # Gradient clipping # ============================================================================ # MODEL CONFIGURATION - Optimized for French # ============================================================================ model_name="openai/gpt-oss-20b", max_seq_length=3072, # Balanced length for French use_flash_attention=True, use_gradient_checkpointing=True, # Mixed precision for efficiency fp16=False, bf16=True, # Better for GPT-OSS # ============================================================================ # LORA CONFIGURATION - Optimized for French Language Learning # ============================================================================ use_lora=True, lora_config={ "r": 24, # Higher rank for language adaptation "lora_alpha": 48, # 2x rank scaling "lora_dropout": 0.05, # Light regularization "target_modules": "all-linear", "target_parameters": [ "7.mlp.experts.gate_up_proj", "7.mlp.experts.down_proj", "15.mlp.experts.gate_up_proj", "15.mlp.experts.down_proj", "23.mlp.experts.gate_up_proj", "23.mlp.experts.down_proj", ], "bias": "none", "task_type": "CAUSAL_LM", }, # ============================================================================ # QUANTIZATION - Balanced Performance/Memory # ============================================================================ use_quantization=True, quantization_config={ "dequantize": True, # MXFP4 as per GPT-OSS tutorial "load_in_4bit": False, # Standard precision for quality }, # ============================================================================ # PERFORMANCE OPTIMIZATION # ============================================================================ # Data loading optimized for large dataset dataloader_num_workers=6, # More workers for large dataset dataloader_pin_memory=True, dataloader_prefetch_factor=3, # Higher prefetch for efficiency # Memory management low_cpu_mem_usage=True, group_by_length=True, # Efficient batching remove_unused_columns=True, # ============================================================================ # EVALUATION & LOGGING # ============================================================================ eval_strategy="steps", eval_steps=200, # Evaluate every 200 steps logging_steps=20, # Log every 20 steps save_strategy="steps", save_steps=500, # Save every 500 steps save_total_limit=3, # Keep 3 best checkpoints metric_for_best_model="eval_loss", greater_is_better=False, load_best_model_at_end=True, # Split ratios for automatic validation/test creation eval_ratio=0.01, test_ratio=0.01, # ============================================================================ # MULTILINGUAL & FRENCH SPECIFIC SETTINGS # ============================================================================ primary_language="fr", # French as primary language reasoning_languages=["French", "English"], # Bilingual reasoning domain_focus="instruction", # Instruction following # ============================================================================ # GENERATION CONFIG FOR EVALUATION - GPT-OSS Harmony Format # ============================================================================ generation_config={ "max_new_tokens": 512, "do_sample": True, "temperature": 0.7, "top_p": 0.9, "top_k": 50, "repetition_penalty": 1.1, "pad_token_id": None, "eos_token_id": None, # GPT-OSS Harmony Format specific settings "reasoning_effort": "medium", # Configurable reasoning level "use_harmony_format": True, # Ensure harmony format in generation }, # ============================================================================ # HF HUB INTEGRATION # ============================================================================ push_to_hub=False, # Set to True to auto-push hub_model_id=None, # Will be set by launch script hub_private_repo=False, # ============================================================================ # MONITORING # ============================================================================ enable_tracking=True, # Trackio monitoring log_artifacts=True, log_metrics=True, log_config=True, ) # Print configuration summary on import print("\nπŸ‡«πŸ‡· OpenHermes-FR Configuration Loaded") print("=" * 50) print(f"πŸ“Š Dataset: {config.dataset_name}") print(f"πŸ—£οΈ Language: French (with {config.dataset_format} format)") print(f"πŸ“ˆ Training: {config.num_train_epochs} epochs") print(f"πŸ”„ Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}") print(f"🧠 LoRA Rank: {config.lora_config['r']}") print(f"πŸ“ Sequence Length: {config.max_seq_length}") print(f"πŸ” Quality Filtering: {'Enabled' if config.filter_bad_entries else 'Disabled'}") print(f"🎡 GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}") print("=" * 50)