See axolotl config
axolotl version: 0.9.2
base_model: google/gemma-3-12b-it
#load_in_4bit: true
#auto_resume_from_checkpoints: true
# gemma3 doesn't seem to play nice with ddp
ddp_find_unused_parameters: true
tokenizer_config: le-llm/gemma-3-12b-it-reasoning-tokenizer
# added_tokens_overrides: {6: "<|begin_of_thought|>", 7: "<|end_of_thought|>", 8: "<|begin_of_solution|>", 9: "<|end_of_solution|>"}
chat_template: gemma3
eot_tokens:
- <end_of_turn>
datasets:
- path: le-llm/openthoughts-113k
type: chat_template
field_messages: conversations
message_property_mappings:
role: from
content: value
dataset_processes: 64
#dataset_keep_in_memory: true
#dataloader_num_workers: 8
#dataloader_prefetch_factor: 16
chat_template: gemma3
dataset_prepared_path: last_run_prepared_reasoning
# val_set_size: 0.01
output_dir: ./outputs/gemma-3-12b-it-reasoning-tok-27b
#adapter: qlora
#lora_model_dir:
sequence_len: 32768 # 16384 # 2048
sample_packing: false # true
pad_to_sequence_len: true
train_on_inputs: true
tensor_parallel_size: 8
# tiled_mlp: true
#context_parallel_size: 8
# dp_shard_size: 4
plugins:
- axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true
# spectrum
#- axolotl.integrations.spectrum.SpectrumPlugin
#spectrum_top_fraction: 0.5
#spectrum_model_name: google/gemma-3-12b-it
wandb_project: gemma-3-12b-reasoning
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch_fused # muon #adamw_bnb_8bit
lr_scheduler: warmup_stable_decay
learning_rate: 5e-5
lr_scheduler_kwargs: {"num_decay_steps": 150}
bf16: auto
# fp16:
tf32: false # TODO: double check precision impact
deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json # deepspeed_configs/zero3_bf16.json
# TODO: When using FSDP full shard, instead of using `gradient_checkpointing` in TrainingArguments, please use `activation_checkpointing` in `fsdp_config`. The former introduces a redundant AllGather operation in backward pass. Reference: https://github.com/huggingface/transformers/issues/30404
#fsdp:
# - full_shard
# - auto_wrap
#fsdp_config:
# fsdp_offload_params: true
# fsdp_state_dict_type: FULL_STATE_DICT
# fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
#fp8: true
#fp8_enable_fsdp_float8_all_gather: true
#torch_compile: true
#fsdp:
# - full_shard
# - auto_wrap
#fsdp_config:
# fsdp_version: 2
# fsdp_offload_params: false
# fsdp_cpu_ram_efficient_loading: false
# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
# fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
# fsdp_state_dict_type: FULL_STATE_DICT
# fsdp_sharding_strategy: FULL_SHARD
# fsdp_reshard_after_forward: true
# # fsdp_activation_checkpointing: true
gradient_checkpointing: true # required for activation offloading
activation_offloading: legacy
#gradient_checkpointing: true
#gradient_checkpointing_kwargs:
# use_reentrant: false
#activation_offloading: true
logging_steps: 1
flash_attention: true # not recommended for gemma3 due to soft logit capping, but it should be fixed in the lates flash attention
#eager_attention:
# torch_compile: True
warmup_steps: 150 #0.4
evals_per_epoch: 1
save_steps: 100
save_total_limit: 6
#saves_per_epoch: 1
weight_decay: 0.0
outputs/gemma-3-12b-it-reasoning-tok-27b
This model is a fine-tuned version of google/gemma-3-12b-it on the le-llm/openthoughts-113k dataset.
Model description
More information needed
Intended uses & limitations
More information needed
Training and evaluation data
More information needed
Training procedure
Training hyperparameters
The following hyperparameters were used during training:
- learning_rate: 5e-05
- train_batch_size: 2
- eval_batch_size: 2
- seed: 42
- distributed_type: multi-GPU
- num_devices: 32
- gradient_accumulation_steps: 2
- total_train_batch_size: 128
- total_eval_batch_size: 64
- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
- lr_scheduler_type: warmup_stable_decay
- lr_scheduler_warmup_steps: 150
- num_epochs: 1.0
Training results
Framework versions
- Transformers 4.51.3
- Pytorch 2.6.0+cu124
- Datasets 3.5.1
- Tokenizers 0.21.2
- Downloads last month
- 131
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support