See axolotl config
axolotl version: 0.10.0.dev0
base_model: Qwen/Qwen3-0.6B-Base
hub_model_id: cyberbabooshka/base
wandb_name: base
tokenizer_type: AutoTokenizer
load_in_8bit: false
load_in_4bit: false
num_processes: 64
dataset_processes: 64
dataset_prepared_path: last_run_prepared
chat_template: jinja
chat_template_jinja: >-
{%- if tools %}
{{- '<|im_start|>system\n' }}
{%- if messages[0].role == 'system' %}
{{- messages[0].content + '\n\n' }}
{%- endif %}
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
{%- for tool in tools %}
{{- "\n" }}
{{- tool | tojson }}
{%- endfor %}
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
{%- if messages[0].role == 'system' %}
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{%- for message in messages[::-1] %}
{%- set index = (messages|length - 1) - loop.index0 %}
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
{%- set ns.multi_step_tool = false %}
{%- set ns.last_query_index = index %}
{%- endif %}
{%- endfor %}
{%- for message in messages %}
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" %}
{%- set content = message.content %}
{%- set reasoning_content = '' %}
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
{%- set reasoning_content = message.reasoning_content %}
{%- else %}
{%- if '</think>' in message.content %}
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
{%- endif %}
{%- endif %}
{%- if loop.index0 > ns.last_query_index %}
{%- if loop.last or (not loop.last and reasoning_content) %}
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- if message.tool_calls %}
{%- for tool_call in message.tool_calls %}
{%- if (loop.first and content) or (not loop.first) %}
{{- '\n' }}
{%- endif %}
{%- if tool_call.function %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '<tool_call>\n{"name": "' }}
{{- tool_call.name }}
{{- '", "arguments": ' }}
{%- if tool_call.arguments is string %}
{{- tool_call.arguments }}
{%- else %}
{{- tool_call.arguments | tojson }}
{%- endif %}
{{- '}\n</tool_call>' }}
{%- endfor %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
{{- '<|im_start|>user' }}
{%- endif %}
{{- '\n<tool_response>\n' }}
{{- message.content }}
{{- '\n</tool_response>' }}
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- if enable_thinking is defined and enable_thinking is false %}
{{- '<think>\n\n</think>\n\n' }}
{%- else %}
{{- '<think>\n' }}
{%- endif %}
{%- endif %}
datasets:
- path: open-thoughts/OpenThoughts2-1M
split: train[1%:]
type: chat_template
field_messages: conversations
train_on_eos: turn
train_on_eot: turn
message_property_mappings:
role: from
content: value
roles:
user:
- user
assistant:
- assistant
test_datasets:
- path: open-thoughts/OpenThoughts2-1M
split: train[:1%]
type: chat_template
field_messages: conversations
train_on_eos: turn
train_on_eot: turn
message_property_mappings:
role: from
content: value
roles:
user:
- user
assistant:
- assistant
output_dir: ./outputs
sequence_len: 9096
batch_flattening: true
sample_packing: false
# adapter: lora
lora_model_dir:
lora_r: 64
lora_alpha: 32
lora_dropout: 0.0
lora_target_modules:
- embed_tokens
lora_target_linear: true
lora_on_cpu: false
wandb_project: mnlp
wandb_entity: aleksandr-dremov-epfl
wandb_watch:
wandb_log_model:
gradient_accumulation_steps: 2
eval_batch_size: 16
micro_batch_size: 4
optimizer: ademamix_8bit
weight_decay: 0.01
learning_rate: 0.00001
warmup_steps: 500
wsd_final_lr_factor: 0.0
wsd_init_div_factor: 100
wsd_fract_decay: 0.2
wsd_decay_type: "sqrt"
wsd_sqrt_power: 0.5
wsd_cooldown_start_lr_factor: 1.0
bf16: auto
tf32: false
torch_compile: true
flash_attention: true
gradient_checkpointing: false
resume_from_checkpoint:
auto_resume_from_checkpoints: true
logging_steps: 16
eval_steps: 2000
save_steps: 1000
max_steps: 40000
num_epochs: 20000000
save_total_limit: 2
special_tokens:
eos_token: "<|im_end|>"
pad_token: "<|endoftext|>"
eot_tokens:
- <|im_end|>
plugins:
- axolotl_wsd.WSDSchedulerPlugin
base
This model is a fine-tuned version of Qwen/Qwen3-0.6B-Base on the open-thoughts/OpenThoughts2-1M dataset. It achieves the following results on the evaluation set:
- Loss: 0.5060
Model description
More information needed
Intended uses & limitations
More information needed
Training and evaluation data
More information needed
Training procedure
Training hyperparameters
The following hyperparameters were used during training:
- learning_rate: 1e-05
- train_batch_size: 4
- eval_batch_size: 16
- seed: 42
- distributed_type: multi-GPU
- num_devices: 4
- gradient_accumulation_steps: 2
- total_train_batch_size: 32
- total_eval_batch_size: 64
- optimizer: Use OptimizerNames.ADEMAMIX_8BIT and the args are: No additional optimizer arguments
- lr_scheduler_type: cosine
- lr_scheduler_warmup_steps: 500
- training_steps: 40000
Training results
Training Loss | Epoch | Step | Validation Loss |
---|---|---|---|
No log | 0.0000 | 1 | 0.8524 |
0.5816 | 0.0671 | 2000 | 0.6038 |
0.554 | 0.1342 | 4000 | 0.5775 |
0.5746 | 0.2013 | 6000 | 0.5623 |
0.5304 | 0.2684 | 8000 | 0.5516 |
0.5334 | 0.3355 | 10000 | 0.5434 |
0.5378 | 0.4026 | 12000 | 0.5372 |
0.5205 | 0.4697 | 14000 | 0.5322 |
0.5301 | 0.5368 | 16000 | 0.5284 |
0.4979 | 0.6039 | 18000 | 0.5253 |
0.514 | 0.6710 | 20000 | 0.5225 |
0.5022 | 0.7381 | 22000 | 0.5202 |
0.5183 | 0.8052 | 24000 | 0.5187 |
0.4987 | 0.8724 | 26000 | 0.5175 |
0.5041 | 0.9395 | 28000 | 0.5161 |
0.4961 | 1.0066 | 30000 | 0.5159 |
0.4882 | 1.0737 | 32000 | 0.5161 |
0.5021 | 1.1408 | 34000 | 0.5117 |
0.4793 | 1.2079 | 36000 | 0.5093 |
0.4854 | 1.2750 | 38000 | 0.5071 |
0.4947 | 1.3421 | 40000 | 0.5060 |
Framework versions
- Transformers 4.51.3
- Pytorch 2.6.0+cu124
- Datasets 3.5.0
- Tokenizers 0.21.1
- Downloads last month
- 913
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support
Model tree for cyberbabooshka/base
Base model
Qwen/Qwen3-0.6B-Base