|
_wandb: |
|
value: |
|
cli_version: 0.19.11 |
|
m: |
|
- "1": gpu/memory_allocated_gb |
|
"6": |
|
- 3 |
|
"7": [] |
|
- "1": gpu/max_memory_allocated_gb |
|
"6": |
|
- 3 |
|
"7": [] |
|
- "1": gpu/memory_reserved_gb |
|
"6": |
|
- 3 |
|
"7": [] |
|
python_version: 3.11.10 |
|
t: |
|
"1": |
|
- 1 |
|
- 11 |
|
- 49 |
|
- 51 |
|
- 55 |
|
- 71 |
|
"2": |
|
- 1 |
|
- 11 |
|
- 49 |
|
- 51 |
|
- 55 |
|
- 71 |
|
"3": |
|
- 2 |
|
- 7 |
|
- 13 |
|
- 16 |
|
- 23 |
|
- 55 |
|
- 61 |
|
"4": 3.11.10 |
|
"5": 0.19.11 |
|
"6": 4.52.4 |
|
"8": |
|
- 5 |
|
"12": 0.19.11 |
|
"13": linux-x86_64 |
|
act_fn: |
|
value: relu |
|
batch_size: |
|
value: 8192 |
|
before_ln: |
|
value: false |
|
c_coeff: |
|
value: 4 |
|
cooldown_start_frac: |
|
value: 0.8 |
|
d_feature: |
|
value: 163840 |
|
d_model: |
|
value: 5120 |
|
device: |
|
value: cuda:0 |
|
initial_lr: |
|
value: 0.0002 |
|
layer_idx: |
|
value: 0 |
|
lr: |
|
value: 0.0002 |
|
min_lr_ratio: |
|
value: 0 |
|
model_name: |
|
value: Qwen/Qwen3-14B |
|
model_type: |
|
value: qwen |
|
n_batches: |
|
value: 277 |
|
n_grad_steps: |
|
value: 4 |
|
n_steps: |
|
value: 122070 |
|
preact_coeff: |
|
value: 6e-05 |
|
shuffle_buffer_batches: |
|
value: 32 |
|
skip_connections: |
|
value: false |
|
sparsity_coeff_final: |
|
value: 8 |
|
x_scale: |
|
value: 1 |
|
y_scale: |
|
value: 1 |
|
|