Upload folder using huggingface_hub
Browse files- README.md +18 -30
- model-00001-of-00009.safetensors +1 -1
- model-00002-of-00009.safetensors +1 -1
- model-00003-of-00009.safetensors +1 -1
- model-00004-of-00009.safetensors +1 -1
- model-00005-of-00009.safetensors +1 -1
- model-00006-of-00009.safetensors +1 -1
- model-00007-of-00009.safetensors +1 -1
- model-00008-of-00009.safetensors +1 -1
- model-00009-of-00009.safetensors +1 -1
- model.safetensors.index.json +1 -0
- training_args.bin +2 -2
README.md
CHANGED
@@ -7,7 +7,7 @@ tags:
|
|
7 |
datasets:
|
8 |
- HuggingFaceH4/Multilingual-Thinking
|
9 |
model-index:
|
10 |
-
- name: outputs/gpt-oss-out
|
11 |
results: []
|
12 |
---
|
13 |
|
@@ -24,47 +24,34 @@ use_kernels: true
|
|
24 |
model_quantization_config: Mxfp4Config
|
25 |
model_quantization_config_kwargs:
|
26 |
dequantize: true
|
27 |
-
|
28 |
-
strict: false # fallback to fp16 on any odd layers
|
29 |
-
|
30 |
plugins:
|
31 |
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
32 |
|
33 |
experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding
|
34 |
|
35 |
datasets:
|
36 |
-
# - path: winglian/pirate-ultrachat-10k
|
37 |
-
# type: chat_template
|
38 |
-
# split: train
|
39 |
- path: HuggingFaceH4/Multilingual-Thinking
|
40 |
type: chat_template
|
41 |
field_thinking: thinking
|
42 |
template_thinking_key: thinking
|
43 |
-
|
44 |
dataset_prepared_path: last_run_prepared
|
45 |
val_set_size: 0
|
46 |
-
output_dir:
|
47 |
|
48 |
-
sequence_len:
|
49 |
sample_packing: true
|
50 |
pad_to_sequence_len: true
|
51 |
|
52 |
-
# adapter: lora
|
53 |
-
# lora_r: 8
|
54 |
-
# lora_alpha: 16
|
55 |
-
# lora_dropout: 0.0
|
56 |
-
# lora_target_linear: true
|
57 |
-
|
58 |
wandb_project: gpt-oss-20b
|
59 |
-
wandb_name: multilingual-reasoning
|
60 |
|
61 |
gradient_accumulation_steps: 1
|
62 |
micro_batch_size: 2
|
63 |
num_epochs: 1
|
64 |
|
65 |
-
|
66 |
-
optimizer: adamw_torch_fused # 8bit optimizers do not work with FSDP2 offload
|
67 |
-
|
68 |
lr_scheduler: constant_with_warmup
|
69 |
learning_rate: 2e-5
|
70 |
|
@@ -75,7 +62,7 @@ flash_attention: true
|
|
75 |
attn_implementation: kernels-community/vllm-flash-attn3
|
76 |
|
77 |
gradient_checkpointing: true
|
78 |
-
activation_offloading: true
|
79 |
|
80 |
logging_steps: 1
|
81 |
saves_per_epoch: 1
|
@@ -87,20 +74,21 @@ eot_tokens:
|
|
87 |
- "<|end|>"
|
88 |
- "<|return|>"
|
89 |
|
90 |
-
|
91 |
-
fsdp_config:
|
92 |
-
offload_params: true
|
93 |
-
state_dict_type: SHARDED_STATE_DICT
|
94 |
-
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
95 |
-
transformer_layer_cls_to_wrap: GptOssDecoderLayer
|
96 |
-
reshard_after_forward: true
|
97 |
-
# cpu_ram_efficient_loading: false
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
```
|
100 |
|
101 |
</details><br>
|
102 |
|
103 |
-
# outputs/gpt-oss-out
|
104 |
|
105 |
This model is a fine-tuned version of [openai/gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b) on the HuggingFaceH4/Multilingual-Thinking dataset.
|
106 |
|
|
|
7 |
datasets:
|
8 |
- HuggingFaceH4/Multilingual-Thinking
|
9 |
model-index:
|
10 |
+
- name: workspace/data/outputs/gpt-oss-out/
|
11 |
results: []
|
12 |
---
|
13 |
|
|
|
24 |
model_quantization_config: Mxfp4Config
|
25 |
model_quantization_config_kwargs:
|
26 |
dequantize: true
|
27 |
+
|
|
|
|
|
28 |
plugins:
|
29 |
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
30 |
|
31 |
experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding
|
32 |
|
33 |
datasets:
|
|
|
|
|
|
|
34 |
- path: HuggingFaceH4/Multilingual-Thinking
|
35 |
type: chat_template
|
36 |
field_thinking: thinking
|
37 |
template_thinking_key: thinking
|
38 |
+
|
39 |
dataset_prepared_path: last_run_prepared
|
40 |
val_set_size: 0
|
41 |
+
output_dir: /workspace/data/outputs/gpt-oss-out/
|
42 |
|
43 |
+
sequence_len: 8196
|
44 |
sample_packing: true
|
45 |
pad_to_sequence_len: true
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
wandb_project: gpt-oss-20b
|
48 |
+
wandb_name: multilingual-reasoning-fft
|
49 |
|
50 |
gradient_accumulation_steps: 1
|
51 |
micro_batch_size: 2
|
52 |
num_epochs: 1
|
53 |
|
54 |
+
optimizer: adamw_torch_fused
|
|
|
|
|
55 |
lr_scheduler: constant_with_warmup
|
56 |
learning_rate: 2e-5
|
57 |
|
|
|
62 |
attn_implementation: kernels-community/vllm-flash-attn3
|
63 |
|
64 |
gradient_checkpointing: true
|
65 |
+
#activation_offloading: true
|
66 |
|
67 |
logging_steps: 1
|
68 |
saves_per_epoch: 1
|
|
|
74 |
- "<|end|>"
|
75 |
- "<|return|>"
|
76 |
|
77 |
+
deepspeed: /pi-workspace/zero3.json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
+
# fsdp_version: 2
|
80 |
+
# fsdp_config:
|
81 |
+
# offload_params: false
|
82 |
+
# state_dict_type: SHARDED_STATE_DICT
|
83 |
+
# auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
84 |
+
# transformer_layer_cls_to_wrap: GptOssDecoderLayer
|
85 |
+
# reshard_after_forward: true
|
86 |
+
# # cpu_ram_efficient_loading: true
|
87 |
```
|
88 |
|
89 |
</details><br>
|
90 |
|
91 |
+
# workspace/data/outputs/gpt-oss-out/
|
92 |
|
93 |
This model is a fine-tuned version of [openai/gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b) on the HuggingFaceH4/Multilingual-Thinking dataset.
|
94 |
|
model-00001-of-00009.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4504304664
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1e66b52040a9a0777a9c94ad800d7dff662a58c96129efbfd77b54ef553560bd
|
3 |
size 4504304664
|
model-00002-of-00009.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4939127656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9cbe12bb05ec8772ff87808f8916a52f9c330075813be3a978e6b968ba2aa52b
|
3 |
size 4939127656
|
model-00003-of-00009.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4939127656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:77057a058259fdad6458a2cc253c248a9a308c43390b2e1eaeff108144d1502e
|
3 |
size 4939127656
|
model-00004-of-00009.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4939127680
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:72cc1b6291eec305ae8b01a4f7ab53eb15489496d4fe36b7613d822cf307ccf2
|
3 |
size 4939127680
|
model-00005-of-00009.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4939127704
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7c0775267ff51f292d952a8a06d7d7a4cb709ea9fa48433f5dde5dc1ead0d84f
|
3 |
size 4939127704
|
model-00006-of-00009.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4939127704
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89e821b90576a93e94a9457d21012a184e3d3336cd8a499b5040dcc9fa8791c8
|
3 |
size 4939127704
|
model-00007-of-00009.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4939127704
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3f2b0083866777b6ef7405acca40a46242005bbebb423f09bd3223880cc94ee8
|
3 |
size 4939127704
|
model-00008-of-00009.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4939127704
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2b97d39f740e42cf0bbe9e15a6ad59dae1b93396ca2d85899c4bb9d0b6882192
|
3 |
size 4939127704
|
model-00009-of-00009.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2751362856
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:549eb052e40666b8b016c7a6db7031e752785cec977beed6a315fea12489caef
|
3 |
size 2751362856
|
model.safetensors.index.json
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
{
|
2 |
"metadata": {
|
|
|
3 |
"total_size": 41829514368
|
4 |
},
|
5 |
"weight_map": {
|
|
|
1 |
{
|
2 |
"metadata": {
|
3 |
+
"total_parameters": 335424,
|
4 |
"total_size": 41829514368
|
5 |
},
|
6 |
"weight_map": {
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:29e1c313582c8dd3b40fa45dcf6d6482aeabf058adc5837643ba6a5b2ecdb37c
|
3 |
+
size 9489
|