bethrezen commited on
Commit
476ef85
·
verified ·
1 Parent(s): e83ce9f

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -7,7 +7,7 @@ tags:
7
  datasets:
8
  - HuggingFaceH4/Multilingual-Thinking
9
  model-index:
10
- - name: outputs/gpt-oss-out-fft/
11
  results: []
12
  ---
13
 
@@ -24,47 +24,34 @@ use_kernels: true
24
  model_quantization_config: Mxfp4Config
25
  model_quantization_config_kwargs:
26
  dequantize: true
27
- block_size: 32 # default, matches the OCP spec
28
- strict: false # fallback to fp16 on any odd layers
29
-
30
  plugins:
31
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
32
 
33
  experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding
34
 
35
  datasets:
36
- # - path: winglian/pirate-ultrachat-10k
37
- # type: chat_template
38
- # split: train
39
  - path: HuggingFaceH4/Multilingual-Thinking
40
  type: chat_template
41
  field_thinking: thinking
42
  template_thinking_key: thinking
43
-
44
  dataset_prepared_path: last_run_prepared
45
  val_set_size: 0
46
- output_dir: ./outputs/gpt-oss-out-fft/
47
 
48
- sequence_len: 8192
49
  sample_packing: true
50
  pad_to_sequence_len: true
51
 
52
- # adapter: lora
53
- # lora_r: 8
54
- # lora_alpha: 16
55
- # lora_dropout: 0.0
56
- # lora_target_linear: true
57
-
58
  wandb_project: gpt-oss-20b
59
- wandb_name: multilingual-reasoning
60
 
61
  gradient_accumulation_steps: 1
62
  micro_batch_size: 2
63
  num_epochs: 1
64
 
65
- #optimizer: adamw_torch_8bit
66
- optimizer: adamw_torch_fused # 8bit optimizers do not work with FSDP2 offload
67
-
68
  lr_scheduler: constant_with_warmup
69
  learning_rate: 2e-5
70
 
@@ -75,7 +62,7 @@ flash_attention: true
75
  attn_implementation: kernels-community/vllm-flash-attn3
76
 
77
  gradient_checkpointing: true
78
- activation_offloading: true
79
 
80
  logging_steps: 1
81
  saves_per_epoch: 1
@@ -87,20 +74,21 @@ eot_tokens:
87
  - "<|end|>"
88
  - "<|return|>"
89
 
90
- fsdp_version: 2
91
- fsdp_config:
92
- offload_params: true
93
- state_dict_type: SHARDED_STATE_DICT
94
- auto_wrap_policy: TRANSFORMER_BASED_WRAP
95
- transformer_layer_cls_to_wrap: GptOssDecoderLayer
96
- reshard_after_forward: true
97
- # cpu_ram_efficient_loading: false
98
 
 
 
 
 
 
 
 
 
99
  ```
100
 
101
  </details><br>
102
 
103
- # outputs/gpt-oss-out-fft/
104
 
105
  This model is a fine-tuned version of [openai/gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b) on the HuggingFaceH4/Multilingual-Thinking dataset.
106
 
 
7
  datasets:
8
  - HuggingFaceH4/Multilingual-Thinking
9
  model-index:
10
+ - name: workspace/data/outputs/gpt-oss-out/
11
  results: []
12
  ---
13
 
 
24
  model_quantization_config: Mxfp4Config
25
  model_quantization_config_kwargs:
26
  dequantize: true
27
+
 
 
28
  plugins:
29
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
30
 
31
  experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding
32
 
33
  datasets:
 
 
 
34
  - path: HuggingFaceH4/Multilingual-Thinking
35
  type: chat_template
36
  field_thinking: thinking
37
  template_thinking_key: thinking
38
+
39
  dataset_prepared_path: last_run_prepared
40
  val_set_size: 0
41
+ output_dir: /workspace/data/outputs/gpt-oss-out/
42
 
43
+ sequence_len: 8196
44
  sample_packing: true
45
  pad_to_sequence_len: true
46
 
 
 
 
 
 
 
47
  wandb_project: gpt-oss-20b
48
+ wandb_name: multilingual-reasoning-fft
49
 
50
  gradient_accumulation_steps: 1
51
  micro_batch_size: 2
52
  num_epochs: 1
53
 
54
+ optimizer: adamw_torch_fused
 
 
55
  lr_scheduler: constant_with_warmup
56
  learning_rate: 2e-5
57
 
 
62
  attn_implementation: kernels-community/vllm-flash-attn3
63
 
64
  gradient_checkpointing: true
65
+ #activation_offloading: true
66
 
67
  logging_steps: 1
68
  saves_per_epoch: 1
 
74
  - "<|end|>"
75
  - "<|return|>"
76
 
77
+ deepspeed: /pi-workspace/zero3.json
 
 
 
 
 
 
 
78
 
79
+ # fsdp_version: 2
80
+ # fsdp_config:
81
+ # offload_params: false
82
+ # state_dict_type: SHARDED_STATE_DICT
83
+ # auto_wrap_policy: TRANSFORMER_BASED_WRAP
84
+ # transformer_layer_cls_to_wrap: GptOssDecoderLayer
85
+ # reshard_after_forward: true
86
+ # # cpu_ram_efficient_loading: true
87
  ```
88
 
89
  </details><br>
90
 
91
+ # workspace/data/outputs/gpt-oss-out/
92
 
93
  This model is a fine-tuned version of [openai/gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b) on the HuggingFaceH4/Multilingual-Thinking dataset.
94
 
model-00001-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e5b710a03085747c068f0b9fc83d8e43f2f9a3ec3d6bc903594fbdf2899d4cd
3
  size 4504304664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e66b52040a9a0777a9c94ad800d7dff662a58c96129efbfd77b54ef553560bd
3
  size 4504304664
model-00002-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b773934f3578f0a413c72fe60d9496996ccad4adb577d725d3ed0d6581980948
3
  size 4939127656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cbe12bb05ec8772ff87808f8916a52f9c330075813be3a978e6b968ba2aa52b
3
  size 4939127656
model-00003-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:523ef402ee9d1873d922df48d1db37a27e4e77c7750806abfaf11e3c048b4e70
3
  size 4939127656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77057a058259fdad6458a2cc253c248a9a308c43390b2e1eaeff108144d1502e
3
  size 4939127656
model-00004-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd562d033a8c9eed46972cebff9957d3b2fc2fb210cad3820d766e0713b61104
3
  size 4939127680
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72cc1b6291eec305ae8b01a4f7ab53eb15489496d4fe36b7613d822cf307ccf2
3
  size 4939127680
model-00005-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd2a2cad8877cba299fc6998489427dcf863550a4f7a0e583f120f4711ad9628
3
  size 4939127704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c0775267ff51f292d952a8a06d7d7a4cb709ea9fa48433f5dde5dc1ead0d84f
3
  size 4939127704
model-00006-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6222ca7f362227804c67f68f9405ba05f11afb6c270a59300f95ce9adf40e74f
3
  size 4939127704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89e821b90576a93e94a9457d21012a184e3d3336cd8a499b5040dcc9fa8791c8
3
  size 4939127704
model-00007-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab19707e4163f4ec06d11f82fdd8aaec555b06044f91508411fec3603db3f0d6
3
  size 4939127704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f2b0083866777b6ef7405acca40a46242005bbebb423f09bd3223880cc94ee8
3
  size 4939127704
model-00008-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ac63ac294c4ab76a754786e597ae5a6177b8199cea60b6116a07a43007cfae8
3
  size 4939127704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b97d39f740e42cf0bbe9e15a6ad59dae1b93396ca2d85899c4bb9d0b6882192
3
  size 4939127704
model-00009-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d35d808eb3800c3ed4c057f09be24e86bfefc07820ee458486b2048baa0d3d0
3
  size 2751362856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:549eb052e40666b8b016c7a6db7031e752785cec977beed6a315fea12489caef
3
  size 2751362856
model.safetensors.index.json CHANGED
@@ -1,5 +1,6 @@
1
  {
2
  "metadata": {
 
3
  "total_size": 41829514368
4
  },
5
  "weight_map": {
 
1
  {
2
  "metadata": {
3
+ "total_parameters": 335424,
4
  "total_size": 41829514368
5
  },
6
  "weight_map": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7fa4b843cb6bcb99be8589958426e121040c1346b3a28c5c5062def1beab2d33
3
- size 8723
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29e1c313582c8dd3b40fa45dcf6d6482aeabf058adc5837643ba6a5b2ecdb37c
3
+ size 9489