Files changed (1) hide show
  1. README.md +435 -421
README.md CHANGED
@@ -1,422 +1,436 @@
1
- ---
2
- library_name: transformers
3
- license: apache-2.0
4
- base_model: Qwen/Qwen2.5-14B
5
- datasets:
6
- - anthracite-org/kalo-opus-instruct-22k-no-refusal
7
- - Nopm/Opus_WritingStruct
8
- - Gryphe/Sonnet3.5-SlimOrcaDedupCleaned
9
- - Gryphe/Sonnet3.5-Charcard-Roleplay
10
- - Gryphe/ChatGPT-4o-Writing-Prompts
11
- - Epiculous/Synthstruct-Gens-v1.1-Filtered-n-Cleaned
12
- - Epiculous/SynthRP-Gens-v1.1-Filtered-n-Cleaned
13
- - nothingiisreal/Reddit-Dirty-And-WritingPrompts
14
- - allura-org/Celeste-1.x-data-mixture
15
- - cognitivecomputations/dolphin-2.9.3
16
- tags:
17
- - generated_from_trainer
18
- model-index:
19
- - name: EVA-Qwen2.5-14B-SFFT-v0.2
20
- results: []
21
- ---
22
-
23
-
24
- # EVA Qwen2.5-14B v0.2
25
-
26
- <p>
27
- A RP/storywriting specialist model, full-parameter finetune of Qwen2.5-14B on mixture of synthetic and natural data.<br>
28
- It uses Celeste 70B 0.1 data mixture, greatly expanding it to improve versatility, creativity and "flavor" of the resulting model.<br>
29
- </p>
30
-
31
- <p><b>Version notes for 0.2</b>: Now using the refined dataset from 32B 0.2. Major improvements in coherence, instruction following and long-context comprehension over 14B v0.1.</p>
32
-
33
- <p>
34
- <p>Prompt format is ChatML.</p><br>
35
- <h3>Recommended sampler values:</h3>
36
- <ul>
37
- <li>Temperature: 0.8</li>
38
- <li>Min-P: 0.05</li>
39
- <li>Top-A: 0.3</li>
40
- <li>Repetition Penalty: 1.03</li>
41
- </ul>
42
-
43
- <h3>Recommended SillyTavern presets (via CalamitousFelicitousness):</h3>
44
-
45
- - [Context](https://huggingface.co/EVA-UNIT-01/EVA-Yi-1.5-9B-32K-V1/blob/main/%5BChatML%5D%20Roleplay-v1.9%20Context.json)
46
- - [Instruct and System Prompt](https://huggingface.co/EVA-UNIT-01/EVA-Yi-1.5-9B-32K-V1/blob/main/%5BChatML%5D%20Roleplay-v1.9%20Instruct.json)
47
- </p>
48
-
49
- <p>
50
- <br>
51
- <h3>
52
- Training data:
53
- </h3>
54
- <ul>
55
- <li>Celeste 70B 0.1 data mixture minus Opus Instruct subset. See that model's <a href=https://huggingface.co/nothingiisreal/L3.1-70B-Celeste-V0.1-BF16>card</a> for details.</li>
56
- <li>Kalomaze's Opus_Instruct_25k dataset, filtered for refusals.</li>
57
- <li>A subset (1k rows) of ChatGPT-4o-WritingPrompts by Gryphe</li>
58
- <li>A subset (2k rows) of Sonnet3.5-Charcards-Roleplay by Gryphe</li>
59
- <li>Synthstruct and SynthRP datasets by Epiculous</li>
60
- <li>A subset from Dolphin-2.9.3, including filtered version of not_samantha and a small subset of systemchat.</li>
61
- </ul>
62
- <h3>
63
- Training time and hardware:
64
- </h3>
65
- <ul><li> 3 hours on 8xH100 SXM, provided by <a href=https://featherless.ai/>FeatherlessAI</a></li></ul><br>
66
- </p>
67
- <p>Model was created by Kearm, Auri and Cahvay.</p>
68
- <h4>Special thanks:</h4><ul>
69
- <li><b>to Cahvay for his work on investigating and reprocessing the corrupted dataset, removing the single biggest source of data poisoning.</b></li>
70
- <li><b>to <a href=https://featherless.ai/>FeatherlessAI</a> for generously providing 8xH100 SXM node for training of this model</b></li>
71
- <li>to Gryphe, Lemmy, Kalomaze, Nopm, Epiculous and CognitiveComputations for the data</li>
72
- <li>and to Allura-org for support, feedback, beta-testing and doing quality control of EVA models.</li></ul>
73
-
74
-
75
- [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
76
- <details><summary>See axolotl config</summary>
77
-
78
- axolotl version: `0.4.1`
79
- ```yaml
80
- base_model: Qwen/Qwen2.5-14B
81
-
82
- load_in_8bit: false
83
- load_in_4bit: false
84
- strict: false
85
-
86
- plugins:
87
- - axolotl.integrations.liger.LigerPlugin
88
- liger_rope: true
89
- liger_rms_norm: true
90
- liger_swiglu: true
91
- liger_fused_linear_cross_entropy: true
92
-
93
- # plugins:
94
- # - axolotl.integrations.spectrum.SpectrumPlugin
95
-
96
- # spectrum_top_fraction: 0.5
97
- # # Optional if using a pre-scanned model as your base_model. Useful if using a model mirror
98
- # spectrum_model_name: Qwen/Qwen2.5-32B
99
-
100
- datasets:
101
- - path: datasets/Celeste_Filtered_utf8fix.jsonl
102
- type: sharegpt
103
- - path: datasets/deduped_not_samantha_norefusals.jsonl
104
- type: sharegpt
105
- - path: datasets/deduped_SynthRP-Gens_processed_ShareGPT_converted_cleaned.jsonl
106
- type: sharegpt
107
- - path: datasets/deduped_Synthstruct-Gens_processed_sharegpt_converted_cleaned.jsonl
108
- type: sharegpt
109
- - path: datasets/Gryphe-4o-WP-filtered-sharegpt_utf8fix.jsonl
110
- type: sharegpt
111
- - path: datasets/opus-instruct-22k-no_refusals-filtered_utf8fix.jsonl
112
- type: sharegpt
113
- - path: datasets/Sonnet3-5-charcard-names-filtered-sharegpt_utf8fix.jsonl
114
- type: sharegpt
115
- - path: datasets/SystemChat_subset_filtered_sharegpt_utf8fix.jsonl
116
- type: sharegpt
117
-
118
- chat_template: chatml
119
- shuffle_merged_datasets: true
120
- val_set_size: 0.001
121
- output_dir: ./EVA-Qwen2.5-14B-SFFT-v0.2
122
-
123
- sequence_len: 10240
124
- sample_packing: true
125
- eval_sample_packing: false
126
- pad_to_sequence_len: true
127
-
128
- # adapter: qlora
129
- # lora_model_dir:
130
- # lora_r: 64
131
- # lora_alpha: 128
132
- # lora_dropout: 0.05
133
- # lora_target_linear: true
134
- # peft_use_dora: true
135
-
136
- base_model: Qwen/Qwen2.5-14B
137
-
138
- load_in_8bit: false
139
- load_in_4bit: false
140
- strict: false
141
-
142
- plugins:
143
- - axolotl.integrations.liger.LigerPlugin
144
- liger_rope: true
145
- liger_rms_norm: true
146
- liger_swiglu: true
147
- liger_fused_linear_cross_entropy: true
148
-
149
- datasets:
150
- - path: datasets/Celeste_Filtered_utf8fix.jsonl
151
- type: sharegpt
152
- - path: datasets/deduped_not_samantha_norefusals.jsonl
153
- type: sharegpt
154
- - path: datasets/deduped_SynthRP-Gens_processed_ShareGPT_converted_cleaned.jsonl
155
- type: sharegpt
156
- - path: datasets/deduped_Synthstruct-Gens_processed_sharegpt_converted_cleaned.jsonl
157
- type: sharegpt
158
- - path: datasets/Gryphe-4o-WP-filtered-sharegpt_utf8fix.jsonl
159
- type: sharegpt
160
- - path: datasets/opus-instruct-22k-no_refusals-filtered_utf8fix.jsonl
161
- type: sharegpt
162
- - path: datasets/Sonnet3-5-charcard-names-filtered-sharegpt_utf8fix.jsonl
163
- type: sharegpt
164
- - path: datasets/SystemChat_subset_filtered_sharegpt_utf8fix.jsonl
165
- type: sharegpt
166
-
167
- chat_template: chatml
168
- shuffle_merged_datasets: true
169
- val_set_size: 0.005
170
- output_dir: ./EVA-Qwen2.5-14B-SFFT-v0.2
171
-
172
- sequence_len: 10240
173
- sample_packing: true
174
- eval_sample_packing: false
175
- pad_to_sequence_len: true
176
-
177
- # adapter: qlora
178
- # lora_model_dir:
179
- # lora_r: 32
180
- # lora_alpha: 16
181
- # lora_dropout: 0.05
182
- # lora_target_linear: true
183
- # peft_use_dora: true
184
-
185
- unfrozen_parameters:
186
- - ^lm_head.weight$
187
- - ^model.embed_tokens.weight$
188
- # mlp.down_proj layers
189
- - model.layers.1.mlp.down_proj
190
- - model.layers.35.mlp.down_proj
191
- - model.layers.38.mlp.down_proj
192
- - model.layers.37.mlp.down_proj
193
- - model.layers.36.mlp.down_proj
194
- - model.layers.15.mlp.down_proj
195
- - model.layers.11.mlp.down_proj
196
- - model.layers.12.mlp.down_proj
197
- - model.layers.34.mlp.down_proj
198
- - model.layers.44.mlp.down_proj
199
- - model.layers.45.mlp.down_proj
200
- - model.layers.9.mlp.down_proj
201
- - model.layers.41.mlp.down_proj
202
- - model.layers.33.mlp.down_proj
203
- - model.layers.43.mlp.down_proj
204
- - model.layers.40.mlp.down_proj
205
- - model.layers.13.mlp.down_proj
206
- - model.layers.8.mlp.down_proj
207
- - model.layers.39.mlp.down_proj
208
- - model.layers.10.mlp.down_proj
209
- - model.layers.14.mlp.down_proj
210
- - model.layers.16.mlp.down_proj
211
- - model.layers.31.mlp.down_proj
212
- - model.layers.32.mlp.down_proj
213
- # mlp.gate_proj layers
214
- - model.layers.1.mlp.gate_proj
215
- - model.layers.44.mlp.gate_proj
216
- - model.layers.46.mlp.gate_proj
217
- - model.layers.45.mlp.gate_proj
218
- - model.layers.43.mlp.gate_proj
219
- - model.layers.47.mlp.gate_proj
220
- - model.layers.42.mlp.gate_proj
221
- - model.layers.32.mlp.gate_proj
222
- - model.layers.27.mlp.gate_proj
223
- - model.layers.33.mlp.gate_proj
224
- - model.layers.28.mlp.gate_proj
225
- - model.layers.39.mlp.gate_proj
226
- - model.layers.41.mlp.gate_proj
227
- - model.layers.40.mlp.gate_proj
228
- - model.layers.30.mlp.gate_proj
229
- - model.layers.29.mlp.gate_proj
230
- - model.layers.31.mlp.gate_proj
231
- - model.layers.37.mlp.gate_proj
232
- - model.layers.26.mlp.gate_proj
233
- - model.layers.10.mlp.gate_proj
234
- - model.layers.38.mlp.gate_proj
235
- - model.layers.36.mlp.gate_proj
236
- - model.layers.12.mlp.gate_proj
237
- - model.layers.13.mlp.gate_proj
238
- # mlp.up_proj layers
239
- - model.layers.1.mlp.up_proj
240
- - model.layers.13.mlp.up_proj
241
- - model.layers.11.mlp.up_proj
242
- - model.layers.14.mlp.up_proj
243
- - model.layers.15.mlp.up_proj
244
- - model.layers.12.mlp.up_proj
245
- - model.layers.8.mlp.up_proj
246
- - model.layers.16.mlp.up_proj
247
- - model.layers.9.mlp.up_proj
248
- - model.layers.19.mlp.up_proj
249
- - model.layers.10.mlp.up_proj
250
- - model.layers.7.mlp.up_proj
251
- - model.layers.17.mlp.up_proj
252
- - model.layers.20.mlp.up_proj
253
- - model.layers.21.mlp.up_proj
254
- - model.layers.18.mlp.up_proj
255
- - model.layers.37.mlp.up_proj
256
- - model.layers.38.mlp.up_proj
257
- - model.layers.39.mlp.up_proj
258
- - model.layers.42.mlp.up_proj
259
- - model.layers.41.mlp.up_proj
260
- - model.layers.27.mlp.up_proj
261
- - model.layers.28.mlp.up_proj
262
- - model.layers.36.mlp.up_proj
263
- # self_attn.k_proj layers
264
- - model.layers.47.self_attn.k_proj
265
- - model.layers.39.self_attn.k_proj
266
- - model.layers.41.self_attn.k_proj
267
- - model.layers.37.self_attn.k_proj
268
- - model.layers.35.self_attn.k_proj
269
- - model.layers.44.self_attn.k_proj
270
- - model.layers.38.self_attn.k_proj
271
- - model.layers.14.self_attn.k_proj
272
- - model.layers.7.self_attn.k_proj
273
- - model.layers.12.self_attn.k_proj
274
- - model.layers.11.self_attn.k_proj
275
- - model.layers.32.self_attn.k_proj
276
- - model.layers.10.self_attn.k_proj
277
- - model.layers.8.self_attn.k_proj
278
- - model.layers.6.self_attn.k_proj
279
- - model.layers.9.self_attn.k_proj
280
- - model.layers.45.self_attn.k_proj
281
- - model.layers.42.self_attn.k_proj
282
- - model.layers.40.self_attn.k_proj
283
- - model.layers.5.self_attn.k_proj
284
- - model.layers.0.self_attn.k_proj
285
- - model.layers.33.self_attn.k_proj
286
- - model.layers.34.self_attn.k_proj
287
- - model.layers.13.self_attn.k_proj
288
- # self_attn.o_proj layers
289
- - model.layers.12.self_attn.o_proj
290
- - model.layers.5.self_attn.o_proj
291
- - model.layers.14.self_attn.o_proj
292
- - model.layers.16.self_attn.o_proj
293
- - model.layers.20.self_attn.o_proj
294
- - model.layers.13.self_attn.o_proj
295
- - model.layers.11.self_attn.o_proj
296
- - model.layers.4.self_attn.o_proj
297
- - model.layers.6.self_attn.o_proj
298
- - model.layers.19.self_attn.o_proj
299
- - model.layers.7.self_attn.o_proj
300
- - model.layers.18.self_attn.o_proj
301
- - model.layers.8.self_attn.o_proj
302
- - model.layers.38.self_attn.o_proj
303
- - model.layers.15.self_attn.o_proj
304
- - model.layers.17.self_attn.o_proj
305
- - model.layers.9.self_attn.o_proj
306
- - model.layers.10.self_attn.o_proj
307
- - model.layers.21.self_attn.o_proj
308
- - model.layers.28.self_attn.o_proj
309
- - model.layers.32.self_attn.o_proj
310
- - model.layers.35.self_attn.o_proj
311
- - model.layers.39.self_attn.o_proj
312
- - model.layers.3.self_attn.o_proj
313
- # self_attn.q_proj layers
314
- - model.layers.1.self_attn.q_proj
315
- - model.layers.2.self_attn.q_proj
316
- - model.layers.3.self_attn.q_proj
317
- - model.layers.44.self_attn.q_proj
318
- - model.layers.29.self_attn.q_proj
319
- - model.layers.45.self_attn.q_proj
320
- - model.layers.43.self_attn.q_proj
321
- - model.layers.32.self_attn.q_proj
322
- - model.layers.38.self_attn.q_proj
323
- - model.layers.19.self_attn.q_proj
324
- - model.layers.42.self_attn.q_proj
325
- - model.layers.34.self_attn.q_proj
326
- - model.layers.36.self_attn.q_proj
327
- - model.layers.40.self_attn.q_proj
328
- - model.layers.26.self_attn.q_proj
329
- - model.layers.20.self_attn.q_proj
330
- - model.layers.28.self_attn.q_proj
331
- - model.layers.39.self_attn.q_proj
332
- - model.layers.41.self_attn.q_proj
333
- - model.layers.33.self_attn.q_proj
334
- - model.layers.35.self_attn.q_proj
335
- - model.layers.25.self_attn.q_proj
336
- - model.layers.30.self_attn.q_proj
337
- - model.layers.27.self_attn.q_proj
338
- # self_attn.v_proj layers
339
- - model.layers.0.self_attn.v_proj
340
- - model.layers.7.self_attn.v_proj
341
- - model.layers.39.self_attn.v_proj
342
- - model.layers.31.self_attn.v_proj
343
- - model.layers.15.self_attn.v_proj
344
- - model.layers.10.self_attn.v_proj
345
- - model.layers.41.self_attn.v_proj
346
- - model.layers.32.self_attn.v_proj
347
- - model.layers.6.self_attn.v_proj
348
- - model.layers.33.self_attn.v_proj
349
- - model.layers.42.self_attn.v_proj
350
- - model.layers.29.self_attn.v_proj
351
- - model.layers.9.self_attn.v_proj
352
- - model.layers.14.self_attn.v_proj
353
- - model.layers.35.self_attn.v_proj
354
- - model.layers.38.self_attn.v_proj
355
- - model.layers.13.self_attn.v_proj
356
- - model.layers.30.self_attn.v_proj
357
- - model.layers.34.self_attn.v_proj
358
- - model.layers.5.self_attn.v_proj
359
- - model.layers.28.self_attn.v_proj
360
- - model.layers.37.self_attn.v_proj
361
- - model.layers.27.self_attn.v_proj
362
- - model.layers.11.self_attn.v_proj
363
-
364
- wandb_project: EVA-Qwen2.5-14B-SFFT-v0.2
365
- wandb_entity:
366
- wandb_watch:
367
- wandb_name: Unit-02
368
- wandb_log_model:
369
-
370
- gradient_accumulation_steps: 8
371
- micro_batch_size: 2
372
- num_epochs: 3
373
- optimizer: paged_ademamix_8bit
374
- lr_scheduler: cosine
375
- learning_rate: 0.00005
376
- max_grad_norm: 3
377
-
378
- train_on_inputs: false
379
- group_by_length: false
380
- bf16: auto
381
- fp16:
382
- tf32: false
383
-
384
- gradient_checkpointing: "unsloth"
385
- # gradient_checkpointing_kwargs:
386
- # use_reentrant: true
387
- early_stopping_patience:
388
- resume_from_checkpoint:
389
- local_rank:
390
- logging_steps: 1
391
- xformers_attention:
392
- flash_attention: true
393
-
394
- warmup_steps: 20
395
- evals_per_epoch: 4
396
- saves_per_epoch: 4
397
- save_safetensors: true
398
- hub_model_id:
399
- hub_strategy:
400
- debug:
401
- deepspeed: deepspeed_configs/zero3_bf16.json
402
- weight_decay: 0.1
403
- # fsdp:
404
- # - full_shard
405
- # - auto_wrap
406
- # fsdp_config:
407
- # fsdp_limit_all_gathers: true
408
- # fsdp_sync_module_states: false
409
- # fsdp_offload_params: true
410
- # fsdp_cpu_ram_efficient_loading: true
411
- # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
412
- # fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
413
- # fsdp_activation_checkpointing: true
414
- # fsdp_state_dict_type: SHARDED_STATE_DICT # Changed from FULL_STATE_DICT
415
- # fsdp_sharding_strategy: FULL_SHARD
416
- # fsdp_forward_prefetch: false # Added
417
- # fsdp_backward_prefetch: "BACKWARD_PRE" # Added
418
- # fsdp_backward_prefetch_limit: 1 # Added
419
- # fsdp_mixed_precision: BF16 # Added
420
- ```
421
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  </details><br>
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: Qwen/Qwen2.5-14B
5
+ datasets:
6
+ - anthracite-org/kalo-opus-instruct-22k-no-refusal
7
+ - Nopm/Opus_WritingStruct
8
+ - Gryphe/Sonnet3.5-SlimOrcaDedupCleaned
9
+ - Gryphe/Sonnet3.5-Charcard-Roleplay
10
+ - Gryphe/ChatGPT-4o-Writing-Prompts
11
+ - Epiculous/Synthstruct-Gens-v1.1-Filtered-n-Cleaned
12
+ - Epiculous/SynthRP-Gens-v1.1-Filtered-n-Cleaned
13
+ - nothingiisreal/Reddit-Dirty-And-WritingPrompts
14
+ - allura-org/Celeste-1.x-data-mixture
15
+ - cognitivecomputations/dolphin-2.9.3
16
+ tags:
17
+ - generated_from_trainer
18
+ language:
19
+ - zho
20
+ - eng
21
+ - fra
22
+ - spa
23
+ - por
24
+ - deu
25
+ - ita
26
+ - rus
27
+ - jpn
28
+ - kor
29
+ - vie
30
+ - tha
31
+ - ara
32
+ model-index:
33
+ - name: EVA-Qwen2.5-14B-SFFT-v0.2
34
+ results: []
35
+ ---
36
+
37
+
38
+ # EVA Qwen2.5-14B v0.2
39
+
40
+ <p>
41
+ A RP/storywriting specialist model, full-parameter finetune of Qwen2.5-14B on mixture of synthetic and natural data.<br>
42
+ It uses Celeste 70B 0.1 data mixture, greatly expanding it to improve versatility, creativity and "flavor" of the resulting model.<br>
43
+ </p>
44
+
45
+ <p><b>Version notes for 0.2</b>: Now using the refined dataset from 32B 0.2. Major improvements in coherence, instruction following and long-context comprehension over 14B v0.1.</p>
46
+
47
+ <p>
48
+ <p>Prompt format is ChatML.</p><br>
49
+ <h3>Recommended sampler values:</h3>
50
+ <ul>
51
+ <li>Temperature: 0.8</li>
52
+ <li>Min-P: 0.05</li>
53
+ <li>Top-A: 0.3</li>
54
+ <li>Repetition Penalty: 1.03</li>
55
+ </ul>
56
+
57
+ <h3>Recommended SillyTavern presets (via CalamitousFelicitousness):</h3>
58
+
59
+ - [Context](https://huggingface.co/EVA-UNIT-01/EVA-Yi-1.5-9B-32K-V1/blob/main/%5BChatML%5D%20Roleplay-v1.9%20Context.json)
60
+ - [Instruct and System Prompt](https://huggingface.co/EVA-UNIT-01/EVA-Yi-1.5-9B-32K-V1/blob/main/%5BChatML%5D%20Roleplay-v1.9%20Instruct.json)
61
+ </p>
62
+
63
+ <p>
64
+ <br>
65
+ <h3>
66
+ Training data:
67
+ </h3>
68
+ <ul>
69
+ <li>Celeste 70B 0.1 data mixture minus Opus Instruct subset. See that model's <a href=https://huggingface.co/nothingiisreal/L3.1-70B-Celeste-V0.1-BF16>card</a> for details.</li>
70
+ <li>Kalomaze's Opus_Instruct_25k dataset, filtered for refusals.</li>
71
+ <li>A subset (1k rows) of ChatGPT-4o-WritingPrompts by Gryphe</li>
72
+ <li>A subset (2k rows) of Sonnet3.5-Charcards-Roleplay by Gryphe</li>
73
+ <li>Synthstruct and SynthRP datasets by Epiculous</li>
74
+ <li>A subset from Dolphin-2.9.3, including filtered version of not_samantha and a small subset of systemchat.</li>
75
+ </ul>
76
+ <h3>
77
+ Training time and hardware:
78
+ </h3>
79
+ <ul><li> 3 hours on 8xH100 SXM, provided by <a href=https://featherless.ai/>FeatherlessAI</a></li></ul><br>
80
+ </p>
81
+ <p>Model was created by Kearm, Auri and Cahvay.</p>
82
+ <h4>Special thanks:</h4><ul>
83
+ <li><b>to Cahvay for his work on investigating and reprocessing the corrupted dataset, removing the single biggest source of data poisoning.</b></li>
84
+ <li><b>to <a href=https://featherless.ai/>FeatherlessAI</a> for generously providing 8xH100 SXM node for training of this model</b></li>
85
+ <li>to Gryphe, Lemmy, Kalomaze, Nopm, Epiculous and CognitiveComputations for the data</li>
86
+ <li>and to Allura-org for support, feedback, beta-testing and doing quality control of EVA models.</li></ul>
87
+
88
+
89
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
90
+ <details><summary>See axolotl config</summary>
91
+
92
+ axolotl version: `0.4.1`
93
+ ```yaml
94
+ base_model: Qwen/Qwen2.5-14B
95
+
96
+ load_in_8bit: false
97
+ load_in_4bit: false
98
+ strict: false
99
+
100
+ plugins:
101
+ - axolotl.integrations.liger.LigerPlugin
102
+ liger_rope: true
103
+ liger_rms_norm: true
104
+ liger_swiglu: true
105
+ liger_fused_linear_cross_entropy: true
106
+
107
+ # plugins:
108
+ # - axolotl.integrations.spectrum.SpectrumPlugin
109
+
110
+ # spectrum_top_fraction: 0.5
111
+ # # Optional if using a pre-scanned model as your base_model. Useful if using a model mirror
112
+ # spectrum_model_name: Qwen/Qwen2.5-32B
113
+
114
+ datasets:
115
+ - path: datasets/Celeste_Filtered_utf8fix.jsonl
116
+ type: sharegpt
117
+ - path: datasets/deduped_not_samantha_norefusals.jsonl
118
+ type: sharegpt
119
+ - path: datasets/deduped_SynthRP-Gens_processed_ShareGPT_converted_cleaned.jsonl
120
+ type: sharegpt
121
+ - path: datasets/deduped_Synthstruct-Gens_processed_sharegpt_converted_cleaned.jsonl
122
+ type: sharegpt
123
+ - path: datasets/Gryphe-4o-WP-filtered-sharegpt_utf8fix.jsonl
124
+ type: sharegpt
125
+ - path: datasets/opus-instruct-22k-no_refusals-filtered_utf8fix.jsonl
126
+ type: sharegpt
127
+ - path: datasets/Sonnet3-5-charcard-names-filtered-sharegpt_utf8fix.jsonl
128
+ type: sharegpt
129
+ - path: datasets/SystemChat_subset_filtered_sharegpt_utf8fix.jsonl
130
+ type: sharegpt
131
+
132
+ chat_template: chatml
133
+ shuffle_merged_datasets: true
134
+ val_set_size: 0.001
135
+ output_dir: ./EVA-Qwen2.5-14B-SFFT-v0.2
136
+
137
+ sequence_len: 10240
138
+ sample_packing: true
139
+ eval_sample_packing: false
140
+ pad_to_sequence_len: true
141
+
142
+ # adapter: qlora
143
+ # lora_model_dir:
144
+ # lora_r: 64
145
+ # lora_alpha: 128
146
+ # lora_dropout: 0.05
147
+ # lora_target_linear: true
148
+ # peft_use_dora: true
149
+
150
+ base_model: Qwen/Qwen2.5-14B
151
+
152
+ load_in_8bit: false
153
+ load_in_4bit: false
154
+ strict: false
155
+
156
+ plugins:
157
+ - axolotl.integrations.liger.LigerPlugin
158
+ liger_rope: true
159
+ liger_rms_norm: true
160
+ liger_swiglu: true
161
+ liger_fused_linear_cross_entropy: true
162
+
163
+ datasets:
164
+ - path: datasets/Celeste_Filtered_utf8fix.jsonl
165
+ type: sharegpt
166
+ - path: datasets/deduped_not_samantha_norefusals.jsonl
167
+ type: sharegpt
168
+ - path: datasets/deduped_SynthRP-Gens_processed_ShareGPT_converted_cleaned.jsonl
169
+ type: sharegpt
170
+ - path: datasets/deduped_Synthstruct-Gens_processed_sharegpt_converted_cleaned.jsonl
171
+ type: sharegpt
172
+ - path: datasets/Gryphe-4o-WP-filtered-sharegpt_utf8fix.jsonl
173
+ type: sharegpt
174
+ - path: datasets/opus-instruct-22k-no_refusals-filtered_utf8fix.jsonl
175
+ type: sharegpt
176
+ - path: datasets/Sonnet3-5-charcard-names-filtered-sharegpt_utf8fix.jsonl
177
+ type: sharegpt
178
+ - path: datasets/SystemChat_subset_filtered_sharegpt_utf8fix.jsonl
179
+ type: sharegpt
180
+
181
+ chat_template: chatml
182
+ shuffle_merged_datasets: true
183
+ val_set_size: 0.005
184
+ output_dir: ./EVA-Qwen2.5-14B-SFFT-v0.2
185
+
186
+ sequence_len: 10240
187
+ sample_packing: true
188
+ eval_sample_packing: false
189
+ pad_to_sequence_len: true
190
+
191
+ # adapter: qlora
192
+ # lora_model_dir:
193
+ # lora_r: 32
194
+ # lora_alpha: 16
195
+ # lora_dropout: 0.05
196
+ # lora_target_linear: true
197
+ # peft_use_dora: true
198
+
199
+ unfrozen_parameters:
200
+ - ^lm_head.weight$
201
+ - ^model.embed_tokens.weight$
202
+ # mlp.down_proj layers
203
+ - model.layers.1.mlp.down_proj
204
+ - model.layers.35.mlp.down_proj
205
+ - model.layers.38.mlp.down_proj
206
+ - model.layers.37.mlp.down_proj
207
+ - model.layers.36.mlp.down_proj
208
+ - model.layers.15.mlp.down_proj
209
+ - model.layers.11.mlp.down_proj
210
+ - model.layers.12.mlp.down_proj
211
+ - model.layers.34.mlp.down_proj
212
+ - model.layers.44.mlp.down_proj
213
+ - model.layers.45.mlp.down_proj
214
+ - model.layers.9.mlp.down_proj
215
+ - model.layers.41.mlp.down_proj
216
+ - model.layers.33.mlp.down_proj
217
+ - model.layers.43.mlp.down_proj
218
+ - model.layers.40.mlp.down_proj
219
+ - model.layers.13.mlp.down_proj
220
+ - model.layers.8.mlp.down_proj
221
+ - model.layers.39.mlp.down_proj
222
+ - model.layers.10.mlp.down_proj
223
+ - model.layers.14.mlp.down_proj
224
+ - model.layers.16.mlp.down_proj
225
+ - model.layers.31.mlp.down_proj
226
+ - model.layers.32.mlp.down_proj
227
+ # mlp.gate_proj layers
228
+ - model.layers.1.mlp.gate_proj
229
+ - model.layers.44.mlp.gate_proj
230
+ - model.layers.46.mlp.gate_proj
231
+ - model.layers.45.mlp.gate_proj
232
+ - model.layers.43.mlp.gate_proj
233
+ - model.layers.47.mlp.gate_proj
234
+ - model.layers.42.mlp.gate_proj
235
+ - model.layers.32.mlp.gate_proj
236
+ - model.layers.27.mlp.gate_proj
237
+ - model.layers.33.mlp.gate_proj
238
+ - model.layers.28.mlp.gate_proj
239
+ - model.layers.39.mlp.gate_proj
240
+ - model.layers.41.mlp.gate_proj
241
+ - model.layers.40.mlp.gate_proj
242
+ - model.layers.30.mlp.gate_proj
243
+ - model.layers.29.mlp.gate_proj
244
+ - model.layers.31.mlp.gate_proj
245
+ - model.layers.37.mlp.gate_proj
246
+ - model.layers.26.mlp.gate_proj
247
+ - model.layers.10.mlp.gate_proj
248
+ - model.layers.38.mlp.gate_proj
249
+ - model.layers.36.mlp.gate_proj
250
+ - model.layers.12.mlp.gate_proj
251
+ - model.layers.13.mlp.gate_proj
252
+ # mlp.up_proj layers
253
+ - model.layers.1.mlp.up_proj
254
+ - model.layers.13.mlp.up_proj
255
+ - model.layers.11.mlp.up_proj
256
+ - model.layers.14.mlp.up_proj
257
+ - model.layers.15.mlp.up_proj
258
+ - model.layers.12.mlp.up_proj
259
+ - model.layers.8.mlp.up_proj
260
+ - model.layers.16.mlp.up_proj
261
+ - model.layers.9.mlp.up_proj
262
+ - model.layers.19.mlp.up_proj
263
+ - model.layers.10.mlp.up_proj
264
+ - model.layers.7.mlp.up_proj
265
+ - model.layers.17.mlp.up_proj
266
+ - model.layers.20.mlp.up_proj
267
+ - model.layers.21.mlp.up_proj
268
+ - model.layers.18.mlp.up_proj
269
+ - model.layers.37.mlp.up_proj
270
+ - model.layers.38.mlp.up_proj
271
+ - model.layers.39.mlp.up_proj
272
+ - model.layers.42.mlp.up_proj
273
+ - model.layers.41.mlp.up_proj
274
+ - model.layers.27.mlp.up_proj
275
+ - model.layers.28.mlp.up_proj
276
+ - model.layers.36.mlp.up_proj
277
+ # self_attn.k_proj layers
278
+ - model.layers.47.self_attn.k_proj
279
+ - model.layers.39.self_attn.k_proj
280
+ - model.layers.41.self_attn.k_proj
281
+ - model.layers.37.self_attn.k_proj
282
+ - model.layers.35.self_attn.k_proj
283
+ - model.layers.44.self_attn.k_proj
284
+ - model.layers.38.self_attn.k_proj
285
+ - model.layers.14.self_attn.k_proj
286
+ - model.layers.7.self_attn.k_proj
287
+ - model.layers.12.self_attn.k_proj
288
+ - model.layers.11.self_attn.k_proj
289
+ - model.layers.32.self_attn.k_proj
290
+ - model.layers.10.self_attn.k_proj
291
+ - model.layers.8.self_attn.k_proj
292
+ - model.layers.6.self_attn.k_proj
293
+ - model.layers.9.self_attn.k_proj
294
+ - model.layers.45.self_attn.k_proj
295
+ - model.layers.42.self_attn.k_proj
296
+ - model.layers.40.self_attn.k_proj
297
+ - model.layers.5.self_attn.k_proj
298
+ - model.layers.0.self_attn.k_proj
299
+ - model.layers.33.self_attn.k_proj
300
+ - model.layers.34.self_attn.k_proj
301
+ - model.layers.13.self_attn.k_proj
302
+ # self_attn.o_proj layers
303
+ - model.layers.12.self_attn.o_proj
304
+ - model.layers.5.self_attn.o_proj
305
+ - model.layers.14.self_attn.o_proj
306
+ - model.layers.16.self_attn.o_proj
307
+ - model.layers.20.self_attn.o_proj
308
+ - model.layers.13.self_attn.o_proj
309
+ - model.layers.11.self_attn.o_proj
310
+ - model.layers.4.self_attn.o_proj
311
+ - model.layers.6.self_attn.o_proj
312
+ - model.layers.19.self_attn.o_proj
313
+ - model.layers.7.self_attn.o_proj
314
+ - model.layers.18.self_attn.o_proj
315
+ - model.layers.8.self_attn.o_proj
316
+ - model.layers.38.self_attn.o_proj
317
+ - model.layers.15.self_attn.o_proj
318
+ - model.layers.17.self_attn.o_proj
319
+ - model.layers.9.self_attn.o_proj
320
+ - model.layers.10.self_attn.o_proj
321
+ - model.layers.21.self_attn.o_proj
322
+ - model.layers.28.self_attn.o_proj
323
+ - model.layers.32.self_attn.o_proj
324
+ - model.layers.35.self_attn.o_proj
325
+ - model.layers.39.self_attn.o_proj
326
+ - model.layers.3.self_attn.o_proj
327
+ # self_attn.q_proj layers
328
+ - model.layers.1.self_attn.q_proj
329
+ - model.layers.2.self_attn.q_proj
330
+ - model.layers.3.self_attn.q_proj
331
+ - model.layers.44.self_attn.q_proj
332
+ - model.layers.29.self_attn.q_proj
333
+ - model.layers.45.self_attn.q_proj
334
+ - model.layers.43.self_attn.q_proj
335
+ - model.layers.32.self_attn.q_proj
336
+ - model.layers.38.self_attn.q_proj
337
+ - model.layers.19.self_attn.q_proj
338
+ - model.layers.42.self_attn.q_proj
339
+ - model.layers.34.self_attn.q_proj
340
+ - model.layers.36.self_attn.q_proj
341
+ - model.layers.40.self_attn.q_proj
342
+ - model.layers.26.self_attn.q_proj
343
+ - model.layers.20.self_attn.q_proj
344
+ - model.layers.28.self_attn.q_proj
345
+ - model.layers.39.self_attn.q_proj
346
+ - model.layers.41.self_attn.q_proj
347
+ - model.layers.33.self_attn.q_proj
348
+ - model.layers.35.self_attn.q_proj
349
+ - model.layers.25.self_attn.q_proj
350
+ - model.layers.30.self_attn.q_proj
351
+ - model.layers.27.self_attn.q_proj
352
+ # self_attn.v_proj layers
353
+ - model.layers.0.self_attn.v_proj
354
+ - model.layers.7.self_attn.v_proj
355
+ - model.layers.39.self_attn.v_proj
356
+ - model.layers.31.self_attn.v_proj
357
+ - model.layers.15.self_attn.v_proj
358
+ - model.layers.10.self_attn.v_proj
359
+ - model.layers.41.self_attn.v_proj
360
+ - model.layers.32.self_attn.v_proj
361
+ - model.layers.6.self_attn.v_proj
362
+ - model.layers.33.self_attn.v_proj
363
+ - model.layers.42.self_attn.v_proj
364
+ - model.layers.29.self_attn.v_proj
365
+ - model.layers.9.self_attn.v_proj
366
+ - model.layers.14.self_attn.v_proj
367
+ - model.layers.35.self_attn.v_proj
368
+ - model.layers.38.self_attn.v_proj
369
+ - model.layers.13.self_attn.v_proj
370
+ - model.layers.30.self_attn.v_proj
371
+ - model.layers.34.self_attn.v_proj
372
+ - model.layers.5.self_attn.v_proj
373
+ - model.layers.28.self_attn.v_proj
374
+ - model.layers.37.self_attn.v_proj
375
+ - model.layers.27.self_attn.v_proj
376
+ - model.layers.11.self_attn.v_proj
377
+
378
+ wandb_project: EVA-Qwen2.5-14B-SFFT-v0.2
379
+ wandb_entity:
380
+ wandb_watch:
381
+ wandb_name: Unit-02
382
+ wandb_log_model:
383
+
384
+ gradient_accumulation_steps: 8
385
+ micro_batch_size: 2
386
+ num_epochs: 3
387
+ optimizer: paged_ademamix_8bit
388
+ lr_scheduler: cosine
389
+ learning_rate: 0.00005
390
+ max_grad_norm: 3
391
+
392
+ train_on_inputs: false
393
+ group_by_length: false
394
+ bf16: auto
395
+ fp16:
396
+ tf32: false
397
+
398
+ gradient_checkpointing: "unsloth"
399
+ # gradient_checkpointing_kwargs:
400
+ # use_reentrant: true
401
+ early_stopping_patience:
402
+ resume_from_checkpoint:
403
+ local_rank:
404
+ logging_steps: 1
405
+ xformers_attention:
406
+ flash_attention: true
407
+
408
+ warmup_steps: 20
409
+ evals_per_epoch: 4
410
+ saves_per_epoch: 4
411
+ save_safetensors: true
412
+ hub_model_id:
413
+ hub_strategy:
414
+ debug:
415
+ deepspeed: deepspeed_configs/zero3_bf16.json
416
+ weight_decay: 0.1
417
+ # fsdp:
418
+ # - full_shard
419
+ # - auto_wrap
420
+ # fsdp_config:
421
+ # fsdp_limit_all_gathers: true
422
+ # fsdp_sync_module_states: false
423
+ # fsdp_offload_params: true
424
+ # fsdp_cpu_ram_efficient_loading: true
425
+ # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
426
+ # fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
427
+ # fsdp_activation_checkpointing: true
428
+ # fsdp_state_dict_type: SHARDED_STATE_DICT # Changed from FULL_STATE_DICT
429
+ # fsdp_sharding_strategy: FULL_SHARD
430
+ # fsdp_forward_prefetch: false # Added
431
+ # fsdp_backward_prefetch: "BACKWARD_PRE" # Added
432
+ # fsdp_backward_prefetch_limit: 1 # Added
433
+ # fsdp_mixed_precision: BF16 # Added
434
+ ```
435
+
436
  </details><br>