Files changed (1) hide show
  1. README.md +192 -180
README.md CHANGED
@@ -1,181 +1,193 @@
1
- ---
2
- library_name: transformers
3
- tags:
4
- - generated_from_trainer
5
- model-index:
6
- - name: EVA-Qwen2.5-1.5B-FFT-v0.0
7
- results: []
8
- license: apache-2.0
9
- language:
10
- - en
11
- base_model:
12
- - Qwen/Qwen2.5-1.5B
13
- datasets:
14
- - anthracite-org/kalo-opus-instruct-22k-no-refusal
15
- - Nopm/Opus_WritingStruct
16
- - Gryphe/Sonnet3.5-SlimOrcaDedupCleaned
17
- - Gryphe/Sonnet3.5-Charcard-Roleplay
18
- - Gryphe/ChatGPT-4o-Writing-Prompts
19
- - Epiculous/Synthstruct-Gens-v1.1-Filtered-n-Cleaned
20
- - Epiculous/SynthRP-Gens-v1.1-Filtered-n-Cleaned
21
- - nothingiisreal/Reddit-Dirty-And-WritingPrompts
22
- - allura-org/Celeste-1.x-data-mixture
23
- - cognitivecomputations/dolphin-2.9.3
24
- ---
25
- # EVA Qwen2.5-1.5BB v0.0
26
-
27
- <p>
28
- A small-scale RP/storywriting specialist model, full-parameter finetune of Qwen2.5-1.5B on mixture of synthetic and natural data.<br>
29
- It uses Celeste 70B 0.1 data mixture, greatly expanding it to improve versatility, creativity and "flavor" of the resulting model.<br>
30
- Unlike EVA-D 1.5B v0.0, this model was created without using DistillKit, and unlike other versions of EVA, Spectrum wasn't used either, since layer freezing is inefficient at small scale.
31
- </p>
32
-
33
- <p>
34
- <br>
35
- <h3>
36
- Training data:
37
- </h3>
38
- <ul>
39
- <li>Celeste 70B 0.1 data mixture minus Opus Instruct subset. See that model's <a href=https://huggingface.co/nothingiisreal/L3.1-70B-Celeste-V0.1-BF16>card</a> for details.</li>
40
- <li>Kalomaze's Opus_Instruct_25k dataset, filtered for refusals.</li>
41
- <li>A subset (1k rows) of ChatGPT-4o-WritingPrompts by Gryphe</li>
42
- <li>A subset (2k rows) of Sonnet3.5-Charcards-Roleplay by Gryphe</li>
43
- <li>Synthstruct and SynthRP datasets by Epiculous</li>
44
- <li>A subset from Dolphin-2.9.3, including filtered version of not_samantha and a small subset of systemchat.</li>
45
- </ul>
46
- <h3>
47
- Training time and hardware:
48
- </h3>
49
- <ul><li>9 hours on 4x3090Ti</a></li></ul>
50
- <h3>
51
- </p>
52
- <p>Model was created by Kearm, Auri and Cahvay.</p>
53
- <h4>Special thanks:</h4><ul>
54
- <li>to Cahvay for his work on investigating and reprocessing the corrupted dataset, removing the single biggest source of data poisoning.</li>
55
- <li>to Gryphe, Lemmy, Kalomaze, Nopm, Epiculous and CognitiveComputations for the data</li>
56
- <li>and to Allura-org for support, feedback, beta-testing and doing quality control of EVA models.</li></ul>
57
-
58
- [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
59
- <details><summary>See axolotl config</summary>
60
-
61
- axolotl version: `0.4.1`
62
- ```yaml
63
- base_model: /media/kearm/Disk_2/HF_FAST_MoE_Fodder/Qwen2.5-1.5B
64
-
65
- load_in_8bit: false
66
- load_in_4bit: false
67
- strict: false
68
-
69
- plugins:
70
- - axolotl.integrations.liger.LigerPlugin
71
- liger_rope: true
72
- liger_rms_norm: true
73
- liger_swiglu: true
74
- liger_fused_linear_cross_entropy: true
75
-
76
- # plugins:
77
- # - axolotl.integrations.spectrum.SpectrumPlugin
78
-
79
- # spectrum_top_fraction: 0.5
80
- # # Optional if using a pre-scanned model as your base_model. Useful if using a model mirror
81
- # spectrum_model_name: Qwen/Qwen2.5-32B
82
-
83
- datasets:
84
- - path: datasets/Celeste_Filtered_utf8fix.jsonl
85
- type: sharegpt
86
- - path: datasets/deduped_not_samantha_norefusals.jsonl
87
- type: sharegpt
88
- - path: datasets/deduped_SynthRP-Gens_processed_ShareGPT_converted_cleaned.jsonl
89
- type: sharegpt
90
- - path: datasets/deduped_Synthstruct-Gens_processed_sharegpt_converted_cleaned.jsonl
91
- type: sharegpt
92
- - path: datasets/Gryphe-4o-WP-filtered-sharegpt_utf8fix.jsonl
93
- type: sharegpt
94
- - path: datasets/Sonnet3-5-charcard-names-filtered-sharegpt_utf8fix.jsonl
95
- type: sharegpt
96
- - path: datasets/SystemChat_subset_filtered_sharegpt_utf8fix.jsonl
97
- type: sharegpt
98
- - path: datasets/S2.jsonl
99
- type: sharegpt
100
- - path: datasets/Turing.jsonl
101
- type: sharegpt
102
-
103
- chat_template: chatml
104
- shuffle_merged_datasets: true
105
- val_set_size: 0.05
106
- output_dir: EVA-Qwen2.5-1.5B-FFT-v0.0
107
-
108
- sequence_len: 10240
109
- sample_packing: true
110
- eval_sample_packing: false
111
- pad_to_sequence_len: true
112
-
113
- # adapter: qlora
114
- # lora_model_dir:
115
- # lora_r: 64
116
- # lora_alpha: 128
117
- # lora_dropout: 0.05
118
- # lora_target_linear: true
119
- # peft_use_dora: true
120
-
121
- wandb_project: EVA-Qwen2.5-1.5B-FFT-v0.0
122
- wandb_entity:
123
- wandb_watch:
124
- wandb_name: Unit-00
125
- wandb_log_model:
126
-
127
- gradient_accumulation_steps: 8
128
- micro_batch_size: 1
129
- num_epochs: 3
130
- optimizer: paged_adamw_8bit
131
- lr_scheduler: cosine
132
- learning_rate: 0.000005
133
- max_grad_norm: 1.5
134
-
135
- train_on_inputs: false
136
- group_by_length: false
137
- bf16: auto
138
- fp16:
139
- tf32: false
140
-
141
- gradient_checkpointing: "unsloth"
142
- gradient_checkpointing_kwargs:
143
- use_reentrant: true
144
- early_stopping_patience:
145
- resume_from_checkpoint:
146
- local_rank:
147
- logging_steps: 1
148
- xformers_attention:
149
- flash_attention: true
150
-
151
- warmup_steps: 20
152
- evals_per_epoch: 4
153
- saves_per_epoch: 4
154
- save_safetensors: true
155
- save_total_limit: 8
156
- hub_model_id:
157
- hub_strategy:
158
- debug:
159
- deepspeed: deepspeed_configs/zero3_bf16.json
160
- weight_decay: 0.15
161
- # fsdp:
162
- # - full_shard
163
- # - auto_wrap
164
- # fsdp_config:
165
- # fsdp_limit_all_gathers: true
166
- # fsdp_sync_module_states: false
167
- # fsdp_offload_params: true
168
- # fsdp_cpu_ram_efficient_loading: true
169
- # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
170
- # fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
171
- # fsdp_activation_checkpointing: true
172
- # fsdp_state_dict_type: SHARDED_STATE_DICT # Changed from FULL_STATE_DICT
173
- # fsdp_sharding_strategy: FULL_SHARD
174
- # fsdp_forward_prefetch: false # Added
175
- # fsdp_backward_prefetch: "BACKWARD_PRE" # Added
176
- # fsdp_backward_prefetch_limit: 1 # Added
177
- # fsdp_mixed_precision: BF16 # Added
178
-
179
- ```
180
-
 
 
 
 
 
 
 
 
 
 
 
 
181
  </details><br>
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - generated_from_trainer
5
+ license: apache-2.0
6
+ language:
7
+ - zho
8
+ - eng
9
+ - fra
10
+ - spa
11
+ - por
12
+ - deu
13
+ - ita
14
+ - rus
15
+ - jpn
16
+ - kor
17
+ - vie
18
+ - tha
19
+ - ara
20
+ base_model:
21
+ - Qwen/Qwen2.5-1.5B
22
+ datasets:
23
+ - anthracite-org/kalo-opus-instruct-22k-no-refusal
24
+ - Nopm/Opus_WritingStruct
25
+ - Gryphe/Sonnet3.5-SlimOrcaDedupCleaned
26
+ - Gryphe/Sonnet3.5-Charcard-Roleplay
27
+ - Gryphe/ChatGPT-4o-Writing-Prompts
28
+ - Epiculous/Synthstruct-Gens-v1.1-Filtered-n-Cleaned
29
+ - Epiculous/SynthRP-Gens-v1.1-Filtered-n-Cleaned
30
+ - nothingiisreal/Reddit-Dirty-And-WritingPrompts
31
+ - allura-org/Celeste-1.x-data-mixture
32
+ - cognitivecomputations/dolphin-2.9.3
33
+ model-index:
34
+ - name: EVA-Qwen2.5-1.5B-FFT-v0.0
35
+ results: []
36
+ ---
37
+ # EVA Qwen2.5-1.5BB v0.0
38
+
39
+ <p>
40
+ A small-scale RP/storywriting specialist model, full-parameter finetune of Qwen2.5-1.5B on mixture of synthetic and natural data.<br>
41
+ It uses Celeste 70B 0.1 data mixture, greatly expanding it to improve versatility, creativity and "flavor" of the resulting model.<br>
42
+ Unlike EVA-D 1.5B v0.0, this model was created without using DistillKit, and unlike other versions of EVA, Spectrum wasn't used either, since layer freezing is inefficient at small scale.
43
+ </p>
44
+
45
+ <p>
46
+ <br>
47
+ <h3>
48
+ Training data:
49
+ </h3>
50
+ <ul>
51
+ <li>Celeste 70B 0.1 data mixture minus Opus Instruct subset. See that model's <a href=https://huggingface.co/nothingiisreal/L3.1-70B-Celeste-V0.1-BF16>card</a> for details.</li>
52
+ <li>Kalomaze's Opus_Instruct_25k dataset, filtered for refusals.</li>
53
+ <li>A subset (1k rows) of ChatGPT-4o-WritingPrompts by Gryphe</li>
54
+ <li>A subset (2k rows) of Sonnet3.5-Charcards-Roleplay by Gryphe</li>
55
+ <li>Synthstruct and SynthRP datasets by Epiculous</li>
56
+ <li>A subset from Dolphin-2.9.3, including filtered version of not_samantha and a small subset of systemchat.</li>
57
+ </ul>
58
+ <h3>
59
+ Training time and hardware:
60
+ </h3>
61
+ <ul><li>9 hours on 4x3090Ti</a></li></ul>
62
+ <h3>
63
+ </p>
64
+ <p>Model was created by Kearm, Auri and Cahvay.</p>
65
+ <h4>Special thanks:</h4><ul>
66
+ <li>to Cahvay for his work on investigating and reprocessing the corrupted dataset, removing the single biggest source of data poisoning.</li>
67
+ <li>to Gryphe, Lemmy, Kalomaze, Nopm, Epiculous and CognitiveComputations for the data</li>
68
+ <li>and to Allura-org for support, feedback, beta-testing and doing quality control of EVA models.</li></ul>
69
+
70
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
71
+ <details><summary>See axolotl config</summary>
72
+
73
+ axolotl version: `0.4.1`
74
+ ```yaml
75
+ base_model: /media/kearm/Disk_2/HF_FAST_MoE_Fodder/Qwen2.5-1.5B
76
+
77
+ load_in_8bit: false
78
+ load_in_4bit: false
79
+ strict: false
80
+
81
+ plugins:
82
+ - axolotl.integrations.liger.LigerPlugin
83
+ liger_rope: true
84
+ liger_rms_norm: true
85
+ liger_swiglu: true
86
+ liger_fused_linear_cross_entropy: true
87
+
88
+ # plugins:
89
+ # - axolotl.integrations.spectrum.SpectrumPlugin
90
+
91
+ # spectrum_top_fraction: 0.5
92
+ # # Optional if using a pre-scanned model as your base_model. Useful if using a model mirror
93
+ # spectrum_model_name: Qwen/Qwen2.5-32B
94
+
95
+ datasets:
96
+ - path: datasets/Celeste_Filtered_utf8fix.jsonl
97
+ type: sharegpt
98
+ - path: datasets/deduped_not_samantha_norefusals.jsonl
99
+ type: sharegpt
100
+ - path: datasets/deduped_SynthRP-Gens_processed_ShareGPT_converted_cleaned.jsonl
101
+ type: sharegpt
102
+ - path: datasets/deduped_Synthstruct-Gens_processed_sharegpt_converted_cleaned.jsonl
103
+ type: sharegpt
104
+ - path: datasets/Gryphe-4o-WP-filtered-sharegpt_utf8fix.jsonl
105
+ type: sharegpt
106
+ - path: datasets/Sonnet3-5-charcard-names-filtered-sharegpt_utf8fix.jsonl
107
+ type: sharegpt
108
+ - path: datasets/SystemChat_subset_filtered_sharegpt_utf8fix.jsonl
109
+ type: sharegpt
110
+ - path: datasets/S2.jsonl
111
+ type: sharegpt
112
+ - path: datasets/Turing.jsonl
113
+ type: sharegpt
114
+
115
+ chat_template: chatml
116
+ shuffle_merged_datasets: true
117
+ val_set_size: 0.05
118
+ output_dir: EVA-Qwen2.5-1.5B-FFT-v0.0
119
+
120
+ sequence_len: 10240
121
+ sample_packing: true
122
+ eval_sample_packing: false
123
+ pad_to_sequence_len: true
124
+
125
+ # adapter: qlora
126
+ # lora_model_dir:
127
+ # lora_r: 64
128
+ # lora_alpha: 128
129
+ # lora_dropout: 0.05
130
+ # lora_target_linear: true
131
+ # peft_use_dora: true
132
+
133
+ wandb_project: EVA-Qwen2.5-1.5B-FFT-v0.0
134
+ wandb_entity:
135
+ wandb_watch:
136
+ wandb_name: Unit-00
137
+ wandb_log_model:
138
+
139
+ gradient_accumulation_steps: 8
140
+ micro_batch_size: 1
141
+ num_epochs: 3
142
+ optimizer: paged_adamw_8bit
143
+ lr_scheduler: cosine
144
+ learning_rate: 0.000005
145
+ max_grad_norm: 1.5
146
+
147
+ train_on_inputs: false
148
+ group_by_length: false
149
+ bf16: auto
150
+ fp16:
151
+ tf32: false
152
+
153
+ gradient_checkpointing: "unsloth"
154
+ gradient_checkpointing_kwargs:
155
+ use_reentrant: true
156
+ early_stopping_patience:
157
+ resume_from_checkpoint:
158
+ local_rank:
159
+ logging_steps: 1
160
+ xformers_attention:
161
+ flash_attention: true
162
+
163
+ warmup_steps: 20
164
+ evals_per_epoch: 4
165
+ saves_per_epoch: 4
166
+ save_safetensors: true
167
+ save_total_limit: 8
168
+ hub_model_id:
169
+ hub_strategy:
170
+ debug:
171
+ deepspeed: deepspeed_configs/zero3_bf16.json
172
+ weight_decay: 0.15
173
+ # fsdp:
174
+ # - full_shard
175
+ # - auto_wrap
176
+ # fsdp_config:
177
+ # fsdp_limit_all_gathers: true
178
+ # fsdp_sync_module_states: false
179
+ # fsdp_offload_params: true
180
+ # fsdp_cpu_ram_efficient_loading: true
181
+ # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
182
+ # fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
183
+ # fsdp_activation_checkpointing: true
184
+ # fsdp_state_dict_type: SHARDED_STATE_DICT # Changed from FULL_STATE_DICT
185
+ # fsdp_sharding_strategy: FULL_SHARD
186
+ # fsdp_forward_prefetch: false # Added
187
+ # fsdp_backward_prefetch: "BACKWARD_PRE" # Added
188
+ # fsdp_backward_prefetch_limit: 1 # Added
189
+ # fsdp_mixed_precision: BF16 # Added
190
+
191
+ ```
192
+
193
  </details><br>