rAIfle commited on
Commit
15a5947
·
verified ·
1 Parent(s): 3c0ae14

Upload 3 files

Browse files
Files changed (3) hide show
  1. train/acolyte.json +6 -0
  2. train/acolyte.toml +147 -0
  3. train/acolyte.yaml +19 -0
train/acolyte.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "train_micro_batch_size_per_gpu": 1,
3
+ "gradient_accumulation_steps": 8,
4
+ "gradient_clipping": 1.0,
5
+ "steps_per_print": 1
6
+ }
train/acolyte.toml ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #unsloth/Mistral-Small-Instruct-2409
2
+ #"hf_hub_url": "teknium/trismegistus-project",
3
+ #"hf_hub_url": "AIRRC/Eudaimonic",
4
+ #"hf_hub_url": "Gryphe/Sonnet3.5-Charcard-Roleplay",
5
+ #"hf_hub_url": "anthracite-org/kalo_misc_part2",
6
+ #"hf_hub_url": "anthracite-org/kalo_opus_misc_240827",
7
+ #"hf_hub_url":"AtlasUnified/atlas-converse",
8
+
9
+
10
+
11
+
12
+ # Paths
13
+ model = '/workspace/model'
14
+ output_dir = '/workspace/out'
15
+
16
+ # Lora configuration
17
+ # can use full_fine_tune=true and no quantization to train the whole model instead of a LoRA
18
+ #full_fine_tune = true
19
+ lora_rank = 1024
20
+ lora_alpha = 256
21
+ lora_dropout = 0.05
22
+
23
+ # Train only specific modules. This is passed to the parameter of the same name in the LoraConfig.
24
+ # If not set, adapt all linear modules.
25
+ # Note, this ALSO affects full fine tuning. In that case, if this is set, only weights containing one
26
+ # of these keys as substring will have requires_grad. If not set everything is trained.
27
+ #target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
28
+
29
+ # can specify layers to adapt with LoRA if you want
30
+ #layers_to_transform = '16:31'
31
+
32
+ # for Mixtral, set the load balancing coefficient
33
+ # load_balancing_loss_coef = 0.02
34
+
35
+ # Optimization configuration
36
+ epochs = 2
37
+ lr_scheduler = 'cosine' # can also be 'constant'
38
+ warmup_steps = 50
39
+
40
+ # might be useful if resuming from a checkpoint and you want to change the LR and force it to something
41
+ #force_constant_lr = 5e-5
42
+
43
+ # hard clamp the magnitude of the LoRA weights
44
+ #scale_weight_norms = 1.0
45
+
46
+ # dynamic batch size, targeting this many tokens per batch, per device
47
+ # if set, completely ignores the batch size in the deepspeed JSON config file
48
+ # can be thought of as a replacement for sample packing
49
+ batch_size_tokens = 10000
50
+
51
+ # Performance settings
52
+ pipeline_stages = 2 # number of pipeline parallel stages, must evenly divide the number of GPUs you launch the script with
53
+ logging_steps = 10 # how often to log in Tensorboard
54
+ eval_steps = 500
55
+ save_steps = 500
56
+ checkpoint_every_n_minutes = 60
57
+ eval_before_first_step = false # do an eval before any training happens
58
+ # dtype to load the underlying model weights in
59
+ model_weight_dtype = 'bfloat16'
60
+ # dtype for the LoRA weights
61
+ lora_weight_dtype = 'bfloat16'
62
+ # Can have the saved weights be different dtype. Don't need to set this. Could be useful for
63
+ # training in float32 but saving with float16.
64
+ #save_dtype = 'bfloat16'
65
+ # Keep this number of stepXXXX (model saves) and global_stepXXX (checkpoint saves) and delete the rest
66
+ # (this only applies to the current training session, and resumed training sessions will not touch
67
+ # old saves)
68
+ keep_states = 5
69
+
70
+ # sort examples by length before dividing them into batches
71
+ # this makes all examples in a batch approximately the same length, to minimize padding
72
+ # the batches are still shuffled after that
73
+ # you should probably always have this set to true
74
+ group_by_length = true
75
+
76
+ # This can also be 'unsloth' to offload hidden states to CPU, saving potentially a lot of VRAM
77
+ # for a minor performance hit.
78
+ # Example: 4x4090, PCIE 3.0 16x, pipeline_stages=4, training QLoRA on Llama 3 70B with 4096 sequence length.
79
+ # true: 75s step time, 19.7G peak per-GPU VRAM usage.
80
+ # 'unsloth': 78s step time, 16.2G peak per-GPU VRAM usage.
81
+ activation_checkpointing = 'unsloth'
82
+
83
+ # Keep MLP weights on system RAM until they are needed. Can save a ton of VRAM with a
84
+ # moderate hit to performance. If using an MoE model, this can also be an integer, in
85
+ # which case only that many experts are offloaded (tradeoff between VRAM and speed).
86
+ offload_mlp_to_cpu = true
87
+
88
+ # Resume a prior run
89
+ # if true, we attempt to resume training from the most recent directory inside output_dir (the directory names are timestamps)
90
+ # so, to resume, just run the exact same command but set this to true first
91
+ resume_from_checkpoint = false
92
+
93
+ # Loading the optimizer states seems to cause some kind of unavoidable VRAM memory leak.
94
+ # It's very small, only about 0.2 GB in cases I've seen. But if you are very close to the
95
+ # limit, it can cause resuming from checkpoint to OOM. As a last resort, you can uncomment
96
+ # this to not load the optimizer states and hopefully the resumption won't OOM.
97
+ #load_optimizer_states = false
98
+
99
+
100
+ # Dataset configuration
101
+
102
+ # How to combine multiple datasets if you have more than one.
103
+ # Can be 'concatenate' or 'interleave'. Will be 'concatenate' if not set.
104
+ dataset_combination_mode = 'concatenate'
105
+ # When to stop interleaving datasets when using mode 'interleave'. Either 'first_exhausted' or 'all_exhausted'.
106
+ # Default if not set: 'first_exhausted'
107
+ # dataset_interleave_stopping_strategy = 'all_exhausted'
108
+ # Can set this lower than training, so we don't drop as many examples when trying to make equal-sized batches.
109
+ # Default if not set: same as training GAS.
110
+ eval_gradient_accumulation_steps = 1
111
+
112
+ # bitsandbytes 4 bit quantization. The parameters here become arguments to Transformers BitsAndBytesConfig.
113
+ #[quantization.bnb]
114
+ #load_in_4bit = true
115
+ #bnb_4bit_use_double_quant = false
116
+ #bnb_4bit_compute_dtype = 'bfloat16'
117
+
118
+ # HQQ quantization. The parameters here become arguments to CustomHQQConfig.
119
+ # [quantization.hqq]
120
+ # nbits = 4
121
+ # group_size = 64
122
+ # compute_dtype = 'bfloat16'
123
+
124
+ # (Optional) You can override the quant params for certain modules. This does substring matching, e.g. if 'gate_proj'
125
+ # is a substring of the full module name, anything specified overwrites the defaults in [quantization.hqq].
126
+ # [quantization.hqq.dynamic_config]
127
+ # gate_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
128
+ # up_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
129
+ # down_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
130
+
131
+ [optimizer]
132
+ # options: adamw_kahan, AdamW, AdamW8bit
133
+ type = 'adamw_kahan'
134
+ lr = 5e-5
135
+ beta1 = 0.9
136
+ beta2 = 0.99
137
+ weight_decay = 0.1
138
+
139
+ [[datasets]]
140
+ # Arbitrary name, used only for separately logging eval metrics. Will be dataset0, dataset1, etc if not set.
141
+ name = 'acolyte'
142
+ dataset_type = 'axolotl'
143
+ dataset_path = './acolyte.yml'
144
+ sequence_len = 16384
145
+ eval_size = 0.01
146
+ # Relative sampling weight, when using combination mode 'interleave'. Will be 1 if not set.
147
+ sample_weight = 1
train/acolyte.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: unsloth/Mistral-Small-Instruct-2409
2
+ model_type: MistralForCausalLM
3
+ tokenizer_type: AutoTokenizer
4
+
5
+ load_in_8bit: false
6
+ load_in_4bit: false
7
+ strict: false
8
+
9
+ datasets:
10
+ - path: teknium/trismegistus-project
11
+ type: sharegpt
12
+ - path: AIRRC/Eudaimonic
13
+ type: sharegpt
14
+ - path: Gryphe/Sonnet3.5-Charcard-Roleplay
15
+ type: sharegpt
16
+ - path: anthracite-org/kalo_misc_part2
17
+ type: sharegpt
18
+ - path: anthracite-org/kalo_opus_misc_240827
19
+ type: sharegpt