rAIfle
/

Acolyte-LORA

Model card Files Files and versions Community

rAIfle commited on Sep 21, 2024

Commit

15a5947

verified ·

1 Parent(s): 3c0ae14

Upload 3 files

Browse files

Files changed (3) hide show

train/acolyte.json +6 -0
train/acolyte.toml +147 -0
train/acolyte.yaml +19 -0

train/acolyte.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "train_micro_batch_size_per_gpu": 1,
+    "gradient_accumulation_steps": 8,
+    "gradient_clipping": 1.0,
+    "steps_per_print": 1
+}

train/acolyte.toml ADDED Viewed

	@@ -0,0 +1,147 @@

+#unsloth/Mistral-Small-Instruct-2409
+#"hf_hub_url": "teknium/trismegistus-project",
+#"hf_hub_url": "AIRRC/Eudaimonic",
+#"hf_hub_url": "Gryphe/Sonnet3.5-Charcard-Roleplay",
+#"hf_hub_url": "anthracite-org/kalo_misc_part2",
+#"hf_hub_url": "anthracite-org/kalo_opus_misc_240827",
+#"hf_hub_url":"AtlasUnified/atlas-converse",
+# Paths
+model = '/workspace/model'
+output_dir = '/workspace/out'
+# Lora configuration
+# can use full_fine_tune=true and no quantization to train the whole model instead of a LoRA
+#full_fine_tune = true
+lora_rank = 1024
+lora_alpha = 256
+lora_dropout = 0.05
+# Train only specific modules. This is passed to the parameter of the same name in the LoraConfig.
+# If not set, adapt all linear modules.
+# Note, this ALSO affects full fine tuning. In that case, if this is set, only weights containing one
+# of these keys as substring will have requires_grad. If not set everything is trained.
+#target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
+# can specify layers to adapt with LoRA if you want
+#layers_to_transform = '16:31'
+# for Mixtral, set the load balancing coefficient
+# load_balancing_loss_coef = 0.02
+# Optimization configuration
+epochs = 2
+lr_scheduler = 'cosine'  # can also be 'constant'
+warmup_steps = 50
+# might be useful if resuming from a checkpoint and you want to change the LR and force it to something
+#force_constant_lr = 5e-5
+# hard clamp the magnitude of the LoRA weights
+#scale_weight_norms = 1.0
+# dynamic batch size, targeting this many tokens per batch, per device
+# if set, completely ignores the batch size in the deepspeed JSON config file
+# can be thought of as a replacement for sample packing
+batch_size_tokens = 10000
+# Performance settings
+pipeline_stages = 2  # number of pipeline parallel stages, must evenly divide the number of GPUs you launch the script with
+logging_steps = 10  # how often to log in Tensorboard
+eval_steps = 500
+save_steps = 500
+checkpoint_every_n_minutes = 60
+eval_before_first_step = false  # do an eval before any training happens
+# dtype to load the underlying model weights in
+model_weight_dtype = 'bfloat16'
+# dtype for the LoRA weights
+lora_weight_dtype = 'bfloat16'
+# Can have the saved weights be different dtype. Don't need to set this. Could be useful for
+# training in float32 but saving with float16.
+#save_dtype = 'bfloat16'
+# Keep this number of stepXXXX (model saves) and global_stepXXX (checkpoint saves) and delete the rest
+# (this only applies to the current training session, and resumed training sessions will not touch
+# old saves)
+keep_states = 5
+# sort examples by length before dividing them into batches
+# this makes all examples in a batch approximately the same length, to minimize padding
+# the batches are still shuffled after that
+# you should probably always have this set to true
+group_by_length = true
+# This can also be 'unsloth' to offload hidden states to CPU, saving potentially a lot of VRAM
+# for a minor performance hit.
+# Example: 4x4090, PCIE 3.0 16x, pipeline_stages=4, training QLoRA on Llama 3 70B with 4096 sequence length.
+# true: 75s step time, 19.7G peak per-GPU VRAM usage.
+# 'unsloth': 78s step time, 16.2G peak per-GPU VRAM usage.
+activation_checkpointing = 'unsloth'
+# Keep MLP weights on system RAM until they are needed. Can save a ton of VRAM with a
+# moderate hit to performance. If using an MoE model, this can also be an integer, in
+# which case only that many experts are offloaded (tradeoff between VRAM and speed).
+offload_mlp_to_cpu = true
+# Resume a prior run
+# if true, we attempt to resume training from the most recent directory inside output_dir (the directory names are timestamps)
+# so, to resume, just run the exact same command but set this to true first
+resume_from_checkpoint = false
+# Loading the optimizer states seems to cause some kind of unavoidable VRAM memory leak.
+# It's very small, only about 0.2 GB in cases I've seen. But if you are very close to the
+# limit, it can cause resuming from checkpoint to OOM. As a last resort, you can uncomment
+# this to not load the optimizer states and hopefully the resumption won't OOM.
+#load_optimizer_states = false
+# Dataset configuration
+# How to combine multiple datasets if you have more than one.
+# Can be 'concatenate' or 'interleave'. Will be 'concatenate' if not set.
+dataset_combination_mode = 'concatenate'
+# When to stop interleaving datasets when using mode 'interleave'. Either 'first_exhausted' or 'all_exhausted'.
+# Default if not set: 'first_exhausted'
+# dataset_interleave_stopping_strategy = 'all_exhausted'
+# Can set this lower than training, so we don't drop as many examples when trying to make equal-sized batches.
+# Default if not set: same as training GAS.
+eval_gradient_accumulation_steps = 1
+# bitsandbytes 4 bit quantization. The parameters here become arguments to Transformers BitsAndBytesConfig.
+#[quantization.bnb]
+#load_in_4bit = true
+#bnb_4bit_use_double_quant = false
+#bnb_4bit_compute_dtype = 'bfloat16'
+# HQQ quantization. The parameters here become arguments to CustomHQQConfig.
+# [quantization.hqq]
+# nbits = 4
+# group_size = 64
+# compute_dtype = 'bfloat16'
+# (Optional) You can override the quant params for certain modules. This does substring matching, e.g. if 'gate_proj'
+# is a substring of the full module name, anything specified overwrites the defaults in [quantization.hqq].
+# [quantization.hqq.dynamic_config]
+# gate_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
+# up_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
+# down_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
+[optimizer]
+# options: adamw_kahan, AdamW, AdamW8bit
+type = 'adamw_kahan'
+lr = 5e-5
+beta1 = 0.9
+beta2 = 0.99
+weight_decay = 0.1
+[[datasets]]
+# Arbitrary name, used only for separately logging eval metrics. Will be dataset0, dataset1, etc if not set.
+name = 'acolyte'
+dataset_type = 'axolotl'
+dataset_path = './acolyte.yml'
+sequence_len = 16384
+eval_size = 0.01
+# Relative sampling weight, when using combination mode 'interleave'. Will be 1 if not set.
+sample_weight = 1

train/acolyte.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+base_model: unsloth/Mistral-Small-Instruct-2409
+model_type: MistralForCausalLM
+tokenizer_type: AutoTokenizer
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+ datasets:
+  - path: teknium/trismegistus-project
+    type: sharegpt
+  - path: AIRRC/Eudaimonic
+    type: sharegpt
+  - path: Gryphe/Sonnet3.5-Charcard-Roleplay
+    type: sharegpt
+  - path: anthracite-org/kalo_misc_part2
+    type: sharegpt
+  - path: anthracite-org/kalo_opus_misc_240827
+    type: sharegpt