github-actions[bot]
		
	commited on
		
		
					Commit 
							
							·
						
						5f8ef4c
	
0
								Parent(s):
							
							
🚀 Deploy method comparison app from GH action
Browse filesThis view is limited to 50 files because it contains too many changes.  
							See raw diff
- MetaMathQA/Makefile +90 -0
- MetaMathQA/README.md +241 -0
- MetaMathQA/cancelled_results/.gitkeep +0 -0
- MetaMathQA/data.py +109 -0
- MetaMathQA/default_training_params.json +26 -0
- MetaMathQA/experiments/adalora/llama-3.2-3B-rank32/adapter_config.json +39 -0
- MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/adapter_config.json +11 -0
- MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/training_params.json +6 -0
- MetaMathQA/experiments/boft/llama-3.2-3B-default/adapter_config.json +20 -0
- MetaMathQA/experiments/bone/llama-3.2-3B-bat/adapter_config.json +19 -0
- MetaMathQA/experiments/bone/llama-3.2-3B-default/adapter_config.json +19 -0
- MetaMathQA/experiments/c3a/llama-3.2-3B-default/adapter_config.json +21 -0
- MetaMathQA/experiments/c3a/llama-3.2-3B-default/training_params.json +6 -0
- MetaMathQA/experiments/delora/llama-3.2-3B-rank32/adapter_config.json +20 -0
- MetaMathQA/experiments/delora/llama-3.2-3B-rank32/training_params.json +6 -0
- MetaMathQA/experiments/fourierft/llama-3.2-3B-default/adapter_config.json +23 -0
- MetaMathQA/experiments/fourierft/llama-3.2-3B-n_frequency-5000/adapter_config.json +23 -0
- MetaMathQA/experiments/full-finetuning/llama-3.2-3B-lr_0.00001/training_params.json +6 -0
- MetaMathQA/experiments/ia3/llama-3.2-3B-default/adapter_config.json +14 -0
- MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/adapter_config.json +14 -0
- MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/training_params.json +6 -0
- MetaMathQA/experiments/ln_tuning/llama-3.2-3B-default/adapter_config.json +11 -0
- MetaMathQA/experiments/loha/llama-3.2-3B-rank32/adapter_config.json +24 -0
- MetaMathQA/experiments/lokr/llama-3.2-3B-rank32/adapter_config.json +27 -0
- MetaMathQA/experiments/lora/llama-3.2-3B-rank10-target-mlp/adapter_config.json +30 -0
- MetaMathQA/experiments/lora/llama-3.2-3B-rank32-dora/adapter_config.json +30 -0
- MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/adapter_config.json +30 -0
- MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/training_params.json +9 -0
- MetaMathQA/experiments/lora/llama-3.2-3B-rank32/adapter_config.json +30 -0
- MetaMathQA/experiments/lora/llama-3.2-3B-rank64-rslora/adapter_config.json +30 -0
- MetaMathQA/experiments/lora/llama-3.2-3B-rank64/adapter_config.json +30 -0
- MetaMathQA/experiments/miss/llama-3.2-3B-bat/adapter_config.json +18 -0
- MetaMathQA/experiments/miss/llama-3.2-3B-default/adapter_config.json +18 -0
- MetaMathQA/experiments/miss/llama-3.2-3B-mini/adapter_config.json +18 -0
- MetaMathQA/experiments/oft/llama-3.2-3B-rank32/adapter_config.json +27 -0
- MetaMathQA/experiments/osf/llama-3.2-3B-rank128/adapter_config.json +28 -0
- MetaMathQA/experiments/osf/llama-3.2-3B-rank128/training_params.json +6 -0
- MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/adapter_config.json +15 -0
- MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/training_params.json +6 -0
- MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-default/adapter_config.json +17 -0
- MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/adapter_config.json +17 -0
- MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/training_params.json +6 -0
- MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-sample_vocab-lr_0.001/adapter_config.json +17 -0
- MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-sample_vocab-lr_0.001/training_params.json +6 -0
- MetaMathQA/experiments/ptuning/llama-3.2-3B-default/adapter_config.json +17 -0
- MetaMathQA/experiments/randlora/llama-3.2-3B-default/adapter_config.json +22 -0
- MetaMathQA/experiments/road/llama-3.2-3B-lr_0.001/adapter_config.json +12 -0
- MetaMathQA/experiments/road/llama-3.2-3B-lr_0.001/training_params.json +5 -0
- MetaMathQA/experiments/shira/llama-3.2-3B-lr_0.0003-random_seed_42/adapter_config.json +15 -0
- MetaMathQA/experiments/shira/llama-3.2-3B-lr_0.0003-random_seed_42/training_params.json +6 -0
    	
        MetaMathQA/Makefile
    ADDED
    
    | @@ -0,0 +1,90 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Makefile for running MetaMathQA experiments.
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            # --- Configuration ---
         | 
| 4 | 
            +
            PYTHON := python
         | 
| 5 | 
            +
            RUN_SCRIPT := run.py
         | 
| 6 | 
            +
            EXPERIMENTS_DIR := experiments
         | 
| 7 | 
            +
            RESULTS_DIR := results
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            # --- Automatic Experiment and Result Discovery ---
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            # 1. Find all experiment directories by looking for adapter_config.json files.
         | 
| 12 | 
            +
            #    This gives us a list like: experiments/lora/llama-3.2-3B-rank32 ...
         | 
| 13 | 
            +
            EXPERIMENT_PATHS := $(shell find $(EXPERIMENTS_DIR) \
         | 
| 14 | 
            +
            		    -name "adapter_config.json" -or \
         | 
| 15 | 
            +
            		    -name "training_params.json" | xargs dirname | sort -u)
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            # 2. Define a function to replace all occurrences of a character in a string.
         | 
| 18 | 
            +
            #    This is needed to replicate the result naming logic from run.py (e.g., "lora/foo" -> "lora-foo").
         | 
| 19 | 
            +
            #    Usage: $(call replace-all, string, char_to_replace, replacement_char)
         | 
| 20 | 
            +
            replace-all = $(if $(findstring $(2),$(1)),$(call replace-all,$(subst $(2),$(3),$(1)),$(2),$(3)),$(1))
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            # 3. Define a function to convert an experiment path to its flat result file path.
         | 
| 23 | 
            +
            #    e.g., "experiments/lora/llama-3.2-3B-rank32" -> "results/lora-llama-3.2-3B-rank32.json"
         | 
| 24 | 
            +
            exp_to_res = $(RESULTS_DIR)/$(call replace-all,$(patsubst $(EXPERIMENTS_DIR)/%,%,$(1)),/,--).json
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            # 4. Generate the list of all target result files we want to build.
         | 
| 27 | 
            +
            RESULT_FILES := $(foreach exp,$(EXPERIMENT_PATHS),$(call exp_to_res,$(exp)))
         | 
| 28 | 
            +
             | 
| 29 | 
            +
             | 
| 30 | 
            +
            # --- Main Rules ---
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            # The default 'all' target depends on all possible result files.
         | 
| 33 | 
            +
            # Running `make` or `make all` will check and run any outdated or missing experiments.
         | 
| 34 | 
            +
            all: $(RESULT_FILES)
         | 
| 35 | 
            +
             | 
| 36 | 
            +
             | 
| 37 | 
            +
            # --- Dynamic Rule Generation ---
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            # This is the core logic. We dynamically generate a specific Makefile rule for each experiment found.
         | 
| 40 | 
            +
            # This avoids a complex pattern rule and makes the logic clearer.
         | 
| 41 | 
            +
            define EXPERIMENT_template
         | 
| 42 | 
            +
            # Input $1: The full experiment path (e.g., experiments/lora/llama-3.2-3B-rank32)
         | 
| 43 | 
            +
             | 
| 44 | 
            +
            # Define the rule:
         | 
| 45 | 
            +
            # The target is the result file (e.g., results/lora-llama-3.2-3B-rank32.json).
         | 
| 46 | 
            +
            # The dependencies are its config files, code changes need to be audited manually since they can
         | 
| 47 | 
            +
            # vary in degree of importance. Note that we explicitly ignore when the script fails to run
         | 
| 48 | 
            +
            # so that the other experiments still have a chance to run.
         | 
| 49 | 
            +
            $(call exp_to_res,$(1)): $(wildcard $(1)/adapter_config.json) $(wildcard $(1)/training_params.json)
         | 
| 50 | 
            +
            	@echo "---"
         | 
| 51 | 
            +
            	@echo "Running experiment: $(1)"
         | 
| 52 | 
            +
            	-$(PYTHON) $(RUN_SCRIPT) -v $(1)
         | 
| 53 | 
            +
            	@echo "Finished: $$@"
         | 
| 54 | 
            +
            	@echo "---"
         | 
| 55 | 
            +
             | 
| 56 | 
            +
            endef
         | 
| 57 | 
            +
             | 
| 58 | 
            +
            # This command iterates through every found experiment path and evaluates the template,
         | 
| 59 | 
            +
            # effectively stamping out a unique, explicit rule for each one.
         | 
| 60 | 
            +
            $(foreach exp_path,$(EXPERIMENT_PATHS),$(eval $(call EXPERIMENT_template,$(exp_path))))
         | 
| 61 | 
            +
             | 
| 62 | 
            +
             | 
| 63 | 
            +
            # --- Utility Rules ---
         | 
| 64 | 
            +
             | 
| 65 | 
            +
            .PHONY: all clean list dump_rules
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            # The 'clean' rule removes all generated results.
         | 
| 68 | 
            +
            clean:
         | 
| 69 | 
            +
            	@echo "Cleaning results directory..."
         | 
| 70 | 
            +
            	@([ -n "$(wildcard $(RESULTS_DIR)/*.json)" ] && rm $(RESULTS_DIR)/*.json) || exit 0
         | 
| 71 | 
            +
             | 
| 72 | 
            +
            # The 'list' rule is for debugging. It shows the discovered experiments
         | 
| 73 | 
            +
            # and the result files the Makefile expects to create for them.
         | 
| 74 | 
            +
            list:
         | 
| 75 | 
            +
            	@echo "Discovered experiment configurations:"
         | 
| 76 | 
            +
            	@$(foreach exp,$(EXPERIMENT_PATHS),echo "  - $(exp)/adapter_config.json";)
         | 
| 77 | 
            +
            	@echo "\nTarget result files:"
         | 
| 78 | 
            +
            	@$(foreach res,$(RESULT_FILES),echo "  - $(res)";)
         | 
| 79 | 
            +
             | 
| 80 | 
            +
            # The 'dump_rules' rule is for debugging. It dumps all dynamically defined rules.
         | 
| 81 | 
            +
            define newline
         | 
| 82 | 
            +
             | 
| 83 | 
            +
             | 
| 84 | 
            +
            endef
         | 
| 85 | 
            +
            define DUMPED_RULES
         | 
| 86 | 
            +
            	$(foreach exp_path,$(EXPERIMENT_PATHS),$(call EXPERIMENT_template,$(exp_path)))
         | 
| 87 | 
            +
            endef
         | 
| 88 | 
            +
             | 
| 89 | 
            +
            dump_rules:
         | 
| 90 | 
            +
            	@echo -e "$(subst $(newline),\n,${DUMPED_RULES})"
         | 
    	
        MetaMathQA/README.md
    ADDED
    
    | @@ -0,0 +1,241 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # PEFT method comparison on the MetaMathQA and GSM8K datasets
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            ## Goal
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            This goal is to provide a benchmarking framework for the different PEFT methods that are implemented. It is important that evaluating different PEFT methods is reproducible, idempotent, and version-controlled. Results for more PEFT methods can be added over time.
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            ## Dataset
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            This task trains on the [MetaMathQA]((https://huggingface.co/datasets/meta-math/MetaMathQA)) dataset and validates/tests on the [GSM8K](https://huggingface.co/datasets/openai/gsm8k) dataset ("main").
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            For the model to attain good accuracy, it needs to learn to adhere to the output format and it must express basic chain of thought reasoning capabilities to get to the correct result in the first place. The task is challenging for models in the sub 7B parameter range.
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            The train set uses the whole of MetaMathQA. The validation set is a random sample from the train set of GSM8K. The test set is the whole of the GSM8K test set.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            ## Running
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            Create an experiment in the `experiment/<peft-method>` folder of your choice and give it a name (the name itself does not matter but helps identify the experiment). An example would be `experiments/lora/llama-3.2-3B-rank32/`. Inside that directory, create 2 files:
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            - `adapter_config.json`
         | 
| 20 | 
            +
            - Optional: `training_parameters.json`
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            Once you created these two files, you can either
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            - run the whole suite using by simply calling `make` (takes >24h)
         | 
| 25 | 
            +
            - run one specific experiment by calling `make results/<experiment_name>-<experiment_variation>.json`,
         | 
| 26 | 
            +
              for example `results/vblora-llama-3.2-3B-default.json`
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            You can get a list of all runnable experiments by running `make list`, e.g.:
         | 
| 29 | 
            +
            ```
         | 
| 30 | 
            +
            % make list                                                                                                                                                              (git)-[method-comparison-results]  ⛓ peft
         | 
| 31 | 
            +
            Discovered experiment configurations:
         | 
| 32 | 
            +
              - experiments/ptuning/llama-3.2-3B-default/adapter_config.json
         | 
| 33 | 
            +
              [...]
         | 
| 34 | 
            +
              - experiments/vblora/llama-3.2-3B-default/adapter_config.json
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            Target result files:
         | 
| 37 | 
            +
              - results/ptuning-llama-3.2-3B-default.json
         | 
| 38 | 
            +
              [...]
         | 
| 39 | 
            +
              - results/vblora-llama-3.2-3B-default.json
         | 
| 40 | 
            +
            ```
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            In case you want to force the execution of an experiment, you can simply `touch` the respective adapter config
         | 
| 43 | 
            +
            without modifying it. For example:
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                touch experiments/vblora/llama-3.2-3B-default/adapter_config.json
         | 
| 46 | 
            +
                make
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            to run the VBLoRA default experiment again.
         | 
| 49 | 
            +
             | 
| 50 | 
            +
            ### `adapter_config.json`
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            This must be a valid PEFT configuration. It is easiest to create it programmatically, e.g.:
         | 
| 53 | 
            +
             | 
| 54 | 
            +
            ```python
         | 
| 55 | 
            +
            from peft import LoraConfig
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            config = LoraConfig(...)
         | 
| 58 | 
            +
            config.save_pretrained(<path-to-experiment>)
         | 
| 59 | 
            +
            ```
         | 
| 60 | 
            +
             | 
| 61 | 
            +
            ### `training_parameters.json`
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            There is a default file for the non-PEFT parameters: `default_training_params.json`. This contains all the other parameters that are relevant for training, e.g. the base model id, number of steps, batch size, learning rate, etc. If parameters that differ from the defaults are needed for a specific experiment, place a `training_parameters.json` into the experiment directory and adjust the parameters that need changing. The other parametes are taken from the aforementioned default config.
         | 
| 64 | 
            +
             | 
| 65 | 
            +
            For an overview of all possible arguments, you can also check the `TrainConfig` `dataclass` in `utils.py`.
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            ### Runtime performance
         | 
| 68 | 
            +
             | 
| 69 | 
            +
            Several factors should be considered to achieve a fast runtime performance. Besides the obvious factors like `max_steps` or the base model size, we found the following factors to have a significant impact:
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            #### Eval batch size
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            Regarding the `batch_size_eval` parameter, it is quite critical since evaluation takes up a significant portion of the training time and batching helps with reducing that. It should be possible to choose a value that is multiple times higher than the batch size used for training (`batch_size`). You should also pay attention to the size of the validation set -- e.g. if it's 50, don't choose a `batch_size_eval` of 40, as that results in a large batch of 30 and a small batch of 10. 25 might be a better choice. Also, ensure via a quick train run that the batch size does not lead to out of memory errors -- getting this error at the very end on evaluating the test set would be quite a loss of time.
         | 
| 74 | 
            +
             | 
| 75 | 
            +
            #### Generation length
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            During testing, we discovered that the validation time is greatly inflated by just a few very long generations. Those can inflate the validation time by a factor of 3 or more. At the same time, we discovered that these long generations do not help with accuracy -- in fact, if they exceed the maximum configured length, they're just cut off mid sentence and would thus produce an accuracy of 0 anyway.
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            To remedy this, we now set both `max_length` and `max_new_tokens` for the generation kwargs in the default training parameters. Normally, this is not possible when using transformers, as the latter argument overrides the former. However, we have added special logic inside of `get_generation_config` which takes both and chooses the smaller of the two. This way, we can get rid of these excessively long generations, thus considerably reducing eval times, while still guaranteeing a maximum total generation length to guard against OOM errors. Testing showed that this does not hamper test accuracy. It is therefore recommended not to change these settings.
         | 
| 80 | 
            +
             | 
| 81 | 
            +
            #### Bucketing
         | 
| 82 | 
            +
             | 
| 83 | 
            +
            The length of the sequences in the training data can vary a lot. Therefore, if samples are taken randomly from the training dataset, we will end up with batches containing very short and very long sequences. This is bad because the batch will be padded to the longest sequence, slowing down training. The obvious solution would be to sort the whole dataset by sequence length, but this is also bad because it introduces an order bias (e.g. first training on only short and then on only long answers).
         | 
| 84 | 
            +
             | 
| 85 | 
            +
            The solution is to find a trade off between the two factors. This is achieved by the `BucketIterator`. It first creates buckets that contain multiple batches, e.g. 20x the batch size. The bucket is then sorted by sequence length and then batches are yielded from the bucket. Therefore, we have a small order bias within a bucket but not between buckets, stricking a good balance between training speed and training loss.
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            From practical experiments, for a batch size of 4, a bucket size of 80 provides a good balance with only slightly lower training loss but cutting training time by 25%. For eval, we don't use the iterator since there, the batch size is relatively big and thus there is little upside.
         | 
| 88 | 
            +
             | 
| 89 | 
            +
            ### Start a run
         | 
| 90 | 
            +
             | 
| 91 | 
            +
            Once everything is set up properly, start a run by using the `run.py` script. Pass `-v` for verbose output to the console (recommended if observing the progress is desired). As an example, for `experiments/lora/llama-3.2-3B-rank32/` the invocation would be:
         | 
| 92 | 
            +
             | 
| 93 | 
            +
            ```sh
         | 
| 94 | 
            +
            python run.py -v experiments/lora/llama-3.2-3B-rank32/
         | 
| 95 | 
            +
            ```
         | 
| 96 | 
            +
             | 
| 97 | 
            +
            By default, the adapter will be saved in a temporary file for further inspection if needed. The prevent this, add the `--clean` flag to the call.
         | 
| 98 | 
            +
             | 
| 99 | 
            +
            ### Run status
         | 
| 100 | 
            +
             | 
| 101 | 
            +
            The run can be categorized 3 different states:
         | 
| 102 | 
            +
             | 
| 103 | 
            +
            1. Main run: You are on the `main` branch and the run ended successfully. The results are stored in the `results` folder and are used for further analysis.
         | 
| 104 | 
            +
            2. Test run: You are not on the `main` branch and the run ended successfully. The results are stored in the `temporary_results` folder and are not used for further analysis.
         | 
| 105 | 
            +
            3. The run was cancelled (`ctrl + c`). The results are stored in the `cancelled_results` folder and are not used for further analysis.
         | 
| 106 | 
            +
             | 
| 107 | 
            +
            ## Outputs
         | 
| 108 | 
            +
             | 
| 109 | 
            +
            Results are stored in one of the result directories. An example output could look like so:
         | 
| 110 | 
            +
             | 
| 111 | 
            +
            ```js
         | 
| 112 | 
            +
            {
         | 
| 113 | 
            +
              "run_info": {
         | 
| 114 | 
            +
                "created_at": "2025-03-05T13:50:05+00:00",
         | 
| 115 | 
            +
                "total_time": 2711.0915009640157,
         | 
| 116 | 
            +
                "experiment_name": "ia3/lr_0.001",
         | 
| 117 | 
            +
                "peft_branch": "ben-method-comparison",
         | 
| 118 | 
            +
                "train_config": {
         | 
| 119 | 
            +
                  "model_id": "meta-llama/Llama-3.2-3B",
         | 
| 120 | 
            +
                  "dtype": "bfloat16",
         | 
| 121 | 
            +
                  "max_seq_length": 768,
         | 
| 122 | 
            +
                  "batch_size": 4,
         | 
| 123 | 
            +
                  "batch_size_eval": 51,
         | 
| 124 | 
            +
                  "max_steps": 5000,
         | 
| 125 | 
            +
                  "eval_steps": 250,
         | 
| 126 | 
            +
                  "compile": false,
         | 
| 127 | 
            +
                  "query_template": "Question: {query} Think step by step.\nAnswer:",
         | 
| 128 | 
            +
                  "seed": 0,
         | 
| 129 | 
            +
                  "grad_norm_clip": 1.0,
         | 
| 130 | 
            +
                  "optimizer_kwargs": {
         | 
| 131 | 
            +
                    "lr": 0.001
         | 
| 132 | 
            +
                  },
         | 
| 133 | 
            +
                  "lr_scheduler": "cosine",
         | 
| 134 | 
            +
                  "use_amp": false,
         | 
| 135 | 
            +
                  "generation_kwargs": {
         | 
| 136 | 
            +
                    "max_length": 800
         | 
| 137 | 
            +
                  },
         | 
| 138 | 
            +
                  "attn_implementation": null
         | 
| 139 | 
            +
                },
         | 
| 140 | 
            +
                "peft_config": {
         | 
| 141 | 
            +
                  "task_type": null,
         | 
| 142 | 
            +
                  "peft_type": "IA3",
         | 
| 143 | 
            +
                  "auto_mapping": null,
         | 
| 144 | 
            +
                  "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
         | 
| 145 | 
            +
                  "revision": null,
         | 
| 146 | 
            +
                  "inference_mode": false,
         | 
| 147 | 
            +
                  "target_modules": [
         | 
| 148 | 
            +
                    "v_proj",
         | 
| 149 | 
            +
                    "k_proj",
         | 
| 150 | 
            +
                    "down_proj"
         | 
| 151 | 
            +
                  ],
         | 
| 152 | 
            +
                  "exclude_modules": null,
         | 
| 153 | 
            +
                  "feedforward_modules": [
         | 
| 154 | 
            +
                    "down_proj"
         | 
| 155 | 
            +
                  ],
         | 
| 156 | 
            +
                  "fan_in_fan_out": false,
         | 
| 157 | 
            +
                  "modules_to_save": null,
         | 
| 158 | 
            +
                  "init_ia3_weights": true
         | 
| 159 | 
            +
                }
         | 
| 160 | 
            +
              },
         | 
| 161 | 
            +
              "train_info": {
         | 
| 162 | 
            +
                "accelerator_memory_reserved_avg": 14229219940,
         | 
| 163 | 
            +
                "accelerator_memory_max": 24847056896,
         | 
| 164 | 
            +
                "accelerator_memory_reserved_99th": 19115624366,
         | 
| 165 | 
            +
                "train_time": 2238.65277833899,
         | 
| 166 | 
            +
                "file_size": 1157064,
         | 
| 167 | 
            +
                "status": "success",
         | 
| 168 | 
            +
                "metrics": [
         | 
| 169 | 
            +
                  {
         | 
| 170 | 
            +
                    "step": 250,
         | 
| 171 | 
            +
                    "valid accuracy": 0.0784313725490196,
         | 
| 172 | 
            +
                    "train loss": 1.1336498007774354,
         | 
| 173 | 
            +
                    "train samples": 1000
         | 
| 174 | 
            +
                  },
         | 
| 175 | 
            +
                  [...]
         | 
| 176 | 
            +
                  {
         | 
| 177 | 
            +
                    "step": 5000,
         | 
| 178 | 
            +
                    "valid accuracy": 0.21568627450980393,
         | 
| 179 | 
            +
                    "train loss": 0.6345920492410659,
         | 
| 180 | 
            +
                    "train samples": 20000
         | 
| 181 | 
            +
                  },
         | 
| 182 | 
            +
                  {
         | 
| 183 | 
            +
                    "step": 5000,
         | 
| 184 | 
            +
                    "test accuracy": 0.35129740518962077,
         | 
| 185 | 
            +
                    "train loss": 0.6345920492410659,
         | 
| 186 | 
            +
                    "train samples": 20000,
         | 
| 187 | 
            +
                    "train total tokens": 4197579
         | 
| 188 | 
            +
                  }
         | 
| 189 | 
            +
                ]
         | 
| 190 | 
            +
              },
         | 
| 191 | 
            +
              "meta_info": {
         | 
| 192 | 
            +
                "model_sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
         | 
| 193 | 
            +
                "model_created_at": "2024-09-18T15:23:48+00:00",
         | 
| 194 | 
            +
                "dataset_sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
         | 
| 195 | 
            +
                "dataset_created_at": "2023-09-21T17:22:46+00:00",
         | 
| 196 | 
            +
                "package_info": {
         | 
| 197 | 
            +
                  "transformers-version": "4.50.0.dev0",
         | 
| 198 | 
            +
                  "transformers-commit-hash": "752ef3fd4e70869626ec70657a770a85c0ad9219",
         | 
| 199 | 
            +
                  "peft-version": "0.14.1.dev0",
         | 
| 200 | 
            +
                  "peft-commit-hash": "a447a4e5ecd87b7d57733f4df9616a328cf130f4",
         | 
| 201 | 
            +
                  "datasets-version": "3.3.2",
         | 
| 202 | 
            +
                  "datasets-commit-hash": null,
         | 
| 203 | 
            +
                  "bitsandbytes-version": "0.45.2",
         | 
| 204 | 
            +
                  "bitsandbytes-commit-hash": null,
         | 
| 205 | 
            +
                  "torch-version": "2.6.0+cu124",
         | 
| 206 | 
            +
                  "torch-commit-hash": null
         | 
| 207 | 
            +
                },
         | 
| 208 | 
            +
                "system_info": {
         | 
| 209 | 
            +
                  "system": "Linux",
         | 
| 210 | 
            +
                  "release": "6.11.0-17-generic",
         | 
| 211 | 
            +
                  "version": "#17~24.04.2-Ubuntu SMP PREEMPT_DYNAMIC Mon Jan 20 22:48:29 UTC 2",
         | 
| 212 | 
            +
                  "machine": "x86_64",
         | 
| 213 | 
            +
                  "processor": "x86_64",
         | 
| 214 | 
            +
                  "accelerator": "NVIDIA GeForce RTX 4090"
         | 
| 215 | 
            +
                },
         | 
| 216 | 
            +
                "pytorch_info": "PyTorch built with: [...]"
         | 
| 217 | 
            +
              }
         | 
| 218 | 
            +
            }
         | 
| 219 | 
            +
            ```
         | 
| 220 | 
            +
             | 
| 221 | 
            +
            ## Dependencies
         | 
| 222 | 
            +
             | 
| 223 | 
            +
            Apart from the normal PEFT dependencies, ensure that the packages in the `requirements.txt` are installed, e.g. via:
         | 
| 224 | 
            +
             | 
| 225 | 
            +
            ```sh
         | 
| 226 | 
            +
            python -m pip install -r requirements.txt
         | 
| 227 | 
            +
            ```
         | 
| 228 | 
            +
             | 
| 229 | 
            +
            Python 3.12+ is required.
         | 
| 230 | 
            +
             | 
| 231 | 
            +
            ## Open tasks
         | 
| 232 | 
            +
             | 
| 233 | 
            +
            - consider using `DataLoader`
         | 
| 234 | 
            +
            - consider adding https://github.com/huggingface/Math-Verify
         | 
| 235 | 
            +
            - consider adding `weight` argument to cross entropy calculation to downweight the EOS token, but it would require calculating the loss manually instead of relying on transformers (see https://github.com/huggingface/transformers/blob/6a876462c308bd7cd7d3ca8e93abaa7d5b02e90e/src/transformers/loss/loss_utils.py#L24-L48)
         | 
| 236 | 
            +
            - do a sanity check against/comparison with transformers Trainer
         | 
| 237 | 
            +
            - consider using vLLM to potentially speed up generations, at least for the test set
         | 
| 238 | 
            +
            - using `torch.compile` leads to a huge slowdown, investigate (maybe recompiles), although it does save memory
         | 
| 239 | 
            +
            - AMP does not appear to help, investigate
         | 
| 240 | 
            +
            - packing of sequences (but this probably requires adjusting the attention matrix)
         | 
| 241 | 
            +
            - clean up what gets printed and where (stdout, stderr)
         | 
    	
        MetaMathQA/cancelled_results/.gitkeep
    ADDED
    
    | 
            File without changes
         | 
    	
        MetaMathQA/data.py
    ADDED
    
    | @@ -0,0 +1,109 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Copyright 2025-present the HuggingFace Inc. team.
         | 
| 2 | 
            +
            #
         | 
| 3 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 4 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 5 | 
            +
            # You may obtain a copy of the License at
         | 
| 6 | 
            +
            #
         | 
| 7 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 8 | 
            +
            #
         | 
| 9 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 10 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 11 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 12 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 13 | 
            +
            # limitations under the License.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            """
         | 
| 16 | 
            +
            All utilities related to data handling.
         | 
| 17 | 
            +
            """
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            from functools import partial
         | 
| 20 | 
            +
            from typing import Callable
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            import datasets
         | 
| 23 | 
            +
            import numpy as np
         | 
| 24 | 
            +
            from datasets import Dataset, load_dataset
         | 
| 25 | 
            +
             | 
| 26 | 
            +
             | 
| 27 | 
            +
            # with a token limit of 768 for query + response, we have to exclude all texts with length > 1304; this leaves 93.8% of
         | 
| 28 | 
            +
            # the dataset
         | 
| 29 | 
            +
            CHAR_LIMIT = 1300
         | 
| 30 | 
            +
            # train/valid/test split -- note that evaluation takes quite long, so don't choose too large sizes for the valid set,
         | 
| 31 | 
            +
            # since it's run multiple times during training; test is only run once at the end and thus can be larger
         | 
| 32 | 
            +
            VALID_SIZE = 50
         | 
| 33 | 
            +
             | 
| 34 | 
            +
             | 
| 35 | 
            +
            def get_filtered_dataset(*, ds: datasets.Dataset, print_fn: Callable[..., None]) -> Dataset:
         | 
| 36 | 
            +
                """Return the filtered dataset, with long queries removed.
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                We determined that 99% of queries have 529 or fewer characters. Characters roughly correspond to tokens, so this is
         | 
| 39 | 
            +
                a good proxy. We cannot use tokens directly, as that depends on the tokenizer, which can be different for each
         | 
| 40 | 
            +
                model, but we want the same filter for each model.
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                """
         | 
| 43 | 
            +
                char_lengths = [len(f"{q} {r}") for q, r in zip(ds["query"], ds["response"])]
         | 
| 44 | 
            +
                idx_filtered = [i for i, length in enumerate(char_lengths) if length <= CHAR_LIMIT]
         | 
| 45 | 
            +
                print_fn(f"Filtered dataset: {100 * len(idx_filtered) / len(ds):.1f}% of the original dataset")
         | 
| 46 | 
            +
                return ds.select(idx_filtered)
         | 
| 47 | 
            +
             | 
| 48 | 
            +
             | 
| 49 | 
            +
            def get_train_valid_test_datasets(
         | 
| 50 | 
            +
                *, tokenizer, query_template: str, print_fn: Callable[..., None]
         | 
| 51 | 
            +
            ) -> tuple[Dataset, Dataset, Dataset]:
         | 
| 52 | 
            +
                """
         | 
| 53 | 
            +
                Return the indices of the train, valid, and test splits of the dataset.
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                We cannot use ds.train_test_split(..., stratify_by_column="type") as it gives:
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                > ValueError: Stratifying by column is only supported for ClassLabel column, and column type is Value.
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                even after calling ds_filtered.class_encode_column("type"). Thus, using sklearn's StratifiedKFold instead.
         | 
| 60 | 
            +
                """
         | 
| 61 | 
            +
                metamath = load_dataset("meta-math/MetaMathQA")["train"]
         | 
| 62 | 
            +
                metamath = get_filtered_dataset(ds=metamath, print_fn=print_fn)
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                # gsmk8k does not need to be filtered as query and response are short enough
         | 
| 65 | 
            +
                gsm8k = load_dataset("openai/gsm8k", "main")
         | 
| 66 | 
            +
                gsm8k = gsm8k.rename_columns({"question": "query", "answer": "response"})
         | 
| 67 | 
            +
                gsm8k_train = gsm8k["train"]
         | 
| 68 | 
            +
                gsm8k_test = gsm8k["test"]
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                np.random.seed(0)
         | 
| 71 | 
            +
                indices = np.arange(len(gsm8k_train))
         | 
| 72 | 
            +
                np.random.shuffle(indices)
         | 
| 73 | 
            +
                idx_valid = indices[:VALID_SIZE]
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                ds_train = metamath
         | 
| 76 | 
            +
                ds_valid = gsm8k_train.select(idx_valid)
         | 
| 77 | 
            +
                ds_test = gsm8k_test
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                print_fn(f"Train size: {len(ds_train)}")
         | 
| 80 | 
            +
                print_fn(f"Valid size: {len(ds_valid)}")
         | 
| 81 | 
            +
                print_fn(f"Test size: {len(ds_test)}")
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                tokenize_with_answer_ = partial(tokenize_with_answer, tokenizer=tokenizer, template=query_template)
         | 
| 84 | 
            +
                tokenize_wo_answer_ = partial(tokenize_wo_answer, tokenizer=tokenizer, template=query_template)
         | 
| 85 | 
            +
                ds_train = ds_train.map(tokenize_with_answer_, batched=True).remove_columns(["type", "query", "original_question"])
         | 
| 86 | 
            +
                ds_valid = ds_valid.map(tokenize_wo_answer_, batched=True).remove_columns(["query"])
         | 
| 87 | 
            +
                ds_test = ds_test.map(tokenize_wo_answer_, batched=True).remove_columns(["query"])
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                return ds_train, ds_valid, ds_test
         | 
| 90 | 
            +
             | 
| 91 | 
            +
             | 
| 92 | 
            +
            def tokenize_with_answer(samples, tokenizer, template):
         | 
| 93 | 
            +
                queries = [template.format(query=sample) + answer for sample, answer in zip(samples["query"], samples["response"])]
         | 
| 94 | 
            +
                tokenized = tokenizer(queries)
         | 
| 95 | 
            +
                tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]]
         | 
| 96 | 
            +
                tokenized["attention_mask"] = [
         | 
| 97 | 
            +
                    input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"]
         | 
| 98 | 
            +
                ]
         | 
| 99 | 
            +
                return tokenized
         | 
| 100 | 
            +
             | 
| 101 | 
            +
             | 
| 102 | 
            +
            def tokenize_wo_answer(samples, tokenizer, template):
         | 
| 103 | 
            +
                queries = [template.format(query=sample) for sample in samples["query"]]
         | 
| 104 | 
            +
                tokenized = tokenizer(queries)
         | 
| 105 | 
            +
                tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]]
         | 
| 106 | 
            +
                tokenized["attention_mask"] = [
         | 
| 107 | 
            +
                    input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"]
         | 
| 108 | 
            +
                ]
         | 
| 109 | 
            +
                return tokenized
         | 
    	
        MetaMathQA/default_training_params.json
    ADDED
    
    | @@ -0,0 +1,26 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "model_id": "meta-llama/Llama-3.2-3B",
         | 
| 3 | 
            +
              "dtype": "bfloat16",
         | 
| 4 | 
            +
              "max_seq_length": 768,
         | 
| 5 | 
            +
              "batch_size": 4,
         | 
| 6 | 
            +
              "batch_size_eval": 50,
         | 
| 7 | 
            +
              "max_steps": 5000,
         | 
| 8 | 
            +
              "eval_steps": 250,
         | 
| 9 | 
            +
              "compile": false,
         | 
| 10 | 
            +
              "seed": 0,
         | 
| 11 | 
            +
              "grad_norm_clip": 1.0,
         | 
| 12 | 
            +
              "optimizer_type": "AdamW",
         | 
| 13 | 
            +
              "optimizer_kwargs": {
         | 
| 14 | 
            +
                "lr": 1e-4,
         | 
| 15 | 
            +
                "weight_decay": 0.1
         | 
| 16 | 
            +
              },
         | 
| 17 | 
            +
              "lr_scheduler": "cosine",
         | 
| 18 | 
            +
              "use_amp": false,
         | 
| 19 | 
            +
              "autocast_adapter_dtype": true,
         | 
| 20 | 
            +
              "attn_implementation": null,
         | 
| 21 | 
            +
              "generation_kwargs": {
         | 
| 22 | 
            +
                "max_length": 800,
         | 
| 23 | 
            +
                "max_new_tokens": 300
         | 
| 24 | 
            +
              },
         | 
| 25 | 
            +
              "query_template": "Question: {query} Think step by step.\nAnswer:"
         | 
| 26 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/adalora/llama-3.2-3B-rank32/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,39 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "alpha_pattern": {},
         | 
| 3 | 
            +
              "auto_mapping": null,
         | 
| 4 | 
            +
              "base_model_name_or_path": null,
         | 
| 5 | 
            +
              "beta1": 0.85,
         | 
| 6 | 
            +
              "beta2": 0.85,
         | 
| 7 | 
            +
              "bias": "none",
         | 
| 8 | 
            +
              "corda_config": null,
         | 
| 9 | 
            +
              "deltaT": 1,
         | 
| 10 | 
            +
              "eva_config": null,
         | 
| 11 | 
            +
              "exclude_modules": null,
         | 
| 12 | 
            +
              "fan_in_fan_out": false,
         | 
| 13 | 
            +
              "inference_mode": false,
         | 
| 14 | 
            +
              "init_lora_weights": true,
         | 
| 15 | 
            +
              "init_r": 64,
         | 
| 16 | 
            +
              "layer_replication": null,
         | 
| 17 | 
            +
              "layers_pattern": null,
         | 
| 18 | 
            +
              "layers_to_transform": null,
         | 
| 19 | 
            +
              "loftq_config": {},
         | 
| 20 | 
            +
              "lora_alpha": 8,
         | 
| 21 | 
            +
              "lora_bias": false,
         | 
| 22 | 
            +
              "lora_dropout": 0.0,
         | 
| 23 | 
            +
              "megatron_config": null,
         | 
| 24 | 
            +
              "megatron_core": "megatron.core",
         | 
| 25 | 
            +
              "modules_to_save": null,
         | 
| 26 | 
            +
              "orth_reg_weight": 0.5,
         | 
| 27 | 
            +
              "peft_type": "ADALORA",
         | 
| 28 | 
            +
              "r": 8,
         | 
| 29 | 
            +
              "rank_pattern": null,
         | 
| 30 | 
            +
              "revision": null,
         | 
| 31 | 
            +
              "target_modules": null,
         | 
| 32 | 
            +
              "target_r": 32,
         | 
| 33 | 
            +
              "task_type": null,
         | 
| 34 | 
            +
              "tfinal": 500,
         | 
| 35 | 
            +
              "tinit": 200,
         | 
| 36 | 
            +
              "total_step": 5000,
         | 
| 37 | 
            +
              "use_dora": false,
         | 
| 38 | 
            +
              "use_rslora": false
         | 
| 39 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,11 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "adapter_layers": 28,
         | 
| 3 | 
            +
              "adapter_len": 100,
         | 
| 4 | 
            +
              "auto_mapping": null,
         | 
| 5 | 
            +
              "base_model_name_or_path": null,
         | 
| 6 | 
            +
              "inference_mode": false,
         | 
| 7 | 
            +
              "peft_type": "ADAPTION_PROMPT",
         | 
| 8 | 
            +
              "revision": null,
         | 
| 9 | 
            +
              "target_modules": null,
         | 
| 10 | 
            +
              "task_type": "CAUSAL_LM"
         | 
| 11 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/training_params.json
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "optimizer_kwargs": {
         | 
| 3 | 
            +
                "lr": 5e-4
         | 
| 4 | 
            +
              }
         | 
| 5 | 
            +
            }
         | 
| 6 | 
            +
             | 
    	
        MetaMathQA/experiments/boft/llama-3.2-3B-default/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,20 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "bias": "none",
         | 
| 5 | 
            +
              "boft_block_num": 0,
         | 
| 6 | 
            +
              "boft_block_size": 4,
         | 
| 7 | 
            +
              "boft_dropout": 0.0,
         | 
| 8 | 
            +
              "boft_n_butterfly_factor": 1,
         | 
| 9 | 
            +
              "exclude_modules": null,
         | 
| 10 | 
            +
              "fan_in_fan_out": false,
         | 
| 11 | 
            +
              "inference_mode": false,
         | 
| 12 | 
            +
              "init_weights": true,
         | 
| 13 | 
            +
              "layers_pattern": null,
         | 
| 14 | 
            +
              "layers_to_transform": null,
         | 
| 15 | 
            +
              "modules_to_save": null,
         | 
| 16 | 
            +
              "peft_type": "BOFT",
         | 
| 17 | 
            +
              "revision": null,
         | 
| 18 | 
            +
              "target_modules": null,
         | 
| 19 | 
            +
              "task_type": null
         | 
| 20 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/bone/llama-3.2-3B-bat/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,19 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "bias": "none",
         | 
| 5 | 
            +
              "exclude_modules": null,
         | 
| 6 | 
            +
              "inference_mode": false,
         | 
| 7 | 
            +
              "init_weights": "bat",
         | 
| 8 | 
            +
              "layers_pattern": null,
         | 
| 9 | 
            +
              "layers_to_transform": null,
         | 
| 10 | 
            +
              "modules_to_save": null,
         | 
| 11 | 
            +
              "peft_type": "BONE",
         | 
| 12 | 
            +
              "r": 64,
         | 
| 13 | 
            +
              "revision": null,
         | 
| 14 | 
            +
              "target_modules": [
         | 
| 15 | 
            +
                "v_proj",
         | 
| 16 | 
            +
                "q_proj"
         | 
| 17 | 
            +
              ],
         | 
| 18 | 
            +
              "task_type": null
         | 
| 19 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/bone/llama-3.2-3B-default/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,19 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "bias": "none",
         | 
| 5 | 
            +
              "exclude_modules": null,
         | 
| 6 | 
            +
              "inference_mode": false,
         | 
| 7 | 
            +
              "init_weights": true,
         | 
| 8 | 
            +
              "layers_pattern": null,
         | 
| 9 | 
            +
              "layers_to_transform": null,
         | 
| 10 | 
            +
              "modules_to_save": null,
         | 
| 11 | 
            +
              "peft_type": "BONE",
         | 
| 12 | 
            +
              "r": 64,
         | 
| 13 | 
            +
              "revision": null,
         | 
| 14 | 
            +
              "target_modules": [
         | 
| 15 | 
            +
                "v_proj",
         | 
| 16 | 
            +
                "q_proj"
         | 
| 17 | 
            +
              ],
         | 
| 18 | 
            +
              "task_type": null
         | 
| 19 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/c3a/llama-3.2-3B-default/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,21 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "bias": "none",
         | 
| 5 | 
            +
              "exclude_modules": null,
         | 
| 6 | 
            +
              "fan_in_fan_out": false,
         | 
| 7 | 
            +
              "inference_mode": false,
         | 
| 8 | 
            +
              "init_weights": false,
         | 
| 9 | 
            +
              "layers_pattern": null,
         | 
| 10 | 
            +
              "layers_to_transform": null,
         | 
| 11 | 
            +
              "modules_to_save": null,
         | 
| 12 | 
            +
              "block_size": 64,
         | 
| 13 | 
            +
              "block_size_pattern": {},
         | 
| 14 | 
            +
              "peft_type": "C3A",
         | 
| 15 | 
            +
              "revision": null,
         | 
| 16 | 
            +
              "target_modules": [
         | 
| 17 | 
            +
                "v_proj",
         | 
| 18 | 
            +
                "q_proj"
         | 
| 19 | 
            +
              ],
         | 
| 20 | 
            +
              "task_type": null
         | 
| 21 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/c3a/llama-3.2-3B-default/training_params.json
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "optimizer_kwargs": {
         | 
| 3 | 
            +
                "lr": 3e-1,
         | 
| 4 | 
            +
                "weight_decay": 1e-5
         | 
| 5 | 
            +
              }
         | 
| 6 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/delora/llama-3.2-3B-rank32/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,20 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "lambda_pattern": {},
         | 
| 3 | 
            +
              "auto_mapping": null,
         | 
| 4 | 
            +
              "base_model_name_or_path": null,
         | 
| 5 | 
            +
              "bias": "none",
         | 
| 6 | 
            +
              "exclude_modules": null,
         | 
| 7 | 
            +
              "inference_mode": false,
         | 
| 8 | 
            +
              "init_weights": true,
         | 
| 9 | 
            +
              "layers_pattern": null,
         | 
| 10 | 
            +
              "layers_to_transform": null,
         | 
| 11 | 
            +
              "delora_lambda": 15,
         | 
| 12 | 
            +
              "module_dropout": 0.0,
         | 
| 13 | 
            +
              "modules_to_save": null,
         | 
| 14 | 
            +
              "peft_type": "DELORA",
         | 
| 15 | 
            +
              "r": 32,
         | 
| 16 | 
            +
              "rank_pattern": {},
         | 
| 17 | 
            +
              "revision": null,
         | 
| 18 | 
            +
              "target_modules": null,
         | 
| 19 | 
            +
              "task_type": "CAUSAL_LM"
         | 
| 20 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/delora/llama-3.2-3B-rank32/training_params.json
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "optimizer_kwargs": {
         | 
| 3 | 
            +
                "lr": 1e-3
         | 
| 4 | 
            +
              }
         | 
| 5 | 
            +
            }
         | 
| 6 | 
            +
             | 
    	
        MetaMathQA/experiments/fourierft/llama-3.2-3B-default/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,23 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "bias": "none",
         | 
| 5 | 
            +
              "exclude_modules": null,
         | 
| 6 | 
            +
              "fan_in_fan_out": false,
         | 
| 7 | 
            +
              "inference_mode": false,
         | 
| 8 | 
            +
              "init_weights": false,
         | 
| 9 | 
            +
              "layers_pattern": null,
         | 
| 10 | 
            +
              "layers_to_transform": null,
         | 
| 11 | 
            +
              "modules_to_save": null,
         | 
| 12 | 
            +
              "n_frequency": 1000,
         | 
| 13 | 
            +
              "n_frequency_pattern": {},
         | 
| 14 | 
            +
              "peft_type": "FOURIERFT",
         | 
| 15 | 
            +
              "random_loc_seed": 777,
         | 
| 16 | 
            +
              "revision": null,
         | 
| 17 | 
            +
              "scaling": 300,
         | 
| 18 | 
            +
              "target_modules": [
         | 
| 19 | 
            +
                "v_proj",
         | 
| 20 | 
            +
                "q_proj"
         | 
| 21 | 
            +
              ],
         | 
| 22 | 
            +
              "task_type": null
         | 
| 23 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/fourierft/llama-3.2-3B-n_frequency-5000/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,23 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "bias": "none",
         | 
| 5 | 
            +
              "exclude_modules": null,
         | 
| 6 | 
            +
              "fan_in_fan_out": false,
         | 
| 7 | 
            +
              "inference_mode": false,
         | 
| 8 | 
            +
              "init_weights": false,
         | 
| 9 | 
            +
              "layers_pattern": null,
         | 
| 10 | 
            +
              "layers_to_transform": null,
         | 
| 11 | 
            +
              "modules_to_save": null,
         | 
| 12 | 
            +
              "n_frequency": 5000,
         | 
| 13 | 
            +
              "n_frequency_pattern": {},
         | 
| 14 | 
            +
              "peft_type": "FOURIERFT",
         | 
| 15 | 
            +
              "random_loc_seed": 777,
         | 
| 16 | 
            +
              "revision": null,
         | 
| 17 | 
            +
              "scaling": 300,
         | 
| 18 | 
            +
              "target_modules": [
         | 
| 19 | 
            +
                "v_proj",
         | 
| 20 | 
            +
                "q_proj"
         | 
| 21 | 
            +
              ],
         | 
| 22 | 
            +
              "task_type": null
         | 
| 23 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/full-finetuning/llama-3.2-3B-lr_0.00001/training_params.json
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "optimizer_kwargs": {
         | 
| 3 | 
            +
                "lr": 1e-5
         | 
| 4 | 
            +
              }
         | 
| 5 | 
            +
            }
         | 
| 6 | 
            +
             | 
    	
        MetaMathQA/experiments/ia3/llama-3.2-3B-default/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,14 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "exclude_modules": null,
         | 
| 5 | 
            +
              "fan_in_fan_out": false,
         | 
| 6 | 
            +
              "feedforward_modules": null,
         | 
| 7 | 
            +
              "inference_mode": false,
         | 
| 8 | 
            +
              "init_ia3_weights": true,
         | 
| 9 | 
            +
              "modules_to_save": null,
         | 
| 10 | 
            +
              "peft_type": "IA3",
         | 
| 11 | 
            +
              "revision": null,
         | 
| 12 | 
            +
              "target_modules": null,
         | 
| 13 | 
            +
              "task_type": null
         | 
| 14 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,14 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "exclude_modules": null,
         | 
| 5 | 
            +
              "fan_in_fan_out": false,
         | 
| 6 | 
            +
              "feedforward_modules": null,
         | 
| 7 | 
            +
              "inference_mode": false,
         | 
| 8 | 
            +
              "init_ia3_weights": true,
         | 
| 9 | 
            +
              "modules_to_save": null,
         | 
| 10 | 
            +
              "peft_type": "IA3",
         | 
| 11 | 
            +
              "revision": null,
         | 
| 12 | 
            +
              "target_modules": null,
         | 
| 13 | 
            +
              "task_type": null
         | 
| 14 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/training_params.json
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "optimizer_kwargs": {
         | 
| 3 | 
            +
                "lr": 1e-3
         | 
| 4 | 
            +
              }
         | 
| 5 | 
            +
            }
         | 
| 6 | 
            +
             | 
    	
        MetaMathQA/experiments/ln_tuning/llama-3.2-3B-default/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,11 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "exclude_modules": null,
         | 
| 5 | 
            +
              "inference_mode": false,
         | 
| 6 | 
            +
              "modules_to_save": null,
         | 
| 7 | 
            +
              "peft_type": "LN_TUNING",
         | 
| 8 | 
            +
              "revision": null,
         | 
| 9 | 
            +
              "target_modules": null,
         | 
| 10 | 
            +
              "task_type": null
         | 
| 11 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/loha/llama-3.2-3B-rank32/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "alpha": 64,
         | 
| 3 | 
            +
              "alpha_pattern": {},
         | 
| 4 | 
            +
              "auto_mapping": null,
         | 
| 5 | 
            +
              "base_model_name_or_path": null,
         | 
| 6 | 
            +
              "exclude_modules": null,
         | 
| 7 | 
            +
              "inference_mode": false,
         | 
| 8 | 
            +
              "init_weights": true,
         | 
| 9 | 
            +
              "layers_pattern": null,
         | 
| 10 | 
            +
              "layers_to_transform": null,
         | 
| 11 | 
            +
              "module_dropout": 0.0,
         | 
| 12 | 
            +
              "modules_to_save": null,
         | 
| 13 | 
            +
              "peft_type": "LOHA",
         | 
| 14 | 
            +
              "r": 32,
         | 
| 15 | 
            +
              "rank_dropout": 0.0,
         | 
| 16 | 
            +
              "rank_pattern": {},
         | 
| 17 | 
            +
              "revision": null,
         | 
| 18 | 
            +
              "target_modules": [
         | 
| 19 | 
            +
                "q_proj",
         | 
| 20 | 
            +
                "v_proj"
         | 
| 21 | 
            +
              ],
         | 
| 22 | 
            +
              "task_type": null,
         | 
| 23 | 
            +
              "use_effective_conv2d": false
         | 
| 24 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/lokr/llama-3.2-3B-rank32/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,27 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "alpha": 64,
         | 
| 3 | 
            +
              "alpha_pattern": {},
         | 
| 4 | 
            +
              "auto_mapping": null,
         | 
| 5 | 
            +
              "base_model_name_or_path": null,
         | 
| 6 | 
            +
              "decompose_both": false,
         | 
| 7 | 
            +
              "decompose_factor": -1,
         | 
| 8 | 
            +
              "exclude_modules": null,
         | 
| 9 | 
            +
              "inference_mode": false,
         | 
| 10 | 
            +
              "init_weights": true,
         | 
| 11 | 
            +
              "layers_pattern": null,
         | 
| 12 | 
            +
              "layers_to_transform": null,
         | 
| 13 | 
            +
              "module_dropout": 0.0,
         | 
| 14 | 
            +
              "modules_to_save": null,
         | 
| 15 | 
            +
              "peft_type": "LOKR",
         | 
| 16 | 
            +
              "r": 32,
         | 
| 17 | 
            +
              "rank_dropout": 0.0,
         | 
| 18 | 
            +
              "rank_dropout_scale": false,
         | 
| 19 | 
            +
              "rank_pattern": {},
         | 
| 20 | 
            +
              "revision": null,
         | 
| 21 | 
            +
              "target_modules": [
         | 
| 22 | 
            +
                "q_proj",
         | 
| 23 | 
            +
                "v_proj"
         | 
| 24 | 
            +
              ],
         | 
| 25 | 
            +
              "task_type": null,
         | 
| 26 | 
            +
              "use_effective_conv2d": false
         | 
| 27 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/lora/llama-3.2-3B-rank10-target-mlp/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,30 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "alpha_pattern": {},
         | 
| 3 | 
            +
              "auto_mapping": null,
         | 
| 4 | 
            +
              "base_model_name_or_path": null,
         | 
| 5 | 
            +
              "bias": "none",
         | 
| 6 | 
            +
              "corda_config": null,
         | 
| 7 | 
            +
              "eva_config": null,
         | 
| 8 | 
            +
              "exclude_modules": null,
         | 
| 9 | 
            +
              "fan_in_fan_out": false,
         | 
| 10 | 
            +
              "inference_mode": false,
         | 
| 11 | 
            +
              "init_lora_weights": true,
         | 
| 12 | 
            +
              "layer_replication": null,
         | 
| 13 | 
            +
              "layers_pattern": null,
         | 
| 14 | 
            +
              "layers_to_transform": null,
         | 
| 15 | 
            +
              "loftq_config": {},
         | 
| 16 | 
            +
              "lora_alpha": 20,
         | 
| 17 | 
            +
              "lora_bias": false,
         | 
| 18 | 
            +
              "lora_dropout": 0.0,
         | 
| 19 | 
            +
              "megatron_config": null,
         | 
| 20 | 
            +
              "megatron_core": "megatron.core",
         | 
| 21 | 
            +
              "modules_to_save": null,
         | 
| 22 | 
            +
              "peft_type": "LORA",
         | 
| 23 | 
            +
              "r": 10,
         | 
| 24 | 
            +
              "rank_pattern": {},
         | 
| 25 | 
            +
              "revision": null,
         | 
| 26 | 
            +
              "target_modules": ["gate_proj", "up_proj", "down_proj"],
         | 
| 27 | 
            +
              "task_type": "CAUSAL_LM",
         | 
| 28 | 
            +
              "use_dora": false,
         | 
| 29 | 
            +
              "use_rslora": false
         | 
| 30 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/lora/llama-3.2-3B-rank32-dora/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,30 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "alpha_pattern": {},
         | 
| 3 | 
            +
              "auto_mapping": null,
         | 
| 4 | 
            +
              "base_model_name_or_path": null,
         | 
| 5 | 
            +
              "bias": "none",
         | 
| 6 | 
            +
              "corda_config": null,
         | 
| 7 | 
            +
              "eva_config": null,
         | 
| 8 | 
            +
              "exclude_modules": null,
         | 
| 9 | 
            +
              "fan_in_fan_out": false,
         | 
| 10 | 
            +
              "inference_mode": false,
         | 
| 11 | 
            +
              "init_lora_weights": true,
         | 
| 12 | 
            +
              "layer_replication": null,
         | 
| 13 | 
            +
              "layers_pattern": null,
         | 
| 14 | 
            +
              "layers_to_transform": null,
         | 
| 15 | 
            +
              "loftq_config": {},
         | 
| 16 | 
            +
              "lora_alpha": 64,
         | 
| 17 | 
            +
              "lora_bias": false,
         | 
| 18 | 
            +
              "lora_dropout": 0.0,
         | 
| 19 | 
            +
              "megatron_config": null,
         | 
| 20 | 
            +
              "megatron_core": "megatron.core",
         | 
| 21 | 
            +
              "modules_to_save": null,
         | 
| 22 | 
            +
              "peft_type": "LORA",
         | 
| 23 | 
            +
              "r": 32,
         | 
| 24 | 
            +
              "rank_pattern": {},
         | 
| 25 | 
            +
              "revision": null,
         | 
| 26 | 
            +
              "target_modules": null,
         | 
| 27 | 
            +
              "task_type": "CAUSAL_LM",
         | 
| 28 | 
            +
              "use_dora": true,
         | 
| 29 | 
            +
              "use_rslora": false
         | 
| 30 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,30 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "alpha_pattern": {},
         | 
| 3 | 
            +
              "auto_mapping": null,
         | 
| 4 | 
            +
              "base_model_name_or_path": null,
         | 
| 5 | 
            +
              "bias": "none",
         | 
| 6 | 
            +
              "corda_config": null,
         | 
| 7 | 
            +
              "eva_config": null,
         | 
| 8 | 
            +
              "exclude_modules": null,
         | 
| 9 | 
            +
              "fan_in_fan_out": false,
         | 
| 10 | 
            +
              "inference_mode": false,
         | 
| 11 | 
            +
              "init_lora_weights": true,
         | 
| 12 | 
            +
              "layer_replication": null,
         | 
| 13 | 
            +
              "layers_pattern": null,
         | 
| 14 | 
            +
              "layers_to_transform": null,
         | 
| 15 | 
            +
              "loftq_config": {},
         | 
| 16 | 
            +
              "lora_alpha": 64,
         | 
| 17 | 
            +
              "lora_bias": false,
         | 
| 18 | 
            +
              "lora_dropout": 0.0,
         | 
| 19 | 
            +
              "megatron_config": null,
         | 
| 20 | 
            +
              "megatron_core": "megatron.core",
         | 
| 21 | 
            +
              "modules_to_save": null,
         | 
| 22 | 
            +
              "peft_type": "LORA",
         | 
| 23 | 
            +
              "r": 32,
         | 
| 24 | 
            +
              "rank_pattern": {},
         | 
| 25 | 
            +
              "revision": null,
         | 
| 26 | 
            +
              "target_modules": null,
         | 
| 27 | 
            +
              "task_type": "CAUSAL_LM",
         | 
| 28 | 
            +
              "use_dora": false,
         | 
| 29 | 
            +
              "use_rslora": false
         | 
| 30 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/training_params.json
    ADDED
    
    | @@ -0,0 +1,9 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "optimizer_type": "lora-fa",
         | 
| 3 | 
            +
              "optimizer_kwargs": {
         | 
| 4 | 
            +
                "r": 32,
         | 
| 5 | 
            +
                "lora_alpha": 64,
         | 
| 6 | 
            +
                "lr": 1e-4,
         | 
| 7 | 
            +
                "weight_decay": 0.1
         | 
| 8 | 
            +
              }
         | 
| 9 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/lora/llama-3.2-3B-rank32/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,30 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "alpha_pattern": {},
         | 
| 3 | 
            +
              "auto_mapping": null,
         | 
| 4 | 
            +
              "base_model_name_or_path": null,
         | 
| 5 | 
            +
              "bias": "none",
         | 
| 6 | 
            +
              "corda_config": null,
         | 
| 7 | 
            +
              "eva_config": null,
         | 
| 8 | 
            +
              "exclude_modules": null,
         | 
| 9 | 
            +
              "fan_in_fan_out": false,
         | 
| 10 | 
            +
              "inference_mode": false,
         | 
| 11 | 
            +
              "init_lora_weights": true,
         | 
| 12 | 
            +
              "layer_replication": null,
         | 
| 13 | 
            +
              "layers_pattern": null,
         | 
| 14 | 
            +
              "layers_to_transform": null,
         | 
| 15 | 
            +
              "loftq_config": {},
         | 
| 16 | 
            +
              "lora_alpha": 64,
         | 
| 17 | 
            +
              "lora_bias": false,
         | 
| 18 | 
            +
              "lora_dropout": 0.0,
         | 
| 19 | 
            +
              "megatron_config": null,
         | 
| 20 | 
            +
              "megatron_core": "megatron.core",
         | 
| 21 | 
            +
              "modules_to_save": null,
         | 
| 22 | 
            +
              "peft_type": "LORA",
         | 
| 23 | 
            +
              "r": 32,
         | 
| 24 | 
            +
              "rank_pattern": {},
         | 
| 25 | 
            +
              "revision": null,
         | 
| 26 | 
            +
              "target_modules": null,
         | 
| 27 | 
            +
              "task_type": "CAUSAL_LM",
         | 
| 28 | 
            +
              "use_dora": false,
         | 
| 29 | 
            +
              "use_rslora": false
         | 
| 30 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/lora/llama-3.2-3B-rank64-rslora/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,30 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "alpha_pattern": {},
         | 
| 3 | 
            +
              "auto_mapping": null,
         | 
| 4 | 
            +
              "base_model_name_or_path": null,
         | 
| 5 | 
            +
              "bias": "none",
         | 
| 6 | 
            +
              "corda_config": null,
         | 
| 7 | 
            +
              "eva_config": null,
         | 
| 8 | 
            +
              "exclude_modules": null,
         | 
| 9 | 
            +
              "fan_in_fan_out": false,
         | 
| 10 | 
            +
              "inference_mode": false,
         | 
| 11 | 
            +
              "init_lora_weights": true,
         | 
| 12 | 
            +
              "layer_replication": null,
         | 
| 13 | 
            +
              "layers_pattern": null,
         | 
| 14 | 
            +
              "layers_to_transform": null,
         | 
| 15 | 
            +
              "loftq_config": {},
         | 
| 16 | 
            +
              "lora_alpha": 64,
         | 
| 17 | 
            +
              "lora_bias": false,
         | 
| 18 | 
            +
              "lora_dropout": 0.0,
         | 
| 19 | 
            +
              "megatron_config": null,
         | 
| 20 | 
            +
              "megatron_core": "megatron.core",
         | 
| 21 | 
            +
              "modules_to_save": null,
         | 
| 22 | 
            +
              "peft_type": "LORA",
         | 
| 23 | 
            +
              "r": 64,
         | 
| 24 | 
            +
              "rank_pattern": {},
         | 
| 25 | 
            +
              "revision": null,
         | 
| 26 | 
            +
              "target_modules": null,
         | 
| 27 | 
            +
              "task_type": "CAUSAL_LM",
         | 
| 28 | 
            +
              "use_dora": false,
         | 
| 29 | 
            +
              "use_rslora": true
         | 
| 30 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/lora/llama-3.2-3B-rank64/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,30 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "alpha_pattern": {},
         | 
| 3 | 
            +
              "auto_mapping": null,
         | 
| 4 | 
            +
              "base_model_name_or_path": null,
         | 
| 5 | 
            +
              "bias": "none",
         | 
| 6 | 
            +
              "corda_config": null,
         | 
| 7 | 
            +
              "eva_config": null,
         | 
| 8 | 
            +
              "exclude_modules": null,
         | 
| 9 | 
            +
              "fan_in_fan_out": false,
         | 
| 10 | 
            +
              "inference_mode": false,
         | 
| 11 | 
            +
              "init_lora_weights": true,
         | 
| 12 | 
            +
              "layer_replication": null,
         | 
| 13 | 
            +
              "layers_pattern": null,
         | 
| 14 | 
            +
              "layers_to_transform": null,
         | 
| 15 | 
            +
              "loftq_config": {},
         | 
| 16 | 
            +
              "lora_alpha": 128,
         | 
| 17 | 
            +
              "lora_bias": false,
         | 
| 18 | 
            +
              "lora_dropout": 0.0,
         | 
| 19 | 
            +
              "megatron_config": null,
         | 
| 20 | 
            +
              "megatron_core": "megatron.core",
         | 
| 21 | 
            +
              "modules_to_save": null,
         | 
| 22 | 
            +
              "peft_type": "LORA",
         | 
| 23 | 
            +
              "r": 64,
         | 
| 24 | 
            +
              "rank_pattern": {},
         | 
| 25 | 
            +
              "revision": null,
         | 
| 26 | 
            +
              "target_modules": null,
         | 
| 27 | 
            +
              "task_type": "CAUSAL_LM",
         | 
| 28 | 
            +
              "use_dora": false,
         | 
| 29 | 
            +
              "use_rslora": false
         | 
| 30 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/miss/llama-3.2-3B-bat/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,18 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "bias": "none",
         | 
| 5 | 
            +
              "exclude_modules": null,
         | 
| 6 | 
            +
              "inference_mode": false,
         | 
| 7 | 
            +
              "init_weights": "bat",
         | 
| 8 | 
            +
              "layers_pattern": null,
         | 
| 9 | 
            +
              "layers_to_transform": null,
         | 
| 10 | 
            +
              "mini_r": 1,
         | 
| 11 | 
            +
              "miss_dropout": 0.0,
         | 
| 12 | 
            +
              "modules_to_save": null,
         | 
| 13 | 
            +
              "peft_type": "MISS",
         | 
| 14 | 
            +
              "r": 64,
         | 
| 15 | 
            +
              "revision": null,
         | 
| 16 | 
            +
              "target_modules": null,
         | 
| 17 | 
            +
              "task_type": null
         | 
| 18 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/miss/llama-3.2-3B-default/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,18 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "bias": "none",
         | 
| 5 | 
            +
              "exclude_modules": null,
         | 
| 6 | 
            +
              "inference_mode": false,
         | 
| 7 | 
            +
              "init_weights": true,
         | 
| 8 | 
            +
              "layers_pattern": null,
         | 
| 9 | 
            +
              "layers_to_transform": null,
         | 
| 10 | 
            +
              "mini_r": 1,
         | 
| 11 | 
            +
              "miss_dropout": 0.0,
         | 
| 12 | 
            +
              "modules_to_save": null,
         | 
| 13 | 
            +
              "peft_type": "MISS",
         | 
| 14 | 
            +
              "r": 64,
         | 
| 15 | 
            +
              "revision": null,
         | 
| 16 | 
            +
              "target_modules": null,
         | 
| 17 | 
            +
              "task_type": null
         | 
| 18 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/miss/llama-3.2-3B-mini/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,18 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "bias": "none",
         | 
| 5 | 
            +
              "exclude_modules": null,
         | 
| 6 | 
            +
              "inference_mode": false,
         | 
| 7 | 
            +
              "init_weights": "mini",
         | 
| 8 | 
            +
              "layers_pattern": null,
         | 
| 9 | 
            +
              "layers_to_transform": null,
         | 
| 10 | 
            +
              "mini_r": 64,
         | 
| 11 | 
            +
              "miss_dropout": 0.0,
         | 
| 12 | 
            +
              "modules_to_save": null,
         | 
| 13 | 
            +
              "peft_type": "MISS",
         | 
| 14 | 
            +
              "r": 64,
         | 
| 15 | 
            +
              "revision": null,
         | 
| 16 | 
            +
              "target_modules": null,
         | 
| 17 | 
            +
              "task_type": null
         | 
| 18 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/oft/llama-3.2-3B-rank32/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,27 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "alpha_pattern": {},
         | 
| 3 | 
            +
              "auto_mapping": null,
         | 
| 4 | 
            +
              "base_model_name_or_path": null,
         | 
| 5 | 
            +
              "bias": "none",
         | 
| 6 | 
            +
              "block_share": false,
         | 
| 7 | 
            +
              "coft": false,
         | 
| 8 | 
            +
              "eps": 6e-05,
         | 
| 9 | 
            +
              "exclude_modules": null,
         | 
| 10 | 
            +
              "fan_in_fan_out": false,
         | 
| 11 | 
            +
              "inference_mode": false,
         | 
| 12 | 
            +
              "init_weights": true,
         | 
| 13 | 
            +
              "layers_pattern": null,
         | 
| 14 | 
            +
              "layers_to_transform": null,
         | 
| 15 | 
            +
              "module_dropout": 0.0,
         | 
| 16 | 
            +
              "modules_to_save": null,
         | 
| 17 | 
            +
              "oft_block_size": 0,
         | 
| 18 | 
            +
              "peft_type": "OFT",
         | 
| 19 | 
            +
              "r": 32,
         | 
| 20 | 
            +
              "rank_pattern": {},
         | 
| 21 | 
            +
              "revision": null,
         | 
| 22 | 
            +
              "target_modules": [
         | 
| 23 | 
            +
                "q_proj",
         | 
| 24 | 
            +
                "v_proj"
         | 
| 25 | 
            +
              ],
         | 
| 26 | 
            +
              "task_type": null
         | 
| 27 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/osf/llama-3.2-3B-rank128/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,28 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "task_type": null,
         | 
| 3 | 
            +
              "peft_type": "OSF",
         | 
| 4 | 
            +
              "auto_mapping": null,
         | 
| 5 | 
            +
              "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
         | 
| 6 | 
            +
              "revision": null,
         | 
| 7 | 
            +
              "inference_mode": false,
         | 
| 8 | 
            +
              "effective_rank": null,
         | 
| 9 | 
            +
              "target_modules": [
         | 
| 10 | 
            +
                "q_proj",
         | 
| 11 | 
            +
                "k_proj",
         | 
| 12 | 
            +
                "v_proj",
         | 
| 13 | 
            +
                "o_proj",
         | 
| 14 | 
            +
                "gate_proj",
         | 
| 15 | 
            +
                "down_proj",
         | 
| 16 | 
            +
                "up_proj"
         | 
| 17 | 
            +
              ],
         | 
| 18 | 
            +
              "rank_pattern": {
         | 
| 19 | 
            +
                "q_proj": 2944,
         | 
| 20 | 
            +
                "o_proj": 2944,
         | 
| 21 | 
            +
                "k_proj": 896,
         | 
| 22 | 
            +
                "v_proj": 896,
         | 
| 23 | 
            +
                "gate_proj": 2944,
         | 
| 24 | 
            +
                "down_proj": 2944,
         | 
| 25 | 
            +
                "up_proj": 2944
         | 
| 26 | 
            +
              }
         | 
| 27 | 
            +
            }
         | 
| 28 | 
            +
             | 
    	
        MetaMathQA/experiments/osf/llama-3.2-3B-rank128/training_params.json
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "optimizer_kwargs": {
         | 
| 3 | 
            +
                "lr": 5e-5
         | 
| 4 | 
            +
              }
         | 
| 5 | 
            +
            }
         | 
| 6 | 
            +
             | 
    	
        MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,15 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "encoder_hidden_size": 3072,
         | 
| 5 | 
            +
              "inference_mode": false,
         | 
| 6 | 
            +
              "num_attention_heads": 24,
         | 
| 7 | 
            +
              "num_layers": 28,
         | 
| 8 | 
            +
              "num_transformer_submodules": 1,
         | 
| 9 | 
            +
              "num_virtual_tokens": 200,
         | 
| 10 | 
            +
              "peft_type": "PREFIX_TUNING",
         | 
| 11 | 
            +
              "prefix_projection": false,
         | 
| 12 | 
            +
              "revision": null,
         | 
| 13 | 
            +
              "task_type": "CAUSAL_LM",
         | 
| 14 | 
            +
              "token_dim": 3072
         | 
| 15 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/training_params.json
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "optimizer_kwargs": {
         | 
| 3 | 
            +
                "lr": 1e-3
         | 
| 4 | 
            +
              }
         | 
| 5 | 
            +
            }
         | 
| 6 | 
            +
             | 
    	
        MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-default/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,17 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "inference_mode": false,
         | 
| 5 | 
            +
              "num_attention_heads": 24,
         | 
| 6 | 
            +
              "num_layers": 28,
         | 
| 7 | 
            +
              "num_transformer_submodules": 1,
         | 
| 8 | 
            +
              "num_virtual_tokens": 200,
         | 
| 9 | 
            +
              "peft_type": "PROMPT_TUNING",
         | 
| 10 | 
            +
              "prompt_tuning_init": "RANDOM",
         | 
| 11 | 
            +
              "prompt_tuning_init_text": null,
         | 
| 12 | 
            +
              "revision": null,
         | 
| 13 | 
            +
              "task_type": "CAUSAL_LM",
         | 
| 14 | 
            +
              "token_dim": 3072,
         | 
| 15 | 
            +
              "tokenizer_kwargs": null,
         | 
| 16 | 
            +
              "tokenizer_name_or_path": null
         | 
| 17 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,17 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "inference_mode": false,
         | 
| 5 | 
            +
              "num_attention_heads": 24,
         | 
| 6 | 
            +
              "num_layers": 28,
         | 
| 7 | 
            +
              "num_transformer_submodules": 1,
         | 
| 8 | 
            +
              "num_virtual_tokens": 200,
         | 
| 9 | 
            +
              "peft_type": "PROMPT_TUNING",
         | 
| 10 | 
            +
              "prompt_tuning_init": "RANDOM",
         | 
| 11 | 
            +
              "prompt_tuning_init_text": null,
         | 
| 12 | 
            +
              "revision": null,
         | 
| 13 | 
            +
              "task_type": "CAUSAL_LM",
         | 
| 14 | 
            +
              "token_dim": 3072,
         | 
| 15 | 
            +
              "tokenizer_kwargs": null,
         | 
| 16 | 
            +
              "tokenizer_name_or_path": null
         | 
| 17 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/training_params.json
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "optimizer_kwargs": {
         | 
| 3 | 
            +
                "lr": 1e-3
         | 
| 4 | 
            +
              }
         | 
| 5 | 
            +
            }
         | 
| 6 | 
            +
             | 
    	
        MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-sample_vocab-lr_0.001/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,17 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "inference_mode": false,
         | 
| 5 | 
            +
              "num_attention_heads": 24,
         | 
| 6 | 
            +
              "num_layers": 28,
         | 
| 7 | 
            +
              "num_transformer_submodules": 1,
         | 
| 8 | 
            +
              "num_virtual_tokens": 200,
         | 
| 9 | 
            +
              "peft_type": "PROMPT_TUNING",
         | 
| 10 | 
            +
              "prompt_tuning_init": "SAMPLE_VOCAB",
         | 
| 11 | 
            +
              "prompt_tuning_init_text": null,
         | 
| 12 | 
            +
              "revision": null,
         | 
| 13 | 
            +
              "task_type": "CAUSAL_LM",
         | 
| 14 | 
            +
              "token_dim": 3072,
         | 
| 15 | 
            +
              "tokenizer_kwargs": null,
         | 
| 16 | 
            +
              "tokenizer_name_or_path": null
         | 
| 17 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-sample_vocab-lr_0.001/training_params.json
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "optimizer_kwargs": {
         | 
| 3 | 
            +
                "lr": 1e-3
         | 
| 4 | 
            +
              }
         | 
| 5 | 
            +
            }
         | 
| 6 | 
            +
             | 
    	
        MetaMathQA/experiments/ptuning/llama-3.2-3B-default/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,17 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "encoder_dropout": 0.0,
         | 
| 5 | 
            +
              "encoder_hidden_size": 3072,
         | 
| 6 | 
            +
              "encoder_num_layers": 2,
         | 
| 7 | 
            +
              "encoder_reparameterization_type": "MLP",
         | 
| 8 | 
            +
              "inference_mode": false,
         | 
| 9 | 
            +
              "num_attention_heads": 24,
         | 
| 10 | 
            +
              "num_layers": 28,
         | 
| 11 | 
            +
              "num_transformer_submodules": 1,
         | 
| 12 | 
            +
              "num_virtual_tokens": 20,
         | 
| 13 | 
            +
              "peft_type": "P_TUNING",
         | 
| 14 | 
            +
              "revision": null,
         | 
| 15 | 
            +
              "task_type": "CAUSAL_LM",
         | 
| 16 | 
            +
              "token_dim": 3072
         | 
| 17 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/randlora/llama-3.2-3B-default/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,22 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "bias": "none",
         | 
| 5 | 
            +
              "fan_in_fan_out": false,
         | 
| 6 | 
            +
              "inference_mode": false,
         | 
| 7 | 
            +
              "init_weights": true,
         | 
| 8 | 
            +
              "layers_pattern": null,
         | 
| 9 | 
            +
              "layers_to_transform": null,
         | 
| 10 | 
            +
              "modules_to_save": null,
         | 
| 11 | 
            +
              "peft_type": "RANDLORA",
         | 
| 12 | 
            +
              "projection_prng_key": 0,
         | 
| 13 | 
            +
              "r": 32,
         | 
| 14 | 
            +
              "randlora_alpha": 640,
         | 
| 15 | 
            +
              "randlora_dropout": 0.0,
         | 
| 16 | 
            +
              "revision": null,
         | 
| 17 | 
            +
              "save_projection": true,
         | 
| 18 | 
            +
              "sparse": false,
         | 
| 19 | 
            +
              "target_modules": null,
         | 
| 20 | 
            +
              "task_type": null,
         | 
| 21 | 
            +
              "very_sparse": false
         | 
| 22 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/road/llama-3.2-3B-lr_0.001/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,12 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "group_size": 64,
         | 
| 5 | 
            +
              "inference_mode": false,
         | 
| 6 | 
            +
              "init_weights": true,
         | 
| 7 | 
            +
              "peft_type": "ROAD",
         | 
| 8 | 
            +
              "revision": null,
         | 
| 9 | 
            +
              "target_modules": null,
         | 
| 10 | 
            +
              "task_type": null,
         | 
| 11 | 
            +
              "variant": "road_2"
         | 
| 12 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/road/llama-3.2-3B-lr_0.001/training_params.json
    ADDED
    
    | @@ -0,0 +1,5 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "optimizer_kwargs": {
         | 
| 3 | 
            +
                "lr": 1e-3
         | 
| 4 | 
            +
              }
         | 
| 5 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/shira/llama-3.2-3B-lr_0.0003-random_seed_42/adapter_config.json
    ADDED
    
    | @@ -0,0 +1,15 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_mapping": null,
         | 
| 3 | 
            +
              "base_model_name_or_path": null,
         | 
| 4 | 
            +
              "fan_in_fan_out": false,
         | 
| 5 | 
            +
              "inference_mode": false,
         | 
| 6 | 
            +
              "init_weights": true,
         | 
| 7 | 
            +
              "mask_type": "random",
         | 
| 8 | 
            +
              "modules_to_save": null,
         | 
| 9 | 
            +
              "peft_type": "SHIRA",
         | 
| 10 | 
            +
              "r": 32,
         | 
| 11 | 
            +
              "random_seed": 42,
         | 
| 12 | 
            +
              "revision": null,
         | 
| 13 | 
            +
              "target_modules": null,
         | 
| 14 | 
            +
              "task_type": null
         | 
| 15 | 
            +
            }
         | 
    	
        MetaMathQA/experiments/shira/llama-3.2-3B-lr_0.0003-random_seed_42/training_params.json
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "optimizer_kwargs": {
         | 
| 3 | 
            +
                "lr": 3e-4
         | 
| 4 | 
            +
              }
         | 
| 5 | 
            +
            }
         | 
| 6 | 
            +
             | 
