move filter to before saving so it doesn't happen everytime, update runpod manual script
Browse files- README.md +1 -1
- scripts/setup-runpod.sh +3 -3
- src/axolotl/utils/data.py +12 -12
    	
        README.md
    CHANGED
    
    | @@ -155,7 +155,7 @@ use_cpu: false | |
| 155 | 
             
            - Once you start your runpod, and SSH into it:
         | 
| 156 | 
             
            ```shell
         | 
| 157 | 
             
            export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
         | 
| 158 | 
            -
            source <(curl -s https://raw.githubusercontent.com/ | 
| 159 | 
             
            ```
         | 
| 160 |  | 
| 161 | 
             
            - Once the setup script completes
         | 
|  | |
| 155 | 
             
            - Once you start your runpod, and SSH into it:
         | 
| 156 | 
             
            ```shell
         | 
| 157 | 
             
            export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
         | 
| 158 | 
            +
            source <(curl -s https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/dev/scripts/setup-runpod.sh)
         | 
| 159 | 
             
            ```
         | 
| 160 |  | 
| 161 | 
             
            - Once the setup script completes
         | 
    	
        scripts/setup-runpod.sh
    CHANGED
    
    | @@ -29,14 +29,14 @@ fi | |
| 29 | 
             
            # install flash-attn and deepspeed from pre-built wheels for this specific container b/c these take forever to install
         | 
| 30 | 
             
            mkdir -p /workspace/wheels
         | 
| 31 | 
             
            cd /workspace/wheels
         | 
| 32 | 
            -
            curl -L -O https://github.com/ | 
| 33 | 
            -
            curl -L -O https://github.com/ | 
| 34 | 
             
            pip install deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
         | 
| 35 | 
             
            pip install flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
         | 
| 36 | 
             
            pip install "peft @ git+https://github.com/huggingface/peft.git@main" --force-reinstall --no-dependencies
         | 
| 37 |  | 
| 38 | 
             
            cd /workspace/
         | 
| 39 | 
            -
            git clone https://github.com/ | 
| 40 | 
             
            cd axolotl
         | 
| 41 | 
             
            pip install -e .[int4]
         | 
| 42 | 
             
            mkdir -p ~/.cache/huggingface/accelerate/
         | 
|  | |
| 29 | 
             
            # install flash-attn and deepspeed from pre-built wheels for this specific container b/c these take forever to install
         | 
| 30 | 
             
            mkdir -p /workspace/wheels
         | 
| 31 | 
             
            cd /workspace/wheels
         | 
| 32 | 
            +
            curl -L -O https://github.com/OpenAccess-AI-Collective/axolotl/raw/wheels/wheels/deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
         | 
| 33 | 
            +
            curl -L -O https://github.com/OpenAccess-AI-Collective/axolotl/raw/wheels/wheels/flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
         | 
| 34 | 
             
            pip install deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
         | 
| 35 | 
             
            pip install flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
         | 
| 36 | 
             
            pip install "peft @ git+https://github.com/huggingface/peft.git@main" --force-reinstall --no-dependencies
         | 
| 37 |  | 
| 38 | 
             
            cd /workspace/
         | 
| 39 | 
            +
            git clone https://github.com/OpenAccess-AI-Collective/axolotl.git
         | 
| 40 | 
             
            cd axolotl
         | 
| 41 | 
             
            pip install -e .[int4]
         | 
| 42 | 
             
            mkdir -p ~/.cache/huggingface/accelerate/
         | 
    	
        src/axolotl/utils/data.py
    CHANGED
    
    | @@ -198,6 +198,18 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): | |
| 198 | 
             
                        )
         | 
| 199 | 
             
                        dataset = Dataset.from_list([_ for _ in constant_len_dataset])
         | 
| 200 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 201 | 
             
                        if cfg.local_rank == 0:
         | 
| 202 | 
             
                            logging.info(
         | 
| 203 | 
             
                                f"Saving packed prepared dataset to disk... {prepared_ds_path}"
         | 
| @@ -208,18 +220,6 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): | |
| 208 | 
             
                        tokenizer, cfg, default_dataset_prepared_path
         | 
| 209 | 
             
                    )
         | 
| 210 |  | 
| 211 | 
            -
                # filter out bad data
         | 
| 212 | 
            -
                dataset = Dataset.from_list(
         | 
| 213 | 
            -
                    [
         | 
| 214 | 
            -
                        d
         | 
| 215 | 
            -
                        for d in dataset
         | 
| 216 | 
            -
                        if len(d["input_ids"]) < cfg.sequence_len
         | 
| 217 | 
            -
                           and len(d["input_ids"]) > 0
         | 
| 218 | 
            -
                           and len(d["input_ids"]) == len(d["attention_mask"])
         | 
| 219 | 
            -
                           and len(d["input_ids"]) == len(d["labels"])
         | 
| 220 | 
            -
                    ]
         | 
| 221 | 
            -
                )
         | 
| 222 | 
            -
             | 
| 223 | 
             
                if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
         | 
| 224 | 
             
                    logging.info(
         | 
| 225 | 
             
                        f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"
         | 
|  | |
| 198 | 
             
                        )
         | 
| 199 | 
             
                        dataset = Dataset.from_list([_ for _ in constant_len_dataset])
         | 
| 200 |  | 
| 201 | 
            +
                        # filter out bad data
         | 
| 202 | 
            +
                        dataset = Dataset.from_list(
         | 
| 203 | 
            +
                            [
         | 
| 204 | 
            +
                                d
         | 
| 205 | 
            +
                                for d in dataset
         | 
| 206 | 
            +
                                if len(d["input_ids"]) < cfg.sequence_len
         | 
| 207 | 
            +
                                   and len(d["input_ids"]) > 0
         | 
| 208 | 
            +
                                   and len(d["input_ids"]) == len(d["attention_mask"])
         | 
| 209 | 
            +
                                   and len(d["input_ids"]) == len(d["labels"])
         | 
| 210 | 
            +
                            ]
         | 
| 211 | 
            +
                        )
         | 
| 212 | 
            +
             | 
| 213 | 
             
                        if cfg.local_rank == 0:
         | 
| 214 | 
             
                            logging.info(
         | 
| 215 | 
             
                                f"Saving packed prepared dataset to disk... {prepared_ds_path}"
         | 
|  | |
| 220 | 
             
                        tokenizer, cfg, default_dataset_prepared_path
         | 
| 221 | 
             
                    )
         | 
| 222 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 223 | 
             
                if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
         | 
| 224 | 
             
                    logging.info(
         | 
| 225 | 
             
                        f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"
         | 
