add float16 docs and tweak typehints
Browse files- README.md +8 -0
- src/axolotl/utils/models.py +5 -3
README.md
CHANGED
|
@@ -264,6 +264,8 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
|
|
| 264 |
bf16: true # require >=ampere
|
| 265 |
fp16: true
|
| 266 |
tf32: true # require >=ampere
|
|
|
|
|
|
|
| 267 |
```
|
| 268 |
Note: Repo does not do 4-bit quantization.
|
| 269 |
|
|
@@ -522,6 +524,12 @@ Add below flag to train command above
|
|
| 522 |
--merge_lora --lora_model_dir="./completed-model" --load_in_8bit=False --load_in_4bit=False
|
| 523 |
```
|
| 524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
## Common Errors 🧰
|
| 526 |
|
| 527 |
> Cuda out of memory
|
|
|
|
| 264 |
bf16: true # require >=ampere
|
| 265 |
fp16: true
|
| 266 |
tf32: true # require >=ampere
|
| 267 |
+
bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP
|
| 268 |
+
float16: true # use instead of fp16 when you don't want AMP
|
| 269 |
```
|
| 270 |
Note: Repo does not do 4-bit quantization.
|
| 271 |
|
|
|
|
| 524 |
--merge_lora --lora_model_dir="./completed-model" --load_in_8bit=False --load_in_4bit=False
|
| 525 |
```
|
| 526 |
|
| 527 |
+
If you run out of CUDA memory, you can try to merge in system RAM with
|
| 528 |
+
|
| 529 |
+
```bash
|
| 530 |
+
CUDA_VISIBLE_DEVICES="" python3 scripts/finetune.py ...
|
| 531 |
+
```
|
| 532 |
+
|
| 533 |
## Common Errors 🧰
|
| 534 |
|
| 535 |
> Cuda out of memory
|
src/axolotl/utils/models.py
CHANGED
|
@@ -11,13 +11,14 @@ import bitsandbytes as bnb
|
|
| 11 |
import torch
|
| 12 |
import transformers
|
| 13 |
from optimum.bettertransformer import BetterTransformer
|
| 14 |
-
from transformers import
|
| 15 |
-
from transformers import (
|
| 16 |
AutoConfig,
|
| 17 |
AutoModelForCausalLM,
|
| 18 |
AutoTokenizer,
|
| 19 |
BitsAndBytesConfig,
|
| 20 |
LlamaConfig,
|
|
|
|
|
|
|
| 21 |
)
|
| 22 |
|
| 23 |
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
|
|
@@ -71,7 +72,7 @@ def load_tokenizer(
|
|
| 71 |
def load_model(
|
| 72 |
base_model, base_model_config, model_type, tokenizer, cfg, adapter="lora"
|
| 73 |
):
|
| 74 |
-
# type: (str, str, str,
|
| 75 |
"""
|
| 76 |
Load a model from a base model and a model type.
|
| 77 |
"""
|
|
@@ -284,6 +285,7 @@ def load_model(
|
|
| 284 |
model = AutoModelForCausalLM.from_pretrained(
|
| 285 |
base_model,
|
| 286 |
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
|
|
|
| 287 |
torch_dtype=torch_dtype,
|
| 288 |
device_map=cfg.device_map,
|
| 289 |
trust_remote_code=cfg.trust_remote_code or False,
|
|
|
|
| 11 |
import torch
|
| 12 |
import transformers
|
| 13 |
from optimum.bettertransformer import BetterTransformer
|
| 14 |
+
from transformers import ( # noqa: F401
|
|
|
|
| 15 |
AutoConfig,
|
| 16 |
AutoModelForCausalLM,
|
| 17 |
AutoTokenizer,
|
| 18 |
BitsAndBytesConfig,
|
| 19 |
LlamaConfig,
|
| 20 |
+
PreTrainedModel,
|
| 21 |
+
PreTrainedTokenizerBase,
|
| 22 |
)
|
| 23 |
|
| 24 |
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
|
|
|
|
| 72 |
def load_model(
|
| 73 |
base_model, base_model_config, model_type, tokenizer, cfg, adapter="lora"
|
| 74 |
):
|
| 75 |
+
# type: (str, str, str, PreTrainedTokenizerBase, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
|
| 76 |
"""
|
| 77 |
Load a model from a base model and a model type.
|
| 78 |
"""
|
|
|
|
| 285 |
model = AutoModelForCausalLM.from_pretrained(
|
| 286 |
base_model,
|
| 287 |
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
| 288 |
+
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
| 289 |
torch_dtype=torch_dtype,
|
| 290 |
device_map=cfg.device_map,
|
| 291 |
trust_remote_code=cfg.trust_remote_code or False,
|