problem after training the model
We have managed to launch the training script by providing our own dataset, following this guide.
However, we can launch the model in chatbot format before the training, but we are unable to launch it once it has been trained, as the ram consumption skyrockets, can we modify any parameter at configuration level to solve this problem?
We are currently following these steps, in colab free.
https://colab.research.google.com/drive/1n5U13L0Bzhs32QO_bls5jwuZR62GPSwE?usp=sharing#scrollTo=zlw7IxfUED0a
It's not clear what you're doing or on what hardware, but, just sounds like you don't have enough mem to load it? you need to load on a GPU
Hi @ivgome your code is running well in colab free with few modifications - mostly due to memory limitations (see copy and paste of your file with adjustments below which is running):
-- coding: utf-8 --
"""Fine-tuning Dolly 2.0 with LoRA and Alpaca.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1-nyF2tdV7jOvxqR3OCw7Bv2IyZQpp_5D
Fine-tuning Dolly 2.0 with LoRA
- Dolly-v2-3b - https://huggingface.co/databricks/dolly-v2-3b
- LoRA paper - https://arxiv.org/abs/2106.09685
- Alpaca Cleaned Dataset - https://github.com/gururise/AlpacaDataCleaned
"""
!git clone https://github.com/gururise/AlpacaDataCleaned.git
ls AlpacaDataCleaned/
!pip install accelerate>=0.21.0 transformers[torch]==4.30.2
!pip install -q datasets loralib sentencepiece
!pip -q install git+https://github.com/huggingface/peft.git
!pip -q install bitsandbytes
Create Instruct Pipeline
import logging
import re
import numpy as np
from transformers import Pipeline, PreTrainedTokenizer
logger = logging.getLogger(name)
INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
INTRO_BLURB = (
"Below is an instruction that describes a task. Write a response that appropriately completes the request."
)
This is the prompt that is used for generating responses using an already trained model. It ends with the response
key, where the job of the model is to provide the completion that follows it (i.e. the response itself).
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
""".format(
intro=INTRO_BLURB,
instruction_key=INSTRUCTION_KEY,
instruction="{instruction}",
response_key=RESPONSE_KEY,
)
def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
"""Gets the token ID for a given string that has been added to the tokenizer as a special token.
When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
treated specially and converted to a single, new token. This retrieves the token ID each of these keys map to.
Args:
tokenizer (PreTrainedTokenizer): the tokenizer
key (str): the key to convert to a single token
Raises:
RuntimeError: if more than one ID was generated
Returns:
int: the token ID for the given key
"""
token_ids = tokenizer.encode(key)
if len(token_ids) > 1:
raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
return token_ids[0]
class InstructionTextGenerationPipeline(Pipeline):
def init(
self, *args, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs
):
super().init(*args, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)
def _sanitize_parameters(self, return_instruction_text=False, **generate_kwargs):
preprocess_params = {}
# newer versions of the tokenizer configure the response key as a special token. newer versions still may
# append a newline to yield a single token. find whatever token is configured for the response key.
tokenizer_response_key = next(
(token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None
)
response_key_token_id = None
end_key_token_id = None
if tokenizer_response_key:
try:
response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)
# Ensure generation stops once it generates "### End"
generate_kwargs["eos_token_id"] = end_key_token_id
except ValueError:
pass
forward_params = generate_kwargs
postprocess_params = {
"response_key_token_id": response_key_token_id,
"end_key_token_id": end_key_token_id,
"return_instruction_text": return_instruction_text,
}
return preprocess_params, forward_params, postprocess_params
def preprocess(self, instruction_text, **generate_kwargs):
prompt_text = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction_text)
inputs = self.tokenizer(
prompt_text,
return_tensors="pt",
)
inputs["prompt_text"] = prompt_text
inputs["instruction_text"] = instruction_text
return inputs
def _forward(self, model_inputs, **generate_kwargs):
input_ids = model_inputs["input_ids"]
attention_mask = model_inputs.get("attention_mask", None)
generated_sequence = self.model.generate(
input_ids=input_ids.to(self.model.device),
attention_mask=attention_mask,
pad_token_id=self.tokenizer.pad_token_id,
**generate_kwargs,
)[0].cpu()
instruction_text = model_inputs.pop("instruction_text")
return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}
def postprocess(self, model_outputs, response_key_token_id, end_key_token_id, return_instruction_text):
sequence = model_outputs["generated_sequence"]
instruction_text = model_outputs["instruction_text"]
# The response will be set to this variable if we can identify it.
decoded = None
# If we have token IDs for the response and end, then we can find the tokens and only decode between them.
if response_key_token_id and end_key_token_id:
# Find where "### Response:" is first found in the generated tokens. Considering this is part of the
# prompt, we should definitely find it. We will return the tokens found after this token.
response_pos = None
response_positions = np.where(sequence == response_key_token_id)[0]
if len(response_positions) == 0:
logger.warn(f"Could not find response key {response_key_token_id} in: {sequence}")
else:
response_pos = response_positions[0]
if response_pos:
# Next find where "### End" is located. The model has been trained to end its responses with this
# sequence (or actually, the token ID it maps to, since it is a special token). We may not find
# this token, as the response could be truncated. If we don't find it then just return everything
# to the end. Note that even though we set eos_token_id, we still see the this token at the end.
end_pos = None
end_positions = np.where(sequence == end_key_token_id)[0]
if len(end_positions) > 0:
end_pos = end_positions[0]
decoded = self.tokenizer.decode(sequence[response_pos + 1 : end_pos]).strip()
else:
# Otherwise we'll decode everything and use a regex to find the response and end.
fully_decoded = self.tokenizer.decode(sequence)
# The response appears after "### Response:". The model has been trained to append "### End" at the
# end.
m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)
if m:
decoded = m.group(1).strip()
else:
# The model might not generate the "### End" sequence before reaching the max tokens. In this case,
# return everything after "### Response:".
m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
if m:
decoded = m.group(1).strip()
else:
logger.warn(f"Failed to find response in:\n{fully_decoded}")
if return_instruction_text:
return {"instruction_text": instruction_text, "generated_text": decoded}
return decoded
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-3b", padding_side="left")
quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b",
device_map="auto",
torch_dtype=torch.bfloat16,
#torch_dtype=torch.int8,
quantization_config=quantization_config,
load_in_4bit=True,
#load_in_8bit=True,
)
generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)
from datasets import load_dataset
data = load_dataset("json",
data_files="./AlpacaDataCleaned/alpaca_data.json")
def generate_prompt(data_point):
# taken from https://github.com/tloen/alpaca-lora
if data_point["instruction"]:
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
Instruction:
{data_point["instruction"]}
Input:
{data_point["input"]}
Response:
{data_point["output"]}"""
else:
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
Instruction:
{data_point["instruction"]}
Response:
{data_point["output"]}"""
data = data.map(lambda data_point: {"prompt": tokenizer(generate_prompt(data_point))})
data
"""## Finetuning Dolly"""
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, GPTJForCausalLM
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model
Settings for A100 - For 3090
MICRO_BATCH_SIZE = 4 # change to 4 for 3090
BATCH_SIZE = 128
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 2 # paper uses 3
LEARNING_RATE = 2e-5
CUTOFF_LEN = 256
LORA_R = 4
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
Settings for A100 - For 3090
MICRO_BATCH_SIZE = 4 # change to 4 for 3090
BATCH_SIZE = 32
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 2 # paper uses 3
LEARNING_RATE = 2e-5
CUTOFF_LEN = 32
LORA_R = 4
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
#model = prepare_model_for_int8_training(model, use_gradient_checkpointing=True)
config = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token
data = load_dataset("json", data_files="./AlpacaDataCleaned/alpaca_data_cleaned.json")
data = data.shuffle().map(
lambda data_point: tokenizer(
generate_prompt(data_point),
truncation=True,
max_length=CUTOFF_LEN,
padding="max_length",
)
)
data
from transformers.training_args import ParallelMode
trainer = transformers.Trainer(
model=model,
train_dataset=data["train"],
args=transformers.TrainingArguments(
per_device_train_batch_size=MICRO_BATCH_SIZE,
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
warmup_steps=100,
num_train_epochs=EPOCHS,
learning_rate=LEARNING_RATE,
fp16=True,
#sharded_ddp="zero_dp_3 auto_wrap",
# fsdp="full_shard auto_wrap",
# model_parallel=True,
#parallel_mode=ParallelMode.DISTRIBUTED,
#is_model_parallel=True,
logging_steps=1,
output_dir="lora-dolly",
save_total_limit=3,
) ,
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)
model.save_pretrained("alpaca-lora-dolly-2.0")
generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)
generate_text("Look up the boiling point of water.")
generate_text("Find the capital of Spain.")
generate_text("Translate the following phrase into French: I love my dog")
generate_text("Given a set of numbers, find the maximum value: Set: {10, 3, 25, 62, 16}")
generate_text("Translate the following phrase into French: I love my dog")