Tokenizer causes issues in Finetuning because of special tokens in tokenization <|X|>
Ill run through my setup and then get to the problem.
i am setting up config, tokenizer, model and peftmodel
from peft import LoraConfig, TaskType
import torch
CHATPATH = "/notebooks/starchat-beta"
BASEPATH = "/notebooks/starcoderplus"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)
peftconfig = LoraConfig(
CHATPATH,
base_model_name_or_path = BASEPATH,
task_type=TaskType.CAUSAL_LM,
target_modules = ["c_proj", "c_attn", "q_attn"],
bias="none",
inference_mode=False,
r=8,
lora_alpha=32,
lora_dropout=0.01
)
from transformers import AutoTokenizer
system_token = "<|system|>"
user_token = "<|user|>"
assistant_token = "<|assistant|>"
end_token = "<|end|>"
tokenizer = AutoTokenizer.from_pretrained(BASEPATH)
tokenizer.pad_token=tokenizer.eos_token
added_tokens = tokenizer.add_special_tokens({"additional_special_tokens": [system_token, user_token, assistant_token, end_token]})
print("tokenizer.vocab_size", tokenizer.vocab_size, added_tokens)
> tokenizer.vocab_size 49152 0
from transformers import AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained(
BASEPATH,
torch_dtype=torch.bfloat16,
device_map="auto",
).to(DEVICE)
freeze_model(model)
from peft import get_peft_model
peftmodel = get_peft_model(model, peftconfig)
peftmodel.resize_token_embeddings(len(tokenizer))
now we have the peftmodel and the tokenizer setup. check that even tho i add the special tokens, they dont get added.
i continue with setting up the data.
import pandas as pd
from datasets import Dataset
system_token = "<|system|>"
user_token = "<|user|>"
assistant_token = "<|assistant|>"
end_token = "<|end|>"
system_msg = "X"
def prepare_dialogue(row):
# print(row)
prompt = system_token + "\n" + system_msg + end_token + "\n"
prompt += user_token + "\n" + row["prompt"] + end_token + "\n"
prompt += assistant_token + "\n" + row["completion"] + end_token + "\n"
row["dialogue"] = prompt
return row
def strip_quotes(val): return val.strip('"') if isinstance(val, str) else val
def prepare_row(row):
for col in row.index:
row[col] = row[col].strip("'").strip("';")
return prepare_dialogue(row)
def prepare_data(data):
data.rename(columns={"'completion';": 'completion', "'prompt'": 'prompt'}, inplace=True)
data = data.apply(prepare_row, axis=1)
return data
def load_data(path):
data = pd.read_csv(path, delimiter=";", quotechar="'",skipinitialspace=True)
return Dataset.from_pandas(prepare_data(data))
trainingdata = load_data("./data/training.csv")
testingdata = load_data("./data/testing.csv")
def tokenize(batch):
batch_dialogues = batch['dialogue'] # Fetch the 'dialogue' field
tokenization = tokenizer(batch_dialogues, padding=True, return_token_type_ids=False)
labels = tokenization.input_ids.copy()
# mask_user_labels(tokenizer, labels) # not working.
tokenization['labels'] = labels
return tokenization
from datasets import DatasetDict
dataset = DatasetDict({
'train': trainingdata.map(tokenize, batched=True),
'test': testingdata.map(tokenize, batched=True)
})
for key in dataset:
dataset[key] = dataset[key].remove_columns(['dialogue', 'completion', 'prompt'])
let me go through the important parts.
the prepare_dialogue
function takes the data from my csv and formats it according to the dialogue template.
the tokenize
function takes a batch, tokenizes them and adds them as labels to the dataset.
here is the crux of the matter.
print(dataset['train'])
print('torch max: ', torch.max(torch.tensor(dataset['train']["labels"])))
final_layer = list(peftmodel.modules())[-1]
if isinstance(final_layer, torch.nn.Linear):
print(f"The output dimension is {final_layer.out_features}")
else:
print("Final layer is not a Linear layer.")
> Dataset({
features: ['input_ids', 'attention_mask', 'labels'],
num_rows: 228
})
> torch max: tensor(49155)
> The output dimension is 49152
here is the problem.
the largest label_id found is 49155, but the output dimension is only 49152.
print("system_token_id:", tokenizer.convert_tokens_to_ids(system_token))
print("user_token_id:", tokenizer.convert_tokens_to_ids(user_token))
print("assistant_token_id:", tokenizer.convert_tokens_to_ids(assistant_token))
print("end_token_id:", tokenizer.convert_tokens_to_ids(end_token))
> system_token_id: 49152
> user_token_id: 49154
> assistant_token_id: 49153
> end_token_id: 49155
the added token are the difference.
what am i to do here?
training like this throws errors because of dimension mismatch.
not adding the token makes no sense as per the "documentation" or the code