Tom Aarsen
commited on
Commit
•
a5e1612
1
Parent(s):
4111085
Add "add_eos_token" to tokenizer config; simplify usage
Browse files- README.md +1 -4
- tokenizer_config.json +1 -0
README.md
CHANGED
@@ -6892,10 +6892,7 @@ model = AutoModel.from_pretrained('intfloat/e5-mistral-7b-instruct')
|
|
6892 |
|
6893 |
max_length = 4096
|
6894 |
# Tokenize the input texts
|
6895 |
-
batch_dict = tokenizer(input_texts, max_length=max_length
|
6896 |
-
# append eos_token_id to every input_ids
|
6897 |
-
batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
|
6898 |
-
batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt')
|
6899 |
|
6900 |
outputs = model(**batch_dict)
|
6901 |
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
|
|
|
6892 |
|
6893 |
max_length = 4096
|
6894 |
# Tokenize the input texts
|
6895 |
+
batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
|
|
|
|
|
|
|
6896 |
|
6897 |
outputs = model(**batch_dict)
|
6898 |
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
|
tokenizer_config.json
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
{
|
|
|
2 |
"added_tokens_decoder": {
|
3 |
"0": {
|
4 |
"content": "<unk>",
|
|
|
1 |
{
|
2 |
+
"add_eos_token": true,
|
3 |
"added_tokens_decoder": {
|
4 |
"0": {
|
5 |
"content": "<unk>",
|