Tom Aarsen commited on
Commit
a5e1612
1 Parent(s): 4111085

Add "add_eos_token" to tokenizer config; simplify usage

Browse files
Files changed (2) hide show
  1. README.md +1 -4
  2. tokenizer_config.json +1 -0
README.md CHANGED
@@ -6892,10 +6892,7 @@ model = AutoModel.from_pretrained('intfloat/e5-mistral-7b-instruct')
6892
 
6893
  max_length = 4096
6894
  # Tokenize the input texts
6895
- batch_dict = tokenizer(input_texts, max_length=max_length - 1, return_attention_mask=False, padding=False, truncation=True)
6896
- # append eos_token_id to every input_ids
6897
- batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
6898
- batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt')
6899
 
6900
  outputs = model(**batch_dict)
6901
  embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
 
6892
 
6893
  max_length = 4096
6894
  # Tokenize the input texts
6895
+ batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
 
 
 
6896
 
6897
  outputs = model(**batch_dict)
6898
  embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
tokenizer_config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "added_tokens_decoder": {
3
  "0": {
4
  "content": "<unk>",
 
1
  {
2
+ "add_eos_token": true,
3
  "added_tokens_decoder": {
4
  "0": {
5
  "content": "<unk>",