zhichao-geng commited on
Commit
adb6d9a
·
verified ·
1 Parent(s): cb0e473

Use the DistilBertTokenizer for this DistilBERT-based model (#4)

Browse files

- Use the DistilBertTokenizer for this DistilBERT-based model (e4469d87867f92a54ee361a46db996b0d96a1fad)

Files changed (2) hide show
  1. README.md +2 -2
  2. tokenizer_config.json +1 -1
README.md CHANGED
@@ -102,7 +102,7 @@ query = "What's the weather in ny now?"
102
  document = "Currently New York is rainy."
103
 
104
  # encode the query
105
- feature_query = tokenizer([query], padding=True, truncation=True, return_tensors='pt', return_token_type_ids=False)
106
  input_ids = feature_query["input_ids"]
107
  batch_size = input_ids.shape[0]
108
  query_vector = torch.zeros(batch_size, tokenizer.vocab_size)
@@ -110,7 +110,7 @@ query_vector[torch.arange(batch_size).unsqueeze(-1), input_ids] = 1
110
  query_sparse_vector = query_vector*idf
111
 
112
  # encode the document
113
- feature_document = tokenizer([document], padding=True, truncation=True, return_tensors='pt', return_token_type_ids=False)
114
  output = model(**feature_document)[0]
115
  document_sparse_vector = get_sparse_vector(feature_document, output)
116
 
 
102
  document = "Currently New York is rainy."
103
 
104
  # encode the query
105
+ feature_query = tokenizer([query], padding=True, truncation=True, return_tensors='pt')
106
  input_ids = feature_query["input_ids"]
107
  batch_size = input_ids.shape[0]
108
  query_vector = torch.zeros(batch_size, tokenizer.vocab_size)
 
110
  query_sparse_vector = query_vector*idf
111
 
112
  # encode the document
113
+ feature_document = tokenizer([document], padding=True, truncation=True, return_tensors='pt')
114
  output = model(**feature_document)[0]
115
  document_sparse_vector = get_sparse_vector(feature_document, output)
116
 
tokenizer_config.json CHANGED
@@ -8,6 +8,6 @@
8
  "sep_token": "[SEP]",
9
  "strip_accents": null,
10
  "tokenize_chinese_chars": true,
11
- "tokenizer_class": "BertTokenizer",
12
  "unk_token": "[UNK]"
13
  }
 
8
  "sep_token": "[SEP]",
9
  "strip_accents": null,
10
  "tokenize_chinese_chars": true,
11
+ "tokenizer_class": "DistilBertTokenizer",
12
  "unk_token": "[UNK]"
13
  }