Use the DistilBertTokenizer for this DistilBERT-based model (#4)
Browse files- Use the DistilBertTokenizer for this DistilBERT-based model (e4469d87867f92a54ee361a46db996b0d96a1fad)
- README.md +2 -2
- tokenizer_config.json +1 -1
README.md
CHANGED
@@ -102,7 +102,7 @@ query = "What's the weather in ny now?"
|
|
102 |
document = "Currently New York is rainy."
|
103 |
|
104 |
# encode the query
|
105 |
-
feature_query = tokenizer([query], padding=True, truncation=True, return_tensors='pt'
|
106 |
input_ids = feature_query["input_ids"]
|
107 |
batch_size = input_ids.shape[0]
|
108 |
query_vector = torch.zeros(batch_size, tokenizer.vocab_size)
|
@@ -110,7 +110,7 @@ query_vector[torch.arange(batch_size).unsqueeze(-1), input_ids] = 1
|
|
110 |
query_sparse_vector = query_vector*idf
|
111 |
|
112 |
# encode the document
|
113 |
-
feature_document = tokenizer([document], padding=True, truncation=True, return_tensors='pt'
|
114 |
output = model(**feature_document)[0]
|
115 |
document_sparse_vector = get_sparse_vector(feature_document, output)
|
116 |
|
|
|
102 |
document = "Currently New York is rainy."
|
103 |
|
104 |
# encode the query
|
105 |
+
feature_query = tokenizer([query], padding=True, truncation=True, return_tensors='pt')
|
106 |
input_ids = feature_query["input_ids"]
|
107 |
batch_size = input_ids.shape[0]
|
108 |
query_vector = torch.zeros(batch_size, tokenizer.vocab_size)
|
|
|
110 |
query_sparse_vector = query_vector*idf
|
111 |
|
112 |
# encode the document
|
113 |
+
feature_document = tokenizer([document], padding=True, truncation=True, return_tensors='pt')
|
114 |
output = model(**feature_document)[0]
|
115 |
document_sparse_vector = get_sparse_vector(feature_document, output)
|
116 |
|
tokenizer_config.json
CHANGED
@@ -8,6 +8,6 @@
|
|
8 |
"sep_token": "[SEP]",
|
9 |
"strip_accents": null,
|
10 |
"tokenize_chinese_chars": true,
|
11 |
-
"tokenizer_class": "
|
12 |
"unk_token": "[UNK]"
|
13 |
}
|
|
|
8 |
"sep_token": "[SEP]",
|
9 |
"strip_accents": null,
|
10 |
"tokenize_chinese_chars": true,
|
11 |
+
"tokenizer_class": "DistilBertTokenizer",
|
12 |
"unk_token": "[UNK]"
|
13 |
}
|