sentence_transformers_support (#3)
Browse files- Add support for Sentence Transformer (161751154d1c93b1d0f2e3aee4f9b03aeeb9a2a9)
- 1_Pooling/config.json +0 -7
- 1_SpladePooling/config.json +5 -0
- README.md +83 -2
- config_sentence_transformers.json +14 -0
- modules.json +4 -10
- sentence_bert_config.json +3 -3
1_Pooling/config.json
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"word_embedding_dimension": 384,
|
3 |
-
"pooling_mode_cls_token": false,
|
4 |
-
"pooling_mode_mean_tokens": false,
|
5 |
-
"pooling_mode_max_tokens": false,
|
6 |
-
"pooling_mode_mean_sqrt_len_tokens": false
|
7 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1_SpladePooling/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"pooling_strategy": "max",
|
3 |
+
"activation_function": "relu",
|
4 |
+
"word_embedding_dimension": null
|
5 |
+
}
|
README.md
CHANGED
@@ -2,12 +2,16 @@
|
|
2 |
language:
|
3 |
- en
|
4 |
license: apache-2.0
|
5 |
-
library_name: transformers
|
6 |
tags:
|
7 |
- language
|
8 |
- granite
|
9 |
- embeddings
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
# Granite-Embedding-30m-Sparse
|
13 |
|
@@ -117,6 +121,83 @@ for r in res:
|
|
117 |
print(r)
|
118 |
|
119 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
**Evaluation:**
|
121 |
|
122 |
Granite-Embedding-30m-Sparse is competive in performance to the naver/splade-v3-distilbert despite being half the parameter size. We also compare the sparse model with similar sized dense embedding counterpart `ibm-granite/granite-embedding-30m-english`. The performance of the models on MTEB Retrieval (i.e., BEIR) is reported below.
|
|
|
2 |
language:
|
3 |
- en
|
4 |
license: apache-2.0
|
|
|
5 |
tags:
|
6 |
- language
|
7 |
- granite
|
8 |
- embeddings
|
9 |
+
- sentence-transformers
|
10 |
+
- sparse-encoder
|
11 |
+
- sparse
|
12 |
+
- splade
|
13 |
+
pipeline_tag: feature-extraction
|
14 |
+
library_name: sentence-transformers
|
15 |
---
|
16 |
# Granite-Embedding-30m-Sparse
|
17 |
|
|
|
121 |
print(r)
|
122 |
|
123 |
```
|
124 |
+
|
125 |
+
**Usage with Sentence Transformers:**
|
126 |
+
|
127 |
+
First install the Sentence Transformers library:
|
128 |
+
|
129 |
+
```bash
|
130 |
+
pip install -U sentence-transformers
|
131 |
+
```
|
132 |
+
|
133 |
+
Then you can load this model and run inference.
|
134 |
+
```python
|
135 |
+
|
136 |
+
from sentence_transformers import SparseEncoder
|
137 |
+
|
138 |
+
# Download from the 🤗 Hub
|
139 |
+
model = SparseEncoder("ibm-granite/granite-embedding-30m-sparse")
|
140 |
+
|
141 |
+
# Run inference
|
142 |
+
docs = [
|
143 |
+
"Artificial intelligence was founded as an academic discipline in 1956.",
|
144 |
+
"Alan Turing was the first person to conduct substantial research in AI.",
|
145 |
+
"Born in Maida Vale, London, Turing was raised in southern England.",
|
146 |
+
]
|
147 |
+
docs_embeddings = model.encode_document(docs, max_active_dims=192)
|
148 |
+
print(docs_embeddings.shape)
|
149 |
+
# [3, 50265]
|
150 |
+
|
151 |
+
queries = ["When was artificial intelligence founded", "Where was Turing born?"]
|
152 |
+
queries_embeddings = model.encode_query(queries, max_active_dims=50)
|
153 |
+
print(queries_embeddings.shape)
|
154 |
+
# [2, 50265]
|
155 |
+
|
156 |
+
# Get the similarity scores for the embeddings
|
157 |
+
similarities = model.similarity(queries_embeddings, docs_embeddings)
|
158 |
+
print(similarities.shape)
|
159 |
+
# [2, 3]
|
160 |
+
|
161 |
+
for i, query in enumerate(queries):
|
162 |
+
best_doc_index = similarities[i].argmax().item()
|
163 |
+
|
164 |
+
print(f"Query: {query}")
|
165 |
+
print(f"Best doc associate: Similarity: {similarities[i][best_doc_index]:.4f}, Doc: {docs[best_doc_index]}")
|
166 |
+
intersection = model.intersection(queries_embeddings[i], docs_embeddings[best_doc_index])
|
167 |
+
decoded_intersection = model.decode(intersection, top_k=10)
|
168 |
+
print("Top 10 tokens influencing the similarity:")
|
169 |
+
for token, score in decoded_intersection:
|
170 |
+
print(f"Token: {token}, Score: {score:.4f}")
|
171 |
+
|
172 |
+
# Query: When was artificial intelligence founded
|
173 |
+
# Best doc associate: Similarity: 12.3641, Doc: Artificial intelligence was founded as an academic discipline in 1956.
|
174 |
+
# Top 10 tokens influencing the similarity:
|
175 |
+
# Token: ĠAI, Score: 2.7591
|
176 |
+
# Token: Ġintelligence, Score: 2.2971
|
177 |
+
# Token: Ġartificial, Score: 1.7654
|
178 |
+
# Token: Ġfounded, Score: 1.3254
|
179 |
+
# Token: Ġinvention, Score: 0.9808
|
180 |
+
# Token: Ġlearning, Score: 0.4847
|
181 |
+
# Token: Ġcomputer, Score: 0.4789
|
182 |
+
# Token: Ġrobot, Score: 0.3466
|
183 |
+
# Token: Ġestablishment, Score: 0.3371
|
184 |
+
# Token: Ġscientific, Score: 0.2804
|
185 |
+
# Query: Where was Turing born?
|
186 |
+
# Best doc associate: Similarity: 17.1359, Doc: Born in Maida Vale, London, Turing was raised in southern England.
|
187 |
+
# Top 10 tokens influencing the similarity:
|
188 |
+
# Token: uring, Score: 2.9761
|
189 |
+
# Token: ĠTuring, Score: 2.4544
|
190 |
+
# Token: Ġborn, Score: 2.4314
|
191 |
+
# Token: ing, Score: 1.7760
|
192 |
+
# Token: ure, Score: 1.7626
|
193 |
+
# Token: Ġcomput, Score: 1.3356
|
194 |
+
# Token: Ġraised, Score: 1.3285
|
195 |
+
# Token: able, Score: 1.1940
|
196 |
+
# Token: Ġphilosopher, Score: 0.4118
|
197 |
+
# Token: Ġmachine, Score: 0.3977
|
198 |
+
|
199 |
+
```
|
200 |
+
|
201 |
**Evaluation:**
|
202 |
|
203 |
Granite-Embedding-30m-Sparse is competive in performance to the naver/splade-v3-distilbert despite being half the parameter size. We also compare the sparse model with similar sized dense embedding counterpart `ibm-granite/granite-embedding-30m-english`. The performance of the models on MTEB Retrieval (i.e., BEIR) is reported below.
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_type": "SparseEncoder",
|
3 |
+
"__version__": {
|
4 |
+
"sentence_transformers": "5.0.0",
|
5 |
+
"transformers": "4.50.3",
|
6 |
+
"pytorch": "2.6.0+cu124"
|
7 |
+
},
|
8 |
+
"prompts": {
|
9 |
+
"query": "",
|
10 |
+
"document": ""
|
11 |
+
},
|
12 |
+
"default_prompt_name": null,
|
13 |
+
"similarity_fn_name": "dot"
|
14 |
+
}
|
modules.json
CHANGED
@@ -3,18 +3,12 @@
|
|
3 |
"idx": 0,
|
4 |
"name": "0",
|
5 |
"path": "",
|
6 |
-
"type": "sentence_transformers.models.
|
7 |
},
|
8 |
{
|
9 |
"idx": 1,
|
10 |
"name": "1",
|
11 |
-
"path": "
|
12 |
-
"type": "sentence_transformers.models.
|
13 |
-
},
|
14 |
-
{
|
15 |
-
"idx": 2,
|
16 |
-
"name": "2",
|
17 |
-
"path": "2_Normalize",
|
18 |
-
"type": "sentence_transformers.models.Normalize"
|
19 |
}
|
20 |
-
]
|
|
|
3 |
"idx": 0,
|
4 |
"name": "0",
|
5 |
"path": "",
|
6 |
+
"type": "sentence_transformers.sparse_encoder.models.MLMTransformer"
|
7 |
},
|
8 |
{
|
9 |
"idx": 1,
|
10 |
"name": "1",
|
11 |
+
"path": "1_SpladePooling",
|
12 |
+
"type": "sentence_transformers.sparse_encoder.models.SpladePooling"
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
}
|
14 |
+
]
|
sentence_bert_config.json
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
{
|
2 |
-
|
3 |
-
|
4 |
-
}
|
|
|
1 |
{
|
2 |
+
"max_seq_length": 512,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|