Sentence Similarity
Transformers
Safetensors
multilingual
nllb-llm2vec
feature-extraction
text-embedding
embeddings
information-retrieval
beir
text-classification
language-model
text-clustering
text-semantic-similarity
text-evaluation
text-reranking
Sentence Similarity
natural_questions
ms_marco
fever
hotpot_qa
mteb
custom_code
Fabian-David Schmidt
commited on
Commit
·
9fcef4c
1
Parent(s):
954e323
fix(README): example usage for encoding
Browse files
README.md
CHANGED
|
@@ -43,10 +43,7 @@ import torch
|
|
| 43 |
import torch.nn.functional as F
|
| 44 |
from transformers import AutoTokenizer, AutoModel, AutoConfig
|
| 45 |
|
| 46 |
-
|
| 47 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 48 |
-
"facebook/nllb-200-distilled-600M"
|
| 49 |
-
)
|
| 50 |
|
| 51 |
model = AutoModel.from_pretrained(
|
| 52 |
"fdschmidt93/NLLB-LLM2Vec-Meta-Llama-31-8B-Instruct-mntp-unsup-simcse",
|
|
@@ -55,32 +52,30 @@ model = AutoModel.from_pretrained(
|
|
| 55 |
device_map="cuda" if torch.cuda.is_available() else "cpu",
|
| 56 |
)
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
"
|
| 61 |
-
)
|
| 62 |
-
queries = [
|
| 63 |
-
[instruction, "how much protein should a female eat"],
|
| 64 |
-
[instruction, "summit define"],
|
| 65 |
]
|
| 66 |
-
q_reps = l2v.encode(queries)
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
"
|
|
|
|
| 72 |
]
|
| 73 |
-
|
|
|
|
|
|
|
| 74 |
|
| 75 |
# Compute cosine similarity
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
cos_sim =
|
| 79 |
|
| 80 |
print(cos_sim)
|
| 81 |
"""
|
| 82 |
-
tensor([[0.
|
| 83 |
-
[0.
|
| 84 |
"""
|
| 85 |
```
|
| 86 |
|
|
|
|
| 43 |
import torch.nn.functional as F
|
| 44 |
from transformers import AutoTokenizer, AutoModel, AutoConfig
|
| 45 |
|
| 46 |
+
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
model = AutoModel.from_pretrained(
|
| 49 |
"fdschmidt93/NLLB-LLM2Vec-Meta-Llama-31-8B-Instruct-mntp-unsup-simcse",
|
|
|
|
| 52 |
device_map="cuda" if torch.cuda.is_available() else "cpu",
|
| 53 |
)
|
| 54 |
|
| 55 |
+
flores_en = [
|
| 56 |
+
"Lead researchers say this may bring early detection of cancer, tuberculosis, HIV and malaria to patients in low-income countries, where the survival rates for illnesses such as breast cancer can be half those of richer countries.",
|
| 57 |
+
"Enceladus is the most reflective object in the solar system, reflecting about 90 percent of the sunlight that hits it.",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
]
|
|
|
|
| 59 |
|
| 60 |
+
en_embeds = model.encode(flores_en)
|
| 61 |
+
|
| 62 |
+
flores_de = [
|
| 63 |
+
"Führende Forscher sagen, dass dies die Früherkennung von Krebs, Tuberkulose, HIV und Malaria für Patienten in einkommensschwachen Ländern fördern könnte, wo die Überlebensraten bei Krankheiten wie Brustkrebs teilweise nur halb so hoch sind wie in reicheren Ländern.",
|
| 64 |
+
"Enceladus ist das Objekt im Sonnensystem, das am stärksten reflektiert. Er wirft etwa 90 Prozent des auf ihn treffenden Sonnenlichts zurück.",
|
| 65 |
]
|
| 66 |
+
|
| 67 |
+
de_embeds = model.encode(flores_de, src_lang="deu_Latn")
|
| 68 |
+
|
| 69 |
|
| 70 |
# Compute cosine similarity
|
| 71 |
+
en_embeds_norm = F.normalize(en_embeds, p=2, dim=1)
|
| 72 |
+
de_embeds_norm = F.normalize(de_embeds, p=2, dim=1)
|
| 73 |
+
cos_sim = en_embeds_norm @ de_embeds_norm.T
|
| 74 |
|
| 75 |
print(cos_sim)
|
| 76 |
"""
|
| 77 |
+
tensor([[0.9062, 0.0894],
|
| 78 |
+
[0.1289, 0.8633]])
|
| 79 |
"""
|
| 80 |
```
|
| 81 |
|