fdschmidt93
/

NLLB-LLM2Vec-Meta-Llama-31-8B-Instruct-mntp-unsup-simcse

Model card Files Files and versions

xet

Community

Fabian-David Schmidt commited on Oct 18, 2024

Commit

9fcef4c

1 Parent(s): 954e323

fix(README): example usage for encoding

Browse files

Files changed (1) hide show

README.md +17 -22

README.md CHANGED Viewed

@@ -43,10 +43,7 @@ import torch
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModel, AutoConfig
-# Loading base Mistral model, along with custom code that enables bidirectional connections in decoder-only LLMs.
-tokenizer = AutoTokenizer.from_pretrained(
-    "facebook/nllb-200-distilled-600M"
-)
 model = AutoModel.from_pretrained(
     "fdschmidt93/NLLB-LLM2Vec-Meta-Llama-31-8B-Instruct-mntp-unsup-simcse",
@@ -55,32 +52,30 @@ model = AutoModel.from_pretrained(
     device_map="cuda" if torch.cuda.is_available() else "cpu",
 )
-# Encoding queries using instructions
-instruction = (
-    "Given a web search query, retrieve relevant passages that answer the query:"
-)
-queries = [
-    [instruction, "how much protein should a female eat"],
-    [instruction, "summit define"],
 ]
-q_reps = l2v.encode(queries)
-# Encoding documents. Instruction are not required for documents
-documents = [
-    "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
-    "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
 ]
-d_reps = l2v.encode(documents)
 # Compute cosine similarity
-q_reps_norm = F.normalize(q_reps, p=2, dim=1)
-d_reps_norm = F.normalize(d_reps, p=2, dim=1)
-cos_sim = q_reps_norm @ d_reps_norm.T
 print(cos_sim)
 """
-tensor([[0.7740, 0.5580],
-        [0.4845, 0.4993]])
 """
 ```

 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModel, AutoConfig
+tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
 model = AutoModel.from_pretrained(
     "fdschmidt93/NLLB-LLM2Vec-Meta-Llama-31-8B-Instruct-mntp-unsup-simcse",
     device_map="cuda" if torch.cuda.is_available() else "cpu",
 )
+flores_en = [
+    "Lead researchers say this may bring early detection of cancer, tuberculosis, HIV and malaria to patients in low-income countries, where the survival rates for illnesses such as breast cancer can be half those of richer countries.",
+    "Enceladus is the most reflective object in the solar system, reflecting about 90 percent of the sunlight that hits it.",
 ]
+en_embeds = model.encode(flores_en)
+flores_de = [
+    "Führende Forscher sagen, dass dies die Früherkennung von Krebs, Tuberkulose, HIV und Malaria für Patienten in einkommensschwachen Ländern fördern könnte, wo die Überlebensraten bei Krankheiten wie Brustkrebs teilweise nur halb so hoch sind wie in reicheren Ländern.",
+    "Enceladus ist das Objekt im Sonnensystem, das am stärksten reflektiert. Er wirft etwa 90 Prozent des auf ihn treffenden Sonnenlichts zurück.",
 ]
+de_embeds = model.encode(flores_de, src_lang="deu_Latn")
 # Compute cosine similarity
+en_embeds_norm = F.normalize(en_embeds, p=2, dim=1)
+de_embeds_norm = F.normalize(de_embeds, p=2, dim=1)
+cos_sim = en_embeds_norm @ de_embeds_norm.T
 print(cos_sim)
 """
+tensor([[0.9062, 0.0894],
+        [0.1289, 0.8633]])
 """
 ```