Fabian-David Schmidt commited on
Commit
9fcef4c
·
1 Parent(s): 954e323

fix(README): example usage for encoding

Browse files
Files changed (1) hide show
  1. README.md +17 -22
README.md CHANGED
@@ -43,10 +43,7 @@ import torch
43
  import torch.nn.functional as F
44
  from transformers import AutoTokenizer, AutoModel, AutoConfig
45
 
46
- # Loading base Mistral model, along with custom code that enables bidirectional connections in decoder-only LLMs.
47
- tokenizer = AutoTokenizer.from_pretrained(
48
- "facebook/nllb-200-distilled-600M"
49
- )
50
 
51
  model = AutoModel.from_pretrained(
52
  "fdschmidt93/NLLB-LLM2Vec-Meta-Llama-31-8B-Instruct-mntp-unsup-simcse",
@@ -55,32 +52,30 @@ model = AutoModel.from_pretrained(
55
  device_map="cuda" if torch.cuda.is_available() else "cpu",
56
  )
57
 
58
- # Encoding queries using instructions
59
- instruction = (
60
- "Given a web search query, retrieve relevant passages that answer the query:"
61
- )
62
- queries = [
63
- [instruction, "how much protein should a female eat"],
64
- [instruction, "summit define"],
65
  ]
66
- q_reps = l2v.encode(queries)
67
 
68
- # Encoding documents. Instruction are not required for documents
69
- documents = [
70
- "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
71
- "Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.",
 
72
  ]
73
- d_reps = l2v.encode(documents)
 
 
74
 
75
  # Compute cosine similarity
76
- q_reps_norm = F.normalize(q_reps, p=2, dim=1)
77
- d_reps_norm = F.normalize(d_reps, p=2, dim=1)
78
- cos_sim = q_reps_norm @ d_reps_norm.T
79
 
80
  print(cos_sim)
81
  """
82
- tensor([[0.7740, 0.5580],
83
- [0.4845, 0.4993]])
84
  """
85
  ```
86
 
 
43
  import torch.nn.functional as F
44
  from transformers import AutoTokenizer, AutoModel, AutoConfig
45
 
46
+ tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
 
 
 
47
 
48
  model = AutoModel.from_pretrained(
49
  "fdschmidt93/NLLB-LLM2Vec-Meta-Llama-31-8B-Instruct-mntp-unsup-simcse",
 
52
  device_map="cuda" if torch.cuda.is_available() else "cpu",
53
  )
54
 
55
+ flores_en = [
56
+ "Lead researchers say this may bring early detection of cancer, tuberculosis, HIV and malaria to patients in low-income countries, where the survival rates for illnesses such as breast cancer can be half those of richer countries.",
57
+ "Enceladus is the most reflective object in the solar system, reflecting about 90 percent of the sunlight that hits it.",
 
 
 
 
58
  ]
 
59
 
60
+ en_embeds = model.encode(flores_en)
61
+
62
+ flores_de = [
63
+ "Führende Forscher sagen, dass dies die Früherkennung von Krebs, Tuberkulose, HIV und Malaria für Patienten in einkommensschwachen Ländern fördern könnte, wo die Überlebensraten bei Krankheiten wie Brustkrebs teilweise nur halb so hoch sind wie in reicheren Ländern.",
64
+ "Enceladus ist das Objekt im Sonnensystem, das am stärksten reflektiert. Er wirft etwa 90 Prozent des auf ihn treffenden Sonnenlichts zurück.",
65
  ]
66
+
67
+ de_embeds = model.encode(flores_de, src_lang="deu_Latn")
68
+
69
 
70
  # Compute cosine similarity
71
+ en_embeds_norm = F.normalize(en_embeds, p=2, dim=1)
72
+ de_embeds_norm = F.normalize(de_embeds, p=2, dim=1)
73
+ cos_sim = en_embeds_norm @ de_embeds_norm.T
74
 
75
  print(cos_sim)
76
  """
77
+ tensor([[0.9062, 0.0894],
78
+ [0.1289, 0.8633]])
79
  """
80
  ```
81