Ayesha931 commited on
Commit
c6b1ceb
·
verified ·
1 Parent(s): 0ceb1ec

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +15 -16
src/streamlit_app.py CHANGED
@@ -4,6 +4,7 @@ import torch
4
  import pickle
5
  import numpy as np
6
  from transformers import AutoTokenizer, AutoModel
 
7
  from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
8
  from PyPDF2 import PdfReader
9
  import json
@@ -52,7 +53,7 @@ JSON_TEMPLATE = {
52
 
53
  # --- Load Model ---
54
  tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
55
- model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
56
 
57
  def mean_pooling(model_output, attention_mask):
58
  token_embeddings = model_output[0]
@@ -60,26 +61,24 @@ def mean_pooling(model_output, attention_mask):
60
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
61
 
62
  def embed_text(texts):
63
- # encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
64
- # with torch.no_grad():
65
- # model_output = model(**encoded_input)
66
- # return mean_pooling(model_output, encoded_input['attention_mask']).cpu().numpy()
67
 
68
- # Use the correct device
69
- device = torch.device("cpu") # or "cuda" if you have a GPU
70
 
71
- # Move model to the correct device
72
- model.to(device)
73
 
74
- # Prepare the inputs
75
- encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
76
 
77
- # Generate embeddings
78
- with torch.no_grad():
79
- model_output = model(**encoded_input)
80
 
81
- # Return embeddings as numpy array
82
- return mean_pooling(model_output, encoded_input['attention_mask']).cpu().numpy()
 
 
83
 
84
 
85
  # --- Load & Chunk Text ---
 
4
  import pickle
5
  import numpy as np
6
  from transformers import AutoTokenizer, AutoModel
7
+ from sentence_transformers import SentenceTransformer
8
  from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
9
  from PyPDF2 import PdfReader
10
  import json
 
53
 
54
  # --- Load Model ---
55
  tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
56
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
57
 
58
  def mean_pooling(model_output, attention_mask):
59
  token_embeddings = model_output[0]
 
61
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
62
 
63
  def embed_text(texts):
 
 
 
 
64
 
65
+ # # Use the correct device
66
+ # device = torch.device("cpu") # or "cuda" if you have a GPU
67
 
68
+ # # Move model to the correct device
69
+ # model.to(device)
70
 
71
+ # # Prepare the inputs
72
+ # encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
73
 
74
+ # # Generate embeddings
75
+ # with torch.no_grad():
76
+ # model_output = model(**encoded_input)
77
 
78
+ # # Return embeddings as numpy array
79
+ # return mean_pooling(model_output, encoded_input['attention_mask']).cpu().numpy()
80
+ embeddings = model.encode(texts)
81
+ return embeddings
82
 
83
 
84
  # --- Load & Chunk Text ---