Update app.py
Browse files
app.py
CHANGED
|
@@ -39,8 +39,6 @@ from typing import List, Dict, Tuple
|
|
| 39 |
import datetime
|
| 40 |
from abc import ABC, abstractmethod
|
| 41 |
from typing import List, Dict, Any
|
| 42 |
-
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
|
| 43 |
-
|
| 44 |
|
| 45 |
# Automatically get the current year
|
| 46 |
CURRENT_YEAR = datetime.datetime.now().year
|
|
@@ -107,9 +105,8 @@ groq_client = Groq(api_key=GROQ_API_KEY)
|
|
| 107 |
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
|
| 108 |
mistral_client = Mistral(api_key=MISTRAL_API_KEY)
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
)
|
| 113 |
|
| 114 |
# Step 1: Create a base class for AI models
|
| 115 |
class AIModel(ABC):
|
|
@@ -648,9 +645,9 @@ def rerank_documents(query: str, documents: List[Dict],
|
|
| 648 |
bm25_scores = bm25.get_scores(query)
|
| 649 |
|
| 650 |
# Step 4: Get semantic similarity scores
|
| 651 |
-
query_embedding = similarity_model.
|
| 652 |
doc_summaries = [doc['summary'] for doc in valid_docs]
|
| 653 |
-
doc_embeddings = similarity_model.
|
| 654 |
semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
|
| 655 |
|
| 656 |
# Step 5: Combine scores (normalize first)
|
|
@@ -682,11 +679,11 @@ def rerank_documents(query: str, documents: List[Dict],
|
|
| 682 |
continue
|
| 683 |
|
| 684 |
# Check similarity with already selected documents
|
| 685 |
-
doc_embedding = similarity_model.
|
| 686 |
is_similar = False
|
| 687 |
|
| 688 |
for content in added_contents:
|
| 689 |
-
content_embedding = similarity_model.
|
| 690 |
similarity = util.pytorch_cos_sim(doc_embedding, content_embedding)
|
| 691 |
if similarity > similarity_threshold:
|
| 692 |
is_similar = True
|
|
@@ -708,8 +705,8 @@ def rerank_documents(query: str, documents: List[Dict],
|
|
| 708 |
|
| 709 |
def compute_similarity(text1, text2):
|
| 710 |
# Encode the texts
|
| 711 |
-
embedding1 = similarity_model.
|
| 712 |
-
embedding2 = similarity_model.
|
| 713 |
|
| 714 |
# Compute cosine similarity
|
| 715 |
cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2)
|
|
|
|
| 39 |
import datetime
|
| 40 |
from abc import ABC, abstractmethod
|
| 41 |
from typing import List, Dict, Any
|
|
|
|
|
|
|
| 42 |
|
| 43 |
# Automatically get the current year
|
| 44 |
CURRENT_YEAR = datetime.datetime.now().year
|
|
|
|
| 105 |
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
|
| 106 |
mistral_client = Mistral(api_key=MISTRAL_API_KEY)
|
| 107 |
|
| 108 |
+
# Initialize the similarity model
|
| 109 |
+
similarity_model = SentenceTransformer('BAAI/bge-small-en-v1.5')
|
|
|
|
| 110 |
|
| 111 |
# Step 1: Create a base class for AI models
|
| 112 |
class AIModel(ABC):
|
|
|
|
| 645 |
bm25_scores = bm25.get_scores(query)
|
| 646 |
|
| 647 |
# Step 4: Get semantic similarity scores
|
| 648 |
+
query_embedding = similarity_model.encode(query)
|
| 649 |
doc_summaries = [doc['summary'] for doc in valid_docs]
|
| 650 |
+
doc_embeddings = similarity_model.encode(doc_summaries)
|
| 651 |
semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
|
| 652 |
|
| 653 |
# Step 5: Combine scores (normalize first)
|
|
|
|
| 679 |
continue
|
| 680 |
|
| 681 |
# Check similarity with already selected documents
|
| 682 |
+
doc_embedding = similarity_model.encode(doc['summary'])
|
| 683 |
is_similar = False
|
| 684 |
|
| 685 |
for content in added_contents:
|
| 686 |
+
content_embedding = similarity_model.encode(content)
|
| 687 |
similarity = util.pytorch_cos_sim(doc_embedding, content_embedding)
|
| 688 |
if similarity > similarity_threshold:
|
| 689 |
is_similar = True
|
|
|
|
| 705 |
|
| 706 |
def compute_similarity(text1, text2):
|
| 707 |
# Encode the texts
|
| 708 |
+
embedding1 = similarity_model.encode(text1)
|
| 709 |
+
embedding2 = similarity_model.encode(text2)
|
| 710 |
|
| 711 |
# Compute cosine similarity
|
| 712 |
cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2)
|