Spaces:
Runtime error
Runtime error
File size: 2,723 Bytes
fc850f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
from typing import List
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
import matplotlib.pyplot as plt
def calculate_word_overlaps(documents: List[str], query: str):
"""
Calculate the average word overlaps between documents and the query.
"""
query_words = set(query.lower().split())
word_overlaps = []
for doc in documents:
doc_words = set(doc.lower().split())
overlap = len(query_words.intersection(doc_words))
word_overlaps.append(overlap)
if len(word_overlaps) > 0:
average_word_overlap = np.mean(word_overlaps)
else:
average_word_overlap = 0.0
return average_word_overlap
def calculate_duplication_rate(documents: List[str]):
"""
Calculate the duplication rate among a list of documents.
"""
total_words_set = set()
total_words = 0
for doc in documents:
doc_words = doc.lower().split()
total_words_set.update(doc_words)
total_words += len(doc_words)
if total_words > 0:
duplication_rate = (total_words - len(total_words_set)) / total_words
else:
duplication_rate = 0.0
return duplication_rate
def cosine_similarity_score(documents: List[str], query: str):
"""
Calculate cosine similarity between the query and each document.
"""
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([query] + documents)
cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
return cosine_similarities[0]
def jaccard_similarity_score(documents: List[str], query: str):
"""
Calculate Jaccard similarity between the query and each document.
"""
query_words = set(query.lower().split())
jaccard_similarities = []
for doc in documents:
doc_words = set(doc.lower().split())
intersection_size = len(query_words.intersection(doc_words))
union_size = len(query_words.union(doc_words))
jaccard_similarity = intersection_size / union_size if union_size > 0 else 0
jaccard_similarities.append(jaccard_similarity)
return jaccard_similarities
def display_similarity_results(cosine_scores, jaccard_scores, title):
st.subheader(f"{title} - Cosine Similarity to Query")
plt.bar(range(len(cosine_scores)), cosine_scores)
plt.xlabel("Documents")
plt.ylabel("Cosine Similarity")
st.pyplot(plt)
st.subheader(f"{title} - Jaccard Similarity to Query")
plt.bar(range(len(jaccard_scores)), jaccard_scores, color='orange')
plt.xlabel("Documents")
plt.ylabel("Jaccard Similarity")
st.pyplot(plt) |