from typing import List import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import streamlit as st import matplotlib.pyplot as plt def calculate_word_overlaps(documents: List[str], query: str): """ Calculate the average word overlaps between documents and the query. """ query_words = set(query.lower().split()) word_overlaps = [] for doc in documents: doc_words = set(doc.lower().split()) overlap = len(query_words.intersection(doc_words)) word_overlaps.append(overlap) if len(word_overlaps) > 0: average_word_overlap = np.mean(word_overlaps) else: average_word_overlap = 0.0 return average_word_overlap def calculate_duplication_rate(documents: List[str]): """ Calculate the duplication rate among a list of documents. """ total_words_set = set() total_words = 0 for doc in documents: doc_words = doc.lower().split() total_words_set.update(doc_words) total_words += len(doc_words) if total_words > 0: duplication_rate = (total_words - len(total_words_set)) / total_words else: duplication_rate = 0.0 return duplication_rate def cosine_similarity_score(documents: List[str], query: str): """ Calculate cosine similarity between the query and each document. """ tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform([query] + documents) cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]) return cosine_similarities[0] def jaccard_similarity_score(documents: List[str], query: str): """ Calculate Jaccard similarity between the query and each document. """ query_words = set(query.lower().split()) jaccard_similarities = [] for doc in documents: doc_words = set(doc.lower().split()) intersection_size = len(query_words.intersection(doc_words)) union_size = len(query_words.union(doc_words)) jaccard_similarity = intersection_size / union_size if union_size > 0 else 0 jaccard_similarities.append(jaccard_similarity) return jaccard_similarities def display_similarity_results(cosine_scores, jaccard_scores, title): st.subheader(f"{title} - Cosine Similarity to Query") plt.bar(range(len(cosine_scores)), cosine_scores) plt.xlabel("Documents") plt.ylabel("Cosine Similarity") st.pyplot(plt) st.subheader(f"{title} - Jaccard Similarity to Query") plt.bar(range(len(jaccard_scores)), jaccard_scores, color='orange') plt.xlabel("Documents") plt.ylabel("Jaccard Similarity") st.pyplot(plt)