File size: 2,723 Bytes
fc850f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from typing import List
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
import matplotlib.pyplot as plt

def calculate_word_overlaps(documents: List[str], query: str):
    """
    Calculate the average word overlaps between documents and the query.
    """
    query_words = set(query.lower().split())
    word_overlaps = []

    for doc in documents:
        doc_words = set(doc.lower().split())
        overlap = len(query_words.intersection(doc_words))
        word_overlaps.append(overlap)

    if len(word_overlaps) > 0:
        average_word_overlap = np.mean(word_overlaps)
    else:
        average_word_overlap = 0.0

    return average_word_overlap

def calculate_duplication_rate(documents: List[str]):
    """
    Calculate the duplication rate among a list of documents.
    """
    total_words_set = set()
    total_words = 0

    for doc in documents:
        doc_words = doc.lower().split()
        total_words_set.update(doc_words)
        total_words += len(doc_words)

    if total_words > 0:
        duplication_rate = (total_words - len(total_words_set)) / total_words
    else:
        duplication_rate = 0.0

    return duplication_rate


def cosine_similarity_score(documents: List[str], query: str):
    """
    Calculate cosine similarity between the query and each document.
    """
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([query] + documents)
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
    return cosine_similarities[0]

def jaccard_similarity_score(documents: List[str], query: str):
    """
    Calculate Jaccard similarity between the query and each document.
    """
    query_words = set(query.lower().split())
    jaccard_similarities = []

    for doc in documents:
        doc_words = set(doc.lower().split())
        intersection_size = len(query_words.intersection(doc_words))
        union_size = len(query_words.union(doc_words))
        jaccard_similarity = intersection_size / union_size if union_size > 0 else 0
        jaccard_similarities.append(jaccard_similarity)

    return jaccard_similarities


def display_similarity_results(cosine_scores, jaccard_scores, title):
    st.subheader(f"{title} - Cosine Similarity to Query")
    plt.bar(range(len(cosine_scores)), cosine_scores)
    plt.xlabel("Documents")
    plt.ylabel("Cosine Similarity")
    st.pyplot(plt)

    st.subheader(f"{title} - Jaccard Similarity to Query")
    plt.bar(range(len(jaccard_scores)), jaccard_scores, color='orange')
    plt.xlabel("Documents")
    plt.ylabel("Jaccard Similarity")
    st.pyplot(plt)