File size: 8,018 Bytes
1523311
 
 
 
 
 
d37f521
1523311
 
 
 
 
 
 
 
d37f521
 
1523311
bf669a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1523311
3643a93
 
d37f521
3643a93
1523311
3643a93
 
 
1523311
 
3643a93
1523311
 
3643a93
4d311a0
3643a93
 
d37f521
1523311
 
d37f521
 
 
 
1523311
 
 
 
 
 
 
 
 
d37f521
 
1523311
 
 
d37f521
 
1523311
6bdb887
3e8320d
6bdb887
4d311a0
3643a93
 
3e8320d
6bdb887
3e8320d
1523311
4d311a0
 
3643a93
1523311
3643a93
1523311
3643a93
d37f521
3643a93
d37f521
1523311
d37f521
1523311
 
d37f521
1523311
 
 
d37f521
 
 
 
 
 
 
3643a93
df07dc7
d37f521
1523311
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import streamlit as st
from docx import Document
import re
from collections import Counter
from math import sqrt
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
import matplotlib.pyplot as plt
import io
import base64

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

def read_file_content(uploaded_file):
    if uploaded_file.type == "text/plain":
        return uploaded_file.getvalue().decode("utf-8")
    elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        doc = Document(uploaded_file)
        return " ".join([paragraph.text for paragraph in doc.paragraphs])
    else:
        raise ValueError("Unsupported file type")

def preprocess_text(text):
    # Convert to lowercase and remove punctuation
    text = re.sub(r'[^\w\s]', '', text.lower())
    
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words]

def cosine_similarity(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = sqrt(sum1) * sqrt(sum2)
    
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def calculate_word_similarity(text1, text2):
    words1 = preprocess_text(text1)
    words2 = preprocess_text(text2)
    
    vec1 = Counter(words1)
    vec2 = Counter(words2)
    
    similarity = cosine_similarity(vec1, vec2)
    return similarity * 100

def calculate_sentence_similarity(text1, text2):
    sentences1 = sent_tokenize(text1)
    sentences2 = sent_tokenize(text2)
    
    similarities = []
    for sent1 in sentences1:
        max_similarity = 0
        for sent2 in sentences2:
            similarity = calculate_word_similarity(sent1, sent2)
            if similarity > max_similarity:
                max_similarity = similarity
        similarities.append(max_similarity)
    
    average_similarity = sum(similarities) / len(similarities) if similarities else 0.0
    return average_similarity

def longest_common_subsequence(text1, text2):
    sentences1 = sent_tokenize(text1)
    sentences2 = sent_tokenize(text2)
    
    m, n = len(sentences1), len(sentences2)
    L = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if sentences1[i-1] == sentences2[j-1]:
                L[i][j] = L[i-1][j-1] + 1
            else:
                L[i][j] = max(L[i-1][j], L[i][j-1])
    
    # Backtrack to find the LCS
    lcs = []
    i, j = m, n
    while i > 0 and j > 0:
        if sentences1[i-1] == sentences2[j-1]:
            lcs.append(sentences1[i-1])
            i -= 1
            j -= 1
        elif L[i-1][j] > L[i][j-1]:
            i -= 1
        else:
            j -= 1
    
    return list(reversed(lcs))

def suggest_rewrites(sentence):
    words = word_tokenize(sentence)
    tagged_words = nltk.pos_tag(words)
    
    rewrites = []
    
    for word, tag in tagged_words:
        syns = wordnet.synsets(word)
        if syns:
            if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'):
                synonym = syns[0].lemmas()[0].name()
                if synonym != word:
                    rewrites.append(synonym)
                else:
                    rewrites.append(word)
            else:
                rewrites.append(word)
        else:
            rewrites.append(word)
    
    return " ".join(rewrites)

def calculate_plagiarism_percentage(word_similarity, sentence_similarity):
    return (word_similarity + sentence_similarity) / 2

def create_bar_chart(word_similarity, sentence_similarity, plagiarism_percentage):
    fig, ax = plt.subplots()
    ax.bar(["Word-Level Similarity", "Sentence-Level Similarity", "Plagiarism Percentage"], 
           [word_similarity, sentence_similarity, plagiarism_percentage], 
           color=["blue", "green", "red"])
    ax.set_ylabel("Percentage")
    ax.set_ylim(0, 100)
    ax.set_title("Document Similarity and Plagiarism")
    st.pyplot(fig)

def download_report(word_similarity, sentence_similarity, plagiarism_percentage, matched_sequences, reworded_matches):
    report = f"Word-Level Similarity: {word_similarity:.2f}%\n"
    report += f"Sentence-Level Similarity: {sentence_similarity:.2f}%\n"
    report += f"Plagiarism Percentage: {plagiarism_percentage:.2f}%\n\n"
    report += "Matched Sequences from the Created Document:\n"
    for i, match in enumerate(matched_sequences, 1):
        report += f"{i}. {match}\n"
    
    report += "\nRewritten Suggestions to Avoid Plagiarism:\n"
    for i, reworded in enumerate(reworded_matches, 1):
        report += f"{i}. {reworded}\n"

    report_bytes = report.encode("utf-8")
    b64 = base64.b64encode(report_bytes).decode()
    href = f'<a href="data:text/plain;base64,{b64}" download="plagiarism_report.txt">Download Report</a>'
    st.markdown(href, unsafe_allow_html=True)

def main():
    st.title("High-Accuracy Document Plagiarism Checker")

    doc1 = st.file_uploader("Upload Original Document", type=["txt", "docx"])
    doc2 = st.file_uploader("Upload Created Document", type=["txt", "docx"])

    if doc1 is not None and doc2 is not None:
        try:
            text1 = read_file_content(doc1)  # Original Document
            text2 = read_file_content(doc2)  # Created Document

            # Calculate word-level cosine similarity
            word_similarity = calculate_word_similarity(text1, text2)
            # Calculate sentence-level similarity
            sentence_similarity = calculate_sentence_similarity(text1, text2)
            # Calculate plagiarism percentage
            plagiarism_percentage = calculate_plagiarism_percentage(word_similarity, sentence_similarity)

            # Find longest common subsequences for sentence matches (from the created document)
            matched_sequences = longest_common_subsequence(text1, text2)

            st.write(f"Word-Level Cosine Similarity: {word_similarity:.2f}%")
            st.write(f"Sentence-Level Similarity: {sentence_similarity:.2f}%")
            st.write(f"Plagiarism Percentage: {plagiarism_percentage:.2f}%")

            create_bar_chart(word_similarity, sentence_similarity, plagiarism_percentage)

            if plagiarism_percentage < 20:
                st.write("The created document is mostly original.")
            elif plagiarism_percentage < 50:
                st.write("There are some similarities between the created and original documents.")
            else:
                st.write("The created document has significant similarities with the original and may contain plagiarism.")

            if matched_sequences:
                st.subheader("Matched Content from the Created Document:")
                for i, match in enumerate(matched_sequences, 1):
                    st.write(f"{i}. {match}")
                
                # Rewriting the matched content
                reworded_matches = [suggest_rewrites(match) for match in matched_sequences]

                st.subheader("Rewritten Suggestions to Avoid Plagiarism:")
                for i, reworded in enumerate(reworded_matches, 1):
                    st.write(f"{i}. {reworded}")
                
                download_report(word_similarity, sentence_similarity, plagiarism_percentage, matched_sequences, reworded_matches)
            else:
                st.write("No significant matched content found from the created document.")

        except ValueError as e:
            st.error(f"Error: {str(e)}")

if __name__ == "__main__":
    main()