Spaces:
Sleeping
Sleeping
File size: 8,018 Bytes
1523311 d37f521 1523311 d37f521 1523311 bf669a0 1523311 3643a93 d37f521 3643a93 1523311 3643a93 1523311 3643a93 1523311 3643a93 4d311a0 3643a93 d37f521 1523311 d37f521 1523311 d37f521 1523311 d37f521 1523311 6bdb887 3e8320d 6bdb887 4d311a0 3643a93 3e8320d 6bdb887 3e8320d 1523311 4d311a0 3643a93 1523311 3643a93 1523311 3643a93 d37f521 3643a93 d37f521 1523311 d37f521 1523311 d37f521 1523311 d37f521 3643a93 df07dc7 d37f521 1523311 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
import streamlit as st
from docx import Document
import re
from collections import Counter
from math import sqrt
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
import matplotlib.pyplot as plt
import io
import base64
# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
def read_file_content(uploaded_file):
if uploaded_file.type == "text/plain":
return uploaded_file.getvalue().decode("utf-8")
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
doc = Document(uploaded_file)
return " ".join([paragraph.text for paragraph in doc.paragraphs])
else:
raise ValueError("Unsupported file type")
def preprocess_text(text):
# Convert to lowercase and remove punctuation
text = re.sub(r'[^\w\s]', '', text.lower())
# Tokenize and remove stopwords
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(text)
return [word for word in tokens if word not in stop_words]
def cosine_similarity(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = sqrt(sum1) * sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def calculate_word_similarity(text1, text2):
words1 = preprocess_text(text1)
words2 = preprocess_text(text2)
vec1 = Counter(words1)
vec2 = Counter(words2)
similarity = cosine_similarity(vec1, vec2)
return similarity * 100
def calculate_sentence_similarity(text1, text2):
sentences1 = sent_tokenize(text1)
sentences2 = sent_tokenize(text2)
similarities = []
for sent1 in sentences1:
max_similarity = 0
for sent2 in sentences2:
similarity = calculate_word_similarity(sent1, sent2)
if similarity > max_similarity:
max_similarity = similarity
similarities.append(max_similarity)
average_similarity = sum(similarities) / len(similarities) if similarities else 0.0
return average_similarity
def longest_common_subsequence(text1, text2):
sentences1 = sent_tokenize(text1)
sentences2 = sent_tokenize(text2)
m, n = len(sentences1), len(sentences2)
L = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if sentences1[i-1] == sentences2[j-1]:
L[i][j] = L[i-1][j-1] + 1
else:
L[i][j] = max(L[i-1][j], L[i][j-1])
# Backtrack to find the LCS
lcs = []
i, j = m, n
while i > 0 and j > 0:
if sentences1[i-1] == sentences2[j-1]:
lcs.append(sentences1[i-1])
i -= 1
j -= 1
elif L[i-1][j] > L[i][j-1]:
i -= 1
else:
j -= 1
return list(reversed(lcs))
def suggest_rewrites(sentence):
words = word_tokenize(sentence)
tagged_words = nltk.pos_tag(words)
rewrites = []
for word, tag in tagged_words:
syns = wordnet.synsets(word)
if syns:
if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'):
synonym = syns[0].lemmas()[0].name()
if synonym != word:
rewrites.append(synonym)
else:
rewrites.append(word)
else:
rewrites.append(word)
else:
rewrites.append(word)
return " ".join(rewrites)
def calculate_plagiarism_percentage(word_similarity, sentence_similarity):
return (word_similarity + sentence_similarity) / 2
def create_bar_chart(word_similarity, sentence_similarity, plagiarism_percentage):
fig, ax = plt.subplots()
ax.bar(["Word-Level Similarity", "Sentence-Level Similarity", "Plagiarism Percentage"],
[word_similarity, sentence_similarity, plagiarism_percentage],
color=["blue", "green", "red"])
ax.set_ylabel("Percentage")
ax.set_ylim(0, 100)
ax.set_title("Document Similarity and Plagiarism")
st.pyplot(fig)
def download_report(word_similarity, sentence_similarity, plagiarism_percentage, matched_sequences, reworded_matches):
report = f"Word-Level Similarity: {word_similarity:.2f}%\n"
report += f"Sentence-Level Similarity: {sentence_similarity:.2f}%\n"
report += f"Plagiarism Percentage: {plagiarism_percentage:.2f}%\n\n"
report += "Matched Sequences from the Created Document:\n"
for i, match in enumerate(matched_sequences, 1):
report += f"{i}. {match}\n"
report += "\nRewritten Suggestions to Avoid Plagiarism:\n"
for i, reworded in enumerate(reworded_matches, 1):
report += f"{i}. {reworded}\n"
report_bytes = report.encode("utf-8")
b64 = base64.b64encode(report_bytes).decode()
href = f'<a href="data:text/plain;base64,{b64}" download="plagiarism_report.txt">Download Report</a>'
st.markdown(href, unsafe_allow_html=True)
def main():
st.title("High-Accuracy Document Plagiarism Checker")
doc1 = st.file_uploader("Upload Original Document", type=["txt", "docx"])
doc2 = st.file_uploader("Upload Created Document", type=["txt", "docx"])
if doc1 is not None and doc2 is not None:
try:
text1 = read_file_content(doc1) # Original Document
text2 = read_file_content(doc2) # Created Document
# Calculate word-level cosine similarity
word_similarity = calculate_word_similarity(text1, text2)
# Calculate sentence-level similarity
sentence_similarity = calculate_sentence_similarity(text1, text2)
# Calculate plagiarism percentage
plagiarism_percentage = calculate_plagiarism_percentage(word_similarity, sentence_similarity)
# Find longest common subsequences for sentence matches (from the created document)
matched_sequences = longest_common_subsequence(text1, text2)
st.write(f"Word-Level Cosine Similarity: {word_similarity:.2f}%")
st.write(f"Sentence-Level Similarity: {sentence_similarity:.2f}%")
st.write(f"Plagiarism Percentage: {plagiarism_percentage:.2f}%")
create_bar_chart(word_similarity, sentence_similarity, plagiarism_percentage)
if plagiarism_percentage < 20:
st.write("The created document is mostly original.")
elif plagiarism_percentage < 50:
st.write("There are some similarities between the created and original documents.")
else:
st.write("The created document has significant similarities with the original and may contain plagiarism.")
if matched_sequences:
st.subheader("Matched Content from the Created Document:")
for i, match in enumerate(matched_sequences, 1):
st.write(f"{i}. {match}")
# Rewriting the matched content
reworded_matches = [suggest_rewrites(match) for match in matched_sequences]
st.subheader("Rewritten Suggestions to Avoid Plagiarism:")
for i, reworded in enumerate(reworded_matches, 1):
st.write(f"{i}. {reworded}")
download_report(word_similarity, sentence_similarity, plagiarism_percentage, matched_sequences, reworded_matches)
else:
st.write("No significant matched content found from the created document.")
except ValueError as e:
st.error(f"Error: {str(e)}")
if __name__ == "__main__":
main() |