Spaces:
Sleeping
Sleeping
import streamlit as st | |
from docx import Document | |
import re | |
from collections import Counter | |
from math import sqrt | |
import nltk | |
from nltk.corpus import stopwords, wordnet | |
from nltk.tokenize import word_tokenize, sent_tokenize | |
import matplotlib.pyplot as plt | |
import io | |
import base64 | |
# Download necessary NLTK data | |
nltk.download('punkt', quiet=True) | |
nltk.download('stopwords', quiet=True) | |
nltk.download('wordnet', quiet=True) | |
nltk.download('averaged_perceptron_tagger', quiet=True) | |
def read_file_content(uploaded_file): | |
if uploaded_file.type == "text/plain": | |
return uploaded_file.getvalue().decode("utf-8") | |
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
doc = Document(uploaded_file) | |
return " ".join([paragraph.text for paragraph in doc.paragraphs]) | |
else: | |
raise ValueError("Unsupported file type") | |
def preprocess_text(text): | |
# Convert to lowercase and remove punctuation | |
text = re.sub(r'[^\w\s]', '', text.lower()) | |
# Tokenize and remove stopwords | |
stop_words = set(stopwords.words('english')) | |
tokens = word_tokenize(text) | |
return [word for word in tokens if word not in stop_words] | |
def cosine_similarity(vec1, vec2): | |
intersection = set(vec1.keys()) & set(vec2.keys()) | |
numerator = sum([vec1[x] * vec2[x] for x in intersection]) | |
sum1 = sum([vec1[x]**2 for x in vec1.keys()]) | |
sum2 = sum([vec2[x]**2 for x in vec2.keys()]) | |
denominator = sqrt(sum1) * sqrt(sum2) | |
if not denominator: | |
return 0.0 | |
else: | |
return float(numerator) / denominator | |
def calculate_word_similarity(text1, text2): | |
words1 = preprocess_text(text1) | |
words2 = preprocess_text(text2) | |
vec1 = Counter(words1) | |
vec2 = Counter(words2) | |
similarity = cosine_similarity(vec1, vec2) | |
return similarity * 100 | |
def calculate_sentence_similarity(text1, text2): | |
sentences1 = sent_tokenize(text1) | |
sentences2 = sent_tokenize(text2) | |
similarities = [] | |
for sent1 in sentences1: | |
max_similarity = 0 | |
for sent2 in sentences2: | |
similarity = calculate_word_similarity(sent1, sent2) | |
if similarity > max_similarity: | |
max_similarity = similarity | |
similarities.append(max_similarity) | |
average_similarity = sum(similarities) / len(similarities) if similarities else 0.0 | |
return average_similarity | |
def longest_common_subsequence(text1, text2): | |
sentences1 = sent_tokenize(text1) | |
sentences2 = sent_tokenize(text2) | |
m, n = len(sentences1), len(sentences2) | |
L = [[0] * (n + 1) for _ in range(m + 1)] | |
for i in range(1, m + 1): | |
for j in range(1, n + 1): | |
if sentences1[i-1] == sentences2[j-1]: | |
L[i][j] = L[i-1][j-1] + 1 | |
else: | |
L[i][j] = max(L[i-1][j], L[i][j-1]) | |
# Backtrack to find the LCS | |
lcs = [] | |
i, j = m, n | |
while i > 0 and j > 0: | |
if sentences1[i-1] == sentences2[j-1]: | |
lcs.append(sentences1[i-1]) | |
i -= 1 | |
j -= 1 | |
elif L[i-1][j] > L[i][j-1]: | |
i -= 1 | |
else: | |
j -= 1 | |
return list(reversed(lcs)) | |
def suggest_rewrites(sentence): | |
words = word_tokenize(sentence) | |
tagged_words = nltk.pos_tag(words) | |
rewrites = [] | |
for word, tag in tagged_words: | |
syns = wordnet.synsets(word) | |
if syns: | |
if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'): | |
synonym = syns[0].lemmas()[0].name() | |
if synonym != word: | |
rewrites.append(synonym) | |
else: | |
rewrites.append(word) | |
else: | |
rewrites.append(word) | |
else: | |
rewrites.append(word) | |
return " ".join(rewrites) | |
def calculate_plagiarism_percentage(word_similarity, sentence_similarity): | |
return (word_similarity + sentence_similarity) / 2 | |
def create_bar_chart(word_similarity, sentence_similarity, plagiarism_percentage): | |
fig, ax = plt.subplots() | |
ax.bar(["Word-Level Similarity", "Sentence-Level Similarity", "Plagiarism Percentage"], | |
[word_similarity, sentence_similarity, plagiarism_percentage], | |
color=["blue", "green", "red"]) | |
ax.set_ylabel("Percentage") | |
ax.set_ylim(0, 100) | |
ax.set_title("Document Similarity and Plagiarism") | |
st.pyplot(fig) | |
def download_report(word_similarity, sentence_similarity, plagiarism_percentage, matched_sequences, reworded_matches): | |
report = f"Word-Level Similarity: {word_similarity:.2f}%\n" | |
report += f"Sentence-Level Similarity: {sentence_similarity:.2f}%\n" | |
report += f"Plagiarism Percentage: {plagiarism_percentage:.2f}%\n\n" | |
report += "Matched Sequences from the Created Document:\n" | |
for i, match in enumerate(matched_sequences, 1): | |
report += f"{i}. {match}\n" | |
report += "\nRewritten Suggestions to Avoid Plagiarism:\n" | |
for i, reworded in enumerate(reworded_matches, 1): | |
report += f"{i}. {reworded}\n" | |
report_bytes = report.encode("utf-8") | |
b64 = base64.b64encode(report_bytes).decode() | |
href = f'<a href="data:text/plain;base64,{b64}" download="plagiarism_report.txt">Download Report</a>' | |
st.markdown(href, unsafe_allow_html=True) | |
def main(): | |
st.title("High-Accuracy Document Plagiarism Checker") | |
doc1 = st.file_uploader("Upload Original Document", type=["txt", "docx"]) | |
doc2 = st.file_uploader("Upload Created Document", type=["txt", "docx"]) | |
if doc1 is not None and doc2 is not None: | |
try: | |
text1 = read_file_content(doc1) # Original Document | |
text2 = read_file_content(doc2) # Created Document | |
# Calculate word-level cosine similarity | |
word_similarity = calculate_word_similarity(text1, text2) | |
# Calculate sentence-level similarity | |
sentence_similarity = calculate_sentence_similarity(text1, text2) | |
# Calculate plagiarism percentage | |
plagiarism_percentage = calculate_plagiarism_percentage(word_similarity, sentence_similarity) | |
# Find longest common subsequences for sentence matches (from the created document) | |
matched_sequences = longest_common_subsequence(text1, text2) | |
st.write(f"Word-Level Cosine Similarity: {word_similarity:.2f}%") | |
st.write(f"Sentence-Level Similarity: {sentence_similarity:.2f}%") | |
st.write(f"Plagiarism Percentage: {plagiarism_percentage:.2f}%") | |
create_bar_chart(word_similarity, sentence_similarity, plagiarism_percentage) | |
if plagiarism_percentage < 20: | |
st.write("The created document is mostly original.") | |
elif plagiarism_percentage < 50: | |
st.write("There are some similarities between the created and original documents.") | |
else: | |
st.write("The created document has significant similarities with the original and may contain plagiarism.") | |
if matched_sequences: | |
st.subheader("Matched Content from the Created Document:") | |
for i, match in enumerate(matched_sequences, 1): | |
st.write(f"{i}. {match}") | |
# Rewriting the matched content | |
reworded_matches = [suggest_rewrites(match) for match in matched_sequences] | |
st.subheader("Rewritten Suggestions to Avoid Plagiarism:") | |
for i, reworded in enumerate(reworded_matches, 1): | |
st.write(f"{i}. {reworded}") | |
download_report(word_similarity, sentence_similarity, plagiarism_percentage, matched_sequences, reworded_matches) | |
else: | |
st.write("No significant matched content found from the created document.") | |
except ValueError as e: | |
st.error(f"Error: {str(e)}") | |
if __name__ == "__main__": | |
main() |