Spaces:

srikanththirumani
/

Plagiarism-checks

Sleeping

App Files Files Community

Plagiarism-checks / app.py

srikanththirumani

Update app.py

bf669a0 verified 10 months ago

raw

history blame contribute delete

8.02 kB

	import streamlit as st
	from docx import Document
	import re
	from collections import Counter
	from math import sqrt
	import nltk
	from nltk.corpus import stopwords, wordnet
	from nltk.tokenize import word_tokenize, sent_tokenize
	import matplotlib.pyplot as plt
	import io
	import base64

	# Download necessary NLTK data
	nltk.download('punkt', quiet=True)
	nltk.download('stopwords', quiet=True)
	nltk.download('wordnet', quiet=True)
	nltk.download('averaged_perceptron_tagger', quiet=True)

	def read_file_content(uploaded_file):
	if uploaded_file.type == "text/plain":
	return uploaded_file.getvalue().decode("utf-8")
	elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
	doc = Document(uploaded_file)
	return " ".join([paragraph.text for paragraph in doc.paragraphs])
	else:
	raise ValueError("Unsupported file type")

	def preprocess_text(text):
	# Convert to lowercase and remove punctuation
	text = re.sub(r'[^\w\s]', '', text.lower())

	# Tokenize and remove stopwords
	stop_words = set(stopwords.words('english'))
	tokens = word_tokenize(text)
	return [word for word in tokens if word not in stop_words]

	def cosine_similarity(vec1, vec2):
	intersection = set(vec1.keys()) & set(vec2.keys())
	numerator = sum([vec1[x] * vec2[x] for x in intersection])

	sum1 = sum([vec1[x]**2 for x in vec1.keys()])
	sum2 = sum([vec2[x]**2 for x in vec2.keys()])
	denominator = sqrt(sum1) * sqrt(sum2)

	if not denominator:
	return 0.0
	else:
	return float(numerator) / denominator

	def calculate_word_similarity(text1, text2):
	words1 = preprocess_text(text1)
	words2 = preprocess_text(text2)

	vec1 = Counter(words1)
	vec2 = Counter(words2)

	similarity = cosine_similarity(vec1, vec2)
	return similarity * 100

	def calculate_sentence_similarity(text1, text2):
	sentences1 = sent_tokenize(text1)
	sentences2 = sent_tokenize(text2)

	similarities = []
	for sent1 in sentences1:
	max_similarity = 0
	for sent2 in sentences2:
	similarity = calculate_word_similarity(sent1, sent2)
	if similarity > max_similarity:
	max_similarity = similarity
	similarities.append(max_similarity)

	average_similarity = sum(similarities) / len(similarities) if similarities else 0.0
	return average_similarity

	def longest_common_subsequence(text1, text2):
	sentences1 = sent_tokenize(text1)
	sentences2 = sent_tokenize(text2)

	m, n = len(sentences1), len(sentences2)
	L = [[0] * (n + 1) for _ in range(m + 1)]

	for i in range(1, m + 1):
	for j in range(1, n + 1):
	if sentences1[i-1] == sentences2[j-1]:
	L[i][j] = L[i-1][j-1] + 1
	else:
	L[i][j] = max(L[i-1][j], L[i][j-1])

	# Backtrack to find the LCS
	lcs = []
	i, j = m, n
	while i > 0 and j > 0:
	if sentences1[i-1] == sentences2[j-1]:
	lcs.append(sentences1[i-1])
	i -= 1
	j -= 1
	elif L[i-1][j] > L[i][j-1]:
	i -= 1
	else:
	j -= 1

	return list(reversed(lcs))

	def suggest_rewrites(sentence):
	words = word_tokenize(sentence)
	tagged_words = nltk.pos_tag(words)

	rewrites = []

	for word, tag in tagged_words:
	syns = wordnet.synsets(word)
	if syns:
	if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'):
	synonym = syns[0].lemmas()[0].name()
	if synonym != word:
	rewrites.append(synonym)
	else:
	rewrites.append(word)
	else:
	rewrites.append(word)
	else:
	rewrites.append(word)

	return " ".join(rewrites)

	def calculate_plagiarism_percentage(word_similarity, sentence_similarity):
	return (word_similarity + sentence_similarity) / 2

	def create_bar_chart(word_similarity, sentence_similarity, plagiarism_percentage):
	fig, ax = plt.subplots()
	ax.bar(["Word-Level Similarity", "Sentence-Level Similarity", "Plagiarism Percentage"],
	[word_similarity, sentence_similarity, plagiarism_percentage],
	color=["blue", "green", "red"])
	ax.set_ylabel("Percentage")
	ax.set_ylim(0, 100)
	ax.set_title("Document Similarity and Plagiarism")
	st.pyplot(fig)

	def download_report(word_similarity, sentence_similarity, plagiarism_percentage, matched_sequences, reworded_matches):
	report = f"Word-Level Similarity: {word_similarity:.2f}%\n"
	report += f"Sentence-Level Similarity: {sentence_similarity:.2f}%\n"
	report += f"Plagiarism Percentage: {plagiarism_percentage:.2f}%\n\n"
	report += "Matched Sequences from the Created Document:\n"
	for i, match in enumerate(matched_sequences, 1):
	report += f"{i}. {match}\n"

	report += "\nRewritten Suggestions to Avoid Plagiarism:\n"
	for i, reworded in enumerate(reworded_matches, 1):
	report += f"{i}. {reworded}\n"

	report_bytes = report.encode("utf-8")
	b64 = base64.b64encode(report_bytes).decode()
	href = f'<a href="data:text/plain;base64,{b64}" download="plagiarism_report.txt">Download Report</a>'
	st.markdown(href, unsafe_allow_html=True)

	def main():
	st.title("High-Accuracy Document Plagiarism Checker")

	doc1 = st.file_uploader("Upload Original Document", type=["txt", "docx"])
	doc2 = st.file_uploader("Upload Created Document", type=["txt", "docx"])

	if doc1 is not None and doc2 is not None:
	try:
	text1 = read_file_content(doc1) # Original Document
	text2 = read_file_content(doc2) # Created Document

	# Calculate word-level cosine similarity
	word_similarity = calculate_word_similarity(text1, text2)
	# Calculate sentence-level similarity
	sentence_similarity = calculate_sentence_similarity(text1, text2)
	# Calculate plagiarism percentage
	plagiarism_percentage = calculate_plagiarism_percentage(word_similarity, sentence_similarity)

	# Find longest common subsequences for sentence matches (from the created document)
	matched_sequences = longest_common_subsequence(text1, text2)

	st.write(f"Word-Level Cosine Similarity: {word_similarity:.2f}%")
	st.write(f"Sentence-Level Similarity: {sentence_similarity:.2f}%")
	st.write(f"Plagiarism Percentage: {plagiarism_percentage:.2f}%")

	create_bar_chart(word_similarity, sentence_similarity, plagiarism_percentage)

	if plagiarism_percentage < 20:
	st.write("The created document is mostly original.")
	elif plagiarism_percentage < 50:
	st.write("There are some similarities between the created and original documents.")
	else:
	st.write("The created document has significant similarities with the original and may contain plagiarism.")

	if matched_sequences:
	st.subheader("Matched Content from the Created Document:")
	for i, match in enumerate(matched_sequences, 1):
	st.write(f"{i}. {match}")

	# Rewriting the matched content
	reworded_matches = [suggest_rewrites(match) for match in matched_sequences]

	st.subheader("Rewritten Suggestions to Avoid Plagiarism:")
	for i, reworded in enumerate(reworded_matches, 1):
	st.write(f"{i}. {reworded}")

	download_report(word_similarity, sentence_similarity, plagiarism_percentage, matched_sequences, reworded_matches)
	else:
	st.write("No significant matched content found from the created document.")

	except ValueError as e:
	st.error(f"Error: {str(e)}")

	if __name__ == "__main__":
	main()