Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -16,11 +16,115 @@ nltk.download('stopwords', quiet=True)
|
|
16 |
nltk.download('wordnet', quiet=True)
|
17 |
nltk.download('averaged_perceptron_tagger', quiet=True)
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
def calculate_plagiarism_percentage(word_similarity, sentence_similarity):
|
22 |
-
# This is a simple way to calculate plagiarism percentage
|
23 |
-
# You may want to adjust the weights or use a more sophisticated method
|
24 |
return (word_similarity + sentence_similarity) / 2
|
25 |
|
26 |
def create_bar_chart(word_similarity, sentence_similarity, plagiarism_percentage):
|
|
|
16 |
nltk.download('wordnet', quiet=True)
|
17 |
nltk.download('averaged_perceptron_tagger', quiet=True)
|
18 |
|
19 |
+
def read_file_content(uploaded_file):
|
20 |
+
if uploaded_file.type == "text/plain":
|
21 |
+
return uploaded_file.getvalue().decode("utf-8")
|
22 |
+
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
23 |
+
doc = Document(uploaded_file)
|
24 |
+
return " ".join([paragraph.text for paragraph in doc.paragraphs])
|
25 |
+
else:
|
26 |
+
raise ValueError("Unsupported file type")
|
27 |
+
|
28 |
+
def preprocess_text(text):
|
29 |
+
# Convert to lowercase and remove punctuation
|
30 |
+
text = re.sub(r'[^\w\s]', '', text.lower())
|
31 |
+
|
32 |
+
# Tokenize and remove stopwords
|
33 |
+
stop_words = set(stopwords.words('english'))
|
34 |
+
tokens = word_tokenize(text)
|
35 |
+
return [word for word in tokens if word not in stop_words]
|
36 |
+
|
37 |
+
def cosine_similarity(vec1, vec2):
|
38 |
+
intersection = set(vec1.keys()) & set(vec2.keys())
|
39 |
+
numerator = sum([vec1[x] * vec2[x] for x in intersection])
|
40 |
+
|
41 |
+
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
|
42 |
+
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
|
43 |
+
denominator = sqrt(sum1) * sqrt(sum2)
|
44 |
+
|
45 |
+
if not denominator:
|
46 |
+
return 0.0
|
47 |
+
else:
|
48 |
+
return float(numerator) / denominator
|
49 |
+
|
50 |
+
def calculate_word_similarity(text1, text2):
|
51 |
+
words1 = preprocess_text(text1)
|
52 |
+
words2 = preprocess_text(text2)
|
53 |
+
|
54 |
+
vec1 = Counter(words1)
|
55 |
+
vec2 = Counter(words2)
|
56 |
+
|
57 |
+
similarity = cosine_similarity(vec1, vec2)
|
58 |
+
return similarity * 100
|
59 |
+
|
60 |
+
def calculate_sentence_similarity(text1, text2):
|
61 |
+
sentences1 = sent_tokenize(text1)
|
62 |
+
sentences2 = sent_tokenize(text2)
|
63 |
+
|
64 |
+
similarities = []
|
65 |
+
for sent1 in sentences1:
|
66 |
+
max_similarity = 0
|
67 |
+
for sent2 in sentences2:
|
68 |
+
similarity = calculate_word_similarity(sent1, sent2)
|
69 |
+
if similarity > max_similarity:
|
70 |
+
max_similarity = similarity
|
71 |
+
similarities.append(max_similarity)
|
72 |
+
|
73 |
+
average_similarity = sum(similarities) / len(similarities) if similarities else 0.0
|
74 |
+
return average_similarity
|
75 |
+
|
76 |
+
def longest_common_subsequence(text1, text2):
|
77 |
+
sentences1 = sent_tokenize(text1)
|
78 |
+
sentences2 = sent_tokenize(text2)
|
79 |
+
|
80 |
+
m, n = len(sentences1), len(sentences2)
|
81 |
+
L = [[0] * (n + 1) for _ in range(m + 1)]
|
82 |
+
|
83 |
+
for i in range(1, m + 1):
|
84 |
+
for j in range(1, n + 1):
|
85 |
+
if sentences1[i-1] == sentences2[j-1]:
|
86 |
+
L[i][j] = L[i-1][j-1] + 1
|
87 |
+
else:
|
88 |
+
L[i][j] = max(L[i-1][j], L[i][j-1])
|
89 |
+
|
90 |
+
# Backtrack to find the LCS
|
91 |
+
lcs = []
|
92 |
+
i, j = m, n
|
93 |
+
while i > 0 and j > 0:
|
94 |
+
if sentences1[i-1] == sentences2[j-1]:
|
95 |
+
lcs.append(sentences1[i-1])
|
96 |
+
i -= 1
|
97 |
+
j -= 1
|
98 |
+
elif L[i-1][j] > L[i][j-1]:
|
99 |
+
i -= 1
|
100 |
+
else:
|
101 |
+
j -= 1
|
102 |
+
|
103 |
+
return list(reversed(lcs))
|
104 |
+
|
105 |
+
def suggest_rewrites(sentence):
|
106 |
+
words = word_tokenize(sentence)
|
107 |
+
tagged_words = nltk.pos_tag(words)
|
108 |
+
|
109 |
+
rewrites = []
|
110 |
+
|
111 |
+
for word, tag in tagged_words:
|
112 |
+
syns = wordnet.synsets(word)
|
113 |
+
if syns:
|
114 |
+
if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'):
|
115 |
+
synonym = syns[0].lemmas()[0].name()
|
116 |
+
if synonym != word:
|
117 |
+
rewrites.append(synonym)
|
118 |
+
else:
|
119 |
+
rewrites.append(word)
|
120 |
+
else:
|
121 |
+
rewrites.append(word)
|
122 |
+
else:
|
123 |
+
rewrites.append(word)
|
124 |
+
|
125 |
+
return " ".join(rewrites)
|
126 |
|
127 |
def calculate_plagiarism_percentage(word_similarity, sentence_similarity):
|
|
|
|
|
128 |
return (word_similarity + sentence_similarity) / 2
|
129 |
|
130 |
def create_bar_chart(word_similarity, sentence_similarity, plagiarism_percentage):
|