srikanththirumani commited on
Commit
bf669a0
·
verified ·
1 Parent(s): 3643a93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -3
app.py CHANGED
@@ -16,11 +16,115 @@ nltk.download('stopwords', quiet=True)
16
  nltk.download('wordnet', quiet=True)
17
  nltk.download('averaged_perceptron_tagger', quiet=True)
18
 
19
- # ... (keep all the existing functions)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def calculate_plagiarism_percentage(word_similarity, sentence_similarity):
22
- # This is a simple way to calculate plagiarism percentage
23
- # You may want to adjust the weights or use a more sophisticated method
24
  return (word_similarity + sentence_similarity) / 2
25
 
26
  def create_bar_chart(word_similarity, sentence_similarity, plagiarism_percentage):
 
16
  nltk.download('wordnet', quiet=True)
17
  nltk.download('averaged_perceptron_tagger', quiet=True)
18
 
19
+ def read_file_content(uploaded_file):
20
+ if uploaded_file.type == "text/plain":
21
+ return uploaded_file.getvalue().decode("utf-8")
22
+ elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
23
+ doc = Document(uploaded_file)
24
+ return " ".join([paragraph.text for paragraph in doc.paragraphs])
25
+ else:
26
+ raise ValueError("Unsupported file type")
27
+
28
+ def preprocess_text(text):
29
+ # Convert to lowercase and remove punctuation
30
+ text = re.sub(r'[^\w\s]', '', text.lower())
31
+
32
+ # Tokenize and remove stopwords
33
+ stop_words = set(stopwords.words('english'))
34
+ tokens = word_tokenize(text)
35
+ return [word for word in tokens if word not in stop_words]
36
+
37
+ def cosine_similarity(vec1, vec2):
38
+ intersection = set(vec1.keys()) & set(vec2.keys())
39
+ numerator = sum([vec1[x] * vec2[x] for x in intersection])
40
+
41
+ sum1 = sum([vec1[x]**2 for x in vec1.keys()])
42
+ sum2 = sum([vec2[x]**2 for x in vec2.keys()])
43
+ denominator = sqrt(sum1) * sqrt(sum2)
44
+
45
+ if not denominator:
46
+ return 0.0
47
+ else:
48
+ return float(numerator) / denominator
49
+
50
+ def calculate_word_similarity(text1, text2):
51
+ words1 = preprocess_text(text1)
52
+ words2 = preprocess_text(text2)
53
+
54
+ vec1 = Counter(words1)
55
+ vec2 = Counter(words2)
56
+
57
+ similarity = cosine_similarity(vec1, vec2)
58
+ return similarity * 100
59
+
60
+ def calculate_sentence_similarity(text1, text2):
61
+ sentences1 = sent_tokenize(text1)
62
+ sentences2 = sent_tokenize(text2)
63
+
64
+ similarities = []
65
+ for sent1 in sentences1:
66
+ max_similarity = 0
67
+ for sent2 in sentences2:
68
+ similarity = calculate_word_similarity(sent1, sent2)
69
+ if similarity > max_similarity:
70
+ max_similarity = similarity
71
+ similarities.append(max_similarity)
72
+
73
+ average_similarity = sum(similarities) / len(similarities) if similarities else 0.0
74
+ return average_similarity
75
+
76
+ def longest_common_subsequence(text1, text2):
77
+ sentences1 = sent_tokenize(text1)
78
+ sentences2 = sent_tokenize(text2)
79
+
80
+ m, n = len(sentences1), len(sentences2)
81
+ L = [[0] * (n + 1) for _ in range(m + 1)]
82
+
83
+ for i in range(1, m + 1):
84
+ for j in range(1, n + 1):
85
+ if sentences1[i-1] == sentences2[j-1]:
86
+ L[i][j] = L[i-1][j-1] + 1
87
+ else:
88
+ L[i][j] = max(L[i-1][j], L[i][j-1])
89
+
90
+ # Backtrack to find the LCS
91
+ lcs = []
92
+ i, j = m, n
93
+ while i > 0 and j > 0:
94
+ if sentences1[i-1] == sentences2[j-1]:
95
+ lcs.append(sentences1[i-1])
96
+ i -= 1
97
+ j -= 1
98
+ elif L[i-1][j] > L[i][j-1]:
99
+ i -= 1
100
+ else:
101
+ j -= 1
102
+
103
+ return list(reversed(lcs))
104
+
105
+ def suggest_rewrites(sentence):
106
+ words = word_tokenize(sentence)
107
+ tagged_words = nltk.pos_tag(words)
108
+
109
+ rewrites = []
110
+
111
+ for word, tag in tagged_words:
112
+ syns = wordnet.synsets(word)
113
+ if syns:
114
+ if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'):
115
+ synonym = syns[0].lemmas()[0].name()
116
+ if synonym != word:
117
+ rewrites.append(synonym)
118
+ else:
119
+ rewrites.append(word)
120
+ else:
121
+ rewrites.append(word)
122
+ else:
123
+ rewrites.append(word)
124
+
125
+ return " ".join(rewrites)
126
 
127
  def calculate_plagiarism_percentage(word_similarity, sentence_similarity):
 
 
128
  return (word_similarity + sentence_similarity) / 2
129
 
130
  def create_bar_chart(word_similarity, sentence_similarity, plagiarism_percentage):