Jay-Rajput commited on
Commit
bea1d24
·
1 Parent(s): d3b9cb7

Add application file

Browse files
Files changed (4) hide show
  1. Dockerfile +22 -0
  2. app.py +45 -0
  3. requirements.txt +5 -0
  4. text_humanizer.py +200 -0
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ # download spacy model and nltk resources at build time
13
+ RUN python -m spacy download en_core_web_sm || true
14
+ RUN python - <<'PY'
15
+ from text_humanizer import download_nltk_resources
16
+ download_nltk_resources()
17
+ PY
18
+
19
+ EXPOSE 7860
20
+
21
+ COPY --chown=user . /app
22
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from fastapi import FastAPI, Header, HTTPException, Depends
3
+ from pydantic import BaseModel
4
+ from text_humanizer import TextHumanizer, download_nltk_resources
5
+ import spacy
6
+
7
+ API_KEY = os.environ.get("API_KEY", "dev-key")
8
+ PORT = int(os.environ.get("PORT", 7860))
9
+
10
+ app = FastAPI()
11
+ humanizer = None
12
+
13
+ class HumanizeReq(BaseModel):
14
+ text: str
15
+ use_passive: bool = False
16
+ use_synonyms: bool = False
17
+
18
+ def verify_key(x_api_key: str = Header(None)):
19
+ if x_api_key != API_KEY:
20
+ raise HTTPException(status_code=403, detail="Forbidden")
21
+ return True
22
+
23
+ @app.get("/")
24
+ def greet_json():
25
+ return {"Hello": "World!"}
26
+
27
+ @app.on_event("startup")
28
+ def startup():
29
+ # ensure NLTK resources and spacy model are available at runtime
30
+ download_nltk_resources()
31
+ try:
32
+ spacy.load("en_core_web_sm")
33
+ except OSError:
34
+ import spacy.cli
35
+ spacy.cli.download("en_core_web_sm")
36
+ global humanizer
37
+ humanizer = TextHumanizer()
38
+
39
+ @app.post("/humanize")
40
+ def humanize(req: HumanizeReq, _=Depends(verify_key)):
41
+ return {"humanized": humanizer.humanize_text(req.text, req.use_passive, req.use_synonyms)}
42
+
43
+ # if __name__ == "__main__":
44
+ # import uvicorn
45
+ # uvicorn.run(app, host="0.0.0.0", port=PORT)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ spacy
4
+ nltk
5
+ sentence-transformers
text_humanizer.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ssl
2
+ import random
3
+ import warnings
4
+
5
+ import nltk
6
+ import spacy
7
+ from nltk.tokenize import word_tokenize
8
+ from nltk.corpus import wordnet
9
+ from sentence_transformers import SentenceTransformer, util
10
+
11
+ warnings.filterwarnings("ignore", category=FutureWarning)
12
+
13
+ NLP_GLOBAL = spacy.load("en_core_web_sm")
14
+
15
+ def download_nltk_resources():
16
+ """
17
+ Download required NLTK resources if not already installed.
18
+ """
19
+ try:
20
+ _create_unverified_https_context = ssl._create_unverified_context
21
+ except AttributeError:
22
+ pass
23
+ else:
24
+ ssl._create_default_https_context = _create_unverified_https_context
25
+
26
+ resources = ['punkt', 'averaged_perceptron_tagger', 'punkt_tab','wordnet','averaged_perceptron_tagger_eng']
27
+ for resource in resources:
28
+ try:
29
+ nltk.download(resource, quiet=True)
30
+ except Exception as e:
31
+ print(f"Error downloading {resource}: {str(e)}")
32
+
33
+
34
+ # This class contains methods to humanize academic text, such as improving readability or
35
+ # simplifying complex language.
36
+ class TextHumanizer:
37
+ """
38
+ Transforms text into a more formal (academic) style:
39
+ - Expands contractions
40
+ - Adds academic transitions
41
+ - Optionally converts some sentences to passive voice
42
+ - Optionally replaces words with synonyms for more formality
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ model_name='paraphrase-MiniLM-L6-v2',
48
+ p_passive=0.2,
49
+ p_synonym_replacement=0.3,
50
+ p_academic_transition=0.3,
51
+ seed=None
52
+ ):
53
+ if seed is not None:
54
+ random.seed(seed)
55
+
56
+ self.nlp = spacy.load("en_core_web_sm")
57
+ self.model = SentenceTransformer(model_name)
58
+
59
+ # Transformation probabilities
60
+ self.p_passive = p_passive
61
+ self.p_synonym_replacement = p_synonym_replacement
62
+ self.p_academic_transition = p_academic_transition
63
+
64
+ # Common academic transitions
65
+ self.academic_transitions = [
66
+ "Moreover,", "Additionally,", "Furthermore,", "Hence,",
67
+ "Therefore,", "Consequently,", "Nonetheless,", "Nevertheless,"
68
+ ]
69
+
70
+ def humanize_text(self, text, use_passive=False, use_synonyms=False):
71
+ doc = self.nlp(text)
72
+ transformed_sentences = []
73
+
74
+ for sent in doc.sents:
75
+ sentence_str = sent.text.strip()
76
+
77
+ # 1. Expand contractions
78
+ sentence_str = self.expand_contractions(sentence_str)
79
+
80
+ # 2. Possibly add academic transitions
81
+ # if random.random() < self.p_academic_transition:
82
+ # sentence_str = self.add_academic_transitions(sentence_str)
83
+
84
+ # 3. Optionally convert to passive
85
+ if use_passive and random.random() < self.p_passive:
86
+ sentence_str = self.convert_to_passive(sentence_str)
87
+
88
+ # 4. Optionally replace words with synonyms
89
+ if use_synonyms and random.random() < self.p_synonym_replacement:
90
+ sentence_str = self.replace_with_synonyms(sentence_str)
91
+
92
+ transformed_sentences.append(sentence_str)
93
+
94
+ return ' '.join(transformed_sentences)
95
+
96
+ def expand_contractions(self, sentence):
97
+ contraction_map = {
98
+ "n't": " not", "'re": " are", "'s": " is", "'ll": " will",
99
+ "'ve": " have", "'d": " would", "'m": " am"
100
+ }
101
+ tokens = word_tokenize(sentence)
102
+ expanded_tokens = []
103
+ for token in tokens:
104
+ lower_token = token.lower()
105
+ replaced = False
106
+ for contraction, expansion in contraction_map.items():
107
+ if contraction in lower_token and lower_token.endswith(contraction):
108
+ new_token = lower_token.replace(contraction, expansion)
109
+ if token[0].isupper():
110
+ new_token = new_token.capitalize()
111
+ expanded_tokens.append(new_token)
112
+ replaced = True
113
+ break
114
+ if not replaced:
115
+ expanded_tokens.append(token)
116
+
117
+ return ' '.join(expanded_tokens)
118
+
119
+ def add_academic_transitions(self, sentence):
120
+ transition = random.choice(self.academic_transitions)
121
+ return f"{transition} {sentence}"
122
+
123
+ def convert_to_passive(self, sentence):
124
+ doc = self.nlp(sentence)
125
+ subj_tokens = [t for t in doc if t.dep_ == 'nsubj' and t.head.dep_ == 'ROOT']
126
+ dobj_tokens = [t for t in doc if t.dep_ == 'dobj']
127
+
128
+ if subj_tokens and dobj_tokens:
129
+ subject = subj_tokens[0]
130
+ dobj = dobj_tokens[0]
131
+ verb = subject.head
132
+ if subject.i < verb.i < dobj.i:
133
+ passive_str = f"{dobj.text} {verb.lemma_} by {subject.text}"
134
+ original_str = ' '.join(token.text for token in doc)
135
+ chunk = f"{subject.text} {verb.text} {dobj.text}"
136
+ if chunk in original_str:
137
+ sentence = original_str.replace(chunk, passive_str)
138
+ return sentence
139
+
140
+ def replace_with_synonyms(self, sentence):
141
+ tokens = word_tokenize(sentence)
142
+ pos_tags = nltk.pos_tag(tokens)
143
+
144
+ new_tokens = []
145
+ for (word, pos) in pos_tags:
146
+ if pos.startswith(('J', 'N', 'V', 'R')) and wordnet.synsets(word):
147
+ if random.random() < 0.5:
148
+ synonyms = self._get_synonyms(word, pos)
149
+ if synonyms:
150
+ best_synonym = self._select_closest_synonym(word, synonyms)
151
+ new_tokens.append(best_synonym if best_synonym else word)
152
+ else:
153
+ new_tokens.append(word)
154
+ else:
155
+ new_tokens.append(word)
156
+ else:
157
+ new_tokens.append(word)
158
+
159
+ # Join cleanly with punctuation fix
160
+ sentence = " ".join(new_tokens)
161
+ sentence = (
162
+ sentence.replace(" ,", ",")
163
+ .replace(" .", ".")
164
+ .replace(" !", "!")
165
+ .replace(" ?", "?")
166
+ .replace(" :", ":")
167
+ .replace(" '", "'")
168
+ )
169
+ return sentence
170
+
171
+ def _get_synonyms(self, word, pos):
172
+ wn_pos = None
173
+ if pos.startswith('J'):
174
+ wn_pos = wordnet.ADJ
175
+ elif pos.startswith('N'):
176
+ wn_pos = wordnet.NOUN
177
+ elif pos.startswith('R'):
178
+ wn_pos = wordnet.ADV
179
+ elif pos.startswith('V'):
180
+ wn_pos = wordnet.VERB
181
+
182
+ synonyms = set()
183
+ for syn in wordnet.synsets(word, pos=wn_pos):
184
+ for lemma in syn.lemmas():
185
+ lemma_name = lemma.name().replace('_', ' ')
186
+ if lemma_name.lower() != word.lower():
187
+ synonyms.add(lemma_name)
188
+ return list(synonyms)
189
+
190
+ def _select_closest_synonym(self, original_word, synonyms):
191
+ if not synonyms:
192
+ return None
193
+ original_emb = self.model.encode(original_word, convert_to_tensor=True)
194
+ synonym_embs = self.model.encode(synonyms, convert_to_tensor=True)
195
+ cos_scores = util.cos_sim(original_emb, synonym_embs)[0]
196
+ max_score_index = cos_scores.argmax().item()
197
+ max_score = cos_scores[max_score_index].item()
198
+ if max_score >= 0.5:
199
+ return synonyms[max_score_index]
200
+ return None