humanizer / text_humanizer.py
Jay-Rajput's picture
Add application file
bea1d24
raw
history blame
7.26 kB
import ssl
import random
import warnings
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from sentence_transformers import SentenceTransformer, util
warnings.filterwarnings("ignore", category=FutureWarning)
NLP_GLOBAL = spacy.load("en_core_web_sm")
def download_nltk_resources():
"""
Download required NLTK resources if not already installed.
"""
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
resources = ['punkt', 'averaged_perceptron_tagger', 'punkt_tab','wordnet','averaged_perceptron_tagger_eng']
for resource in resources:
try:
nltk.download(resource, quiet=True)
except Exception as e:
print(f"Error downloading {resource}: {str(e)}")
# This class contains methods to humanize academic text, such as improving readability or
# simplifying complex language.
class TextHumanizer:
"""
Transforms text into a more formal (academic) style:
- Expands contractions
- Adds academic transitions
- Optionally converts some sentences to passive voice
- Optionally replaces words with synonyms for more formality
"""
def __init__(
self,
model_name='paraphrase-MiniLM-L6-v2',
p_passive=0.2,
p_synonym_replacement=0.3,
p_academic_transition=0.3,
seed=None
):
if seed is not None:
random.seed(seed)
self.nlp = spacy.load("en_core_web_sm")
self.model = SentenceTransformer(model_name)
# Transformation probabilities
self.p_passive = p_passive
self.p_synonym_replacement = p_synonym_replacement
self.p_academic_transition = p_academic_transition
# Common academic transitions
self.academic_transitions = [
"Moreover,", "Additionally,", "Furthermore,", "Hence,",
"Therefore,", "Consequently,", "Nonetheless,", "Nevertheless,"
]
def humanize_text(self, text, use_passive=False, use_synonyms=False):
doc = self.nlp(text)
transformed_sentences = []
for sent in doc.sents:
sentence_str = sent.text.strip()
# 1. Expand contractions
sentence_str = self.expand_contractions(sentence_str)
# 2. Possibly add academic transitions
# if random.random() < self.p_academic_transition:
# sentence_str = self.add_academic_transitions(sentence_str)
# 3. Optionally convert to passive
if use_passive and random.random() < self.p_passive:
sentence_str = self.convert_to_passive(sentence_str)
# 4. Optionally replace words with synonyms
if use_synonyms and random.random() < self.p_synonym_replacement:
sentence_str = self.replace_with_synonyms(sentence_str)
transformed_sentences.append(sentence_str)
return ' '.join(transformed_sentences)
def expand_contractions(self, sentence):
contraction_map = {
"n't": " not", "'re": " are", "'s": " is", "'ll": " will",
"'ve": " have", "'d": " would", "'m": " am"
}
tokens = word_tokenize(sentence)
expanded_tokens = []
for token in tokens:
lower_token = token.lower()
replaced = False
for contraction, expansion in contraction_map.items():
if contraction in lower_token and lower_token.endswith(contraction):
new_token = lower_token.replace(contraction, expansion)
if token[0].isupper():
new_token = new_token.capitalize()
expanded_tokens.append(new_token)
replaced = True
break
if not replaced:
expanded_tokens.append(token)
return ' '.join(expanded_tokens)
def add_academic_transitions(self, sentence):
transition = random.choice(self.academic_transitions)
return f"{transition} {sentence}"
def convert_to_passive(self, sentence):
doc = self.nlp(sentence)
subj_tokens = [t for t in doc if t.dep_ == 'nsubj' and t.head.dep_ == 'ROOT']
dobj_tokens = [t for t in doc if t.dep_ == 'dobj']
if subj_tokens and dobj_tokens:
subject = subj_tokens[0]
dobj = dobj_tokens[0]
verb = subject.head
if subject.i < verb.i < dobj.i:
passive_str = f"{dobj.text} {verb.lemma_} by {subject.text}"
original_str = ' '.join(token.text for token in doc)
chunk = f"{subject.text} {verb.text} {dobj.text}"
if chunk in original_str:
sentence = original_str.replace(chunk, passive_str)
return sentence
def replace_with_synonyms(self, sentence):
tokens = word_tokenize(sentence)
pos_tags = nltk.pos_tag(tokens)
new_tokens = []
for (word, pos) in pos_tags:
if pos.startswith(('J', 'N', 'V', 'R')) and wordnet.synsets(word):
if random.random() < 0.5:
synonyms = self._get_synonyms(word, pos)
if synonyms:
best_synonym = self._select_closest_synonym(word, synonyms)
new_tokens.append(best_synonym if best_synonym else word)
else:
new_tokens.append(word)
else:
new_tokens.append(word)
else:
new_tokens.append(word)
# Join cleanly with punctuation fix
sentence = " ".join(new_tokens)
sentence = (
sentence.replace(" ,", ",")
.replace(" .", ".")
.replace(" !", "!")
.replace(" ?", "?")
.replace(" :", ":")
.replace(" '", "'")
)
return sentence
def _get_synonyms(self, word, pos):
wn_pos = None
if pos.startswith('J'):
wn_pos = wordnet.ADJ
elif pos.startswith('N'):
wn_pos = wordnet.NOUN
elif pos.startswith('R'):
wn_pos = wordnet.ADV
elif pos.startswith('V'):
wn_pos = wordnet.VERB
synonyms = set()
for syn in wordnet.synsets(word, pos=wn_pos):
for lemma in syn.lemmas():
lemma_name = lemma.name().replace('_', ' ')
if lemma_name.lower() != word.lower():
synonyms.add(lemma_name)
return list(synonyms)
def _select_closest_synonym(self, original_word, synonyms):
if not synonyms:
return None
original_emb = self.model.encode(original_word, convert_to_tensor=True)
synonym_embs = self.model.encode(synonyms, convert_to_tensor=True)
cos_scores = util.cos_sim(original_emb, synonym_embs)[0]
max_score_index = cos_scores.argmax().item()
max_score = cos_scores[max_score_index].item()
if max_score >= 0.5:
return synonyms[max_score_index]
return None