Spaces:
Runtime error
Runtime error
import ssl | |
import random | |
import warnings | |
import nltk | |
import spacy | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import wordnet | |
from sentence_transformers import SentenceTransformer, util | |
warnings.filterwarnings("ignore", category=FutureWarning) | |
NLP_GLOBAL = spacy.load("en_core_web_sm") | |
def download_nltk_resources(): | |
""" | |
Download required NLTK resources if not already installed. | |
""" | |
try: | |
_create_unverified_https_context = ssl._create_unverified_context | |
except AttributeError: | |
pass | |
else: | |
ssl._create_default_https_context = _create_unverified_https_context | |
resources = ['punkt', 'averaged_perceptron_tagger', 'punkt_tab','wordnet','averaged_perceptron_tagger_eng'] | |
for resource in resources: | |
try: | |
nltk.download(resource, quiet=True) | |
except Exception as e: | |
print(f"Error downloading {resource}: {str(e)}") | |
# This class contains methods to humanize academic text, such as improving readability or | |
# simplifying complex language. | |
class TextHumanizer: | |
""" | |
Transforms text into a more formal (academic) style: | |
- Expands contractions | |
- Adds academic transitions | |
- Optionally converts some sentences to passive voice | |
- Optionally replaces words with synonyms for more formality | |
""" | |
def __init__( | |
self, | |
model_name='paraphrase-MiniLM-L6-v2', | |
p_passive=0.2, | |
p_synonym_replacement=0.3, | |
p_academic_transition=0.3, | |
seed=None | |
): | |
if seed is not None: | |
random.seed(seed) | |
self.nlp = spacy.load("en_core_web_sm") | |
self.model = SentenceTransformer(model_name) | |
# Transformation probabilities | |
self.p_passive = p_passive | |
self.p_synonym_replacement = p_synonym_replacement | |
self.p_academic_transition = p_academic_transition | |
# Common academic transitions | |
self.academic_transitions = [ | |
"Moreover,", "Additionally,", "Furthermore,", "Hence,", | |
"Therefore,", "Consequently,", "Nonetheless,", "Nevertheless," | |
] | |
def humanize_text(self, text, use_passive=False, use_synonyms=False): | |
doc = self.nlp(text) | |
transformed_sentences = [] | |
for sent in doc.sents: | |
sentence_str = sent.text.strip() | |
# 1. Expand contractions | |
sentence_str = self.expand_contractions(sentence_str) | |
# 2. Possibly add academic transitions | |
# if random.random() < self.p_academic_transition: | |
# sentence_str = self.add_academic_transitions(sentence_str) | |
# 3. Optionally convert to passive | |
if use_passive and random.random() < self.p_passive: | |
sentence_str = self.convert_to_passive(sentence_str) | |
# 4. Optionally replace words with synonyms | |
if use_synonyms and random.random() < self.p_synonym_replacement: | |
sentence_str = self.replace_with_synonyms(sentence_str) | |
transformed_sentences.append(sentence_str) | |
return ' '.join(transformed_sentences) | |
def expand_contractions(self, sentence): | |
contraction_map = { | |
"n't": " not", "'re": " are", "'s": " is", "'ll": " will", | |
"'ve": " have", "'d": " would", "'m": " am" | |
} | |
tokens = word_tokenize(sentence) | |
expanded_tokens = [] | |
for token in tokens: | |
lower_token = token.lower() | |
replaced = False | |
for contraction, expansion in contraction_map.items(): | |
if contraction in lower_token and lower_token.endswith(contraction): | |
new_token = lower_token.replace(contraction, expansion) | |
if token[0].isupper(): | |
new_token = new_token.capitalize() | |
expanded_tokens.append(new_token) | |
replaced = True | |
break | |
if not replaced: | |
expanded_tokens.append(token) | |
return ' '.join(expanded_tokens) | |
def add_academic_transitions(self, sentence): | |
transition = random.choice(self.academic_transitions) | |
return f"{transition} {sentence}" | |
def convert_to_passive(self, sentence): | |
doc = self.nlp(sentence) | |
subj_tokens = [t for t in doc if t.dep_ == 'nsubj' and t.head.dep_ == 'ROOT'] | |
dobj_tokens = [t for t in doc if t.dep_ == 'dobj'] | |
if subj_tokens and dobj_tokens: | |
subject = subj_tokens[0] | |
dobj = dobj_tokens[0] | |
verb = subject.head | |
if subject.i < verb.i < dobj.i: | |
passive_str = f"{dobj.text} {verb.lemma_} by {subject.text}" | |
original_str = ' '.join(token.text for token in doc) | |
chunk = f"{subject.text} {verb.text} {dobj.text}" | |
if chunk in original_str: | |
sentence = original_str.replace(chunk, passive_str) | |
return sentence | |
def replace_with_synonyms(self, sentence): | |
tokens = word_tokenize(sentence) | |
pos_tags = nltk.pos_tag(tokens) | |
new_tokens = [] | |
for (word, pos) in pos_tags: | |
if pos.startswith(('J', 'N', 'V', 'R')) and wordnet.synsets(word): | |
if random.random() < 0.5: | |
synonyms = self._get_synonyms(word, pos) | |
if synonyms: | |
best_synonym = self._select_closest_synonym(word, synonyms) | |
new_tokens.append(best_synonym if best_synonym else word) | |
else: | |
new_tokens.append(word) | |
else: | |
new_tokens.append(word) | |
else: | |
new_tokens.append(word) | |
# Join cleanly with punctuation fix | |
sentence = " ".join(new_tokens) | |
sentence = ( | |
sentence.replace(" ,", ",") | |
.replace(" .", ".") | |
.replace(" !", "!") | |
.replace(" ?", "?") | |
.replace(" :", ":") | |
.replace(" '", "'") | |
) | |
return sentence | |
def _get_synonyms(self, word, pos): | |
wn_pos = None | |
if pos.startswith('J'): | |
wn_pos = wordnet.ADJ | |
elif pos.startswith('N'): | |
wn_pos = wordnet.NOUN | |
elif pos.startswith('R'): | |
wn_pos = wordnet.ADV | |
elif pos.startswith('V'): | |
wn_pos = wordnet.VERB | |
synonyms = set() | |
for syn in wordnet.synsets(word, pos=wn_pos): | |
for lemma in syn.lemmas(): | |
lemma_name = lemma.name().replace('_', ' ') | |
if lemma_name.lower() != word.lower(): | |
synonyms.add(lemma_name) | |
return list(synonyms) | |
def _select_closest_synonym(self, original_word, synonyms): | |
if not synonyms: | |
return None | |
original_emb = self.model.encode(original_word, convert_to_tensor=True) | |
synonym_embs = self.model.encode(synonyms, convert_to_tensor=True) | |
cos_scores = util.cos_sim(original_emb, synonym_embs)[0] | |
max_score_index = cos_scores.argmax().item() | |
max_score = cos_scores[max_score_index].item() | |
if max_score >= 0.5: | |
return synonyms[max_score_index] | |
return None |