Spaces:
Runtime error
Runtime error
File size: 7,258 Bytes
ea2133c bea1d24 ea2133c bea1d24 ea2133c bea1d24 ea2133c bea1d24 ea2133c 9e7dc23 ea2133c bea1d24 ea2133c bea1d24 ea2133c bea1d24 ea2133c 9e7dc23 ea2133c 9e7dc23 ea2133c bea1d24 ea2133c bea1d24 ea2133c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
import ssl
import random
import warnings
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from sentence_transformers import SentenceTransformer, util
warnings.filterwarnings("ignore", category=FutureWarning)
NLP_GLOBAL = spacy.load("en_core_web_sm")
def download_nltk_resources():
"""
Download required NLTK resources if not already installed.
"""
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
resources = ['punkt', 'averaged_perceptron_tagger', 'punkt_tab','wordnet','averaged_perceptron_tagger_eng']
for resource in resources:
try:
nltk.download(resource, quiet=True)
except Exception as e:
print(f"Error downloading {resource}: {str(e)}")
# This class contains methods to humanize academic text, such as improving readability or
# simplifying complex language.
class TextHumanizer:
"""
Transforms text into a more formal (academic) style:
- Expands contractions
- Adds academic transitions
- Optionally converts some sentences to passive voice
- Optionally replaces words with synonyms for more formality
"""
def __init__(
self,
model_name='paraphrase-MiniLM-L6-v2',
p_passive=0.2,
p_synonym_replacement=0.3,
p_academic_transition=0.3,
seed=None
):
if seed is not None:
random.seed(seed)
self.nlp = spacy.load("en_core_web_sm")
self.model = SentenceTransformer(model_name)
# Transformation probabilities
self.p_passive = p_passive
self.p_synonym_replacement = p_synonym_replacement
self.p_academic_transition = p_academic_transition
# Common academic transitions
self.academic_transitions = [
"Moreover,", "Additionally,", "Furthermore,", "Hence,",
"Therefore,", "Consequently,", "Nonetheless,", "Nevertheless,"
]
def humanize_text(self, text, use_passive=False, use_synonyms=False):
doc = self.nlp(text)
transformed_sentences = []
for sent in doc.sents:
sentence_str = sent.text.strip()
# 1. Expand contractions
sentence_str = self.expand_contractions(sentence_str)
# 2. Possibly add academic transitions
# if random.random() < self.p_academic_transition:
# sentence_str = self.add_academic_transitions(sentence_str)
# 3. Optionally convert to passive
if use_passive and random.random() < self.p_passive:
sentence_str = self.convert_to_passive(sentence_str)
# 4. Optionally replace words with synonyms
if use_synonyms and random.random() < self.p_synonym_replacement:
sentence_str = self.replace_with_synonyms(sentence_str)
transformed_sentences.append(sentence_str)
return ' '.join(transformed_sentences)
def expand_contractions(self, sentence):
contraction_map = {
"n't": " not", "'re": " are", "'s": " is", "'ll": " will",
"'ve": " have", "'d": " would", "'m": " am"
}
tokens = word_tokenize(sentence)
expanded_tokens = []
for token in tokens:
lower_token = token.lower()
replaced = False
for contraction, expansion in contraction_map.items():
if contraction in lower_token and lower_token.endswith(contraction):
new_token = lower_token.replace(contraction, expansion)
if token[0].isupper():
new_token = new_token.capitalize()
expanded_tokens.append(new_token)
replaced = True
break
if not replaced:
expanded_tokens.append(token)
return ' '.join(expanded_tokens)
def add_academic_transitions(self, sentence):
transition = random.choice(self.academic_transitions)
return f"{transition} {sentence}"
def convert_to_passive(self, sentence):
doc = self.nlp(sentence)
subj_tokens = [t for t in doc if t.dep_ == 'nsubj' and t.head.dep_ == 'ROOT']
dobj_tokens = [t for t in doc if t.dep_ == 'dobj']
if subj_tokens and dobj_tokens:
subject = subj_tokens[0]
dobj = dobj_tokens[0]
verb = subject.head
if subject.i < verb.i < dobj.i:
passive_str = f"{dobj.text} {verb.lemma_} by {subject.text}"
original_str = ' '.join(token.text for token in doc)
chunk = f"{subject.text} {verb.text} {dobj.text}"
if chunk in original_str:
sentence = original_str.replace(chunk, passive_str)
return sentence
def replace_with_synonyms(self, sentence):
tokens = word_tokenize(sentence)
pos_tags = nltk.pos_tag(tokens)
new_tokens = []
for (word, pos) in pos_tags:
if pos.startswith(('J', 'N', 'V', 'R')) and wordnet.synsets(word):
if random.random() < 0.5:
synonyms = self._get_synonyms(word, pos)
if synonyms:
best_synonym = self._select_closest_synonym(word, synonyms)
new_tokens.append(best_synonym if best_synonym else word)
else:
new_tokens.append(word)
else:
new_tokens.append(word)
else:
new_tokens.append(word)
# Join cleanly with punctuation fix
sentence = " ".join(new_tokens)
sentence = (
sentence.replace(" ,", ",")
.replace(" .", ".")
.replace(" !", "!")
.replace(" ?", "?")
.replace(" :", ":")
.replace(" '", "'")
)
return sentence
def _get_synonyms(self, word, pos):
wn_pos = None
if pos.startswith('J'):
wn_pos = wordnet.ADJ
elif pos.startswith('N'):
wn_pos = wordnet.NOUN
elif pos.startswith('R'):
wn_pos = wordnet.ADV
elif pos.startswith('V'):
wn_pos = wordnet.VERB
synonyms = set()
for syn in wordnet.synsets(word, pos=wn_pos):
for lemma in syn.lemmas():
lemma_name = lemma.name().replace('_', ' ')
if lemma_name.lower() != word.lower():
synonyms.add(lemma_name)
return list(synonyms)
def _select_closest_synonym(self, original_word, synonyms):
if not synonyms:
return None
original_emb = self.model.encode(original_word, convert_to_tensor=True)
synonym_embs = self.model.encode(synonyms, convert_to_tensor=True)
cos_scores = util.cos_sim(original_emb, synonym_embs)[0]
max_score_index = cos_scores.argmax().item()
max_score = cos_scores[max_score_index].item()
if max_score >= 0.5:
return synonyms[max_score_index]
return None |