import re _no_period_re = re.compile(r"(No[.])(?=[ ]?[0-9])") _percent_re = re.compile(r"([ ]?[%])") _half_re = re.compile("([0-9]½)|(½)") # List of (regular expression, replacement) pairs for abbreviations: _abbreviations = [ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ ("mrs", "misess"), ("ms", "miss"), ("mr", "mister"), ("dr", "doctor"), ("st", "saint"), ("co", "company"), ("jr", "junior"), ("maj", "major"), ("gen", "general"), ("drs", "doctors"), ("rev", "reverend"), ("lt", "lieutenant"), ("hon", "honorable"), ("sgt", "sergeant"), ("capt", "captain"), ("esq", "esquire"), ("ltd", "limited"), ("col", "colonel"), ("ft", "fort"), ] ] def _expand_no_period(m): word = m.group(0) if word[0] == "N": return "Number" return "number" def _expand_percent(m): return " percent" def _expand_half(m): word = m.group(1) if word is None: return "half" return word[0] + " and a half" def normalize_abbreviations(text): text = re.sub(_no_period_re, _expand_no_period, text) text = re.sub(_percent_re, _expand_percent, text) text = re.sub(_half_re, _expand_half, text) return text