Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
"""Text preprocessing for radiology reports with complex Unicode and formatting. | |
Handles reports containing complex Unicode symbolic characters and non-standard | |
structural formatting that are not currently supported by the prompt and LangExtract | |
library. Prevents timeout issues by normalizing problematic characters and structures | |
to formats compatible with downstream processing. | |
Typical usage example: | |
from sanitize import preprocess_report | |
clean_text = preprocess_report(raw_report) | |
""" | |
from __future__ import annotations | |
import re | |
import ftfy | |
_TRANSLATE = str.maketrans( | |
{ | |
0x2022: "*", | |
0x25CF: "*", | |
0x27A1: "->", | |
0xF0E0: "->", | |
0x2192: "->", | |
0x2190: "<-", | |
0x00D7: "x", | |
0x2191: "up", | |
0x2642: "male", | |
0x2640: "female", | |
0x2010: "-", | |
0x2013: "-", | |
0x2014: "-", | |
0x00A0: " ", | |
} | |
) | |
_WS = re.compile(r"[ \t]+") | |
_BLANKS = re.compile(r"\n\s*\n\s*\n+") | |
# Structure normalization patterns | |
_BEGIN = re.compile(r"---\s*BEGIN [^-]+---\n*", re.I) | |
_END = re.compile(r"\n*---\s*END [^-]+---\s*", re.I) | |
_HEADER = re.compile(r"\*{3}\s*([^*]+?)\s*\*{3}", re.I) | |
_BULLET_HDR = re.compile(r"^[ \t]*[\*\u2022\u25CF-]+\s*", re.M) | |
_ENUM = re.compile(r"^[ \t]*(\d+)[\)\.][ \t]+", re.M) | |
def sanitize_text(text: str) -> str: | |
"""Sanitizes Unicode characters and normalizes whitespace. | |
Applies ftfy text repair, translates problematic Unicode symbols to ASCII | |
equivalents, normalizes whitespace, and removes excessive blank lines. | |
Args: | |
text: The input text to sanitize. | |
Returns: | |
Sanitized text with Unicode issues resolved and whitespace normalized. | |
""" | |
out = ftfy.fix_text(text, remove_control_chars=True, normalization="NFC") | |
out = out.translate(_TRANSLATE) | |
out = _WS.sub(" ", out) | |
out = out.replace("\r\n", "\n").replace("\r", "\n") | |
out = _BLANKS.sub("\n\n", out) | |
return out.strip() | |
def normalize_structure(text: str) -> str: | |
"""Normalizes structural elements in radiology reports. | |
Removes report wrappers, converts asterisk headers to colon format, | |
removes bullet prefixes, and standardizes enumerations. | |
Args: | |
text: The input text to normalize. | |
Returns: | |
Text with structural elements normalized for consistent formatting. | |
""" | |
text = _BEGIN.sub("", text) | |
text = _END.sub("", text) | |
text = _HEADER.sub(lambda m: f"{m.group(1).strip()}:", text) | |
text = _BULLET_HDR.sub("", text) | |
text = _ENUM.sub(lambda m: f"{m.group(1)}. ", text) | |
return text.strip() | |
def preprocess_report(raw: str) -> str: | |
"""Preprocesses radiology reports with sanitization and normalization. | |
Combines Unicode sanitization and structural normalization to prepare | |
radiology reports for downstream processing. This is the main entry point | |
for text preprocessing. | |
Args: | |
raw: The raw radiology report text. | |
Returns: | |
Preprocessed text ready for structured extraction. | |
""" | |
return normalize_structure(sanitize_text(raw)) | |