File size: 822 Bytes
f80cf2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
"""
helper.py

Utility functions for text processing and data cleaning.
"""

import re


def clean_text_whitespace(text: str) -> str:
    """
    Clean up text by normalizing whitespace and newlines.
    
    Args:
        text (str): Input text string to clean
        
    Returns:
        str: Cleaned text with normalized whitespace and newlines
    """
    if not text or not isinstance(text, str):
        return text

    # Replace multiple whitespace characters (spaces, tabs) with a single space
    # This handles spaces, tabs, and other whitespace characters except newlines
    text = re.sub(r'[^\S\n]+', ' ', text)

    # Replace multiple consecutive newlines with a single newline
    text = re.sub(r'\n{2,}', '\n', text)

    # Strip leading and trailing whitespace
    text = text.strip()

    return text