Spaces:
Running
Running
import logging | |
from io import BytesIO | |
def setup_logging(): | |
"""Set up logging configuration.""" | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
handlers=[logging.StreamHandler()] # Only console logging | |
) | |
return logging.getLogger(__name__) | |
def meters_to_miles(meters): | |
"""Convert distance in meters to miles.""" | |
return meters * 0.000621371 | |
def validate_excel_file(file_stream: BytesIO) -> tuple[bool, str]: | |
"""Validate the uploaded file is an Excel file by its magic numbers.""" | |
try: | |
# Read the first 4 bytes to check the file signature | |
header = file_stream.read(4) | |
file_stream.seek(0) # Reset stream position for further processing | |
# Check for Excel file signatures | |
if header == b'\x50\x4B\x03\x04': # ZIP archive (xlsx) | |
return True, "Valid Excel file" | |
elif header == b'\xD0\xCF\x11\xE0': # Compound File (xls) | |
return True, "Valid Excel file" | |
else: | |
return False, "Invalid file type: Not an Excel file" | |
except Exception as e: | |
return False, f"Validation error: {str(e)}" | |
def clean_address(address): | |
"""Clean and standardize address strings.""" | |
if not isinstance(address, str): | |
return "" | |
# Remove extra whitespace | |
cleaned = " ".join(address.split()) | |
# Remove common abbreviations and standardize format | |
replacements = { | |
"ST.": "STREET", | |
"ST ": "STREET ", | |
"AVE.": "AVENUE", | |
"AVE ": "AVENUE ", | |
"RD.": "ROAD", | |
"RD ": "ROAD ", | |
"BLVD.": "BOULEVARD", | |
"BLVD ": "BOULEVARD ", | |
"DR.": "DRIVE", | |
"DR ": "DRIVE ", | |
} | |
for old, new in replacements.items(): | |
cleaned = cleaned.replace(old, new) | |
return cleaned | |
def handle_empty_values(df, required_columns): | |
"""Handle empty values in required columns.""" | |
# Create a copy to avoid modifying the original DataFrame | |
clean_df = df.copy() | |
# Fill empty values with empty strings | |
for col in required_columns: | |
if col in clean_df.columns: | |
clean_df[col] = clean_df[col].fillna("") | |
return clean_df |