Spaces:
Sleeping
Sleeping
import pandas as pd | |
from smolagents import tool | |
from typing import Union, Optional | |
def get_csv_data_summary(file_path: str) -> str: | |
""" | |
Reads a CSV file from the given file path and returns a summary of its content. | |
The summary includes the number of rows and columns, column names, and basic descriptive statistics. | |
Args: | |
file_path (str): The absolute local path to the CSV file. | |
This path should be obtained from the 'File Information' section if the file was downloaded by the agent. | |
Returns: | |
str: A string containing the data summary (shape, columns, descriptive statistics) or an error message if processing fails. | |
""" | |
try: | |
df = pd.read_csv(file_path) | |
summary = f"Successfully read CSV file: '{file_path}'\n" | |
summary += f"Number of rows: {len(df)}\n" | |
summary += f"Number of columns: {len(df.columns)}\n" | |
summary += f"Column names: {', '.join(df.columns.astype(str))}\n\n" | |
summary += "Descriptive statistics:\n" | |
# include='all' for mixed types | |
summary += df.describe(include='all').to_string() | |
# For very wide dataframes, head might be more useful than full describe in limited contexts | |
# if len(df.columns) > 15: | |
# summary += "\n\nFirst 5 rows (due to large number of columns):\n" | |
# summary += df.head().to_string() | |
return summary | |
except ImportError: | |
return "Error: The 'pandas' library is required but not installed. Please ensure it is available in the agent's environment." | |
except FileNotFoundError: | |
return f"Error: The CSV file was not found at the specified path: '{file_path}'. Please verify the path." | |
except pd.errors.EmptyDataError: | |
return f"Error: The CSV file at '{file_path}' is empty." | |
except Exception as e: | |
return f"Error processing CSV file '{file_path}': {type(e).__name__} - {str(e)}" | |
def get_excel_data_summary(file_path: str, sheet_name: Optional[str] = "0") -> str: | |
""" | |
Reads an Excel file (supports .xls and .xlsx) from the given file path and returns a summary of the specified sheet's content. | |
The summary includes the number of rows and columns, column names, and basic descriptive statistics for that sheet. | |
Args: | |
file_path (str): The absolute local path to the Excel file. | |
This path should be obtained from the 'File Information' section if the file was downloaded by the agent. | |
sheet_name (str | int | None, optional): The name of the sheet to read (e.g., "Sheet1") or its 0-indexed position (e.g., 0). | |
If None or 0, the first sheet is read. Defaults to 0 (the first sheet). | |
Returns: | |
str: A string containing the data summary from the specified sheet (shape, columns, descriptive statistics) or an error message. | |
""" | |
try: | |
# Determine engine based on file extension for clearer error messages if engine is missing | |
engine = None | |
if file_path.endswith('.xlsx'): | |
engine = 'openpyxl' | |
elif file_path.endswith('.xls'): | |
# or 'openpyxl' if xlrd is not available and openpyxl can handle it. | |
engine = 'xlrd' | |
actual_sheet_name_for_pandas: Union[str, int, None] | |
if sheet_name is None: | |
actual_sheet_name_for_pandas = 0 # Default to first sheet | |
elif sheet_name.isdigit(): | |
actual_sheet_name_for_pandas = int(sheet_name) | |
else: | |
actual_sheet_name_for_pandas = sheet_name | |
df = pd.read_excel( | |
file_path, sheet_name=actual_sheet_name_for_pandas, engine=engine) | |
sheet_identifier = f"sheet '{sheet_name}'" if sheet_name is not None else "the first sheet" | |
summary = f"Successfully read {sheet_identifier} from Excel file: '{file_path}'\n" | |
summary += f"Number of rows: {len(df)}\n" | |
summary += f"Number of columns: {len(df.columns)}\n" | |
summary += f"Column names: {', '.join(df.columns.astype(str))}\n\n" | |
summary += "Descriptive statistics:\n" | |
summary += df.describe(include='all').to_string() | |
return summary | |
except ImportError: | |
return ("Error: The 'pandas' library and an Excel engine ('openpyxl' for .xlsx, 'xlrd' for .xls) " | |
"are required. Please ensure they are available in the agent's environment.") | |
except FileNotFoundError: | |
return f"Error: The Excel file was not found at the specified path: '{file_path}'. Please verify the path." | |
except pd.errors.EmptyDataError: # Though less common for Excel sheets than CSVs | |
return f"Error: The specified sheet in Excel file '{file_path}' is empty or could not be parsed as data." | |
except ValueError as ve: # Catches incorrect sheet names/indices from pandas | |
if "sheet_name" in str(ve).lower(): | |
return f"Error: Sheet '{sheet_name}' not found in Excel file '{file_path}'. Please check the sheet name or index." | |
return f"Error processing Excel file '{file_path}': ValueError - {str(ve)}" | |
except Exception as e: | |
# Specific check for missing engines, as pandas might raise a general Exception or ValueError | |
err_str = str(e).lower() | |
if "openpyxl" in err_str and "install openpyxl" in err_str: | |
return f"Error: Missing 'openpyxl' engine for Excel file '{file_path}'. Please install it." | |
if "xlrd" in err_str and ("install xlrd" in err_str or "support for .xls files" in err_str): | |
return f"Error: Missing 'xlrd' engine for .xls Excel file '{file_path}'. Please install it or try 'openpyxl' if compatible." | |
return f"Error processing Excel file '{file_path}': {type(e).__name__} - {str(e)}" | |