agent-course-gaia / file_processing_tools.py
kirbah's picture
support xls,csv files
e77bace
import pandas as pd
from smolagents import tool
from typing import Union, Optional
@tool
def get_csv_data_summary(file_path: str) -> str:
"""
Reads a CSV file from the given file path and returns a summary of its content.
The summary includes the number of rows and columns, column names, and basic descriptive statistics.
Args:
file_path (str): The absolute local path to the CSV file.
This path should be obtained from the 'File Information' section if the file was downloaded by the agent.
Returns:
str: A string containing the data summary (shape, columns, descriptive statistics) or an error message if processing fails.
"""
try:
df = pd.read_csv(file_path)
summary = f"Successfully read CSV file: '{file_path}'\n"
summary += f"Number of rows: {len(df)}\n"
summary += f"Number of columns: {len(df.columns)}\n"
summary += f"Column names: {', '.join(df.columns.astype(str))}\n\n"
summary += "Descriptive statistics:\n"
# include='all' for mixed types
summary += df.describe(include='all').to_string()
# For very wide dataframes, head might be more useful than full describe in limited contexts
# if len(df.columns) > 15:
# summary += "\n\nFirst 5 rows (due to large number of columns):\n"
# summary += df.head().to_string()
return summary
except ImportError:
return "Error: The 'pandas' library is required but not installed. Please ensure it is available in the agent's environment."
except FileNotFoundError:
return f"Error: The CSV file was not found at the specified path: '{file_path}'. Please verify the path."
except pd.errors.EmptyDataError:
return f"Error: The CSV file at '{file_path}' is empty."
except Exception as e:
return f"Error processing CSV file '{file_path}': {type(e).__name__} - {str(e)}"
@tool
def get_excel_data_summary(file_path: str, sheet_name: Optional[str] = "0") -> str:
"""
Reads an Excel file (supports .xls and .xlsx) from the given file path and returns a summary of the specified sheet's content.
The summary includes the number of rows and columns, column names, and basic descriptive statistics for that sheet.
Args:
file_path (str): The absolute local path to the Excel file.
This path should be obtained from the 'File Information' section if the file was downloaded by the agent.
sheet_name (str | int | None, optional): The name of the sheet to read (e.g., "Sheet1") or its 0-indexed position (e.g., 0).
If None or 0, the first sheet is read. Defaults to 0 (the first sheet).
Returns:
str: A string containing the data summary from the specified sheet (shape, columns, descriptive statistics) or an error message.
"""
try:
# Determine engine based on file extension for clearer error messages if engine is missing
engine = None
if file_path.endswith('.xlsx'):
engine = 'openpyxl'
elif file_path.endswith('.xls'):
# or 'openpyxl' if xlrd is not available and openpyxl can handle it.
engine = 'xlrd'
actual_sheet_name_for_pandas: Union[str, int, None]
if sheet_name is None:
actual_sheet_name_for_pandas = 0 # Default to first sheet
elif sheet_name.isdigit():
actual_sheet_name_for_pandas = int(sheet_name)
else:
actual_sheet_name_for_pandas = sheet_name
df = pd.read_excel(
file_path, sheet_name=actual_sheet_name_for_pandas, engine=engine)
sheet_identifier = f"sheet '{sheet_name}'" if sheet_name is not None else "the first sheet"
summary = f"Successfully read {sheet_identifier} from Excel file: '{file_path}'\n"
summary += f"Number of rows: {len(df)}\n"
summary += f"Number of columns: {len(df.columns)}\n"
summary += f"Column names: {', '.join(df.columns.astype(str))}\n\n"
summary += "Descriptive statistics:\n"
summary += df.describe(include='all').to_string()
return summary
except ImportError:
return ("Error: The 'pandas' library and an Excel engine ('openpyxl' for .xlsx, 'xlrd' for .xls) "
"are required. Please ensure they are available in the agent's environment.")
except FileNotFoundError:
return f"Error: The Excel file was not found at the specified path: '{file_path}'. Please verify the path."
except pd.errors.EmptyDataError: # Though less common for Excel sheets than CSVs
return f"Error: The specified sheet in Excel file '{file_path}' is empty or could not be parsed as data."
except ValueError as ve: # Catches incorrect sheet names/indices from pandas
if "sheet_name" in str(ve).lower():
return f"Error: Sheet '{sheet_name}' not found in Excel file '{file_path}'. Please check the sheet name or index."
return f"Error processing Excel file '{file_path}': ValueError - {str(ve)}"
except Exception as e:
# Specific check for missing engines, as pandas might raise a general Exception or ValueError
err_str = str(e).lower()
if "openpyxl" in err_str and "install openpyxl" in err_str:
return f"Error: Missing 'openpyxl' engine for Excel file '{file_path}'. Please install it."
if "xlrd" in err_str and ("install xlrd" in err_str or "support for .xls files" in err_str):
return f"Error: Missing 'xlrd' engine for .xls Excel file '{file_path}'. Please install it or try 'openpyxl' if compatible."
return f"Error processing Excel file '{file_path}': {type(e).__name__} - {str(e)}"