agent-course-gaia

Sleeping

App Files Files Community

agent-course-gaia / file_processing_tools.py

kirbah

support xls,csv files

e77bace 8 days ago

raw

history blame contribute delete

5.79 kB

	import pandas as pd
	from smolagents import tool
	from typing import Union, Optional


	@tool
	def get_csv_data_summary(file_path: str) -> str:
	"""
	Reads a CSV file from the given file path and returns a summary of its content.
	The summary includes the number of rows and columns, column names, and basic descriptive statistics.

	Args:
	file_path (str): The absolute local path to the CSV file.
	This path should be obtained from the 'File Information' section if the file was downloaded by the agent.

	Returns:
	str: A string containing the data summary (shape, columns, descriptive statistics) or an error message if processing fails.
	"""
	try:
	df = pd.read_csv(file_path)

	summary = f"Successfully read CSV file: '{file_path}'\n"
	summary += f"Number of rows: {len(df)}\n"
	summary += f"Number of columns: {len(df.columns)}\n"
	summary += f"Column names: {', '.join(df.columns.astype(str))}\n\n"
	summary += "Descriptive statistics:\n"
	# include='all' for mixed types
	summary += df.describe(include='all').to_string()

	# For very wide dataframes, head might be more useful than full describe in limited contexts
	# if len(df.columns) > 15:
	# summary += "\n\nFirst 5 rows (due to large number of columns):\n"
	# summary += df.head().to_string()

	return summary
	except ImportError:
	return "Error: The 'pandas' library is required but not installed. Please ensure it is available in the agent's environment."
	except FileNotFoundError:
	return f"Error: The CSV file was not found at the specified path: '{file_path}'. Please verify the path."
	except pd.errors.EmptyDataError:
	return f"Error: The CSV file at '{file_path}' is empty."
	except Exception as e:
	return f"Error processing CSV file '{file_path}': {type(e).__name__} - {str(e)}"


	@tool
	def get_excel_data_summary(file_path: str, sheet_name: Optional[str] = "0") -> str:
	"""
	Reads an Excel file (supports .xls and .xlsx) from the given file path and returns a summary of the specified sheet's content.
	The summary includes the number of rows and columns, column names, and basic descriptive statistics for that sheet.

	Args:
	file_path (str): The absolute local path to the Excel file.
	This path should be obtained from the 'File Information' section if the file was downloaded by the agent.
	sheet_name (str \| int \| None, optional): The name of the sheet to read (e.g., "Sheet1") or its 0-indexed position (e.g., 0).
	If None or 0, the first sheet is read. Defaults to 0 (the first sheet).

	Returns:
	str: A string containing the data summary from the specified sheet (shape, columns, descriptive statistics) or an error message.
	"""
	try:
	# Determine engine based on file extension for clearer error messages if engine is missing
	engine = None
	if file_path.endswith('.xlsx'):
	engine = 'openpyxl'
	elif file_path.endswith('.xls'):
	# or 'openpyxl' if xlrd is not available and openpyxl can handle it.
	engine = 'xlrd'

	actual_sheet_name_for_pandas: Union[str, int, None]
	if sheet_name is None:
	actual_sheet_name_for_pandas = 0 # Default to first sheet
	elif sheet_name.isdigit():
	actual_sheet_name_for_pandas = int(sheet_name)
	else:
	actual_sheet_name_for_pandas = sheet_name

	df = pd.read_excel(
	file_path, sheet_name=actual_sheet_name_for_pandas, engine=engine)

	sheet_identifier = f"sheet '{sheet_name}'" if sheet_name is not None else "the first sheet"
	summary = f"Successfully read {sheet_identifier} from Excel file: '{file_path}'\n"
	summary += f"Number of rows: {len(df)}\n"
	summary += f"Number of columns: {len(df.columns)}\n"
	summary += f"Column names: {', '.join(df.columns.astype(str))}\n\n"
	summary += "Descriptive statistics:\n"
	summary += df.describe(include='all').to_string()

	return summary
	except ImportError:
	return ("Error: The 'pandas' library and an Excel engine ('openpyxl' for .xlsx, 'xlrd' for .xls) "
	"are required. Please ensure they are available in the agent's environment.")
	except FileNotFoundError:
	return f"Error: The Excel file was not found at the specified path: '{file_path}'. Please verify the path."
	except pd.errors.EmptyDataError: # Though less common for Excel sheets than CSVs
	return f"Error: The specified sheet in Excel file '{file_path}' is empty or could not be parsed as data."
	except ValueError as ve: # Catches incorrect sheet names/indices from pandas
	if "sheet_name" in str(ve).lower():
	return f"Error: Sheet '{sheet_name}' not found in Excel file '{file_path}'. Please check the sheet name or index."
	return f"Error processing Excel file '{file_path}': ValueError - {str(ve)}"
	except Exception as e:
	# Specific check for missing engines, as pandas might raise a general Exception or ValueError
	err_str = str(e).lower()
	if "openpyxl" in err_str and "install openpyxl" in err_str:
	return f"Error: Missing 'openpyxl' engine for Excel file '{file_path}'. Please install it."
	if "xlrd" in err_str and ("install xlrd" in err_str or "support for .xls files" in err_str):
	return f"Error: Missing 'xlrd' engine for .xls Excel file '{file_path}'. Please install it or try 'openpyxl' if compatible."
	return f"Error processing Excel file '{file_path}': {type(e).__name__} - {str(e)}"