eyetrack-to-sacc-pipeline / eyetrack2saccade.py

Georg Willer

Change default header names to upercase starting & add checking for alternative time_header names

b9f7415 2 months ago

5.61 kB

	import pandas as pd
	import numpy as np
	from .constants import TIME_ALIASES, LEFT_X_ALIASES, LEFT_Y_ALIASES, RIGHT_X_ALIASES, RIGHT_Y_ALIASES, X_ALIASES, \
	Y_ALIASES
	from .detectors import saccade_detection
	from typing import Union

	class Eye2SacExtractor:
	data: pd.DataFrame = None
	x: np.array = None
	y: np.array = None
	time: np.array = None

	def _load_data(self, file_path: str):
	if file_path.endswith('.csv'):
	sep = self.__derive_separator(file_path)
	return pd.read_csv(file_path, sep)
	elif file_path.endswith('.txt'):
	return pd.read_csv(file_path, sep='\t')
	else:
	raise ValueError('File format not supported. Please provide a csv or txt file.')

	def _clean_data(self):
	self.data.dropna(inplace=True)

	def _map_relevant_data(self, time_header: str, x_headers: Union[str, list], y_headers: Union[str, list]):
	# map and extract relevant data
	try:
	self.time = self._get_value_array(time_header, TIME_ALIASES)
	if isinstance(x_headers, str):
	self.x = self._get_value_array(x_headers, X_ALIASES)
	# if the user accidently specified the header as list with a single entry, capture it
	elif isinstance(x_headers, list) and len(x_headers) == 1:
	self.x = self._get_value_array(x_headers[0], X_ALIASES)
	elif isinstance(x_headers, list) and len(x_headers) == 2:
	left_x_header = x_headers[0]
	right_x_header = x_headers[1]
	left_x = self._get_value_array(left_x_header, LEFT_X_ALIASES)
	right_x = self._get_value_array(right_x_header, RIGHT_X_ALIASES)
	self.x = np.mean([left_x, right_x], axis=0)
	else:
	raise ValueError('invalid size of x_headers')

	if isinstance(y_headers, str):
	self.y = self._get_value_array(y_headers, Y_ALIASES)
	# if the user accidently specified the header as list with a single entry, capture it
	elif isinstance(y_headers, list) and len(y_headers) == 1:
	self.y = self._get_value_array(y_headers[0], Y_ALIASES)
	elif isinstance(y_headers, list) and len(y_headers) == 2:
	left_y_header = y_headers[0]
	right_y_header = y_headers[1]
	left_y = self._get_value_array(left_y_header, LEFT_Y_ALIASES)
	right_y = self._get_value_array(right_y_header, RIGHT_Y_ALIASES)
	self.y = np.mean([left_y, right_y], axis=0)
	else:
	raise ValueError('invalid size of y_headers')
	except KeyError:
	raise ValueError('Required data columns are missing or not in the correct naming format.')

	def _get_value_array(self, header_name: str, known_names: list) -> np.array:
	# convert column names to lowercase
	data_columns_lowercase = self.data.columns.str.lower()
	# if the specified header is non-existent, check against known header names
	if header_name not in self.data.columns:
	# if no known header name matches, throw value error
	matching_columns = data_columns_lowercase.intersection(known_names)
	print(f"Using alternative columns: {matching_columns}")
	if len(matching_columns) == 0:
	raise ValueError(f'Invalid data format: header {header_name} not found.')
	else:
	return self.data[matching_columns].to_numpy().flatten()
	else:
	return self.data[header_name].to_numpy().flatten()

	def extract_features(self, data: Union[pd.DataFrame, str], time_header: str, x_headers: Union[str, list], y_headers: Union[str, list], missing: float, minlen: int, maxvel: int, maxacc: int):
	if isinstance(data, pd.DataFrame):
	self.data = data
	elif isinstance(data, str):
	self.data = self._load_data(data)
	else:
	raise ValueError('Data must be a pandas DataFrame or a file path to a csv or txt file.')
	self._clean_data()
	self._map_relevant_data(time_header, x_headers, y_headers)

	return self._extract_features(missing, minlen, maxvel, maxacc)


	def _extract_features(self, missing: float = 0.0, minlen: int = 5, maxvel: int = 40, maxacc: int = 340) -> pd.DataFrame :
	_, esac = saccade_detection(self.x, self.y, self.time, missing=missing, minlen=minlen, maxvel=maxvel, maxacc=maxacc)
	esac_df = pd.DataFrame(esac, columns=['starttime', 'endtime', 'duration', 'startx', 'starty', 'endx', 'endy'])
	return esac_df

	def __derive_separator(self, file_path):
	# Versuche, die Datei mit Komma als Separator zu lesen
	try:
	df_comma = pd.read_csv(file_path, sep=',')
	comma_columns = len(df_comma.columns)
	except Exception:
	comma_columns = 0

	# Versuche, die Datei mit Semikolon als Separator zu lesen
	try:
	df_semicolon = pd.read_csv(file_path, sep=';')
	semicolon_columns = len(df_semicolon.columns)
	except Exception:
	semicolon_columns = 0

	# Vergleiche die Anzahl der Spalten und bestimme den Separator
	if comma_columns > semicolon_columns:
	return ',' # Komma als Separator
	elif semicolon_columns > comma_columns:
	return ';' # Semikolon als Separator
	else:
	raise ValueError('Columns separator in CSV not supported. Make sure to use either , or ; as separator')