File size: 5,614 Bytes
45856e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341a65f
 
45856e0
 
 
 
 
 
 
 
9b37b0e
45856e0
 
b9f7415
9b37b0e
b91f3bc
 
9b37b0e
b91f3bc
9b37b0e
 
 
b91f3bc
 
9b37b0e
45856e0
9b37b0e
 
 
b91f3bc
 
 
 
9b37b0e
 
 
b91f3bc
 
9b37b0e
 
 
45856e0
 
 
e76f24f
b91f3bc
 
 
 
 
e76f24f
 
 
b91f3bc
 
e76f24f
 
 
45856e0
9b37b0e
45856e0
 
 
 
 
 
 
9b37b0e
45856e0
fb2fb4c
45856e0
 
 
 
 
 
341a65f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import numpy as np
from .constants import TIME_ALIASES, LEFT_X_ALIASES, LEFT_Y_ALIASES, RIGHT_X_ALIASES, RIGHT_Y_ALIASES, X_ALIASES, \
    Y_ALIASES
from .detectors import saccade_detection
from typing import Union

class Eye2SacExtractor:
    data: pd.DataFrame = None
    x: np.array = None
    y: np.array = None
    time: np.array = None

    def _load_data(self, file_path: str):
        if file_path.endswith('.csv'):
            sep = self.__derive_separator(file_path)
            return pd.read_csv(file_path, sep)
        elif file_path.endswith('.txt'):
            return pd.read_csv(file_path, sep='\t')
        else:
            raise ValueError('File format not supported. Please provide a csv or txt file.')

    def _clean_data(self):
        self.data.dropna(inplace=True)

    def _map_relevant_data(self, time_header: str, x_headers: Union[str, list], y_headers: Union[str, list]):
        # map and extract relevant data
        try:
            self.time = self._get_value_array(time_header, TIME_ALIASES)
            if isinstance(x_headers, str):
                self.x = self._get_value_array(x_headers, X_ALIASES)
            # if the user accidently specified the header as list with a single entry, capture it
            elif isinstance(x_headers, list) and len(x_headers) == 1:
                self.x = self._get_value_array(x_headers[0], X_ALIASES)
            elif isinstance(x_headers, list) and len(x_headers) == 2:
                left_x_header = x_headers[0]
                right_x_header = x_headers[1]
                left_x = self._get_value_array(left_x_header, LEFT_X_ALIASES)
                right_x = self._get_value_array(right_x_header, RIGHT_X_ALIASES)
                self.x = np.mean([left_x, right_x], axis=0)
            else:
                raise ValueError('invalid size of x_headers')
            
            if isinstance(y_headers, str):
                self.y = self._get_value_array(y_headers, Y_ALIASES)
            # if the user accidently specified the header as list with a single entry, capture it
            elif isinstance(y_headers, list) and len(y_headers) == 1:
                self.y = self._get_value_array(y_headers[0], Y_ALIASES)
            elif isinstance(y_headers, list) and len(y_headers) == 2:
                left_y_header = y_headers[0]
                right_y_header = y_headers[1]
                left_y = self._get_value_array(left_y_header, LEFT_Y_ALIASES)
                right_y = self._get_value_array(right_y_header, RIGHT_Y_ALIASES)
                self.y = np.mean([left_y, right_y], axis=0) 
            else:
                raise ValueError('invalid size of y_headers')
        except KeyError:
            raise ValueError('Required data columns are missing or not in the correct naming format.')

    def _get_value_array(self, header_name: str, known_names: list) -> np.array:
        # convert column names to lowercase
        data_columns_lowercase = self.data.columns.str.lower()
        # if the specified header is non-existent, check against known header names
        if header_name not in self.data.columns:
            # if no known header name matches, throw value error
            matching_columns = data_columns_lowercase.intersection(known_names)
            print(f"Using alternative columns: {matching_columns}")
            if len(matching_columns) == 0:
                raise ValueError(f'Invalid data format: header {header_name} not found.')
            else:
                return self.data[matching_columns].to_numpy().flatten()
        else:
            return self.data[header_name].to_numpy().flatten()

    def extract_features(self, data: Union[pd.DataFrame, str], time_header: str, x_headers: Union[str, list], y_headers: Union[str, list], missing: float, minlen: int, maxvel: int, maxacc: int):
        if isinstance(data, pd.DataFrame):
            self.data = data
        elif isinstance(data, str):
            self.data = self._load_data(data)
        else:
            raise ValueError('Data must be a pandas DataFrame or a file path to a csv or txt file.')
        self._clean_data()
        self._map_relevant_data(time_header, x_headers, y_headers)

        return self._extract_features(missing, minlen, maxvel, maxacc)


    def _extract_features(self, missing: float = 0.0, minlen: int = 5, maxvel: int = 40, maxacc: int = 340) -> pd.DataFrame :
        _, esac = saccade_detection(self.x, self.y, self.time, missing=missing, minlen=minlen, maxvel=maxvel, maxacc=maxacc)
        esac_df = pd.DataFrame(esac, columns=['starttime', 'endtime', 'duration', 'startx', 'starty', 'endx', 'endy'])
        return esac_df
    
    def __derive_separator(self, file_path):
         # Versuche, die Datei mit Komma als Separator zu lesen
        try:
            df_comma = pd.read_csv(file_path, sep=',')
            comma_columns = len(df_comma.columns)
        except Exception:
            comma_columns = 0

        # Versuche, die Datei mit Semikolon als Separator zu lesen
        try:
            df_semicolon = pd.read_csv(file_path, sep=';')
            semicolon_columns = len(df_semicolon.columns)
        except Exception:
            semicolon_columns = 0

        # Vergleiche die Anzahl der Spalten und bestimme den Separator
        if comma_columns > semicolon_columns:
            return ','  # Komma als Separator
        elif semicolon_columns > comma_columns:
            return ';'  # Semikolon als Separator
        else:
            raise ValueError('Columns separator in CSV not supported. Make sure to use either , or ; as separator')