hugpv commited on
Commit
da572bf
·
1 Parent(s): 5c5f561

initial commit

Browse files
Files changed (38) hide show
  1. .gitignore +162 -0
  2. README.md +29 -13
  3. analysis_funcs.py +338 -0
  4. app.py +0 -0
  5. chars_df_columns.md +24 -0
  6. classic_correction_algos.py +552 -0
  7. emreading_funcs.py +994 -0
  8. eyekit_measures.py +194 -0
  9. fixations_df_columns.md +88 -0
  10. item_df_columns.md +4 -0
  11. loss_functions.py +97 -0
  12. models.py +892 -0
  13. models/BERT_20240104-223349_loop_normalize_by_line_height_and_width_True_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00430.ckpt +3 -0
  14. models/BERT_20240104-233803_loop_normalize_by_line_height_and_width_False_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00719.ckpt +3 -0
  15. models/BERT_20240107-152040_loop_restrict_sim_data_to_4000_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00515.ckpt +3 -0
  16. models/BERT_20240108-000344_loop_normalize_by_line_height_and_width_False_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00706.ckpt +3 -0
  17. models/BERT_20240108-011230_loop_normalize_by_line_height_and_width_True_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00560.ckpt +3 -0
  18. models/BERT_20240109-090419_loop_normalize_by_line_height_and_width_False_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00518.ckpt +3 -0
  19. models/BERT_20240122-183729_loop_normalize_by_line_height_and_width_True_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00523.ckpt +3 -0
  20. models/BERT_20240122-194041_loop_normalize_by_line_height_and_width_False_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00462.ckpt +3 -0
  21. models/BERT_fin_exp_20240104-223349.yaml +100 -0
  22. models/BERT_fin_exp_20240104-233803.yaml +100 -0
  23. models/BERT_fin_exp_20240107-152040.yaml +100 -0
  24. models/BERT_fin_exp_20240108-000344.yaml +100 -0
  25. models/BERT_fin_exp_20240108-011230.yaml +100 -0
  26. models/BERT_fin_exp_20240109-090419.yaml +100 -0
  27. models/BERT_fin_exp_20240122-183729.yaml +102 -0
  28. models/BERT_fin_exp_20240122-194041.yaml +102 -0
  29. multi_proc_funcs.py +2415 -0
  30. popEye_funcs.py +1373 -0
  31. process_asc_files_in_multi_p.py +149 -0
  32. requirements.txt +25 -0
  33. saccades_df_columns.md +38 -0
  34. sentence_measures.md +35 -0
  35. subject_measures.md +15 -0
  36. trials_df_columns.md +36 -0
  37. utils.py +1349 -0
  38. word_measures.md +58 -0
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
README.md CHANGED
@@ -1,13 +1,29 @@
1
- ---
2
- title: GazeGenie
3
- emoji: 🔥
4
- colorFrom: purple
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.38.0
8
- app_file: app.py
9
- pinned: false
10
- license: unknown
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GazeGenie
2
+ A versatile tool for parsing, cleaning, aligning and analysing fixations from eye-tracking reading experiments
3
+
4
+ ## Use via huggingface spaces
5
+ In Browser navigate to :
6
+
7
+ ## Run via Docker
8
+ mkdir results
9
+ docker run --name gazegenie_app -p 8501:8501 -v $pwd/results:/app/results dockinthehubbing/gaze_genie:latest
10
+
11
+ In Browser navigate to : http://localhost:8501
12
+
13
+ To restart container later:
14
+ docker start -a gazegenie_app
15
+
16
+ ## Local installation
17
+ #### Install conda to get python
18
+ https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Windows-x86_64.exe
19
+
20
+ #### Package installation in Terminal
21
+ mamba create -n eye python=3.11 -y
22
+ mamba activate eye
23
+ mamba install conda-forge::cairo
24
+ pip install -r requirements.txt
25
+
26
+ #### Run program from Terminal
27
+ conda activate eye
28
+ streamlit run app.py
29
+ In Browser navigate to : http://localhost:8501
analysis_funcs.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Partially taken and adapted from: https://github.com/jwcarr/eyekit/blob/1db1913411327b108b87e097a00278b6e50d0751/eyekit/measure.py
3
+ Functions for calculating common reading measures, such as gaze duration or initial landing position.
4
+ """
5
+
6
+ import pandas as pd
7
+ from icecream import ic
8
+
9
+ ic.configureOutput(includeContext=True)
10
+
11
+
12
+ def fix_in_ia(fix_x, fix_y, ia_x_min, ia_x_max, ia_y_min, ia_y_max):
13
+ in_x = ia_x_min <= fix_x <= ia_x_max
14
+ in_y = ia_y_min <= fix_y <= ia_y_max
15
+ if in_x and in_y:
16
+ return True
17
+ else:
18
+ return False
19
+
20
+
21
+ def fix_in_ia_default(fixation, ia_row, prefix):
22
+ return fix_in_ia(
23
+ fixation.x,
24
+ fixation.y,
25
+ ia_row[f"{prefix}_xmin"],
26
+ ia_row[f"{prefix}_xmax"],
27
+ ia_row[f"{prefix}_ymin"],
28
+ ia_row[f"{prefix}_ymax"],
29
+ )
30
+
31
+
32
+ def number_of_fixations_own(trial, dffix, prefix, correction_algo):
33
+ """
34
+ Return the number of fixations on that interest area.
35
+ """
36
+ ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
37
+ counts = []
38
+ for cidx, ia_row in ia_df.iterrows():
39
+ count = 0
40
+ for idx, fixation in dffix.iterrows():
41
+ if fix_in_ia(
42
+ fixation.x,
43
+ fixation.y,
44
+ ia_row[f"{prefix}_xmin"],
45
+ ia_row[f"{prefix}_xmax"],
46
+ ia_row[f"{prefix}_ymin"],
47
+ ia_row[f"{prefix}_ymax"],
48
+ ):
49
+ count += 1
50
+ counts.append(
51
+ {
52
+ f"{prefix}_number": cidx,
53
+ prefix: ia_row[f"{prefix}"],
54
+ f"number_of_fixations_{correction_algo}": count,
55
+ }
56
+ )
57
+ return pd.DataFrame(counts)
58
+
59
+
60
+ def initial_fixation_duration_own(trial, dffix, prefix, correction_algo):
61
+ """
62
+ The duration of the initial fixation on that interest area for each word.
63
+ """
64
+ ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
65
+ durations = []
66
+
67
+ for cidx, ia_row in ia_df.iterrows():
68
+ initial_duration = 0
69
+ for idx, fixation in dffix.iterrows():
70
+ if fix_in_ia_default(fixation, ia_row, prefix):
71
+ initial_duration = fixation.duration
72
+ break # Exit the loop after finding the initial fixation for the word
73
+ durations.append(
74
+ {
75
+ f"{prefix}_number": cidx,
76
+ prefix: ia_row[f"{prefix}"],
77
+ f"initial_fixation_duration_{correction_algo}": initial_duration,
78
+ }
79
+ )
80
+
81
+ return pd.DataFrame(durations)
82
+
83
+
84
+ def first_of_many_duration_own(trial, dffix, prefix, correction_algo):
85
+ ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
86
+ durations = []
87
+ for cidx, ia_row in ia_df.iterrows():
88
+ fixation_durations = []
89
+ for idx, fixation in dffix.iterrows():
90
+ if fix_in_ia_default(fixation, ia_row, prefix):
91
+ fixation_durations.append(fixation.duration)
92
+ if len(fixation_durations) > 1:
93
+ durations.append(
94
+ {
95
+ f"{prefix}_number": cidx,
96
+ prefix: ia_row[f"{prefix}"],
97
+ f"first_of_many_duration_{correction_algo}": fixation_durations[0],
98
+ }
99
+ )
100
+ else:
101
+ durations.append(
102
+ {
103
+ f"{prefix}_number": cidx,
104
+ prefix: ia_row[f"{prefix}"],
105
+ f"first_of_many_duration_{correction_algo}": None,
106
+ }
107
+ )
108
+ if len(durations) > 0:
109
+ return pd.DataFrame(durations)
110
+ else:
111
+ return pd.DataFrame()
112
+
113
+
114
+ def total_fixation_duration_own(trial, dffix, prefix, correction_algo):
115
+ """
116
+ sum duration of all fixations on that interest area.
117
+ """
118
+ ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
119
+ durations = []
120
+ for cidx, ia_row in ia_df.iterrows():
121
+ total_duration = 0
122
+ for idx, fixation in dffix.iterrows():
123
+ if fix_in_ia_default(fixation, ia_row, prefix):
124
+ total_duration += fixation.duration
125
+ durations.append(
126
+ {
127
+ f"{prefix}_number": cidx,
128
+ prefix: ia_row[f"{prefix}"],
129
+ f"total_fixation_duration_{correction_algo}": total_duration,
130
+ }
131
+ )
132
+ return pd.DataFrame(durations)
133
+
134
+
135
+ def gaze_duration_own(trial, dffix, prefix, correction_algo):
136
+ """
137
+ Gaze duration is the sum duration of all fixations
138
+ inside an interest area until the area is exited for the first time.
139
+ """
140
+ ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
141
+ durations = []
142
+ for cidx, ia_row in ia_df.iterrows():
143
+ duration = 0
144
+ in_ia = False
145
+ for idx, fixation in dffix.iterrows():
146
+ if fix_in_ia_default(fixation, ia_row, prefix):
147
+ duration += fixation.duration
148
+ in_ia = True
149
+ elif in_ia:
150
+ break
151
+ durations.append(
152
+ {
153
+ f"{prefix}_number": cidx,
154
+ prefix: ia_row[f"{prefix}"],
155
+ f"gaze_duration_{correction_algo}": duration,
156
+ }
157
+ )
158
+ return pd.DataFrame(durations)
159
+
160
+
161
+ def go_past_duration_own(trial, dffix, prefix, correction_algo):
162
+ """
163
+ Given an interest area and fixation sequence, return the go-past time on
164
+ that interest area. Go-past time is the sum duration of all fixations from
165
+ when the interest area is first entered until when it is first exited to
166
+ the right, including any regressions to the left that occur during that
167
+ time period (and vice versa in the case of right-to-left text).
168
+ """
169
+ ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
170
+ results = []
171
+
172
+ for cidx, ia_row in ia_df.iterrows():
173
+ entered = False
174
+ go_past_time = 0
175
+
176
+ for idx, fixation in dffix.iterrows():
177
+ if fix_in_ia_default(fixation, ia_row, prefix):
178
+ if not entered:
179
+ entered = True
180
+ go_past_time += fixation.duration
181
+ elif entered:
182
+ if ia_row[f"{prefix}_xmax"] < fixation.x: # Interest area has been exited to the right
183
+ break
184
+ go_past_time += fixation.duration
185
+
186
+ results.append(
187
+ {f"{prefix}_number": cidx, prefix: ia_row[f"{prefix}"], f"go_past_duration_{correction_algo}": go_past_time}
188
+ )
189
+
190
+ return pd.DataFrame(results)
191
+
192
+
193
+ def second_pass_duration_own(trial, dffix, prefix, correction_algo):
194
+ """
195
+ Given an interest area and fixation sequence, return the second pass
196
+ duration on that interest area for each word.
197
+ """
198
+ ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
199
+ durations = []
200
+
201
+ for cidx, ia_row in ia_df.iterrows():
202
+ current_pass = None
203
+ next_pass = 1
204
+ pass_duration = 0
205
+ for idx, fixation in dffix.iterrows():
206
+ if fix_in_ia_default(fixation, ia_row, prefix):
207
+ if current_pass is None: # first fixation in a new pass
208
+ current_pass = next_pass
209
+ if current_pass == 2:
210
+ pass_duration += fixation.duration
211
+ elif current_pass == 1: # first fixation to exit the first pass
212
+ current_pass = None
213
+ next_pass += 1
214
+ elif current_pass == 2: # first fixation to exit the second pass
215
+ break
216
+ durations.append(
217
+ {
218
+ f"{prefix}_number": cidx,
219
+ prefix: ia_row[f"{prefix}"],
220
+ f"second_pass_duration_{correction_algo}": pass_duration,
221
+ }
222
+ )
223
+
224
+ return pd.DataFrame(durations)
225
+
226
+
227
+ def initial_landing_position_own(trial, dffix, prefix, correction_algo):
228
+ """
229
+ initial landing position (expressed in character positions) on that interest area.
230
+ Counting is from 1. Returns `None` if no fixation
231
+ landed on the interest area.
232
+ """
233
+ ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
234
+ if prefix == "word":
235
+ chars_df = pd.DataFrame(trial[f"chars_list"])
236
+ else:
237
+ chars_df = None
238
+ results = []
239
+ for cidx, ia_row in ia_df.iterrows():
240
+ landing_position = None
241
+ for idx, fixation in dffix.iterrows():
242
+ if fix_in_ia_default(fixation, ia_row, prefix):
243
+ if prefix == "char":
244
+ landing_position = 1
245
+ else:
246
+ prefix_temp = "char"
247
+ matched_chars_df = chars_df.loc[
248
+ (chars_df.char_xmin >= ia_row[f"{prefix}_xmin"])
249
+ & (chars_df.char_xmax <= ia_row[f"{prefix}_xmax"])
250
+ & (chars_df.char_ymin >= ia_row[f"{prefix}_ymin"])
251
+ & (chars_df.char_ymax <= ia_row[f"{prefix}_ymax"]),
252
+ :,
253
+ ] # TODO need to find way to count correct letter number
254
+ for char_idx, (rowidx, char_row) in enumerate(matched_chars_df.iterrows()):
255
+ if fix_in_ia_default(fixation, char_row, prefix_temp):
256
+ landing_position = char_idx + 1 # starts at 1
257
+ break
258
+ break
259
+ results.append(
260
+ {
261
+ f"{prefix}_number": cidx,
262
+ prefix: ia_row[f"{prefix}"],
263
+ f"initial_landing_position_{correction_algo}": landing_position,
264
+ }
265
+ )
266
+ return pd.DataFrame(results)
267
+
268
+
269
+ def initial_landing_distance_own(trial, dffix, prefix, correction_algo):
270
+ """
271
+ Given an interest area and fixation sequence, return the initial landing
272
+ distance on that interest area. The initial landing distance is the pixel
273
+ distance between the first fixation to land in an interest area and the
274
+ left edge of that interest area (or, in the case of right-to-left text,
275
+ the right edge). Technically, the distance is measured from the text onset
276
+ without including any padding. Returns `None` if no fixation landed on the
277
+ interest area.
278
+ """
279
+ ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
280
+ distances = []
281
+ for cidx, ia_row in ia_df.iterrows():
282
+ initial_distance = None
283
+ for idx, fixation in dffix.iterrows():
284
+ if fix_in_ia_default(fixation, ia_row, prefix):
285
+ distance = abs(ia_row[f"{prefix}_xmin"] - fixation.x)
286
+ if initial_distance is None:
287
+ initial_distance = distance
288
+ break
289
+ distances.append(
290
+ {
291
+ f"{prefix}_number": cidx,
292
+ prefix: ia_row[f"{prefix}"],
293
+ f"initial_landing_distance_{correction_algo}": initial_distance,
294
+ }
295
+ )
296
+ return pd.DataFrame(distances)
297
+
298
+
299
+ def landing_distances_own(trial, dffix, prefix, correction_algo):
300
+ """
301
+ Given an interest area and fixation sequence, return a dataframe with
302
+ landing distances for each word in the interest area.
303
+ """
304
+ ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
305
+ distances = []
306
+ for cidx, ia_row in ia_df.iterrows():
307
+ landing_distances = []
308
+ for idx, fixation in dffix.iterrows():
309
+ if fix_in_ia_default(fixation, ia_row, prefix):
310
+ landing_distance = abs(ia_row[f"{prefix}_xmin"] - fixation.x)
311
+ landing_distances.append(round(landing_distance, ndigits=2))
312
+ distances.append(
313
+ {
314
+ f"{prefix}_number": cidx,
315
+ prefix: ia_row[f"{prefix}"],
316
+ f"landing_distances_{correction_algo}": landing_distances,
317
+ }
318
+ )
319
+ return pd.DataFrame(distances)
320
+
321
+
322
+ def number_of_regressions_in_own(trial, dffix, prefix, correction_algo):
323
+ word_reg_in_count = (
324
+ dffix.groupby([f"on_{prefix}_number_{correction_algo}", f"on_{prefix}_{correction_algo}"])[
325
+ f"{prefix}_reg_in_{correction_algo}"
326
+ ]
327
+ .sum()
328
+ .reset_index()
329
+ .rename(
330
+ columns={
331
+ f"on_{prefix}_number_{correction_algo}": f"{prefix}_number",
332
+ f"{prefix}_reg_in_{correction_algo}": f"number_of_regressions_in_{correction_algo}",
333
+ f"on_{prefix}_{correction_algo}": prefix,
334
+ }
335
+ )
336
+ )
337
+
338
+ return word_reg_in_count
app.py ADDED
The diff for this file is too large to render. See raw diff
 
chars_df_columns.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Column names for Character Dataframe
2
+ - subject: Subject name or ID (derived from filename)
3
+ - trial_id: Trial ID
4
+ - condition: Condition (if applicable)
5
+ - item: Item ID
6
+ - index:
7
+ - letternum: Number of the character
8
+ - char: The character
9
+ - char_xmin: x start position (in pixel)
10
+ - char_ymin: y start position (in pixel)
11
+ - char_xmax: x end position (in pixel)
12
+ - char_ymax: y end position (in pixel)
13
+ - char_y_center: x center position (in pixel)
14
+ - char_x_center: y center position (in pixel)
15
+ - assigned_line: Line of text the character belongs to
16
+ - in_word_number: Number of word the character belongs to
17
+ - in_word: Word the character belongs to
18
+ - num_letters_from_start_of_word: Number of characters since the start of the word
19
+ - in_sentence_number: Number of sentence the character belongs to
20
+ - in_sentence: Sentence the character belongs to
21
+ - letline: Character position from start of line
22
+ - wordline: Word position from start of line for the word the character belongs to
23
+ - wordsent: Word position from start of the sentence for the word the character belongs to
24
+ - letword: Character position from start of word starting from space before word
classic_correction_algos.py ADDED
@@ -0,0 +1,552 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Mostly adapted from https://github.com/jwcarr/eyekit/blob/350d055eecaa1581b03db5a847424825ffbb10f6/eyekit/_snap.py
3
+ """
4
+
5
+ import os
6
+ import numpy as np
7
+ from sklearn.cluster import KMeans
8
+ from icecream import ic
9
+
10
+ ic.configureOutput(includeContext=True)
11
+
12
+ os.environ["OMP_NUM_THREADS"] = "1" # Prevents KMeans memory leak on windows
13
+
14
+
15
+ def apply_classic_algo(
16
+ dffix,
17
+ trial,
18
+ algo="slice",
19
+ algo_params=dict(x_thresh=192, y_thresh=32, w_thresh=32, n_thresh=90),
20
+ ):
21
+ fixation_array = dffix.loc[:, ["x", "y"]].values
22
+ y_diff = trial["y_diff"]
23
+ if "y_char_unique" in trial:
24
+ midlines = trial["y_char_unique"]
25
+ else:
26
+ midlines = trial["y_midline"]
27
+ if len(midlines) == 1:
28
+ corrected_fix_y_vals = np.ones((fixation_array.shape[0])) * midlines[0]
29
+ elif fixation_array.shape[0] <= 2:
30
+ corrected_fix_y_vals = np.ones((fixation_array.shape[0])) * midlines[0]
31
+
32
+ else:
33
+ if algo == "slice":
34
+ corrected_fix_y_vals = slice(fixation_array, midlines, line_height=y_diff, **algo_params)
35
+ elif algo == "warp":
36
+ word_center_list = [(word["word_x_center"], word["word_y_center"]) for word in trial["words_list"]]
37
+ corrected_fix_y_vals = warp(fixation_array, word_center_list)
38
+ elif algo == "chain":
39
+ corrected_fix_y_vals = chain(fixation_array, midlines, **algo_params)
40
+ elif algo == "cluster":
41
+ corrected_fix_y_vals = cluster(fixation_array, midlines)
42
+ elif algo == "merge":
43
+ corrected_fix_y_vals = merge(fixation_array, midlines, **algo_params)
44
+ elif algo == "regress":
45
+ corrected_fix_y_vals = regress(fixation_array, midlines, **algo_params)
46
+ elif algo == "segment":
47
+ corrected_fix_y_vals = segment(fixation_array, midlines, **algo_params)
48
+ elif algo == "split":
49
+ corrected_fix_y_vals = split(fixation_array, midlines, **algo_params)
50
+ elif algo == "stretch":
51
+ corrected_fix_y_vals = stretch(fixation_array, midlines, **algo_params)
52
+ elif algo == "attach":
53
+ corrected_fix_y_vals = attach(fixation_array, midlines)
54
+ elif algo == "compare":
55
+ word_center_list = [(word["word_x_center"], word["word_y_center"]) for word in trial["words_list"]]
56
+ n_nearest_lines = min(algo_params["n_nearest_lines"], len(midlines) - 1)
57
+ algo_params["n_nearest_lines"] = n_nearest_lines
58
+ corrected_fix_y_vals = compare(fixation_array, np.array(word_center_list), **algo_params)
59
+ else:
60
+ raise NotImplementedError(f"{algo} not implemented")
61
+ corrected_fix_y_vals = np.round(corrected_fix_y_vals, decimals=2)
62
+ corrected_line_nums = [trial["y_char_unique"].index(y) for y in corrected_fix_y_vals]
63
+ dffix[f"y_{algo}"] = corrected_fix_y_vals
64
+ dffix[f"line_num_{algo}"] = corrected_line_nums
65
+ dffix = dffix.copy()
66
+ return dffix
67
+
68
+
69
+ def slice(fixation_XY, midlines, line_height: float, x_thresh=192, y_thresh=32, w_thresh=32, n_thresh=90):
70
+ """
71
+ Form a set of runs and then reduce the set to *m* by repeatedly merging
72
+ those that appear to be on the same line. Merged sequences are then
73
+ assigned to text lines in positional order. Default params:
74
+ `x_thresh=192`, `y_thresh=32`, `w_thresh=32`, `n_thresh=90`. Requires
75
+ NumPy. Original method by [Glandorf & Schroeder (2021)](https://doi.org/10.1016/j.procs.2021.09.069).
76
+ """
77
+ fixation_XY = np.array(fixation_XY, dtype=float)
78
+ line_Y = np.array(midlines, dtype=float)
79
+ proto_lines, phantom_proto_lines = {}, {}
80
+ # 1. Segment runs
81
+ dist_X = abs(np.diff(fixation_XY[:, 0]))
82
+ dist_Y = abs(np.diff(fixation_XY[:, 1]))
83
+ end_run_indices = list(np.where(np.logical_or(dist_X > x_thresh, dist_Y > y_thresh))[0] + 1)
84
+ run_starts = [0] + end_run_indices
85
+ run_ends = end_run_indices + [len(fixation_XY)]
86
+ runs = [list(range(start, end)) for start, end in zip(run_starts, run_ends)]
87
+ # 2. Determine starting run
88
+ longest_run_i = np.argmax([fixation_XY[run[-1], 0] - fixation_XY[run[0], 0] for run in runs])
89
+ proto_lines[0] = runs.pop(longest_run_i)
90
+ # 3. Group runs into proto lines
91
+ while runs:
92
+ merger_on_this_iteration = False
93
+ for proto_line_i, direction in [(min(proto_lines), -1), (max(proto_lines), 1)]:
94
+ # Create new proto line above or below (depending on direction)
95
+ proto_lines[proto_line_i + direction] = []
96
+ # Get current proto line XY coordinates (if proto line is empty, get phanton coordinates)
97
+ if proto_lines[proto_line_i]:
98
+ proto_line_XY = fixation_XY[proto_lines[proto_line_i]]
99
+ else:
100
+ proto_line_XY = phantom_proto_lines[proto_line_i]
101
+ # Compute differences between current proto line and all runs
102
+ run_differences = np.zeros(len(runs))
103
+ for run_i, run in enumerate(runs):
104
+ y_diffs = [y - proto_line_XY[np.argmin(abs(proto_line_XY[:, 0] - x)), 1] for x, y in fixation_XY[run]]
105
+ run_differences[run_i] = np.mean(y_diffs)
106
+ # Find runs that can be merged into this proto line
107
+ merge_into_current = list(np.where(abs(run_differences) < w_thresh)[0])
108
+ # Find runs that can be merged into the adjacent proto line
109
+ merge_into_adjacent = list(
110
+ np.where(
111
+ np.logical_and(
112
+ run_differences * direction >= w_thresh,
113
+ run_differences * direction < n_thresh,
114
+ )
115
+ )[0]
116
+ )
117
+ # Perform mergers
118
+ for index in merge_into_current:
119
+ proto_lines[proto_line_i].extend(runs[index])
120
+ for index in merge_into_adjacent:
121
+ proto_lines[proto_line_i + direction].extend(runs[index])
122
+ # If no, mergers to the adjacent, create phantom line for the adjacent
123
+ if not merge_into_adjacent:
124
+ average_x, average_y = np.mean(proto_line_XY, axis=0)
125
+ adjacent_y = average_y + line_height * direction
126
+ phantom_proto_lines[proto_line_i + direction] = np.array([[average_x, adjacent_y]])
127
+ # Remove all runs that were merged on this iteration
128
+ for index in sorted(merge_into_current + merge_into_adjacent, reverse=True):
129
+ del runs[index]
130
+ merger_on_this_iteration = True
131
+ # If no mergers were made, break the while loop
132
+ if not merger_on_this_iteration:
133
+ break
134
+ # 4. Assign any leftover runs to the closest proto lines
135
+ for run in runs:
136
+ best_pl_distance = np.inf
137
+ best_pl_assignemnt = None
138
+ for proto_line_i in proto_lines:
139
+ if proto_lines[proto_line_i]:
140
+ proto_line_XY = fixation_XY[proto_lines[proto_line_i]]
141
+ else:
142
+ proto_line_XY = phantom_proto_lines[proto_line_i]
143
+ y_diffs = [y - proto_line_XY[np.argmin(abs(proto_line_XY[:, 0] - x)), 1] for x, y in fixation_XY[run]]
144
+ pl_distance = abs(np.mean(y_diffs))
145
+ if pl_distance < best_pl_distance:
146
+ best_pl_distance = pl_distance
147
+ best_pl_assignemnt = proto_line_i
148
+ proto_lines[best_pl_assignemnt].extend(run)
149
+ # 5. Prune proto lines
150
+ while len(proto_lines) > len(line_Y):
151
+ top, bot = min(proto_lines), max(proto_lines)
152
+ if len(proto_lines[top]) < len(proto_lines[bot]):
153
+ proto_lines[top + 1].extend(proto_lines[top])
154
+ del proto_lines[top]
155
+ else:
156
+ proto_lines[bot - 1].extend(proto_lines[bot])
157
+ del proto_lines[bot]
158
+ # 6. Map proto lines to text lines
159
+ for line_i, proto_line_i in enumerate(sorted(proto_lines)):
160
+ fixation_XY[proto_lines[proto_line_i], 1] = line_Y[line_i]
161
+ return fixation_XY[:, 1]
162
+
163
+
164
+ def attach(fixation_XY, line_Y):
165
+ n = len(fixation_XY)
166
+ for fixation_i in range(n):
167
+ line_i = np.argmin(abs(line_Y - fixation_XY[fixation_i, 1]))
168
+ fixation_XY[fixation_i, 1] = line_Y[line_i]
169
+ return fixation_XY[:, 1]
170
+
171
+
172
+ def chain(fixation_XY, midlines, x_thresh=192, y_thresh=32):
173
+ """
174
+ Chain consecutive fixations that are sufficiently close to each other, and
175
+ then assign chains to their closest text lines. Default params:
176
+ `x_thresh=192`, `y_thresh=32`. Requires NumPy. Original method
177
+ implemented in [popEye](https://github.com/sascha2schroeder/popEye/).
178
+ """
179
+ try:
180
+ import numpy as np
181
+ except ModuleNotFoundError as e:
182
+ e.msg = "The chain method requires NumPy."
183
+ raise
184
+ fixation_XY = np.array(fixation_XY)
185
+ line_Y = np.array(midlines)
186
+ dist_X = abs(np.diff(fixation_XY[:, 0]))
187
+ dist_Y = abs(np.diff(fixation_XY[:, 1]))
188
+ end_chain_indices = list(np.where(np.logical_or(dist_X > x_thresh, dist_Y > y_thresh))[0] + 1)
189
+ end_chain_indices.append(len(fixation_XY))
190
+ start_of_chain = 0
191
+ for end_of_chain in end_chain_indices:
192
+ mean_y = np.mean(fixation_XY[start_of_chain:end_of_chain, 1])
193
+ line_i = np.argmin(abs(line_Y - mean_y))
194
+ fixation_XY[start_of_chain:end_of_chain, 1] = line_Y[line_i]
195
+ start_of_chain = end_of_chain
196
+ return fixation_XY[:, 1]
197
+
198
+
199
+ def cluster(fixation_XY, line_Y):
200
+ m = len(line_Y)
201
+ fixation_Y = fixation_XY[:, 1].reshape(-1, 1)
202
+ if fixation_Y.shape[0] < m:
203
+ ic(f"CLUSTER failed because of low number of fixations: {fixation_XY.shape}")
204
+ ic("Assigned all fixation to first line")
205
+ return np.ones_like(fixation_XY[:, 1]) * line_Y[0]
206
+ clusters = KMeans(m, n_init=100, max_iter=300).fit_predict(fixation_Y)
207
+ centers = [fixation_Y[clusters == i].mean() for i in range(m)]
208
+ ordered_cluster_indices = np.argsort(centers)
209
+ for fixation_i, cluster_i in enumerate(clusters):
210
+ line_i = np.where(ordered_cluster_indices == cluster_i)[0][0]
211
+ fixation_XY[fixation_i, 1] = line_Y[line_i]
212
+ return fixation_XY[:, 1]
213
+
214
+
215
+ def compare(fixation_XY, word_XY, x_thresh=512, n_nearest_lines=3):
216
+ # COMPARE
217
+ #
218
+ # Lima Sanches, C., Kise, K., & Augereau, O. (2015). Eye gaze and text
219
+ # line matching for reading analysis. In Adjunct proceedings of the
220
+ # 2015 ACM International Joint Conference on Pervasive and
221
+ # Ubiquitous Computing and proceedings of the 2015 ACM International
222
+ # Symposium on Wearable Computers (pp. 1227–1233). Association for
223
+ # Computing Machinery.
224
+ #
225
+ # https://doi.org/10.1145/2800835.2807936
226
+ line_Y = np.unique(word_XY[:, 1])
227
+ n = len(fixation_XY)
228
+ diff_X = np.diff(fixation_XY[:, 0])
229
+ end_line_indices = list(np.where(diff_X < -x_thresh)[0] + 1)
230
+ end_line_indices.append(n)
231
+ start_of_line = 0
232
+ for end_of_line in end_line_indices:
233
+ gaze_line = fixation_XY[start_of_line:end_of_line]
234
+ mean_y = np.mean(gaze_line[:, 1])
235
+ lines_ordered_by_proximity = np.argsort(abs(line_Y - mean_y))
236
+ nearest_line_I = lines_ordered_by_proximity[:n_nearest_lines]
237
+ line_costs = np.zeros(n_nearest_lines)
238
+ for candidate_i in range(n_nearest_lines):
239
+ candidate_line_i = nearest_line_I[candidate_i]
240
+ text_line = word_XY[word_XY[:, 1] == line_Y[candidate_line_i]]
241
+ dtw_cost, dtw_path = dynamic_time_warping(gaze_line[:, 0:1], text_line[:, 0:1])
242
+ line_costs[candidate_i] = dtw_cost
243
+ line_i = nearest_line_I[np.argmin(line_costs)]
244
+ fixation_XY[start_of_line:end_of_line, 1] = line_Y[line_i]
245
+ start_of_line = end_of_line
246
+ return fixation_XY[:, 1]
247
+
248
+
249
+ def merge(fixation_XY, midlines, text_right_to_left=False, y_thresh=32, gradient_thresh=0.1, error_thresh=20):
250
+ """
251
+ Form a set of progressive sequences and then reduce the set to *m* by
252
+ repeatedly merging those that appear to be on the same line. Merged
253
+ sequences are then assigned to text lines in positional order. Default
254
+ params: `y_thresh=32`, `gradient_thresh=0.1`, `error_thresh=20`. Requires
255
+ NumPy. Original method by [Špakov et al. (2019)](https://doi.org/10.3758/s13428-018-1120-x).
256
+ """
257
+ try:
258
+ import numpy as np
259
+ except ModuleNotFoundError as e:
260
+ e.msg = "The merge method requires NumPy."
261
+ raise
262
+ fixation_XY = np.array(fixation_XY)
263
+ line_Y = np.array(midlines)
264
+ diff_X = np.diff(fixation_XY[:, 0])
265
+ dist_Y = abs(np.diff(fixation_XY[:, 1]))
266
+ if text_right_to_left:
267
+ sequence_boundaries = list(np.where(np.logical_or(diff_X > 0, dist_Y > y_thresh))[0] + 1)
268
+ else:
269
+ sequence_boundaries = list(np.where(np.logical_or(diff_X < 0, dist_Y > y_thresh))[0] + 1)
270
+ sequence_starts = [0] + sequence_boundaries
271
+ sequence_ends = sequence_boundaries + [len(fixation_XY)]
272
+ sequences = [list(range(start, end)) for start, end in zip(sequence_starts, sequence_ends)]
273
+ for min_i, min_j, remove_constraints in [
274
+ (3, 3, False), # Phase 1
275
+ (1, 3, False), # Phase 2
276
+ (1, 1, False), # Phase 3
277
+ (1, 1, True), # Phase 4
278
+ ]:
279
+ while len(sequences) > len(line_Y):
280
+ best_merger = None
281
+ best_error = np.inf
282
+ for i in range(len(sequences) - 1):
283
+ if len(sequences[i]) < min_i:
284
+ continue # first sequence too short, skip to next i
285
+ for j in range(i + 1, len(sequences)):
286
+ if len(sequences[j]) < min_j:
287
+ continue # second sequence too short, skip to next j
288
+ candidate_XY = fixation_XY[sequences[i] + sequences[j]]
289
+ gradient, intercept = np.polyfit(candidate_XY[:, 0], candidate_XY[:, 1], 1)
290
+ residuals = candidate_XY[:, 1] - (gradient * candidate_XY[:, 0] + intercept)
291
+ error = np.sqrt(sum(residuals**2) / len(candidate_XY))
292
+ if remove_constraints or (abs(gradient) < gradient_thresh and error < error_thresh):
293
+ if error < best_error:
294
+ best_merger = (i, j)
295
+ best_error = error
296
+ if best_merger is None:
297
+ break # no possible mergers, break while and move to next phase
298
+ merge_i, merge_j = best_merger
299
+ merged_sequence = sequences[merge_i] + sequences[merge_j]
300
+ sequences.append(merged_sequence)
301
+ del sequences[merge_j], sequences[merge_i]
302
+ mean_Y = [fixation_XY[sequence, 1].mean() for sequence in sequences]
303
+ ordered_sequence_indices = np.argsort(mean_Y)
304
+ for line_i, sequence_i in enumerate(ordered_sequence_indices):
305
+ fixation_XY[sequences[sequence_i], 1] = line_Y[line_i]
306
+ return fixation_XY[:, 1]
307
+
308
+
309
+ def regress(
310
+ fixation_XY,
311
+ midlines,
312
+ slope_bounds=(-0.1, 0.1),
313
+ offset_bounds=(-50, 50),
314
+ std_bounds=(1, 20),
315
+ ):
316
+ """
317
+ Find *m* regression lines that best fit the fixations and group fixations
318
+ according to best fit regression lines, and then assign groups to text
319
+ lines in positional order. Default params: `slope_bounds=(-0.1, 0.1)`,
320
+ `offset_bounds=(-50, 50)`, `std_bounds=(1, 20)`. Requires SciPy.
321
+ Original method by [Cohen (2013)](https://doi.org/10.3758/s13428-012-0280-3).
322
+ """
323
+ try:
324
+ import numpy as np
325
+ from scipy.optimize import minimize
326
+ from scipy.stats import norm
327
+ except ModuleNotFoundError as e:
328
+ e.msg = "The regress method requires SciPy."
329
+ raise
330
+ fixation_XY = np.array(fixation_XY)
331
+ line_Y = np.array(midlines)
332
+ density = np.zeros((len(fixation_XY), len(line_Y)))
333
+
334
+ def fit_lines(params):
335
+ k = slope_bounds[0] + (slope_bounds[1] - slope_bounds[0]) * norm.cdf(params[0])
336
+ o = offset_bounds[0] + (offset_bounds[1] - offset_bounds[0]) * norm.cdf(params[1])
337
+ s = std_bounds[0] + (std_bounds[1] - std_bounds[0]) * norm.cdf(params[2])
338
+ predicted_Y_from_slope = fixation_XY[:, 0] * k
339
+ line_Y_plus_offset = line_Y + o
340
+ for line_i in range(len(line_Y)):
341
+ fit_Y = predicted_Y_from_slope + line_Y_plus_offset[line_i]
342
+ density[:, line_i] = norm.logpdf(fixation_XY[:, 1], fit_Y, s)
343
+ return -sum(density.max(axis=1))
344
+
345
+ best_fit = minimize(fit_lines, [0, 0, 0], method="powell")
346
+ fit_lines(best_fit.x)
347
+ return line_Y[density.argmax(axis=1)]
348
+
349
+
350
+ def segment(fixation_XY, midlines, text_right_to_left=False):
351
+ """
352
+ Segment fixation sequence into *m* subsequences based on *m*–1 most-likely
353
+ return sweeps, and then assign subsequences to text lines in chronological
354
+ order. Requires NumPy. Original method by
355
+ [Abdulin & Komogortsev (2015)](https://doi.org/10.1109/BTAS.2015.7358786).
356
+ """
357
+ try:
358
+ import numpy as np
359
+ except ModuleNotFoundError as e:
360
+ e.msg = "The segment method requires NumPy."
361
+ raise
362
+ fixation_XY = np.array(fixation_XY)
363
+ line_Y = np.array(midlines)
364
+ diff_X = np.diff(fixation_XY[:, 0])
365
+ saccades_ordered_by_length = np.argsort(diff_X)
366
+ if text_right_to_left:
367
+ line_change_indices = saccades_ordered_by_length[-(len(line_Y) - 1) :]
368
+ else:
369
+ line_change_indices = saccades_ordered_by_length[: len(line_Y) - 1]
370
+ current_line_i = 0
371
+ for fixation_i in range(len(fixation_XY)):
372
+ fixation_XY[fixation_i, 1] = line_Y[current_line_i]
373
+ if fixation_i in line_change_indices:
374
+ current_line_i += 1
375
+ return fixation_XY[:, 1]
376
+
377
+
378
+ def split(fixation_XY, midlines, text_right_to_left=False):
379
+ """
380
+ Split fixation sequence into subsequences based on best candidate return
381
+ sweeps, and then assign subsequences to closest text lines. Requires
382
+ SciPy. Original method by [Carr et al. (2022)](https://doi.org/10.3758/s13428-021-01554-0).
383
+ """
384
+ try:
385
+ import numpy as np
386
+ from scipy.cluster.vq import kmeans2
387
+ except ModuleNotFoundError as e:
388
+ e.msg = "The split method requires SciPy."
389
+ raise
390
+ fixation_XY = np.array(fixation_XY)
391
+ line_Y = np.array(midlines)
392
+ diff_X = np.array(np.diff(fixation_XY[:, 0]), dtype=float).reshape(-1, 1)
393
+ centers, clusters = kmeans2(diff_X, 2, iter=100, minit="++", missing="raise")
394
+ if text_right_to_left:
395
+ sweep_marker = np.argmax(centers)
396
+ else:
397
+ sweep_marker = np.argmin(centers)
398
+ end_line_indices = list(np.where(clusters == sweep_marker)[0] + 1)
399
+ end_line_indices.append(len(fixation_XY))
400
+ start_of_line = 0
401
+ for end_of_line in end_line_indices:
402
+ mean_y = np.mean(fixation_XY[start_of_line:end_of_line, 1])
403
+ line_i = np.argmin(abs(line_Y - mean_y))
404
+ fixation_XY[start_of_line:end_of_line] = line_Y[line_i]
405
+ start_of_line = end_of_line
406
+ return fixation_XY[:, 1]
407
+
408
+
409
+ def stretch(fixation_XY, midlines, stretch_bounds=(0.9, 1.1), offset_bounds=(-50, 50)):
410
+ """
411
+ Find a stretch factor and offset that results in a good alignment between
412
+ the fixations and lines of text, and then assign the transformed fixations
413
+ to the closest text lines. Default params: `stretch_bounds=(0.9, 1.1)`,
414
+ `offset_bounds=(-50, 50)`. Requires SciPy.
415
+ Original method by [Lohmeier (2015)](http://www.monochromata.de/master_thesis/ma1.3.pdf).
416
+ """
417
+ try:
418
+ import numpy as np
419
+ from scipy.optimize import minimize
420
+ except ModuleNotFoundError as e:
421
+ e.msg = "The stretch method requires SciPy."
422
+ raise
423
+ fixation_Y = np.array(fixation_XY)[:, 1]
424
+ line_Y = np.array(midlines)
425
+ n = len(fixation_Y)
426
+ corrected_Y = np.zeros(n)
427
+
428
+ def fit_lines(params):
429
+ candidate_Y = fixation_Y * params[0] + params[1]
430
+ for fixation_i in range(n):
431
+ line_i = np.argmin(abs(line_Y - candidate_Y[fixation_i]))
432
+ corrected_Y[fixation_i] = line_Y[line_i]
433
+ return sum(abs(candidate_Y - corrected_Y))
434
+
435
+ best_fit = minimize(fit_lines, [1, 0], method="powell", bounds=[stretch_bounds, offset_bounds])
436
+ fit_lines(best_fit.x)
437
+ return corrected_Y
438
+
439
+
440
+ def warp(fixation_XY, word_center_list):
441
+ """
442
+ Map fixations to word centers using [Dynamic Time
443
+ Warping](https://en.wikipedia.org/wiki/Dynamic_time_warping). This finds a
444
+ monotonically increasing mapping between fixations and words with the
445
+ shortest overall distance, effectively resulting in *m* subsequences.
446
+ Fixations are then assigned to the lines that their mapped words belong
447
+ to, effectively assigning subsequences to text lines in chronological
448
+ order. Requires NumPy.
449
+ Original method by [Carr et al. (2022)](https://doi.org/10.3758/s13428-021-01554-0).
450
+ """
451
+ try:
452
+ import numpy as np
453
+ except ModuleNotFoundError as e:
454
+ e.msg = "The warp method requires NumPy."
455
+ raise
456
+ fixation_XY = np.array(fixation_XY)
457
+ word_XY = np.array([word_center for word_center in word_center_list])
458
+ n1 = len(fixation_XY)
459
+ n2 = len(word_XY)
460
+ cost = np.zeros((n1 + 1, n2 + 1))
461
+ cost[0, :] = np.inf
462
+ cost[:, 0] = np.inf
463
+ cost[0, 0] = 0
464
+ for fixation_i in range(n1):
465
+ for word_i in range(n2):
466
+ distance = np.sqrt(sum((fixation_XY[fixation_i] - word_XY[word_i]) ** 2))
467
+ cost[fixation_i + 1, word_i + 1] = distance + min(
468
+ cost[fixation_i, word_i + 1],
469
+ cost[fixation_i + 1, word_i],
470
+ cost[fixation_i, word_i],
471
+ )
472
+ cost = cost[1:, 1:]
473
+ warping_path = [[] for _ in range(n1)]
474
+ while fixation_i > 0 or word_i > 0:
475
+ warping_path[fixation_i].append(word_i)
476
+ possible_moves = [np.inf, np.inf, np.inf]
477
+ if fixation_i > 0 and word_i > 0:
478
+ possible_moves[0] = cost[fixation_i - 1, word_i - 1]
479
+ if fixation_i > 0:
480
+ possible_moves[1] = cost[fixation_i - 1, word_i]
481
+ if word_i > 0:
482
+ possible_moves[2] = cost[fixation_i, word_i - 1]
483
+ best_move = np.argmin(possible_moves)
484
+ if best_move == 0:
485
+ fixation_i -= 1
486
+ word_i -= 1
487
+ elif best_move == 1:
488
+ fixation_i -= 1
489
+ else:
490
+ word_i -= 1
491
+ warping_path[0].append(0)
492
+ for fixation_i, words_mapped_to_fixation_i in enumerate(warping_path):
493
+ candidate_Y = list(word_XY[words_mapped_to_fixation_i, 1])
494
+ fixation_XY[fixation_i, 1] = max(set(candidate_Y), key=candidate_Y.count)
495
+ return fixation_XY[:, 1]
496
+
497
+
498
+ def dynamic_time_warping(sequence1, sequence2):
499
+ n1 = len(sequence1)
500
+ n2 = len(sequence2)
501
+ dtw_cost = np.zeros((n1 + 1, n2 + 1))
502
+ dtw_cost[0, :] = np.inf
503
+ dtw_cost[:, 0] = np.inf
504
+ dtw_cost[0, 0] = 0
505
+ for i in range(n1):
506
+ for j in range(n2):
507
+ this_cost = np.sqrt(sum((sequence1[i] - sequence2[j]) ** 2))
508
+ dtw_cost[i + 1, j + 1] = this_cost + min(dtw_cost[i, j + 1], dtw_cost[i + 1, j], dtw_cost[i, j])
509
+ dtw_cost = dtw_cost[1:, 1:]
510
+ dtw_path = [[] for _ in range(n1)]
511
+ while i > 0 or j > 0:
512
+ dtw_path[i].append(j)
513
+ possible_moves = [np.inf, np.inf, np.inf]
514
+ if i > 0 and j > 0:
515
+ possible_moves[0] = dtw_cost[i - 1, j - 1]
516
+ if i > 0:
517
+ possible_moves[1] = dtw_cost[i - 1, j]
518
+ if j > 0:
519
+ possible_moves[2] = dtw_cost[i, j - 1]
520
+ best_move = np.argmin(possible_moves)
521
+ if best_move == 0:
522
+ i -= 1
523
+ j -= 1
524
+ elif best_move == 1:
525
+ i -= 1
526
+ else:
527
+ j -= 1
528
+ dtw_path[0].append(0)
529
+ return dtw_cost[-1, -1], dtw_path
530
+
531
+
532
+ def wisdom_of_the_crowd(assignments):
533
+ """
534
+ For each fixation, choose the y-value with the most votes across multiple
535
+ algorithms. In the event of a tie, the left-most algorithm is given
536
+ priority.
537
+ """
538
+ assignments = np.column_stack(assignments)
539
+ correction = []
540
+ for row in assignments:
541
+ candidates = list(row)
542
+ candidate_counts = {y: candidates.count(y) for y in set(candidates)}
543
+ best_count = max(candidate_counts.values())
544
+ best_candidates = [y for y, c in candidate_counts.items() if c == best_count]
545
+ if len(best_candidates) == 1:
546
+ correction.append(best_candidates[0])
547
+ else:
548
+ for y in row:
549
+ if y in best_candidates:
550
+ correction.append(y)
551
+ break
552
+ return correction
emreading_funcs.py ADDED
@@ -0,0 +1,994 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Mostly adapted from https://github.com/martin-vasilev/EMreading
2
+ Moslty deprecated in favour of alternative methods."""
3
+
4
+ from icecream import ic
5
+ from io import StringIO
6
+ import re
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+
11
+ def assign_chars_to_words(df):
12
+ df.reset_index(inplace=True, names="index_temp")
13
+ df["wordID"] = ""
14
+ df["char_word"] = -1
15
+ word_list = []
16
+ cols = []
17
+ sent_list = df["sent"].unique()
18
+
19
+ for i in range(len(sent_list)): # for each sentence
20
+ word_list = df[df["sent"] == i]["word"].unique()
21
+ for j in range(len(word_list)):
22
+ cols = df[(df["sent"] == i) & (df["word"] == word_list[j])].index
23
+ df.loc[cols, "wordID"] = "".join(df["char"].loc[cols])
24
+ df.loc[(df["sent"] == i) & (df["word"] == word_list[j]), "char_word"] = [k for k in range(len(cols))]
25
+ df.set_index("index_temp", inplace=True)
26
+ return df
27
+
28
+
29
+ def round_and_int(value):
30
+ if not pd.isna(value):
31
+ return int(round(value))
32
+ else:
33
+ return None
34
+
35
+
36
+ def get_coord_map(coords, x=1920, y=1080):
37
+ """
38
+ Original R version:
39
+ ```R
40
+ # Use stimuli information to create a coordinate map_arr for each pixel on the screen
41
+ # This makes it possible to find exactly what participants were fixating
42
+ coord_map_arr<- function(coords, x=resolution_x, y= resolution_y){
43
+
44
+ coords$id<- 1:nrow(coords)
45
+ map_arr<- data.frame(matrix(NA, nrow = y, ncol = x))
46
+
47
+ for(i in 1:nrow(coords)){
48
+ map_arr[coords$y1[i]:coords$y2[i],coords$x1[i]:coords$x2[i]]<- coords$id[i]
49
+
50
+ }
51
+
52
+ return(map_arr)
53
+ }```
54
+ """
55
+ coords.reset_index(drop=True, inplace=True)
56
+ y1 = coords["char_ymin"].map(round_and_int)
57
+ y2 = coords["char_ymax"].map(round_and_int)
58
+ x1 = coords["char_xmin"].map(round_and_int)
59
+ x2 = coords["char_xmax"].map(round_and_int)
60
+ coords["id"] = np.arange(len(coords))
61
+ map_arr = np.full((y, x), np.nan)
62
+
63
+ for i in range(len(coords)):
64
+ map_arr[y1[i] : y2[i] + 1, x1[i] : x2[i] + 1] = coords["id"].iloc[i]
65
+
66
+ np.sum(pd.isna(map_arr), axis=None)
67
+ return map_arr
68
+
69
+
70
+ def get_char_num_for_each_line(df):
71
+ df.reset_index(inplace=True, names="index_temp")
72
+ df["line_char"] = np.nan
73
+ unq_line = df["assigned_line"].unique()
74
+ for i in unq_line:
75
+ assigned_line = df[df["assigned_line"] == i].index
76
+ df.loc[assigned_line, "line_char"] = range(len(assigned_line))
77
+ df.set_index("index_temp", inplace=True)
78
+ return df
79
+
80
+
81
+ def parse_fix(
82
+ file,
83
+ trial_db,
84
+ ):
85
+
86
+ indexrange = list(range(trial_db["trial_start_idx"], trial_db["trial_end_idx"] + 1))
87
+
88
+ sfix_stamps = [i for i in indexrange if re.search(r"(?i)(SFIX)", file[i])]
89
+
90
+ efix_stamps = [i for i in indexrange if re.search(r"(?i)EFIX", file[i])]
91
+
92
+ if len(sfix_stamps) > (len(efix_stamps) + 1):
93
+ ic(f"length mismatch parse_fix of {len(sfix_stamps) - (len(efix_stamps))}")
94
+
95
+ if not sfix_stamps or not efix_stamps:
96
+ raw_fix = None
97
+ return raw_fix
98
+ for safe_num in range(25):
99
+ if efix_stamps[0] < sfix_stamps[0]:
100
+ efix_stamps = efix_stamps[1:]
101
+ elif efix_stamps[-1] <= sfix_stamps[-1]:
102
+ sfix_stamps = sfix_stamps[:-1]
103
+ elif efix_stamps[0] >= sfix_stamps[0]:
104
+ sfix_stamps = sfix_stamps[1:]
105
+ if not (len(efix_stamps) != len(sfix_stamps) and len(efix_stamps) > 1 and len(sfix_stamps) > 1):
106
+ break
107
+
108
+ def parse_sacc(string):
109
+ a = string.split(" ")
110
+ return float(a[2])
111
+
112
+ esacc_flag = [file[f - 1] if "ESACC" in file[f - 1] else None for f in sfix_stamps]
113
+ saccDur = []
114
+ for k in esacc_flag:
115
+ if k is None:
116
+ saccDur.append(None)
117
+ else:
118
+ saccDur.append(parse_sacc(k))
119
+
120
+ s_time = [int(file[s].strip().split(" ")[-1]) for s in sfix_stamps]
121
+
122
+ e_time = [int(file[s - 1].strip().split(" ")[0]) for s in efix_stamps]
123
+ if len(s_time) != len(e_time):
124
+ if s_time[-1] > e_time[-1]:
125
+ s_time = s_time[:-1]
126
+
127
+ fixDur = [e_time[index] - s_time[index] for index in range(len(s_time))]
128
+ fixDur = [e - s for e, s in zip(e_time, s_time)]
129
+ assert ~(np.asarray(fixDur) < 0).any()
130
+ x = [float(file[fidx].split("\t")[3]) for fidx in efix_stamps]
131
+ y = [float(file[fidx].split("\t")[4]) for fidx in efix_stamps]
132
+ blink_stamp = [index for index in indexrange if "EBLINK" in file[index]]
133
+ blink_time = [float(file[index].strip().replace("\t", " ").split(" ")[2]) - 1 for index in blink_stamp]
134
+ index = np.searchsorted(s_time, blink_time, side="right") - 1
135
+ blink = np.zeros((len(s_time)))
136
+ blink[index] = -1
137
+ raw_fix = pd.DataFrame(
138
+ {"s_time": s_time, "e_time": e_time, "fixDur": fixDur, "saccDur": saccDur, "x": x, "y": y, "blink": blink}
139
+ )
140
+ return raw_fix
141
+
142
+
143
+ def process_fix_EM(fix, coords_map, coords, SL):
144
+ resolution_y, resolution_x = coords_map.shape
145
+ loc = None
146
+ raw_fix = pd.DataFrame()
147
+ num_fixations = len(fix)
148
+ SFIX = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
149
+ EFIX = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
150
+ x = np.full(num_fixations, np.nan)
151
+ y = np.full(num_fixations, np.nan)
152
+ fix_num = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
153
+ fix_dur = np.full(num_fixations, None)
154
+ sent = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
155
+ line = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
156
+ word = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
157
+ char_trial = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
158
+ char_line = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
159
+ word_line = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
160
+ max_sent = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
161
+ max_word = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
162
+ regress = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
163
+ blink = pd.array([None] * num_fixations, dtype=pd.BooleanDtype())
164
+ outOfBnds = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
165
+ outsideText = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
166
+ wordID = np.full(num_fixations, None)
167
+ land_pos = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
168
+ sacc_len = np.full(num_fixations, np.nan)
169
+
170
+ max_sentence = coords["in_sentence_number"].max()
171
+
172
+ curr_sent = np.zeros((max_sentence + 1, 2))
173
+ curr_sent[: max_sentence + 1, 0] = np.arange(0, max_sentence + 1)
174
+
175
+ if isinstance(coords["index"], str):
176
+ coords["index"] = pd.to_numeric(coords["index"], errors="coerce")
177
+
178
+ for j in range(len(fix)):
179
+ if (fix["y"][j] > 0) and (fix["x"][j] > 0) and (fix["y"][j] <= resolution_y) and (fix["x"][j] <= resolution_x):
180
+ loc = coords_map[round(fix["y"][j]), round(fix["x"][j])]
181
+ if pd.isnull(loc):
182
+ loc = None
183
+ else:
184
+ loc = None
185
+
186
+ fix_num[j] = j
187
+ fix_dur[j] = fix["duration"][j]
188
+ SFIX[j] = fix["start_uncorrected"][j]
189
+ EFIX[j] = fix["stop_uncorrected"][j]
190
+ x[j] = fix["x"][j]
191
+ y[j] = fix["y"][j]
192
+ blink[j] = fix["blink"][j]
193
+
194
+ if x[j] < 1 or x[j] > resolution_x or y[j] < 1 or y[j] > resolution_y:
195
+ outOfBnds[j] = 1
196
+ else:
197
+ outOfBnds[j] = 0
198
+ outsideText[j] = 1 if loc is None else 0
199
+
200
+ if fix["x"][j] < 0:
201
+ loc = None
202
+ outOfBnds[j] = 1
203
+ outsideText[j] = 1
204
+
205
+ if loc is not None:
206
+ sent[j] = coords["in_sentence_number"][loc]
207
+ line[j] = coords["assigned_line"][loc]
208
+ word[j] = coords["in_word_number"][loc]
209
+ word_line[j] = coords["wordline"][loc]
210
+ char_trial[j] = coords["index"][loc] + 1
211
+ char_line[j] = coords["letline"][loc]
212
+ wordID[j] = coords["in_word"][loc]
213
+ land_pos[j] = coords["letword"][loc]
214
+
215
+ if j > 0 and not pd.isna(char_trial[j]) and not pd.isna(char_trial[j - 1]):
216
+ sacc_len[j] = abs(char_trial[j] - char_trial[j - 1])
217
+ else:
218
+ sacc_len[j] = np.nan
219
+ else:
220
+ sent[j] = np.nan
221
+ line[j] = np.nan
222
+ word[j] = np.nan
223
+ word_line[j] = np.nan
224
+ char_trial[j] = np.nan
225
+ char_line[j] = np.nan
226
+ wordID[j] = np.nan
227
+ land_pos[j] = np.nan
228
+ sacc_len[j] = np.nan
229
+
230
+ if SL:
231
+ if loc is not None:
232
+ if j == 0:
233
+ max_sent[j] = sent[j]
234
+ else:
235
+ max_sent[j] = max_sent[j - 1] if pd.isna(sent[j]) or pd.isna(max_sent[j - 1]) else max_sent[j - 1]
236
+ if not (pd.isna(max_sent[j]) or pd.isna(sent[j])) and sent[j] > max_sent[j]:
237
+ max_sent[j] = sent[j]
238
+
239
+ if j == 0:
240
+ max_word[j] = abs(word[j])
241
+ curr_sent[sent[j] - 1, 1] = abs(word[j])
242
+ else:
243
+ max_word[j] = (
244
+ curr_sent[sent[j] - 1, 1]
245
+ if pd.isna(word[j]) or pd.isna(curr_sent[sent[j] - 1, 1])
246
+ else curr_sent[sent[j] - 1, 1]
247
+ )
248
+ if not (pd.isna(word[j]) or pd.isna(max_word[j])) and abs(word[j]) > curr_sent[sent[j] - 1, 1]:
249
+ max_word[j] = abs(word[j])
250
+ curr_sent[sent[j] - 1, 1] = abs(word[j])
251
+
252
+ if not (pd.isna(word[j]) or pd.isna(max_word[j])) and abs(word[j]) < max_word[j]:
253
+ regress[j] = 1
254
+ else:
255
+ regress[j] = 0
256
+
257
+ if j > 0 and not pd.isna(word[j]):
258
+ if pd.isna(regress[j - 1]):
259
+ regress[j] = np.nan
260
+ else:
261
+ if abs(word[j]) == max_word[j] and regress[j - 1] == 1 and word[j] in np.unique(word[:j]):
262
+ regress[j] = 1
263
+
264
+ raw_fix = pd.DataFrame(
265
+ {
266
+ "start_uncorrected": SFIX,
267
+ "stop_uncorrected": EFIX,
268
+ "x": x,
269
+ "y": y,
270
+ "fixation_number": fix_num,
271
+ "on_sentence_number_EM": sent,
272
+ "line_EM": line,
273
+ "word_EM": word,
274
+ "word_line_EM": word_line,
275
+ "char_trial_EM": char_trial,
276
+ "char_line_EM": char_line,
277
+ "regress_EM": regress,
278
+ "wordID_EM": wordID,
279
+ "land_pos_EM": land_pos,
280
+ "sacc_len_EM": sacc_len,
281
+ "blink_EM": blink,
282
+ "outOfBnds_EM": outOfBnds,
283
+ "outsideText_EM": outsideText,
284
+ }
285
+ )
286
+
287
+ fix2 = fix.merge(
288
+ raw_fix,
289
+ on=[
290
+ "start_uncorrected",
291
+ "stop_uncorrected",
292
+ "x",
293
+ "y",
294
+ "fixation_number",
295
+ ],
296
+ how="left",
297
+ )
298
+ return fix2
299
+
300
+
301
+ def RS(i, rawfix, coords, reqYthresh, reqXthresh, Ythresh, Xthresh, threshSimilar):
302
+
303
+ if i == 0:
304
+ return 0
305
+
306
+ lw = coords["char_xmax"][0] - coords["char_xmin"][0]
307
+ lh = coords["char_ymax"][0] - coords["char_ymin"][0]
308
+ meetXthresh = False
309
+ meetYthresh = False
310
+
311
+ leftSacc = rawfix["x"][i] < rawfix["x"][i - 1]
312
+ downSacc = rawfix["y"][i] > rawfix["y"][i - 1]
313
+
314
+ if downSacc & reqYthresh:
315
+ Ydiff = lh * Ythresh
316
+ trueYdiff = rawfix["y"][i] - rawfix["y"][i - 1]
317
+ meetYthresh = trueYdiff >= Ydiff
318
+
319
+ if leftSacc & reqXthresh:
320
+ Xdiff = lw * Xthresh
321
+ trueXdiff = rawfix["x"][i - 1] - rawfix["x"][i]
322
+ meetXthresh = trueXdiff >= Xdiff
323
+
324
+ maxPoints = 1 + 2
325
+ if reqYthresh:
326
+ maxPoints += 1
327
+ if reqXthresh:
328
+ maxPoints += 1
329
+
330
+ currPoints = 0
331
+ if leftSacc:
332
+ currPoints = currPoints + (1 / maxPoints)
333
+ if meetXthresh:
334
+ currPoints = currPoints + (1 / maxPoints)
335
+
336
+ if downSacc:
337
+ currPoints = currPoints + 2 * (1 / maxPoints)
338
+ if meetYthresh:
339
+ currPoints = currPoints + (1 / maxPoints)
340
+
341
+ return round(currPoints, 2)
342
+
343
+
344
+ def reMap(rawfix, i, coords_map, coords, newY=None):
345
+ rawfix.set_index("fixation_number", inplace=True)
346
+ assert i in rawfix.index, "Not in index"
347
+ rawfix.loc[i, "reAligned"] = True
348
+ rawfix.loc[i, "previous_line"] = rawfix.loc[i, "line_EM"]
349
+ rawfix.loc[i, "previous_y"] = rawfix.loc[i, "y"]
350
+ if newY != None:
351
+ rawfix.loc[i, "y"] = newY
352
+ loc = coords_map[round(rawfix.loc[i, "y"]), round(rawfix.loc[i, "x"])]
353
+ if pd.isnull(loc):
354
+ return rawfix
355
+ rawfix.loc[i, "on_sentence_number_EM"] = coords["in_sentence_number"][loc]
356
+ rawfix.loc[i, "word_EM"] = coords["in_word_number"][loc]
357
+ rawfix.loc[i, "line_EM"] = coords["assigned_line"][loc]
358
+
359
+ return rawfix.reset_index(drop=False, names=["fixation_number"])
360
+
361
+
362
+ def reAlign(rawfix, coords, coords_map, RSpar):
363
+
364
+ ystart = coords["char_ymin"].min()
365
+ yend = coords["char_ymax"].max()
366
+ nlines = coords["assigned_line"].max()
367
+ letterHeight = coords["char_ymax"][0] - coords["char_ymin"][0]
368
+ xstart = pd.DataFrame(columns=["1", "2"])
369
+ xstart["1"] = np.arange(nlines + 1)
370
+ ystart = pd.DataFrame(columns=["1", "2"])
371
+ ystart["1"] = np.arange(nlines + 1)
372
+ xend = pd.DataFrame(columns=["1", "2"])
373
+ xend["1"] = np.arange(nlines + 1)
374
+ yend = pd.DataFrame(columns=["1", "2"])
375
+ yend["1"] = np.arange(nlines + 1)
376
+ rawfix["previous_x"] = np.nan
377
+
378
+ for i in coords["assigned_line"].unique():
379
+ a = coords[coords["assigned_line"] == i]
380
+ xstart.loc[i, "2"] = a["char_xmin"].min()
381
+ xend.loc[i, "2"] = a["char_xmax"].max()
382
+ ystart.loc[i, "2"] = a["char_ymin"].min()
383
+ yend.loc[i, "2"] = a["char_ymax"].min()
384
+
385
+ lineCenter = ystart["2"] + letterHeight / 2
386
+
387
+ rawfix["prob_return_sweep"] = np.nan
388
+ rawfix["prob_interline_saccade"] = np.nan
389
+ rawfix["reAligned"] = False
390
+ rawfix["previous_y"] = np.nan
391
+ rawfix["previous_line"] = np.nan
392
+
393
+ for i in range(rawfix.shape[0]):
394
+ rawfix.loc[i, "prob_return_sweep"] = RS(
395
+ i,
396
+ rawfix,
397
+ coords,
398
+ reqYthresh=True,
399
+ reqXthresh=True,
400
+ Ythresh=RSpar[0],
401
+ Xthresh=RSpar[1],
402
+ threshSimilar=RSpar[2],
403
+ )
404
+
405
+ if i > 0:
406
+ if (rawfix["prob_return_sweep"][i] < 1) & (rawfix["y"][i] > rawfix["y"][i - 1] + letterHeight / 2):
407
+ rawfix.loc[i, "prob_return_sweep"] = 1
408
+
409
+ rawfix.loc[i, "previous_x"] = rawfix["x"][i]
410
+ rawfix.loc[i, "previous_y"] = rawfix["y"][i]
411
+
412
+ if i > 0:
413
+ if rawfix["y"][i] < rawfix["y"][i - 1] - letterHeight / 2:
414
+ rawfix.loc[i, "prob_interline_saccade"] = 1
415
+ else:
416
+ rawfix.loc[i, "prob_interline_saccade"] = 0
417
+
418
+ RsweepFix = np.sort(
419
+ np.concatenate(
420
+ (np.where(rawfix["prob_return_sweep"] == 1)[0], np.where(rawfix["prob_interline_saccade"] == 1)[0])
421
+ )
422
+ )
423
+
424
+ for i in range(len(RsweepFix)):
425
+ if i == 0:
426
+ linePass = rawfix.loc[: RsweepFix[0] - 1]
427
+
428
+ elif i >= len(RsweepFix):
429
+ linePass = rawfix.loc[RsweepFix[-1] :]
430
+
431
+ else:
432
+ linePass = rawfix.loc[RsweepFix[i - 1] : RsweepFix[i] - 1]
433
+
434
+ if linePass.shape[0] == 1:
435
+ continue
436
+
437
+ avgYpos = linePass["y"].mean(skipna=True)
438
+ whichLine = min(range(len(lineCenter)), key=lambda index: abs(lineCenter[index] - avgYpos))
439
+ linePass.reset_index(inplace=True, drop=True)
440
+ for j in range(linePass.shape[0]):
441
+ onLine = (linePass["y"][j] >= ystart["2"][whichLine]) & (linePass["y"][j] <= yend["2"][whichLine])
442
+
443
+ if not onLine:
444
+ if linePass["y"][j] < ystart["2"][whichLine]:
445
+ rawfix = reMap(
446
+ rawfix, linePass.loc[j, "fixation_number"], coords_map, coords, newY=ystart["2"][whichLine] + 5
447
+ )
448
+ else:
449
+ rawfix = reMap(
450
+ rawfix, linePass.loc[j, "fixation_number"], coords_map, coords, newY=yend["2"][whichLine] - 5
451
+ )
452
+ rawfix.loc[linePass.loc[j, "fixation_number"], "reAligned"] = True
453
+ else:
454
+ rawfix.loc[linePass.loc[j, "fixation_number"], "reAligned"] = False
455
+
456
+ return rawfix
457
+
458
+
459
+ def cleanData(
460
+ raw_fix,
461
+ algo_choice,
462
+ removeBlinks=True,
463
+ combineNearbySmallFix=True,
464
+ combineMethod="char",
465
+ combineDist=1,
466
+ removeSmallFix=True,
467
+ smallFixCutoff=80,
468
+ remove_duration_outliers=True,
469
+ outlierMethod="ms",
470
+ outlierCutoff=800,
471
+ keepRS=False,
472
+ ):
473
+
474
+ if combineNearbySmallFix:
475
+ nbefore = raw_fix.shape[0]
476
+ which_comb = []
477
+
478
+ for i, _ in enumerate(raw_fix):
479
+ prev_line_same = False
480
+ next_line_same = False
481
+
482
+ if (i > 0) and (i < nbefore - 1):
483
+ if combineMethod == "char":
484
+ if (
485
+ pd.isna(raw_fix[f"letternum_{algo_choice}"][i])
486
+ or pd.isna(raw_fix[f"letternum_{algo_choice}"][i - 1])
487
+ or pd.isna(raw_fix[f"letternum_{algo_choice}"][i + 1])
488
+ ):
489
+ continue
490
+
491
+ if raw_fix["duration"][i] < smallFixCutoff:
492
+ if (
493
+ not pd.isna(raw_fix[f"line_num_{algo_choice}"][i])
494
+ and not pd.isna(raw_fix[f"line_num_{algo_choice}"][i - 1])
495
+ and not pd.isna(raw_fix[f"line_num_{algo_choice}"][i + 1])
496
+ ):
497
+
498
+ if raw_fix[f"line_num_{algo_choice}"][i] == raw_fix[f"line_num_{algo_choice}"][i - 1]:
499
+ prev_line_same = True
500
+ if raw_fix[f"line_num_{algo_choice}"][i] == raw_fix[f"line_num_{algo_choice}"][i + 1]:
501
+ next_line_same = True
502
+
503
+ if combineMethod == "char":
504
+ prev = abs(raw_fix[f"letternum_{algo_choice}"][i] - raw_fix[f"letternum_{algo_choice}"][i - 1])
505
+ after = abs(raw_fix[f"letternum_{algo_choice}"][i] - raw_fix[f"letternum_{algo_choice}"][i + 1])
506
+
507
+ else:
508
+ prev = abs(round(raw_fix["x"][i]) - round(raw_fix["x"][i - 1]))
509
+ after = abs(round(raw_fix["x"][i]) - round(raw_fix["x"][i + 1]))
510
+
511
+ if prev <= combineDist:
512
+ which_comb.append(i)
513
+
514
+ if prev_line_same:
515
+
516
+ raw_fix["duration"][i - 1] += raw_fix["duration"][i]
517
+
518
+ if keepRS and (raw_fix["Rtn_sweep"][i] == 1):
519
+
520
+ raw_fix["Rtn_sweep"][i - 1] = 1
521
+
522
+ if after <= combineDist:
523
+ which_comb.append(i)
524
+
525
+ if next_line_same:
526
+
527
+ raw_fix["duration"][i + 1] += raw_fix["duration"][i]
528
+
529
+ if keepRS and (raw_fix["Rtn_sweep"][i] == 1):
530
+
531
+ raw_fix["Rtn_sweep"][i + 1] = 1
532
+
533
+ which_comb = list(set(which_comb))
534
+
535
+ if len(which_comb) > 0:
536
+ raw_fix = raw_fix.drop(labels=which_comb, axis=0)
537
+ nstart = raw_fix.shape[0]
538
+
539
+ if removeBlinks:
540
+ raw_fix = raw_fix[~raw_fix["blink"]].copy()
541
+ nblink = nstart - raw_fix.shape[0]
542
+
543
+ if remove_duration_outliers:
544
+ if outlierMethod == "ms":
545
+ outIndices = np.where(raw_fix["duration"] > outlierCutoff)[0]
546
+ if len(outIndices) > 0:
547
+ raw_fix = raw_fix.drop(outIndices).copy()
548
+ elif outlierMethod == "std":
549
+ nSubCutoff, nOutliers = [], 0
550
+ subM = np.mean(raw_fix["duration"])
551
+ subSTD = np.std(raw_fix["duration"])
552
+ cutoff = subM + outlierCutoff * subSTD
553
+ nSubCutoff.append((len(np.where(raw_fix[raw_fix["duration"] > cutoff])[0])))
554
+ nOutliers = sum(nSubCutoff)
555
+
556
+ return raw_fix.reset_index(drop=True)
557
+
558
+
559
+ def get_space(s):
560
+ if len(s) == 0 or s == " ":
561
+ return 1
562
+ else:
563
+ return None
564
+
565
+
566
+ def get_num(string):
567
+ strr = "".join([i for i in string if i.isdigit()])
568
+ if len(strr) > 0:
569
+ return int(strr)
570
+ else:
571
+ ic(string)
572
+ return strr
573
+
574
+
575
+ def parse_itemID(trialid):
576
+ I = re.search(r"I", trialid).start()
577
+ condition = get_num(trialid[:I])
578
+
579
+ D = re.search(r"D", trialid).start()
580
+ item = get_num(trialid[I + 1 : D])
581
+ depend = get_num(trialid[D:])
582
+
583
+ E = trialid[0]
584
+
585
+ return {"trialid": trialid, "condition": condition, "item": item, "depend": depend, "trial_is": E}
586
+
587
+
588
+ def get_coord(str_input):
589
+ string = "\n".join(
590
+ [l.split("\t")[1].strip() for l in str_input if (("DELAY" not in l) & ("BUTTON" not in l) & ("REGION" in l))]
591
+ )
592
+
593
+ df = pd.read_table(
594
+ StringIO(string),
595
+ sep=" ",
596
+ names=["X" + str(i) for i in range(1, 12)],
597
+ )
598
+ df.loc[:, ["char_xmin", "char_ymin", "char_xmax", "char_ymax", "X11"]] = df[
599
+ ["char_xmin", "char_ymin", "char_xmax", "char_ymax", "X11"]
600
+ ].apply(pd.to_numeric, errors="coerce")
601
+ df.char = df.char.fillna("")
602
+
603
+ a = df[df["char"] == ""].index
604
+ for i in a:
605
+ if "space" not in df.columns:
606
+ df.loc[:, "space"] = None
607
+ df.at[i, "space"] = 1
608
+
609
+ if "char_xmin" in df.columns and "char_ymin" in df.columns:
610
+ df.at[i, "char_xmin"], df.at[i, "char_ymin"] = df.at[i, "char_ymin"], df.at[i, "char_xmax"]
611
+
612
+ if "char_ymin" in df.columns and "char_xmax" in df.columns:
613
+ df.at[i, "char_ymin"], df.at[i, "char_xmax"] = df.at[i, "char_xmax"], df.at[i, "char_ymax"]
614
+
615
+ if "char_xmax" in df.columns and "char_ymax" in df.columns:
616
+ df.at[i, "char_xmax"], df.at[i, "char_ymax"] = df.at[i, "char_ymax"], df.at[i, "X11"]
617
+ df = df.drop(columns=["X1", "X2", "X3", "X5"])
618
+ return df
619
+
620
+
621
+ def map_sent(df):
622
+
623
+ sent_bnd = df[(df.char == ".") | (df.char == "?") | (df.char == "!")].index.tolist()
624
+
625
+ if len(sent_bnd) > 0:
626
+ sent = pd.Series([-1] * len(df))
627
+
628
+ for i, eidx in enumerate(sent_bnd):
629
+ sidx = sent_bnd[i - 1] if i > 0 else 0
630
+ if i == len(sent_bnd) - 1:
631
+ sent.loc[sidx:] = len(sent_bnd) - 1
632
+ else:
633
+ sent.loc[sidx:eidx] = i
634
+ df["sent"] = sent
635
+ else:
636
+ df["sent"] = 1
637
+ return df
638
+
639
+
640
+ def map_line(df):
641
+ df = df[~pd.isnull(df["char_ymin"])].reset_index(names="index_temp")
642
+
643
+ lines = sorted(set(df["char_ymin"].values))
644
+
645
+ assigned_line = np.array([], dtype=int)
646
+
647
+ for i in range(len(lines)):
648
+ loc_lines = np.where(df["char_ymin"].values == lines[i])[0]
649
+ assigned_line = np.concatenate((assigned_line, np.full(len(loc_lines), fill_value=i)))
650
+ df.loc[len(assigned_line) - 1, "space"] = 2
651
+
652
+ df["assigned_line"] = assigned_line
653
+ df.set_index("index_temp", inplace=True)
654
+
655
+ return df
656
+
657
+
658
+ def map_words(df):
659
+ curr_sent, curr_line, curr_word = 0, 0, 0
660
+ df["space"] == 2
661
+
662
+ for i in df.index:
663
+ newSent = curr_sent != df.loc[i, "sent"]
664
+ newLine = curr_line != df.loc[i, "assigned_line"]
665
+
666
+ df.loc[i, "word"] = curr_word
667
+ if df.loc[i, "char"] == "" and not newSent:
668
+ curr_word += 1
669
+ df.loc[i, "word"] = curr_word
670
+
671
+ elif newLine:
672
+ if df.loc[i, "char"] != ".":
673
+ curr_word += 1
674
+ df.loc[i, "word"] = curr_word
675
+ curr_line += 1
676
+
677
+ elif newSent:
678
+ curr_sent += 1
679
+ curr_word = 0
680
+ df.loc[i, "word"] = curr_word
681
+
682
+ return df
683
+
684
+
685
+ def get_return_sweeps(raw_fix_new, coords, algo_choice): # TODO Check if covered by popEye
686
+ currentSent = 0
687
+ currentLine = 0
688
+ maxLine = 0
689
+ inReg = False
690
+
691
+ curr_sent = np.zeros((max(coords["in_sentence_number"]) + 1, 4))
692
+ curr_sent[:, 0] = np.arange(0, max(coords["in_sentence_number"]) + 1)
693
+
694
+ diff_sent = coords["in_sentence_number"].diff().fillna(0)
695
+ last_words = coords.loc[np.where(diff_sent == 1), "in_word_number"]
696
+ curr_sent[:, 2] = np.append(last_words.values, coords["in_word_number"].iloc[-1])
697
+ for m in range(1, len(raw_fix_new)):
698
+ if not (pd.isna(raw_fix_new["char_line_EM"][m - 1]) or pd.isna(raw_fix_new["char_line_EM"][m])):
699
+ raw_fix_new.at[m, "sacc_len_EM"] = abs(raw_fix_new["char_line_EM"][m] - raw_fix_new["char_line_EM"][m - 1])
700
+
701
+ if not pd.isna(raw_fix_new["line_EM"][m]):
702
+ currentLine = raw_fix_new["line_EM"][m]
703
+
704
+ if currentLine > maxLine:
705
+ maxLine = currentLine
706
+ raw_fix_new.at[m, "Rtn_sweep"] = 1
707
+
708
+ if m < len(raw_fix_new) - 1:
709
+ sameLine = (
710
+ not (pd.isna(raw_fix_new["line_EM"][m + 1]) or pd.isna(raw_fix_new["line_EM"][m]))
711
+ and raw_fix_new["line_EM"][m + 1] == raw_fix_new["line_EM"][m]
712
+ )
713
+
714
+ if raw_fix_new["x"][m + 1] < raw_fix_new["x"][m]:
715
+ raw_fix_new.at[m, "Rtn_sweep_type"] = "undersweep" if sameLine else None
716
+ else:
717
+ raw_fix_new.at[m, "Rtn_sweep_type"] = "accurate" if sameLine else None
718
+ else:
719
+ raw_fix_new.at[m, "Rtn_sweep_type"] = np.nan
720
+ else:
721
+ raw_fix_new.at[m, "Rtn_sweep"] = 0
722
+
723
+ if not pd.isna(raw_fix_new["on_sentence_number_EM"][m]):
724
+ if m == 1:
725
+ curr_sent[int(raw_fix_new["on_sentence_number_EM"][m]), 2] = raw_fix_new["word_EM"][m]
726
+ raw_fix_new.at[m, "regress_EM"] = 0
727
+ else:
728
+ if raw_fix_new["word_EM"][m] > curr_sent[int(raw_fix_new["on_sentence_number_EM"][m]), 2]:
729
+ curr_sent[int(raw_fix_new["on_sentence_number_EM"][m]), 2] = raw_fix_new["word_EM"][m]
730
+ inReg = False
731
+
732
+ if currentSent < raw_fix_new["on_sentence_number_EM"][m]:
733
+ curr_sent[currentSent, 3] = 1
734
+ currentSent = raw_fix_new["on_sentence_number_EM"][m]
735
+
736
+ if (
737
+ not pd.isna(raw_fix_new["on_sentence_number_EM"][m - 1])
738
+ and raw_fix_new["on_sentence_number_EM"][m] > raw_fix_new["on_sentence_number_EM"][m - 1]
739
+ ):
740
+ curr_sent[int(raw_fix_new["on_sentence_number_EM"][m - 1]), 3] = 1
741
+
742
+ if (
743
+ raw_fix_new["word_EM"][m] < curr_sent[int(raw_fix_new["on_sentence_number_EM"][m]), 2]
744
+ and curr_sent[int(raw_fix_new["on_sentence_number_EM"][m]), 3] == 0
745
+ ):
746
+ raw_fix_new.at[m, "regress_EM"] = 1
747
+ inReg = True
748
+ else:
749
+ if curr_sent[int(raw_fix_new["on_sentence_number_EM"][m]), 3] == 0:
750
+ raw_fix_new.at[m, "regress_EM"] = 0
751
+
752
+ if (
753
+ raw_fix_new["word_EM"][m] == curr_sent[int(raw_fix_new["on_sentence_number_EM"][m]), 2]
754
+ and inReg
755
+ ):
756
+ raw_fix_new.at[m, "regress_EM"] = 1
757
+ else:
758
+ raw_fix_new.at[m, "regress_EM"] = 1
759
+ raw_fix_new.at[m, "regress2nd_EM"] = 1
760
+ inReg = True
761
+ return raw_fix_new
762
+
763
+
764
+ def word_m_EM(n2):
765
+ sub_list = []
766
+ item_list = []
767
+ cond_list = []
768
+ seq_list = []
769
+ word_list = []
770
+ wordID_list = []
771
+ sent_list = []
772
+ FFD_list = []
773
+ SFD_list = []
774
+ GD_list = []
775
+ TVT_list = []
776
+ nfix1_list = []
777
+ nfix2_list = []
778
+ nfixAll_list = []
779
+ regress_list = []
780
+ o = n2["sent"].unique()
781
+ for k in range(len(o)):
782
+ q = n2[n2["sent"] == o[k]]
783
+ r = sorted(q["word"].unique())
784
+
785
+ for l in range(len(r)):
786
+ word_list.append(r[l])
787
+ sub_list.append(n2["sub"].iloc[0])
788
+ item_list.append(n2["item"].iloc[0])
789
+ seq_list.append(n2["seq"].iloc[0])
790
+ cond_list.append(n2["cond"].iloc[0])
791
+ sent_list.append(o[k])
792
+
793
+ p = q[q["word"] == r[l]]
794
+
795
+ if p.shape[0] == 0:
796
+ FFD_list.append(None)
797
+ SFD_list.append(None)
798
+ GD_list.append(None)
799
+ TVT_list.append(None)
800
+ nfix1_list.append(0)
801
+ nfix2_list.append(0)
802
+ nfixAll_list.append(0)
803
+ else:
804
+ p1 = p[p["regress"] == 0]
805
+ p2 = p[p["regress"] == 1]
806
+
807
+ if p1.shape[0] == 0:
808
+ FFD_list.append(None)
809
+ SFD_list.append(None)
810
+ GD_list.append(None)
811
+ elif p1.shape[0] == 1:
812
+ FFD_list.append(p1["fix_dur"].iloc[0])
813
+ SFD_list.append(p1["fix_dur"].iloc[0])
814
+ GD_list.append(p1["fix_dur"].iloc[0])
815
+ else:
816
+ FFD_list.append(p1["fix_dur"].iloc[0])
817
+ SFD_list.append(None)
818
+ GD_list.append(p1["fix_dur"].sum())
819
+
820
+ TVT_list.append(p["fix_dur"].sum())
821
+ nfix1_list.append(p1.shape[0])
822
+ nfix2_list.append(p2.shape[0])
823
+ nfixAll_list.append(p1.shape[0] + p2.shape[0])
824
+
825
+ wordID_list.append(p["wordID"].iloc[0])
826
+
827
+ if nfix2_list[-1] == 0:
828
+ regress_list.append(0)
829
+ else:
830
+ regress_list.append(1)
831
+
832
+ dataT = pd.DataFrame(
833
+ {
834
+ "sub": sub_list,
835
+ "item": item_list,
836
+ "cond": cond_list,
837
+ "seq": seq_list,
838
+ "word": word_list,
839
+ "wordID": wordID_list,
840
+ "sent": sent_list,
841
+ "FFD": FFD_list,
842
+ "SFD": SFD_list,
843
+ "GD": GD_list,
844
+ "TVT": TVT_list,
845
+ "nfix1": nfix1_list,
846
+ "nfix2": nfix2_list,
847
+ "nfixAll": nfixAll_list,
848
+ "regress": regress_list,
849
+ }
850
+ )
851
+
852
+ sub_list = []
853
+ item_list = []
854
+ cond_list = []
855
+ seq_list = []
856
+ word_list = []
857
+ wordID_list = []
858
+ sent_list = []
859
+ FFD_list = []
860
+ SFD_list = []
861
+ GD_list = []
862
+ TVT_list = []
863
+ nfix1_list = []
864
+ nfix2_list = []
865
+ nfixAll_list = []
866
+ regress_list = []
867
+
868
+ if "dataN" in locals():
869
+ dataN = pd.concat([dataN, dataT], ignore_index=True)
870
+ else:
871
+ dataN = dataT
872
+
873
+
874
+ def word_measures_EM(data, algo_choice, include_time_stamps=False):
875
+ add_blanks = False
876
+
877
+ if "blink" in data.columns:
878
+ required_columns = ["blink", "prev_blink", "after_blink"]
879
+ if all(col in data.columns for col in required_columns):
880
+ if (data["blink"] + data["prev_blink"] + data["after_blink"]).sum() == 0:
881
+ ic("Blinks appear to be already excluded! \n\n")
882
+ else:
883
+ add_blanks = True
884
+ ic("There appears to be valid blink data! We will map blinks to individual words. \n\n")
885
+
886
+ regress_blinks = data[data["blink"] == 1 & ~data["regress_EM"].isna()].index
887
+
888
+ if len(regress_blinks) < 1:
889
+ BlinkFixTypeNotMapped = True
890
+ ic(
891
+ "Fixation type is not mapped for observations with blinks. Therefore, blinks can't be mapped in terms of 1st and 2nd pass reading."
892
+ )
893
+ ic(
894
+ "Please note that, by default, blink fixation durations will also not be added to fixation duration measures for that word since it's assumed you will delete this word from analysis.\n"
895
+ )
896
+ ic("If you need to change this, see settings in the pre-processing function.\n\n")
897
+
898
+ data_n = pd.DataFrame()
899
+
900
+ o_k = sorted(np.unique(data[f"on_sentence_num_{algo_choice}"]))
901
+
902
+ for k, sent_k in enumerate(o_k):
903
+ q_k = data[data[f"on_sentence_num_{algo_choice}"] == sent_k]
904
+
905
+ p1_k = q_k[q_k["regress_EM"] == 0].copy()
906
+ p2_k = q_k[q_k["regress_EM"] == 1].copy()
907
+
908
+ RS_word = np.nan
909
+ check_next = False
910
+
911
+ if max(data[f"line_num_{algo_choice}"]) > 1:
912
+ for z, q_row in q_k.iterrows():
913
+ if not pd.isna(q_row["Rtn_sweep"]):
914
+ if q_row["Rtn_sweep"] == 1:
915
+ check_next = True
916
+ RS_Word = (
917
+ q_row[f"line_word_{algo_choice}"]
918
+ if not pd.isna(q_row[f"line_word_{algo_choice}"])
919
+ else np.nan
920
+ )
921
+ elif check_next and (pd.notna(q_row[f"line_word_{algo_choice}"])) and (q_row["regress_EM"]):
922
+ break
923
+
924
+ word_l = []
925
+ sub_l = [data.loc[0, "subject"]] * len(q_k)
926
+ item_l = [data.loc[0, "item"]] * len(q_k)
927
+ cond_l = [1] * len(q_k)
928
+
929
+ for l, q_row in q_k.iterrows():
930
+ word_l.append(q_row[f"on_word_number_{algo_choice}"])
931
+
932
+ if add_blanks:
933
+ sum_1st_pass = (
934
+ q_row["blink"]
935
+ + p1_k[p1_k.index[q_row.name]]["prev_blink"]
936
+ + p2_k[p2_k.index[q_row.name]]["after_blink"]
937
+ ).sum()
938
+
939
+ blinks_l = [0] * len(word_l)
940
+ if sum_1st_pass > 0:
941
+ blinks_l[l] = 1
942
+
943
+ for l, q_row in q_k.iterrows():
944
+ word_line_l = [q_row[f"line_word_{algo_choice}"]]
945
+
946
+ line_l = [q_row[f"line_num_{algo_choice}"]]
947
+
948
+ if include_time_stamps:
949
+ EFIX_SFD_l = [np.nan]
950
+
951
+ for l, q_row in q_k.iterrows():
952
+ word_line_l.append(q_row[f"line_word_{algo_choice}"])
953
+ line_l.append(q_row[f"line_num_{algo_choice}"])
954
+
955
+ if include_time_stamps:
956
+
957
+ if len(p1_k) > 0:
958
+
959
+ if len(p1_k) == 1:
960
+ EFIX_SFD_l.append(p1_k["stop_uncorrected"][0])
961
+
962
+ data_t = pd.DataFrame(
963
+ list(
964
+ zip(
965
+ sub_l,
966
+ item_l,
967
+ cond_l,
968
+ word_l,
969
+ line_l,
970
+ )
971
+ ),
972
+ columns=[
973
+ "subject",
974
+ "item",
975
+ "condition",
976
+ f"on_word_number_{algo_choice}",
977
+ f"line_num_{algo_choice}",
978
+ "FFD",
979
+ "SFD",
980
+ "GD",
981
+ "TVT",
982
+ "nfix1",
983
+ "nfix2",
984
+ "nfixAll",
985
+ "regress",
986
+ ],
987
+ )
988
+
989
+ if add_blanks:
990
+ data_t["blinks_1stPass"] = blinks_l
991
+
992
+ data_n = pd.concat([data_n, data_t], ignore_index=True)
993
+
994
+ return data_n
eyekit_measures.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import eyekit as ek
3
+ import numpy as np
4
+ import pandas as pd
5
+ from PIL import Image
6
+ from icecream import ic
7
+ import time
8
+
9
+ ic.configureOutput(includeContext=True)
10
+ MEASURES_DICT = {
11
+ "number_of_fixations": [],
12
+ "initial_fixation_duration": [],
13
+ "first_of_many_duration": [],
14
+ "total_fixation_duration": [],
15
+ "gaze_duration": [],
16
+ "go_past_duration": [],
17
+ "second_pass_duration": [],
18
+ "initial_landing_position": [],
19
+ "initial_landing_distance": [],
20
+ "landing_distances": [],
21
+ "number_of_regressions_in": [],
22
+ }
23
+
24
+
25
+ def get_fix_seq_and_text_block(
26
+ dffix,
27
+ trial,
28
+ x_txt_start=None,
29
+ y_txt_start=None,
30
+ font_face="Courier New",
31
+ font_size=None,
32
+ line_height=None,
33
+ use_corrected_fixations=True,
34
+ correction_algo="warp",
35
+ ):
36
+ if use_corrected_fixations and correction_algo is not None:
37
+ fixations_tuples = [
38
+ (
39
+ (x[1]["x"], x[1][f"y_{correction_algo}"], x[1]["corrected_start_time"], x[1]["corrected_end_time"])
40
+ if x[1]["corrected_start_time"] < x[1]["corrected_end_time"]
41
+ else (x[1]["x"], x[1]["y"], x[1]["corrected_start_time"], x[1]["corrected_end_time"] + 1)
42
+ )
43
+ for x in dffix.iterrows()
44
+ ]
45
+ else:
46
+ fixations_tuples = [
47
+ (
48
+ (x[1]["x"], x[1]["y"], x[1]["corrected_start_time"], x[1]["corrected_end_time"])
49
+ if x[1]["corrected_start_time"] < x[1]["corrected_end_time"]
50
+ else (x[1]["x"], x[1]["y"], x[1]["corrected_start_time"], x[1]["corrected_end_time"] + 1)
51
+ )
52
+ for x in dffix.iterrows()
53
+ ]
54
+
55
+ if "display_coords" in trial:
56
+ display_coords = trial["display_coords"]
57
+ else:
58
+ display_coords = (0, 0, 1920, 1080)
59
+ screen_size = ((display_coords[2] - display_coords[0]), (display_coords[3] - display_coords[1]))
60
+
61
+ try:
62
+ fixation_sequence = ek.FixationSequence(fixations_tuples)
63
+ except Exception as e:
64
+ ic(e)
65
+ ic(f"Creating fixation failed for {trial['trial_id']} {trial['filename']}")
66
+ return None, None, screen_size
67
+
68
+ y_diffs = np.unique(trial["line_heights"])
69
+ if len(y_diffs) == 1:
70
+ y_diff = y_diffs[0]
71
+ else:
72
+ y_diff = np.min(y_diffs)
73
+ chars_list = trial["chars_list"]
74
+ max_line = int(chars_list[-1]["assigned_line"])
75
+ words_on_lines = {x: [] for x in range(int(max_line) + 1)}
76
+ [words_on_lines[x["assigned_line"]].append(x["char"]) for x in chars_list]
77
+ sentence_list = ["".join([s for s in v]) for idx, v in words_on_lines.items()]
78
+
79
+ if x_txt_start is None:
80
+ x_txt_start = float(chars_list[0]["char_xmin"])
81
+ if y_txt_start is None:
82
+ y_txt_start = float(chars_list[0]["char_ymax"])
83
+
84
+ if font_face is None and "font" in trial:
85
+ font_face = trial["font"]
86
+ elif font_face is None:
87
+ font_face = "DejaVu Sans Mono"
88
+
89
+ if font_size is None and "font_size" in trial:
90
+ font_size = trial["font_size"]
91
+ elif font_size is None:
92
+ font_size = float(y_diff * 0.333) # pixel to point conversion
93
+ if line_height is None:
94
+ line_height = float(y_diff)
95
+ textblock_input_dict = dict(
96
+ text=sentence_list,
97
+ position=(float(x_txt_start), float(y_txt_start)),
98
+ font_face=font_face,
99
+ line_height=line_height,
100
+ font_size=font_size,
101
+ anchor="left",
102
+ align="left",
103
+ )
104
+ textblock = ek.TextBlock(**textblock_input_dict)
105
+
106
+ ek.io.save(fixation_sequence, f'results/fixation_sequence_eyekit_{trial["trial_id"]}.json', compress=False)
107
+ ek.io.save(textblock, f'results/textblock_eyekit_{trial["trial_id"]}.json', compress=False)
108
+
109
+ return fixations_tuples, textblock_input_dict, screen_size
110
+
111
+
112
+ def eyekit_plot(fixations_tuples, textblock_input_dict, screen_size):
113
+ textblock = ek.TextBlock(**textblock_input_dict)
114
+ img = ek.vis.Image(*screen_size)
115
+ img.draw_text_block(textblock)
116
+ for word in textblock.words():
117
+ img.draw_rectangle(word, color="hotpink")
118
+ fixation_sequence = ek.FixationSequence(fixations_tuples)
119
+ img.draw_fixation_sequence(fixation_sequence)
120
+ img.save("temp_eyekit_img.png", crop_margin=200)
121
+ img_png = Image.open("temp_eyekit_img.png")
122
+ return img_png
123
+
124
+
125
+ def plot_with_measure(fixations_tuples, textblock_input_dict, screen_size, measure, use_characters=False):
126
+ textblock = ek.TextBlock(**textblock_input_dict)
127
+ fixation_sequence = ek.FixationSequence(fixations_tuples)
128
+
129
+ eyekitplot_img = eyekit_plot(fixations_tuples, textblock_input_dict, screen_size)
130
+ eyekitplot_img = ek.vis.Image(*screen_size)
131
+ eyekitplot_img.draw_text_block(textblock)
132
+ if use_characters:
133
+ measure_results = getattr(ek.measure, measure)(textblock.characters(), fixation_sequence)
134
+ enum = textblock.characters()
135
+ else:
136
+ measure_results = getattr(ek.measure, measure)(textblock.words(), fixation_sequence)
137
+ enum = textblock.words()
138
+ for word in enum:
139
+ eyekitplot_img.draw_rectangle(word, color="lightseagreen")
140
+ x = word.onset
141
+ y = word.y_br - 3
142
+ label = f"{measure_results[word.id]}"
143
+ eyekitplot_img.draw_annotation((x, y), label, color="lightseagreen", font_face="Arial bold", font_size=15)
144
+ eyekitplot_img.draw_fixation_sequence(fixation_sequence, color="gray")
145
+ eyekitplot_img.save("multiline_passage_piccol.png", crop_margin=100)
146
+ img_png = Image.open("multiline_passage_piccol.png")
147
+ return img_png
148
+
149
+
150
+ def get_eyekit_measures(fixations_tuples, textblock_input_dict, trial, get_char_measures=False):
151
+ textblock = ek.TextBlock(**textblock_input_dict)
152
+ fixation_sequence = ek.FixationSequence(fixations_tuples)
153
+ measures = copy.deepcopy(MEASURES_DICT)
154
+ words = []
155
+ for w in textblock.words():
156
+ words.append(w.text)
157
+ for m in measures.keys():
158
+ measures[m].append(getattr(ek.measure, m)(w, fixation_sequence))
159
+ word_measures_df = pd.DataFrame(measures)
160
+ word_measures_df["word_number"] = np.arange(0, len(words))
161
+ word_measures_df["word"] = words
162
+
163
+ first_column = word_measures_df.pop("word")
164
+ word_measures_df.insert(0, "word", first_column)
165
+ first_column = word_measures_df.pop("word_number")
166
+ word_measures_df.insert(0, "word_number", first_column)
167
+
168
+ if "item" in trial and "item" not in word_measures_df.columns:
169
+ word_measures_df.insert(loc=0, column="item", value=trial["item"])
170
+ if "condition" in trial and "condition" not in word_measures_df.columns:
171
+ word_measures_df.insert(loc=0, column="condition", value=trial["condition"])
172
+ if "trial_id" in trial and "trial_id" not in word_measures_df.columns:
173
+ word_measures_df.insert(loc=0, column="trial_id", value=trial["trial_id"])
174
+ if "subject" in trial and "subject" not in word_measures_df.columns:
175
+ word_measures_df.insert(loc=0, column="subject", value=trial["subject"])
176
+ if get_char_measures:
177
+ measures = copy.deepcopy(MEASURES_DICT)
178
+
179
+ characters = []
180
+ for c in textblock.characters():
181
+ characters.append(c.text)
182
+ for m in measures.keys():
183
+ measures[m].append(getattr(ek.measure, m)(c, fixation_sequence))
184
+ character_measures_df = pd.DataFrame(measures)
185
+ character_measures_df["char_number"] = np.arange(0, len(characters))
186
+ character_measures_df["character"] = characters
187
+
188
+ first_column = character_measures_df.pop("character")
189
+ character_measures_df.insert(0, "character", first_column)
190
+ first_column = character_measures_df.pop("char_number")
191
+ character_measures_df.insert(0, "char_number", first_column)
192
+ else:
193
+ character_measures_df = None
194
+ return word_measures_df, character_measures_df
fixations_df_columns.md ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Column names for Fixation Dataframe
2
+ Some features were adapted from the popEye R package ([github](https://github.com/sascha2schroeder/popEye))
3
+ The if the column depend on a line assignment then a _ALGORITHM_NAME will be at the end of the name.
4
+ - subject: Subject name or ID
5
+ - trial_id: Trial ID
6
+ - item: Item ID
7
+ - condition: Condition (if applicable)
8
+ - fixation_number: Index of fixation
9
+ - start_uncorrected: Starting timestamp of event as recorded by EyeLink
10
+ - stop_uncorrected: End timestamp of event as recorded by EyeLink
11
+ - start_time: Start time (in ms since start of the trial)
12
+ - end_time: End time (in ms since start of the trial)
13
+ - corrected_start_time: Start time of the event measured from to the first fixation
14
+ - corrected_end_time: End time of the event measured from to the first fixation
15
+ - x: Raw x position (in pixel)
16
+ - y: Raw y position (in pixel)
17
+ - pupil_size: Size of pupil as recorded by EyeLink
18
+ - distance_in_char_widths: Horizontal distance to previous fixation in number of character widths
19
+ - y_ALGORITHM: Corrected y position (in pixel), i.e. after line assignment
20
+ - y_ALGORITHM_correction: Difference between corrected and raw y position (in pixel)
21
+ - duration: Duration (in ms)
22
+ - sac_in: Incoming saccade length (in letters)
23
+ - sac_out: Outgoing saccade length (in letters)
24
+ - type: Whether fixation is an outlier fixation ("out"), i.e. located outside the text area (see assign.outlier and assign.outlier.dist arguments)
25
+ - blink: Whether a blink occured directly before or after the fixation
26
+ - run: Number of run the fixation was assigned to (if applicable)
27
+ - linerun: Number of run on the line the fixation was assigned to (if applicable)
28
+ - line_num: Number of line the fixation was assigned to
29
+ - line_change: Difference between the line of the current and the last fixation
30
+ - line_let: Number of letter on line
31
+ - line_word: Number of word on line
32
+ - letternum: Number of letter in trial
33
+ - letter: Name of Letter
34
+ - on_word_number: Number of word in trial
35
+ - on_word: Name of Word
36
+ - ianum: Number of IA in trial
37
+ - ia: Name of IA
38
+ - on_sentence_num: Number of sentence in trial
39
+ - on_sentence: Sentence text
40
+ - sentence_nwords: Number of words in sentence
41
+ - trial: Name trial (abbreviated)
42
+ - trial_nwords: Number of words in trial
43
+ - word_fix: Number of fixation on word
44
+ - word_run: Number of run the word the word was read
45
+ - word_runid: Number of the word run, the fixation belongs to
46
+ - word_run_fix: Number of fixation within the run
47
+ - word_firstskip: Whether word has been skipped during first-pass reading
48
+ - word_refix: Whether word has been refixated with current fixation
49
+ - word_launch: Launch site distance from the beginning of the word
50
+ - word_land: Landing position with word
51
+ - word_cland: Centered landing position (e.g., calculated from the center of the word)
52
+ - word_reg_out: Whether a regression was made out of the word
53
+ - word_reg_in: Whether a regression was made into the word
54
+ - sentence_word: Number of word in sentence
55
+ - sentence_fix: Number of fixation on sentence
56
+ - sentence_run: Number of run on sentence
57
+ - sentence_runid: Number of the sentence run, the fixation belongs to
58
+ - sentence_firstskip: Whether the sentence has been skipped during first-pass reading
59
+ - sentence_refix: Whether sentence was refixated wither current fixation
60
+ - sentence_reg_out: Whether a regression was made out the sentence
61
+ - sentence_reg_in: Whether a regression was made into the sentence
62
+ - sac_in_ALGORITHM_NAME: Incoming saccade length (in letters)
63
+ - sac_out_ALGORITHM_NAME: Outgoing saccade length (in letters)
64
+ - blink_before: Whether a blink was recorded before the event
65
+ - blink_after: Whether a blink was recorded after the event
66
+ - blink: Whether a blink was recorded before or after the event
67
+ - duration: Duration of the event
68
+ - line_change_ALGORITHM_NAME: Difference between the line of the current and the previous fixation
69
+ - on_word_number_ALGORITHM_NAME: Index of word that the fixation has been assigned to
70
+ - num_words_in_sentence_ALGORITHM_NAME: Number of words in sentence to which fixation has been assigned
71
+ - word_land_ALGORITHM_NAME: Landing position of fixation within word in number of letters
72
+ - line_let_ALGORITHM_NAME: Index of letter on line
73
+ - line_let_from_last_letter_ALGORITHM_NAME: Letter number on line counted from last letter of line
74
+ - line_word_ALGORITHM_NAME: Number of word on line
75
+ - sentence_word_ALGORITHM_NAME: Number of word in sentence
76
+ - is_far_out_of_text_uncorrected: Indicates if a fixation is far outside the stimulus area as determined by the vertical and horizontal margins
77
+ - line_let_previous_ALGORITHM_NAME: Index of letter on line for previous fixations
78
+ - line_let_next_ALGORITHM_NAME: Index of letter on line for next fixations
79
+ - sentence_reg_out_to_ALGORITHM_NAME: Whether a regression was made out of the sentence
80
+ - sentence_reg_in_from_ALGORITHM_NAME: Whether a regression was made into the sentence
81
+ - word_reg_in_from_ALGORITHM_NAME: Whether a regression was made out of the word
82
+ - word_reg_out_to_ALGORITHM_NAME: Whether a regression was made into the word
83
+ - word_firstskip_ALGORITHM_NAME: Whether word has been skipped during first-pass reading
84
+ - sentence_firstskip_ALGORITHM_NAME: Whether the sentence has been skipped during first-pass reading
85
+ - sentence_runid_ALGORITHM_NAME: Number of the sentence run, the fixation belongs to
86
+ - sentence_run_fix_ALGORITHM_NAME:
87
+ - angle_incoming: Angle based on position of previous fixation
88
+ - angle_outgoing: Angle based on position of next fixation
item_df_columns.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ #### Column names for Character Dataframe
2
+ - item: Item ID
3
+ - condition: Condition (if applicable)
4
+ - text: Stimulus text for item
loss_functions.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch as t
2
+
3
+
4
+ def corn_loss(logits, y_train, num_classes):
5
+ """Computes the CORN loss described in our forthcoming
6
+ 'Deep Neural Networks for Rank Consistent Ordinal
7
+ Regression based on Conditional Probabilities'
8
+ manuscript.
9
+ Parameters
10
+ ----------
11
+ logits : torch.tensor, shape=(num_examples, num_classes-1)
12
+ Outputs of the CORN layer.
13
+ y_train : torch.tensor, shape=(num_examples)
14
+ Torch tensor containing the class labels.
15
+ num_classes : int
16
+ Number of unique class labels (class labels should start at 0).
17
+ Returns
18
+ ----------
19
+ loss : torch.tensor
20
+ A torch.tensor containing a single loss value.
21
+ Examples
22
+ ----------
23
+ >>> import torch
24
+ >>> from coral_pytorch.losses import corn_loss
25
+ >>> # Consider 8 training examples
26
+ >>> _ = torch.manual_seed(123)
27
+ >>> X_train = torch.rand(8, 99)
28
+ >>> y_train = torch.tensor([0, 1, 2, 2, 2, 3, 4, 4])
29
+ >>> NUM_CLASSES = 5
30
+ >>> #
31
+ >>> #
32
+ >>> # def __init__(self):
33
+ >>> corn_net = torch.nn.Linear(99, NUM_CLASSES-1)
34
+ >>> #
35
+ >>> #
36
+ >>> # def forward(self, X_train):
37
+ >>> logits = corn_net(X_train)
38
+ >>> logits.shape
39
+ torch.Size([8, 4])
40
+ >>> corn_loss(logits, y_train, NUM_CLASSES)
41
+ tensor(0.7127, grad_fn=<DivBackward0>)
42
+ https://github.com/Raschka-research-group/coral-pytorch/blob/c6ab93afd555a6eac708c95ae1feafa15f91c5aa/coral_pytorch/losses.py
43
+ """
44
+ sets = []
45
+ for i in range(num_classes - 1):
46
+ label_mask = y_train > i - 1
47
+ label_tensor = (y_train[label_mask] > i).to(t.int64)
48
+ sets.append((label_mask, label_tensor))
49
+
50
+ num_examples = 0
51
+ losses = 0.0
52
+ for task_index, s in enumerate(sets):
53
+ train_examples = s[0]
54
+ train_labels = s[1]
55
+
56
+ if len(train_labels) < 1:
57
+ continue
58
+
59
+ num_examples += len(train_labels)
60
+ pred = logits[train_examples, task_index]
61
+
62
+ loss = -t.sum(
63
+ t.nn.functional.logsigmoid(pred) * train_labels
64
+ + (t.nn.functional.logsigmoid(pred) - pred) * (1 - train_labels)
65
+ )
66
+ losses += loss
67
+
68
+ return losses / num_examples
69
+
70
+
71
+ def corn_label_from_logits(logits):
72
+ """
73
+ Returns the predicted rank label from logits for a
74
+ network trained via the CORN loss.
75
+ Parameters
76
+ ----------
77
+ logits : torch.tensor, shape=(n_examples, n_classes)
78
+ Torch tensor consisting of logits returned by the
79
+ neural net.
80
+ Returns
81
+ ----------
82
+ labels : torch.tensor, shape=(n_examples)
83
+ Integer tensor containing the predicted rank (class) labels
84
+ Examples
85
+ ----------
86
+ >>> # 2 training examples, 5 classes
87
+ >>> logits = torch.tensor([[14.152, -6.1942, 0.47710, 0.96850],
88
+ ... [65.667, 0.303, 11.500, -4.524]])
89
+ >>> corn_label_from_logits(logits)
90
+ tensor([1, 3])
91
+ https://github.com/Raschka-research-group/coral-pytorch/blob/c6ab93afd555a6eac708c95ae1feafa15f91c5aa/coral_pytorch/dataset.py
92
+ """
93
+ probas = t.sigmoid(logits)
94
+ probas = t.cumprod(probas, dim=1)
95
+ predict_levels = probas > 0.5
96
+ predicted_labels = t.sum(predict_levels, dim=1)
97
+ return predicted_labels
models.py ADDED
@@ -0,0 +1,892 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import timm
2
+ import os
3
+ from typing import Any
4
+ from pytorch_lightning.utilities.types import LRSchedulerTypeUnion
5
+ import torch as t
6
+ from torch import nn
7
+ import transformers
8
+ import pytorch_lightning as plight
9
+ import torchmetrics
10
+ import einops as eo
11
+ from loss_functions import corn_loss, corn_label_from_logits
12
+
13
+ t.set_float32_matmul_precision("medium")
14
+ global_settings = dict(try_using_torch_compile=False)
15
+
16
+
17
+ class EnsembleModel(plight.LightningModule):
18
+ def __init__(self, models_without_norm_df, models_with_norm_df, learning_rate=0.0002, use_simple_average=False):
19
+ super().__init__()
20
+ self.models_without_norm = nn.ModuleList(list(models_without_norm_df))
21
+ self.models_with_norm = nn.ModuleList(list(models_with_norm_df))
22
+ self.learning_rate = learning_rate
23
+ self.use_simple_average = use_simple_average
24
+
25
+ if not self.use_simple_average:
26
+ self.combiner = nn.Linear(
27
+ self.models_with_norm[0].num_classes * (len(self.models_with_norm) + len(self.models_without_norm)),
28
+ self.models_with_norm[0].num_classes,
29
+ )
30
+
31
+ def forward(self, x):
32
+ x_unnormed, x_normed = x
33
+ if not self.use_simple_average:
34
+ out_unnormed = t.cat([model.model_step(x_unnormed, 0)[0] for model in self.models_without_norm], dim=-1)
35
+ out_normed = t.cat([model.model_step(x_normed, 0)[0] for model in self.models_with_norm], dim=-1)
36
+ out_avg = self.combiner(t.cat((out_unnormed, out_normed), dim=-1))
37
+ else:
38
+ out_unnormed = [model.model_step(x_unnormed, 0)[0] for model in self.models_without_norm]
39
+ out_normed = [model.model_step(x_normed, 0)[0] for model in self.models_with_norm]
40
+
41
+ out_avg = (t.stack(out_unnormed + out_normed, dim=-1) / 2).mean(-1)
42
+ return {"out_avg": out_avg, "out_unnormed": out_unnormed, "out_normed": out_normed}, x_unnormed[-1]
43
+
44
+ def training_step(self, batch, batch_idx):
45
+ out, y = self(batch)
46
+ loss = self.models_with_norm[0]._get_loss(out["out_avg"], y, batch[0])
47
+ self.log("train_loss", loss, on_epoch=True, on_step=True, sync_dist=True)
48
+ return loss
49
+
50
+ def validation_step(self, batch, batch_idx):
51
+ out, y = self(batch)
52
+ preds, y_onecold, ignore_index_val = self.models_with_norm[0]._get_preds_reals(out["out_avg"], y)
53
+ acc = torchmetrics.functional.accuracy(
54
+ preds,
55
+ y_onecold.to(t.long),
56
+ ignore_index=ignore_index_val,
57
+ num_classes=self.models_with_norm[0].num_classes,
58
+ task="multiclass",
59
+ )
60
+ self.log("acc", acc * 100, prog_bar=True, sync_dist=True)
61
+ loss = self.models_with_norm[0]._get_loss(out["out_avg"], y, batch[0])
62
+ self.log("val_loss", loss, prog_bar=True, sync_dist=True)
63
+ return loss
64
+
65
+ def predict_step(self, batch, batch_idx: int, dataloader_idx: int = 0):
66
+ out, y = self(batch)
67
+ preds, y_onecold, ignore_index_val = self.models_with_norm[0]._get_preds_reals(out["out_avg"], y)
68
+ return preds, out, y_onecold
69
+
70
+ def configure_optimizers(self):
71
+ return t.optim.Adam(self.parameters(), lr=self.learning_rate)
72
+
73
+
74
+ class TimmHeadReplace(nn.Module):
75
+ def __init__(self, pooling=None, in_channels=512, pooling_output_dimension=1, all_identity=False) -> None:
76
+ super().__init__()
77
+
78
+ if all_identity:
79
+ self.head = nn.Identity()
80
+ self.pooling = None
81
+ else:
82
+ self.pooling = pooling
83
+ if pooling is not None:
84
+ self.pooling_output_dimension = pooling_output_dimension
85
+ if self.pooling == "AdaptiveAvgPool2d":
86
+ self.pooling_layer = nn.AdaptiveAvgPool2d(pooling_output_dimension)
87
+ elif self.pooling == "AdaptiveMaxPool2d":
88
+ self.pooling_layer = nn.AdaptiveMaxPool2d(pooling_output_dimension)
89
+ self.head = nn.Flatten()
90
+
91
+ def forward(self, x, pre_logits=False):
92
+ if self.pooling is not None:
93
+ if self.pooling == "stack_avg_max_attn":
94
+ x = t.cat([layer(x) for layer in self.pooling_layer], dim=-1)
95
+ else:
96
+ x = self.pooling_layer(x)
97
+ return self.head(x)
98
+
99
+
100
+ class CVModel(nn.Module):
101
+ def __init__(
102
+ self,
103
+ modelname,
104
+ in_shape,
105
+ num_classes,
106
+ loss_func,
107
+ last_activation: str,
108
+ input_padding_val=10,
109
+ char_dims=2,
110
+ max_seq_length=1000,
111
+ ) -> None:
112
+ super().__init__()
113
+ self.modelname = modelname
114
+ self.loss_func = loss_func
115
+ self.in_shape = in_shape
116
+ self.char_dims = char_dims
117
+ self.x_shape = in_shape
118
+ self.last_activation = last_activation
119
+ self.max_seq_length = max_seq_length
120
+ self.num_classes = num_classes
121
+ if self.loss_func == "OrdinalRegLoss":
122
+ self.out_shape = 1
123
+ else:
124
+ self.out_shape = num_classes
125
+
126
+ self.cv_model = timm.create_model(modelname, pretrained=True, num_classes=0)
127
+ self.cv_model.classifier = nn.Identity()
128
+ with t.inference_mode():
129
+ test_out = self.cv_model(t.ones(self.in_shape, dtype=t.float32))
130
+ self.cv_model_out_dim = test_out.shape[1]
131
+ self.cv_model.classifier = nn.Sequential(nn.Flatten(), nn.Linear(self.cv_model_out_dim, self.max_seq_length))
132
+ if self.out_shape == 1:
133
+ self.logit_norm = nn.Identity()
134
+ self.out_project = nn.Identity()
135
+ else:
136
+ self.logit_norm = nn.LayerNorm(self.max_seq_length)
137
+ self.out_project = nn.Linear(1, self.out_shape)
138
+
139
+ if last_activation == "Softmax":
140
+ self.final_activation = nn.Softmax(dim=-1)
141
+ elif last_activation == "Sigmoid":
142
+ self.final_activation = nn.Sigmoid()
143
+ elif last_activation == "LogSigmoid":
144
+ self.final_activation = nn.LogSigmoid()
145
+ elif last_activation == "Identity":
146
+ self.final_activation = nn.Identity()
147
+ else:
148
+ raise NotImplementedError(f"{last_activation} not implemented")
149
+
150
+ def forward(self, x):
151
+ if isinstance(x, list):
152
+ x = x[0]
153
+ x = self.cv_model(x)
154
+ x = self.cv_model.classifier(x).unsqueeze(-1)
155
+ x = self.out_project(x)
156
+ return self.final_activation(x)
157
+
158
+
159
+ class LitModel(plight.LightningModule):
160
+ def __init__(
161
+ self,
162
+ in_shape: tuple,
163
+ hidden_dim: int,
164
+ num_attention_heads: int,
165
+ num_layers: int,
166
+ loss_func: str,
167
+ learning_rate: float,
168
+ weight_decay: float,
169
+ cfg: dict,
170
+ use_lr_warmup: bool,
171
+ use_reduce_on_plateau: bool,
172
+ track_gradient_histogram=False,
173
+ register_forw_hook=False,
174
+ char_dims=2,
175
+ ) -> None:
176
+ super().__init__()
177
+ if "only_use_2nd_input_stream" not in cfg:
178
+ cfg["only_use_2nd_input_stream"] = False
179
+
180
+ if "gamma_step_size" not in cfg:
181
+ cfg["gamma_step_size"] = 5
182
+ if "gamma_step_factor" not in cfg:
183
+ cfg["gamma_step_factor"] = 0.5
184
+ self.save_hyperparameters(
185
+ dict(
186
+ in_shape=in_shape,
187
+ hidden_dim=hidden_dim,
188
+ num_attention_heads=num_attention_heads,
189
+ num_layers=num_layers,
190
+ loss_func=loss_func,
191
+ learning_rate=learning_rate,
192
+ cfg=cfg,
193
+ x_shape=in_shape,
194
+ num_classes=cfg["num_classes"],
195
+ use_lr_warmup=use_lr_warmup,
196
+ num_warmup_steps=cfg["num_warmup_steps"],
197
+ use_reduce_on_plateau=use_reduce_on_plateau,
198
+ weight_decay=weight_decay,
199
+ track_gradient_histogram=track_gradient_histogram,
200
+ register_forw_hook=register_forw_hook,
201
+ char_dims=char_dims,
202
+ remove_timm_classifier_head_pooling=cfg["remove_timm_classifier_head_pooling"],
203
+ change_pooling_for_timm_head_to=cfg["change_pooling_for_timm_head_to"],
204
+ chars_conv_pooling_out_dim=cfg["chars_conv_pooling_out_dim"],
205
+ )
206
+ )
207
+ self.model_to_use = cfg["model_to_use"]
208
+ self.num_classes = cfg["num_classes"]
209
+ self.x_shape = in_shape
210
+ self.in_shape = in_shape
211
+ self.hidden_dim = hidden_dim
212
+ self.num_attention_heads = num_attention_heads
213
+ self.num_layers = num_layers
214
+
215
+ self.use_lr_warmup = use_lr_warmup
216
+ self.num_warmup_steps = cfg["num_warmup_steps"]
217
+ self.warmup_exponent = cfg["warmup_exponent"]
218
+
219
+ self.use_reduce_on_plateau = use_reduce_on_plateau
220
+ self.loss_func = loss_func
221
+ self.learning_rate = learning_rate
222
+ self.weight_decay = weight_decay
223
+ self.using_one_hot_targets = cfg["one_hot_y"]
224
+ self.track_gradient_histogram = track_gradient_histogram
225
+ self.register_forw_hook = register_forw_hook
226
+ if self.loss_func == "OrdinalRegLoss":
227
+ self.ord_reg_loss_max = cfg["ord_reg_loss_max"]
228
+ self.ord_reg_loss_min = cfg["ord_reg_loss_min"]
229
+
230
+ self.num_lin_layers = cfg["num_lin_layers"]
231
+ self.linear_activation = cfg["linear_activation"]
232
+ self.last_activation = cfg["last_activation"]
233
+
234
+ self.max_seq_length = cfg["manual_max_sequence_for_model"]
235
+
236
+ self.use_char_embed_info = cfg["use_embedded_char_pos_info"]
237
+
238
+ self.method_chars_into_model = cfg["method_chars_into_model"]
239
+ self.source_for_pretrained_cv_model = cfg["source_for_pretrained_cv_model"]
240
+ self.method_to_include_char_positions = cfg["method_to_include_char_positions"]
241
+
242
+ self.char_dims = char_dims
243
+ self.char_sequence_length = cfg["max_len_chars_list"] if self.use_char_embed_info else 0
244
+
245
+ self.chars_conv_lr_reduction_factor = cfg["chars_conv_lr_reduction_factor"]
246
+ if self.use_char_embed_info:
247
+ self.chars_bert_reduction_factor = cfg["chars_bert_reduction_factor"]
248
+
249
+ self.use_in_projection_bias = cfg["use_in_projection_bias"]
250
+ self.add_layer_norm_to_in_projection = cfg["add_layer_norm_to_in_projection"]
251
+
252
+ self.hidden_dropout_prob = cfg["hidden_dropout_prob"]
253
+ self.layer_norm_after_in_projection = cfg["layer_norm_after_in_projection"]
254
+ self.method_chars_into_model = cfg["method_chars_into_model"]
255
+ self.input_padding_val = cfg["input_padding_val"]
256
+ self.cv_char_modelname = cfg["cv_char_modelname"]
257
+ self.char_plot_shape = cfg["char_plot_shape"]
258
+
259
+ self.remove_timm_classifier_head_pooling = cfg["remove_timm_classifier_head_pooling"]
260
+ self.change_pooling_for_timm_head_to = cfg["change_pooling_for_timm_head_to"]
261
+ self.chars_conv_pooling_out_dim = cfg["chars_conv_pooling_out_dim"]
262
+
263
+ self.add_layer_norm_to_char_mlp = cfg["add_layer_norm_to_char_mlp"]
264
+ if "profile_torch_run" in cfg:
265
+ self.profile_torch_run = cfg["profile_torch_run"]
266
+ else:
267
+ self.profile_torch_run = False
268
+ if self.loss_func == "OrdinalRegLoss":
269
+ self.out_shape = 1
270
+ else:
271
+ self.out_shape = cfg["num_classes"]
272
+
273
+ if not self.hparams.cfg["only_use_2nd_input_stream"]:
274
+ if (
275
+ self.method_chars_into_model == "dense"
276
+ and self.use_char_embed_info
277
+ and self.method_to_include_char_positions == "concat"
278
+ ):
279
+ self.project = nn.Linear(self.x_shape[-1], self.hidden_dim // 2, bias=self.use_in_projection_bias)
280
+ elif (
281
+ self.method_chars_into_model == "bert"
282
+ and self.use_char_embed_info
283
+ and self.method_to_include_char_positions == "concat"
284
+ ):
285
+ self.hidden_dim_chars = self.hidden_dim // 2
286
+ self.project = nn.Linear(self.x_shape[-1], self.hidden_dim_chars, bias=self.use_in_projection_bias)
287
+ elif (
288
+ self.method_chars_into_model == "resnet"
289
+ and self.method_to_include_char_positions == "concat"
290
+ and self.use_char_embed_info
291
+ ):
292
+ self.project = nn.Linear(self.x_shape[-1], self.hidden_dim // 2, bias=self.use_in_projection_bias)
293
+ elif self.model_to_use == "cv_only_model":
294
+ self.project = nn.Identity()
295
+ else:
296
+ self.project = nn.Linear(self.x_shape[-1], self.hidden_dim, bias=self.use_in_projection_bias)
297
+ if self.add_layer_norm_to_in_projection:
298
+ self.project = nn.Sequential(
299
+ nn.Linear(self.project.in_features, self.project.out_features, bias=self.use_in_projection_bias),
300
+ nn.LayerNorm(self.project.out_features),
301
+ )
302
+
303
+ if hasattr(self, "project") and "posix" in os.name and global_settings["try_using_torch_compile"]:
304
+ self.project = t.compile(self.project)
305
+
306
+ if self.use_char_embed_info:
307
+ self._create_char_model()
308
+
309
+ if self.layer_norm_after_in_projection:
310
+ if self.hparams.cfg["only_use_2nd_input_stream"]:
311
+ self.layer_norm_in = nn.LayerNorm(self.hidden_dim // 2)
312
+ else:
313
+ self.layer_norm_in = nn.LayerNorm(self.hidden_dim)
314
+
315
+ if "posix" in os.name and global_settings["try_using_torch_compile"]:
316
+ self.layer_norm_in = t.compile(self.layer_norm_in)
317
+
318
+ self._create_main_seq_model(cfg)
319
+
320
+ if register_forw_hook:
321
+ self.register_hooks()
322
+ if self.hparams.cfg["only_use_2nd_input_stream"]:
323
+ linear_in_dim = self.hidden_dim // 2
324
+ else:
325
+ linear_in_dim = self.hidden_dim
326
+
327
+ if self.num_lin_layers == 1:
328
+ self.linear = nn.Linear(linear_in_dim, self.out_shape)
329
+ else:
330
+ lin_layers = []
331
+ for _ in range(self.num_lin_layers - 1):
332
+ lin_layers.extend(
333
+ [
334
+ nn.Linear(linear_in_dim, linear_in_dim),
335
+ getattr(nn, self.linear_activation)(),
336
+ ]
337
+ )
338
+ self.linear = nn.Sequential(*lin_layers, nn.Linear(linear_in_dim, self.out_shape))
339
+
340
+ if "posix" in os.name and global_settings["try_using_torch_compile"]:
341
+ self.linear = t.compile(self.linear)
342
+
343
+ if self.last_activation == "Softmax":
344
+ self.final_activation = nn.Softmax(dim=-1)
345
+ elif self.last_activation == "Sigmoid":
346
+ self.final_activation = nn.Sigmoid()
347
+ elif self.last_activation == "Identity":
348
+ self.final_activation = nn.Identity()
349
+ else:
350
+ raise NotImplementedError(f"{self.last_activation} not implemented")
351
+
352
+ if self.profile_torch_run:
353
+ self.profilerr = t.profiler.profile(
354
+ schedule=t.profiler.schedule(wait=1, warmup=10, active=10, repeat=1),
355
+ on_trace_ready=t.profiler.tensorboard_trace_handler("tblogs"),
356
+ with_stack=True,
357
+ record_shapes=True,
358
+ profile_memory=False,
359
+ )
360
+
361
+ def _create_main_seq_model(self, cfg):
362
+ if self.hparams.cfg["only_use_2nd_input_stream"]:
363
+ hidden_dim = self.hidden_dim // 2
364
+ else:
365
+ hidden_dim = self.hidden_dim
366
+ if self.model_to_use == "BERT":
367
+ self.bert_config = transformers.BertConfig(
368
+ vocab_size=self.x_shape[-1],
369
+ hidden_size=hidden_dim,
370
+ num_hidden_layers=self.num_layers,
371
+ intermediate_size=hidden_dim,
372
+ num_attention_heads=self.num_attention_heads,
373
+ max_position_embeddings=self.max_seq_length,
374
+ )
375
+ self.bert_model = transformers.BertModel(self.bert_config)
376
+ elif self.model_to_use == "cv_only_model":
377
+ self.bert_model = CVModel(
378
+ modelname=cfg["cv_modelname"],
379
+ in_shape=self.in_shape,
380
+ num_classes=cfg["num_classes"],
381
+ loss_func=cfg["loss_function"],
382
+ last_activation=cfg["last_activation"],
383
+ input_padding_val=cfg["input_padding_val"],
384
+ char_dims=self.char_dims,
385
+ max_seq_length=cfg["manual_max_sequence_for_model"],
386
+ )
387
+ else:
388
+ raise NotImplementedError(f"{self.model_to_use} not implemented")
389
+ if "posix" in os.name and global_settings["try_using_torch_compile"]:
390
+ self.bert_model = t.compile(self.bert_model)
391
+ return 0
392
+
393
+ def _create_char_model(self):
394
+ if self.method_chars_into_model == "dense":
395
+ self.chars_project_0 = nn.Linear(self.char_dims, 1, bias=self.use_in_projection_bias)
396
+ if "posix" in os.name and global_settings["try_using_torch_compile"]:
397
+ self.chars_project_0 = t.compile(self.chars_project_0)
398
+ if self.method_to_include_char_positions == "concat":
399
+ self.chars_project_1 = nn.Linear(
400
+ self.char_sequence_length, self.hidden_dim // 2, bias=self.use_in_projection_bias
401
+ )
402
+ else:
403
+ self.chars_project_1 = nn.Linear(
404
+ self.char_sequence_length, self.hidden_dim, bias=self.use_in_projection_bias
405
+ )
406
+
407
+ if "posix" in os.name and global_settings["try_using_torch_compile"]:
408
+ self.chars_project_1 = t.compile(self.chars_project_1)
409
+ elif not self.method_chars_into_model == "resnet":
410
+ self.chars_project = nn.Linear(self.char_dims, self.hidden_dim_chars, bias=self.use_in_projection_bias)
411
+ if "posix" in os.name and global_settings["try_using_torch_compile"]:
412
+ self.chars_project = t.compile(self.chars_project)
413
+
414
+ if self.method_chars_into_model == "bert":
415
+ if not hasattr(self, "hidden_dim_chars"):
416
+ if self.hidden_dim // self.chars_bert_reduction_factor > 1:
417
+ self.hidden_dim_chars = self.hidden_dim // self.chars_bert_reduction_factor
418
+ else:
419
+ self.hidden_dim_chars = self.hidden_dim
420
+ self.num_attention_heads_chars = self.hidden_dim_chars // (self.hidden_dim // self.num_attention_heads)
421
+ self.chars_bert_config = transformers.BertConfig(
422
+ vocab_size=self.x_shape[-1],
423
+ hidden_size=self.hidden_dim_chars,
424
+ num_hidden_layers=self.num_layers,
425
+ intermediate_size=self.hidden_dim_chars,
426
+ num_attention_heads=self.num_attention_heads_chars,
427
+ max_position_embeddings=self.char_sequence_length + 1,
428
+ num_labels=1,
429
+ )
430
+ self.chars_bert = transformers.BertForSequenceClassification(self.chars_bert_config)
431
+
432
+ if "posix" in os.name and global_settings["try_using_torch_compile"]:
433
+ self.chars_bert = t.compile(self.chars_bert)
434
+ self.chars_project_class_output = nn.Linear(1, self.hidden_dim_chars, bias=self.use_in_projection_bias)
435
+ if "posix" in os.name and global_settings["try_using_torch_compile"]:
436
+ self.chars_project_class_output = t.compile(self.chars_project_class_output)
437
+ elif self.method_chars_into_model == "resnet":
438
+ if self.source_for_pretrained_cv_model == "timm":
439
+ self.chars_conv = timm.create_model(
440
+ self.cv_char_modelname,
441
+ pretrained=True,
442
+ num_classes=0, # remove classifier nn.Linear
443
+ )
444
+ if self.remove_timm_classifier_head_pooling:
445
+ self.chars_conv.head = TimmHeadReplace(all_identity=True)
446
+ with t.inference_mode():
447
+ test_out = self.chars_conv(
448
+ t.ones((1, 3, self.char_plot_shape[0], self.char_plot_shape[1]), dtype=t.float32)
449
+ )
450
+ if test_out.ndim > 3:
451
+ self.chars_conv.head = TimmHeadReplace(
452
+ self.change_pooling_for_timm_head_to,
453
+ test_out.shape[1],
454
+ )
455
+ elif self.source_for_pretrained_cv_model == "huggingface":
456
+ self.chars_conv = transformers.AutoModelForImageClassification.from_pretrained(self.cv_char_modelname)
457
+ elif self.source_for_pretrained_cv_model == "torch_hub":
458
+ self.chars_conv = t.hub.load(*self.cv_char_modelname.split(","))
459
+
460
+ if hasattr(self.chars_conv, "classifier"):
461
+ self.chars_conv.classifier = nn.Identity()
462
+ elif hasattr(self.chars_conv, "cls_classifier"):
463
+ self.chars_conv.cls_classifier = nn.Identity()
464
+ elif hasattr(self.chars_conv, "fc"):
465
+ self.chars_conv.fc = nn.Identity()
466
+
467
+ if hasattr(self.chars_conv, "distillation_classifier"):
468
+ self.chars_conv.distillation_classifier = nn.Identity()
469
+ with t.inference_mode():
470
+ test_out = self.chars_conv(
471
+ t.ones((1, 3, self.char_plot_shape[0], self.char_plot_shape[1]), dtype=t.float32)
472
+ )
473
+ if hasattr(test_out, "last_hidden_state"):
474
+ self.chars_conv_out_dim = test_out.last_hidden_state.shape[1]
475
+ elif hasattr(test_out, "logits"):
476
+ self.chars_conv_out_dim = test_out.logits.shape[1]
477
+ elif isinstance(test_out, list):
478
+ self.chars_conv_out_dim = test_out[0].shape[1]
479
+ else:
480
+ self.chars_conv_out_dim = test_out.shape[1]
481
+
482
+ char_lin_layers = [nn.Flatten(), nn.Linear(self.chars_conv_out_dim, self.hidden_dim // 2)]
483
+ if self.add_layer_norm_to_char_mlp:
484
+ char_lin_layers.append(nn.LayerNorm(self.hidden_dim // 2))
485
+ self.chars_classifier = nn.Sequential(*char_lin_layers)
486
+ if hasattr(self.chars_conv, "distillation_classifier"):
487
+ self.chars_conv.distillation_classifier = nn.Sequential(
488
+ nn.Flatten(), nn.Linear(self.chars_conv_out_dim, self.hidden_dim // 2)
489
+ )
490
+
491
+ if "posix" in os.name and global_settings["try_using_torch_compile"]:
492
+ self.chars_classifier = t.compile(self.chars_classifier)
493
+ if "posix" in os.name and global_settings["try_using_torch_compile"]:
494
+ self.chars_conv = t.compile(self.chars_conv)
495
+ return 0
496
+
497
+ def register_hooks(self):
498
+ def add_to_tb(layer):
499
+ def hook(model, input, output):
500
+ if hasattr(output, "detach"):
501
+ for logger in self.loggers:
502
+ if hasattr(logger.experiment, "add_histogram"):
503
+ logger.experiment.add_histogram(
504
+ tag=f"{layer}_{str(list(output.shape))}",
505
+ values=output.detach(),
506
+ global_step=self.trainer.global_step,
507
+ )
508
+
509
+ return hook
510
+
511
+ for layer_id, layer in dict([*self.named_modules()]).items():
512
+ layer.register_forward_hook(add_to_tb(f"act_{layer_id}"))
513
+
514
+ def on_after_backward(self) -> None:
515
+ if self.track_gradient_histogram:
516
+ if self.trainer.global_step % 200 == 0:
517
+ for logger in self.loggers:
518
+ if hasattr(logger.experiment, "add_histogram"):
519
+ for layer_id, layer in dict([*self.named_modules()]).items():
520
+ parameters = layer.parameters()
521
+ for idx2, p in enumerate(parameters):
522
+ grad_val = p.grad
523
+ if grad_val is not None:
524
+ grad_name = f"grad_{idx2}_{layer_id}_{str(list(p.grad.shape))}"
525
+ logger.experiment.add_histogram(
526
+ tag=grad_name, values=grad_val, global_step=self.trainer.global_step
527
+ )
528
+
529
+ return super().on_after_backward()
530
+
531
+ def _fold_in_seq_dim(self, out, y):
532
+ batch_size, seq_len, num_classes = out.shape
533
+ out = eo.rearrange(out, "b s c -> (b s) c", s=seq_len)
534
+ if y is None:
535
+ return out, None
536
+ if len(y.shape) > 2:
537
+ y = eo.rearrange(y, "b s c -> (b s) c", s=seq_len)
538
+ else:
539
+ y = eo.rearrange(y, "b s -> (b s)", s=seq_len)
540
+ return out, y
541
+
542
+ def _get_loss(self, out, y, batch):
543
+ attention_mask = batch[-2]
544
+ if self.loss_func == "BCELoss":
545
+ if self.last_activation == "Identity":
546
+ loss = t.nn.functional.binary_cross_entropy_with_logits(out, y, reduction="none")
547
+ else:
548
+ loss = t.nn.functional.binary_cross_entropy(out, y, reduction="none")
549
+
550
+ replace_tensor = t.zeros(loss[1, 1, :].shape, device=loss.device, dtype=loss.dtype, requires_grad=False)
551
+ loss[~attention_mask.bool()] = replace_tensor
552
+ loss = loss.mean()
553
+ elif self.loss_func == "CrossEntropyLoss":
554
+ if len(out.shape) > 2:
555
+ out, y = self._fold_in_seq_dim(out, y)
556
+ loss = t.nn.functional.cross_entropy(out, y, reduction="mean", ignore_index=-100)
557
+ else:
558
+ loss = t.nn.functional.cross_entropy(out, y, reduction="mean", ignore_index=-100)
559
+
560
+ elif self.loss_func == "OrdinalRegLoss":
561
+ loss = t.nn.functional.mse_loss(out, y, reduction="none")
562
+ loss = loss[attention_mask.bool()].sum() * 10.0 / attention_mask.sum()
563
+ elif self.loss_func == "corn_loss":
564
+ out, y = self._fold_in_seq_dim(out, y)
565
+ loss = corn_loss(out, y.squeeze(), self.out_shape)
566
+ else:
567
+ raise ValueError("Loss Function not reckognized")
568
+ return loss
569
+
570
+ def training_step(self, batch, batch_idx):
571
+ if self.profile_torch_run:
572
+ self.profilerr.step()
573
+ out, y = self.model_step(batch, batch_idx)
574
+ loss = self._get_loss(out, y, batch)
575
+ self.log("train_loss", loss, on_epoch=True, on_step=True, sync_dist=True)
576
+ return loss
577
+
578
+ def forward(*args):
579
+ return forward(args[0], args[1:])
580
+
581
+ def model_step(self, batch, batch_idx):
582
+ out = self.forward(batch)
583
+ return out, batch[-1]
584
+
585
+ def optimizer_step(
586
+ self,
587
+ epoch,
588
+ batch_idx,
589
+ optimizer,
590
+ optimizer_closure,
591
+ ):
592
+ optimizer.step(closure=optimizer_closure)
593
+
594
+ if self.use_lr_warmup and self.hparams["cfg"]["lr_scheduling"] != "OneCycleLR":
595
+ if self.trainer.global_step < self.num_warmup_steps:
596
+ lr_scale = min(1.0, float(self.trainer.global_step + 1) / self.num_warmup_steps) ** self.warmup_exponent
597
+ for pg in optimizer.param_groups:
598
+ pg["lr"] = lr_scale * self.hparams.learning_rate
599
+ if self.trainer.global_step % 10 == 0 or self.trainer.global_step == 0:
600
+ for idx, pg in enumerate(optimizer.param_groups):
601
+ self.log(f"lr_{idx}", pg["lr"], prog_bar=True, sync_dist=True)
602
+
603
+ def lr_scheduler_step(self, scheduler: LRSchedulerTypeUnion, metric: Any | None) -> None:
604
+ if self.use_lr_warmup and self.hparams["cfg"]["lr_scheduling"] != "OneCycleLR":
605
+ if self.trainer.global_step > self.num_warmup_steps:
606
+ if metric is None:
607
+ scheduler.step()
608
+ else:
609
+ scheduler.step(metric)
610
+ else:
611
+ if metric is None:
612
+ scheduler.step()
613
+ else:
614
+ scheduler.step(metric)
615
+
616
+ def _get_preds_reals(self, out, y):
617
+ if self.loss_func == "corn_loss":
618
+ seq_len = out.shape[1]
619
+ out, y = self._fold_in_seq_dim(out, y)
620
+ preds = corn_label_from_logits(out)
621
+ preds = eo.rearrange(preds, "(b s) -> b s", s=seq_len)
622
+ if y is not None:
623
+ y = eo.rearrange(y.squeeze(), "(b s) -> b s", s=seq_len)
624
+
625
+ elif self.loss_func == "OrdinalRegLoss":
626
+ preds = out * (self.ord_reg_loss_max - self.ord_reg_loss_min)
627
+ preds = (preds + self.ord_reg_loss_min).round().to(t.long)
628
+
629
+ else:
630
+ preds = t.argmax(out, dim=-1)
631
+ if y is None:
632
+ return preds, y, -100
633
+ else:
634
+ if self.using_one_hot_targets:
635
+ y_onecold = t.argmax(y, dim=-1)
636
+ ignore_index_val = 0
637
+ elif self.loss_func == "OrdinalRegLoss":
638
+ y_onecold = (y * self.num_classes).round().to(t.long)
639
+
640
+ y_onecold = y * (self.ord_reg_loss_max - self.ord_reg_loss_min)
641
+ y_onecold = (y_onecold + self.ord_reg_loss_min).round().to(t.long)
642
+ ignore_index_val = t.min(y_onecold).to(t.long)
643
+ else:
644
+ y_onecold = y
645
+ ignore_index_val = -100
646
+
647
+ if len(preds.shape) > len(y_onecold.shape):
648
+ preds = preds.squeeze()
649
+ return preds, y_onecold, ignore_index_val
650
+
651
+ def validation_step(self, batch, batch_idx):
652
+ out, y = self.model_step(batch, batch_idx)
653
+ preds, y_onecold, ignore_index_val = self._get_preds_reals(out, y)
654
+
655
+ if self.loss_func == "OrdinalRegLoss":
656
+ y_onecold = y_onecold.flatten()
657
+ preds = preds.flatten()[y_onecold != ignore_index_val]
658
+ y_onecold = y_onecold[y_onecold != ignore_index_val]
659
+ acc = (preds == y_onecold).sum() / len(y_onecold)
660
+ else:
661
+ acc = torchmetrics.functional.accuracy(
662
+ preds,
663
+ y_onecold.to(t.long),
664
+ ignore_index=ignore_index_val,
665
+ num_classes=self.num_classes,
666
+ task="multiclass",
667
+ )
668
+ self.log("acc", acc * 100, prog_bar=True, sync_dist=True)
669
+ loss = self._get_loss(out, y, batch)
670
+ self.log("val_loss", loss, prog_bar=True, sync_dist=True)
671
+
672
+ return loss
673
+
674
+ def predict_step(self, batch, batch_idx):
675
+ out, y = self.model_step(batch, batch_idx)
676
+ preds, y_onecold, ignore_index_val = self._get_preds_reals(out, y)
677
+ return preds, y_onecold
678
+
679
+ def configure_optimizers(self):
680
+ params = list(self.named_parameters())
681
+
682
+ def is_chars_conv(n):
683
+ if "chars_conv" not in n:
684
+ return False
685
+ if "chars_conv" in n and "classifier" in n:
686
+ return False
687
+ else:
688
+ return True
689
+
690
+ grouped_parameters = [
691
+ {
692
+ "params": [p for n, p in params if is_chars_conv(n)],
693
+ "lr": self.learning_rate / self.chars_conv_lr_reduction_factor,
694
+ "weight_decay": self.weight_decay,
695
+ },
696
+ {
697
+ "params": [p for n, p in params if not is_chars_conv(n)],
698
+ "lr": self.learning_rate,
699
+ "weight_decay": self.weight_decay,
700
+ },
701
+ ]
702
+ opti = t.optim.AdamW(grouped_parameters, lr=self.learning_rate, weight_decay=self.weight_decay)
703
+ if self.use_reduce_on_plateau:
704
+ opti_dict = {
705
+ "optimizer": opti,
706
+ "lr_scheduler": {
707
+ "scheduler": t.optim.lr_scheduler.ReduceLROnPlateau(opti, mode="min", patience=2, factor=0.5),
708
+ "monitor": "val_loss",
709
+ "frequency": 1,
710
+ "interval": "epoch",
711
+ },
712
+ }
713
+ return opti_dict
714
+ else:
715
+ cfg = self.hparams["cfg"]
716
+ if cfg["use_reduce_on_plateau"]:
717
+ scheduler = None
718
+ elif cfg["lr_scheduling"] == "multistep":
719
+ scheduler = t.optim.lr_scheduler.MultiStepLR(
720
+ opti, milestones=cfg["multistep_milestones"], gamma=cfg["gamma_multistep"], verbose=False
721
+ )
722
+ interval = "step" if cfg["use_training_steps_for_end_and_lr_decay"] else "epoch"
723
+ elif cfg["lr_scheduling"] == "StepLR":
724
+ scheduler = t.optim.lr_scheduler.StepLR(
725
+ opti, step_size=cfg["gamma_step_size"], gamma=cfg["gamma_step_factor"]
726
+ )
727
+ interval = "step" if cfg["use_training_steps_for_end_and_lr_decay"] else "epoch"
728
+ elif cfg["lr_scheduling"] == "anneal":
729
+ scheduler = t.optim.lr_scheduler.CosineAnnealingLR(
730
+ opti, 250, eta_min=cfg["min_lr_anneal"], last_epoch=-1, verbose=False
731
+ )
732
+ interval = "step"
733
+ elif cfg["lr_scheduling"] == "ExponentialLR":
734
+ scheduler = t.optim.lr_scheduler.ExponentialLR(opti, gamma=cfg["lr_sched_exp_fac"])
735
+ interval = "step"
736
+ else:
737
+ scheduler = None
738
+ if scheduler is None:
739
+ return [opti]
740
+ else:
741
+ opti_dict = {
742
+ "optimizer": opti,
743
+ "lr_scheduler": {
744
+ "scheduler": scheduler,
745
+ "monitor": "global_step",
746
+ "frequency": 1,
747
+ "interval": interval,
748
+ },
749
+ }
750
+ return opti_dict
751
+
752
+ def on_fit_start(self) -> None:
753
+ if self.profile_torch_run:
754
+ self.profilerr.start()
755
+ return super().on_fit_start()
756
+
757
+ def on_fit_end(self) -> None:
758
+ if self.profile_torch_run:
759
+ self.profilerr.stop()
760
+ return super().on_fit_end()
761
+
762
+
763
+ def prep_model_input(self, batch):
764
+ if len(batch) == 1:
765
+ batch = batch[0]
766
+ if self.use_char_embed_info:
767
+ if len(batch) == 5:
768
+ x, chars_coords, ims, attention_mask, _ = batch
769
+ elif batch[1].ndim == 4:
770
+ x, ims, attention_mask, _ = batch
771
+ else:
772
+ x, chars_coords, attention_mask, _ = batch
773
+ padding_list = None
774
+ else:
775
+ if len(batch) > 3:
776
+ x = batch[0]
777
+ y = batch[-1]
778
+ attention_mask = batch[1]
779
+ else:
780
+ x, attention_mask, y = batch
781
+
782
+ if self.model_to_use != "cv_only_model" and not self.hparams.cfg["only_use_2nd_input_stream"]:
783
+ x_embedded = self.project(x)
784
+ else:
785
+ x_embedded = x
786
+ if self.use_char_embed_info:
787
+ if self.method_chars_into_model == "dense":
788
+ bool_mask = chars_coords == self.input_padding_val
789
+ bool_mask = bool_mask[:, :, 0]
790
+ chars_coords_projected = self.chars_project_0(chars_coords).squeeze(-1)
791
+ chars_coords_projected = chars_coords_projected * bool_mask
792
+ if self.chars_project_1.in_features == chars_coords_projected.shape[-1]:
793
+ chars_coords_projected = self.chars_project_1(chars_coords_projected)
794
+ else:
795
+ chars_coords_projected = chars_coords_projected.mean(dim=-1)
796
+ chars_coords_projected = chars_coords_projected.unsqueeze(1).repeat(1, x_embedded.shape[2])
797
+ elif self.method_chars_into_model == "bert":
798
+ chars_mask = chars_coords != self.input_padding_val
799
+ chars_mask = t.cat(
800
+ (
801
+ t.ones(chars_mask[:, :1, 0].shape, dtype=t.long, device=chars_coords.device),
802
+ chars_mask[:, :, 0].to(t.long),
803
+ ),
804
+ dim=1,
805
+ )
806
+ chars_coords_projected = self.chars_project(chars_coords)
807
+
808
+ position_ids = t.arange(
809
+ 0, chars_coords_projected.shape[1] + 1, dtype=t.long, device=chars_coords_projected.device
810
+ )
811
+ token_type_ids = t.zeros(
812
+ (chars_coords_projected.size()[0], chars_coords_projected.size()[1] + 1),
813
+ dtype=t.long,
814
+ device=chars_coords_projected.device,
815
+ ) # +1 for CLS
816
+ chars_coords_projected = t.cat(
817
+ (t.ones_like(chars_coords_projected[:, :1, :]), chars_coords_projected), dim=1
818
+ ) # to add CLS token
819
+ chars_coords_projected = self.chars_bert(
820
+ position_ids=position_ids,
821
+ inputs_embeds=chars_coords_projected,
822
+ token_type_ids=token_type_ids,
823
+ attention_mask=chars_mask,
824
+ )
825
+ if hasattr(chars_coords_projected, "last_hidden_state"):
826
+ chars_coords_projected = chars_coords_projected.last_hidden_state[:, 0, :]
827
+ elif hasattr(chars_coords_projected, "logits"):
828
+ chars_coords_projected = chars_coords_projected.logits
829
+ else:
830
+ chars_coords_projected = chars_coords_projected.hidden_states[-1][:, 0, :]
831
+ elif self.method_chars_into_model == "resnet":
832
+ chars_conv_out = self.chars_conv(ims)
833
+ if isinstance(chars_conv_out, list):
834
+ chars_conv_out = chars_conv_out[0]
835
+ if hasattr(chars_conv_out, "logits"):
836
+ chars_conv_out = chars_conv_out.logits
837
+ chars_coords_projected = self.chars_classifier(chars_conv_out)
838
+
839
+ chars_coords_projected = chars_coords_projected.unsqueeze(1).repeat(1, x_embedded.shape[1], 1)
840
+ if hasattr(self, "chars_project_class_output"):
841
+ chars_coords_projected = self.chars_project_class_output(chars_coords_projected)
842
+
843
+ if self.hparams.cfg["only_use_2nd_input_stream"]:
844
+ x_embedded = chars_coords_projected
845
+ elif self.method_to_include_char_positions == "concat":
846
+ x_embedded = t.cat((x_embedded, chars_coords_projected), dim=-1)
847
+ else:
848
+ x_embedded = x_embedded + chars_coords_projected
849
+ return x_embedded, attention_mask
850
+
851
+
852
+ def forward(self, batch):
853
+ prepped_input = prep_model_input(self, batch)
854
+
855
+ if len(batch) > 5:
856
+ x_embedded, padding_list, attention_mask, attention_mask_for_prediction = prepped_input
857
+ elif len(batch) > 2:
858
+ x_embedded, attention_mask = prepped_input
859
+ else:
860
+ x_embedded = prepped_input[0]
861
+ attention_mask = prepped_input[-1]
862
+
863
+ position_ids = t.arange(0, x_embedded.shape[1], dtype=t.long, device=x_embedded.device)
864
+ token_type_ids = t.zeros(x_embedded.size()[:-1], dtype=t.long, device=x_embedded.device)
865
+
866
+ if self.layer_norm_after_in_projection:
867
+ x_embedded = self.layer_norm_in(x_embedded)
868
+
869
+ if self.model_to_use == "LSTM":
870
+ bert_out = self.bert_model(x_embedded)
871
+ elif self.model_to_use in ["ProphetNet", "T5", "FunnelModel"]:
872
+ bert_out = self.bert_model(inputs_embeds=x_embedded, attention_mask=attention_mask)
873
+ elif self.model_to_use == "xBERT":
874
+ bert_out = self.bert_model(x_embedded, mask=attention_mask.to(bool))
875
+ elif self.model_to_use == "cv_only_model":
876
+ bert_out = self.bert_model(x_embedded)
877
+ else:
878
+ bert_out = self.bert_model(
879
+ position_ids=position_ids,
880
+ inputs_embeds=x_embedded,
881
+ token_type_ids=token_type_ids,
882
+ attention_mask=attention_mask,
883
+ )
884
+ if hasattr(bert_out, "last_hidden_state"):
885
+ last_hidden_state = bert_out.last_hidden_state
886
+ out = self.linear(last_hidden_state)
887
+ elif hasattr(bert_out, "logits"):
888
+ out = bert_out.logits
889
+ else:
890
+ out = bert_out
891
+ out = self.final_activation(out)
892
+ return out
models/BERT_20240104-223349_loop_normalize_by_line_height_and_width_True_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00430.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c4ae65e81c722f3732563942ab40447a186869bebb1bbc8433a782805e73ac3
3
+ size 86691676
models/BERT_20240104-233803_loop_normalize_by_line_height_and_width_False_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00719.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7588696e4afc4c8ffb0ff361d9566b7b360c61a3bb6fd6fcb484942b6d2568b
3
+ size 86692053
models/BERT_20240107-152040_loop_restrict_sim_data_to_4000_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00515.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:815b5500a1ae0a04bb55ae58c3896f07981757a2e1a2adf2cbc8a346551d88df
3
+ size 86686270
models/BERT_20240108-000344_loop_normalize_by_line_height_and_width_False_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00706.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2e56e1e33da611622315995e0cdf4db5aad6a086420401ca3ee95393b8977ac
3
+ size 86692053
models/BERT_20240108-011230_loop_normalize_by_line_height_and_width_True_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00560.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f060242cf0bc494d2908e0e99e9d411c9a9b131443cff91bb245229dad2f783
3
+ size 86691676
models/BERT_20240109-090419_loop_normalize_by_line_height_and_width_False_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00518.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbf23ac7baa88a957e1782158bd7a32aedcfcb0527b203079191ac259ec146c5
3
+ size 86692053
models/BERT_20240122-183729_loop_normalize_by_line_height_and_width_True_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00523.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fb7c8238752af51b64a23291080bb30edf9e090defcb2ec4015ddc8d543a9de
3
+ size 86691740
models/BERT_20240122-194041_loop_normalize_by_line_height_and_width_False_dataset_folder_idx_evaluation_8_epoch=41-val_loss=0.00462.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54fedcc5bdeda01bfae26bafcb7542c766807f1af9da7731aaa7ed38e93743d8
3
+ size 86692117
models/BERT_fin_exp_20240104-223349.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ add_layer_norm_to_char_mlp: true
2
+ add_layer_norm_to_in_projection: false
3
+ add_line_overlap_feature: true
4
+ add_normalised_values_as_features: false
5
+ change_pooling_for_timm_head_to: AdaptiveAvgPool2d
6
+ char_dims: 0
7
+ char_plot_shape:
8
+ - 224
9
+ - 224
10
+ chars_bert_reduction_factor: 4
11
+ chars_conv_lr_reduction_factor: 1
12
+ chars_conv_pooling_out_dim: 1
13
+ convert_posix: false
14
+ convert_winpath: false
15
+ cv_char_modelname: coatnet_nano_rw_224
16
+ cv_modelname: null
17
+ early_stopping_patience: 15
18
+ gamma_multistep: null
19
+ gamma_step_factor: 0.5
20
+ gamma_step_size: 3000
21
+ head_multiplication_factor: 64
22
+ hidden_dim_bert: 512
23
+ hidden_dropout_prob: 0.0
24
+ im_partial_string: fixations_chars_channel_sep
25
+ input_padding_val: 10
26
+ last_activation: Identity
27
+ layer_norm_after_in_projection: true
28
+ linear_activation: GELU
29
+ load_best_checkpoint_at_end: false
30
+ loss_function: corn_loss
31
+ lr: 0.0004
32
+ lr_initial: '0.0004'
33
+ lr_sched_exp_fac: null
34
+ lr_scheduling: StepLR
35
+ manual_max_sequence_for_model: 500
36
+ max_len_chars_list: 0
37
+ max_seq_length: 500
38
+ method_chars_into_model: resnet
39
+ method_to_include_char_positions: concat
40
+ min_lr_anneal: 1e-6
41
+ model_to_use: BERT
42
+ multistep_milestones: null
43
+ n_layers_BERT: 4
44
+ norm_by_char_averages: false
45
+ norm_by_line_width: false
46
+ norm_coords_by_letter_min_x_y: false
47
+ normalize_by_line_height_and_width: true
48
+ num_attention_heads: 8
49
+ num_classes: 16
50
+ num_lin_layers: 1
51
+ num_warmup_steps: 3000
52
+ one_hot_y: false
53
+ ord_reg_loss_max: 16
54
+ ord_reg_loss_min: -1
55
+ padding_at_end: true
56
+ plot_histogram: true
57
+ plot_learning_curves: true
58
+ precision: 16-mixed
59
+ prediction_only: false
60
+ pretrained_model_name_to_load: null
61
+ profile_torch_run: false
62
+ reload_model: false
63
+ reload_model_date: null
64
+ remove_eval_idx_from_train_idx: true
65
+ remove_timm_classifier_head_pooling: true
66
+ sample_cols:
67
+ - x
68
+ - y
69
+ sample_means:
70
+ - 0.7326
71
+ - 6.6381
72
+ - 2.4717
73
+ sample_std:
74
+ - 0.2778
75
+ - 1.882
76
+ - 1.8562
77
+ sample_std_unscaled:
78
+ - 285.193
79
+ - 131.1842
80
+ - 1.8562
81
+ save_weights_only: true
82
+ set_max_seq_len_manually: true
83
+ set_num_classes_manually: true
84
+ source_for_pretrained_cv_model: timm
85
+ target_padding_number: -100
86
+ track_activations_via_hook: false
87
+ track_gradient_histogram: false
88
+ use_char_bounding_boxes: true
89
+ use_early_stopping: false
90
+ use_embedded_char_pos_info: true
91
+ use_fixation_duration_information: false
92
+ use_in_projection_bias: false
93
+ use_lr_warmup: true
94
+ use_pupil_size_information: false
95
+ use_reduce_on_plateau: false
96
+ use_start_time_as_input_col: false
97
+ use_training_steps_for_end_and_lr_decay: true
98
+ use_words_coords: false
99
+ warmup_exponent: 1
100
+ weight_decay: 0.0
models/BERT_fin_exp_20240104-233803.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ add_layer_norm_to_char_mlp: true
2
+ add_layer_norm_to_in_projection: false
3
+ add_line_overlap_feature: true
4
+ add_normalised_values_as_features: false
5
+ change_pooling_for_timm_head_to: AdaptiveAvgPool2d
6
+ char_dims: 0
7
+ char_plot_shape:
8
+ - 224
9
+ - 224
10
+ chars_bert_reduction_factor: 4
11
+ chars_conv_lr_reduction_factor: 1
12
+ chars_conv_pooling_out_dim: 1
13
+ convert_posix: false
14
+ convert_winpath: false
15
+ cv_char_modelname: coatnet_nano_rw_224
16
+ cv_modelname: null
17
+ early_stopping_patience: 15
18
+ gamma_multistep: null
19
+ gamma_step_factor: 0.5
20
+ gamma_step_size: 3000
21
+ head_multiplication_factor: 64
22
+ hidden_dim_bert: 512
23
+ hidden_dropout_prob: 0.0
24
+ im_partial_string: fixations_chars_channel_sep
25
+ input_padding_val: 10
26
+ last_activation: Identity
27
+ layer_norm_after_in_projection: true
28
+ linear_activation: GELU
29
+ load_best_checkpoint_at_end: false
30
+ loss_function: corn_loss
31
+ lr: 0.0004
32
+ lr_initial: '0.0004'
33
+ lr_sched_exp_fac: null
34
+ lr_scheduling: StepLR
35
+ manual_max_sequence_for_model: 500
36
+ max_len_chars_list: 0
37
+ max_seq_length: 500
38
+ method_chars_into_model: resnet
39
+ method_to_include_char_positions: concat
40
+ min_lr_anneal: 1e-6
41
+ model_to_use: BERT
42
+ multistep_milestones: null
43
+ n_layers_BERT: 4
44
+ norm_by_char_averages: false
45
+ norm_by_line_width: false
46
+ norm_coords_by_letter_min_x_y: false
47
+ normalize_by_line_height_and_width: false
48
+ num_attention_heads: 8
49
+ num_classes: 16
50
+ num_lin_layers: 1
51
+ num_warmup_steps: 3000
52
+ one_hot_y: false
53
+ ord_reg_loss_max: 16
54
+ ord_reg_loss_min: -1
55
+ padding_at_end: true
56
+ plot_histogram: true
57
+ plot_learning_curves: true
58
+ precision: 16-mixed
59
+ prediction_only: false
60
+ pretrained_model_name_to_load: null
61
+ profile_torch_run: false
62
+ reload_model: false
63
+ reload_model_date: null
64
+ remove_eval_idx_from_train_idx: true
65
+ remove_timm_classifier_head_pooling: true
66
+ sample_cols:
67
+ - x
68
+ - y
69
+ sample_means:
70
+ - 710.6114
71
+ - 473.7518
72
+ - 2.4717
73
+ sample_std:
74
+ - 285.1937
75
+ - 131.1842
76
+ - 1.8562
77
+ sample_std_unscaled:
78
+ - 285.193
79
+ - 131.1842
80
+ - 1.8562
81
+ save_weights_only: true
82
+ set_max_seq_len_manually: true
83
+ set_num_classes_manually: true
84
+ source_for_pretrained_cv_model: timm
85
+ target_padding_number: -100
86
+ track_activations_via_hook: false
87
+ track_gradient_histogram: false
88
+ use_char_bounding_boxes: true
89
+ use_early_stopping: false
90
+ use_embedded_char_pos_info: true
91
+ use_fixation_duration_information: false
92
+ use_in_projection_bias: false
93
+ use_lr_warmup: true
94
+ use_pupil_size_information: false
95
+ use_reduce_on_plateau: false
96
+ use_start_time_as_input_col: false
97
+ use_training_steps_for_end_and_lr_decay: true
98
+ use_words_coords: false
99
+ warmup_exponent: 1
100
+ weight_decay: 0.0
models/BERT_fin_exp_20240107-152040.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ add_layer_norm_to_char_mlp: true
2
+ add_layer_norm_to_in_projection: false
3
+ add_line_overlap_feature: true
4
+ add_normalised_values_as_features: false
5
+ change_pooling_for_timm_head_to: AdaptiveAvgPool2d
6
+ char_dims: 0
7
+ char_plot_shape:
8
+ - 224
9
+ - 224
10
+ chars_bert_reduction_factor: 4
11
+ chars_conv_lr_reduction_factor: 1
12
+ chars_conv_pooling_out_dim: 1
13
+ convert_posix: false
14
+ convert_winpath: false
15
+ cv_char_modelname: coatnet_nano_rw_224
16
+ cv_modelname: null
17
+ early_stopping_patience: 15
18
+ gamma_multistep: null
19
+ gamma_step_factor: 0.5
20
+ gamma_step_size: 3000
21
+ head_multiplication_factor: 64
22
+ hidden_dim_bert: 512
23
+ hidden_dropout_prob: 0.0
24
+ im_partial_string: fixations_chars_channel_sep
25
+ input_padding_val: 10
26
+ last_activation: Identity
27
+ layer_norm_after_in_projection: true
28
+ linear_activation: GELU
29
+ load_best_checkpoint_at_end: false
30
+ loss_function: corn_loss
31
+ lr: 0.0004
32
+ lr_initial: '0.0004'
33
+ lr_sched_exp_fac: null
34
+ lr_scheduling: StepLR
35
+ manual_max_sequence_for_model: 500
36
+ max_len_chars_list: 0
37
+ max_seq_length: 500
38
+ method_chars_into_model: resnet
39
+ method_to_include_char_positions: concat
40
+ min_lr_anneal: 1e-6
41
+ model_to_use: BERT
42
+ multistep_milestones: null
43
+ n_layers_BERT: 4
44
+ norm_by_char_averages: false
45
+ norm_by_line_width: false
46
+ norm_coords_by_letter_min_x_y: true
47
+ normalize_by_line_height_and_width: true
48
+ num_attention_heads: 8
49
+ num_classes: 16
50
+ num_lin_layers: 1
51
+ num_warmup_steps: 3000
52
+ one_hot_y: false
53
+ ord_reg_loss_max: 16
54
+ ord_reg_loss_min: -1
55
+ padding_at_end: true
56
+ plot_histogram: true
57
+ plot_learning_curves: true
58
+ precision: 16-mixed
59
+ prediction_only: false
60
+ pretrained_model_name_to_load: null
61
+ profile_torch_run: false
62
+ reload_model: false
63
+ reload_model_date: null
64
+ remove_eval_idx_from_train_idx: true
65
+ remove_timm_classifier_head_pooling: true
66
+ sample_cols:
67
+ - x
68
+ - y
69
+ sample_means:
70
+ - 0.4423
71
+ - 3.1164
72
+ - 2.4717
73
+ sample_std:
74
+ - 0.2778
75
+ - 1.882
76
+ - 1.8562
77
+ sample_std_unscaled:
78
+ - 285.193
79
+ - 131.1842
80
+ - 1.8562
81
+ save_weights_only: true
82
+ set_max_seq_len_manually: true
83
+ set_num_classes_manually: true
84
+ source_for_pretrained_cv_model: timm
85
+ target_padding_number: -100
86
+ track_activations_via_hook: false
87
+ track_gradient_histogram: false
88
+ use_char_bounding_boxes: true
89
+ use_early_stopping: false
90
+ use_embedded_char_pos_info: true
91
+ use_fixation_duration_information: false
92
+ use_in_projection_bias: false
93
+ use_lr_warmup: true
94
+ use_pupil_size_information: false
95
+ use_reduce_on_plateau: false
96
+ use_start_time_as_input_col: false
97
+ use_training_steps_for_end_and_lr_decay: true
98
+ use_words_coords: false
99
+ warmup_exponent: 1
100
+ weight_decay: 0.0
models/BERT_fin_exp_20240108-000344.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ add_layer_norm_to_char_mlp: true
2
+ add_layer_norm_to_in_projection: false
3
+ add_line_overlap_feature: true
4
+ add_normalised_values_as_features: false
5
+ change_pooling_for_timm_head_to: AdaptiveAvgPool2d
6
+ char_dims: 0
7
+ char_plot_shape:
8
+ - 224
9
+ - 224
10
+ chars_bert_reduction_factor: 4
11
+ chars_conv_lr_reduction_factor: 1
12
+ chars_conv_pooling_out_dim: 1
13
+ convert_posix: false
14
+ convert_winpath: true
15
+ cv_char_modelname: coatnet_nano_rw_224
16
+ cv_modelname: null
17
+ early_stopping_patience: 15
18
+ gamma_multistep: null
19
+ gamma_step_factor: 0.5
20
+ gamma_step_size: 3000
21
+ head_multiplication_factor: 64
22
+ hidden_dim_bert: 512
23
+ hidden_dropout_prob: 0.0
24
+ im_partial_string: fixations_chars_channel_sep
25
+ input_padding_val: 10
26
+ last_activation: Identity
27
+ layer_norm_after_in_projection: true
28
+ linear_activation: GELU
29
+ load_best_checkpoint_at_end: false
30
+ loss_function: corn_loss
31
+ lr: 0.0004
32
+ lr_initial: '0.0004'
33
+ lr_sched_exp_fac: null
34
+ lr_scheduling: StepLR
35
+ manual_max_sequence_for_model: 500
36
+ max_len_chars_list: 0
37
+ max_seq_length: 500
38
+ method_chars_into_model: resnet
39
+ method_to_include_char_positions: concat
40
+ min_lr_anneal: 1e-6
41
+ model_to_use: BERT
42
+ multistep_milestones: null
43
+ n_layers_BERT: 4
44
+ norm_by_char_averages: false
45
+ norm_by_line_width: false
46
+ norm_coords_by_letter_min_x_y: true
47
+ normalize_by_line_height_and_width: false
48
+ num_attention_heads: 8
49
+ num_classes: 16
50
+ num_lin_layers: 1
51
+ num_warmup_steps: 3000
52
+ one_hot_y: false
53
+ ord_reg_loss_max: 16
54
+ ord_reg_loss_min: -1
55
+ padding_at_end: true
56
+ plot_histogram: true
57
+ plot_learning_curves: true
58
+ precision: 16-mixed
59
+ prediction_only: false
60
+ pretrained_model_name_to_load: null
61
+ profile_torch_run: false
62
+ reload_model: false
63
+ reload_model_date: null
64
+ remove_eval_idx_from_train_idx: true
65
+ remove_timm_classifier_head_pooling: true
66
+ sample_cols:
67
+ - x
68
+ - y
69
+ sample_means:
70
+ - 455.5905
71
+ - 218.0598
72
+ - 2.4717
73
+ sample_std:
74
+ - 285.1936
75
+ - 131.1842
76
+ - 1.8562
77
+ sample_std_unscaled:
78
+ - 285.1939
79
+ - 131.1844
80
+ - 1.8562
81
+ save_weights_only: true
82
+ set_max_seq_len_manually: true
83
+ set_num_classes_manually: true
84
+ source_for_pretrained_cv_model: timm
85
+ target_padding_number: -100
86
+ track_activations_via_hook: false
87
+ track_gradient_histogram: false
88
+ use_char_bounding_boxes: true
89
+ use_early_stopping: false
90
+ use_embedded_char_pos_info: true
91
+ use_fixation_duration_information: false
92
+ use_in_projection_bias: false
93
+ use_lr_warmup: true
94
+ use_pupil_size_information: false
95
+ use_reduce_on_plateau: false
96
+ use_start_time_as_input_col: false
97
+ use_training_steps_for_end_and_lr_decay: true
98
+ use_words_coords: false
99
+ warmup_exponent: 1
100
+ weight_decay: 0.0
models/BERT_fin_exp_20240108-011230.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ add_layer_norm_to_char_mlp: true
2
+ add_layer_norm_to_in_projection: false
3
+ add_line_overlap_feature: true
4
+ add_normalised_values_as_features: false
5
+ change_pooling_for_timm_head_to: AdaptiveAvgPool2d
6
+ char_dims: 0
7
+ char_plot_shape:
8
+ - 224
9
+ - 224
10
+ chars_bert_reduction_factor: 4
11
+ chars_conv_lr_reduction_factor: 1
12
+ chars_conv_pooling_out_dim: 1
13
+ convert_posix: false
14
+ convert_winpath: true
15
+ cv_char_modelname: coatnet_nano_rw_224
16
+ cv_modelname: null
17
+ early_stopping_patience: 15
18
+ gamma_multistep: null
19
+ gamma_step_factor: 0.5
20
+ gamma_step_size: 3000
21
+ head_multiplication_factor: 64
22
+ hidden_dim_bert: 512
23
+ hidden_dropout_prob: 0.0
24
+ im_partial_string: fixations_chars_channel_sep
25
+ input_padding_val: 10
26
+ last_activation: Identity
27
+ layer_norm_after_in_projection: true
28
+ linear_activation: GELU
29
+ load_best_checkpoint_at_end: false
30
+ loss_function: corn_loss
31
+ lr: 0.0004
32
+ lr_initial: '0.0004'
33
+ lr_sched_exp_fac: null
34
+ lr_scheduling: StepLR
35
+ manual_max_sequence_for_model: 500
36
+ max_len_chars_list: 0
37
+ max_seq_length: 500
38
+ method_chars_into_model: resnet
39
+ method_to_include_char_positions: concat
40
+ min_lr_anneal: 1e-6
41
+ model_to_use: BERT
42
+ multistep_milestones: null
43
+ n_layers_BERT: 4
44
+ norm_by_char_averages: false
45
+ norm_by_line_width: false
46
+ norm_coords_by_letter_min_x_y: true
47
+ normalize_by_line_height_and_width: true
48
+ num_attention_heads: 8
49
+ num_classes: 16
50
+ num_lin_layers: 1
51
+ num_warmup_steps: 3000
52
+ one_hot_y: false
53
+ ord_reg_loss_max: 16
54
+ ord_reg_loss_min: -1
55
+ padding_at_end: true
56
+ plot_histogram: true
57
+ plot_learning_curves: true
58
+ precision: 16-mixed
59
+ prediction_only: false
60
+ pretrained_model_name_to_load: null
61
+ profile_torch_run: false
62
+ reload_model: false
63
+ reload_model_date: null
64
+ remove_eval_idx_from_train_idx: true
65
+ remove_timm_classifier_head_pooling: true
66
+ sample_cols:
67
+ - x
68
+ - y
69
+ sample_means:
70
+ - 0.4423
71
+ - 3.1164
72
+ - 2.4717
73
+ sample_std:
74
+ - 0.2778
75
+ - 1.882
76
+ - 1.8562
77
+ sample_std_unscaled:
78
+ - 285.1939
79
+ - 131.1844
80
+ - 1.8562
81
+ save_weights_only: true
82
+ set_max_seq_len_manually: true
83
+ set_num_classes_manually: true
84
+ source_for_pretrained_cv_model: timm
85
+ target_padding_number: -100
86
+ track_activations_via_hook: false
87
+ track_gradient_histogram: false
88
+ use_char_bounding_boxes: true
89
+ use_early_stopping: false
90
+ use_embedded_char_pos_info: true
91
+ use_fixation_duration_information: false
92
+ use_in_projection_bias: false
93
+ use_lr_warmup: true
94
+ use_pupil_size_information: false
95
+ use_reduce_on_plateau: false
96
+ use_start_time_as_input_col: false
97
+ use_training_steps_for_end_and_lr_decay: true
98
+ use_words_coords: false
99
+ warmup_exponent: 1
100
+ weight_decay: 0.0
models/BERT_fin_exp_20240109-090419.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ add_layer_norm_to_char_mlp: true
2
+ add_layer_norm_to_in_projection: false
3
+ add_line_overlap_feature: true
4
+ add_normalised_values_as_features: false
5
+ change_pooling_for_timm_head_to: AdaptiveAvgPool2d
6
+ char_dims: 0
7
+ char_plot_shape:
8
+ - 224
9
+ - 224
10
+ chars_bert_reduction_factor: 4
11
+ chars_conv_lr_reduction_factor: 1
12
+ chars_conv_pooling_out_dim: 1
13
+ convert_posix: false
14
+ convert_winpath: true
15
+ cv_char_modelname: coatnet_nano_rw_224
16
+ cv_modelname: null
17
+ early_stopping_patience: 15
18
+ gamma_multistep: null
19
+ gamma_step_factor: 0.5
20
+ gamma_step_size: 3000
21
+ head_multiplication_factor: 64
22
+ hidden_dim_bert: 512
23
+ hidden_dropout_prob: 0.0
24
+ im_partial_string: fixations_chars_channel_sep
25
+ input_padding_val: 10
26
+ last_activation: Identity
27
+ layer_norm_after_in_projection: true
28
+ linear_activation: GELU
29
+ load_best_checkpoint_at_end: false
30
+ loss_function: corn_loss
31
+ lr: 0.0004
32
+ lr_initial: '0.0004'
33
+ lr_sched_exp_fac: null
34
+ lr_scheduling: StepLR
35
+ manual_max_sequence_for_model: 500
36
+ max_len_chars_list: 0
37
+ max_seq_length: 500
38
+ method_chars_into_model: resnet
39
+ method_to_include_char_positions: concat
40
+ min_lr_anneal: 1e-6
41
+ model_to_use: BERT
42
+ multistep_milestones: null
43
+ n_layers_BERT: 4
44
+ norm_by_char_averages: false
45
+ norm_by_line_width: false
46
+ norm_coords_by_letter_min_x_y: true
47
+ normalize_by_line_height_and_width: false
48
+ num_attention_heads: 8
49
+ num_classes: 16
50
+ num_lin_layers: 1
51
+ num_warmup_steps: 3000
52
+ one_hot_y: false
53
+ ord_reg_loss_max: 16
54
+ ord_reg_loss_min: -1
55
+ padding_at_end: true
56
+ plot_histogram: true
57
+ plot_learning_curves: true
58
+ precision: 16-mixed
59
+ prediction_only: false
60
+ pretrained_model_name_to_load: null
61
+ profile_torch_run: false
62
+ reload_model: false
63
+ reload_model_date: null
64
+ remove_eval_idx_from_train_idx: true
65
+ remove_timm_classifier_head_pooling: true
66
+ sample_cols:
67
+ - x
68
+ - y
69
+ sample_means:
70
+ - 455.708
71
+ - 217.8342
72
+ - 2.4706
73
+ sample_std:
74
+ - 285.2534
75
+ - 131.0263
76
+ - 1.8542
77
+ sample_std_unscaled:
78
+ - 285.2527
79
+ - 131.0262
80
+ - 1.8543
81
+ save_weights_only: true
82
+ set_max_seq_len_manually: true
83
+ set_num_classes_manually: true
84
+ source_for_pretrained_cv_model: timm
85
+ target_padding_number: -100
86
+ track_activations_via_hook: false
87
+ track_gradient_histogram: false
88
+ use_char_bounding_boxes: true
89
+ use_early_stopping: false
90
+ use_embedded_char_pos_info: true
91
+ use_fixation_duration_information: false
92
+ use_in_projection_bias: false
93
+ use_lr_warmup: true
94
+ use_pupil_size_information: false
95
+ use_reduce_on_plateau: false
96
+ use_start_time_as_input_col: false
97
+ use_training_steps_for_end_and_lr_decay: true
98
+ use_words_coords: false
99
+ warmup_exponent: 1
100
+ weight_decay: 0.0
models/BERT_fin_exp_20240122-183729.yaml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ add_layer_norm_to_char_mlp: true
2
+ add_layer_norm_to_in_projection: false
3
+ add_line_overlap_feature: true
4
+ add_normalised_values_as_features: false
5
+ add_woc_feature: false
6
+ change_pooling_for_timm_head_to: AdaptiveAvgPool2d
7
+ char_dims: 0
8
+ char_plot_shape:
9
+ - 224
10
+ - 224
11
+ chars_bert_reduction_factor: 4
12
+ chars_conv_lr_reduction_factor: 1
13
+ chars_conv_pooling_out_dim: 1
14
+ convert_posix: false
15
+ convert_winpath: false
16
+ cv_char_modelname: coatnet_nano_rw_224
17
+ cv_modelname: null
18
+ early_stopping_patience: 15
19
+ gamma_multistep: null
20
+ gamma_step_factor: 0.5
21
+ gamma_step_size: 3000
22
+ head_multiplication_factor: 64
23
+ hidden_dim_bert: 512
24
+ hidden_dropout_prob: 0.0
25
+ im_partial_string: fixations_chars_channel_sep
26
+ input_padding_val: 10
27
+ last_activation: Identity
28
+ layer_norm_after_in_projection: true
29
+ linear_activation: GELU
30
+ load_best_checkpoint_at_end: false
31
+ loss_function: corn_loss
32
+ lr: 0.0004
33
+ lr_initial: '0.0004'
34
+ lr_sched_exp_fac: null
35
+ lr_scheduling: StepLR
36
+ manual_max_sequence_for_model: 500
37
+ max_len_chars_list: 0
38
+ max_seq_length: 500
39
+ method_chars_into_model: resnet
40
+ method_to_include_char_positions: concat
41
+ min_lr_anneal: 1e-6
42
+ model_to_use: BERT
43
+ multistep_milestones: null
44
+ n_layers_BERT: 4
45
+ norm_by_char_averages: false
46
+ norm_by_line_width: false
47
+ norm_coords_by_letter_min_x_y: true
48
+ normalize_by_line_height_and_width: true
49
+ num_attention_heads: 8
50
+ num_classes: 16
51
+ num_lin_layers: 1
52
+ num_warmup_steps: 3000
53
+ one_hot_y: false
54
+ only_use_2nd_input_stream: false
55
+ ord_reg_loss_max: 16
56
+ ord_reg_loss_min: -1
57
+ padding_at_end: true
58
+ plot_histogram: true
59
+ plot_learning_curves: true
60
+ precision: 16-mixed
61
+ prediction_only: false
62
+ pretrained_model_name_to_load: null
63
+ profile_torch_run: false
64
+ reload_model: false
65
+ reload_model_date: null
66
+ remove_eval_idx_from_train_idx: true
67
+ remove_timm_classifier_head_pooling: true
68
+ sample_cols:
69
+ - x
70
+ - y
71
+ sample_means:
72
+ - 0.4433
73
+ - 2.9599
74
+ - 2.3264
75
+ sample_std:
76
+ - 0.2782
77
+ - 1.7872
78
+ - 1.7619
79
+ sample_std_unscaled:
80
+ - 287.0107
81
+ - 124.4113
82
+ - 1.7619
83
+ save_weights_only: true
84
+ set_max_seq_len_manually: true
85
+ set_num_classes_manually: true
86
+ source_for_pretrained_cv_model: timm
87
+ target_padding_number: -100
88
+ track_activations_via_hook: false
89
+ track_gradient_histogram: false
90
+ use_char_bounding_boxes: true
91
+ use_early_stopping: false
92
+ use_embedded_char_pos_info: true
93
+ use_fixation_duration_information: false
94
+ use_in_projection_bias: false
95
+ use_lr_warmup: true
96
+ use_pupil_size_information: false
97
+ use_reduce_on_plateau: false
98
+ use_start_time_as_input_col: false
99
+ use_training_steps_for_end_and_lr_decay: true
100
+ use_words_coords: false
101
+ warmup_exponent: 1
102
+ weight_decay: 0.0
models/BERT_fin_exp_20240122-194041.yaml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ add_layer_norm_to_char_mlp: true
2
+ add_layer_norm_to_in_projection: false
3
+ add_line_overlap_feature: true
4
+ add_normalised_values_as_features: false
5
+ add_woc_feature: false
6
+ change_pooling_for_timm_head_to: AdaptiveAvgPool2d
7
+ char_dims: 0
8
+ char_plot_shape:
9
+ - 224
10
+ - 224
11
+ chars_bert_reduction_factor: 4
12
+ chars_conv_lr_reduction_factor: 1
13
+ chars_conv_pooling_out_dim: 1
14
+ convert_posix: false
15
+ convert_winpath: false
16
+ cv_char_modelname: coatnet_nano_rw_224
17
+ cv_modelname: null
18
+ early_stopping_patience: 15
19
+ gamma_multistep: null
20
+ gamma_step_factor: 0.5
21
+ gamma_step_size: 3000
22
+ head_multiplication_factor: 64
23
+ hidden_dim_bert: 512
24
+ hidden_dropout_prob: 0.0
25
+ im_partial_string: fixations_chars_channel_sep
26
+ input_padding_val: 10
27
+ last_activation: Identity
28
+ layer_norm_after_in_projection: true
29
+ linear_activation: GELU
30
+ load_best_checkpoint_at_end: false
31
+ loss_function: corn_loss
32
+ lr: 0.0004
33
+ lr_initial: '0.0004'
34
+ lr_sched_exp_fac: null
35
+ lr_scheduling: StepLR
36
+ manual_max_sequence_for_model: 500
37
+ max_len_chars_list: 0
38
+ max_seq_length: 500
39
+ method_chars_into_model: resnet
40
+ method_to_include_char_positions: concat
41
+ min_lr_anneal: 1e-6
42
+ model_to_use: BERT
43
+ multistep_milestones: null
44
+ n_layers_BERT: 4
45
+ norm_by_char_averages: false
46
+ norm_by_line_width: false
47
+ norm_coords_by_letter_min_x_y: true
48
+ normalize_by_line_height_and_width: false
49
+ num_attention_heads: 8
50
+ num_classes: 16
51
+ num_lin_layers: 1
52
+ num_warmup_steps: 3000
53
+ one_hot_y: false
54
+ only_use_2nd_input_stream: false
55
+ ord_reg_loss_max: 16
56
+ ord_reg_loss_min: -1
57
+ padding_at_end: true
58
+ plot_histogram: true
59
+ plot_learning_curves: true
60
+ precision: 16-mixed
61
+ prediction_only: false
62
+ pretrained_model_name_to_load: null
63
+ profile_torch_run: false
64
+ reload_model: false
65
+ reload_model_date: null
66
+ remove_eval_idx_from_train_idx: true
67
+ remove_timm_classifier_head_pooling: true
68
+ sample_cols:
69
+ - x
70
+ - y
71
+ sample_means:
72
+ - 459.3367
73
+ - 206.88
74
+ - 2.3264
75
+ sample_std:
76
+ - 287.0111
77
+ - 124.4113
78
+ - 1.7619
79
+ sample_std_unscaled:
80
+ - 287.0107
81
+ - 124.4113
82
+ - 1.7619
83
+ save_weights_only: true
84
+ set_max_seq_len_manually: true
85
+ set_num_classes_manually: true
86
+ source_for_pretrained_cv_model: timm
87
+ target_padding_number: -100
88
+ track_activations_via_hook: false
89
+ track_gradient_histogram: false
90
+ use_char_bounding_boxes: true
91
+ use_early_stopping: false
92
+ use_embedded_char_pos_info: true
93
+ use_fixation_duration_information: false
94
+ use_in_projection_bias: false
95
+ use_lr_warmup: true
96
+ use_pupil_size_information: false
97
+ use_reduce_on_plateau: false
98
+ use_start_time_as_input_col: false
99
+ use_training_steps_for_end_and_lr_decay: true
100
+ use_words_coords: false
101
+ warmup_exponent: 1
102
+ weight_decay: 0.0
multi_proc_funcs.py ADDED
@@ -0,0 +1,2415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from icecream import ic
2
+ from matplotlib import pyplot as plt
3
+ import pathlib as pl
4
+ import json
5
+ from PIL import Image
6
+ from torch.utils.data.dataloader import DataLoader as dl
7
+ import matplotlib.patches as patches
8
+ from torch.utils.data import Dataset as torch_dset
9
+ import torchvision.transforms.functional as tvfunc
10
+ import einops as eo
11
+ from collections.abc import Iterable
12
+ import numpy as np
13
+ import pandas as pd
14
+ from matplotlib import font_manager
15
+ from matplotlib.font_manager import FontProperties
16
+ from matplotlib.patches import Rectangle
17
+ from tqdm.auto import tqdm
18
+ import torch as t
19
+ import plotly.express as px
20
+ import copy
21
+
22
+ import yaml
23
+ import classic_correction_algos as calgo
24
+ import analysis_funcs as anf
25
+ import models
26
+ import popEye_funcs as pf
27
+ from loss_functions import corn_label_from_logits
28
+ import torch.multiprocessing
29
+ torch.multiprocessing.set_sharing_strategy('file_system') # Needed to make multi proc not fail on linux
30
+
31
+ ic.configureOutput(includeContext=True)
32
+
33
+ PLOTS_FOLDER = pl.Path("plots")
34
+ event_strs = [
35
+ "EFIX",
36
+ "EFIX R",
37
+ "EFIX L",
38
+ "SSACC",
39
+ "ESACC",
40
+ "SFIX",
41
+ "MSG",
42
+ "SBLINK",
43
+ "EBLINK",
44
+ "BUTTON",
45
+ "INPUT",
46
+ "END",
47
+ "START",
48
+ "DISPLAY ON",
49
+ ]
50
+ AVAILABLE_FONTS = [x.name for x in font_manager.fontManager.ttflist]
51
+ COLORS = px.colors.qualitative.Alphabet
52
+ RESULTS_FOLDER = pl.Path("results")
53
+ PLOTS_FOLDER = pl.Path("plots")
54
+
55
+ DIST_MODELS_FOLDER = pl.Path("models")
56
+ IMAGENET_MEAN = [0.485, 0.456, 0.406]
57
+ IMAGENET_STD = [0.229, 0.224, 0.225]
58
+ DEFAULT_FIX_MEASURES = [
59
+ "letternum",
60
+ "letter",
61
+ "on_word_number",
62
+ "on_word",
63
+ "on_sentence",
64
+ "num_words_in_sentence",
65
+ "on_sentence_num",
66
+ "word_land",
67
+ "line_let",
68
+ "line_word",
69
+ "sac_in",
70
+ "sac_out",
71
+ "word_launch",
72
+ "word_refix",
73
+ "word_reg_in",
74
+ "word_reg_out",
75
+ "sentence_reg_in",
76
+ "word_firstskip",
77
+ "word_run",
78
+ "sentence_run",
79
+ "word_run_fix",
80
+ "word_cland",
81
+ ]
82
+ ALL_FIX_MEASURES = DEFAULT_FIX_MEASURES + [
83
+ "angle_incoming",
84
+ "angle_outgoing",
85
+ "line_let_from_last_letter",
86
+ "sentence_word",
87
+ "line_let_previous",
88
+ "line_let_next",
89
+ "sentence_refix",
90
+ "word_reg_out_to",
91
+ "word_reg_in_from",
92
+ "sentence_reg_out",
93
+ "sentence_reg_in_from",
94
+ "sentence_reg_out_to",
95
+ "sentence_firstskip",
96
+ "word_runid",
97
+ "sentence_runid",
98
+ "word_fix",
99
+ "sentence_fix",
100
+ "sentence_run_fix",
101
+ ]
102
+
103
+
104
+ class DSet(torch_dset):
105
+ def __init__(
106
+ self,
107
+ in_sequence: t.Tensor,
108
+ chars_center_coords_padded: t.Tensor,
109
+ out_categories: t.Tensor,
110
+ trialslist: list,
111
+ padding_list: list = None,
112
+ padding_at_end: bool = False,
113
+ return_images_for_conv: bool = False,
114
+ im_partial_string: str = "fixations_chars_channel_sep",
115
+ input_im_shape=[224, 224],
116
+ ) -> None:
117
+ super().__init__()
118
+
119
+ self.in_sequence = in_sequence
120
+ self.chars_center_coords_padded = chars_center_coords_padded
121
+ self.out_categories = out_categories
122
+ self.padding_list = padding_list
123
+ self.padding_at_end = padding_at_end
124
+ self.trialslist = trialslist
125
+ self.return_images_for_conv = return_images_for_conv
126
+ self.input_im_shape = input_im_shape
127
+ if return_images_for_conv:
128
+ self.im_partial_string = im_partial_string
129
+ self.plot_files = [
130
+ str(x["plot_file"]).replace("fixations_words", im_partial_string) for x in self.trialslist
131
+ ]
132
+
133
+ def __getitem__(self, index):
134
+
135
+ if self.return_images_for_conv:
136
+ im = Image.open(self.plot_files[index])
137
+ if [im.size[1], im.size[0]] != self.input_im_shape:
138
+ im = tvfunc.resize(im, self.input_im_shape)
139
+ im = tvfunc.normalize(tvfunc.to_tensor(im), IMAGENET_MEAN, IMAGENET_STD)
140
+ if self.chars_center_coords_padded is not None:
141
+ if self.padding_list is not None:
142
+ attention_mask = t.ones(self.in_sequence[index].shape[:-1], dtype=t.long)
143
+ if self.padding_at_end:
144
+ if self.padding_list[index] > 0:
145
+ attention_mask[-self.padding_list[index] :] = 0
146
+ else:
147
+ attention_mask[: self.padding_list[index]] = 0
148
+ if self.return_images_for_conv:
149
+ return (
150
+ self.in_sequence[index],
151
+ self.chars_center_coords_padded[index],
152
+ im,
153
+ attention_mask,
154
+ self.out_categories[index],
155
+ )
156
+ return (
157
+ self.in_sequence[index],
158
+ self.chars_center_coords_padded[index],
159
+ attention_mask,
160
+ self.out_categories[index],
161
+ )
162
+ else:
163
+ if self.return_images_for_conv:
164
+ return (
165
+ self.in_sequence[index],
166
+ self.chars_center_coords_padded[index],
167
+ im,
168
+ self.out_categories[index],
169
+ )
170
+ else:
171
+ return (self.in_sequence[index], self.chars_center_coords_padded[index], self.out_categories[index])
172
+
173
+ if self.padding_list is not None:
174
+ attention_mask = t.ones(self.in_sequence[index].shape[:-1], dtype=t.long)
175
+ if self.padding_at_end:
176
+ if self.padding_list[index] > 0:
177
+ attention_mask[-self.padding_list[index] :] = 0
178
+ else:
179
+ attention_mask[: self.padding_list[index]] = 0
180
+ if self.return_images_for_conv:
181
+ return (self.in_sequence[index], im, attention_mask, self.out_categories[index])
182
+ else:
183
+ return (self.in_sequence[index], attention_mask, self.out_categories[index])
184
+ if self.return_images_for_conv:
185
+ return (self.in_sequence[index], im, self.out_categories[index])
186
+ else:
187
+ return (self.in_sequence[index], self.out_categories[index])
188
+
189
+ def __len__(self):
190
+ if isinstance(self.in_sequence, t.Tensor):
191
+ return self.in_sequence.shape[0]
192
+ else:
193
+ return len(self.in_sequence)
194
+
195
+
196
+ def remove_compile_from_model(model):
197
+ if hasattr(model.project, "_orig_mod"):
198
+ model.project = model.project._orig_mod
199
+ model.chars_conv = model.chars_conv._orig_mod
200
+ model.chars_classifier = model.chars_classifier._orig_mod
201
+ model.layer_norm_in = model.layer_norm_in._orig_mod
202
+ model.bert_model = model.bert_model._orig_mod
203
+ model.linear = model.linear._orig_mod
204
+ return model
205
+
206
+
207
+ def remove_compile_from_dict(state_dict):
208
+ for key in list(state_dict.keys()):
209
+ newkey = key.replace("._orig_mod.", ".")
210
+ state_dict[newkey] = state_dict.pop(key)
211
+ return state_dict
212
+
213
+
214
+ def load_model(model_file, cfg):
215
+ try:
216
+ model_loaded = t.load(model_file, map_location="cpu", weights_only=True)
217
+ if "hyper_parameters" in model_loaded.keys():
218
+ model_cfg_temp = model_loaded["hyper_parameters"]["cfg"]
219
+ else:
220
+ model_cfg_temp = cfg
221
+ model_state_dict = model_loaded["state_dict"]
222
+ except Exception as e:
223
+ ic(e)
224
+ ic(f"Failed to load {model_file}")
225
+ return None
226
+ model = models.LitModel(
227
+ [1, 500, 3],
228
+ model_cfg_temp["hidden_dim_bert"],
229
+ model_cfg_temp["num_attention_heads"],
230
+ model_cfg_temp["n_layers_BERT"],
231
+ model_cfg_temp["loss_function"],
232
+ 1e-4,
233
+ model_cfg_temp["weight_decay"],
234
+ model_cfg_temp,
235
+ model_cfg_temp["use_lr_warmup"],
236
+ model_cfg_temp["use_reduce_on_plateau"],
237
+ track_gradient_histogram=model_cfg_temp["track_gradient_histogram"],
238
+ register_forw_hook=model_cfg_temp["track_activations_via_hook"],
239
+ char_dims=model_cfg_temp["char_dims"],
240
+ )
241
+ model = remove_compile_from_model(model)
242
+ model_state_dict = remove_compile_from_dict(model_state_dict)
243
+ with t.no_grad():
244
+ model.load_state_dict(model_state_dict, strict=False)
245
+ model.eval()
246
+ model.freeze()
247
+ return model
248
+
249
+
250
+ def find_and_load_model(model_date: str):
251
+ model_cfg_file = list(DIST_MODELS_FOLDER.glob(f"*{model_date}*.yaml"))
252
+ if len(model_cfg_file) == 0:
253
+ ic(f"No model cfg yaml found for {model_date}")
254
+ return None, None
255
+ model_cfg_file = model_cfg_file[0]
256
+ with open(model_cfg_file) as f:
257
+ model_cfg = yaml.safe_load(f)
258
+
259
+ model_file = list(pl.Path("models").glob(f"*{model_date}*.ckpt"))[0]
260
+ model = load_model(model_file, model_cfg)
261
+
262
+ return model, model_cfg
263
+
264
+
265
+ def set_up_models(dist_models_folder):
266
+ out_dict = {}
267
+ dist_models_with_norm = list(dist_models_folder.glob("*normalize_by_line_height_and_width_True*.ckpt"))
268
+ dist_models_without_norm = list(dist_models_folder.glob("*normalize_by_line_height_and_width_False*.ckpt"))
269
+ DIST_MODEL_DATE_WITH_NORM = dist_models_with_norm[0].stem.split("_")[1]
270
+
271
+ models_without_norm_df = [find_and_load_model(m_file.stem.split("_")[1]) for m_file in dist_models_without_norm]
272
+ models_with_norm_df = [find_and_load_model(m_file.stem.split("_")[1]) for m_file in dist_models_with_norm]
273
+
274
+ model_cfg_without_norm_df = [x[1] for x in models_without_norm_df if x[1] is not None][0]
275
+ model_cfg_with_norm_df = [x[1] for x in models_with_norm_df if x[1] is not None][0]
276
+
277
+ models_without_norm_df = [x[0] for x in models_without_norm_df if x[0] is not None]
278
+ models_with_norm_df = [x[0] for x in models_with_norm_df if x[0] is not None]
279
+
280
+ ensemble_model_avg = models.EnsembleModel(
281
+ models_without_norm_df, models_with_norm_df, learning_rate=0.0058, use_simple_average=True
282
+ )
283
+ out_dict["ensemble_model_avg"] = ensemble_model_avg
284
+
285
+ out_dict["model_cfg_without_norm_df"] = model_cfg_without_norm_df
286
+ out_dict["model_cfg_with_norm_df"] = model_cfg_with_norm_df
287
+
288
+ single_DIST_model, single_DIST_model_cfg = find_and_load_model(model_date=DIST_MODEL_DATE_WITH_NORM)
289
+ out_dict["single_DIST_model"] = single_DIST_model
290
+ out_dict["single_DIST_model_cfg"] = single_DIST_model_cfg
291
+ return out_dict
292
+
293
+
294
+ def reorder_columns(
295
+ df,
296
+ cols=[
297
+ "subject",
298
+ "trial_id",
299
+ "item",
300
+ "condition",
301
+ "fixation_number",
302
+ "num",
303
+ "word_number",
304
+ "sentence_number",
305
+ "duration",
306
+ "start_uncorrected",
307
+ "stop_uncorrected",
308
+ "start_time",
309
+ "end_time",
310
+ "corrected_start_time",
311
+ "corrected_end_time",
312
+ "dX",
313
+ "dY",
314
+ ],
315
+ ):
316
+ existing_cols = [col for col in cols if col in df.columns]
317
+ other_cols = [col for col in df.columns if col not in cols]
318
+ return df[existing_cols + other_cols]
319
+
320
+
321
+ def nan_or_int_minus_one(x):
322
+ if not pd.isna(x):
323
+ return int(x - 1.0)
324
+ else:
325
+ return pd.NA
326
+
327
+
328
+ def add_popEye_cols_to_chars_df(chars_df):
329
+
330
+ if "letternum" not in chars_df.columns or "letline" not in chars_df.columns:
331
+ chars_df.reset_index(drop=False, inplace=True)
332
+ chars_df.rename({"index": "letternum"}, axis=1, inplace=True)
333
+ chars_df.loc[:, "letline"] = -1
334
+ chars_df["wordline"] = (
335
+ chars_df.groupby("assigned_line")["in_word_number"].rank(method="dense").map(nan_or_int_minus_one)
336
+ )
337
+ chars_df["wordsent"] = (
338
+ chars_df.groupby("in_sentence_number")["in_word_number"].rank(method="dense").map(nan_or_int_minus_one)
339
+ )
340
+ chars_df["letword"] = (
341
+ chars_df.groupby("in_word_number")["letternum"].rank(method="dense").map(nan_or_int_minus_one)
342
+ )
343
+ for line_idx in chars_df.assigned_line.unique():
344
+ chars_df.loc[chars_df.assigned_line == line_idx, "letline"] = (
345
+ chars_df.loc[chars_df.assigned_line == line_idx, "char"].reset_index().index
346
+ )
347
+ return chars_df
348
+
349
+
350
+ def add_boxes_to_ax(
351
+ chars_list,
352
+ ax,
353
+ font_to_use="DejaVu Sans Mono",
354
+ fontsize=21,
355
+ prefix="char",
356
+ box_annotations: list = None,
357
+ edgecolor="grey",
358
+ linewidth=0.8,
359
+ ):
360
+ if box_annotations is None:
361
+ enum = chars_list
362
+ else:
363
+ enum = zip(chars_list, box_annotations)
364
+ for v in enum:
365
+ if box_annotations is not None:
366
+ v, annot_text = v
367
+ x0, y0 = v[f"{prefix}_xmin"], v[f"{prefix}_ymin"]
368
+ xdiff, ydiff = v[f"{prefix}_xmax"] - v[f"{prefix}_xmin"], v[f"{prefix}_ymax"] - v[f"{prefix}_ymin"]
369
+ ax.add_patch(Rectangle((x0, y0), xdiff, ydiff, edgecolor=edgecolor, facecolor="none", lw=linewidth, alpha=0.4))
370
+ if box_annotations is not None:
371
+ ax.annotate(
372
+ str(annot_text),
373
+ (x0 + xdiff / 2, y0),
374
+ horizontalalignment="center",
375
+ verticalalignment="center",
376
+ fontproperties=FontProperties(family=font_to_use, style="normal", size=fontsize / 1.5),
377
+ )
378
+
379
+
380
+ def add_text_to_ax(
381
+ chars_list,
382
+ ax,
383
+ font_to_use="DejaVu Sans Mono",
384
+ fontsize=21,
385
+ prefix="char",
386
+ ):
387
+ font_props = FontProperties(family=font_to_use, style="normal", size=fontsize)
388
+ enum = chars_list
389
+ for v in enum:
390
+ ax.text(
391
+ v[f"{prefix}_x_center"],
392
+ v[f"{prefix}_y_center"],
393
+ v[prefix],
394
+ horizontalalignment="center",
395
+ verticalalignment="center",
396
+ fontproperties=font_props,
397
+ )
398
+
399
+
400
+ def set_font_from_chars_list(trial):
401
+
402
+ if "chars_list" in trial:
403
+ chars_df = pd.DataFrame(trial["chars_list"])
404
+ line_diffs = np.diff(chars_df.char_y_center.unique())
405
+ y_diffs = np.unique(line_diffs)
406
+ if len(y_diffs) == 1:
407
+ y_diff = y_diffs[0]
408
+ else:
409
+ y_diff = np.min(y_diffs)
410
+ y_diff = round(y_diff * 2) / 2
411
+
412
+ else:
413
+ y_diff = 1 / 0.333 * 18
414
+ font_size = y_diff * 0.333 # pixel to point conversion
415
+ return round((font_size) * 4, ndigits=0) / 4
416
+
417
+
418
+ def get_plot_props(trial, available_fonts):
419
+ if "font" in trial.keys():
420
+ font = trial["font"]
421
+ font_size = trial["font_size"]
422
+ if font not in available_fonts:
423
+ font = "DejaVu Sans Mono"
424
+ else:
425
+ font = "DejaVu Sans Mono"
426
+ font_size = 21
427
+ dpi = 96
428
+ if "display_coords" in trial.keys() and trial["display_coords"] is not None:
429
+ screen_res = (trial["display_coords"][2], trial["display_coords"][3])
430
+ else:
431
+ screen_res = (1920, 1080)
432
+ return font, font_size, dpi, screen_res
433
+
434
+
435
+ def get_font_and_font_size_from_trial(trial):
436
+ font_face, font_size, dpi, screen_res = get_plot_props(trial, AVAILABLE_FONTS)
437
+
438
+ if font_size is None and "font_size" in trial:
439
+ font_size = trial["font_size"]
440
+ elif font_size is None:
441
+ font_size = set_font_from_chars_list(trial)
442
+ return font_face, font_size
443
+
444
+
445
+ def sigmoid(x):
446
+ return 1 / (1 + np.exp(-1 * x))
447
+
448
+
449
+ def matplotlib_plot_df(
450
+ dffix,
451
+ trial,
452
+ algo_choice,
453
+ dffix_no_clean=None,
454
+ desired_dpi=300,
455
+ fix_to_plot=[],
456
+ stim_info_to_plot=["Characters", "Word boxes"],
457
+ box_annotations: list = None,
458
+ font=None,
459
+ use_duration_arrow_sizes=True,
460
+ ):
461
+ chars_df = pd.DataFrame(trial["chars_list"]) if "chars_list" in trial else None
462
+
463
+ if chars_df is not None:
464
+ font_face, font_size = get_font_and_font_size_from_trial(trial)
465
+ font_size = font_size * 0.65
466
+ else:
467
+ ic("No character or word information available to plot")
468
+
469
+ if "display_coords" in trial:
470
+ desired_width_in_pixels = trial["display_coords"][2] + 1
471
+ desired_height_in_pixels = trial["display_coords"][3] + 1
472
+ else:
473
+ desired_width_in_pixels = 1920
474
+ desired_height_in_pixels = 1080
475
+
476
+ figure_width = desired_width_in_pixels / desired_dpi
477
+ figure_height = desired_height_in_pixels / desired_dpi
478
+
479
+ fig = plt.figure(figsize=(figure_width, figure_height), dpi=desired_dpi)
480
+ ax = fig.add_subplot(1, 1, 1)
481
+ fig.subplots_adjust(bottom=0)
482
+ fig.subplots_adjust(top=1)
483
+ fig.subplots_adjust(right=1)
484
+ fig.subplots_adjust(left=0)
485
+ if font is None:
486
+ if "font" in trial and trial["font"] in AVAILABLE_FONTS:
487
+ font_to_use = trial["font"]
488
+ else:
489
+ font_to_use = "DejaVu Sans Mono"
490
+ else:
491
+ font_to_use = font
492
+ if "font_size" in trial:
493
+ font_size = trial["font_size"]
494
+ else:
495
+ font_size = 20
496
+
497
+ if "Words" in stim_info_to_plot and "words_list" in trial:
498
+ add_text_to_ax(
499
+ trial["words_list"],
500
+ ax,
501
+ font_to_use,
502
+ prefix="word",
503
+ fontsize=font_size / 3.89,
504
+ )
505
+ if "Word boxes" in stim_info_to_plot and "words_list" in trial:
506
+ add_boxes_to_ax(
507
+ trial["words_list"],
508
+ ax,
509
+ font_to_use,
510
+ prefix="word",
511
+ fontsize=font_size / 3.89,
512
+ box_annotations=box_annotations,
513
+ edgecolor="black",
514
+ linewidth=0.9,
515
+ )
516
+
517
+ if "Characters" in stim_info_to_plot and "chars_list" in trial:
518
+ add_text_to_ax(
519
+ trial["chars_list"],
520
+ ax,
521
+ font_to_use,
522
+ prefix="char",
523
+ fontsize=font_size / 3.89,
524
+ )
525
+ if "Character boxes" in stim_info_to_plot and "chars_list" in trial:
526
+ add_boxes_to_ax(
527
+ trial["chars_list"],
528
+ ax,
529
+ font_to_use,
530
+ prefix="char",
531
+ fontsize=font_size / 3.89,
532
+ box_annotations=box_annotations,
533
+ )
534
+
535
+ if "Uncorrected Fixations" in fix_to_plot and dffix_no_clean is None:
536
+ if use_duration_arrow_sizes and "duration" in dffix.columns:
537
+ duration_scaled = dffix.duration - dffix.duration.min()
538
+ duration_scaled = (((duration_scaled / duration_scaled.max()) - 0.5) * 3).values
539
+ durations = sigmoid(duration_scaled) * 50 * 0.5
540
+ if use_duration_arrow_sizes:
541
+ ax.plot(
542
+ dffix.x,
543
+ dffix.y,
544
+ label="Raw fixations",
545
+ color="blue",
546
+ alpha=0.5,
547
+ )
548
+ add_arrow_annotations(dffix, "y", ax, "blue", durations[:-1])
549
+ else:
550
+ ax.plot(
551
+ dffix.x,
552
+ dffix.y,
553
+ label="Remaining fixations",
554
+ color="blue",
555
+ alpha=0.5,
556
+ )
557
+ add_arrow_annotations(dffix, "y", ax, "blue", 4)
558
+
559
+ if dffix_no_clean is not None and "Uncorrected Fixations" in fix_to_plot:
560
+
561
+ ax.plot(
562
+ dffix_no_clean.x,
563
+ dffix_no_clean.y,
564
+ # marker='.',
565
+ label="All fixations",
566
+ color="k",
567
+ alpha=0.5,
568
+ lw=1,
569
+ )
570
+ add_arrow_annotations(dffix_no_clean, "y", ax, "k", 4)
571
+ if "was_discarded_due_blinks" in dffix_no_clean.columns and dffix_no_clean["was_discarded_due_blinks"].any():
572
+ discarded_blink_fix = dffix_no_clean.loc[dffix_no_clean["was_discarded_due_blinks"], :].copy()
573
+ ax.scatter(
574
+ discarded_blink_fix.x,
575
+ discarded_blink_fix.y,
576
+ s=12,
577
+ label="Discarded due to blinks",
578
+ lw=1.5,
579
+ edgecolors="orange",
580
+ facecolors="none",
581
+ )
582
+ if (
583
+ "was_discarded_due_to_long_duration" in dffix_no_clean.columns
584
+ and dffix_no_clean["was_discarded_due_to_long_duration"].any()
585
+ ):
586
+ discarded_long_fix = dffix_no_clean.loc[dffix_no_clean["was_discarded_due_to_long_duration"], :].copy()
587
+ ax.scatter(
588
+ discarded_long_fix.x,
589
+ discarded_long_fix.y,
590
+ s=18,
591
+ label="Overly long fixations",
592
+ lw=0.8,
593
+ edgecolors="purple",
594
+ facecolors="none",
595
+ )
596
+ if "was_merged" in dffix_no_clean.columns:
597
+ merged_fix = dffix_no_clean.loc[dffix_no_clean["was_merged"], :].copy()
598
+ if not merged_fix.empty:
599
+ ax.scatter(
600
+ merged_fix.x,
601
+ merged_fix.y,
602
+ s=7,
603
+ label="Merged short fixations",
604
+ lw=1,
605
+ edgecolors="red",
606
+ facecolors="none",
607
+ )
608
+ if "was_discarded_outside_text" in dffix_no_clean.columns:
609
+ was_discarded_outside_text_fix = dffix_no_clean.loc[dffix_no_clean["was_discarded_outside_text"], :].copy()
610
+ if not was_discarded_outside_text_fix.empty:
611
+ ax.scatter(
612
+ was_discarded_outside_text_fix.x,
613
+ was_discarded_outside_text_fix.y,
614
+ s=8,
615
+ label="Outside text fixations",
616
+ lw=1.2,
617
+ edgecolors="blue",
618
+ facecolors="none",
619
+ )
620
+ if "was_discarded_short_fix" in dffix_no_clean.columns:
621
+ was_discarded_short_fix_fix = dffix_no_clean.loc[dffix_no_clean["was_discarded_short_fix"], :].copy()
622
+ if not was_discarded_short_fix_fix.empty:
623
+ ax.scatter(
624
+ was_discarded_short_fix_fix.x,
625
+ was_discarded_short_fix_fix.y,
626
+ label="Discarded short fixations",
627
+ s=9,
628
+ lw=1.5,
629
+ edgecolors="green",
630
+ facecolors="none",
631
+ )
632
+ if "Corrected Fixations" in fix_to_plot:
633
+ if isinstance(algo_choice, list):
634
+ algo_choices = algo_choice
635
+ repeats = range(len(algo_choice))
636
+ else:
637
+ algo_choices = [algo_choice]
638
+ repeats = range(1)
639
+ for algoIdx in repeats:
640
+ algo_choice = algo_choices[algoIdx]
641
+ if f"y_{algo_choice}" in dffix.columns:
642
+ ax.plot(
643
+ dffix.x,
644
+ dffix.loc[:, f"y_{algo_choice}"],
645
+ label=algo_choice,
646
+ color=COLORS[algoIdx],
647
+ alpha=0.6,
648
+ linewidth=0.6,
649
+ )
650
+
651
+ add_arrow_annotations(dffix, f"y_{algo_choice}", ax, COLORS[algoIdx], 6)
652
+
653
+ ax.set_xlim((0, desired_width_in_pixels))
654
+ ax.set_ylim((0, desired_height_in_pixels))
655
+ ax.invert_yaxis()
656
+ if "Corrected Fixations" in fix_to_plot or "Uncorrected Fixations" in fix_to_plot:
657
+ ax.legend(prop={"size": 5})
658
+
659
+ return fig, desired_width_in_pixels, desired_height_in_pixels
660
+
661
+
662
+ def add_arrow_annotations(dffix, y_col, ax, color, size):
663
+ x = dffix.x.values
664
+
665
+ y = dffix.loc[:, y_col].values
666
+
667
+ x = x[:-1]
668
+ y = y[:-1]
669
+ dX = -(x[1:] - x[:-1])
670
+ dY = -(y[1:] - y[:-1])
671
+
672
+ xpos = x[1:]
673
+ ypos = y[1:]
674
+ if isinstance(size, Iterable):
675
+ use_size_idx = True
676
+ else:
677
+ use_size_idx = False
678
+ s = size
679
+ for fidx, (X, Y, dX, dY) in enumerate(zip(xpos, ypos, dX, dY)):
680
+ if use_size_idx:
681
+ s = size[fidx]
682
+ ax.annotate(
683
+ "",
684
+ xytext=(X + 0.001 * dX, Y + 0.001 * dY),
685
+ xy=(X, Y),
686
+ arrowprops=dict(arrowstyle="fancy", color=color),
687
+ size=s,
688
+ alpha=0.3,
689
+ )
690
+
691
+
692
+ def plot_saccade_df(fix_df, sac_df, trial, show_numbers=False, add_lines_to_fix_df=False):
693
+ stim_only_fig, _, _ = matplotlib_plot_df(
694
+ fix_df,
695
+ trial,
696
+ None,
697
+ dffix_no_clean=None,
698
+ desired_dpi=300,
699
+ fix_to_plot=[],
700
+ stim_info_to_plot=["Characters", "Word boxes"],
701
+ box_annotations=None,
702
+ font=None,
703
+ )
704
+ if stim_only_fig is None:
705
+ fig, ax = plt.subplots(1, 1, figsize=(8, 5), dpi=150)
706
+ invert_ax_needed = True
707
+ else:
708
+ fig = stim_only_fig
709
+ ax = fig.axes[0]
710
+ invert_ax_needed = False
711
+
712
+ def plot_arrow(x1, y1, x2, y2, scale_factor):
713
+ """Plot an arrow from (x1,y1) to (x2,y2) with adjustable size"""
714
+ ax.arrow(
715
+ x1,
716
+ y1,
717
+ (x2 - x1),
718
+ (y2 - y1),
719
+ color="k",
720
+ alpha=0.7,
721
+ length_includes_head=True,
722
+ width=3 * scale_factor,
723
+ head_width=15 * scale_factor,
724
+ head_length=15 * scale_factor,
725
+ )
726
+
727
+ xs = sac_df["xs"].values
728
+ ys = sac_df["ys"].values
729
+ xe = sac_df["xe"].values
730
+ ye = sac_df["ye"].values
731
+ extent = np.sqrt((xs.min() - xe.max()) ** 2 + (ys.min() - ye.max()) ** 2)
732
+ scale_factor = 0.0005 * extent
733
+ for i in range(len(xs)):
734
+ plot_arrow(xs[i], ys[i], xe[i], ye[i], scale_factor=scale_factor)
735
+ if add_lines_to_fix_df:
736
+ plotfunc = ax.plot
737
+ else:
738
+ plotfunc = ax.scatter
739
+ if "x" in fix_df.columns:
740
+ plotfunc(fix_df["x"], fix_df["y"], marker=".")
741
+ else:
742
+ plotfunc(fix_df["xs"], fix_df["ys"], marker=".")
743
+
744
+ if invert_ax_needed:
745
+ ax.invert_yaxis()
746
+ if show_numbers:
747
+ size = 8 * scale_factor
748
+
749
+ xytext = (
750
+ 1,
751
+ -1,
752
+ )
753
+ for index, row in fix_df.iterrows():
754
+ ax.annotate(
755
+ index,
756
+ xy=(row["x"], row["y"]),
757
+ textcoords="offset points",
758
+ ha="center",
759
+ xytext=xytext,
760
+ va="bottom",
761
+ color="k",
762
+ size=size,
763
+ )
764
+
765
+ for index, row in sac_df.iterrows():
766
+ ax.annotate(
767
+ index,
768
+ xy=(row["xs"], row["ys"]),
769
+ textcoords="offset points",
770
+ ha="center",
771
+ xytext=xytext,
772
+ va="top",
773
+ color="r",
774
+ size=size,
775
+ )
776
+ return fig
777
+
778
+
779
+ def get_events_df_from_lines_and_trial_selection(trial, trial_lines, discard_fixations_without_sfix):
780
+
781
+ line_dicts = []
782
+ fixations_dicts = []
783
+ events_dicts = []
784
+ blink_started = False
785
+
786
+ fixation_started = False
787
+ esac_count = 0
788
+ efix_count = 0
789
+ sfix_count = 0
790
+ sblink_count = 0
791
+ eblink_times = []
792
+
793
+ eye_to_use = "R"
794
+ for l in trial_lines:
795
+ if "EFIX R" in l:
796
+ eye_to_use = "R"
797
+ break
798
+ elif "EFIX L" in l:
799
+ eye_to_use = "L"
800
+ break
801
+ for l in trial_lines:
802
+ parts = [x.strip() for x in l.split("\t")]
803
+ if f"EFIX {eye_to_use}" in l:
804
+ efix_count += 1
805
+ if fixation_started:
806
+ had_SFIX_before_it = True
807
+ if parts[1] == "." and parts[2] == ".":
808
+ continue
809
+ fixation_started = False
810
+ else:
811
+ had_SFIX_before_it = False
812
+ fix_dict = {
813
+ "fixation_number": efix_count,
814
+ "start_time": float(pd.to_numeric(parts[0].split()[-1].strip(), errors="coerce")),
815
+ "end_time": float(pd.to_numeric(parts[1].strip(), errors="coerce")),
816
+ "duration": float(pd.to_numeric(parts[2].strip(), errors="coerce")),
817
+ "x": float(pd.to_numeric(parts[3].strip(), errors="coerce")),
818
+ "y": float(pd.to_numeric(parts[4].strip(), errors="coerce")),
819
+ "pupil_size": float(pd.to_numeric(parts[5].strip(), errors="coerce")),
820
+ "had_SFIX_before_it": had_SFIX_before_it,
821
+ "msg": "FIX",
822
+ }
823
+ if not discard_fixations_without_sfix or had_SFIX_before_it:
824
+ fixations_dicts.append(fix_dict)
825
+ events_dicts.append(
826
+ {
827
+ "num": efix_count - 1,
828
+ "start": float(pd.to_numeric(parts[0].split()[-1].strip(), errors="coerce")),
829
+ "stop": float(pd.to_numeric(parts[1].strip(), errors="coerce")),
830
+ "duration": float(pd.to_numeric(parts[2].strip(), errors="coerce")),
831
+ "xs": float(pd.to_numeric(parts[3].strip(), errors="coerce")),
832
+ "xe": None,
833
+ "ys": float(pd.to_numeric(parts[4].strip(), errors="coerce")),
834
+ "ye": None,
835
+ "ampl": None,
836
+ "pv": None,
837
+ "pupil_size": float(pd.to_numeric(parts[5].strip(), errors="coerce")),
838
+ "msg": "FIX",
839
+ }
840
+ )
841
+ if len(fixations_dicts) >= 2:
842
+ assert fixations_dicts[-1]["start_time"] > fixations_dicts[-2]["start_time"], "start times not in order"
843
+ elif f"SFIX {eye_to_use}" in l:
844
+ sfix_count += 1
845
+ fixation_started = True
846
+ elif f"SBLINK {eye_to_use}" in l:
847
+ sblink_count += 1
848
+ blink_started = True
849
+ elif f"EBLINK {eye_to_use}" in l:
850
+ blink_started = False
851
+ blink_dict = {
852
+ "num": len(eblink_times),
853
+ "start": float(pd.to_numeric(parts[0].split()[-1].strip(), errors="coerce")),
854
+ "stop": float(pd.to_numeric(parts[1].strip(), errors="coerce")),
855
+ "duration": float(pd.to_numeric(parts[2].strip(), errors="coerce")),
856
+ "xs": None,
857
+ "xe": None,
858
+ "ys": None,
859
+ "ye": None,
860
+ "ampl": None,
861
+ "pv": None,
862
+ "pupil_size": None,
863
+ "msg": "BLINK",
864
+ }
865
+ events_dicts.append(blink_dict)
866
+ eblink_times.append(float(pd.to_numeric(parts[-1], errors="coerce")))
867
+ elif "ESACC" in l:
868
+ sac_dict = {
869
+ "num": esac_count,
870
+ "start": float(pd.to_numeric(parts[0].split()[-1].strip(), errors="coerce")),
871
+ "stop": float(pd.to_numeric(parts[1].strip(), errors="coerce")),
872
+ "duration": float(pd.to_numeric(parts[2].strip(), errors="coerce")),
873
+ "xs": float(pd.to_numeric(parts[3].strip(), errors="coerce")),
874
+ "ys": float(pd.to_numeric(parts[4].strip(), errors="coerce")),
875
+ "xe": float(pd.to_numeric(parts[5].strip(), errors="coerce")),
876
+ "ye": float(pd.to_numeric(parts[6].strip(), errors="coerce")),
877
+ "ampl": float(pd.to_numeric(parts[7].strip(), errors="coerce")),
878
+ "pv": float(pd.to_numeric(parts[8].strip(), errors="coerce")),
879
+ "pupil_size": None,
880
+ "msg": "SAC",
881
+ }
882
+ events_dicts.append(sac_dict)
883
+ esac_count += 1
884
+ if not blink_started and not any([True for x in event_strs if x in l]):
885
+ if len(parts) < 3 or (parts[1] == "." and parts[2] == "."):
886
+ continue
887
+ line_dicts.append(
888
+ {
889
+ "idx": float(pd.to_numeric(parts[0].strip(), errors="coerce")),
890
+ "x": float(pd.to_numeric(parts[1].strip(), errors="coerce")),
891
+ "y": float(pd.to_numeric(parts[2].strip(), errors="coerce")),
892
+ "p": float(pd.to_numeric(parts[3].strip(), errors="coerce")),
893
+ "part_of_fixation": fixation_started,
894
+ "fixation_number": sfix_count,
895
+ "part_of_blink": blink_started,
896
+ "blink_number": sblink_count,
897
+ }
898
+ )
899
+
900
+ trial["eblink_times"] = eblink_times
901
+ df = pd.DataFrame(line_dicts)
902
+ df["x_smoothed"] = np.convolve(df.x, np.ones((5,)) / 5, mode="same") # popEye smoothes this way
903
+ df["y_smoothed"] = np.convolve(df.y, np.ones((5,)) / 5, mode="same")
904
+ df["time"] = df["idx"] - df["idx"].iloc[0]
905
+ df = pf.compute_velocity(df)
906
+ events_df = pd.DataFrame(events_dicts)
907
+ events_df["start_uncorrected"] = events_df.start
908
+ events_df["stop_uncorrected"] = events_df.stop
909
+ events_df["start"] = events_df.start - trial["trial_start_time"]
910
+ events_df["stop"] = events_df.stop - trial["trial_start_time"]
911
+ events_df["start"] = events_df["start"].clip(0, events_df["start"].max())
912
+ events_df.sort_values(by="start", inplace=True) # Needed because blinks can happen during other events, I think
913
+ events_df.reset_index(drop=True, inplace=True)
914
+ events_df = pf.event_long(events_df)
915
+ events_df["duration"] = events_df["stop"] - events_df["start"]
916
+
917
+ trial["efix_count"] = efix_count
918
+ trial["eye_to_use"] = eye_to_use
919
+ trial["sfix_count"] = sfix_count
920
+ trial["sblink_count"] = sblink_count
921
+ return trial, df, events_df
922
+
923
+
924
+ def add_default_font_and_character_props_to_state(trial):
925
+ chars_list = trial["chars_list"]
926
+ chars_df = pd.DataFrame(trial["chars_list"])
927
+ line_diffs = np.diff(chars_df.char_y_center.unique())
928
+ y_diffs = np.unique(line_diffs)
929
+ if len(y_diffs) > 1:
930
+ y_diff = np.min(y_diffs)
931
+ else:
932
+ y_diff = y_diffs[0]
933
+
934
+ y_diff = round(y_diff * 2) / 2
935
+ x_txt_start = chars_list[0]["char_xmin"]
936
+ y_txt_start = chars_list[0]["char_y_center"]
937
+
938
+ font_face, font_size = get_font_and_font_size_from_trial(trial)
939
+
940
+ line_height = y_diff
941
+ return y_diff, x_txt_start, y_txt_start, font_face, font_size, line_height
942
+
943
+
944
+ def get_raw_events_df_and_trial(trial, discard_fixations_without_sfix):
945
+ fname = pl.Path(trial["filename"]).stem
946
+ trial_id = trial["trial_id"]
947
+ trial_lines = trial.pop("trial_lines")
948
+
949
+ trial["plot_file"] = str(PLOTS_FOLDER.joinpath(f"{fname}_{trial_id}_2ndInput_chars_channel_sep.png"))
950
+
951
+ trial, df, events_df = get_events_df_from_lines_and_trial_selection(
952
+ trial, trial_lines, discard_fixations_without_sfix
953
+ )
954
+ trial["gaze_df"] = df
955
+ font, font_size, dpi, screen_res = get_plot_props(trial, AVAILABLE_FONTS)
956
+ trial["font"] = font
957
+ trial["font_size"] = font_size
958
+ trial["dpi"] = dpi
959
+ trial["screen_res"] = screen_res
960
+ if "chars_list" in trial:
961
+ chars_df = pd.DataFrame(trial["chars_list"])
962
+
963
+ chars_df = add_popEye_cols_to_chars_df(chars_df)
964
+
965
+ if "index" not in chars_df.columns:
966
+ chars_df.reset_index(inplace=True)
967
+ trial["chars_df"] = chars_df.to_dict()
968
+ trial["y_char_unique"] = list(chars_df.char_y_center.sort_values().unique())
969
+ return reorder_columns(events_df), trial
970
+
971
+
972
+ def get_outlier_indeces(
973
+ dffix, chars_df, x_thres_in_chars, y_thresh_in_heights, xcol, ycol, letter_width_avg, line_heights_avg
974
+ ):
975
+ indeces_out = []
976
+ for linenum, line_chars_subdf in chars_df.groupby("assigned_line"):
977
+ left = line_chars_subdf["char_xmin"].min()
978
+ right = line_chars_subdf["char_xmax"].max()
979
+ top = line_chars_subdf["char_ymin"].min()
980
+ bottom = line_chars_subdf["char_ymax"].max()
981
+ left_min = left - (x_thres_in_chars * letter_width_avg)
982
+ right_max = right + (x_thres_in_chars * letter_width_avg)
983
+ top_max = top - (line_heights_avg * y_thresh_in_heights)
984
+ bottom_min = bottom + (line_heights_avg * y_thresh_in_heights)
985
+ indeces_out_line = []
986
+ indeces_out_line.extend(list(dffix.loc[dffix[xcol] < left_min, :].index))
987
+ indeces_out_line.extend(list(dffix.loc[dffix[xcol] > right_max, :].index))
988
+ indeces_out_line.extend(list(dffix.loc[dffix[ycol] < top_max, :].index))
989
+ indeces_out_line.extend(list(dffix.loc[dffix[ycol] > bottom_min, :].index))
990
+ indeces_out_line_set = set(indeces_out_line)
991
+ indeces_out.append(indeces_out_line_set)
992
+ return list(set.intersection(*indeces_out))
993
+
994
+
995
+ def get_distance_between_fixations_in_characters_and_recalc_duration(
996
+ fix, letter_width_avg, start_colname="start", stop_colname="stop", xcol="xs"
997
+ ):
998
+ fix.reset_index(drop=True, inplace=True)
999
+ fix.loc[:, "duration"] = fix[stop_colname] - fix[start_colname]
1000
+ fix.loc[:, "distance_in_char_widths"] = 0.0
1001
+ for i in range(1, len(fix)):
1002
+ fix.loc[i, "distance_in_char_widths"] = np.round(
1003
+ np.abs(fix.loc[i, xcol] - fix.loc[i - 1, xcol]) / letter_width_avg, decimals=3
1004
+ )
1005
+ return fix
1006
+
1007
+
1008
+ def clean_fixations_popeye_no_sacc(fix, trial, duration_threshold, distance_threshold):
1009
+ if "letter_width_avg" in trial:
1010
+ letter_width_avg = trial["letter_width_avg"]
1011
+ else:
1012
+ letter_width_avg = 12
1013
+
1014
+ stop_time_col, start_time_col = get_time_cols(fix)
1015
+ if "xs" in fix.columns:
1016
+ x_colname = "xs"
1017
+ y_colname = "ys"
1018
+ else:
1019
+ x_colname = "x"
1020
+ y_colname = "y"
1021
+ if "blink" not in fix.columns:
1022
+ fix["blink"] = 0
1023
+ fix.dropna(subset=[x_colname, y_colname], how="any", axis=0, inplace=True)
1024
+ fix.reset_index(drop=True, inplace=True)
1025
+ fix = get_distance_between_fixations_in_characters_and_recalc_duration(
1026
+ fix, letter_width_avg, start_time_col, stop_time_col, x_colname
1027
+ )
1028
+
1029
+ fix["num"] = np.arange(len(fix), dtype=int)
1030
+ i = 0
1031
+ while i <= len(fix) - 1:
1032
+
1033
+ merge_before = False
1034
+ merge_after = False
1035
+
1036
+ if fix["duration"].iloc[i] <= duration_threshold:
1037
+
1038
+ # check fixation n - 1
1039
+ if i > 1:
1040
+ if (
1041
+ fix["duration"].iloc[i - 1] > duration_threshold
1042
+ and fix["blink"].iloc[i - 1] == 0
1043
+ and fix["distance_in_char_widths"].iloc[i] <= distance_threshold
1044
+ ):
1045
+ merge_before = True
1046
+ # check fixation n + 1
1047
+ if i < len(fix) - 1:
1048
+ if (
1049
+ fix["duration"].iloc[i + 1] > duration_threshold
1050
+ and fix["blink"].iloc[i + 1] == 0
1051
+ and fix["distance_in_char_widths"].iloc[i + 1] <= distance_threshold
1052
+ ):
1053
+ merge_after = True
1054
+
1055
+ # check merge.status
1056
+ if merge_before and not merge_after:
1057
+ merge = -1
1058
+ elif not merge_before and merge_after:
1059
+ merge = 1
1060
+ elif not merge_before and not merge_after:
1061
+ merge = 0
1062
+ elif merge_before and merge_after:
1063
+ if fix["duration"].iloc[i - 1] >= fix["duration"].iloc[i + 1]:
1064
+ merge = -1
1065
+ else:
1066
+ merge = 1
1067
+
1068
+ # close if above duration threshold
1069
+ else:
1070
+ merge = 0
1071
+
1072
+ if merge == 0:
1073
+ i += 1
1074
+
1075
+ elif merge == -1:
1076
+
1077
+ fix.loc[i - 1, stop_time_col] = fix.loc[i, stop_time_col]
1078
+ fix.loc[i - 1, x_colname] = round((fix.loc[i - 1, x_colname] + fix.loc[i, x_colname]) / 2)
1079
+ fix.loc[i - 1, y_colname] = round((fix.loc[i - 1, y_colname] + fix.loc[i, y_colname]) / 2)
1080
+
1081
+ fix = fix.drop(i, axis=0)
1082
+ fix.reset_index(drop=True, inplace=True)
1083
+
1084
+ start = fix[start_time_col].iloc[i - 1]
1085
+ stop = fix[stop_time_col].iloc[i - 1]
1086
+
1087
+ fix = get_distance_between_fixations_in_characters_and_recalc_duration(
1088
+ fix, letter_width_avg, start_time_col, stop_time_col, x_colname
1089
+ )
1090
+
1091
+ elif merge == 1:
1092
+ fix.loc[i + 1, start_time_col] = fix.loc[i, start_time_col]
1093
+ fix.loc[i + 1, x_colname] = round((fix.loc[i, x_colname] + fix.loc[i + 1, x_colname]) / 2)
1094
+ fix.loc[i + 1, y_colname] = round((fix.loc[i, y_colname] + fix.loc[i + 1, y_colname]) / 2)
1095
+
1096
+ fix.drop(index=i, inplace=True)
1097
+ fix.reset_index(drop=True, inplace=True)
1098
+
1099
+ start = fix.loc[i, start_time_col]
1100
+ stop = fix.loc[i, stop_time_col]
1101
+
1102
+ fix = get_distance_between_fixations_in_characters_and_recalc_duration(
1103
+ fix, letter_width_avg, start_time_col, stop_time_col, x_colname
1104
+ )
1105
+
1106
+ fix.loc[:, "num"] = np.arange(len(fix), dtype=int)
1107
+
1108
+ # delete last fixation
1109
+ if fix.iloc[-1]["duration"] < duration_threshold:
1110
+ fix = fix.iloc[:-1]
1111
+ trial["last_fixation_was_discarded_because_too_short"] = True
1112
+ else:
1113
+ trial["last_fixation_was_discarded_because_too_short"] = False
1114
+ fix.reset_index(drop=True, inplace=True)
1115
+ return fix.copy()
1116
+
1117
+
1118
+ def clean_dffix_own(
1119
+ trial: dict,
1120
+ choice_handle_short_and_close_fix: str,
1121
+ discard_far_out_of_text_fix,
1122
+ x_thres_in_chars,
1123
+ y_thresh_in_heights,
1124
+ short_fix_threshold,
1125
+ merge_distance_threshold: float,
1126
+ discard_long_fix: bool,
1127
+ discard_long_fix_threshold: int,
1128
+ discard_blinks: bool,
1129
+ dffix: pd.DataFrame,
1130
+ ):
1131
+ dffix = dffix.dropna(how="all", axis=1).copy()
1132
+ if dffix.empty:
1133
+ return dffix, trial
1134
+ dffix = dffix.rename(
1135
+ {
1136
+ k: v
1137
+ for k, v in {
1138
+ "xs": "x",
1139
+ "ys": "y",
1140
+ "num": "fixation_number",
1141
+ }.items()
1142
+ if v not in dffix.columns
1143
+ },
1144
+ axis=1,
1145
+ )
1146
+ stop_time_col, start_time_col = get_time_cols(dffix)
1147
+ add_time_cols(dffix, stop_time_col, start_time_col)
1148
+ if "dffix_no_clean" not in trial:
1149
+ trial["dffix_no_clean"] = (
1150
+ dffix.copy()
1151
+ ) # TODO check if cleaning can be dialed in or if dffix get overwritten every time
1152
+ add_time_cols(trial["dffix_no_clean"], stop_time_col, start_time_col)
1153
+
1154
+ trial["dffix_no_clean"]["was_merged"] = False
1155
+ trial["dffix_no_clean"]["was_discarded_short_fix"] = False
1156
+ trial["dffix_no_clean"]["was_discarded_outside_text"] = False
1157
+
1158
+ num_fix_before_clean = trial["dffix_no_clean"].shape[0]
1159
+ trial["Fixation Cleaning Stats"] = {}
1160
+ trial["Fixation Cleaning Stats"]["Number of fixations before cleaning"] = num_fix_before_clean
1161
+
1162
+ trial["Fixation Cleaning Stats"]["Discard fixation before or after blinks"] = discard_blinks
1163
+
1164
+ if discard_blinks and "blink" in dffix.columns:
1165
+ trial["dffix_no_clean"]["was_discarded_due_blinks"] = False
1166
+ dffix = dffix[dffix["blink"] == False].copy()
1167
+ trial["dffix_no_clean"].loc[
1168
+ ~trial["dffix_no_clean"]["start_time"].isin(dffix["start_time"]), "was_discarded_due_blinks"
1169
+ ] = True
1170
+ trial["Fixation Cleaning Stats"]["Number of discarded fixations due to blinks"] = (
1171
+ num_fix_before_clean - dffix.shape[0]
1172
+ )
1173
+ trial["Fixation Cleaning Stats"]["Number of discarded fixations due to blinks (%)"] = round(
1174
+ 100
1175
+ * (trial["Fixation Cleaning Stats"]["Number of discarded fixations due to blinks"] / num_fix_before_clean),
1176
+ 2,
1177
+ )
1178
+
1179
+ trial["Fixation Cleaning Stats"]["Discard long fixations"] = discard_long_fix
1180
+
1181
+ if discard_long_fix and not dffix.empty:
1182
+ dffix_before_long_fix_removal = dffix.copy()
1183
+ trial["dffix_no_clean"]["was_discarded_due_to_long_duration"] = False
1184
+ dffix = dffix[dffix["duration"] < discard_long_fix_threshold].copy()
1185
+ dffix_after_long_fix_removal = dffix.copy()
1186
+ trial["dffix_no_clean"].loc[
1187
+ (
1188
+ ~trial["dffix_no_clean"]["start_time"].isin(dffix_after_long_fix_removal["start_time"])
1189
+ & (trial["dffix_no_clean"]["start_time"].isin(dffix_before_long_fix_removal["start_time"]))
1190
+ ),
1191
+ "was_discarded_due_to_long_duration",
1192
+ ] = True
1193
+ trial["Fixation Cleaning Stats"]["Number of discarded long fixations"] = num_fix_before_clean - dffix.shape[0]
1194
+ trial["Fixation Cleaning Stats"]["Number of discarded long fixations (%)"] = round(
1195
+ 100 * (trial["Fixation Cleaning Stats"]["Number of discarded long fixations"] / num_fix_before_clean), 2
1196
+ )
1197
+ num_fix_before_merge = dffix.shape[0]
1198
+ trial["Fixation Cleaning Stats"]["How short and close fixations were handled"] = choice_handle_short_and_close_fix
1199
+ if (
1200
+ choice_handle_short_and_close_fix == "Merge" or choice_handle_short_and_close_fix == "Merge then discard"
1201
+ ) and not dffix.empty:
1202
+ dffix_before_merge = dffix.copy()
1203
+ dffix = clean_fixations_popeye_no_sacc(dffix, trial, short_fix_threshold, merge_distance_threshold)
1204
+ dffix_after_merge = dffix.copy()
1205
+ trial["dffix_no_clean"].loc[
1206
+ (~trial["dffix_no_clean"]["start_time"].isin(dffix_after_merge["start_time"]))
1207
+ & (trial["dffix_no_clean"]["start_time"].isin(dffix_before_merge["start_time"])),
1208
+ "was_merged",
1209
+ ] = True
1210
+ if trial["last_fixation_was_discarded_because_too_short"]:
1211
+ trial["dffix_no_clean"].iloc[-1, trial["dffix_no_clean"].columns.get_loc("was_merged")] = False
1212
+ trial["dffix_no_clean"].iloc[-1, trial["dffix_no_clean"].columns.get_loc("was_discarded_short_fix")] = True
1213
+ trial["Fixation Cleaning Stats"]["Number of merged fixations"] = (
1214
+ num_fix_before_merge - dffix_after_merge.shape[0]
1215
+ )
1216
+ trial["Fixation Cleaning Stats"]["Number of merged fixations (%)"] = round(
1217
+ 100 * (trial["Fixation Cleaning Stats"]["Number of merged fixations"] / num_fix_before_merge), 2
1218
+ )
1219
+
1220
+ if not dffix.empty:
1221
+ dffix.reset_index(drop=True, inplace=True)
1222
+ dffix.loc[:, "fixation_number"] = np.arange(dffix.shape[0])
1223
+ trial["x_thres_in_chars"], trial["y_thresh_in_heights"] = x_thres_in_chars, y_thresh_in_heights
1224
+ if "chars_list" in trial and not dffix.empty:
1225
+ indeces_out = get_outlier_indeces(
1226
+ dffix,
1227
+ pd.DataFrame(trial["chars_list"]),
1228
+ x_thres_in_chars,
1229
+ y_thresh_in_heights,
1230
+ "x",
1231
+ "y",
1232
+ trial["letter_width_avg"],
1233
+ np.mean(trial["line_heights"]),
1234
+ )
1235
+ else:
1236
+ indeces_out = []
1237
+ dffix["is_far_out_of_text_uncorrected"] = "in"
1238
+ if len(indeces_out) > 0:
1239
+ times_out = dffix.loc[indeces_out, "start_time"].copy()
1240
+ dffix.loc[indeces_out, "is_far_out_of_text_uncorrected"] = "out"
1241
+ trial["Fixation Cleaning Stats"]["Far out of text fixations were discarded"] = discard_far_out_of_text_fix
1242
+ if discard_far_out_of_text_fix and len(indeces_out) > 0:
1243
+ num_fix_before_clean_via_discard_far_out_of_text_fix = dffix.shape[0]
1244
+ trial["dffix_no_clean"].loc[
1245
+ trial["dffix_no_clean"]["start_time"].isin(times_out), "was_discarded_outside_text"
1246
+ ] = True
1247
+ dffix = dffix.loc[dffix["is_far_out_of_text_uncorrected"] == "in", :].reset_index(drop=True).copy()
1248
+ trial["Fixation Cleaning Stats"]["Number of discarded far-out-of-text fixations"] = (
1249
+ num_fix_before_clean_via_discard_far_out_of_text_fix - dffix.shape[0]
1250
+ )
1251
+ trial["Fixation Cleaning Stats"]["Number of discarded far-out-of-text fixations (%)"] = round(
1252
+ 100
1253
+ * (
1254
+ trial["Fixation Cleaning Stats"]["Number of discarded far-out-of-text fixations"]
1255
+ / num_fix_before_clean_via_discard_far_out_of_text_fix
1256
+ ),
1257
+ 2,
1258
+ )
1259
+ dffix = dffix.drop(columns="is_far_out_of_text_uncorrected")
1260
+ if (
1261
+ choice_handle_short_and_close_fix == "Discard"
1262
+ or choice_handle_short_and_close_fix == "Merge then discard"
1263
+ and not dffix.empty
1264
+ ):
1265
+ num_fix_before_clean_via_discard_short = dffix.shape[0]
1266
+ times_out = dffix.loc[(dffix["duration"] < short_fix_threshold), "start_time"].copy()
1267
+ if len(times_out) > 0:
1268
+ trial["dffix_no_clean"].loc[
1269
+ trial["dffix_no_clean"]["start_time"].isin(times_out), "was_discarded_short_fix"
1270
+ ] = True
1271
+ dffix = dffix[(dffix["duration"] >= short_fix_threshold)].reset_index(drop=True).copy()
1272
+ trial["Fixation Cleaning Stats"]["Number of discarded short fixations"] = (
1273
+ num_fix_before_clean_via_discard_short - dffix.shape[0]
1274
+ )
1275
+ trial["Fixation Cleaning Stats"]["Number of discarded short fixations (%)"] = round(
1276
+ 100
1277
+ * (trial["Fixation Cleaning Stats"]["Number of discarded short fixations"])
1278
+ / num_fix_before_clean_via_discard_short,
1279
+ 2,
1280
+ )
1281
+
1282
+ trial["Fixation Cleaning Stats"]["Total number of discarded and merged fixations"] = (
1283
+ num_fix_before_clean - dffix.shape[0]
1284
+ )
1285
+ trial["Fixation Cleaning Stats"]["Total number of discarded and merged fixations (%)"] = round(
1286
+ 100 * trial["Fixation Cleaning Stats"]["Total number of discarded and merged fixations"] / num_fix_before_clean,
1287
+ 2,
1288
+ )
1289
+
1290
+ if not dffix.empty:
1291
+ droplist = ["num", "msg"]
1292
+ if discard_blinks:
1293
+ droplist += ["blink", "blink_before", "blink_after"]
1294
+ for col in droplist:
1295
+ if col in dffix.columns:
1296
+ dffix = dffix.drop(col, axis=1)
1297
+
1298
+ if "start" in dffix.columns:
1299
+ dffix = dffix.drop(axis=1, labels=["start", "stop"])
1300
+ if "corrected_start_time" not in dffix.columns:
1301
+ min_start_time = min(dffix["start_uncorrected"])
1302
+ dffix["corrected_start_time"] = dffix["start_uncorrected"] - min_start_time
1303
+ dffix["corrected_end_time"] = dffix["stop_uncorrected"] - min_start_time
1304
+ assert all(np.diff(dffix["corrected_start_time"]) > 0), "start times not in order"
1305
+
1306
+ dffix_no_clean_fig, _, _ = matplotlib_plot_df(
1307
+ dffix,
1308
+ trial,
1309
+ None,
1310
+ trial["dffix_no_clean"],
1311
+ box_annotations=None,
1312
+ fix_to_plot=["Uncorrected Fixations"],
1313
+ stim_info_to_plot=["Characters", "Word boxes"],
1314
+ )
1315
+ savename = f"{trial['subject']}_{trial['trial_id']}_clean_compare.png"
1316
+ dffix_no_clean_fig.savefig(RESULTS_FOLDER.joinpath(savename), dpi=300, bbox_inches="tight")
1317
+ plt.close(dffix_no_clean_fig)
1318
+
1319
+ dffix_clean_fig, _, _ = matplotlib_plot_df(
1320
+ dffix,
1321
+ trial,
1322
+ None,
1323
+ None,
1324
+ box_annotations=None,
1325
+ fix_to_plot=["Uncorrected Fixations"],
1326
+ stim_info_to_plot=["Characters", "Word boxes"],
1327
+ use_duration_arrow_sizes=False,
1328
+ )
1329
+ savename = f"{trial['subject']}_{trial['trial_id']}_after_clean.png"
1330
+ dffix_clean_fig.savefig(RESULTS_FOLDER.joinpath(savename), dpi=300, bbox_inches="tight")
1331
+ plt.close(dffix_clean_fig)
1332
+ if "item" not in dffix.columns and "item" in trial:
1333
+ dffix.insert(loc=0, column="item", value=trial["item"])
1334
+ if "condition" not in dffix.columns and "condition" in trial:
1335
+ dffix.insert(loc=0, column="condition", value=trial["condition"])
1336
+ if "subject" not in dffix.columns and "subject" in trial:
1337
+ dffix.insert(loc=0, column="subject", value=trial["subject"])
1338
+ if "trial_id" not in dffix.columns and "trial_id" in trial:
1339
+ dffix.insert(loc=0, column="trial_id", value=trial["trial_id"])
1340
+ dffix = reorder_columns(dffix)
1341
+ return dffix, trial
1342
+
1343
+
1344
+ def add_time_cols(dffix, stop_time_col, start_time_col):
1345
+ if "start_time" not in dffix.columns:
1346
+ dffix["start_time"] = dffix[start_time_col]
1347
+ if "end_time" not in dffix.columns:
1348
+ dffix["end_time"] = dffix[stop_time_col]
1349
+ if "duration" not in dffix.columns:
1350
+ dffix["duration"] = dffix["end_time"] - dffix["start_time"]
1351
+
1352
+
1353
+ def get_time_cols(dffix):
1354
+ if "stop" in dffix.columns:
1355
+ stop_time_col = "stop"
1356
+ elif "end_time" in dffix.columns:
1357
+ stop_time_col = "end_time"
1358
+ elif "corrected_end_time" in dffix.columns:
1359
+ stop_time_col = "corrected_end_time"
1360
+ if "start" in dffix.columns:
1361
+ start_time_col = "start"
1362
+ elif "start_time" in dffix.columns:
1363
+ start_time_col = "start_time"
1364
+ elif "corrected_start_time" in dffix.columns:
1365
+ start_time_col = "corrected_start_time"
1366
+ return stop_time_col, start_time_col
1367
+
1368
+
1369
+ def trial_to_dfs(
1370
+ trial: dict,
1371
+ discard_fixations_without_sfix,
1372
+ choice_handle_short_and_close_fix,
1373
+ discard_far_out_of_text_fix,
1374
+ x_thres_in_chars,
1375
+ y_thresh_in_heights,
1376
+ short_fix_threshold,
1377
+ merge_distance_threshold,
1378
+ discard_long_fix,
1379
+ discard_long_fix_threshold,
1380
+ discard_blinks,
1381
+ ):
1382
+ events_df, trial = get_raw_events_df_and_trial(trial, discard_fixations_without_sfix)
1383
+ dffix, trial = clean_dffix_own(
1384
+ trial,
1385
+ choice_handle_short_and_close_fix,
1386
+ discard_far_out_of_text_fix,
1387
+ x_thres_in_chars,
1388
+ y_thresh_in_heights,
1389
+ short_fix_threshold,
1390
+ merge_distance_threshold,
1391
+ discard_long_fix,
1392
+ discard_long_fix_threshold,
1393
+ discard_blinks,
1394
+ events_df[events_df["msg"] == "FIX"].copy(),
1395
+ )
1396
+
1397
+ dffix = dffix.dropna(how="all", axis=1).copy()
1398
+ trial["dffix"] = dffix
1399
+ trial["events_df"] = events_df
1400
+ return dffix, trial
1401
+
1402
+
1403
+ def get_all_measures(
1404
+ trial,
1405
+ dffix,
1406
+ prefix,
1407
+ use_corrected_fixations=True,
1408
+ correction_algo="Wisdom_of_Crowds",
1409
+ measures_to_calculate=["initial_landing_position"],
1410
+ include_coords=False,
1411
+ save_to_csv=False,
1412
+ ):
1413
+ stim_df = pd.DataFrame(trial[f"{prefix}s_list"])
1414
+ if f"{prefix}_number" not in stim_df.columns:
1415
+ stim_df[f"{prefix}_number"] = np.arange(stim_df.shape[0])
1416
+ if use_corrected_fixations:
1417
+ dffix_copy = copy.deepcopy(dffix)
1418
+ dffix_copy["y"] = dffix_copy[f"y_{correction_algo}"]
1419
+ else:
1420
+ dffix_copy = dffix
1421
+ correction_algo = "uncorrected"
1422
+ res_dfs = []
1423
+ for measure in measures_to_calculate:
1424
+ if hasattr(anf, f"{measure}_own"):
1425
+ function = getattr(anf, f"{measure}_own")
1426
+ result = function(trial, dffix_copy, prefix, correction_algo)
1427
+ res_dfs.append(result)
1428
+ dfs_list = [df for df in [stim_df] + res_dfs if not df.empty]
1429
+ own_measure_df = stim_df
1430
+ if len(dfs_list) > 1:
1431
+ for df in dfs_list[1:]:
1432
+ droplist = [col for col in df.columns if (col != f"{prefix}_number" and col in stim_df.columns)]
1433
+ own_measure_df = own_measure_df.merge(df.drop(columns=droplist), how="left", on=[f"{prefix}_number"])
1434
+ first_column = own_measure_df.pop(prefix)
1435
+ own_measure_df.insert(0, prefix, first_column)
1436
+ wordfirst = pf.aggregate_words_firstrun(dffix_copy, correction_algo, measures_to_calculate)
1437
+ wordtmp = pf.aggregate_words(dffix_copy, pd.DataFrame(trial["words_list"]), correction_algo, measures_to_calculate)
1438
+ out = pf.combine_words(
1439
+ dffix_copy,
1440
+ wordfirst=wordfirst,
1441
+ wordtmp=wordtmp,
1442
+ algo_choice=correction_algo,
1443
+ measures_to_calculate=measures_to_calculate,
1444
+ )
1445
+
1446
+ extra_cols = list(set(out.columns) - set(own_measure_df.columns))
1447
+ cols_to_add = ["word_number"] + extra_cols
1448
+ own_measure_df = pd.merge(own_measure_df, out.loc[:, cols_to_add], on="word_number", how="left")
1449
+
1450
+ first_cols = [
1451
+ "subject",
1452
+ "trial_id",
1453
+ "item",
1454
+ "condition",
1455
+ "question_correct",
1456
+ "word_number",
1457
+ "word",
1458
+ ]
1459
+ for col in first_cols:
1460
+ if col in trial and col not in own_measure_df.columns:
1461
+ own_measure_df.insert(loc=0, column=col, value=trial[col])
1462
+
1463
+ own_measure_df = own_measure_df.dropna(how="all", axis=1).copy()
1464
+ if not include_coords:
1465
+ word_cols = ["word_xmin", "word_xmax", "word_ymax", "word_xmin", "word_ymin", "word_x_center", "word_y_center"]
1466
+ own_measure_df = own_measure_df.drop(columns=word_cols)
1467
+
1468
+ own_measure_df = reorder_columns(own_measure_df)
1469
+ if "question_correct" in own_measure_df.columns:
1470
+ own_measure_df = own_measure_df.drop(columns=["question_correct"])
1471
+ if save_to_csv:
1472
+ own_measure_df.to_csv(
1473
+ RESULTS_FOLDER / f"{trial['subject']}_{trial['trial_id']}_{correction_algo}_word_measures.csv"
1474
+ )
1475
+ return own_measure_df
1476
+
1477
+
1478
+ def add_line_overlaps_to_sample(trial, sample):
1479
+ char_df = pd.DataFrame(trial["chars_list"])
1480
+ line_overlaps = []
1481
+ for arr in sample:
1482
+ y_val = arr[1]
1483
+ line_overlap = t.tensor(-1, dtype=t.float32)
1484
+ for idx, (x1, x2) in enumerate(zip(char_df.char_ymin.unique(), char_df.char_ymax.unique())):
1485
+ if x1 <= y_val <= x2:
1486
+ line_overlap = t.tensor(idx, dtype=t.float32)
1487
+ break
1488
+ line_overlaps.append(line_overlap)
1489
+ line_olaps_tensor = t.stack(line_overlaps, dim=0)
1490
+ sample = t.cat([sample, line_olaps_tensor.unsqueeze(1)], dim=1)
1491
+ return sample
1492
+
1493
+
1494
+ def norm_coords_by_letter_min_x_y(
1495
+ sample_idx: int,
1496
+ trialslist: list,
1497
+ samplelist: list,
1498
+ chars_center_coords_list: list = None,
1499
+ ):
1500
+ chars_df = pd.DataFrame(trialslist[sample_idx]["chars_list"])
1501
+ trialslist[sample_idx]["x_char_unique"] = list(chars_df.char_xmin.unique())
1502
+
1503
+ min_x_chars = chars_df.char_xmin.min()
1504
+ min_y_chars = chars_df.char_ymin.min()
1505
+
1506
+ norm_vector_substract = t.zeros(
1507
+ (1, samplelist[sample_idx].shape[1]), dtype=samplelist[sample_idx].dtype, device=samplelist[sample_idx].device
1508
+ )
1509
+ norm_vector_substract[0, 0] = norm_vector_substract[0, 0] + 1 * min_x_chars
1510
+ norm_vector_substract[0, 1] = norm_vector_substract[0, 1] + 1 * min_y_chars
1511
+
1512
+ samplelist[sample_idx] = samplelist[sample_idx] - norm_vector_substract
1513
+
1514
+ if chars_center_coords_list is not None:
1515
+ norm_vector_substract = norm_vector_substract.squeeze(0)[:2]
1516
+ if chars_center_coords_list[sample_idx].shape[-1] == norm_vector_substract.shape[-1] * 2:
1517
+ chars_center_coords_list[sample_idx][:, :2] -= norm_vector_substract
1518
+ chars_center_coords_list[sample_idx][:, 2:] -= norm_vector_substract
1519
+ else:
1520
+ chars_center_coords_list[sample_idx] -= norm_vector_substract
1521
+ return trialslist, samplelist, chars_center_coords_list
1522
+
1523
+
1524
+ def norm_coords_by_letter_positions(
1525
+ sample_idx: int,
1526
+ trialslist: list,
1527
+ samplelist: list,
1528
+ meanlist: list = None,
1529
+ stdlist: list = None,
1530
+ return_mean_std_lists=False,
1531
+ norm_by_char_averages=False,
1532
+ chars_center_coords_list: list = None,
1533
+ add_normalised_values_as_features=False,
1534
+ ):
1535
+ chars_df = pd.DataFrame(trialslist[sample_idx]["chars_list"])
1536
+ trialslist[sample_idx]["x_char_unique"] = list(chars_df.char_xmin.unique())
1537
+
1538
+ min_x_chars = chars_df.char_xmin.min()
1539
+ max_x_chars = chars_df.char_xmax.max()
1540
+
1541
+ norm_vector_multi = t.ones(
1542
+ (1, samplelist[sample_idx].shape[1]), dtype=samplelist[sample_idx].dtype, device=samplelist[sample_idx].device
1543
+ )
1544
+ if norm_by_char_averages:
1545
+ chars_list = trialslist[sample_idx]["chars_list"]
1546
+ char_widths = np.asarray([x["char_xmax"] - x["char_xmin"] for x in chars_list])
1547
+ char_heights = np.asarray([x["char_ymax"] - x["char_ymin"] for x in chars_list])
1548
+ char_widths_average = np.mean(char_widths[char_widths > 0])
1549
+ char_heights_average = np.mean(char_heights[char_heights > 0])
1550
+
1551
+ norm_vector_multi[0, 0] = norm_vector_multi[0, 0] * char_widths_average
1552
+ norm_vector_multi[0, 1] = norm_vector_multi[0, 1] * char_heights_average
1553
+
1554
+ else:
1555
+ line_height = min(np.unique(trialslist[sample_idx]["line_heights"]))
1556
+ line_width = max_x_chars - min_x_chars
1557
+ norm_vector_multi[0, 0] = norm_vector_multi[0, 0] * line_width
1558
+ norm_vector_multi[0, 1] = norm_vector_multi[0, 1] * line_height
1559
+ assert ~t.any(t.isnan(norm_vector_multi)), "Nan found in char norming vector"
1560
+
1561
+ norm_vector_multi = norm_vector_multi.squeeze(0)
1562
+ if add_normalised_values_as_features:
1563
+ norm_vector_multi = norm_vector_multi[norm_vector_multi != 1]
1564
+ normed_features = samplelist[sample_idx][:, : norm_vector_multi.shape[0]] / norm_vector_multi
1565
+ samplelist[sample_idx] = t.cat([samplelist[sample_idx], normed_features], dim=1)
1566
+ else:
1567
+ samplelist[sample_idx] = samplelist[sample_idx] / norm_vector_multi # in case time or pupil size is included
1568
+ if chars_center_coords_list is not None:
1569
+ norm_vector_multi = norm_vector_multi[:2]
1570
+ if chars_center_coords_list[sample_idx].shape[-1] == norm_vector_multi.shape[-1] * 2:
1571
+ chars_center_coords_list[sample_idx][:, :2] /= norm_vector_multi
1572
+ chars_center_coords_list[sample_idx][:, 2:] /= norm_vector_multi
1573
+ else:
1574
+ chars_center_coords_list[sample_idx] /= norm_vector_multi
1575
+ if return_mean_std_lists:
1576
+ mean_val = samplelist[sample_idx].mean(axis=0).cpu().numpy()
1577
+ meanlist.append(mean_val)
1578
+ std_val = samplelist[sample_idx].std(axis=0).cpu().numpy()
1579
+ stdlist.append(std_val)
1580
+ assert ~any(pd.isna(mean_val)), "Nan found in mean_val"
1581
+ assert ~any(pd.isna(mean_val)), "Nan found in std_val"
1582
+
1583
+ return trialslist, samplelist, meanlist, stdlist, chars_center_coords_list
1584
+ return trialslist, samplelist, chars_center_coords_list
1585
+
1586
+
1587
+ def get_fig_ax(screen_res, dpi, words_df, x_margin, y_margin, dffix=None, prefix="word"):
1588
+ fig = plt.figure(figsize=(screen_res[0] / dpi, screen_res[1] / dpi), dpi=dpi)
1589
+ ax = plt.Axes(fig, [0.0, 0.0, 1.0, 1.0])
1590
+ ax.set_axis_off()
1591
+ if dffix is not None:
1592
+ ax.set_ylim((dffix.y.min(), dffix.y.max()))
1593
+ ax.set_xlim((dffix.x.min(), dffix.x.max()))
1594
+ else:
1595
+ ax.set_ylim((words_df[f"{prefix}_y_center"].min() - y_margin, words_df[f"{prefix}_y_center"].max() + y_margin))
1596
+ ax.set_xlim((words_df[f"{prefix}_x_center"].min() - x_margin, words_df[f"{prefix}_x_center"].max() + x_margin))
1597
+ ax.invert_yaxis()
1598
+ fig.add_axes(ax)
1599
+ return fig, ax
1600
+
1601
+
1602
+ def get_save_path(fpath, fname_ending):
1603
+ save_path = PLOTS_FOLDER.joinpath(f"{fpath.stem}_{fname_ending}.png")
1604
+ return save_path
1605
+
1606
+
1607
+ def save_im_load_convert(fpath, fig, fname_ending, mode):
1608
+ save_path = get_save_path(fpath, fname_ending)
1609
+ fig.savefig(save_path)
1610
+ im = Image.open(save_path).convert(mode)
1611
+ im.save(save_path)
1612
+ return im
1613
+
1614
+
1615
+ def plot_text_boxes_fixations(
1616
+ fpath,
1617
+ dpi,
1618
+ screen_res,
1619
+ set_font_size: bool,
1620
+ font_size: int,
1621
+ dffix=None,
1622
+ trial=None,
1623
+ ):
1624
+ if isinstance(fpath, str):
1625
+ fpath = pl.Path(fpath)
1626
+ prefix = "char"
1627
+
1628
+ if dffix is None:
1629
+ dffix = pd.read_csv(fpath)
1630
+ if trial is None:
1631
+ json_fpath = str(fpath).replace("_fixations.csv", "_trial.json")
1632
+ with open(json_fpath, "r") as f:
1633
+ trial = json.load(f)
1634
+ words_df = pd.DataFrame(trial[f"{prefix}s_list"])
1635
+ x_right = words_df[f"{prefix}_xmin"]
1636
+ x_left = words_df[f"{prefix}_xmax"]
1637
+ y_top = words_df[f"{prefix}_ymax"]
1638
+ y_bottom = words_df[f"{prefix}_ymin"]
1639
+
1640
+ if f"{prefix}_x_center" not in words_df.columns:
1641
+ words_df[f"{prefix}_x_center"] = (words_df[f"{prefix}_xmax"] - words_df[f"{prefix}_xmin"]) / 2 + words_df[
1642
+ f"{prefix}_xmin"
1643
+ ]
1644
+ words_df[f"{prefix}_y_center"] = (words_df[f"{prefix}_ymax"] - words_df[f"{prefix}_ymin"]) / 2 + words_df[
1645
+ f"{prefix}_ymin"
1646
+ ]
1647
+
1648
+ x_margin = words_df[f"{prefix}_x_center"].mean() / 8
1649
+ y_margin = words_df[f"{prefix}_y_center"].mean() / 4
1650
+ times = dffix.corrected_start_time - dffix.corrected_start_time.min()
1651
+ times = times / times.max()
1652
+ times = np.linspace(0.25, 1, len(times))
1653
+
1654
+ if set_font_size:
1655
+ font = "monospace"
1656
+ else:
1657
+ font_size = trial["font_size"] * 27 // dpi
1658
+
1659
+ font_props = FontProperties(family=font, style="normal", size=font_size)
1660
+
1661
+ fig, ax = get_fig_ax(screen_res, dpi, words_df, x_margin, y_margin, prefix=prefix)
1662
+
1663
+ ax.scatter(words_df[f"{prefix}_x_center"], words_df[f"{prefix}_y_center"], s=1, facecolor="k", alpha=0.01)
1664
+ for idx in range(len(x_left)):
1665
+ ax.text(
1666
+ words_df[f"{prefix}_x_center"][idx],
1667
+ words_df[f"{prefix}_y_center"][idx],
1668
+ words_df[prefix][idx],
1669
+ horizontalalignment="center",
1670
+ verticalalignment="center",
1671
+ fontproperties=font_props,
1672
+ )
1673
+ fname_ending = f"{prefix}s_grey"
1674
+ words_grey_im = save_im_load_convert(fpath, fig, fname_ending, "L")
1675
+
1676
+ plt.close("all")
1677
+ fig, ax = get_fig_ax(screen_res, dpi, words_df, x_margin, y_margin, prefix=prefix)
1678
+
1679
+ ax.scatter(words_df[f"{prefix}_x_center"], words_df[f"{prefix}_y_center"], s=1, facecolor="k", alpha=0.1)
1680
+ for idx in range(len(x_left)):
1681
+ xdiff = x_right[idx] - x_left[idx]
1682
+ ydiff = y_top[idx] - y_bottom[idx]
1683
+ rect = patches.Rectangle(
1684
+ (x_left[idx] - 1, y_bottom[idx] - 1), xdiff, ydiff, alpha=0.9, linewidth=1, edgecolor="k", facecolor="grey"
1685
+ ) # seems to need one pixel offset
1686
+ ax.add_patch(rect)
1687
+ fname_ending = f"{prefix}_boxes_grey"
1688
+ word_boxes_grey_im = save_im_load_convert(fpath, fig, fname_ending, "L")
1689
+
1690
+ plt.close("all")
1691
+
1692
+ fig, ax = get_fig_ax(screen_res, dpi, words_df, x_margin, y_margin, prefix=prefix)
1693
+
1694
+ ax.scatter(dffix.x, dffix.y, facecolor="k", alpha=times)
1695
+ fname_ending = "fix_scatter_grey"
1696
+ fix_scatter_grey_im = save_im_load_convert(fpath, fig, fname_ending, "L")
1697
+
1698
+ plt.close("all")
1699
+
1700
+ arr_combo = np.stack(
1701
+ [
1702
+ np.asarray(words_grey_im),
1703
+ np.asarray(word_boxes_grey_im),
1704
+ np.asarray(fix_scatter_grey_im),
1705
+ ],
1706
+ axis=2,
1707
+ )
1708
+
1709
+ im_combo = Image.fromarray(arr_combo)
1710
+ fname_ending = f"{prefix}s_channel_sep"
1711
+
1712
+ im_combo.save(fpath)
1713
+
1714
+ return im_combo
1715
+
1716
+
1717
+ def prep_data_for_dist(model_cfg, dffix, trial):
1718
+ if isinstance(dffix, dict):
1719
+ dffix = dffix["value"]
1720
+ sample_tensor = t.tensor(dffix.loc[:, model_cfg["sample_cols"]].to_numpy(), dtype=t.float32)
1721
+
1722
+ if model_cfg["add_line_overlap_feature"]:
1723
+ sample_tensor = add_line_overlaps_to_sample(trial, sample_tensor)
1724
+
1725
+ has_nans = t.any(t.isnan(sample_tensor))
1726
+ assert not has_nans, "NaNs found in sample tensor"
1727
+ samplelist_eval = [sample_tensor]
1728
+ trialslist_eval = [trial]
1729
+ chars_center_coords_list_eval = None
1730
+ if model_cfg["norm_coords_by_letter_min_x_y"]:
1731
+ for sample_idx, _ in enumerate(samplelist_eval):
1732
+ trialslist_eval, samplelist_eval, chars_center_coords_list_eval = norm_coords_by_letter_min_x_y(
1733
+ sample_idx,
1734
+ trialslist_eval,
1735
+ samplelist_eval,
1736
+ chars_center_coords_list=chars_center_coords_list_eval,
1737
+ )
1738
+
1739
+ if model_cfg["normalize_by_line_height_and_width"]:
1740
+ meanlist_eval, stdlist_eval = [], []
1741
+ for sample_idx, _ in enumerate(samplelist_eval):
1742
+ (
1743
+ trialslist_eval,
1744
+ samplelist_eval,
1745
+ meanlist_eval,
1746
+ stdlist_eval,
1747
+ chars_center_coords_list_eval,
1748
+ ) = norm_coords_by_letter_positions(
1749
+ sample_idx,
1750
+ trialslist_eval,
1751
+ samplelist_eval,
1752
+ meanlist_eval,
1753
+ stdlist_eval,
1754
+ return_mean_std_lists=True,
1755
+ norm_by_char_averages=model_cfg["norm_by_char_averages"],
1756
+ chars_center_coords_list=chars_center_coords_list_eval,
1757
+ add_normalised_values_as_features=model_cfg["add_normalised_values_as_features"],
1758
+ )
1759
+ sample_tensor = samplelist_eval[0]
1760
+ sample_means = t.tensor(model_cfg["sample_means"], dtype=t.float32)
1761
+ sample_std = t.tensor(model_cfg["sample_std"], dtype=t.float32)
1762
+ sample_tensor = (sample_tensor - sample_means) / sample_std
1763
+ sample_tensor = sample_tensor.unsqueeze(0)
1764
+ if not pl.Path(trial["plot_file"]).exists():
1765
+ plot_text_boxes_fixations(
1766
+ fpath=trial["plot_file"],
1767
+ dpi=250,
1768
+ screen_res=(1024, 768),
1769
+ set_font_size=True,
1770
+ font_size=4,
1771
+ dffix=dffix,
1772
+ trial=trial,
1773
+ )
1774
+
1775
+ val_set = DSet(
1776
+ sample_tensor,
1777
+ None,
1778
+ t.zeros((1, sample_tensor.shape[1])),
1779
+ trialslist_eval,
1780
+ padding_list=[0],
1781
+ padding_at_end=model_cfg["padding_at_end"],
1782
+ return_images_for_conv=True,
1783
+ im_partial_string=model_cfg["im_partial_string"],
1784
+ input_im_shape=model_cfg["char_plot_shape"],
1785
+ )
1786
+ val_loader = dl(val_set, batch_size=1, shuffle=False, num_workers=0)
1787
+ return val_loader, val_set
1788
+
1789
+
1790
+ def fold_in_seq_dim(out, y=None):
1791
+ batch_size, seq_len, num_classes = out.shape
1792
+
1793
+ out = eo.rearrange(out, "b s c -> (b s) c", s=seq_len)
1794
+ if y is None:
1795
+ return out, None
1796
+ if len(y.shape) > 2:
1797
+ y = eo.rearrange(y, "b s c -> (b s) c", s=seq_len)
1798
+ else:
1799
+ y = eo.rearrange(y, "b s -> (b s)", s=seq_len)
1800
+ return out, y
1801
+
1802
+
1803
+ def logits_to_pred(out, y=None):
1804
+ seq_len = out.shape[1]
1805
+ out, y = fold_in_seq_dim(out, y)
1806
+ preds = corn_label_from_logits(out)
1807
+ preds = eo.rearrange(preds, "(b s) -> b s", s=seq_len)
1808
+ if y is not None:
1809
+ y = eo.rearrange(y.squeeze(), "(b s) -> b s", s=seq_len)
1810
+ y = y
1811
+ return preds, y
1812
+
1813
+
1814
+ def get_DIST_preds(dffix, trial, models_dict):
1815
+ algo_choice = "DIST"
1816
+
1817
+ model = models_dict["single_DIST_model"]
1818
+ loader, dset = prep_data_for_dist(models_dict["single_DIST_model_cfg"], dffix, trial)
1819
+ batch = next(iter(loader))
1820
+
1821
+ if "cpu" not in str(model.device):
1822
+ batch = [x.cuda() for x in batch]
1823
+ try:
1824
+ out = model(batch)
1825
+ preds, y = logits_to_pred(out, y=None)
1826
+ if len(trial["y_char_unique"]) < 1:
1827
+ y_char_unique = pd.DataFrame(trial["chars_list"]).char_y_center.sort_values().unique()
1828
+ else:
1829
+ y_char_unique = trial["y_char_unique"]
1830
+ num_lines = trial["num_char_lines"] - 1
1831
+ preds = t.clamp(preds, 0, num_lines).squeeze().cpu().numpy()
1832
+ y_pred_DIST = [y_char_unique[idx] for idx in preds]
1833
+
1834
+ dffix[f"line_num_{algo_choice}"] = preds
1835
+ dffix[f"y_{algo_choice}"] = np.round(y_pred_DIST, decimals=2)
1836
+ dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(2)
1837
+ except Exception as e:
1838
+ ic(f"Exception on model(batch) for DIST \n{e}")
1839
+ return dffix
1840
+
1841
+
1842
+ def get_DIST_ensemble_preds(
1843
+ dffix,
1844
+ trial,
1845
+ model_cfg_without_norm_df,
1846
+ model_cfg_with_norm_df,
1847
+ ensemble_model_avg,
1848
+ ):
1849
+ algo_choice = "DIST-Ensemble"
1850
+ loader_without_norm, dset_without_norm = prep_data_for_dist(model_cfg_without_norm_df, dffix, trial)
1851
+ loader_with_norm, dset_with_norm = prep_data_for_dist(model_cfg_with_norm_df, dffix, trial)
1852
+ batch_without_norm = next(iter(loader_without_norm))
1853
+ batch_with_norm = next(iter(loader_with_norm))
1854
+ out = ensemble_model_avg((batch_without_norm, batch_with_norm))
1855
+ preds, y = logits_to_pred(out[0]["out_avg"], y=None)
1856
+ if len(trial["y_char_unique"]) < 1:
1857
+ y_char_unique = pd.DataFrame(trial["chars_list"]).char_y_center.sort_values().unique()
1858
+ else:
1859
+ y_char_unique = trial["y_char_unique"]
1860
+ num_lines = trial["num_char_lines"] - 1
1861
+ preds = t.clamp(preds, 0, num_lines).squeeze().cpu().numpy()
1862
+ y_pred_DIST = [y_char_unique[idx] for idx in preds]
1863
+
1864
+ dffix[f"line_num_{algo_choice}"] = preds
1865
+ dffix[f"y_{algo_choice}"] = np.round(y_pred_DIST, decimals=1)
1866
+ dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(1)
1867
+ return dffix
1868
+
1869
+
1870
+ def get_EDIST_preds_with_model_check(dffix, trial, models_dict):
1871
+
1872
+ dffix = get_DIST_ensemble_preds(
1873
+ dffix,
1874
+ trial,
1875
+ models_dict["model_cfg_without_norm_df"],
1876
+ models_dict["model_cfg_with_norm_df"],
1877
+ models_dict["ensemble_model_avg"],
1878
+ )
1879
+ return dffix
1880
+
1881
+
1882
+ def get_all_classic_preds(dffix, trial, classic_algos_cfg):
1883
+ corrections = []
1884
+ for algo, classic_params in copy.deepcopy(classic_algos_cfg).items():
1885
+ dffix = calgo.apply_classic_algo(dffix, trial, algo, classic_params)
1886
+ corrections.append(np.asarray(dffix.loc[:, f"y_{algo}"]))
1887
+ return dffix, corrections
1888
+
1889
+
1890
+ def apply_woc(dffix, trial, corrections, algo_choice):
1891
+
1892
+ corrected_Y = calgo.wisdom_of_the_crowd(corrections)
1893
+ dffix.loc[:, f"y_{algo_choice}"] = corrected_Y
1894
+ dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(1)
1895
+ corrected_line_nums = [trial["y_char_unique"].index(y) for y in corrected_Y]
1896
+ dffix.loc[:, f"line_num_y_{algo_choice}"] = corrected_line_nums
1897
+ dffix.loc[:, f"line_num_{algo_choice}"] = corrected_line_nums
1898
+ return dffix
1899
+
1900
+
1901
+ def apply_correction_algo(dffix, algo_choice, trial, models_dict, classic_algos_cfg):
1902
+
1903
+ if algo_choice == "DIST":
1904
+ dffix = get_DIST_preds(dffix, trial, models_dict=models_dict)
1905
+
1906
+ elif algo_choice == "DIST-Ensemble":
1907
+ dffix = get_EDIST_preds_with_model_check(dffix, trial, models_dict=models_dict)
1908
+ elif algo_choice == "Wisdom_of_Crowds_with_DIST":
1909
+ dffix, corrections = get_all_classic_preds(dffix, trial, classic_algos_cfg)
1910
+ dffix = get_DIST_preds(dffix, trial, models_dict=models_dict)
1911
+ for _ in range(3):
1912
+ corrections.append(np.asarray(dffix.loc[:, "y_DIST"]))
1913
+ dffix = apply_woc(dffix, trial, corrections, algo_choice)
1914
+ elif algo_choice == "Wisdom_of_Crowds_with_DIST_Ensemble":
1915
+ dffix, corrections = get_all_classic_preds(dffix, trial, classic_algos_cfg)
1916
+ dffix = get_EDIST_preds_with_model_check(dffix, trial, models_dict=models_dict)
1917
+ for _ in range(3):
1918
+ corrections.append(np.asarray(dffix.loc[:, "y_DIST-Ensemble"]))
1919
+ dffix = apply_woc(dffix, trial, corrections, algo_choice)
1920
+ elif algo_choice == "Wisdom_of_Crowds":
1921
+ dffix, corrections = get_all_classic_preds(dffix, trial, classic_algos_cfg)
1922
+ dffix = apply_woc(dffix, trial, corrections, algo_choice)
1923
+
1924
+ else:
1925
+ algo_cfg = classic_algos_cfg[algo_choice]
1926
+ dffix = calgo.apply_classic_algo(dffix, trial, algo_choice, algo_cfg)
1927
+ dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(1)
1928
+ dffix = dffix.copy() # apparently helps with fragmentation
1929
+ return dffix
1930
+
1931
+
1932
+ def add_popEye_cols_to_dffix(dffix, algo_choice, chars_df, trial, xcol, cols_to_add: list):
1933
+ """
1934
+ Required for word or sentence measures:
1935
+ - letternum
1936
+ - letter
1937
+ - on_word_number
1938
+ - on_word
1939
+ - on_sentence
1940
+ - num_words_in_sentence
1941
+ - on_sentence_num
1942
+ - word_land
1943
+ - line_let
1944
+ - line_word
1945
+ - sac_in
1946
+ - sac_out
1947
+ - word_launch
1948
+ - word_refix
1949
+ - word_reg_in
1950
+ - word_reg_out
1951
+ - sentence_reg_in
1952
+ - word_firstskip
1953
+ - word_run
1954
+ - sentence_run
1955
+ - word_run_fix
1956
+ - word_cland
1957
+ Optional:
1958
+ - line_let_from_last_letter
1959
+ - sentence_word
1960
+ - line_let_previous
1961
+ - line_let_next
1962
+ - sentence_refix
1963
+ - word_reg_out_to
1964
+ - word_reg_in_from
1965
+ - sentence_reg_out
1966
+ - sentence_reg_in_from
1967
+ - sentence_reg_out_to
1968
+ - sentence_firstskip
1969
+ - word_runid
1970
+ - sentence_runid
1971
+ - word_fix
1972
+ - sentence_fix
1973
+ """
1974
+ if "angle_incoming" in cols_to_add:
1975
+ x_diff_incoming = dffix[xcol].values - dffix[xcol].shift(1).values
1976
+ y_diff_incoming = dffix["y"].values - dffix["y"].shift(1).values
1977
+ angle_incoming = np.arctan2(y_diff_incoming, x_diff_incoming) * (180 / np.pi)
1978
+ dffix["angle_incoming"] = angle_incoming
1979
+ if "angle_outgoing" in cols_to_add:
1980
+ x_diff_outgoing = dffix[xcol].shift(-1).values - dffix[xcol].values
1981
+ y_diff_outgoing = dffix["y"].shift(-1).values - dffix["y"].values
1982
+ angle_outgoing = np.arctan2(y_diff_outgoing, x_diff_outgoing) * (180 / np.pi)
1983
+ dffix["angle_outgoing"] = angle_outgoing
1984
+ dffix[f"line_change_{algo_choice}"] = np.concatenate(
1985
+ ([0], np.diff(dffix[f"line_num_{algo_choice}"])), axis=0
1986
+ ).astype(int)
1987
+
1988
+ for i in list(dffix.index):
1989
+ if dffix.loc[i, f"line_num_{algo_choice}"] > -1 and not pd.isna(dffix.loc[i, f"line_num_{algo_choice}"]):
1990
+ selected_stimmat = chars_df[
1991
+ chars_df["assigned_line"] == dffix.loc[i, f"line_num_{algo_choice}"]
1992
+ ].reset_index()
1993
+ selected_stimmat.loc[:, "letword"] = selected_stimmat.groupby("in_word_number")["letternum"].rank()
1994
+ letters_on_line = selected_stimmat.shape[0]
1995
+ out = dffix.loc[i, xcol] - selected_stimmat["char_x_center"]
1996
+ min_idx = out.abs().idxmin()
1997
+ dffix.loc[i, f"letternum_{algo_choice}"] = selected_stimmat.loc[min_idx, "letternum"]
1998
+ dffix.loc[i, f"letter_{algo_choice}"] = selected_stimmat.loc[min_idx, "char"]
1999
+ dffix.loc[i, f"line_let_{algo_choice}"] = selected_stimmat.loc[min_idx, "letline"]
2000
+ if "line_let_from_last_letter" in cols_to_add:
2001
+ dffix.loc[i, f"line_let_from_last_letter_{algo_choice}"] = (
2002
+ letters_on_line - dffix.loc[i, f"line_let_{algo_choice}"]
2003
+ )
2004
+ word_min_idx = min_idx
2005
+ if (
2006
+ selected_stimmat.loc[min_idx, "char"] == " "
2007
+ and (min_idx - 1) in selected_stimmat.index
2008
+ and (min_idx + 1) in selected_stimmat.index
2009
+ ):
2010
+ dist_to_previous_letter = np.abs(
2011
+ dffix.loc[i, xcol] - selected_stimmat.loc[min_idx - 1, "char_x_center"]
2012
+ )
2013
+ dist_to_following_letter = np.abs(
2014
+ dffix.loc[i, xcol] - selected_stimmat.loc[min_idx + 1, "char_x_center"]
2015
+ )
2016
+ if dist_to_previous_letter < dist_to_following_letter:
2017
+ word_min_idx = min_idx - 1
2018
+ if not pd.isna(selected_stimmat.loc[min_idx, "in_word_number"]):
2019
+ dffix.loc[i, f"on_word_number_{algo_choice}"] = selected_stimmat.loc[word_min_idx, "in_word_number"]
2020
+ dffix.loc[i, f"on_word_{algo_choice}"] = selected_stimmat.loc[word_min_idx, "in_word"]
2021
+ dffix.loc[i, f"word_land_{algo_choice}"] = selected_stimmat.loc[
2022
+ word_min_idx, "num_letters_from_start_of_word"
2023
+ ]
2024
+ dffix.loc[i, f"line_word_{algo_choice}"] = selected_stimmat.loc[word_min_idx, "wordline"]
2025
+ if "sentence_word" in cols_to_add:
2026
+ dffix.loc[i, f"sentence_word_{algo_choice}"] = selected_stimmat.loc[word_min_idx, "wordsent"]
2027
+ dffix.loc[i, "num_words_in_sentence"] = len(selected_stimmat.loc[word_min_idx, "in_sentence"].split(" "))
2028
+ dffix.loc[i, f"on_sentence_num_{algo_choice}"] = selected_stimmat.loc[word_min_idx, "in_sentence_number"]
2029
+ dffix.loc[i, f"on_sentence_{algo_choice}"] = selected_stimmat.loc[word_min_idx, "in_sentence"]
2030
+ if "line_let_previous" in cols_to_add:
2031
+ dffix[f"line_let_previous_{algo_choice}"] = dffix[f"line_let_{algo_choice}"].shift(-1)
2032
+ if "line_let_next" in cols_to_add:
2033
+ dffix[f"line_let_next_{algo_choice}"] = dffix[f"line_let_{algo_choice}"].shift(1)
2034
+ dffix = pf.compute_saccade_length(dffix, chars_df, algo_choice)
2035
+ dffix = pf.compute_launch_distance(dffix, algo_choice)
2036
+ dffix = pf.compute_refixation(dffix, algo_choice)
2037
+ dffix = pf.compute_regression(dffix, algo_choice)
2038
+ dffix = pf.compute_firstskip(dffix, algo_choice)
2039
+ dffix = pf.compute_run(dffix, algo_choice)
2040
+ dffix = pf.compute_landing_position(dffix, algo_choice)
2041
+ dffix = dffix.loc[:, ~dffix.columns.duplicated()]
2042
+ return dffix
2043
+
2044
+
2045
+ def export_dataframe(df: pd.DataFrame, csv_name: str):
2046
+ if isinstance(df, dict):
2047
+ df = df["value"]
2048
+ df.to_csv(csv_name)
2049
+ return csv_name
2050
+
2051
+
2052
+ def _convert_to_json(obj):
2053
+ if isinstance(obj, (int, float, str, bool)):
2054
+ return obj
2055
+ elif isinstance(obj, dict):
2056
+ return {k: _convert_to_json(v) for k, v in obj.items()}
2057
+ elif isinstance(obj, list) or isinstance(obj, tuple):
2058
+ return [_convert_to_json(item) for item in obj]
2059
+ elif isinstance(obj, dict):
2060
+ return {k: _convert_to_json(val) for k, val in obj.items()}
2061
+ elif hasattr(obj, "to_dict"):
2062
+ return _convert_to_json(obj.to_dict())
2063
+ elif hasattr(obj, "tolist"):
2064
+ return _convert_to_json(obj.tolist())
2065
+ elif obj is None:
2066
+ return None
2067
+ else:
2068
+ raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
2069
+
2070
+
2071
+ def save_trial_to_json(trial, savename):
2072
+ filtered_trial = {}
2073
+ for key, value in trial.items():
2074
+ try:
2075
+ filtered_trial[key] = _convert_to_json(value)
2076
+ except TypeError as e:
2077
+ ic(f"Warning: Skipping non-serializable value for key '{key}' due to error: {e}")
2078
+
2079
+ with open(savename, "w", encoding="utf-8") as f:
2080
+ json.dump(filtered_trial, f, ensure_ascii=False, indent=4)
2081
+
2082
+
2083
+ def export_trial(trial: dict):
2084
+
2085
+ trial_id = trial["trial_id"]
2086
+ savename = RESULTS_FOLDER.joinpath(pl.Path(trial["filename"]).stem)
2087
+ trial_name = f"{savename}_{trial_id}_trial_info.json"
2088
+
2089
+ filtered_trial = copy.deepcopy(trial)
2090
+ _ = [filtered_trial.pop(k) for k in list(filtered_trial.keys()) if isinstance(filtered_trial[k], pd.DataFrame)]
2091
+ _ = [
2092
+ filtered_trial.pop(k)
2093
+ for k in list(filtered_trial.keys())
2094
+ if k
2095
+ in [
2096
+ "words_list",
2097
+ "chars_list",
2098
+ "chars_df_alt",
2099
+ "EMReading_fix",
2100
+ "chars_df",
2101
+ "dffix_sacdf_popEye",
2102
+ "fixdf_popEye",
2103
+ "sacdf_popEye",
2104
+ "saccade_df",
2105
+ "combined_df",
2106
+ "own_sentence_measures_dfs_for_algo",
2107
+ "own_word_measures_dfs_for_algo",
2108
+ ]
2109
+ ]
2110
+
2111
+ filtered_trial["line_heights"] = list(np.unique(filtered_trial["line_heights"]))
2112
+ save_trial_to_json(filtered_trial, trial_name)
2113
+ return trial_name
2114
+
2115
+
2116
+ def add_cols_from_trial(trial, df, cols=["item", "condition", "trial_id", "subject"]):
2117
+ for col in cols:
2118
+ if col not in df.columns:
2119
+ df.insert(loc=0, column=col, value=trial[col])
2120
+
2121
+
2122
+ def correct_df(
2123
+ dffix,
2124
+ algo_choice,
2125
+ trial,
2126
+ for_multi,
2127
+ is_outside_of_streamlit,
2128
+ classic_algos_cfg,
2129
+ models_dict,
2130
+ measures_to_calculate_multi_asc=[],
2131
+ include_coords_multi_asc=False,
2132
+ sent_measures_to_calc_multi=[],
2133
+ fix_cols_to_add=[],
2134
+ ):
2135
+ if is_outside_of_streamlit:
2136
+ stqdm = tqdm
2137
+ else:
2138
+ from stqdm import stqdm
2139
+
2140
+ if isinstance(dffix, dict):
2141
+ dffix = dffix["value"]
2142
+ if "x" not in dffix.keys() or "x" not in dffix.keys():
2143
+ ic(f"x or y not in dffix")
2144
+ ic(dffix.columns)
2145
+ return dffix
2146
+
2147
+ if isinstance(algo_choice, list):
2148
+ algo_choices = algo_choice
2149
+ repeats = range(len(algo_choice))
2150
+ else:
2151
+ algo_choices = [algo_choice]
2152
+ repeats = range(1)
2153
+
2154
+ chars_df = pd.DataFrame(trial["chars_df"]) if "chars_df" in trial else pd.DataFrame(trial["chars_list"])
2155
+ if for_multi:
2156
+ own_word_measures_dfs_for_algo = []
2157
+ own_sentence_measures_dfs_for_algo = []
2158
+ trial["average_y_corrections"] = []
2159
+ for algoIdx in stqdm(repeats, desc="Applying line-assignment algorithms"):
2160
+ algo_choice = algo_choices[algoIdx]
2161
+ dffix = apply_correction_algo(dffix, algo_choice, trial, models_dict, classic_algos_cfg)
2162
+ average_y_correction = (dffix[f"y_{algo_choice}"] - dffix["y"]).mean().round(1)
2163
+ trial["average_y_corrections"].append({"Algorithm": algo_choice, "average_y_correction": average_y_correction})
2164
+ fig, desired_width_in_pixels, desired_height_in_pixels = matplotlib_plot_df(
2165
+ dffix,
2166
+ trial,
2167
+ algo_choice,
2168
+ None,
2169
+ box_annotations=None,
2170
+ fix_to_plot=["Uncorrected Fixations", "Corrected Fixations"],
2171
+ stim_info_to_plot=["Characters", "Word boxes"],
2172
+ )
2173
+ savename = f"{trial['subject']}_{trial['trial_id']}_corr_{algo_choice}_fix.png"
2174
+ fig.savefig(RESULTS_FOLDER.joinpath(savename), dpi=300)
2175
+ plt.close(fig)
2176
+ dffix = add_popEye_cols_to_dffix(dffix, algo_choice, chars_df, trial, "x", cols_to_add=fix_cols_to_add)
2177
+
2178
+ if for_multi and len(measures_to_calculate_multi_asc) > 0 and dffix.shape[0] > 1:
2179
+ own_word_measures = get_all_measures(
2180
+ trial,
2181
+ dffix,
2182
+ prefix="word",
2183
+ use_corrected_fixations=True,
2184
+ correction_algo=algo_choice,
2185
+ measures_to_calculate=measures_to_calculate_multi_asc,
2186
+ include_coords=include_coords_multi_asc,
2187
+ )
2188
+ own_word_measures_dfs_for_algo.append(own_word_measures)
2189
+ sent_measures_multi = pf.compute_sentence_measures(
2190
+ dffix, pd.DataFrame(trial["chars_df"]), algo_choice, sent_measures_to_calc_multi
2191
+ )
2192
+ own_sentence_measures_dfs_for_algo.append(sent_measures_multi)
2193
+
2194
+ if for_multi and len(own_word_measures_dfs_for_algo) > 0:
2195
+ words_df = (
2196
+ pd.DataFrame(trial["chars_df"])
2197
+ .drop_duplicates(subset="in_word_number", keep="first")
2198
+ .loc[:, ["in_word_number", "in_word"]]
2199
+ .rename({"in_word_number": "word_number", "in_word": "word"}, axis=1)
2200
+ .reset_index(drop=True)
2201
+ )
2202
+ add_cols_from_trial(trial, words_df, cols=["item", "condition", "trial_id", "subject"])
2203
+ words_df["subject_trialID"] = [f"{id}_{num}" for id, num in zip(words_df["subject"], words_df["trial_id"])]
2204
+ words_df = words_df.merge(
2205
+ own_word_measures_dfs_for_algo[0],
2206
+ how="left",
2207
+ on=["subject", "trial_id", "item", "condition", "word_number", "word"],
2208
+ )
2209
+ for word_measure_df in own_word_measures_dfs_for_algo[1:]:
2210
+ words_df = words_df.merge(
2211
+ word_measure_df, how="left", on=["subject", "trial_id", "item", "condition", "word_number", "word"]
2212
+ )
2213
+ words_df = reorder_columns(words_df, ["subject", "trial_id", "item", "condition", "word_number", "word"])
2214
+
2215
+ sentence_df = (
2216
+ pd.DataFrame(trial["chars_df"])
2217
+ .drop_duplicates(subset="in_sentence_number", keep="first")
2218
+ .loc[
2219
+ :,
2220
+ [
2221
+ "in_sentence_number",
2222
+ "in_sentence",
2223
+ ],
2224
+ ]
2225
+ .rename({"in_sentence_number": "sentence_number", "in_sentence": "sentence"}, axis=1)
2226
+ .reset_index(drop=True)
2227
+ )
2228
+ add_cols_from_trial(trial, sentence_df, cols=["item", "condition", "trial_id", "subject"])
2229
+ sentence_df["subject_trialID"] = [
2230
+ f"{id}_{num}" for id, num in zip(sentence_df["subject"], sentence_df["trial_id"])
2231
+ ]
2232
+ sentence_df = sentence_df.merge(
2233
+ own_sentence_measures_dfs_for_algo[0],
2234
+ how="left",
2235
+ on=["item", "condition", "trial_id", "subject", "sentence_number", "sentence"],
2236
+ )
2237
+ for sent_measure_df in own_sentence_measures_dfs_for_algo[1:]:
2238
+ sentence_df = sentence_df.merge(
2239
+ sent_measure_df,
2240
+ how="left",
2241
+ on=["subject", "trial_id", "item", "condition", "sentence_number", "sentence", "number_of_words"],
2242
+ )
2243
+ sentence_df = reorder_columns(
2244
+ sentence_df, ["subject", "trial_id", "item", "condition", "sentence_number", "sentence", "number_of_words"]
2245
+ )
2246
+
2247
+ trial["own_word_measures_dfs_for_algo"] = words_df
2248
+
2249
+ trial["own_sentence_measures_dfs_for_algo"] = sentence_df
2250
+ dffix = reorder_columns(dffix)
2251
+ if for_multi:
2252
+ return dffix
2253
+ else:
2254
+ fix_cols_to_keep = [
2255
+ c
2256
+ for c in dffix.columns
2257
+ if (
2258
+ (any([lname in c for lname in ALL_FIX_MEASURES]) and any([lname in c for lname in fix_cols_to_add]))
2259
+ or (not any([lname in c for lname in ALL_FIX_MEASURES]))
2260
+ )
2261
+ ]
2262
+
2263
+ savename = RESULTS_FOLDER.joinpath(pl.Path(trial["filename"]).stem)
2264
+ csv_name = f"{savename}_{trial['trial_id']}_corrected_fixations.csv"
2265
+ csv_name = export_dataframe(dffix.loc[:, fix_cols_to_keep].copy(), csv_name)
2266
+
2267
+ export_trial(trial)
2268
+ return dffix
2269
+
2270
+
2271
+ def process_trial_choice(
2272
+ trial: dict,
2273
+ algo_choice: str,
2274
+ choice_handle_short_and_close_fix,
2275
+ for_multi,
2276
+ discard_fixations_without_sfix,
2277
+ discard_far_out_of_text_fix,
2278
+ x_thres_in_chars,
2279
+ y_thresh_in_heights,
2280
+ short_fix_threshold,
2281
+ merge_distance_threshold,
2282
+ discard_long_fix,
2283
+ discard_long_fix_threshold,
2284
+ discard_blinks,
2285
+ measures_to_calculate_multi_asc,
2286
+ include_coords_multi_asc,
2287
+ sent_measures_to_calculate_multi_asc,
2288
+ classic_algos_cfg,
2289
+ models_dict,
2290
+ fix_cols_to_add,
2291
+ ):
2292
+
2293
+ dffix, trial = trial_to_dfs(
2294
+ trial=trial,
2295
+ choice_handle_short_and_close_fix=choice_handle_short_and_close_fix,
2296
+ discard_fixations_without_sfix=discard_fixations_without_sfix,
2297
+ discard_far_out_of_text_fix=discard_far_out_of_text_fix,
2298
+ x_thres_in_chars=x_thres_in_chars,
2299
+ y_thresh_in_heights=y_thresh_in_heights,
2300
+ short_fix_threshold=short_fix_threshold,
2301
+ discard_long_fix=discard_long_fix,
2302
+ discard_long_fix_threshold=discard_long_fix_threshold,
2303
+ merge_distance_threshold=merge_distance_threshold,
2304
+ discard_blinks=discard_blinks,
2305
+ )
2306
+ if "chars_list" in trial:
2307
+ chars_df = pd.DataFrame(trial["chars_df"])
2308
+
2309
+ trial["chars_df"] = chars_df.to_dict()
2310
+ trial["y_char_unique"] = list(chars_df.char_y_center.sort_values().unique())
2311
+ if algo_choice is not None and ("chars_list" in trial or "words_list" in trial):
2312
+ if dffix.shape[0] > 1:
2313
+ dffix = correct_df(
2314
+ dffix,
2315
+ algo_choice,
2316
+ trial,
2317
+ for_multi=for_multi,
2318
+ is_outside_of_streamlit=False,
2319
+ classic_algos_cfg=classic_algos_cfg,
2320
+ models_dict=models_dict,
2321
+ measures_to_calculate_multi_asc=measures_to_calculate_multi_asc,
2322
+ include_coords_multi_asc=include_coords_multi_asc,
2323
+ sent_measures_to_calc_multi=sent_measures_to_calculate_multi_asc,
2324
+ fix_cols_to_add=fix_cols_to_add,
2325
+ )
2326
+
2327
+ saccade_df = get_saccade_df(dffix, trial, algo_choice, trial.pop("events_df"))
2328
+ trial["saccade_df"] = saccade_df.to_dict()
2329
+
2330
+ fig = plot_saccade_df(dffix, saccade_df, trial, True, False)
2331
+ fig.savefig(RESULTS_FOLDER / f"{trial['subject']}_{trial['trial_id']}_saccades.png")
2332
+ plt.close(fig)
2333
+ else:
2334
+ ic(
2335
+ f"🚨 Only {dffix.shape[0]} fixation left after processing. saccade_df not created for trial {trial['trial_id']} 🚨"
2336
+ )
2337
+
2338
+ else:
2339
+ ic("🚨 Stimulus information needed for fixation line-assignment 🚨")
2340
+ for c in ["gaze_df", "dffix"]:
2341
+ if c in trial:
2342
+ trial.pop(c)
2343
+ return dffix, trial
2344
+
2345
+
2346
+ def get_saccade_df(dffix, trial, algo_choices, events_df):
2347
+ if not isinstance(algo_choices, list):
2348
+ algo_choices = [algo_choices]
2349
+ sac_df_as_detected = events_df[events_df["msg"] == "SAC"].copy()
2350
+ last_sacc_stop_time = sac_df_as_detected["stop_uncorrected"].iloc[-1]
2351
+ dffix_after_last_sacc = dffix.loc[dffix["start_uncorrected"] > last_sacc_stop_time, :].copy()
2352
+ if not dffix_after_last_sacc.empty:
2353
+ dffix_before_last_sacc = dffix.loc[dffix["start_uncorrected"] < last_sacc_stop_time, :].copy()
2354
+ dffix = pd.concat([dffix_before_last_sacc, dffix_after_last_sacc.iloc[[0], :]], axis=0)
2355
+ sac_df_as_detected = sac_df_as_detected[sac_df_as_detected["start"] >= dffix["end_time"].iloc[0]]
2356
+ sac_df_as_detected = sac_df_as_detected[sac_df_as_detected["stop"] <= dffix["start_time"].iloc[-1]]
2357
+
2358
+ sac_index_keep = [
2359
+ i for i, row in sac_df_as_detected.iterrows() if np.abs(row["start"] - dffix["start_time"].values).min() < 100
2360
+ ]
2361
+ sac_df_as_detected = sac_df_as_detected.loc[sac_index_keep, :]
2362
+
2363
+ starts = pd.Series(dffix["start_time"].values, dffix["start_time"])
2364
+ ends = pd.Series(dffix["end_time"].values, dffix["end_time"])
2365
+ starts_reind = starts.reindex(sac_df_as_detected["stop"], method="bfill").dropna()
2366
+ ends_reind = ends.reindex(sac_df_as_detected["start"], method="ffill").dropna()
2367
+
2368
+ sac_df_as_detected_start_indexed = sac_df_as_detected.copy().set_index("start")
2369
+ saccade_df = (
2370
+ sac_df_as_detected_start_indexed.loc[ends_reind.index, :]
2371
+ .reset_index(drop=False)
2372
+ .rename({"start": "start_time", "stop": "end_time"}, axis=1)
2373
+ )
2374
+
2375
+ saccade_df = pf.get_angle_and_eucl_dist(saccade_df)
2376
+ # TODO maybe add incoming outgoing angle from sacc_df to dffix
2377
+
2378
+ dffix_start_indexed = dffix.copy().set_index("start_time")
2379
+ dffix_end_indexed = dffix.copy().set_index("end_time")
2380
+ for algo_choice in algo_choices:
2381
+
2382
+ saccade_df[f"ys_{algo_choice}"] = dffix_end_indexed.loc[ends_reind.values, f"y_{algo_choice}"].values
2383
+ saccade_df[f"ye_{algo_choice}"] = dffix_start_indexed.loc[starts_reind.values, f"y_{algo_choice}"].values
2384
+ saccade_df = pf.get_angle_and_eucl_dist(saccade_df, algo_choice)
2385
+
2386
+ saccade_df[f"lines_{algo_choice}"] = dffix_end_indexed.loc[ends_reind.values, f"line_num_{algo_choice}"].values
2387
+ saccade_df[f"linee_{algo_choice}"] = dffix_start_indexed.loc[
2388
+ starts_reind.values, f"line_num_{algo_choice}"
2389
+ ].values
2390
+
2391
+ saccade_df[f"line_word_s_{algo_choice}"] = dffix_end_indexed.loc[
2392
+ ends_reind.values, f"line_word_{algo_choice}"
2393
+ ].values
2394
+ saccade_df[f"line_word_e_{algo_choice}"] = dffix_start_indexed.loc[
2395
+ starts_reind.values, f"line_word_{algo_choice}"
2396
+ ].values
2397
+
2398
+ saccade_df[f"lets_{algo_choice}"] = dffix_end_indexed.loc[ends_reind.values, f"letternum_{algo_choice}"].values
2399
+ saccade_df[f"lete_{algo_choice}"] = dffix_start_indexed.loc[
2400
+ starts_reind.values, f"letternum_{algo_choice}"
2401
+ ].values
2402
+
2403
+ blink_df = events_df[events_df["msg"] == "BLINK"]
2404
+ for i in range(len(saccade_df)):
2405
+ if saccade_df.loc[i, "start_time"] in blink_df["start"]:
2406
+ saccade_df.loc[i, "blink"] = True
2407
+
2408
+ saccade_df = pf.compute_non_line_dependent_saccade_measures(saccade_df, trial)
2409
+ for algo_choice in algo_choices:
2410
+ saccade_df = pf.compute_saccade_measures(saccade_df, trial, algo_choice)
2411
+
2412
+ if "msg" in saccade_df.columns:
2413
+ saccade_df = saccade_df.drop(axis=1, labels=["msg"])
2414
+ saccade_df = reorder_columns(saccade_df)
2415
+ return saccade_df.dropna(how="all", axis=1).copy()
popEye_funcs.py ADDED
@@ -0,0 +1,1373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Mostly adapted from: https://github.com/sascha2schroeder/popEye
3
+ """
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from icecream import ic
8
+ from scipy import stats
9
+ import pathlib as pl
10
+
11
+ RESULTS_FOLDER = pl.Path("results")
12
+
13
+
14
+ def compute_velocity(xy):
15
+ samp = 1000
16
+
17
+ N = xy.shape[0]
18
+ v = pd.DataFrame(data=np.zeros((N, 3)), columns=["time", "vx", "vy"])
19
+ v["time"] = xy["time"]
20
+
21
+ v.iloc[2 : (N - 2), 1:3] = (
22
+ samp
23
+ / 6
24
+ * (
25
+ xy.iloc[4:N, 1:3].values
26
+ + xy.iloc[3 : (N - 1), 1:3].values
27
+ - xy.iloc[1 : (N - 3), 1:3].values
28
+ - xy.iloc[0 : (N - 4), 1:3].values
29
+ )
30
+ )
31
+ v.iloc[1, 1:3] = samp / 2 * (xy.iloc[2, 1:3].values - xy.iloc[0, 1:3].values)
32
+ v.iloc[(N - 2), 1:3] = samp / 2 * (xy.iloc[N - 1, 1:3].values - xy.iloc[N - 4, 1:3].values)
33
+
34
+ xy = pd.concat([xy.set_index("time"), v.set_index("time")], axis=1).reset_index()
35
+ return xy
36
+
37
+
38
+ def event_long(events_df):
39
+ events_df["duration"] = events_df["stop"] - events_df["start"]
40
+ events_df = events_df[events_df["duration"] > 0]
41
+ events_df = events_df.drop(columns=["duration"])
42
+ events_df.reset_index(drop=True, inplace=True)
43
+ tmplong_cols = list(events_df.columns)
44
+ tmplong_cols.remove("msg")
45
+ events_df["del"] = 0
46
+ for i in events_df.index:
47
+ if events_df.loc[i, "msg"] == "BLINK":
48
+ if i == 0:
49
+ continue
50
+ for col in tmplong_cols:
51
+ events_df.loc[i, col] = events_df.loc[i - 1, col]
52
+ events_df.loc[i - 1, "del"] = 1
53
+
54
+ events_df = events_df[events_df["del"] == 0]
55
+ events_df = events_df.drop(columns=["del"])
56
+ events_df.reset_index(drop=True, inplace=True)
57
+ events_df["num"] = range(len(events_df))
58
+ # compute blinks
59
+ # ---------------
60
+
61
+ events_df["blink_before"] = 0
62
+ events_df["blink_after"] = 0
63
+
64
+ for i in events_df.index:
65
+ if events_df.loc[i, "msg"] == "BLINK":
66
+ events_df.loc[i - 1, "blink_after"] = 1
67
+ if i < len(events_df) - 1:
68
+ events_df.loc[i + 1, "blink_before"] = 1
69
+
70
+ # combine
71
+ events_df["blink"] = (events_df["blink_before"] == 1) | (events_df["blink_after"] == 1)
72
+ return events_df.copy()
73
+
74
+
75
+ def compute_non_line_dependent_saccade_measures(saccade_df, trial_dict):
76
+
77
+ saccade_df["trial_id"] = trial_dict["trial_id"]
78
+ gaze_df = trial_dict["gaze_df"]
79
+ for s in range(len(saccade_df)):
80
+ is_directional_deviation = False
81
+ a = saccade_df["start_time"][s]
82
+ b = saccade_df["end_time"][s]
83
+
84
+ if not gaze_df["x"][[True if (a <= x <= b) else False for x in gaze_df["time"]]].any():
85
+ gaze_df.loc[a:b, "x"] = np.nan
86
+
87
+ bool_vec = (gaze_df["time"] >= a) & (gaze_df["time"] <= b)
88
+ if (not gaze_df["x"][bool_vec].isna().any()) and bool_vec.any():
89
+ # saccade amplitude (dX, dY)
90
+ minx = min(gaze_df.loc[bool_vec, "x"])
91
+ maxx = max(gaze_df.loc[bool_vec, "x"])
92
+ if "calibration_method" not in trial_dict or trial_dict["calibration_method"] != "H3":
93
+ miny = min(gaze_df.loc[bool_vec, "y"])
94
+ maxy = max(gaze_df.loc[bool_vec, "y"])
95
+ ix1 = gaze_df.loc[bool_vec, "x"].index[np.argmin(gaze_df.loc[bool_vec, "x"])]
96
+ ix2 = gaze_df.loc[bool_vec, "x"].index[np.argmax(gaze_df.loc[bool_vec, "x"])]
97
+ if "calibration_method" not in trial_dict or trial_dict["calibration_method"] != "H3":
98
+ iy1 = gaze_df.loc[bool_vec, "y"].index[np.argmin(gaze_df.loc[bool_vec, "y"])]
99
+ iy2 = gaze_df.loc[bool_vec, "y"].index[np.argmax(gaze_df.loc[bool_vec, "y"])]
100
+ saccade_df.loc[s, "dX"] = round(np.sign(ix2 - ix1) * (maxx - minx))
101
+ if "calibration_method" not in trial_dict or trial_dict["calibration_method"] != "H3":
102
+ saccade_df.loc[s, "dY"] = round(np.sign(iy2 - iy1) * (maxy - miny))
103
+
104
+ # saccade amplitude/angle
105
+ if "calibration_method" not in trial_dict or trial_dict["calibration_method"] != "H3":
106
+ saccade_df.loc[s, "amp_px"] = round(
107
+ np.sqrt(saccade_df.loc[s, "dX"] ** 2 + saccade_df.loc[s, "dY"] ** 2)
108
+ )
109
+ saccade_df.loc[s, "amp_angle"] = round(np.arctan2(saccade_df.loc[s, "dY"], saccade_df.loc[s, "dX"]), 2)
110
+ saccade_df.loc[s, "amp_angle_deg"] = round(
111
+ np.arctan2(saccade_df.loc[s, "dY"], saccade_df.loc[s, "dX"]) * (180 / np.pi), 2
112
+ )
113
+
114
+ else:
115
+ saccade_df.loc[s, "amp_px"] = np.nan
116
+ saccade_df.loc[s, "amp_angle"] = np.nan
117
+ saccade_df.loc[s, "amp_angle_deg"] = np.nan
118
+
119
+ if 35 <= abs(saccade_df.loc[s, "angle"]) <= 145:
120
+ if saccade_df.loc[s, "xe"] - saccade_df.loc[s, "xs"] > 0 and not (
121
+ "blink_before" in saccade_df.columns
122
+ and (saccade_df.loc[s, "blink_before"] or saccade_df.loc[s, "blink_after"])
123
+ ):
124
+ is_directional_deviation = True
125
+
126
+ saccade_df.loc[s, "is_directional_deviation"] = is_directional_deviation
127
+
128
+ return saccade_df
129
+
130
+
131
+ def compute_saccade_measures(saccade_df, trial_dict, algo_choice):
132
+
133
+ if algo_choice is not None:
134
+ algo_str = f"_{algo_choice}"
135
+ else:
136
+ algo_str = ""
137
+ gaze_df = trial_dict["gaze_df"]
138
+ saccade_df.reset_index(drop=True, inplace=True)
139
+ saccade_df.loc[:, f"has_line_change{algo_str}"] = (
140
+ saccade_df.loc[:, f"lines{algo_str}"] != saccade_df.loc[:, f"linee{algo_str}"]
141
+ )
142
+ saccade_df.loc[:, f"goes_to_next_line{algo_str}"] = saccade_df.loc[:, f"linee{algo_str}"] == (
143
+ saccade_df.loc[:, f"lines{algo_str}"] + 1
144
+ )
145
+ saccade_df.loc[:, f"is_directional_deviation{algo_str}"] = False
146
+ saccade_df.loc[:, f"is_return_sweep{algo_str}"] = False
147
+
148
+ for sidx, subdf in saccade_df.groupby(f"lines{algo_str}"):
149
+ if subdf.iloc[-1][f"goes_to_next_line{algo_str}"]:
150
+ saccade_df.loc[subdf.index[-1], f"is_return_sweep{algo_str}"] = True
151
+
152
+ for s in range(len(saccade_df)):
153
+ is_directional_deviation = False
154
+ a = saccade_df["start_time"][s]
155
+ b = saccade_df["end_time"][s]
156
+
157
+ if not gaze_df["x"][[True if (a <= x <= b) else False for x in gaze_df["time"]]].any():
158
+ gaze_df.loc[a:b, "x"] = np.nan
159
+
160
+ # saccade distance in letters
161
+ if saccade_df.loc[s, f"lete{algo_str}"] is None or saccade_df.loc[s, f"lets{algo_str}"] is None:
162
+ ic(
163
+ f"None found for compute_saccade_measures at index {s} for subj {trial_dict['subject']} and trial {trial_dict['trial_id']}"
164
+ )
165
+ else:
166
+ saccade_df.loc[s, f"dist_let{algo_str}"] = (
167
+ saccade_df.loc[s, f"lete{algo_str}"] - saccade_df.loc[s, f"lets{algo_str}"]
168
+ )
169
+
170
+ bool_vec = (gaze_df["time"] >= a) & (gaze_df["time"] <= b)
171
+ if (not gaze_df["x"][bool_vec].isna().any()) and bool_vec.any():
172
+ # saccade peak velocity (vpeak)
173
+ if "calibration_method" not in trial_dict or trial_dict["calibration_method"] != "H3":
174
+ vx = gaze_df.vx[bool_vec]
175
+ vy = gaze_df.vy[bool_vec]
176
+ if not vx.empty and not vy.empty:
177
+ saccade_df.loc[s, f"peak_vel{algo_str}"] = round(np.nanmax(np.sqrt(vx**2 + vy**2)))
178
+ else:
179
+ saccade_df.loc[s, f"peak_vel{algo_str}"] = round(np.nanmax(np.sqrt(gaze_df.vx[bool_vec] ** 2)))
180
+
181
+ if 35 <= abs(saccade_df.loc[s, f"angle{algo_str}"]) <= 145:
182
+ if saccade_df.loc[s, "xe"] - saccade_df.loc[s, "xs"] > 0 and not (
183
+ "blink_before" in saccade_df.columns
184
+ and (saccade_df.loc[s, "blink_before"] or saccade_df.loc[s, "blink_after"])
185
+ ):
186
+ is_directional_deviation = True
187
+
188
+ saccade_df.loc[s, f"is_directional_deviation{algo_str}"] = is_directional_deviation
189
+ return saccade_df.copy()
190
+
191
+
192
+ def get_angle_and_eucl_dist(saccade_df, algo_choice=None):
193
+ if algo_choice is not None:
194
+ algo_str = f"_{algo_choice}"
195
+ else:
196
+ algo_str = ""
197
+ saccade_df["xe_minus_xs"] = saccade_df["xe"] - saccade_df["xs"]
198
+ saccade_df[f"ye_minus_ys{algo_str}"] = saccade_df[f"ye{algo_str}"] - saccade_df[f"ys{algo_str}"]
199
+ saccade_df["eucledian_distance"] = (
200
+ saccade_df["xe_minus_xs"].map(np.square) + saccade_df[f"ye_minus_ys{algo_str}"].map(np.square)
201
+ ).map(np.sqrt)
202
+ saccade_df[f"angle{algo_str}"] = np.arctan2(
203
+ saccade_df.loc[:, f"ye_minus_ys{algo_str}"], saccade_df.loc[:, "xe_minus_xs"]
204
+ ) * (180 / np.pi)
205
+ return saccade_df
206
+
207
+
208
+ def compute_saccade_length(dffix, stimulus_df, algo_choice):
209
+
210
+ for j in dffix.index:
211
+ if (
212
+ j == 0
213
+ or pd.isna(dffix.at[j, f"line_num_{algo_choice}"])
214
+ or pd.isna(dffix.at[j - 1, f"line_num_{algo_choice}"])
215
+ or dffix.at[j, f"letternum_{algo_choice}"] is None
216
+ or dffix.at[j - 1, f"letternum_{algo_choice}"] is None
217
+ ):
218
+ continue
219
+
220
+ # Same line, calculate saccade length as difference in letter numbers
221
+ if dffix.at[j - 1, f"line_num_{algo_choice}"] == dffix.at[j, f"line_num_{algo_choice}"]:
222
+ dffix.at[j, f"sac_in_{algo_choice}"] = (
223
+ dffix.at[j, f"letternum_{algo_choice}"] - dffix.at[j - 1, f"letternum_{algo_choice}"]
224
+ )
225
+
226
+ # Go to line ahead, calculate saccade length as difference in minimum letter numbers in target and previous lines, respectively
227
+ elif dffix.at[j - 1, f"line_num_{algo_choice}"] < dffix.at[j, f"line_num_{algo_choice}"]:
228
+ min_stim_j = np.min(
229
+ stimulus_df[stimulus_df["assigned_line"] == dffix.at[j, f"line_num_{algo_choice}"]]["letternum"]
230
+ )
231
+ min_stim_j_1 = np.min(
232
+ stimulus_df[stimulus_df["assigned_line"] == dffix.at[j - 1, f"line_num_{algo_choice}"]]["letternum"]
233
+ )
234
+ dffix.at[j, f"sac_in_{algo_choice}"] = (dffix.at[j, f"letternum_{algo_choice}"] - min_stim_j) - (
235
+ dffix.at[j - 1, f"letternum_{algo_choice}"] - min_stim_j_1
236
+ )
237
+
238
+ # Return to line visited before, calculate saccade length as difference in minimum letter numbers in target and next lines, respectively
239
+ elif dffix.at[j - 1, f"line_num_{algo_choice}"] > dffix.at[j, f"line_num_{algo_choice}"]:
240
+ min_stim_j_1 = np.min(
241
+ stimulus_df[stimulus_df["assigned_line"] == dffix.at[j - 1, f"line_num_{algo_choice}"]]["letternum"]
242
+ )
243
+ min_stim_j = np.min(
244
+ stimulus_df[stimulus_df["assigned_line"] == dffix.at[j, f"line_num_{algo_choice}"]]["letternum"]
245
+ )
246
+ dffix.at[j, f"sac_in_{algo_choice}"] = (dffix.at[j - 1, f"letternum_{algo_choice}"] - min_stim_j_1) - (
247
+ dffix.at[j, f"letternum_{algo_choice}"] - min_stim_j
248
+ )
249
+
250
+ for j in range(len(dffix) - 1):
251
+ if (
252
+ pd.isna(dffix.at[j, f"line_num_{algo_choice}"])
253
+ or pd.isna(dffix.at[j + 1, f"line_num_{algo_choice}"])
254
+ or dffix.at[j + 1, f"letternum_{algo_choice}"] is None
255
+ or dffix.at[j, f"letternum_{algo_choice}"] is None
256
+ ):
257
+ continue
258
+
259
+ # Same line, calculate saccade length as difference in letter numbers
260
+ if dffix.at[j + 1, f"line_num_{algo_choice}"] == dffix.at[j, f"line_num_{algo_choice}"]:
261
+ dffix.at[j, f"sac_out_{algo_choice}"] = (
262
+ dffix.at[j + 1, f"letternum_{algo_choice}"] - dffix.at[j, f"letternum_{algo_choice}"]
263
+ )
264
+
265
+ elif dffix.at[j + 1, f"line_num_{algo_choice}"] > dffix.at[j, f"line_num_{algo_choice}"]:
266
+ min_stim_j_1 = np.min(
267
+ stimulus_df[stimulus_df["assigned_line"] == dffix.at[j + 1, f"line_num_{algo_choice}"]]["letternum"]
268
+ )
269
+ min_stim_j = np.min(
270
+ stimulus_df[stimulus_df["assigned_line"] == dffix.at[j, f"line_num_{algo_choice}"]]["letternum"]
271
+ )
272
+ dffix.at[j, f"sac_out_{algo_choice}"] = (dffix.at[j + 1, f"letternum_{algo_choice}"] - min_stim_j_1) - (
273
+ dffix.at[j, f"letternum_{algo_choice}"] - min_stim_j
274
+ )
275
+
276
+ elif dffix.at[j + 1, f"line_num_{algo_choice}"] < dffix.at[j, f"line_num_{algo_choice}"]:
277
+ min_stim_j_1 = np.min(
278
+ stimulus_df[stimulus_df["assigned_line"] == dffix.at[j, f"line_num_{algo_choice}"]]["letternum"]
279
+ )
280
+ min_stim_j = np.min(
281
+ stimulus_df[stimulus_df["assigned_line"] == dffix.at[j + 1, f"line_num_{algo_choice}"]]["letternum"]
282
+ )
283
+ dffix.at[j, f"sac_out_{algo_choice}"] = (dffix.at[j, f"letternum_{algo_choice}"] - min_stim_j) - (
284
+ dffix.at[j + 1, f"letternum_{algo_choice}"] - min_stim_j_1
285
+ )
286
+
287
+ return dffix
288
+
289
+
290
+ def compute_launch_distance(dffix, algo_choice):
291
+
292
+ for i in range(1, dffix.shape[0]):
293
+ if pd.isna(dffix.loc[i, f"sac_in_{algo_choice}"]):
294
+ continue
295
+
296
+ if dffix.loc[i, f"sac_in_{algo_choice}"] >= 0:
297
+ dffix.loc[i, f"word_launch_{algo_choice}"] = (
298
+ dffix.loc[i, f"sac_in_{algo_choice}"] - dffix.loc[i, f"word_land_{algo_choice}"]
299
+ )
300
+
301
+ else:
302
+ dffix.loc[i, f"word_launch_{algo_choice}"] = (
303
+ dffix.loc[i, f"sac_in_{algo_choice}"] + dffix.loc[i - 1, f"word_land_{algo_choice}"]
304
+ )
305
+
306
+ return dffix
307
+
308
+
309
+ def compute_refixation(dffix, algo_choice):
310
+ dffix.loc[:, f"word_refix_{algo_choice}"] = False
311
+ dffix.loc[:, f"sentence_refix_{algo_choice}"] = False
312
+ for j in dffix.index:
313
+ if (
314
+ j == 0
315
+ or pd.isna(dffix.loc[j, f"on_word_number_{algo_choice}"])
316
+ or pd.isna(dffix.loc[j - 1, f"on_word_number_{algo_choice}"])
317
+ ):
318
+ continue
319
+ dffix.loc[j, f"word_refix_{algo_choice}"] = (
320
+ dffix.loc[j, f"on_word_number_{algo_choice}"] == dffix.loc[j - 1, f"on_word_number_{algo_choice}"]
321
+ )
322
+ dffix.loc[j, f"sentence_refix_{algo_choice}"] = (
323
+ dffix.loc[j, f"on_sentence_num_{algo_choice}"] == dffix.loc[j - 1, f"on_sentence_num_{algo_choice}"]
324
+ )
325
+ return dffix
326
+
327
+
328
+ def compute_regression(dffix, algo_choice):
329
+ tmp = dffix.copy()
330
+ tmp.reset_index(drop=True, inplace=True)
331
+ tmp.loc[:, f"word_reg_out_{algo_choice}"] = False
332
+ tmp.loc[:, f"word_reg_in_{algo_choice}"] = False
333
+ tmp.loc[:, f"word_reg_out_to_{algo_choice}"] = float("nan")
334
+ tmp.loc[:, f"word_reg_in_from_{algo_choice}"] = float("nan")
335
+ tmp.loc[:, f"sentence_reg_out_{algo_choice}"] = False
336
+ tmp.loc[:, f"sentence_reg_in_{algo_choice}"] = False
337
+ tmp.loc[:, f"sentence_reg_out_to_{algo_choice}"] = float("nan")
338
+ tmp.loc[:, f"sentence_reg_in_from_{algo_choice}"] = float("nan")
339
+
340
+ if len(tmp) > 1:
341
+ for j in range(1, len(tmp)):
342
+ # Skip outliers
343
+ if pd.isnull(tmp.iloc[j][f"on_word_number_{algo_choice}"]) or pd.isnull(
344
+ tmp.iloc[j - 1][f"on_word_number_{algo_choice}"]
345
+ ):
346
+ continue
347
+
348
+ # Word
349
+ if tmp.iloc[j][f"on_word_number_{algo_choice}"] < tmp.iloc[j - 1][f"on_word_number_{algo_choice}"]:
350
+ tmp.loc[j, f"word_reg_in_{algo_choice}"] = True
351
+ tmp.loc[j - 1, f"word_reg_out_{algo_choice}"] = True
352
+ tmp.loc[j, f"word_reg_in_from_{algo_choice}"] = tmp.iloc[j - 1][f"on_word_number_{algo_choice}"]
353
+ tmp.loc[j - 1, f"word_reg_out_to_{algo_choice}"] = tmp.iloc[j][f"on_word_number_{algo_choice}"]
354
+
355
+ # Sentence
356
+ if tmp.iloc[j][f"on_sentence_num_{algo_choice}"] < tmp.iloc[j - 1][f"on_sentence_num_{algo_choice}"]:
357
+ tmp.loc[j, f"sentence_reg_in_{algo_choice}"] = True
358
+ tmp.loc[j - 1, f"sentence_reg_out_{algo_choice}"] = True
359
+ tmp.loc[j, f"sentence_reg_in_from_{algo_choice}"] = tmp.iloc[j - 1][f"on_sentence_num_{algo_choice}"]
360
+ tmp.loc[j - 1, f"sentence_reg_out_to_{algo_choice}"] = tmp.iloc[j][f"on_sentence_num_{algo_choice}"]
361
+
362
+ extra_cols = list(set(tmp.columns) - set(dffix.columns))
363
+ # select these columns from tmp and add the 'fixation_number'
364
+ cols_to_add = ["fixation_number"] + extra_cols
365
+
366
+ # merge selected columns to dffix with 'outer' how and 'fixation_number' as common key
367
+ dffix = pd.merge(dffix, tmp[cols_to_add], on="fixation_number", how="outer")
368
+ return dffix
369
+
370
+
371
+ def compute_firstskip(dffix, algo_choice):
372
+ dffix[f"word_firstskip_{algo_choice}"] = 0
373
+ word_mem = []
374
+
375
+ dffix[f"sentence_firstskip_{algo_choice}"] = 0
376
+ sentence_mem = []
377
+ dffix.reset_index(inplace=True)
378
+ for j in range(dffix.shape[0]):
379
+
380
+ # word
381
+ if (
382
+ dffix.loc[j, f"on_word_number_{algo_choice}"] < np.max(word_mem, initial=0)
383
+ and dffix.loc[j, f"on_word_number_{algo_choice}"] not in word_mem
384
+ ):
385
+ dffix.loc[j, f"word_firstskip_{algo_choice}"] = 1
386
+
387
+ # sent
388
+ if (
389
+ dffix.loc[j, f"on_sentence_num_{algo_choice}"] < np.max(sentence_mem, initial=0)
390
+ and dffix.loc[j, f"on_sentence_num_{algo_choice}"] not in sentence_mem
391
+ ):
392
+ dffix.loc[j, f"sentence_firstskip_{algo_choice}"] = 1
393
+
394
+ word_mem.append(dffix.loc[j, f"on_word_number_{algo_choice}"])
395
+ sentence_mem.append(dffix.loc[j, f"on_sentence_num_{algo_choice}"])
396
+
397
+ # set NA values for missing line numbers
398
+ dffix.loc[dffix[f"line_num_{algo_choice}"].isna(), f"word_firstskip_{algo_choice}"] = np.nan
399
+ dffix.loc[dffix[f"line_num_{algo_choice}"].isna(), f"sentence_firstskip_{algo_choice}"] = np.nan
400
+ dffix.set_index("index", inplace=True)
401
+ return dffix
402
+
403
+
404
+ def compute_run(dffix, algo_choice):
405
+ if "fixation_number" not in dffix.columns and "num" in dffix.columns:
406
+ dffix["fixation_number"] = dffix["num"]
407
+ tmp = dffix.copy()
408
+ tmp.reset_index(inplace=True, drop=True)
409
+ # initialize
410
+ tmp.loc[~tmp[f"on_word_{algo_choice}"].isna(), f"word_runid_{algo_choice}"] = 0
411
+ tmp[f"sentence_runid_{algo_choice}"] = 0
412
+
413
+ # fixation loop
414
+ if len(tmp) > 1:
415
+ for j in range(1, len(tmp)):
416
+
417
+ # word
418
+ if tmp[f"word_reg_in_{algo_choice}"][j] == 1 and tmp[f"word_reg_in_{algo_choice}"][j - 1] != 1:
419
+ tmp.loc[j, f"word_runid_{algo_choice}"] = tmp[f"word_runid_{algo_choice}"][j - 1] + 1
420
+ else:
421
+ tmp.loc[j, f"word_runid_{algo_choice}"] = tmp.loc[j - 1, f"word_runid_{algo_choice}"]
422
+
423
+ # sentence
424
+ if tmp[f"sentence_reg_in_{algo_choice}"][j] == 1 and tmp[f"sentence_reg_in_{algo_choice}"][j - 1] != 1:
425
+ tmp.loc[j, f"sentence_runid_{algo_choice}"] = tmp[f"sentence_runid_{algo_choice}"][j - 1] + 1
426
+ else:
427
+ tmp.loc[j, f"sentence_runid_{algo_choice}"] = tmp[f"sentence_runid_{algo_choice}"][j - 1]
428
+ tmp[f"word_runid_{algo_choice}"] = tmp[f"word_runid_{algo_choice}"] - 1
429
+ tmp[f"sentence_runid_{algo_choice}"] = tmp[f"sentence_runid_{algo_choice}"] - 1
430
+ # fixid in word
431
+ tmp[f"word_fix_{algo_choice}"] = tmp.groupby(f"on_word_number_{algo_choice}")["fixation_number"].transform(
432
+ lambda x: stats.rankdata(x, method="min")
433
+ )
434
+ # fixid in sent
435
+ tmp[f"sentence_fix_{algo_choice}"] = tmp.groupby(f"on_sentence_num_{algo_choice}")["fixation_number"].transform(
436
+ lambda x: stats.rankdata(x, method="min")
437
+ )
438
+
439
+ # runid in word
440
+ tmp["id"] = tmp[f"on_word_number_{algo_choice}"].astype(str) + ":" + tmp[f"word_runid_{algo_choice}"].astype(str)
441
+ fix_tmp = tmp.copy().drop_duplicates(subset="id")
442
+ fix_tmp[f"word_run_{algo_choice}"] = fix_tmp.groupby(f"on_word_number_{algo_choice}")[
443
+ f"word_runid_{algo_choice}"
444
+ ].transform(lambda x: stats.rankdata(x, method="min"))
445
+
446
+ if f"word_run_{algo_choice}" in tmp.columns:
447
+ tmp = tmp.drop(columns=[f"word_run_{algo_choice}"])
448
+ tmp = pd.merge(tmp, fix_tmp[["id", f"word_run_{algo_choice}"]], on="id")
449
+ del tmp["id"]
450
+ tmp = tmp.sort_values("fixation_number")
451
+
452
+ # runid in sentence
453
+ tmp["id"] = (
454
+ tmp[f"on_sentence_num_{algo_choice}"].astype(str) + ":" + tmp[f"sentence_runid_{algo_choice}"].astype(str)
455
+ )
456
+ fix_tmp = tmp.copy().drop_duplicates(subset="id")
457
+ fix_tmp[f"sentence_run_{algo_choice}"] = fix_tmp.groupby(f"on_sentence_num_{algo_choice}")["id"].transform(
458
+ lambda x: stats.rankdata(x, method="min")
459
+ )
460
+ if f"sentence_run_{algo_choice}" in tmp.columns:
461
+ tmp = tmp.drop(columns=[f"sentence_run_{algo_choice}"])
462
+ tmp = pd.merge(tmp, fix_tmp[["id", f"sentence_run_{algo_choice}"]], on="id")
463
+ del tmp["id"]
464
+ tmp = tmp.sort_values("fixation_number")
465
+
466
+ # fixnum in word_run
467
+ tmp["id"] = tmp[f"on_word_number_{algo_choice}"].astype(str) + ":" + tmp[f"word_run_{algo_choice}"].astype(str)
468
+ tmp[f"word_run_fix_{algo_choice}"] = tmp.groupby(["id"])["fixation_number"].rank("first").values
469
+ del tmp["id"]
470
+ tmp = tmp.sort_values("fixation_number")
471
+
472
+ # fixnum in sentence_run
473
+ tmp["id"] = tmp[f"on_sentence_num_{algo_choice}"].astype(str) + ":" + tmp[f"sentence_run_{algo_choice}"].astype(str)
474
+ tmp[f"sentence_run_fix_{algo_choice}"] = tmp.groupby(["id"])["fixation_number"].rank("first").values
475
+ del tmp["id"]
476
+ tmp = tmp.sort_values("fixation_number")
477
+ names = [
478
+ "fixation_number",
479
+ f"word_runid_{algo_choice}",
480
+ f"sentence_runid_{algo_choice}",
481
+ f"word_fix_{algo_choice}",
482
+ f"sentence_fix_{algo_choice}",
483
+ f"word_run_{algo_choice}",
484
+ f"sentence_run_{algo_choice}",
485
+ f"word_run_fix_{algo_choice}",
486
+ f"sentence_run_fix_{algo_choice}",
487
+ ]
488
+ dffix = pd.merge(dffix, tmp[names], on="fixation_number", how="left")
489
+ return dffix.copy()
490
+
491
+
492
+ def compute_landing_position(dffix, algo_choice):
493
+ dffix[f"word_cland_{algo_choice}"] = (
494
+ dffix[f"word_land_{algo_choice}"] - (dffix[f"on_word_{algo_choice}"].str.len() + 1) / 2
495
+ )
496
+ return dffix
497
+
498
+
499
+ def aggregate_words_firstrun(
500
+ fix,
501
+ algo_choice,
502
+ measures_to_calculate=[
503
+ "firstrun_blink",
504
+ "firstrun_skip",
505
+ "firstrun_refix",
506
+ "firstrun_reg_in",
507
+ "firstrun_reg_out",
508
+ "firstrun_dur",
509
+ "firstrun_gopast",
510
+ "firstrun_gopast_sel",
511
+ ],
512
+ ):
513
+ firstruntmp = fix.loc[fix[f"word_run_{algo_choice}"] == 1].copy()
514
+
515
+ firstrun = firstruntmp.drop_duplicates(subset=f"on_word_number_{algo_choice}", keep="first").copy()
516
+
517
+ names = [
518
+ "subject",
519
+ "trial_id",
520
+ "item",
521
+ "condition",
522
+ f"on_word_number_{algo_choice}",
523
+ f"on_word_{algo_choice}",
524
+ "fixation_number",
525
+ ]
526
+ firstrun = firstrun[names].sort_values(f"on_word_number_{algo_choice}")
527
+
528
+ # compute measures
529
+ firstrun[f"firstrun_nfix_{algo_choice}"] = firstruntmp.groupby(f"on_word_number_{algo_choice}")[
530
+ "fixation_number"
531
+ ].transform(
532
+ "count"
533
+ ) # Required for many other measures
534
+ firstrun[f"firstrun_nfix_{algo_choice}"] = firstrun[f"firstrun_nfix_{algo_choice}"].fillna(0)
535
+ if "firstrun_blink" in measures_to_calculate:
536
+ if "blink" in firstruntmp:
537
+ firstrun[f"firstrun_blink_{algo_choice}"] = firstruntmp.groupby(f"on_word_number_{algo_choice}")[
538
+ "blink"
539
+ ].transform("max")
540
+ else:
541
+ firstrun[f"firstrun_blink_{algo_choice}"] = 0
542
+
543
+ if "firstrun_skip" in measures_to_calculate:
544
+ firstrun[f"firstrun_skip_{algo_choice}"] = firstruntmp.groupby(f"on_word_number_{algo_choice}")[
545
+ f"word_firstskip_{algo_choice}"
546
+ ].transform("max")
547
+ if "firstrun_refix" in measures_to_calculate:
548
+ firstrun[f"firstrun_refix_{algo_choice}"] = firstruntmp.groupby(f"on_word_number_{algo_choice}")[
549
+ f"word_refix_{algo_choice}"
550
+ ].transform("max")
551
+ if "firstrun_reg_in" in measures_to_calculate:
552
+ firstrun[f"firstrun_reg_in_{algo_choice}"] = firstruntmp.groupby(f"on_word_number_{algo_choice}")[
553
+ f"word_reg_out_{algo_choice}"
554
+ ].transform("max")
555
+ if "firstrun_reg_out" in measures_to_calculate:
556
+ firstrun[f"firstrun_reg_out_{algo_choice}"] = firstruntmp.groupby(f"on_word_number_{algo_choice}")[
557
+ f"word_reg_in_{algo_choice}"
558
+ ].transform("max")
559
+ if "firstrun_dur" in measures_to_calculate:
560
+ firstrun[f"firstrun_dur_{algo_choice}"] = firstruntmp.groupby(f"on_word_number_{algo_choice}")[
561
+ "duration"
562
+ ].transform("sum")
563
+ firstrun = firstrun.sort_values(["trial_id", f"on_word_number_{algo_choice}"]).copy()
564
+
565
+ return firstrun
566
+
567
+
568
+ def compute_gopast_word(fixations_dataframe, algo_choice):
569
+
570
+ ias = np.unique(fixations_dataframe.loc[:, f"on_word_number_{algo_choice}"])
571
+
572
+ for j in range(len(ias) - 1):
573
+ fixations_dataframe.loc[
574
+ (fixations_dataframe[f"on_word_number_{algo_choice}"] == ias[j]), f"gopast_{algo_choice}"
575
+ ] = np.nansum(
576
+ fixations_dataframe.loc[
577
+ (
578
+ fixations_dataframe["fixation_number"]
579
+ >= np.min(
580
+ fixations_dataframe.loc[
581
+ (fixations_dataframe[f"on_word_number_{algo_choice}"] == ias[j]), "fixation_number"
582
+ ]
583
+ )
584
+ )
585
+ & (
586
+ fixations_dataframe["fixation_number"]
587
+ < np.min(
588
+ fixations_dataframe.loc[
589
+ (fixations_dataframe[f"on_word_number_{algo_choice}"] > ias[j]), "fixation_number"
590
+ ]
591
+ )
592
+ )
593
+ & (~fixations_dataframe[f"on_word_number_{algo_choice}"].isna())
594
+ ]["duration"]
595
+ )
596
+
597
+ fixations_dataframe.loc[
598
+ (fixations_dataframe[f"on_word_number_{algo_choice}"] == ias[j]), f"selgopast_{algo_choice}"
599
+ ] = np.nansum(
600
+ fixations_dataframe.loc[
601
+ (
602
+ fixations_dataframe["fixation_number"]
603
+ >= np.min(
604
+ fixations_dataframe.loc[
605
+ (fixations_dataframe[f"on_word_number_{algo_choice}"] == ias[j]), "fixation_number"
606
+ ]
607
+ )
608
+ )
609
+ & (
610
+ fixations_dataframe["fixation_number"]
611
+ < np.min(
612
+ fixations_dataframe.loc[
613
+ (fixations_dataframe[f"on_word_number_{algo_choice}"] > ias[j]), "fixation_number"
614
+ ]
615
+ )
616
+ )
617
+ & (fixations_dataframe[f"on_word_number_{algo_choice}"] == ias[j])
618
+ & (~fixations_dataframe[f"on_word_number_{algo_choice}"].isna())
619
+ ]["duration"]
620
+ )
621
+ return fixations_dataframe
622
+
623
+
624
+ def aggregate_words(
625
+ fix,
626
+ word_item,
627
+ algo_choice,
628
+ measures_to_calculate=[
629
+ "blink",
630
+ ],
631
+ ):
632
+ wordtmp = fix.copy()
633
+
634
+ word = wordtmp.drop_duplicates(subset=f"on_word_number_{algo_choice}", keep="first").copy()
635
+ names = [
636
+ f"on_sentence_num_{algo_choice}",
637
+ f"on_word_number_{algo_choice}",
638
+ f"on_word_{algo_choice}",
639
+ ]
640
+ word = word.loc[:, names].sort_values(by=f"on_word_number_{algo_choice}")
641
+
642
+ wordtmp = compute_gopast_word(wordtmp, algo_choice)
643
+
644
+ if "blink" in measures_to_calculate:
645
+ if "blink" in wordtmp:
646
+ word[f"blink_{algo_choice}"] = wordtmp.groupby(f"on_word_number_{algo_choice}")["blink"].transform("max")
647
+ else:
648
+ word[f"blink_{algo_choice}"] = 0
649
+ if "nrun" in measures_to_calculate or "reread" in measures_to_calculate:
650
+ word[f"nrun_{algo_choice}"] = wordtmp.groupby(f"on_word_number_{algo_choice}")[
651
+ f"word_run_{algo_choice}"
652
+ ].transform("max")
653
+ if "reread" in measures_to_calculate:
654
+ word[f"reread_{algo_choice}"] = word[f"nrun_{algo_choice}"] > 1
655
+ word[f"number_of_fixations_{algo_choice}"] = wordtmp.groupby(f"on_word_number_{algo_choice}")[
656
+ "fixation_number"
657
+ ].transform("count")
658
+ if "refix" in measures_to_calculate:
659
+ word[f"refix_{algo_choice}"] = wordtmp.groupby(f"on_word_number_{algo_choice}")[
660
+ f"word_refix_{algo_choice}"
661
+ ].transform("max")
662
+ if "reg_in" in measures_to_calculate:
663
+ word[f"reg_in_{algo_choice}"] = wordtmp.groupby(f"on_word_number_{algo_choice}")[
664
+ f"word_reg_in_{algo_choice}"
665
+ ].transform("max")
666
+ if "reg_out" in measures_to_calculate:
667
+ word[f"reg_out_{algo_choice}"] = wordtmp.groupby(f"on_word_number_{algo_choice}")[
668
+ f"word_reg_out_{algo_choice}"
669
+ ].transform("max")
670
+ if "total_fixation_duration" in measures_to_calculate:
671
+ word[f"total_fixation_duration_{algo_choice}"] = wordtmp.groupby(f"on_word_number_{algo_choice}")[
672
+ "duration"
673
+ ].transform("sum")
674
+ if "gopast" in measures_to_calculate and f"gopast_{algo_choice}" in wordtmp.columns:
675
+ word[f"gopast_{algo_choice}"] = wordtmp.groupby(f"on_word_number_{algo_choice}")[
676
+ f"gopast_{algo_choice}"
677
+ ].transform("max")
678
+ word[f"gopast_{algo_choice}"] = word[f"gopast_{algo_choice}"].fillna(0)
679
+
680
+ if "gopast_sel" in measures_to_calculate and f"selgopast_{algo_choice}" in wordtmp.columns:
681
+ word[f"gopast_sel_{algo_choice}"] = wordtmp.groupby(f"on_word_number_{algo_choice}")[
682
+ f"selgopast_{algo_choice}"
683
+ ].transform("max")
684
+ word[f"gopast_sel_{algo_choice}"] = word[f"gopast_sel_{algo_choice}"].fillna(0)
685
+
686
+ word.rename({f"on_word_number_{algo_choice}": "word_number"}, axis=1, inplace=True)
687
+ word = pd.merge(
688
+ word.reset_index(drop=True), word_item.reset_index(drop=True), on="word_number", how="right", validate="1:1"
689
+ )
690
+ word[f"number_of_fixations_{algo_choice}"] = word[f"number_of_fixations_{algo_choice}"].fillna(0)
691
+ if "total_fixation_duration" in measures_to_calculate:
692
+ word[f"total_fixation_duration_{algo_choice}"] = word[f"total_fixation_duration_{algo_choice}"].fillna(0)
693
+
694
+ word[f"skip_{algo_choice}"] = 0
695
+ if "blink" in measures_to_calculate:
696
+ word.loc[word[f"blink_{algo_choice}"].isna(), f"skip_{algo_choice}"] = 1
697
+ word.loc[word[f"number_of_fixations_{algo_choice}"] == 0, f"skip_{algo_choice}"] = 1
698
+ word[f"skip_{algo_choice}"] = word[f"skip_{algo_choice}"].astype("boolean")
699
+
700
+ if "number_of_fixations" not in measures_to_calculate:
701
+ word = word.drop(columns=f"number_of_fixations_{algo_choice}")
702
+ if "blink" in measures_to_calculate:
703
+ word[f"blink_{algo_choice}"] = word[f"blink_{algo_choice}"].astype("boolean")
704
+
705
+ word = word.sort_values(by=["word_number"])
706
+
707
+ if "condition" in wordtmp.columns and "condition" not in word.columns:
708
+ word.insert(loc=0, column="condition", value=wordtmp["condition"].iloc[0])
709
+ if "item" in wordtmp.columns and "item" not in word.columns:
710
+ word.insert(loc=0, column="item", value=wordtmp["item"].iloc[0])
711
+ if "trial_id" in wordtmp.columns and "trial_id" not in word.columns:
712
+ word.insert(loc=0, column="trial_id", value=wordtmp["trial_id"].iloc[0])
713
+ if "subject" in wordtmp.columns and "subject" not in word.columns:
714
+ word.insert(loc=0, column="subject", value=wordtmp["subject"].iloc[0])
715
+
716
+ return word
717
+
718
+
719
+ def combine_words(fix, wordfirst, wordtmp, algo_choice, measures_to_calculate):
720
+
721
+ subject = wordtmp["subject"].values[0]
722
+ trial_id = wordtmp["trial_id"].values[0]
723
+ item = wordtmp["item"].values[0]
724
+ condition = wordtmp["condition"].values[0]
725
+ wordtmp = wordtmp.loc[
726
+ :,
727
+ [
728
+ c
729
+ for c in [
730
+ "word_number",
731
+ "word",
732
+ f"blink_{algo_choice}",
733
+ f"skip_{algo_choice}",
734
+ f"nrun_{algo_choice}",
735
+ f"reread_{algo_choice}",
736
+ f"number_of_fixations_{algo_choice}",
737
+ f"refix_{algo_choice}",
738
+ f"reg_in_{algo_choice}",
739
+ f"reg_out_{algo_choice}",
740
+ f"total_fixation_duration_{algo_choice}",
741
+ f"gopast_{algo_choice}",
742
+ f"gopast_sel_{algo_choice}",
743
+ ]
744
+ if c in wordtmp.columns
745
+ ],
746
+ ]
747
+
748
+ wordfirsttmp = wordfirst.loc[
749
+ :,
750
+ [
751
+ c
752
+ for c in [
753
+ f"on_word_number_{algo_choice}",
754
+ f"firstrun_skip_{algo_choice}",
755
+ f"firstrun_nfix_{algo_choice}",
756
+ f"firstrun_refix_{algo_choice}",
757
+ f"firstrun_reg_in_{algo_choice}",
758
+ f"firstrun_reg_out_{algo_choice}",
759
+ f"firstrun_dur_{algo_choice}",
760
+ f"firstrun_gopast_{algo_choice}",
761
+ f"firstrun_gopast_sel_{algo_choice}",
762
+ ]
763
+ if c in wordfirst.columns
764
+ ],
765
+ ]
766
+
767
+ fixtmp = fix[(fix[f"word_run_{algo_choice}"] == 1) & (fix[f"word_run_fix_{algo_choice}"] == 1)].copy()
768
+ names = [
769
+ c
770
+ for c in [
771
+ f"on_word_number_{algo_choice}",
772
+ f"sac_in_{algo_choice}",
773
+ f"sac_out_{algo_choice}",
774
+ f"word_launch_{algo_choice}",
775
+ f"word_land_{algo_choice}",
776
+ f"word_cland_{algo_choice}",
777
+ f"duration",
778
+ ]
779
+ if c in fixtmp.columns
780
+ ]
781
+ fixtmp = fixtmp[names].copy()
782
+ fixtmp.rename(
783
+ {
784
+ f"sac_in_{algo_choice}": f"firstfix_sac_in_{algo_choice}",
785
+ f"sac_out_{algo_choice}": f"firstfix_sac_out_{algo_choice}",
786
+ f"word_launch_{algo_choice}": f"firstfix_launch_{algo_choice}",
787
+ f"word_land_{algo_choice}": f"firstfix_land_{algo_choice}",
788
+ f"word_cland_{algo_choice}": f"firstfix_cland_{algo_choice}",
789
+ f"duration": f"firstfix_dur_{algo_choice}",
790
+ },
791
+ axis=1,
792
+ inplace=True,
793
+ )
794
+ comb = pd.merge(
795
+ pd.merge(
796
+ wordtmp,
797
+ wordfirsttmp.rename({f"on_word_number_{algo_choice}": "word_number"}, axis=1),
798
+ on="word_number",
799
+ how="left",
800
+ ),
801
+ fixtmp.rename({f"on_word_number_{algo_choice}": "word_number"}, axis=1),
802
+ on="word_number",
803
+ how="left",
804
+ )
805
+
806
+ dropcols = [
807
+ c
808
+ for c in [
809
+ f"firstrun_skip_{algo_choice}",
810
+ f"firstrun_refix_{algo_choice}",
811
+ f"firstrun_reg_in_{algo_choice}",
812
+ f"firstrun_reg_out_{algo_choice}",
813
+ f"firstrun_dur_{algo_choice}",
814
+ f"firstrun_gopast_{algo_choice}",
815
+ f"firstrun_gopast_sel_{algo_choice}",
816
+ f"firstfix_sac_in_{algo_choice}",
817
+ f"firstfix_sac_out_{algo_choice}",
818
+ f"firstfix_launch_{algo_choice}",
819
+ f"firstfix_land_{algo_choice}",
820
+ f"firstfix_cland_{algo_choice}",
821
+ f"firstfix_dur_{algo_choice}",
822
+ ]
823
+ if ((c.replace(f"_{algo_choice}", "") not in measures_to_calculate) & (c in comb.columns))
824
+ ]
825
+ comb = comb.drop(columns=dropcols).copy()
826
+ comb.sort_values(by="word_number", inplace=True)
827
+
828
+ # recompute firstrun skip (skips are also firstkips)
829
+ if f"skip_{algo_choice}" in comb.columns and f"firstrun_skip_{algo_choice}" in comb.columns:
830
+ comb.loc[comb[f"skip_{algo_choice}"] == 1, f"firstrun_skip_{algo_choice}"] = 1
831
+
832
+ # gopast time in firstrun
833
+ if f"gopast_{algo_choice}" in comb.columns and "firstrun_gopast" in measures_to_calculate:
834
+ comb[f"firstrun_gopast_{algo_choice}"] = comb[f"gopast_{algo_choice}"]
835
+ if f"gopast_sel_{algo_choice}" in comb.columns and "firstrun_gopast_sel" in measures_to_calculate:
836
+ comb[f"firstrun_gopast_sel_{algo_choice}"] = comb[f"gopast_sel_{algo_choice}"]
837
+ if f"gopast_{algo_choice}" in comb.columns:
838
+ comb.drop(columns=[f"gopast_{algo_choice}"], inplace=True)
839
+
840
+ if f"gopast_sel_{algo_choice}" in comb.columns:
841
+ comb.drop(columns=[f"gopast_sel_{algo_choice}"], inplace=True)
842
+
843
+ if f"firstrun_nfix_{algo_choice}" in comb.columns and "singlefix" in measures_to_calculate:
844
+ comb[f"singlefix_{algo_choice}"] = 0
845
+ comb.loc[(comb[f"firstrun_nfix_{algo_choice}"] == 1), f"singlefix_{algo_choice}"] = 1
846
+
847
+ if f"firstfix_sac_in_{algo_choice}" in comb.columns and "singlefix_sac_in" in measures_to_calculate:
848
+ comb.loc[(comb[f"firstrun_nfix_{algo_choice}"] == 1), f"singlefix_sac_in_{algo_choice}"] = comb[
849
+ f"firstfix_sac_in_{algo_choice}"
850
+ ][(comb[f"firstrun_nfix_{algo_choice}"] == 1)]
851
+
852
+ if f"firstfix_sac_out_{algo_choice}" in comb.columns and "singlefix_sac_out" in measures_to_calculate:
853
+ comb.loc[(comb[f"firstrun_nfix_{algo_choice}"] == 1), f"singlefix_sac_out_{algo_choice}"] = comb[
854
+ f"firstfix_sac_out_{algo_choice}"
855
+ ][(comb[f"firstrun_nfix_{algo_choice}"] == 1)]
856
+
857
+ if f"firstfix_launch_{algo_choice}" in comb.columns and "singlefix_launch" in measures_to_calculate:
858
+ comb.loc[(comb[f"firstrun_nfix_{algo_choice}"] == 1), f"singlefix_launch_{algo_choice}"] = comb[
859
+ f"firstfix_launch_{algo_choice}"
860
+ ][(comb[f"firstrun_nfix_{algo_choice}"] == 1)]
861
+
862
+ if f"firstfix_land_{algo_choice}" in comb.columns and "singlefix_land" in measures_to_calculate:
863
+ comb.loc[(comb[f"firstrun_nfix_{algo_choice}"] == 1), f"singlefix_land_{algo_choice}"] = comb[
864
+ f"firstfix_land_{algo_choice}"
865
+ ][(comb[f"firstrun_nfix_{algo_choice}"] == 1)]
866
+
867
+ if f"firstfix_cland_{algo_choice}" in comb.columns and "singlefix_cland" in measures_to_calculate:
868
+ comb.loc[(comb[f"firstrun_nfix_{algo_choice}"] == 1), f"singlefix_cland_{algo_choice}"] = comb[
869
+ f"firstfix_cland_{algo_choice}"
870
+ ][(comb[f"firstrun_nfix_{algo_choice}"] == 1)]
871
+
872
+ if f"firstfix_dur_{algo_choice}" in comb.columns and "singlefix_dur" in measures_to_calculate:
873
+ comb.loc[(comb[f"firstrun_nfix_{algo_choice}"] == 1), f"singlefix_dur_{algo_choice}"] = comb[
874
+ f"firstfix_dur_{algo_choice}"
875
+ ][(comb[f"firstrun_nfix_{algo_choice}"] == 1)]
876
+
877
+ if "condition" not in comb.columns:
878
+ comb.insert(loc=0, column="condition", value=condition)
879
+ if "item" not in comb.columns:
880
+ comb.insert(loc=0, column="item", value=item)
881
+ if "trial_id" not in comb.columns:
882
+ comb.insert(loc=0, column="trial_id", value=trial_id)
883
+ if "subject" not in comb.columns:
884
+ comb.insert(loc=0, column="subject", value=subject)
885
+ return comb.copy()
886
+
887
+
888
+ def compute_sentence_measures(fix, stimmat, algo_choice, measures_to_calc, save_to_csv=False):
889
+ sentitem = stimmat.drop_duplicates(
890
+ subset="in_sentence_number", keep="first"
891
+ ) # TODO check why there are rows with sent number None
892
+ fixin = fix.copy().reset_index(drop=True)
893
+
894
+ fixin["on_sentence_num2"] = fixin[f"on_sentence_num_{algo_choice}"].copy()
895
+
896
+ # Recompute sentence number (two fixation exception rule)
897
+ for j in range(1, len(fixin) - 1):
898
+ if fixin.loc[j, "on_sentence_num2"] != fixin.loc[j - 1, "on_sentence_num2"]:
899
+ if j + 1 in fixin.index and fixin.loc[j + 1, "on_sentence_num2"] == fixin.loc[j - 1, "on_sentence_num2"]:
900
+ fixin.loc[j, "on_sentence_num2"] = fixin.loc[j - 1, "on_sentence_num2"]
901
+ elif j + 2 in fixin.index and fixin.loc[j + 2, "on_sentence_num2"] == fixin.loc[j - 1, "on_sentence_num2"]:
902
+ fixin.loc[j, "on_sentence_num2"] = fixin.loc[j - 1, "on_sentence_num2"]
903
+
904
+ fixin["id"] = fixin.apply(lambda row: f"{row['on_sentence_num2']}", axis=1)
905
+
906
+ fixin[f"sent_reg_in2_{algo_choice}"] = 0
907
+ fixin[f"sent_reg_out2_{algo_choice}"] = 0
908
+
909
+ fixin[f"sent_runid2_{algo_choice}"] = 1
910
+
911
+ fixin.loc[0, "last"] = fixin.loc[0, "id"]
912
+ fixin.loc[0, f"firstpass_{algo_choice}"] = 1
913
+ mem = [fixin.loc[0, "on_sentence_num2"]]
914
+ wordmem = [fixin.loc[0, f"on_word_number_{algo_choice}"]]
915
+ fixin.loc[0, f"forward_{algo_choice}"] = 1
916
+
917
+ for j in range(1, len(fixin)):
918
+ fixin.loc[j, "last"] = fixin.loc[j - 1, "id"]
919
+
920
+ if fixin.loc[j, "on_sentence_num2"] != fixin.loc[j - 1, "on_sentence_num2"]:
921
+ fixin.loc[j, f"sent_reg_in2_{algo_choice}"] = 1
922
+ fixin.loc[j - 1, f"sent_reg_out2_{algo_choice}"] = 1
923
+ fixin.loc[j, f"sent_reg_in_from2_{algo_choice}"] = fixin.loc[j - 1, "on_sentence_num2"]
924
+ fixin.loc[j - 1, f"sent_reg_out_to2_{algo_choice}"] = fixin.loc[j, "on_sentence_num2"]
925
+
926
+ if fixin.loc[j, f"sent_reg_in2_{algo_choice}"] == 1 and fixin.loc[j - 1, f"sent_reg_in2_{algo_choice}"] != 1:
927
+ fixin.loc[j, f"sent_runid2_{algo_choice}"] = fixin.loc[j - 1, f"sent_runid2_{algo_choice}"] + 1
928
+ else:
929
+ fixin.loc[j, f"sent_runid2_{algo_choice}"] = fixin.loc[j - 1, f"sent_runid2_{algo_choice}"]
930
+
931
+ if fixin.loc[j, "on_sentence_num2"] >= fixin.loc[j - 1, "on_sentence_num2"]:
932
+ if fixin.loc[j, "on_sentence_num2"] in mem:
933
+ if fixin.loc[j, "on_sentence_num2"] == max(mem):
934
+ fixin.loc[j, f"firstpass_{algo_choice}"] = 1
935
+ else:
936
+ fixin.loc[j, f"firstpass_{algo_choice}"] = 0
937
+ else:
938
+ mem.append(fixin.loc[j, "on_sentence_num2"])
939
+ fixin.loc[j, f"firstpass_{algo_choice}"] = 1
940
+ else:
941
+ fixin.loc[j, f"firstpass_{algo_choice}"] = 0
942
+
943
+ if fixin.loc[j, f"on_word_number_{algo_choice}"] > max(wordmem):
944
+ wordmem.append(fixin.loc[j, f"on_word_number_{algo_choice}"])
945
+ fixin.loc[j, f"forward_{algo_choice}"] = 1
946
+ elif fixin.loc[j, f"on_word_number_{algo_choice}"] < max(wordmem):
947
+ fixin.loc[j, f"forward_{algo_choice}"] = 0
948
+
949
+ for i in range(len(fixin) - 3):
950
+ if fixin.loc[i, f"line_change_{algo_choice}"] > 0:
951
+ fixin.loc[i, "on_word_number"] = 0
952
+ fixin.loc[i + 1, f"forward_{algo_choice}"] = 1
953
+ fixin.loc[i + 2, f"forward_{algo_choice}"] = 1
954
+ fixin.loc[i + 3, f"forward_{algo_choice}"] = 1
955
+
956
+ for i in range(1, len(fixin) - 3):
957
+ if fixin.loc[i, "on_sentence_num2"] > fixin.loc[i - 1, "on_sentence_num2"]:
958
+ fixin.loc[i + 1, f"forward_{algo_choice}"] = 1
959
+ fixin.loc[i + 2, f"forward_{algo_choice}"] = 1
960
+
961
+ fixin["id2"] = fixin["id"] + ":" + fixin[f"sent_runid2_{algo_choice}"].astype(str)
962
+
963
+ fixin = fixin.sort_values(["trial_id", "fixation_number"])
964
+
965
+ sent = fixin.copy().drop_duplicates(subset="id", keep="first")
966
+ names = [
967
+ "id",
968
+ "subject",
969
+ "trial_id",
970
+ "item",
971
+ "condition",
972
+ "on_sentence_num2",
973
+ f"on_sentence_num_{algo_choice}",
974
+ f"on_sentence_{algo_choice}",
975
+ "num_words_in_sentence",
976
+ ]
977
+ sent = sent[names].reset_index(drop=True)
978
+
979
+ sent[f"firstrun_skip_{algo_choice}"] = 0
980
+
981
+ mem = []
982
+ for j in range(len(sent)):
983
+ if not pd.isna(sent.loc[j, f"on_sentence_num_{algo_choice}"]):
984
+ if len(mem) > 0 and sent.loc[j, f"on_sentence_num_{algo_choice}"] < max(mem) and not pd.isna(max(mem)):
985
+ sent.loc[j, f"firstrun_skip_{algo_choice}"] = 1
986
+ if (
987
+ not pd.isna(sent.loc[j, f"on_sentence_num_{algo_choice}"])
988
+ and sent.loc[j, f"on_sentence_num_{algo_choice}"] not in mem
989
+ ):
990
+ mem.append(sent.loc[j, f"on_sentence_num_{algo_choice}"])
991
+
992
+ if "total_n_fixations" in measures_to_calc:
993
+ tmp = fixin.groupby("id")["duration"].count().reset_index()
994
+ tmp.columns = ["id", f"total_n_fixations_{algo_choice}"]
995
+ sent = pd.merge(sent, tmp, on="id", how="left")
996
+ sent.fillna({f"total_n_fixations_{algo_choice}": 0}, inplace=True)
997
+
998
+ tmp = fixin.groupby("id")["duration"].sum().reset_index()
999
+ tmp.columns = ["id", f"total_dur_{algo_choice}"]
1000
+ sent = pd.merge(sent, tmp, on="id", how="left")
1001
+ sent.fillna({f"total_dur_{algo_choice}": 0}, inplace=True)
1002
+
1003
+ if "firstpass_n_fixations" in measures_to_calc:
1004
+ tmp = fixin[fixin[f"firstpass_{algo_choice}"] == 1].groupby("id")["duration"].count().reset_index()
1005
+ tmp.columns = ["id", f"firstpass_n_fixations_{algo_choice}"]
1006
+ sent = pd.merge(sent, tmp, on="id", how="left")
1007
+ sent.fillna({f"firstpass_n_fixations_{algo_choice}": 0}, inplace=True)
1008
+
1009
+ if "firstpass_dur" in measures_to_calc:
1010
+ tmp = fixin[fixin[f"firstpass_{algo_choice}"] == 1].groupby("id")["duration"].sum().reset_index()
1011
+ tmp.columns = ["id", f"firstpass_dur_{algo_choice}"]
1012
+ sent = pd.merge(sent, tmp, on="id", how="left")
1013
+ sent.fillna({f"firstpass_dur_{algo_choice}": 0}, inplace=True)
1014
+
1015
+ if "firstpass_forward_n_fixations" in measures_to_calc:
1016
+ tmp = (
1017
+ fixin[(fixin[f"firstpass_{algo_choice}"] == 1) & (fixin[f"forward_{algo_choice}"] == 1)]
1018
+ .groupby("id")["duration"]
1019
+ .count()
1020
+ .reset_index()
1021
+ )
1022
+ tmp.columns = ["id", f"firstpass_forward_n_fixations_{algo_choice}"]
1023
+ sent = pd.merge(sent, tmp, on="id", how="left")
1024
+ sent.fillna({f"firstpass_forward_n_fixations_{algo_choice}": 0}, inplace=True)
1025
+
1026
+ if "firstpass_forward_dur" in measures_to_calc:
1027
+ tmp = (
1028
+ fixin[(fixin[f"firstpass_{algo_choice}"] == 1) & (fixin[f"forward_{algo_choice}"] == 1)]
1029
+ .groupby("id")["duration"]
1030
+ .sum()
1031
+ .reset_index()
1032
+ )
1033
+ tmp.columns = ["id", f"firstpass_forward_dur_{algo_choice}"]
1034
+ sent = pd.merge(sent, tmp, on="id", how="left")
1035
+ sent.fillna({f"firstpass_forward_dur_{algo_choice}": 0}, inplace=True)
1036
+
1037
+ if "firstpass_reread_n_fixations" in measures_to_calc:
1038
+ tmp = (
1039
+ fixin[(fixin[f"firstpass_{algo_choice}"] == 1) & (fixin[f"forward_{algo_choice}"] == 0)]
1040
+ .groupby("id")["duration"]
1041
+ .count()
1042
+ .reset_index()
1043
+ )
1044
+ tmp.columns = ["id", f"firstpass_reread_n_fixations_{algo_choice}"]
1045
+ sent = pd.merge(sent, tmp, on="id", how="left")
1046
+ sent.fillna({f"firstpass_reread_n_fixations_{algo_choice}": 0}, inplace=True)
1047
+
1048
+ if "firstpass_reread_dur" in measures_to_calc:
1049
+ tmp = (
1050
+ fixin[(fixin[f"firstpass_{algo_choice}"] == 1) & (fixin[f"forward_{algo_choice}"] == 0)]
1051
+ .groupby("id")["duration"]
1052
+ .sum()
1053
+ .reset_index()
1054
+ )
1055
+ tmp.columns = ["id", f"firstpass_reread_dur_{algo_choice}"]
1056
+ sent = pd.merge(sent, tmp, on="id", how="left")
1057
+ sent.fillna({f"firstpass_reread_dur_{algo_choice}": 0}, inplace=True)
1058
+
1059
+ if sum(fixin[f"firstpass_{algo_choice}"] == 0) != 0:
1060
+ if "lookback_n_fixations" in measures_to_calc:
1061
+ tmp = fixin[fixin[f"firstpass_{algo_choice}"] == 0].groupby("id")["duration"].count().reset_index()
1062
+ tmp.columns = ["id", f"lookback_n_fixations_{algo_choice}"]
1063
+ sent = pd.merge(sent, tmp, on="id", how="left")
1064
+ sent.fillna({f"lookback_n_fixations_{algo_choice}": 0}, inplace=True)
1065
+
1066
+ if "lookback_dur" in measures_to_calc:
1067
+ tmp = fixin[fixin[f"firstpass_{algo_choice}"] == 0].groupby("id")["duration"].sum().reset_index()
1068
+ tmp.columns = ["id", f"lookback_dur_{algo_choice}"]
1069
+ sent = pd.merge(sent, tmp, on="id", how="left")
1070
+ sent.fillna({f"lookback_dur_{algo_choice}": 0}, inplace=True)
1071
+
1072
+ fixin["id2"] = fixin.apply(lambda row: f"{row['id']}:{row[f'sent_runid2_{algo_choice}']}", axis=1)
1073
+ sent2 = fixin.drop_duplicates(subset="id2", keep="first")
1074
+ sent3 = sent2[(sent2[f"firstpass_{algo_choice}"] == 0) & (~pd.isna(sent2[f"sent_reg_in_from2_{algo_choice}"]))]
1075
+
1076
+ tmp = fixin[fixin["id2"].isin(sent3["id2"])].groupby("id")["duration"].count().reset_index()
1077
+ tmp.columns = ["id", f"lookfrom_n_fixations_{algo_choice}"]
1078
+ tmp2 = pd.merge(tmp, sent3)
1079
+ tmp3 = tmp2.groupby("last")[f"lookfrom_n_fixations_{algo_choice}"].sum().reset_index()
1080
+ tmp3.columns = ["last", f"lookfrom_n_fixations_{algo_choice}"]
1081
+ sent = pd.merge(sent, tmp3, left_on="id", right_on="last", how="left")
1082
+ sent.fillna({f"lookfrom_n_fixations_{algo_choice}": 0}, inplace=True)
1083
+
1084
+ if "lookfrom_dur" in measures_to_calc:
1085
+ tmp = fixin[fixin["id2"].isin(sent3["id2"])].groupby("id")["duration"].sum().reset_index()
1086
+ tmp.columns = ["id", f"lookfrom_dur_{algo_choice}"]
1087
+ tmp2 = pd.merge(tmp, sent3)
1088
+ tmp3 = tmp2.groupby("last")[f"lookfrom_dur_{algo_choice}"].sum().reset_index()
1089
+ tmp3.columns = ["last", f"lookfrom_dur_{algo_choice}"]
1090
+ sent = pd.merge(sent, tmp3, left_on="id", right_on="last", how="left")
1091
+ sent.fillna({f"lookfrom_dur_{algo_choice}": 0}, inplace=True)
1092
+
1093
+ # Firstrun
1094
+ firstruntmp = fixin[fixin[f"sentence_run_{algo_choice}"] == 1]
1095
+
1096
+ if "firstrun_reg_in" in measures_to_calc:
1097
+ tmp = firstruntmp.groupby("id")[f"sent_reg_in2_{algo_choice}"].max().reset_index()
1098
+ tmp.columns = ["id", f"firstrun_reg_in_{algo_choice}"]
1099
+ sent = pd.merge(sent, tmp, on="id", how="left")
1100
+ sent.fillna({f"firstrun_reg_in_{algo_choice}": 0}, inplace=True)
1101
+
1102
+ if "firstrun_reg_out" in measures_to_calc:
1103
+ tmp = firstruntmp.groupby("id")[f"sent_reg_out2_{algo_choice}"].max().reset_index()
1104
+ tmp.columns = ["id", f"firstrun_reg_out_{algo_choice}"]
1105
+ sent = pd.merge(sent, tmp, on="id", how="left")
1106
+ sent.fillna({f"firstrun_reg_out_{algo_choice}": 0}, inplace=True)
1107
+
1108
+ # Complete sentence
1109
+ gopasttmp = fixin.copy()
1110
+ gopasttmp[f"on_sentence_num_{algo_choice}"] = gopasttmp["on_sentence_num2"]
1111
+ tmp = compute_gopast_sentence(gopasttmp, algo_choice)
1112
+ names = ["id", f"gopast_{algo_choice}", f"selgopast_{algo_choice}"]
1113
+ tmp = tmp[names]
1114
+ tmp = tmp.drop_duplicates(subset="id", keep="first")
1115
+ tmp.columns = ["id", f"gopast_{algo_choice}", f"gopast_sel_{algo_choice}"]
1116
+ sent = pd.merge(sent, tmp, on="id", how="left")
1117
+
1118
+ # Nrun
1119
+ tmp = fixin.groupby("id")[f"sentence_run_{algo_choice}"].max().reset_index()
1120
+ tmp.columns = ["id", f"nrun_{algo_choice}"]
1121
+ sent = pd.merge(sent, tmp, on="id", how="left")
1122
+
1123
+ # Reread
1124
+ sent[f"reread_{algo_choice}"] = sent.apply(lambda row: 1 if row[f"nrun_{algo_choice}"] > 1 else 0, axis=1)
1125
+
1126
+ # Reg_in
1127
+ tmp = fixin.groupby("id")[f"sent_reg_in2_{algo_choice}"].max().reset_index()
1128
+ tmp.columns = ["id", f"reg_in_{algo_choice}"]
1129
+ sent = pd.merge(sent, tmp, on="id", how="left")
1130
+
1131
+ # Reg_out
1132
+ tmp = fixin.groupby("id")[f"sent_reg_out2_{algo_choice}"].max().reset_index()
1133
+ tmp.columns = ["id", f"reg_out_{algo_choice}"]
1134
+ sent = pd.merge(sent, tmp, on="id", how="left")
1135
+
1136
+ sent = sent.sort_values(by=f"on_sentence_num_{algo_choice}").reset_index(drop=True)
1137
+
1138
+ # Rate
1139
+ sent[f"rate_{algo_choice}"] = round(60000 / (sent[f"total_dur_{algo_choice}"] / sent["num_words_in_sentence"]))
1140
+
1141
+ # Write out
1142
+ item = sentitem.copy()
1143
+
1144
+ sent = pd.merge(
1145
+ sent,
1146
+ item.rename({"in_sentence_number": f"on_sentence_num_{algo_choice}"}, axis=1),
1147
+ on=f"on_sentence_num_{algo_choice}",
1148
+ how="left",
1149
+ )
1150
+ sent[f"skip_{algo_choice}"] = 0
1151
+ sent.loc[pd.isna(sent[f"nrun_{algo_choice}"]), f"skip_{algo_choice}"] = 1
1152
+
1153
+ names = [
1154
+ "subject",
1155
+ "trial_id",
1156
+ "item",
1157
+ "condition",
1158
+ ] + [
1159
+ c
1160
+ for c in [
1161
+ f"on_sentence_num_{algo_choice}",
1162
+ f"on_sentence_{algo_choice}",
1163
+ "num_words_in_sentence",
1164
+ f"skip_{algo_choice}",
1165
+ f"nrun_{algo_choice}",
1166
+ f"reread_{algo_choice}",
1167
+ f"reg_in_{algo_choice}",
1168
+ f"reg_out_{algo_choice}",
1169
+ f"total_n_fixations_{algo_choice}",
1170
+ f"total_dur_{algo_choice}",
1171
+ f"rate_{algo_choice}",
1172
+ f"gopast_{algo_choice}",
1173
+ f"gopast_sel_{algo_choice}",
1174
+ f"firstrun_skip_{algo_choice}",
1175
+ f"firstrun_reg_in_{algo_choice}",
1176
+ f"firstrun_reg_out_{algo_choice}",
1177
+ f"firstpass_n_fixations_{algo_choice}",
1178
+ f"firstpass_dur_{algo_choice}",
1179
+ f"firstpass_forward_n_fixations_{algo_choice}",
1180
+ f"firstpass_forward_dur_{algo_choice}",
1181
+ f"firstpass_reread_n_fixations_{algo_choice}",
1182
+ f"firstpass_reread_dur_{algo_choice}",
1183
+ f"lookback_n_fixations_{algo_choice}",
1184
+ f"lookback_dur_{algo_choice}",
1185
+ f"lookfrom_n_fixations_{algo_choice}",
1186
+ f"lookfrom_dur_{algo_choice}",
1187
+ ]
1188
+ if (c in sent.columns and c.replace(f"_{algo_choice}", "") in measures_to_calc)
1189
+ ]
1190
+ sent = sent[names].copy()
1191
+ sent.rename(
1192
+ {
1193
+ f"on_sentence_num_{algo_choice}": "sentence_number",
1194
+ f"on_sentence_{algo_choice}": "sentence",
1195
+ "num_words_in_sentence": "number_of_words",
1196
+ },
1197
+ axis=1,
1198
+ inplace=True,
1199
+ )
1200
+
1201
+ if save_to_csv:
1202
+ subj = fix["subject"].iloc[0]
1203
+ trial_id = fix["trial_id"].iloc[0]
1204
+ sent.to_csv(RESULTS_FOLDER / f"{subj}_{trial_id}_{algo_choice}_sentence_measures.csv")
1205
+ return sent.copy()
1206
+
1207
+
1208
+ def compute_gopast_sentence(fixin, algo_choice):
1209
+ # create response vectors
1210
+ fixin[f"gopast_{algo_choice}"] = np.nan
1211
+ fixin[f"selgopast_{algo_choice}"] = np.nan
1212
+
1213
+ # compute trialid within person
1214
+ ias = fixin[f"on_sentence_num_{algo_choice}"].unique()
1215
+
1216
+ # compute measures
1217
+ for j in ias:
1218
+ min_fixation_number_j = fixin.loc[fixin[f"on_sentence_num_{algo_choice}"] == j, "fixation_number"].min(
1219
+ skipna=True
1220
+ )
1221
+ next_min_fixation_number = (
1222
+ fixin.loc[fixin[f"on_sentence_num_{algo_choice}"] > j, "fixation_number"].min(skipna=True)
1223
+ if j != ias[-1]
1224
+ else float("inf")
1225
+ )
1226
+
1227
+ mask = (
1228
+ (fixin["fixation_number"] >= min_fixation_number_j)
1229
+ & (fixin["fixation_number"] < next_min_fixation_number)
1230
+ & (~fixin[f"on_sentence_num_{algo_choice}"].isna())
1231
+ )
1232
+ fixin.loc[fixin[f"on_sentence_num_{algo_choice}"] == j, f"gopast_{algo_choice}"] = fixin.loc[
1233
+ mask, "duration"
1234
+ ].sum(skipna=True)
1235
+
1236
+ mask_j = (
1237
+ (fixin["fixation_number"] >= min_fixation_number_j)
1238
+ & (fixin["fixation_number"] < next_min_fixation_number)
1239
+ & (~fixin[f"on_sentence_num_{algo_choice}"].isna())
1240
+ & (fixin[f"on_sentence_num_{algo_choice}"] == j)
1241
+ )
1242
+ fixin.loc[fixin[f"on_sentence_num_{algo_choice}"] == j, f"selgopast_{algo_choice}"] = fixin.loc[
1243
+ mask_j, "duration"
1244
+ ].sum(skipna=True)
1245
+
1246
+ return fixin
1247
+
1248
+
1249
+ def aggregate_trials(dffix_combined, wordcomb, all_trials_by_subj, algo_choices):
1250
+ tmp = dffix_combined.copy()
1251
+
1252
+ trial = tmp.drop_duplicates(subset="subject_trialID", keep="first")
1253
+ names = ["subject_trialID", "subject", "trial_id", "item", "condition"]
1254
+ trial = trial[names].copy()
1255
+
1256
+ for index, row in trial.iterrows():
1257
+ selected_trial = all_trials_by_subj[row["subject"]][row["trial_id"]]
1258
+ info_keys = [
1259
+ k for k in selected_trial.keys() if k in ["trial_start_time", "trial_end_time", "question_correct"]
1260
+ ]
1261
+ if row["subject"] in all_trials_by_subj and row["trial_id"] in all_trials_by_subj[row["subject"]]:
1262
+ if selected_trial["Fixation Cleaning Stats"]["Discard fixation before or after blinks"]:
1263
+ trial.at[index, "blink"] = selected_trial["Fixation Cleaning Stats"][
1264
+ "Number of discarded fixations due to blinks"
1265
+ ]
1266
+ for key, value in selected_trial.items():
1267
+ if key in info_keys:
1268
+ trial.at[index, key] = value
1269
+
1270
+ subdf = wordcomb.copy().loc[:, ["subject_trialID"]].drop_duplicates(subset=["subject_trialID"], keep="first")
1271
+ trial = pd.merge(trial, subdf, on="subject_trialID", how="left")
1272
+ for sub, subdf in wordcomb.groupby("subject"):
1273
+ for trialid, trialdf in subdf.groupby("trial_id"):
1274
+ trial.loc[((trial["subject"] == sub) & (trial["trial_id"] == trialid)), "number_of_words_in_trial"] = (
1275
+ trialdf["word"].count()
1276
+ )
1277
+ trial.sort_values(by="subject_trialID", inplace=True)
1278
+
1279
+ if "blink" in tmp.columns:
1280
+ blink = tmp.groupby("subject_trialID")["blink"].sum() / 2
1281
+ blink = blink.round().reset_index()
1282
+ trial = pd.merge(trial, blink, on="subject_trialID", how="left")
1283
+
1284
+ trial["nfix"] = tmp.groupby("subject_trialID")["fixation_number"].agg("count").values
1285
+ new_col_dfs = []
1286
+ new_col_dfs.append(tmp.groupby("subject_trialID")["duration"].agg("mean").reset_index(name="mean_fix_duration"))
1287
+
1288
+ new_col_dfs.append(tmp.groupby("subject_trialID")["duration"].agg("sum").reset_index(name="total_fix_duration"))
1289
+ for algo_choice in algo_choices:
1290
+ new_col_dfs.append(
1291
+ tmp.groupby("subject_trialID")[f"word_runid_{algo_choice}"]
1292
+ .agg("max")
1293
+ .reset_index(name=f"nrun_{algo_choice}")
1294
+ )
1295
+ tmp[f"saccade_length_{algo_choice}"] = tmp[f"word_land_{algo_choice}"] + tmp[f"word_launch_{algo_choice}"]
1296
+ new_col_dfs.append(
1297
+ tmp[(tmp[f"saccade_length_{algo_choice}"] >= 0) & tmp[f"saccade_length_{algo_choice}"].notna()]
1298
+ .groupby("subject_trialID")[f"saccade_length_{algo_choice}"]
1299
+ .agg("mean")
1300
+ .reset_index(name=f"saccade_length_{algo_choice}")
1301
+ )
1302
+
1303
+ word = wordcomb.copy()
1304
+ if f"firstrun_skip_{algo_choice}" in wordcomb.columns:
1305
+ new_col_dfs.append(
1306
+ word.groupby("subject_trialID")[f"firstrun_skip_{algo_choice}"]
1307
+ .agg("mean")
1308
+ .reset_index(name=f"skip_{algo_choice}")
1309
+ )
1310
+ if f"refix_{algo_choice}" in wordcomb.columns:
1311
+ new_col_dfs.append(
1312
+ word.groupby("subject_trialID")[f"refix_{algo_choice}"]
1313
+ .agg("mean")
1314
+ .reset_index(name=f"refix_{algo_choice}")
1315
+ )
1316
+ if f"reg_in_{algo_choice}" in wordcomb.columns:
1317
+ new_col_dfs.append(
1318
+ word.groupby("subject_trialID")[f"reg_in_{algo_choice}"]
1319
+ .agg("mean")
1320
+ .reset_index(name=f"reg_{algo_choice}")
1321
+ )
1322
+
1323
+ if f"firstrun_dur_{algo_choice}" in wordcomb.columns:
1324
+ new_col_dfs.append(
1325
+ word.groupby("subject_trialID")[f"firstrun_dur_{algo_choice}"]
1326
+ .agg("sum")
1327
+ .reset_index(name=f"firstpass_{algo_choice}")
1328
+ )
1329
+
1330
+ if f"total_fixation_duration_{algo_choice}" in wordcomb.columns:
1331
+ new_col_dfs.append(
1332
+ (word[f"total_fixation_duration_{algo_choice}"] - word[f"firstrun_dur_{algo_choice}"])
1333
+ .groupby(word["subject_trialID"])
1334
+ .agg("sum")
1335
+ .reset_index(name=f"rereading_{algo_choice}")
1336
+ )
1337
+ trial = pd.concat(
1338
+ [trial.set_index("subject_trialID")] + [df.set_index("subject_trialID") for df in new_col_dfs], axis=1
1339
+ ).reset_index()
1340
+ trial[f"reading_rate_{algo_choice}"] = (
1341
+ 60000 / (trial["total_fix_duration"] / trial["number_of_words_in_trial"])
1342
+ ).round()
1343
+
1344
+ return trial.copy()
1345
+
1346
+
1347
+ def aggregate_subjects(trials, algo_choices):
1348
+ trial_aggregates = trials.groupby("subject")[["nfix", "blink"]].mean().round(3).reset_index()
1349
+ trial_aggregates = trial_aggregates.merge(
1350
+ trials.groupby("subject")["question_correct"].sum().reset_index(name="n_question_correct"), on="subject"
1351
+ )
1352
+ trial_aggregates = trial_aggregates.merge(
1353
+ trials.groupby("subject")["trial_id"].count().reset_index(name="ntrial"), on="subject"
1354
+ )
1355
+ for algo_choice in algo_choices:
1356
+ cols_to_do = [
1357
+ c
1358
+ for c in [
1359
+ f"saccade_length_{algo_choice}",
1360
+ f"reg_{algo_choice}",
1361
+ f"mean_fix_duration_{algo_choice}",
1362
+ f"total_fix_duration_{algo_choice}",
1363
+ f"reading_rate_{algo_choice}",
1364
+ f"refix_{algo_choice}",
1365
+ f"nrun_{algo_choice}",
1366
+ f"skip_{algo_choice}",
1367
+ ]
1368
+ if c in trials.columns
1369
+ ]
1370
+ trial_aggregates_temp = trials.groupby("subject")[cols_to_do].mean().round(3).reset_index()
1371
+ trial_aggregates = pd.merge(trial_aggregates, trial_aggregates_temp, how="left", on="subject")
1372
+
1373
+ return trial_aggregates
process_asc_files_in_multi_p.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
2
+ import json
3
+ from sys import platform as _platform
4
+ from functools import partial
5
+ import multiprocessing
6
+ import os
7
+ from tqdm.auto import tqdm
8
+ from multi_proc_funcs import DIST_MODELS_FOLDER, process_trial_choice, set_up_models
9
+ import sys
10
+ import pandas as pd
11
+
12
+
13
+ def get_cpu_count():
14
+ if os.sys.platform in ("linux", "linux2", "darwin"):
15
+ return os.cpu_count()
16
+ elif os.sys.platform == "win32":
17
+ return multiprocessing.cpu_count()
18
+ else:
19
+ return 1
20
+
21
+
22
+ def process_asc_files_in_multi_proc(
23
+ algo_choice,
24
+ choice_handle_short_and_close_fix,
25
+ discard_fixations_without_sfix,
26
+ discard_far_out_of_text_fix,
27
+ x_thres_in_chars,
28
+ y_thresh_in_heights,
29
+ short_fix_threshold,
30
+ merge_distance_threshold,
31
+ discard_long_fix,
32
+ discard_long_fix_threshold,
33
+ discard_blinks,
34
+ measures_to_calculate_multi_asc,
35
+ include_coords_multi_asc,
36
+ sent_measures_to_calculate_multi_asc,
37
+ trials_by_ids,
38
+ classic_algos_cfg,
39
+ models_dict,
40
+ fix_cols_to_add_multi_asc,
41
+ ):
42
+ funcc = partial(
43
+ process_trial_choice,
44
+ algo_choice=algo_choice,
45
+ choice_handle_short_and_close_fix=choice_handle_short_and_close_fix,
46
+ for_multi=True,
47
+ discard_fixations_without_sfix=discard_fixations_without_sfix,
48
+ discard_far_out_of_text_fix=discard_far_out_of_text_fix,
49
+ x_thres_in_chars=x_thres_in_chars,
50
+ y_thresh_in_heights=y_thresh_in_heights,
51
+ short_fix_threshold=short_fix_threshold,
52
+ merge_distance_threshold=merge_distance_threshold,
53
+ discard_long_fix=discard_long_fix,
54
+ discard_long_fix_threshold=discard_long_fix_threshold,
55
+ discard_blinks=discard_blinks,
56
+ measures_to_calculate_multi_asc=measures_to_calculate_multi_asc,
57
+ include_coords_multi_asc=include_coords_multi_asc,
58
+ sent_measures_to_calculate_multi_asc=sent_measures_to_calculate_multi_asc,
59
+ classic_algos_cfg=classic_algos_cfg,
60
+ models_dict=models_dict,
61
+ fix_cols_to_add=fix_cols_to_add_multi_asc,
62
+ )
63
+ workers = min(len(trials_by_ids), 32, get_cpu_count() - 1)
64
+ with multiprocessing.Pool(workers) as pool:
65
+ out = pool.map(funcc, trials_by_ids.values())
66
+ return out
67
+
68
+
69
+ def make_json_compatible(obj):
70
+ if isinstance(obj, dict):
71
+ return {k: make_json_compatible(v) for k, v in obj.items()}
72
+ elif isinstance(obj, list):
73
+ return [make_json_compatible(v) for v in obj]
74
+ elif isinstance(obj, pd.DataFrame):
75
+ return obj.to_dict(orient="records")
76
+ elif isinstance(obj, pd.Series):
77
+ return obj.to_dict()
78
+ else:
79
+ return obj
80
+
81
+
82
+ def main():
83
+ try:
84
+ input_data = sys.stdin.buffer.read()
85
+
86
+ (
87
+ algo_choice,
88
+ choice_handle_short_and_close_fix,
89
+ discard_fixations_without_sfix,
90
+ discard_far_out_of_text_fix,
91
+ x_thres_in_chars,
92
+ y_thresh_in_heights,
93
+ short_fix_threshold,
94
+ merge_distance_threshold,
95
+ discard_long_fix,
96
+ discard_long_fix_threshold,
97
+ discard_blinks,
98
+ measures_to_calculate_multi_asc,
99
+ include_coords_multi_asc,
100
+ sent_measures_to_calculate_multi_asc,
101
+ trials_by_ids,
102
+ classic_algos_cfg,
103
+ models_dict,
104
+ fix_cols_to_add_multi_asc,
105
+ ) = json.loads(input_data)
106
+ if (
107
+ "DIST" in algo_choice
108
+ or "Wisdom_of_Crowds_with_DIST" in algo_choice
109
+ or "DIST-Ensemble" in algo_choice
110
+ or "Wisdom_of_Crowds_with_DIST_Ensemble" in algo_choice
111
+ ):
112
+ del models_dict # Needed to stop pickling from failing for multiproc
113
+ models_dict = set_up_models(DIST_MODELS_FOLDER)
114
+ else:
115
+ models_dict = {}
116
+ out = process_asc_files_in_multi_proc(
117
+ algo_choice,
118
+ choice_handle_short_and_close_fix,
119
+ discard_fixations_without_sfix,
120
+ discard_far_out_of_text_fix,
121
+ x_thres_in_chars,
122
+ y_thresh_in_heights,
123
+ short_fix_threshold,
124
+ merge_distance_threshold,
125
+ discard_long_fix,
126
+ discard_long_fix_threshold,
127
+ discard_blinks,
128
+ measures_to_calculate_multi_asc,
129
+ include_coords_multi_asc,
130
+ sent_measures_to_calculate_multi_asc,
131
+ trials_by_ids,
132
+ classic_algos_cfg,
133
+ models_dict,
134
+ fix_cols_to_add_multi_asc,
135
+ )
136
+ out2 = []
137
+ for dffix, trial in out:
138
+ dffix = dffix.to_dict("records")
139
+ trial = make_json_compatible(trial)
140
+ out2.append((dffix, trial))
141
+ json_data_out = json.dumps(out2)
142
+ sys.stdout.flush()
143
+ print(json_data_out)
144
+ except Exception as e:
145
+ print(json.dumps({"error": str(e)}))
146
+
147
+
148
+ if __name__ == "__main__":
149
+ main()
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets
2
+ einops
3
+ matplotlib
4
+ numpy
5
+ pandas
6
+ PyYAML
7
+ seaborn
8
+ tqdm
9
+ transformers==4.*
10
+ tensorboard
11
+ torchmetrics
12
+ pytorch-lightning
13
+ scikit-learn
14
+ plotly
15
+ lovely-tensors
16
+ timm
17
+ openpyxl
18
+ torch==2.*
19
+ pydantic==1.10
20
+ streamlit >= 1.35
21
+ pycairo
22
+ eyekit
23
+ stqdm
24
+ jellyfish
25
+ icecream
saccades_df_columns.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Column names for Saccades Dataframe
2
+ Some features were adapted from the popEye R package ([github](https://github.com/sascha2schroeder/popEye))
3
+ The if the column depend on a line assignment then a _ALGORITHM_NAME will be at the end of the name.
4
+ - subject: Subject name or ID (derived from filename)
5
+ - trial_id: Trial ID
6
+ - item: Item ID
7
+ - condition: Condition (if applicable)
8
+ - num: Saccade number
9
+ - start_time: Start time (in ms since start of the trial)
10
+ - end_time: End time (in ms since start of the trial)
11
+ - xs: Raw x start position (in pixel)
12
+ - ys: Raw y start position (in pixel)
13
+ - xe: Raw x end position (in pixel)
14
+ - ye: Raw y end position (in pixel)
15
+ - ampl: saccadic amplitude (degrees)
16
+ - pv: peak velocity (degrees/sec)
17
+ - start_uncorrected: Start time (in ms as recorded by EyeLink)
18
+ - stop_uncorrected: End time (in ms as recorded by EyeLink)
19
+ - blink_before: Whether a blink occured directly before the saccade
20
+ - blink_after: Whether a blink occured directly after the saccade
21
+ - blink: Whether a blink occured directly before or after the saccade
22
+ - duration: Duration (in ms)
23
+ - xe_minus_xs: Horizontal saccade distance
24
+ - ye_minus_ys: Vertical saccade distance
25
+ - eucledian_distance: Eucledian distance
26
+ - angle: Angle
27
+ - dX: Horizontal saccade amplitude
28
+ - dY: Vertical saccade amplitude
29
+ - ys_ALGORITHM_NAME: Corrected y start position (in pixel), i.e. after line assignment
30
+ - ye_ALGORITHM_NAME: Corrected y end position (in pixel), i.e. after line assignment
31
+ - ye_minus_ys_ALGORITHM_NAME: Vertical saccade distance after being snapped to line
32
+ - angle_ALGORITHM_NAME: Vertical saccade distance after being snapped to line
33
+ - lines_ALGORITHM_NAME: Starting line of saccade
34
+ - linee_ALGORITHM_NAME: Landing line of saccade
35
+ - line_word_s_ALGORITHM_NAME: Number of word on line from which saccade starts
36
+ - line_word_e_ALGORITHM_NAME: Number of word on line where saccade ends
37
+ - lets_ALGORITHM_NAME: Number of letter from which saccade starts
38
+ - lete_ALGORITHM_NAME: Number of letter where saccade ends
sentence_measures.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Column names for Sentence measures
2
+ Some features were adapted from the popEye R package ([github](https://github.com/sascha2schroeder/popEye))
3
+ The if the column depend on a line assignment then a _ALGORITHM_NAME will be at the end of the name.
4
+ - subject: Participant ID
5
+ - trial_id: Position of trial in analysis
6
+ - item: Item ID
7
+ - condition: Condition (if applicable)
8
+ - sentence_number: Number of sentence in trial
9
+ - sentence: Sentence Text
10
+ - number_of_words: Number of words in sentence
11
+ - skip: Whether the sentence has been skipped
12
+ - nrun: Number of times the sentence has been read
13
+ - reread: Whether the sentence has been read more than one time
14
+ - reg_in: Whether a regression has been made into the sentence
15
+ - reg_out: Whether a regression has been made out of the sentence
16
+ - total_n_fixations: Number of fixations made on the sentence
17
+ - total_dur: Total sentence reading time
18
+ - rate: Reading rate (number of words per minute)
19
+ - gopast: Sum of all fixations durations from the time the sentence was entered until it was left to the right (regression path duration)
20
+ - gopast_sel: Sum of all fixations on the sentence from the time it was entered until it was left to the right (selective go-past time: regression path dur ation minus the time of the regression path)
21
+ - firstrun_skip: Whether sentence has been skipped during first-pass reading
22
+ - firstrun_reg_in: Whether a regression has been made into the sentence during first-pass reading
23
+ - firstrun_reg_out: Whether a regression has been made out of the sentence during first-pass reading
24
+ - firstpass_n_fixations: Number of fixation made during first-pass reading
25
+ - firstpass_dur: First-pass reading time
26
+ - firstpass_forward_n_fixations: Number of first-pass forward fixations (landing on one of the upcoming words of a sentence)
27
+ - firstpass_forward_dur: Duration of forward fixations during first-pass reading
28
+ - firstpass_reread_n_fixations: Number of first-pass rereading fixations (landing one of the words of the sentence that have been read previously)
29
+ - firstpass_reread_dur: Duration of rereading fixations during first-pass reading
30
+ - lookback_n_fixations: Number of fixations made on the sentence after regressing into it from another sentence
31
+ - lookback_dur: Duration of lookback fixations on the sentence
32
+ - lookfrom_n_fixations: Number of rereading fixations on another sentence initiated from the sentence
33
+ - lookfrom_dur: Duration of lookfrom fixations on the sentence
34
+
35
+ The forward, rereading, look-back, and look-from measures are computed in similar way as in the SR "Getting Reading Measures" tool (https://www.sr-support.com/thread-350.html) which is based on the Eyelink Analysojia software (developed by the Turku Eye Labs).
subject_measures.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Column names for Subject level summary statistics
2
+ Some features were adapted from the popEye R package ([github](https://github.com/sascha2schroeder/popEye))
3
+ The if the column depend on a line assignment then a _ALGORITHM_NAME will be at the end of the name.
4
+
5
+ - subject: Subject identifier, taken from filename
6
+ - ntrial: Number of trials for the subject
7
+ - n_question_correct: Total number of correctly answered questions
8
+ - blink: Mean number of blinks across trials
9
+ - nfix: Mean number of fixations across trials
10
+ - skip_ALGORITHM_NAME: Mean proportion of words that have been skipped during first-pass reading across trials
11
+ - saccade_length_ALGORITHM_NAME: Mean (forward) saccade length
12
+ - refix_ALGORITHM_NAME: Mean proportion of words that have been refixated across trials
13
+ - reg_ALGORITHM_NAME: Mean proportion of words which have been regressed into across trials
14
+ - mean_fixation_duration: Mean fixation duration
15
+ - total_fix_duration: Mean total reading time across trials
trials_df_columns.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Column names for Trials Dataframe
2
+ Some features were adapted from the popEye R package ([github](https://github.com/sascha2schroeder/popEye))
3
+ The if the column depend on a line assignment then a _ALGORITHM_NAME will be at the end of the name.
4
+ - subject: Subject name or ID (derived from filename)
5
+ - trial_id: Trial ID
6
+ - item: Item ID
7
+ - condition: Condition (if applicable)
8
+ - average_y_correction_ALGORITHM_NAME: Average difference between raw y position of a fixation and the center of the line to which it was assigned in pixels
9
+ - Number of fixations before cleaning: Number of fixation found for the trial before any cleaning is done
10
+ - Discard long fixations: Indicates if overly long fixations were discarded
11
+ - Number of discarded long fixations: Number of fixations that were discarded due to being overly long
12
+ - Number of discarded long fixations (%): Number of fixations that were discarded due to being overly long as a percentage of the total number of fixations
13
+ - How short and close fixations were handled: Which option was chosen for handling short fixation
14
+ - Number of merged fixations: Number of fixations that were merged due to their duration being below the set threshold and being in horizontal proximity to their preceeding or subsequent fixation
15
+ - Number of merged fixations (%): Number of fixations that were merged due to their duration being below the set threshold and being in horizontal proximity to their preceeding or subsequent fixation as a percentage of the total number of fixations
16
+ - Far out of text fixations were discarded: Whether fixations were discarded if they were far outside the stimulus text
17
+ - Number of discarded far-out-of-text fixations: Number of fixations that were discarded due to being far outside the stimulus text
18
+ - Number of discarded far-out-of-text fixations (%): Number of fixations that were discarded due to being far outside the stimulus text as a percentage of the total number of fixations
19
+ - Total number of discarded and merged fixations: Number of fixations that were cleaned up
20
+ - Total number of discarded and merged fixations (%): Number of fixations that were cleaned up as a percentage of the total number of fixations
21
+ - trial_start_time: Timestamp of the start of the trial
22
+ - trial_end_time: Timestamp of the end of the trial
23
+ - question_correct: Whether the question associated with the trial was answered correctly. This will be blank if it could not be determined
24
+ - number_of_words_in_trial: Total number of words in the stimulus used for the trial
25
+ - blink: Number of blinks detected during the trial
26
+ - nfix: Number of fixations remaining after cleaning
27
+ - nrun_ALGORITHM_NAME: Number of runs on trial
28
+ - saccade_length_ALGORITHM_NAME: Average saccade length across the trial
29
+ - mean_fix_duration_ALGORITHM_NAME: Average fixation duration across the trial
30
+ - total_fix_duration_ALGORITHM_NAME: Total fixation duration across the trial
31
+ - skip_ALGORITHM_NAME: Proportion of words in the trial that have been skipped during first-pass reading
32
+ - refix_ALGORITHM_NAME: Proportion of words in the trial that have been refixated
33
+ - reg_ALGORITHM_NAME: Proportion of words which have been regressed into
34
+ - firstpass_ALGORITHM_NAME: First-pass reading time
35
+ - rereading_ALGORITHM_NAME: Re-reading time (total reading time minus first-pass reading time)
36
+ - reading_rate_ALGORITHM_NAME: Reading rate (words per minute)
utils.py ADDED
@@ -0,0 +1,1349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ from io import StringIO
3
+ import re
4
+ import zipfile
5
+ import os
6
+ import plotly.graph_objects as go
7
+ from io import StringIO
8
+ import numpy as np
9
+ import pandas as pd
10
+ from PIL import Image
11
+ import json
12
+ from matplotlib import pyplot as plt
13
+ import pathlib as pl
14
+ import matplotlib as mpl
15
+ from streamlit.runtime.uploaded_file_manager import UploadedFile
16
+ from tqdm.auto import tqdm
17
+ import time
18
+ import requests
19
+ from icecream import ic
20
+ from matplotlib import font_manager
21
+ from multi_proc_funcs import (
22
+ COLORS,
23
+ PLOTS_FOLDER,
24
+ RESULTS_FOLDER,
25
+ add_boxes_to_ax,
26
+ add_text_to_ax,
27
+ matplotlib_plot_df,
28
+ save_trial_to_json,
29
+ sigmoid,
30
+ )
31
+ import emreading_funcs as emf
32
+
33
+ ic.configureOutput(includeContext=True)
34
+ TEMP_FIGURE_STIMULUS_PATH = PLOTS_FOLDER / "temp_matplotlib_plot_stimulus.png"
35
+ all_fonts = [x.name for x in font_manager.fontManager.ttflist]
36
+ mpl.use("agg")
37
+
38
+ DIST_MODELS_FOLDER = pl.Path("models")
39
+ IMAGENET_MEAN = [0.485, 0.456, 0.406]
40
+ IMAGENET_STD = [0.229, 0.224, 0.225]
41
+ PLOTS_FOLDER = pl.Path("plots")
42
+
43
+ names_dict = {
44
+ "SSACC": {"Descr": "Start of Saccade", "Pattern": "SSACC <eye > <stime>"},
45
+ "ESACC": {
46
+ "Descr": "End of Saccade",
47
+ "Pattern": "ESACC <eye > <stime> <etime > <dur> <sxp > <syp> <exp > <eyp> <ampl > <pv >",
48
+ },
49
+ "SFIX": {"Descr": "Start of Fixation", "Pattern": "SFIX <eye > <stime>"},
50
+ "EFIX": {"Descr": "End of Fixation", "Pattern": "EFIX <eye > <stime> <etime > <dur> <axp > <ayp> <aps >"},
51
+ "SBLINK": {"Descr": "Start of Blink", "Pattern": "SBLINK <eye > <stime>"},
52
+ "EBLINK": {"Descr": "End of Blink", "Pattern": "EBLINK <eye > <stime> <etime > <dur>"},
53
+ "DISPLAY ON": {"Descr": "Actual start of Trial", "Pattern": "DISPLAY ON"},
54
+ }
55
+ metadata_strs = ["DISPLAY COORDS", "GAZE_COORDS", "FRAMERATE"]
56
+
57
+
58
+ POPEYE_FIXATION_COLS_DICT = {
59
+ "start": "start_time",
60
+ "stop": "end_time",
61
+ "xs": "x",
62
+ "ys": "y",
63
+ }
64
+ EMREADING_COLS_DROPLIST = ["hasText", "char_trial"]
65
+ EMREADING_COLS_DICT = {
66
+ "sub": "subject",
67
+ "item": "item",
68
+ "condition": "condition",
69
+ "SFIX": "start_time",
70
+ "EFIX": "end_time",
71
+ "xPos": "x",
72
+ "yPos": "y",
73
+ "fix_number": "fixation_number",
74
+ "fix_dur": "duration",
75
+ "wordID": "on_word_EM",
76
+ "outOfBnds": "out_of_bounds",
77
+ "outsideText": "out_of_text_area",
78
+ }
79
+
80
+
81
+ def download_url(url, target_filename):
82
+ max_retries = 4
83
+ for attempt in range(1, max_retries + 1):
84
+ try:
85
+ r = requests.get(url)
86
+ if r.status_code != 200:
87
+ ic(f"Download failed due to unsuccessful response from server: {r.status_code}")
88
+ return -1
89
+ open(target_filename, "wb").write(r.content)
90
+ return 0
91
+
92
+ except Exception as e:
93
+ if attempt < max_retries:
94
+ time.sleep(2 * attempt)
95
+ ic(f"Download failed due to an error; will try again in {attempt*2} seconds:", e)
96
+ else:
97
+ ic(f"Failed after all attempts ({url}). Error details:\n{e}")
98
+ return -1
99
+
100
+
101
+ def asc_to_trial_ids(
102
+ asc_file, close_gap_between_words, paragraph_trials_only, ias_files, trial_start_keyword, end_trial_at_keyword
103
+ ):
104
+ asc_encoding = ["ISO-8859-15", "UTF-8"][0]
105
+ trials_dict, lines = file_to_trials_and_lines(
106
+ asc_file,
107
+ asc_encoding,
108
+ close_gap_between_words=close_gap_between_words,
109
+ paragraph_trials_only=paragraph_trials_only,
110
+ uploaded_ias_files=ias_files,
111
+ trial_start_keyword=trial_start_keyword,
112
+ end_trial_at_keyword=end_trial_at_keyword,
113
+ )
114
+
115
+ enum = (
116
+ trials_dict["paragraph_trials"]
117
+ if paragraph_trials_only and "paragraph_trials" in trials_dict.keys()
118
+ else range(trials_dict["max_trial_idx"])
119
+ )
120
+ trials_by_ids = {trials_dict[idx]["trial_id"]: trials_dict[idx] for idx in enum}
121
+ return trials_by_ids, lines, trials_dict
122
+
123
+
124
+ def get_trials_list(
125
+ asc_file, close_gap_between_words, paragraph_trials_only, ias_files, trial_start_keyword, end_trial_at_keyword
126
+ ):
127
+ if hasattr(asc_file, "name"):
128
+ savename = pl.Path(asc_file.name).stem
129
+ else:
130
+ savename = pl.Path(asc_file).stem
131
+
132
+ trials_by_ids, lines, trials_dict = asc_to_trial_ids(
133
+ asc_file,
134
+ close_gap_between_words=close_gap_between_words,
135
+ paragraph_trials_only=paragraph_trials_only,
136
+ ias_files=ias_files,
137
+ trial_start_keyword=trial_start_keyword,
138
+ end_trial_at_keyword=end_trial_at_keyword,
139
+ )
140
+ trial_keys = list(trials_by_ids.keys())
141
+ savename = RESULTS_FOLDER / f"{savename}_metadata_overview.json"
142
+
143
+ offload_list = [
144
+ "gaze_df",
145
+ "dffix",
146
+ "chars_df",
147
+ "saccade_df",
148
+ "x_char_unique",
149
+ "line_heights",
150
+ "chars_list",
151
+ "words_list",
152
+ "dffix_sacdf_popEye",
153
+ "fixdf_popEye",
154
+ "saccade_df",
155
+ "sacdf_popEye",
156
+ "combined_df",
157
+ "events_df",
158
+ ]
159
+ trials_dict_cut_down = {}
160
+ for k_outer, v_outer in trials_dict.items():
161
+ if isinstance(v_outer, dict):
162
+ trials_dict_cut_down[k_outer] = {}
163
+ for prop, val in v_outer.items():
164
+ if prop not in offload_list:
165
+ trials_dict_cut_down[k_outer][prop] = val
166
+ else:
167
+ trials_dict_cut_down[k_outer] = v_outer
168
+ save_trial_to_json(trials_dict_cut_down, savename=savename)
169
+ return trial_keys, trials_by_ids, lines, asc_file, trials_dict
170
+
171
+
172
+ def calc_xdiff_ydiff(line_xcoords_no_pad, line_ycoords_no_pad, line_heights, allow_multiple_values=False):
173
+ x_diffs = np.unique(np.diff(line_xcoords_no_pad))
174
+ if len(x_diffs) == 1:
175
+ x_diff = x_diffs[0]
176
+ elif not allow_multiple_values:
177
+ x_diff = np.min(x_diffs)
178
+ else:
179
+ x_diff = x_diffs
180
+
181
+ if np.unique(line_ycoords_no_pad).shape[0] == 1:
182
+ return x_diff, line_heights[0]
183
+ y_diffs = np.unique(np.diff(line_ycoords_no_pad))
184
+ if len(y_diffs) == 1:
185
+ y_diff = y_diffs[0]
186
+ elif len(y_diffs) == 0:
187
+ y_diff = 0
188
+ elif not allow_multiple_values:
189
+ y_diff = np.min(y_diffs)
190
+ else:
191
+ y_diff = y_diffs
192
+ return np.round(x_diff, decimals=2), np.round(y_diff, decimals=2)
193
+
194
+
195
+ def add_words(chars_list):
196
+ chars_list_reconstructed = []
197
+ words_list = []
198
+ sentence_list = []
199
+ sentence_start_idx = 0
200
+ sentence_num = 0
201
+ word_start_idx = 0
202
+ chars_df = pd.DataFrame(chars_list)
203
+ chars_df["char_width"] = chars_df.char_xmax - chars_df.char_xmin
204
+ word_dict = None
205
+ on_line_num = -1
206
+ line_change_on_next_char = False
207
+ num_chars = len(chars_list)
208
+ for idx, char_dict in enumerate(chars_list):
209
+ # check if line change will happen after current char
210
+ on_line_num = char_dict["assigned_line"]
211
+ if idx < num_chars - 1:
212
+ line_change_on_next_char = on_line_num != chars_list[idx + 1]["assigned_line"]
213
+ else:
214
+ line_change_on_next_char = False
215
+ chars_list_reconstructed.append(char_dict)
216
+ if char_dict["char"] in [" "] or len(chars_list_reconstructed) == len(chars_list) or line_change_on_next_char:
217
+ word_xmin = chars_list_reconstructed[word_start_idx]["char_xmin"]
218
+ if chars_list_reconstructed[-1]["char"] == " " and len(chars_list_reconstructed) != 1:
219
+ word_xmax = chars_list_reconstructed[-2]["char_xmax"]
220
+
221
+ word = "".join(
222
+ [
223
+ chars_list_reconstructed[idx]["char"]
224
+ for idx in range(word_start_idx, len(chars_list_reconstructed) - 1)
225
+ ]
226
+ )
227
+ elif len(chars_list_reconstructed) == 1:
228
+ word_xmax = chars_list_reconstructed[-1]["char_xmax"]
229
+ word = " "
230
+ else:
231
+ word = "".join(
232
+ [
233
+ chars_list_reconstructed[idx]["char"]
234
+ for idx in range(word_start_idx, len(chars_list_reconstructed))
235
+ ]
236
+ )
237
+ word_xmax = chars_list_reconstructed[-1]["char_xmax"]
238
+ word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
239
+ word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
240
+ word_x_center = round((word_xmax - word_xmin) / 2 + word_xmin, ndigits=2)
241
+ word_y_center = round((word_ymax - word_ymin) / 2 + word_ymin, ndigits=2)
242
+ word_length = len(word)
243
+ assigned_line = chars_list_reconstructed[word_start_idx]["assigned_line"]
244
+ word_dict = dict(
245
+ word_number=len(words_list),
246
+ word=word,
247
+ word_length=word_length,
248
+ word_xmin=word_xmin,
249
+ word_xmax=word_xmax,
250
+ word_ymin=word_ymin,
251
+ word_ymax=word_ymax,
252
+ word_x_center=word_x_center,
253
+ word_y_center=word_y_center,
254
+ assigned_line=assigned_line,
255
+ )
256
+ if len(word) > 0 and word != " ":
257
+ words_list.append(word_dict)
258
+ for cidx, char_dict in enumerate(chars_list_reconstructed[word_start_idx:]):
259
+ if char_dict["char"] == " ":
260
+ char_dict["in_word_number"] = len(words_list)
261
+ char_dict["in_word"] = " "
262
+ char_dict["num_letters_from_start_of_word"] = 0
263
+ else:
264
+ char_dict["in_word_number"] = len(words_list) - 1
265
+ char_dict["in_word"] = word
266
+ char_dict["num_letters_from_start_of_word"] = cidx
267
+
268
+ word_start_idx = idx + 1
269
+
270
+ if chars_list_reconstructed[-1]["char"] in [".", "!", "?"] or idx == (len(chars_list) - 1):
271
+ if idx != sentence_start_idx:
272
+ chars_df_temp = pd.DataFrame(chars_list_reconstructed[sentence_start_idx:])
273
+ line_texts = []
274
+ for sidx, subdf in chars_df_temp.groupby("assigned_line"):
275
+ line_text = "_".join(subdf.char.values)
276
+ line_text = line_text.replace("_ _", " ")
277
+ line_text = line_text.replace("_", "")
278
+ line_texts.append(line_text.strip())
279
+ sentence_text = " ".join(line_texts)
280
+ sentence_dict = dict(sentence_num=sentence_num, sentence_text=sentence_text)
281
+ sentence_list.append(sentence_dict)
282
+ for c in chars_list_reconstructed[sentence_start_idx:]:
283
+ c["in_sentence_number"] = sentence_num
284
+ c["in_sentence"] = sentence_text
285
+ sentence_start_idx = len(chars_list_reconstructed)
286
+ sentence_num += 1
287
+ else:
288
+ sentence_list[-1]["sentence_text"] += chars_list_reconstructed[sentence_start_idx]["char"]
289
+ chars_list_reconstructed[idx]["in_sentence_number"] = sentence_list[-1]["sentence_num"]
290
+ chars_list_reconstructed[idx]["in_sentence"] = sentence_list[-1]["sentence_text"]
291
+ for cidx, char_dict in enumerate(chars_list_reconstructed):
292
+ if (
293
+ char_dict["char"] == " "
294
+ and (cidx + 1) < len(chars_list_reconstructed)
295
+ and char_dict["assigned_line"] == chars_list_reconstructed[cidx + 1]["assigned_line"]
296
+ ):
297
+ char_dict["in_word_number"] = chars_list_reconstructed[cidx + 1]["in_word_number"]
298
+ char_dict["in_word"] = chars_list_reconstructed[cidx + 1]["in_word"]
299
+
300
+ last_letter_in_word = words_list[-1]["word"][-1]
301
+ last_letter_in_chars_list_reconstructed = char_dict["char"]
302
+ if last_letter_in_word != last_letter_in_chars_list_reconstructed:
303
+ if last_letter_in_chars_list_reconstructed in [".", "!", "?"]:
304
+ words_list[-1] = dict(
305
+ word_number=len(words_list),
306
+ word=words_list[-1]["word"] + char_dict["char"],
307
+ word_length=len(words_list[-1]["word"] + char_dict["char"]),
308
+ word_xmin=words_list[-1]["word_xmin"],
309
+ word_xmax=char_dict["char_xmax"],
310
+ word_ymin=words_list[-1]["word_ymin"],
311
+ word_ymax=words_list[-1]["word_ymax"],
312
+ assigned_line=assigned_line,
313
+ )
314
+
315
+ word_x_center = round(
316
+ (words_list[-1]["word_xmax"] - words_list[-1]["word_xmin"]) / 2 + words_list[-1]["word_xmin"], ndigits=2
317
+ )
318
+ word_y_center = round(
319
+ (words_list[-1]["word_ymax"] - word_dict["word_ymin"]) / 2 + words_list[-1]["word_ymin"], ndigits=2
320
+ )
321
+ words_list[-1]["word_x_center"] = word_x_center
322
+ words_list[-1]["word_y_center"] = word_y_center
323
+ else:
324
+ word_dict = dict(
325
+ word_number=len(words_list),
326
+ word=char_dict["char"],
327
+ word_length=1,
328
+ word_xmin=char_dict["char_xmin"],
329
+ word_xmax=char_dict["char_xmax"],
330
+ word_ymin=char_dict["char_ymin"],
331
+ word_ymax=char_dict["char_ymax"],
332
+ word_x_center=char_dict["char_x_center"],
333
+ word_y_center=char_dict["char_y_center"],
334
+ assigned_line=assigned_line,
335
+ )
336
+ words_list.append(word_dict)
337
+ chars_list_reconstructed[-1]["in_word_number"] = len(words_list) - 1
338
+ chars_list_reconstructed[-1]["in_word"] = word_dict["word"]
339
+ chars_list_reconstructed[-1]["num_letters_from_start_of_word"] = 0
340
+ if len(sentence_list) > 0:
341
+ chars_list_reconstructed[-1]["in_sentence_number"] = sentence_num - 1
342
+ chars_list_reconstructed[-1]["in_sentence"] = sentence_list[-1]["sentence_text"]
343
+ else:
344
+ ic(f"Warning Sentence list empty: {sentence_list}")
345
+
346
+ return words_list, chars_list_reconstructed
347
+
348
+
349
+ def read_ias_file(ias_file, prefix):
350
+
351
+ if isinstance(ias_file, UploadedFile):
352
+ lines = StringIO(ias_file.getvalue().decode("utf-8")).readlines()
353
+ ias_dicts = []
354
+ for l in lines:
355
+ lsplit = l.strip().split("\t")
356
+ ldict = {
357
+ f"{prefix}_number": float(lsplit[1]),
358
+ f"{prefix}_xmin": float(lsplit[2]),
359
+ f"{prefix}_xmax": float(lsplit[4]),
360
+ f"{prefix}_ymin": float(lsplit[3]),
361
+ f"{prefix}_ymax": float(lsplit[5]),
362
+ prefix: lsplit[6],
363
+ }
364
+ ias_dicts.append(ldict)
365
+ ias_df = pd.DataFrame(ias_dicts)
366
+ else:
367
+ ias_df = pd.read_csv(ias_file, delimiter="\t", header=None)
368
+ ias_df = ias_df.rename(
369
+ {
370
+ 1: f"{prefix}_number",
371
+ 2: f"{prefix}_xmin",
372
+ 4: f"{prefix}_xmax",
373
+ 3: f"{prefix}_ymin",
374
+ 5: f"{prefix}_ymax",
375
+ 6: prefix,
376
+ },
377
+ axis=1,
378
+ )
379
+ first_line_df = ias_df[ias_df[f"{prefix}_ymin"] == ias_df.loc[0, f"{prefix}_ymin"]]
380
+ words_include_spaces = (
381
+ first_line_df[f"{prefix}_xmax"].values == first_line_df[f"{prefix}_xmin"].shift(-1).values
382
+ ).any()
383
+ ias_df[f"{prefix}_width"] = ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_xmin"]
384
+ if words_include_spaces:
385
+ ias_df[f"{prefix}_length"] = ias_df[prefix].map(lambda x: len(x) + 1)
386
+ ias_df[f"{prefix}_width_per_length"] = ias_df[f"{prefix}_width"] / ias_df[f"{prefix}_length"]
387
+ ias_df[f"{prefix}_xmax"] = (ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_width_per_length"]).round(2)
388
+
389
+ ias_df[f"{prefix}_x_center"] = (
390
+ (ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_xmin"]) / 2 + ias_df[f"{prefix}_xmin"]
391
+ ).round(2)
392
+ ias_df[f"{prefix}_y_center"] = (
393
+ (ias_df[f"{prefix}_ymax"] - ias_df[f"{prefix}_ymin"]) / 2 + ias_df[f"{prefix}_ymin"]
394
+ ).round(2)
395
+ unique_midlines = list(np.unique(ias_df[f"{prefix}_y_center"]))
396
+ assigned_lines = [unique_midlines.index(x) for x in ias_df[f"{prefix}_y_center"]]
397
+ ias_df["assigned_line"] = assigned_lines
398
+ ias_df[f"{prefix}_number"] = np.arange(ias_df.shape[0])
399
+ return ias_df
400
+
401
+
402
+ def get_chars_list_from_words_list(ias_df, prefix="word"):
403
+ ias_df.reset_index(inplace=True, drop=True)
404
+ unique_midlines = list(np.unique(ias_df[f"{prefix}_y_center"]))
405
+ chars_list = []
406
+ for (idx, row), (next_idx, next_row) in zip(ias_df.iterrows(), ias_df.shift(-1).iterrows()):
407
+ word = str(row[prefix])
408
+ letter_width = (row[f"{prefix}_xmax"] - row[f"{prefix}_xmin"]) / len(word)
409
+ for i_w, letter in enumerate(word):
410
+ char_dict = dict(
411
+ in_word_number=idx,
412
+ in_word=word,
413
+ char_xmin=round(row[f"{prefix}_xmin"] + i_w * letter_width, 2),
414
+ char_xmax=round(row[f"{prefix}_xmin"] + (i_w + 1) * letter_width, 2),
415
+ char_ymin=row[f"{prefix}_ymin"],
416
+ char_ymax=row[f"{prefix}_ymax"],
417
+ char=letter,
418
+ )
419
+
420
+ char_dict["char_x_center"] = round(
421
+ (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"], ndigits=2
422
+ )
423
+ char_dict["char_y_center"] = round(
424
+ (row[f"{prefix}_ymax"] - row[f"{prefix}_ymin"]) / 2 + row[f"{prefix}_ymin"], ndigits=2
425
+ )
426
+
427
+ if i_w >= len(word) + 1:
428
+ break
429
+ char_dict["assigned_line"] = unique_midlines.index(char_dict["char_y_center"])
430
+ chars_list.append(char_dict)
431
+ if chars_list[-1]["char"] != " " and row.assigned_line == next_row.assigned_line:
432
+ char_dict = dict(
433
+ char_xmin=chars_list[-1]["char_xmax"],
434
+ char_xmax=round(chars_list[-1]["char_xmax"] + letter_width, 2),
435
+ char_ymin=row[f"{prefix}_ymin"],
436
+ char_ymax=row[f"{prefix}_ymax"],
437
+ char=" ",
438
+ )
439
+
440
+ char_dict["char_x_center"] = round(
441
+ (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"], ndigits=2
442
+ )
443
+ char_dict["char_y_center"] = round(
444
+ (row[f"{prefix}_ymax"] - row[f"{prefix}_ymin"]) / 2 + row[f"{prefix}_ymin"], ndigits=2
445
+ )
446
+
447
+ char_dict["assigned_line"] = unique_midlines.index(char_dict["char_y_center"])
448
+ chars_list.append(char_dict)
449
+ chars_df = pd.DataFrame(chars_list)
450
+ chars_df.loc[:, ["in_word_number", "in_word"]] = chars_df.loc[:, ["in_word_number", "in_word"]].copy().ffill(axis=0)
451
+ return chars_df.to_dict("records")
452
+
453
+
454
+ def check_values(v1, v2):
455
+ """Function that compares two lists for equality.
456
+
457
+ Returns True if both lists are the same; False if they are not; and None if either is None."""
458
+
459
+ # Check if any of the lists is None
460
+ if v1 is None or v2 is None or pd.isna(v1) or pd.isna(v2):
461
+ return None
462
+
463
+ # Compare elements in v1 with corresponding elements in v2
464
+ if v1 != v2:
465
+ return False
466
+ if v1 != v2:
467
+ return False
468
+ return True
469
+
470
+
471
+ def asc_lines_to_trials_by_trail_id(
472
+ lines: list,
473
+ paragraph_trials_only=True,
474
+ filename: str = "",
475
+ close_gap_between_words=True,
476
+ ias_files=[],
477
+ start_trial_at_keyword="START",
478
+ end_trial_at_keyword="END",
479
+ ) -> dict:
480
+
481
+ if len(ias_files) > 0:
482
+ ias_files_dict = {pl.Path(f.name).stem: f for f in ias_files}
483
+ else:
484
+ ias_files_dict = {}
485
+ if hasattr(filename, "name"):
486
+ filename = filename.name
487
+ subject = pl.Path(filename).stem
488
+ y_px = []
489
+ x_px = []
490
+ calibration_offset = []
491
+ calibration_max_error = []
492
+ calibration_time = []
493
+ calibration_avg_error = []
494
+ trial_var_block_lines = None
495
+ question_answer = None
496
+ question_correct = None
497
+ condition = "UNKNOWN"
498
+ item = "UNKNOWN"
499
+ depend = "UNKNOWN"
500
+ trial_index = None
501
+ fps = None
502
+ display_coords = None
503
+ trial_var_block_idx = -1
504
+ trials_dict = dict(paragraph_trials=[], paragraph_trial_IDs=[])
505
+ trial_idx = -1
506
+ trial_var_block_start_idx = -1
507
+ removed_trial_ids = []
508
+ ias_file = ""
509
+ trial_var_block_lines_list = []
510
+ if "\n".join(map(str.strip, lines)).find("TRIAL_VAR") != -1:
511
+ for idx, l in enumerate(tqdm(lines, desc=f"Checking for TRIAL_VAR lines for {filename}")):
512
+ if trial_var_block_start_idx == -1 and "MSG" not in l:
513
+ continue
514
+ if "TRIAL_VAR" in l:
515
+ if trial_var_block_start_idx == -1:
516
+ trial_var_block_start_idx = idx
517
+ continue
518
+ else:
519
+ if trial_var_block_start_idx != -1:
520
+ trial_var_block_stop_idx = idx
521
+ trial_var_block_lines = [
522
+ x.strip() for x in lines[trial_var_block_start_idx:trial_var_block_stop_idx]
523
+ ]
524
+ trial_var_block_lines_list.append(trial_var_block_lines)
525
+ trial_var_block_start_idx = -1
526
+ has_trial_var_lines = len(trial_var_block_lines_list) > 0
527
+ else:
528
+ has_trial_var_lines = False
529
+
530
+ for idx, l in enumerate(lines):
531
+ if "MSG" not in l:
532
+ continue
533
+ parts = l.strip().split(" ")
534
+ if "TRIALID" in l:
535
+ trial_id = re.split(r"[ :\t]+", l.strip())[-1]
536
+ trial_id_timestamp = parts[1]
537
+ trial_idx += 1
538
+ if trial_id[0] in ["F", "P", "E"]:
539
+
540
+ parse_dict = emf.parse_itemID(trial_id)
541
+ condition = parse_dict["condition"]
542
+ item = parse_dict["item"]
543
+ depend = parse_dict["depend"]
544
+ else:
545
+ parse_dict = {}
546
+ if trial_id[0] == "F":
547
+ trial_is = "question"
548
+ elif trial_id[0] == "P":
549
+ trial_is = "practice"
550
+ else:
551
+ if has_trial_var_lines:
552
+ trial_var_block_idx += 1
553
+ trial_var_block_lines = trial_var_block_lines_list[trial_var_block_idx]
554
+ image_lines = [s for s in trial_var_block_lines if "img" in s]
555
+ if len(image_lines) > 0:
556
+ item = image_lines[0].split(" ")[-1]
557
+ cond_lines = [s for s in trial_var_block_lines if "cond" in s]
558
+ if len(cond_lines) > 0:
559
+ condition = cond_lines[0].split(" ")[-1]
560
+ item_lines = [s for s in trial_var_block_lines if "item" in s]
561
+ if len(item_lines) > 0:
562
+ item = item_lines[0].split(" ")[-1]
563
+ trial_index_lines = [s for s in trial_var_block_lines if "Trial_Index" in s]
564
+ if len(trial_index_lines) > 0:
565
+ trial_index = trial_index_lines[0].split(" ")[-1]
566
+ question_key_lines = [s for s in trial_var_block_lines if "QUESTION_KEY_PRESSED" in s]
567
+ if len(question_key_lines) > 0:
568
+ question_answer = question_key_lines[0].split(" ")[-1]
569
+ question_response_lines = [s for s in trial_var_block_lines if " RESPONSE" in s]
570
+ if len(question_response_lines) > 0:
571
+ question_answer = question_response_lines[0].split(" ")[-1]
572
+ question_correct_lines = [
573
+ s for s in trial_var_block_lines if ("QUESTION_ACCURACY" in s) | (" ACCURACY" in s)
574
+ ]
575
+ if len(question_correct_lines) > 0:
576
+ question_correct = question_correct_lines[0].split(" ")[-1]
577
+ trial_is_lines = [s for s in trial_var_block_lines if "trial" in s]
578
+ if len(trial_is_lines) > 0:
579
+ trial_is_line = trial_is_lines[0].split(" ")[-1]
580
+ if "pract" in trial_is_line or "end" in trial_is_line:
581
+ trial_is = "practice"
582
+ trial_id = f"{trial_is}_{trial_id}"
583
+ else:
584
+ trial_is = "paragraph"
585
+ trial_id = f"{condition}_{trial_is}_{trial_id}"
586
+ trials_dict["paragraph_trials"].append(trial_idx)
587
+ trials_dict["paragraph_trial_IDs"].append(trial_id)
588
+ else:
589
+ trial_is = "paragraph"
590
+ trial_id = f"{condition}_{trial_is}_{trial_id}_{trial_idx}"
591
+ trials_dict["paragraph_trials"].append(trial_idx)
592
+ trials_dict["paragraph_trial_IDs"].append(trial_id)
593
+ else:
594
+ if len(trial_id) > 1:
595
+ condition = trial_id[1]
596
+ trial_is = "paragraph"
597
+ trials_dict["paragraph_trials"].append(trial_idx)
598
+ trials_dict["paragraph_trial_IDs"].append(trial_id)
599
+ trials_dict[trial_idx] = dict(
600
+ subject=subject,
601
+ filename=filename,
602
+ trial_idx=trial_idx,
603
+ trial_id=trial_id,
604
+ trial_id_idx=idx,
605
+ trial_id_timestamp=trial_id_timestamp,
606
+ trial_is=trial_is,
607
+ trial_var_block_lines=trial_var_block_lines,
608
+ seq=trial_idx,
609
+ item=item,
610
+ depend=depend,
611
+ condition=condition,
612
+ parse_dict=parse_dict,
613
+ )
614
+ if question_answer is not None:
615
+ trials_dict[trial_idx]["question_answer"] = question_answer
616
+ if question_correct is not None:
617
+ trials_dict[trial_idx]["question_correct"] = question_correct
618
+ if trial_index is not None:
619
+ trials_dict[trial_idx]["trial_index"] = trial_index
620
+ last_trial_skipped = False
621
+
622
+ elif "TRIAL_RESULT" in l or "stop_trial" in l:
623
+ trials_dict[trial_idx]["trial_result_idx"] = idx
624
+ trials_dict[trial_idx]["trial_result_timestamp"] = int(parts[0].split("\t")[1])
625
+ if len(parts) > 2:
626
+ trials_dict[trial_idx]["trial_result_number"] = int(parts[2])
627
+ elif "QUESTION_ANSWER" in l and not has_trial_var_lines:
628
+ trials_dict[trial_idx]["question_answer_idx"] = idx
629
+ trials_dict[trial_idx]["question_answer_timestamp"] = int(parts[0].split("\t")[1])
630
+ if len(parts) > 2:
631
+ trials_dict[trial_idx]["question_answer_question_trial"] = int(
632
+ pd.to_numeric(l.strip().split(" ")[-1].strip(), errors="coerce")
633
+ )
634
+ elif "KEYBOARD" in l:
635
+ trials_dict[trial_idx]["keyboard_press_idx"] = idx
636
+ trials_dict[trial_idx]["keyboard_press_timestamp"] = int(parts[0].split("\t")[1])
637
+ elif "DISPLAY COORDS" in l and display_coords is None:
638
+ display_coords = (float(parts[-4]), float(parts[-3]), float(parts[-2]), float(parts[-1]))
639
+ elif "GAZE_COORDS" in l and display_coords is None:
640
+ display_coords = (float(parts[-4]), float(parts[-3]), float(parts[-2]), float(parts[-1]))
641
+ elif "FRAMERATE" in l:
642
+ l_idx = parts.index(metadata_strs[2])
643
+ fps = float(parts[l_idx + 1])
644
+ elif "TRIAL ABORTED" in l or "TRIAL REPEATED" in l:
645
+ if not last_trial_skipped:
646
+ if trial_is == "paragraph":
647
+ trials_dict["paragraph_trials"].remove(trial_idx)
648
+ trial_idx -= 1
649
+ removed_trial_ids.append(trial_id)
650
+ last_trial_skipped = True
651
+ elif "IAREA FILE" in l:
652
+ ias_file = parts[-1]
653
+ ias_file_stem = ias_file.split("/")[-1].split("\\")[-1].split(".")[0]
654
+ trials_dict[trial_idx]["ias_file_from_asc"] = ias_file
655
+ trials_dict[trial_idx]["ias_file"] = ias_file_stem
656
+ if item == "UNKNOWN":
657
+ trials_dict[trial_idx]["item"] = ias_file_stem
658
+ if ias_file_stem in ias_files_dict:
659
+ try:
660
+ ias_file = ias_files_dict[ias_file_stem]
661
+ ias_df = read_ias_file(ias_file, prefix="word") # TODO make option if word or chars in ias
662
+ trials_dict[trial_idx]["words_list"] = ias_df.to_dict("records")
663
+ trials_dict[trial_idx]["chars_list"] = get_chars_list_from_words_list(ias_df, prefix="word")
664
+ except Exception as e:
665
+ ic(f"Reading ias file failed")
666
+ ic(e)
667
+ else:
668
+ ic(f"IAS file {ias_file_stem} not found")
669
+ elif "CALIBRATION" in l and "MSG" in l:
670
+ calibration_method = parts[3].strip()
671
+ if trial_idx > -1:
672
+ trials_dict[trial_idx]["calibration_method"] = calibration_method
673
+ elif "VALIDATION" in l and "MSG" in l and "ABORTED" not in l:
674
+ try:
675
+ calibration_time_line_parts = re.split(r"[ :\t]+", l.strip())
676
+ calibration_time.append(float(calibration_time_line_parts[1]))
677
+ calibration_avg_error.append(float(calibration_time_line_parts[9]))
678
+ calibration_max_error.append(float(calibration_time_line_parts[11]))
679
+ calibration_offset.append(float(calibration_time_line_parts[14]))
680
+ x_px.append(float(calibration_time_line_parts[-2].split(",")[0]))
681
+ y_px.append(float(calibration_time_line_parts[-2].split(",")[1]))
682
+ except Exception as e:
683
+ ic(f"parsing VALIDATION failed for line {l}")
684
+ trials_df = pd.DataFrame([trials_dict[i] for i in range(trial_idx) if i in trials_dict])
685
+
686
+ if (
687
+ question_correct is None
688
+ and "trial_result_number" in trials_df.columns
689
+ and "question_answer_question_trial" in trials_df.columns
690
+ ):
691
+ trials_df["question_answer_selection"] = trials_df["trial_result_number"].shift(-1).values
692
+ trials_df["correct_trial_answer_would_be"] = trials_df["question_answer_question_trial"].shift(-1).values
693
+ trials_df["question_correct"] = [
694
+ check_values(a, b)
695
+ for a, b in zip(trials_df["question_answer_selection"], trials_df["correct_trial_answer_would_be"])
696
+ ]
697
+ for pidx, prow in trials_df.loc[trials_df.trial_is == "paragraph", :].iterrows():
698
+ trials_dict[pidx]["question_correct"] = prow["question_correct"]
699
+ if prow["question_correct"] is not None:
700
+ trials_dict[pidx]["question_answer_selection"] = prow["question_answer_selection"]
701
+ trials_dict[pidx]["correct_trial_answer_would_be"] = prow["correct_trial_answer_would_be"]
702
+ else:
703
+ trials_dict[pidx]["question_answer_selection"] = None
704
+ trials_dict[pidx]["correct_trial_answer_would_be"] = None
705
+ if "question_correct" in trials_df.columns:
706
+ paragraph_trials_df = trials_df.loc[trials_df.trial_is == "paragraph", :]
707
+ overall_question_answer_value_counts = (
708
+ paragraph_trials_df["question_correct"].dropna().astype(int).value_counts().to_dict()
709
+ )
710
+ overall_question_answer_value_counts_normed = (
711
+ paragraph_trials_df["question_correct"].dropna().astype(int).value_counts(normalize=True).to_dict()
712
+ )
713
+ else:
714
+ overall_question_answer_value_counts = None
715
+ overall_question_answer_value_counts_normed = None
716
+ if paragraph_trials_only:
717
+ trials_dict_temp = trials_dict.copy()
718
+ for k in trials_dict_temp.keys():
719
+ if k not in ["paragraph_trials"] + trials_dict_temp["paragraph_trials"]:
720
+ trials_dict.pop(k)
721
+ if len(trials_dict_temp["paragraph_trials"]):
722
+ trial_idx = trials_dict_temp["paragraph_trials"][-1]
723
+ else:
724
+ return trials_dict
725
+ trials_dict["display_coords"] = display_coords
726
+ trials_dict["fps"] = fps
727
+ trials_dict["max_trial_idx"] = trial_idx
728
+ trials_dict["overall_question_answer_value_counts"] = overall_question_answer_value_counts
729
+ trials_dict["overall_question_answer_value_counts_normed"] = overall_question_answer_value_counts_normed
730
+ enum = (
731
+ trials_dict["paragraph_trials"]
732
+ if ("paragraph_trials" in trials_dict.keys() and paragraph_trials_only)
733
+ else range(len(trials_dict))
734
+ )
735
+ for trial_idx in enum:
736
+ if trial_idx not in trials_dict.keys():
737
+ continue
738
+ if "chars_list" in trials_dict[trial_idx]:
739
+ chars_list = trials_dict[trial_idx]["chars_list"]
740
+ else:
741
+ chars_list = []
742
+ if "display_coords" not in trials_dict[trial_idx].keys():
743
+ trials_dict[trial_idx]["display_coords"] = trials_dict["display_coords"]
744
+ trials_dict[trial_idx]["overall_question_answer_value_counts"] = trials_dict[
745
+ "overall_question_answer_value_counts"
746
+ ]
747
+ trials_dict[trial_idx]["overall_question_answer_value_counts_normed"] = trials_dict[
748
+ "overall_question_answer_value_counts_normed"
749
+ ]
750
+ trial_start_idx = trials_dict[trial_idx]["trial_id_idx"]
751
+ trial_end_idx = trials_dict[trial_idx]["trial_result_idx"]
752
+ trial_lines = lines[trial_start_idx:trial_end_idx]
753
+ if len(y_px) > 0:
754
+ trials_dict[trial_idx]["y_px"] = y_px
755
+ trials_dict[trial_idx]["x_px"] = x_px
756
+ if "calibration_method" not in trials_dict[trial_idx]:
757
+ trials_dict[trial_idx]["calibration_method"] = calibration_method
758
+ trials_dict[trial_idx]["calibration_offset"] = calibration_offset
759
+ trials_dict[trial_idx]["calibration_max_error"] = calibration_max_error
760
+ trials_dict[trial_idx]["calibration_time"] = calibration_time
761
+ trials_dict[trial_idx]["calibration_avg_error"] = calibration_avg_error
762
+ for idx, l in enumerate(trial_lines):
763
+ parts = l.strip().split(" ")
764
+ if "START" in l and " MSG" not in l:
765
+ trials_dict[trial_idx]["text_end_idx"] = trial_start_idx + idx
766
+ trials_dict[trial_idx]["start_idx"] = trial_start_idx + idx + 7
767
+ trials_dict[trial_idx]["start_time"] = int(parts[0].split("\t")[1])
768
+ elif "END" in l and "ENDBUTTON" not in l and " MSG" not in l:
769
+ trials_dict[trial_idx]["end_idx"] = trial_start_idx + idx - 2
770
+ trials_dict[trial_idx]["end_time"] = int(parts[0].split("\t")[1])
771
+ elif "MSG" not in l:
772
+ continue
773
+ elif "ENDBUTTON" in l:
774
+ trials_dict[trial_idx]["endbutton_idx"] = trial_start_idx + idx
775
+ trials_dict[trial_idx]["endbutton_time"] = int(parts[0].split("\t")[1])
776
+ elif "SYNCTIME" in l:
777
+ trials_dict[trial_idx]["synctime"] = trial_start_idx + idx
778
+ trials_dict[trial_idx]["synctime_time"] = int(parts[0].split("\t")[1])
779
+ elif start_trial_at_keyword in l:
780
+ trials_dict[trial_idx][f"{start_trial_at_keyword}_line_idx"] = trial_start_idx + idx
781
+ trials_dict[trial_idx][f"{start_trial_at_keyword}_time"] = int(parts[0].split("\t")[1])
782
+ elif "GAZE TARGET OFF" in l:
783
+ trials_dict[trial_idx]["gaze_targ_off_time"] = int(parts[0].split("\t")[1])
784
+ elif "GAZE TARGET ON" in l:
785
+ trials_dict[trial_idx]["gaze_targ_on_time"] = int(parts[0].split("\t")[1])
786
+ trials_dict[trial_idx]["gaze_targ_on_time_idx"] = trial_start_idx + idx
787
+ elif "DISPLAY_SENTENCE" in l: # some .asc files seem to use this
788
+ trials_dict[trial_idx]["gaze_targ_on_time"] = int(parts[0].split("\t")[1])
789
+ trials_dict[trial_idx]["gaze_targ_on_time_idx"] = trial_start_idx + idx
790
+ elif "DISPLAY TEXT" in l:
791
+ trials_dict[trial_idx]["text_start_idx"] = trial_start_idx + idx
792
+ elif "REGION CHAR" in l:
793
+ rg_idx = parts.index("CHAR")
794
+ if len(parts[rg_idx:]) > 8:
795
+ char = " "
796
+ idx_correction = 1
797
+ elif len(parts[rg_idx:]) == 3:
798
+ char = " "
799
+ if "REGION CHAR" not in trial_lines[idx + 1]:
800
+ parts = trial_lines[idx + 1].strip().split(" ")
801
+ idx_correction = -rg_idx - 4
802
+ else:
803
+ char = parts[rg_idx + 3]
804
+ idx_correction = 0
805
+ try:
806
+ char_dict = {
807
+ "char": char,
808
+ "char_xmin": float(parts[rg_idx + 4 + idx_correction]),
809
+ "char_ymin": float(parts[rg_idx + 5 + idx_correction]),
810
+ "char_xmax": float(parts[rg_idx + 6 + idx_correction]),
811
+ "char_ymax": float(parts[rg_idx + 7 + idx_correction]),
812
+ }
813
+ char_dict["char_y_center"] = round(
814
+ (char_dict["char_ymax"] - char_dict["char_ymin"]) / 2 + char_dict["char_ymin"], ndigits=2
815
+ )
816
+ char_dict["char_x_center"] = round(
817
+ (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"], ndigits=2
818
+ )
819
+ chars_list.append(char_dict)
820
+ except Exception as e:
821
+ ic(f"char_dict creation failed for parts {parts}")
822
+ ic(e)
823
+
824
+ if start_trial_at_keyword == "SYNCTIME" and "synctime_time" in trials_dict[trial_idx]:
825
+ trials_dict[trial_idx]["trial_start_time"] = trials_dict[trial_idx]["synctime_time"]
826
+ trials_dict[trial_idx]["trial_start_idx"] = trials_dict[trial_idx]["synctime"]
827
+ elif start_trial_at_keyword == "GAZE TARGET ON" and "gaze_targ_on_time" in trials_dict[trial_idx]:
828
+ trials_dict[trial_idx]["trial_start_time"] = trials_dict[trial_idx]["gaze_targ_on_time"]
829
+ trials_dict[trial_idx]["trial_start_idx"] = trials_dict[trial_idx]["gaze_targ_on_time_idx"]
830
+ elif start_trial_at_keyword == "START":
831
+ trials_dict[trial_idx]["trial_start_time"] = trials_dict[trial_idx]["start_time"]
832
+ trials_dict[trial_idx]["trial_start_idx"] = trials_dict[trial_idx]["start_idx"]
833
+ elif f"{start_trial_at_keyword}_time" in trials_dict[trial_idx]:
834
+ trials_dict[trial_idx]["trial_start_time"] = trials_dict[trial_idx][f"{start_trial_at_keyword}_time"]
835
+ trials_dict[trial_idx]["trial_start_idx"] = trials_dict[trial_idx][f"{start_trial_at_keyword}_line_idx"]
836
+ else:
837
+ trials_dict[trial_idx]["trial_start_time"] = trials_dict[trial_idx]["start_time"]
838
+ trials_dict[trial_idx]["trial_start_idx"] = trials_dict[trial_idx]["start_idx"]
839
+ if end_trial_at_keyword == "ENDBUTTON" and "endbutton_time" in trials_dict[trial_idx]:
840
+ trials_dict[trial_idx]["trial_end_time"] = trials_dict[trial_idx]["endbutton_time"]
841
+ trials_dict[trial_idx]["trial_end_idx"] = trials_dict[trial_idx]["endbutton_idx"]
842
+ elif end_trial_at_keyword == "END" and "end_idx" in trials_dict[trial_idx]:
843
+ trials_dict[trial_idx]["trial_end_time"] = trials_dict[trial_idx]["end_time"]
844
+ trials_dict[trial_idx]["trial_end_idx"] = trials_dict[trial_idx]["end_idx"]
845
+ elif end_trial_at_keyword == "KEYBOARD" and "keyboard_press_idx" in trials_dict[trial_idx]:
846
+ trials_dict[trial_idx]["trial_end_idx"] = trials_dict[trial_idx]["keyboard_press_idx"]
847
+ else:
848
+ trials_dict[trial_idx]["trial_end_idx"] = trials_dict[trial_idx]["trial_result_idx"]
849
+ if trials_dict[trial_idx]["trial_end_idx"] < trials_dict[trial_idx]["trial_start_idx"]:
850
+ raise ValueError(f"trial_start_idx is larger than trial_end_idx for trial_idx {trial_idx}")
851
+ if len(chars_list) > 0:
852
+ line_ycoords = []
853
+ for idx in range(len(chars_list)):
854
+ chars_list[idx]["char_y_center"] = round(
855
+ (chars_list[idx]["char_ymax"] - chars_list[idx]["char_ymin"]) / 2 + chars_list[idx]["char_ymin"],
856
+ ndigits=2,
857
+ )
858
+ if chars_list[idx]["char_y_center"] not in line_ycoords:
859
+ line_ycoords.append(chars_list[idx]["char_y_center"])
860
+ for idx in range(len(chars_list)):
861
+ chars_list[idx]["assigned_line"] = line_ycoords.index(chars_list[idx]["char_y_center"])
862
+
863
+ letter_width_avg = np.mean(
864
+ [x["char_xmax"] - x["char_xmin"] for x in chars_list if x["char_xmax"] > x["char_xmin"]]
865
+ )
866
+ line_heights = [round(abs(x["char_ymax"] - x["char_ymin"]), 3) for x in chars_list]
867
+ line_xcoords_all = [x["char_x_center"] for x in chars_list]
868
+ line_xcoords_no_pad = np.unique(line_xcoords_all)
869
+
870
+ line_ycoords_all = [x["char_y_center"] for x in chars_list]
871
+ line_ycoords_no_pad = np.unique(line_ycoords_all)
872
+
873
+ trials_dict[trial_idx]["x_char_unique"] = list(line_xcoords_no_pad)
874
+ trials_dict[trial_idx]["y_char_unique"] = list(line_ycoords_no_pad)
875
+ x_diff, y_diff = calc_xdiff_ydiff(
876
+ line_xcoords_no_pad, line_ycoords_no_pad, line_heights, allow_multiple_values=False
877
+ )
878
+ trials_dict[trial_idx]["x_diff"] = float(x_diff)
879
+ trials_dict[trial_idx]["y_diff"] = float(y_diff)
880
+ trials_dict[trial_idx]["num_char_lines"] = len(line_ycoords_no_pad)
881
+ trials_dict[trial_idx]["letter_width_avg"] = letter_width_avg
882
+ trials_dict[trial_idx]["line_heights"] = line_heights
883
+ words_list_from_func, chars_list_reconstructed = add_words(chars_list)
884
+ words_list = words_list_from_func
885
+
886
+ if close_gap_between_words: # TODO this may need to change the "in_word" col for the chars_df
887
+ for widx in range(1, len(words_list)):
888
+ if words_list[widx]["assigned_line"] == words_list[widx - 1]["assigned_line"]:
889
+ word_sep_half_width = (words_list[widx]["word_xmin"] - words_list[widx - 1]["word_xmax"]) / 2
890
+ words_list[widx - 1]["word_xmax"] = words_list[widx - 1]["word_xmax"] + word_sep_half_width
891
+ words_list[widx]["word_xmin"] = words_list[widx]["word_xmin"] - word_sep_half_width
892
+ else:
893
+ chars_df = pd.DataFrame(chars_list_reconstructed)
894
+ chars_df.loc[
895
+ chars_df["char"] == " ", ["in_word", "in_word_number", "num_letters_from_start_of_word"]
896
+ ] = pd.NA
897
+ chars_list_reconstructed = chars_df.to_dict("records")
898
+ trials_dict[trial_idx]["words_list"] = words_list
899
+ trials_dict[trial_idx]["chars_list"] = chars_list_reconstructed
900
+ return trials_dict
901
+
902
+
903
+ def get_lines_from_file(uploaded_file, asc_encoding="ISO-8859-15"):
904
+ if isinstance(uploaded_file, str) or isinstance(uploaded_file, pl.Path):
905
+ with open(uploaded_file, "r", encoding=asc_encoding) as f:
906
+ lines = f.readlines()
907
+ else:
908
+ stringio = StringIO(uploaded_file.getvalue().decode(asc_encoding))
909
+ loaded_str = stringio.read()
910
+ lines = loaded_str.split("\n")
911
+ return lines
912
+
913
+
914
+ def file_to_trials_and_lines(
915
+ uploaded_file,
916
+ asc_encoding: str = "ISO-8859-15",
917
+ close_gap_between_words=True,
918
+ paragraph_trials_only=True,
919
+ uploaded_ias_files=[],
920
+ trial_start_keyword="START",
921
+ end_trial_at_keyword="END",
922
+ ):
923
+ lines = get_lines_from_file(uploaded_file, asc_encoding=asc_encoding)
924
+ trials_dict = asc_lines_to_trials_by_trail_id(
925
+ lines,
926
+ paragraph_trials_only,
927
+ uploaded_file,
928
+ close_gap_between_words=close_gap_between_words,
929
+ ias_files=uploaded_ias_files,
930
+ start_trial_at_keyword=trial_start_keyword,
931
+ end_trial_at_keyword=end_trial_at_keyword,
932
+ )
933
+
934
+ if "paragraph_trials" not in trials_dict.keys() and "trial_is" in trials_dict[0].keys():
935
+ paragraph_trials = []
936
+ for k in range(trials_dict["max_trial_idx"]):
937
+ if trials_dict[k]["trial_is"] == "paragraph":
938
+ paragraph_trials.append(k)
939
+ trials_dict["paragraph_trials"] = paragraph_trials
940
+
941
+ enum = (
942
+ trials_dict["paragraph_trials"]
943
+ if paragraph_trials_only and "paragraph_trials" in trials_dict.keys()
944
+ else range(trials_dict["max_trial_idx"])
945
+ )
946
+ for k in enum:
947
+ if "chars_list" in trials_dict[k].keys():
948
+ max_line = trials_dict[k]["chars_list"][-1]["assigned_line"]
949
+ words_on_lines = {x: [] for x in range(max_line + 1)}
950
+ [words_on_lines[x["assigned_line"]].append(x["char"]) for x in trials_dict[k]["chars_list"]]
951
+ line_list = ["".join([s for s in v]) for idx, v in words_on_lines.items()]
952
+ sentences_temp = "".join([x["char"] for x in trials_dict[k]["chars_list"]])
953
+ sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z]\.)(?<![A-Z][a-z]\.)(?<=\.|\?)", sentences_temp)
954
+ text = "\n".join([x for x in line_list])
955
+ trials_dict[k]["sentence_list"] = [s for s in sentences if len(s) > 0]
956
+ trials_dict[k]["line_list"] = line_list
957
+ trials_dict[k]["text"] = text
958
+ trials_dict[k]["max_line"] = max_line
959
+
960
+ return trials_dict, lines
961
+
962
+
963
+ def discard_empty_str_from_list(l):
964
+ return [x for x in l if len(x) > 0]
965
+
966
+
967
+ def make_folders(gradio_temp_folder, gradio_temp_unzipped_folder, PLOTS_FOLDER):
968
+ gradio_temp_folder.mkdir(exist_ok=True)
969
+ gradio_temp_unzipped_folder.mkdir(exist_ok=True)
970
+ PLOTS_FOLDER.mkdir(exist_ok=True)
971
+ return 0
972
+
973
+
974
+ def plotly_plot_with_image(
975
+ dffix,
976
+ trial,
977
+ algo_choice,
978
+ saccade_df=None,
979
+ to_plot_list=["Uncorrected Fixations", "Corrected Fixations", "Word boxes"],
980
+ lines_in_plot="Uncorrected",
981
+ scale_factor=0.5,
982
+ font="DejaVu Sans Mono",
983
+ box_annotations: list = None,
984
+ ):
985
+ mpl_fig, img_width, img_height = matplotlib_plot_df(
986
+ dffix,
987
+ trial,
988
+ algo_choice,
989
+ None,
990
+ desired_dpi=300,
991
+ fix_to_plot=[],
992
+ stim_info_to_plot=to_plot_list,
993
+ font=font,
994
+ box_annotations=box_annotations,
995
+ )
996
+ mpl_fig.savefig(TEMP_FIGURE_STIMULUS_PATH)
997
+ plt.close(mpl_fig)
998
+ if lines_in_plot == "Uncorrected":
999
+ uncorrected_plot_mode = "markers+lines+text"
1000
+ else:
1001
+ uncorrected_plot_mode = "markers+text"
1002
+
1003
+ if lines_in_plot == "Corrected":
1004
+ corrected_plot_mode = "markers+lines+text"
1005
+ else:
1006
+ corrected_plot_mode = "markers+text"
1007
+
1008
+ if lines_in_plot == "Both":
1009
+ uncorrected_plot_mode = "markers+lines+text"
1010
+ corrected_plot_mode = "markers+lines+text"
1011
+
1012
+ fig = go.Figure()
1013
+ fig.add_trace(
1014
+ go.Scatter(
1015
+ x=[0, img_width * scale_factor],
1016
+ y=[img_height * scale_factor, 0],
1017
+ mode="markers",
1018
+ marker_opacity=0,
1019
+ name="scale_helper",
1020
+ )
1021
+ )
1022
+
1023
+ fig.update_xaxes(visible=False, range=[0, img_width * scale_factor])
1024
+
1025
+ fig.update_yaxes(
1026
+ visible=False,
1027
+ range=[img_height * scale_factor, 0],
1028
+ scaleanchor="x",
1029
+ )
1030
+ if (
1031
+ "Words" in to_plot_list
1032
+ or "Word boxes" in to_plot_list
1033
+ or "Character boxes" in to_plot_list
1034
+ or "Characters" in to_plot_list
1035
+ ):
1036
+ imsource = Image.open(str(TEMP_FIGURE_STIMULUS_PATH))
1037
+ fig.add_layout_image(
1038
+ dict(
1039
+ x=0,
1040
+ sizex=img_width * scale_factor,
1041
+ y=0,
1042
+ sizey=img_height * scale_factor,
1043
+ xref="x",
1044
+ yref="y",
1045
+ opacity=1.0,
1046
+ layer="below",
1047
+ sizing="stretch",
1048
+ source=imsource,
1049
+ )
1050
+ )
1051
+
1052
+ duration_scaled = dffix.duration - dffix.duration.min()
1053
+ duration_scaled = ((duration_scaled / duration_scaled.max()) - 0.5) * 3
1054
+ duration = sigmoid(duration_scaled) * 50 * scale_factor
1055
+ if "Uncorrected Fixations" in to_plot_list:
1056
+ fig.add_trace(
1057
+ go.Scatter(
1058
+ x=dffix.x * scale_factor,
1059
+ y=dffix.y * scale_factor,
1060
+ mode=uncorrected_plot_mode,
1061
+ name="Raw fixations",
1062
+ marker=dict(
1063
+ color=COLORS[-1],
1064
+ symbol="arrow",
1065
+ size=duration.values,
1066
+ angleref="previous",
1067
+ ),
1068
+ line=dict(color=COLORS[-1], width=2 * scale_factor),
1069
+ text=np.arange(dffix.shape[0]),
1070
+ textposition="top right",
1071
+ textfont=dict(
1072
+ family="sans serif",
1073
+ size=23 * scale_factor,
1074
+ color=COLORS[-1],
1075
+ ),
1076
+ hovertext=[f"x:{x}, y:{y}, n:{num}" for x, y, num in zip(dffix.x, dffix[f"y"], range(dffix.shape[0]))],
1077
+ opacity=0.9,
1078
+ )
1079
+ )
1080
+
1081
+ if "Corrected Fixations" in to_plot_list:
1082
+ if isinstance(algo_choice, list):
1083
+ algo_choices = algo_choice
1084
+ repeats = range(len(algo_choice))
1085
+ else:
1086
+ algo_choices = [algo_choice]
1087
+ repeats = range(1)
1088
+ for algoIdx in repeats:
1089
+ algo_choice = algo_choices[algoIdx]
1090
+ if f"y_{algo_choice}" in dffix.columns:
1091
+ fig.add_trace(
1092
+ go.Scatter(
1093
+ x=dffix.x * scale_factor,
1094
+ y=dffix.loc[:, f"y_{algo_choice}"] * scale_factor,
1095
+ mode=corrected_plot_mode,
1096
+ name=algo_choice,
1097
+ marker=dict(
1098
+ color=COLORS[algoIdx],
1099
+ symbol="arrow",
1100
+ size=duration.values,
1101
+ angleref="previous",
1102
+ ),
1103
+ line=dict(color=COLORS[algoIdx], width=1.5 * scale_factor),
1104
+ text=np.arange(dffix.shape[0]),
1105
+ textposition="top center",
1106
+ textfont=dict(
1107
+ family="sans serif",
1108
+ size=22 * scale_factor,
1109
+ color=COLORS[algoIdx],
1110
+ ),
1111
+ hovertext=[
1112
+ f"x:{x}, y:{y}, n:{num}"
1113
+ for x, y, num in zip(dffix.x, dffix[f"y_{algo_choice}"], range(dffix.shape[0]))
1114
+ ],
1115
+ opacity=0.9,
1116
+ )
1117
+ )
1118
+ if "Saccades" in to_plot_list:
1119
+
1120
+ duration_scaled = saccade_df.duration - saccade_df.duration.min()
1121
+ duration_scaled = ((duration_scaled / duration_scaled.max()) - 0.5) * 3
1122
+ duration = sigmoid(duration_scaled) * 65 * scale_factor
1123
+ starting_coordinates = [tuple(row * scale_factor) for row in saccade_df.loc[:, ["xs", "ys"]].values]
1124
+ ending_coordinates = [tuple(row * scale_factor) for row in saccade_df.loc[:, ["xe", "ye"]].values]
1125
+ for sidx, (start, end) in enumerate(zip(starting_coordinates, ending_coordinates)):
1126
+ if sidx == 0:
1127
+ show_legend = True
1128
+ else:
1129
+ show_legend = False
1130
+
1131
+ fig.add_trace(
1132
+ go.Scatter(
1133
+ x=[start[0], end[0]],
1134
+ y=[start[1], end[1]],
1135
+ mode="markers+lines+text",
1136
+ line=dict(color=COLORS[-1], width=1.5 * scale_factor, dash="dash"),
1137
+ showlegend=show_legend,
1138
+ legendgroup="1",
1139
+ name="Saccades",
1140
+ text=sidx,
1141
+ textposition="top center",
1142
+ textfont=dict(family="sans serif", size=22 * scale_factor, color=COLORS[-1]),
1143
+ marker=dict(
1144
+ color=COLORS[-1],
1145
+ symbol="arrow",
1146
+ size=duration.values,
1147
+ angleref="previous",
1148
+ ),
1149
+ )
1150
+ )
1151
+ if "Saccades snapped to line" in to_plot_list:
1152
+
1153
+ duration_scaled = saccade_df.duration - saccade_df.duration.min()
1154
+ duration_scaled = ((duration_scaled / duration_scaled.max()) - 0.5) * 3
1155
+ duration = sigmoid(duration_scaled) * 65 * scale_factor
1156
+
1157
+ if isinstance(algo_choice, list):
1158
+ algo_choices = algo_choice
1159
+ repeats = range(len(algo_choice))
1160
+ else:
1161
+ algo_choices = [algo_choice]
1162
+ repeats = range(1)
1163
+ for algoIdx in repeats:
1164
+ algo_choice = algo_choices[algoIdx]
1165
+ if f"ys_{algo_choice}" in saccade_df.columns:
1166
+ starting_coordinates = [
1167
+ tuple(row * scale_factor) for row in saccade_df.loc[:, ["xs", f"ys_{algo_choice}"]].values
1168
+ ]
1169
+ ending_coordinates = [
1170
+ tuple(row * scale_factor) for row in saccade_df.loc[:, ["xe", f"ye_{algo_choice}"]].values
1171
+ ]
1172
+ for sidx, (start, end) in enumerate(zip(starting_coordinates, ending_coordinates)):
1173
+ if sidx == 0:
1174
+ show_legend = True
1175
+ else:
1176
+ show_legend = False
1177
+ fig.add_trace(
1178
+ go.Scatter(
1179
+ x=[start[0], end[0]],
1180
+ y=[start[1], end[1]],
1181
+ mode="markers+lines",
1182
+ line=dict(color=COLORS[algoIdx], width=1.5 * scale_factor, dash="dash"),
1183
+ showlegend=show_legend,
1184
+ legendgroup="2",
1185
+ text=sidx,
1186
+ textposition="top center",
1187
+ textfont=dict(family="sans serif", size=22 * scale_factor, color=COLORS[algoIdx]),
1188
+ name="Saccades snapped to line",
1189
+ marker=dict(
1190
+ color=COLORS[algoIdx],
1191
+ symbol="arrow",
1192
+ size=duration.values,
1193
+ angleref="previous",
1194
+ ),
1195
+ )
1196
+ )
1197
+ fig.update_layout(
1198
+ plot_bgcolor=None,
1199
+ width=img_width * scale_factor,
1200
+ height=img_height * scale_factor,
1201
+ margin={"l": 0, "r": 0, "t": 0, "b": 0},
1202
+ legend=dict(orientation="h", yanchor="bottom", y=-0.1, xanchor="right", x=0.8),
1203
+ )
1204
+
1205
+ for trace in fig["data"]:
1206
+ if trace["name"] == "scale_helper":
1207
+ trace["showlegend"] = False
1208
+ return fig
1209
+
1210
+
1211
+ def plot_fix_measure(
1212
+ dffix,
1213
+ plot_choices,
1214
+ x_axis_selection,
1215
+ margin=dict(t=40, l=10, r=10, b=1),
1216
+ label_start="Fixation",
1217
+ ):
1218
+ y_label = f"{label_start} Feature"
1219
+ if x_axis_selection == "Index":
1220
+ num_datapoints = dffix.shape[0]
1221
+ x_label = f"{label_start} Number"
1222
+ x_nums = np.arange(num_datapoints)
1223
+ elif x_axis_selection == "Start Time":
1224
+ x_label = f"{label_start} Start Time"
1225
+ x_nums = dffix["start_time"]
1226
+
1227
+ layout = dict(
1228
+ plot_bgcolor="white",
1229
+ autosize=True,
1230
+ margin=margin,
1231
+ xaxis=dict(
1232
+ title=x_label,
1233
+ linecolor="black",
1234
+ range=[x_nums.min() - 1, x_nums.max() + 1],
1235
+ showgrid=False,
1236
+ mirror="all",
1237
+ showline=True,
1238
+ ),
1239
+ yaxis=dict(
1240
+ title=y_label,
1241
+ side="left",
1242
+ linecolor="black",
1243
+ showgrid=False,
1244
+ mirror="all",
1245
+ showline=True,
1246
+ ),
1247
+ legend=dict(orientation="v", yanchor="middle", y=0.95, xanchor="left", x=1.05),
1248
+ )
1249
+
1250
+ fig = go.Figure(layout=layout)
1251
+ for pidx, plot_choice in enumerate(plot_choices):
1252
+ fig.add_trace(
1253
+ go.Scatter(
1254
+ x=x_nums,
1255
+ y=dffix.loc[:, plot_choice],
1256
+ mode="markers",
1257
+ name=plot_choice,
1258
+ marker_color=COLORS[pidx],
1259
+ marker_size=3,
1260
+ showlegend=True,
1261
+ )
1262
+ )
1263
+ fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor="black")
1264
+
1265
+ return fig
1266
+
1267
+
1268
+ def plot_y_corr(dffix, algo_choice, margin=dict(t=40, l=10, r=10, b=1)):
1269
+ num_datapoints = len(dffix.x)
1270
+
1271
+ layout = dict(
1272
+ plot_bgcolor="white",
1273
+ autosize=True,
1274
+ margin=margin,
1275
+ xaxis=dict(
1276
+ title="Fixation Index",
1277
+ linecolor="black",
1278
+ range=[-1, num_datapoints + 1],
1279
+ showgrid=False,
1280
+ mirror="all",
1281
+ showline=True,
1282
+ ),
1283
+ yaxis=dict(
1284
+ title="y correction",
1285
+ side="left",
1286
+ linecolor="black",
1287
+ showgrid=False,
1288
+ mirror="all",
1289
+ showline=True,
1290
+ ),
1291
+ legend=dict(orientation="v", yanchor="middle", y=0.95, xanchor="left", x=1.05),
1292
+ )
1293
+ if isinstance(dffix, dict):
1294
+ dffix = dffix["value"]
1295
+ algo_string = algo_choice[0] if isinstance(algo_choice, list) else algo_choice
1296
+ if f"y_{algo_string}_correction" not in dffix.columns:
1297
+ ic("No line-assignment column found in dataframe")
1298
+ return go.Figure(layout=layout)
1299
+ if isinstance(dffix, dict):
1300
+ dffix = dffix["value"]
1301
+
1302
+ fig = go.Figure(layout=layout)
1303
+
1304
+ if isinstance(algo_choice, list):
1305
+ algo_choices = algo_choice
1306
+ repeats = range(len(algo_choice))
1307
+ else:
1308
+ algo_choices = [algo_choice]
1309
+ repeats = range(1)
1310
+ for algoIdx in repeats:
1311
+ algo_choice = algo_choices[algoIdx]
1312
+ fig.add_trace(
1313
+ go.Scatter(
1314
+ x=np.arange(num_datapoints),
1315
+ y=dffix.loc[:, f"y_{algo_choice}_correction"],
1316
+ mode="markers",
1317
+ name=f"{algo_choice} y correction",
1318
+ marker_color=COLORS[algoIdx],
1319
+ marker_size=3,
1320
+ showlegend=True,
1321
+ )
1322
+ )
1323
+ fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor="black")
1324
+
1325
+ return fig
1326
+
1327
+
1328
+ def download_example_ascs(EXAMPLES_FOLDER, EXAMPLES_ASC_ZIP_FILENAME, OSF_DOWNLAOD_LINK, EXAMPLES_FOLDER_PATH):
1329
+ if not os.path.isdir(EXAMPLES_FOLDER):
1330
+ os.mkdir(EXAMPLES_FOLDER)
1331
+
1332
+ if not os.path.exists(EXAMPLES_ASC_ZIP_FILENAME):
1333
+ download_url(OSF_DOWNLAOD_LINK, EXAMPLES_ASC_ZIP_FILENAME)
1334
+
1335
+ if os.path.exists(EXAMPLES_ASC_ZIP_FILENAME):
1336
+ if EXAMPLES_FOLDER_PATH.exists():
1337
+ EXAMPLE_ASC_FILES = [x for x in EXAMPLES_FOLDER_PATH.glob("*.asc")]
1338
+ if len(EXAMPLE_ASC_FILES) != 4:
1339
+ try:
1340
+ with zipfile.ZipFile(EXAMPLES_ASC_ZIP_FILENAME, "r") as zip_ref:
1341
+ zip_ref.extractall(EXAMPLES_FOLDER)
1342
+ except Exception as e:
1343
+ ic(e)
1344
+ ic(f"Extracting {EXAMPLES_ASC_ZIP_FILENAME} failed")
1345
+
1346
+ EXAMPLE_ASC_FILES = [x for x in EXAMPLES_FOLDER_PATH.glob("*.asc")]
1347
+ else:
1348
+ EXAMPLE_ASC_FILES = []
1349
+ return EXAMPLE_ASC_FILES
word_measures.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Column names for Word measures
2
+ Some features were adapted from the popEye R package ([github](https://github.com/sascha2schroeder/popEye))
3
+ The if the column depend on a line assignment then a _ALGORITHM_NAME will be at the end of the name.
4
+
5
+ - subject: Subject name or ID
6
+ - trial_id: Trial ID
7
+ - item: Item ID
8
+ - condition: Condition (if applicable)
9
+ - word_number: Number of word in trial
10
+ - word_length: Number of characters in word
11
+ - word_xmin: x-coordinate of left side of bounding box
12
+ - word_xmax: x-coordinate of right side of bounding box
13
+ - word_ymin: y-coordinate of top of bounding box
14
+ - word_ymax: y-coordinate of bottom of bounding box
15
+ - word_x_center: x-coordinate of center of bounding box
16
+ - word_y_center: y-coordinate of center of bounding box
17
+ - assigned_line: Line number to which the word belongs
18
+ - word: Text of word
19
+ - blink_ALGORITHM_NAME: Variable indicating whether there was blink directly before, during, or directly after the word was fixated
20
+ - number_of_fixations_ALGORITHM_NAME: Number of fixations on the word during the whole trial
21
+ - initial_fixation_duration_ALGORITHM_NAME: Duration of the initial fixation on that word
22
+ - first_of_many_duration_ALGORITHM_NAME: Duration of the initial fixation on that word, but only if there was more than one fixation on the word
23
+ - total_fixation_duration_ALGORITHM_NAME: Total time the word was read during the trial in ms (total reading time)
24
+ - gaze_duration_ALGORITHM_NAME: The sum duration of all fixations inside a word until the word is exited for the first time
25
+ - go_past_duration_ALGORITHM_NAME: Go-past time is the sum duration of all fixations from when the interest area is first entered until when it is first exited to the right, including any regressions to the left that occur during that time period
26
+ - second_pass_duration_ALGORITHM_NAME: Second pass duration is the sum duration of all fixations inside an interest area during the second pass over that interest area.
27
+ - initial_landing_position_ALGORITHM_NAME:
28
+ - initial_landing_distance_ALGORITHM_NAME:
29
+ - landing_distances_ALGORITHM_NAME:
30
+ - number_of_regressions_in_ALGORITHM_NAME:
31
+ - singlefix_sac_in_ALGORITHM_NAME: Incoming saccade length (in letters) for the first fixation on the word when it was fixated only once during first-pass reading
32
+ - firstrun_nfix_ALGORITHM_NAME: Number of fixations made on the word during first-pass reading
33
+ - singlefix_land_ALGORITHM_NAME: Landing position (letter) of the first fixation on the word when it was fixated only once during first-pass reading
34
+ - firstrun_skip_ALGORITHM_NAME: Variable indicating whether the word was skipped during first-pass reading
35
+ - firstfix_cland_ALGORITHM_NAME: Centered landing position of the first fixation on the word (Vitu et al., 2001: landing position - ((wordlength + 1) / 2))
36
+ - singlefix_dur_ALGORITHM_NAME: Duration of the first fixation on the word when it was fixated only once during first-pass reading
37
+ - firstrun_gopast_sel_ALGORITHM_NAME: Sum of all fixations on the word from the time it was entered until it was left to the right (selective go-past time: go-past time minus the time of the regression path)
38
+ - firstfix_land_ALGORITHM_NAME: Landing position (letter) of the first fixation on the word
39
+ - skip_ALGORITHM_NAME: Variable indicating whether the word was fixated in the trial
40
+ - firstrun_refix_ALGORITHM_NAME: Variable indicating whether the word was refixated during first-pass reading
41
+ - firstrun_reg_out_ALGORITHM_NAME: Variable indicating whether there was a regression from the word during first-pass reading
42
+ - blink_ALGORITHM_NAME:
43
+ - firstfix_sac_out_ALGORITHM_NAME: Outgoing saccade length (in letters) for the first fixation on the word
44
+ - reread_ALGORITHM_NAME: Variable indicating whether the word was reread at least once during the trial
45
+ - refix_ALGORITHM_NAME: Variable indicating whether the word has been refixated at least once during a trial
46
+ - reg_in_ALGORITHM_NAME: Variable indicating whether there was at least one regression into the word
47
+ - firstrun_dur_ALGORITHM_NAME: Time the word was read during first-pass reading (gaze duration)
48
+ - firstfix_sac_in_ALGORITHM_NAME: Incoming saccade length (in letters) for the first fixation on the word
49
+ - singlefix_ALGORITHM_NAME: Variable indicating whether the word was fixated only once during first-pass reading
50
+ - firstrun_gopast_ALGORITHM_NAME: Sum of all fixations durations from the time the word was entered until it was left to the right (go-past time/regression path duration)
51
+ - nrun_ALGORITHM_NAME: Number of times the word was reread within the trial ("reread" means that it was read again after it has been left to the left or right)
52
+ - singlefix_cland_ALGORITHM_NAME: Centred landing position of the first fixation on the word when it was fixated only once during first-pass reading
53
+ - reg_out_ALGORITHM_NAME: Variable indicating whether there was at least one regression from the word
54
+ - firstfix_dur_ALGORITHM_NAME: Duration of the first fixation on the word (first fixation duration)
55
+ - firstfix_launch_ALGORITHM_NAME: Launch site distance (incoming saccade length until the space before the word)
56
+ - singlefix_sac_out_ALGORITHM_NAME: Outgoing saccade length (in letters) for the first fixation on the word when it was fixated only once during first-pass reading
57
+ - firstrun_reg_in_ALGORITHM_NAME: Variable indicating whether there was a regression into the word during first-pass reading
58
+ - singlefix_launch_ALGORITHM_NAME: Launch site distance (incoming saccade length until the space before the word) for the first fixation on the word when it was fixated only once during first-pass reading