aeye-lab · SiQube · Oct 23, 2024 · Oct 23, 2024 · Oct 24, 2024 · Oct 24, 2024
@@ -231,6 +231,30 @@ def load_precomputed_reading_measures(self) -> None:
             self.paths,
         )
 
+    def split_gaze_data(
+            self,
+            by: list[str] | str,
+    ) -> None:
+        """Split gaze data into separated GazeDataFrame's.
+
+        Parameters
+        ----------
+        by: list[str] | str
+            Column(s) to split dataframe by.
+        """
+        fileinfo_dicts = self.fileinfo['gaze'].to_dicts()
+
+        all_gaze_frames = []
+        all_fileinfo_rows = []
+
+        for frame, fileinfo_row in zip(self.gaze, fileinfo_dicts):
+            split_frames = frame.split(by=by)
+            all_gaze_frames.extend(split_frames)
+            all_fileinfo_rows.extend([fileinfo_row] * len(split_frames))
+
+        self.gaze = all_gaze_frames
+        self.fileinfo['gaze'] = pl.concat([pl.from_dict(row) for row in all_fileinfo_rows])
+
     def split_precomputed_events(
             self,
             by: list[str] | str,

@@ -285,6 +285,7 @@ def __init__(
 
         # Remove this attribute once #893 is fixed
         self._metadata: dict[str, Any] | None = None
+        self.auto_column_detect = auto_column_detect
 
     def apply(
             self,
@@ -307,6 +308,33 @@ def apply(
         else:
             raise ValueError(f"unsupported method '{function}'")
 
+    def split(self, by: list[str] | str) -> list[GazeDataFrame]:
+        """Split the GazeDataFrame into multiple frames based on specified column(s).
+
+        Parameters
+        ----------
+        by: list[str] | str
+            Column name(s) to split the DataFrame by. If a single string is provided,
+            it will be used as a single column name. If a list is provided, the DataFrame
+            will be split by unique combinations of values in all specified columns.
+
+        Returns
+        -------
+        list[GazeDataFrame]
+            A list of new GazeDataFrame instances, each containing a partition of the
+            original data with all metadata and configurations preserved.
+        """
+        return [
+            GazeDataFrame(
+                new_frame,
+                experiment=self.experiment,
+                trial_columns=self.trial_columns,
+                time_column='time',
+                distance_column='distance',
+            )
+            for new_frame in self.frame.partition_by(by=by)
+        ]
+
     def transform(
             self,
             transform_method: str | Callable[..., pl.Expr],

@@ -146,6 +146,8 @@ def mock_toy(
                     'y_left_pix': np.zeros(1000),
                     'x_right_pix': np.zeros(1000),
                     'y_right_pix': np.zeros(1000),
+                    'trial_id_1': np.concatenate([np.zeros(500), np.ones(500)]),
+                    'trial_id_2': ['a'] * 200 + ['b'] * 200 + ['c'] * 600,
                 },
                 schema={
                     'subject_id': pl.Int64,
@@ -154,6 +156,8 @@ def mock_toy(
                     'y_left_pix': pl.Float64,
                     'x_right_pix': pl.Float64,
                     'y_right_pix': pl.Float64,
+                    'trial_id_1': pl.Float64,
+                    'trial_id_2': pl.Utf8,
                 },
             )
             pixel_columns = ['x_left_pix', 'y_left_pix', 'x_right_pix', 'y_right_pix']
@@ -169,6 +173,8 @@ def mock_toy(
                     'y_right_pix': np.zeros(1000),
                     'x_avg_pix': np.zeros(1000),
                     'y_avg_pix': np.zeros(1000),
+                    'trial_id_1': np.concatenate([np.zeros(500), np.ones(500)]),
+                    'trial_id_2': ['a'] * 200 + ['b'] * 200 + ['c'] * 600,
                 },
                 schema={
                     'subject_id': pl.Int64,
@@ -179,6 +185,8 @@ def mock_toy(
                     'y_right_pix': pl.Float64,
                     'x_avg_pix': pl.Float64,
                     'y_avg_pix': pl.Float64,
+                    'trial_id_1': pl.Float64,
+                    'trial_id_2': pl.Utf8,
                 },
             )
             pixel_columns = [
@@ -192,12 +200,16 @@ def mock_toy(
                     'time': np.arange(1000),
                     'x_left_pix': np.zeros(1000),
                     'y_left_pix': np.zeros(1000),
+                    'trial_id_1': np.concatenate([np.zeros(500), np.ones(500)]),
+                    'trial_id_2': ['a'] * 200 + ['b'] * 200 + ['c'] * 600,
                 },
                 schema={
                     'subject_id': pl.Int64,
                     'time': pl.Int64,
                     'x_left_pix': pl.Float64,
                     'y_left_pix': pl.Float64,
+                    'trial_id_1': pl.Float64,
+                    'trial_id_2': pl.Utf8,
                 },
             )
             pixel_columns = ['x_left_pix', 'y_left_pix']
@@ -208,12 +220,16 @@ def mock_toy(
                     'time': np.arange(1000),
                     'x_right_pix': np.zeros(1000),
                     'y_right_pix': np.zeros(1000),
+                    'trial_id_1': np.concatenate([np.zeros(500), np.ones(500)]),
+                    'trial_id_2': ['a'] * 200 + ['b'] * 200 + ['c'] * 600,
                 },
                 schema={
                     'subject_id': pl.Int64,
                     'time': pl.Int64,
                     'x_right_pix': pl.Float64,
                     'y_right_pix': pl.Float64,
+                    'trial_id_1': pl.Float64,
+                    'trial_id_2': pl.Utf8,
                 },
             )
             pixel_columns = ['x_right_pix', 'y_right_pix']
@@ -224,12 +240,16 @@ def mock_toy(
                     'time': np.arange(1000),
                     'x_pix': np.zeros(1000),
                     'y_pix': np.zeros(1000),
+                    'trial_id_1': np.concatenate([np.zeros(500), np.ones(500)]),
+                    'trial_id_2': ['a'] * 200 + ['b'] * 200 + ['c'] * 600,
                 },
                 schema={
                     'subject_id': pl.Int64,
                     'time': pl.Int64,
                     'x_pix': pl.Float64,
                     'y_pix': pl.Float64,
+                    'trial_id_1': pl.Float64,
+                    'trial_id_2': pl.Utf8,
                 },
             )
             pixel_columns = ['x_pix', 'y_pix']
@@ -1000,7 +1020,8 @@ def test_detect_events_attribute_error(gaze_dataset_configuration):
             },
             (
                 "Column 'position' not found. Available columns are: "
-                "['time', 'subject_id', 'pixel', 'custom_position', 'velocity']"
+                "['time', 'trial_id_1', 'trial_id_2', 'subject_id', "
+                "'pixel', 'custom_position', 'velocity']"
             ),
             id='no_position',
         ),
@@ -1012,7 +1033,8 @@ def test_detect_events_attribute_error(gaze_dataset_configuration):
             },
             (
                 "Column 'velocity' not found. Available columns are: "
-                "['time', 'subject_id', 'pixel', 'position', 'custom_velocity']"
+                "['time', 'trial_id_1', 'trial_id_2', 'subject_id', "
+                "'pixel', 'position', 'custom_velocity']"
             ),
             id='no_velocity',
         ),
@@ -1930,3 +1952,30 @@ def test_load_split_precomputed_events(precomputed_dataset_configuration, by, ex
     dataset.load()
     dataset.split_precomputed_events(by)
     assert len(dataset.precomputed_events) == expected_len
+
+
+@pytest.mark.parametrize(
+    ('by', 'expected_len'),
+    [
+        pytest.param(
+            'trial_id_1',
+            40,
+            id='subset_int',
+        ),
+        pytest.param(
+            'trial_id_2',
+            60,
+            id='subset_int',
+        ),
+        pytest.param(
+            ['trial_id_1', 'trial_id_2'],
+            80,
+            id='subset_int',
+        ),
+    ],
+)
+def test_load_split_gaze(gaze_dataset_configuration, by, expected_len):
+    dataset = pm.Dataset(**gaze_dataset_configuration['init_kwargs'])
+    dataset.load()
+    dataset.split_gaze_data(by)
+    assert len(dataset.gaze) == expected_len
@@ -205,3 +205,25 @@ def test_gaze_dataframe_copy_no_experiment():
 
     # We want to have separate experiment instances but the same values.
     assert gaze.experiment is gaze_copy.experiment
+
+
+def test_gaze_dataframe_split():
+    gaze = pm.GazeDataFrame(
+        pl.DataFrame(
+            {
+                'x': [0, 1, 2, 3],
+                'y': [1, 1, 0, 0],
+                'trial_id': [0, 1, 1, 2],
+            },
+            schema={'x': pl.Float64, 'y': pl.Float64, 'trial_id': pl.Int8},
+        ),
+        experiment=None,
+        position_columns=['x', 'y'],
+    )
+
+    split_gaze = gaze.split('trial_id')
+    assert all(gaze_df.frame.n_unique('trial_id') == 1 for gaze_df in split_gaze)
+    assert len(split_gaze) == 3
+    assert_frame_equal(gaze.frame.filter(pl.col('trial_id') == 0), split_gaze[0].frame)
+    assert_frame_equal(gaze.frame.filter(pl.col('trial_id') == 1), split_gaze[1].frame)
+    assert_frame_equal(gaze.frame.filter(pl.col('trial_id') == 2), split_gaze[2].frame)