From 13395f0c151ce151a5963f78202309786a53095b Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Thu, 2 Jun 2022 14:25:10 -0500 Subject: [PATCH 01/13] FIX-#4522: Correct multiindex metadata with groupby Signed-off-by: Devin Petersohn --- modin/core/dataframe/algebra/groupby.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modin/core/dataframe/algebra/groupby.py b/modin/core/dataframe/algebra/groupby.py index 78bb5dbc9dd..e8fef51f6f1 100644 --- a/modin/core/dataframe/algebra/groupby.py +++ b/modin/core/dataframe/algebra/groupby.py @@ -133,6 +133,12 @@ def map( axis=1, ) other = list(other.columns) + # GH#4522: Vile as this may be, it is necessary to avoid the case where we are + # grouping by columns that were recently added to the data via + # `from_labels`. The internal dataframe doesn't know what to do when + # the label matches a column name. + if len(df.columns.intersection(df.index.names)) > 0: + df = df.reset_index(drop=True) by_part = other else: by_part = by From 8040915c29bc19b9c60fc3619fbb99b9f7f46d0b Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Thu, 2 Jun 2022 14:32:49 -0500 Subject: [PATCH 02/13] Add test Signed-off-by: Devin Petersohn --- modin/pandas/test/test_groupby.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 6f885199d95..1965d59b098 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -2003,3 +2003,25 @@ def test_sum_with_level(): } modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) eval_general(modin_df, pandas_df, lambda df: df.set_index("C").groupby("C").sum()) + + +def test_reset_index_groupby(): + frame_data = np.random.randint(97, 198, size=(2**6, 2**4)) + pandas_df = pandas.DataFrame( + frame_data, + index=pandas.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in range(2**6)] + ), + ).add_prefix("col") + pandas_df.index.names = [f"index_{i}" for i in range(len(pandas_df.index.names))] + # Convert every other column to string + for col in pandas_df.iloc[ + :, [i for i in range(len(pandas_df.columns)) if i % 2 == 0] + ]: + pandas_df[col] = [str(chr(i)) for i in pandas_df[col]] + modin_df = from_pandas(pandas_df) + eval_general( + modin_df, + pandas_df, + lambda df: df.reset_index().groupby(["index_0", "index_1"]).count(), + ) From 6e8466e647ed9c3aca3aff81f727a461fc866fa6 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Thu, 2 Jun 2022 14:33:54 -0500 Subject: [PATCH 03/13] Add release note Signed-off-by: Devin Petersohn --- docs/release_notes/release_notes-0.15.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release_notes/release_notes-0.15.0.rst b/docs/release_notes/release_notes-0.15.0.rst index 45c9b023b9a..ce22e5dfd1f 100644 --- a/docs/release_notes/release_notes-0.15.0.rst +++ b/docs/release_notes/release_notes-0.15.0.rst @@ -27,6 +27,7 @@ Key Features and Updates * FIX-#4481: Allow clipping with a Modin Series of bounds (#4486) * FIX-#4504: Support na_action in applymap (#4505) * FIX-#4503: Stop the memory logging thread after session exit (#4515) + * FIX-#4522: Correct multiindex metadata with groupby (#4523) * Performance enhancements * FEAT-#4320: Add connectorx as an alternative engine for read_sql (#4346) * PERF-#4493: Use partition size caches more in Modin dataframe (#4495) From 95f551001e11bd908f40c02f0e83b0337788ed42 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 3 Jun 2022 08:50:38 -0500 Subject: [PATCH 04/13] Fix error checking for groupby Signed-off-by: Devin Petersohn --- modin/pandas/dataframe.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 6d3cc6812b4..85ec58d6cdc 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -415,7 +415,6 @@ def groupby( ) else: squeeze = False - axis = self._get_axis_number(axis) idx_name = None # Drop here indicates whether or not to drop the data column before doing the @@ -448,6 +447,14 @@ def groupby( idx_name = by.name by = by._query_compiler elif is_list_like(by): + for k in by: + if k in self.index.names and k in self.axes[axis]: + level_name, index_name = "an index", "a column" + if axis == 1: + level_name, index_name = index_name, level_name + raise ValueError( + f"{k} is both {level_name} level and {index_name} label, which is ambiguous." + ) # fastpath for multi column groupby if axis == 0 and all( ( From d2e8c5f6802d35484f9856753d91e6d484e4f52a Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 7 Jun 2022 11:18:33 -0500 Subject: [PATCH 05/13] Fix error checking Signed-off-by: Devin Petersohn --- modin/pandas/dataframe.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 85ec58d6cdc..7d2c05410ac 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -423,7 +423,16 @@ def groupby( # strings is passed in, the data used for the groupby is dropped before the # groupby takes place. drop = False - + # Check that there is no ambiguity in the parameter we were given. + _by_check = by if is_list_like(by) else [by] + for k in _by_check: + if k in self.index.names and k in self.axes[axis]: + level_name, index_name = "an index", "a column" + if axis == 1: + level_name, index_name = index_name, level_name + raise ValueError( + f"{k} is both {level_name} level and {index_name} label, which is ambiguous." + ) if ( not isinstance(by, (pandas.Series, Series)) and is_list_like(by) @@ -447,14 +456,6 @@ def groupby( idx_name = by.name by = by._query_compiler elif is_list_like(by): - for k in by: - if k in self.index.names and k in self.axes[axis]: - level_name, index_name = "an index", "a column" - if axis == 1: - level_name, index_name = index_name, level_name - raise ValueError( - f"{k} is both {level_name} level and {index_name} label, which is ambiguous." - ) # fastpath for multi column groupby if axis == 0 and all( ( From a5bb81e5c337391a5e9ec838e3aaf6180343ae45 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 7 Jun 2022 11:32:04 -0700 Subject: [PATCH 06/13] Add test for when by is in intersection of index names and columns Signed-off-by: Rehan Durrani --- modin/core/dataframe/algebra/groupby.py | 2 ++ modin/pandas/test/test_groupby.py | 47 ++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/modin/core/dataframe/algebra/groupby.py b/modin/core/dataframe/algebra/groupby.py index e8fef51f6f1..c2b7e38ad37 100644 --- a/modin/core/dataframe/algebra/groupby.py +++ b/modin/core/dataframe/algebra/groupby.py @@ -137,6 +137,8 @@ def map( # grouping by columns that were recently added to the data via # `from_labels`. The internal dataframe doesn't know what to do when # the label matches a column name. + # We ensure that the columns, index, and by don't intersect in the API level, + # so if we hit this if statement, we know its a result of a deferred re-index. if len(df.columns.intersection(df.index.names)) > 0: df = df.reset_index(drop=True) by_part = other diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 1965d59b098..997a953e22b 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -1566,7 +1566,7 @@ def test_agg_exceptions(operation): }, ], ) -def test_to_pandas_convertion(kwargs): +def test_to_pandas_conversion(kwargs): data = {"a": [1, 2], "b": [3, 4], "c": [5, 6]} by = ["a", "b"] @@ -2025,3 +2025,48 @@ def test_reset_index_groupby(): pandas_df, lambda df: df.reset_index().groupby(["index_0", "index_1"]).count(), ) + +def test_by_in_index_and_columns(): + pandas_df = pandas.DataFrame( + [[1, 2, 3]], index=pd.Index([0], name="a"), columns=['a', 'b', 'c'] + ) + modin_df = from_pandas(pandas_df) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by='a').count(), + raising_exceptions=True, + check_exception_type=True, + ) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=['a', 'b']).count(), + raising_exceptions=True, + check_exception_type=True, + ) + pandas_df = pandas.DataFrame( + [[1, 2, 3]], index=pd.Index([(0, 1)], names=["a", 'b']), columns=['a', 'b', 'c'] + ) + modin_df = from_pandas(pandas_df) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by='a').count(), + raising_exceptions=True, + check_exception_type=True, + ) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=['a', 'c']).count(), + raising_exceptions=True, + check_exception_type=True, + ) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=['a', 'b']).count(), + raising_exceptions=True, + check_exception_type=True, + ) From 75c60891ac3ecdd3717cea1f076106056325063c Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 7 Jun 2022 11:41:16 -0700 Subject: [PATCH 07/13] lint Signed-off-by: Rehan Durrani --- modin/pandas/test/test_groupby.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 997a953e22b..f1b8176553f 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -2026,47 +2026,48 @@ def test_reset_index_groupby(): lambda df: df.reset_index().groupby(["index_0", "index_1"]).count(), ) + def test_by_in_index_and_columns(): pandas_df = pandas.DataFrame( - [[1, 2, 3]], index=pd.Index([0], name="a"), columns=['a', 'b', 'c'] + [[1, 2, 3]], index=pd.Index([0], name="a"), columns=["a", "b", "c"] ) modin_df = from_pandas(pandas_df) eval_general( modin_df, pandas_df, - lambda df: df.groupby(by='a').count(), + lambda df: df.groupby(by="a").count(), raising_exceptions=True, check_exception_type=True, ) eval_general( modin_df, pandas_df, - lambda df: df.groupby(by=['a', 'b']).count(), + lambda df: df.groupby(by=["a", "b"]).count(), raising_exceptions=True, check_exception_type=True, ) pandas_df = pandas.DataFrame( - [[1, 2, 3]], index=pd.Index([(0, 1)], names=["a", 'b']), columns=['a', 'b', 'c'] + [[1, 2, 3]], index=pd.Index([(0, 1)], names=["a", "b"]), columns=["a", "b", "c"] ) modin_df = from_pandas(pandas_df) eval_general( modin_df, pandas_df, - lambda df: df.groupby(by='a').count(), + lambda df: df.groupby(by="a").count(), raising_exceptions=True, check_exception_type=True, ) eval_general( modin_df, pandas_df, - lambda df: df.groupby(by=['a', 'c']).count(), + lambda df: df.groupby(by=["a", "c"]).count(), raising_exceptions=True, check_exception_type=True, ) eval_general( modin_df, pandas_df, - lambda df: df.groupby(by=['a', 'b']).count(), + lambda df: df.groupby(by=["a", "b"]).count(), raising_exceptions=True, check_exception_type=True, ) From a547d323bfcf6ae336c1ea5a2740255a5fb2f106 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 7 Jun 2022 11:45:40 -0700 Subject: [PATCH 08/13] remove unused line Signed-off-by: Rehan Durrani --- modin/pandas/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 7d2c05410ac..79bbd5147c6 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -415,6 +415,7 @@ def groupby( ) else: squeeze = False + axis = self._get_axis_number(axis) idx_name = None # Drop here indicates whether or not to drop the data column before doing the From 3ce19eafc0781d96a9110b0e9a3645197e507928 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 7 Jun 2022 12:15:48 -0700 Subject: [PATCH 09/13] add comments to test Signed-off-by: Rehan Durrani --- modin/pandas/test/test_groupby.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index df6f00f2baf..d7f7445e053 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -2035,6 +2035,14 @@ def test_sum_with_level(): def test_reset_index_groupby(): + # Due to `reset_index` deferring the actual reindexing of partitions, + # when we call groupby after a `reset_index` with a `by` column name + # that was moved from the index to the columns via `from_labels` the + # algebra layer incorrectly thinks that the `by` key is duplicated + # across both the columns and labels, and fails, when it should + # succeed. We have this test to ensure that that case is correctly + # handled, and passes. For more details, checkout + # https://github.com/modin-project/modin/issues/4522. frame_data = np.random.randint(97, 198, size=(2**6, 2**4)) pandas_df = pandas.DataFrame( frame_data, @@ -2043,11 +2051,14 @@ def test_reset_index_groupby(): ), ).add_prefix("col") pandas_df.index.names = [f"index_{i}" for i in range(len(pandas_df.index.names))] - # Convert every other column to string + # Convert every even column to string for col in pandas_df.iloc[ :, [i for i in range(len(pandas_df.columns)) if i % 2 == 0] ]: pandas_df[col] = [str(chr(i)) for i in pandas_df[col]] + # The `pandas_df` contains a multi-index with 3 levels, named `index_0`, `index_1`, + # and `index_2`, and 16 columns, named `col0` through `col15`. Every even column + # has dtype `str`, while odd columns have dtype `int64`. modin_df = from_pandas(pandas_df) eval_general( modin_df, From 0e4a52101d750717440e987f093c4bc622cb58c3 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 7 Jun 2022 14:08:51 -0700 Subject: [PATCH 10/13] fix typo and add solution for series + index Signed-off-by: Rehan Durrani --- modin/pandas/dataframe.py | 29 +++++++++++------ modin/pandas/test/test_groupby.py | 54 +++++++++++++++++++++++++------ 2 files changed, 63 insertions(+), 20 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 79bbd5147c6..4fa2ecdfb3b 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -425,17 +425,21 @@ def groupby( # groupby takes place. drop = False # Check that there is no ambiguity in the parameter we were given. - _by_check = by if is_list_like(by) else [by] - for k in _by_check: - if k in self.index.names and k in self.axes[axis]: - level_name, index_name = "an index", "a column" - if axis == 1: - level_name, index_name = index_name, level_name - raise ValueError( - f"{k} is both {level_name} level and {index_name} label, which is ambiguous." - ) + # We don't need to check if `by` is a Series or Index, since those + # won't be referencing labels + if not isinstance(by, (pandas.Series, Series, pandas.Index)): + _by_check = by if is_list_like(by) else [by] + for k in _by_check: + if not isinstance(k, (Series, pandas.Series, pandas.Index)): + if k in self.index.names and k in self.axes[axis ^ 1]: + level_name, index_name = "an index", "a column" + if axis == 1: + level_name, index_name = index_name, level_name + raise ValueError( + f"{k} is both {level_name} level and {index_name} label, which is ambiguous." + ) if ( - not isinstance(by, (pandas.Series, Series)) + not isinstance(by, (pandas.Series, Series, pandas.Index)) and is_list_like(by) and len(by) == 1 ): @@ -452,6 +456,11 @@ def groupby( level, by = by, None elif level is None: by = self.__getitem__(by)._query_compiler + elif isinstance(by, (pandas.Series, pandas.Index)): + if isinstance(by, pandas.Index) and len(by) != len(self.axes[axis]): + raise ValueError("Grouper and axis must be same length") + idx_name = by.name + by = Series(by)._query_compiler elif isinstance(by, Series): drop = by._parent is self idx_name = by.name diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index d7f7445e053..8cc518a3ec3 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -2076,15 +2076,16 @@ def test_by_in_index_and_columns(): modin_df, pandas_df, lambda df: df.groupby(by="a").count(), - raising_exceptions=True, - check_exception_type=True, ) eval_general( modin_df, pandas_df, lambda df: df.groupby(by=["a", "b"]).count(), - raising_exceptions=True, - check_exception_type=True, + ) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=[df["b"], "a"]).count(), ) pandas_df = pandas.DataFrame( [[1, 2, 3]], index=pd.Index([(0, 1)], names=["a", "b"]), columns=["a", "b", "c"] @@ -2094,20 +2095,53 @@ def test_by_in_index_and_columns(): modin_df, pandas_df, lambda df: df.groupby(by="a").count(), - raising_exceptions=True, - check_exception_type=True, ) eval_general( modin_df, pandas_df, lambda df: df.groupby(by=["a", "c"]).count(), - raising_exceptions=True, - check_exception_type=True, ) eval_general( modin_df, pandas_df, lambda df: df.groupby(by=["a", "b"]).count(), - raising_exceptions=True, - check_exception_type=True, + ) + + +def test_by_series(): + pandas_df = pandas.DataFrame( + [[1, 2, 3]], index=pd.Index([0], name="a"), columns=["a", "b", "c"] + ) + modin_df = from_pandas(pandas_df) + + def make_appropriately_typed_series(df, values=["a"]): + """Return a Series from either pandas or modin.pandas depending on type of `df`.""" + if isinstance(df, pd.DataFrame): + return pd.Series(values) + return pandas.Series(values) + + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=make_appropriately_typed_series(df)).count(), + ) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby( + by=make_appropriately_typed_series(df, ["a", "b"]) + ).count(), + ) + + +def test_by_index(): + pandas_df = pandas.DataFrame( + [[1, 2, 3]], index=pd.Index([0], name="a"), columns=["a", "b", "c"] + ) + modin_df = from_pandas(pandas_df) + eval_general(modin_df, pandas_df, lambda df: df.groupby(by=pd.Index(["a"])).count()) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=pd.Index(["a", "b"])).count(), ) From 639a201ecaa7bc17454cdab178091f82b46dae20 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 7 Jun 2022 14:12:26 -0700 Subject: [PATCH 11/13] update test Signed-off-by: Rehan Durrani --- modin/pandas/test/test_groupby.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 8cc518a3ec3..5dc438c79d3 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -2113,6 +2113,16 @@ def test_by_series(): [[1, 2, 3]], index=pd.Index([0], name="a"), columns=["a", "b", "c"] ) modin_df = from_pandas(pandas_df) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=pandas.Series(["a"])).count(), + ) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=pandas.Series(["a", "b"])).count(), + ) def make_appropriately_typed_series(df, values=["a"]): """Return a Series from either pandas or modin.pandas depending on type of `df`.""" From d2c729543537c20db0c302b7e2c19826ae7dadfc Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 7 Jun 2022 18:34:25 -0700 Subject: [PATCH 12/13] Fix value counts to call groupby with appropriate arguments Signed-off-by: Rehan Durrani --- modin/pandas/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 30f43189eac..c220b7291cb 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3461,7 +3461,9 @@ def value_counts( Count unique values in the `BasePandasDataset`. """ if subset is None: - subset = self._query_compiler.columns + # Need to get column names as array rather than as Index, since `groupby` does not + # treat `Index` arguments to `by` as a list of labels. + subset = self._query_compiler.columns.values counted_values = self.groupby(by=subset, dropna=dropna, observed=True).size() if sort: counted_values.sort_values(ascending=ascending, inplace=True) From 55ec3a1e54590366d6313ecca87087d7ae8ac3eb Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 7 Jun 2022 18:41:53 -0700 Subject: [PATCH 13/13] Fix naming Signed-off-by: Rehan Durrani --- modin/pandas/dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 4fa2ecdfb3b..887ac139af3 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -428,8 +428,8 @@ def groupby( # We don't need to check if `by` is a Series or Index, since those # won't be referencing labels if not isinstance(by, (pandas.Series, Series, pandas.Index)): - _by_check = by if is_list_like(by) else [by] - for k in _by_check: + _by_list = by if is_list_like(by) else [by] + for k in _by_list: if not isinstance(k, (Series, pandas.Series, pandas.Index)): if k in self.index.names and k in self.axes[axis ^ 1]: level_name, index_name = "an index", "a column"