diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py index 3b370737c77..cead45169ca 100644 --- a/asv_bench/benchmarks/benchmarks.py +++ b/asv_bench/benchmarks/benchmarks.py @@ -43,12 +43,7 @@ class BaseTimeGroupBy: def setup(self, shape, ngroups=5, groupby_ncols=1): ngroups = translator_groupby_ngroups(ngroups, shape) self.df, self.groupby_columns = generate_dataframe( - "int", - *shape, - RAND_LOW, - RAND_HIGH, - groupby_ncols, - count_groups=ngroups, + "int", *shape, RAND_LOW, RAND_HIGH, groupby_ncols, count_groups=ngroups, ) diff --git a/asv_bench/benchmarks/hdk/benchmarks.py b/asv_bench/benchmarks/hdk/benchmarks.py index f98680ff8c9..7252d101f62 100644 --- a/asv_bench/benchmarks/hdk/benchmarks.py +++ b/asv_bench/benchmarks/hdk/benchmarks.py @@ -442,12 +442,7 @@ class BaseTimeGroupBy: def setup(self, shape, ngroups=5, groupby_ncols=1): ngroups = translator_groupby_ngroups(ngroups, shape) self.df, self.groupby_columns = generate_dataframe( - "int", - *shape, - RAND_LOW, - RAND_HIGH, - groupby_ncols, - count_groups=ngroups, + "int", *shape, RAND_LOW, RAND_HIGH, groupby_ncols, count_groups=ngroups, ) # correct while we use 'col*' like name for non-groupby columns # and 'groupby_col*' like name for groupby columns diff --git a/asv_bench/benchmarks/hdk/io.py b/asv_bench/benchmarks/hdk/io.py index dccb55966ad..e169e6640df 100644 --- a/asv_bench/benchmarks/hdk/io.py +++ b/asv_bench/benchmarks/hdk/io.py @@ -54,10 +54,5 @@ def setup(self, cache, shape): self.filename, self.names, self.dtype = cache[file_id] def time_read_csv_names(self, cache, shape): - df = IMPL.read_csv( - self.filename, - names=self.names, - header=0, - dtype=self.dtype, - ) + df = IMPL.read_csv(self.filename, names=self.names, header=0, dtype=self.dtype,) trigger_import(df) diff --git a/asv_bench/benchmarks/io/parquet.py b/asv_bench/benchmarks/io/parquet.py index 5b2ffc9c470..021ae2a253a 100644 --- a/asv_bench/benchmarks/io/parquet.py +++ b/asv_bench/benchmarks/io/parquet.py @@ -44,8 +44,4 @@ def setup(self, test_filenames, shape): self.shape_id = get_shape_id(shape) def time_read_parquet(self, test_filenames, shape): - execute( - IMPL.read_parquet( - test_filenames[self.shape_id], - ) - ) + execute(IMPL.read_parquet(test_filenames[self.shape_id],)) diff --git a/asv_bench/benchmarks/utils/common.py b/asv_bench/benchmarks/utils/common.py index 02878c2a651..1a5ca45bcc8 100644 --- a/asv_bench/benchmarks/utils/common.py +++ b/asv_bench/benchmarks/utils/common.py @@ -244,11 +244,7 @@ def gen_true_false_int_data(nrows, ncols, rand_low, rand_high): def gen_data( - data_type: str, - nrows: int, - ncols: int, - rand_low: int, - rand_high: int, + data_type: str, nrows: int, ncols: int, rand_low: int, rand_high: int, ) -> dict: """ Generate data with caching. @@ -501,12 +497,7 @@ def execute( return # compatibility with old Modin versions - all( - map( - lambda partition: partition.drain_call_queue() or True, - partitions, - ) - ) + all(map(lambda partition: partition.drain_call_queue() or True, partitions,)) if ASV_USE_ENGINE == "ray": from ray import wait diff --git a/asv_bench/benchmarks/utils/data_shapes.py b/asv_bench/benchmarks/utils/data_shapes.py index 4012d3468b6..5c7291871af 100644 --- a/asv_bench/benchmarks/utils/data_shapes.py +++ b/asv_bench/benchmarks/utils/data_shapes.py @@ -156,10 +156,7 @@ "hdk.TimeReadCsvNames", ], ), - ( - HDK_BINARY_OP_DATA_SIZE[ASV_DATASET_SIZE], - ["hdk.TimeMerge", "hdk.TimeAppend"], - ), + (HDK_BINARY_OP_DATA_SIZE[ASV_DATASET_SIZE], ["hdk.TimeMerge", "hdk.TimeAppend"],), ( HDK_SERIES_DATA_SIZE[ASV_DATASET_SIZE], ["hdk.TimeBinaryOpSeries", "hdk.TimeValueCountsSeries"], diff --git a/modin/_compat/core/py36/base_io.py b/modin/_compat/core/py36/base_io.py index bb153911b61..13d2dc73c54 100644 --- a/modin/_compat/core/py36/base_io.py +++ b/modin/_compat/core/py36/base_io.py @@ -206,16 +206,9 @@ def read_json( returns=_doc_returns_qc, ) def read_feather( - cls, - path, - columns=None, - use_threads=True, + cls, path, columns=None, use_threads=True, ): # noqa: PR01 - return cls._read_feather( - path=path, - columns=columns, - use_threads=use_threads, - ) + return cls._read_feather(path=path, columns=columns, use_threads=use_threads,) @classmethod @_inherit_docstrings(pandas.read_stata, apilink="pandas.read_stata") @@ -258,13 +251,10 @@ def read_stata( returns=_doc_returns_qc, ) def read_pickle( - cls, - filepath_or_buffer, - compression="infer", + cls, filepath_or_buffer, compression="infer", ): # noqa: PR01 return cls._read_pickle( - filepath_or_buffer=filepath_or_buffer, - compression=compression, + filepath_or_buffer=filepath_or_buffer, compression=compression, ) @classmethod @@ -306,8 +296,5 @@ def to_pickle( protocol: int = 4, # older pandas only supports protocol <= 4 ): # noqa: PR01, D200 return cls._to_pickle( - obj, - filepath_or_buffer, - compression=compression, - protocol=protocol, + obj, filepath_or_buffer, compression=compression, protocol=protocol, ) diff --git a/modin/_compat/pandas_api/classes.py b/modin/_compat/pandas_api/classes.py index d704ccc713a..d31c28b0b39 100644 --- a/modin/_compat/pandas_api/classes.py +++ b/modin/_compat/pandas_api/classes.py @@ -16,9 +16,7 @@ from modin._compat import PandasCompatVersion if PandasCompatVersion.CURRENT == PandasCompatVersion.PY36: - from .py36 import ( - Python36CompatibleBasePandasDataset as BasePandasDatasetCompat, - ) + from .py36 import Python36CompatibleBasePandasDataset as BasePandasDatasetCompat from .py36 import Python36CompatibleDataFrame as DataFrameCompat from .py36 import Python36CompatibleSeries as SeriesCompat from .py36 import Python36CompatibleDataFrameGroupBy as DataFrameGroupByCompat @@ -27,9 +25,7 @@ from .py36 import Python36CompatibleRolling as RollingCompat from .py36 import Python36CompatibleResampler as ResamplerCompat elif PandasCompatVersion.CURRENT == PandasCompatVersion.LATEST: - from .latest import ( - LatestCompatibleBasePandasDataset as BasePandasDatasetCompat, - ) + from .latest import LatestCompatibleBasePandasDataset as BasePandasDatasetCompat from .latest import LatestCompatibleDataFrame as DataFrameCompat from .latest import LatestCompatibleSeries as SeriesCompat from .latest import LatestCompatibleDataFrameGroupBy as DataFrameGroupByCompat diff --git a/modin/_compat/pandas_api/latest/base.py b/modin/_compat/pandas_api/latest/base.py index 53ce3e0259a..97617b436c0 100644 --- a/modin/_compat/pandas_api/latest/base.py +++ b/modin/_compat/pandas_api/latest/base.py @@ -224,11 +224,7 @@ def rank( ) def reindex( - self, - index=None, - columns=None, - copy=True, - **kwargs, + self, index=None, columns=None, copy=True, **kwargs, ): return self._reindex(index=index, columns=columns, copy=copy, **kwargs) @@ -345,13 +341,7 @@ def set_axis(self, labels, axis=0, inplace=no_default, *, copy=no_default): ) def sem( - self, - axis=None, - skipna=True, - level=None, - ddof=1, - numeric_only=None, - **kwargs, + self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs, ): return self._sem( axis=axis, @@ -402,13 +392,7 @@ def skew( ) def std( - self, - axis=None, - skipna=True, - level=None, - ddof=1, - numeric_only=None, - **kwargs, + self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs, ): return self._std( axis=axis, diff --git a/modin/_compat/pandas_api/py36/base.py b/modin/_compat/pandas_api/py36/base.py index fb120b25827..3169bad9d09 100644 --- a/modin/_compat/pandas_api/py36/base.py +++ b/modin/_compat/pandas_api/py36/base.py @@ -160,18 +160,9 @@ def rank( ) def reindex( - self, - index=None, - columns=None, - copy=True, - **kwargs, + self, index=None, columns=None, copy=True, **kwargs, ): - return self._reindex( - index=index, - columns=columns, - copy=copy, - **kwargs, - ) + return self._reindex(index=index, columns=columns, copy=copy, **kwargs,) def resample( self, diff --git a/modin/_compat/pandas_api/py36/io.py b/modin/_compat/pandas_api/py36/io.py index 6466473a61c..75232672be8 100644 --- a/modin/_compat/pandas_api/py36/io.py +++ b/modin/_compat/pandas_api/py36/io.py @@ -155,10 +155,7 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): return DataFrame( query_compiler=FactoryDispatcher.read_parquet( - path=path, - engine=engine, - columns=columns, - **kwargs, + path=path, engine=engine, columns=columns, **kwargs, ) ) diff --git a/modin/_compat/pandas_api/py36/series.py b/modin/_compat/pandas_api/py36/series.py index 069603c904f..a2178ba2d35 100644 --- a/modin/_compat/pandas_api/py36/series.py +++ b/modin/_compat/pandas_api/py36/series.py @@ -61,12 +61,7 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs): return self._idxmin(axis=axis, skipna=skipna) def kurt( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs, + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): # noqa: PR01, RT01, D200 if axis not in (None, 0, "index", "rows"): raise ValueError(f"No axis named {axis} for object type Series") diff --git a/modin/_compat/pandas_api/py36/utils.py b/modin/_compat/pandas_api/py36/utils.py index ec27515e048..98dfb50fcba 100644 --- a/modin/_compat/pandas_api/py36/utils.py +++ b/modin/_compat/pandas_api/py36/utils.py @@ -30,12 +30,7 @@ def create_stat_method(name): """ def stat_method( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs, + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): return self._stat_operation(name, axis, skipna, level, numeric_only, **kwargs) diff --git a/modin/_compat/pandas_api/py36/window.py b/modin/_compat/pandas_api/py36/window.py index 82b57b734ec..8417c502840 100644 --- a/modin/_compat/pandas_api/py36/window.py +++ b/modin/_compat/pandas_api/py36/window.py @@ -39,7 +39,6 @@ def __init__( axis, ) - @append_to_docstring("Compatibility layer for 'Python 3.6 pandas' for Rolling.") @_inherit_docstrings(pandas.core.window.rolling.Rolling) class Python36CompatibleRolling(BaseCompatibleRolling): diff --git a/modin/conftest.py b/modin/conftest.py index 4a9687a2f7f..bedb2f465fc 100644 --- a/modin/conftest.py +++ b/modin/conftest.py @@ -372,9 +372,7 @@ def TestReadCSVFixture(): # each xdist worker spawned in separate process with separate namespace and dataset pytest.csvs_names = {file_id: get_unique_filename() for file_id in files_ids} # test_read_csv_col_handling, test_read_csv_parsing - _make_csv_file(filenames)( - filename=pytest.csvs_names["test_read_csv_regular"], - ) + _make_csv_file(filenames)(filename=pytest.csvs_names["test_read_csv_regular"],) # test_read_csv_parsing _make_csv_file(filenames)( filename=pytest.csvs_names["test_read_csv_yes_no"], @@ -382,8 +380,7 @@ def TestReadCSVFixture(): ) # test_read_csv_col_handling _make_csv_file(filenames)( - filename=pytest.csvs_names["test_read_csv_blank_lines"], - add_blank_lines=True, + filename=pytest.csvs_names["test_read_csv_blank_lines"], add_blank_lines=True, ) # test_read_csv_nans_handling _make_csv_file(filenames)( @@ -393,8 +390,7 @@ def TestReadCSVFixture(): ) # test_read_csv_error_handling _make_csv_file(filenames)( - filename=pytest.csvs_names["test_read_csv_bad_lines"], - add_bad_lines=True, + filename=pytest.csvs_names["test_read_csv_bad_lines"], add_bad_lines=True, ) yield diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index 2514d9dc31c..835103532f9 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -13,6 +13,7 @@ """Module houses builder class for Binary operator.""" +from typing import Any, Callable import numpy as np import pandas @@ -23,7 +24,9 @@ class Binary(Operator): """Builder class for Binary operator.""" @classmethod - def register(cls, func, join_type="outer", labels="replace"): + def register( + cls, func: Callable, join_type: str = "outer", labels: str = "replace" + ) -> Callable: """ Build template binary operator. @@ -44,8 +47,13 @@ def register(cls, func, join_type="outer", labels="replace"): """ def caller( - query_compiler, other, broadcast=False, *args, dtypes=None, **kwargs - ): + query_compiler: Any, + other: Any, + broadcast: bool = False, + *args: Any, + dtypes: Any | None = None, + **kwargs: Any + ) -> Any: """ Apply binary `func` to passed operands. @@ -112,8 +120,7 @@ def caller( ) else: new_modin_frame = query_compiler._modin_frame.map( - lambda df: func(df, other, *args, **kwargs), - dtypes=dtypes, + lambda df: func(df, other, *args, **kwargs), dtypes=dtypes, ) return query_compiler.__constructor__(new_modin_frame) diff --git a/modin/core/dataframe/algebra/default2pandas/binary.py b/modin/core/dataframe/algebra/default2pandas/binary.py index 9186ec96504..f1fdf9a7876 100644 --- a/modin/core/dataframe/algebra/default2pandas/binary.py +++ b/modin/core/dataframe/algebra/default2pandas/binary.py @@ -13,6 +13,7 @@ """Module houses default binary functions builder class.""" +from typing import Any, Callable from .default import DefaultMethod import pandas @@ -23,7 +24,7 @@ class BinaryDefault(DefaultMethod): """Build default-to-pandas methods which executes binary functions.""" @classmethod - def build_default_to_pandas(cls, fn, fn_name): + def build_default_to_pandas(cls, fn: Callable, fn_name: str) -> Callable: """ Build function that do fallback to pandas for passed binary `fn`. @@ -41,7 +42,7 @@ def build_default_to_pandas(cls, fn, fn_name): to the casted to pandas frame. """ - def bin_ops_wrapper(df, other, *args, **kwargs): + def bin_ops_wrapper(df: Any, other: Any, *args: Any, **kwargs: Any) -> None: """Apply specified binary function to the passed operands.""" squeeze_other = kwargs.pop("broadcast", False) or kwargs.pop( "squeeze_other", False diff --git a/modin/core/dataframe/algebra/default2pandas/cat.py b/modin/core/dataframe/algebra/default2pandas/cat.py index 146b48a5cb2..1ed5c898df8 100644 --- a/modin/core/dataframe/algebra/default2pandas/cat.py +++ b/modin/core/dataframe/algebra/default2pandas/cat.py @@ -14,13 +14,16 @@ """Module houses default applied-on-category functions builder class.""" from .series import SeriesDefault +import pandas class CatDefault(SeriesDefault): """Builder for default-to-pandas methods which is executed under category accessor.""" @classmethod - def frame_wrapper(cls, df): + def frame_wrapper( + cls, df: pandas.DataFrame + ) -> pandas.core.arrays.categorical.CategoricalAccessor: """ Get category accessor of the passed frame. diff --git a/modin/core/dataframe/algebra/default2pandas/datetime.py b/modin/core/dataframe/algebra/default2pandas/datetime.py index f29750eba8c..3e54a24dc7d 100644 --- a/modin/core/dataframe/algebra/default2pandas/datetime.py +++ b/modin/core/dataframe/algebra/default2pandas/datetime.py @@ -14,13 +14,16 @@ """Module houses default applied-on-datetime functions builder class.""" from .series import SeriesDefault +import pandas class DateTimeDefault(SeriesDefault): """Builder for default-to-pandas methods which is executed under datetime accessor.""" @classmethod - def frame_wrapper(cls, df): + def frame_wrapper( + cls, df: pandas.DataFrame + ) -> pandas.core.indexes.accessors.DatetimeProperties: """ Get datetime accessor of the passed frame. diff --git a/modin/core/dataframe/algebra/default2pandas/default.py b/modin/core/dataframe/algebra/default2pandas/default.py index 0dc3e6493f9..d254cc83bdb 100644 --- a/modin/core/dataframe/algebra/default2pandas/default.py +++ b/modin/core/dataframe/algebra/default2pandas/default.py @@ -13,6 +13,8 @@ """Module houses default functions builder class.""" +from typing import Any, Callable, Optional, Union +from xmlrpc.client import boolean from modin.core.dataframe.algebra import Operator from modin.utils import try_cast_to_pandas, MODIN_UNNAMED_SERIES_LABEL @@ -28,7 +30,7 @@ class ObjTypeDeterminer: to an object under which `key` function is applied. """ - def __getattr__(self, key): + def __getattr__(self, key: str) -> Callable: """ Build function that executes `key` function over passed frame. @@ -42,7 +44,7 @@ def __getattr__(self, key): Function that takes DataFrame and executes `key` function on it. """ - def func(df, *args, **kwargs): + def func(df, *args: Any, **kwargs : Any) -> Any: """Access specified attribute of the passed object and call it if it's callable.""" prop = getattr(df, key) if callable(prop): @@ -69,7 +71,7 @@ class DefaultMethod(Operator): DEFAULT_OBJECT_TYPE = ObjTypeDeterminer @classmethod - def register(cls, func, obj_type=None, inplace=None, fn_name=None): + def register(cls, func: Union[Callable, str], obj_type: Optional[object] =None, inplace: Optional[boolean]=None, fn_name: Optional[str]=None) -> Callable: """ Build function that do fallback to default pandas implementation for passed `func`. @@ -106,7 +108,7 @@ def register(cls, func, obj_type=None, inplace=None, fn_name=None): if type(fn) == property: fn = cls.build_property_wrapper(fn) - def applyier(df, *args, **kwargs): + def applyier(df: pandas.DataFrame, *args: Any, **kwargs: Any) -> (pandas.DataFrame | Any) : """ Apply target function to the casted to pandas frame. @@ -159,7 +161,7 @@ def applyier(df, *args, **kwargs): @classmethod # FIXME: this method is almost a duplicate of `cls.build_default_to_pandas`. # Those two methods should be merged into a single one. - def build_wrapper(cls, fn, fn_name): + def build_wrapper(cls, fn: Callable, fn_name: str) -> Callable: """ Build function that do fallback to pandas for passed `fn`. @@ -180,7 +182,7 @@ def build_wrapper(cls, fn, fn_name): """ wrapper = cls.build_default_to_pandas(fn, fn_name) - def args_cast(self, *args, **kwargs): + def args_cast(self, *args: Any, **kwargs: Any) -> Any: """ Preprocess `default_to_pandas` function arguments and apply default function. @@ -193,7 +195,7 @@ def args_cast(self, *args, **kwargs): return args_cast @classmethod - def build_property_wrapper(cls, prop): + def build_property_wrapper(cls, prop: str) -> Callable: """ Build function that accesses specified property of the frame. @@ -208,14 +210,14 @@ def build_property_wrapper(cls, prop): Function that takes DataFrame and returns its value of `prop` property. """ - def property_wrapper(df): + def property_wrapper(df: pandas.DataFrame) -> Any: """Get specified property of the passed object.""" return prop.fget(df) return property_wrapper @classmethod - def build_default_to_pandas(cls, fn, fn_name): + def build_default_to_pandas(cls, fn: Callable, fn_name: str) -> Callable: """ Build function that do fallback to pandas for passed `fn`. @@ -233,14 +235,14 @@ def build_default_to_pandas(cls, fn, fn_name): """ fn.__name__ = f"" - def wrapper(self, *args, **kwargs): + def wrapper(self, *args: Any, **kwargs: Any) -> Any: """Do fallback to pandas for the specified function.""" return self.default_to_pandas(fn, *args, **kwargs) return wrapper @classmethod - def frame_wrapper(cls, df): + def frame_wrapper(cls, df: pandas.DataFrame) -> pandas.DataFrame : """ Extract frame property to apply function on. diff --git a/modin/core/dataframe/algebra/default2pandas/groupby.py b/modin/core/dataframe/algebra/default2pandas/groupby.py index ed3a2bb81ac..c154ab502f2 100644 --- a/modin/core/dataframe/algebra/default2pandas/groupby.py +++ b/modin/core/dataframe/algebra/default2pandas/groupby.py @@ -13,6 +13,8 @@ """Module houses default GroupBy functions builder class.""" +from gc import callbacks +from typing import Any, Callable, Optional, Union from .default import DefaultMethod import pandas @@ -34,7 +36,9 @@ class GroupBy: ] @classmethod - def validate_by(cls, by): + def validate_by( + cls, by: Union[pandas.DataFrame, pandas.Series, list] + ) -> Any: """ Build valid `by` parameter for `pandas.DataFrame.groupby`. @@ -52,7 +56,7 @@ def validate_by(cls, by): By parameter with all DataFrames casted to Series. """ - def try_cast_series(df): + def try_cast_series(df: pandas.DataFrame): """Cast one-column frame to Series.""" if isinstance(df, pandas.DataFrame): df = df.squeeze(axis=1) @@ -71,7 +75,9 @@ def try_cast_series(df): return by @classmethod - def inplace_applyier_builder(cls, key, func=None): + def inplace_applyier_builder( + cls, key: callable, func: Optional[Union[callable, str]] = None + ) -> Callable: """ Bind actual aggregation function to the GroupBy aggregation method. @@ -96,7 +102,7 @@ def inplace_applyier(grp, *func_args, **func_kwargs): return inplace_applyier @classmethod - def get_func(cls, key, **kwargs): + def get_func(cls, key: Union[str, callable], **kwargs: Any) -> Callable: """ Extract aggregation function from groupby arguments. @@ -129,7 +135,7 @@ def get_func(cls, key, **kwargs): return cls.inplace_applyier_builder(key) @classmethod - def build_aggregate_method(cls, key): + def build_aggregate_method(cls, key: Union[str, callable]) -> Callable: """ Build function for `QueryCompiler.groupby_agg` that can be executed as default-to-pandas. @@ -146,7 +152,7 @@ def build_aggregate_method(cls, key): """ def fn( - df, + df: pandas.DataFrame, by, axis, groupby_kwargs, @@ -167,7 +173,7 @@ def fn( return fn @classmethod - def build_groupby_reduce_method(cls, agg_func): + def build_groupby_reduce_method(cls, agg_func: Union[callable, str]): """ Build function for `QueryCompiler.groupby_*` that can be executed as default-to-pandas. @@ -191,11 +197,7 @@ def fn( by = cls.validate_by(by) grp = df.groupby(by, axis=axis, **groupby_kwargs) grp_agg_func = cls.get_func(agg_func, **kwargs) - return grp_agg_func( - grp, - *agg_args, - **agg_kwargs, - ) + return grp_agg_func(grp, *agg_args, **agg_kwargs,) if isinstance(by, pandas.DataFrame): by = by.squeeze(axis=1) @@ -260,7 +262,7 @@ def is_aggregate(cls, key): # noqa: PR01 return key in cls.agg_aliases @classmethod - def build_groupby(cls, func): + def build_groupby(cls, func: Union[callable, str]) -> Callable: """ Build function that groups DataFrame and applies aggregation function to the every group. @@ -281,15 +283,15 @@ def build_groupby(cls, func): @staticmethod def handle_as_index_for_dataframe( - result, - internal_by_cols, - by_cols_dtypes=None, - by_length=None, - selection=None, - partition_idx=0, - drop=True, - method=None, - inplace=False, + result: pandas.DataFrame, + internal_by_cols: list, + by_cols_dtypes: Optional[list] = None, + by_length: Optional[int] = None, + selection: Optional[Union[str, list]] = None, + partition_idx: int = 0, + drop: bool = True, + method: Optional[str] = None, + inplace: bool = False, ): """ Handle `as_index=False` parameter for the passed GroupBy aggregation result. @@ -353,16 +355,16 @@ def handle_as_index_for_dataframe( @staticmethod def handle_as_index( - result_cols, - result_index_names, - internal_by_cols, - by_cols_dtypes=None, - by_length=None, - selection=None, - partition_idx=0, - drop=True, - method=None, - ): + result_cols: pandas.Index, + result_index_names: list, + internal_by_cols: list, + by_cols_dtypes: Optional[list] = None, + by_length: Optional[int] = None, + selection: Optional[Union[str, list]] = None, + partition_idx: int = 0, + drop: bool = True, + method: Optional[str] = None, + ) -> Union[bool, bool, list(str), list(int)]: """ Compute hints to process ``as_index=False`` parameter for the GroupBy result. @@ -523,7 +525,7 @@ class GroupByDefault(DefaultMethod): OBJECT_TYPE = "GroupBy" @classmethod - def register(cls, func, **kwargs): + def register(cls, func: Union[Callable, str], **kwargs: Any) -> Callable: """ Build default-to-pandas GroupBy aggregation function. @@ -558,7 +560,7 @@ def register(cls, func, **kwargs): } @classmethod - def get_aggregation_method(cls, how): + def get_aggregation_method(cls, how: Any) -> Callable: """ Return `pandas.DataFrameGroupBy` method that implements the passed `how` UDF applying strategy. diff --git a/modin/core/dataframe/algebra/default2pandas/resample.py b/modin/core/dataframe/algebra/default2pandas/resample.py index f3dca22fe55..21f1a6b9a52 100644 --- a/modin/core/dataframe/algebra/default2pandas/resample.py +++ b/modin/core/dataframe/algebra/default2pandas/resample.py @@ -13,6 +13,7 @@ """Module houses default Resamle functions builder class.""" +from typing import Any, Callable from .default import DefaultMethod @@ -22,7 +23,7 @@ class Resampler: """Builder class for resampled aggregation functions.""" @classmethod - def build_resample(cls, func, squeeze_self): + def build_resample(cls, func: Callable, squeeze_self: bool) -> Callable: """ Build function that resamples time-series data and does aggregation. @@ -40,7 +41,7 @@ def build_resample(cls, func, squeeze_self): to resampled time-series data. """ - def fn(df, resample_kwargs, *args, **kwargs): + def fn(df: Any, resample_kwargs: Any, *args: Any, **kwargs: Any) -> Any: """Resample time-series data of the passed frame and apply specified aggregation.""" if squeeze_self: df = df.squeeze(axis=1) @@ -60,7 +61,9 @@ class ResampleDefault(DefaultMethod): OBJECT_TYPE = "Resampler" @classmethod - def register(cls, func, squeeze_self=False, **kwargs): + def register( + cls, func: Callable, squeeze_self: bool = False, **kwargs: Any + ) -> Callable: """ Build function that do fallback to pandas and aggregate resampled data. diff --git a/modin/core/dataframe/algebra/default2pandas/rolling.py b/modin/core/dataframe/algebra/default2pandas/rolling.py index 08c2cbf0093..0bd837a16e4 100644 --- a/modin/core/dataframe/algebra/default2pandas/rolling.py +++ b/modin/core/dataframe/algebra/default2pandas/rolling.py @@ -13,6 +13,7 @@ """Module houses default Rolling functions builder class.""" +from typing import Any, Callable from .default import DefaultMethod @@ -22,7 +23,7 @@ class Rolling: """Builder for aggregation on a rolling window functions.""" @classmethod - def build_rolling(cls, func): + def build_rolling(cls, func: Callable) -> Callable: """ Build function that creates a rolling window and executes `func` on it. @@ -37,7 +38,7 @@ def build_rolling(cls, func): Function that takes pandas DataFrame and applies `func` on a rolling window. """ - def fn(df, rolling_args, *args, **kwargs): + def fn(df: Any, rolling_args: Any, *args: Any, **kwargs: Any) -> Any: """Create rolling window for the passed frame and execute specified `func` on it.""" rolling_args, rolling_kw = rolling_args roller = df.rolling(*rolling_args, **rolling_kw) @@ -56,7 +57,7 @@ class RollingDefault(DefaultMethod): OBJECT_TYPE = "Rolling" @classmethod - def register(cls, func, **kwargs): + def register(cls, func: Callable, **kwargs: Any) -> Callable: """ Build function that do fallback to pandas to apply `func` on a rolling window. diff --git a/modin/core/dataframe/algebra/default2pandas/series.py b/modin/core/dataframe/algebra/default2pandas/series.py index a3d3e84c58f..601cd4b6a61 100644 --- a/modin/core/dataframe/algebra/default2pandas/series.py +++ b/modin/core/dataframe/algebra/default2pandas/series.py @@ -14,6 +14,7 @@ """Module houses default Series functions builder class.""" from .default import DefaultMethod +import pandas class SeriesDefault(DefaultMethod): @@ -22,7 +23,7 @@ class SeriesDefault(DefaultMethod): OBJECT_TYPE = "Series" @classmethod - def frame_wrapper(cls, df): + def frame_wrapper(cls, df: pandas.DataFrame) -> pandas.Series: """ Squeeze passed DataFrame to be able to process Series-specific functions on it. diff --git a/modin/core/dataframe/algebra/default2pandas/str.py b/modin/core/dataframe/algebra/default2pandas/str.py index 002b7744fdd..cfba6a62162 100644 --- a/modin/core/dataframe/algebra/default2pandas/str.py +++ b/modin/core/dataframe/algebra/default2pandas/str.py @@ -14,13 +14,16 @@ """Module houses default applied-on-str functions builder class.""" from .series import SeriesDefault +import pandas class StrDefault(SeriesDefault): """Builder for default-to-pandas methods which is executed under `str` accessor.""" @classmethod - def frame_wrapper(cls, df): + def frame_wrapper( + cls, df: pandas.DataFrame + ) -> pandas.core.strings.accessor.StringMethods: """ Get `str` accessor of the passed frame. diff --git a/modin/core/dataframe/algebra/fold.py b/modin/core/dataframe/algebra/fold.py index 419a0b56903..122a489d93c 100644 --- a/modin/core/dataframe/algebra/fold.py +++ b/modin/core/dataframe/algebra/fold.py @@ -13,6 +13,7 @@ """Module houses builder class for Fold operator.""" +from typing import Any, Callable, Iterable from .operator import Operator @@ -20,7 +21,7 @@ class Fold(Operator): """Builder class for Fold functions.""" @classmethod - def register(cls, fold_function): + def register(cls, fold_function: Callable) -> Callable: """ Build Fold operator that will be performed across rows/columns. @@ -35,7 +36,12 @@ def register(cls, fold_function): Function that takes query compiler and executes Fold function. """ - def caller(query_compiler, fold_axis=None, *args, **kwargs): + def caller( + query_compiler: Any, + fold_axis: int | None = None, + *args: Iterable, + **kwargs: Any + ) -> Any: """ Execute Fold function against passed query compiler. diff --git a/modin/core/dataframe/algebra/groupby.py b/modin/core/dataframe/algebra/groupby.py index 0a25c2f2a91..77a0f3850fb 100644 --- a/modin/core/dataframe/algebra/groupby.py +++ b/modin/core/dataframe/algebra/groupby.py @@ -14,6 +14,7 @@ """Module houses builder class for GroupByReduce operator.""" from collections.abc import Container +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import pandas from .tree_reduce import TreeReduce @@ -26,7 +27,12 @@ class GroupByReduce(TreeReduce): """Builder class for GroupBy aggregation functions.""" @classmethod - def register(cls, map_func, reduce_func=None, **call_kwds): + def register( + cls, + map_func: Union[str, dict, Callable], + reduce_func: Union[str, dict, Callable] = None, + **call_kwds: Any, + ) -> Callable: """ Build template GroupBy aggregation function. @@ -72,15 +78,15 @@ def build_fn(name): @classmethod def map( cls, - df, - map_func, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - other=None, - by=None, - ): + df: pandas.DataFrame, + map_func: Union[dict, Callable[pandas.DataFrameGroupBy]], + axis: int, + groupby_kwargs: Dict, + agg_args: List, + agg_kwargs: dict, + other: Optional[pandas.DataFrame] = None, + by: Optional[Union[list, str]] = None, + ) -> pandas.DataFrame: """ Execute Map phase of GroupByReduce. @@ -129,8 +135,7 @@ def map( other = other.squeeze(axis=axis ^ 1) if isinstance(other, pandas.DataFrame): df = pandas.concat( - [df] + [other[[o for o in other if o not in df]]], - axis=1, + [df] + [other[[o for o in other if o not in df]]], axis=1, ) other = list(other.columns) by_part = other @@ -146,15 +151,15 @@ def map( @classmethod def reduce( cls, - df, - reduce_func, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - partition_idx=0, - drop=False, - method=None, + df: pandas.DataFrame, + reduce_func: Callable, + axis: int, + groupby_kwargs: dict, + agg_args: list, + agg_kwargs: dict, + partition_idx: int = 0, + drop: bool = False, + method: Optional[str] = None, ): """ Execute Reduce phase of GroupByReduce. @@ -237,18 +242,18 @@ def reduce( @classmethod def caller( cls, - query_compiler, - by, - map_func, - reduce_func, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - method=None, - default_to_pandas_func=None, - ): + query_compiler: Any, + by: Any, + map_func: Union[dict, Callable], + reduce_func: Union[dict, Callable], + axis: int, + groupby_kwargs: dict, + agg_args: list, + agg_kwargs: dict, + drop: bool = False, + method: Optional[str] = None, + default_to_pandas_func: Callable = None, + ) -> Any: """ Execute GroupBy aggregation with TreeReduce approach. @@ -355,7 +360,9 @@ def caller( return result @staticmethod - def try_filter_dict(agg_func, df): + def try_filter_dict( + agg_func: Union[dict, Callable], df: pandas.DataFrame + ) -> Callable: """ Build aggregation function to apply to each group at this particular partition. @@ -382,16 +389,16 @@ def try_filter_dict(agg_func, df): @classmethod def build_map_reduce_functions( cls, - by, - axis, - groupby_kwargs, - map_func, - reduce_func, - agg_args, - agg_kwargs, - drop=False, - method=None, - ): + by: Any, + axis: int, + groupby_kwargs: dict, + map_func: pandas.DataFrame, + reduce_func: pandas.DataFrame, + agg_args: list, + agg_kwargs: dict, + drop: bool = False, + method: Optional[str] = None, + ) -> Tuple(Callable): """ Bind appropriate arguments to map and reduce functions. @@ -428,8 +435,8 @@ def build_map_reduce_functions( if hasattr(by, "_modin_frame"): by = None - def _map(df, other=None, **kwargs): - def wrapper(df, other=None): + def _map(df: pandas.DataFrame, other=None, **kwargs) -> pandas.DataFrame: + def wrapper(df: pandas.DataFrame, other=None) -> pandas.DataFrame: return cls.map( df, other=other, @@ -450,8 +457,8 @@ def wrapper(df, other=None): result = wrapper(df.copy(), other if other is None else other.copy()) return result - def _reduce(df, **call_kwargs): - def wrapper(df): + def _reduce(df: pandas.DataFrame, **call_kwargs: Any) -> pandas.DataFrame: + def wrapper(df: pandas.DataFrame) -> pandas.DataFrame: return cls.reduce( df, axis=axis, @@ -488,7 +495,7 @@ def wrapper(df): } -def _is_reduce_function_with_depth(fn, depth: int = 0): +def _is_reduce_function_with_depth(fn: Any, depth: int = 0) -> bool: """ Check whether all functions defined by `fn` are groupby reductions. @@ -526,7 +533,7 @@ def _is_reduce_function_with_depth(fn, depth: int = 0): return isinstance(fn, str) and fn in groupby_reduce_functions -def is_reduce_function(fn): +def is_reduce_function(fn: Any) -> bool: """ Check whether all functions defined by `fn` are groupby reductions. diff --git a/modin/core/dataframe/algebra/map.py b/modin/core/dataframe/algebra/map.py index f5922617b03..3a45156bbf2 100644 --- a/modin/core/dataframe/algebra/map.py +++ b/modin/core/dataframe/algebra/map.py @@ -13,6 +13,7 @@ """Module houses builder class for Map operator.""" +from typing import Any, Callable from .operator import Operator @@ -20,7 +21,9 @@ class Map(Operator): """Builder class for Map operator.""" @classmethod - def register(cls, function, *call_args, **call_kwds): + def register( + cls, function: Callable, *call_args: Any, **call_kwds: Any + ) -> Callable: """ Build Map operator that will be performed across each partition. @@ -41,7 +44,7 @@ def register(cls, function, *call_args, **call_kwds): Function that takes query compiler and executes map function. """ - def caller(query_compiler, *args, **kwargs): + def caller(query_compiler: Any, *args: Any, **kwargs: Any) -> Any: """Execute Map function against passed query compiler.""" return query_compiler.__constructor__( query_compiler._modin_frame.map( diff --git a/modin/core/dataframe/algebra/operator.py b/modin/core/dataframe/algebra/operator.py index cc093e6720b..0eaaf15f117 100644 --- a/modin/core/dataframe/algebra/operator.py +++ b/modin/core/dataframe/algebra/operator.py @@ -13,13 +13,13 @@ """Module contains an interface for operator builder classes.""" -from typing import Optional +from typing import Any, Callable, Optional class Operator(object): """Interface for building operators that can execute in parallel across partitions.""" - def __init__(self): + def __init__(self) -> None: raise ValueError( "Please use {}.register instead of the constructor".format( type(self).__name__ @@ -27,7 +27,7 @@ def __init__(self): ) @classmethod - def register(cls, func, **kwargs): + def register(cls, func: Callable, **kwargs: Any) -> Callable: """ Build operator that applies source function across the entire dataset. diff --git a/modin/core/dataframe/algebra/reduce.py b/modin/core/dataframe/algebra/reduce.py index 0f4fbe3667f..2089192a00d 100644 --- a/modin/core/dataframe/algebra/reduce.py +++ b/modin/core/dataframe/algebra/reduce.py @@ -13,6 +13,7 @@ """Module houses builder class for Reduce operator.""" +from typing import Any, Callable, Optional from .operator import Operator @@ -20,7 +21,9 @@ class Reduce(Operator): """Builder class for Reduce operator.""" @classmethod - def register(cls, reduce_function, axis=None): + def register( + cls, reduce_function: Callable, axis: Optional[int] = None + ) -> Callable: """ Build Reduce operator that will be performed across rows/columns. @@ -39,7 +42,7 @@ def register(cls, reduce_function, axis=None): Function that takes query compiler and executes Reduce function. """ - def caller(query_compiler, *args, **kwargs): + def caller(query_compiler: Any, *args: Any, **kwargs: Any) -> Any: """Execute Reduce function against passed query compiler.""" _axis = kwargs.get("axis") if axis is None else axis return query_compiler.__constructor__( diff --git a/modin/core/dataframe/algebra/tree_reduce.py b/modin/core/dataframe/algebra/tree_reduce.py index 671faa1ea0a..94cb1aaeebf 100644 --- a/modin/core/dataframe/algebra/tree_reduce.py +++ b/modin/core/dataframe/algebra/tree_reduce.py @@ -13,6 +13,7 @@ """Module houses builder class for TreeReduce operator.""" +from typing import Any, Callable, Optional from .operator import Operator @@ -20,7 +21,12 @@ class TreeReduce(Operator): """Builder class for TreeReduce operator.""" @classmethod - def register(cls, map_function, reduce_function=None, axis=None): + def register( + cls, + map_function: Callable, + reduce_function: Optional[Callable] = None, + axis: Optional[int] = None, + ) -> Callable: """ Build TreeReduce operator. @@ -42,7 +48,7 @@ def register(cls, map_function, reduce_function=None, axis=None): if reduce_function is None: reduce_function = map_function - def caller(query_compiler, *args, **kwargs): + def caller(query_compiler: Any, *args: Any, **kwargs: Any) -> Any: """Execute TreeReduce function against passed query compiler.""" _axis = kwargs.get("axis") if axis is None else axis return query_compiler.__constructor__( diff --git a/modin/core/dataframe/base/dataframe/dataframe.py b/modin/core/dataframe/base/dataframe/dataframe.py index 536d43ceaea..6595a852dbd 100644 --- a/modin/core/dataframe/base/dataframe/dataframe.py +++ b/modin/core/dataframe/base/dataframe/dataframe.py @@ -255,10 +255,7 @@ def groupby( @abstractmethod def reduce( - self, - axis: Union[int, Axis], - function: Callable, - dtypes: Optional[str] = None, + self, axis: Union[int, Axis], function: Callable, dtypes: Optional[str] = None, ) -> "ModinDataframe": """ Perform a user-defined aggregation on the specified axis, where the axis reduces down to a singleton. diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 5b60d54191c..a35e2e9604b 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -1003,11 +1003,7 @@ def from_labels_executor(df, **kwargs): return result new_parts = self._partition_mgr_cls.apply_func_to_select_indices( - 0, - self._partitions, - from_labels_executor, - [0], - keep_remaining=True, + 0, self._partitions, from_labels_executor, [0], keep_remaining=True, ) new_column_widths = [ self.index.nlevels + self.column_widths[0] @@ -1558,19 +1554,13 @@ def _compute_tree_reduce_metadata(self, axis, new_parts): new_dtypes = None result = self.__constructor__( - new_parts, - *new_axes, - *new_axes_lengths, - new_dtypes, + new_parts, *new_axes, *new_axes_lengths, new_dtypes, ) return result @lazy_metadata_decorator(apply_axis="both") def reduce( - self, - axis: Union[int, Axis], - function: Callable, - dtypes: Optional[str] = None, + self, axis: Union[int, Axis], function: Callable, dtypes: Optional[str] = None, ) -> "PandasDataframe": """ Perform a user-defined aggregation on the specified axis, where the axis reduces down to a singleton. Requires knowledge of the full axis for the reduction. @@ -2021,12 +2011,7 @@ def explode(self, axis: Union[int, Axis], func: Callable) -> "PandasDataframe": @lazy_metadata_decorator(apply_axis="both") def apply_full_axis( - self, - axis, - func, - new_index=None, - new_columns=None, - dtypes=None, + self, axis, func, new_index=None, new_columns=None, dtypes=None, ): """ Perform a function across an entire axis. @@ -2112,14 +2097,8 @@ def apply_full_axis_select_indices( # Get the indices for the axis being applied to (it is the opposite of axis # being applied over) dict_indices = self._get_dict_of_block_index(axis ^ 1, numeric_indices) - new_partitions = ( - self._partition_mgr_cls.apply_func_to_select_indices_along_full_axis( - axis, - self._partitions, - func, - dict_indices, - keep_remaining=keep_remaining, - ) + new_partitions = self._partition_mgr_cls.apply_func_to_select_indices_along_full_axis( + axis, self._partitions, func, dict_indices, keep_remaining=keep_remaining, ) # TODO Infer columns and index from `keep_remaining` and `apply_indices` if new_index is None: @@ -2698,11 +2677,7 @@ def n_ary_op(self, op, right_frames: list, join_type="outer"): ) return self.__constructor__( - new_frame, - joined_index, - joined_columns, - row_lengths, - column_widths, + new_frame, joined_index, joined_columns, row_lengths, column_widths, ) @lazy_metadata_decorator(apply_axis="both") diff --git a/modin/core/dataframe/pandas/interchange/dataframe_protocol/dataframe.py b/modin/core/dataframe/pandas/interchange/dataframe_protocol/dataframe.py index e330e78512b..61fc21ebf95 100644 --- a/modin/core/dataframe/pandas/interchange/dataframe_protocol/dataframe.py +++ b/modin/core/dataframe/pandas/interchange/dataframe_protocol/dataframe.py @@ -70,10 +70,7 @@ class PandasProtocolDataframe(ProtocolDataframe): """ def __init__( - self, - df: PandasDataframe, - nan_as_null: bool = False, - allow_copy: bool = True, + self, df: PandasDataframe, nan_as_null: bool = False, allow_copy: bool = True, ) -> None: self._df = df self._nan_as_null = nan_as_null diff --git a/modin/core/dataframe/pandas/interchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/pandas/interchange/dataframe_protocol/from_dataframe.py index 2b312bd77fd..35dfb300272 100644 --- a/modin/core/dataframe/pandas/interchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/pandas/interchange/dataframe_protocol/from_dataframe.py @@ -125,12 +125,7 @@ def unpack_protocol_column( which keeps memory referenced by the column alive. """ dtype = col.dtype[0] - if dtype in ( - DTypeKind.INT, - DTypeKind.UINT, - DTypeKind.FLOAT, - DTypeKind.BOOL, - ): + if dtype in (DTypeKind.INT, DTypeKind.UINT, DTypeKind.FLOAT, DTypeKind.BOOL,): return primitive_column_to_ndarray(col) elif dtype == DTypeKind.CATEGORICAL: return categorical_column_to_series(col) diff --git a/modin/core/dataframe/pandas/partitioning/axis_partition.py b/modin/core/dataframe/pandas/partitioning/axis_partition.py index 55a36b28c15..2177fefacb5 100644 --- a/modin/core/dataframe/pandas/partitioning/axis_partition.py +++ b/modin/core/dataframe/pandas/partitioning/axis_partition.py @@ -250,9 +250,7 @@ def deploy_func_between_two_axis_partitions( # reshaping flattened `rt_parts` array into a frame with shape `other_shape` combined_axis = [ pandas.concat( - rt_parts[other_shape[i - 1] : other_shape[i]], - axis=axis, - copy=False, + rt_parts[other_shape[i - 1] : other_shape[i]], axis=axis, copy=False, ) for i in range(1, len(other_shape)) ] diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py index 9003fc527b0..9abb820c361 100644 --- a/modin/core/dataframe/pandas/partitioning/partition_manager.py +++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py @@ -1396,14 +1396,9 @@ def rebalance_partitions(cls, partitions): new_partitions = np.array( [ cls.column_partitions( - partitions[i : i + chunk_size], - full_axis=False, - ) - for i in range( - 0, - num_existing_partitions, - chunk_size, + partitions[i : i + chunk_size], full_axis=False, ) + for i in range(0, num_existing_partitions, chunk_size,) ] ) return new_partitions, None diff --git a/modin/core/execution/dask/common/engine_wrapper.py b/modin/core/execution/dask/common/engine_wrapper.py index 03e8871b279..72433cf9ca2 100644 --- a/modin/core/execution/dask/common/engine_wrapper.py +++ b/modin/core/execution/dask/common/engine_wrapper.py @@ -21,12 +21,7 @@ class DaskWrapper: @classmethod def deploy( - cls, - func, - f_args=None, - f_kwargs=None, - num_returns=1, - pure=True, + cls, func, f_args=None, f_kwargs=None, num_returns=1, pure=True, ): """ Deploy a function in a worker process. diff --git a/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py b/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py index 10ce592766a..2c3e27ad946 100644 --- a/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py +++ b/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py @@ -201,10 +201,7 @@ def deploy_axis_func( maintain_partitioning, *partitions, ), - f_kwargs={ - "lengths": lengths, - "manual_partition": manual_partition, - }, + f_kwargs={"lengths": lengths, "manual_partition": manual_partition,}, num_returns=result_num_splits * 4, pure=False, ) diff --git a/modin/core/execution/ray/common/utils.py b/modin/core/execution/ray/common/utils.py index 9b51e916955..a3285bf299d 100644 --- a/modin/core/execution/ray/common/utils.py +++ b/modin/core/execution/ray/common/utils.py @@ -40,7 +40,7 @@ # This constant should be in sync with the limit in ray, which is private, # not exposed to users, and not documented: # https://github.com/ray-project/ray/blob/4692e8d8023e789120d3f22b41ffb136b50f70ea/python/ray/_private/ray_constants.py#L57-L62 -_MAC_OBJECT_STORE_LIMIT_BYTES = 2 * 2**30 +_MAC_OBJECT_STORE_LIMIT_BYTES = 2 * 2 ** 30 ObjectIDType = ray.ObjectRef if version.parse(ray.__version__) >= version.parse("1.2.0"): diff --git a/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py b/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py index b34e08eeaaa..882f7b2e345 100644 --- a/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py +++ b/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py @@ -265,11 +265,7 @@ def iloc(df, row_labels, col_labels): iloc = cuDFOnRayDataframePartition.preprocess_func(iloc) return self.gpu_manager.apply.remote( - self.key, - None, - iloc, - col_labels=col_labels, - row_labels=row_labels, + self.key, None, iloc, col_labels=col_labels, row_labels=row_labels, ) def get_gpu_manager(self): @@ -357,8 +353,7 @@ def convert(df): # FIXME: Can't find `gpu_manager.apply_result_not_dataframe` method. return self.gpu_manager.apply_result_not_dataframe.remote( - self.get_key(), - convert, + self.get_key(), convert, ) def free(self): @@ -373,10 +368,7 @@ def copy(self): ------- cuDFOnRayDataframePartition """ - new_key = self.gpu_manager.apply.remote( - self.get_key(), - lambda x: x, - ) + new_key = self.gpu_manager.apply.remote(self.get_key(), lambda x: x,) new_key = RayWrapper.materialize(new_key) return self.__constructor__(self.gpu_manager, new_key) diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py index 6561fcda963..455d8edb4db 100644 --- a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py +++ b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py @@ -178,12 +178,9 @@ def drain_call_queue(self): # this dramatically improves performance. func, f_args, f_kwargs = call_queue[0] logger.debug(f"SUBMIT::_apply_func::{self._identity}") - ( - self._data, - new_length, - new_width, - self._ip_cache, - ) = _apply_func.remote(data, func, *f_args, **f_kwargs) + (self._data, new_length, new_width, self._ip_cache,) = _apply_func.remote( + data, func, *f_args, **f_kwargs + ) logger.debug(f"EXIT::Partition.drain_call_queue::{self._identity}") self.call_queue = [] diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py index 91a720bcb3e..9652e128c1f 100644 --- a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py +++ b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py @@ -65,8 +65,7 @@ def magic(*args, **kwargs): ) = inspect.getframeinfo(current_frame) current_frame = current_frame.f_back t = threading.Thread( - target=call_progress_bar, - args=(result_parts, line_number), + target=call_progress_bar, args=(result_parts, line_number), ) t.start() # We need to know whether or not we are in a jupyter notebook diff --git a/modin/core/io/__init__.py b/modin/core/io/__init__.py index 9b7c8fc2d11..a5ffa4f2f1f 100644 --- a/modin/core/io/__init__.py +++ b/modin/core/io/__init__.py @@ -18,9 +18,7 @@ from .text.csv_glob_dispatcher import CSVGlobDispatcher from .text.fwf_dispatcher import FWFDispatcher from .text.json_dispatcher import JSONDispatcher -from .text.custom_text_dispatcher import ( - CustomTextExperimentalDispatcher, -) +from .text.custom_text_dispatcher import CustomTextExperimentalDispatcher from .text.excel_dispatcher import ExcelDispatcher from .file_dispatcher import FileDispatcher from .text.text_file_dispatcher import TextFileDispatcher diff --git a/modin/core/io/column_stores/column_store_dispatcher.py b/modin/core/io/column_stores/column_store_dispatcher.py index a49ce859eb3..a2b4e795f30 100644 --- a/modin/core/io/column_stores/column_store_dispatcher.py +++ b/modin/core/io/column_stores/column_store_dispatcher.py @@ -227,12 +227,7 @@ def build_query_compiler(cls, path, columns, **kwargs): ) new_query_compiler = cls.query_compiler_cls( cls.frame_cls( - remote_parts, - index, - columns, - row_lens, - column_widths, - dtypes=dtypes, + remote_parts, index, columns, row_lens, column_widths, dtypes=dtypes, ) ) return new_query_compiler diff --git a/modin/core/io/column_stores/feather_dispatcher.py b/modin/core/io/column_stores/feather_dispatcher.py index 933e26918e4..15fe8e73b5f 100644 --- a/modin/core/io/column_stores/feather_dispatcher.py +++ b/modin/core/io/column_stores/feather_dispatcher.py @@ -54,10 +54,7 @@ def _read(cls, path, columns=None, **kwargs): ) from pyarrow.feather import read_feather - with OpenFile( - path, - **(kwargs.get("storage_options", None) or {}), - ) as file: + with OpenFile(path, **(kwargs.get("storage_options", None) or {}),) as file: df = read_feather(file) # pyarrow.feather.read_feather doesn't support columns as pandas.Index columns = list(df.columns) diff --git a/modin/core/io/column_stores/parquet_dispatcher.py b/modin/core/io/column_stores/parquet_dispatcher.py index 29f959ab54e..e4da257ad9d 100644 --- a/modin/core/io/column_stores/parquet_dispatcher.py +++ b/modin/core/io/column_stores/parquet_dispatcher.py @@ -218,8 +218,7 @@ def files(self): return self._files def to_pandas_dataframe( - self, - columns, + self, columns, ): from pyarrow.parquet import read_table @@ -372,11 +371,7 @@ def call_deploy(cls, dataset, col_partitions, storage_options, **kwargs): parquet_files = dataset.files # step determines how many row groups are going to be in a partition - step = compute_chunksize( - num_row_groups, - NPartitions.get(), - min_block_size=1, - ) + step = compute_chunksize(num_row_groups, NPartitions.get(), min_block_size=1,) current_partition_size = 0 file_index = 0 partition_files = [] # 2D array - each element contains list of chunks to read @@ -469,9 +464,7 @@ def build_partition(cls, partition_ids, column_widths): [ [ cls.frame_partition_cls( - part_id[0], - length=part_id[2], - width=col_width, + part_id[0], length=part_id[2], width=col_width, ) for part_id, col_width in zip(part_ids, column_widths) ] @@ -604,8 +597,7 @@ def _read(cls, path, engine, columns, **kwargs): https://arrow.apache.org/docs/python/parquet.html """ import_optional_dependency( - "pyarrow", - "pyarrow is required to read parquet files.", + "pyarrow", "pyarrow is required to read parquet files.", ) from modin.pandas.io import PQ_INDEX_REGEX diff --git a/modin/core/io/io.py b/modin/core/io/io.py index b388a2f9e6d..54073ac06ae 100644 --- a/modin/core/io/io.py +++ b/modin/core/io/io.py @@ -117,11 +117,7 @@ def from_dataframe(cls, df): ) def _read_parquet(cls, **kwargs): # noqa: PR01 ErrorMessage.default_to_pandas("`read_parquet`") - return cls.from_pandas( - pandas.read_parquet( - **kwargs, - ) - ) + return cls.from_pandas(pandas.read_parquet(**kwargs,)) @classmethod @_inherit_docstrings(pandas.read_csv, apilink="pandas.read_csv") @@ -131,9 +127,7 @@ def _read_parquet(cls, **kwargs): # noqa: PR01 returns=_doc_returns_qc_or_parser, ) def _read_csv( - cls, - filepath_or_buffer, - **kwargs, + cls, filepath_or_buffer, **kwargs, ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_csv`") return cls._read(filepath_or_buffer=filepath_or_buffer, **kwargs) @@ -173,8 +167,7 @@ def _read(cls, **kwargs): returns=_doc_returns_qc, ) def _read_json( - cls, - **kwargs, + cls, **kwargs, ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_json`") return cls.from_pandas(pandas.read_json(**kwargs)) @@ -412,17 +405,10 @@ def read_hdf( returns=_doc_returns_qc, ) def _read_feather( - cls, - path, - **kwargs, + cls, path, **kwargs, ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_feather`") - return cls.from_pandas( - pandas.read_feather( - path, - **kwargs, - ) - ) + return cls.from_pandas(pandas.read_feather(path, **kwargs,)) @classmethod @_inherit_docstrings(pandas.read_stata, apilink="pandas.read_stata") @@ -432,9 +418,7 @@ def _read_feather( returns=_doc_returns_qc, ) def _read_stata( - cls, - filepath_or_buffer, - **kwargs, + cls, filepath_or_buffer, **kwargs, ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_stata`") return cls.from_pandas(pandas.read_stata(filepath_or_buffer, **kwargs)) @@ -477,17 +461,10 @@ def read_sas( returns=_doc_returns_qc, ) def _read_pickle( - cls, - filepath_or_buffer, - **kwargs, + cls, filepath_or_buffer, **kwargs, ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_pickle`") - return cls.from_pandas( - pandas.read_pickle( - filepath_or_buffer, - **kwargs, - ) - ) + return cls.from_pandas(pandas.read_pickle(filepath_or_buffer, **kwargs,)) @classmethod @_inherit_docstrings(pandas.read_sql, apilink="pandas.read_sql") @@ -592,19 +569,10 @@ def read_sql_table( returns=_doc_returns_qc, ) def _read_sql_query( - cls, - sql, - con, - **kwargs, + cls, sql, con, **kwargs, ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_sql_query`") - return cls.from_pandas( - pandas.read_sql_query( - sql, - con, - **kwargs, - ) - ) + return cls.from_pandas(pandas.read_sql_query(sql, con, **kwargs,)) @classmethod @_inherit_docstrings(pandas.read_spss, apilink="pandas.read_spss") @@ -656,10 +624,7 @@ def to_sql( pandas.DataFrame.to_pickle, apilink="pandas.DataFrame.to_pickle" ) def _to_pickle( - cls, - obj: Any, - filepath_or_buffer, - **kwargs, + cls, obj: Any, filepath_or_buffer, **kwargs, ): # noqa: PR01, D200 """ Pickle (serialize) object to file. @@ -668,11 +633,7 @@ def _to_pickle( if isinstance(obj, BaseQueryCompiler): obj = obj.to_pandas() - return pandas.to_pickle( - obj, - filepath_or_buffer=filepath_or_buffer, - **kwargs, - ) + return pandas.to_pickle(obj, filepath_or_buffer=filepath_or_buffer, **kwargs,) @classmethod @_inherit_docstrings(pandas.DataFrame.to_csv, apilink="pandas.DataFrame.to_csv") diff --git a/modin/core/io/pickle/pickle_dispatcher.py b/modin/core/io/pickle/pickle_dispatcher.py index 32feae81f65..49ec8eb1097 100644 --- a/modin/core/io/pickle/pickle_dispatcher.py +++ b/modin/core/io/pickle/pickle_dispatcher.py @@ -70,12 +70,7 @@ def _read(cls, filepath_or_buffer, **kwargs): for idx, file_name in enumerate(filepath_or_buffer): *partition_ids[idx], lengths_ids[idx], widths_ids[idx] = cls.deploy( - func=cls.parse, - f_kwargs={ - "fname": file_name, - **kwargs, - }, - num_returns=3, + func=cls.parse, f_kwargs={"fname": file_name, **kwargs,}, num_returns=3, ) lengths = cls.materialize(lengths_ids) widths = cls.materialize(widths_ids) diff --git a/modin/core/io/text/csv_glob_dispatcher.py b/modin/core/io/text/csv_glob_dispatcher.py index 5a687da4d6c..109dca3703a 100644 --- a/modin/core/io/text/csv_glob_dispatcher.py +++ b/modin/core/io/text/csv_glob_dispatcher.py @@ -72,9 +72,7 @@ def _read(cls, filepath_or_buffer, **kwargs): filepath_or_buffer = cls.get_path(filepath_or_buffer) elif not cls.pathlib_or_pypath(filepath_or_buffer): return cls.single_worker_read( - filepath_or_buffer, - reason=cls.BUFFER_UNSUPPORTED_MSG, - **kwargs, + filepath_or_buffer, reason=cls.BUFFER_UNSUPPORTED_MSG, **kwargs, ) # We read multiple csv files when the file path is a list of absolute file paths. We assume that all of the files will be essentially replicas of the @@ -161,8 +159,7 @@ def _read(cls, filepath_or_buffer, **kwargs): if usecols is not None and usecols_md[1] != "integer": del kwargs["usecols"] all_cols = pandas.read_csv( - filepath_or_buffer, - **dict(kwargs, nrows=0, skipfooter=0), + filepath_or_buffer, **dict(kwargs, nrows=0, skipfooter=0), ).columns usecols = all_cols.get_indexer_for(list(usecols_md[0])) parse_dates = kwargs.pop("parse_dates", False) @@ -237,9 +234,7 @@ def _read(cls, filepath_or_buffer, **kwargs): for idx, chunks in enumerate(splits): args.update({"chunks": chunks}) *partition_ids[idx], index_ids[idx], dtypes_ids[idx] = cls.deploy( - func=cls.parse, - f_kwargs=args, - num_returns=num_splits + 2, + func=cls.parse, f_kwargs=args, num_returns=num_splits + 2, ) # Compute the index based on a sum of the lengths of each partition (by default) @@ -477,10 +472,7 @@ def partitioned_file( # TODO(williamma12): Handle when skiprows > number of rows in file. Currently returns empty df. outside_quotes, read_rows = cls._read_rows( - f, - nrows=skip_amount, - quotechar=quotechar, - is_quoting=is_quoting, + f, nrows=skip_amount, quotechar=quotechar, is_quoting=is_quoting, ) if skiprows: skiprows -= read_rows @@ -513,10 +505,7 @@ def partitioned_file( read_size = nrows - read_rows_counter outside_quotes, read_rows = cls._read_rows( - f, - nrows=read_size, - quotechar=quotechar, - is_quoting=is_quoting, + f, nrows=read_size, quotechar=quotechar, is_quoting=is_quoting, ) split_size += read_rows read_rows_counter += read_rows diff --git a/modin/core/io/text/excel_dispatcher.py b/modin/core/io/text/excel_dispatcher.py index 4b30d171ee7..9e51e2b3ab9 100644 --- a/modin/core/io/text/excel_dispatcher.py +++ b/modin/core/io/text/excel_dispatcher.py @@ -199,9 +199,7 @@ def _read(cls, io, **kwargs): if b"" not in chunk and b"" in chunk: break remote_results_list = cls.deploy( - func=cls.parse, - f_kwargs=args, - num_returns=num_splits + 2, + func=cls.parse, f_kwargs=args, num_returns=num_splits + 2, ) data_ids.append(remote_results_list[:-2]) index_ids.append(remote_results_list[-2]) diff --git a/modin/core/io/text/json_dispatcher.py b/modin/core/io/text/json_dispatcher.py index d8bb3e33026..bd36d7dcf52 100644 --- a/modin/core/io/text/json_dispatcher.py +++ b/modin/core/io/text/json_dispatcher.py @@ -65,19 +65,14 @@ def _read(cls, path_or_buf, **kwargs): with OpenFile(path_or_buf, "rb", kwargs.get("compression", "infer")) as f: column_widths, num_splits = cls._define_metadata(empty_pd_df, columns) args = {"fname": path_or_buf, "num_splits": num_splits, **kwargs} - splits = cls.partitioned_file( - f, - num_partitions=NPartitions.get(), - ) + splits = cls.partitioned_file(f, num_partitions=NPartitions.get(),) partition_ids = [None] * len(splits) index_ids = [None] * len(splits) dtypes_ids = [None] * len(splits) for idx, (start, end) in enumerate(splits): args.update({"start": start, "end": end}) *partition_ids[idx], index_ids[idx], dtypes_ids[idx], _ = cls.deploy( - func=cls.parse, - f_kwargs=args, - num_returns=num_splits + 3, + func=cls.parse, f_kwargs=args, num_returns=num_splits + 3, ) # partition_id[-1] contains the columns for each partition, which will be useful # for implementing when `lines=False`. diff --git a/modin/core/io/text/text_file_dispatcher.py b/modin/core/io/text/text_file_dispatcher.py index 89fe154994f..ad7dbf3c4c0 100644 --- a/modin/core/io/text/text_file_dispatcher.py +++ b/modin/core/io/text/text_file_dispatcher.py @@ -535,9 +535,7 @@ def _define_header_size( @classmethod def _define_metadata( - cls, - df: pandas.DataFrame, - column_names: ColumnNamesTypes, + cls, df: pandas.DataFrame, column_names: ColumnNamesTypes, ) -> Tuple[list, int]: """ Define partitioning metadata. @@ -805,9 +803,7 @@ def skiprows_func(x): @classmethod def _define_index( - cls, - index_ids: list, - index_name: str, + cls, index_ids: list, index_name: str, ) -> Tuple[IndexColType, list]: """ Compute the resulting DataFrame index and index lengths for each of partitions. @@ -981,10 +977,7 @@ def _read(cls, filepath_or_buffer, **kwargs): # Define header size for further skipping (Header can be skipped because header # information will be obtained further from empty_df, so no need to handle it # by workers) - header_size = cls._define_header_size( - header, - names, - ) + header_size = cls._define_header_size(header, names,) ( skiprows_md, pre_reading, @@ -995,10 +988,7 @@ def _read(cls, filepath_or_buffer, **kwargs): ) (use_modin_impl, fallback_reason) = cls.check_parameters_support( - filepath_or_buffer_md, - kwargs, - skiprows_md, - header_size, + filepath_or_buffer_md, kwargs, skiprows_md, header_size, ) if not use_modin_impl: return cls.single_worker_read( diff --git a/modin/core/storage_formats/base/doc_utils.py b/modin/core/storage_formats/base/doc_utils.py index b538c47c92b..2fe591a1c11 100644 --- a/modin/core/storage_formats/base/doc_utils.py +++ b/modin/core/storage_formats/base/doc_utils.py @@ -272,10 +272,7 @@ def doc_reduce_agg(method, refer_to, params=None, extra_params=None): ] ) return doc_qc_method( - template, - params=params, - method=method, - refer_to=f"DataFrame.{refer_to}", + template, params=params, method=method, refer_to=f"DataFrame.{refer_to}", ) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index a0e05638a8e..46f8988cd49 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -752,11 +752,7 @@ def series_update(self, other, **kwargs): # noqa: PR02 New QueryCompiler with updated values. """ return BinaryDefault.register(pandas.Series.update, inplace=True)( - self, - other=other, - squeeze_self=True, - squeeze_other=True, - **kwargs, + self, other=other, squeeze_self=True, squeeze_other=True, **kwargs, ) @doc_utils.add_refer_to("DataFrame.clip") @@ -1074,10 +1070,7 @@ def prod(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.prod)(self, **kwargs) @doc_utils.doc_reduce_agg( - method="sum", - refer_to="sum", - extra_params=["**kwargs"], - params="axis : {0, 1}", + method="sum", refer_to="sum", extra_params=["**kwargs"], params="axis : {0, 1}", ) def sum(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.sum)(self, **kwargs) @@ -2301,10 +2294,7 @@ def apply_on_series(self, func, *args, **kwargs): assert self.is_series_like() return SeriesDefault.register(pandas.Series.apply)( - self, - func=func, - *args, - **kwargs, + self, func=func, *args, **kwargs, ) def explode(self, column): @@ -2342,13 +2332,7 @@ def explode(self, column): refer_to="count", ) def groupby_count( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.count)( self, @@ -2366,13 +2350,7 @@ def groupby_count( refer_to="any", ) def groupby_any( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.any)( self, @@ -2388,13 +2366,7 @@ def groupby_any( action="get the minimum value", result="minimum value", refer_to="min" ) def groupby_min( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.min)( self, @@ -2408,13 +2380,7 @@ def groupby_min( @doc_utils.doc_groupby_method(result="product", refer_to="prod") def groupby_prod( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.prod)( self, @@ -2430,13 +2396,7 @@ def groupby_prod( action="get the maximum value", result="maximum value", refer_to="max" ) def groupby_max( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.max)( self, @@ -2454,13 +2414,7 @@ def groupby_max( refer_to="all", ) def groupby_all( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.all)( self, @@ -2474,13 +2428,7 @@ def groupby_all( @doc_utils.doc_groupby_method(result="sum", refer_to="sum") def groupby_sum( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.sum)( self, @@ -2498,13 +2446,7 @@ def groupby_sum( refer_to="size", ) def groupby_size( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.size)( self, @@ -2583,13 +2525,7 @@ def groupby_agg( action="compute the mean value", result="mean value", refer_to="mean" ) def groupby_mean( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2605,13 +2541,7 @@ def groupby_mean( action="compute unbiased skew", result="unbiased skew", refer_to="skew" ) def groupby_skew( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2629,13 +2559,7 @@ def groupby_skew( refer_to="cumsum", ) def groupby_cumsum( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2653,13 +2577,7 @@ def groupby_cumsum( refer_to="cummax", ) def groupby_cummax( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2677,13 +2595,7 @@ def groupby_cummax( refer_to="cummin", ) def groupby_cummin( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2701,13 +2613,7 @@ def groupby_cummin( refer_to="cumprod", ) def groupby_cumprod( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2723,13 +2629,7 @@ def groupby_cumprod( action="compute standard deviation", result="standard deviation", refer_to="std" ) def groupby_std( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2745,13 +2645,7 @@ def groupby_std( action="compute numerical rank", result="numerical rank", refer_to="rank" ) def groupby_rank( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2767,13 +2661,7 @@ def groupby_rank( action="compute variance", result="variance", refer_to="var" ) def groupby_var( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2791,13 +2679,7 @@ def groupby_var( refer_to="nunique", ) def groupby_nunique( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2813,13 +2695,7 @@ def groupby_nunique( action="get the median value", result="median value", refer_to="median" ) def groupby_median( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2837,13 +2713,7 @@ def groupby_median( refer_to="quantile", ) def groupby_quantile( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2861,13 +2731,7 @@ def groupby_quantile( refer_to="fillna", ) def groupby_fillna( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2883,13 +2747,7 @@ def groupby_fillna( action="get data types", result="data type", refer_to="dtypes" ) def groupby_dtypes( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2907,13 +2765,7 @@ def groupby_dtypes( refer_to="shift", ) def groupby_shift( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -3976,9 +3828,7 @@ def resample_pipe(self, resample_kwargs, func, *args, **kwargs): ) @doc_utils.doc_resample_reduce( - result="product", - params="min_count : int", - refer_to="prod", + result="product", params="min_count : int", refer_to="prod", ) def resample_prod(self, resample_kwargs, min_count, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.prod)( @@ -3994,8 +3844,7 @@ def resample_quantile(self, resample_kwargs, q, *args, **kwargs): ) @doc_utils.doc_resample_reduce( - result="standard error of the mean", - refer_to="sem", + result="standard error of the mean", refer_to="sem", ) def resample_sem(self, resample_kwargs, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.sem)( @@ -4019,9 +3868,7 @@ def resample_std(self, resample_kwargs, ddof, *args, **kwargs): ) @doc_utils.doc_resample_reduce( - result="sum", - params="min_count : int", - refer_to="sum", + result="sum", params="min_count : int", refer_to="sum", ) def resample_sum(self, resample_kwargs, min_count, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.sum)( diff --git a/modin/core/storage_formats/cudf/parser.py b/modin/core/storage_formats/cudf/parser.py index 9daac986e13..12bedb2a5fc 100644 --- a/modin/core/storage_formats/cudf/parser.py +++ b/modin/core/storage_formats/cudf/parser.py @@ -76,10 +76,8 @@ def single_worker_read(cls, fname, *, reason, **kwargs): pandas_frame = cls.parse(fname, **kwargs) if isinstance(pandas_frame, pandas.io.parsers.TextFileReader): pd_read = pandas_frame.read - pandas_frame.read = ( - lambda *args, **kwargs: cls.query_compiler_cls.from_pandas( - pd_read(*args, **kwargs), cls.frame_cls - ) + pandas_frame.read = lambda *args, **kwargs: cls.query_compiler_cls.from_pandas( + pd_read(*args, **kwargs), cls.frame_cls ) return pandas_frame elif isinstance(pandas_frame, (OrderedDict, dict)): diff --git a/modin/core/storage_formats/pandas/parsers.py b/modin/core/storage_formats/pandas/parsers.py index e08f4da8451..39c473e5fff 100644 --- a/modin/core/storage_formats/pandas/parsers.py +++ b/modin/core/storage_formats/pandas/parsers.py @@ -130,10 +130,7 @@ def find_common_type_cat(types): if all(isinstance(t, pandas.CategoricalDtype) for t in types): if all(t.ordered for t in types): categories = np.sort(np.unique([c for t in types for c in t.categories])) - return pandas.CategoricalDtype( - categories, - ordered=True, - ) + return pandas.CategoricalDtype(categories, ordered=True,) return union_categoricals( [pandas.Categorical([], dtype=t) for t in types], sort_categories=all(t.ordered for t in types), @@ -247,8 +244,7 @@ def get_dtypes(cls, dtypes_ids): # concat all elements of `partitions_dtypes` and find common dtype # for each of the column among all partitions frame_dtypes = combined_part_dtypes.apply( - lambda row: find_common_type_cat(row.values), - axis=1, + lambda row: find_common_type_cat(row.values), axis=1, ).squeeze(axis=0) return frame_dtypes @@ -281,10 +277,8 @@ def single_worker_read(cls, fname, *, reason: str, **kwargs): pandas_frame = cls.parse(fname, **kwargs) if isinstance(pandas_frame, pandas.io.parsers.TextFileReader): pd_read = pandas_frame.read - pandas_frame.read = ( - lambda *args, **kwargs: cls.query_compiler_cls.from_pandas( - pd_read(*args, **kwargs), cls.frame_cls - ) + pandas_frame.read = lambda *args, **kwargs: cls.query_compiler_cls.from_pandas( + pd_read(*args, **kwargs), cls.frame_cls ) return pandas_frame elif isinstance(pandas_frame, (OrderedDict, dict)): @@ -673,10 +667,7 @@ def _read_row_group_chunk( return ( ParquetFile(f) .read_row_groups( - range( - row_group_start, - row_group_end, - ), + range(row_group_start, row_group_end,), columns=columns, use_pandas_metadata=True, ) @@ -762,10 +753,7 @@ def parse(fname, **kwargs): if num_splits is None: return pandas.read_feather(fname, **kwargs) - with OpenFile( - fname, - **(kwargs.pop("storage_options", None) or {}), - ) as file: + with OpenFile(fname, **(kwargs.pop("storage_options", None) or {}),) as file: df = feather.read_feather(file, **kwargs) # Append the length of the index here to build it externally return _split_result_for_readers(0, num_splits, df) + [len(df.index), df.dtypes] diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 254e4de9145..50ee9d8a50a 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -440,10 +440,7 @@ def where(self, cond, other, **kwargs): # the same row and column labels as `self`. new_modin_frame = self._modin_frame.n_ary_op( lambda df, cond, other: df.where(cond, other, **kwargs), - [ - cond._modin_frame, - other._modin_frame, - ], + [cond._modin_frame, other._modin_frame,], join_type=None, ) # This will be a Series of scalars to be applied based on the condition @@ -805,10 +802,7 @@ def reduce_fn(df, **kwargs): count_cols = count_cols.sum(axis=axis, skipna=False) return sum_cols / count_cols - return TreeReduce.register( - map_fn, - reduce_fn, - )(self, axis=axis, **kwargs) + return TreeReduce.register(map_fn, reduce_fn,)(self, axis=axis, **kwargs) # END TreeReduce operations @@ -1019,11 +1013,7 @@ def resample_ohlc_df(self, resample_kwargs, *args, **kwargs): def resample_prod(self, resample_kwargs, min_count, *args, **kwargs): return self._resample_func( - resample_kwargs, - "prod", - min_count=min_count, - *args, - **kwargs, + resample_kwargs, "prod", min_count=min_count, *args, **kwargs, ) def resample_size(self, resample_kwargs): @@ -1039,11 +1029,7 @@ def resample_std(self, resample_kwargs, ddof, *args, **kwargs): def resample_sum(self, resample_kwargs, min_count, *args, **kwargs): return self._resample_func( - resample_kwargs, - "sum", - min_count=min_count, - *args, - **kwargs, + resample_kwargs, "sum", min_count=min_count, *args, **kwargs, ) def resample_var(self, resample_kwargs, ddof, *args, **kwargs): @@ -1454,9 +1440,7 @@ def stack(self, level, dropna): def unique(self): new_modin_frame = self._modin_frame.apply_full_axis( - 0, - lambda x: x.squeeze(axis=1).unique(), - new_columns=self.columns, + 0, lambda x: x.squeeze(axis=1).unique(), new_columns=self.columns, ) return self.__constructor__(new_modin_frame) @@ -2168,9 +2152,7 @@ def applyier(df, internal_indices, other=[], internal_other_indices=[]): ) new_index = pandas.RangeIndex(len(self.index) * len(value_vars)) new_modin_frame = self._modin_frame.__constructor__( - new_parts, - index=new_index, - columns=id_vars + [var_name, value_name], + new_parts, index=new_index, columns=id_vars + [var_name, value_name], ) result = self.__constructor__(new_modin_frame) # this assigment needs to propagate correct indices into partitions @@ -2655,9 +2637,7 @@ def _groupby_mean_reduce(dfgb, **kwargs): result = GroupByReduce.register( lambda dfgb, **kwargs: pandas.concat( - [dfgb.sum(**kwargs), dfgb.count()], - axis=1, - copy=False, + [dfgb.sum(**kwargs), dfgb.count()], axis=1, copy=False, ), _groupby_mean_reduce, default_to_pandas_func=lambda dfgb, **kwargs: dfgb.mean(**kwargs), @@ -2676,13 +2656,7 @@ def _groupby_mean_reduce(dfgb, **kwargs): return result def groupby_size( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): result = self._groupby_dict_reduce( by=by, @@ -2792,13 +2766,7 @@ def _groupby_dict_reduce( ) def groupby_dtypes( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2894,9 +2862,7 @@ def groupby_agg_builder(df, by=None, drop=False, partition_idx=None): missed_by_cols = internal_by_df.columns.difference(df.columns) if len(missed_by_cols) > 0: df = pandas.concat( - [df, internal_by_df[missed_by_cols]], - axis=1, - copy=False, + [df, internal_by_df[missed_by_cols]], axis=1, copy=False, ) internal_by_cols = internal_by_df.columns diff --git a/modin/distributed/dataframe/pandas/partitions.py b/modin/distributed/dataframe/pandas/partitions.py index 3206c35fa3b..29f7ffd0bac 100644 --- a/modin/distributed/dataframe/pandas/partitions.py +++ b/modin/distributed/dataframe/pandas/partitions.py @@ -229,11 +229,7 @@ def from_partitions( column_widths = [len(idx) for idx in internal_indices] frame = partition_frame_class( - parts, - index, - columns, - row_lengths=row_lengths, - column_widths=column_widths, + parts, index, columns, row_lengths=row_lengths, column_widths=column_widths, ) if labels_axis_to_sync != -1: diff --git a/modin/experimental/batch/pipeline.py b/modin/experimental/batch/pipeline.py index 1030e88589f..2eb444bdb44 100644 --- a/modin/experimental/batch/pipeline.py +++ b/modin/experimental/batch/pipeline.py @@ -235,8 +235,7 @@ def _complete_nodes(self, list_of_nodes, partitions): for i in range(1, self.num_partitions): new_dfs.append( type(partitions[0])( - partition_list, - full_axis=partitions[0].full_axis, + partition_list, full_axis=partitions[0].full_axis, ).add_to_apply_calls(node.func, i) ) new_dfs[-1].drain_call_queue(num_splits=1) diff --git a/modin/experimental/batch/test/test_pipeline.py b/modin/experimental/batch/test/test_pipeline.py index 3248ca7b94f..2a03da90604 100644 --- a/modin/experimental/batch/test/test_pipeline.py +++ b/modin/experimental/batch/test/test_pipeline.py @@ -24,8 +24,7 @@ @pytest.mark.skipif( - Engine.get() != "Ray", - reason="Only Ray supports the Batch Pipeline API", + Engine.get() != "Ray", reason="Only Ray supports the Batch Pipeline API", ) class TestPipelineRayEngine: def test_warnings(self): @@ -163,8 +162,7 @@ def test_output_id(self): ): pipeline.compute_batch(postprocessor=lambda df: df, pass_output_id=True) with pytest.raises( - ValueError, - match="Output ID cannot be specified for non-output node.", + ValueError, match="Output ID cannot be specified for non-output node.", ): pipeline.add_query(lambda df: df, output_id=22) assert ( @@ -485,8 +483,7 @@ def reducer(dfs): output_id=20, ) pipeline.add_query( - lambda df: pandas.concat([df] * 1000), - repartition_after=True, + lambda df: pandas.concat([df] * 1000), repartition_after=True, ) def to_csv(df, partition_id): @@ -501,9 +498,7 @@ def post_proc(df, o_id, partition_id): return df new_dfs = pipeline.compute_batch( - postprocessor=post_proc, - pass_partition_id=True, - pass_output_id=True, + postprocessor=post_proc, pass_partition_id=True, pass_output_id=True, ) correct_df = pd.DataFrame([[0, 1, 2]]) correct_df["new_col"] = 0 @@ -543,8 +538,7 @@ def post_proc(df, o_id, partition_id): @pytest.mark.skipif( - Engine.get() == "Ray", - reason="Ray supports the Batch Pipeline API", + Engine.get() == "Ray", reason="Ray supports the Batch Pipeline API", ) def test_pipeline_unsupported_engine(): """Ensure that trying to use the Pipeline API with an unsupported Engine raises errors.""" diff --git a/modin/experimental/cloud/rpyc_proxy.py b/modin/experimental/cloud/rpyc_proxy.py index d77c419ba8a..7673048e969 100644 --- a/modin/experimental/cloud/rpyc_proxy.py +++ b/modin/experimental/cloud/rpyc_proxy.py @@ -641,13 +641,16 @@ def _preprocess_init_args( ): (data,) = conn.deliver((data,), {})[0] - return (), dict( - data=data, - index=index, - columns=columns, - dtype=dtype, - copy=copy, - query_compiler=query_compiler, + return ( + (), + dict( + data=data, + index=index, + columns=columns, + dtype=dtype, + copy=copy, + query_compiler=query_compiler, + ), ) @property @@ -694,9 +697,7 @@ def make_dataframe_groupby_wrapper(DataFrameGroupBy): Look for deatils in make_dataframe_wrapper() and _deliveringWrapper(). """ DeliveringDataFrameGroupBy = _deliveringWrapper( - DataFrameGroupBy, - ["agg", "aggregate", "apply"], - target_name="DataFrameGroupBy", + DataFrameGroupBy, ["agg", "aggregate", "apply"], target_name="DataFrameGroupBy", ) return DeliveringDataFrameGroupBy diff --git a/modin/experimental/cloud/test/test_cloud.py b/modin/experimental/cloud/test/test_cloud.py index 08d7ccad192..89844d039c5 100644 --- a/modin/experimental/cloud/test/test_cloud.py +++ b/modin/experimental/cloud/test/test_cloud.py @@ -63,8 +63,7 @@ def ray_cluster(conda_packages=None): make_bootstrap_config_mock, ): ray_cluster = RayCluster( - Provider(name="aws"), - add_conda_packages=conda_packages, + Provider(name="aws"), add_conda_packages=conda_packages, ) return ray_cluster @@ -111,13 +110,10 @@ def test_create_or_update_cluster(make_ray_cluster, make_create_or_update_cluste ], ) @pytest.mark.parametrize( - "user_packages", - [["scikit-learn>=0.23", "modin==0.8.0"], None], + "user_packages", [["scikit-learn>=0.23", "modin==0.8.0"], None], ) def test_update_conda_requirements( - make_ray_cluster, - setup_commands_source, - user_packages, + make_ray_cluster, setup_commands_source, user_packages, ): fake_version = namedtuple("FakeVersion", "major minor micro")(7, 12, 45) with mock.patch("sys.version_info", fake_version): diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py index 095ceac5c44..ba7c384c3eb 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py @@ -206,8 +206,8 @@ def compute_fragment_size(cls, table): cpu_count = os.cpu_count() if cpu_count is not None: fragment_size = table.num_rows // cpu_count - fragment_size = min(fragment_size, 2**25) - fragment_size = max(fragment_size, 2**18) + fragment_size = min(fragment_size, 2 ** 25) + fragment_size = max(fragment_size, 2 ** 18) else: fragment_size = 0 else: diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py index 86d632739d3..110392fc144 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py @@ -873,13 +873,7 @@ def join( condition = self._build_equi_join_condition(other, left_on, right_on) - op = JoinNode( - self, - other, - how=how.value, - exprs=exprs, - condition=condition, - ) + op = JoinNode(self, other, how=how.value, exprs=exprs, condition=condition,) new_columns = Index.__new__(Index, data=new_columns) res = self.__constructor__( @@ -1100,13 +1094,7 @@ def _join_by_index(self, other_modin_frames, how, sort, ignore_index): exprs[new_col_name] = rhs.ref(col) new_columns.append(new_col_name) - op = JoinNode( - lhs, - rhs, - how=how, - exprs=exprs, - condition=condition, - ) + op = JoinNode(lhs, rhs, how=how, exprs=exprs, condition=condition,) new_columns = Index.__new__( Index, data=new_columns, dtype=self.columns.dtype @@ -1121,10 +1109,7 @@ def _join_by_index(self, other_modin_frames, how, sort, ignore_index): if sort: lhs = lhs.sort_rows( - lhs._index_cols, - ascending=True, - ignore_index=False, - na_position="last", + lhs._index_cols, ascending=True, ignore_index=False, na_position="last", ) if reset_index_names: diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py index f64627fc79f..d34eacec10c 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py @@ -657,12 +657,7 @@ class JoinNode(DFAlgNode): """ def __init__( - self, - left, - right, - how="inner", - exprs=None, - condition=None, + self, left, right, how="inner", exprs=None, condition=None, ): self.input = [left, right] self.how = how @@ -678,11 +673,7 @@ def copy(self): JoinNode """ return JoinNode( - self.input[0], - self.input[1], - self.how, - self.exprs, - self.condition, + self.input[0], self.input[1], self.how, self.exprs, self.condition, ) def _prints(self, prefix): diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/utils.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/utils.py index 35e87bce803..46fe456ab3c 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/utils.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/utils.py @@ -27,18 +27,8 @@ arrow_types_map = { DTypeKind.BOOL: {8: pa.bool_()}, - DTypeKind.INT: { - 8: pa.int8(), - 16: pa.int16(), - 32: pa.int32(), - 64: pa.int64(), - }, - DTypeKind.UINT: { - 8: pa.uint8(), - 16: pa.uint16(), - 32: pa.uint32(), - 64: pa.uint64(), - }, + DTypeKind.INT: {8: pa.int8(), 16: pa.int16(), 32: pa.int32(), 64: pa.int64(),}, + DTypeKind.UINT: {8: pa.uint8(), 16: pa.uint16(), 32: pa.uint32(), 64: pa.uint64(),}, DTypeKind.FLOAT: {16: pa.float16(), 32: pa.float32(), 64: pa.float64()}, DTypeKind.STRING: {8: pa.string()}, } diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/io/io.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/io/io.py index cd4841d5d95..1ab5514c5b8 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/io/io.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/io/io.py @@ -40,18 +40,7 @@ from pandas.io.common import is_url ReadCsvKwargsType = Dict[ - str, - Union[ - str, - int, - bool, - dict, - object, - Sequence, - Callable, - Dialect, - None, - ], + str, Union[str, int, bool, dict, object, Sequence, Callable, Dialect, None,], ] @@ -235,9 +224,7 @@ def read_csv( return cls._read(**mykwargs) cls._validate_read_csv_kwargs(mykwargs) - use_modin_impl, error_message = cls._read_csv_check_support( - mykwargs, - ) + use_modin_impl, error_message = cls._read_csv_check_support(mykwargs,) if not use_modin_impl: raise ArrowEngineException(error_message) if isinstance(dtype, dict): @@ -401,8 +388,7 @@ def _prepare_pyarrow_usecols(cls, read_csv_kwargs): @classmethod def _read_csv_check_support( - cls, - read_csv_kwargs: ReadCsvKwargsType, + cls, read_csv_kwargs: ReadCsvKwargsType, ) -> Tuple[bool, str]: """ Check if passed parameters are supported by current ``modin.pandas.read_csv`` implementation. @@ -548,8 +534,7 @@ def _read_csv_check_support( @classmethod def _validate_read_csv_kwargs( - cls, - read_csv_kwargs: ReadCsvKwargsType, + cls, read_csv_kwargs: ReadCsvKwargsType, ): """ Validate `read_csv` keyword arguments. diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py index 87515318bb7..e73001b3ac9 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py @@ -144,7 +144,5 @@ def put_arrow(cls, obj): The new partition. """ return HdkOnNativeDataframePartition( - arrow_table=obj, - length=len(obj), - width=len(obj.columns), + arrow_table=obj, length=len(obj), width=len(obj.columns), ) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py index eb42da07d0e..cf1ae3cde79 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py @@ -286,10 +286,7 @@ def test_float32(self): ) @pytest.mark.parametrize("names", [None, [f"c{x}" for x in range(1, 7)]]) def test_read_csv_datetime( - self, - engine, - parse_dates, - names, + self, engine, parse_dates, names, ): parse_dates_unsupported = isinstance(parse_dates, dict) or ( @@ -333,9 +330,7 @@ def test_read_csv_datetime( ], ) def test_read_csv_col_handling( - self, - engine, - usecols, + self, engine, usecols, ): eval_io( fn_name="read_csv", @@ -660,11 +655,7 @@ def concat(lib, df, join, sort, ignore_index): return lib.concat([df], join=join, sort=sort, ignore_index=ignore_index) run_and_compare( - concat, - data=self.data, - join=join, - sort=sort, - ignore_index=ignore_index, + concat, data=self.data, join=join, sort=sort, ignore_index=ignore_index, ) def test_groupby_concat_single(self): @@ -673,8 +664,7 @@ def concat(lib, df): return df.groupby("a").agg({"b": "min"}) run_and_compare( - concat, - data=self.data, + concat, data=self.data, ) @pytest.mark.parametrize("join", ["inner"]) @@ -1854,9 +1844,7 @@ def sort(df, ascending, **kwargs): return df.sort_values(["a", "b"], ascending=ascending) run_and_compare( - sort, - data=self.data, - ascending=ascending, + sort, data=self.data, ascending=ascending, ) @pytest.mark.parametrize("ascending", ascending_values) @@ -1865,9 +1853,7 @@ def sort(df, ascending, **kwargs): return df.sort_values("d", ascending=ascending) run_and_compare( - sort, - data=self.data, - ascending=ascending, + sort, data=self.data, ascending=ascending, ) @pytest.mark.parametrize("cols", cols_values) @@ -1978,15 +1964,15 @@ def test_uint(self, md_df_constructor): { "uint8_in_int_bounds": np.array([1, 2, 3], dtype="uint8"), "uint8_out-of_int_bounds": np.array( - [(2**8) - 1, (2**8) - 2, (2**8) - 3], dtype="uint8" + [(2 ** 8) - 1, (2 ** 8) - 2, (2 ** 8) - 3], dtype="uint8" ), "uint16_in_int_bounds": np.array([1, 2, 3], dtype="uint16"), "uint16_out-of_int_bounds": np.array( - [(2**16) - 1, (2**16) - 2, (2**16) - 3], dtype="uint16" + [(2 ** 16) - 1, (2 ** 16) - 2, (2 ** 16) - 3], dtype="uint16" ), "uint32_in_int_bounds": np.array([1, 2, 3], dtype="uint32"), "uint32_out-of_int_bounds": np.array( - [(2**32) - 1, (2**32) - 2, (2**32) - 3], dtype="uint32" + [(2 ** 32) - 1, (2 ** 32) - 2, (2 ** 32) - 3], dtype="uint32" ), "uint64_in_int_bounds": np.array([1, 2, 3], dtype="uint64"), } @@ -2021,7 +2007,7 @@ def test_uint_overflow(self, md_df_constructor): pandas.DataFrame( { "col": np.array( - [(2**64) - 1, (2**64) - 2, (2**64) - 3], dtype="uint64" + [(2 ** 64) - 1, (2 ** 64) - 2, (2 ** 64) - 3], dtype="uint64" ) } ) diff --git a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/axis_partition.py b/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/axis_partition.py index 2336a9d5e41..35221400cde 100644 --- a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/axis_partition.py +++ b/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/axis_partition.py @@ -90,12 +90,7 @@ def apply(self, func, *args, num_splits=None, other_axis_partition=None, **kwarg return [ PyarrowOnRayDataframePartition(obj) for obj in deploy_ray_axis_func.options(num_returns=num_splits).remote( - self.axis, - func, - args, - kwargs, - num_splits, - *self.list_of_blocks, + self.axis, func, args, kwargs, num_splits, *self.list_of_blocks, ) ] @@ -255,13 +250,7 @@ def deploy_ray_axis_func(axis, func, f_args, f_kwargs, num_splits, *partitions): @ray.remote def deploy_ray_func_between_two_axis_partitions( - axis, - func, - f_args, - f_kwargs, - num_splits, - len_of_left, - *partitions, + axis, func, f_args, f_kwargs, num_splits, len_of_left, *partitions, ): """ Deploy a function along a full axis between two data sets in Ray. diff --git a/modin/experimental/core/storage_formats/hdk/query_compiler.py b/modin/experimental/core/storage_formats/hdk/query_compiler.py index 7ac735b4a02..ac33a3200c8 100644 --- a/modin/experimental/core/storage_formats/hdk/query_compiler.py +++ b/modin/experimental/core/storage_formats/hdk/query_compiler.py @@ -133,9 +133,7 @@ def bind_wrappers(cls): if callable(method): setattr( - cls, - name, - build_method_wrapper(name, method), + cls, name, build_method_wrapper(name, method), ) return cls @@ -293,13 +291,7 @@ def take_2d(self, index=None, columns=None): ) def groupby_size( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): # Grouping on empty frame or on index level. if len(self.columns) == 0: @@ -529,11 +521,7 @@ def fillna( ): assert not inplace, "inplace=True should be handled on upper level" new_frame = self._modin_frame.fillna( - value=value, - method=method, - axis=axis, - limit=limit, - downcast=downcast, + value=value, method=method, axis=axis, limit=limit, downcast=downcast, ) return self.__constructor__(new_frame, self._shape_hint) diff --git a/modin/experimental/pandas/test/test_io_exp.py b/modin/experimental/pandas/test/test_io_exp.py index ef265a0ef31..8acb291ce4f 100644 --- a/modin/experimental/pandas/test/test_io_exp.py +++ b/modin/experimental/pandas/test/test_io_exp.py @@ -33,8 +33,7 @@ @pytest.mark.skipif( - Engine.get() == "Dask", - reason="Dask does not have experimental API", + Engine.get() == "Dask", reason="Dask does not have experimental API", ) def test_from_sql_distributed(make_sql_connection): # noqa: F811 if Engine.get() == "Ray": @@ -67,8 +66,7 @@ def test_from_sql_distributed(make_sql_connection): # noqa: F811 @pytest.mark.skipif( - Engine.get() == "Dask", - reason="Dask does not have experimental API", + Engine.get() == "Dask", reason="Dask does not have experimental API", ) def test_from_sql_defaults(make_sql_connection): # noqa: F811 with ensure_clean_dir() as dirname: @@ -215,8 +213,7 @@ def _pandas_read_csv_glob(path, storage_options): pandas_df = pandas.concat( [ pandas.read_csv( - f"{path}test_data{i}.csv", - storage_options=storage_options, + f"{path}test_data{i}.csv", storage_options=storage_options, ) for i in range(2) ], @@ -234,8 +231,7 @@ def _pandas_read_csv_glob(path, storage_options): @pytest.mark.skipif( - not Engine.get() == "Ray", - reason=f"{Engine.get()} does not have experimental API", + not Engine.get() == "Ray", reason=f"{Engine.get()} does not have experimental API", ) @pytest.mark.parametrize("compression", [None, "gzip"]) @pytest.mark.parametrize( @@ -301,8 +297,7 @@ def _custom_parser(io_input, **kwargs): @pytest.mark.skipif( - not Engine.get() == "Ray", - reason=f"{Engine.get()} does not have experimental API", + not Engine.get() == "Ray", reason=f"{Engine.get()} does not have experimental API", ) def test_read_evaluated_dict(): def _generate_evaluated_dict(file_name, nrows, ncols): @@ -338,9 +333,7 @@ def columns_callback(io_input, **kwargs): _generate_evaluated_dict(filename, 64, 8) df1 = pd.read_custom_text( - filename, - columns=["col1", "col2"], - custom_parser=_custom_parser, + filename, columns=["col1", "col2"], custom_parser=_custom_parser, ) assert df1.shape == (64, 2) diff --git a/modin/experimental/xgboost/test/test_default.py b/modin/experimental/xgboost/test/test_default.py index 9504e3680c5..9c2d9d8a6fc 100644 --- a/modin/experimental/xgboost/test/test_default.py +++ b/modin/experimental/xgboost/test/test_default.py @@ -20,8 +20,7 @@ @pytest.mark.skipif( - Engine.get() == "Ray", - reason="This test doesn't make sense on Ray engine.", + Engine.get() == "Ray", reason="This test doesn't make sense on Ray engine.", ) @pytest.mark.skipif( Engine.get() == "Python", diff --git a/modin/experimental/xgboost/test/test_dmatrix.py b/modin/experimental/xgboost/test/test_dmatrix.py index 3f7f7d681c6..695ffec9c71 100644 --- a/modin/experimental/xgboost/test/test_dmatrix.py +++ b/modin/experimental/xgboost/test/test_dmatrix.py @@ -69,8 +69,7 @@ def check_dmatrix(data, label=None, **kwargs): ], ) @pytest.mark.parametrize( - "feature_types", - [None, "q", list("qiqiq")], + "feature_types", [None, "q", list("qiqiq")], ) def test_dmatrix_feature_names_and_feature_types(data, feature_names, feature_types): check_dmatrix(data, feature_names=feature_names, feature_types=feature_types) @@ -83,9 +82,7 @@ def test_feature_names(): feature_names = [f"feat{i}" for i in range(X.shape[1])] check_dmatrix( - X, - y, - feature_names=feature_names, + X, y, feature_names=feature_names, ) dmatrix = xgb.DMatrix(X, label=y, feature_names=feature_names) diff --git a/modin/experimental/xgboost/test/test_xgboost.py b/modin/experimental/xgboost/test/test_xgboost.py index 1863a268379..2f7008cfd06 100644 --- a/modin/experimental/xgboost/test/test_xgboost.py +++ b/modin/experimental/xgboost/test/test_xgboost.py @@ -40,12 +40,10 @@ @pytest.mark.parametrize( - "modin_type_y", - [pd.DataFrame, pd.Series], + "modin_type_y", [pd.DataFrame, pd.Series], ) @pytest.mark.parametrize( - "num_actors", - [1, num_cpus, None, modin.config.NPartitions.get() + 1], + "num_actors", [1, num_cpus, None, modin.config.NPartitions.get() + 1], ) @pytest.mark.parametrize( "data", @@ -114,28 +112,17 @@ def test_xgb_with_binary_classification_datasets(data, num_actors, modin_type_y) @pytest.mark.parametrize( - "modin_type_y", - [pd.DataFrame, pd.Series], + "modin_type_y", [pd.DataFrame, pd.Series], ) @pytest.mark.parametrize( - "num_actors", - [1, num_cpus, None, modin.config.NPartitions.get() + 1], + "num_actors", [1, num_cpus, None, modin.config.NPartitions.get() + 1], ) @pytest.mark.parametrize( "data", [ - ( - load_iris(), - {"num_class": 3}, - ), - ( - load_digits(), - {"num_class": 10}, - ), - ( - load_wine(), - {"num_class": 3}, - ), + (load_iris(), {"num_class": 3},), + (load_digits(), {"num_class": 10},), + (load_wine(), {"num_class": 3},), ], ids=["load_iris", "load_digits", "load_wine"], ) @@ -199,17 +186,13 @@ def test_xgb_with_multiclass_classification_datasets(data, num_actors, modin_typ @pytest.mark.parametrize( - "modin_type_y", - [pd.DataFrame, pd.Series], + "modin_type_y", [pd.DataFrame, pd.Series], ) @pytest.mark.parametrize( - "num_actors", - [1, num_cpus, None, modin.config.NPartitions.get() + 1], + "num_actors", [1, num_cpus, None, modin.config.NPartitions.get() + 1], ) @pytest.mark.parametrize( - "data", - [(load_diabetes(), {"eta": 0.01})], - ids=["load_diabetes"], + "data", [(load_diabetes(), {"eta": 0.01})], ids=["load_diabetes"], ) def test_xgb_with_regression_datasets(data, num_actors, modin_type_y): dataset, param = data diff --git a/modin/experimental/xgboost/xgboost.py b/modin/experimental/xgboost/xgboost.py index ce940129f54..520bdf497a4 100644 --- a/modin/experimental/xgboost/xgboost.py +++ b/modin/experimental/xgboost/xgboost.py @@ -305,9 +305,7 @@ def __init__(self, params=None, cache=(), model_file=None): # noqa: MD01 super(Booster, self).__init__(params=params, cache=cache, model_file=model_file) def predict( - self, - data: DMatrix, - **kwargs, + self, data: DMatrix, **kwargs, ): """ Run distributed prediction with a trained booster. diff --git a/modin/experimental/xgboost/xgboost_ray.py b/modin/experimental/xgboost/xgboost_ray.py index ddd05232204..d11ef7201fe 100644 --- a/modin/experimental/xgboost/xgboost_ray.py +++ b/modin/experimental/xgboost/xgboost_ray.py @@ -287,10 +287,7 @@ def create_actors(num_actors): def _split_data_across_actors( - actors: List, - set_func, - X_parts, - y_parts, + actors: List, set_func, X_parts, y_parts, ): """ Split row partitions of data between actors. @@ -306,15 +303,10 @@ def _split_data_across_actors( y_parts : list Row partitions of y data. """ - X_parts_by_actors = _assign_row_partitions_to_actors( - actors, - X_parts, - ) + X_parts_by_actors = _assign_row_partitions_to_actors(actors, X_parts,) y_parts_by_actors = _assign_row_partitions_to_actors( - actors, - y_parts, - data_for_aligning=X_parts_by_actors, + actors, y_parts, data_for_aligning=X_parts_by_actors, ) for rank, (_, actor) in enumerate(actors): @@ -322,9 +314,7 @@ def _split_data_across_actors( def _assign_row_partitions_to_actors( - actors: List, - row_partitions, - data_for_aligning=None, + actors: List, row_partitions, data_for_aligning=None, ): """ Assign row_partitions to actors. @@ -455,12 +445,7 @@ def _assign_row_partitions_to_actors( def _train( - dtrain, - params: Dict, - *args, - num_actors=None, - evals=(), - **kwargs, + dtrain, params: Dict, *args, num_actors=None, evals=(), **kwargs, ): """ Run distributed training of XGBoost model on Ray engine. @@ -592,17 +577,13 @@ def _map_predict(booster, part, columns, dmatrix_kwargs={}, **kwargs): """ dmatrix = xgb.DMatrix(part, **dmatrix_kwargs) prediction = pandas.DataFrame( - booster.predict(dmatrix, **kwargs), - index=part.index, - columns=columns, + booster.predict(dmatrix, **kwargs), index=part.index, columns=columns, ) return prediction def _predict( - booster, - data, - **kwargs, + booster, data, **kwargs, ): """ Run distributed prediction with a trained booster on Ray engine. diff --git a/modin/pandas/base.py b/modin/pandas/base.py index d318334179e..83f94ef9232 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -210,11 +210,7 @@ def _update_inplace(self, new_query_compiler): old_query_compiler.free() def _validate_other( - self, - other, - axis, - dtype_check=False, - compare_index=False, + self, other, axis, dtype_check=False, compare_index=False, ): """ Help to check validity of other in inter-df operations. @@ -846,12 +842,7 @@ def error_raiser(msg, exception): stacklevel=2, ) query_compiler = self._query_compiler.apply( - func, - axis, - args=args, - raw=raw, - result_type=result_type, - **kwds, + func, axis, args=args, raw=raw, result_type=result_type, **kwds, ) return query_compiler @@ -1805,11 +1796,7 @@ def _stat_operation( else self ) result_qc = getattr(data._query_compiler, op_name)( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, + axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs, ) return self._reduce_dimension(result_qc) @@ -1822,12 +1809,7 @@ def memory_usage(self, index=True, deep=False): # noqa: PR01, RT01, D200 ) def _min( - self, - axis, - skipna, - level, - numeric_only, - **kwargs, + self, axis, skipna, level, numeric_only, **kwargs, ): # noqa: PR01, RT01, D200 """ Return the minimum of the values over the requested axis. @@ -2019,13 +2001,7 @@ def check_dtype(t): @_inherit_docstrings(pandas.DataFrame.rank, apilink="pandas.DataFrame.rank") def _rank( - self, - axis, - method, - numeric_only, - na_option, - ascending, - pct, + self, axis, method, numeric_only, na_option, ascending, pct, ): axis = self._get_axis_number(axis) return self.__constructor__( @@ -2068,11 +2044,7 @@ def _ensure_index(self, index_like, axis=0): # noqa: PR01, RT01, D200 return ensure_index(index_like) def _reindex( - self, - index, - columns, - copy, - **kwargs, + self, index, columns, copy, **kwargs, ): # noqa: PR01, RT01, D200 """ Conform `BasePandasDataset` to new index with optional filling logic. @@ -2264,6 +2236,7 @@ def _reset_index( raise ValueError("cannot insert level_0, already exists") else: new_query_compiler = self._query_compiler.reset_index( + drop=drop, level=level, col_level=col_level, @@ -2378,14 +2351,7 @@ def rtruediv( rdiv = rtruediv def _sample( - self, - n, - frac, - replace, - weights, - random_state, - axis, - **kwargs, + self, n, frac, replace, weights, random_state, axis, **kwargs, ): # noqa: PR01, RT01, D200 """ Return a random sample of items from an axis of object. @@ -2504,13 +2470,7 @@ def _sample( return self.__constructor__(query_compiler=query_compiler) def _sem( - self, - axis, - skipna, - level, - ddof, - numeric_only, - **kwargs, + self, axis, skipna, level, ddof, numeric_only, **kwargs, ): # noqa: PR01, RT01, D200 """ Return unbiased standard error of the mean over requested axis. @@ -2611,12 +2571,7 @@ def _shift(self, periods, freq, axis, fill_value): # noqa: PR01, RT01, D200 return self.tshift(periods, freq) def _skew( - self, - axis, - skipna, - level, - numeric_only, - **kwargs, + self, axis, skipna, level, numeric_only, **kwargs, ): # noqa: PR01, RT01, D200 """ Return unbiased skew over requested axis. @@ -2696,13 +2651,7 @@ def sort_values( return self._create_or_update_from_compiler(result, inplace) def _std( - self, - axis, - skipna, - level, - ddof, - numeric_only, - **kwargs, + self, axis, skipna, level, ddof, numeric_only, **kwargs, ): # noqa: PR01, RT01, D200 """ Return sample standard deviation over requested axis. @@ -2782,11 +2731,7 @@ def to_hdf( ) def to_numpy(self, dtype=None, copy=False, na_value=no_default): - return self._query_compiler.to_numpy( - dtype=dtype, - copy=copy, - na_value=na_value, - ) + return self._query_compiler.to_numpy(dtype=dtype, copy=copy, na_value=na_value,) # TODO(williamma12): When this gets implemented, have the series one call this. def to_period( @@ -3343,19 +3288,13 @@ def __nonzero__(self): __bool__ = __nonzero__ @_doc_binary_op( - operation="disjunction", - bin_op="or", - right="other", - **_doc_binary_op_kwargs, + operation="disjunction", bin_op="or", right="other", **_doc_binary_op_kwargs, ) def __or__(self, other): return self._binary_op("__or__", other, axis=0) @_doc_binary_op( - operation="disjunction", - bin_op="ror", - right="other", - **_doc_binary_op_kwargs, + operation="disjunction", bin_op="ror", right="other", **_doc_binary_op_kwargs, ) def __ror__(self, other): return self._binary_op("__ror__", other, axis=0) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 756e29c5b2b..e28a5e7be9f 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -109,13 +109,7 @@ class DataFrame(DataFrameCompat, BasePandasDataset): @append_to_docstring(__doc__) def _init( - self, - data, - index, - columns, - dtype, - copy, - query_compiler, + self, data, index, columns, dtype, copy, query_compiler, ): # Siblings are other dataframes that share the same query compiler. We # use this list to update inplace when there is a shallow copy. @@ -703,8 +697,7 @@ def _corr(self, method, min_periods, numeric_only): # noqa: PR01, RT01, D200 ) return self.__constructor__( query_compiler=self._query_compiler.corr( - method=method, - min_periods=min_periods, + method=method, min_periods=min_periods, ) ) @@ -1677,13 +1670,7 @@ def pow( ) def _prod( - self, - axis, - skipna, - level, - numeric_only, - min_count, - **kwargs, + self, axis, skipna, level, numeric_only, min_count, **kwargs, ): # noqa: PR01, RT01, D200 """ Return the product of the values over the requested axis. @@ -1798,13 +1785,7 @@ def _rename( return obj def _replace( - self, - to_replace, - value, - inplace, - limit, - regex, - method, + self, to_replace, value, inplace, limit, regex, method, ): # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. @@ -2071,13 +2052,7 @@ def sub( subtract = sub def _sum( - self, - axis, - skipna, - level, - numeric_only, - min_count, - **kwargs, + self, axis, skipna, level, numeric_only, min_count, **kwargs, ): # noqa: PR01, RT01, D200 """ Return the sum of the values over the requested axis. @@ -2515,11 +2490,7 @@ def __setitem__(self, key, value): if len(key) != value.shape[-1]: raise ValueError("Columns must be same length as key") item = broadcast_item( - self, - slice(None), - key, - value, - need_columns_reindex=False, + self, slice(None), key, value, need_columns_reindex=False, ) new_qc = self._query_compiler.write_items( slice(None), self.columns.get_indexer_for(key), item diff --git a/modin/pandas/general.py b/modin/pandas/general.py index a34dac3a2e5..914d8c3d26e 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -378,11 +378,7 @@ def value_counts( Series """ return Series(values).value_counts( - sort=sort, - ascending=ascending, - normalize=normalize, - bins=bins, - dropna=dropna, + sort=sort, ascending=ascending, normalize=normalize, bins=bins, dropna=dropna, ) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index dcd990f13b2..8d30ceb4c97 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -346,8 +346,7 @@ def dtypes(self): raise ValueError("Cannot call dtypes on groupby with axis=1") return self._check_index( self._wrap_aggregation( - type(self._query_compiler).groupby_dtypes, - numeric_only=False, + type(self._query_compiler).groupby_dtypes, numeric_only=False, ) ) @@ -445,11 +444,7 @@ def __getitem__(self, key): ) cols_to_grab = internal_by.union(key) key = [col for col in self._df.columns if col in cols_to_grab] - return DataFrameGroupBy( - self._df[key], - drop=self._drop, - **kwargs, - ) + return DataFrameGroupBy(self._df[key], drop=self._drop, **kwargs,) if ( self._is_multi_by and isinstance(self._by, list) @@ -459,11 +454,7 @@ def __getitem__(self, key): "Column lookups on GroupBy with arbitrary Series in by" + " is not yet supported." ) - return SeriesGroupBy( - self._df[key], - drop=False, - **kwargs, - ) + return SeriesGroupBy(self._df[key], drop=False, **kwargs,) def cummin(self, axis=0, **kwargs): return self._check_index_name( @@ -675,8 +666,7 @@ def size(self): **self._kwargs, ) result = work_object._wrap_aggregation( - type(work_object._query_compiler).groupby_size, - numeric_only=False, + type(work_object._query_compiler).groupby_size, numeric_only=False, ) if not isinstance(result, Series): result = result.squeeze(axis=1) @@ -749,8 +739,7 @@ def resample(self, rule, *args, **kwargs): def median(self, numeric_only=None): return self._check_index( self._wrap_aggregation( - type(self._query_compiler).groupby_median, - numeric_only=numeric_only, + type(self._query_compiler).groupby_median, numeric_only=numeric_only, ) ) @@ -811,8 +800,7 @@ def fillna(self, *args, **kwargs): def count(self): result = self._wrap_aggregation( - type(self._query_compiler).groupby_count, - numeric_only=False, + type(self._query_compiler).groupby_count, numeric_only=False, ) # pandas do it in case of Series if isinstance(result, Series): @@ -1030,12 +1018,7 @@ def _compute_index_grouped(self, numerical=False): return groupby_obj.indices if numerical else groupby_obj.groups def _wrap_aggregation( - self, - qc_method, - numeric_only=None, - agg_args=None, - agg_kwargs=None, - **kwargs, + self, qc_method, numeric_only=None, agg_args=None, agg_kwargs=None, **kwargs, ): """ Perform common metadata transformations and apply groupby functions. diff --git a/modin/pandas/resample.py b/modin/pandas/resample.py index dbfd01167a8..49fd00cee56 100644 --- a/modin/pandas/resample.py +++ b/modin/pandas/resample.py @@ -92,12 +92,7 @@ def apply(self, func, *args, **kwargs): query_comp_op = self._query_compiler.resample_app_ser dataframe = DataFrame( - query_compiler=query_comp_op( - self.resample_kwargs, - func, - *args, - **kwargs, - ) + query_compiler=query_comp_op(self.resample_kwargs, func, *args, **kwargs,) ) if is_list_like(func) or isinstance(self._dataframe, DataFrame): return dataframe @@ -116,12 +111,7 @@ def aggregate(self, func, *args, **kwargs): query_comp_op = self._query_compiler.resample_agg_ser dataframe = DataFrame( - query_compiler=query_comp_op( - self.resample_kwargs, - func, - *args, - **kwargs, - ) + query_compiler=query_comp_op(self.resample_kwargs, func, *args, **kwargs,) ) if is_list_like(func) or isinstance(self._dataframe, DataFrame): return dataframe @@ -234,54 +224,42 @@ def nunique(self, *args, **kwargs): def first(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_first( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) def last(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_last( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) def max(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_max( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) def mean(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_mean( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) def median(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_median( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) def min(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_min( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) @@ -291,17 +269,13 @@ def ohlc(self, *args, **kwargs): if isinstance(self._dataframe, DataFrame): return DataFrame( query_compiler=self._query_compiler.resample_ohlc_df( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) else: return DataFrame( query_compiler=self._query_compiler.resample_ohlc_ser( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) @@ -319,9 +293,7 @@ def size(self): def sem(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_sem( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 0e8ff29a920..ab848153f53 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1196,12 +1196,7 @@ def keys(self): # noqa: RT01, D200 return self.index def _kurt( - self, - axis, - skipna, - level, - numeric_only, - **kwargs, + self, axis, skipna, level, numeric_only, **kwargs, ): # noqa: PR01, RT01, D200 """ Return unbiased kurtosis over requested axis. @@ -1245,14 +1240,7 @@ def arg(s): @_inherit_docstrings(pandas.Series.mask, apilink="pandas.Series.mask") def _mask( - self, - cond, - other, - inplace, - axis, - level, - errors, - try_cast, + self, cond, other, inplace, axis, level, errors, try_cast, ): return self._default_to_pandas( pandas.Series.mask, @@ -1411,13 +1399,7 @@ def pow(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, @_inherit_docstrings(pandas.Series.prod, apilink="pandas.Series.prod") def _prod( - self, - axis, - skipna, - level, - numeric_only, - min_count, - **kwargs, + self, axis, skipna, level, numeric_only, min_count, **kwargs, ): axis = self._get_axis_number(axis) if level is not None: @@ -1672,13 +1654,7 @@ def reorder_levels(self, order): # noqa: PR01, RT01, D200 return super(Series, self).reorder_levels(order) def _replace( - self, - to_replace, - value, - inplace, - limit, - regex, - method, + self, to_replace, value, inplace, limit, regex, method, ): # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. @@ -1789,13 +1765,7 @@ def sub(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, subtract = sub def _sum( - self, - axis, - skipna, - level, - numeric_only, - min_count, - **kwargs, + self, axis, skipna, level, numeric_only, min_count, **kwargs, ): # noqa: PR01, RT01, D200 """ Return the sum of the values. @@ -1889,11 +1859,7 @@ def to_numpy( """ return ( super(Series, self) - .to_numpy( - dtype=dtype, - copy=copy, - na_value=na_value, - ) + .to_numpy(dtype=dtype, copy=copy, na_value=na_value,) .flatten() ) @@ -2030,14 +1996,7 @@ def view(self, dtype=None): # noqa: PR01, RT01, D200 ) def _where( - self, - cond, - other, - inplace, - axis, - level, - errors, - try_cast, + self, cond, other, inplace, axis, level, errors, try_cast, ): # noqa: PR01, RT01, D200 """ Replace values where the condition is False. diff --git a/modin/pandas/test/dataframe/test_binary.py b/modin/pandas/test/dataframe/test_binary.py index 25f36f54de8..7e464915c68 100644 --- a/modin/pandas/test/dataframe/test_binary.py +++ b/modin/pandas/test/dataframe/test_binary.py @@ -75,7 +75,7 @@ def test_math_functions(other, axis, op): @pytest.mark.parametrize( "other", - [lambda df: df[: -(2**4)], lambda df: df[df.columns[0]].reset_index(drop=True)], + [lambda df: df[: -(2 ** 4)], lambda df: df[df.columns[0]].reset_index(drop=True)], ids=["check_missing_value", "check_different_index"], ) @pytest.mark.parametrize("fill_value", [None, 3.0]) diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index 710eb4b152a..d4ef9de5c76 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -186,8 +186,7 @@ def test_between_time(): pandas_df.between_time("12:00", "17:00"), ) df_equals( - modin_df.between_time("3:00", "4:00"), - pandas_df.between_time("3:00", "4:00"), + modin_df.between_time("3:00", "4:00"), pandas_df.between_time("3:00", "4:00"), ) df_equals( modin_df.T.between_time("12:00", "17:00", axis=1), @@ -431,9 +430,7 @@ def test_kurt_kurtosis_level(level): df_pandas.columns = index eval_general( - df_modin, - df_pandas, - lambda df: df.kurtosis(axis=1, level=level), + df_modin, df_pandas, lambda df: df.kurtosis(axis=1, level=level), ) @@ -470,9 +467,7 @@ def test_mad_level(level): modin_df.columns = index pandas_df.columns = index eval_general( - modin_df, - pandas_df, - lambda df: df.mad(axis=1, level=level), + modin_df, pandas_df, lambda df: df.mad(axis=1, level=level), ) @@ -583,12 +578,7 @@ def test_pivot_table_data(data, index, columns, values): ) @pytest.mark.parametrize("margins_name", ["Custom name", None]) def test_pivot_table_margins( - data, - index, - columns, - values, - aggfunc, - margins_name, + data, index, columns, values, aggfunc, margins_name, ): eval_general( *create_test_dfs(data), @@ -770,18 +760,10 @@ def test_resample_specific(rule, closed, label, on, level): modin_df[on] = pandas.date_range("22/06/1941", periods=12, freq="T") pandas_resampler = pandas_df.resample( - rule, - closed=closed, - label=label, - on=on, - level=level, + rule, closed=closed, label=label, on=on, level=level, ) modin_resampler = modin_df.resample( - rule, - closed=closed, - label=label, - on=on, - level=level, + rule, closed=closed, label=label, on=on, level=level, ) df_equals(modin_resampler.var(0), pandas_resampler.var(0)) if on is None and level is None: @@ -972,8 +954,7 @@ def test_swaplevel(): ), ) df_equals( - modin_df.swaplevel("Number", "Color"), - pandas_df.swaplevel("Number", "Color"), + modin_df.swaplevel("Number", "Color"), pandas_df.swaplevel("Number", "Color"), ) df_equals(modin_df.swaplevel(), pandas_df.swaplevel()) df_equals(modin_df.swaplevel(0, 1), pandas_df.swaplevel(0, 1)) @@ -1007,16 +988,14 @@ def test_take(): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_records(request, data): eval_general( - *create_test_dfs(data), - lambda df: df.dropna().to_records(), + *create_test_dfs(data), lambda df: df.dropna().to_records(), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_string(data): eval_general( - *create_test_dfs(data), - lambda df: df.to_string(), + *create_test_dfs(data), lambda df: df.to_string(), ) diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py index 6ff03ce684d..7ad5609503b 100644 --- a/modin/pandas/test/dataframe/test_indexing.py +++ b/modin/pandas/test/dataframe/test_indexing.py @@ -136,8 +136,7 @@ def test_asof_without_nan(dates, subset): @pytest.mark.parametrize( - "lookup", - [[60, 70, 90], [60.5, 70.5, 100]], + "lookup", [[60, 70, 90], [60.5, 70.5, 100]], ) @pytest.mark.parametrize("subset", ["col2", "col1", ["col1", "col2"], None]) def test_asof_large(lookup, subset): @@ -534,20 +533,15 @@ def test_loc_multi_index(): pandas_index = pandas.MultiIndex.from_tuples(tuples, names=["first", "second"]) frame_data = np.random.randint(0, 100, size=(16, 100)) modin_df = pd.DataFrame( - frame_data, - index=modin_index, - columns=["col{}".format(i) for i in range(100)], + frame_data, index=modin_index, columns=["col{}".format(i) for i in range(100)], ) pandas_df = pandas.DataFrame( - frame_data, - index=pandas_index, - columns=["col{}".format(i) for i in range(100)], + frame_data, index=pandas_index, columns=["col{}".format(i) for i in range(100)], ) df_equals(modin_df.loc["bar", "col1"], pandas_df.loc["bar", "col1"]) assert modin_df.loc[("bar", "one"), "col1"] == pandas_df.loc[("bar", "one"), "col1"] df_equals( - modin_df.loc["bar", ("col1", "col2")], - pandas_df.loc["bar", ("col1", "col2")], + modin_df.loc["bar", ("col1", "col2")], pandas_df.loc["bar", ("col1", "col2")], ) # From issue #1456 @@ -1344,8 +1338,7 @@ def test_reset_index_multiindex_groupby(data): "none_in_index_names", [ pytest.param( - False, - marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"), + False, marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"), ), True, "mixed_1st_None", @@ -1488,8 +1481,7 @@ def test_reset_index_with_multi_index_no_drop( "none_in_index_names", [ pytest.param( - False, - marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"), + False, marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"), ), True, "mixed_1st_None", @@ -1578,9 +1570,7 @@ def test_sample(data, axis): with pytest.raises(ValueError): modin_df.sample( - frac=0.5, - weights=[0.5 for _ in range(len(modin_df.columns[:-1]))], - axis=1, + frac=0.5, weights=[0.5 for _ in range(len(modin_df.columns[:-1]))], axis=1, ) with pytest.raises(ValueError): @@ -1987,9 +1977,7 @@ def test___setitem__mask(): ids=["empty_frame", "empty_cols", "1_length_cols", "2_length_cols"], ) @pytest.mark.parametrize( - "value", - [[11, 22], [11, 22, 33]], - ids=["2_length_val", "3_length_val"], + "value", [[11, 22], [11, 22, 33]], ids=["2_length_val", "3_length_val"], ) @pytest.mark.parametrize("convert_to_series", [False, True]) @pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"]) diff --git a/modin/pandas/test/dataframe/test_iter.py b/modin/pandas/test/dataframe/test_iter.py index a9978fbc4a0..5d0ba3a66b2 100644 --- a/modin/pandas/test/dataframe/test_iter.py +++ b/modin/pandas/test/dataframe/test_iter.py @@ -379,8 +379,7 @@ def test_constructor_columns_and_index(): pandas.DataFrame(pandas_df, columns=["max_speed", "health"]), ) df_equals( - pd.DataFrame(modin_df, index=[1, 2]), - pandas.DataFrame(pandas_df, index=[1, 2]), + pd.DataFrame(modin_df, index=[1, 2]), pandas.DataFrame(pandas_df, index=[1, 2]), ) df_equals( pd.DataFrame(modin_df, index=[1, 2], columns=["health"]), diff --git a/modin/pandas/test/dataframe/test_join_sort.py b/modin/pandas/test/dataframe/test_join_sort.py index ea6cf263268..3e510ec7b58 100644 --- a/modin/pandas/test/dataframe/test_join_sort.py +++ b/modin/pandas/test/dataframe/test_join_sort.py @@ -67,20 +67,20 @@ def test_combine(data): "test_data, test_data2", [ ( - np.random.uniform(0, 100, size=(2**6, 2**6)), - np.random.uniform(0, 100, size=(2**7, 2**6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), ), ( - np.random.uniform(0, 100, size=(2**7, 2**6)), - np.random.uniform(0, 100, size=(2**6, 2**6)), + np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), ), ( - np.random.uniform(0, 100, size=(2**6, 2**6)), - np.random.uniform(0, 100, size=(2**6, 2**7)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), ), ( - np.random.uniform(0, 100, size=(2**6, 2**7)), - np.random.uniform(0, 100, size=(2**6, 2**6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), ), ], ) @@ -165,20 +165,20 @@ def test_join(test_data, test_data2): "test_data, test_data2", [ ( - np.random.uniform(0, 100, size=(2**6, 2**6)), - np.random.uniform(0, 100, size=(2**7, 2**6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), ), ( - np.random.uniform(0, 100, size=(2**7, 2**6)), - np.random.uniform(0, 100, size=(2**6, 2**6)), + np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), ), ( - np.random.uniform(0, 100, size=(2**6, 2**6)), - np.random.uniform(0, 100, size=(2**6, 2**7)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), ), ( - np.random.uniform(0, 100, size=(2**6, 2**7)), - np.random.uniform(0, 100, size=(2**6, 2**6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), ), ], ) @@ -218,18 +218,10 @@ def test_merge(test_data, test_data2): df_equals(modin_result, pandas_result) modin_result = modin_df.merge( - modin_df2, - how=hows[i], - left_on="key", - right_on="key", - sort=sorts[j], + modin_df2, how=hows[i], left_on="key", right_on="key", sort=sorts[j], ) pandas_result = pandas_df.merge( - pandas_df2, - how=hows[i], - left_on="key", - right_on="key", - sort=sorts[j], + pandas_df2, how=hows[i], left_on="key", right_on="key", sort=sorts[j], ) df_equals(modin_result, pandas_result) @@ -441,9 +433,7 @@ def test_sort_multiindex(sort_remaining): ) @pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"]) @pytest.mark.parametrize( - "ignore_index", - bool_arg_values, - ids=arg_keys("ignore_index", bool_arg_keys), + "ignore_index", bool_arg_values, ids=arg_keys("ignore_index", bool_arg_keys), ) @pytest.mark.parametrize("key", [None, rotate_decimal_digits_or_symbols]) def test_sort_values( diff --git a/modin/pandas/test/dataframe/test_reduce.py b/modin/pandas/test/dataframe/test_reduce.py index 93f7be95fe4..03cada3f57f 100644 --- a/modin/pandas/test/dataframe/test_reduce.py +++ b/modin/pandas/test/dataframe/test_reduce.py @@ -94,9 +94,7 @@ def test_all_any_level(data, axis, level, method): pandas_df.columns = new_col eval_general( - modin_df, - pandas_df, - lambda df: getattr(df, method)(axis=axis, level=level), + modin_df, pandas_df, lambda df: getattr(df, method)(axis=axis, level=level), ) @@ -106,8 +104,7 @@ def test_all_any_level(data, axis, level, method): ) def test_count(data, axis): eval_general( - *create_test_dfs(data), - lambda df: df.count(axis=axis), + *create_test_dfs(data), lambda df: df.count(axis=axis), ) @@ -135,9 +132,7 @@ def test_count_level(data, axis, level): pandas_df.columns = new_col eval_general( - modin_df, - pandas_df, - lambda df: df.count(axis=axis, level=level), + modin_df, pandas_df, lambda df: df.count(axis=axis, level=level), ) @@ -146,9 +141,7 @@ def test_count_dtypes(data): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) eval_general( - modin_df, - pandas_df, - lambda df: df.isna().count(axis=0), + modin_df, pandas_df, lambda df: df.isna().count(axis=0), ) @@ -156,8 +149,7 @@ def test_count_dtypes(data): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_describe(data, percentiles): eval_general( - *create_test_dfs(data), - lambda df: df.describe(percentiles=percentiles), + *create_test_dfs(data), lambda df: df.describe(percentiles=percentiles), ) @@ -165,12 +157,12 @@ def test_describe(data, percentiles): @pytest.mark.parametrize("datetime_is_numeric", [True, False, None]) def test_2195(datetime_is_numeric, has_numeric_column): data = { - "categorical": pd.Categorical(["d"] * 10**2), - "date": [np.datetime64("2000-01-01")] * 10**2, + "categorical": pd.Categorical(["d"] * 10 ** 2), + "date": [np.datetime64("2000-01-01")] * 10 ** 2, } if has_numeric_column: - data.update({"numeric": [5] * 10**2}) + data.update({"numeric": [5] * 10 ** 2}) modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) @@ -292,17 +284,12 @@ def test_min_max_mean(data, axis, skipna, numeric_only, is_transposed, method): @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) def test_prod( - data, - axis, - skipna, - is_transposed, - method, + data, axis, skipna, is_transposed, method, ): eval_general( *create_test_dfs(data), lambda df, *args, **kwargs: getattr(df.T if is_transposed else df, method)( - axis=axis, - skipna=skipna, + axis=axis, skipna=skipna, ), ) @@ -328,10 +315,7 @@ def test_prod( def test_sum(data, axis, skipna, is_transposed): eval_general( *create_test_dfs(data), - lambda df: (df.T if is_transposed else df).sum( - axis=axis, - skipna=skipna, - ), + lambda df: (df.T if is_transposed else df).sum(axis=axis, skipna=skipna,), ) # test for issue #1953 diff --git a/modin/pandas/test/dataframe/test_window.py b/modin/pandas/test/dataframe/test_window.py index f1f7e103bbd..30903d2b86e 100644 --- a/modin/pandas/test/dataframe/test_window.py +++ b/modin/pandas/test/dataframe/test_window.py @@ -91,8 +91,7 @@ def test_diff(axis, periods): @pytest.mark.parametrize("axis", ["rows", "columns"]) def test_diff_transposed(axis): eval_general( - *create_test_dfs(test_data["int_data"]), - lambda df: df.T.diff(axis=axis), + *create_test_dfs(test_data["int_data"]), lambda df: df.T.diff(axis=axis), ) @@ -356,8 +355,7 @@ def test_fillna_dict_series(): df_equals(modin_df.fillna({"a": 0, "b": 5}), df.fillna({"a": 0, "b": 5})) df_equals( - modin_df.fillna({"a": 0, "b": 5, "d": 7}), - df.fillna({"a": 0, "b": 5, "d": 7}), + modin_df.fillna({"a": 0, "b": 5, "d": 7}), df.fillna({"a": 0, "b": 5, "d": 7}), ) # Series treated same as dict diff --git a/modin/pandas/test/test_concat.py b/modin/pandas/test/test_concat.py index 12d23892b2e..56b6d91ed51 100644 --- a/modin/pandas/test/test_concat.py +++ b/modin/pandas/test/test_concat.py @@ -149,7 +149,7 @@ def test_ignore_index_concat(): def test_concat_non_subscriptable_keys(): - frame_data = np.random.randint(0, 100, size=(2**10, 2**6)) + frame_data = np.random.randint(0, 100, size=(2 ** 10, 2 ** 6)) df = pd.DataFrame(frame_data).add_prefix("col") pdf = pandas.DataFrame(frame_data).add_prefix("col") @@ -232,7 +232,6 @@ def test_sort_order(sort, join, axis): pandas_concat = pandas.concat([pandas_df, pandas_df2], join=join, sort=sort) modin_concat = pd.concat([modin_df, modin_df2], join=join, sort=sort) df_equals( - pandas_concat, - modin_concat, + pandas_concat, modin_concat, ) assert list(pandas_concat.columns) == list(modin_concat.columns) diff --git a/modin/pandas/test/test_general.py b/modin/pandas/test/test_general.py index 4b6f3a4acdf..ca2982fcf2b 100644 --- a/modin/pandas/test/test_general.py +++ b/modin/pandas/test/test_general.py @@ -419,11 +419,7 @@ def test_merge_asof_merge_options(): # left_by + right_by with warns_that_defaulting_to_pandas(): modin_result = pd.merge_asof( - modin_quotes, - modin_trades, - on="time", - left_by="ticker", - right_by="ticker2", + modin_quotes, modin_trades, on="time", left_by="ticker", right_by="ticker2", ) df_equals( pandas.merge_asof( @@ -441,18 +437,10 @@ def test_merge_asof_merge_options(): modin_trades["ticker"] = modin_trades["ticker2"] with warns_that_defaulting_to_pandas(): modin_result = pd.merge_asof( - modin_quotes, - modin_trades, - on="time", - by="ticker", + modin_quotes, modin_trades, on="time", by="ticker", ) df_equals( - pandas.merge_asof( - pandas_quotes, - pandas_trades, - on="time", - by="ticker", - ), + pandas.merge_asof(pandas_quotes, pandas_trades, on="time", by="ticker",), modin_result, ) @@ -479,19 +467,11 @@ def test_merge_asof_merge_options(): # Direction with warns_that_defaulting_to_pandas(): modin_result = pd.merge_asof( - modin_quotes, - modin_trades, - on="time", - by="ticker", - direction="forward", + modin_quotes, modin_trades, on="time", by="ticker", direction="forward", ) df_equals( pandas.merge_asof( - pandas_quotes, - pandas_trades, - on="time", - by="ticker", - direction="forward", + pandas_quotes, pandas_trades, on="time", by="ticker", direction="forward", ), modin_result, ) @@ -745,11 +725,7 @@ def test_to_pandas_indices(data): def test_create_categorical_dataframe_with_duplicate_column_name(): # This tests for https://github.com/modin-project/modin/issues/4312 pd_df = pandas.DataFrame( - { - "a": pandas.Categorical([1, 2]), - "b": [4, 5], - "c": pandas.Categorical([7, 8]), - } + {"a": pandas.Categorical([1, 2]), "b": [4, 5], "c": pandas.Categorical([7, 8]),} ) pd_df.columns = ["a", "b", "a"] md_df = pd.DataFrame(pd_df) @@ -778,10 +754,7 @@ def test_create_categorical_dataframe_with_duplicate_column_name(): (lambda df: df.mean(level=0), r"DataFrame\.mean"), (lambda df: df + df, r"DataFrame\.add"), (lambda df: df.index, r"DataFrame\.get_axis\(0\)"), - ( - lambda df: df.drop(columns="col1").squeeze().repeat(2), - r"Series\.repeat", - ), + (lambda df: df.drop(columns="col1").squeeze().repeat(2), r"Series\.repeat",), (lambda df: df.groupby("col1").prod(), r"GroupBy\.prod"), (lambda df: df.rolling(1).count(), r"Rolling\.count"), ], diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index b805c8e4238..83e434e83ce 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -124,7 +124,7 @@ def wrapper(obj1, obj2, *args, **kwargs): @pytest.mark.parametrize("as_index", [True, False]) def test_mixed_dtypes_groupby(as_index): - frame_data = np.random.randint(97, 198, size=(2**6, 2**4)) + frame_data = np.random.randint(97, 198, size=(2 ** 6, 2 ** 4)) pandas_df = pandas.DataFrame(frame_data).add_prefix("col") # Convert every other column to string for col in pandas_df.iloc[ @@ -880,13 +880,13 @@ def test_simple_col_groupby(): @pytest.mark.parametrize( - "by", [np.random.randint(0, 100, size=2**8), lambda x: x % 3, None] + "by", [np.random.randint(0, 100, size=2 ** 8), lambda x: x % 3, None] ) @pytest.mark.parametrize("as_index_series_or_dataframe", [0, 1, 2]) def test_series_groupby(by, as_index_series_or_dataframe): if as_index_series_or_dataframe <= 1: as_index = as_index_series_or_dataframe == 1 - series_data = np.random.randint(97, 198, size=2**8) + series_data = np.random.randint(97, 198, size=2 ** 8) modin_series = pd.Series(series_data) pandas_series = pandas.Series(series_data) else: @@ -1257,19 +1257,13 @@ def eval_groups(modin_groupby, pandas_groupby): @_copy_pandas_groupby_if_needed def eval_shift(modin_groupby, pandas_groupby): eval_general( - modin_groupby, - pandas_groupby, - lambda groupby: groupby.shift(), + modin_groupby, pandas_groupby, lambda groupby: groupby.shift(), ) eval_general( - modin_groupby, - pandas_groupby, - lambda groupby: groupby.shift(periods=0), + modin_groupby, pandas_groupby, lambda groupby: groupby.shift(periods=0), ) eval_general( - modin_groupby, - pandas_groupby, - lambda groupby: groupby.shift(periods=-3), + modin_groupby, pandas_groupby, lambda groupby: groupby.shift(periods=-3), ) # Disabled for `BaseOnPython` because of the issue with `getitem_array`. @@ -1298,7 +1292,7 @@ def eval_shift(modin_groupby, pandas_groupby): def test_groupby_on_index_values_with_loop(): - length = 2**6 + length = 2 ** 6 data = { "a": np.random.randint(0, 100, size=length), "b": np.random.randint(0, 100, size=length), @@ -1338,7 +1332,7 @@ def test_groupby_on_index_values_with_loop(): ], ) def test_groupby_multiindex(groupby_kwargs): - frame_data = np.random.randint(0, 100, size=(2**6, 2**4)) + frame_data = np.random.randint(0, 100, size=(2 ** 6, 2 ** 4)) modin_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) @@ -1958,8 +1952,7 @@ def test_not_str_by(by, as_index): # 0 and -1 are considered to be the indices of the columns to group on. pytest.param({1: "sum", 2: "nunique"}, id="dict_agg_no_intersection_with_by"), pytest.param( - {0: "mean", 1: "sum", 2: "nunique"}, - id="dict_agg_has_intersection_with_by", + {0: "mean", 1: "sum", 2: "nunique"}, id="dict_agg_has_intersection_with_by", ), pytest.param( {1: "sum", 2: "nunique", -1: "nunique"}, diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 768a3ca4ebf..bcb8017556b 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -269,13 +269,7 @@ def test_read_csv_delimiters( ) @pytest.mark.parametrize("skip_blank_lines", [True, False]) def test_read_csv_col_handling( - self, - header, - index_col, - prefix, - names, - usecols, - skip_blank_lines, + self, header, index_col, prefix, names, usecols, skip_blank_lines, ): if names is lib.no_default: pytest.skip("some parameters combiantions fails: issue #2312") @@ -318,11 +312,7 @@ def test_from_csv_with_callable_usecols(self, usecols): ) @pytest.mark.parametrize("skipfooter", [0, 10]) def test_read_csv_parsing_1( - self, - dtype, - engine, - converters, - skipfooter, + self, dtype, engine, converters, skipfooter, ): if dtype: @@ -369,14 +359,7 @@ def test_read_csv_parsing_1( ) @pytest.mark.parametrize("encoding", ["latin1", "windows-1251", None]) def test_read_csv_parsing_2( - self, - make_csv_file, - request, - header, - skiprows, - nrows, - names, - encoding, + self, make_csv_file, request, header, skiprows, nrows, names, encoding, ): xfail_case = ( StorageFormat.get() == "Hdk" @@ -398,8 +381,7 @@ def test_read_csv_parsing_2( with ensure_clean(".csv") as unique_filename: if encoding: make_csv_file( - filename=unique_filename, - encoding=encoding, + filename=unique_filename, encoding=encoding, ) kwargs = { "filepath_or_buffer": unique_filename @@ -433,11 +415,7 @@ def test_read_csv_parsing_2( @pytest.mark.parametrize("skipfooter", [0, 10]) @pytest.mark.parametrize("nrows", [35, None]) def test_read_csv_parsing_3( - self, - true_values, - false_values, - skipfooter, - nrows, + self, true_values, false_values, skipfooter, nrows, ): xfail_case = ( (false_values or true_values) @@ -471,8 +449,7 @@ def test_read_csv_skipinitialspace(self): eval_io_from_str(str_initial_spaces, unique_filename, skipinitialspace=True) @pytest.mark.parametrize( - "test_case", - ["single_element", "single_column", "multiple_columns"], + "test_case", ["single_element", "single_column", "multiple_columns"], ) def test_read_csv_squeeze(self, request, test_case): if request.config.getoption("--simulate-cloud").lower() != "off": @@ -529,12 +506,7 @@ def test_read_csv_does_not_warn_mangle_dupe_cols_kwarg(self): @pytest.mark.parametrize("verbose", [True, False]) @pytest.mark.parametrize("skip_blank_lines", [True, False]) def test_read_csv_nans_handling( - self, - na_values, - keep_default_na, - na_filter, - verbose, - skip_blank_lines, + self, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, ): eval_io( fn_name="read_csv", @@ -718,13 +690,7 @@ def test_read_csv_encoding(self, make_csv_file, encoding): @pytest.mark.parametrize("escapechar", [None, "d", "x"]) @pytest.mark.parametrize("dialect", ["test_csv_dialect", None]) def test_read_csv_file_format( - self, - make_csv_file, - thousands, - decimal, - lineterminator, - escapechar, - dialect, + self, make_csv_file, thousands, decimal, lineterminator, escapechar, dialect, ): if Engine.get() != "Python" and lineterminator == "x": pytest.xfail("read_csv with Ray engine outputs empty frame - issue #2493") @@ -779,12 +745,7 @@ def test_read_csv_file_format( @pytest.mark.parametrize("doublequote", [True, False]) @pytest.mark.parametrize("comment", [None, "#", "x"]) def test_read_csv_quoting( - self, - make_csv_file, - quoting, - quotechar, - doublequote, - comment, + self, make_csv_file, quoting, quotechar, doublequote, comment, ): # in these cases escapechar should be set, otherwise error occures # _csv.Error: need to escape, but no escapechar set" @@ -822,10 +783,7 @@ def test_read_csv_quoting( reason="In compat mode, some error handling tests are failing due to https://github.com/modin-project/modin/issues/2845", ) def test_read_csv_error_handling( - self, - warn_bad_lines, - error_bad_lines, - on_bad_lines, + self, warn_bad_lines, error_bad_lines, on_bad_lines, ): # in that case exceptions are raised both by Modin and pandas # and tests pass @@ -909,8 +867,7 @@ def test_read_csv_internal( ) else: make_csv_file( - filename=unique_filename, - delimiter=delimiter, + filename=unique_filename, delimiter=delimiter, ) eval_io( @@ -1327,12 +1284,10 @@ def test_to_csv_with_index(self): ] ).transpose() modin_df = pd.DataFrame( - values, - columns=["key"] + ["avalue" + str(i) for i in range(1, 1 + cols)], + values, columns=["key"] + ["avalue" + str(i) for i in range(1, 1 + cols)], ).set_index("key") pandas_df = pandas.DataFrame( - values, - columns=["key"] + ["avalue" + str(i) for i in range(1, 1 + cols)], + values, columns=["key"] + ["avalue" + str(i) for i in range(1, 1 + cols)], ).set_index("key") eval_to_file(modin_df, pandas_df, "to_csv", "csv") @@ -1915,24 +1870,21 @@ def test_excel_empty_line(self): def test_read_excel_empty_rows(self): # Test parsing empty rows in middle of excel dataframe as NaN values eval_io( - fn_name="read_excel", - io="modin/pandas/test/data/test_empty_rows.xlsx", + fn_name="read_excel", io="modin/pandas/test/data/test_empty_rows.xlsx", ) @check_file_leaks def test_read_excel_border_rows(self): # Test parsing border rows as NaN values in excel dataframe eval_io( - fn_name="read_excel", - io="modin/pandas/test/data/test_border_rows.xlsx", + fn_name="read_excel", io="modin/pandas/test/data/test_border_rows.xlsx", ) @check_file_leaks def test_read_excel_every_other_nan(self): # Test for reading excel dataframe with every other row as a NaN value eval_io( - fn_name="read_excel", - io="modin/pandas/test/data/every_other_row_nan.xlsx", + fn_name="read_excel", io="modin/pandas/test/data/every_other_row_nan.xlsx", ) @pytest.mark.parametrize( @@ -2164,16 +2116,13 @@ def test_read_sql_from_sql_server(self): "mssql+pymssql://sa:Strong.Pwd-123@0.0.0.0:1433/master" ) pandas_df_to_read = pandas.DataFrame( - np.arange( - 1000 * 256, - ).reshape(1000, 256) + np.arange(1000 * 256,).reshape(1000, 256) ).add_prefix("col") pandas_df_to_read.to_sql( table_name, sqlalchemy_connection_string, if_exists="replace" ) modin_df = pd.read_sql( - query, - ModinDatabaseConnection("sqlalchemy", sqlalchemy_connection_string), + query, ModinDatabaseConnection("sqlalchemy", sqlalchemy_connection_string), ) pandas_df = pandas.read_sql(query, sqlalchemy_connection_string) df_equals(modin_df, pandas_df) @@ -2187,15 +2136,10 @@ def test_read_sql_from_postgres(self): query = f"SELECT * FROM {table_name}" connection = "postgresql://sa:Strong.Pwd-123@localhost:2345/postgres" pandas_df_to_read = pandas.DataFrame( - np.arange( - 1000 * 256, - ).reshape(1000, 256) + np.arange(1000 * 256,).reshape(1000, 256) ).add_prefix("col") pandas_df_to_read.to_sql(table_name, connection, if_exists="replace") - modin_df = pd.read_sql( - query, - ModinDatabaseConnection("psycopg2", connection), - ) + modin_df = pd.read_sql(query, ModinDatabaseConnection("psycopg2", connection),) pandas_df = pandas.read_sql(query, connection) df_equals(modin_df, pandas_df) @@ -2542,8 +2486,7 @@ def test_read_feather_s3(self, storage_options): def test_read_feather_path_object(self, make_feather_file): eval_io( - fn_name="read_feather", - path=Path(make_feather_file()), + fn_name="read_feather", path=Path(make_feather_file()), ) @pytest.mark.xfail( diff --git a/modin/pandas/test/test_rolling.py b/modin/pandas/test/test_rolling.py index 689720ce180..257669777b7 100644 --- a/modin/pandas/test/test_rolling.py +++ b/modin/pandas/test/test_rolling.py @@ -152,8 +152,7 @@ def test_dataframe_dt_index(axis, on, closed, window): df_equals(modin_rolled.count(), pandas_rolled.count()) df_equals(modin_rolled.skew(), pandas_rolled.skew()) df_equals( - modin_rolled.apply(np.sum, raw=True), - pandas_rolled.apply(np.sum, raw=True), + modin_rolled.apply(np.sum, raw=True), pandas_rolled.apply(np.sum, raw=True), ) df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1)) @@ -168,16 +167,10 @@ def test_series(data, window, min_periods, win_type): if window > len(pandas_series): window = len(pandas_series) pandas_rolled = pandas_series.rolling( - window=window, - min_periods=min_periods, - win_type=win_type, - center=True, + window=window, min_periods=min_periods, win_type=win_type, center=True, ) modin_rolled = modin_series.rolling( - window=window, - min_periods=min_periods, - win_type=win_type, - center=True, + window=window, min_periods=min_periods, win_type=win_type, center=True, ) # Testing of Window class if win_type is not None: @@ -196,8 +189,7 @@ def test_series(data, window, min_periods, win_type): df_equals(modin_rolled.min(), pandas_rolled.min()) df_equals(modin_rolled.max(), pandas_rolled.max()) df_equals( - modin_rolled.corr(modin_series), - pandas_rolled.corr(pandas_series), + modin_rolled.corr(modin_series), pandas_rolled.corr(pandas_series), ) df_equals( modin_rolled.cov(modin_series, True), pandas_rolled.cov(pandas_series, True) @@ -211,8 +203,7 @@ def test_series(data, window, min_periods, win_type): df_equals(modin_rolled.apply(np.sum), pandas_rolled.apply(np.sum)) df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) df_equals( - modin_rolled.agg([np.sum, np.mean]), - pandas_rolled.agg([np.sum, np.mean]), + modin_rolled.agg([np.sum, np.mean]), pandas_rolled.agg([np.sum, np.mean]), ) df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1)) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 22e551d6a7a..e6b302f72bc 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -503,9 +503,7 @@ def test___pow__(data): "dt_index", [True, False], ids=["dt_index_true", "dt_index_false"] ) @pytest.mark.parametrize( - "data", - [*test_data_values, "empty"], - ids=[*test_data_keys, "empty"], + "data", [*test_data_values, "empty"], ids=[*test_data_keys, "empty"], ) def test___repr__(name, dt_index, data): if data == "empty": @@ -646,8 +644,7 @@ def test_agg(data, func): "Older pandas raises TypeError but Modin conforms to AssertionError" ) eval_general( - *create_test_series(data), - lambda df: df.agg(func), + *create_test_series(data), lambda df: df.agg(func), ) @@ -658,8 +655,7 @@ def test_agg_except(data, func): # See details in pandas issue 36036. with pytest.raises(SpecificationError): eval_general( - *create_test_series(data), - lambda df: df.agg(func), + *create_test_series(data), lambda df: df.agg(func), ) @@ -671,8 +667,7 @@ def test_agg_numeric(request, data, func): ): axis = 0 eval_general( - *create_test_series(data), - lambda df: df.agg(func, axis), + *create_test_series(data), lambda df: df.agg(func, axis), ) @@ -687,8 +682,7 @@ def test_agg_numeric_except(request, data, func): # See details in pandas issue 36036. with pytest.raises(SpecificationError): eval_general( - *create_test_series(data), - lambda df: df.agg(func, axis), + *create_test_series(data), lambda df: df.agg(func, axis), ) @@ -704,8 +698,7 @@ def test_aggregate(data, func): ) axis = 0 eval_general( - *create_test_series(data), - lambda df: df.aggregate(func, axis), + *create_test_series(data), lambda df: df.aggregate(func, axis), ) @@ -717,8 +710,7 @@ def test_aggregate_except(data, func): # See details in pandas issues 36036. with pytest.raises(SpecificationError): eval_general( - *create_test_series(data), - lambda df: df.aggregate(func, axis), + *create_test_series(data), lambda df: df.aggregate(func, axis), ) @@ -730,8 +722,7 @@ def test_aggregate_numeric(request, data, func): ): axis = 0 eval_general( - *create_test_series(data), - lambda df: df.agg(func, axis), + *create_test_series(data), lambda df: df.agg(func, axis), ) @@ -746,8 +737,7 @@ def test_aggregate_numeric_except(request, data, func): # See details in pandas issues 36036. with pytest.raises(SpecificationError): eval_general( - *create_test_series(data), - lambda df: df.agg(func, axis), + *create_test_series(data), lambda df: df.agg(func, axis), ) @@ -871,8 +861,7 @@ def test_append(data): @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_apply(data, func): eval_general( - *create_test_series(data), - lambda df: df.apply(func), + *create_test_series(data), lambda df: df.apply(func), ) @@ -883,8 +872,7 @@ def test_apply_except(data, func): # See details in pandas issues 36036. with pytest.raises(SpecificationError): eval_general( - *create_test_series(data), - lambda df: df.apply(func), + *create_test_series(data), lambda df: df.apply(func), ) @@ -913,8 +901,7 @@ def test_apply_external_lib(): def test_apply_numeric(request, data, func): if name_contains(request.node.name, numeric_dfs): eval_general( - *create_test_series(data), - lambda df: df.apply(func), + *create_test_series(data), lambda df: df.apply(func), ) @@ -926,8 +913,7 @@ def test_apply_numeric_except(request, data, func): # See details in pandas issues 36036. with pytest.raises(SpecificationError): eval_general( - *create_test_series(data), - lambda df: df.apply(func), + *create_test_series(data), lambda df: df.apply(func), ) @@ -1019,8 +1005,7 @@ def test_asof(where): @pytest.mark.parametrize( - "where", - [20, 30, [10.5, 40.5], [10], pandas.Index([20, 30]), pandas.Index([10.5])], + "where", [20, 30, [10.5, 40.5], [10], pandas.Index([20, 30]), pandas.Index([10.5])], ) def test_asof_large(where): values = test_data["float_nan_data"]["col1"] @@ -1193,9 +1178,7 @@ def test_bool(data): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"]) def test_clip_scalar(request, data, bound_type): - modin_series, pandas_series = create_test_series( - data, - ) + modin_series, pandas_series = create_test_series(data,) if name_contains(request.node.name, numeric_dfs): # set bounds @@ -1215,9 +1198,7 @@ def test_clip_scalar(request, data, bound_type): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"]) def test_clip_sequence(request, data, bound_type): - modin_series, pandas_series = create_test_series( - data, - ) + modin_series, pandas_series = create_test_series(data,) if name_contains(request.node.name, numeric_dfs): lower = random_state.random_integers(RAND_LOW, RAND_HIGH, len(pandas_series)) @@ -1691,8 +1672,7 @@ def test_dt(timezone): modin_series.dt.to_pydatetime(), pandas_series.dt.to_pydatetime() ) df_equals( - modin_series.dt.tz_localize(None), - pandas_series.dt.tz_localize(None), + modin_series.dt.tz_localize(None), pandas_series.dt.tz_localize(None), ) if timezone: df_equals( @@ -2138,9 +2118,7 @@ def test_kurtosis_level(level): pandas_s.columns = index eval_general( - modin_s, - pandas_s, - lambda s: s.kurtosis(axis=1, level=level), + modin_s, pandas_s, lambda s: s.kurtosis(axis=1, level=level), ) @@ -2166,8 +2144,7 @@ def test_index_order(func): s_pandas.index = index df_equals( - getattr(s_modin, func)(level=0).index, - getattr(s_pandas, func)(level=0).index, + getattr(s_modin, func)(level=0).index, getattr(s_pandas, func)(level=0).index, ) @@ -2750,9 +2727,7 @@ def test_repeat(data, repeats): ) def test_repeat_lists(data, repeats): eval_general( - pd.Series(data), - pandas.Series(data), - lambda df: df.repeat(repeats), + pd.Series(data), pandas.Series(data), lambda df: df.repeat(repeats), ) @@ -2817,12 +2792,10 @@ def test_resample(closed, label, level): pandas_resampler.transform(lambda x: (x - x.mean()) / x.std()), ) df_equals( - modin_resampler.aggregate("max"), - pandas_resampler.aggregate("max"), + modin_resampler.aggregate("max"), pandas_resampler.aggregate("max"), ) df_equals( - modin_resampler.apply("sum"), - pandas_resampler.apply("sum"), + modin_resampler.apply("sum"), pandas_resampler.apply("sum"), ) df_equals( modin_resampler.get_group(name=list(modin_resampler.groups)[0]), @@ -2834,8 +2807,7 @@ def test_resample(closed, label, level): # Upsampling from level= or on= selection is not supported if level is None: df_equals( - modin_resampler.interpolate(), - pandas_resampler.interpolate(), + modin_resampler.interpolate(), pandas_resampler.interpolate(), ) df_equals(modin_resampler.asfreq(), pandas_resampler.asfreq()) df_equals( @@ -3035,8 +3007,7 @@ def test_sem_float_nan_only(skipna, ddof): @pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) def test_sem_int_only(ddof): eval_general( - *create_test_series(test_data["int_data"]), - lambda df: df.sem(ddof=ddof), + *create_test_series(test_data["int_data"]), lambda df: df.sem(ddof=ddof), ) @@ -3082,8 +3053,7 @@ def test_shift_slice_shift(data, index, periods): ] df_equals( - modin_series.shift(periods=periods), - pandas_series.shift(periods=periods), + modin_series.shift(periods=periods), pandas_series.shift(periods=periods), ) df_equals( modin_series.shift(periods=periods, fill_value=777), @@ -3114,9 +3084,7 @@ def test_sort_index(data, ascending, sort_remaining, na_position): modin_series, pandas_series, lambda df: df.sort_index( - ascending=ascending, - sort_remaining=sort_remaining, - na_position=na_position, + ascending=ascending, sort_remaining=sort_remaining, na_position=na_position, ), ) @@ -3352,8 +3320,7 @@ def test_series_empty_values(): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_string(request, data): eval_general( - *create_test_series(data), - lambda df: df.to_string(), + *create_test_series(data), lambda df: df.to_string(), ) @@ -3382,8 +3349,7 @@ def test_tolist(data): @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_transform(data, func): eval_general( - *create_test_series(data), - lambda df: df.transform(func), + *create_test_series(data), lambda df: df.transform(func), ) @@ -3391,8 +3357,7 @@ def test_transform(data, func): @pytest.mark.parametrize("func", agg_func_except_values, ids=agg_func_except_keys) def test_transform_except(data, func): eval_general( - *create_test_series(data), - lambda df: df.transform(func), + *create_test_series(data), lambda df: df.transform(func), ) @@ -3475,8 +3440,7 @@ def test_tz_localize(): pandas_series.tz_localize("America/Los_Angeles"), ) df_equals( - modin_series.tz_localize("UTC"), - pandas_series.tz_localize("UTC"), + modin_series.tz_localize("UTC"), pandas_series.tz_localize("UTC"), ) @@ -3582,7 +3546,7 @@ def sort_sensitive_comparator(df1, df2): ) # from issue #2365 - arr = np.random.rand(2**6) + arr = np.random.rand(2 ** 6) arr[::10] = np.nan eval_general( *create_test_series(arr), @@ -3608,8 +3572,7 @@ def test_value_counts_categorical(): random_state.shuffle(data) eval_general( - *create_test_series(data, dtype="category"), - lambda df: df.value_counts(), + *create_test_series(data, dtype="category"), lambda df: df.value_counts(), ) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 21565286309..1bc1d926f3c 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -42,9 +42,9 @@ random_state = np.random.RandomState(seed=42) DATASET_SIZE_DICT = { - "Small": (2**2, 2**3), - "Normal": (2**6, 2**8), - "Big": (2**7, 2**12), + "Small": (2 ** 2, 2 ** 3), + "Normal": (2 ** 6, 2 ** 8), + "Big": (2 ** 7, 2 ** 12), } # Size of test dataframes @@ -852,9 +852,7 @@ def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs): f.write(csv_str) eval_io( - filepath_or_buffer=unique_filename, - fn_name="read_csv", - **kwargs, + filepath_or_buffer=unique_filename, fn_name="read_csv", **kwargs, ) finally: diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index 7564732c9bd..5d5de728041 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -255,11 +255,7 @@ def check_both_not_none(option1, option2): def broadcast_item( - obj, - row_lookup, - col_lookup, - item, - need_columns_reindex=True, + obj, row_lookup, col_lookup, item, need_columns_reindex=True, ): """ Use NumPy to broadcast or reshape item with reindexing. diff --git a/modin/pandas/window.py b/modin/pandas/window.py index fa7922df5c8..f14ca384935 100644 --- a/modin/pandas/window.py +++ b/modin/pandas/window.py @@ -192,20 +192,13 @@ def apply( ) def aggregate( - self, - func, - *args, - **kwargs, + self, func, *args, **kwargs, ): from .dataframe import DataFrame dataframe = DataFrame( query_compiler=self._query_compiler.rolling_aggregate( - self.axis, - self.rolling_args, - func, - *args, - **kwargs, + self.axis, self.rolling_args, func, *args, **kwargs, ) ) if isinstance(self._dataframe, DataFrame): diff --git a/modin/test/interchange/dataframe_protocol/base/test_utils.py b/modin/test/interchange/dataframe_protocol/base/test_utils.py index 5fdc803e331..a5de4f5edd3 100644 --- a/modin/test/interchange/dataframe_protocol/base/test_utils.py +++ b/modin/test/interchange/dataframe_protocol/base/test_utils.py @@ -42,10 +42,7 @@ (np.dtype("float32"), "f"), (np.dtype("float64"), "g"), (pandas.Series(["a"]).dtype, "u"), - ( - pandas.Series([0]).astype("datetime64[ns]").dtype, - "tsn:", - ), + (pandas.Series([0]).astype("datetime64[ns]").dtype, "tsn:",), ], ) def test_dtype_to_arrow_c(pandas_dtype, c_string): # noqa PR01 diff --git a/modin/test/storage_formats/pandas/test_internals.py b/modin/test/storage_formats/pandas/test_internals.py index 92c91027cdd..43718943efe 100644 --- a/modin/test/storage_formats/pandas/test_internals.py +++ b/modin/test/storage_formats/pandas/test_internals.py @@ -166,16 +166,10 @@ def func_to_apply(partition, row_internal_indices, col_internal_indices, item): ) @pytest.mark.parametrize( "test_type", - [ - "many_small_dfs", - "concatted_df_with_small_dfs", - "large_df_plus_small_dfs", - ], + ["many_small_dfs", "concatted_df_with_small_dfs", "large_df_plus_small_dfs",], ) @pytest.mark.parametrize( - "set_num_partitions", - [1, 4], - indirect=True, + "set_num_partitions", [1, 4], indirect=True, ) def test_rebalance_partitions(test_type, set_num_partitions): num_partitions = NPartitions.get() @@ -300,9 +294,7 @@ class TestDrainVirtualPartitionCallQueue: """ def test_from_virtual_partitions_with_call_queues( - self, - axis, - virtual_partition_class, + self, axis, virtual_partition_class, ): # reverse the dataframe along the virtual partition axis. def reverse(df): @@ -333,8 +325,7 @@ def reverse(df): else: expected_df = pandas.DataFrame([[1, 0, 3, 2]], columns=[0, 0, 0, 0]) df_equals( - level_two_virtual.to_pandas(), - expected_df, + level_two_virtual.to_pandas(), expected_df, ) def test_from_block_and_virtual_partition_with_call_queues( diff --git a/modin/test/test_envvar_npartitions.py b/modin/test/test_envvar_npartitions.py index fa23bbef957..e2d0db8570a 100644 --- a/modin/test/test_envvar_npartitions.py +++ b/modin/test/test_envvar_npartitions.py @@ -21,7 +21,7 @@ @pytest.mark.parametrize("num_partitions", [2, 4, 6, 8, 10]) def test_set_npartitions(num_partitions): NPartitions.put(num_partitions) - data = np.random.randint(0, 100, size=(2**16, 2**8)) + data = np.random.randint(0, 100, size=(2 ** 16, 2 ** 8)) df = pd.DataFrame(data) part_shape = df._query_compiler._modin_frame._partitions.shape assert part_shape[0] == num_partitions and part_shape[1] == min(num_partitions, 8) @@ -31,7 +31,7 @@ def test_set_npartitions(num_partitions): @pytest.mark.parametrize("right_num_partitions", [2, 4, 6, 8, 10]) def test_runtime_change_npartitions(left_num_partitions, right_num_partitions): NPartitions.put(left_num_partitions) - data = np.random.randint(0, 100, size=(2**16, 2**8)) + data = np.random.randint(0, 100, size=(2 ** 16, 2 ** 8)) left_df = pd.DataFrame(data) part_shape = left_df._query_compiler._modin_frame._partitions.shape assert part_shape[0] == left_num_partitions and part_shape[1] == min( diff --git a/modin/test/test_partition_api.py b/modin/test/test_partition_api.py index 5f4d8a44bf6..b85d5309c8c 100644 --- a/modin/test/test_partition_api.py +++ b/modin/test/test_partition_api.py @@ -84,10 +84,8 @@ def get_df(lib, data): get_func(actual_partitions[row_idx][col_idx]), ) else: - expected_axis_partitions = ( - expected_df._query_compiler._modin_frame._partition_mgr_cls.axis_partition( - expected_partitions, axis ^ 1 - ) + expected_axis_partitions = expected_df._query_compiler._modin_frame._partition_mgr_cls.axis_partition( + expected_partitions, axis ^ 1 ) expected_axis_partitions = [ axis_partition.force_materialization().unwrap(squeeze=True) diff --git a/modin/test/test_utils.py b/modin/test/test_utils.py index 98c2439e208..01eb6673163 100644 --- a/modin/test/test_utils.py +++ b/modin/test/test_utils.py @@ -138,11 +138,7 @@ class Child(Parent): @pytest.mark.parametrize( "source_doc,to_append,expected", [ - ( - "One-line doc.", - "One-line message.", - "One-line doc.One-line message.", - ), + ("One-line doc.", "One-line message.", "One-line doc.One-line message.",), ( """ Regular doc-string diff --git a/scripts/doc_checker.py b/scripts/doc_checker.py index 5a525637136..66f2955a11a 100644 --- a/scripts/doc_checker.py +++ b/scripts/doc_checker.py @@ -128,8 +128,7 @@ def check_optional_args(doc: Docstring) -> list: ( "MD01", MODIN_ERROR_CODES["MD01"].format( - parameter=parameter, - found=type_line, + parameter=parameter, found=type_line, ), ) )