diff --git a/.github/workflows/ci-pre-commit.yml b/.github/workflows/ci-pre-commit.yml index 7c45cdcb..8cf7c5ec 100644 --- a/.github/workflows/ci-pre-commit.yml +++ b/.github/workflows/ci-pre-commit.yml @@ -6,6 +6,8 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - - uses: pre-commit/action@v2.0.3 + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + - uses: pre-commit/action@v3.0.0 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2dca9160..a78f50b2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,16 +24,16 @@ jobs: matrix: numfocus_nightly: [false] os: ["ubuntu-latest"] - pyarrow: ["0.17.1", "1.0.1", "2.0.0", "3.0.0", "4.0.1", "5.0.0", "6.0.1", "nightly"] - python: ["3.7", "3.8"] + pyarrow: ["2.0.0", "3.0.0", "4.0.1", "5.0.0", "6.0.1", "6.0.2", "nightly"] + python: ["3.8"] include: - numfocus_nightly: true os: "ubuntu-latest" - pyarrow: "2.0.0" + pyarrow: "6.0.2" python: "3.8" - numfocus_nightly: false os: "macos-latest" - pyarrow: "0.17.1" + pyarrow: "5.0.0" python: "3.8" continue-on-error: ${{ matrix.numfocus_nightly || matrix.pyarrow == 'nightly' }} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index b0dedf99..81000099 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -9,11 +9,12 @@ jobs: - name: Checkout source uses: actions/checkout@v2 - - name: Setup Conda Environment - uses: conda-incubator/setup-miniconda@v2.1.1 + - name: Mamba Docs environment + uses: mamba-org/provision-with-micromamba@main with: environment-file: docs/environment-docs.yml - activate-environment: kartothek-docs + environment-name: kartothek-docs + cache-downloads: true - name: List conda shell: bash -l {0} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index deea8b57..2709e77e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - - repo: https://github.com/ambv/black - rev: 19.10b0 + - repo: https://github.com/psf/black + rev: 22.10.0 hooks: - id: black args: - --safe - - --target-version=py36 + - --target-version=py38 - repo: https://github.com/asottile/blacken-docs rev: v1.7.0 @@ -13,15 +13,15 @@ repos: - id: blacken-docs additional_dependencies: [black==19.10b0] args: - - --target-version=py36 + - --target-version=py38 - - repo: https://gitlab.com/pycqa/flake8 + - repo: https://github.com/PyCQA/flake8 rev: 3.8.3 hooks: - id: flake8 - repo: https://github.com/pre-commit/mirrors-isort - rev: v4.3.21 + rev: v5.10.1 hooks: - id: isort additional_dependencies: [toml] diff --git a/asv_bench/benchmarks/filter.py b/asv_bench/benchmarks/filter.py index e514f804..118a6f2a 100644 --- a/asv_bench/benchmarks/filter.py +++ b/asv_bench/benchmarks/filter.py @@ -26,7 +26,7 @@ def setup(self, predicate): [("int32", "<", 321)], [("int32", "<", 321)], ] - self.df = get_dataframe_not_nested(10 ** 5) + self.df = get_dataframe_not_nested(10**5) def time_filter_df_from_predicates(self, predicate): filter_df_from_predicates(self.df, self.predicate) @@ -48,7 +48,7 @@ def setup(self, column): if column == "null": raise NotImplementedError() self.arr = ( - get_dataframe_not_nested(10 ** 5) + get_dataframe_not_nested(10**5) .sample(frac=1.0) .reset_index(drop=True)[column] .values @@ -69,7 +69,7 @@ class TimeFilterArrayIn: params = ( cols_to_filter, [10, 100, 1000], - [10 ** 4, 10 ** 5, 10 ** 6], + [10**4, 10**5, 10**6], ) param_names = ["column", "filter_size", "array_size", "enabled"] diff --git a/asv_bench/benchmarks/index.py b/asv_bench/benchmarks/index.py index 75dae7c0..09703223 100644 --- a/asv_bench/benchmarks/index.py +++ b/asv_bench/benchmarks/index.py @@ -60,8 +60,8 @@ def teardown(self, number_values, number_partitions, dtype): class Index(IndexBase): params = ( - [10 * 1, 10 ** 3], # values - [10 * 1, 10 ** 3], # partitions + [10 * 1, 10**3], # values + [10 * 1, 10**3], # partitions [(int, pa.int64())], # types ) param_names = ["number_values", "number_partitions", "dtype"] @@ -93,7 +93,7 @@ def time_observed_values(self, number_values, number_partitions, arrow_type): class SerializeIndex(IndexBase): timeout = 180 params = ( - [(10 ** 3, 10), (10 ** 4, 100)], # (values, partitions) + [(10**3, 10), (10**4, 100)], # (values, partitions) [(int, pa.int64())], # types ) param_names = ["number_values__number_partitions", "dtype"] @@ -117,7 +117,7 @@ def time_serialization(self, number_values__number_partitions, arrow_type): class BuildIndex(AsvBenchmarkConfig): - params = ([-1, 1], [10 ** 3, 10 ** 4], [10, 100]) + params = ([-1, 1], [10**3, 10**4], [10, 100]) param_names = ["cardinality", "num_values", "partitions_to_merge"] def setup(self, cardinality, num_values, partitions_to_merge): diff --git a/asv_bench/benchmarks/metapartition.py b/asv_bench/benchmarks/metapartition.py index 65e427dc..c9d67c7a 100644 --- a/asv_bench/benchmarks/metapartition.py +++ b/asv_bench/benchmarks/metapartition.py @@ -16,7 +16,7 @@ class TimeMetaPartition(AsvBenchmarkConfig): params = ( - [10 ** 5, 10 ** 6], + [10**5, 10**6], [ (np.int64, 123456789), (str, "abcdefgh"), diff --git a/asv_bench/benchmarks/predicate_pushdown.py b/asv_bench/benchmarks/predicate_pushdown.py index ee876b2e..54af3ee6 100644 --- a/asv_bench/benchmarks/predicate_pushdown.py +++ b/asv_bench/benchmarks/predicate_pushdown.py @@ -10,7 +10,7 @@ class TimeRestore: of iterating over dictionaries in Python. """ - params = [(10 ** 3, 10 ** 4), (10, 10 ** 2, 10 ** 3)] + params = [(10**3, 10**4), (10, 10**2, 10**3)] param_names = ["num_rows", "chunk_size"] def setup(self, num_rows, chunk_size): diff --git a/asv_bench/benchmarks/schema.py b/asv_bench/benchmarks/schema.py index c4f16ce7..5a03eb8e 100644 --- a/asv_bench/benchmarks/schema.py +++ b/asv_bench/benchmarks/schema.py @@ -23,7 +23,7 @@ def time_make_meta(self): class TimeValidateCompatible(AsvBenchmarkConfig): - params = ([2, 10 ** 2, 10 ** 3, 10 ** 4], [True, False]) + params = ([2, 10**2, 10**3, 10**4], [True, False]) timeout = 120.0 param_names = ["num_schemas", "has_na"] @@ -50,7 +50,7 @@ def time_validate_compatible(self, num_schemas, has_na): class TimeValidateSharedColumns(AsvBenchmarkConfig): - params = [2, 10 ** 2] + params = [2, 10**2] timeout = 120.0 param_names = ["num_schemas"] diff --git a/asv_bench/benchmarks/write.py b/asv_bench/benchmarks/write.py index ce42b40d..eba544cb 100644 --- a/asv_bench/benchmarks/write.py +++ b/asv_bench/benchmarks/write.py @@ -55,7 +55,7 @@ def generate_metadata(max_depth=7, num_leafs=5): class TimeStoreDataset(AsvBenchmarkConfig): timeout = 120 - params = ([10, 10 ** 2, 10 ** 3], [4], [2, 4]) + params = ([10, 10**2, 10**3], [4], [2, 4]) param_names = ["num_partitions", "max_depth", "num_leafs"] def setup(self, num_partitions, max_depth, num_leafs): @@ -76,7 +76,7 @@ def time_store_dataset_from_partitions(self, num_partitions, max_depth, num_leaf class TimePersistMetadata(AsvBenchmarkConfig): timeout = 240 - params = [1, 10 ** 2, 10 ** 3] + params = [1, 10**2, 10**3] def setup(self, num_partitions): self.store = get_store_from_url("hfs://{}".format(tempfile.mkdtemp())) diff --git a/docs/environment-docs.yml b/docs/environment-docs.yml index f298efd9..0403d3c9 100644 --- a/docs/environment-docs.yml +++ b/docs/environment-docs.yml @@ -2,6 +2,7 @@ name: kartothek-docs channels: - conda-forge dependencies: + - python=3.8 - dask[dataframe] - decorator - msgpack-python>=0.5.2 @@ -10,6 +11,7 @@ dependencies: - pandas>=0.23.0, !=1.0.0 - pyarrow>=0.17.1,!=1.0.0, <4 - simplejson + - jinja2<3.1 - simplekv - storefact - toolz diff --git a/kartothek/api/discover.py b/kartothek/api/discover.py index b1cd21e7..7f15ef84 100644 --- a/kartothek/api/discover.py +++ b/kartothek/api/discover.py @@ -275,7 +275,8 @@ def discover_cube( if len(partition_keys) == 0: raise ValueError( 'Seed dataset ("{seed_dataset}") has no partition keys.'.format( # type: ignore # noqa - seed_dataset=seed_dataset, partition_keys=", ".join(partition_keys), + seed_dataset=seed_dataset, + partition_keys=", ".join(partition_keys), ) ) elif len(partition_keys) < 2: diff --git a/kartothek/core/dataset.py b/kartothek/core/dataset.py index b398b20c..002199ff 100644 --- a/kartothek/core/dataset.py +++ b/kartothek/core/dataset.py @@ -280,7 +280,8 @@ def load_index(self: T, column: str, store: StoreInput) -> T: return self.copy(indices=indices) @deprecate_parameters_if_set( - DEPRECATION_WARNING_REMOVE_PARAMETER, "load_partition_indices", + DEPRECATION_WARNING_REMOVE_PARAMETER, + "load_partition_indices", ) def load_all_indices( self: T, store: StoreInput, load_partition_indices: bool = True @@ -446,7 +447,9 @@ def get_indices_as_dataframe( ) else: df = dm._evaluate_conjunction( - columns=columns, predicates=None, date_as_object=date_as_object, + columns=columns, + predicates=None, + date_as_object=date_as_object, ) return df diff --git a/kartothek/core/factory.py b/kartothek/core/factory.py index a3590628..1e365f63 100644 --- a/kartothek/core/factory.py +++ b/kartothek/core/factory.py @@ -165,7 +165,9 @@ def load_index(self: T, column, store=None) -> T: "load_partition_indices", ) def load_all_indices( - self: T, store: Any = None, load_partition_indices: bool = True, + self: T, + store: Any = None, + load_partition_indices: bool = True, ) -> T: self._cache_metadata = self.dataset_metadata.load_all_indices( self.store, load_partition_indices=load_partition_indices diff --git a/kartothek/io/dask/_sizeof.py b/kartothek/io/dask/_sizeof.py index 0ceaf9f9..1ffc763d 100644 --- a/kartothek/io/dask/_sizeof.py +++ b/kartothek/io/dask/_sizeof.py @@ -6,12 +6,12 @@ def _dct_sizeof(obj): def register_sizeof_ktk_classes(): + from kartothek.core.common_metadata import SchemaWrapper from kartothek.core.dataset import DatasetMetadata from kartothek.core.factory import DatasetFactory - from kartothek.io_components.metapartition import MetaPartition from kartothek.core.index import ExplicitSecondaryIndex, PartitionIndex from kartothek.core.partition import Partition - from kartothek.core.common_metadata import SchemaWrapper + from kartothek.io_components.metapartition import MetaPartition dask_sizeof.register(DatasetMetadata, _dct_sizeof) dask_sizeof.register(DatasetFactory, _dct_sizeof) diff --git a/kartothek/io/dask/compression.py b/kartothek/io/dask/compression.py index 19dc7c9f..ab3cda42 100644 --- a/kartothek/io/dask/compression.py +++ b/kartothek/io/dask/compression.py @@ -10,8 +10,7 @@ try: # Technically distributed is an optional dependency - from distributed.protocol import serialize_bytes - from distributed.protocol import deserialize_bytes + from distributed.protocol import deserialize_bytes, serialize_bytes HAS_DISTRIBUTED = True except ImportError: diff --git a/kartothek/io/dask/dataframe.py b/kartothek/io/dask/dataframe.py index e5167124..4b6ff762 100644 --- a/kartothek/io/dask/dataframe.py +++ b/kartothek/io/dask/dataframe.py @@ -281,7 +281,8 @@ def _shuffle_docs(func): @default_docs @_shuffle_docs @deprecate_parameters_if_set( - DEPRECATION_WARNING_REMOVE_PARAMETER, "delete_scope", + DEPRECATION_WARNING_REMOVE_PARAMETER, + "delete_scope", ) def store_dataset_from_ddf( ddf: dd.DataFrame, diff --git a/kartothek/io/eager.py b/kartothek/io/eager.py index f7f480e3..b61efc75 100644 --- a/kartothek/io/eager.py +++ b/kartothek/io/eager.py @@ -428,7 +428,9 @@ def read_table( @default_docs @normalize_args @deprecate_parameters_if_set( - DEPRECATION_WARNING_REMOVE_PARAMETER, "output_dataset_uuid", "df_serializer", + DEPRECATION_WARNING_REMOVE_PARAMETER, + "output_dataset_uuid", + "df_serializer", ) def commit_dataset( store: Optional[StoreInput] = None, @@ -712,7 +714,10 @@ def create_empty_dataset_header( "df_serializer", ) @deprecate_parameters_if_set( - DEPRECATION_WARNING_REMOVE_PARAMETER, "metadata", "overwrite", "metadata_merger", + DEPRECATION_WARNING_REMOVE_PARAMETER, + "metadata", + "overwrite", + "metadata_merger", ) def write_single_partition( store: Optional[KeyValueStore] = None, diff --git a/kartothek/io/testing/append_cube.py b/kartothek/io/testing/append_cube.py index d4023443..af69767e 100644 --- a/kartothek/io/testing/append_cube.py +++ b/kartothek/io/testing/append_cube.py @@ -116,7 +116,10 @@ def test_rowgroups_are_applied_when_df_serializer_is_passed_to_append_cube( """ # Build cube - df = pd.DataFrame(data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}, columns=["x", "p"],) + df = pd.DataFrame( + data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}, + columns=["x", "p"], + ) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( data=df, @@ -127,7 +130,8 @@ def test_rowgroups_are_applied_when_df_serializer_is_passed_to_append_cube( # Append to cube df_append = pd.DataFrame( - data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]}, columns=["x", "p"], + data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]}, + columns=["x", "p"], ) result = driver( data={"seed": df_append}, @@ -157,17 +161,27 @@ def test_single_rowgroup_when_df_serializer_is_not_passed_to_append_cube( """ # Build cube - df = pd.DataFrame(data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}, columns=["x", "p"],) + df = pd.DataFrame( + data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}, + columns=["x", "p"], + ) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( - data=df, cube=cube, store=function_store, + data=df, + cube=cube, + store=function_store, ) # Append to cube df_append = pd.DataFrame( - data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]}, columns=["x", "p"], + data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]}, + columns=["x", "p"], + ) + result = driver( + data={"seed": df_append}, + cube=cube, + store=function_store, ) - result = driver(data={"seed": df_append}, cube=cube, store=function_store,) dataset = result["seed"].load_all_indices(function_store()) part_num_rows = {0: 2, 1: 2, 2: 1, 3: 3} @@ -187,7 +201,10 @@ def test_compression_is_compatible_on_append_cube(driver, function_store): unnecessarily. """ # Build cube - df = pd.DataFrame(data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}, columns=["x", "p"],) + df = pd.DataFrame( + data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}, + columns=["x", "p"], + ) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( data=df, @@ -198,7 +215,8 @@ def test_compression_is_compatible_on_append_cube(driver, function_store): # Append to cube df_append = pd.DataFrame( - data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]}, columns=["x", "p"], + data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]}, + columns=["x", "p"], ) result = driver( data={"seed": df_append}, diff --git a/kartothek/io/testing/build_cube.py b/kartothek/io/testing/build_cube.py index 26b68e5e..0b11e313 100644 --- a/kartothek/io/testing/build_cube.py +++ b/kartothek/io/testing/build_cube.py @@ -280,7 +280,10 @@ def test_rowgroups_are_applied_when_df_serializer_is_passed_to_build_cube( """ Test that the dataset is split into row groups depending on the chunk size """ - df = pd.DataFrame(data={"x": [0, 1, 2, 3], "p": [0, 1, 1, 1]}, columns=["x", "p"],) + df = pd.DataFrame( + data={"x": [0, 1, 2, 3], "p": [0, 1, 1, 1]}, + columns=["x", "p"], + ) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") result = driver( @@ -304,10 +307,17 @@ def test_single_rowgroup_when_df_serializer_is_not_passed_to_build_cube( """ Test that the dataset has a single row group as default path """ - df = pd.DataFrame(data={"x": [0, 1, 2, 3], "p": [0, 1, 1, 1]}, columns=["x", "p"],) + df = pd.DataFrame( + data={"x": [0, 1, 2, 3], "p": [0, 1, 1, 1]}, + columns=["x", "p"], + ) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") - result = driver(data=df, cube=cube, store=function_store,) + result = driver( + data=df, + cube=cube, + store=function_store, + ) dataset = result["seed"].load_all_indices(function_store()) part_num_rows = {0: 1, 1: 3} diff --git a/kartothek/io/testing/extend_cube.py b/kartothek/io/testing/extend_cube.py index abedf734..a3e436a5 100644 --- a/kartothek/io/testing/extend_cube.py +++ b/kartothek/io/testing/extend_cube.py @@ -92,7 +92,8 @@ def test_rowgroups_are_applied_when_df_serializer_is_passed_to_extend_cube( Test that the dataset is split into row groups depending on the chunk size """ df_extra = pd.DataFrame( - data={"x": [0, 1, 2, 3], "p": [0, 1, 1, 1]}, columns=["x", "p"], + data={"x": [0, 1, 2, 3], "p": [0, 1, 1, 1]}, + columns=["x", "p"], ) result = driver( data={"extra": df_extra}, @@ -116,9 +117,14 @@ def test_single_rowgroup_when_df_serializer_is_not_passed_to_extend_cube( Test that the dataset has a single row group as default path """ df_extra = pd.DataFrame( - data={"x": [0, 1, 2, 3], "p": [0, 1, 1, 1]}, columns=["x", "p"], + data={"x": [0, 1, 2, 3], "p": [0, 1, 1, 1]}, + columns=["x", "p"], + ) + result = driver( + data={"extra": df_extra}, + cube=existing_cube, + store=function_store, ) - result = driver(data={"extra": df_extra}, cube=existing_cube, store=function_store,) dataset = result["extra"].load_all_indices(function_store()) part_num_rows = {0: 1, 1: 3} @@ -138,7 +144,10 @@ def test_compression_is_compatible_on_extend_cube(driver, function_store): unnecessarily. """ # Build cube - df = pd.DataFrame(data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}, columns=["x", "p"],) + df = pd.DataFrame( + data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}, + columns=["x", "p"], + ) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( data=df, @@ -148,7 +157,8 @@ def test_compression_is_compatible_on_extend_cube(driver, function_store): ) df_extra = pd.DataFrame( - data={"x": [0, 1, 2, 3], "p": [0, 1, 1, 1]}, columns=["x", "p"], + data={"x": [0, 1, 2, 3], "p": [0, 1, 1, 1]}, + columns=["x", "p"], ) result = driver( data={"extra": df_extra}, diff --git a/kartothek/io/testing/update_cube.py b/kartothek/io/testing/update_cube.py index ff1bff1a..babae0c9 100644 --- a/kartothek/io/testing/update_cube.py +++ b/kartothek/io/testing/update_cube.py @@ -133,7 +133,10 @@ def test_rowgroups_are_applied_when_df_serializer_is_passed_to_update_cube( ``chunk_size>0`` should be split into row groups accordingly. """ # Build cube - df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],) + df = pd.DataFrame( + data={"x": [0, 1], "p": [0, 1]}, + columns=["x", "p"], + ) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( data=df, @@ -144,7 +147,8 @@ def test_rowgroups_are_applied_when_df_serializer_is_passed_to_update_cube( # Update cube - replace p=1 and append p=2 partitions df_update = pd.DataFrame( - data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"], + data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, + columns=["x", "p"], ) result = driver( data={"seed": df_update}, @@ -173,15 +177,21 @@ def test_single_rowgroup_when_df_serializer_is_not_passed_to_update_cube( Test that the dataset has a single row group as default path """ # Build cube - df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],) + df = pd.DataFrame( + data={"x": [0, 1], "p": [0, 1]}, + columns=["x", "p"], + ) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( - data=df, cube=cube, store=function_store, + data=df, + cube=cube, + store=function_store, ) # Update cube - replace p=1 and append p=2 partitions df_update = pd.DataFrame( - data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"], + data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, + columns=["x", "p"], ) result = driver( data={"seed": df_update}, @@ -208,7 +218,10 @@ def test_compression_is_compatible_on_update_cube(driver, function_store): unnecessarily. """ # Build cube - df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],) + df = pd.DataFrame( + data={"x": [0, 1], "p": [0, 1]}, + columns=["x", "p"], + ) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( data=df, @@ -219,7 +232,8 @@ def test_compression_is_compatible_on_update_cube(driver, function_store): # Update cube - replace p=1 and append p=2 partitions df_update = pd.DataFrame( - data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"], + data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, + columns=["x", "p"], ) result = driver( data={"seed": df_update}, diff --git a/kartothek/io_components/cube/query/__init__.py b/kartothek/io_components/cube/query/__init__.py index f81ae1fa..f3add9a1 100644 --- a/kartothek/io_components/cube/query/__init__.py +++ b/kartothek/io_components/cube/query/__init__.py @@ -256,7 +256,13 @@ def _reduce_dtype(dtype): def plan_query( - conditions, cube, datasets, dimension_columns, partition_by, payload_columns, store, + conditions, + cube, + datasets, + dimension_columns, + partition_by, + payload_columns, + store, ): """ Plan cube query execution. diff --git a/kartothek/io_components/metapartition.py b/kartothek/io_components/metapartition.py index 0fab39fe..0103b4e4 100644 --- a/kartothek/io_components/metapartition.py +++ b/kartothek/io_components/metapartition.py @@ -1285,8 +1285,7 @@ def apply( return self.copy(data=new_data, table_meta=new_table_meta) def as_sentinel(self): - """ - """ + """ """ return MetaPartition( None, metadata_version=self.metadata_version, @@ -1798,7 +1797,8 @@ def partition_labels_from_mps(mps: List[MetaPartition]) -> List[str]: @deprecate_parameters_if_set( - DEPRECATION_WARNING_REMOVE_PARAMETER, "expected_secondary_indices", + DEPRECATION_WARNING_REMOVE_PARAMETER, + "expected_secondary_indices", ) def parse_input_to_metapartition( obj: MetaPartitionInput, diff --git a/kartothek/io_components/utils.py b/kartothek/io_components/utils.py index 187f58e2..a6031915 100644 --- a/kartothek/io_components/utils.py +++ b/kartothek/io_components/utils.py @@ -119,7 +119,8 @@ def _combine_metadata(dataset_metadata, append_to_list): def _ensure_compatible_indices( - dataset: Optional[DatasetMetadataBase], secondary_indices: Optional[Iterable[str]], + dataset: Optional[DatasetMetadataBase], + secondary_indices: Optional[Iterable[str]], ) -> InferredIndices: if dataset: ds_secondary_indices = list(dataset.secondary_indices.keys()) diff --git a/kartothek/io_components/write.py b/kartothek/io_components/write.py index eb630549..8c6b44f9 100644 --- a/kartothek/io_components/write.py +++ b/kartothek/io_components/write.py @@ -244,7 +244,8 @@ def store_dataset_from_partitions( @deprecate_parameters_if_set( - DEPRECATION_WARNING_REMOVE_PARAMETER, "add_partitions", + DEPRECATION_WARNING_REMOVE_PARAMETER, + "add_partitions", ) def update_metadata(dataset_builder, metadata_merger, add_partitions, dataset_metadata): diff --git a/kartothek/serialization/_generic.py b/kartothek/serialization/_generic.py index 931df72b..a58e842a 100644 --- a/kartothek/serialization/_generic.py +++ b/kartothek/serialization/_generic.py @@ -321,7 +321,7 @@ def _handle_categorical_data(array_like, require_ordered): else: categorical = array_like.cat array_value_type = categorical.categories.dtype - if categorical.categories.is_monotonic: + if categorical.categories.is_monotonic_increasing: array_like = categorical.as_ordered() else: array_like = categorical.reorder_categories( @@ -392,6 +392,7 @@ def _ensure_type_stability( ("i", "u"), # various string kinds ("O", "S"), + ("S", "O"), ("O", "U"), # bool w/ Nones ("b", "O"), @@ -521,7 +522,9 @@ def filter_array_like( matching_idx |= pd.isnull(array_like) np.logical_and( - matching_idx, mask, out=out, + matching_idx, + mask, + out=out, ) else: raise NotImplementedError("op not supported") diff --git a/kartothek/serialization/_parquet.py b/kartothek/serialization/_parquet.py index 41c550a5..3c808d43 100644 --- a/kartothek/serialization/_parquet.py +++ b/kartothek/serialization/_parquet.py @@ -62,7 +62,10 @@ def _reset_dictionary_columns(table, exclude=None): continue if pa.types.is_dictionary(field.type): new_field = pa.field( - field.name, field.type.value_type, field.nullable, field.metadata, + field.name, + field.type.value_type, + field.nullable, + field.metadata, ) schema = schema.remove(i).insert(i, new_field) @@ -316,7 +319,7 @@ def restore_dataframe( ) # we don't sleep when we're done with the last attempt if nb_retry < (MAX_NB_RETRIES - 1): - time.sleep(BACKOFF_TIME * 2 ** nb_retry) + time.sleep(BACKOFF_TIME * 2**nb_retry) raise ParquetReadError( f"Failed to restore dataframe after {MAX_NB_RETRIES} attempts. Parameters: " @@ -632,4 +635,4 @@ def _epsilon(num): if epsilon_position < 0: epsilon_position += 1 - return 10 ** epsilon_position + return 10**epsilon_position diff --git a/kartothek/utils/migration_helpers.py b/kartothek/utils/migration_helpers.py index 82e97f66..2498c6c4 100644 --- a/kartothek/utils/migration_helpers.py +++ b/kartothek/utils/migration_helpers.py @@ -142,7 +142,9 @@ def get_specific_function_deprecation_warning( def get_specific_function_deprecation_warning_multi_table( - function_name: str, deprecated_in: str, removed_in: Optional[str] = None, + function_name: str, + deprecated_in: str, + removed_in: Optional[str] = None, ): return get_specific_function_deprecation_warning( function_name=function_name, @@ -233,7 +235,9 @@ def raise_warning( # gets original trace message if deprecators have been stacked warnings.warn( _assemble_warning_message( - parameter=parameter, message=warning, func_name=func_name, + parameter=parameter, + message=warning, + func_name=func_name, ), DeprecationWarning, stacklevel=stacklevel, @@ -241,7 +245,9 @@ def raise_warning( def _make_decorator_stackable( - wrapper_func: Callable, base_func: Callable, exclude_parameters: Tuple[str], + wrapper_func: Callable, + base_func: Callable, + exclude_parameters: Tuple[str], ) -> Callable: """ Attaches neccessary meta info directly to the decorator function's objects making multiple instance of these @@ -539,9 +545,11 @@ def warn_logic() -> None: else list(inspect.signature(func).parameters.keys()) ) - parameters_mapping = { # required for resolving optional parameters being passed as non-kwargs - parameter: func_args.index(parameter) for parameter in parameters - } + parameters_mapping = ( + { # required for resolving optional parameters being passed as non-kwargs + parameter: func_args.index(parameter) for parameter in parameters + } + ) # _make_decorator_stackable required in order to be able to stack multiple of these decorators. return _make_decorator_stackable( wrapper_func=wraps_func, base_func=func, exclude_parameters=parameters diff --git a/kartothek/utils/store.py b/kartothek/utils/store.py index d58eec48..3f4350c5 100644 --- a/kartothek/utils/store.py +++ b/kartothek/utils/store.py @@ -13,10 +13,10 @@ try: # azure-storage-blob < 12 - from azure.storage.blob import BlockBlobService as _BlockBlobService from azure.common import ( AzureMissingResourceHttpError as _AzureMissingResourceHttpError, ) + from azure.storage.blob import BlockBlobService as _BlockBlobService except ImportError: class _BlockBlobService: # type: ignore @@ -32,8 +32,8 @@ class _AzureMissingResourceHttpError: # type: ignore try: # azure-storage-blob >= 12 - from azure.storage.blob import ContainerClient as _ContainerClient from azure.core.exceptions import ResourceNotFoundError as _ResourceNotFoundError + from azure.storage.blob import ContainerClient as _ContainerClient except ImportError: class _ContainerClient: # type: ignore diff --git a/pyproject.toml b/pyproject.toml index 91fc8674..c0f47fbe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ exclude = ''' ''' [tool.isort] +profile = "black" multi_line_output = 3 include_trailing_comma = true line_length = 88 diff --git a/reference-data/arrow-compat/6.0.2.parquet b/reference-data/arrow-compat/6.0.2.parquet new file mode 100644 index 00000000..69dcf769 Binary files /dev/null and b/reference-data/arrow-compat/6.0.2.parquet differ diff --git a/reference-data/arrow-compat/batch_generate_references.sh b/reference-data/arrow-compat/batch_generate_references.sh index 3333ebc0..d55a0a5a 100755 --- a/reference-data/arrow-compat/batch_generate_references.sh +++ b/reference-data/arrow-compat/batch_generate_references.sh @@ -2,7 +2,7 @@ # Note: this assumes you have kartothek installed in your current environment and you are using conda -PYARROW_VERSIONS="0.14.1 0.15.0 0.16.0 0.17.1 1.0.1 2.0.0 3.0.0 4.0.1 5.0.0 6.0.1" +PYARROW_VERSIONS="0.14.1 0.15.0 0.16.0 0.17.1 1.0.1 2.0.0 3.0.0 4.0.1 5.0.0 6.0.1 6.0.2" for pyarrow_version in $PYARROW_VERSIONS; do echo $pyarrow_version diff --git a/tests/conftest.py b/tests/conftest.py index 06bd5721..e86a6c31 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -300,7 +300,11 @@ def _get_meta_partitions_with_dataframe( table_name: df, } - mp = MetaPartition(label="cluster_1", data=data, metadata_version=metadata_version,) + mp = MetaPartition( + label="cluster_1", + data=data, + metadata_version=metadata_version, + ) df_3 = pd.DataFrame( OrderedDict( [ @@ -320,7 +324,9 @@ def _get_meta_partitions_with_dataframe( } mp2 = MetaPartition( - label="cluster_2", data=data, metadata_version=metadata_version, + label="cluster_2", + data=data, + metadata_version=metadata_version, ) return [mp, mp2] @@ -478,7 +484,9 @@ def meta_partitions_dataframe_alternative_table_name( """ with cm_frozen_time(TIME_TO_FREEZE): return _get_meta_partitions_with_dataframe( - metadata_version, table_name=alternative_table_name, table_name_2=None, + metadata_version, + table_name=alternative_table_name, + table_name_2=None, ) diff --git a/tests/core/test_dataset_dyn_part.py b/tests/core/test_dataset_dyn_part.py index c33e395e..e81a4cca 100644 --- a/tests/core/test_dataset_dyn_part.py +++ b/tests/core/test_dataset_dyn_part.py @@ -338,6 +338,7 @@ def test_dask_partitions(metadata_version): """ import dask.dataframe + partition_suffix = "suffix" bucket_dir = tempfile.mkdtemp() dataset_uuid = "uuid+namespace-attribute12_underscored" os.mkdir("{}/{}".format(bucket_dir, dataset_uuid)) @@ -384,6 +385,15 @@ def test_dask_partitions(metadata_version): metadata.update(expected_partitions) metadata.update(expected_tables) + store_schema_metadata( + make_meta( + pd.DataFrame({"location": ["L-0/{}".format(partition_suffix)]}), + origin="stored", + ), + dataset_uuid, + store, + "core", + ) dmd = DatasetMetadata.load_from_store(dataset_uuid, store) actual_partitions = dmd.to_dict()["partitions"] # we partition on location ID which has two values diff --git a/tests/io/cube/test_append.py b/tests/io/cube/test_append.py index ab94e80f..6edb5516 100644 --- a/tests/io/cube/test_append.py +++ b/tests/io/cube/test_append.py @@ -1,10 +1,10 @@ import pytest -from tests.io.cube.utils import wrap_bag_write, wrap_ddf_write from kartothek.io.dask.bag_cube import append_to_cube_from_bag from kartothek.io.dask.dataframe_cube import append_to_cube_from_dataframe from kartothek.io.eager_cube import append_to_cube from kartothek.io.testing.append_cube import * # noqa +from tests.io.cube.utils import wrap_bag_write, wrap_ddf_write @pytest.fixture diff --git a/tests/io/cube/test_build.py b/tests/io/cube/test_build.py index a62a2938..6a27f181 100644 --- a/tests/io/cube/test_build.py +++ b/tests/io/cube/test_build.py @@ -5,13 +5,13 @@ import dask.core import pandas as pd import pytest -from tests.io.cube.utils import wrap_bag_write, wrap_ddf_write from kartothek.core.cube.cube import Cube from kartothek.io.dask.bag_cube import build_cube_from_bag from kartothek.io.dask.dataframe_cube import build_cube_from_dataframe from kartothek.io.eager_cube import build_cube from kartothek.io.testing.build_cube import * # noqa +from tests.io.cube.utils import wrap_bag_write, wrap_ddf_write @pytest.fixture diff --git a/tests/io/cube/test_cleanup.py b/tests/io/cube/test_cleanup.py index 67ce89ed..7d84fb53 100644 --- a/tests/io/cube/test_cleanup.py +++ b/tests/io/cube/test_cleanup.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- import pytest -from tests.io.cube.utils import wrap_bag_delete from kartothek.io.dask.bag_cube import cleanup_cube_bag from kartothek.io.eager_cube import cleanup_cube from kartothek.io.testing.cleanup_cube import * # noqa +from tests.io.cube.utils import wrap_bag_delete @pytest.fixture diff --git a/tests/io/cube/test_copy.py b/tests/io/cube/test_copy.py index 5c2f2fcf..01b898c5 100644 --- a/tests/io/cube/test_copy.py +++ b/tests/io/cube/test_copy.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- import pytest -from tests.io.cube.utils import wrap_bag_copy from kartothek.io.dask.bag_cube import copy_cube_bag from kartothek.io.eager_cube import copy_cube from kartothek.io.testing.copy_cube import * # noqa +from tests.io.cube.utils import wrap_bag_copy @pytest.fixture diff --git a/tests/io/cube/test_delete.py b/tests/io/cube/test_delete.py index 558366a7..cae37b6b 100644 --- a/tests/io/cube/test_delete.py +++ b/tests/io/cube/test_delete.py @@ -1,9 +1,9 @@ import pytest -from tests.io.cube.utils import wrap_bag_delete from kartothek.io.dask.bag_cube import delete_cube_bag from kartothek.io.eager_cube import delete_cube from kartothek.io.testing.delete_cube import * # noqa +from tests.io.cube.utils import wrap_bag_delete @pytest.fixture diff --git a/tests/io/cube/test_extend.py b/tests/io/cube/test_extend.py index 7ff863ca..34e464ab 100644 --- a/tests/io/cube/test_extend.py +++ b/tests/io/cube/test_extend.py @@ -5,13 +5,13 @@ import dask.core import pandas as pd import pytest -from tests.io.cube.utils import wrap_bag_write, wrap_ddf_write from kartothek.core.cube.cube import Cube from kartothek.io.dask.bag_cube import extend_cube_from_bag from kartothek.io.dask.dataframe_cube import extend_cube_from_dataframe from kartothek.io.eager_cube import extend_cube from kartothek.io.testing.extend_cube import * # noqa +from tests.io.cube.utils import wrap_bag_write, wrap_ddf_write @pytest.fixture diff --git a/tests/io/cube/test_query.py b/tests/io/cube/test_query.py index 512ee649..071c7fbb 100644 --- a/tests/io/cube/test_query.py +++ b/tests/io/cube/test_query.py @@ -1,10 +1,10 @@ import pytest -from tests.io.cube.utils import wrap_bag_read, wrap_ddf_read from kartothek.io.dask.bag_cube import query_cube_bag from kartothek.io.dask.dataframe_cube import query_cube_dataframe from kartothek.io.eager_cube import query_cube from kartothek.io.testing.query_cube import * # noqa +from tests.io.cube.utils import wrap_bag_read, wrap_ddf_read @pytest.fixture(scope="session") diff --git a/tests/io/cube/test_stats.py b/tests/io/cube/test_stats.py index 218f36a9..4088d09d 100644 --- a/tests/io/cube/test_stats.py +++ b/tests/io/cube/test_stats.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- import pytest -from tests.io.cube.utils import wrap_bag_stats from kartothek.io.dask.bag_cube import collect_stats_bag from kartothek.io.eager_cube import collect_stats from kartothek.io.testing.stats_cube import * # noqa +from tests.io.cube.utils import wrap_bag_stats @pytest.fixture diff --git a/tests/io/cube/test_update.py b/tests/io/cube/test_update.py index 10bcb790..0612dc47 100644 --- a/tests/io/cube/test_update.py +++ b/tests/io/cube/test_update.py @@ -1,8 +1,8 @@ import pytest -from tests.io.cube.utils import wrap_bag_write from kartothek.io.dask.bag_cube import update_cube_from_bag from kartothek.io.testing.update_cube import * # noqa +from tests.io.cube.utils import wrap_bag_write @pytest.fixture diff --git a/tests/io/dask/dataframe/test_shuffle.py b/tests/io/dask/dataframe/test_shuffle.py index 2b281722..b409ded7 100644 --- a/tests/io/dask/dataframe/test_shuffle.py +++ b/tests/io/dask/dataframe/test_shuffle.py @@ -202,4 +202,4 @@ def test_update_shuffle_buckets( ): df = data_dct["core"] assert len(df.primary.unique()) == 1 - assert df.sorted_column.is_monotonic + assert df.sorted_column.is_monotonic_increasing diff --git a/tests/io/dask/dataframe/test_stats.py b/tests/io/dask/dataframe/test_stats.py index 0a69044e..47bf7b16 100644 --- a/tests/io/dask/dataframe/test_stats.py +++ b/tests/io/dask/dataframe/test_stats.py @@ -83,7 +83,10 @@ def test_collect_dataset_metadata_predicates_on_index(store_factory): data={"P": range(10), "L": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"]} ) store_dataframes_as_dataset( - store=store_factory, dataset_uuid="dataset_uuid", partition_on=["L"], dfs=[df], + store=store_factory, + dataset_uuid="dataset_uuid", + partition_on=["L"], + dfs=[df], ) predicates = [[("L", "==", "b")]] @@ -211,7 +214,9 @@ def test_collect_dataset_metadata_empty_dataset(store_factory): store=store_factory, dataset_uuid="dataset_uuid", dfs=[df], partition_on=["A"] ) df_stats = collect_dataset_metadata( - store=store_factory, dataset_uuid="dataset_uuid", table_name="table", + store=store_factory, + dataset_uuid="dataset_uuid", + table_name="table", ).compute() expected = pd.DataFrame(columns=_METADATA_SCHEMA.keys()) expected = expected.astype(_METADATA_SCHEMA) @@ -225,7 +230,9 @@ def test_collect_dataset_metadata_concat(store_factory): store=store_factory, dataset_uuid="dataset_uuid", dfs=[df], partition_on=["A"] ) df_stats1 = collect_dataset_metadata( - store=store_factory, dataset_uuid="dataset_uuid", table_name="table", + store=store_factory, + dataset_uuid="dataset_uuid", + table_name="table", ).compute() # Remove all partitions of the dataset @@ -234,7 +241,9 @@ def test_collect_dataset_metadata_concat(store_factory): ) df_stats2 = collect_dataset_metadata( - store=store_factory, dataset_uuid="dataset_uuid", table_name="table", + store=store_factory, + dataset_uuid="dataset_uuid", + table_name="table", ).compute() pd.concat([df_stats1, df_stats2]) @@ -250,7 +259,9 @@ def test_collect_dataset_metadata_delete_dataset(store_factory): ) df_stats = collect_dataset_metadata( - store=store_factory, dataset_uuid="dataset_uuid", table_name="table", + store=store_factory, + dataset_uuid="dataset_uuid", + table_name="table", ).compute() expected = pd.DataFrame(columns=_METADATA_SCHEMA) expected = expected.astype(_METADATA_SCHEMA) @@ -261,7 +272,10 @@ def test_collect_dataset_metadata_fraction_precision(store_factory): df = pd.DataFrame(data={"A": range(100), "B": range(100)}) store_dataframes_as_dataset( - store=store_factory, dataset_uuid="dataset_uuid", dfs=[df], partition_on=["A"], + store=store_factory, + dataset_uuid="dataset_uuid", + dfs=[df], + partition_on=["A"], ) # Creates 100 partitions df_stats = collect_dataset_metadata( @@ -277,7 +291,10 @@ def test_collect_dataset_metadata_at_least_one_partition(store_factory): df = pd.DataFrame(data={"A": range(100), "B": range(100)}) store_dataframes_as_dataset( - store=store_factory, dataset_uuid="dataset_uuid", dfs=[df], partition_on=["A"], + store=store_factory, + dataset_uuid="dataset_uuid", + dfs=[df], + partition_on=["A"], ) # Creates 100 partitions df_stats = collect_dataset_metadata( @@ -302,7 +319,9 @@ def test_collect_dataset_metadata_table_without_partition(store_factory): ) df_stats = collect_dataset_metadata( - store=store_factory, dataset_uuid="dataset_uuid", table_name="table2", + store=store_factory, + dataset_uuid="dataset_uuid", + table_name="table2", ).compute() actual = df_stats.drop( columns=[ diff --git a/tests/io/eager/test_copy.py b/tests/io/eager/test_copy.py index 53be64fd..ddd7b287 100644 --- a/tests/io/eager/test_copy.py +++ b/tests/io/eager/test_copy.py @@ -26,7 +26,10 @@ def assert_target_ktk_readable(tgt_store, tgt_ds): """ Try to read the target dataset using high level KTK functionality """ - df_result = read_table(store=tgt_store, dataset_uuid=tgt_ds,) + df_result = read_table( + store=tgt_store, + dataset_uuid=tgt_ds, + ) assert df_result is not None assert len(df_result) == 10 df_result = read_table( @@ -45,11 +48,13 @@ def assert_target_keys(src_store, src_uuid, tgt_store, tgt_uuid): values are equal to the source data set (or modified as expected) """ df_source = DatasetFactory( - dataset_uuid=src_uuid, store_factory=lazy_store(src_store), + dataset_uuid=src_uuid, + store_factory=lazy_store(src_store), ) src_keys = get_dataset_keys(df_source.dataset_metadata) df_target = DatasetFactory( - dataset_uuid=tgt_uuid, store_factory=lazy_store(tgt_store), + dataset_uuid=tgt_uuid, + store_factory=lazy_store(tgt_store), ) tgt_keys = get_dataset_keys(df_target.dataset_metadata) diff --git a/tests/io_components/test_metapartition.py b/tests/io_components/test_metapartition.py index ae4ae1ff..402a90b1 100644 --- a/tests/io_components/test_metapartition.py +++ b/tests/io_components/test_metapartition.py @@ -1164,7 +1164,7 @@ def test_partition_on_stable_order(): new_mp = mp.partition_on("partition_key") for sub_mp in new_mp: sub_df = sub_mp.data["table"] - assert sub_df.sorted_col.is_monotonic + assert sub_df.sorted_col.is_monotonic_increasing def test_table_meta(store): @@ -1676,8 +1676,14 @@ def test_parse_input_schema_formats(): def test_get_parquet_metadata(store): df = pd.DataFrame({"P": np.arange(0, 10), "L": np.arange(0, 10)}) - mp = MetaPartition(label="test_label", data={"core": df},) - meta_partition = mp.store_dataframes(store=store, dataset_uuid="dataset_uuid",) + mp = MetaPartition( + label="test_label", + data={"core": df}, + ) + meta_partition = mp.store_dataframes( + store=store, + dataset_uuid="dataset_uuid", + ) actual = meta_partition.get_parquet_metadata(store=store, table_name="core") actual.drop(labels="serialized_size", axis=1, inplace=True) @@ -1698,8 +1704,14 @@ def test_get_parquet_metadata(store): def test_get_parquet_metadata_empty_df(store): df = pd.DataFrame() - mp = MetaPartition(label="test_label", data={"core": df},) - meta_partition = mp.store_dataframes(store=store, dataset_uuid="dataset_uuid",) + mp = MetaPartition( + label="test_label", + data={"core": df}, + ) + meta_partition = mp.store_dataframes( + store=store, + dataset_uuid="dataset_uuid", + ) actual = meta_partition.get_parquet_metadata(store=store, table_name="core") actual.drop( @@ -1727,7 +1739,10 @@ def test_get_parquet_metadata_empty_df(store): def test_get_parquet_metadata_row_group_size(store): df = pd.DataFrame({"P": np.arange(0, 10), "L": np.arange(0, 10)}) - mp = MetaPartition(label="test_label", data={"core": df},) + mp = MetaPartition( + label="test_label", + data={"core": df}, + ) ps = ParquetSerializer(chunk_size=5) meta_partition = mp.store_dataframes( @@ -1758,8 +1773,14 @@ def test_get_parquet_metadata_row_group_size(store): def test_get_parquet_metadata_table_name_not_str(store): df = pd.DataFrame({"P": np.arange(0, 10), "L": np.arange(0, 10)}) - mp = MetaPartition(label="test_label", data={"core": df, "another_table": df},) - meta_partition = mp.store_dataframes(store=store, dataset_uuid="dataset_uuid",) + mp = MetaPartition( + label="test_label", + data={"core": df, "another_table": df}, + ) + meta_partition = mp.store_dataframes( + store=store, + dataset_uuid="dataset_uuid", + ) with pytest.raises(TypeError): meta_partition.get_parquet_metadata( diff --git a/tests/io_components/test_utils.py b/tests/io_components/test_utils.py index f17fc5b6..11980857 100644 --- a/tests/io_components/test_utils.py +++ b/tests/io_components/test_utils.py @@ -228,7 +228,7 @@ def test_sort_cateogrical(): expected_values = sorted(values) assert all(sorted_df["cat"].values == expected_values) - assert sorted_df["cat"].is_monotonic + assert sorted_df["cat"].is_monotonic_increasing assert sorted_df["cat"].cat.ordered assert all(sorted_df["cat"].cat.categories == sorted(categories)) diff --git a/tests/serialization/test_arrow_compat.py b/tests/serialization/test_arrow_compat.py index 55725a21..86105495 100644 --- a/tests/serialization/test_arrow_compat.py +++ b/tests/serialization/test_arrow_compat.py @@ -25,6 +25,7 @@ "4.0.1", "5.0.0", "6.0.1", + "6.0.2", ] diff --git a/tests/serialization/test_dataframe.py b/tests/serialization/test_dataframe.py index ed45953e..2f9c1fcb 100644 --- a/tests/serialization/test_dataframe.py +++ b/tests/serialization/test_dataframe.py @@ -236,7 +236,7 @@ def test_predicate_pushdown( """ # All test dataframes need to have the same length assert len(df) == 4 - assert df[df.columns[0]].is_monotonic and df.iloc[0, 0] < df.iloc[-1, 0] + assert df[df.columns[0]].is_monotonic_increasing and df.iloc[0, 0] < df.iloc[-1, 0] # This is due to the limitation that dates cannot be expressed in # Pandas' query() method. diff --git a/tests/serialization/test_io_buffer.py b/tests/serialization/test_io_buffer.py index e710af5d..35cae6f7 100644 --- a/tests/serialization/test_io_buffer.py +++ b/tests/serialization/test_io_buffer.py @@ -280,10 +280,10 @@ def test_empty(blocksize): def test_giga(): - raw_size = 100 * 1024 ** 3 # 100 GB + raw_size = 100 * 1024**3 # 100 GB raw_inner = _ZeroFile(raw_size) raw = _ReadRecordWrapper(raw_inner) - blocksize = 4 * 1024 ** 2 # 4MB + blocksize = 4 * 1024**2 # 4MB b = BlockBuffer(raw, blocksize) assert b.size == raw_size diff --git a/tests/serialization/test_parquet.py b/tests/serialization/test_parquet.py index cad38c8d..9977fc83 100644 --- a/tests/serialization/test_parquet.py +++ b/tests/serialization/test_parquet.py @@ -388,7 +388,7 @@ def test_int64_statistics_overflow(reference_store, predicate_pushdown_to_io): ([0, 4, 1], True), ([-2, 44], False), ([-3, 0], True), - ([-1, 10 ** 4], False), + ([-1, 10**4], False), ([2, 3], True), ([-1, 20], True), ([-30, -5, 50, 10], True), diff --git a/tests/utils/test_ktk_adapters.py b/tests/utils/test_ktk_adapters.py index 16cd6867..07d791d0 100644 --- a/tests/utils/test_ktk_adapters.py +++ b/tests/utils/test_ktk_adapters.py @@ -30,7 +30,11 @@ def cube_has_ts_col(request): @pytest.fixture def cube(): - return Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube",) + return Cube( + dimension_columns=["x"], + partition_columns=["p"], + uuid_prefix="cube", + ) @pytest.fixture diff --git a/tests/utils/test_migration_helpers.py b/tests/utils/test_migration_helpers.py index 3d443545..58b1951e 100644 --- a/tests/utils/test_migration_helpers.py +++ b/tests/utils/test_migration_helpers.py @@ -38,7 +38,8 @@ def default_parameter_deprecation_texts_generic(request) -> str: @pytest.fixture( - scope="class", params=[DEPRECATION_WARNING_REMOVE_FUNCTION_GENERIC_VERSION], + scope="class", + params=[DEPRECATION_WARNING_REMOVE_FUNCTION_GENERIC_VERSION], ) def default_function_deprecation_texts_generic(request) -> str: return request.param @@ -279,7 +280,9 @@ def test_deprecate_parameter_stacked_nested( # check: Only the first stacked deprecator construct in the callstack should raise warnings. with pytest.warns(DeprecationWarning) as warn_record: result = func_non_optional_params_multiple_params_stacked_nested( - func_non_optional_params_multiple_params_stacked_nested_counterparts, 0, 1, + func_non_optional_params_multiple_params_stacked_nested_counterparts, + 0, + 1, ) # ensures, that the second - nested - deprecator construct does not raise warnings @@ -298,7 +301,9 @@ def test_deprecate_parameter_stacked_nested_inverse( # check: Only the first stacked deprecator construct in the callstack should raise warnings. with pytest.warns(DeprecationWarning) as warn_record: result = func_non_optional_params_multiple_params_stacked_nested_inverse( - func_non_optional_params_multiple_params_stacked_nested_counterparts, 0, 1, + func_non_optional_params_multiple_params_stacked_nested_counterparts, + 0, + 1, ) # ensures, that the second - nested - deprecator construct does not raise warnings @@ -422,7 +427,8 @@ def test_generic_deprecation_warning(default_deprecation_texts_generic: str): @pytest.fixture( - scope="class", params=[False, True], + scope="class", + params=[False, True], ) def is_test_optional_parameters(request) -> bool: return request.param