JDASoftwareGroup · timostrunk · Nov 4, 2022 · Nov 4, 2022 · Nov 4, 2022
diff --git a/.github/workflows/ci-pre-commit.yml b/.github/workflows/ci-pre-commit.yml
@@ -6,6 +6,8 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
-    - uses: actions/setup-python@v2
-    - uses: pre-commit/[email protected]
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'
+    - uses: pre-commit/[email protected]
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -24,16 +24,16 @@ jobs:
       matrix:
         numfocus_nightly: [false]
         os: ["ubuntu-latest"]
-        pyarrow: ["0.17.1", "1.0.1", "2.0.0", "3.0.0", "4.0.1", "5.0.0", "6.0.1", "nightly"]
-        python: ["3.7", "3.8"]
+        pyarrow: ["2.0.0", "3.0.0", "4.0.1", "5.0.0", "6.0.1", "6.0.2", "nightly"]
+        python: ["3.8"]
         include:
           - numfocus_nightly: true
             os: "ubuntu-latest"
-            pyarrow: "2.0.0"
+            pyarrow: "6.0.2"
             python: "3.8"
           - numfocus_nightly: false
             os: "macos-latest"
-            pyarrow: "0.17.1"
+            pyarrow: "5.0.0"
             python: "3.8"
     continue-on-error: ${{ matrix.numfocus_nightly || matrix.pyarrow == 'nightly' }}
 

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -9,11 +9,12 @@ jobs:
       - name: Checkout source
         uses: actions/checkout@v2
 
-      - name: Setup Conda Environment
-        uses: conda-incubator/[email protected]
+      - name: Mamba Docs environment
+        uses: mamba-org/provision-with-micromamba@main
         with:
           environment-file: docs/environment-docs.yml
-          activate-environment: kartothek-docs
+          environment-name: kartothek-docs
+          cache-downloads: true
 
       - name: List conda
         shell: bash -l {0}

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,27 +1,27 @@
 repos:
-  - repo: https://github.com/ambv/black
-    rev: 19.10b0
+  - repo: https://github.com/psf/black
+    rev: 22.10.0
     hooks:
       - id: black
         args:
           - --safe
-          - --target-version=py36
+          - --target-version=py38
 
   - repo: https://github.com/asottile/blacken-docs
     rev: v1.7.0
     hooks:
     -   id: blacken-docs
         additional_dependencies: [black==19.10b0]
         args:
-          - --target-version=py36
+          - --target-version=py38
 
-  - repo: https://gitlab.com/pycqa/flake8
+  - repo: https://github.com/PyCQA/flake8
     rev: 3.8.3
     hooks:
     -   id: flake8
 
   - repo: https://github.com/pre-commit/mirrors-isort
-    rev: v4.3.21
+    rev: v5.10.1
     hooks:
     -   id: isort
         additional_dependencies: [toml]

diff --git a/asv_bench/benchmarks/filter.py b/asv_bench/benchmarks/filter.py
@@ -26,7 +26,7 @@ def setup(self, predicate):
                 [("int32", "<", 321)],
                 [("int32", "<", 321)],
             ]
-        self.df = get_dataframe_not_nested(10 ** 5)
+        self.df = get_dataframe_not_nested(10**5)
 
     def time_filter_df_from_predicates(self, predicate):
         filter_df_from_predicates(self.df, self.predicate)
@@ -48,7 +48,7 @@ def setup(self, column):
         if column == "null":
             raise NotImplementedError()
         self.arr = (
-            get_dataframe_not_nested(10 ** 5)
+            get_dataframe_not_nested(10**5)
             .sample(frac=1.0)
             .reset_index(drop=True)[column]
             .values
@@ -69,7 +69,7 @@ class TimeFilterArrayIn:
     params = (
         cols_to_filter,
         [10, 100, 1000],
-        [10 ** 4, 10 ** 5, 10 ** 6],
+        [10**4, 10**5, 10**6],
     )
     param_names = ["column", "filter_size", "array_size", "enabled"]
 

diff --git a/asv_bench/benchmarks/index.py b/asv_bench/benchmarks/index.py
@@ -60,8 +60,8 @@ def teardown(self, number_values, number_partitions, dtype):
 
 class Index(IndexBase):
     params = (
-        [10 * 1, 10 ** 3],  # values
-        [10 * 1, 10 ** 3],  # partitions
+        [10 * 1, 10**3],  # values
+        [10 * 1, 10**3],  # partitions
         [(int, pa.int64())],  # types
     )
     param_names = ["number_values", "number_partitions", "dtype"]
@@ -93,7 +93,7 @@ def time_observed_values(self, number_values, number_partitions, arrow_type):
 class SerializeIndex(IndexBase):
     timeout = 180
     params = (
-        [(10 ** 3, 10), (10 ** 4, 100)],  # (values, partitions)
+        [(10**3, 10), (10**4, 100)],  # (values, partitions)
         [(int, pa.int64())],  # types
     )
     param_names = ["number_values__number_partitions", "dtype"]
@@ -117,7 +117,7 @@ def time_serialization(self, number_values__number_partitions, arrow_type):
 
 
 class BuildIndex(AsvBenchmarkConfig):
-    params = ([-1, 1], [10 ** 3, 10 ** 4], [10, 100])
+    params = ([-1, 1], [10**3, 10**4], [10, 100])
     param_names = ["cardinality", "num_values", "partitions_to_merge"]
 
     def setup(self, cardinality, num_values, partitions_to_merge):

diff --git a/asv_bench/benchmarks/metapartition.py b/asv_bench/benchmarks/metapartition.py
@@ -16,7 +16,7 @@
 
 class TimeMetaPartition(AsvBenchmarkConfig):
     params = (
-        [10 ** 5, 10 ** 6],
+        [10**5, 10**6],
         [
             (np.int64, 123456789),
             (str, "abcdefgh"),

diff --git a/asv_bench/benchmarks/predicate_pushdown.py b/asv_bench/benchmarks/predicate_pushdown.py
@@ -10,7 +10,7 @@ class TimeRestore:
     of iterating over dictionaries in Python.
     """
 
-    params = [(10 ** 3, 10 ** 4), (10, 10 ** 2, 10 ** 3)]
+    params = [(10**3, 10**4), (10, 10**2, 10**3)]
     param_names = ["num_rows", "chunk_size"]
 
     def setup(self, num_rows, chunk_size):

diff --git a/asv_bench/benchmarks/schema.py b/asv_bench/benchmarks/schema.py
@@ -23,7 +23,7 @@ def time_make_meta(self):
 
 class TimeValidateCompatible(AsvBenchmarkConfig):
 
-    params = ([2, 10 ** 2, 10 ** 3, 10 ** 4], [True, False])
+    params = ([2, 10**2, 10**3, 10**4], [True, False])
     timeout = 120.0
 
     param_names = ["num_schemas", "has_na"]
@@ -50,7 +50,7 @@ def time_validate_compatible(self, num_schemas, has_na):
 
 
 class TimeValidateSharedColumns(AsvBenchmarkConfig):
-    params = [2, 10 ** 2]
+    params = [2, 10**2]
     timeout = 120.0
 
     param_names = ["num_schemas"]

diff --git a/asv_bench/benchmarks/write.py b/asv_bench/benchmarks/write.py
@@ -55,7 +55,7 @@ def generate_metadata(max_depth=7, num_leafs=5):
 
 class TimeStoreDataset(AsvBenchmarkConfig):
     timeout = 120
-    params = ([10, 10 ** 2, 10 ** 3], [4], [2, 4])
+    params = ([10, 10**2, 10**3], [4], [2, 4])
     param_names = ["num_partitions", "max_depth", "num_leafs"]
 
     def setup(self, num_partitions, max_depth, num_leafs):
@@ -76,7 +76,7 @@ def time_store_dataset_from_partitions(self, num_partitions, max_depth, num_leaf
 
 class TimePersistMetadata(AsvBenchmarkConfig):
     timeout = 240
-    params = [1, 10 ** 2, 10 ** 3]
+    params = [1, 10**2, 10**3]
 
     def setup(self, num_partitions):
         self.store = get_store_from_url("hfs://{}".format(tempfile.mkdtemp()))

diff --git a/docs/environment-docs.yml b/docs/environment-docs.yml
@@ -2,6 +2,7 @@ name: kartothek-docs
 channels:
   - conda-forge
 dependencies:
+  - python=3.8
   - dask[dataframe]
   - decorator
   - msgpack-python>=0.5.2
@@ -10,6 +11,7 @@ dependencies:
   - pandas>=0.23.0, !=1.0.0
   - pyarrow>=0.17.1,!=1.0.0, <4
   - simplejson
+  - jinja2<3.1
   - simplekv
   - storefact
   - toolz

diff --git a/kartothek/api/discover.py b/kartothek/api/discover.py
@@ -275,7 +275,8 @@ def discover_cube(
         if len(partition_keys) == 0:
             raise ValueError(
                 'Seed dataset ("{seed_dataset}") has no partition keys.'.format(  # type: ignore # noqa
-                    seed_dataset=seed_dataset, partition_keys=", ".join(partition_keys),
+                    seed_dataset=seed_dataset,
+                    partition_keys=", ".join(partition_keys),
                 )
             )
         elif len(partition_keys) < 2:

diff --git a/kartothek/core/dataset.py b/kartothek/core/dataset.py
@@ -280,7 +280,8 @@ def load_index(self: T, column: str, store: StoreInput) -> T:
         return self.copy(indices=indices)
 
     @deprecate_parameters_if_set(
-        DEPRECATION_WARNING_REMOVE_PARAMETER, "load_partition_indices",
+        DEPRECATION_WARNING_REMOVE_PARAMETER,
+        "load_partition_indices",
     )
     def load_all_indices(
         self: T, store: StoreInput, load_partition_indices: bool = True
@@ -446,7 +447,9 @@ def get_indices_as_dataframe(
             )
         else:
             df = dm._evaluate_conjunction(
-                columns=columns, predicates=None, date_as_object=date_as_object,
+                columns=columns,
+                predicates=None,
+                date_as_object=date_as_object,
             )
         return df
 

diff --git a/kartothek/core/factory.py b/kartothek/core/factory.py
@@ -165,7 +165,9 @@ def load_index(self: T, column, store=None) -> T:
         "load_partition_indices",
     )
     def load_all_indices(
-        self: T, store: Any = None, load_partition_indices: bool = True,
+        self: T,
+        store: Any = None,
+        load_partition_indices: bool = True,
     ) -> T:
         self._cache_metadata = self.dataset_metadata.load_all_indices(
             self.store, load_partition_indices=load_partition_indices

diff --git a/kartothek/io/dask/_sizeof.py b/kartothek/io/dask/_sizeof.py
@@ -6,12 +6,12 @@ def _dct_sizeof(obj):
 
 
 def register_sizeof_ktk_classes():
+    from kartothek.core.common_metadata import SchemaWrapper
     from kartothek.core.dataset import DatasetMetadata
     from kartothek.core.factory import DatasetFactory
-    from kartothek.io_components.metapartition import MetaPartition
     from kartothek.core.index import ExplicitSecondaryIndex, PartitionIndex
     from kartothek.core.partition import Partition
-    from kartothek.core.common_metadata import SchemaWrapper
+    from kartothek.io_components.metapartition import MetaPartition
 
     dask_sizeof.register(DatasetMetadata, _dct_sizeof)
     dask_sizeof.register(DatasetFactory, _dct_sizeof)

diff --git a/kartothek/io/dask/compression.py b/kartothek/io/dask/compression.py
@@ -10,8 +10,7 @@
 
 try:
     # Technically distributed is an optional dependency
-    from distributed.protocol import serialize_bytes
-    from distributed.protocol import deserialize_bytes
+    from distributed.protocol import deserialize_bytes, serialize_bytes
 
     HAS_DISTRIBUTED = True
 except ImportError:

diff --git a/kartothek/io/dask/dataframe.py b/kartothek/io/dask/dataframe.py
@@ -281,7 +281,8 @@ def _shuffle_docs(func):
 @default_docs
 @_shuffle_docs
 @deprecate_parameters_if_set(
-    DEPRECATION_WARNING_REMOVE_PARAMETER, "delete_scope",
+    DEPRECATION_WARNING_REMOVE_PARAMETER,
+    "delete_scope",
 )
 def store_dataset_from_ddf(
     ddf: dd.DataFrame,

diff --git a/kartothek/io/eager.py b/kartothek/io/eager.py
@@ -428,7 +428,9 @@ def read_table(
 @default_docs
 @normalize_args
 @deprecate_parameters_if_set(
-    DEPRECATION_WARNING_REMOVE_PARAMETER, "output_dataset_uuid", "df_serializer",
+    DEPRECATION_WARNING_REMOVE_PARAMETER,
+    "output_dataset_uuid",
+    "df_serializer",
 )
 def commit_dataset(
     store: Optional[StoreInput] = None,
@@ -712,7 +714,10 @@ def create_empty_dataset_header(
     "df_serializer",
 )
 @deprecate_parameters_if_set(
-    DEPRECATION_WARNING_REMOVE_PARAMETER, "metadata", "overwrite", "metadata_merger",
+    DEPRECATION_WARNING_REMOVE_PARAMETER,
+    "metadata",
+    "overwrite",
+    "metadata_merger",
 )
 def write_single_partition(
     store: Optional[KeyValueStore] = None,

diff --git a/kartothek/io/testing/append_cube.py b/kartothek/io/testing/append_cube.py
@@ -116,7 +116,10 @@ def test_rowgroups_are_applied_when_df_serializer_is_passed_to_append_cube(
     """
 
     # Build cube
-    df = pd.DataFrame(data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}, columns=["x", "p"],)
+    df = pd.DataFrame(
+        data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]},
+        columns=["x", "p"],
+    )
     cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
     build_cube(
         data=df,
@@ -127,7 +130,8 @@ def test_rowgroups_are_applied_when_df_serializer_is_passed_to_append_cube(
 
     # Append to cube
     df_append = pd.DataFrame(
-        data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]}, columns=["x", "p"],
+        data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]},
+        columns=["x", "p"],
     )
     result = driver(
         data={"seed": df_append},
@@ -157,17 +161,27 @@ def test_single_rowgroup_when_df_serializer_is_not_passed_to_append_cube(
     """
 
     # Build cube
-    df = pd.DataFrame(data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}, columns=["x", "p"],)
+    df = pd.DataFrame(
+        data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]},
+        columns=["x", "p"],
+    )
     cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
     build_cube(
-        data=df, cube=cube, store=function_store,
+        data=df,
+        cube=cube,
+        store=function_store,
     )
 
     # Append to cube
     df_append = pd.DataFrame(
-        data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]}, columns=["x", "p"],
+        data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]},
+        columns=["x", "p"],
+    )
+    result = driver(
+        data={"seed": df_append},
+        cube=cube,
+        store=function_store,
     )
-    result = driver(data={"seed": df_append}, cube=cube, store=function_store,)
     dataset = result["seed"].load_all_indices(function_store())
 
     part_num_rows = {0: 2, 1: 2, 2: 1, 3: 3}
@@ -187,7 +201,10 @@ def test_compression_is_compatible_on_append_cube(driver, function_store):
     unnecessarily.
     """
     # Build cube
-    df = pd.DataFrame(data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}, columns=["x", "p"],)
+    df = pd.DataFrame(
+        data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]},
+        columns=["x", "p"],
+    )
     cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
     build_cube(
         data=df,
@@ -198,7 +215,8 @@ def test_compression_is_compatible_on_append_cube(driver, function_store):
 
     # Append to cube
     df_append = pd.DataFrame(
-        data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]}, columns=["x", "p"],
+        data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]},
+        columns=["x", "p"],
     )
     result = driver(
         data={"seed": df_append},