Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix GitHub actions #532

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions .github/workflows/ci-pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
- uses: pre-commit/[email protected]
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- uses: pre-commit/[email protected]
8 changes: 4 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@ jobs:
matrix:
numfocus_nightly: [false]
os: ["ubuntu-latest"]
pyarrow: ["0.17.1", "1.0.1", "2.0.0", "3.0.0", "4.0.1", "5.0.0", "6.0.1", "nightly"]
python: ["3.7", "3.8"]
pyarrow: ["2.0.0", "3.0.0", "4.0.1", "5.0.0", "6.0.1", "6.0.2", "nightly"]
python: ["3.8"]
include:
- numfocus_nightly: true
os: "ubuntu-latest"
pyarrow: "2.0.0"
pyarrow: "6.0.2"
python: "3.8"
- numfocus_nightly: false
os: "macos-latest"
pyarrow: "0.17.1"
pyarrow: "5.0.0"
python: "3.8"
continue-on-error: ${{ matrix.numfocus_nightly || matrix.pyarrow == 'nightly' }}

Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ jobs:
- name: Checkout source
uses: actions/checkout@v2

- name: Setup Conda Environment
uses: conda-incubator/[email protected]
- name: Mamba Docs environment
uses: mamba-org/provision-with-micromamba@main
with:
environment-file: docs/environment-docs.yml
activate-environment: kartothek-docs
environment-name: kartothek-docs
cache-downloads: true

- name: List conda
shell: bash -l {0}
Expand Down
12 changes: 6 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
repos:
- repo: https://github.com/ambv/black
rev: 19.10b0
- repo: https://github.com/psf/black
rev: 22.10.0
hooks:
- id: black
args:
- --safe
- --target-version=py36
- --target-version=py38

- repo: https://github.com/asottile/blacken-docs
rev: v1.7.0
hooks:
- id: blacken-docs
additional_dependencies: [black==19.10b0]
args:
- --target-version=py36
- --target-version=py38

- repo: https://gitlab.com/pycqa/flake8
- repo: https://github.com/PyCQA/flake8
rev: 3.8.3
hooks:
- id: flake8

- repo: https://github.com/pre-commit/mirrors-isort
rev: v4.3.21
rev: v5.10.1
hooks:
- id: isort
additional_dependencies: [toml]
Expand Down
6 changes: 3 additions & 3 deletions asv_bench/benchmarks/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def setup(self, predicate):
[("int32", "<", 321)],
[("int32", "<", 321)],
]
self.df = get_dataframe_not_nested(10 ** 5)
self.df = get_dataframe_not_nested(10**5)

def time_filter_df_from_predicates(self, predicate):
filter_df_from_predicates(self.df, self.predicate)
Expand All @@ -48,7 +48,7 @@ def setup(self, column):
if column == "null":
raise NotImplementedError()
self.arr = (
get_dataframe_not_nested(10 ** 5)
get_dataframe_not_nested(10**5)
.sample(frac=1.0)
.reset_index(drop=True)[column]
.values
Expand All @@ -69,7 +69,7 @@ class TimeFilterArrayIn:
params = (
cols_to_filter,
[10, 100, 1000],
[10 ** 4, 10 ** 5, 10 ** 6],
[10**4, 10**5, 10**6],
)
param_names = ["column", "filter_size", "array_size", "enabled"]

Expand Down
8 changes: 4 additions & 4 deletions asv_bench/benchmarks/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ def teardown(self, number_values, number_partitions, dtype):

class Index(IndexBase):
params = (
[10 * 1, 10 ** 3], # values
[10 * 1, 10 ** 3], # partitions
[10 * 1, 10**3], # values
[10 * 1, 10**3], # partitions
[(int, pa.int64())], # types
)
param_names = ["number_values", "number_partitions", "dtype"]
Expand Down Expand Up @@ -93,7 +93,7 @@ def time_observed_values(self, number_values, number_partitions, arrow_type):
class SerializeIndex(IndexBase):
timeout = 180
params = (
[(10 ** 3, 10), (10 ** 4, 100)], # (values, partitions)
[(10**3, 10), (10**4, 100)], # (values, partitions)
[(int, pa.int64())], # types
)
param_names = ["number_values__number_partitions", "dtype"]
Expand All @@ -117,7 +117,7 @@ def time_serialization(self, number_values__number_partitions, arrow_type):


class BuildIndex(AsvBenchmarkConfig):
params = ([-1, 1], [10 ** 3, 10 ** 4], [10, 100])
params = ([-1, 1], [10**3, 10**4], [10, 100])
param_names = ["cardinality", "num_values", "partitions_to_merge"]

def setup(self, cardinality, num_values, partitions_to_merge):
Expand Down
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/metapartition.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

class TimeMetaPartition(AsvBenchmarkConfig):
params = (
[10 ** 5, 10 ** 6],
[10**5, 10**6],
[
(np.int64, 123456789),
(str, "abcdefgh"),
Expand Down
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/predicate_pushdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class TimeRestore:
of iterating over dictionaries in Python.
"""

params = [(10 ** 3, 10 ** 4), (10, 10 ** 2, 10 ** 3)]
params = [(10**3, 10**4), (10, 10**2, 10**3)]
param_names = ["num_rows", "chunk_size"]

def setup(self, num_rows, chunk_size):
Expand Down
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def time_make_meta(self):

class TimeValidateCompatible(AsvBenchmarkConfig):

params = ([2, 10 ** 2, 10 ** 3, 10 ** 4], [True, False])
params = ([2, 10**2, 10**3, 10**4], [True, False])
timeout = 120.0

param_names = ["num_schemas", "has_na"]
Expand All @@ -50,7 +50,7 @@ def time_validate_compatible(self, num_schemas, has_na):


class TimeValidateSharedColumns(AsvBenchmarkConfig):
params = [2, 10 ** 2]
params = [2, 10**2]
timeout = 120.0

param_names = ["num_schemas"]
Expand Down
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/write.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def generate_metadata(max_depth=7, num_leafs=5):

class TimeStoreDataset(AsvBenchmarkConfig):
timeout = 120
params = ([10, 10 ** 2, 10 ** 3], [4], [2, 4])
params = ([10, 10**2, 10**3], [4], [2, 4])
param_names = ["num_partitions", "max_depth", "num_leafs"]

def setup(self, num_partitions, max_depth, num_leafs):
Expand All @@ -76,7 +76,7 @@ def time_store_dataset_from_partitions(self, num_partitions, max_depth, num_leaf

class TimePersistMetadata(AsvBenchmarkConfig):
timeout = 240
params = [1, 10 ** 2, 10 ** 3]
params = [1, 10**2, 10**3]

def setup(self, num_partitions):
self.store = get_store_from_url("hfs://{}".format(tempfile.mkdtemp()))
Expand Down
2 changes: 2 additions & 0 deletions docs/environment-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: kartothek-docs
channels:
- conda-forge
dependencies:
- python=3.8
- dask[dataframe]
- decorator
- msgpack-python>=0.5.2
Expand All @@ -10,6 +11,7 @@ dependencies:
- pandas>=0.23.0, !=1.0.0
- pyarrow>=0.17.1,!=1.0.0, <4
- simplejson
- jinja2<3.1
- simplekv
- storefact
- toolz
Expand Down
3 changes: 2 additions & 1 deletion kartothek/api/discover.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,8 @@ def discover_cube(
if len(partition_keys) == 0:
raise ValueError(
'Seed dataset ("{seed_dataset}") has no partition keys.'.format( # type: ignore # noqa
seed_dataset=seed_dataset, partition_keys=", ".join(partition_keys),
seed_dataset=seed_dataset,
partition_keys=", ".join(partition_keys),
)
)
elif len(partition_keys) < 2:
Expand Down
7 changes: 5 additions & 2 deletions kartothek/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,8 @@ def load_index(self: T, column: str, store: StoreInput) -> T:
return self.copy(indices=indices)

@deprecate_parameters_if_set(
DEPRECATION_WARNING_REMOVE_PARAMETER, "load_partition_indices",
DEPRECATION_WARNING_REMOVE_PARAMETER,
"load_partition_indices",
)
def load_all_indices(
self: T, store: StoreInput, load_partition_indices: bool = True
Expand Down Expand Up @@ -446,7 +447,9 @@ def get_indices_as_dataframe(
)
else:
df = dm._evaluate_conjunction(
columns=columns, predicates=None, date_as_object=date_as_object,
columns=columns,
predicates=None,
date_as_object=date_as_object,
)
return df

Expand Down
4 changes: 3 additions & 1 deletion kartothek/core/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,9 @@ def load_index(self: T, column, store=None) -> T:
"load_partition_indices",
)
def load_all_indices(
self: T, store: Any = None, load_partition_indices: bool = True,
self: T,
store: Any = None,
load_partition_indices: bool = True,
) -> T:
self._cache_metadata = self.dataset_metadata.load_all_indices(
self.store, load_partition_indices=load_partition_indices
Expand Down
4 changes: 2 additions & 2 deletions kartothek/io/dask/_sizeof.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ def _dct_sizeof(obj):


def register_sizeof_ktk_classes():
from kartothek.core.common_metadata import SchemaWrapper
from kartothek.core.dataset import DatasetMetadata
from kartothek.core.factory import DatasetFactory
from kartothek.io_components.metapartition import MetaPartition
from kartothek.core.index import ExplicitSecondaryIndex, PartitionIndex
from kartothek.core.partition import Partition
from kartothek.core.common_metadata import SchemaWrapper
from kartothek.io_components.metapartition import MetaPartition

dask_sizeof.register(DatasetMetadata, _dct_sizeof)
dask_sizeof.register(DatasetFactory, _dct_sizeof)
Expand Down
3 changes: 1 addition & 2 deletions kartothek/io/dask/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@

try:
# Technically distributed is an optional dependency
from distributed.protocol import serialize_bytes
from distributed.protocol import deserialize_bytes
from distributed.protocol import deserialize_bytes, serialize_bytes

HAS_DISTRIBUTED = True
except ImportError:
Expand Down
3 changes: 2 additions & 1 deletion kartothek/io/dask/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,8 @@ def _shuffle_docs(func):
@default_docs
@_shuffle_docs
@deprecate_parameters_if_set(
DEPRECATION_WARNING_REMOVE_PARAMETER, "delete_scope",
DEPRECATION_WARNING_REMOVE_PARAMETER,
"delete_scope",
)
def store_dataset_from_ddf(
ddf: dd.DataFrame,
Expand Down
9 changes: 7 additions & 2 deletions kartothek/io/eager.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,9 @@ def read_table(
@default_docs
@normalize_args
@deprecate_parameters_if_set(
DEPRECATION_WARNING_REMOVE_PARAMETER, "output_dataset_uuid", "df_serializer",
DEPRECATION_WARNING_REMOVE_PARAMETER,
"output_dataset_uuid",
"df_serializer",
)
def commit_dataset(
store: Optional[StoreInput] = None,
Expand Down Expand Up @@ -712,7 +714,10 @@ def create_empty_dataset_header(
"df_serializer",
)
@deprecate_parameters_if_set(
DEPRECATION_WARNING_REMOVE_PARAMETER, "metadata", "overwrite", "metadata_merger",
DEPRECATION_WARNING_REMOVE_PARAMETER,
"metadata",
"overwrite",
"metadata_merger",
)
def write_single_partition(
store: Optional[KeyValueStore] = None,
Expand Down
34 changes: 26 additions & 8 deletions kartothek/io/testing/append_cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,10 @@ def test_rowgroups_are_applied_when_df_serializer_is_passed_to_append_cube(
"""

# Build cube
df = pd.DataFrame(data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}, columns=["x", "p"],)
df = pd.DataFrame(
data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]},
columns=["x", "p"],
)
cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
build_cube(
data=df,
Expand All @@ -127,7 +130,8 @@ def test_rowgroups_are_applied_when_df_serializer_is_passed_to_append_cube(

# Append to cube
df_append = pd.DataFrame(
data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]}, columns=["x", "p"],
data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]},
columns=["x", "p"],
)
result = driver(
data={"seed": df_append},
Expand Down Expand Up @@ -157,17 +161,27 @@ def test_single_rowgroup_when_df_serializer_is_not_passed_to_append_cube(
"""

# Build cube
df = pd.DataFrame(data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}, columns=["x", "p"],)
df = pd.DataFrame(
data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]},
columns=["x", "p"],
)
cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
build_cube(
data=df, cube=cube, store=function_store,
data=df,
cube=cube,
store=function_store,
)

# Append to cube
df_append = pd.DataFrame(
data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]}, columns=["x", "p"],
data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]},
columns=["x", "p"],
)
result = driver(
data={"seed": df_append},
cube=cube,
store=function_store,
)
result = driver(data={"seed": df_append}, cube=cube, store=function_store,)
dataset = result["seed"].load_all_indices(function_store())

part_num_rows = {0: 2, 1: 2, 2: 1, 3: 3}
Expand All @@ -187,7 +201,10 @@ def test_compression_is_compatible_on_append_cube(driver, function_store):
unnecessarily.
"""
# Build cube
df = pd.DataFrame(data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}, columns=["x", "p"],)
df = pd.DataFrame(
data={"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]},
columns=["x", "p"],
)
cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
build_cube(
data=df,
Expand All @@ -198,7 +215,8 @@ def test_compression_is_compatible_on_append_cube(driver, function_store):

# Append to cube
df_append = pd.DataFrame(
data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]}, columns=["x", "p"],
data={"x": [0, 1, 2, 3], "p": [2, 3, 3, 3]},
columns=["x", "p"],
)
result = driver(
data={"seed": df_append},
Expand Down
Loading