Skip to content

Commit

Permalink
Minor fixes (#37)
Browse files Browse the repository at this point in the history
* Fix #34, #35, #36
* Try/except for new numba import
* Modify test_parquet to use str rather than Path
* Fix collections.abc warning
* mkdir for tempdir, reduce retry wait for test
* Skip unsupported pandas extension tests
* Pin pyarrow, update changelog
  • Loading branch information
brl0 authored Jul 26, 2020
1 parent efdabe5 commit 550ccf9
Show file tree
Hide file tree
Showing 11 changed files with 81 additions and 25 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ install:
- conda config --set always_yes yes --set changeps1 no
- conda update -q conda
- conda info -a
- conda create -q -n test-environment -c conda-forge python=$TRAVIS_PYTHON_VERSION pandas dask numba numpy "pyarrow>=0.15" pytest hypothesis scipy shapely geopandas param hilbertcurve
- conda create -q -n test-environment -c conda-forge python=$TRAVIS_PYTHON_VERSION pandas dask numba numpy "pyarrow<1" pytest hypothesis scipy shapely geopandas param hilbertcurve
- conda activate test-environment
- python setup.py install

Expand Down
24 changes: 20 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,25 @@
Version 0.3.6
=============

### Added
- More intuitive error when lsuffix == rsuffix on sjoin ([#35](https://github.com/holoviz/spatialpandas/issues/35))

### Fixed
- `read_parquet_dask` fails to read from s3 glob ([#34](https://github.com/holoviz/spatialpandas/issues/34))
- Tests failing with `ValueError: Cannot mask with a boolean indexer containing NA values` ([#41](https://github.com/holoviz/spatialpandas/issues/41))
- Tests in `test_parquet` failing with `TypeError: argument of type 'PosixPath' is not iterable` ([#42](https://github.com/holoviz/spatialpandas/issues/42))
- Create temp directory for partitions explitictly, fixes failure of test `test_pack_partitions_to_parquet`

### Updated
- Numba import updated to address deprecation warning ([#36](https://github.com/holoviz/spatialpandas/issues/36))


Version 0.3.5
=============

### Fixed
- Fixed `GeoDataFrame` constructor exception when GeoPandas is not installed.

Version 0.3.4
=============

Expand All @@ -12,15 +28,15 @@ Version 0.3.4
- Fixed `abstract://` protocol error in `pack_partitions_to_parquet` when run on
local filesystem.
- Preserve active geometry column when importing GeoPandas `GeoDataFrame`.
- Always load index columns when the `columns` argument is passed to `read_parquet`.
- Always load index columns when the `columns` argument is passed to `read_parquet`.

### Updated
- Added support for pandas 1.0.
- Added support for pyarrow 0.16. When 0.16 is available, the performance of
`read_parquet` and `read_parquet_dask` is significantly improved.


Version 0.3.2 / 0.3.3
Version 0.3.2 / 0.3.3
=====================

### Fixed
Expand Down Expand Up @@ -59,7 +75,7 @@ Version 0.1.1
- Documented dependencies required for the Overview notebook ([#18](https://github.com/holoviz/spatialpandas/pull/18))

### Fixed
- Fixed Ring.to_shapely error ([#17](https://github.com/holoviz/spatialpandas/pull/17))
- Fixed Ring.to_shapely error ([#17](https://github.com/holoviz/spatialpandas/pull/17))

Version 0.1.0
=============
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
'dask>=2.0',
'numba',
'numpy',
'pyarrow>=0.15',
'pyarrow>=0.15,<1',
'param',
'fsspec',
'retrying',
Expand Down
2 changes: 2 additions & 0 deletions spatialpandas/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,8 @@ def move_retry(p1, p2):
for out_partition in out_partitions:
part_dir = os.path.join(path, "part.%d.parquet" % out_partition)
mkdirs_retry(part_dir)
tmp_part_dir = tempdir_format.format(partition=out_partition, uuid=dataset_uuid)
mkdirs_retry(tmp_part_dir)

# Shuffle and write a parquet dataset for each output partition
@retryit
Expand Down
2 changes: 1 addition & 1 deletion spatialpandas/geometry/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from numbers import Integral
from collections import Iterable
from collections.abc import Iterable

import numpy as np
import pandas as pd
Expand Down
6 changes: 4 additions & 2 deletions spatialpandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,10 +238,12 @@ def read_parquet_dask(


def _maybe_prepend_protocol(paths, filesystem):
if filesystem.protocol not in ("file", "abstract"):
protocol = filesystem.protocol if isinstance(
filesystem.protocol, str) else filesystem.protocol[0]
if protocol not in ("file", "abstract"):
# Add back prefix (e.g. s3://)
paths = [
"{proto}://{p}".format(proto=filesystem.protocol, p=p) for p in paths
"{proto}://{p}".format(proto=protocol, p=p) for p in paths
]
return paths

Expand Down
6 changes: 5 additions & 1 deletion spatialpandas/spatialindex/rtree.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import numpy as np
from numba import jitclass
from numba import int64, float64

try:
from numba.experimental import jitclass
except ImportError:
from numba import jitclass

from spatialpandas.spatialindex.hilbert_curve import (
distances_from_coordinates
)
Expand Down
6 changes: 6 additions & 0 deletions spatialpandas/tools/sjoin.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ def sjoin(
" Received: {val}".format(val=repr(how), valid_how=valid_how)
)

# Validate suffixes
if lsuffix == rsuffix:
raise ValueError(
"`lsuffix` and `rsuffix` must not be equal"
)

# Perform sjoin
if isinstance(left_df, GeoDataFrame):
return _sjoin_pandas_pandas(
Expand Down
5 changes: 4 additions & 1 deletion tests/test_fixedextensionarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,10 @@ def test_take_non_na_fill_value(self):
def test_reindex_non_na_fill_value(self, data_missing):
pass

@pytest.mark.skip("Cannot mask with a boolean indexer containing NA values")
def test_getitem_boolean_na_treated_as_false(self, data):
pass


class TestGeometryGroupby(eb.BaseGroupbyTests):
@pytest.mark.skip(
Expand Down Expand Up @@ -204,4 +208,3 @@ class TestGeometryReshaping(eb.BaseReshapingTests):
@pytest.mark.skip(reason="__setitem__ not supported")
def test_ravel(self):
pass

5 changes: 4 additions & 1 deletion tests/test_listextensionarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,10 @@ def test_take_non_na_fill_value(self):
def test_reindex_non_na_fill_value(self, data_missing):
pass

@pytest.mark.skip("Cannot mask with a boolean indexer containing NA values")
def test_getitem_boolean_na_treated_as_false(self, data):
pass


class TestGeometryGroupby(eb.BaseGroupbyTests):
@pytest.mark.skip(
Expand Down Expand Up @@ -204,4 +208,3 @@ class TestGeometryReshaping(eb.BaseReshapingTests):
@pytest.mark.skip(reason="__setitem__ not supported")
def test_ravel(self):
pass

46 changes: 33 additions & 13 deletions tests/test_parquet.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from hypothesis import given, settings, HealthCheck
from hypothesis import given, settings, HealthCheck, Phase, Verbosity
import hypothesis.strategies as hs
import dask
import dask.dataframe as dd
Expand Down Expand Up @@ -40,7 +40,7 @@ def test_parquet(gp_point, gp_multipoint, gp_multiline, tmp_path):

path = tmp_path / 'df.parq'
to_parquet(df, path)
df_read = read_parquet(path, columns=['point', 'multipoint', 'multiline', 'a'])
df_read = read_parquet(str(path), columns=['point', 'multipoint', 'multiline', 'a'])
assert isinstance(df_read, GeoDataFrame)
assert all(df == df_read)
assert df_read.index.name == df.index.name
Expand All @@ -65,7 +65,7 @@ def test_parquet_columns(gp_point, gp_multipoint, gp_multiline, tmp_path):
path = tmp_path / 'df.parq'
to_parquet(df, path)
columns = ['a', 'multiline']
df_read = read_parquet(path, columns=columns)
df_read = read_parquet(str(path), columns=columns)
assert isinstance(df_read, GeoDataFrame)
assert all(df[columns] == df_read)

Expand All @@ -86,8 +86,8 @@ def test_parquet_dask(gp_multipoint, gp_multiline, tmp_path):
ddf = dd.from_pandas(df, npartitions=3)

path = tmp_path / 'ddf.parq'
ddf.to_parquet(path)
ddf_read = read_parquet_dask(path)
ddf.to_parquet(str(path))
ddf_read = read_parquet_dask(str(path))

# Check type
assert isinstance(ddf_read, DaskGeoDataFrame)
Expand Down Expand Up @@ -158,7 +158,18 @@ def test_pack_partitions(gp_multipoint, gp_multiline):
gp_multiline=st_multiline_array(min_size=60, max_size=100, geoseries=True),
use_temp_format=hs.booleans()
)
@settings(deadline=None, max_examples=30, suppress_health_check=[HealthCheck.too_slow])
@settings(
deadline=None,
max_examples=30,
suppress_health_check=[HealthCheck.too_slow],
phases=[
Phase.explicit,
Phase.reuse,
Phase.generate,
Phase.target
],
verbosity=Verbosity.verbose,
)
def test_pack_partitions_to_parquet(
gp_multipoint, gp_multiline, use_temp_format, tmp_path
):
Expand All @@ -173,13 +184,22 @@ def test_pack_partitions_to_parquet(

path = tmp_path / 'ddf.parq'
if use_temp_format:
(tmp_path / 'scratch').mkdir(parents=True, exist_ok=True)
tempdir_format = str(tmp_path / 'scratch' / 'part-{uuid}-{partition:03d}')
else:
tempdir_format = None

_retry_args = dict(
wait_exponential_multiplier=10,
wait_exponential_max=20000,
stop_max_attempt_number=4
)

ddf_packed = ddf.pack_partitions_to_parquet(
path, npartitions=12,
tempdir_format=tempdir_format
str(path),
npartitions=12,
tempdir_format=tempdir_format,
_retry_args=_retry_args,
)

# Check the number of partitions (< 4 can happen in the case of empty partitions)
Expand Down Expand Up @@ -228,7 +248,7 @@ def test_pack_partitions_to_parquet_glob(
}).set_geometry('lines')
ddf1 = dd.from_pandas(df1, npartitions=3)
path1 = tmp_path / 'ddf1.parq'
ddf_packed1 = ddf1.pack_partitions_to_parquet(path1, npartitions=3)
ddf_packed1 = ddf1.pack_partitions_to_parquet(str(path1), npartitions=3)

# Build dataframe2
n = min(len(gp_multipoint2), len(gp_multiline2))
Expand All @@ -239,7 +259,7 @@ def test_pack_partitions_to_parquet_glob(
}).set_geometry('lines')
ddf2 = dd.from_pandas(df2, npartitions=3)
path2 = tmp_path / 'ddf2.parq'
ddf_packed2 = ddf2.pack_partitions_to_parquet(path2, npartitions=4)
ddf_packed2 = ddf2.pack_partitions_to_parquet(str(path2), npartitions=4)

# Load both packed datasets with glob
ddf_globbed = read_parquet_dask(tmp_path / "ddf*.parq", geometry="lines")
Expand Down Expand Up @@ -298,7 +318,7 @@ def test_pack_partitions_to_parquet_list_bounds(
}).set_geometry('lines')
ddf1 = dd.from_pandas(df1, npartitions=3)
path1 = tmp_path / 'ddf1.parq'
ddf_packed1 = ddf1.pack_partitions_to_parquet(path1, npartitions=3)
ddf_packed1 = ddf1.pack_partitions_to_parquet(str(path1), npartitions=3)

# Build dataframe2
n = min(len(gp_multipoint2), len(gp_multiline2))
Expand All @@ -309,11 +329,11 @@ def test_pack_partitions_to_parquet_list_bounds(
}).set_geometry('lines')
ddf2 = dd.from_pandas(df2, npartitions=3)
path2 = tmp_path / 'ddf2.parq'
ddf_packed2 = ddf2.pack_partitions_to_parquet(path2, npartitions=4)
ddf_packed2 = ddf2.pack_partitions_to_parquet(str(path2), npartitions=4)

# Load both packed datasets with glob
ddf_read = read_parquet_dask(
[tmp_path / "ddf1.parq", tmp_path / "ddf2.parq"],
[str(tmp_path / "ddf1.parq"), str(tmp_path / "ddf2.parq")],
geometry="points", bounds=bounds
)

Expand Down

0 comments on commit 550ccf9

Please sign in to comment.