Minor fixes (#37)

* Fix #34, #35, #36 * Try/except for new numba import * Modify test_parquet to use str rather than Path * Fix collections.abc warning * mkdir for tempdir, reduce retry wait for test * Skip unsupported pandas extension tests * Pin pyarrow, update changelog
holoviz · Jul 26, 2020 · 550ccf9 · 550ccf9
1 parent efdabe5
commit 550ccf9
Show file tree

Hide file tree

Showing 11 changed files with 81 additions and 25 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -12,7 +12,7 @@ install:
   - conda config --set always_yes yes --set changeps1 no
   - conda update -q conda
   - conda info -a
-  - conda create -q -n test-environment -c conda-forge python=$TRAVIS_PYTHON_VERSION pandas dask numba numpy "pyarrow>=0.15" pytest hypothesis scipy shapely geopandas param hilbertcurve
+  - conda create -q -n test-environment -c conda-forge python=$TRAVIS_PYTHON_VERSION pandas dask numba numpy "pyarrow<1" pytest hypothesis scipy shapely geopandas param hilbertcurve
   - conda activate test-environment
   - python setup.py install
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,25 @@
+Version 0.3.6
+=============
+
+### Added
+ - More intuitive error when lsuffix == rsuffix on sjoin ([#35](https://github.com/holoviz/spatialpandas/issues/35))
+
+### Fixed
+ - `read_parquet_dask` fails to read from s3 glob ([#34](https://github.com/holoviz/spatialpandas/issues/34))
+ - Tests failing with `ValueError: Cannot mask with a boolean indexer containing NA values` ([#41](https://github.com/holoviz/spatialpandas/issues/41))
+ - Tests in `test_parquet` failing with `TypeError: argument of type 'PosixPath' is not iterable` ([#42](https://github.com/holoviz/spatialpandas/issues/42))
+ - Create temp directory for partitions explitictly, fixes failure of test `test_pack_partitions_to_parquet`
+
+### Updated
+ - Numba import updated to address deprecation warning ([#36](https://github.com/holoviz/spatialpandas/issues/36))
+
+
 Version 0.3.5
 =============
 
 ### Fixed
  - Fixed `GeoDataFrame` constructor exception when GeoPandas is not installed.
- 
+
 Version 0.3.4
 =============
 
@@ -12,15 +28,15 @@ Version 0.3.4
  - Fixed `abstract://` protocol error in `pack_partitions_to_parquet` when run on
  local filesystem.
  - Preserve active geometry column when importing GeoPandas `GeoDataFrame`.
- - Always load index columns when the `columns` argument is passed to `read_parquet`.  
+ - Always load index columns when the `columns` argument is passed to `read_parquet`.
 
 ### Updated
  - Added support for pandas 1.0.
  - Added support for pyarrow 0.16. When 0.16 is available, the performance of
  `read_parquet` and `read_parquet_dask` is significantly improved.
 
 
-Version 0.3.2 / 0.3.3 
+Version 0.3.2 / 0.3.3
 =====================
 
 ### Fixed
@@ -59,7 +75,7 @@ Version 0.1.1
  - Documented dependencies required for the Overview notebook ([#18](https://github.com/holoviz/spatialpandas/pull/18))
 
 ### Fixed
- - Fixed Ring.to_shapely error ([#17](https://github.com/holoviz/spatialpandas/pull/17)) 
+ - Fixed Ring.to_shapely error ([#17](https://github.com/holoviz/spatialpandas/pull/17))
 
 Version 0.1.0
 =============

diff --git a/setup.py b/setup.py
@@ -24,7 +24,7 @@
       'dask>=2.0',
       'numba',
       'numpy',
-      'pyarrow>=0.15',
+      'pyarrow>=0.15,<1',
       'param',
       'fsspec',
       'retrying',

diff --git a/spatialpandas/dask.py b/spatialpandas/dask.py
@@ -313,6 +313,8 @@ def move_retry(p1, p2):
         for out_partition in out_partitions:
             part_dir = os.path.join(path, "part.%d.parquet" % out_partition)
             mkdirs_retry(part_dir)
+            tmp_part_dir = tempdir_format.format(partition=out_partition, uuid=dataset_uuid)
+            mkdirs_retry(tmp_part_dir)
 
         # Shuffle and write a parquet dataset for each output partition
         @retryit

diff --git a/spatialpandas/geometry/base.py b/spatialpandas/geometry/base.py
@@ -1,5 +1,5 @@
 from numbers import Integral
-from collections import Iterable
+from collections.abc import Iterable
 
 import numpy as np
 import pandas as pd

diff --git a/spatialpandas/io/parquet.py b/spatialpandas/io/parquet.py
@@ -238,10 +238,12 @@ def read_parquet_dask(
 
 
 def _maybe_prepend_protocol(paths, filesystem):
-    if filesystem.protocol not in ("file", "abstract"):
+    protocol = filesystem.protocol if isinstance(
+        filesystem.protocol, str) else filesystem.protocol[0]
+    if protocol not in ("file", "abstract"):
         # Add back prefix (e.g. s3://)
         paths = [
-            "{proto}://{p}".format(proto=filesystem.protocol, p=p) for p in paths
+            "{proto}://{p}".format(proto=protocol, p=p) for p in paths
         ]
     return paths
 

diff --git a/spatialpandas/spatialindex/rtree.py b/spatialpandas/spatialindex/rtree.py
@@ -1,7 +1,11 @@
 import numpy as np
-from numba import jitclass
 from numba import int64, float64
 
+try:
+    from numba.experimental import jitclass
+except ImportError:
+    from numba import jitclass
+
 from spatialpandas.spatialindex.hilbert_curve import (
     distances_from_coordinates
 )

diff --git a/spatialpandas/tools/sjoin.py b/spatialpandas/tools/sjoin.py
@@ -77,6 +77,12 @@ def sjoin(
             "    Received: {val}".format(val=repr(how), valid_how=valid_how)
         )
 
+    # Validate suffixes
+    if lsuffix == rsuffix:
+        raise ValueError(
+            "`lsuffix` and `rsuffix` must not be equal"
+        )
+
     # Perform sjoin
     if isinstance(left_df, GeoDataFrame):
         return _sjoin_pandas_pandas(

diff --git a/tests/test_fixedextensionarray.py b/tests/test_fixedextensionarray.py
@@ -146,6 +146,10 @@ def test_take_non_na_fill_value(self):
     def test_reindex_non_na_fill_value(self, data_missing):
         pass
 
+    @pytest.mark.skip("Cannot mask with a boolean indexer containing NA values")
+    def test_getitem_boolean_na_treated_as_false(self, data):
+        pass
+
 
 class TestGeometryGroupby(eb.BaseGroupbyTests):
     @pytest.mark.skip(
@@ -204,4 +208,3 @@ class TestGeometryReshaping(eb.BaseReshapingTests):
     @pytest.mark.skip(reason="__setitem__ not supported")
     def test_ravel(self):
         pass
-
diff --git a/tests/test_listextensionarray.py b/tests/test_listextensionarray.py
@@ -146,6 +146,10 @@ def test_take_non_na_fill_value(self):
     def test_reindex_non_na_fill_value(self, data_missing):
         pass
 
+    @pytest.mark.skip("Cannot mask with a boolean indexer containing NA values")
+    def test_getitem_boolean_na_treated_as_false(self, data):
+        pass
+
 
 class TestGeometryGroupby(eb.BaseGroupbyTests):
     @pytest.mark.skip(
@@ -204,4 +208,3 @@ class TestGeometryReshaping(eb.BaseReshapingTests):
     @pytest.mark.skip(reason="__setitem__ not supported")
     def test_ravel(self):
         pass
-
diff --git a/tests/test_parquet.py b/tests/test_parquet.py
@@ -1,4 +1,4 @@
-from hypothesis import given, settings, HealthCheck
+from hypothesis import given, settings, HealthCheck, Phase, Verbosity
 import hypothesis.strategies as hs
 import dask
 import dask.dataframe as dd
@@ -40,7 +40,7 @@ def test_parquet(gp_point, gp_multipoint, gp_multiline, tmp_path):
 
     path = tmp_path / 'df.parq'
     to_parquet(df, path)
-    df_read = read_parquet(path, columns=['point', 'multipoint', 'multiline', 'a'])
+    df_read = read_parquet(str(path), columns=['point', 'multipoint', 'multiline', 'a'])
     assert isinstance(df_read, GeoDataFrame)
     assert all(df == df_read)
     assert df_read.index.name == df.index.name
@@ -65,7 +65,7 @@ def test_parquet_columns(gp_point, gp_multipoint, gp_multiline, tmp_path):
     path = tmp_path / 'df.parq'
     to_parquet(df, path)
     columns = ['a', 'multiline']
-    df_read = read_parquet(path, columns=columns)
+    df_read = read_parquet(str(path), columns=columns)
     assert isinstance(df_read, GeoDataFrame)
     assert all(df[columns] == df_read)
 
@@ -86,8 +86,8 @@ def test_parquet_dask(gp_multipoint, gp_multiline, tmp_path):
     ddf = dd.from_pandas(df, npartitions=3)
 
     path = tmp_path / 'ddf.parq'
-    ddf.to_parquet(path)
-    ddf_read = read_parquet_dask(path)
+    ddf.to_parquet(str(path))
+    ddf_read = read_parquet_dask(str(path))
 
     # Check type
     assert isinstance(ddf_read, DaskGeoDataFrame)
@@ -158,7 +158,18 @@ def test_pack_partitions(gp_multipoint, gp_multiline):
     gp_multiline=st_multiline_array(min_size=60, max_size=100, geoseries=True),
     use_temp_format=hs.booleans()
 )
-@settings(deadline=None, max_examples=30, suppress_health_check=[HealthCheck.too_slow])
+@settings(
+    deadline=None,
+    max_examples=30,
+    suppress_health_check=[HealthCheck.too_slow],
+    phases=[
+        Phase.explicit,
+        Phase.reuse,
+        Phase.generate,
+        Phase.target
+    ],
+    verbosity=Verbosity.verbose,
+)
 def test_pack_partitions_to_parquet(
         gp_multipoint, gp_multiline, use_temp_format, tmp_path
 ):
@@ -173,13 +184,22 @@ def test_pack_partitions_to_parquet(
 
     path = tmp_path / 'ddf.parq'
     if use_temp_format:
+        (tmp_path / 'scratch').mkdir(parents=True, exist_ok=True)
         tempdir_format = str(tmp_path / 'scratch' / 'part-{uuid}-{partition:03d}')
     else:
         tempdir_format = None
 
+    _retry_args = dict(
+        wait_exponential_multiplier=10,
+        wait_exponential_max=20000,
+        stop_max_attempt_number=4
+    )
+
     ddf_packed = ddf.pack_partitions_to_parquet(
-        path, npartitions=12,
-        tempdir_format=tempdir_format
+        str(path),
+        npartitions=12,
+        tempdir_format=tempdir_format,
+        _retry_args=_retry_args,
     )
 
     # Check the number of partitions (< 4 can happen in the case of empty partitions)
@@ -228,7 +248,7 @@ def test_pack_partitions_to_parquet_glob(
     }).set_geometry('lines')
     ddf1 = dd.from_pandas(df1, npartitions=3)
     path1 = tmp_path / 'ddf1.parq'
-    ddf_packed1 = ddf1.pack_partitions_to_parquet(path1, npartitions=3)
+    ddf_packed1 = ddf1.pack_partitions_to_parquet(str(path1), npartitions=3)
 
     # Build dataframe2
     n = min(len(gp_multipoint2), len(gp_multiline2))
@@ -239,7 +259,7 @@ def test_pack_partitions_to_parquet_glob(
     }).set_geometry('lines')
     ddf2 = dd.from_pandas(df2, npartitions=3)
     path2 = tmp_path / 'ddf2.parq'
-    ddf_packed2 = ddf2.pack_partitions_to_parquet(path2, npartitions=4)
+    ddf_packed2 = ddf2.pack_partitions_to_parquet(str(path2), npartitions=4)
 
     # Load both packed datasets with glob
     ddf_globbed = read_parquet_dask(tmp_path / "ddf*.parq", geometry="lines")
@@ -298,7 +318,7 @@ def test_pack_partitions_to_parquet_list_bounds(
     }).set_geometry('lines')
     ddf1 = dd.from_pandas(df1, npartitions=3)
     path1 = tmp_path / 'ddf1.parq'
-    ddf_packed1 = ddf1.pack_partitions_to_parquet(path1, npartitions=3)
+    ddf_packed1 = ddf1.pack_partitions_to_parquet(str(path1), npartitions=3)
 
     # Build dataframe2
     n = min(len(gp_multipoint2), len(gp_multiline2))
@@ -309,11 +329,11 @@ def test_pack_partitions_to_parquet_list_bounds(
     }).set_geometry('lines')
     ddf2 = dd.from_pandas(df2, npartitions=3)
     path2 = tmp_path / 'ddf2.parq'
-    ddf_packed2 = ddf2.pack_partitions_to_parquet(path2, npartitions=4)
+    ddf_packed2 = ddf2.pack_partitions_to_parquet(str(path2), npartitions=4)
 
     # Load both packed datasets with glob
     ddf_read = read_parquet_dask(
-        [tmp_path / "ddf1.parq", tmp_path / "ddf2.parq"],
+        [str(tmp_path / "ddf1.parq"), str(tmp_path / "ddf2.parq")],
         geometry="points", bounds=bounds
     )