Skip to content

Commit

Permalink
filter bunch by geometry type (#11) (#12)
Browse files Browse the repository at this point in the history
* filter bunch by geometry type  (#11)

* add method: filter_by_geometry and enum: GeometryType

* use xyzservices filter solution

---------

Co-authored-by: Martin Fleischmann <[email protected]>
  • Loading branch information
sdp5 and martinfleis authored Sep 7, 2023
1 parent f310f6d commit 21cc3ef
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 4 deletions.
2 changes: 1 addition & 1 deletion doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@ The database of dataset metadata is handled via custom dict-based classes.

.. autoclass:: Bunch
:exclude-members: clear, copy, fromkeys, get, items, keys, pop, popitem, setdefault, update, values
:members: flatten, query_name
:members: filter, flatten, query_name
137 changes: 135 additions & 2 deletions geodatasets/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

import json
import uuid
from typing import Callable

GEOMETRY_TYPES = ["POINT", "LINESTRING", "POLYGON", "MIXED"]
QUERY_NAME_TRANSLATION = str.maketrans({x: "" for x in "., -_/"})


Expand All @@ -28,7 +30,6 @@ def __dir__(self):
return self.keys()

def _repr_html_(self, inside=False):

children = ""
for key in self.keys():
if isinstance(self[key], Dataset):
Expand Down Expand Up @@ -118,6 +119,139 @@ def query_name(self, name: str) -> Dataset:

raise ValueError(f"No matching item found for the query '{name}'.")

def filter(
self,
keyword: str | None = None,
name: str | None = None,
geometry_type: str | None = None,
function: Callable[[Dataset], bool] = None,
) -> Bunch:
"""Return a subset of the :class:`Bunch` matching the filter conditions
Each :class:`Dataset` within a :class:`Bunch` is checked against one or
more specified conditions and kept if they are satisfied or removed if at least
one condition is not met.
Parameters
----------
keyword : str (optional)
Condition returns ``True`` if ``keyword`` string is present in any string
value in a :class:`Dataset` object.
The comparison is not case sensitive.
name : str (optional)
Condition returns ``True`` if ``name`` string is present in
the name attribute of :class:`Dataset` object.
The comparison is not case sensitive.
geometry_type : str (optional)
Condition returns ``True`` if :meth:`Dataset.geometry_type` is
matches the ``geometry_type``.
Possible options are ``["Point", "LineString", "Polygon", "Mixed"]``.
The comparison is not case sensitive.
function : callable (optional)
Custom function taking :class:`Dataset` as an argument and returns
bool. If ``function`` is given, other parameters are ignored.
Returns
-------
filtered : Bunch
Examples
--------
>>> from geodatasets import data
You can filter all Point datasets:
>>> points = data.filter(geometry_type="Point")
Or all datasets with ``chicago`` in the name:
>>> chicago_datasets = data.filter(name="chicago")
You can use keyword search to find all datasets in a CSV format:
>>> csv_datasets = data.filter(keyword="csv")
You can combine multiple conditions to find datasets based with ``chicago`` in
name of Polygon geometry type:
>>> chicago_polygons = data.filter(name="chicago", geometry_type="Polygon")
You can also pass custom function that takes :class:`Dataset` and returns
boolean value. You can then find all datasets with ``nrows`` smaller than
100:
>>> def small_data(dataset):
... if hasattr(dataset, "nrows") and dataset.nrows < 100:
... return True
... return False
>>> small = data.filter(function=small_data)
"""

def _validate(dataset, keyword, name, geometry_type):
cond = []

if keyword is not None:
keyword_match = False
for v in dataset.values():
if isinstance(v, str) and keyword.lower() in v.lower():
keyword_match = True
break
cond.append(keyword_match)

if name is not None:
name_match = False
if name.lower() in dataset.name.lower():
name_match = True
cond.append(name_match)

if geometry_type is not None:
geom_type_match = False
if (
dataset.geometry_type.upper()
== geometry_type.translate(QUERY_NAME_TRANSLATION).upper()
):
geom_type_match = True
cond.append(geom_type_match)

return all(cond)

def _filter_bunch(bunch, keyword, name, geometry_type, function):
new = Bunch()
for key, value in bunch.items():
if isinstance(value, Dataset):
if function is None:
if _validate(
value,
keyword=keyword,
name=name,
geometry_type=geometry_type,
):
new[key] = value
else:
if function(value):
new[key] = value

else:
filtered = _filter_bunch(
value,
keyword=keyword,
name=name,
geometry_type=geometry_type,
function=function,
)
if filtered:
new[key] = filtered

return new

return _filter_bunch(
self,
keyword=keyword,
name=name,
geometry_type=geometry_type,
function=function,
)


class Dataset(Bunch):
"""
Expand Down Expand Up @@ -178,7 +312,6 @@ def _repr_html_(self, inside=False):


def _load_json(f):

data = json.loads(f)

items = Bunch()
Expand Down
24 changes: 23 additions & 1 deletion geodatasets/tests/test_lib.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from geodatasets import Bunch, Dataset, data
from geodatasets.lib import GEOMETRY_TYPES


@pytest.fixture
Expand All @@ -10,6 +11,7 @@ def data1():
attribution="(C) geodatasets",
name="my_public_data",
filename="data.zip",
geometry_type="Polygon",
hash="qwertyuiopasdfghjklzxcvbnm1234567890",
)

Expand All @@ -21,6 +23,7 @@ def data2():
attribution="(C) geodatasets",
name="my_public_data2",
filename="data2.json",
geometry_type="Point",
hash="qwertyuiopasdfghjklzxcvbnm1234567890",
)

Expand All @@ -37,7 +40,9 @@ def test_bunch(


def test_dir(data1):
assert dir(data1) == sorted(["url", "attribution", "name", "filename", "hash"])
assert dir(data1) == sorted(
["url", "attribution", "name", "filename", "geometry_type", "hash"]
)


def test_expect_name_url_attribution():
Expand Down Expand Up @@ -134,3 +139,20 @@ def test_query_name():

with pytest.raises(ValueError, match="No matching item found"):
data.query_name("i don't exist")


def test_filter(test_bunch):
assert len(test_bunch.filter(keyword="json").flatten()) == 1
assert len(test_bunch.filter(name="data2").flatten()) == 1
assert len(test_bunch.filter(geometry_type="Point").flatten()) == 1
assert (
len(test_bunch.filter(keyword="json", geometry_type="Polygon").flatten()) == 0
)
assert len(test_bunch.filter(name="nonsense").flatten()) == 0

def custom(provider):
if hasattr(provider, "filename") and provider.filename == "data.zip":
return True
return False

assert len(test_bunch.filter(function=custom).flatten()) == 1

0 comments on commit 21cc3ef

Please sign in to comment.