Skip to content

Commit

Permalink
Add fallback mechanism for data sources (#46)
Browse files Browse the repository at this point in the history
  • Loading branch information
rth authored Oct 23, 2020
1 parent e4aeceb commit 46ad398
Show file tree
Hide file tree
Showing 5 changed files with 178 additions and 28 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,5 @@ jobs:
pip install flake8
- name: Test with pytest
run: |
pip install pytest
pip install pytest pytest-httpserver
pytest
21 changes: 21 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,27 @@ Defaults to ``~/pgeocode_data``, it is the directory where data is downloaded
for later consumption. It can be changed using the environment variable
``PGEOCODE_DATA_DIR``, i.e. ``export PGEOCODE_DATA_DIR=/tmp/pgeocode_data``.

**Data sources**

The data sources are provided as a list in the ``pgeocode.DOWNLOAD_URL`` variable.
The default value is,

.. code::
DOWNLOAD_URL = [
"https://download.geonames.org/export/zip/{country}.zip",
"https://symerio.github.io/postal-codes-data/data/geonames/{country}.txt",
]
Data sources are tried from first to last until one works. Here the second link is a mirror
of the first.

It is also possible to extend this variable with third party data sources, as
long as they follow the same format. See for instance
[postal-codes-data](https://github.com/symerio/postal-codes-data/tree/master/data/geonames)
repository for examples of data files.


License
-------

Expand Down
4 changes: 4 additions & 0 deletions doc/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ Testing

Unit tests can be run with,

.. code::
pip install pytest pytest-httpserver
.. code::
pytest
Expand Down
83 changes: 65 additions & 18 deletions pgeocode.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
#
# Authors: Roman Yurchak <[email protected]>

import contextlib
import os
import urllib.request
import warnings
from io import BytesIO
from typing import Any, Tuple
from typing import Any, Tuple, List
from zipfile import ZipFile

import numpy as np
import pandas as pd
Expand All @@ -17,7 +19,13 @@
"PGEOCODE_DATA_DIR", os.path.join(os.path.expanduser("~"), "pgeocode_data")
)

DOWNLOAD_URL = "https://download.geonames.org/export/zip/{country}.zip"
# A list of download locations. If the first URL fails, following ones will
# be used.
DOWNLOAD_URL = [
"https://download.geonames.org/export/zip/{country}.zip",
"https://symerio.github.io/postal-codes-data/data/geonames/{country}.txt",
]


DATA_FIELDS = [
"country_code",
Expand Down Expand Up @@ -121,11 +129,51 @@
]


def _open_url(url: str) -> Tuple[BytesIO, Any]:
"""Download contents for a URL"""
@contextlib.contextmanager
def _open_extract_url(url: str, country: str) -> Any:
"""Download contents for a URL
If the file has a .zip extension, open it and extract the country
Returns the opened file object.
"""
with urllib.request.urlopen(url) as res:
reader = BytesIO(res.read())
return reader, res.headers
with BytesIO(res.read()) as reader:
if url.endswith(".zip"):
with ZipFile(reader) as fh_zip:
with fh_zip.open(country.upper() + ".txt") as fh:
yield fh
else:
yield reader


@contextlib.contextmanager
def _open_extract_cycle_url(urls: List[str], country: str) -> Any:
"""Same as _open_extract_url but cycle through URLs until one works
We start by opening the first URL in the list, and if fails
move to the next, until one works or the end of list is reached.
"""
if not isinstance(urls, list) or not len(urls):
raise ValueError(f"urls={urls} must be a list with at least one URL")

err_msg = f"Provided download URLs failed {{err}}: {urls}"
for idx, val in enumerate(urls):
try:
with _open_extract_url(val, country) as fh:
yield fh
# Found a working URL, exit the loop.
break
except urllib.error.HTTPError as err: # type: ignore
if idx == len(urls) - 1:
raise
warnings.warn(
f"Download from {val} failed with: {err}. "
"Trying next URL in DOWNLOAD_URL list.",
UserWarning,
)
else:
raise ValueError(err_msg)


class Nominatim:
Expand Down Expand Up @@ -168,23 +216,22 @@ def __init__(self, country: str = "fr", unique: bool = True):
@staticmethod
def _get_data(country: str) -> Tuple[str, pd.DataFrame]:
"""Load the data from disk; otherwise download and save it"""
from zipfile import ZipFile

data_path = os.path.join(STORAGE_DIR, country.upper() + ".txt")
if os.path.exists(data_path):
data = pd.read_csv(data_path, dtype={"postal_code": str})
else:
url = DOWNLOAD_URL.format(country=country)
reader, headers = _open_url(url)
with ZipFile(reader) as fh_zip:
with fh_zip.open(country.upper() + ".txt") as fh:
data = pd.read_csv(
fh,
sep="\t",
header=None,
names=DATA_FIELDS,
dtype={"postal_code": str},
)
download_urls = [
val.format(country=country) for val in DOWNLOAD_URL
]
with _open_extract_cycle_url(download_urls, country) as fh:
data = pd.read_csv(
fh,
sep="\t",
header=None,
names=DATA_FIELDS,
dtype={"postal_code": str},
)
if not os.path.exists(STORAGE_DIR):
os.mkdir(STORAGE_DIR)
data.to_csv(data_path, index=None)
Expand Down
96 changes: 87 additions & 9 deletions test_pgeocode.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
#
# Authors: Roman Yurchak <[email protected]>
import os
import shutil
import tempfile
import urllib
import json
from zipfile import ZipFile
from io import BytesIO

import numpy as np
import pandas as pd
Expand All @@ -12,16 +14,13 @@

import pgeocode
from pgeocode import GeoDistance, Nominatim, haversine_distance
from pgeocode import _open_extract_url


@pytest.fixture
def temp_dir():
path_save = pgeocode.STORAGE_DIR
path = tempfile.mkdtemp()
pgeocode.STORAGE_DIR = path
yield path
pgeocode.STORAGE_DIR = path_save
shutil.rmtree(path)
def temp_dir(tmpdir, monkeypatch):
monkeypatch.setattr(pgeocode, "STORAGE_DIR", str(tmpdir))
yield str(tmpdir)


def _normalize_str(x):
Expand Down Expand Up @@ -179,3 +178,82 @@ def test_haversine_distance():
d_pred = haversine_distance(x, y)
# same distance +/- 3 km
assert_allclose(d_ref, d_pred, atol=3)


def test_open_extract_url(httpserver):
download_url = "/fr.txt"

# check download of uncompressed files
httpserver.expect_oneshot_request(download_url).respond_with_json({"a": 1})
with _open_extract_url(httpserver.url_for(download_url), "fr") as fh:
assert json.loads(fh.read()) == {"a": 1}
httpserver.check_assertions()

# check download of zipped files
# Create an in-memory zip file
answer = b"a=1"
with BytesIO() as fh:
with ZipFile(fh, "w") as fh_zip:
with fh_zip.open("FR.txt", "w") as fh_inner:
fh_inner.write(answer)
fh.seek(0)
res = fh.read()

download_url = "/fr.zip"
httpserver.expect_oneshot_request(download_url).respond_with_data(res)

with _open_extract_url(httpserver.url_for(download_url), "fr") as fh:
assert fh.read() == answer


@pytest.mark.parametrize(
"download_url",
[
"https://download.geonames.org/export/zip/{country}.zip",
"https://symerio.github.io/postal-codes-data/data/"
"geonames/{country}.txt",
],
ids=["geonames", "gitlab-pages"],
)
def test_cdn(temp_dir, monkeypatch, download_url):
monkeypatch.setattr(pgeocode, "DOWNLOAD_URL", [download_url])
assert not os.path.exists(os.path.join(temp_dir, "IE.txt"))
Nominatim("IE")
# the data file was downloaded
assert os.path.exists(os.path.join(temp_dir, "IE.txt"))


def test_url_returns_404(httpserver, monkeypatch, temp_dir):
download_url = "/fr.gzip"
httpserver.expect_oneshot_request(download_url).respond_with_data(
"", status=404
)

monkeypatch.setattr(
pgeocode, "DOWNLOAD_URL", [httpserver.url_for(download_url)]
)
# Nominatim("fr")
with pytest.raises(urllib.error.HTTPError, match="HTTP Error 404"):
Nominatim("fr")
httpserver.check_assertions()


def test_first_url_fails(httpserver, monkeypatch, temp_dir):
download_url = "/IE.txt"
httpserver.expect_oneshot_request(download_url).respond_with_data(
"", status=404
)

monkeypatch.setattr(
pgeocode,
"DOWNLOAD_URL",
[
httpserver.url_for(download_url),
"https://symerio.github.io/postal-codes-data/data/"
"geonames/{country}.txt",
],
)
msg = "IE.txt failed with: HTTP Error 404.*Trying next URL"
with pytest.warns(UserWarning, match=msg):
Nominatim("ie")
httpserver.check_assertions()

0 comments on commit 46ad398

Please sign in to comment.