-
Notifications
You must be signed in to change notification settings - Fork 58
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add fallback mechanism for data sources (#46)
- Loading branch information
Showing
5 changed files
with
178 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,10 @@ Testing | |
|
||
Unit tests can be run with, | ||
|
||
.. code:: | ||
pip install pytest pytest-httpserver | ||
.. code:: | ||
pytest | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,11 +2,13 @@ | |
# | ||
# Authors: Roman Yurchak <[email protected]> | ||
|
||
import contextlib | ||
import os | ||
import urllib.request | ||
import warnings | ||
from io import BytesIO | ||
from typing import Any, Tuple | ||
from typing import Any, Tuple, List | ||
from zipfile import ZipFile | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
@@ -17,7 +19,13 @@ | |
"PGEOCODE_DATA_DIR", os.path.join(os.path.expanduser("~"), "pgeocode_data") | ||
) | ||
|
||
DOWNLOAD_URL = "https://download.geonames.org/export/zip/{country}.zip" | ||
# A list of download locations. If the first URL fails, following ones will | ||
# be used. | ||
DOWNLOAD_URL = [ | ||
"https://download.geonames.org/export/zip/{country}.zip", | ||
"https://symerio.github.io/postal-codes-data/data/geonames/{country}.txt", | ||
] | ||
|
||
|
||
DATA_FIELDS = [ | ||
"country_code", | ||
|
@@ -121,11 +129,51 @@ | |
] | ||
|
||
|
||
def _open_url(url: str) -> Tuple[BytesIO, Any]: | ||
"""Download contents for a URL""" | ||
@contextlib.contextmanager | ||
def _open_extract_url(url: str, country: str) -> Any: | ||
"""Download contents for a URL | ||
If the file has a .zip extension, open it and extract the country | ||
Returns the opened file object. | ||
""" | ||
with urllib.request.urlopen(url) as res: | ||
reader = BytesIO(res.read()) | ||
return reader, res.headers | ||
with BytesIO(res.read()) as reader: | ||
if url.endswith(".zip"): | ||
with ZipFile(reader) as fh_zip: | ||
with fh_zip.open(country.upper() + ".txt") as fh: | ||
yield fh | ||
else: | ||
yield reader | ||
|
||
|
||
@contextlib.contextmanager | ||
def _open_extract_cycle_url(urls: List[str], country: str) -> Any: | ||
"""Same as _open_extract_url but cycle through URLs until one works | ||
We start by opening the first URL in the list, and if fails | ||
move to the next, until one works or the end of list is reached. | ||
""" | ||
if not isinstance(urls, list) or not len(urls): | ||
raise ValueError(f"urls={urls} must be a list with at least one URL") | ||
|
||
err_msg = f"Provided download URLs failed {{err}}: {urls}" | ||
for idx, val in enumerate(urls): | ||
try: | ||
with _open_extract_url(val, country) as fh: | ||
yield fh | ||
# Found a working URL, exit the loop. | ||
break | ||
except urllib.error.HTTPError as err: # type: ignore | ||
if idx == len(urls) - 1: | ||
raise | ||
warnings.warn( | ||
f"Download from {val} failed with: {err}. " | ||
"Trying next URL in DOWNLOAD_URL list.", | ||
UserWarning, | ||
) | ||
else: | ||
raise ValueError(err_msg) | ||
|
||
|
||
class Nominatim: | ||
|
@@ -168,23 +216,22 @@ def __init__(self, country: str = "fr", unique: bool = True): | |
@staticmethod | ||
def _get_data(country: str) -> Tuple[str, pd.DataFrame]: | ||
"""Load the data from disk; otherwise download and save it""" | ||
from zipfile import ZipFile | ||
|
||
data_path = os.path.join(STORAGE_DIR, country.upper() + ".txt") | ||
if os.path.exists(data_path): | ||
data = pd.read_csv(data_path, dtype={"postal_code": str}) | ||
else: | ||
url = DOWNLOAD_URL.format(country=country) | ||
reader, headers = _open_url(url) | ||
with ZipFile(reader) as fh_zip: | ||
with fh_zip.open(country.upper() + ".txt") as fh: | ||
data = pd.read_csv( | ||
fh, | ||
sep="\t", | ||
header=None, | ||
names=DATA_FIELDS, | ||
dtype={"postal_code": str}, | ||
) | ||
download_urls = [ | ||
val.format(country=country) for val in DOWNLOAD_URL | ||
] | ||
with _open_extract_cycle_url(download_urls, country) as fh: | ||
data = pd.read_csv( | ||
fh, | ||
sep="\t", | ||
header=None, | ||
names=DATA_FIELDS, | ||
dtype={"postal_code": str}, | ||
) | ||
if not os.path.exists(STORAGE_DIR): | ||
os.mkdir(STORAGE_DIR) | ||
data.to_csv(data_path, index=None) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,8 +2,10 @@ | |
# | ||
# Authors: Roman Yurchak <[email protected]> | ||
import os | ||
import shutil | ||
import tempfile | ||
import urllib | ||
import json | ||
from zipfile import ZipFile | ||
from io import BytesIO | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
@@ -12,16 +14,13 @@ | |
|
||
import pgeocode | ||
from pgeocode import GeoDistance, Nominatim, haversine_distance | ||
from pgeocode import _open_extract_url | ||
|
||
|
||
@pytest.fixture | ||
def temp_dir(): | ||
path_save = pgeocode.STORAGE_DIR | ||
path = tempfile.mkdtemp() | ||
pgeocode.STORAGE_DIR = path | ||
yield path | ||
pgeocode.STORAGE_DIR = path_save | ||
shutil.rmtree(path) | ||
def temp_dir(tmpdir, monkeypatch): | ||
monkeypatch.setattr(pgeocode, "STORAGE_DIR", str(tmpdir)) | ||
yield str(tmpdir) | ||
|
||
|
||
def _normalize_str(x): | ||
|
@@ -179,3 +178,82 @@ def test_haversine_distance(): | |
d_pred = haversine_distance(x, y) | ||
# same distance +/- 3 km | ||
assert_allclose(d_ref, d_pred, atol=3) | ||
|
||
|
||
def test_open_extract_url(httpserver): | ||
download_url = "/fr.txt" | ||
|
||
# check download of uncompressed files | ||
httpserver.expect_oneshot_request(download_url).respond_with_json({"a": 1}) | ||
with _open_extract_url(httpserver.url_for(download_url), "fr") as fh: | ||
assert json.loads(fh.read()) == {"a": 1} | ||
httpserver.check_assertions() | ||
|
||
# check download of zipped files | ||
# Create an in-memory zip file | ||
answer = b"a=1" | ||
with BytesIO() as fh: | ||
with ZipFile(fh, "w") as fh_zip: | ||
with fh_zip.open("FR.txt", "w") as fh_inner: | ||
fh_inner.write(answer) | ||
fh.seek(0) | ||
res = fh.read() | ||
|
||
download_url = "/fr.zip" | ||
httpserver.expect_oneshot_request(download_url).respond_with_data(res) | ||
|
||
with _open_extract_url(httpserver.url_for(download_url), "fr") as fh: | ||
assert fh.read() == answer | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"download_url", | ||
[ | ||
"https://download.geonames.org/export/zip/{country}.zip", | ||
"https://symerio.github.io/postal-codes-data/data/" | ||
"geonames/{country}.txt", | ||
], | ||
ids=["geonames", "gitlab-pages"], | ||
) | ||
def test_cdn(temp_dir, monkeypatch, download_url): | ||
monkeypatch.setattr(pgeocode, "DOWNLOAD_URL", [download_url]) | ||
assert not os.path.exists(os.path.join(temp_dir, "IE.txt")) | ||
Nominatim("IE") | ||
# the data file was downloaded | ||
assert os.path.exists(os.path.join(temp_dir, "IE.txt")) | ||
|
||
|
||
def test_url_returns_404(httpserver, monkeypatch, temp_dir): | ||
download_url = "/fr.gzip" | ||
httpserver.expect_oneshot_request(download_url).respond_with_data( | ||
"", status=404 | ||
) | ||
|
||
monkeypatch.setattr( | ||
pgeocode, "DOWNLOAD_URL", [httpserver.url_for(download_url)] | ||
) | ||
# Nominatim("fr") | ||
with pytest.raises(urllib.error.HTTPError, match="HTTP Error 404"): | ||
Nominatim("fr") | ||
httpserver.check_assertions() | ||
|
||
|
||
def test_first_url_fails(httpserver, monkeypatch, temp_dir): | ||
download_url = "/IE.txt" | ||
httpserver.expect_oneshot_request(download_url).respond_with_data( | ||
"", status=404 | ||
) | ||
|
||
monkeypatch.setattr( | ||
pgeocode, | ||
"DOWNLOAD_URL", | ||
[ | ||
httpserver.url_for(download_url), | ||
"https://symerio.github.io/postal-codes-data/data/" | ||
"geonames/{country}.txt", | ||
], | ||
) | ||
msg = "IE.txt failed with: HTTP Error 404.*Trying next URL" | ||
with pytest.warns(UserWarning, match=msg): | ||
Nominatim("ie") | ||
httpserver.check_assertions() |