Skip to content

Commit

Permalink
Merge pull request #1279 from gboeing/cache
Browse files Browse the repository at this point in the history
refactor http caching and request pausing
  • Loading branch information
gboeing authored Feb 15, 2025
2 parents f9c6790 + 60f395f commit 29c06c0
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 76 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

## 2.0.2 (TBD)

- fix bug in parsing time when calculating pause duration between requests (#1277)
- fix bug where consolidate_intersections function would mutate the passed-in graph (#1273)
- fix bug in parsing time when calculating pause duration between requests (#1277)
- refactor interals of caching and pausing between requests (#1279)
- provide user-friendly error message if consolidate_intersections is run more than once (#1273)
- improve docstrings (#1272 #1274)

Expand Down
73 changes: 40 additions & 33 deletions osmnx/_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,24 +30,22 @@ def _save_to_cache(
"""
Save a HTTP response JSON object to a file in the cache folder.
This calculates the checksum of `url` to generate the cache file name. If
the request was sent to server via POST instead of GET, then `url` should
If request was sent to server via POST instead of GET, then `url` should
be a GET-style representation of the request. Response is only saved to a
cache file if `settings.use_cache` is True, `response_json` is not None,
and `ok` is True.
cache file if `settings.use_cache` is True, `ok` is True, `response_json`
is not None, and `response_json` does not contain a server "remark."
Users should always pass OrderedDicts instead of dicts of parameters into
request functions, so the parameters remain in the same order each time,
producing the same URL string, and thus the same hash. Otherwise the cache
will eventually contain multiple saved responses for the same request
because the URL's parameters appeared in a different order each time.
producing the same URL string, and thus the same hash. Otherwise you will
get a cache miss when the URL's parameters appeared in a different order.
Parameters
----------
url
The URL of the request.
response_json
The JSON response from the server.
The JSON HTTP response.
ok
A `requests.response.ok` value.
"""
Expand All @@ -59,51 +57,61 @@ def _save_to_cache(
msg = f"Did not save to cache because response contains remark: {response_json['remark']!r}"
utils.log(msg, lg.WARNING)
else:
# create the folder on the disk if it doesn't already exist
cache_folder = Path(settings.cache_folder)
cache_folder.mkdir(parents=True, exist_ok=True)

# hash the url to make the filename succinct but unique
# sha1 digest is 160 bits = 20 bytes = 40 hexadecimal characters
checksum = sha1(url.encode("utf-8")).hexdigest() # noqa: S324
cache_filepath = cache_folder / f"{checksum}.json"

# dump to json, and save to file
# create cache folder on disk if it doesn't already exist
cache_filepath = _get_cache_filepath(url)
cache_filepath.parent.mkdir(parents=True, exist_ok=True)
cache_filepath.write_text(json.dumps(response_json), encoding="utf-8")
msg = f"Saved response to cache file {str(cache_filepath)!r}"
utils.log(msg, level=lg.INFO)


def _url_in_cache(url: str) -> Path | None:
def _get_cache_filepath(key: str, extension: str = "json") -> Path:
"""
Determine if a URL's response exists in the cache.
Determine a cache filepath for a key.
Calculates the checksum of `url` to determine the cache file's name.
Returns None if it cannot be found in the cache.
This uses the configured `settings.cache_folder` and calculates the 160
bit SHA-1 hash digest (40 hexadecimal characters) of `key` to generate a
succinct but unique cache filename.
Parameters
----------
url
The URL to look for in the cache.
key
The key for which to generate a cache filepath, for example, a URL.
extension
The desired cache file extension.
Returns
-------
cache_filepath
Path to cached response for `url` if it exists, otherwise None.
Cache filepath corresponding to `key`.
"""
# hash the url to generate the cache filename
checksum = sha1(url.encode("utf-8")).hexdigest() # noqa: S324
cache_filepath = Path(settings.cache_folder) / f"{checksum}.json"
digest = sha1(key.encode("utf-8")).hexdigest() # noqa: S324
return Path(settings.cache_folder) / f"{digest}.{extension}"


# if this file exists in the cache, return its full path
def _check_cache(key: str) -> Path | None:
"""
Check if a key exists in the cache, and return its cache filepath if so.
Parameters
----------
key
The key to look for in the cache.
Returns
-------
cache_filepath
Filepath to cached data for `key` if it exists, otherwise None.
"""
cache_filepath = _get_cache_filepath(key)
return cache_filepath if cache_filepath.is_file() else None


def _retrieve_from_cache(url: str) -> dict[str, Any] | list[dict[str, Any]] | None:
"""
Retrieve a HTTP response JSON object from the cache if it exists.
Returns None if there is a server remark in the cached response.
A cache hit returns the data. A cache miss returns None.
Parameters
----------
Expand All @@ -113,13 +121,12 @@ def _retrieve_from_cache(url: str) -> dict[str, Any] | list[dict[str, Any]] | No
Returns
-------
response_json
Cached response for `url` if it exists in the cache and does not
contain a server remark, otherwise None.
The cached response for `url` if it exists, otherwise None.
"""
# if the tool is configured to use the cache
if settings.use_cache:
# return cached response for this url if exists, otherwise return None
cache_filepath = _url_in_cache(url)
cache_filepath = _check_cache(url)
if cache_filepath is not None:
response_json: dict[str, Any] | list[dict[str, Any]]
response_json = json.loads(cache_filepath.read_text(encoding="utf-8"))
Expand Down
16 changes: 6 additions & 10 deletions osmnx/_nominatim.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,6 @@ def _nominatim_request(
params: OrderedDict[str, int | str],
*,
request_type: str = "search",
pause: float = 1,
error_pause: float = 60,
) -> list[dict[str, Any]]:
"""
Send a HTTP GET request to the Nominatim API and return response.
Expand All @@ -92,13 +90,8 @@ def _nominatim_request(
params
Key-value pairs of parameters.
request_type
{"search", "reverse", "lookup"}
Which Nominatim API endpoint to query.
pause
How long to pause before request, in seconds. Per the Nominatim usage
policy: "an absolute maximum of 1 request per second" is allowed.
error_pause
How long to pause in seconds before re-trying request if error.
Which Nominatim API endpoint to query, one of {"search", "reverse",
"lookup"}.
Returns
-------
Expand All @@ -120,7 +113,9 @@ def _nominatim_request(
if isinstance(cached_response_json, list):
return cached_response_json

# pause then request this URL
# how long to pause before request, in seconds. Per the Nominatim usage
# policy: "an absolute maximum of 1 request per second" is allowed.
pause = 1
hostname = _http._hostname_from_url(url)
msg = f"Pausing {pause} second(s) before making HTTP GET request to {hostname!r}"
utils.log(msg, level=lg.INFO)
Expand All @@ -139,6 +134,7 @@ def _nominatim_request(

# handle 429 and 504 errors by pausing then recursively re-trying request
if response.status_code in {429, 504}: # pragma: no cover
error_pause = 55
msg = (
f"{hostname!r} responded {response.status_code} {response.reason}: "
f"we'll retry in {error_pause} secs"
Expand Down
39 changes: 14 additions & 25 deletions osmnx/_overpass.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,8 @@ def _get_network_filter(network_type: str) -> str:
def _get_overpass_pause(
base_endpoint: str,
*,
recursive_delay: float = 4,
default_duration: float = 50,
recursion_pause: float = 5,
default_pause: float = 60,
) -> float:
"""
Retrieve a pause duration from the Overpass API status endpoint.
Expand All @@ -155,19 +155,19 @@ def _get_overpass_pause(
----------
base_endpoint
Base Overpass API URL (without "/status" at the end).
recursive_delay
recursion_pause
How long to wait between recursive calls if the server is currently
running a query.
default_duration
If a fatal error occurs, fall back on returning this value.
default_pause
If a fatal error occurs, fall back on this liberal pause duration.
Returns
-------
pause
The current pause duration specified by the Overpass status endpoint.
"""
# if overpass rate limiting is False, then there is zero pause
if not settings.overpass_rate_limit:
# if overpass rate limiting is False, then there is zero pause
return 0

url = base_endpoint.rstrip("/") + "/status"
Expand All @@ -185,7 +185,7 @@ def _get_overpass_pause(
# cannot reach status endpoint: log error and return default duration
msg = f"Unable to reach {url}, {e}"
utils.log(msg, level=lg.ERROR)
return default_duration
return default_pause

# try to parse the output
try:
Expand All @@ -195,7 +195,7 @@ def _get_overpass_pause(
# cannot parse output: log error and return default duration
msg = f"Unable to parse {url} response: {response_text}"
utils.log(msg, level=lg.ERROR)
return default_duration
return default_pause

# determine the current status of the server
try:
Expand All @@ -215,16 +215,16 @@ def _get_overpass_pause(
pause = max(seconds, 1)

# if first token is 'Currently', it is currently running a query so
# check back in recursive_delay seconds
# check back in recursion_pause seconds
elif status_first_part == "Currently":
time.sleep(recursive_delay)
time.sleep(recursion_pause)
pause = _get_overpass_pause(base_endpoint)

# any other status is unrecognized: log error, return default duration
else:
msg = f"Unrecognized server status: {status!r}"
utils.log(msg, level=lg.ERROR)
return default_duration
return default_pause

return pause

Expand Down Expand Up @@ -428,25 +428,14 @@ def _download_overpass_features(
yield _overpass_request(OrderedDict(data=query_str))


def _overpass_request(
data: OrderedDict[str, Any],
*,
pause: float | None = None,
error_pause: float = 55,
) -> dict[str, Any]:
def _overpass_request(data: OrderedDict[str, Any]) -> dict[str, Any]:
"""
Send a HTTP POST request to the Overpass API and return response.
Parameters
----------
data
Key-value pairs of parameters.
pause
How long to pause in seconds before request. If None, will query API
status endpoint to find when next slot is available.
error_pause
How long to pause in seconds (in addition to `pause`) before re-trying
request if error.
Returns
-------
Expand All @@ -464,8 +453,7 @@ def _overpass_request(
return cached_response_json

# pause then request this URL
if pause is None:
pause = _get_overpass_pause(settings.overpass_url)
pause = _get_overpass_pause(settings.overpass_url)
hostname = _http._hostname_from_url(url)
msg = f"Pausing {pause} second(s) before making HTTP POST request to {hostname!r}"
utils.log(msg, level=lg.INFO)
Expand All @@ -484,6 +472,7 @@ def _overpass_request(

# handle 429 and 504 errors by pausing then recursively re-trying request
if response.status_code in {429, 504}: # pragma: no cover
error_pause = 55
msg = (
f"{hostname!r} responded {response.status_code} {response.reason}: "
f"we'll retry in {error_pause} secs"
Expand Down
6 changes: 2 additions & 4 deletions osmnx/elevation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import logging as lg
import multiprocessing as mp
import time
from hashlib import sha1
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Any
Expand Down Expand Up @@ -128,10 +127,9 @@ def _build_vrt_file(raster_paths: Iterable[str | Path]) -> Path:
msg = "rio-vrt must be installed as an optional dependency to build VRTs."
raise ImportError(msg)

# use the sha1 hash of the sorted filepaths as the VRT filename
# determine VRT cache filepath, from stringified sorted raster filepaths
raster_paths = sorted(raster_paths)
checksum = sha1(str(raster_paths).encode("utf-8")).hexdigest() # noqa: S324
vrt_path = Path(settings.cache_folder) / f"{checksum}.vrt"
vrt_path = _http._get_cache_filepath(str(raster_paths), "vrt")

# build the VRT file if it doesn't already exist in the cache
if not vrt_path.is_file():
Expand Down
3 changes: 0 additions & 3 deletions tests/lint_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@ set -euo pipefail
# delete temp files and folders
rm -r -f ./.coverage* ./.pytest_cache ./.temp ./dist ./docs/build ./*/__pycache__

# create all the configured environment/requirements files
python ./environments/make-env-files.py

# run the pre-commit hooks for linting/formatting
SKIP=no-commit-to-branch pre-commit run --all-files

Expand Down

0 comments on commit 29c06c0

Please sign in to comment.