Merge pull request #1279 from gboeing/cache

refactor http caching and request pausing
gboeing · Feb 15, 2025 · 29c06c0 · 29c06c0
2 parents f9c6790 + 60f395f
commit 29c06c0
Show file tree

Hide file tree

Showing 6 changed files with 64 additions and 76 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,8 +2,9 @@
 
 ## 2.0.2 (TBD)
 
-- fix bug in parsing time when calculating pause duration between requests (#1277)
 - fix bug where consolidate_intersections function would mutate the passed-in graph (#1273)
+- fix bug in parsing time when calculating pause duration between requests (#1277)
+- refactor interals of caching and pausing between requests (#1279)
 - provide user-friendly error message if consolidate_intersections is run more than once (#1273)
 - improve docstrings (#1272 #1274)
 

diff --git a/osmnx/_http.py b/osmnx/_http.py
@@ -30,24 +30,22 @@ def _save_to_cache(
     """
     Save a HTTP response JSON object to a file in the cache folder.
 
-    This calculates the checksum of `url` to generate the cache file name. If
-    the request was sent to server via POST instead of GET, then `url` should
+    If request was sent to server via POST instead of GET, then `url` should
     be a GET-style representation of the request. Response is only saved to a
-    cache file if `settings.use_cache` is True, `response_json` is not None,
-    and `ok` is True.
+    cache file if `settings.use_cache` is True, `ok` is True, `response_json`
+    is not None, and `response_json` does not contain a server "remark."
 
     Users should always pass OrderedDicts instead of dicts of parameters into
     request functions, so the parameters remain in the same order each time,
-    producing the same URL string, and thus the same hash. Otherwise the cache
-    will eventually contain multiple saved responses for the same request
-    because the URL's parameters appeared in a different order each time.
+    producing the same URL string, and thus the same hash. Otherwise you will
+    get a cache miss when the URL's parameters appeared in a different order.
 
     Parameters
     ----------
     url
         The URL of the request.
     response_json
-        The JSON response from the server.
+        The JSON HTTP response.
     ok
         A `requests.response.ok` value.
     """
@@ -59,51 +57,61 @@ def _save_to_cache(
             msg = f"Did not save to cache because response contains remark: {response_json['remark']!r}"
             utils.log(msg, lg.WARNING)
         else:
-            # create the folder on the disk if it doesn't already exist
-            cache_folder = Path(settings.cache_folder)
-            cache_folder.mkdir(parents=True, exist_ok=True)
-
-            # hash the url to make the filename succinct but unique
-            # sha1 digest is 160 bits = 20 bytes = 40 hexadecimal characters
-            checksum = sha1(url.encode("utf-8")).hexdigest()  # noqa: S324
-            cache_filepath = cache_folder / f"{checksum}.json"
-
-            # dump to json, and save to file
+            # create cache folder on disk if it doesn't already exist
+            cache_filepath = _get_cache_filepath(url)
+            cache_filepath.parent.mkdir(parents=True, exist_ok=True)
             cache_filepath.write_text(json.dumps(response_json), encoding="utf-8")
             msg = f"Saved response to cache file {str(cache_filepath)!r}"
             utils.log(msg, level=lg.INFO)
 
 
-def _url_in_cache(url: str) -> Path | None:
+def _get_cache_filepath(key: str, extension: str = "json") -> Path:
     """
-    Determine if a URL's response exists in the cache.
+    Determine a cache filepath for a key.
 
-    Calculates the checksum of `url` to determine the cache file's name.
-    Returns None if it cannot be found in the cache.
+    This uses the configured `settings.cache_folder` and calculates the 160
+    bit SHA-1 hash digest (40 hexadecimal characters) of `key` to generate a
+    succinct but unique cache filename.
 
     Parameters
     ----------
-    url
-        The URL to look for in the cache.
+    key
+        The key for which to generate a cache filepath, for example, a URL.
+    extension
+        The desired cache file extension.
 
     Returns
     -------
     cache_filepath
-        Path to cached response for `url` if it exists, otherwise None.
+        Cache filepath corresponding to `key`.
     """
-    # hash the url to generate the cache filename
-    checksum = sha1(url.encode("utf-8")).hexdigest()  # noqa: S324
-    cache_filepath = Path(settings.cache_folder) / f"{checksum}.json"
+    digest = sha1(key.encode("utf-8")).hexdigest()  # noqa: S324
+    return Path(settings.cache_folder) / f"{digest}.{extension}"
+
 
-    # if this file exists in the cache, return its full path
+def _check_cache(key: str) -> Path | None:
+    """
+    Check if a key exists in the cache, and return its cache filepath if so.
+
+    Parameters
+    ----------
+    key
+        The key to look for in the cache.
+
+    Returns
+    -------
+    cache_filepath
+        Filepath to cached data for `key` if it exists, otherwise None.
+    """
+    cache_filepath = _get_cache_filepath(key)
     return cache_filepath if cache_filepath.is_file() else None
 
 
 def _retrieve_from_cache(url: str) -> dict[str, Any] | list[dict[str, Any]] | None:
     """
     Retrieve a HTTP response JSON object from the cache if it exists.
 
-    Returns None if there is a server remark in the cached response.
+    A cache hit returns the data. A cache miss returns None.
 
     Parameters
     ----------
@@ -113,13 +121,12 @@ def _retrieve_from_cache(url: str) -> dict[str, Any] | list[dict[str, Any]] | No
     Returns
     -------
     response_json
-        Cached response for `url` if it exists in the cache and does not
-        contain a server remark, otherwise None.
+        The cached response for `url` if it exists, otherwise None.
     """
     # if the tool is configured to use the cache
     if settings.use_cache:
         # return cached response for this url if exists, otherwise return None
-        cache_filepath = _url_in_cache(url)
+        cache_filepath = _check_cache(url)
         if cache_filepath is not None:
             response_json: dict[str, Any] | list[dict[str, Any]]
             response_json = json.loads(cache_filepath.read_text(encoding="utf-8"))

diff --git a/osmnx/_nominatim.py b/osmnx/_nominatim.py
@@ -81,8 +81,6 @@ def _nominatim_request(
     params: OrderedDict[str, int | str],
     *,
     request_type: str = "search",
-    pause: float = 1,
-    error_pause: float = 60,
 ) -> list[dict[str, Any]]:
     """
     Send a HTTP GET request to the Nominatim API and return response.
@@ -92,13 +90,8 @@ def _nominatim_request(
     params
         Key-value pairs of parameters.
     request_type
-        {"search", "reverse", "lookup"}
-        Which Nominatim API endpoint to query.
-    pause
-        How long to pause before request, in seconds. Per the Nominatim usage
-        policy: "an absolute maximum of 1 request per second" is allowed.
-    error_pause
-        How long to pause in seconds before re-trying request if error.
+        Which Nominatim API endpoint to query, one of {"search", "reverse",
+        "lookup"}.
 
     Returns
     -------
@@ -120,7 +113,9 @@ def _nominatim_request(
     if isinstance(cached_response_json, list):
         return cached_response_json
 
-    # pause then request this URL
+    # how long to pause before request, in seconds. Per the Nominatim usage
+    # policy: "an absolute maximum of 1 request per second" is allowed.
+    pause = 1
     hostname = _http._hostname_from_url(url)
     msg = f"Pausing {pause} second(s) before making HTTP GET request to {hostname!r}"
     utils.log(msg, level=lg.INFO)
@@ -139,6 +134,7 @@ def _nominatim_request(
 
     # handle 429 and 504 errors by pausing then recursively re-trying request
     if response.status_code in {429, 504}:  # pragma: no cover
+        error_pause = 55
         msg = (
             f"{hostname!r} responded {response.status_code} {response.reason}: "
             f"we'll retry in {error_pause} secs"

diff --git a/osmnx/_overpass.py b/osmnx/_overpass.py
@@ -141,8 +141,8 @@ def _get_network_filter(network_type: str) -> str:
 def _get_overpass_pause(
     base_endpoint: str,
     *,
-    recursive_delay: float = 4,
-    default_duration: float = 50,
+    recursion_pause: float = 5,
+    default_pause: float = 60,
 ) -> float:
     """
     Retrieve a pause duration from the Overpass API status endpoint.
@@ -155,19 +155,19 @@ def _get_overpass_pause(
     ----------
     base_endpoint
         Base Overpass API URL (without "/status" at the end).
-    recursive_delay
+    recursion_pause
         How long to wait between recursive calls if the server is currently
         running a query.
-    default_duration
-        If a fatal error occurs, fall back on returning this value.
+    default_pause
+        If a fatal error occurs, fall back on this liberal pause duration.
 
     Returns
     -------
     pause
         The current pause duration specified by the Overpass status endpoint.
     """
+    # if overpass rate limiting is False, then there is zero pause
     if not settings.overpass_rate_limit:
-        # if overpass rate limiting is False, then there is zero pause
         return 0
 
     url = base_endpoint.rstrip("/") + "/status"
@@ -185,7 +185,7 @@ def _get_overpass_pause(
         # cannot reach status endpoint: log error and return default duration
         msg = f"Unable to reach {url}, {e}"
         utils.log(msg, level=lg.ERROR)
-        return default_duration
+        return default_pause
 
     # try to parse the output
     try:
@@ -195,7 +195,7 @@ def _get_overpass_pause(
         # cannot parse output: log error and return default duration
         msg = f"Unable to parse {url} response: {response_text}"
         utils.log(msg, level=lg.ERROR)
-        return default_duration
+        return default_pause
 
     # determine the current status of the server
     try:
@@ -215,16 +215,16 @@ def _get_overpass_pause(
             pause = max(seconds, 1)
 
         # if first token is 'Currently', it is currently running a query so
-        # check back in recursive_delay seconds
+        # check back in recursion_pause seconds
         elif status_first_part == "Currently":
-            time.sleep(recursive_delay)
+            time.sleep(recursion_pause)
             pause = _get_overpass_pause(base_endpoint)
 
         # any other status is unrecognized: log error, return default duration
         else:
             msg = f"Unrecognized server status: {status!r}"
             utils.log(msg, level=lg.ERROR)
-            return default_duration
+            return default_pause
 
     return pause
 
@@ -428,25 +428,14 @@ def _download_overpass_features(
         yield _overpass_request(OrderedDict(data=query_str))
 
 
-def _overpass_request(
-    data: OrderedDict[str, Any],
-    *,
-    pause: float | None = None,
-    error_pause: float = 55,
-) -> dict[str, Any]:
+def _overpass_request(data: OrderedDict[str, Any]) -> dict[str, Any]:
     """
     Send a HTTP POST request to the Overpass API and return response.
 
     Parameters
     ----------
     data
         Key-value pairs of parameters.
-    pause
-        How long to pause in seconds before request. If None, will query API
-        status endpoint to find when next slot is available.
-    error_pause
-        How long to pause in seconds (in addition to `pause`) before re-trying
-        request if error.
 
     Returns
     -------
@@ -464,8 +453,7 @@ def _overpass_request(
         return cached_response_json
 
     # pause then request this URL
-    if pause is None:
-        pause = _get_overpass_pause(settings.overpass_url)
+    pause = _get_overpass_pause(settings.overpass_url)
     hostname = _http._hostname_from_url(url)
     msg = f"Pausing {pause} second(s) before making HTTP POST request to {hostname!r}"
     utils.log(msg, level=lg.INFO)
@@ -484,6 +472,7 @@ def _overpass_request(
 
     # handle 429 and 504 errors by pausing then recursively re-trying request
     if response.status_code in {429, 504}:  # pragma: no cover
+        error_pause = 55
         msg = (
             f"{hostname!r} responded {response.status_code} {response.reason}: "
             f"we'll retry in {error_pause} secs"

diff --git a/osmnx/elevation.py b/osmnx/elevation.py
@@ -5,7 +5,6 @@
 import logging as lg
 import multiprocessing as mp
 import time
-from hashlib import sha1
 from pathlib import Path
 from typing import TYPE_CHECKING
 from typing import Any
@@ -128,10 +127,9 @@ def _build_vrt_file(raster_paths: Iterable[str | Path]) -> Path:
         msg = "rio-vrt must be installed as an optional dependency to build VRTs."
         raise ImportError(msg)
 
-    # use the sha1 hash of the sorted filepaths as the VRT filename
+    # determine VRT cache filepath, from stringified sorted raster filepaths
     raster_paths = sorted(raster_paths)
-    checksum = sha1(str(raster_paths).encode("utf-8")).hexdigest()  # noqa: S324
-    vrt_path = Path(settings.cache_folder) / f"{checksum}.vrt"
+    vrt_path = _http._get_cache_filepath(str(raster_paths), "vrt")
 
     # build the VRT file if it doesn't already exist in the cache
     if not vrt_path.is_file():

diff --git a/tests/lint_test.sh b/tests/lint_test.sh
@@ -4,9 +4,6 @@ set -euo pipefail
 # delete temp files and folders
 rm -r -f ./.coverage* ./.pytest_cache ./.temp ./dist ./docs/build ./*/__pycache__
 
-# create all the configured environment/requirements files
-python ./environments/make-env-files.py
-
 # run the pre-commit hooks for linting/formatting
 SKIP=no-commit-to-branch pre-commit run --all-files