From 1456727abc66fe51bf9f9627d6af69ad17c33e2e Mon Sep 17 00:00:00 2001 From: Panos Mavrogiorgos <pmav99@gmail.com> Date: Wed, 26 Jun 2024 13:22:39 +0300 Subject: [PATCH 01/15] chore: Update pre-commit --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fdaf062..053a4d2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -41,7 +41,7 @@ repos: - id: "shellcheck" - repo: "https://github.com/python-jsonschema/check-jsonschema" - rev: "0.28.5" + rev: "0.28.6" hooks: - id: "check-github-workflows" - id: "check-readthedocs" @@ -60,7 +60,7 @@ repos: - repo: "https://github.com/charliermarsh/ruff-pre-commit" # Ruff version. - rev: 'v0.4.9' + rev: 'v0.4.10' hooks: - id: "ruff" From 0209bcdf7d78a3b066aa0eff2c9e09de7f61ddea Mon Sep 17 00:00:00 2001 From: Panos Mavrogiorgos <pmav99@gmail.com> Date: Wed, 26 Jun 2024 13:37:29 +0300 Subject: [PATCH 02/15] ci: Use release/v1 branch for build.yml Fixes #106 [skip ci] --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 41d02e4..b3e65c7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -13,13 +13,13 @@ jobs: - uses: actions/checkout@main - uses: actions/setup-python@main with: - python-version: '3.x' + python-version: "3.x" - uses: actions/cache@main with: path: ${{ env.pythonLocation }} key: build-${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('pyproject.toml', 'setup.*') }} - run: pip wheel . --no-deps -w dist - - uses: pypa/gh-action-pypi-publish@master + - uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ password: ${{ secrets.PYPI_TOKEN }} From 958ae13ab2678c3fab454b6fd1a99ce9e8261390 Mon Sep 17 00:00:00 2001 From: abdu558 <arefabdu55@gmail.com> Date: Sun, 18 Aug 2024 21:56:34 +0100 Subject: [PATCH 03/15] Add fetch_usace_station function to init --- searvey/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/searvey/__init__.py b/searvey/__init__.py index 5d621e9..8e2c628 100644 --- a/searvey/__init__.py +++ b/searvey/__init__.py @@ -10,7 +10,7 @@ from searvey.stations import get_stations from searvey.stations import Provider from searvey.usgs import get_usgs_stations - +from searvey._usace_api import fetch_usace_station __version__ = importlib.metadata.version(__name__) @@ -24,4 +24,5 @@ "get_usgs_stations", "Provider", "__version__", + "fetch_usace_station", ] From d2d9edaeeb12aed9eb00355755ef7a51fd6bd499 Mon Sep 17 00:00:00 2001 From: abdu558 <arefabdu55@gmail.com> Date: Sun, 18 Aug 2024 21:58:43 +0100 Subject: [PATCH 04/15] Add usace api and fetch station method --- searvey/_usace_api.py | 61 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 searvey/_usace_api.py diff --git a/searvey/_usace_api.py b/searvey/_usace_api.py new file mode 100644 index 0000000..d5694f0 --- /dev/null +++ b/searvey/_usace_api.py @@ -0,0 +1,61 @@ +import logging +import xml.etree.ElementTree as ET +from datetime import datetime +from collections import abc + +import httpx +import multifutures +import pandas as pd + +from ._common import _fetch_url, _resolve_end_date, _resolve_http_client, _resolve_rate_limit, _resolve_start_date, _to_utc +from .custom_types import DatetimeLike + +logger = logging.getLogger(__name__) + +BASE_URL = "https://rivergages.mvr.usace.army.mil/watercontrol/webservices/rest/webserviceWaterML.cfc?method=RGWML&meth=getValues&location={location}&site={site}&variable={variable}&beginDate={begin_date}&endDate={end_date}&authToken=RiverGages" + +def fetch_usace_station( + station_id: str, + start_date: DatetimeLike | None = None, + end_date: DatetimeLike | None = None, + *, + rate_limit: multifutures.RateLimit | None = None, + http_client: httpx.Client | None = None, + multiprocessing_executor: multifutures.ExecutorProtocol | None = None, + multithreading_executor: multifutures.ExecutorProtocol | None = None, +) -> pd.DataFrame: + """ + Make a query to the USACE API for river gauge data for ``station_id`` + and return the results as a ``pandas.DataFrame``. + + :param station_id: The station identifier. + :param start_date: The starting date of the query. Defaults to 7 days ago. + :param end_date: The finishing date of the query. Defaults to "now". + :param variable: The variable to fetch. Defaults to "HG" (gauge height). + :param rate_limit: The rate limit for making requests to the USACE servers. + :param http_client: The ``httpx.Client``. + :param multiprocessing_executor: An instance of a class implementing the ``concurrent.futures.Executor`` API. + :param multithreading_executor: An instance of a class implementing the ``concurrent.futures.Executor`` API. + """ + logger.info("USACE-%s: Starting scraping: %s - %s", station_id, start_date, end_date) + now = pd.Timestamp.now("utc") + try: + df = _fetch_usace( + station_ids=[station_id], + start_dates=_resolve_start_date(now, start_date), + end_dates=_resolve_end_date(now, end_date), + rate_limit=rate_limit, + http_client=http_client, + multiprocessing_executor=multiprocessing_executor, + multithreading_executor=multithreading_executor, + ).get(station_id, pd.DataFrame()) + except Exception as e: + logger.error(f"USACE-{station_id}: An error occurred while fetching data: {str(e)}") + df = pd.DataFrame() + + if df.empty: + logger.warning(f"USACE-{station_id}: No data retrieved for the specified period.") + else: + logger.info("USACE-%s: Finished scraping: %s - %s", station_id, start_date, end_date) + + return df \ No newline at end of file From c26f1078adcf63b06630241b3a23525ce0932c81 Mon Sep 17 00:00:00 2001 From: abdu558 <arefabdu55@gmail.com> Date: Sun, 18 Aug 2024 22:00:35 +0100 Subject: [PATCH 05/15] Add _fetch_usace function for retrieving USACE data --- searvey/_usace_api.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/searvey/_usace_api.py b/searvey/_usace_api.py index d5694f0..adf0e84 100644 --- a/searvey/_usace_api.py +++ b/searvey/_usace_api.py @@ -14,6 +14,44 @@ BASE_URL = "https://rivergages.mvr.usace.army.mil/watercontrol/webservices/rest/webserviceWaterML.cfc?method=RGWML&meth=getValues&location={location}&site={site}&variable={variable}&beginDate={begin_date}&endDate={end_date}&authToken=RiverGages" +def _fetch_usace( + station_ids: abc.Collection[str], + start_dates: pd.DatetimeIndex, + end_dates: pd.DatetimeIndex, + *, + rate_limit: multifutures.RateLimit | None, + http_client: httpx.Client | None, + multiprocessing_executor: multifutures.ExecutorProtocol | None, + multithreading_executor: multifutures.ExecutorProtocol | None, +) -> dict[str, pd.DataFrame]: + rate_limit = _resolve_rate_limit(rate_limit) + http_client = _resolve_http_client(http_client) + start_dates = _to_utc(start_dates) + end_dates = _to_utc(end_dates) + + usace_responses = _retrieve_usace_data( + station_ids=station_ids, + start_dates=start_dates, + end_dates=end_dates, + rate_limit=rate_limit, + http_client=http_client, + executor=multithreading_executor, + ) + + dataframes = {} + for response in usace_responses: + station_id = response.kwargs["station_id"] + if response.exception: + logger.error(f"USACE-{station_id}: Failed to retrieve data. Error: {response.exception}") + continue + df = _parse_xml_data(response.result, station_id) + if not df.empty: + dataframes[station_id] = df + else: + logger.warning(f"USACE-{station_id}: No data retrieved or parsed.") + + return dataframes + def fetch_usace_station( station_id: str, start_date: DatetimeLike | None = None, From 660ab6dc45410d437c248afcec5b68f0bdc69bdf Mon Sep 17 00:00:00 2001 From: abdu558 <arefabdu55@gmail.com> Date: Sun, 18 Aug 2024 22:01:36 +0100 Subject: [PATCH 06/15] Add _retrieve_usace_data and url generation functions for retrieving USACE data --- searvey/_usace_api.py | 50 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/searvey/_usace_api.py b/searvey/_usace_api.py index adf0e84..0975869 100644 --- a/searvey/_usace_api.py +++ b/searvey/_usace_api.py @@ -14,6 +14,56 @@ BASE_URL = "https://rivergages.mvr.usace.army.mil/watercontrol/webservices/rest/webserviceWaterML.cfc?method=RGWML&meth=getValues&location={location}&site={site}&variable={variable}&beginDate={begin_date}&endDate={end_date}&authToken=RiverGages" + +def _generate_urls( + station_id: str, + start_date: pd.Timestamp, + end_date: pd.Timestamp, +) -> list[str]: + if end_date < start_date: + raise ValueError(f"'end_date' must be after 'start_date': {end_date} vs {start_date}") + if end_date == start_date: + return [] + + url = BASE_URL.format( + location=station_id, + site=station_id, + variable="HG", + begin_date=start_date.strftime("%Y-%m-%dT%H:%M"), + end_date=end_date.strftime("%Y-%m-%dT%H:%M") + ) + print(url) + return [url] + +def _retrieve_usace_data( + station_ids: abc.Collection[str], + start_dates: abc.Collection[pd.Timestamp], + end_dates: abc.Collection[pd.Timestamp], + rate_limit: multifutures.RateLimit, + http_client: httpx.Client, + executor: multifutures.ExecutorProtocol | None, +) -> list[multifutures.FutureResult]: + kwargs = [] + for station_id, start_date, end_date in zip(station_ids, start_dates, end_dates): + for url in _generate_urls(station_id=station_id, start_date=start_date, end_date=end_date): + if url: + kwargs.append( + dict( + station_id=station_id, + url=url, + client=http_client, + rate_limit=rate_limit, + ), + ) + with http_client: + logger.debug("Starting data retrieval") + results = multifutures.multithread( + func=_fetch_url, func_kwargs=kwargs, check=False, executor=executor + ) + logger.debug("Finished data retrieval") + return results + + def _fetch_usace( station_ids: abc.Collection[str], start_dates: pd.DatetimeIndex, From 49d4db5df1dd7a269418e7624a20579e493edc6a Mon Sep 17 00:00:00 2001 From: abdu558 <arefabdu55@gmail.com> Date: Sun, 18 Aug 2024 22:02:09 +0100 Subject: [PATCH 07/15] Add XML data parsing function for USACE API --- searvey/_usace_api.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/searvey/_usace_api.py b/searvey/_usace_api.py index 0975869..e77d56c 100644 --- a/searvey/_usace_api.py +++ b/searvey/_usace_api.py @@ -14,6 +14,31 @@ BASE_URL = "https://rivergages.mvr.usace.army.mil/watercontrol/webservices/rest/webserviceWaterML.cfc?method=RGWML&meth=getValues&location={location}&site={site}&variable={variable}&beginDate={begin_date}&endDate={end_date}&authToken=RiverGages" +def _parse_xml_data(content: str, station_id: str) -> pd.DataFrame: + try: + namespace = {'wml': 'http://www.cuahsi.org/waterML/1.0/'} + root = ET.fromstring(content) + values_element = root.find(".//wml:values", namespaces=namespace) + + if values_element is None: + logger.warning(f"{station_id}: No 'values' element found in the XML.") + return pd.DataFrame() + + data = [] + for value_element in values_element.findall("wml:value", namespaces=namespace): + date_time = value_element.get("dateTime") + value = value_element.text + date_time_obj = datetime.strptime(date_time, "%Y-%m-%dT%H:%M:%S") + data.append({'time': date_time_obj, 'value': float(value)}) + + df = pd.DataFrame(data) + df.set_index('time', inplace=True) + df.index = pd.to_datetime(df.index, utc=True) + df.attrs["station_id"] = f"USACE-{station_id}" + return df + except ET.ParseError: + logger.error(f"{station_id}: Failed to parse XML data.") + return pd.DataFrame() def _generate_urls( station_id: str, From e78ca7c09c2bd723c6da030c0ad9e8814d7e182d Mon Sep 17 00:00:00 2001 From: abdu558 <arefabdu55@gmail.com> Date: Sun, 18 Aug 2024 22:11:19 +0100 Subject: [PATCH 08/15] Add example code for Army Corps WL --- USACE_data.ipynb | 125 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 USACE_data.ipynb diff --git a/USACE_data.ipynb b/USACE_data.ipynb new file mode 100644 index 0000000..68f62f0 --- /dev/null +++ b/USACE_data.ipynb @@ -0,0 +1,125 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up for Army Corps WL data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import geopandas as gpd\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import httpx\n", + "from searvey._usace_api import fetch_usace_station\n", + "\n", + "logging.basicConfig(\n", + " level=20,\n", + " style=\"{\",\n", + " format=\"{asctime:s}; {levelname:8s}; {threadName:23s}; {name:<25s} {lineno:5d}; {message:s}\",\n", + ")\n", + "\n", + "logging.getLogger(\"urllib3\").setLevel(30)\n", + "logging.getLogger(\"parso\").setLevel(30)\n", + "\n", + "logger = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch WL data from a single station" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Define start and end dates for data retrieval\n", + "\n", + "df = fetch_usace_station(\"01300\", start_date=\"2020-04-05\", end_date=\"2020-04-10\",http_client=httpx.Client(verify=False))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fetch Army Corps Water Level Data from multiple station" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from searvey._usace_api import _fetch_usace\n", + "import httpx\n", + "import pandas as pd\n", + "# df = _fetch_usace(station_ids=[\"01300\"], start_dates=\"2020-04-05\", end_dates=\"2020-04-10\",http_client=httpx.Client(verify=False))\n", + "\n", + "\n", + "df = _fetch_usace(\n", + " station_ids=[\"01300\"],\n", + " start_dates=pd.DatetimeIndex([\"2020-04-05\"]),\n", + " end_dates=pd.DatetimeIndex([\"2020-04-10\"]),\n", + " rate_limit=None,\n", + " http_client=httpx.Client(verify=False),\n", + " multiprocessing_executor=None,\n", + " multithreading_executor=None\n", + ")\n", + "df['01300']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Graph the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import hvplot.pandas\n", + "\n", + "df[\"01300\"].hvplot(title=\"Army Corps WL values\", xlabel=\"Index\", ylabel=\"Value\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 5f7e7d866c88009de556c580add7e410e9e3bbac Mon Sep 17 00:00:00 2001 From: abdu558 <arefabdu55@gmail.com> Date: Sun, 18 Aug 2024 22:37:37 +0100 Subject: [PATCH 09/15] Move USACE example file into example folder --- USACE_data.ipynb => examples/USACE_data.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename USACE_data.ipynb => examples/USACE_data.ipynb (100%) diff --git a/USACE_data.ipynb b/examples/USACE_data.ipynb similarity index 100% rename from USACE_data.ipynb rename to examples/USACE_data.ipynb From 61b209210f09dcbe3de9a80793ccd02b9836f265 Mon Sep 17 00:00:00 2001 From: abdu558 <arefabdu55@gmail.com> Date: Tue, 20 Aug 2024 00:14:44 +0100 Subject: [PATCH 10/15] Update USACE examples and comments --- examples/USACE_data.ipynb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/USACE_data.ipynb b/examples/USACE_data.ipynb index 68f62f0..8ca959c 100644 --- a/examples/USACE_data.ipynb +++ b/examples/USACE_data.ipynb @@ -46,9 +46,11 @@ "outputs": [], "source": [ "\n", - "# Define start and end dates for data retrieval\n", + "# Define start and end dates for data retrieval, you can use either datetime.date or string for the dates\n", + "import datetime\n", + "df = fetch_usace_station(\"01300\", datetime.date(2020, 4, 5), end_date=\"2020-04-10\",http_client=httpx.Client(verify=False))\n", "\n", - "df = fetch_usace_station(\"01300\", start_date=\"2020-04-05\", end_date=\"2020-04-10\",http_client=httpx.Client(verify=False))\n" + "df" ] }, { @@ -67,19 +69,17 @@ "from searvey._usace_api import _fetch_usace\n", "import httpx\n", "import pandas as pd\n", - "# df = _fetch_usace(station_ids=[\"01300\"], start_dates=\"2020-04-05\", end_dates=\"2020-04-10\",http_client=httpx.Client(verify=False))\n", - "\n", "\n", "df = _fetch_usace(\n", " station_ids=[\"01300\"],\n", - " start_dates=pd.DatetimeIndex([\"2020-04-05\"]),\n", - " end_dates=pd.DatetimeIndex([\"2020-04-10\"]),\n", + " start_dates=[\"2020-04-05\"],\n", + " end_dates=[\"2020-04-10\"],\n", " rate_limit=None,\n", " http_client=httpx.Client(verify=False),\n", " multiprocessing_executor=None,\n", " multithreading_executor=None\n", ")\n", - "df['01300']" + "df['01300']\n" ] }, { From 85dfc4610511514a8b5a818bd21b2f054450bd8d Mon Sep 17 00:00:00 2001 From: abdu558 <arefabdu55@gmail.com> Date: Tue, 20 Aug 2024 00:39:17 +0100 Subject: [PATCH 11/15] update usace to handle multiple start and end dates --- searvey/_usace_api.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/searvey/_usace_api.py b/searvey/_usace_api.py index e77d56c..9aaa18f 100644 --- a/searvey/_usace_api.py +++ b/searvey/_usace_api.py @@ -2,7 +2,9 @@ import xml.etree.ElementTree as ET from datetime import datetime from collections import abc - +from typing import List +from typing import Union +from searvey.custom_types import DatetimeLike import httpx import multifutures import pandas as pd @@ -57,7 +59,6 @@ def _generate_urls( begin_date=start_date.strftime("%Y-%m-%dT%H:%M"), end_date=end_date.strftime("%Y-%m-%dT%H:%M") ) - print(url) return [url] def _retrieve_usace_data( @@ -71,6 +72,7 @@ def _retrieve_usace_data( kwargs = [] for station_id, start_date, end_date in zip(station_ids, start_dates, end_dates): for url in _generate_urls(station_id=station_id, start_date=start_date, end_date=end_date): + logger.info("USACE-%s: Starting scraping: %s - %s", station_id, start_date, end_date) if url: kwargs.append( dict( @@ -91,8 +93,8 @@ def _retrieve_usace_data( def _fetch_usace( station_ids: abc.Collection[str], - start_dates: pd.DatetimeIndex, - end_dates: pd.DatetimeIndex, + start_dates: Union[DatetimeLike, List[DatetimeLike]] = None, + end_dates: Union[DatetimeLike, List[DatetimeLike]] = None, *, rate_limit: multifutures.RateLimit | None, http_client: httpx.Client | None, @@ -101,8 +103,15 @@ def _fetch_usace( ) -> dict[str, pd.DataFrame]: rate_limit = _resolve_rate_limit(rate_limit) http_client = _resolve_http_client(http_client) - start_dates = _to_utc(start_dates) - end_dates = _to_utc(end_dates) + + now = pd.Timestamp.now("utc") + + start_dates = [start_dates] if not isinstance(start_dates, list) else start_dates + end_dates = [end_dates] if not isinstance(end_dates, list) else end_dates + + #we get the first index because the output is (DatetimeIndex(['2020-04-05'], dtype='datetime64[ns]', freq=None) + start_dates = [_resolve_start_date(now, date)[0] for date in start_dates] + end_dates = [_resolve_end_date(now, date)[0] for date in end_dates] usace_responses = _retrieve_usace_data( station_ids=station_ids, @@ -142,21 +151,19 @@ def fetch_usace_station( and return the results as a ``pandas.DataFrame``. :param station_id: The station identifier. - :param start_date: The starting date of the query. Defaults to 7 days ago. - :param end_date: The finishing date of the query. Defaults to "now". - :param variable: The variable to fetch. Defaults to "HG" (gauge height). + :param start_date: The starting date of the query. + :param end_date: The finishing date of the query. :param rate_limit: The rate limit for making requests to the USACE servers. - :param http_client: The ``httpx.Client``. - :param multiprocessing_executor: An instance of a class implementing the ``concurrent.futures.Executor`` API. - :param multithreading_executor: An instance of a class implementing the ``concurrent.futures.Executor`` API. + :param http_client: The ``httpx.Client``, this should have the parameter verify=False. + :param multiprocessing_executor + :param multithreading_executor """ logger.info("USACE-%s: Starting scraping: %s - %s", station_id, start_date, end_date) - now = pd.Timestamp.now("utc") try: df = _fetch_usace( station_ids=[station_id], - start_dates=_resolve_start_date(now, start_date), - end_dates=_resolve_end_date(now, end_date), + start_dates=start_date, + end_dates=end_date, rate_limit=rate_limit, http_client=http_client, multiprocessing_executor=multiprocessing_executor, From 7ec7723aa1c6138d432c2604afe02ab68c1ba1a8 Mon Sep 17 00:00:00 2001 From: abdu558 <arefabdu55@gmail.com> Date: Tue, 20 Aug 2024 00:39:40 +0100 Subject: [PATCH 12/15] Add tests for USACE API functions --- tests/usace_test.py | 70 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 tests/usace_test.py diff --git a/tests/usace_test.py b/tests/usace_test.py new file mode 100644 index 0000000..e45c35a --- /dev/null +++ b/tests/usace_test.py @@ -0,0 +1,70 @@ +import pandas as pd +import pytest +import httpx +from unittest.mock import patch, MagicMock + +from searvey._usace_api import fetch_usace_station, _fetch_usace, _generate_urls + + +def test_generate_urls(): + station_id = "01300" + start_date = pd.Timestamp("2020-04-05") + end_date = pd.Timestamp("2020-04-10") + + urls = _generate_urls(station_id, start_date, end_date) + + assert len(urls) == 1 + assert station_id in urls[0] + assert "2020-04-05" in urls[0] + assert "2020-04-10" in urls[0] + +def test_fetch_usace(): + result = _fetch_usace( + station_ids=["01300"], + start_dates=["2020-04-05"], + end_dates=["2020-04-10"], + rate_limit=None, + http_client=httpx.Client(verify=False), + multiprocessing_executor=None, + multithreading_executor=None + ) + assert "01300" in result + assert isinstance(result["01300"], pd.DataFrame) + assert len(result) == 1 + +@patch('searvey._usace_api._fetch_usace') +def test_fetch_usace_station(mock_fetch): + mock_df = pd.DataFrame({ + 'value': [10.5, 11.2, 10.8] + }, index=pd.date_range("2020-04-05", periods=3, freq='D')) + mock_df.index.name = 'time' + mock_df.attrs["station_id"] = "USACE-01300" + + mock_fetch.return_value = {"01300": mock_df} + + result = fetch_usace_station( + "01300", + start_date="2020-04-05", + end_date="2020-04-10", + http_client=httpx.Client(verify=False) + ) + + assert isinstance(result, pd.DataFrame) + assert len(result) == 3 + assert result.index.name == 'time' + assert 'value' in result.columns + assert result.attrs["station_id"] == "USACE-01300" + +def test_fetch_usace_station_error_handling(): + with patch('searvey._usace_api._fetch_usace', side_effect=Exception("API Error")): + result = fetch_usace_station( + "01300", + start_date="2020-04-05", + end_date="2020-04-10", + http_client=httpx.Client(verify=False) + ) + assert result.empty + +if __name__ == "__main__": + pytest.main() + From 9a187d3a6a586c26b42e8c33e8e46eb94b0dd935 Mon Sep 17 00:00:00 2001 From: abdu558 <arefabdu55@gmail.com> Date: Tue, 20 Aug 2024 00:51:14 +0100 Subject: [PATCH 13/15] Apply pre commit changes --- searvey/__init__.py | 3 ++- searvey/_usace_api.py | 26 +++++++++++++++++--------- tests/usace_test.py | 42 +++++++++++++++++++++--------------------- 3 files changed, 40 insertions(+), 31 deletions(-) diff --git a/searvey/__init__.py b/searvey/__init__.py index 8e2c628..0aded10 100644 --- a/searvey/__init__.py +++ b/searvey/__init__.py @@ -4,13 +4,14 @@ from searvey._coops_api import fetch_coops_station from searvey._ioc_api import fetch_ioc_station +from searvey._usace_api import fetch_usace_station from searvey.coops import get_coops_stations from searvey.ioc import get_ioc_data from searvey.ioc import get_ioc_stations from searvey.stations import get_stations from searvey.stations import Provider from searvey.usgs import get_usgs_stations -from searvey._usace_api import fetch_usace_station + __version__ = importlib.metadata.version(__name__) diff --git a/searvey/_usace_api.py b/searvey/_usace_api.py index 9aaa18f..d07aa43 100644 --- a/searvey/_usace_api.py +++ b/searvey/_usace_api.py @@ -1,24 +1,29 @@ import logging import xml.etree.ElementTree as ET -from datetime import datetime from collections import abc +from datetime import datetime from typing import List from typing import Union -from searvey.custom_types import DatetimeLike + import httpx import multifutures import pandas as pd -from ._common import _fetch_url, _resolve_end_date, _resolve_http_client, _resolve_rate_limit, _resolve_start_date, _to_utc +from ._common import _fetch_url +from ._common import _resolve_end_date +from ._common import _resolve_http_client +from ._common import _resolve_rate_limit +from ._common import _resolve_start_date from .custom_types import DatetimeLike logger = logging.getLogger(__name__) BASE_URL = "https://rivergages.mvr.usace.army.mil/watercontrol/webservices/rest/webserviceWaterML.cfc?method=RGWML&meth=getValues&location={location}&site={site}&variable={variable}&beginDate={begin_date}&endDate={end_date}&authToken=RiverGages" + def _parse_xml_data(content: str, station_id: str) -> pd.DataFrame: try: - namespace = {'wml': 'http://www.cuahsi.org/waterML/1.0/'} + namespace = {"wml": "http://www.cuahsi.org/waterML/1.0/"} root = ET.fromstring(content) values_element = root.find(".//wml:values", namespaces=namespace) @@ -31,10 +36,10 @@ def _parse_xml_data(content: str, station_id: str) -> pd.DataFrame: date_time = value_element.get("dateTime") value = value_element.text date_time_obj = datetime.strptime(date_time, "%Y-%m-%dT%H:%M:%S") - data.append({'time': date_time_obj, 'value': float(value)}) + data.append({"time": date_time_obj, "value": float(value)}) df = pd.DataFrame(data) - df.set_index('time', inplace=True) + df.set_index("time", inplace=True) df.index = pd.to_datetime(df.index, utc=True) df.attrs["station_id"] = f"USACE-{station_id}" return df @@ -42,6 +47,7 @@ def _parse_xml_data(content: str, station_id: str) -> pd.DataFrame: logger.error(f"{station_id}: Failed to parse XML data.") return pd.DataFrame() + def _generate_urls( station_id: str, start_date: pd.Timestamp, @@ -57,10 +63,11 @@ def _generate_urls( site=station_id, variable="HG", begin_date=start_date.strftime("%Y-%m-%dT%H:%M"), - end_date=end_date.strftime("%Y-%m-%dT%H:%M") + end_date=end_date.strftime("%Y-%m-%dT%H:%M"), ) return [url] + def _retrieve_usace_data( station_ids: abc.Collection[str], start_dates: abc.Collection[pd.Timestamp], @@ -109,7 +116,7 @@ def _fetch_usace( start_dates = [start_dates] if not isinstance(start_dates, list) else start_dates end_dates = [end_dates] if not isinstance(end_dates, list) else end_dates - #we get the first index because the output is (DatetimeIndex(['2020-04-05'], dtype='datetime64[ns]', freq=None) + # we get the first index because the output is (DatetimeIndex(['2020-04-05'], dtype='datetime64[ns]', freq=None) start_dates = [_resolve_start_date(now, date)[0] for date in start_dates] end_dates = [_resolve_end_date(now, date)[0] for date in end_dates] @@ -136,6 +143,7 @@ def _fetch_usace( return dataframes + def fetch_usace_station( station_id: str, start_date: DatetimeLike | None = None, @@ -178,4 +186,4 @@ def fetch_usace_station( else: logger.info("USACE-%s: Finished scraping: %s - %s", station_id, start_date, end_date) - return df \ No newline at end of file + return df diff --git a/tests/usace_test.py b/tests/usace_test.py index e45c35a..0e79fb2 100644 --- a/tests/usace_test.py +++ b/tests/usace_test.py @@ -1,9 +1,12 @@ +from unittest.mock import patch + +import httpx import pandas as pd import pytest -import httpx -from unittest.mock import patch, MagicMock -from searvey._usace_api import fetch_usace_station, _fetch_usace, _generate_urls +from searvey._usace_api import _fetch_usace +from searvey._usace_api import _generate_urls +from searvey._usace_api import fetch_usace_station def test_generate_urls(): @@ -18,6 +21,7 @@ def test_generate_urls(): assert "2020-04-05" in urls[0] assert "2020-04-10" in urls[0] + def test_fetch_usace(): result = _fetch_usace( station_ids=["01300"], @@ -26,45 +30,41 @@ def test_fetch_usace(): rate_limit=None, http_client=httpx.Client(verify=False), multiprocessing_executor=None, - multithreading_executor=None + multithreading_executor=None, ) assert "01300" in result assert isinstance(result["01300"], pd.DataFrame) assert len(result) == 1 -@patch('searvey._usace_api._fetch_usace') + +@patch("searvey._usace_api._fetch_usace") def test_fetch_usace_station(mock_fetch): - mock_df = pd.DataFrame({ - 'value': [10.5, 11.2, 10.8] - }, index=pd.date_range("2020-04-05", periods=3, freq='D')) - mock_df.index.name = 'time' + mock_df = pd.DataFrame( + {"value": [10.5, 11.2, 10.8]}, index=pd.date_range("2020-04-05", periods=3, freq="D") + ) + mock_df.index.name = "time" mock_df.attrs["station_id"] = "USACE-01300" mock_fetch.return_value = {"01300": mock_df} result = fetch_usace_station( - "01300", - start_date="2020-04-05", - end_date="2020-04-10", - http_client=httpx.Client(verify=False) + "01300", start_date="2020-04-05", end_date="2020-04-10", http_client=httpx.Client(verify=False) ) assert isinstance(result, pd.DataFrame) assert len(result) == 3 - assert result.index.name == 'time' - assert 'value' in result.columns + assert result.index.name == "time" + assert "value" in result.columns assert result.attrs["station_id"] == "USACE-01300" + def test_fetch_usace_station_error_handling(): - with patch('searvey._usace_api._fetch_usace', side_effect=Exception("API Error")): + with patch("searvey._usace_api._fetch_usace", side_effect=Exception("API Error")): result = fetch_usace_station( - "01300", - start_date="2020-04-05", - end_date="2020-04-10", - http_client=httpx.Client(verify=False) + "01300", start_date="2020-04-05", end_date="2020-04-10", http_client=httpx.Client(verify=False) ) assert result.empty + if __name__ == "__main__": pytest.main() - From 4cbff3386ac961e1e80749e9a3ebd809099be609 Mon Sep 17 00:00:00 2001 From: abdu558 <arefabdu55@gmail.com> Date: Tue, 20 Aug 2024 01:04:39 +0100 Subject: [PATCH 14/15] Update documentation for army corp WL --- README.md | 1 + docs/source/usace.rst | 15 +++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 docs/source/usace.rst diff --git a/README.md b/README.md index 14123a6..8842a7b 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Searvey aims to provide the following functionality: - U.S. Center for Operational Oceanographic Products and Services (CO-OPS) - Flanders Marine Institute (VLIZ); Intergovernmental Oceanographic Commission (IOC) - U.S. Geological Survey (USGS) + - Army Corp WL ## Installation diff --git a/docs/source/usace.rst b/docs/source/usace.rst new file mode 100644 index 0000000..c94147c --- /dev/null +++ b/docs/source/usace.rst @@ -0,0 +1,15 @@ +USACE RiverGages +============== +The U.S. Army Corps of Engineers RiverGages <https://rivergages.mvr.usace.army.mil/>_ +system provides water level data for rivers and waterways across the United States. +searvey uses the RiverGages REST API to access this data. Currently, water level +data is exposed in searvey. + +The data from an individual station can be retrieved with: +.. autofunction:: searvey.usace.get_usace_station + +You can fetch data from multiple stations and multiple different dates with: +.. autofunction:: searvey.usace.fetch_usace + +Note: The verify=False parameter in the httpx.Client() is used here to bypass +SSL verification, which is the only way to access the USACE RiverGages API. \ No newline at end of file From 95331b9d82f60df950bc244544c8909518fbc9a1 Mon Sep 17 00:00:00 2001 From: abdu558 <arefabdu55@gmail.com> Date: Tue, 20 Aug 2024 08:16:21 +0100 Subject: [PATCH 15/15] Improve compatibility by adding optional --- searvey/_usace_api.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/searvey/_usace_api.py b/searvey/_usace_api.py index d07aa43..23f12fb 100644 --- a/searvey/_usace_api.py +++ b/searvey/_usace_api.py @@ -4,6 +4,7 @@ from datetime import datetime from typing import List from typing import Union +from typing import Optional import httpx import multifutures @@ -74,7 +75,7 @@ def _retrieve_usace_data( end_dates: abc.Collection[pd.Timestamp], rate_limit: multifutures.RateLimit, http_client: httpx.Client, - executor: multifutures.ExecutorProtocol | None, + executor: Optional[multifutures.ExecutorProtocol] = None, ) -> list[multifutures.FutureResult]: kwargs = [] for station_id, start_date, end_date in zip(station_ids, start_dates, end_dates): @@ -103,10 +104,10 @@ def _fetch_usace( start_dates: Union[DatetimeLike, List[DatetimeLike]] = None, end_dates: Union[DatetimeLike, List[DatetimeLike]] = None, *, - rate_limit: multifutures.RateLimit | None, - http_client: httpx.Client | None, - multiprocessing_executor: multifutures.ExecutorProtocol | None, - multithreading_executor: multifutures.ExecutorProtocol | None, + rate_limit: Optional[multifutures.RateLimit] = None, + http_client: Optional[httpx.Client] = None, + multiprocessing_executor: Optional[multifutures.ExecutorProtocol] = None, + multithreading_executor: Optional[multifutures.ExecutorProtocol] = None, ) -> dict[str, pd.DataFrame]: rate_limit = _resolve_rate_limit(rate_limit) http_client = _resolve_http_client(http_client) @@ -146,13 +147,13 @@ def _fetch_usace( def fetch_usace_station( station_id: str, - start_date: DatetimeLike | None = None, - end_date: DatetimeLike | None = None, + start_date: Optional[DatetimeLike] = None, + end_date: Optional[DatetimeLike] = None, *, - rate_limit: multifutures.RateLimit | None = None, - http_client: httpx.Client | None = None, - multiprocessing_executor: multifutures.ExecutorProtocol | None = None, - multithreading_executor: multifutures.ExecutorProtocol | None = None, + rate_limit: Optional[multifutures.RateLimit] = None, + http_client: Optional[httpx.Client] = None, + multiprocessing_executor: Optional[multifutures.ExecutorProtocol] = None, + multithreading_executor: Optional[multifutures.ExecutorProtocol] = None, ) -> pd.DataFrame: """ Make a query to the USACE API for river gauge data for ``station_id``