diff --git a/.config/pre-commit-config.yaml b/.config/pre-commit-config.yaml index ba658ce..177b3e1 100644 --- a/.config/pre-commit-config.yaml +++ b/.config/pre-commit-config.yaml @@ -8,7 +8,7 @@ repos: - id: end-of-file-fixer - id: check-ast - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.0 + rev: v0.8.6 hooks: # Run the linter. - id: ruff @@ -17,7 +17,7 @@ repos: - id: ruff-format args: [--config, .config/ruff.toml] - repo: https://github.com/astral-sh/uv-pre-commit - rev: 0.4.24 + rev: 0.5.15 hooks: # Run the pip compile - id: pip-compile diff --git a/pyproject.toml b/pyproject.toml index fb6243b..a7896a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "defopt>=6.4.0", "email_validator", "hdx-python-country>=3.8.3", - "hdx-python-utilities>=3.7.4", + "hdx-python-utilities@git+ssh://git@github.com/OCHA-DAP/hdx-python-utilities@error_handler#egg=hdx-python-utilities", "libhxl>=5.2.2", "makefun", "quantulum3", @@ -95,7 +95,7 @@ run = """ """ [tool.hatch.envs.hatch-static-analysis] -dependencies = ["ruff==0.7.0"] +dependencies = ["ruff==0.8.6"] [tool.hatch.envs.hatch-static-analysis.scripts] format-check = ["ruff format --config .config/ruff.toml --check --diff {args:.}",] diff --git a/requirements.txt b/requirements.txt index fce3967..82b9158 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ # uv pip compile pyproject.toml --resolver=backtracking --all-extras -o requirements.txt annotated-types==0.7.0 # via pydantic -attrs==24.2.0 +attrs==24.3.0 # via # frictionless # jsonlines @@ -10,19 +10,19 @@ attrs==24.2.0 # referencing cachetools==5.5.0 # via google-auth -certifi==2024.8.30 +certifi==2024.12.14 # via requests cfgv==3.4.0 # via pre-commit chardet==5.2.0 # via frictionless -charset-normalizer==3.4.0 +charset-normalizer==3.4.1 # via requests ckanapi==4.8 # via hdx-python-api (pyproject.toml) -click==8.1.7 +click==8.1.8 # via typer -coverage==7.6.4 +coverage==7.6.10 # via pytest-cov defopt==6.4.0 # via hdx-python-api (pyproject.toml) @@ -44,7 +44,7 @@ filelock==3.16.1 # via virtualenv frictionless==5.18.0 # via hdx-python-utilities -google-auth==2.36.0 +google-auth==2.37.0 # via # google-auth-oauthlib # gspread @@ -52,15 +52,15 @@ google-auth-oauthlib==1.2.1 # via gspread gspread==6.1.4 # via hdx-python-api (pyproject.toml) -hdx-python-country==3.8.3 +hdx-python-country==3.8.5 # via hdx-python-api (pyproject.toml) -hdx-python-utilities==3.7.4 +hdx-python-utilities @ git+ssh://git@github.com/OCHA-DAP/hdx-python-utilities@212e95dc21a48e5985237fc66af8ef3bb0ddbf5e#egg=hdx-python-utilities # via # hdx-python-api (pyproject.toml) # hdx-python-country humanize==4.11.0 # via frictionless -identify==2.6.1 +identify==2.6.5 # via pre-commit idna==3.10 # via @@ -68,13 +68,13 @@ idna==3.10 # requests ijson==3.3.0 # via hdx-python-utilities -inflect==7.4.0 +inflect==7.5.0 # via quantulum3 iniconfig==2.0.0 # via pytest isodate==0.7.2 # via frictionless -jinja2==3.1.4 +jinja2==3.1.5 # via frictionless jsonlines==4.0.0 # via hdx-python-utilities @@ -90,7 +90,7 @@ libhxl==5.2.2 # via # hdx-python-api (pyproject.toml) # hdx-python-country -loguru==0.7.2 +loguru==0.7.3 # via hdx-python-utilities makefun==1.15.6 # via hdx-python-api (pyproject.toml) @@ -106,13 +106,13 @@ more-itertools==10.5.0 # via inflect nodeenv==1.9.1 # via pre-commit -num2words==0.5.13 +num2words==0.5.14 # via quantulum3 oauthlib==3.2.2 # via requests-oauthlib openpyxl==3.1.5 # via hdx-python-utilities -packaging==24.1 +packaging==24.2 # via pytest petl==1.7.15 # via frictionless @@ -134,15 +134,17 @@ pyasn1==0.6.1 # rsa pyasn1-modules==0.4.1 # via google-auth -pydantic==2.9.2 +pydantic==2.10.5 # via frictionless -pydantic-core==2.23.4 +pydantic-core==2.27.2 # via pydantic -pygments==2.18.0 +pygments==2.19.1 # via rich pyphonetics==0.5.3 - # via hdx-python-country -pytest==8.3.3 + # via + # hdx-python-country + # hdx-python-utilities +pytest==8.3.4 # via # hdx-python-api (pyproject.toml) # pytest-cov @@ -188,17 +190,17 @@ rfc3986==2.0.0 # via frictionless rich==13.9.4 # via typer -rpds-py==0.21.0 +rpds-py==0.22.3 # via # jsonschema # referencing rsa==4.9 # via google-auth -ruamel-yaml==0.18.6 +ruamel-yaml==0.18.10 # via hdx-python-utilities ruamel-yaml-clib==0.2.12 # via ruamel-yaml -setuptools==75.3.0 +setuptools==75.8.0 # via ckanapi shellingham==1.5.4 # via typer @@ -206,7 +208,7 @@ simpleeval==1.0.3 # via frictionless simplejson==3.19.3 # via ckanapi -six==1.16.0 +six==1.17.0 # via # ckanapi # pockets @@ -228,7 +230,7 @@ text-unidecode==1.3 # via python-slugify typeguard==4.4.1 # via inflect -typer==0.13.0 +typer==0.15.1 # via frictionless typing-extensions==4.12.2 # via @@ -241,21 +243,21 @@ unidecode==1.3.8 # via # libhxl # pyphonetics -urllib3==2.2.3 +urllib3==2.3.0 # via # libhxl # requests validators==0.34.0 # via frictionless -virtualenv==20.27.1 +virtualenv==20.28.1 # via pre-commit -wheel==0.44.0 +wheel==0.45.1 # via libhxl xlrd==2.0.1 # via hdx-python-utilities xlrd3==1.1.0 # via libhxl -xlsx2csv==0.8.3 +xlsx2csv==0.8.4 # via hdx-python-utilities xlsxwriter==3.2.0 # via tableschema-to-template diff --git a/src/hdx/api/utilities/__init__.py b/src/hdx/api/utilities/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/hdx/data/dataset_title_helper.py b/src/hdx/api/utilities/dataset_title_helper.py similarity index 100% rename from src/hdx/data/dataset_title_helper.py rename to src/hdx/api/utilities/dataset_title_helper.py diff --git a/src/hdx/data/date_helper.py b/src/hdx/api/utilities/date_helper.py similarity index 100% rename from src/hdx/data/date_helper.py rename to src/hdx/api/utilities/date_helper.py diff --git a/src/hdx/data/filestore_helper.py b/src/hdx/api/utilities/filestore_helper.py similarity index 100% rename from src/hdx/data/filestore_helper.py rename to src/hdx/api/utilities/filestore_helper.py diff --git a/src/hdx/api/utilities/hdx_error_handler.py b/src/hdx/api/utilities/hdx_error_handler.py new file mode 100644 index 0000000..b4c6e4d --- /dev/null +++ b/src/hdx/api/utilities/hdx_error_handler.py @@ -0,0 +1,208 @@ +import logging +from typing import Any, Tuple + +from hdx.data.dataset import Dataset +from hdx.data.hdxobject import HDXError +from hdx.utilities.dictandlist import dict_of_sets_add +from hdx.utilities.error_handler import ErrorHandler +from hdx.utilities.typehint import ListTuple + +logger = logging.getLogger(__name__) + + +class HDXErrorHandler(ErrorHandler): + """Class that enables recording of errors and warnings. + + Errors and warnings can be logged by calling the `output` method or + automatically logged on exit. Messages are output grouped by category and + sorted. + + Args: + should_exit_on_error (bool): Whether to exit with a 1 code if there are errors. Default is True. + + """ + + def __init__( + self, + should_exit_on_error: bool = True, + ): + super().__init__(should_exit_on_error) + self.shared_errors["hdx_error"] = {} + + @staticmethod + def get_category(pipeline: str, identifier: str) -> str: + """ + Get category from pipeline and identifier + + Args: + pipeline (str): Name of the pipeline originating the error + identifier (str): Identifier e.g. dataset name + Returns: + str: Category + """ + return f"{pipeline} - {identifier}" + + def errors_to_hdx( + self, + pipeline: str, + identifier: str, + text: str, + resource_name: str = "", + err_to_hdx: bool = False, + ) -> None: + """ + Add a new message to the hdx_error type + + Args: + pipeline (str): Name of the pipeline originating the error + identifier (str): Identifier e.g. dataset name + text (str): Text to use e.g. "sector CSS not found in table" + resource_name (str): The resource name that the message applies to. Only needed if writing errors to HDX + message_type (str): The type of message (error or warning). Default is "error" + err_to_hdx (bool): Flag indicating if the message should be added to HDX metadata. Default is False + Returns: + None + """ + if err_to_hdx: + category = (pipeline, identifier, resource_name) + dict_of_sets_add(self.shared_errors["hdx_error"], category, text) + + def add_message( + self, + pipeline: str, + identifier: str, + text: str, + resource_name: str = "", + message_type: str = "error", + err_to_hdx: bool = False, + ) -> None: + """ + Add a new message (typically a warning or error) to a dictionary of messages in a + fixed format: + pipeline - identifier - {text} + identifier is usually a dataset name. + Args: + pipeline (str): Name of the pipeline originating the error + identifier (str): Identifier e.g. dataset name + text (str): Text to use e.g. "sector CSS not found in table" + resource_name (str): The resource name that the message applies to. Only needed if writing errors to HDX + message_type (str): The type of message (error or warning). Default is "error" + err_to_hdx (bool): Flag indicating if the message should be added to HDX metadata. Default is False + Returns: + None + """ + self.add(text, self.get_category(pipeline, identifier), message_type) + self.errors_to_hdx( + pipeline, identifier, text, resource_name, err_to_hdx + ) + + def add_missing_value_message( + self, + pipeline: str, + identifier: str, + value_type: str, + value: Any, + resource_name: str = "", + message_type: str = "error", + err_to_hdx: bool = False, + ) -> None: + """ + Add a new message (typically a warning or error) concerning a missing value + to a dictionary of messages in a fixed format: + pipeline - identifier - {text} + identifier is usually a dataset name. + Args: + pipeline (str): Name of the scaper originating the error + identifier (str): Identifier e.g. dataset name + value_type (str): Type of value e.g. "sector" + value (Any): Missing value + resource_name (str): The resource name that the message applies to. Only needed if writing errors to HDX + message_type (str): The type of message (error or warning). Default is "error" + err_to_hdx (bool): Flag indicating if the message should be added to HDX metadata. Default is False + Returns: + None + """ + text = self.missing_value_message(value_type, value) + self.add_message( + pipeline, + identifier, + text, + resource_name, + message_type, + err_to_hdx, + ) + + def add_multi_valued_message( + self, + pipeline: str, + identifier: str, + text: str, + values: ListTuple, + resource_name: str = "", + message_type: str = "error", + err_to_hdx: bool = False, + ) -> bool: + """ + Add a new message (typically a warning or error) concerning a list of + values to a set of messages in a fixed format: + pipeline - identifier - n {text}. First 10 values: n1,n2,n3... + If less than 10 values, ". First 10 values" is omitted. identifier is usually + a dataset name. + Args: + pipeline (str): Name of the scaper originating the error + identifier (str): Identifier e.g. dataset name + text (str): Text to use e.g. "negative values removed" + values (ListTuple): The list of related values of concern + resource_name (str): The resource name that the message applies to. Only needed if writing errors to HDX + message_type (str): The type of message (error or warning). Default is "error" + err_to_hdx (bool): Flag indicating if the message should be added to HDX metadata. Default is False + Returns: + bool: True if a message was added, False if not + """ + text = self.multi_valued_message(text, values) + if text is None: + return False + self.add_message( + pipeline, + identifier, + text, + resource_name, + message_type, + err_to_hdx, + ) + return True + + def output_errors(self, err_to_hdx: bool) -> None: + self.log() + if err_to_hdx: + logger.info("Writing errors to HDX") + for identifier, errors in self.shared_errors["hdx_error"].items(): + write_error_to_resource(identifier, errors) + + +def write_error_to_resource( + identifier: Tuple[str, str, str], errors: set[str] +) -> bool: + """ + Writes error messages to a resource on HDX. If the resource already has an + error message, it is only overwritten if the two messages are different. + Args: + identifier (Tuple[str, str, str]): The scraper, dataset, and resource names that the message applies to + errors (set[str]): Set of errors to use e.g. "negative values removed" + Returns: + bool: True if a message was added, False if not + """ + # We are using the names here because errors may be specified in the YAML by us + _, dataset_name, resource_name = identifier + error_text = ", ".join(sorted(errors)) + dataset = Dataset.read_from_hdx(dataset_name) + try: + success = dataset.add_hapi_error( + error_text, resource_name=resource_name + ) + except (HDXError, AttributeError): + logger.error(f"Could not write error to {dataset_name}") + return False + if success: + logger.info(f"Wrote error message to {dataset_name}") + return success diff --git a/src/hdx/data/dataset.py b/src/hdx/data/dataset.py index aa74ce9..0960429 100755 --- a/src/hdx/data/dataset.py +++ b/src/hdx/data/dataset.py @@ -22,7 +22,6 @@ from hxl.input import InputOptions, munge_url -import hdx.data.filestore_helper as filestore_helper import hdx.data.organization as org_module import hdx.data.resource as res_module import hdx.data.resource_view as resource_view @@ -31,8 +30,9 @@ import hdx.data.vocabulary as vocabulary from hdx.api.configuration import Configuration from hdx.api.locations import Locations -from hdx.data.dataset_title_helper import DatasetTitleHelper -from hdx.data.date_helper import DateHelper +from hdx.api.utilities.dataset_title_helper import DatasetTitleHelper +from hdx.api.utilities.date_helper import DateHelper +from hdx.api.utilities.filestore_helper import FilestoreHelper from hdx.data.hdxobject import HDXError, HDXObject from hdx.data.resource_matcher import ResourceMatcher from hdx.location.country import Country @@ -530,7 +530,7 @@ def check_required_fields( "There are no resources! Please add at least one resource!" ) for resource in self.resources: - filestore_helper.FilestoreHelper.resource_check_required_fields( + FilestoreHelper.resource_check_required_fields( resource, ignore_fields=ignore_fields ) @@ -869,7 +869,7 @@ def _dataset_update_resources( logger.warning( f"Resource exists. Updating {resource['name']}" ) - filestore_helper.FilestoreHelper.dataset_update_filestore_resource( + FilestoreHelper.dataset_update_filestore_resource( resource_data_to_update, filestore_resources, i, @@ -881,7 +881,7 @@ def _dataset_update_resources( resource_data_to_update = resources_metadata_to_update[ updated_resource_index ] - filestore_helper.FilestoreHelper.check_filestore_resource( + FilestoreHelper.check_filestore_resource( resource_data_to_update, filestore_resources, resource_index, @@ -910,13 +910,13 @@ def _dataset_update_resources( logger.warning( f"Changing resource name to: {updated_resource_name}" ) - filestore_helper.FilestoreHelper.dataset_update_filestore_resource( + FilestoreHelper.dataset_update_filestore_resource( resource_data_to_update, filestore_resources, i, ) else: - filestore_helper.FilestoreHelper.check_filestore_resource( + FilestoreHelper.check_filestore_resource( resource_data_to_update, filestore_resources, i, @@ -1133,7 +1133,7 @@ def create_in_hdx( filestore_resources = {} if self.resources: for i, resource in enumerate(self.resources): - filestore_helper.FilestoreHelper.check_filestore_resource( + FilestoreHelper.check_filestore_resource( resource, filestore_resources, i, **kwargs ) self.unseparate_resources() diff --git a/src/hdx/data/resource.py b/src/hdx/data/resource.py index 9e77d19..c8a92f4 100755 --- a/src/hdx/data/resource.py +++ b/src/hdx/data/resource.py @@ -8,9 +8,9 @@ from typing import Any, Dict, List, Optional, Tuple, Union import hdx.data.dataset -import hdx.data.filestore_helper as filestore_helper from hdx.api.configuration import Configuration -from hdx.data.date_helper import DateHelper +from hdx.api.utilities.date_helper import DateHelper +from hdx.api.utilities.filestore_helper import FilestoreHelper from hdx.data.hdxobject import HDXError, HDXObject from hdx.data.resource_view import ResourceView from hdx.utilities.dateparse import now_utc, now_utc_notz, parse_date @@ -336,10 +336,7 @@ def check_url_filetoupload(self) -> None: ) else: if "url" in self.data: - if ( - self.data["url"] - != filestore_helper.FilestoreHelper.temporary_url - ): + if self.data["url"] != FilestoreHelper.temporary_url: raise HDXError( "Either a url or a file to upload must be supplied not both!" ) diff --git a/tests/hdx/data/test_update_logic.py b/tests/hdx/data/test_update_logic.py index 5812623..9feac26 100644 --- a/tests/hdx/data/test_update_logic.py +++ b/tests/hdx/data/test_update_logic.py @@ -10,7 +10,7 @@ from hdx.data.vocabulary import Vocabulary from hdx.location.country import Country from hdx.utilities.loader import load_yaml -from hdx.utilities.text import multiple_replace +from hdx.utilities.matching import multiple_replace class TestUpdateLogic: diff --git a/tests/hdx/data/test_dataset_title_helper.py b/tests/hdx/utilities/test_dataset_title_helper.py similarity index 99% rename from tests/hdx/data/test_dataset_title_helper.py rename to tests/hdx/utilities/test_dataset_title_helper.py index 8f396e3..45d936e 100755 --- a/tests/hdx/data/test_dataset_title_helper.py +++ b/tests/hdx/utilities/test_dataset_title_helper.py @@ -4,7 +4,7 @@ import pytest -from hdx.data.dataset_title_helper import DatasetTitleHelper +from hdx.api.utilities.dataset_title_helper import DatasetTitleHelper class TestDatasetTitleHelper: diff --git a/tests/hdx/data/test_filestore_helper.py b/tests/hdx/utilities/test_filestore_helper.py similarity index 94% rename from tests/hdx/data/test_filestore_helper.py rename to tests/hdx/utilities/test_filestore_helper.py index 735bb86..26bff43 100644 --- a/tests/hdx/data/test_filestore_helper.py +++ b/tests/hdx/utilities/test_filestore_helper.py @@ -1,8 +1,9 @@ import copy import re -from . import resource_data -from hdx.data.filestore_helper import FilestoreHelper +from tests.hdx.data import resource_data + +from hdx.api.utilities.filestore_helper import FilestoreHelper from hdx.data.resource import Resource diff --git a/tests/hdx/utilities/test_hdx_error_handler.py b/tests/hdx/utilities/test_hdx_error_handler.py new file mode 100644 index 0000000..a0fb9c9 --- /dev/null +++ b/tests/hdx/utilities/test_hdx_error_handler.py @@ -0,0 +1,58 @@ +"""Errors on exit Tests""" + +import logging + +import pytest + +from hdx.api.utilities.hdx_error_handler import HDXErrorHandler +from hdx.utilities.easy_logging import setup_logging + +setup_logging() + + +class TestHDXErrorHandler: + def test_hdx_error_handler(self, caplog): + with pytest.raises(SystemExit): + with caplog.at_level(logging.ERROR): + with HDXErrorHandler() as errors: + errors.add_message( + "pipeline1", + "dataset1", + "error message", + "resource1", + err_to_hdx=True, + ) + errors.add_missing_value_message( + "pipeline1", + "dataset1", + "field1", + 123, + "resource1", + err_to_hdx=True, + ) + errors.add_multi_valued_message( + "pipeline1", + "dataset1", + "following values changed", + [1, 2, 3, 4], + "resource1", + err_to_hdx=True, + ) + assert ( + len( + errors.shared_errors["error"][ + "pipeline1 - dataset1" + ] + ) + == 3 + ) + assert ( + len( + errors.shared_errors["hdx_error"][ + ("pipeline1", "dataset1", "resource1") + ] + ) + == 3 + ) + errors.output_errors(False) + assert "following values changed" in caplog.text