diff --git a/admin/run_environment/constraints.txt b/admin/run_environment/constraints.txt index 24259b167..7b8608b65 100644 --- a/admin/run_environment/constraints.txt +++ b/admin/run_environment/constraints.txt @@ -336,7 +336,9 @@ mypy==1.14.1 mypy-boto3-s3==1.35.93 # via boto3-stubs mypy-extensions==1.0.0 - # via mypy + # via + # mypy + # typing-inspect nest-asyncio==1.6.0 # via ipykernel networkx==3.4.2 @@ -359,6 +361,7 @@ numpy==2.2.1 # matplotlib # pandas # pandas-stubs + # pandera # pydeck # pyogrio # rasterio @@ -381,6 +384,7 @@ packaging==24.2 # geopandas # ipykernel # matplotlib + # pandera # plotly # pyogrio # pytest @@ -393,10 +397,13 @@ pandas==2.2.3 # geopandas # leafmap # mapclassify + # pandera # streamlit # streamlit-aggrid pandas-stubs==2.2.3.241126 # via -r /__w/data-engineering/data-engineering/admin/ops/../run_environment/requirements.in +pandera==0.22.1 + # via -r /__w/data-engineering/data-engineering/admin/ops/../run_environment/requirements.in parsedatetime==2.6 # via agate parso==0.8.4 @@ -467,6 +474,7 @@ pydantic==2.10.5 # via # -r /__w/data-engineering/data-engineering/admin/ops/../run_environment/requirements.in # dbt-semantic-interfaces + # pandera pydantic-core==2.27.2 # via pydantic pydeck==0.9.1 @@ -675,6 +683,8 @@ traittypes==0.2.1 # via # bqplot # ipyleaflet +typeguard==4.4.1 + # via pandera typer==0.15.1 # via -r /__w/data-engineering/data-engineering/admin/ops/../run_environment/requirements.in types-awscrt==0.23.6 @@ -719,7 +729,11 @@ typing-extensions==4.12.2 # sqlalchemy # sqlalchemy-stubs # streamlit + # typeguard # typer + # typing-inspect +typing-inspect==0.9.0 + # via pandera tzdata==2024.2 # via pandas urllib3==2.3.0 diff --git a/admin/run_environment/requirements.in b/admin/run_environment/requirements.in index edfe994a7..998c92a08 100644 --- a/admin/run_environment/requirements.in +++ b/admin/run_environment/requirements.in @@ -27,6 +27,7 @@ openpyxl openpyxl-stubs pandas pandas-stubs +pandera plotly pre-commit psycopg2-binary diff --git a/admin/run_environment/requirements.txt b/admin/run_environment/requirements.txt index 7651e5c87..ad21e17d2 100644 --- a/admin/run_environment/requirements.txt +++ b/admin/run_environment/requirements.txt @@ -336,7 +336,9 @@ mypy==1.14.1 mypy-boto3-s3==1.35.93 # via boto3-stubs mypy-extensions==1.0.0 - # via mypy + # via + # mypy + # typing-inspect nest-asyncio==1.6.0 # via ipykernel networkx==3.4.2 @@ -359,6 +361,7 @@ numpy==2.2.1 # matplotlib # pandas # pandas-stubs + # pandera # pydeck # pyogrio # rasterio @@ -381,6 +384,7 @@ packaging==24.2 # geopandas # ipykernel # matplotlib + # pandera # plotly # pyogrio # pytest @@ -393,10 +397,13 @@ pandas==2.2.3 # geopandas # leafmap # mapclassify + # pandera # streamlit # streamlit-aggrid pandas-stubs==2.2.3.241126 # via -r /__w/data-engineering/data-engineering/admin/ops/../run_environment/requirements.in +pandera==0.22.1 + # via -r /__w/data-engineering/data-engineering/admin/ops/../run_environment/requirements.in parsedatetime==2.6 # via agate parso==0.8.4 @@ -467,6 +474,7 @@ pydantic==2.10.5 # via # -r /__w/data-engineering/data-engineering/admin/ops/../run_environment/requirements.in # dbt-semantic-interfaces + # pandera pydantic-core==2.27.2 # via pydantic pydeck==0.9.1 @@ -675,6 +683,8 @@ traittypes==0.2.1 # via # bqplot # ipyleaflet +typeguard==4.4.1 + # via pandera typer==0.15.1 # via -r /__w/data-engineering/data-engineering/admin/ops/../run_environment/requirements.in types-awscrt==0.23.6 @@ -719,7 +729,11 @@ typing-extensions==4.12.2 # sqlalchemy # sqlalchemy-stubs # streamlit + # typeguard # typer + # typing-inspect +typing-inspect==0.9.0 + # via pandera tzdata==2024.2 # via pandas urllib3==2.3.0 diff --git a/dcpy/connectors/socrata/publish.py b/dcpy/connectors/socrata/publish.py index cbf768464..7b84e4356 100644 --- a/dcpy/connectors/socrata/publish.py +++ b/dcpy/connectors/socrata/publish.py @@ -23,6 +23,7 @@ from dcpy.utils.logging import logger import dcpy.models.product.dataset.metadata as md +import dcpy.models.dataset as dataset from .utils import SOCRATA_USER, SOCRATA_PASSWORD, _socrata_request SOCRATA_REVISION_APPLY_TIMEOUT_SECS = 10 * 60 # Ten Mins @@ -176,7 +177,9 @@ def __init__(self, col: md.DatasetColumn): self.display_name = col.name self.description = col.description self.is_primary_key = ( - bool(col.checks.is_primary_key) if col.checks else False + bool(col.checks.is_primary_key) + if isinstance(col.checks, dataset.Checks) + else False ) class Attachment(TypedDict): @@ -298,7 +301,9 @@ def calculate_pushed_col_metadata(self, our_columns: list[md.DatasetColumn]): new_col["initial_output_column_id"] = new_col["id"] new_col["is_primary_key"] = ( - True if (our_col.checks and our_col.checks.is_primary_key) else False + bool(our_col.checks.is_primary_key) + if isinstance(our_col.checks, dataset.Checks) + else False ) new_col["display_name"] = our_col.name diff --git a/dcpy/lifecycle/package/validate.py b/dcpy/lifecycle/package/validate.py index e5f83391a..aa85009ac 100644 --- a/dcpy/lifecycle/package/validate.py +++ b/dcpy/lifecycle/package/validate.py @@ -218,6 +218,8 @@ def validate_df( ) ) + if isinstance(col.checks, list): # TODO: delete after refactoring + raise NotImplementedError("Must be old dataset.Checks format to run checks") # Check Nulls if col.checks and col.checks.non_nullable: if not df_only_col_nulls.empty: diff --git a/dcpy/lifecycle/validate/__init__.py b/dcpy/lifecycle/validate/__init__.py new file mode 100644 index 000000000..8ff3aa2e9 --- /dev/null +++ b/dcpy/lifecycle/validate/__init__.py @@ -0,0 +1,3 @@ +from . import pandera_custom_checks + +# from .data import run diff --git a/dcpy/lifecycle/validate/data.py b/dcpy/lifecycle/validate/data.py new file mode 100644 index 000000000..2852ebe04 --- /dev/null +++ b/dcpy/lifecycle/validate/data.py @@ -0,0 +1,14 @@ +from pathlib import Path + + +def run( + dataset_id: str, + input_path: Path, +): + # TODO: read in data from input_path to pandas dataframe + + # TODO: get dataset template + + # TODO: run data checks + + raise NotImplementedError diff --git a/dcpy/lifecycle/validate/pandera_custom_checks.py b/dcpy/lifecycle/validate/pandera_custom_checks.py new file mode 100644 index 000000000..22996b72c --- /dev/null +++ b/dcpy/lifecycle/validate/pandera_custom_checks.py @@ -0,0 +1,9 @@ +from pandera import extensions + + +@extensions.register_check_method(check_type="element_wise") +def is_geom_point(s): + try: + return s.geom_type == "Point" + except ValueError: + return False diff --git a/dcpy/lifecycle/validate/pandera_utils.py b/dcpy/lifecycle/validate/pandera_utils.py new file mode 100644 index 000000000..c8e1c6766 --- /dev/null +++ b/dcpy/lifecycle/validate/pandera_utils.py @@ -0,0 +1,132 @@ +import pandera as pa +import pandas as pd +import geopandas as gpd +from inspect import ( + signature, +) # used for checking expected attributes in a class signuture + +from dcpy.models.dataset import Column, CheckAttributes, Checks + + +def create_check(check: str | dict[str, CheckAttributes]) -> pa.Check: + """ + Creates a Pandera `Check` object from a given check definition. + + Args: + check: + A string representing the name of the check or a dictionary with the + check name as the key and check attibutes as the value. + Returns: + pa.Check: + A Pandera `Check` object constructed with the specified parameters. + Raises: + AssertionError: + If the `check` dictionary does not contain exactly one key-value pair. + ValueError: + If the check name is not registered or if attributes cannot be parsed + or used to create a valid `Check`. + """ + allowed_check_names = { + **pa.Check.CHECK_FUNCTION_REGISTRY, + **pa.Check.REGISTERED_CUSTOM_CHECKS, + } + + if isinstance(check, str): + check_name = check + check_args = None + elif isinstance(check, dict): + assert len(check) == 1, ( + "`utils.create_pa_check` expects exactly 1 key-value pair in `check` param." + ) + check_name, check_args = next(iter(check.items())) + + if check_name not in allowed_check_names: + raise ValueError(f"Unregistered check name: '{check_name}'.") + + # Retrieve constructor for the specified check name from pandera. + # The constructor requires check-specific parameters and also accepts **kwargs + # for generic parameters shared across all Check objects like "description" attribute + check_constructor = getattr(pa.Check, check_name) + + if check_args: + check_expected_params = signature(check_constructor).parameters + invalid_check_keys = set(check_args.args.keys()) - set( + check_expected_params.keys() + ) + if invalid_check_keys: + raise ValueError( + f"Invalid argument keys found for check '{check_name}': {invalid_check_keys}. " + f"Valid argument keys are: {sorted(check_expected_params.keys())}." + ) + + try: + check_obj = ( + check_constructor( + **check_args.args, + raise_warning=check_args.warn_only, + description=check_args.description, + name=check_args.name, + title=check_args.title, + n_failure_cases=check_args.n_failure_cases, + groups=check_args.groups, + groupby=check_args.groupby, + ignore_na=check_args.ignore_na, + ) + if check_args + else check_constructor() + ) + except Exception as e: + raise ValueError( + f"Check '{check_name}' couldn't be created. Error message: {e}" + ) + + return check_obj + + +def create_checks(checks: list[str | dict[str, CheckAttributes]]) -> list[pa.Check]: + """Create Pandera checks.""" + pandera_checks = [create_check(check) for check in checks] + return pandera_checks + + +def create_column_with_checks(column: Column) -> pa.Column: + """Create Pandera column validator object.""" + if isinstance(column.checks, Checks): + raise NotImplementedError( + "Pandera checks are not implemented for old Column.checks format" + ) + data_checks = create_checks(column.checks) if column.checks else None + return pa.Column( + # TODO: implement `dtype` param + coerce=True, # coerce column to defined data type. This decision is up for debate + checks=data_checks, + required=column.is_required, + description=column.description, + nullable=True, # TODO: temp solution. Need to figure out what to do with this (equivalent to can be null) + ) + + +def run_data_checks( + df: pd.DataFrame | gpd.GeoDataFrame, columns: list[Column] +) -> pd.DataFrame | gpd.GeoDataFrame: + """ + Validate a DataFrame or GeoDataFrame against a schema defined by a list of columns with Pandera. + + Args: + df (pd.DataFrame | gpd.GeoDataFrame): The input DataFrame to validate. + columns (list[Column]): List of column definitions specifying validation rules. + + Raises: + AssertionError: If column names in `columns` are not unique. + """ + + column_names = [column.id for column in columns] + assert len(column_names) == len(set(column_names)), ( + "Columns should have unique names" + ) + + dataframe_checks = {} + for column in columns: + dataframe_checks[column.id] = create_column_with_checks(column) + + return pa.DataFrameSchema(dataframe_checks).validate(df) diff --git a/dcpy/models/dataset.py b/dcpy/models/dataset.py index de41fee08..f977d63e6 100644 --- a/dcpy/models/dataset.py +++ b/dcpy/models/dataset.py @@ -1,5 +1,7 @@ from dcpy.models.base import SortedSerializedBase -from typing import Literal +from typing import Literal, Any, Callable + +from pydantic import field_validator COLUMN_TYPES = Literal[ "text", @@ -14,12 +16,34 @@ ] -# TODO: extend/modify Checks model +# TODO: DELETE class Checks(SortedSerializedBase): is_primary_key: bool | None = None non_nullable: bool | None = None +class CheckAttributes(SortedSerializedBase, extra="forbid"): + """ + Defines the settings and parameters for a column data check, + aligning with the `pandera.Check` object. + + This class mirrors the `pandera.Check` constructor, where the `args` property + holds parameters specific to individual checks (e.g., thresholds or conditions). + Additional fields in this class configure options such as whether to raise + warnings or how to handle missing data. + """ + + args: dict[str, Any] + description: str | None = None + warn_only: bool = False + name: str | None = None + title: str | None = None + n_failure_cases: int | None = None + groups: str | list[str] | None = None + groupby: str | list[str] | Callable | None = None + ignore_na: bool = True + + class Column(SortedSerializedBase, extra="forbid"): """ An extensible base class for defining column metadata in ingest and product templates. @@ -29,4 +53,15 @@ class Column(SortedSerializedBase, extra="forbid"): data_type: COLUMN_TYPES | None = None description: str | None = None is_required: bool = True - checks: Checks | None = None + checks: Checks | list[str | dict[str, CheckAttributes]] | None = ( + None # TODO: delete Checks after refactoring metadata + ) + + @field_validator("checks", mode="after") + @classmethod + def check_checks(cls, checks: list[str | dict[str, CheckAttributes]]): + if isinstance(checks, list): + for check in checks: + if isinstance(check, dict) and len(check) != 1: + raise ValueError(f"{check} must contain exactly one key-value pair") + return checks diff --git a/dcpy/models/lifecycle/ingest.py b/dcpy/models/lifecycle/ingest.py index 0f4a84dc2..a054eed8d 100644 --- a/dcpy/models/lifecycle/ingest.py +++ b/dcpy/models/lifecycle/ingest.py @@ -93,6 +93,7 @@ class Template(BaseModel, extra="forbid"): attributes: DatasetAttributes ingestion: Ingestion columns: list[Column] = [] + checks: list[str | dict[str, Any]] | None = None @property def has_geom(self): diff --git a/dcpy/test/lifecycle/package/test_column_validation.py b/dcpy/test/lifecycle/package/test_column_validation.py index d67fa8428..871baad3d 100644 --- a/dcpy/test/lifecycle/package/test_column_validation.py +++ b/dcpy/test/lifecycle/package/test_column_validation.py @@ -8,6 +8,7 @@ from dcpy.test.lifecycle.package.conftest import TEST_METADATA_YAML_PATH import dcpy.models.product.dataset.metadata as md +import dcpy.models.dataset as dataset from dcpy.lifecycle.package import validate rd = random.Random() @@ -82,12 +83,13 @@ def _fake_row(columns: list[md.DatasetColumn]): ) for c in columns: - if c.checks and not c.checks.non_nullable and random.choice([True, False]): - # adding some extra chaos - if random.choice([True, False]): - del row[c.name] - else: - row[c.name] = "" + if isinstance(c.checks, dataset.Checks): + if not c.checks.non_nullable and random.choice([True, False]): + # adding some extra chaos + if random.choice([True, False]): + del row[c.name] + else: + row[c.name] = "" return row diff --git a/dcpy/test/lifecycle/validate/__init__.py b/dcpy/test/lifecycle/validate/__init__.py new file mode 100644 index 000000000..325b569b2 --- /dev/null +++ b/dcpy/test/lifecycle/validate/__init__.py @@ -0,0 +1,3 @@ +from pathlib import Path + +RESOURCES = Path(__file__).parent / "resources" diff --git a/dcpy/test/lifecycle/validate/resources/invalid_data_checks.yml b/dcpy/test/lifecycle/validate/resources/invalid_data_checks.yml new file mode 100644 index 000000000..b454764b6 --- /dev/null +++ b/dcpy/test/lifecycle/validate/resources/invalid_data_checks.yml @@ -0,0 +1,9 @@ +- unregistered_func_with_args: + args: + min_value: 100 +- unregistered_func_without_args +- greater_than # missing args +- greater_than: + args: + min_value: 5 + unexpected_key: value diff --git a/dcpy/test/lifecycle/validate/resources/valid_columns_with_checks.yml b/dcpy/test/lifecycle/validate/resources/valid_columns_with_checks.yml new file mode 100644 index 000000000..4cb5e7602 --- /dev/null +++ b/dcpy/test/lifecycle/validate/resources/valid_columns_with_checks.yml @@ -0,0 +1,22 @@ +- id: bbl + data_type: text + is_required: True + checks: + - str_contains: + args: + pattern: "75" + warn_only: False + +- id: custom_value + data_type: integer + is_required: False + checks: + - greater_than: + description: This column must be greater than zero + args: + min_value: 0 + - unique_values_eq: + description: This column must contain at least these values. + warn_only: True + args: + values: [5, 10] diff --git a/dcpy/test/lifecycle/validate/resources/valid_data_checks.yml b/dcpy/test/lifecycle/validate/resources/valid_data_checks.yml new file mode 100644 index 000000000..5c52b967b --- /dev/null +++ b/dcpy/test/lifecycle/validate/resources/valid_data_checks.yml @@ -0,0 +1,28 @@ +- in_range: + description: My custom check description. + args: + min_value: 100 + max_value: 200 + warn_only: true + +- greater_than: + args: + min_value: 5 + warn_only: false + +- greater_than: + args: + min_value: abc # adding this for visibility. Pandera sees this as a valid check + warn_only: false + +# case for default properties +- greater_than: + args: + min_value: 1 + warn_only: true + name: greater than + title: My greater than check + n_failure_cases: 1 + ignore_na: false + groups: col_a + groupby: ["col_a", "col_b"] diff --git a/dcpy/test/lifecycle/validate/test_pandera_custom_checks.py b/dcpy/test/lifecycle/validate/test_pandera_custom_checks.py new file mode 100644 index 000000000..dcf804a2c --- /dev/null +++ b/dcpy/test/lifecycle/validate/test_pandera_custom_checks.py @@ -0,0 +1,42 @@ +import pytest +import pandera as pa +import geopandas as gpd + +from dcpy.models.dataset import Column +from dcpy.lifecycle.validate import pandera_utils + + +def test_is_geom_point_valid_points(): + gdf = gpd.GeoDataFrame( + { + "geometry": gpd.GeoSeries.from_wkt( + [ + "POINT (1 1)", + "POINT (2 3)", + ] + ) + } + ) + pandera_utils.run_data_checks( + df=gdf, + columns=[Column(id="geometry", checks=["is_geom_point"])], + ) + + +def test_is_geom_point_invalid_geoms(): + gdf = gpd.GeoDataFrame( + { + "geometry": gpd.GeoSeries.from_wkt( + [ + "POINT (1 1)", + "LINESTRING (0 0, 1 1)", + "POLYGON ((0 0, 1 1, 1 0, 0 0))", + ] + ) + } + ) + with pytest.raises(pa.errors.SchemaError): + pandera_utils.run_data_checks( + df=gdf, + columns=[Column(id="geometry", checks=["is_geom_point"])], + ) diff --git a/dcpy/test/lifecycle/validate/test_pandera_utils.py b/dcpy/test/lifecycle/validate/test_pandera_utils.py new file mode 100644 index 000000000..3fbd49548 --- /dev/null +++ b/dcpy/test/lifecycle/validate/test_pandera_utils.py @@ -0,0 +1,197 @@ +import pytest +import pandera as pa +import pandas as pd +import yaml +from pydantic import TypeAdapter +from inspect import ( + signature, +) # used for checking expected attributes in a class signuture + +from dcpy.models.dataset import Column, CheckAttributes +from dcpy.lifecycle.validate import pandera_utils + +from . import RESOURCES + + +def get_valid_checks(): + """A list of test models.dataset.Column.checks objects.""" + with open(RESOURCES / "valid_data_checks.yml") as f: + data_checks = TypeAdapter( + list[str | dict[str, CheckAttributes]] + ).validate_python(yaml.safe_load(f)) + return data_checks + + +def get_invalid_checks(): + """A list of test models.dataset.Column.checks objects.""" + with open(RESOURCES / "invalid_data_checks.yml") as f: + data_checks = TypeAdapter( + list[str | dict[str, CheckAttributes]] + ).validate_python(yaml.safe_load(f)) + return data_checks + + +valid_data_checks = get_valid_checks() +invalid_data_checks = get_invalid_checks() + + +@pytest.mark.parametrize( + "input_data, expected_pa_check", + [ + ( + valid_data_checks[0], + pa.Check.in_range( + min_value=100, + max_value=200, + raise_warning=True, + description="My custom check description.", + ), + ), + ( + valid_data_checks[1], + pa.Check.greater_than(min_value=5, raise_warning=False), + ), + ( + valid_data_checks[2], + pa.Check.greater_than(min_value="abc", raise_warning=False), + ), + ( + valid_data_checks[3], + pa.Check.greater_than( + min_value=1, + raise_warning=True, + name="greater than", + title="My greater than check", + n_failure_cases=1, + ignore_na=False, + groups="col_a", + groupby=["col_a", "col_b"], + ), + ), + # TODO: add custom registered check + ], +) +def test_create_check_success(input_data, expected_pa_check): + actual_check = pandera_utils.create_check(input_data) + assert actual_check == expected_pa_check + + +@pytest.mark.parametrize( + "input_data, error_message_substring", + [ + ( + invalid_data_checks[0], + "Unregistered check name", + ), + ( + invalid_data_checks[1], + "Unregistered check name", + ), + ( + invalid_data_checks[2], + "couldn't be created", + ), + ( + invalid_data_checks[3], + "Invalid argument keys found for check", + ), + ], +) +def test_create_check_failure(input_data, error_message_substring): + with pytest.raises(ValueError, match=error_message_substring): + pandera_utils.create_check(input_data) + + +def test_check_attributes_consistency_with_pa_check(): + """ + Ensures that all attributes in CheckAttributes.model_fields (excluding "args") + are valid parameters for the pa.Check class. + + This test is necessary because feeding invalid attributes to pa.Check() does + not raise an error, which could lead to silent failures. It is particularly + important for long-term stability, as Pandera's API may change before it + reaches version 1.0, including potential renaming of pa.Check parameters. + """ + check_attributes = pandera_utils.CheckAttributes.model_fields + + # rename check_attributes keys to match with pandera keys + check_attributes["check_kwargs"] = check_attributes.pop("args") + check_attributes["raise_warning"] = check_attributes.pop("warn_only") + + check_expected_params = signature(pa.Check).parameters + + invalid_check_keys = set(check_attributes.keys()) - set( + check_expected_params.keys() + ) + assert len(invalid_check_keys) == 0 + + +def get_valid_columns(): + """A list of test models.dataset.Column objects.""" + with open(RESOURCES / "valid_columns_with_checks.yml") as f: + columns = TypeAdapter(list[Column]).validate_python(yaml.safe_load(f)) + return columns + + +@pytest.mark.parametrize( + "test_df", + [ + pd.DataFrame( + { + "bbl": ["1000157502", "1000157501", None], + "custom_value": [1, 5, 10], + } + ), + pd.DataFrame( + { + "bbl": ["1000157502", "1000157501", None], + "custom_value": [1, 5, 10], + "extra_column": ["a", "b", None], + } + ), # df with a column that doesn't have data checks + pd.DataFrame( + { + "bbl": ["1000157502", "1000157501", None, None], + "custom_value": [None, 1, 5, 10], + } + ), # df with warning only data check + ], +) +def test_run_data_checks_success(test_df): + """ + Test the `run_data_checks` function for passing, warning, and expected failing scenarios. + + Verifies: + - Valid data passes checks. + - Data with extra columns passes checks. + - Data with warnings still passes checks. + """ + columns = get_valid_columns() + pandera_utils.run_data_checks(df=test_df, columns=columns) + + +def test_run_data_checks_fail(): + """Test that data fails data checks as expected.""" + columns = get_valid_columns() + data_checks_fail = pd.DataFrame( + {"bbl": ["1000150002", "1000157501"], "custom_value": [0, 1]} + ) + with pytest.raises( + pa.errors.SchemaError, match="failed element-wise validator number 0" + ): + pandera_utils.run_data_checks(df=data_checks_fail, columns=columns) + + +def test_run_data_checks_duplicate_columns_error(): + """Test the `run_data_checks` function for handling duplicate column names.""" + + columns = get_valid_columns() + df = pd.DataFrame( + {"bbl": ["1000157502", "1000157501", None], "custom_value": [1, 5, 10]} + ) + # create a fail test case with same column names + for col in columns: + col.id = "duplicate_name" + + with pytest.raises(AssertionError, match="Columns should have unique names"): + pandera_utils.run_data_checks(df=df, columns=columns)