From 14759b843942292b9775239d513b457a04a5c566 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Mon, 2 Dec 2024 18:55:06 -0500 Subject: [PATCH] Add FERC1 validation tests (#3860) * Load dagster assets for FERC1 validation tests This lets us test things that aren't in `PudlTabl`. * Improve performance by caching the asset loader. * Add minmax tests for all FERC1 assets. * Update release notes * move release notes docs --------- Co-authored-by: Christina Gosnell --- docs/release_notes.rst | 2 + test/conftest.py | 13 +++++ test/validate/ferc1_test.py | 112 ++++++++++++++++++++++-------------- 3 files changed, 84 insertions(+), 43 deletions(-) diff --git a/docs/release_notes.rst b/docs/release_notes.rst index f1e6585a65..ea5857714b 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -22,6 +22,8 @@ Bug Fixes :ref:`core_ferc1__yearly_other_regulatory_liabilities_sched278`. See issue :issue:`3952` and PRs :pr:`3969,3979`. Thanks to :user:`yolandazzz13` for making this fix. +* Added preliminary data validation checks for several FERC 1 tables that were + missing it :pr:`3860`. Major Dependency Updates ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/test/conftest.py b/test/conftest.py index ef23a27a3e..55994116db 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -11,6 +11,7 @@ import pytest import sqlalchemy as sa from dagster import ( + AssetValueLoader, build_init_resource_context, graph, materialize_to_memory, @@ -18,6 +19,7 @@ import pudl from pudl import resources +from pudl.etl import defs from pudl.etl.cli import pudl_etl_job_factory from pudl.extract.ferc1 import Ferc1DbfExtractor, raw_ferc1_xbrl__metadata_json from pudl.extract.ferc714 import raw_ferc714_xbrl__metadata_json @@ -105,6 +107,17 @@ def live_databases(request) -> bool: return request.config.getoption("--live-dbs") +@pytest.fixture(scope="session") +def asset_value_loader() -> AssetValueLoader: + """Fixture that initializes an asset value loader. + + Use this as ``asset_value_loader.load_asset_value`` instead + of ``defs.load_asset_value`` to not reinitialize the asset + value loader over and over again. + """ + return defs.get_asset_value_loader() + + @pytest.fixture(scope="session", name="save_unmapped_ids") def save_unmapped_ids(request) -> bool: """Fixture that tells whether to use existing live FERC1/PUDL DBs).""" diff --git a/test/validate/ferc1_test.py b/test/validate/ferc1_test.py index 34e43fd67c..92991ece17 100644 --- a/test/validate/ferc1_test.py +++ b/test/validate/ferc1_test.py @@ -50,9 +50,9 @@ @pytest.mark.parametrize("table_name", unique_record_tables) -def test_record_id_dupes(pudl_engine, table_name): +def test_record_id_dupes(table_name, asset_value_loader): """Verify that the generated ferc1 record_ids are unique.""" - table = pd.read_sql(table_name, pudl_engine) + table = asset_value_loader.load_asset_value(table_name) n_dupes = table.record_id.duplicated().to_numpy().sum() if n_dupes: @@ -63,76 +63,102 @@ def test_record_id_dupes(pudl_engine, table_name): @pytest.mark.parametrize( - "df_name,cols", + "asset_key,cols", [ - ("fbp_ferc1", "all"), - ("fuel_ferc1", "all"), - ("plant_in_service_ferc1", "all"), - ("plants_all_ferc1", "all"), - ("plants_hydro_ferc1", "all"), - ("plants_pumped_storage_ferc1", "all"), - ("plants_small_ferc1", "all"), - ("plants_steam_ferc1", "all"), - ("pu_ferc1", "all"), - ("purchased_power_ferc1", "all"), + ("out_ferc1__yearly_steam_plants_fuel_by_plant_sched402", "all"), + ("out_ferc1__yearly_steam_plants_fuel_sched402", "all"), + ("out_ferc1__yearly_plant_in_service_sched204", "all"), + ("out_ferc1__yearly_all_plants", "all"), + ("out_ferc1__yearly_hydroelectric_plants_sched406", "all"), + ("out_ferc1__yearly_pumped_storage_plants_sched408", "all"), + ("out_ferc1__yearly_small_plants_sched410", "all"), + ("out_ferc1__yearly_steam_plants_sched402", "all"), + ("_out_ferc1__yearly_plants_utilities", "all"), + ("out_ferc1__yearly_purchased_power_and_exchanges_sched326", "all"), ], ) -def test_no_null_cols_ferc1(pudl_out_ferc1, live_dbs, cols, df_name): +def test_no_null_cols_ferc1(live_dbs, asset_value_loader, cols, asset_key): """Verify that output DataFrames have no entirely NULL columns.""" if not live_dbs: pytest.skip("Data validation only works with a live PUDL DB.") pv.no_null_cols( - pudl_out_ferc1.__getattribute__(df_name)(), cols=cols, df_name=df_name + asset_value_loader.load_asset_value(asset_key), cols=cols, df_name=asset_key ) @pytest.mark.parametrize( - "df_name,expected_rows", + "asset_key,expected_rows", [ - ("fbp_ferc1", 26_947), - ("fuel_ferc1", 51_238), - ("plant_in_service_ferc1", 355_918), - ("plants_all_ferc1", 58_520), - ("plants_hydro_ferc1", 7_202), - ("plants_pumped_storage_ferc1", 580), - ("plants_small_ferc1", 17_763), - ("plants_steam_ferc1", 32_975), - ("pu_ferc1", 7_887), - ("purchased_power_ferc1", 211_794), + ("_out_ferc1__yearly_plants_utilities", 7_887), + ("out_ferc1__yearly_all_plants", 58_520), + ("out_ferc1__yearly_balance_sheet_assets_sched110", 278_789), + ("out_ferc1__yearly_balance_sheet_liabilities_sched110", 233_383), + ("out_ferc1__yearly_cash_flows_sched120", 306_837), + ("out_ferc1__yearly_depreciation_by_function_sched219", 148_352), + ("out_ferc1__yearly_depreciation_changes_sched219", 263_942), + ("out_ferc1__yearly_depreciation_summary_sched336", 216_710), + ("out_ferc1__yearly_energy_dispositions_sched401", 25_954), + ("out_ferc1__yearly_energy_sources_sched401", 38_315), + ("out_ferc1__yearly_hydroelectric_plants_sched406", 7_202), + ("out_ferc1__yearly_income_statements_sched114", 347_394), + ("out_ferc1__yearly_operating_expenses_sched320", 618_518), + ("out_ferc1__yearly_operating_revenues_sched300", 77_646), + ("out_ferc1__yearly_other_regulatory_liabilities_sched278", 53015), + ("out_ferc1__yearly_plant_in_service_sched204", 355_918), + ("out_ferc1__yearly_pumped_storage_plants_sched408", 580), + ("out_ferc1__yearly_purchased_power_and_exchanges_sched326", 211_794), + ("out_ferc1__yearly_retained_earnings_sched118", 105_585), + ("out_ferc1__yearly_sales_by_rate_schedules_sched304", 303_909), + ("out_ferc1__yearly_small_plants_sched410", 17763), + ("out_ferc1__yearly_steam_plants_fuel_by_plant_sched402", 26_947), + ("out_ferc1__yearly_steam_plants_fuel_sched402", 51_238), + ("out_ferc1__yearly_steam_plants_sched402", 32_975), + ("out_ferc1__yearly_transmission_lines_sched422", 640_619), + ("out_ferc1__yearly_utility_plant_summary_sched200", 198_769), + ("out_ferc1__yearly_small_plants_sched410", 17_763), ], ) -def test_minmax_rows(pudl_out_ferc1, live_dbs, expected_rows, df_name): +def test_minmax_rows(live_dbs, asset_value_loader, expected_rows, asset_key): """Verify that output DataFrames don't have too many or too few rows. Args: - pudl_out_ferc1: A PudlTabl output object. live_dbs: Boolean (wether we're using a live or testing DB). expected_rows (int): Expected number of rows that the dataframe should contain when all data is loaded and is output without aggregation. - df_name (str): Shorthand name identifying the dataframe, corresponding - to the name of the function used to pull it from the PudlTabl - output object. + asset_key (str): The name of the asset. """ if not live_dbs: pytest.skip("Data validation only works with a live PUDL DB.") _ = ( - pudl_out_ferc1.__getattribute__(df_name)() + asset_value_loader.load_asset_value(asset_key) .pipe( - pv.check_min_rows, expected_rows=expected_rows, margin=0.0, df_name=df_name + pv.check_min_rows, + expected_rows=expected_rows, + margin=0.0, + df_name=asset_key, ) .pipe( - pv.check_max_rows, expected_rows=expected_rows, margin=0.0, df_name=df_name + pv.check_max_rows, + expected_rows=expected_rows, + margin=0.0, + df_name=asset_key, ) ) @pytest.mark.parametrize( - "df_name,unique_subset", + "asset_key,unique_subset", [ - ("pu_ferc1", ["utility_id_ferc1", "plant_name_ferc1"]), - ("fbp_ferc1", ["report_year", "utility_id_ferc1", "plant_name_ferc1"]), ( - "plants_hydro_ferc1", + "_out_ferc1__yearly_plants_utilities", + ["utility_id_ferc1", "plant_name_ferc1"], + ), + ( + "out_ferc1__yearly_steam_plants_fuel_by_plant_sched402", + ["report_year", "utility_id_ferc1", "plant_name_ferc1"], + ), + ( + "out_ferc1__yearly_hydroelectric_plants_sched406", [ "report_year", "utility_id_ferc1", @@ -141,7 +167,7 @@ def test_minmax_rows(pudl_out_ferc1, live_dbs, expected_rows, df_name): ], ), ( - "plants_pumped_storage_ferc1", + "out_ferc1__yearly_pumped_storage_plants_sched408", [ "report_year", "utility_id_ferc1", @@ -150,19 +176,19 @@ def test_minmax_rows(pudl_out_ferc1, live_dbs, expected_rows, df_name): ], ), ( - "plant_in_service_ferc1", + "out_ferc1__yearly_plant_in_service_sched204", ["report_year", "utility_id_ferc1", "ferc_account_label"], ), ], ) -def test_unique_rows_ferc1(pudl_out_ferc1, live_dbs, df_name, unique_subset): +def test_unique_rows_ferc1(live_dbs, asset_value_loader, asset_key, unique_subset): """Test whether dataframe has unique records within a subset of columns.""" if not live_dbs: pytest.skip("Data validation only works with a live PUDL DB.") pv.check_unique_rows( - pudl_out_ferc1.__getattribute__(df_name)(), + asset_value_loader.load_asset_value(asset_key), subset=unique_subset, - df_name=df_name, + df_name=asset_key, )