Skip to content

Commit

Permalink
Add FERC1 validation tests (#3860)
Browse files Browse the repository at this point in the history
* Load dagster assets for FERC1 validation tests

This lets us test things that aren't in `PudlTabl`.

* Improve performance by caching the asset loader.

* Add minmax tests for all FERC1 assets.

* Update release notes

* move release notes docs

---------

Co-authored-by: Christina Gosnell <[email protected]>
  • Loading branch information
jdangerx and cmgosnell authored Dec 2, 2024
1 parent c616d6d commit 14759b8
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 43 deletions.
2 changes: 2 additions & 0 deletions docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ Bug Fixes
:ref:`core_ferc1__yearly_other_regulatory_liabilities_sched278`. See issue
:issue:`3952` and PRs :pr:`3969,3979`. Thanks to :user:`yolandazzz13` for making
this fix.
* Added preliminary data validation checks for several FERC 1 tables that were
missing it :pr:`3860`.

Major Dependency Updates
^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
13 changes: 13 additions & 0 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@
import pytest
import sqlalchemy as sa
from dagster import (
AssetValueLoader,
build_init_resource_context,
graph,
materialize_to_memory,
)

import pudl
from pudl import resources
from pudl.etl import defs
from pudl.etl.cli import pudl_etl_job_factory
from pudl.extract.ferc1 import Ferc1DbfExtractor, raw_ferc1_xbrl__metadata_json
from pudl.extract.ferc714 import raw_ferc714_xbrl__metadata_json
Expand Down Expand Up @@ -105,6 +107,17 @@ def live_databases(request) -> bool:
return request.config.getoption("--live-dbs")


@pytest.fixture(scope="session")
def asset_value_loader() -> AssetValueLoader:
"""Fixture that initializes an asset value loader.
Use this as ``asset_value_loader.load_asset_value`` instead
of ``defs.load_asset_value`` to not reinitialize the asset
value loader over and over again.
"""
return defs.get_asset_value_loader()


@pytest.fixture(scope="session", name="save_unmapped_ids")
def save_unmapped_ids(request) -> bool:
"""Fixture that tells whether to use existing live FERC1/PUDL DBs)."""
Expand Down
112 changes: 69 additions & 43 deletions test/validate/ferc1_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@


@pytest.mark.parametrize("table_name", unique_record_tables)
def test_record_id_dupes(pudl_engine, table_name):
def test_record_id_dupes(table_name, asset_value_loader):
"""Verify that the generated ferc1 record_ids are unique."""
table = pd.read_sql(table_name, pudl_engine)
table = asset_value_loader.load_asset_value(table_name)
n_dupes = table.record_id.duplicated().to_numpy().sum()

if n_dupes:
Expand All @@ -63,76 +63,102 @@ def test_record_id_dupes(pudl_engine, table_name):


@pytest.mark.parametrize(
"df_name,cols",
"asset_key,cols",
[
("fbp_ferc1", "all"),
("fuel_ferc1", "all"),
("plant_in_service_ferc1", "all"),
("plants_all_ferc1", "all"),
("plants_hydro_ferc1", "all"),
("plants_pumped_storage_ferc1", "all"),
("plants_small_ferc1", "all"),
("plants_steam_ferc1", "all"),
("pu_ferc1", "all"),
("purchased_power_ferc1", "all"),
("out_ferc1__yearly_steam_plants_fuel_by_plant_sched402", "all"),
("out_ferc1__yearly_steam_plants_fuel_sched402", "all"),
("out_ferc1__yearly_plant_in_service_sched204", "all"),
("out_ferc1__yearly_all_plants", "all"),
("out_ferc1__yearly_hydroelectric_plants_sched406", "all"),
("out_ferc1__yearly_pumped_storage_plants_sched408", "all"),
("out_ferc1__yearly_small_plants_sched410", "all"),
("out_ferc1__yearly_steam_plants_sched402", "all"),
("_out_ferc1__yearly_plants_utilities", "all"),
("out_ferc1__yearly_purchased_power_and_exchanges_sched326", "all"),
],
)
def test_no_null_cols_ferc1(pudl_out_ferc1, live_dbs, cols, df_name):
def test_no_null_cols_ferc1(live_dbs, asset_value_loader, cols, asset_key):
"""Verify that output DataFrames have no entirely NULL columns."""
if not live_dbs:
pytest.skip("Data validation only works with a live PUDL DB.")
pv.no_null_cols(
pudl_out_ferc1.__getattribute__(df_name)(), cols=cols, df_name=df_name
asset_value_loader.load_asset_value(asset_key), cols=cols, df_name=asset_key
)


@pytest.mark.parametrize(
"df_name,expected_rows",
"asset_key,expected_rows",
[
("fbp_ferc1", 26_947),
("fuel_ferc1", 51_238),
("plant_in_service_ferc1", 355_918),
("plants_all_ferc1", 58_520),
("plants_hydro_ferc1", 7_202),
("plants_pumped_storage_ferc1", 580),
("plants_small_ferc1", 17_763),
("plants_steam_ferc1", 32_975),
("pu_ferc1", 7_887),
("purchased_power_ferc1", 211_794),
("_out_ferc1__yearly_plants_utilities", 7_887),
("out_ferc1__yearly_all_plants", 58_520),
("out_ferc1__yearly_balance_sheet_assets_sched110", 278_789),
("out_ferc1__yearly_balance_sheet_liabilities_sched110", 233_383),
("out_ferc1__yearly_cash_flows_sched120", 306_837),
("out_ferc1__yearly_depreciation_by_function_sched219", 148_352),
("out_ferc1__yearly_depreciation_changes_sched219", 263_942),
("out_ferc1__yearly_depreciation_summary_sched336", 216_710),
("out_ferc1__yearly_energy_dispositions_sched401", 25_954),
("out_ferc1__yearly_energy_sources_sched401", 38_315),
("out_ferc1__yearly_hydroelectric_plants_sched406", 7_202),
("out_ferc1__yearly_income_statements_sched114", 347_394),
("out_ferc1__yearly_operating_expenses_sched320", 618_518),
("out_ferc1__yearly_operating_revenues_sched300", 77_646),
("out_ferc1__yearly_other_regulatory_liabilities_sched278", 53015),
("out_ferc1__yearly_plant_in_service_sched204", 355_918),
("out_ferc1__yearly_pumped_storage_plants_sched408", 580),
("out_ferc1__yearly_purchased_power_and_exchanges_sched326", 211_794),
("out_ferc1__yearly_retained_earnings_sched118", 105_585),
("out_ferc1__yearly_sales_by_rate_schedules_sched304", 303_909),
("out_ferc1__yearly_small_plants_sched410", 17763),
("out_ferc1__yearly_steam_plants_fuel_by_plant_sched402", 26_947),
("out_ferc1__yearly_steam_plants_fuel_sched402", 51_238),
("out_ferc1__yearly_steam_plants_sched402", 32_975),
("out_ferc1__yearly_transmission_lines_sched422", 640_619),
("out_ferc1__yearly_utility_plant_summary_sched200", 198_769),
("out_ferc1__yearly_small_plants_sched410", 17_763),
],
)
def test_minmax_rows(pudl_out_ferc1, live_dbs, expected_rows, df_name):
def test_minmax_rows(live_dbs, asset_value_loader, expected_rows, asset_key):
"""Verify that output DataFrames don't have too many or too few rows.
Args:
pudl_out_ferc1: A PudlTabl output object.
live_dbs: Boolean (wether we're using a live or testing DB).
expected_rows (int): Expected number of rows that the dataframe should
contain when all data is loaded and is output without aggregation.
df_name (str): Shorthand name identifying the dataframe, corresponding
to the name of the function used to pull it from the PudlTabl
output object.
asset_key (str): The name of the asset.
"""
if not live_dbs:
pytest.skip("Data validation only works with a live PUDL DB.")
_ = (
pudl_out_ferc1.__getattribute__(df_name)()
asset_value_loader.load_asset_value(asset_key)
.pipe(
pv.check_min_rows, expected_rows=expected_rows, margin=0.0, df_name=df_name
pv.check_min_rows,
expected_rows=expected_rows,
margin=0.0,
df_name=asset_key,
)
.pipe(
pv.check_max_rows, expected_rows=expected_rows, margin=0.0, df_name=df_name
pv.check_max_rows,
expected_rows=expected_rows,
margin=0.0,
df_name=asset_key,
)
)


@pytest.mark.parametrize(
"df_name,unique_subset",
"asset_key,unique_subset",
[
("pu_ferc1", ["utility_id_ferc1", "plant_name_ferc1"]),
("fbp_ferc1", ["report_year", "utility_id_ferc1", "plant_name_ferc1"]),
(
"plants_hydro_ferc1",
"_out_ferc1__yearly_plants_utilities",
["utility_id_ferc1", "plant_name_ferc1"],
),
(
"out_ferc1__yearly_steam_plants_fuel_by_plant_sched402",
["report_year", "utility_id_ferc1", "plant_name_ferc1"],
),
(
"out_ferc1__yearly_hydroelectric_plants_sched406",
[
"report_year",
"utility_id_ferc1",
Expand All @@ -141,7 +167,7 @@ def test_minmax_rows(pudl_out_ferc1, live_dbs, expected_rows, df_name):
],
),
(
"plants_pumped_storage_ferc1",
"out_ferc1__yearly_pumped_storage_plants_sched408",
[
"report_year",
"utility_id_ferc1",
Expand All @@ -150,19 +176,19 @@ def test_minmax_rows(pudl_out_ferc1, live_dbs, expected_rows, df_name):
],
),
(
"plant_in_service_ferc1",
"out_ferc1__yearly_plant_in_service_sched204",
["report_year", "utility_id_ferc1", "ferc_account_label"],
),
],
)
def test_unique_rows_ferc1(pudl_out_ferc1, live_dbs, df_name, unique_subset):
def test_unique_rows_ferc1(live_dbs, asset_value_loader, asset_key, unique_subset):
"""Test whether dataframe has unique records within a subset of columns."""
if not live_dbs:
pytest.skip("Data validation only works with a live PUDL DB.")
pv.check_unique_rows(
pudl_out_ferc1.__getattribute__(df_name)(),
asset_value_loader.load_asset_value(asset_key),
subset=unique_subset,
df_name=df_name,
df_name=asset_key,
)


Expand Down

0 comments on commit 14759b8

Please sign in to comment.