From 14759b843942292b9775239d513b457a04a5c566 Mon Sep 17 00:00:00 2001
From: Dazhong Xia <dazhong.xia@catalyst.coop>
Date: Mon, 2 Dec 2024 18:55:06 -0500
Subject: [PATCH] Add FERC1 validation tests (#3860)

* Load dagster assets for FERC1 validation tests

This lets us test things that aren't in `PudlTabl`.

* Improve performance by caching the asset loader.

* Add minmax tests for all FERC1 assets.

* Update release notes

* move release notes docs

---------

Co-authored-by: Christina Gosnell <cgosnell@catalyst.coop>
---
 docs/release_notes.rst      |   2 +
 test/conftest.py            |  13 +++++
 test/validate/ferc1_test.py | 112 ++++++++++++++++++++++--------------
 3 files changed, 84 insertions(+), 43 deletions(-)

diff --git a/docs/release_notes.rst b/docs/release_notes.rst
index f1e6585a65..ea5857714b 100644
--- a/docs/release_notes.rst
+++ b/docs/release_notes.rst
@@ -22,6 +22,8 @@ Bug Fixes
   :ref:`core_ferc1__yearly_other_regulatory_liabilities_sched278`. See issue
   :issue:`3952` and PRs :pr:`3969,3979`. Thanks to :user:`yolandazzz13` for making
   this fix.
+* Added preliminary data validation checks for several FERC 1 tables that were
+  missing it :pr:`3860`.
 
 Major Dependency Updates
 ^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/test/conftest.py b/test/conftest.py
index ef23a27a3e..55994116db 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -11,6 +11,7 @@
 import pytest
 import sqlalchemy as sa
 from dagster import (
+    AssetValueLoader,
     build_init_resource_context,
     graph,
     materialize_to_memory,
@@ -18,6 +19,7 @@
 
 import pudl
 from pudl import resources
+from pudl.etl import defs
 from pudl.etl.cli import pudl_etl_job_factory
 from pudl.extract.ferc1 import Ferc1DbfExtractor, raw_ferc1_xbrl__metadata_json
 from pudl.extract.ferc714 import raw_ferc714_xbrl__metadata_json
@@ -105,6 +107,17 @@ def live_databases(request) -> bool:
     return request.config.getoption("--live-dbs")
 
 
+@pytest.fixture(scope="session")
+def asset_value_loader() -> AssetValueLoader:
+    """Fixture that initializes an asset value loader.
+
+    Use this as ``asset_value_loader.load_asset_value`` instead
+    of ``defs.load_asset_value`` to not reinitialize the asset
+    value loader over and over again.
+    """
+    return defs.get_asset_value_loader()
+
+
 @pytest.fixture(scope="session", name="save_unmapped_ids")
 def save_unmapped_ids(request) -> bool:
     """Fixture that tells whether to use existing live FERC1/PUDL DBs)."""
diff --git a/test/validate/ferc1_test.py b/test/validate/ferc1_test.py
index 34e43fd67c..92991ece17 100644
--- a/test/validate/ferc1_test.py
+++ b/test/validate/ferc1_test.py
@@ -50,9 +50,9 @@
 
 
 @pytest.mark.parametrize("table_name", unique_record_tables)
-def test_record_id_dupes(pudl_engine, table_name):
+def test_record_id_dupes(table_name, asset_value_loader):
     """Verify that the generated ferc1 record_ids are unique."""
-    table = pd.read_sql(table_name, pudl_engine)
+    table = asset_value_loader.load_asset_value(table_name)
     n_dupes = table.record_id.duplicated().to_numpy().sum()
 
     if n_dupes:
@@ -63,76 +63,102 @@ def test_record_id_dupes(pudl_engine, table_name):
 
 
 @pytest.mark.parametrize(
-    "df_name,cols",
+    "asset_key,cols",
     [
-        ("fbp_ferc1", "all"),
-        ("fuel_ferc1", "all"),
-        ("plant_in_service_ferc1", "all"),
-        ("plants_all_ferc1", "all"),
-        ("plants_hydro_ferc1", "all"),
-        ("plants_pumped_storage_ferc1", "all"),
-        ("plants_small_ferc1", "all"),
-        ("plants_steam_ferc1", "all"),
-        ("pu_ferc1", "all"),
-        ("purchased_power_ferc1", "all"),
+        ("out_ferc1__yearly_steam_plants_fuel_by_plant_sched402", "all"),
+        ("out_ferc1__yearly_steam_plants_fuel_sched402", "all"),
+        ("out_ferc1__yearly_plant_in_service_sched204", "all"),
+        ("out_ferc1__yearly_all_plants", "all"),
+        ("out_ferc1__yearly_hydroelectric_plants_sched406", "all"),
+        ("out_ferc1__yearly_pumped_storage_plants_sched408", "all"),
+        ("out_ferc1__yearly_small_plants_sched410", "all"),
+        ("out_ferc1__yearly_steam_plants_sched402", "all"),
+        ("_out_ferc1__yearly_plants_utilities", "all"),
+        ("out_ferc1__yearly_purchased_power_and_exchanges_sched326", "all"),
     ],
 )
-def test_no_null_cols_ferc1(pudl_out_ferc1, live_dbs, cols, df_name):
+def test_no_null_cols_ferc1(live_dbs, asset_value_loader, cols, asset_key):
     """Verify that output DataFrames have no entirely NULL columns."""
     if not live_dbs:
         pytest.skip("Data validation only works with a live PUDL DB.")
     pv.no_null_cols(
-        pudl_out_ferc1.__getattribute__(df_name)(), cols=cols, df_name=df_name
+        asset_value_loader.load_asset_value(asset_key), cols=cols, df_name=asset_key
     )
 
 
 @pytest.mark.parametrize(
-    "df_name,expected_rows",
+    "asset_key,expected_rows",
     [
-        ("fbp_ferc1", 26_947),
-        ("fuel_ferc1", 51_238),
-        ("plant_in_service_ferc1", 355_918),
-        ("plants_all_ferc1", 58_520),
-        ("plants_hydro_ferc1", 7_202),
-        ("plants_pumped_storage_ferc1", 580),
-        ("plants_small_ferc1", 17_763),
-        ("plants_steam_ferc1", 32_975),
-        ("pu_ferc1", 7_887),
-        ("purchased_power_ferc1", 211_794),
+        ("_out_ferc1__yearly_plants_utilities", 7_887),
+        ("out_ferc1__yearly_all_plants", 58_520),
+        ("out_ferc1__yearly_balance_sheet_assets_sched110", 278_789),
+        ("out_ferc1__yearly_balance_sheet_liabilities_sched110", 233_383),
+        ("out_ferc1__yearly_cash_flows_sched120", 306_837),
+        ("out_ferc1__yearly_depreciation_by_function_sched219", 148_352),
+        ("out_ferc1__yearly_depreciation_changes_sched219", 263_942),
+        ("out_ferc1__yearly_depreciation_summary_sched336", 216_710),
+        ("out_ferc1__yearly_energy_dispositions_sched401", 25_954),
+        ("out_ferc1__yearly_energy_sources_sched401", 38_315),
+        ("out_ferc1__yearly_hydroelectric_plants_sched406", 7_202),
+        ("out_ferc1__yearly_income_statements_sched114", 347_394),
+        ("out_ferc1__yearly_operating_expenses_sched320", 618_518),
+        ("out_ferc1__yearly_operating_revenues_sched300", 77_646),
+        ("out_ferc1__yearly_other_regulatory_liabilities_sched278", 53015),
+        ("out_ferc1__yearly_plant_in_service_sched204", 355_918),
+        ("out_ferc1__yearly_pumped_storage_plants_sched408", 580),
+        ("out_ferc1__yearly_purchased_power_and_exchanges_sched326", 211_794),
+        ("out_ferc1__yearly_retained_earnings_sched118", 105_585),
+        ("out_ferc1__yearly_sales_by_rate_schedules_sched304", 303_909),
+        ("out_ferc1__yearly_small_plants_sched410", 17763),
+        ("out_ferc1__yearly_steam_plants_fuel_by_plant_sched402", 26_947),
+        ("out_ferc1__yearly_steam_plants_fuel_sched402", 51_238),
+        ("out_ferc1__yearly_steam_plants_sched402", 32_975),
+        ("out_ferc1__yearly_transmission_lines_sched422", 640_619),
+        ("out_ferc1__yearly_utility_plant_summary_sched200", 198_769),
+        ("out_ferc1__yearly_small_plants_sched410", 17_763),
     ],
 )
-def test_minmax_rows(pudl_out_ferc1, live_dbs, expected_rows, df_name):
+def test_minmax_rows(live_dbs, asset_value_loader, expected_rows, asset_key):
     """Verify that output DataFrames don't have too many or too few rows.
 
     Args:
-        pudl_out_ferc1: A PudlTabl output object.
         live_dbs: Boolean (wether we're using a live or testing DB).
         expected_rows (int): Expected number of rows that the dataframe should
             contain when all data is loaded and is output without aggregation.
-        df_name (str): Shorthand name identifying the dataframe, corresponding
-            to the name of the function used to pull it from the PudlTabl
-            output object.
+        asset_key (str): The name of the asset.
     """
     if not live_dbs:
         pytest.skip("Data validation only works with a live PUDL DB.")
     _ = (
-        pudl_out_ferc1.__getattribute__(df_name)()
+        asset_value_loader.load_asset_value(asset_key)
         .pipe(
-            pv.check_min_rows, expected_rows=expected_rows, margin=0.0, df_name=df_name
+            pv.check_min_rows,
+            expected_rows=expected_rows,
+            margin=0.0,
+            df_name=asset_key,
         )
         .pipe(
-            pv.check_max_rows, expected_rows=expected_rows, margin=0.0, df_name=df_name
+            pv.check_max_rows,
+            expected_rows=expected_rows,
+            margin=0.0,
+            df_name=asset_key,
         )
     )
 
 
 @pytest.mark.parametrize(
-    "df_name,unique_subset",
+    "asset_key,unique_subset",
     [
-        ("pu_ferc1", ["utility_id_ferc1", "plant_name_ferc1"]),
-        ("fbp_ferc1", ["report_year", "utility_id_ferc1", "plant_name_ferc1"]),
         (
-            "plants_hydro_ferc1",
+            "_out_ferc1__yearly_plants_utilities",
+            ["utility_id_ferc1", "plant_name_ferc1"],
+        ),
+        (
+            "out_ferc1__yearly_steam_plants_fuel_by_plant_sched402",
+            ["report_year", "utility_id_ferc1", "plant_name_ferc1"],
+        ),
+        (
+            "out_ferc1__yearly_hydroelectric_plants_sched406",
             [
                 "report_year",
                 "utility_id_ferc1",
@@ -141,7 +167,7 @@ def test_minmax_rows(pudl_out_ferc1, live_dbs, expected_rows, df_name):
             ],
         ),
         (
-            "plants_pumped_storage_ferc1",
+            "out_ferc1__yearly_pumped_storage_plants_sched408",
             [
                 "report_year",
                 "utility_id_ferc1",
@@ -150,19 +176,19 @@ def test_minmax_rows(pudl_out_ferc1, live_dbs, expected_rows, df_name):
             ],
         ),
         (
-            "plant_in_service_ferc1",
+            "out_ferc1__yearly_plant_in_service_sched204",
             ["report_year", "utility_id_ferc1", "ferc_account_label"],
         ),
     ],
 )
-def test_unique_rows_ferc1(pudl_out_ferc1, live_dbs, df_name, unique_subset):
+def test_unique_rows_ferc1(live_dbs, asset_value_loader, asset_key, unique_subset):
     """Test whether dataframe has unique records within a subset of columns."""
     if not live_dbs:
         pytest.skip("Data validation only works with a live PUDL DB.")
     pv.check_unique_rows(
-        pudl_out_ferc1.__getattribute__(df_name)(),
+        asset_value_loader.load_asset_value(asset_key),
         subset=unique_subset,
-        df_name=df_name,
+        df_name=asset_key,
     )