diff --git a/src/pudl/output/ferc714.py b/src/pudl/output/ferc714.py index 8049536b46..7d027f413a 100644 --- a/src/pudl/output/ferc714.py +++ b/src/pudl/output/ferc714.py @@ -652,18 +652,20 @@ def summarized_demand_ferc714( demand_hourly_pa_ferc714.loc[ :, ["report_date", "respondent_id_ferc714", "demand_mwh"] ], + on=["report_date", "respondent_id_ferc714"], how="left", ) - .groupby(["report_date", "respondent_id_ferc714"]) - .agg({"demand_mwh": sum}) + .groupby(["report_date", "respondent_id_ferc714"], as_index=False)[ + ["demand_mwh"] + ] + .sum(min_count=1) .rename(columns={"demand_mwh": "demand_annual_mwh"}) - .reset_index() .merge( georeferenced_counties_ferc714.groupby( - ["report_date", "respondent_id_ferc714"] - ) - .agg({"population": sum, "area_km2": sum}) - .reset_index() + ["report_date", "respondent_id_ferc714"], as_index=False + )[["population", "area_km2"]].sum(min_count=1), + on=["report_date", "respondent_id_ferc714"], + how="left", ) .assign( population_density_km2=lambda x: x.population / x.area_km2, diff --git a/test/validate/service_territory_test.py b/test/validate/service_territory_test.py index 4dbee9a988..203f40ff0e 100644 --- a/test/validate/service_territory_test.py +++ b/test/validate/service_territory_test.py @@ -13,7 +13,7 @@ "df_name,expected_rows", [ ("summarized_demand_ferc714", 3_195), - ("fipsified_respondents_ferc714", 135_627), + ("fipsified_respondents_ferc714", 135_537), ("compiled_geometry_balancing_authority_eia861", 112_507), ("compiled_geometry_utility_eia861", 247_705), ], @@ -46,3 +46,43 @@ def test_minmax_rows( pv.check_max_rows, expected_rows=expected_rows, margin=0.0, df_name=df_name ) ) + + +@pytest.mark.parametrize( + "df_name,expected_rows", + [("demand_hourly_pa_ferc714", 15_608_154)], +) +def test_minmax_rows_and_year_in_demand_hourly_pa_ferc714( + pudl_out_orig: "pudl.output.pudltabl.PudlTabl", + live_dbs: bool, + expected_rows: int, + df_name: str, +): + """Test if the majority of the years in the two date columns line up & min/max rows. + + We are parameterizing this test even though it only has one input because the + test_minmax_rows is a common test across many tables and we wanted to preserve the + format. + """ + if not live_dbs: + pytest.skip("Data validation only works with a live PUDL DB.") + demand_hourly_pa_ferc714 = pudl_out_orig.__getattribute__(df_name)() + _ = demand_hourly_pa_ferc714.pipe( + pv.check_min_rows, expected_rows=expected_rows, margin=0.0, df_name=df_name + ).pipe(pv.check_max_rows, expected_rows=expected_rows, margin=0.0, df_name=df_name) + + logger.info("Checking the consistency of the year in the multiple date columns.") + mismatched_report_years = demand_hourly_pa_ferc714[ + ( + demand_hourly_pa_ferc714.utc_datetime.dt.year + != demand_hourly_pa_ferc714.report_date.dt.year + ) + ] + if ( + off_ratio := len(mismatched_report_years) / len(demand_hourly_pa_ferc714) + ) > 0.001: + raise AssertionError( + f"Found more ({off_ratio:.2%}) than expected (>.1%) FERC714 records" + " where the report year from the utc_datetime differs from the " + "report_date column." + )