From 095d31bdf34ca12ca86f5f5b69f3a4b0479968d3 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Tue, 7 Nov 2023 16:55:58 -0500 Subject: [PATCH 1/3] idk exactly why the "nan"s began existing but this fixes it --- src/pudl/analysis/classify_plants_ferc1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl/analysis/classify_plants_ferc1.py b/src/pudl/analysis/classify_plants_ferc1.py index 1a0f2835f6..68393c0699 100644 --- a/src/pudl/analysis/classify_plants_ferc1.py +++ b/src/pudl/analysis/classify_plants_ferc1.py @@ -546,7 +546,7 @@ def revert_filled_in_string_nulls(df: pd.DataFrame) -> pd.DataFrame: if col in df.columns: # the replace to_replace={column_name: {"", pd.NA}} mysteriously doesn't work. df[col] = df[col].replace( - to_replace=[""], + to_replace=["", "nan"], value=pd.NA, ) return df From 14e6bc8e6909e5f52a0cbd27bba2bc54f875afec Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Tue, 7 Nov 2023 18:00:58 -0500 Subject: [PATCH 2/3] revert the replace of "nan" by stopping introducing them! plus some light clean up --- src/pudl/analysis/classify_plants_ferc1.py | 14 +++++++++----- src/pudl/output/ferc1.py | 4 ---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/pudl/analysis/classify_plants_ferc1.py b/src/pudl/analysis/classify_plants_ferc1.py index 68393c0699..ef6f800aa6 100644 --- a/src/pudl/analysis/classify_plants_ferc1.py +++ b/src/pudl/analysis/classify_plants_ferc1.py @@ -546,7 +546,7 @@ def revert_filled_in_string_nulls(df: pd.DataFrame) -> pd.DataFrame: if col in df.columns: # the replace to_replace={column_name: {"", pd.NA}} mysteriously doesn't work. df[col] = df[col].replace( - to_replace=["", "nan"], + to_replace=[""], value=pd.NA, ) return df @@ -654,9 +654,11 @@ def fuel_by_plant_ferc1( ] # Ensure that the dataframe we've gotten has all the information we need: - for col in keep_cols: - if col not in fuel_df.columns: - raise AssertionError(f"Required column {col} not found in input fuel_df.") + missing_cols = [col for col in keep_cols if col not in fuel_df.columns] + if missing_cols: + raise AssertionError( + f"Required columns not found in input fuel_df: {missing_cols}" + ) # Calculate per-fuel derived values and add them to the DataFrame df = ( @@ -679,7 +681,8 @@ def fuel_by_plant_ferc1( "plant_name_ferc1", "report_year", "fuel_type_code_pudl", - ] + ], + observed=True, ) .sum() .reset_index() @@ -732,6 +735,7 @@ def fuel_by_plant_ferc1( ).reset_index() # Label each plant-year record by primary fuel: + df.loc[:, ["primary_fuel_by_cost", "primary_fuel_by_mmbtu"]] = pd.NA for fuel_str in fuel_categories: try: mmbtu_mask = df[f"{fuel_str}_fraction_mmbtu"] > thresh diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index fe11acb346..5ed3551f3e 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -810,10 +810,6 @@ def drop_other_fuel_types(df): return df[df.fuel_type_code_pudl != "other"].copy() thresh = context.op_config["thresh"] - # The existing function expects `fuel_type_code_pudl` to be an object, rather than - # a category. This is a legacy of pre-dagster code, and we convert here to prevent - # further retooling in the code-base. - fuel_ferc1["fuel_type_code_pudl"] = fuel_ferc1["fuel_type_code_pudl"].astype(str) fuel_categories = list( pudl.transform.ferc1.FuelFerc1TableTransformer() From b7533cfbff5ff8538cab5208e7fde1b90c52132e Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 8 Nov 2023 10:23:14 -0500 Subject: [PATCH 3/3] REALLY REALLY its a nullable string --- src/pudl/analysis/classify_plants_ferc1.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/pudl/analysis/classify_plants_ferc1.py b/src/pudl/analysis/classify_plants_ferc1.py index ef6f800aa6..9a8e6a1096 100644 --- a/src/pudl/analysis/classify_plants_ferc1.py +++ b/src/pudl/analysis/classify_plants_ferc1.py @@ -736,6 +736,12 @@ def fuel_by_plant_ferc1( # Label each plant-year record by primary fuel: df.loc[:, ["primary_fuel_by_cost", "primary_fuel_by_mmbtu"]] = pd.NA + df = df.astype( + { + "primary_fuel_by_cost": pd.StringDtype(), + "primary_fuel_by_mmbtu": pd.StringDtype(), + } + ) for fuel_str in fuel_categories: try: mmbtu_mask = df[f"{fuel_str}_fraction_mmbtu"] > thresh