Skip to content

Commit

Permalink
Fix column mapping errors (#3820)
Browse files Browse the repository at this point in the history
* Flag missing/extra columns as extraction errors.

* add all early release columns into column maps

* update phmsa extractor & column map to squash column extraction errors

* remove straggler unstructered column in map

* fix inline comment

* Add unit tests to exercise new column mapping validation.

* Update comment to reflect current validations.

---------

Co-authored-by: Zane Selvans <[email protected]>
  • Loading branch information
cmgosnell and zaneselvans authored Aug 29, 2024
1 parent 23a982c commit d033574
Show file tree
Hide file tree
Showing 20 changed files with 223 additions and 34 deletions.
1 change: 0 additions & 1 deletion src/pudl/extract/eia923.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ def process_raw(self, df, page, **partition):
f"column with {partition['year']}"
)
df.loc[mask, "report_year"] = partition["year"]
df = self.add_data_maturity(df, page, **partition)
return df

@staticmethod
Expand Down
6 changes: 3 additions & 3 deletions src/pudl/extract/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,16 +183,16 @@ def validate(self, df: pd.DataFrame, page: str, **partition: PartitionSelection)
page_cols = self.get_page_cols(page, partition_selection)
expected_cols = page_cols.union(self.cols_added)
if set(df.columns) != set(expected_cols):
# TODO (bendnorman): Enforce canonical fields for all raw fields?
# Ensure that expected and actually extracted columns match
extra_raw_cols = set(df.columns).difference(expected_cols)
missing_raw_cols = set(expected_cols).difference(df.columns)
if extra_raw_cols:
logger.warning(
raise ValueError(
f"{page}/{partition_selection}: Extra columns found in extracted table:"
f"\n{extra_raw_cols}"
)
if missing_raw_cols:
logger.warning(
raise ValueError(
f"{page}/{partition_selection}: Expected columns not found in extracted table:"
f"\n{missing_raw_cols}"
)
Expand Down
33 changes: 32 additions & 1 deletion src/pudl/extract/phmsagas.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def process_renamed(self, newdata: pd.DataFrame, page: str, **partition):
"""Drop columns that get mapped to other assets.
"""Drop columns that get mapped to other assets and columns with unstructured data.
Older years of PHMSA data have one Excel tab in the raw data, while newer data
has multiple tabs. To extract data into tables that follow the newer data format
Expand All @@ -52,6 +52,37 @@ def process_renamed(self, newdata: pd.DataFrame, page: str, **partition):
f"\n{to_drop}"
)
newdata = newdata.drop(columns=to_drop, errors="ignore")
# there is an annoying middling number of columns in phmsa raw data that are unnamed
# and have a smattering of random values. we want to drop these guys. but we are going
# to enumerate what we expect to need to drop so if lots of new unmapped columns happen
# unexpectedly a warning will happen here (and a extraction validation error will happen)
# FYI: In 2009 there were a ton of extra columns seemingly from two records that had a
# multi-line comment that shifted the rest of the cells over. Presumably we could map
# them all and do a manual shift of the data. But its only 2 records so rn we are just
# droppign them.
unnamed_page_years = {
"yearly_distribution": [
2000,
2001,
2002,
2003,
2004,
2005,
2006,
2007,
2009,
]
}
unnamed_columns = newdata.filter(like="unnamed").columns
if (page in unnamed_page_years) and (
int(partition["year"]) in unnamed_page_years[page]
):
newdata = newdata.drop(columns=unnamed_columns)
elif not unnamed_columns.empty:
logger.warning(
"We found some unnamed columns that are probably not expected. "
f"Consider dropping them. Columns found: {unnamed_columns}"
)
return newdata


Expand Down
2 changes: 1 addition & 1 deletion src/pudl/package_data/eia923/column_maps/boiler_fuel.csv
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,4 @@ ash_content_pct_december,ash_content_december,ash_content_december,ash_content_d
total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity,total_fuel_consumption_quantity
balancing_authority_code_eia,,,,,,,,,,,balancing_authority_code,balancing_authority_code,balancing_authority_code,balancing_authority_code,balancing_authority_code,balancing_authority_code,ba_code
report_year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year
early_release,,,,,,,,,,,,,,,,,
early_release,,,,,,,,,,,,,,,,early_release_data_july_2024_not_fully_edited_use_with_caution_do_not_aggregate_to_state_regional_or_national_totals,
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ stored_onsite_1000_tons,stored_onsite,stored_onsite,stored_onsite,stored_onsite,
stored_offsite_1000_tons,stored_offsite,stored_offsite,stored_offsite,stored_offsite,stored_offsite,stored_offsite_thousand_tons,stored_offsite_thousand_tons,stored_offsite,stored_offsite,stored_offsite,stored_offsite,stored_offsite,stored_offsite,stored_offsite,stored_offsite,stored_offsite
total_disposal_1000_tons,total,total,total,total,total,total_thousand_tons_or_for_steam_mmbtu,total_thousand_tons_or_for_steam_mmbtu,total,total,total,total,total,total,total,total,total
byproducts_to_report,no_byproducts,no_byproducts,no_byproducts,no_byproducts,no_byproducts,no_byproducts,no_byproducts,no_byproducts,no_byproducts,no_byproducts,no_byproducts,no_byproducts,no_byproducts,no_byproducts,no_byproducts,no_byproducts
early_release,,,,,,,,,,,,,,,,early_release_data_july_2024_not_fully_edited_use_with_caution_do_not_aggregate_to_state_regional_or_national_totals
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ revenues_fly_bottom_ash_intermingled_1000_dollars,fly_bottom_ash_intermingled_re
revenues_fgd_byproducts_1000_dollars,fgd_byproducts_revenues,fgd_byproducts_revenues,fgd_byproducts_revenues,fgd_byproducts_revenues,fgd_byproducts_revenues,fgd_byproducts_revenues_thousand_dollars,fgd_byproducts_revenues_thousand_dollars,fgd_byproducts_revenues,fgd_byproducts_revenues,fgd_byproducts_revenues,fgd_byproducts_revenues,fgd_byproducts_revenues,fgd_byproducts_revenues,fgd_byproducts_revenues,fgd_byproducts_revenues,fgd_byproducts_revenues
revenues_other_byproducts_1000_dollars,other_byproducts_revenues,other_byproducts_revenues,other_byproducts_revenues,other_byproducts_revenues,other_byproducts_revenues,other_byproducts_revenues_thousand_dollars,other_byproducts_revenues_thousand_dollars,other_byproducts_revenues,other_byproducts_revenues,other_byproducts_revenues,other_byproducts_revenues,other_byproducts_revenues,other_byproducts_revenues,other_byproducts_revenues,other_byproducts_revenues,other_byproducts_revenues
revenues_total_byproduct_1000_dollars,total_revenues,total_revenues,total_revenues,total_revenues,total_revenues,total_revenues_thousand_dollars,total_revenues_thousand_dollars,total_revenues,total_revenues,total_revenues,total_revenues,total_revenues,total_revenues,total_revenues,total_revenues,total_revenues
early_release,,unnamed_30,,,,,,,,,,,,,,early_release_data_july_2024_not_fully_edited_use_with_caution_do_not_aggregate_to_state_regional_or_national_totals
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ annual_maximum_intake_winter_temperature_fahrenheit,intake_peak_winter_temperatu
annual_maximum_intake_summer_temperature_fahrenheit,intake_peak_summer_temperature,intake_peak_summer_temperature,,,,,,,,,,,,,,
annual_maximum_outlet_winter_temperature_fahrenheit,outlet_peak_winter_temperature,outlet_peak_winter_temperature,,,,,,,,,,,,,,
annual_maximum_outlet_summer_temperature_fahrenheit,outlet_peak_summer_temperature,outlet_peak_summer_temperature,,,,,,,,,,,,,,
early_release,,,,,,,,,,,,,,,,early_release_data_july_2024_not_fully_edited_use_with_caution_do_not_aggregate_to_state_regional_or_national_totals
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ fgd_electricity_consumption_mwh,fgd_electricity_consumption_megawatthours,fgd_el
mercury_removal_efficiency,mercury_removal_efficiency,mercury_removal_efficiency,mercury_removal_efficiency,mercury_removal_efficiency,mercury_removal_efficiency,mercury_removal_efficiency,mercury_removal_efficiency,mercury_removal_efficiency,mercury_removal_efficiency,mercury_removal_efficiency,mercury_removal_efficiency,mercury_removal_efficiency
mercury_emission_rate_lb_per_trillion_btu,,,,,mercury_emission_rate,mercury_emission_rate,mercury_emission_rate,mercury_emission_rate,mercury_emission_rate,mercury_emission_rate,mercury_emission_rate,mercury_emission_rate
acid_gas_removal_efficiency,acid_gas_removal_efficiency,acid_gas_removal_efficiency,acid_gas_removal_efficiency,acid_gas_removal_efficiency,acid_gas_removal_efficiency,acid_gas_removal_efficiency,acid_gas_removal_efficiency,acid_gas_removal_efficiency,acid_gas_removal_efficiency,acid_gas_removal_efficiency,acid_gas_removal_efficiency,acid_gas_removal_efficiency
early_release,,,,,,,,,,,,early_release_data_july_2024_not_fully_edited_use_with_caution_do_not_aggregate_to_state_regional_or_national_totals
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,4 @@ total_fuel_consumed_for_electricity_units,electric_fuel_consumption_quantity,ele
total_gross_generation_mwh,gross_generation_megawatthours,gross_generation_megawatthours,gross_generation_megawatthours,gross_generation_megawatthours,gross_generation_megawatthours,gross_generation_megawatthours,gross_generation_megawatthours,gross_generation_megawatthours,gross_generation_megawatthours,gross_generation_megawatthours,gross_generation_megawatthours
total_net_generation_mwh,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours
report_year,year,year,year,year,year,year,year,year,year,year,year
early_release,,,,,,,,,,early_release_data_july_2024_not_fully_edited_use_with_caution_do_not_aggregate_to_state_regional_or_national_totals,
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ fgd_electricity_consumption_mwh,electric_energy_consumption,electric_energy_cons
so2_removal_efficiency_annual,efficiency_at_annual_operating_factor,efficiency_at_annual_operating_factor,efficiency_at_annual_operating_factor,efficiency_at_annual_operating_factor,,,,,,,,,,,,
so2_removal_efficiency_tested,tested_efficiency_at_100_load,tested_efficiency_at_100_load,tested_efficiency_at_100_load,tested_efficiency_at_100_load,,,,,,,,,,,,
so2_test_date,date_of_latest_efficiency_test,date_of_latest_efficiency_test,date_of_latest_efficiency_test,date_of_latest_efficiency_test,,,,,,,,,,,,
early_release,,,,,,,,,,,,,,,,early_release_data_july_2024_not_fully_edited_use_with_caution_do_not_aggregate_to_state_regional_or_national_totals
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ natural_gas_delivery_contract_type_code,,,,,,,natural_gas_delivery_contract_type
moisture_content_pct,,,,,,,moisture_content,moisture_content,moisture_content,moisture_content,moisture_content,moisture_content,moisture_content,moisture_content,moisture_content,moisture_content,moisture_content
chlorine_content_ppm,,,,,,,chlorine_content,chlorine_content,chlorine_content,chlorine_content,chlorine_content,chlorine_content,chlorine_content,chlorine_content,chlorine_content,chlorine_content,chlorine_content
balancing_authority_code_eia,,,,,,,,,,,balancing_authority_code,balancing_authority_code,balancing_authority_code,balancing_authority_code,balancing_authority_code,balancing_authority_code,ba_code
early_release,,,,,,,,,,,,,,,,,
early_release,,,,,,,,,,,,,,,,early_release_data_july_2024_not_fully_edited_use_with_caution_do_not_aggregate_to_state_regional_or_national_totals,
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,4 @@ total_fuel_consumption_mmbtu,total_fuel_consumption_mmbtus,total_fuel_consumptio
elec_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtus,elec_fuel_consumption_mmbtus,elec_fuel_consumption_mmbtus,elec_fuel_consumption_mmbtus,elec_fuel_consumption_mmbtus,elec_fuel_consumption_mmbtus,elec_fuel_consumption_mmbtus,elec_fuel_consumption_mmbtus,elec_fuel_consumption_mmbtus,elec_fuel_consumption_mmbtus,elec_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtu
total_net_generation_mwh,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours,net_generation_megawatthours
report_year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year
early_release,,,,,,,,,,,,,,,,,,,,,,,,
early_release,,,,,,,,,,,,,,,,,,,,,,,early_release_data_july_2024_not_fully_edited_use_with_caution_do_not_aggregate_to_state_regional_or_national_totals,
2 changes: 1 addition & 1 deletion src/pudl/package_data/eia923/column_maps/generator.csv
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ net_generation_mwh_december,net_generation_december,net_generation_december,net_
net_generation_mwh_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date,net_generation_year_to_date
balancing_authority_code_eia,,,,,,,,,,,balancing_authority_code,balancing_authority_code,balancing_authority_code,balancing_authority_code,balancing_authority_code,balancing_authority_code,ba_code
report_year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year,year
early_release,,,,,,,,,,,,,,,,,
early_release,,,,,,,,,,,,,,,,early_release_data_july_2024_not_fully_edited_use_with_caution_do_not_aggregate_to_state_regional_or_national_totals,
2 changes: 1 addition & 1 deletion src/pudl/package_data/eia923/column_maps/plant_frame.csv
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ plant_name_eia,plant_name,plant_name,plant_name,plant_name,plant_name,plant_name
associated_combined_heat_power,combined_heat_and_power_status_y_chp_n_non_chp,combined_heat_and_power_status,combined_heat_and_power_status_y_chp_n_non_chp,combined_heat_and_power_status,combined_heat_and_power_status,combined_heat_and_power_status,combined_heat_and_power_status,combined_heat_and_power_status,combined_heat_and_power_status,combined_heat_and_power_status,combined_heat_and_power_status,combined_heat_and_power_status,combined_heat_and_power_status,combined_heat_and_power_status
reporting_frequency_code,reporting_frequency_annual_or_monthly,reporting_frequency,reporting_frequency_annual_or_monthly,reporting_frequency,reporting_frequency,reporting_frequency,reporting_frequency,reporting_frequency,reporting_frequency,respondent_frequency,respondent_frequency,respondent_frequency,respondent_frequency,respondent_frequency
nameplate_capacity_mw,nameplate_capacity_mw,,,,,,,,,,,,,
early_release,,,,,,,,,,,,,,
early_release,,,,,,,,,,,,,early_release_data_july_2024_not_fully_edited_use_with_caution_do_not_aggregate_to_state_regional_or_national_totals,
2 changes: 1 addition & 1 deletion src/pudl/package_data/eia923/column_maps/stocks.csv
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ petcoke_september,petcoke_sep,petcoke_sep,petcoke_sep,petcoke_sep,petcoke_sep,pe
petcoke_october,petcoke_oct,petcoke_oct,petcoke_oct,petcoke_oct,petcoke_oct,petcoke_oct,petcoke_oct,petcoke_oct,petcoke_oct,petcoke_oct,petcoke_oct,petcoke_october,petcoke_oct,petcoke_october,petcoke_october,petcoke_october,petcoke_october,petcoke_october,petcoke_october,petcoke_october,petcoke_october,petcoke_october,petcoke_october,petcoke_october
petcoke_november,petcoke_nov,petcoke_nov,petcoke_nov,petcoke_nov,petcoke_nov,petcoke_nov,petcoke_nov,petcoke_nov,petcoke_nov,petcoke_nov,petcoke_nov,petcoke_november,petcoke_nov,petcoke_november,petcoke_november,petcoke_november,petcoke_november,petcoke_november,petcoke_november,petcoke_november,petcoke_november,petcoke_november,petcoke_november,petcoke_november
petcoke_december,petcoke_dec,petcoke_dec,petcoke_dec,petcoke_dec,petcoke_dec,petcoke_dec,petcoke_dec,petcoke_dec,petcoke_dec,petcoke_dec,petcoke_dec,petcoke_december,petcoke_dec,petcoke_december,petcoke_december,petcoke_december,petcoke_december,petcoke_december,petcoke_december,petcoke_december,petcoke_december,petcoke_december,petcoke_december,petcoke_december
early_release,,,,,,,,,,,,,,,,,,,,,,,,
early_release,,,,,,,,,,,,,,,,,,,,,,,early_release_data_july_2024_not_fully_edited_use_with_caution_do_not_aggregate_to_state_regional_or_national_totals,
Loading

0 comments on commit d033574

Please sign in to comment.