Skip to content

Commit

Permalink
Merge pull request #402 from singularity-energy/greg/missing_pjm
Browse files Browse the repository at this point in the history
Strip leading zeros from CEMS `emission_unit_id_epa`
  • Loading branch information
grgmiller authored Dec 17, 2024
2 parents b9e9a83 + b241630 commit 7d80e74
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 3 deletions.
40 changes: 39 additions & 1 deletion notebooks/work_in_progress/sandbox.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,47 @@
"logger = get_logger(\"test\")\n",
"\n",
"\n",
"year = 2022\n",
"year = 2023\n",
"path_prefix = f\"{year}/\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# load intermediate output data as needed\n",
"cems = pd.read_csv(\n",
" outputs_folder(f\"{year}/cems_subplant_{year}.csv.zip\"),\n",
" compression=\"zip\",\n",
" parse_dates=[\"datetime_utc\", \"report_date\"],\n",
")\n",
"partial_cems_subplant = pd.read_csv(\n",
" outputs_folder(f\"{year}/partial_cems_subplant_{year}.csv.zip\"),\n",
" compression=\"zip\",\n",
" parse_dates=[\"datetime_utc\", \"report_date\"],\n",
")\n",
"partial_cems_plant = pd.read_csv(\n",
" outputs_folder(f\"{year}/partial_cems_plant_{year}.csv.zip\"),\n",
" compression=\"zip\",\n",
" parse_dates=[\"datetime_utc\", \"report_date\"],\n",
")\n",
"eia923_allocated = pd.read_csv(\n",
" outputs_folder(f\"{year}/eia923_allocated_{year}.csv.zip\"),\n",
" compression=\"zip\",\n",
" parse_dates=[\"report_date\"],\n",
")\n",
"plant_attributes = pd.read_csv(\n",
" outputs_folder(f\"{year}/plant_static_attributes_{year}.csv.zip\"), compression=\"zip\"\n",
")\n",
"primary_fuel_table = pd.read_csv(\n",
" outputs_folder(f\"{year}/primary_fuel_table_{year}.csv.zip\"), compression=\"zip\"\n",
")\n",
"monthly_eia_data_to_shape = eia923_allocated[\n",
" (eia923_allocated[\"hourly_data_source\"] == \"eia\")\n",
"]"
]
}
],
"metadata": {
Expand Down
4 changes: 2 additions & 2 deletions src/oge/data_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,14 +445,14 @@ def main(args):
if year < earliest_hourly_data_year:
# export plant static attributes to csv
output_data.output_intermediate_data(
plant_attributes.assign(shaped_plant_id=pd.NA),
plant_attributes,
"plant_static_attributes",
path_prefix,
year,
args.skip_outputs,
)
if not args.skip_outputs:
plant_attributes.assign(shaped_plant_id=pd.NA).to_csv(
plant_attributes.to_csv(
results_folder(f"{path_prefix}plant_data/plant_static_attributes.csv"),
index=False,
)
Expand Down
11 changes: 11 additions & 0 deletions src/oge/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ def load_cems_data(year: int) -> pd.DataFrame:
# convert to tz-naive datetime to allow for dtype application
cems = apply_dtypes(cems)

# strip leading zeros from emissions unit id
cems["emissions_unit_id_epa"] = cems["emissions_unit_id_epa"].str.lstrip("0")

# update the plant_id_eia column using manual matches
cems = update_epa_to_eia_map(cems, year)

Expand Down Expand Up @@ -144,6 +147,10 @@ def load_cems_ids() -> pd.DataFrame:
columns=["plant_id_epa", "plant_id_eia", "emissions_unit_id_epa"],
).drop_duplicates()
cems_id_year = apply_dtypes(cems_id_year)
# strip leading zeros from emissions unit id
cems_id_year["emissions_unit_id_epa"] = cems_id_year[
"emissions_unit_id_epa"
].str.lstrip("0")
# update the plant_id_eia column using manual matches
cems_id_year = update_epa_to_eia_map(cems_id_year, year)
cems_ids.append(cems_id_year)
Expand Down Expand Up @@ -707,6 +714,7 @@ def load_epa_eia_crosswalk_from_raw(year: int) -> pd.DataFrame:

# remove leading zeros from the generator id and emissions_unit_id_epa
crosswalk["EIA_GENERATOR_ID"] = crosswalk["EIA_GENERATOR_ID"].str.lstrip("0")
crosswalk["CAMD_UNIT_ID"] = crosswalk["CAMD_UNIT_ID"].str.lstrip("0")

# some eia plant ids are missing. Let us assume that the EIA and EPA plant ids
# match in this case
Expand Down Expand Up @@ -810,6 +818,9 @@ def load_epa_eia_crosswalk(year: int) -> pd.DataFrame:
"""

crosswalk = load_pudl_table("core_epa__assn_eia_epacamd")
crosswalk["emissions_unit_id_epa"] = crosswalk["emissions_unit_id_epa"].str.lstrip(
"0"
)

# load manually inputted data
crosswalk_manual = pd.read_csv(
Expand Down
8 changes: 8 additions & 0 deletions src/oge/reference_tables/epa_eia_crosswalk_manual.csv
Original file line number Diff line number Diff line change
Expand Up @@ -470,3 +470,11 @@ plant_id_epa,emissions_unit_id_epa,plant_id_eia,generator_id,start_year,end_year
55494,CT4,55494,CT2,,,CAMPD Facility database mapping
55494,CT5,55494,CT5,,,CAMPD Facility database mapping
55494,CT6,55494,CT6,,,CAMPD Facility database mapping
55481,1,55481,GT1,,2012,Changes plant ID in 2013
55481,1,55481,ST1,,2012,Changes plant ID in 2013
55481,2,55481,GT2,,2012,Changes plant ID in 2013
55481,2,55481,ST1,,2012,Changes plant ID in 2013
55481,1,58557,GT1,2013,,Changes plant ID in 2013
55481,1,58557,ST1,2013,,Changes plant ID in 2013
55481,2,58557,GT2,2013,,Changes plant ID in 2013
55481,2,58557,ST1,2013,,Changes plant ID in 2013

0 comments on commit 7d80e74

Please sign in to comment.