Skip to content

Commit

Permalink
NREL ATB axtraction (#3498)
Browse files Browse the repository at this point in the history
* eia860 solar: extract step wahoo

* tweak column names

* first pass of extracting 860 wind

* first pass at extracting nrelatb

* fix lil unit test bbs

* actually test rename in unit test

* fix suffix number in column map

* add defintions pages into data source docs

---------

Co-authored-by: E. Belfer <[email protected]>
  • Loading branch information
cmgosnell and e-belfer authored Mar 26, 2024
1 parent 1c005ce commit b9dc700
Show file tree
Hide file tree
Showing 17 changed files with 2,036 additions and 15 deletions.
518 changes: 518 additions & 0 deletions docs/data_sources/nrelatb/nrelatb_definitions_2020.html

Large diffs are not rendered by default.

442 changes: 442 additions & 0 deletions docs/data_sources/nrelatb/nrelatb_definitions_2021.html

Large diffs are not rendered by default.

442 changes: 442 additions & 0 deletions docs/data_sources/nrelatb/nrelatb_definitions_2022.html

Large diffs are not rendered by default.

482 changes: 482 additions & 0 deletions docs/data_sources/nrelatb/nrelatb_definitions_2023.html

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/pudl/etl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
*load_assets_from_modules([pudl.extract.eia191], group_name="raw_eia191"),
*load_assets_from_modules([pudl.extract.eia757a], group_name="raw_eia757a"),
*load_assets_from_modules([pudl.extract.phmsagas], group_name="raw_phmsagas"),
*load_assets_from_modules([pudl.extract.nrelatb], group_name="raw_nrelatb"),
*load_assets_from_modules([pudl.extract.eia860m], group_name="raw_eia860m"),
*load_assets_from_modules([pudl.extract.eia860], group_name="raw_eia860"),
*load_assets_from_modules([pudl.transform.eia860], group_name="_core_eia860"),
Expand Down
1 change: 1 addition & 0 deletions src/pudl/extract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
excel,
ferc1,
ferc714,
nrelatb,
phmsagas,
xbrl,
)
11 changes: 0 additions & 11 deletions src/pudl/extract/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,17 +85,6 @@ def get_file_name(self, page, **partition):
"""Returns file name of given partition and page."""
return self._file_name.loc[page, str(self._get_partition_selection(partition))]

def get_column_map(self, page, **partition):
"""Return dictionary for renaming columns in a given partition and page."""
return {
v: k
for k, v in self._column_map[page]
.T.loc[str(self._get_partition_selection(partition))]
.to_dict()
.items()
if v != -1
}

def get_form(self, page) -> str:
"""Returns the form name for a given page."""
return self._page_part_map.loc[page, "form"]
Expand Down
13 changes: 12 additions & 1 deletion src/pudl/extract/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,17 @@ def get_all_columns(self, page) -> list[str]:
"""Returns list of all pudl columns for a given page across all partitions."""
return sorted(self._column_map[page].T.columns)

def get_column_map(self, page, **partition):
"""Return dictionary for renaming columns in a given partition and page."""
return {
v: k
for k, v in self._column_map[page]
.T.loc[str(self._get_partition_selection(partition))]
.to_dict()
.items()
if v != -1
}


class GenericExtractor(ABC):
"""Generic extractor base class."""
Expand Down Expand Up @@ -145,7 +156,7 @@ def process_raw(
self, df: pd.DataFrame, page: str, **partition: PartitionSelection
) -> pd.DataFrame:
"""Takes any special steps for processing raw data and renaming columns."""
return df
return df.rename(columns=self._metadata.get_column_map(page, **partition))

def process_renamed(
self, df: pd.DataFrame, page: str, **partition: PartitionSelection
Expand Down
34 changes: 34 additions & 0 deletions src/pudl/extract/nrelatb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Routines used for extracting the raw NREL ATB data."""

from dagster import Output, asset

from pudl.extract.extractor import GenericMetadata, raw_df_factory
from pudl.extract.parquet import ParquetExtractor


class Extractor(ParquetExtractor):
"""Extractor for NREL ATB."""

def __init__(self, *args, **kwargs):
"""Initialize the module.
Args:
ds (:class:datastore.Datastore): Initialized datastore.
"""
self.METADATA = GenericMetadata("nrelatb")
super().__init__(*args, **kwargs)


raw_nrelatb__all_dfs = raw_df_factory(Extractor, name="nrelatb")


@asset(
required_resource_keys={"datastore", "dataset_settings"},
)
def raw_nrelatb__data(raw_nrelatb__all_dfs):
"""Extract raw NREL ATB data from annual parquet files to one dataframe.
Returns:
An extracted NREL ATB dataframe.
"""
return Output(value=raw_nrelatb__all_dfs["data"])
51 changes: 51 additions & 0 deletions src/pudl/extract/parquet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""Extractor for Parquet data."""

import io

import pandas as pd

import pudl.logging_helpers
from pudl.extract.extractor import GenericExtractor, PartitionSelection

logger = pudl.logging_helpers.get_logger(__name__)


class ParquetExtractor(GenericExtractor):
"""Class for extracting dataframes from parquet files.
The extraction logic is invoked by calling extract() method of this class.
"""

def source_filename(self, page: str, **partition: PartitionSelection) -> str:
"""Produce the source Parquet file name as it will appear in the archive.
Args:
page: pudl name for the dataset contents, eg "boiler_generator_assn" or
"data"
partition: partition to load. Examples:
{'year': 2009}
Returns:
string name of the parquet file
"""
partition_selection = self._metadata._get_partition_selection(partition)
return f"{self._dataset_name}-{partition_selection}.parquet"

def load_source(self, page: str, **partition: PartitionSelection) -> pd.DataFrame:
"""Produce the dataframe object for the given partition.
This method assumes that the archive includes one unzipped file per partition.
Args:
page: pudl name for the dataset contents, eg "boiler_generator_assn" or
"data"
partition: partition to load. Examples:
{'year': 2009}
{'year_month': '2020-08'}
Returns:
pd.DataFrame instance containing CSV data
"""
res = self.ds.get_unique_resource(self._dataset_name, **partition)
df = pd.read_parquet(io.BytesIO(res))
return df
2 changes: 1 addition & 1 deletion src/pudl/metadata/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,7 @@
"source_format": "Parquet",
},
"working_partitions": {
"years": [2023],
"years": sorted(set(range(2019, 2024))),
},
"contributors": [
CONTRIBUTORS["catalyst-cooperative"],
Expand Down
21 changes: 21 additions & 0 deletions src/pudl/package_data/nrelatb/column_maps/data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
year_index,2019,2020,2021,2022,2023
report_year,atb_year,atb_year,atb_year,atb_year,atb_year
core_metric_key,core_metric_key,core_metric_key,core_metric_key,core_metric_key,core_metric_key
core_metric_parameter,core_metric_parameter,core_metric_parameter,core_metric_parameter,core_metric_parameter,core_metric_parameter
core_metric_case,core_metric_case,core_metric_case,core_metric_case,core_metric_case,core_metric_case
cost_recovery_period_years,crpyears,crpyears,crpyears,crpyears,crpyears
technology_description,technology,technology,technology,technology,technology
technology_alias,,,technology_alias,technology_alias,technology_alias
display_name,,,display_name,display_name,display_name
is_default,,,default,default,default
technology_description_detail_1,techdetail,techdetail,techdetail,techdetail,techdetail
technology_description_detail_2,,,,,techdetail2
resource_description,,,,,resourcedetail
is_technology_mature,,,,,maturity
technology_scale,,,,,scale
scenario_atb,scenario,scenario,scenario,scenario,scenario
core_metric_variable_year,core_metric_variable,core_metric_variable,core_metric_variable,core_metric_variable,core_metric_variable
units,units,units,units,units,units
value,value,value,value,value,value
update_date,update_date,,,,
revision_num,revision,revision,,,
2 changes: 2 additions & 0 deletions src/pudl/package_data/settings/etl_fast.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,5 @@ datasets:
year_quarters: ["2022q1"]
phmsagas:
years: [2022]
nrelatb:
years: [2023]
2 changes: 2 additions & 0 deletions src/pudl/package_data/settings/etl_full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -344,3 +344,5 @@ datasets:
2021,
2022,
]
nrelatb:
years: [2019, 2020, 2021, 2022, 2023]
14 changes: 14 additions & 0 deletions src/pudl/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,18 @@ class PhmsaGasSettings(GenericDatasetSettings):
years: list[int] = data_source.working_partitions["years"]


class NrelAtbSettings(GenericDatasetSettings):
"""An immutable pydantic model to validate NREL ATB settings.
Args:
data_source: DataSource metadata object
years: list of years to validate.
"""

data_source: ClassVar[DataSource] = DataSource.from_id("nrelatb")
years: list[int] = data_source.working_partitions["years"]


class Eia923Settings(GenericDatasetSettings):
"""An immutable pydantic model to validate EIA 923 settings.
Expand Down Expand Up @@ -389,6 +401,7 @@ class DatasetsSettings(FrozenBaseModel):
ferc714: Ferc714Settings | None = None
glue: GlueSettings | None = None
phmsagas: PhmsaGasSettings | None = None
nrelatb: NrelAtbSettings | None = None

@model_validator(mode="before")
@classmethod
Expand All @@ -408,6 +421,7 @@ def default_load_all(cls, data: dict[str, Any]) -> dict[str, Any]:
data["ferc714"] = Ferc714Settings()
data["glue"] = GlueSettings()
data["phmsagas"] = PhmsaGasSettings()
data["nrelatb"] = NrelAtbSettings()

return data

Expand Down
1 change: 1 addition & 0 deletions src/pudl/workspace/datastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ class ZenodoDoiSettings(BaseSettings):
ferc714: ZenodoDoi = "10.5281/zenodo.8326694"
gridpathratk: ZenodoDoi = "10.5281/zenodo.10844662"
phmsagas: ZenodoDoi = "10.5281/zenodo.10493790"
nrelatb: ZenodoDoi = "10.5281/zenodo.10839268"

model_config = SettingsConfigDict(env_prefix="pudl_zenodo_doi_", env_file=".env")

Expand Down
14 changes: 12 additions & 2 deletions test/unit/extract/csv_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,17 @@ def test_extract():
company_data = "Total of All Companies"
df = pd.DataFrame([company_data])
df.columns = [company_field]

# TODO: Once FakeExtractor is independent of eia176, mock out populating _column_map for PARTITION_SELECTION;
# Also include negative tests, i.e., for partition selections not in the _column_map
with (
patch.object(CsvExtractor, "load_source", return_value=df),
patch.object(
# Testing the rename
GenericMetadata,
"get_column_map",
return_value={company_field: "company_rename"},
),
patch.object(
# Transposing the df here to get the orientation we expect get_page_cols to return
CsvExtractor,
Expand All @@ -73,5 +80,8 @@ def test_extract():
),
):
res = extractor.extract(**PARTITION)
assert len(res) == 1
assert res[PAGE][company_field][0] == company_data
assert len(res) == 1 # Assert only one page extracted
assert list(res.keys()) == [PAGE] # Assert it is named correctly
assert (
res[PAGE]["company_rename"][0] == company_data
) # Assert that column correctly renamed and data is there.

0 comments on commit b9dc700

Please sign in to comment.