Skip to content

Commit

Permalink
Merge pull request #2994 from catalyst-cooperative/nightly-build-2023-31
Browse files Browse the repository at this point in the history
Merge dev into main for 2023-10-31
  • Loading branch information
zaneselvans authored Oct 31, 2023
2 parents 9280bc8 + bbd82ba commit 0d59c3b
Show file tree
Hide file tree
Showing 7 changed files with 984 additions and 239 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ repos:
# Formatters: hooks that re-write Python & documentation files
####################################################################################
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.1
rev: v0.1.3
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]

# Format the code
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 23.10.0
rev: 23.10.1
hooks:
- id: black
language_version: python3.11
Expand Down
2 changes: 1 addition & 1 deletion devtools/datasette/publish.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ datasette_metadata_to_yml -o "metadata.yml"

datasette publish cloudrun \
--service catalyst-datasette \
--memory 4Gi \
--memory 32Gi \
--install datasette-cluster-map \
--install datasette-vega \
--install datasette-block-robots \
Expand Down
28 changes: 15 additions & 13 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ dependencies = [
"coloredlogs>=14.0,<15.1", # Dagster requires 14.0
"dagster-webserver>=1.4,<1.6",
"dagster>=1.4,<1.6",
"dask>=2022.5,<2023.10.1",
"dask>=2022.5,<2023.10.2",
"datapackage>=1.11,<1.16", # Transition datastore to use frictionless.
"email-validator>=1.0.3", # pydantic[email]
"fsspec>=2022.5,<2023.10.1",
"gcsfs>=2022.5,<2023.9.3",
"gcsfs>=2022.5,<2023.10.1",
"geopandas>=0.13,<0.15",
"grpcio==1.57.0", # Required by dagster. Version works with MacOS
"grpcio-health-checking==1.57.0", # Required by dagster. Version works with MacOS
Expand Down Expand Up @@ -193,19 +193,21 @@ select = [
"W", # pycodestyle warnings
]
ignore = [
"D401", # Require imperative mood in docstrings.
"D401", # Require imperative mood in docstrings.
"D417",
"E501", # Overlong lines.
"E203", # Space before ':' (black recommends to ignore)
"PD003", # Use of isna rather than isnull
"PD004", # Use of notna rather than notnull
"PD008", # Use of df.at[] rather than df.loc[]
"PD010", # Use of df.stack()
"PD013", # Use of df.unstack()
"PD015", # Use of pd.merge() rather than df.merge()
"PD901", # df as variable name
"E501", # Overlong lines.
"E203", # Space before ':' (black recommends to ignore)
"E226", # Missing whitespace around arithmetic operator
"E266", # Too many leading `#` before block comment
"PD003", # Use of isna rather than isnull
"PD004", # Use of notna rather than notnull
"PD008", # Use of df.at[] rather than df.loc[]
"PD010", # Use of df.stack()
"PD013", # Use of df.unstack()
"PD015", # Use of pd.merge() rather than df.merge()
"PD901", # df as variable name
"RET504", # Ignore unnecessary assignment before return
"S101", # Use of assert
"S101", # Use of assert
]

# Assume Python 3.11
Expand Down
166 changes: 116 additions & 50 deletions src/pudl/output/ferc1.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,42 +12,108 @@
from matplotlib import pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout
from pandas._libs.missing import NAType as pandas_NAType
from pydantic import BaseModel, confloat, validator
from pydantic import BaseModel, validator

import pudl
from pudl.transform.ferc1 import (
GroupMetricChecks,
GroupMetricTolerances,
MetricTolerances,
)

logger = pudl.logging_helpers.get_logger(__name__)


class CalculationToleranceFerc1(BaseModel):
"""Data quality expectations related to FERC 1 calculations.
We are doing a lot of comparisons between calculated and reported values to identify
reporting errors in the data, errors in FERC's metadata, and bugs in our own code.
This class provides a structure for encoding our expectations about the level of
acceptable (or at least expected) errors, and allows us to pass them around.
In the future we might also want to specify much more granular expectations,
pertaining to individual tables, years, utilities, or facts to ensure that we don't
have low overall error rates, but a problem with the way the data or metadata is
reported in a particular year. We could also define per-filing and per-table error
tolerances to help us identify individual utilities that have e.g. used an outdated
version of Form 1 when filing.
"""

intertable_calculation_errors: confloat(ge=0.0, le=1.0) = 0.05
"""Fraction of interatble calculations that are allowed to not match exactly."""


EXPLOSION_CALCULATION_TOLERANCES: dict[str, CalculationToleranceFerc1] = {
"income_statement_ferc1": CalculationToleranceFerc1(
intertable_calculation_errors=0.20,
EXPLOSION_CALCULATION_TOLERANCES: dict[str, GroupMetricChecks] = {
"income_statement_ferc1": GroupMetricChecks(
groups_to_check=[
"ungrouped",
"report_year",
"xbrl_factoid",
"utility_id_ferc1",
],
group_metric_tolerances=GroupMetricTolerances(
ungrouped=MetricTolerances(
error_frequency=0.02,
relative_error_magnitude=0.04,
null_calculated_value_frequency=1.0,
),
report_year=MetricTolerances(
error_frequency=0.036,
relative_error_magnitude=0.048,
null_calculated_value_frequency=1.0,
),
xbrl_factoid=MetricTolerances(
error_frequency=0.35,
relative_error_magnitude=0.17,
null_calculated_value_frequency=1.0,
),
utility_id_ferc1=MetricTolerances(
error_frequency=0.13,
relative_error_magnitude=0.42,
null_calculated_value_frequency=1.0,
),
),
),
"balance_sheet_assets_ferc1": CalculationToleranceFerc1(
intertable_calculation_errors=0.65,
"balance_sheet_assets_ferc1": GroupMetricChecks(
groups_to_check=[
"ungrouped",
"report_year",
"xbrl_factoid",
"utility_id_ferc1",
],
group_metric_tolerances=GroupMetricTolerances(
ungrouped=MetricTolerances(
error_frequency=0.014,
relative_error_magnitude=0.04,
null_calculated_value_frequency=1.0,
),
report_year=MetricTolerances(
error_frequency=0.12,
relative_error_magnitude=0.04,
null_calculated_value_frequency=1.0,
),
xbrl_factoid=MetricTolerances(
error_frequency=0.37,
relative_error_magnitude=0.22,
null_calculated_value_frequency=1.0,
),
utility_id_ferc1=MetricTolerances(
error_frequency=0.21,
relative_error_magnitude=0.26,
null_calculated_value_frequency=1.0,
),
),
),
"balance_sheet_liabilities_ferc1": CalculationToleranceFerc1(
intertable_calculation_errors=0.07,
"balance_sheet_liabilities_ferc1": GroupMetricChecks(
groups_to_check=[
"ungrouped",
"report_year",
"xbrl_factoid",
"utility_id_ferc1",
],
group_metric_tolerances=GroupMetricTolerances(
ungrouped=MetricTolerances(
error_frequency=0.028,
relative_error_magnitude=0.019,
null_calculated_value_frequency=1.0,
),
report_year=MetricTolerances(
error_frequency=0.028,
relative_error_magnitude=0.04,
null_calculated_value_frequency=1.0,
),
xbrl_factoid=MetricTolerances(
error_frequency=0.028,
relative_error_magnitude=0.019,
null_calculated_value_frequency=1.0,
),
utility_id_ferc1=MetricTolerances(
error_frequency=0.063,
relative_error_magnitude=0.04,
null_calculated_value_frequency=1.0,
),
),
),
}

Expand Down Expand Up @@ -980,7 +1046,7 @@ def exploded_table_asset_factory(
root_table: str,
table_names_to_explode: list[str],
seed_nodes: list[NodeId],
calculation_tolerance: CalculationToleranceFerc1,
group_metric_checks: GroupMetricChecks,
io_manager_key: str | None = None,
) -> AssetsDefinition:
"""Create an exploded table based on a set of related input tables."""
Expand Down Expand Up @@ -1017,7 +1083,7 @@ def exploded_tables_asset(
calculation_components_xbrl_ferc1=calculation_components_xbrl_ferc1,
seed_nodes=seed_nodes,
tags=tags,
calculation_tolerance=calculation_tolerance,
group_metric_checks=group_metric_checks,
).boom(tables_to_explode=tables_to_explode)

return exploded_tables_asset
Expand All @@ -1039,7 +1105,7 @@ def create_exploded_table_assets() -> list[AssetsDefinition]:
"electric_operating_expenses_ferc1",
"electric_operating_revenues_ferc1",
],
"calculation_tolerance": EXPLOSION_CALCULATION_TOLERANCES[
"group_metric_checks": EXPLOSION_CALCULATION_TOLERANCES[
"income_statement_ferc1"
],
"seed_nodes": [
Expand All @@ -1060,7 +1126,7 @@ def create_exploded_table_assets() -> list[AssetsDefinition]:
"plant_in_service_ferc1",
"electric_plant_depreciation_functional_ferc1",
],
"calculation_tolerance": EXPLOSION_CALCULATION_TOLERANCES[
"group_metric_checks": EXPLOSION_CALCULATION_TOLERANCES[
"balance_sheet_assets_ferc1"
],
"seed_nodes": [
Expand All @@ -1079,7 +1145,7 @@ def create_exploded_table_assets() -> list[AssetsDefinition]:
"balance_sheet_liabilities_ferc1",
"retained_earnings_ferc1",
],
"calculation_tolerance": EXPLOSION_CALCULATION_TOLERANCES[
"group_metric_checks": EXPLOSION_CALCULATION_TOLERANCES[
"balance_sheet_liabilities_ferc1"
],
"seed_nodes": [
Expand Down Expand Up @@ -1110,7 +1176,7 @@ def __init__(
calculation_components_xbrl_ferc1: pd.DataFrame,
seed_nodes: list[NodeId],
tags: pd.DataFrame = pd.DataFrame(),
calculation_tolerance: CalculationToleranceFerc1 = CalculationToleranceFerc1(),
group_metric_checks: GroupMetricChecks = GroupMetricChecks(),
):
"""Instantiate an Exploder class.
Expand All @@ -1124,7 +1190,7 @@ def __init__(
"""
self.table_names: list[str] = table_names
self.root_table: str = root_table
self.calculation_tolerance = calculation_tolerance
self.group_metric_checks = group_metric_checks
self.metadata_xbrl_ferc1 = metadata_xbrl_ferc1
self.calculation_components_xbrl_ferc1 = calculation_components_xbrl_ferc1
self.seed_nodes = seed_nodes
Expand Down Expand Up @@ -1302,7 +1368,7 @@ def calculation_forest(self: Self) -> "XbrlCalculationForestFerc1":
exploded_meta=self.exploded_meta,
seeds=self.seed_nodes,
tags=self.tags,
calculation_tolerance=self.calculation_tolerance,
group_metric_checks=self.group_metric_checks,
)

@cached_property
Expand Down Expand Up @@ -1366,8 +1432,6 @@ def boom(self: Self, tables_to_explode: dict[str, pd.DataFrame]) -> pd.DataFrame
Args:
tables_to_explode: dictionary of table name (key) to transfomed table (value).
calculation_tolerance: What proportion (0-1) of calculated values are
allowed to be incorrect without raising an AssertionError.
"""
exploded = (
self.initial_explosion_concatenation(tables_to_explode)
Expand Down Expand Up @@ -1454,7 +1518,7 @@ def reconcile_intertable_calculations(
components originate entirely or partially outside of the table. It also
accounts for components that only sum to a factoid within a particular dimension
(e.g., for an electric utility or for plants whose plant_function is
"in_service"). This returns a dataframe with a "calculated_amount" column.
"in_service"). This returns a dataframe with a "calculated_value" column.
Args:
exploded: concatenated tables for table explosion.
Expand All @@ -1479,11 +1543,13 @@ def reconcile_intertable_calculations(
value_col=self.value_col,
)
calculated_df = pudl.transform.ferc1.check_calculation_metrics(
calculated_df=calculated_df, group_metric_checks=self.group_metric_checks
)
calculated_df = pudl.transform.ferc1.add_corrections(
calculated_df=calculated_df,
value_col=self.value_col,
calculation_tolerance=self.calculation_tolerance.intertable_calculation_errors,
is_close_tolerance=pudl.transform.ferc1.IsCloseTolerance(),
table_name=self.root_table,
add_corrections=True,
)
logger.info("Checking sub-total calcs.")
subtotal_calcs = pudl.transform.ferc1.calculate_values_from_components(
Expand All @@ -1496,10 +1562,7 @@ def reconcile_intertable_calculations(
)
subtotal_calcs = pudl.transform.ferc1.check_calculation_metrics(
calculated_df=subtotal_calcs,
value_col=self.value_col,
calculation_tolerance=self.calculation_tolerance.intertable_calculation_errors,
table_name=self.root_table,
add_corrections=True,
group_metric_checks=self.group_metric_checks,
)
return calculated_df

Expand Down Expand Up @@ -1551,7 +1614,7 @@ class XbrlCalculationForestFerc1(BaseModel):
exploded_calcs: pd.DataFrame = pd.DataFrame()
seeds: list[NodeId] = []
tags: pd.DataFrame = pd.DataFrame()
calculation_tolerance: CalculationToleranceFerc1 = CalculationToleranceFerc1()
group_metric_checks: GroupMetricChecks = GroupMetricChecks()

class Config:
"""Allow the class to store a dataframe."""
Expand Down Expand Up @@ -1785,8 +1848,8 @@ def check_conflicting_tags(annotated_forest: nx.DiGraph) -> None:
nodes = annotated_forest.nodes
for ancestor in nodes:
for descendant in nx.descendants(annotated_forest, ancestor):
for tag in nodes[ancestor]["tags"]:
if tag in nodes[descendant]["tags"]:
for tag in nodes[ancestor].get("tags", {}):
if tag in nodes[descendant].get("tags", {}):
ancestor_tag_value = nodes[ancestor]["tags"][tag]
descendant_tag_value = nodes[descendant]["tags"][tag]
if ancestor_tag_value != descendant_tag_value:
Expand Down Expand Up @@ -2069,7 +2132,7 @@ def leafy_meta(self: Self) -> pd.DataFrame:
leaf_tags = {}
ancestors = list(nx.ancestors(self.annotated_forest, leaf)) + [leaf]
for node in ancestors:
leaf_tags |= self.annotated_forest.nodes[node]["tags"]
leaf_tags |= self.annotated_forest.nodes[node].get("tags", {})
all_leaf_weights = {
self._get_path_weight(path, self.annotated_forest)
for path in nx.all_simple_paths(
Expand Down Expand Up @@ -2273,5 +2336,8 @@ def nodes_to_df(calc_forest: nx.DiGraph, nodes: list[NodeId]) -> pd.DataFrame:
}
index = pd.DataFrame(node_dict.keys()).astype("string")
data = pd.DataFrame(node_dict.values())
tags = pd.json_normalize(data.tags).astype("string")
try:
tags = pd.json_normalize(data.tags).astype("string")
except AttributeError:
tags = pd.DataFrame()
return pd.concat([index, tags], axis="columns")
Loading

0 comments on commit 0d59c3b

Please sign in to comment.