Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hide _out tables and restore metadata in Datasette #3226

Merged
merged 10 commits into from
Jan 29, 2024
86 changes: 76 additions & 10 deletions devtools/datasette/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from subprocess import check_call, check_output

import click
import sqlalchemy as sa

from pudl.metadata.classes import DatasetteMetadata
from pudl.workspace.setup import PudlPaths
Expand Down Expand Up @@ -82,9 +83,75 @@ def inspect_data(datasets: list[str], pudl_out: Path) -> str:
return inspect_output


def metadata(pudl_out: Path) -> str:
def check_tables_have_metadata(
zaneselvans marked this conversation as resolved.
Show resolved Hide resolved
metadata: DatasetteMetadata,
pudl_out: Path,
databases: list[str],
) -> None:
"""Check to make sure all tables in the databases have metadata.

This function fails if there are tables present in a database
that do not have structure metadata.

Args:
metadata: The structure metadata for the datasette deployment
pudl_out: The directory that contains the pudl outputs
databases: The list of databases to test.
"""
resources = {}
resources |= metadata.xbrl_resources
resources["pudl"] = metadata.resources

database_table_exceptions = {"pudl": {"alembic_version"}}

tables_missing_metadata_results = {}

for database in databases:
database_path = pudl_out / database
database_name = database_path.stem

# Grab all tables in the database
engine = sa.create_engine(f"sqlite:///{str(database_path)}")
inspector = sa.inspect(engine)
tables_in_database = set(inspector.get_table_names())
zaneselvans marked this conversation as resolved.
Show resolved Hide resolved

# There are some tables that we don't expect to have metadata
# like alembic_version in pudl.sqlite.
table_exceptions = database_table_exceptions.get(database_name)

if table_exceptions:
tables_in_database = tables_in_database - table_exceptions
tables_with_metadata = {resource.name for resource in resources[database_name]}

# Find the tables that existing in the database that we don't have metadata for
tables_missing_metadata = tables_in_database - tables_with_metadata

tables_missing_metadata_results[database_name] = tables_missing_metadata

has_no_missing_tables_with_missing_metadata = all(
not bool(value) for value in tables_missing_metadata_results.values()
)

assert (
has_no_missing_tables_with_missing_metadata
), f"These tables are missing datasette metadata: {tables_missing_metadata_results}"


def metadata(
pudl_out: Path,
databases: list[str],
) -> str:
"""Return human-readable metadata for Datasette."""
return DatasetteMetadata.from_data_source_ids(pudl_out).to_yaml()
metadata = DatasetteMetadata.from_data_source_ids(pudl_out)

# DBF databases do not have any metadata
zaneselvans marked this conversation as resolved.
Show resolved Hide resolved
databases_with_metadata = (
dataset
for dataset in databases
if dataset == "pudl.sqlite" or dataset.endswith("xbrl.sqlite")
)
check_tables_have_metadata(metadata, pudl_out, databases_with_metadata)
return metadata.to_yaml()


@click.command(context_settings={"help_option_names": ["-h", "--help"]})
Expand Down Expand Up @@ -124,13 +191,12 @@ def deploy_datasette(deploy: str, fly_args: tuple[str]) -> int:
python publish.py --fly -- --build-only
"""
pudl_out = PudlPaths().pudl_output
metadata_yml = metadata(pudl_out)
# Order the databases to highlight PUDL
datasets = (
databases = (
["pudl.sqlite"]
+ sorted(str(p.name) for p in pudl_out.glob("ferc*.sqlite"))
+ ["censusdp1tract.sqlite"]
)
metadata_yml = metadata(pudl_out, databases)

if deploy == "fly":
logging.info("Deploying to fly.io...")
Expand All @@ -139,8 +205,8 @@ def deploy_datasette(deploy: str, fly_args: tuple[str]) -> int:
inspect_path = fly_dir / "inspect-data.json"
metadata_path = fly_dir / "metadata.yml"

logging.info(f"Inspecting DBs for datasette: {datasets}...")
inspect_output = inspect_data(datasets, pudl_out)
logging.info(f"Inspecting DBs for datasette: {databases}...")
inspect_output = inspect_data(databases, pudl_out)
with inspect_path.open("w") as f:
f.write(json.dumps(inspect_output))

Expand All @@ -152,9 +218,9 @@ def deploy_datasette(deploy: str, fly_args: tuple[str]) -> int:
with docker_path.open("w") as f:
f.write(make_dockerfile())

logging.info(f"Compressing {datasets} and putting into docker context...")
logging.info(f"Compressing {databases} and putting into docker context...")
check_call(
["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + datasets, # noqa: S603
["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + databases, # noqa: S603
cwd=pudl_out,
)

Expand All @@ -173,7 +239,7 @@ def deploy_datasette(deploy: str, fly_args: tuple[str]) -> int:
f.write(metadata_yml)

check_call(
["/usr/bin/env", "datasette", "serve", "-m", "metadata.yml"] + datasets, # noqa: S603
["/usr/bin/env", "datasette", "serve", "-m", "metadata.yml"] + databases, # noqa: S603
cwd=pudl_out,
)

Expand Down
40 changes: 16 additions & 24 deletions src/pudl/metadata/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1873,14 +1873,24 @@ def get_sorted_resources(self) -> StrictList(Resource):
on their name which results in the following order to promote output
tables to users and push intermediate tables to the bottom of the
docs: output, core, intermediate.

In the future we might want to have more fine grain control over how
Resources are sorted.

Returns:
A sorted list of resources.
"""
return sorted(self.resources, key=lambda r: r.name, reverse=True)
resources = self.resources

def sort_resource_names(resource: Resource):
pattern = re.compile(r"(_out_|out_|core_)")
zaneselvans marked this conversation as resolved.
Show resolved Hide resolved

matches = pattern.findall(resource.name)
prefix = matches[0] if matches else ""

prefix_order = {"out_": 1, "core_": 2, "_out_": 3}
return prefix_order.get(prefix, float("inf"))

return sorted(resources, key=sort_resource_names, reverse=False)


class CodeMetadata(PudlMeta):
Expand Down Expand Up @@ -1925,7 +1935,7 @@ class DatasetteMetadata(PudlMeta):
"""

data_sources: list[DataSource]
resources: list[Resource] = Package.from_resource_ids().get_sorted_resources()
resources: list[Resource] = Package.from_resource_ids()
xbrl_resources: dict[str, list[Resource]] = {}
label_columns: dict[str, str] = {
"core_eia__entity_plants": "plant_name_eia",
Expand Down Expand Up @@ -1959,12 +1969,6 @@ def from_data_source_ids(
"ferc60_xbrl",
"ferc714_xbrl",
],
extra_etl_groups: list[str] = [
"entity_eia",
"glue",
"static_eia",
"static_ferc1",
],
) -> "DatasetteMetadata":
"""Construct a dictionary of DataSources from data source names.

Expand All @@ -1974,19 +1978,12 @@ def from_data_source_ids(
output_path: PUDL_OUTPUT path.
data_source_ids: ids of data sources currently included in Datasette
xbrl_ids: ids of data converted XBRL data to be included in Datasette
extra_etl_groups: ETL groups with resources that should be included
"""
# Compile a list of DataSource objects for use in the template
data_sources = [DataSource.from_id(ds_id) for ds_id in data_source_ids]

# Instantiate all possible resources in a Package:
pkg = Package.from_resource_ids()
# Grab a list of just the resources we want to output:
resources = [
res
for res in pkg.resources
if res.etl_group in data_source_ids + extra_etl_groups
]
zaneselvans marked this conversation as resolved.
Show resolved Hide resolved
resources = Package.from_resource_ids().resources

# Get XBRL based resources
xbrl_resources = {}
Expand All @@ -2007,15 +2004,10 @@ def from_data_source_ids(
xbrl_resources=xbrl_resources,
)

def to_yaml(self, exclude_intermediate_resources: bool = False) -> None:
def to_yaml(self) -> None:
"""Output database, table, and column metadata to YAML file."""
template = _get_jinja_environment().get_template("datasette-metadata.yml.jinja")
if exclude_intermediate_resources:
[
resource
for resource in self.resources
if not resource.name.startswith("_")
]
zaneselvans marked this conversation as resolved.
Show resolved Hide resolved

rendered = template.render(
license=LICENSES["cc-by-4.0"],
data_sources=self.data_sources,
Expand Down
2 changes: 1 addition & 1 deletion src/pudl/metadata/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@
"license_raw": LICENSES["us-govt"],
"license_pudl": LICENSES["cc-by-4.0"],
},
"core_epa__assn_eia_epacamd": {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was an oopsie from #2995

"epacamd_eia": {
"title": "EPA CAMD to EIA Power Sector Data Crosswalk",
"path": "https://github.com/USEPA/camd-eia-crosswalk",
"description": (
Expand Down
3 changes: 3 additions & 0 deletions src/pudl/metadata/templates/datasette-metadata.yml.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ databases:
description_html: |
<p>{{ resource.description | wordwrap(80) | indent(10) }}</p>
{%- endif %}
{%- if resource.name.startswith("_") %}
hidden: true
{%- endif %}
{%- if resource.name in label_columns %}
label_column: {{ label_columns[resource.name] }}
{%- endif %}
Expand Down
21 changes: 21 additions & 0 deletions test/unit/metadata_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,24 @@ def test_resources_have_descriptions():
f"Found {len(resources_without_description)} resources without descriptions: "
f"{resources_without_description}"
)


def test_get_sorted_resources() -> None:
"""Test that resources are returned in this order (out, core, _out)."""
resource_ids = (
"_out_eia__plants_utilities",
"core_eia__entity_boilers",
"out_eia__yearly_boilers",
)
resources = Package.from_resource_ids(
resource_ids=resource_ids, resolve_foreign_keys=True
).get_sorted_resources()

first_resource_name = resources[0].name
last_resource_name = resources[-1].name
assert first_resource_name.startswith(
"out"
), f"{first_resource_name} is the first resource. Expected a resource with the prefix 'out'"
assert last_resource_name.startswith(
"_out"
), f"{last_resource_name} is the last resource. Expected a resource with the prefix '_out'"
Loading