Skip to content

Commit

Permalink
Merge pull request #125 from statisticsnorway/124-hente-metadata-fra-…
Browse files Browse the repository at this point in the history
…offentlig-statistikkbank

functions for getting and filtering metadata from the statbank public api
  • Loading branch information
aecorn authored May 2, 2024
2 parents 755b5f2 + f9ae2d4 commit 39af75c
Show file tree
Hide file tree
Showing 7 changed files with 1,397 additions and 946 deletions.
28 changes: 12 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,18 @@ df_folkemengde = apidata("https://i.ssb.no/pxwebi/api/v0/no/prod_24v_intern/STAR
)
```

`apimetadata` gets metadata from the *public* api, like apidata does.
```python
meta = apimetadata("05300")
```

`apicodelist` gets a specific codelist out of the metadata, or all the codelists.
```python
all_codelists = apimetadata("05300")
avstand_codelist = apimetadata("05300", "Avstand1")
```


`apidata_rotate` is a thin wrapper around pivot_table. Stolen from: https://github.com/sehyoun/SSB_API_helper/blob/master/src/ssb_api_helper.py
```python
df_folkemengde_rotert = apidata_rotate(df_folkemengde, 'tidskolonne', "verdikolonne")
Expand Down Expand Up @@ -173,22 +185,6 @@ import logging
statbank.logger.setLevel(logging.WARNING)
```

## Version history
- 1.1.0 Migrating to "new template" for Pypi-packages at SSB, with full typing support, a reference website, logging instead of print etc.
- 1.0.6 fixing new functionality on "IRkodelister"
- 1.0.5 Making transferdata_template smarter, were it can take a bunch of dataframes and incorporate them in the returned dict. Trying to support columntype "internasjonal rapportering".
- 1.0.4 Fixing bug where empty codelists stops description initialization, Updating pyjstat to 2.4.0, changing imports to absolute from package root
- 1.0.2 Doc-string style cleanup, a check on username and password on client init, changes to time and display of time, demo notebooks cleaned
- 1.0.0 Finished going through initial issues, less complaining from verify on floats
- 0.0.11 Statbank people wanted a user-agent-requesst-header to differentiate test from prod
- 0.0.9 After further user-testing and requests
- 0.0.5 Still some parameter issues
- 0.0.4 More test coverage, some bugs fixed in rounding checks and parameter-passing
- 0.0.3 Removed batches, stripping uttrekk from transfer, rounding function on uttrekk, data required in as a dict of dataframes, with "deltabell-navn". Tableid now works to transfer to instead of only "hovedtabellnavn"
- 0.0.2 Starting alpha, fine-tuning release to Pypi on github-release
- 0.0.1 Client, transfer, description, apidata. Quite a lot of work done already. Pre-alpha.


## License

Distributed under the terms of the [MIT license][license],
Expand Down
2,146 changes: 1,230 additions & 916 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dapla-statbank-client"
version = "1.2.1"
version = "1.2.2"
description = "Handles data transfer Statbank <-> Dapla for Statistics Norway"
authors = ["Statistics Norway", "Carl F. Corneil <[email protected]>"]
license = "MIT"
Expand Down
11 changes: 10 additions & 1 deletion src/statbank/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,22 @@

import toml

from statbank.apidata import apicodelist
from statbank.apidata import apidata
from statbank.apidata import apidata_all
from statbank.apidata import apidata_rotate
from statbank.apidata import apimetadata
from statbank.client import StatbankClient
from statbank.statbank_logger import logger

__all__ = ["StatbankClient", "apidata", "apidata_all", "apidata_rotate"]
__all__ = [
"StatbankClient",
"apidata",
"apidata_all",
"apidata_rotate",
"apimetadata",
"apicodelist",
]


# Split into function for testing
Expand Down
56 changes: 51 additions & 5 deletions src/statbank/apidata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import urllib
from typing import TYPE_CHECKING
from typing import Any

import requests as r
from pyjstat import pyjstat
Expand Down Expand Up @@ -97,17 +98,17 @@ def apidata_all(id_or_url: str = "", include_id: bool = False) -> pd.DataFrame:
return apidata(id_or_url, apidata_query_all(id_or_url), include_id=include_id)


def apidata_query_all(id_or_url: str = "") -> QueryWholeType:
"""Builds a query for ALL THE DATA in a table based on a request for metadata on the table.
def apimetadata(id_or_url: str = "") -> dict[str, Any]:
"""Get the metadata of a published statbank-table as a dict.
Args:
id_or_url (str): The id of the STATBANK-table to get the total query for, or supply the total url, if the table is "internal".
Returns:
QueryWholeType: The prepared query based on all the codes in the table.
dict[str, Any]: The metadata of the table as the json returned from the API-get-request.
Raises:
ValueError: If the parameter is not a valid statbank ID or a direct url.
ValueError: If the first parameter is not recognized as a statbank ID or a direct url.
"""
if len(id_or_url) == STATBANK_TABLE_ID_LENGTH and id_or_url.isdigit():
url = f"https://data.ssb.no/api/v0/no/table/{id_or_url}/"
Expand All @@ -121,7 +122,52 @@ def apidata_query_all(id_or_url: str = "") -> QueryWholeType:
url = id_or_url
res = r.get(url, timeout=5)
res.raise_for_status()
meta = res.json()["variables"]
meta: dict[str, Any] = res.json()
return meta


def apicodelist(
id_or_url: str = "",
codelist_name: str = "",
) -> dict[str, str] | dict[str, dict[str, str]]:
"""Get one specific or all the codelists of a published statbank-table as a dict or nested dicts.
Args:
id_or_url (str): The id of the STATBANK-table to get the total query for, or supply the total url, if the table is "internal".
codelist_name (str): The name of the specific codelist to get.
Returns:
dict[str, str] | dict[str, dict[str, str]]: The codelist of the table as a dict or a nested dict.
Raises:
ValueError: If the specified codelist_name is not in the returned metadata.
"""
metadata = apimetadata(id_or_url)
results = {}
for col in metadata["variables"]:
results[col["code"]] = dict(zip(col["values"], col["valueTexts"]))
if codelist_name == "":
return results
if codelist_name in results:
return results[codelist_name]
for col in metadata["variables"]:
if codelist_name == col["text"]:
return dict(zip(col["values"], col["valueTexts"]))
col_names = ", ".join([col["code"] for col in metadata["variables"]])
error_msg = f"Cant find {codelist_name} among the available names: {col_names}"
raise ValueError(error_msg)


def apidata_query_all(id_or_url: str = "") -> QueryWholeType:
"""Builds a query for ALL THE DATA in a table based on a request for metadata on the table.
Args:
id_or_url (str): The id of the STATBANK-table to get the total query for, or supply the total url, if the table is "internal".
Returns:
QueryWholeType: The prepared query based on all the codes in the table.
"""
meta = apimetadata(id_or_url)["variables"]
code_list: list[QueryPartType] = []
for code in meta:
tmp: QueryPartType = {"code": "", "selection": {"filter": "", "values": []}}
Expand Down
46 changes: 39 additions & 7 deletions src/statbank/client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import Any

if TYPE_CHECKING:
import pandas as pd
Expand All @@ -16,9 +17,11 @@

if TYPE_CHECKING:
from statbank.api_types import QueryWholeType
from statbank.apidata import apicodelist
from statbank.apidata import apidata
from statbank.apidata import apidata_all
from statbank.apidata import apidata_rotate
from statbank.apidata import apimetadata
from statbank.auth import StatbankAuth
from statbank.globals import APPROVE_DEFAULT_JIT
from statbank.globals import OSLO_TIMEZONE
Expand Down Expand Up @@ -134,19 +137,20 @@ def __repr__(self) -> str:
"""Represent the class with the necessary argument to replicate."""
result = "StatbankClient("
if self.date != TOMORROW:
result += f', date = "{self.date.isoformat("T", "seconds")}")'
result += f'date = "{self.date.isoformat("T", "seconds")}", '
if self.shortuser:
result += f', shortuser = "{self.shortuser}")'
result += f'shortuser = "{self.shortuser}", '
if self.cc:
result += f', cc = "{self.cc}")'
result += f'cc = "{self.cc}", '
if self.bcc:
result += f', bcc = "{self.bcc}")'
result += f', bcc = "{self.bcc}", '
if not self.overwrite:
result += f", overwrite = {self.overwrite})"
result += f"overwrite = {self.overwrite}), "
if self.approve != APPROVE_DEFAULT_JIT:
result += f", approve = {self.approve})"
result += f"approve = {self.approve}, "
if self.check_username_password:
result += f", check_username_password = {self.check_username_password})"
result += f"check_username_password = {self.check_username_password}"
result = result.strip(" ").strip(",")
result += ")"
return result

Expand Down Expand Up @@ -392,6 +396,34 @@ def apidata_all(id_or_url: str = "", include_id: bool = False) -> pd.DataFrame:
"""
return apidata_all(id_or_url=id_or_url, include_id=include_id)

@staticmethod
def apimetadata(id_or_url: str = "") -> dict[str, Any]:
"""Get the metadata of a published statbank-table as a dict.
Args:
id_or_url (str): The id of the STATBANK-table to get the total query for, or supply the total url, if the table is "internal".
Returns:
dict[str, Any]: The metadata of the table as the json returned from the API-get-request.
"""
return apimetadata(id_or_url=id_or_url)

@staticmethod
def apicodelist(
id_or_url: str = "",
codelist_name: str = "",
) -> dict[str, str] | dict[str, dict[str, str]]:
"""Get one specific or all the codelists of a published statbank-table as a dict or nested dicts.
Args:
id_or_url (str): The id of the STATBANK-table to get the total query for, or supply the total url, if the table is "internal".
codelist_name (str): The name of the specific codelist to get.
Returns:
dict[str, str] | dict[str, dict[str, str]]: The codelist of the table as a dict or a nested dict.
"""
return apicodelist(id_or_url=id_or_url, codelist_name=codelist_name)

@staticmethod
def apidata_rotate(
df: pd.DataFrame,
Expand Down
54 changes: 54 additions & 0 deletions tests/test_apidata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
from requests.exceptions import HTTPError

from statbank import StatbankClient
from statbank.apidata import apicodelist
from statbank.apidata import apidata
from statbank.apidata import apidata_all
from statbank.apidata import apidata_query_all
from statbank.apidata import apidata_rotate
from statbank.apidata import apimetadata

load_dotenv()

Expand Down Expand Up @@ -40,6 +42,9 @@ def fake_post_response_key_service() -> requests.Response:
return response


VAR_NUM = 4


def fake_get_table_meta() -> requests.Response:
response = requests.Response()
response.status_code = 200
Expand Down Expand Up @@ -77,6 +82,43 @@ def client_fake(
return StatbankClient(check_username_password=False)


@mock.patch.object(requests, "get")
def test_apimetadata(fake_get: Callable) -> None:
fake_get.return_value = fake_get_table_meta()
assert len(apimetadata("05300").get("title"))


@mock.patch.object(requests, "get")
def test_apicodelist_all(fake_get: Callable) -> None:
fake_get.return_value = fake_get_table_meta()
assert len(apicodelist("05300")) == VAR_NUM


@mock.patch.object(requests, "get")
def test_apicodelist_specific(fake_get: Callable) -> None:
fake_get.return_value = fake_get_table_meta()
result = apicodelist("05300", "Avstand1")
assert len(result)
assert isinstance(result, dict)
assert all(isinstance(x, str) for x in result.values())


@mock.patch.object(requests, "get")
def test_apicodelist_specific_text(fake_get: Callable) -> None:
fake_get.return_value = fake_get_table_meta()
result = apicodelist("05300", "avstand")
assert len(result)
assert isinstance(result, dict)
assert all(isinstance(x, str) for x in result.values())


@mock.patch.object(requests, "get")
def test_apicodelist_specific_missing_raises(fake_get: Callable) -> None:
fake_get.return_value = fake_get_table_meta()
with pytest.raises(ValueError, match="Cant find") as _:
apicodelist("05300", "missing")


@pytest.fixture()
@mock.patch.object(requests, "get")
def query_all_05300(fake_get: Callable) -> pd.DataFrame:
Expand Down Expand Up @@ -121,6 +163,18 @@ def test_apidata_rotate_05300(
assert ind.isdigit()


def test_client_apimetadata(client_fake: Callable) -> None:
metadata = client_fake.apimetadata("05300")
assert len(metadata.get("title"))


def test_client_apicodelist(client_fake: Callable) -> None:
metadata = client_fake.apicodelist("05300", "Avstand1")
assert len(metadata)
assert isinstance(metadata, dict)
assert all(isinstance(x, str) for x in metadata.values())


def test_client_apidata(client_fake: Callable, query_all_05300: pd.DataFrame) -> None:
tabledata = client_fake.apidata("05300", query_all_05300)
assert isinstance(tabledata, pd.DataFrame)
Expand Down

0 comments on commit 39af75c

Please sign in to comment.