Skip to content

Commit

Permalink
Add support for data dictionary properties
Browse files Browse the repository at this point in the history
Both at the dataset and distribution level. This are implemented as
(single) repeating subfields with url, format and license properties
supported.

Potential future improvements: license fallback to dataset's and link to
datastore data dictionary automatically
amercader committed Oct 17, 2024
1 parent 44a11fd commit bd346a0
Showing 6 changed files with 413 additions and 23 deletions.
114 changes: 104 additions & 10 deletions ckanext/dcat/profiles/dcat_us_3.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from decimal import Decimal, DecimalException

from rdflib import Literal, BNode
from rdflib import Literal, BNode, URIRef

from ckanext.dcat.profiles import (
DCAT,
DCATUS,
DCT,
FOAF,
RDF,
RDFS,
SKOS,
XSD,
)
@@ -59,6 +60,84 @@ def graph_from_catalog(self, catalog_dict, catalog_ref):

self._graph_from_catalog_base(catalog_dict, catalog_ref)

def _data_dictionary_parse(self, data_dict, subject):

g = self.g

for data_dictionary_ref in g.objects(subject, DCATUS.describedBy):
if isinstance(data_dictionary_ref, Literal):
data_dict["data_dictionary"] = str(data_dictionary_ref)
else:
if not isinstance(data_dict.get("data_dictionary"), list):
data_dict["data_dictionary"] = []
data_dictionary_dict = {}
for item in [
(DCAT.accessURL, "url"),
(DCT["format"], "format"),
(DCT.license, "license"),
]:
predicate, key = item
value = self._object_value(data_dictionary_ref, predicate)
if value:
data_dictionary_dict[key] = value
if data_dictionary_dict:
data_dict["data_dictionary"].append(data_dictionary_dict)

return data_dict

def _data_dictionary_graph(self, data_dict, subject):
"""
Adds triples related to the data dictionary property of a Datasets
or a Distribution
TODO: Link somehow to the DataStore data dictionary if that exists
and is public
"""

g = self.g

data_dictionary = self._get_dict_value(data_dict, "data_dictionary")
if isinstance(data_dictionary, str):
g.add((subject, DCATUS.describedBy, Literal(data_dictionary)))
elif (
isinstance(data_dictionary, list)
and len(data_dictionary)
and isinstance(data_dictionary[0], dict)
):
data_dictionary = data_dictionary[0]
url = data_dictionary.get("url")
if url:
data_dictionary_ref = BNode()
g.add((data_dictionary_ref, RDF.type, DCAT.Distribution))
self._add_triple_from_dict(
data_dictionary,
data_dictionary_ref,
DCAT.accessURL,
"url",
_type=URIRef,
_class=RDFS.Resource,
)
if data_dictionary.get("format"):
self._add_triple_from_dict(
data_dictionary,
data_dictionary_ref,
DCT["format"],
"format",
_type=URIRefOrLiteral,
_class=DCT.MediaTypeOrExtent,
)
# TODO: fallback to dataset / distribution one
if data_dictionary.get("license"):
self._add_triple_from_dict(
data_dictionary,
data_dictionary_ref,
DCT.license,
"license",
_type=URIRefOrLiteral,
_class=DCT.LicenseDocument,
)
g.add((subject, DCATUS.describedBy, data_dictionary_ref))

def _parse_dataset_v3_us(self, dataset_dict, dataset_ref):

g = self.g
@@ -76,19 +155,28 @@ def _parse_dataset_v3_us(self, dataset_dict, dataset_ref):
}
)

# Data dictionary
self._data_dictionary_parse(dataset_dict, dataset_ref)

for distribution_ref in self._distributions(dataset_ref):

# Distribution identifier
value = self._object_value(distribution_ref, DCT.identifier)
if value:
for resource_dict in dataset_dict.get("resources", []):
if resource_dict["distribution_ref"] == str(distribution_ref):
for resource_dict in dataset_dict.get("resources", []):
if resource_dict["distribution_ref"] == str(distribution_ref):

# Distribution identifier
value = self._object_value(distribution_ref, DCT.identifier)
if value:
resource_dict["identifier"] = value

# Temporal resolution
value = self._object_value(distribution_ref, DCAT.temporalResolution)
if value:
resource_dict["temporal_resolution"] = value
# Temporal resolution
value = self._object_value(
distribution_ref, DCAT.temporalResolution
)
if value:
resource_dict["temporal_resolution"] = value

# Data dictionary
self._data_dictionary_parse(resource_dict, distribution_ref)

def _graph_from_dataset_v3_us(self, dataset_dict, dataset_ref):

@@ -144,6 +232,9 @@ def add_bounding(predicate, value):
):
add_bounding(item[0], item[1])

# Data dictionary
self._data_dictionary_graph(dataset_dict, dataset_ref)

for resource_dict in dataset_dict.get("resources", []):

distribution_ref = CleanedURIRef(resource_uri(resource_dict))
@@ -166,3 +257,6 @@ def add_bounding(predicate, value):
"temporal_resolution",
_datatype=XSD.duration,
)

# Data dictionary
self._data_dictionary_graph(resource_dict, distribution_ref)
35 changes: 30 additions & 5 deletions ckanext/dcat/schemas/dcat_us_full.yaml
Original file line number Diff line number Diff line change
@@ -270,11 +270,20 @@ dataset_fields:
validators: ignore_missing scheming_multiple_text
help_text: The legislation that mandates the creation or management of the dataset.

#- field_name: hvd_category
# label: HVD Category
# preset: multiple_text
# validators: ignore_missing scheming_multiple_text
# TODO: implement separately as part of wider HVD support
- field_name: data_dictionary
label: Data dictionary
repeating_label: Data dictionary
repeating_once: true
repeating_subfields:

- field_name: url
label: URL

- field_name: format
label: Format

- field_name: license
label: License

# Note: if not provided, this will be autogenerated
- field_name: uri
@@ -438,6 +447,22 @@ resource_fields:

help_text: A data service that gives access to the resource.

- field_name: data_dictionary
label: Data dictionary
repeating_label: Data dictionary
repeating_once: true
repeating_subfields:

- field_name: url
label: URL

- field_name: format
label: Format

- field_name: license
label: License


# Note: if not provided, this will be autogenerated
- field_name: uri
label: URI
127 changes: 127 additions & 0 deletions ckanext/dcat/tests/profiles/dcat_us_3/test_dcat_us_3_profile_parse.py
Original file line number Diff line number Diff line change
@@ -153,6 +153,48 @@ def test_e2e_dcat_to_ckan(self):
"http://publications.europa.eu/webapi/rdf/sparql"
]

def test_two_distributions(self):

data = """
@prefix dcat: <http://www.w3.org/ns/dcat#> .
@prefix dcat-us: <http://resources.data.gov/ontology/dcat-us#> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix locn: <http://www.w3.org/ns/locn#> .
@prefix gsp: <http://www.opengis.net/ont/geosparql#> .
<https://example.com/dataset1>
a dcat:Dataset ;
dcterms:title "Dataset 1" ;
dcterms:description "This is a dataset" ;
dcterms:publisher <https://example.com/publisher1> ;
dcat:distribution <http://test.ckan.net/dataset/xxx/resource/yyy> ;
dcat:distribution <http://test.ckan.net/dataset/xxx/resource/zzz>
.
<http://test.ckan.net/dataset/xxx/resource/yyy> a dcat:Distribution ;
dcterms:title "Resource 1" ;
dcterms:identifier "id1"
.
<http://test.ckan.net/dataset/xxx/resource/zzz> a dcat:Distribution ;
dcterms:title "Resource 2" ;
dcterms:identifier "id2"
.
"""
p = RDFParser()

p.parse(data, _format="ttl")

datasets = [d for d in p.datasets()]

assert len(datasets[0]["resources"]) == 2

assert datasets[0]["resources"][0]["name"] == "Resource 1"
assert datasets[0]["resources"][0]["identifier"] == "id1"
assert datasets[0]["resources"][1]["name"] == "Resource 2"
assert datasets[0]["resources"][1]["identifier"] == "id2"

def test_bbox(self):

data = """
@@ -188,3 +230,88 @@ def test_bbox(self):
assert dataset["bbox"][0]["east"] == "10.3"
assert dataset["bbox"][0]["north"] == "50.2"
assert dataset["bbox"][0]["south"] == "20.2"

def test_data_dictionary_dataset(self):

data = """
@prefix dcat: <http://www.w3.org/ns/dcat#> .
@prefix dcat-us: <http://resources.data.gov/ontology/dcat-us#> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix locn: <http://www.w3.org/ns/locn#> .
@prefix gsp: <http://www.opengis.net/ont/geosparql#> .
<https://example.com/dataset1>
a dcat:Dataset ;
dcterms:title "Dataset 1" ;
dcterms:description "This is a dataset" ;
dcterms:publisher <https://example.com/publisher1> ;
dcat-us:describedBy [ a dcat:Distribution ;
dcterms:format <https://resources.data.gov/vocab/file-type/TODO/JSON> ;
dcterms:license <https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0> ;
dcat:accessURL <https://example.org/some-data-dictionary> ]
.
"""
p = RDFParser()

p.parse(data, _format="ttl")

datasets = [d for d in p.datasets()]

dataset = datasets[0]
assert (
dataset["data_dictionary"][0]["url"]
== "https://example.org/some-data-dictionary"
)
assert (
dataset["data_dictionary"][0]["format"]
== "https://resources.data.gov/vocab/file-type/TODO/JSON"
)
assert (
dataset["data_dictionary"][0]["license"]
== "https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0"
)

def test_data_dictionary_distribution(self):

data = """
@prefix dcat: <http://www.w3.org/ns/dcat#> .
@prefix dcat-us: <http://resources.data.gov/ontology/dcat-us#> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix locn: <http://www.w3.org/ns/locn#> .
@prefix gsp: <http://www.opengis.net/ont/geosparql#> .
<https://example.com/dataset1>
a dcat:Dataset ;
dcterms:title "Dataset 1" ;
dcterms:description "This is a dataset" ;
dcterms:publisher <https://example.com/publisher1> ;
dcat:distribution <http://test.ckan.net/dataset/xxx/resource/yyy>
.
<http://test.ckan.net/dataset/xxx/resource/yyy> a dcat:Distribution ;
dcat-us:describedBy [ a dcat:Distribution ;
dcterms:format <https://resources.data.gov/vocab/file-type/TODO/JSON> ;
dcterms:license <https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0> ;
dcat:accessURL <https://example.org/some-data-dictionary> ]
.
"""
p = RDFParser()

p.parse(data, _format="ttl")

datasets = [d for d in p.datasets()]

resource = datasets[0]["resources"][0]
assert (
resource["data_dictionary"][0]["url"]
== "https://example.org/some-data-dictionary"
)
assert (
resource["data_dictionary"][0]["format"]
== "https://resources.data.gov/vocab/file-type/TODO/JSON"
)
assert (
resource["data_dictionary"][0]["license"]
== "https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0"
)
Original file line number Diff line number Diff line change
@@ -26,8 +26,6 @@
RDFS,
)

DCAT_AP_PROFILES = ["dcat_us_3"]


@pytest.mark.usefixtures("with_plugins", "clean_db")
@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets")
@@ -224,15 +222,17 @@ def test_e2e_ckan_to_dcat(self):
wkt_geom = wkt.dumps(dataset["spatial_coverage"][0]["geom"], decimals=4)
assert self._triple(g, spatial[0][2], LOCN.Geometry, wkt_geom, GSP.wktLiteral)

distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2]
resource = dataset_dict["resources"][0]

# Alternate identifiers
ids = []
for subject in [t[2] for t in g.triples((dataset_ref, ADMS.identifier, None))]:
ids.append(str(g.value(subject, SKOS.notation)))
assert ids == dataset["alternate_identifier"]

# Resources

distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2]
resource = dataset_dict["resources"][0]

# Resources: core fields

assert self._triple(g, distribution_ref, DCT.title, resource["name"])
@@ -340,7 +340,7 @@ def test_distribution_identifier(self):
],
}

s = RDFSerializer(profiles=DCAT_AP_PROFILES)
s = RDFSerializer()
g = s.g

dataset_ref = s.graph_from_dataset(dataset_dict)
@@ -365,7 +365,7 @@ def test_distribution_identifier_falls_back_to_id(self):
],
}

s = RDFSerializer(profiles=DCAT_AP_PROFILES)
s = RDFSerializer()
g = s.g

dataset_ref = s.graph_from_dataset(dataset_dict)
@@ -384,7 +384,7 @@ def test_bbox(self):
],
}

s = RDFSerializer(profiles=DCAT_AP_PROFILES)
s = RDFSerializer()
g = s.g

dataset_ref = s.graph_from_dataset(dataset_dict)
@@ -418,3 +418,131 @@ def test_bbox(self):
dataset_dict["bbox"][0]["south"],
data_type=XSD.decimal,
)

def test_data_dictionary_dataset(self):

data_dictionary_dict = {
"url": "https://example.org/some-data-dictionary",
"format": "https://resources.data.gov/vocab/file-type/TODO/JSON",
"license": "https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0",
}

dataset_dict = {
"name": "test-dcat-us",
"description": "Test",
"data_dictionary": [data_dictionary_dict],
}

s = RDFSerializer()
g = s.g

dataset_ref = s.graph_from_dataset(dataset_dict)

data_dictionary_ref = [s for s in g.objects(dataset_ref, DCATUS.describedBy)][0]

assert self._triple(
g,
data_dictionary_ref,
RDF.type,
DCAT.Distribution,
)

assert self._triple(
g,
data_dictionary_ref,
DCAT.accessURL,
URIRef(data_dictionary_dict["url"]),
)

assert self._triple(
g,
data_dictionary_ref,
DCT["format"],
URIRef(data_dictionary_dict["format"]),
)

assert self._triple(
g,
data_dictionary_ref,
DCT.license,
URIRef(data_dictionary_dict["license"]),
)

def test_data_dictionary_distribution(self):

data_dictionary_dict = {
"url": "https://example.org/some-data-dictionary",
"format": "https://resources.data.gov/vocab/file-type/TODO/JSON",
"license": "https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0",
}

dataset_dict = {
"name": "test-dcat-us",
"description": "Test",
"resources": [
{
"id": "2607a002-142a-40b1-8026-96457b70c01d",
"name": "test",
"data_dictionary": [data_dictionary_dict],
}
],
}

s = RDFSerializer()
g = s.g

dataset_ref = s.graph_from_dataset(dataset_dict)

distribution_ref = [s for s in g.objects(dataset_ref, DCAT.distribution)][0]

data_dictionary_ref = [
s for s in g.objects(distribution_ref, DCATUS.describedBy)
][0]

assert self._triple(
g,
data_dictionary_ref,
RDF.type,
DCAT.Distribution,
)

assert self._triple(
g,
data_dictionary_ref,
DCAT.accessURL,
URIRef(data_dictionary_dict["url"]),
)

assert self._triple(
g,
data_dictionary_ref,
DCT["format"],
URIRef(data_dictionary_dict["format"]),
)

assert self._triple(
g,
data_dictionary_ref,
DCT.license,
URIRef(data_dictionary_dict["license"]),
)

def test_data_dictionary_dataset_string(self):

dataset_dict = {
"name": "test-dcat-us",
"description": "Test",
"data_dictionary": "https://example.org/some-data-dictionary",
}

s = RDFSerializer()
g = s.g

dataset_ref = s.graph_from_dataset(dataset_dict)

assert self._triple(
g,
dataset_ref,
DCATUS.describedBy,
dataset_dict["data_dictionary"],
)
2 changes: 2 additions & 0 deletions ckanext/dcat/tests/shacl/test_shacl.py
Original file line number Diff line number Diff line change
@@ -216,6 +216,8 @@ def test_validate_dcat_us_3_graph():
graph = graph_from_dataset("ckan_full_dataset_dcat_us_vocabularies.json")

graph.serialize(destination="graph.ttl")

graph.serialize(destination="graph.xml")
path = _get_shacl_file_path("dcat-us_3.0_shacl_shapes.ttl")
r = validate(graph, shacl_graph=path)
conforms, results_graph, results_text = r
14 changes: 14 additions & 0 deletions examples/ckan/ckan_full_dataset_dcat_us_vocabularies.json
Original file line number Diff line number Diff line change
@@ -160,6 +160,13 @@
}
],
"spatial_resolution_in_meters": 1.5,
"data_dictionary": [
{
"url": "https://example.org/some-data-dictionary",
"format": "https://resources.data.gov/vocab/file-type/TODO/JSON",
"license": "https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0"
}
],
"resources": [
{
"name": "Resource 1",
@@ -187,6 +194,13 @@
"http://id.loc.gov/vocabulary/iso639-1/en",
"http://id.loc.gov/vocabulary/iso639-1/es",
"http://id.loc.gov/vocabulary/iso639-1/ca"
],
"data_dictionary": [
{
"url": "https://example.org/some-data-dictionary",
"format": "https://resources.data.gov/vocab/file-type/TODO/JSON",
"license": "https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0"
}
]
}
]

0 comments on commit bd346a0

Please sign in to comment.