Add support for data dictionary properties

Both at the dataset and distribution level. This are implemented as (single) repeating subfields with url, format and license properties supported. Potential future improvements: license fallback to dataset's and link to datastore data dictionary automatically
ckan · Oct 17, 2024 · bd346a0 · bd346a0
1 parent 44a11fd
commit bd346a0
Showing 6 changed files with 413 additions and 23 deletions.
diff --git a/ckanext/dcat/profiles/dcat_us_3.py b/ckanext/dcat/profiles/dcat_us_3.py
@@ -1,13 +1,14 @@
 from decimal import Decimal, DecimalException
 
-from rdflib import Literal, BNode
+from rdflib import Literal, BNode, URIRef
 
 from ckanext.dcat.profiles import (
     DCAT,
     DCATUS,
     DCT,
     FOAF,
     RDF,
+    RDFS,
     SKOS,
     XSD,
 )
@@ -59,6 +60,84 @@ def graph_from_catalog(self, catalog_dict, catalog_ref):
 
         self._graph_from_catalog_base(catalog_dict, catalog_ref)
 
+    def _data_dictionary_parse(self, data_dict, subject):
+
+        g = self.g
+
+        for data_dictionary_ref in g.objects(subject, DCATUS.describedBy):
+            if isinstance(data_dictionary_ref, Literal):
+                data_dict["data_dictionary"] = str(data_dictionary_ref)
+            else:
+                if not isinstance(data_dict.get("data_dictionary"), list):
+                    data_dict["data_dictionary"] = []
+                data_dictionary_dict = {}
+                for item in [
+                    (DCAT.accessURL, "url"),
+                    (DCT["format"], "format"),
+                    (DCT.license, "license"),
+                ]:
+                    predicate, key = item
+                    value = self._object_value(data_dictionary_ref, predicate)
+                    if value:
+                        data_dictionary_dict[key] = value
+                if data_dictionary_dict:
+                    data_dict["data_dictionary"].append(data_dictionary_dict)
+
+        return data_dict
+
+    def _data_dictionary_graph(self, data_dict, subject):
+        """
+        Adds triples related to the data dictionary property of a Datasets
+        or a Distribution
+
+        TODO: Link somehow to the DataStore data dictionary if that exists
+        and is public
+        """
+
+        g = self.g
+
+        data_dictionary = self._get_dict_value(data_dict, "data_dictionary")
+        if isinstance(data_dictionary, str):
+            g.add((subject, DCATUS.describedBy, Literal(data_dictionary)))
+        elif (
+            isinstance(data_dictionary, list)
+            and len(data_dictionary)
+            and isinstance(data_dictionary[0], dict)
+        ):
+            data_dictionary = data_dictionary[0]
+            url = data_dictionary.get("url")
+            if url:
+                data_dictionary_ref = BNode()
+                g.add((data_dictionary_ref, RDF.type, DCAT.Distribution))
+                self._add_triple_from_dict(
+                    data_dictionary,
+                    data_dictionary_ref,
+                    DCAT.accessURL,
+                    "url",
+                    _type=URIRef,
+                    _class=RDFS.Resource,
+                )
+                if data_dictionary.get("format"):
+                    self._add_triple_from_dict(
+                        data_dictionary,
+                        data_dictionary_ref,
+                        DCT["format"],
+                        "format",
+                        _type=URIRefOrLiteral,
+                        _class=DCT.MediaTypeOrExtent,
+                    )
+                # TODO: fallback to dataset / distribution one
+                if data_dictionary.get("license"):
+                    self._add_triple_from_dict(
+                        data_dictionary,
+                        data_dictionary_ref,
+                        DCT.license,
+                        "license",
+                        _type=URIRefOrLiteral,
+                        _class=DCT.LicenseDocument,
+                    )
+                g.add((subject, DCATUS.describedBy, data_dictionary_ref))
+
     def _parse_dataset_v3_us(self, dataset_dict, dataset_ref):
 
         g = self.g
@@ -76,19 +155,28 @@ def _parse_dataset_v3_us(self, dataset_dict, dataset_ref):
                 }
             )
 
+        # Data dictionary
+        self._data_dictionary_parse(dataset_dict, dataset_ref)
+
         for distribution_ref in self._distributions(dataset_ref):
 
-            # Distribution identifier
-            value = self._object_value(distribution_ref, DCT.identifier)
-            if value:
-                for resource_dict in dataset_dict.get("resources", []):
-                    if resource_dict["distribution_ref"] == str(distribution_ref):
+            for resource_dict in dataset_dict.get("resources", []):
+                if resource_dict["distribution_ref"] == str(distribution_ref):
+
+                    # Distribution identifier
+                    value = self._object_value(distribution_ref, DCT.identifier)
+                    if value:
                         resource_dict["identifier"] = value
 
-            # Temporal resolution
-            value = self._object_value(distribution_ref, DCAT.temporalResolution)
-            if value:
-                resource_dict["temporal_resolution"] = value
+                    # Temporal resolution
+                    value = self._object_value(
+                        distribution_ref, DCAT.temporalResolution
+                    )
+                    if value:
+                        resource_dict["temporal_resolution"] = value
+
+                    # Data dictionary
+                    self._data_dictionary_parse(resource_dict, distribution_ref)
 
     def _graph_from_dataset_v3_us(self, dataset_dict, dataset_ref):
 
@@ -144,6 +232,9 @@ def add_bounding(predicate, value):
                 ):
                     add_bounding(item[0], item[1])
 
+        # Data dictionary
+        self._data_dictionary_graph(dataset_dict, dataset_ref)
+
         for resource_dict in dataset_dict.get("resources", []):
 
             distribution_ref = CleanedURIRef(resource_uri(resource_dict))
@@ -166,3 +257,6 @@ def add_bounding(predicate, value):
                 "temporal_resolution",
                 _datatype=XSD.duration,
             )
+
+            # Data dictionary
+            self._data_dictionary_graph(resource_dict, distribution_ref)
diff --git a/ckanext/dcat/schemas/dcat_us_full.yaml b/ckanext/dcat/schemas/dcat_us_full.yaml
@@ -270,11 +270,20 @@ dataset_fields:
   validators: ignore_missing scheming_multiple_text
   help_text: The legislation that mandates the creation or management of the dataset.
 
-#- field_name: hvd_category
-#  label: HVD Category
-#  preset: multiple_text
-#  validators: ignore_missing scheming_multiple_text
-# TODO: implement separately as part of wider HVD support
+- field_name: data_dictionary
+  label: Data dictionary
+  repeating_label: Data dictionary
+  repeating_once: true
+  repeating_subfields:
+
+    - field_name: url
+      label: URL
+
+    - field_name: format
+      label: Format
+
+    - field_name: license
+      label: License
 
 # Note: if not provided, this will be autogenerated
 - field_name: uri
@@ -438,6 +447,22 @@ resource_fields:
 
   help_text: A data service that gives access to the resource.
 
+- field_name: data_dictionary
+  label: Data dictionary
+  repeating_label: Data dictionary
+  repeating_once: true
+  repeating_subfields:
+
+    - field_name: url
+      label: URL
+
+    - field_name: format
+      label: Format
+
+    - field_name: license
+      label: License
+
+
   # Note: if not provided, this will be autogenerated
 - field_name: uri
   label: URI

diff --git a/ckanext/dcat/tests/profiles/dcat_us_3/test_dcat_us_3_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_us_3/test_dcat_us_3_profile_parse.py
@@ -153,6 +153,48 @@ def test_e2e_dcat_to_ckan(self):
             "http://publications.europa.eu/webapi/rdf/sparql"
         ]
 
+    def test_two_distributions(self):
+
+        data = """
+        @prefix dcat: <http://www.w3.org/ns/dcat#> .
+        @prefix dcat-us: <http://resources.data.gov/ontology/dcat-us#> .
+        @prefix dcterms: <http://purl.org/dc/terms/> .
+        @prefix locn: <http://www.w3.org/ns/locn#> .
+        @prefix gsp: <http://www.opengis.net/ont/geosparql#> .
+
+        <https://example.com/dataset1>
+          a dcat:Dataset ;
+          dcterms:title "Dataset 1" ;
+          dcterms:description "This is a dataset" ;
+          dcterms:publisher <https://example.com/publisher1> ;
+          dcat:distribution <http://test.ckan.net/dataset/xxx/resource/yyy> ;
+          dcat:distribution <http://test.ckan.net/dataset/xxx/resource/zzz>
+        .
+
+        <http://test.ckan.net/dataset/xxx/resource/yyy> a dcat:Distribution ;
+            dcterms:title "Resource 1" ;
+            dcterms:identifier "id1"
+        .
+
+        <http://test.ckan.net/dataset/xxx/resource/zzz> a dcat:Distribution ;
+            dcterms:title "Resource 2" ;
+            dcterms:identifier "id2"
+        .
+
+        """
+        p = RDFParser()
+
+        p.parse(data, _format="ttl")
+
+        datasets = [d for d in p.datasets()]
+
+        assert len(datasets[0]["resources"]) == 2
+
+        assert datasets[0]["resources"][0]["name"] == "Resource 1"
+        assert datasets[0]["resources"][0]["identifier"] == "id1"
+        assert datasets[0]["resources"][1]["name"] == "Resource 2"
+        assert datasets[0]["resources"][1]["identifier"] == "id2"
+
     def test_bbox(self):
 
         data = """
@@ -188,3 +230,88 @@ def test_bbox(self):
         assert dataset["bbox"][0]["east"] == "10.3"
         assert dataset["bbox"][0]["north"] == "50.2"
         assert dataset["bbox"][0]["south"] == "20.2"
+
+    def test_data_dictionary_dataset(self):
+
+        data = """
+        @prefix dcat: <http://www.w3.org/ns/dcat#> .
+        @prefix dcat-us: <http://resources.data.gov/ontology/dcat-us#> .
+        @prefix dcterms: <http://purl.org/dc/terms/> .
+        @prefix locn: <http://www.w3.org/ns/locn#> .
+        @prefix gsp: <http://www.opengis.net/ont/geosparql#> .
+
+        <https://example.com/dataset1>
+          a dcat:Dataset ;
+          dcterms:title "Dataset 1" ;
+          dcterms:description "This is a dataset" ;
+          dcterms:publisher <https://example.com/publisher1> ;
+          dcat-us:describedBy [ a dcat:Distribution ;
+                  dcterms:format <https://resources.data.gov/vocab/file-type/TODO/JSON> ;
+                  dcterms:license <https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0> ;
+                  dcat:accessURL <https://example.org/some-data-dictionary> ]
+        .
+        """
+        p = RDFParser()
+
+        p.parse(data, _format="ttl")
+
+        datasets = [d for d in p.datasets()]
+
+        dataset = datasets[0]
+        assert (
+            dataset["data_dictionary"][0]["url"]
+            == "https://example.org/some-data-dictionary"
+        )
+        assert (
+            dataset["data_dictionary"][0]["format"]
+            == "https://resources.data.gov/vocab/file-type/TODO/JSON"
+        )
+        assert (
+            dataset["data_dictionary"][0]["license"]
+            == "https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0"
+        )
+
+    def test_data_dictionary_distribution(self):
+
+        data = """
+        @prefix dcat: <http://www.w3.org/ns/dcat#> .
+        @prefix dcat-us: <http://resources.data.gov/ontology/dcat-us#> .
+        @prefix dcterms: <http://purl.org/dc/terms/> .
+        @prefix locn: <http://www.w3.org/ns/locn#> .
+        @prefix gsp: <http://www.opengis.net/ont/geosparql#> .
+
+        <https://example.com/dataset1>
+          a dcat:Dataset ;
+          dcterms:title "Dataset 1" ;
+          dcterms:description "This is a dataset" ;
+          dcterms:publisher <https://example.com/publisher1> ;
+          dcat:distribution <http://test.ckan.net/dataset/xxx/resource/yyy>
+        .
+
+
+        <http://test.ckan.net/dataset/xxx/resource/yyy> a dcat:Distribution ;
+          dcat-us:describedBy [ a dcat:Distribution ;
+                  dcterms:format <https://resources.data.gov/vocab/file-type/TODO/JSON> ;
+                  dcterms:license <https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0> ;
+                  dcat:accessURL <https://example.org/some-data-dictionary> ]
+        .
+        """
+        p = RDFParser()
+
+        p.parse(data, _format="ttl")
+
+        datasets = [d for d in p.datasets()]
+
+        resource = datasets[0]["resources"][0]
+        assert (
+            resource["data_dictionary"][0]["url"]
+            == "https://example.org/some-data-dictionary"
+        )
+        assert (
+            resource["data_dictionary"][0]["format"]
+            == "https://resources.data.gov/vocab/file-type/TODO/JSON"
+        )
+        assert (
+            resource["data_dictionary"][0]["license"]
+            == "https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0"
+        )
diff --git a/ckanext/dcat/tests/profiles/dcat_us_3/test_dcat_us_3_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_us_3/test_dcat_us_3_profile_serialize.py
@@ -26,8 +26,6 @@
     RDFS,
 )
 
-DCAT_AP_PROFILES = ["dcat_us_3"]
-
 
 @pytest.mark.usefixtures("with_plugins", "clean_db")
 @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets")
@@ -224,15 +222,17 @@ def test_e2e_ckan_to_dcat(self):
         wkt_geom = wkt.dumps(dataset["spatial_coverage"][0]["geom"], decimals=4)
         assert self._triple(g, spatial[0][2], LOCN.Geometry, wkt_geom, GSP.wktLiteral)
 
-        distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2]
-        resource = dataset_dict["resources"][0]
-
         # Alternate identifiers
         ids = []
         for subject in [t[2] for t in g.triples((dataset_ref, ADMS.identifier, None))]:
             ids.append(str(g.value(subject, SKOS.notation)))
         assert ids == dataset["alternate_identifier"]
 
+        # Resources
+
+        distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2]
+        resource = dataset_dict["resources"][0]
+
         # Resources: core fields
 
         assert self._triple(g, distribution_ref, DCT.title, resource["name"])
@@ -340,7 +340,7 @@ def test_distribution_identifier(self):
             ],
         }
 
-        s = RDFSerializer(profiles=DCAT_AP_PROFILES)
+        s = RDFSerializer()
         g = s.g
 
         dataset_ref = s.graph_from_dataset(dataset_dict)
@@ -365,7 +365,7 @@ def test_distribution_identifier_falls_back_to_id(self):
             ],
         }
 
-        s = RDFSerializer(profiles=DCAT_AP_PROFILES)
+        s = RDFSerializer()
         g = s.g
 
         dataset_ref = s.graph_from_dataset(dataset_dict)
@@ -384,7 +384,7 @@ def test_bbox(self):
             ],
         }
 
-        s = RDFSerializer(profiles=DCAT_AP_PROFILES)
+        s = RDFSerializer()
         g = s.g
 
         dataset_ref = s.graph_from_dataset(dataset_dict)
@@ -418,3 +418,131 @@ def test_bbox(self):
             dataset_dict["bbox"][0]["south"],
             data_type=XSD.decimal,
         )
+
+    def test_data_dictionary_dataset(self):
+
+        data_dictionary_dict = {
+            "url": "https://example.org/some-data-dictionary",
+            "format": "https://resources.data.gov/vocab/file-type/TODO/JSON",
+            "license": "https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0",
+        }
+
+        dataset_dict = {
+            "name": "test-dcat-us",
+            "description": "Test",
+            "data_dictionary": [data_dictionary_dict],
+        }
+
+        s = RDFSerializer()
+        g = s.g
+
+        dataset_ref = s.graph_from_dataset(dataset_dict)
+
+        data_dictionary_ref = [s for s in g.objects(dataset_ref, DCATUS.describedBy)][0]
+
+        assert self._triple(
+            g,
+            data_dictionary_ref,
+            RDF.type,
+            DCAT.Distribution,
+        )
+
+        assert self._triple(
+            g,
+            data_dictionary_ref,
+            DCAT.accessURL,
+            URIRef(data_dictionary_dict["url"]),
+        )
+
+        assert self._triple(
+            g,
+            data_dictionary_ref,
+            DCT["format"],
+            URIRef(data_dictionary_dict["format"]),
+        )
+
+        assert self._triple(
+            g,
+            data_dictionary_ref,
+            DCT.license,
+            URIRef(data_dictionary_dict["license"]),
+        )
+
+    def test_data_dictionary_distribution(self):
+
+        data_dictionary_dict = {
+            "url": "https://example.org/some-data-dictionary",
+            "format": "https://resources.data.gov/vocab/file-type/TODO/JSON",
+            "license": "https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0",
+        }
+
+        dataset_dict = {
+            "name": "test-dcat-us",
+            "description": "Test",
+            "resources": [
+                {
+                    "id": "2607a002-142a-40b1-8026-96457b70c01d",
+                    "name": "test",
+                    "data_dictionary": [data_dictionary_dict],
+                }
+            ],
+        }
+
+        s = RDFSerializer()
+        g = s.g
+
+        dataset_ref = s.graph_from_dataset(dataset_dict)
+
+        distribution_ref = [s for s in g.objects(dataset_ref, DCAT.distribution)][0]
+
+        data_dictionary_ref = [
+            s for s in g.objects(distribution_ref, DCATUS.describedBy)
+        ][0]
+
+        assert self._triple(
+            g,
+            data_dictionary_ref,
+            RDF.type,
+            DCAT.Distribution,
+        )
+
+        assert self._triple(
+            g,
+            data_dictionary_ref,
+            DCAT.accessURL,
+            URIRef(data_dictionary_dict["url"]),
+        )
+
+        assert self._triple(
+            g,
+            data_dictionary_ref,
+            DCT["format"],
+            URIRef(data_dictionary_dict["format"]),
+        )
+
+        assert self._triple(
+            g,
+            data_dictionary_ref,
+            DCT.license,
+            URIRef(data_dictionary_dict["license"]),
+        )
+
+    def test_data_dictionary_dataset_string(self):
+
+        dataset_dict = {
+            "name": "test-dcat-us",
+            "description": "Test",
+            "data_dictionary": "https://example.org/some-data-dictionary",
+        }
+
+        s = RDFSerializer()
+        g = s.g
+
+        dataset_ref = s.graph_from_dataset(dataset_dict)
+
+        assert self._triple(
+            g,
+            dataset_ref,
+            DCATUS.describedBy,
+            dataset_dict["data_dictionary"],
+        )
diff --git a/ckanext/dcat/tests/shacl/test_shacl.py b/ckanext/dcat/tests/shacl/test_shacl.py
@@ -216,6 +216,8 @@ def test_validate_dcat_us_3_graph():
     graph = graph_from_dataset("ckan_full_dataset_dcat_us_vocabularies.json")
 
     graph.serialize(destination="graph.ttl")
+
+    graph.serialize(destination="graph.xml")
     path = _get_shacl_file_path("dcat-us_3.0_shacl_shapes.ttl")
     r = validate(graph, shacl_graph=path)
     conforms, results_graph, results_text = r

diff --git a/examples/ckan/ckan_full_dataset_dcat_us_vocabularies.json b/examples/ckan/ckan_full_dataset_dcat_us_vocabularies.json
@@ -160,6 +160,13 @@
         }
     ],
     "spatial_resolution_in_meters": 1.5,
+    "data_dictionary": [
+        {
+            "url": "https://example.org/some-data-dictionary",
+            "format": "https://resources.data.gov/vocab/file-type/TODO/JSON",
+            "license": "https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0"
+        }
+    ],
     "resources": [
         {
             "name": "Resource 1",
@@ -187,6 +194,13 @@
                 "http://id.loc.gov/vocabulary/iso639-1/en",
                 "http://id.loc.gov/vocabulary/iso639-1/es",
                 "http://id.loc.gov/vocabulary/iso639-1/ca"
+            ],
+            "data_dictionary": [
+                {
+                    "url": "https://example.org/some-data-dictionary",
+                    "format": "https://resources.data.gov/vocab/file-type/TODO/JSON",
+                    "license": "https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0"
+                }
             ]
         }
     ]