Merge pull request #201 from GovDataOfficial/publisher-uri-handling

Fix behavior if publisher_uri is not available
ckan · Oct 22, 2021 · b50bd24 · b50bd24
2 parents 27b66c9 + 4eec7d1
commit b50bd24
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 33 deletions.
diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py
@@ -22,7 +22,7 @@
 from ckan.lib.munge import munge_tag
 from ckan.lib.helpers import url_for
 
-from ckanext.dcat.utils import resource_uri, publisher_uri_from_dataset_dict, DCAT_EXPOSE_SUBCATALOGS, DCAT_CLEAN_TAGS
+from ckanext.dcat.utils import resource_uri, publisher_uri_organization_fallback, DCAT_EXPOSE_SUBCATALOGS, DCAT_CLEAN_TAGS
 
 DCT = Namespace("http://purl.org/dc/terms/")
 DCAT = Namespace("http://www.w3.org/ns/dcat#")
@@ -1135,18 +1135,25 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
             dataset_dict.get('organization'),
         ]):
 
-            publisher_uri = publisher_uri_from_dataset_dict(dataset_dict)
+            publisher_uri = self._get_dataset_value(dataset_dict, 'publisher_uri')
+            publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict)
+            publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name')
             if publisher_uri:
                 publisher_details = CleanedURIRef(publisher_uri)
+            elif not publisher_name and publisher_uri_fallback:
+                # neither URI nor name are available, use organization as fallback
+                publisher_details = CleanedURIRef(publisher_uri_fallback)
             else:
-                # No organization nor publisher_uri
+                # No publisher_uri
                 publisher_details = BNode()
 
             g.add((publisher_details, RDF.type, FOAF.Organization))
             g.add((dataset_ref, DCT.publisher, publisher_details))
 
-            publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name')
-            if not publisher_name and dataset_dict.get('organization'):
+            # In case no name and URI are available, again fall back to organization.
+            # If no name but an URI is available, the name literal remains empty to
+            # avoid mixing organization and dataset values.
+            if not publisher_name and not publisher_uri and dataset_dict.get('organization'):
                 publisher_name = dataset_dict['organization']['title']
 
             g.add((publisher_details, FOAF.name, Literal(publisher_name)))
@@ -1481,19 +1488,25 @@ def _publisher_graph(self, dataset_ref, dataset_dict):
             dataset_dict.get('organization'),
         ]):
 
-            publisher_uri = publisher_uri_from_dataset_dict(dataset_dict)
+            publisher_uri = self._get_dataset_value(dataset_dict, 'publisher_uri')
+            publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict)
+            publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name')
             if publisher_uri:
-                publisher_details = URIRef(publisher_uri)
+                publisher_details = CleanedURIRef(publisher_uri)
+            elif not publisher_name and publisher_uri_fallback:
+                # neither URI nor name are available, use organization as fallback
+                publisher_details = CleanedURIRef(publisher_uri_fallback)
             else:
-                # No organization nor publisher_uri
+                # No publisher_uri
                 publisher_details = BNode()
 
             self.g.add((publisher_details, RDF.type, SCHEMA.Organization))
             self.g.add((dataset_ref, SCHEMA.publisher, publisher_details))
 
-
-            publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name')
-            if not publisher_name and dataset_dict.get('organization'):
+            # In case no name and URI are available, again fall back to organization.
+            # If no name but an URI is available, the name literal remains empty to
+            # avoid mixing organization and dataset values.
+            if not publisher_name and not publisher_uri and dataset_dict.get('organization'):
                 publisher_name = dataset_dict['organization']['title']
             self.g.add((publisher_details, SCHEMA.name, Literal(publisher_name)))
 

diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py
@@ -409,6 +409,39 @@ def test_publisher_no_uri(self):
         assert self._triple(g, publisher, RDF.type, FOAF.Organization)
         assert self._triple(g, publisher, FOAF.name, extras['publisher_name'])
 
+    def test_publisher_org_no_uri(self):
+        dataset = {
+            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
+            'name': 'test-dataset',
+            'organization': {
+                'id': '',
+                'name': 'publisher1',
+                'title': 'Example Publisher from Org',
+            },
+            'extras': [
+                {'key': 'publisher_name', 'value': 'Example Publisher'},
+                {'key': 'publisher_email', 'value': '[email protected]'},
+                {'key': 'publisher_url', 'value': 'http://example.com/publisher/home'},
+                {'key': 'publisher_type', 'value': 'http://purl.org/adms/publishertype/Company'},
+            ]
+        }
+        extras = self._extras(dataset)
+
+        s = RDFSerializer()
+        g = s.g
+
+        dataset_ref = s.graph_from_dataset(dataset)
+
+        publisher = self._triple(g, dataset_ref, DCT.publisher, None)[2]
+        assert publisher
+        assert isinstance(publisher, BNode)
+
+        assert self._triple(g, publisher, RDF.type, FOAF.Organization)
+        assert self._triple(g, publisher, FOAF.name, extras['publisher_name'])
+        assert self._triple(g, publisher, FOAF.mbox, extras['publisher_email'])
+        assert self._triple(g, publisher, FOAF.homepage, URIRef(extras['publisher_url']))
+        assert self._triple(g, publisher, DCT.type, URIRef(extras['publisher_type']))
+
     def test_temporal(self):
         dataset = {
             'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',

diff --git a/ckanext/dcat/tests/test_schemaorg_profile_serialize.py b/ckanext/dcat/tests/test_schemaorg_profile_serialize.py
@@ -130,6 +130,43 @@ def test_publisher_extras(self):
         assert self._triple(g, contact_point, SCHEMA.url, extras['publisher_url'])
         assert self._triple(g, contact_point, SCHEMA.contactType, 'customer service')
 
+    def test_publisher_no_uri(self):
+        dataset = {
+            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
+            'name': 'test-dataset',
+            'organization': {
+                'id': '',
+                'name': 'publisher1',
+                'title': 'Example Publisher from Org',
+            },
+            'extras': [
+                {'key': 'publisher_name', 'value': 'Example Publisher'},
+                {'key': 'publisher_email', 'value': '[email protected]'},
+                {'key': 'publisher_url', 'value': 'http://example.com/publisher/home'},
+                {'key': 'publisher_type', 'value': 'http://purl.org/adms/publishertype/Company'},
+            ]
+        }
+        extras = self._extras(dataset)
+
+        s = RDFSerializer(profiles=['schemaorg'])
+        g = s.g
+
+        dataset_ref = s.graph_from_dataset(dataset)
+
+        publisher = self._triple(g, dataset_ref, SCHEMA.publisher, None)[2]
+        assert publisher
+        assert isinstance(publisher, BNode)
+        assert self._triple(g, publisher, RDF.type, SCHEMA.Organization)
+        assert self._triple(g, publisher, SCHEMA.name, extras['publisher_name'])
+
+        contact_point = self._triple(g, publisher, SCHEMA.contactPoint, None)[2]
+        assert contact_point
+        assert self._triple(g, contact_point, RDF.type, SCHEMA.ContactPoint)
+        assert self._triple(g, contact_point, SCHEMA.name, extras['publisher_name'])
+        assert self._triple(g, contact_point, SCHEMA.email, extras['publisher_email'])
+        assert self._triple(g, contact_point, SCHEMA.url, extras['publisher_url'])
+        assert self._triple(g, contact_point, SCHEMA.contactType, 'customer service')
+
     def test_publisher_org(self):
         dataset = {
             'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',

diff --git a/ckanext/dcat/utils.py b/ckanext/dcat/utils.py
@@ -241,38 +241,22 @@ def resource_uri(resource_dict):
     return uri
 
 
-def publisher_uri_from_dataset_dict(dataset_dict):
+def publisher_uri_organization_fallback(dataset_dict):
     '''
-    Returns an URI for a dataset's publisher
-
-    This will be used to uniquely reference the publisher on the RDF
-    serializations.
-
-    The value will be the first found of:
-
-        1. The value of the `publisher_uri` field
-        2. The value of an extra with key `publisher_uri`
-        3. `catalog_uri()` + '/organization/' + `organization id` field
+    Builds a fallback dataset URI of the form
+    `catalog_uri()` + '/organization/' + `organization id` field
 
     Check the documentation for `catalog_uri()` for the recommended ways of
     setting it.
 
     Returns a string with the publisher URI, or None if no URI could be
     generated.
     '''
-
-    uri = dataset_dict.get('publisher_uri')
-    if not uri:
-        for extra in dataset_dict.get('extras', []):
-            if extra['key'] == 'publisher_uri':
-                uri = extra['value']
-                break
-    if not uri and dataset_dict.get('organization'):
-        uri = '{0}/organization/{1}'.format(catalog_uri().rstrip('/'),
+    if dataset_dict.get('organization'):
+        return '{0}/organization/{1}'.format(catalog_uri().rstrip('/'),
                                             dataset_dict['organization']['id'])
 
-    return uri
-
+    return None
 
 def dataset_id_from_resource(resource_dict):
     '''