diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index 69553480..d1479717 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -22,7 +22,7 @@ from ckan.lib.munge import munge_tag from ckan.lib.helpers import url_for -from ckanext.dcat.utils import resource_uri, publisher_uri_from_dataset_dict, DCAT_EXPOSE_SUBCATALOGS, DCAT_CLEAN_TAGS +from ckanext.dcat.utils import resource_uri, publisher_uri_organization_fallback, DCAT_EXPOSE_SUBCATALOGS, DCAT_CLEAN_TAGS DCT = Namespace("http://purl.org/dc/terms/") DCAT = Namespace("http://www.w3.org/ns/dcat#") @@ -1135,18 +1135,25 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): dataset_dict.get('organization'), ]): - publisher_uri = publisher_uri_from_dataset_dict(dataset_dict) + publisher_uri = self._get_dataset_value(dataset_dict, 'publisher_uri') + publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict) + publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') if publisher_uri: publisher_details = CleanedURIRef(publisher_uri) + elif not publisher_name and publisher_uri_fallback: + # neither URI nor name are available, use organization as fallback + publisher_details = CleanedURIRef(publisher_uri_fallback) else: - # No organization nor publisher_uri + # No publisher_uri publisher_details = BNode() g.add((publisher_details, RDF.type, FOAF.Organization)) g.add((dataset_ref, DCT.publisher, publisher_details)) - publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') - if not publisher_name and dataset_dict.get('organization'): + # In case no name and URI are available, again fall back to organization. + # If no name but an URI is available, the name literal remains empty to + # avoid mixing organization and dataset values. + if not publisher_name and not publisher_uri and dataset_dict.get('organization'): publisher_name = dataset_dict['organization']['title'] g.add((publisher_details, FOAF.name, Literal(publisher_name))) @@ -1481,19 +1488,25 @@ def _publisher_graph(self, dataset_ref, dataset_dict): dataset_dict.get('organization'), ]): - publisher_uri = publisher_uri_from_dataset_dict(dataset_dict) + publisher_uri = self._get_dataset_value(dataset_dict, 'publisher_uri') + publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict) + publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') if publisher_uri: - publisher_details = URIRef(publisher_uri) + publisher_details = CleanedURIRef(publisher_uri) + elif not publisher_name and publisher_uri_fallback: + # neither URI nor name are available, use organization as fallback + publisher_details = CleanedURIRef(publisher_uri_fallback) else: - # No organization nor publisher_uri + # No publisher_uri publisher_details = BNode() self.g.add((publisher_details, RDF.type, SCHEMA.Organization)) self.g.add((dataset_ref, SCHEMA.publisher, publisher_details)) - - publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name') - if not publisher_name and dataset_dict.get('organization'): + # In case no name and URI are available, again fall back to organization. + # If no name but an URI is available, the name literal remains empty to + # avoid mixing organization and dataset values. + if not publisher_name and not publisher_uri and dataset_dict.get('organization'): publisher_name = dataset_dict['organization']['title'] self.g.add((publisher_details, SCHEMA.name, Literal(publisher_name))) diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py index b94c1ce2..79dd83a4 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py +++ b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py @@ -409,6 +409,39 @@ def test_publisher_no_uri(self): assert self._triple(g, publisher, RDF.type, FOAF.Organization) assert self._triple(g, publisher, FOAF.name, extras['publisher_name']) + def test_publisher_org_no_uri(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'organization': { + 'id': '', + 'name': 'publisher1', + 'title': 'Example Publisher from Org', + }, + 'extras': [ + {'key': 'publisher_name', 'value': 'Example Publisher'}, + {'key': 'publisher_email', 'value': 'publisher@example.com'}, + {'key': 'publisher_url', 'value': 'http://example.com/publisher/home'}, + {'key': 'publisher_type', 'value': 'http://purl.org/adms/publishertype/Company'}, + ] + } + extras = self._extras(dataset) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + publisher = self._triple(g, dataset_ref, DCT.publisher, None)[2] + assert publisher + assert isinstance(publisher, BNode) + + assert self._triple(g, publisher, RDF.type, FOAF.Organization) + assert self._triple(g, publisher, FOAF.name, extras['publisher_name']) + assert self._triple(g, publisher, FOAF.mbox, extras['publisher_email']) + assert self._triple(g, publisher, FOAF.homepage, URIRef(extras['publisher_url'])) + assert self._triple(g, publisher, DCT.type, URIRef(extras['publisher_type'])) + def test_temporal(self): dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', diff --git a/ckanext/dcat/tests/test_schemaorg_profile_serialize.py b/ckanext/dcat/tests/test_schemaorg_profile_serialize.py index fe02cf29..fd1e3fec 100644 --- a/ckanext/dcat/tests/test_schemaorg_profile_serialize.py +++ b/ckanext/dcat/tests/test_schemaorg_profile_serialize.py @@ -130,6 +130,43 @@ def test_publisher_extras(self): assert self._triple(g, contact_point, SCHEMA.url, extras['publisher_url']) assert self._triple(g, contact_point, SCHEMA.contactType, 'customer service') + def test_publisher_no_uri(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'organization': { + 'id': '', + 'name': 'publisher1', + 'title': 'Example Publisher from Org', + }, + 'extras': [ + {'key': 'publisher_name', 'value': 'Example Publisher'}, + {'key': 'publisher_email', 'value': 'publisher@example.com'}, + {'key': 'publisher_url', 'value': 'http://example.com/publisher/home'}, + {'key': 'publisher_type', 'value': 'http://purl.org/adms/publishertype/Company'}, + ] + } + extras = self._extras(dataset) + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + publisher = self._triple(g, dataset_ref, SCHEMA.publisher, None)[2] + assert publisher + assert isinstance(publisher, BNode) + assert self._triple(g, publisher, RDF.type, SCHEMA.Organization) + assert self._triple(g, publisher, SCHEMA.name, extras['publisher_name']) + + contact_point = self._triple(g, publisher, SCHEMA.contactPoint, None)[2] + assert contact_point + assert self._triple(g, contact_point, RDF.type, SCHEMA.ContactPoint) + assert self._triple(g, contact_point, SCHEMA.name, extras['publisher_name']) + assert self._triple(g, contact_point, SCHEMA.email, extras['publisher_email']) + assert self._triple(g, contact_point, SCHEMA.url, extras['publisher_url']) + assert self._triple(g, contact_point, SCHEMA.contactType, 'customer service') + def test_publisher_org(self): dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', diff --git a/ckanext/dcat/utils.py b/ckanext/dcat/utils.py index 8d1aa8f9..b86b64c9 100644 --- a/ckanext/dcat/utils.py +++ b/ckanext/dcat/utils.py @@ -241,18 +241,10 @@ def resource_uri(resource_dict): return uri -def publisher_uri_from_dataset_dict(dataset_dict): +def publisher_uri_organization_fallback(dataset_dict): ''' - Returns an URI for a dataset's publisher - - This will be used to uniquely reference the publisher on the RDF - serializations. - - The value will be the first found of: - - 1. The value of the `publisher_uri` field - 2. The value of an extra with key `publisher_uri` - 3. `catalog_uri()` + '/organization/' + `organization id` field + Builds a fallback dataset URI of the form + `catalog_uri()` + '/organization/' + `organization id` field Check the documentation for `catalog_uri()` for the recommended ways of setting it. @@ -260,19 +252,11 @@ def publisher_uri_from_dataset_dict(dataset_dict): Returns a string with the publisher URI, or None if no URI could be generated. ''' - - uri = dataset_dict.get('publisher_uri') - if not uri: - for extra in dataset_dict.get('extras', []): - if extra['key'] == 'publisher_uri': - uri = extra['value'] - break - if not uri and dataset_dict.get('organization'): - uri = '{0}/organization/{1}'.format(catalog_uri().rstrip('/'), + if dataset_dict.get('organization'): + return '{0}/organization/{1}'.format(catalog_uri().rstrip('/'), dataset_dict['organization']['id']) - return uri - + return None def dataset_id_from_resource(resource_dict): '''