Skip to content

Commit

Permalink
Merge pull request #201 from GovDataOfficial/publisher-uri-handling
Browse files Browse the repository at this point in the history
 Fix behavior if publisher_uri is not available
  • Loading branch information
amercader authored Oct 22, 2021
2 parents 27b66c9 + 4eec7d1 commit b50bd24
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 33 deletions.
35 changes: 24 additions & 11 deletions ckanext/dcat/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from ckan.lib.munge import munge_tag
from ckan.lib.helpers import url_for

from ckanext.dcat.utils import resource_uri, publisher_uri_from_dataset_dict, DCAT_EXPOSE_SUBCATALOGS, DCAT_CLEAN_TAGS
from ckanext.dcat.utils import resource_uri, publisher_uri_organization_fallback, DCAT_EXPOSE_SUBCATALOGS, DCAT_CLEAN_TAGS

DCT = Namespace("http://purl.org/dc/terms/")
DCAT = Namespace("http://www.w3.org/ns/dcat#")
Expand Down Expand Up @@ -1135,18 +1135,25 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
dataset_dict.get('organization'),
]):

publisher_uri = publisher_uri_from_dataset_dict(dataset_dict)
publisher_uri = self._get_dataset_value(dataset_dict, 'publisher_uri')
publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict)
publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name')
if publisher_uri:
publisher_details = CleanedURIRef(publisher_uri)
elif not publisher_name and publisher_uri_fallback:
# neither URI nor name are available, use organization as fallback
publisher_details = CleanedURIRef(publisher_uri_fallback)
else:
# No organization nor publisher_uri
# No publisher_uri
publisher_details = BNode()

g.add((publisher_details, RDF.type, FOAF.Organization))
g.add((dataset_ref, DCT.publisher, publisher_details))

publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name')
if not publisher_name and dataset_dict.get('organization'):
# In case no name and URI are available, again fall back to organization.
# If no name but an URI is available, the name literal remains empty to
# avoid mixing organization and dataset values.
if not publisher_name and not publisher_uri and dataset_dict.get('organization'):
publisher_name = dataset_dict['organization']['title']

g.add((publisher_details, FOAF.name, Literal(publisher_name)))
Expand Down Expand Up @@ -1481,19 +1488,25 @@ def _publisher_graph(self, dataset_ref, dataset_dict):
dataset_dict.get('organization'),
]):

publisher_uri = publisher_uri_from_dataset_dict(dataset_dict)
publisher_uri = self._get_dataset_value(dataset_dict, 'publisher_uri')
publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict)
publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name')
if publisher_uri:
publisher_details = URIRef(publisher_uri)
publisher_details = CleanedURIRef(publisher_uri)
elif not publisher_name and publisher_uri_fallback:
# neither URI nor name are available, use organization as fallback
publisher_details = CleanedURIRef(publisher_uri_fallback)
else:
# No organization nor publisher_uri
# No publisher_uri
publisher_details = BNode()

self.g.add((publisher_details, RDF.type, SCHEMA.Organization))
self.g.add((dataset_ref, SCHEMA.publisher, publisher_details))


publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name')
if not publisher_name and dataset_dict.get('organization'):
# In case no name and URI are available, again fall back to organization.
# If no name but an URI is available, the name literal remains empty to
# avoid mixing organization and dataset values.
if not publisher_name and not publisher_uri and dataset_dict.get('organization'):
publisher_name = dataset_dict['organization']['title']
self.g.add((publisher_details, SCHEMA.name, Literal(publisher_name)))

Expand Down
33 changes: 33 additions & 0 deletions ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,39 @@ def test_publisher_no_uri(self):
assert self._triple(g, publisher, RDF.type, FOAF.Organization)
assert self._triple(g, publisher, FOAF.name, extras['publisher_name'])

def test_publisher_org_no_uri(self):
dataset = {
'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
'name': 'test-dataset',
'organization': {
'id': '',
'name': 'publisher1',
'title': 'Example Publisher from Org',
},
'extras': [
{'key': 'publisher_name', 'value': 'Example Publisher'},
{'key': 'publisher_email', 'value': '[email protected]'},
{'key': 'publisher_url', 'value': 'http://example.com/publisher/home'},
{'key': 'publisher_type', 'value': 'http://purl.org/adms/publishertype/Company'},
]
}
extras = self._extras(dataset)

s = RDFSerializer()
g = s.g

dataset_ref = s.graph_from_dataset(dataset)

publisher = self._triple(g, dataset_ref, DCT.publisher, None)[2]
assert publisher
assert isinstance(publisher, BNode)

assert self._triple(g, publisher, RDF.type, FOAF.Organization)
assert self._triple(g, publisher, FOAF.name, extras['publisher_name'])
assert self._triple(g, publisher, FOAF.mbox, extras['publisher_email'])
assert self._triple(g, publisher, FOAF.homepage, URIRef(extras['publisher_url']))
assert self._triple(g, publisher, DCT.type, URIRef(extras['publisher_type']))

def test_temporal(self):
dataset = {
'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
Expand Down
37 changes: 37 additions & 0 deletions ckanext/dcat/tests/test_schemaorg_profile_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,43 @@ def test_publisher_extras(self):
assert self._triple(g, contact_point, SCHEMA.url, extras['publisher_url'])
assert self._triple(g, contact_point, SCHEMA.contactType, 'customer service')

def test_publisher_no_uri(self):
dataset = {
'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
'name': 'test-dataset',
'organization': {
'id': '',
'name': 'publisher1',
'title': 'Example Publisher from Org',
},
'extras': [
{'key': 'publisher_name', 'value': 'Example Publisher'},
{'key': 'publisher_email', 'value': '[email protected]'},
{'key': 'publisher_url', 'value': 'http://example.com/publisher/home'},
{'key': 'publisher_type', 'value': 'http://purl.org/adms/publishertype/Company'},
]
}
extras = self._extras(dataset)

s = RDFSerializer(profiles=['schemaorg'])
g = s.g

dataset_ref = s.graph_from_dataset(dataset)

publisher = self._triple(g, dataset_ref, SCHEMA.publisher, None)[2]
assert publisher
assert isinstance(publisher, BNode)
assert self._triple(g, publisher, RDF.type, SCHEMA.Organization)
assert self._triple(g, publisher, SCHEMA.name, extras['publisher_name'])

contact_point = self._triple(g, publisher, SCHEMA.contactPoint, None)[2]
assert contact_point
assert self._triple(g, contact_point, RDF.type, SCHEMA.ContactPoint)
assert self._triple(g, contact_point, SCHEMA.name, extras['publisher_name'])
assert self._triple(g, contact_point, SCHEMA.email, extras['publisher_email'])
assert self._triple(g, contact_point, SCHEMA.url, extras['publisher_url'])
assert self._triple(g, contact_point, SCHEMA.contactType, 'customer service')

def test_publisher_org(self):
dataset = {
'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
Expand Down
28 changes: 6 additions & 22 deletions ckanext/dcat/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,38 +241,22 @@ def resource_uri(resource_dict):
return uri


def publisher_uri_from_dataset_dict(dataset_dict):
def publisher_uri_organization_fallback(dataset_dict):
'''
Returns an URI for a dataset's publisher
This will be used to uniquely reference the publisher on the RDF
serializations.
The value will be the first found of:
1. The value of the `publisher_uri` field
2. The value of an extra with key `publisher_uri`
3. `catalog_uri()` + '/organization/' + `organization id` field
Builds a fallback dataset URI of the form
`catalog_uri()` + '/organization/' + `organization id` field
Check the documentation for `catalog_uri()` for the recommended ways of
setting it.
Returns a string with the publisher URI, or None if no URI could be
generated.
'''

uri = dataset_dict.get('publisher_uri')
if not uri:
for extra in dataset_dict.get('extras', []):
if extra['key'] == 'publisher_uri':
uri = extra['value']
break
if not uri and dataset_dict.get('organization'):
uri = '{0}/organization/{1}'.format(catalog_uri().rstrip('/'),
if dataset_dict.get('organization'):
return '{0}/organization/{1}'.format(catalog_uri().rstrip('/'),
dataset_dict['organization']['id'])

return uri

return None

def dataset_id_from_resource(resource_dict):
'''
Expand Down

0 comments on commit b50bd24

Please sign in to comment.