diff --git a/CHANGES.rst b/CHANGES.rst index cc10d5fc..5c6cb3ad 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -14,6 +14,9 @@ New features processing the input in custom versions of :class:`icat.ingest.IngestReader`. ++ `#148`_, `#149`_: Inject an additional element with environment + information into the input data in :class:`icat.ingest.IngestReader`. + + `#146`_, `#147`_: Better error handling in :class:`icat.ingest.IngestReader`. @@ -40,6 +43,8 @@ Bug fixes and minor changes .. _#145: https://github.com/icatproject/python-icat/pull/145 .. _#146: https://github.com/icatproject/python-icat/issues/146 .. _#147: https://github.com/icatproject/python-icat/pull/147 +.. _#148: https://github.com/icatproject/python-icat/issues/148 +.. _#149: https://github.com/icatproject/python-icat/pull/149 .. _changes-1_2_0: diff --git a/MANIFEST.in b/MANIFEST.in index 655665c1..1e4d72c2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -17,6 +17,7 @@ include doc/tutorial/*.py include etc/ingest-*.xsd include etc/ingest.xslt include tests/conftest.py +include tests/data/ingest-env.xslt include tests/data/legacy-icatdump-*.xml include tests/data/legacy-icatdump-*.yaml include tests/data/metadata-*.xml diff --git a/doc/src/ingest.rst b/doc/src/ingest.rst index 9ab94740..8245e0fd 100644 --- a/doc/src/ingest.rst +++ b/doc/src/ingest.rst @@ -44,6 +44,57 @@ objects read from the input file in ICAT. :show-inheritance: +.. _ingest-process: + +Ingest process +-------------- + +The processing of ingest files during the instantiation of an +:class:`~icat.ingest.IngestReader` object may be summarized with the +following steps: + +1. Read the metadata and parse the :class:`lxml.etree._ElementTree`. + +2. Call :meth:`~icat.ingest.IngestReader.get_xsd` to get the + appropriate XSD file and validate the metadata against that schema. + +3. Inject an ``_environment`` element as first child of the ``data`` + element, see below. + +4. Call :meth:`~icat.ingest.IngestReader.get_xslt` to get the + appropriate XSLT file and transform the metadata into generic ICAT + data XML file format. + +5. Feed the result of the transformation into the parent class + :class:`~icat.dumpfile_xml.XMLDumpFileReader`. + +Once this initialization is done, +:meth:`~icat.ingest.IngestReader.ingest` may be called to read the +individual objects defined in the metadata. + + +.. _ingest-environment: + +The environment element +----------------------- + +During the processing of ingest files, an ``_environment`` element +will be injected as the first child of the ``data`` element. In the +current version of python-icat, this ``_environment`` element has the +following attributes: + + `icat_version` + Version of the ICAT server this client connects to, e.g. the + :attr:`icat.client.Client.apiversion` attribute of the `client` + object being used by this :class:`~icat.ingest.IngestReader`. + +More attributes may be added in future versions. This +``_environment`` element may be used by the XSLT in order to adapt the +result of the transformation to the environment, in particular to +adapt the output to the ICAT schema version it is supposed to conform +to. + + .. _ingest-example: Ingest example diff --git a/etc/ingest.xslt b/etc/ingest.xslt index 6e1e5cee..ad14d715 100644 --- a/etc/ingest.xslt +++ b/etc/ingest.xslt @@ -10,6 +10,8 @@ + + diff --git a/src/icat/ingest.py b/src/icat/ingest.py index 8b5c7fdb..0b8f2e8f 100644 --- a/src/icat/ingest.py +++ b/src/icat/ingest.py @@ -73,6 +73,10 @@ class IngestReader(XMLDumpFileReader): .. versionchanged:: 1.3.0 drop class attribute :attr:`~icat.ingest.IngestReader.XSLT_name` in favour of :attr:`~icat.ingest.IngestReader.XSLT_Map`. + + .. versionchanged:: 1.3.0 + inject an element `_environment` as first child of the root + element into the input data. """ SchemaDir = Path("/usr/share/icat") @@ -110,6 +114,7 @@ def __init__(self, client, metadata, investigation): schema = etree.XMLSchema(etree.parse(f)) if not schema.validate(ingest_data): raise InvalidIngestFileError("validation failed") + self.add_environment(client, ingest_data) with self.get_xslt(ingest_data).open("rb") as f: xslt = etree.XSLT(etree.parse(f)) super().__init__(client, xslt(ingest_data)) @@ -180,6 +185,34 @@ def get_xslt(self, ingest_data): raise InvalidIngestFileError("unknown format") return self.SchemaDir / xslt + def get_environment(self, client): + """Get the environment to be injected as an element into the input. + + :param client: the client object being used by this + IngestReader. + :type client: :class:`icat.client.Client` + :return: the environment. + :rtype: :class:`dict` + + .. versionadded:: 1.3.0 + """ + return dict(icat_version=str(client.apiversion)) + + def add_environment(self, client, ingest_data): + """Inject environment information into input data. + + :param client: the client object being used by this + IngestReader. + :type client: :class:`icat.client.Client` + :param ingest_data: input data + :type ingest_data: :class:`lxml.etree._ElementTree` + + .. versionadded:: 1.3.0 + """ + env = self.get_environment(client) + env_elem = etree.Element("_environment", **env) + ingest_data.getroot().insert(0, env_elem) + def getobjs_from_data(self, data, objindex): typed_objindex = set() for key, obj in super().getobjs_from_data(data, objindex): diff --git a/tests/conftest.py b/tests/conftest.py index 25c01dcb..104901da 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -159,6 +159,20 @@ def require_dumpfile_backend(backend): _skip("need %s backend for icat.dumpfile" % (backend)) +def get_icatdata_schema(): + if icat_version < "4.4": + fname = "icatdata-4.3.xsd" + elif icat_version < "4.7": + fname = "icatdata-4.4.xsd" + elif icat_version < "4.10": + fname = "icatdata-4.7.xsd" + elif icat_version < "5.0": + fname = "icatdata-4.10.xsd" + else: + fname = "icatdata-5.0.xsd" + return gettestdata(fname) + + def get_reference_dumpfile(ext = "yaml"): require_icat_version("4.4.0", "oldest available set of test data") if icat_version < "4.7": diff --git a/tests/data/ingest-env.xslt b/tests/data/ingest-env.xslt new file mode 100644 index 00000000..8e0eb4e7 --- /dev/null +++ b/tests/data/ingest-env.xslt @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + 2024-01-22T14:30:51+01:00 + + + + ingest-env.xslt + + + + + + + + + + + + + false + + + + + + + + + + + + + + + + _Investigation + + + + + + + + + diff --git a/tests/data/myingest.xslt b/tests/data/myingest.xslt index 4016a60e..7b7f591b 100644 --- a/tests/data/myingest.xslt +++ b/tests/data/myingest.xslt @@ -10,6 +10,8 @@ + + diff --git a/tests/test_06_ingest.py b/tests/test_06_ingest.py index b9c560f3..e0456d67 100644 --- a/tests/test_06_ingest.py +++ b/tests/test_06_ingest.py @@ -11,7 +11,8 @@ import icat.config from icat.ingest import IngestReader from icat.query import Query -from conftest import getConfig, gettestdata, icat_version, testdatadir +from conftest import (getConfig, gettestdata, icat_version, + get_icatdata_schema, testdatadir) def get_test_investigation(client): @@ -80,14 +81,13 @@ class MyIngestReader(IngestReader): cet = datetime.timezone(datetime.timedelta(hours=1)) cest = datetime.timezone(datetime.timedelta(hours=2)) -Case = namedtuple('Case', ['data', 'metadata', 'schema', 'checks', 'marks']) +Case = namedtuple('Case', ['data', 'metadata', 'checks', 'marks']) # Try out different variants for the metadata input file cases = [ Case( data = ["testingest_inl_1", "testingest_inl_2"], metadata = gettestdata("metadata-4.4-inl.xml"), - schema = gettestdata("icatdata-4.4.xsd"), checks = { "testingest_inl_1": [ ("SELECT ds.description FROM Dataset ds WHERE ds.id = %d", @@ -127,7 +127,6 @@ class MyIngestReader(IngestReader): Case( data = ["testingest_inl5_1", "testingest_inl5_2"], metadata = gettestdata("metadata-5.0-inl.xml"), - schema = gettestdata("icatdata-5.0.xsd"), checks = { "testingest_inl5_1": [ ("SELECT ds.description FROM Dataset ds WHERE ds.id = %d", @@ -186,7 +185,6 @@ class MyIngestReader(IngestReader): Case( data = ["testingest_sep_1", "testingest_sep_2"], metadata = gettestdata("metadata-4.4-sep.xml"), - schema = gettestdata("icatdata-4.4.xsd"), checks = { "testingest_sep_1": [ ("SELECT ds.description FROM Dataset ds WHERE ds.id = %d", @@ -226,7 +224,6 @@ class MyIngestReader(IngestReader): Case( data = ["testingest_sep5_1", "testingest_sep5_2"], metadata = gettestdata("metadata-5.0-sep.xml"), - schema = gettestdata("icatdata-5.0.xsd"), checks = { "testingest_sep5_1": [ ("SELECT ds.description FROM Dataset ds WHERE ds.id = %d", @@ -286,7 +283,6 @@ class MyIngestReader(IngestReader): data = [ "testingest_sample_1", "testingest_sample_2", "testingest_sample_3", "testingest_sample_4" ], metadata = gettestdata("metadata-sample.xml"), - schema = gettestdata("icatdata-4.4.xsd"), checks = { "testingest_sample_1": [ ("SELECT ds.description FROM Dataset ds WHERE ds.id = %d", @@ -357,7 +353,7 @@ def test_ingest_schema(client, investigation, schemadir, case): for name in case.data: datasets.append(client.new("Dataset", name=name)) reader = IngestReader(client, case.metadata, investigation) - with case.schema.open("rb") as f: + with get_icatdata_schema().open("rb") as f: schema = etree.XMLSchema(etree.parse(f)) assert schema.validate(reader.infile) @@ -406,7 +402,6 @@ def test_ingest(client, investigation, samples, schemadir, case): Case( data = ["testingest_io_1"], metadata = io_metadata, - schema = gettestdata("icatdata-4.4.xsd"), checks = { "testingest_io_1": [ ("SELECT ds.description FROM Dataset ds WHERE ds.id = %d", @@ -529,28 +524,24 @@ def test_ingest_fileobj(client, investigation, samples, schemadir, case): Case( data = [], metadata = invalid_root_metadata, - schema = None, checks = {}, marks = (), ), Case( data = [], metadata = invalid_ver_metadata, - schema = None, checks = {}, marks = (), ), Case( data = ["testingest_err_invalid_ref"], metadata = invalid_ref_metadata, - schema = gettestdata("icatdata-4.4.xsd"), checks = {}, marks = (), ), Case( data = ["testingest_err_invalid_dup"], metadata = invalid_dup_metadata, - schema = gettestdata("icatdata-4.4.xsd"), checks = {}, marks = (), ), @@ -558,7 +549,6 @@ def test_ingest_fileobj(client, investigation, samples, schemadir, case): data = ["testingest_err_invalid_dup_id_1", "testingest_err_invalid_dup_id_2"], metadata = invalid_dup_id_metadata, - schema = gettestdata("icatdata-4.4.xsd"), checks = {}, marks = (), ), @@ -614,14 +604,12 @@ def test_ingest_error_invalid(client, investigation, schemadir, case): Case( data = ["testingest_err_search_attr"], metadata = searcherr_attr_metadata, - schema = gettestdata("icatdata-4.4.xsd"), checks = {}, marks = (), ), Case( data = ["testingest_err_search_ref"], metadata = searcherr_ref_metadata, - schema = gettestdata("icatdata-4.4.xsd"), checks = {}, marks = (), ), @@ -642,7 +630,6 @@ def test_ingest_error_searcherr(client, investigation, schemadir, case): Case( data = ["testingest_custom_icatingest_1"], metadata = gettestdata("metadata-custom-icatingest.xml"), - schema = gettestdata("icatdata-4.4.xsd"), checks = { "testingest_custom_icatingest_1": [ ("SELECT ds.description FROM Dataset ds WHERE ds.id = %d", @@ -662,7 +649,6 @@ def test_ingest_error_searcherr(client, investigation, schemadir, case): Case( data = ["testingest_custom_myingest_1"], metadata = gettestdata("metadata-custom-myingest.xml"), - schema = gettestdata("icatdata-4.4.xsd"), checks = { "testingest_custom_myingest_1": [ ("SELECT ds.description FROM Dataset ds WHERE ds.id = %d", @@ -714,3 +700,38 @@ def test_custom_ingest(client, investigation, samples, schemadir, case): ds = client.assertedSearch(query)[0] for query, res in case.checks[name]: assert client.assertedSearch(query % ds.id)[0] == res + + +env_cases = [ + Case( + data = ["testingest_inl_1", "testingest_inl_2"], + metadata = gettestdata("metadata-4.4-inl.xml"), + checks = {}, + marks = (), + ), +] +@pytest.mark.parametrize("case", [ + pytest.param(c, id=c.metadata.name, marks=c.marks) for c in env_cases +]) +def test_ingest_env(monkeypatch, client, investigation, schemadir, case): + """Test using the _environment element. + + Applying a custom XSLT that extracts an attribute from the + _environment element that is injected by IngestReader into the + input data and puts that values into the head element of the + transformed input. This is to test that adding the _environment + element works and it is in principle possible to make use of the + values in the XSLT. + """ + monkeypatch.setattr(IngestReader, + "XSLT_Map", dict(icatingest="ingest-env.xslt")) + datasets = [] + for name in case.data: + datasets.append(client.new("Dataset", name=name)) + reader = IngestReader(client, case.metadata, investigation) + with get_icatdata_schema().open("rb") as f: + schema = etree.XMLSchema(etree.parse(f)) + assert schema.validate(reader.infile) + version_elem = reader.infile.xpath("/icatdata/head/apiversion") + assert version_elem + assert version_elem[0].text == str(client.apiversion)