From b8120d4823bfc19e1ba7f3c8077d1c1db5340e3d Mon Sep 17 00:00:00 2001 From: Rolf Krahl Date: Fri, 6 Sep 2024 15:17:44 +0200 Subject: [PATCH 1/3] Review dumpinvestigation.py example script: - Move creation of search expressions to select the ICAT objects to be written into helper functions, - Get rid of legacy search expressions, use Query objects instead, - Use DISTINCT aggregator where appropriate to avoid including objects mutliple tines, - Various ckean up. --- doc/examples/dumpinvestigation.py | 264 ++++++++++++++++++------------ 1 file changed, 163 insertions(+), 101 deletions(-) diff --git a/doc/examples/dumpinvestigation.py b/doc/examples/dumpinvestigation.py index d415d0c9..6dcbe574 100644 --- a/doc/examples/dumpinvestigation.py +++ b/doc/examples/dumpinvestigation.py @@ -17,130 +17,192 @@ logging.basicConfig(level=logging.INFO) -formats = icat.dumpfile.Backends.keys() -config = icat.config.Config() -config.add_variable('file', ("-o", "--outputfile"), - dict(help="output file name or '-' for stdout"), - default='-') -config.add_variable('format', ("-f", "--format"), - dict(help="output file format", choices=formats), - default='YAML') -config.add_variable('investigation', ("investigation",), - dict(help="name and optionally visit id " - "(separated by a colon) of the investigation")) -client, conf = config.getconfig() - -if client.apiversion < '4.4': - raise RuntimeError("Sorry, ICAT version %s is too old, need 4.4.0 or newer." - % client.apiversion) -client.login(conf.auth, conf.credentials) - - # ------------------------------------------------------------ # helper # ------------------------------------------------------------ -def getinvestigation(invid): +def get_investigation_id(client, invid): """Search the investigation id from name and optionally visitid.""" + query = Query(client, "Investigation", attributes=["id"]) l = invid.split(':') - if len(l) == 1: - # No colon, invid == name - searchexp = "Investigation.id [name='%s']" % tuple(l) - elif len(l) == 2: + query.addConditions({"name": "= '%s'" % l[0]}) + if len(l) == 2: # one colon, invid == name:visitId - searchexp = "Investigation.id [name='%s' AND visitId='%s']" % tuple(l) + query.addConditions({"visitId": "= '%s'" % l[1]}) else: # too many colons raise RuntimeError("Invalid investigation identifier '%s'" % invid) - return (client.assertedSearch(searchexp)[0]) + return client.assertedSearch(query)[0] -def mergesearch(sexps): +def mergesearch(client, queries): """Do many searches and merge the results in one list excluding dups.""" objs = set() - for se in sexps: + for se in queries: objs.update(client.search(se)) return list(objs) +# The following helper functions control what ICAT objects are written +# in each of the dumpfile chunks. There are three options for the +# items in each list: either queries expressed as Query objects, or +# queries expressed as string expressions, or lists of objects. In +# the first two cases, the search results will be written, in the last +# case, the objects are written as provided. + +def get_auth_types(client, invid): + """Users and groups related to the investigation. + """ + # We need the users related to our investigation via + # InvestigationUser, the users member of one of the groups related + # via InvestigationGroup, and the instrument scientists from the + # instruments related to the investigations. These are + # independent searches, but the results are likely to overlap. So + # we need to search and merge results first. + usersearch = [ + Query(client, "User", conditions={ + "investigationUsers." + "investigation.id": "= %d" % invid, + }), + Query(client, "User", conditions={ + "userGroups.grouping.investigationGroups." + "investigation.id": "= %d" % invid, + }), + Query(client, "User", conditions={ + "instrumentScientists.instrument.investigationInstruments." + "investigation.id": "= %d" % invid, + }), + ] + return [ + mergesearch(client, usersearch), + Query(client, "Grouping", conditions={ + "investigationGroups.investigation.id": "= %d" % invid, + }, includes=["userGroups.user"], aggregate="DISTINCT", order=True), + ] + +def get_static_types(client, invid): + """Static stuff that exists independently of the investigation in ICAT. + """ + # Similar situation for ParameterType as for User: need to merge + # ParameterType used for InvestigationParameter, SampleParameter, + # DatasetParameter, and DatafileParameter. + ptsearch = [ + Query(client, "ParameterType", conditions={ + "investigationParameters." + "investigation.id": "= %d" % invid, + }, includes=["facility", "permissibleStringValues"]), + Query(client, "ParameterType", conditions={ + "sampleParameters.sample." + "investigation.id": "= %d" % invid, + }, includes=["facility", "permissibleStringValues"]), + Query(client, "ParameterType", conditions={ + "datasetParameters.dataset." + "investigation.id": "= %d" % invid, + }, includes=["facility", "permissibleStringValues"]), + Query(client, "ParameterType", conditions={ + "datafileParameters.datafile.dataset." + "investigation.id": "= %d" % invid, + }, includes=["facility", "permissibleStringValues"]), + ] + return [ + Query(client, "Facility", + conditions={ + "investigations.id": "= %d" % invid, + }, + order=True), + Query(client, "Instrument", + conditions={ + "investigationInstruments.investigation.id": "= %d" % invid, + }, + includes=["facility", "instrumentScientists.user"], + order=True), + mergesearch(client, ptsearch), + Query(client, "InvestigationType", + conditions={ + "investigations.id": "= %d" % invid, + }, + includes=["facility"], + order=True), + Query(client, "SampleType", + conditions={ + "samples.investigation.id": "= %d" % invid, + }, + includes=["facility"], + aggregate="DISTINCT", + order=True), + Query(client, "DatasetType", + conditions={ + "datasets.investigation.id": "= %d" % invid, + }, + includes=["facility"], + aggregate="DISTINCT", + order=True), + Query(client, "DatafileFormat", + conditions={ + "datafiles.dataset.investigation.id": "= %d" % invid, + }, + includes=["facility"], + aggregate="DISTINCT", + order=True), + ] + +def get_investigation_types(client, invid): + """The investigation and all the stuff that belongs to it. + """ + # The set of objects to be included in the Investigation. + inv_includes = { + "facility", "type.facility", "investigationInstruments", + "investigationInstruments.instrument.facility", "shifts", + "keywords", "publications", "investigationUsers", + "investigationUsers.user", "investigationGroups", + "investigationGroups.grouping", "parameters", + "parameters.type.facility" + } + return [ + Query(client, "Investigation", + conditions={"id":"in (%d)" % invid}, + includes=inv_includes), + Query(client, "Sample", + conditions={"investigation.id":"= %d" % invid}, + includes={"investigation", "type.facility", + "parameters", "parameters.type.facility"}, + order=True), + Query(client, "Dataset", + conditions={"investigation.id":"= %d" % invid}, + includes={"investigation", "type.facility", "sample", + "parameters", "parameters.type.facility"}, + order=True), + Query(client, "Datafile", + conditions={"dataset.investigation.id":"= %d" % invid}, + includes={"dataset", "datafileFormat.facility", + "parameters", "parameters.type.facility"}, + order=True) + ] # ------------------------------------------------------------ # Do it # ------------------------------------------------------------ -invid = getinvestigation(conf.investigation) - +formats = icat.dumpfile.Backends.keys() +config = icat.config.Config() +config.add_variable('file', ("-o", "--outputfile"), + dict(help="output file name or '-' for stdout"), + default='-') +config.add_variable('format', ("-f", "--format"), + dict(help="output file format", choices=formats), + default='YAML') +config.add_variable('investigation', ("investigation",), + dict(help="name and optionally visit id " + "(separated by a colon) of the investigation")) +client, conf = config.getconfig() -# We need the users related to our investigation via -# InvestigationUser, the users member of one of the groups related via -# InvestigationGroup, and the instrument scientists from the -# instruments related to the investigations. These are independent -# searches, but the results are likely to overlap. So we need to -# search and merge results first. Similar situation for ParameterType. -usersearch = [("User <-> InvestigationUser <-> Investigation [id=%d]"), - ("User <-> UserGroup <-> Grouping <-> InvestigationGroup " - "<-> Investigation [id=%d]"), - ("User <-> InstrumentScientist <-> Instrument " - "<-> InvestigationInstrument <-> Investigation [id=%d]")] -ptsearch = [("ParameterType INCLUDE Facility, PermissibleStringValue " - "<-> InvestigationParameter <-> Investigation [id=%d]"), - ("ParameterType INCLUDE Facility, PermissibleStringValue " - "<-> SampleParameter <-> Sample <-> Investigation [id=%d]"), - ("ParameterType INCLUDE Facility, PermissibleStringValue " - "<-> DatasetParameter <-> Dataset <-> Investigation [id=%d]"), - ("ParameterType INCLUDE Facility, PermissibleStringValue " - "<-> DatafileParameter <-> Datafile <-> Dataset " - "<-> Investigation [id=%d]"), ] +if client.apiversion < '4.4': + raise RuntimeError("Sorry, ICAT version %s is too old, need 4.4.0 or newer." + % client.apiversion) +client.login(conf.auth, conf.credentials) -# The set of objects to be included in the Investigation. -inv_includes = { "facility", "type.facility", "investigationInstruments", - "investigationInstruments.instrument.facility", "shifts", - "keywords", "publications", "investigationUsers", - "investigationUsers.user", "investigationGroups", - "investigationGroups.grouping", "parameters", - "parameters.type.facility" } -# The following lists control what ICAT objects are written in each of -# the dumpfile chunks. There are three options for the items in each -# list: either queries expressed as Query objects, or queries -# expressed as string expressions, or lists of objects. In the first -# two cases, the seacrh results will be written, in the last case, the -# objects are written as provided. We assume that there is only one -# relevant facility, e.g. that all objects related to the -# investigation are related to the same facility. We may thus ommit -# the facility from the ORDER BY clauses. -authtypes = [mergesearch([s % invid for s in usersearch]), - ("Grouping ORDER BY name INCLUDE UserGroup, User " - "<-> InvestigationGroup <-> Investigation [id=%d]" % invid)] -statictypes = [("Facility ORDER BY name"), - ("Instrument ORDER BY name " - "INCLUDE Facility, InstrumentScientist, User " - "<-> InvestigationInstrument <-> Investigation [id=%d]" - % invid), - (mergesearch([s % invid for s in ptsearch])), - ("InvestigationType ORDER BY name INCLUDE Facility " - "<-> Investigation [id=%d]" % invid), - ("SampleType ORDER BY name, molecularFormula INCLUDE Facility " - "<-> Sample <-> Investigation [id=%d]" % invid), - ("DatasetType ORDER BY name INCLUDE Facility " - "<-> Dataset <-> Investigation [id=%d]" % invid), - ("DatafileFormat ORDER BY name, version INCLUDE Facility " - "<-> Datafile <-> Dataset <-> Investigation [id=%d]" % invid)] -investtypes = [Query(client, "Investigation", - conditions={"id":"in (%d)" % invid}, - includes=inv_includes), - Query(client, "Sample", order=["name"], - conditions={"investigation.id":"= %d" % invid}, - includes={"investigation", "type.facility", - "parameters", "parameters.type.facility"}), - Query(client, "Dataset", order=["name"], - conditions={"investigation.id":"= %d" % invid}, - includes={"investigation", "type.facility", "sample", - "parameters", "parameters.type.facility"}), - Query(client, "Datafile", order=["dataset.name", "name"], - conditions={"dataset.investigation.id":"= %d" % invid}, - includes={"dataset", "datafileFormat.facility", - "parameters", "parameters.type.facility"})] +invid = get_investigation_id(client, conf.investigation) with open_dumpfile(client, conf.file, conf.format, 'w') as dumpfile: - dumpfile.writedata(authtypes) - dumpfile.writedata(statictypes) - dumpfile.writedata(investtypes) + dumpfile.writedata(get_auth_types(client, invid)) + dumpfile.writedata(get_static_types(client, invid)) + dumpfile.writedata(get_investigation_types(client, invid)) From 4bed5ef9035d7744b389231c4107178f93a4ef57 Mon Sep 17 00:00:00 2001 From: Rolf Krahl Date: Wed, 18 Sep 2024 14:38:26 +0200 Subject: [PATCH 2/3] Typo in docstring in example script --- doc/examples/ingest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/examples/ingest.py b/doc/examples/ingest.py index 83a2333d..82880807 100644 --- a/doc/examples/ingest.py +++ b/doc/examples/ingest.py @@ -24,7 +24,7 @@ The script takes the name of an investigation as argument. The investigation MUST exist in ICAT beforehand and all datasets in the input directory MUST belong to this investigation. The script will -create tha datasets in ICAT, e.g. they MUST NOT exist in ICAT +create the datasets in ICAT, e.g. they MUST NOT exist in ICAT beforehand. The metadata input file may contain attributes and related objects (datasetInstrument, datasetTechnique, datasetParameter) for the datasets provided in the input directory. From cc9573de4b415356dd1a7a465b97c52e74599926 Mon Sep 17 00:00:00 2001 From: Rolf Krahl Date: Fri, 11 Oct 2024 14:33:30 +0200 Subject: [PATCH 3/3] Update changelog --- CHANGES.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index d879da34..5ed42d76 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -18,11 +18,13 @@ Bug fixes and minor changes --------------------------- + `#162`_: Minor updates in the tool chain ++ `#164`_: Fix `dumpinvestigation.py` example script .. _#160: https://github.com/icatproject/python-icat/issues/160 .. _#161: https://github.com/icatproject/python-icat/pull/161 .. _#162: https://github.com/icatproject/python-icat/pull/162 .. _#163: https://github.com/icatproject/python-icat/pull/163 +.. _#164: https://github.com/icatproject/python-icat/pull/164 .. _changes-1_4_0: