From 9a1e70864c85b686854309c54aa6d410ca05bcd4 Mon Sep 17 00:00:00 2001 From: Ross Spencer Date: Tue, 22 Sep 2020 11:25:56 -0400 Subject: [PATCH] Improve retrieval of original name Having used AIPscan a little more, leaving the name blank is strange to look at, but we also cannot always retrieve the original name. We do our best to a) retrieve the name in this commit, as well as b) populate the name with the transfer UUID as an alternative. --- AIPscan/Aggregator/mets_parse_helpers.py | 31 +- AIPscan/Aggregator/tasks.py | 2 +- .../original_name_mets/dataverse_example.xml | 2688 +++++++++++++++++ .../document-empty-dirs.xml | 859 ++++++ AIPscan/Aggregator/tests/test_mets.py | 21 +- 5 files changed, 3584 insertions(+), 17 deletions(-) create mode 100644 AIPscan/Aggregator/tests/fixtures/original_name_mets/dataverse_example.xml create mode 100644 AIPscan/Aggregator/tests/fixtures/original_name_mets/document-empty-dirs.xml diff --git a/AIPscan/Aggregator/mets_parse_helpers.py b/AIPscan/Aggregator/mets_parse_helpers.py index 4c81d01e..f1d4fefb 100644 --- a/AIPscan/Aggregator/mets_parse_helpers.py +++ b/AIPscan/Aggregator/mets_parse_helpers.py @@ -44,36 +44,43 @@ def parse_mets_with_metsrw(mets_file): def get_aip_original_name(mets): - """Retrieve PREMIS original name from a METSDocument object.""" + """Retrieve PREMIS original name from a METSDocument object. + + If the original name cannot be reliably retrieved from the METS file + a METSError exception is returned to be handled by the caller as + desired. + """ # Negated as we're going to want to remove this length of values. NAMESUFFIX = -len("-00000000-0000-0000-0000-000000000000") + # The transfer directory prefix is a directory prefix that can also + # exist in a dmdSec intellectual entity and we want to identify and + # ignore those. + TRANSFER_DIR_PREFIX = "%transferDirectory%" + NAMESPACES = {u"premis": u"http://www.loc.gov/premis/v3"} ELEM_ORIGINAL_NAME_PATTERN = ".//premis:originalName" - FIRST_DMDSEC = "dmdSec_1" - original_name = "" for fsentry in mets.all_files(): - try: - dmdsec = fsentry.dmdsecs[0] - if dmdsec.id_string != FIRST_DMDSEC: - continue + for dmdsec in fsentry.dmdsecs: dmd_element = dmdsec.serialize() full_name = dmd_element.find( ELEM_ORIGINAL_NAME_PATTERN, namespaces=NAMESPACES ) + if full_name is not None and full_name.text.startswith(TRANSFER_DIR_PREFIX): + # We don't want this value, it will usually represent an + # directory entity. + continue try: original_name = full_name.text[:NAMESUFFIX] except AttributeError: - pass - break - except IndexError: - pass + continue + # There should be a transfer name in every METS. if original_name == "": - raise METSError() + raise METSError("Cannot locate transfer name in METS") return original_name diff --git a/AIPscan/Aggregator/tasks.py b/AIPscan/Aggregator/tasks.py index 4f9ab83d..4950ef4a 100644 --- a/AIPscan/Aggregator/tasks.py +++ b/AIPscan/Aggregator/tasks.py @@ -294,7 +294,7 @@ def get_mets( except METSError: # Some other error with the METS file that we might want to # log and act upon. - originalName = "" + originalName = packageUUID aip = create_aip_object( package_uuid=packageUUID, diff --git a/AIPscan/Aggregator/tests/fixtures/original_name_mets/dataverse_example.xml b/AIPscan/Aggregator/tests/fixtures/original_name_mets/dataverse_example.xml new file mode 100644 index 00000000..92f7364a --- /dev/null +++ b/AIPscan/Aggregator/tests/fixtures/original_name_mets/dataverse_example.xml @@ -0,0 +1,2688 @@ + + + + + + + + + + + A study of my afternoon snacks + https://doi.org/10.5072/FK2/QAWS8O + + + Noel Fielding + Mary Berry + Hollywood, Paul + + + Scholars Portal Dataverse + + + 6.0 + + + + + CC0 Waiver + + + + + + + + + + + + + + + + UUID + e10369a1-a485-4001-8967-33c18e1ab142 + + dataverse-e10369a1-a485-4001-8967-33c18e1ab142 + + + + + + + + + + + + + + UUID + 9bbb67b9-cad2-4b6c-ab8e-a91229f2506e + + + 0 + + sha256 + 51a28297c482b67b4164a4c5c06b8cc15be3cbf79a619ff39a4b581c649d9104 + + 1945 + + + Unknown + + + + 2020-01-15 + + + %transferDirectory%objects/afternoon-snacks-1-ddi.xml + + + + + + + + + + UUID + f5a8d63e-e9c3-4f14-9b04-a40e7800289e + + ingestion + 2020-09-22T15:09:17.361175+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 7aca39d2-6ae4-486b-83de-e2d680bf2229 + + message digest calculation + 2020-09-22T15:09:17.753016+00:00 + + program="python"; module="hashlib.sha256()" + + + + + 51a28297c482b67b4164a4c5c06b8cc15be3cbf79a619ff39a4b581c649d9104 + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 714f9f37-c0dc-49d5-95e8-354f82ea04a0 + + virus check + 2020-09-22T15:09:21.147870+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + UUID + 5701888a-898b-4faf-bf54-b768cdfd83e2 + + + 0 + + sha256 + 913b55eaa1cc0643cd0fe23eeb8f08c6a4441f2a79395b132be74e6886866645 + + 445 + + + Unknown + + + + 2020-01-15 + + + %transferDirectory%objects/afternoon-snacks-1.RData + + derivation + has source + + UUID + d71a73f5-a5a4-41c8-8586-cdf35a758f64 + + + UUID + 7bb87306-4917-4368-8cd8-c9d718aa1f32 + + + + + + + + + + + + UUID + b0357f12-0138-4601-945b-85c542c7b7f4 + + ingestion + 2020-09-22T15:09:17.409962+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 2e4959a7-7ba1-4d33-b7a8-c0825d208a5d + + message digest calculation + 2020-09-22T15:09:17.655737+00:00 + + program="python"; module="hashlib.sha256()" + + + + + 913b55eaa1cc0643cd0fe23eeb8f08c6a4441f2a79395b132be74e6886866645 + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 8d216bc8-b839-4027-8c24-e9830187d58b + + virus check + 2020-09-22T15:09:21.172868+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + UUID + d71a73f5-a5a4-41c8-8586-cdf35a758f64 + + + 0 + + sha256 + 455db507707fc489cc655ea8cfec6af5f1fa05d9d022cbe0d34bd95dc7c435e7 + + 273 + + + Unknown + + + + 2020-01-15 + + + %transferDirectory%objects/afternoon-snacks-1.csv + + derivation + is source of + + UUID + 5701888a-898b-4faf-bf54-b768cdfd83e2 + + + UUID + 7bb87306-4917-4368-8cd8-c9d718aa1f32 + + + + derivation + is source of + + UUID + 8d41b701-49a4-4693-8052-fe6b3f032921 + + + UUID + 447430bb-32ad-4120-81a2-81f11612d18c + + + + + + + + + + + + UUID + 59153d1d-ae40-4232-990d-6f0f8ddf723a + + ingestion + 2020-09-22T15:09:17.456222+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 6d05feaf-ca3e-4069-9cce-bf8b18c90ca8 + + message digest calculation + 2020-09-22T15:09:17.824435+00:00 + + program="python"; module="hashlib.sha256()" + + + + + 455db507707fc489cc655ea8cfec6af5f1fa05d9d022cbe0d34bd95dc7c435e7 + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 61210a7b-6782-43f1-aeaa-feeaff646f0d + + virus check + 2020-09-22T15:09:21.199489+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 7bb87306-4917-4368-8cd8-c9d718aa1f32 + + derivation + 2020-09-22T15:09:31.613047+00:00 + + + + + + + %transferDirectory%objects/afternoon-snacks-1.RData + + + + URI + https://dataverse.scholarsportal.info + + + + + + + + + + + UUID + 447430bb-32ad-4120-81a2-81f11612d18c + + derivation + 2020-09-22T15:09:31.626165+00:00 + + + + + + + %transferDirectory%objects/afternoon-snacks-1.tab + + + + URI + https://dataverse.scholarsportal.info + + + + + + + + + + + UUID + fde75dfe-4fe3-4364-99ec-957a18083c35 + + fixity check + 2020-09-22T15:09:31.672963+00:00 + + program="python"; module="hashlib.md5()" + + + Pass + + Dataverse checksum 84bca41c1557b4aecd3923f9969aeed0 verified + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + URI + https://dataverse.scholarsportal.info + + Archivematica Dataverse Transfer + Dataverse + + + + + + + + + + + + UUID + 8d41b701-49a4-4693-8052-fe6b3f032921 + + + 0 + + sha256 + 740a4459f05185250920ac2f872891d6d81fafe64773ad06686202e8d6b796f6 + + 265 + + + Unknown + + + + 2020-01-15 + + + %transferDirectory%objects/afternoon-snacks-1.tab + + derivation + has source + + UUID + d71a73f5-a5a4-41c8-8586-cdf35a758f64 + + + UUID + 447430bb-32ad-4120-81a2-81f11612d18c + + + + + + + + + + + + UUID + 0c805f50-28fe-4de8-8785-2a7e5ee4ce7f + + ingestion + 2020-09-22T15:09:17.327088+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + fc884ffd-cfdc-4193-afa3-b82985c2b486 + + message digest calculation + 2020-09-22T15:09:17.705939+00:00 + + program="python"; module="hashlib.sha256()" + + + + + 740a4459f05185250920ac2f872891d6d81fafe64773ad06686202e8d6b796f6 + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 4032d673-2b56-4566-87bc-726f15d2a85a + + virus check + 2020-09-22T15:09:21.290860+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + UUID + 5cf6ce6c-324a-4ea7-ae94-0a7bc913be0c + + + 0 + + sha256 + feabd3fd18603bb7c7431e2128b82583b79843562cdb7c87965c585e048f8b09 + + 321 + + + Unknown + + + + 2020-01-15 + + + %transferDirectory%objects/afternoon-snacks-1citation-bib.bib + + + + + + + + + + UUID + f7988539-bc6a-48b7-b8f0-9d51c41c244b + + ingestion + 2020-09-22T15:09:17.387115+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 2ffa4653-83f1-4b1f-9739-2eab627e211e + + message digest calculation + 2020-09-22T15:09:17.728713+00:00 + + program="python"; module="hashlib.sha256()" + + + + + feabd3fd18603bb7c7431e2128b82583b79843562cdb7c87965c585e048f8b09 + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 4a78a713-8a38-4dc1-88d8-c6ffe939cb5e + + virus check + 2020-09-22T15:09:21.599825+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + UUID + 7226a3c7-a83b-44eb-ad0e-76bf706bf569 + + + 0 + + sha256 + 5655c06f492a46fd9a4d22031849728f5be7eb5685689297886c18d23f5490b3 + + 808 + + + Unknown + + + + 2020-01-15 + + + %transferDirectory%objects/afternoon-snacks-1citation-endnote.xml + + + + + + + + + + UUID + 5cee4599-0521-4459-949e-4c2b0484614d + + ingestion + 2020-09-22T15:09:17.502903+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 8ed91e3c-a872-4025-9997-42c7c2df5169 + + message digest calculation + 2020-09-22T15:09:17.775993+00:00 + + program="python"; module="hashlib.sha256()" + + + + + 5655c06f492a46fd9a4d22031849728f5be7eb5685689297886c18d23f5490b3 + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 0a9202fd-498e-470b-9c55-b9a4a863083a + + virus check + 2020-09-22T15:09:20.920262+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + UUID + 45d6528a-65a3-4c9d-b710-298410c21b84 + + + 0 + + sha256 + 9bea8cc461f0b145ceef4c8e732244c0c974217ec6a34df1fcf2628f889367c9 + + 473 + + + Unknown + + + + 2020-01-15 + + + %transferDirectory%objects/afternoon-snacks-1citation-ris.ris + + + + + + + + + + UUID + c2e64b19-3b51-41ac-870b-fe3c5563244b + + ingestion + 2020-09-22T15:09:17.479558+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 9a7cb6eb-ac5e-4e46-83d0-f2324f419a0a + + message digest calculation + 2020-09-22T15:09:17.680923+00:00 + + program="python"; module="hashlib.sha256()" + + + + + 9bea8cc461f0b145ceef4c8e732244c0c974217ec6a34df1fcf2628f889367c9 + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + aa441498-d2da-4a99-8e00-c787dce07d84 + + virus check + 2020-09-22T15:09:21.558983+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + UUID + d736f5c9-07c0-4a66-aa09-d8fccabf3f14 + + + 0 + + sha256 + 410f5b304ecf53175196780ea26bfc3d80dcff1a54d3adbad28f4dbb1905d548 + + 15 + + + Unknown + + + + 2020-01-15 + + + %transferDirectory%objects/cake-descriptions.txt + + + + + + + + + + UUID + 645b0bc2-4e4c-43f2-8efc-bfecc4774a86 + + ingestion + 2020-09-22T15:09:17.433133+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 1a4677ff-f13a-4265-8bcb-139775d56a1e + + message digest calculation + 2020-09-22T15:09:17.800532+00:00 + + program="python"; module="hashlib.sha256()" + + + + + 410f5b304ecf53175196780ea26bfc3d80dcff1a54d3adbad28f4dbb1905d548 + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 3cf641d4-6ce6-439b-abea-ef2736e9cdf6 + + virus check + 2020-09-22T15:09:21.098716+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + c7db47ca-e8c1-4289-8d92-c28f79d462a1 + + fixity check + 2020-09-22T15:09:31.650865+00:00 + + program="python"; module="hashlib.md5()" + + + Pass + + Dataverse checksum 9672a5adc5bf80ddb69fc404203f4dae verified + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + UUID + 1404b577-e179-4b2f-91c2-e59b048f64ec + + + 0 + + sha256 + 472485cb54e78e720c07147a1d07a344936d0eda65cd3792569f398234f0a61d + + 7681 + + + Unknown + + + + 2020-09-22 + + + %SIPDirectory%objects/metadata/transfers/dataverse-c5fcf270-6835-433a-a2b9-010ea363f467/METS.xml + + + + + + + + + + UUID + 31c6d09e-44b9-4f67-b5af-04b846fb5f01 + + ingestion + 2020-09-22T15:09:49.471931+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 41d0de8e-8bdd-4bfc-a61d-eb53bb66023b + + message digest calculation + 2020-09-22T15:09:49.616845+00:00 + + program="python"; module="hashlib.sha256()" + + + + + 472485cb54e78e720c07147a1d07a344936d0eda65cd3792569f398234f0a61d + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 255ca1d6-3745-4acf-8d43-c6f80da4cc1b + + virus check + 2020-09-22T15:09:52.757949+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + UUID + 76cbddce-342d-4abb-9e75-ed431c812c43 + + + 0 + + sha256 + 3cd63459daf926b17d66855b9969b9579fe57933724899557f6956e388bcb975 + + 216 + + + Unknown + + + + 2020-09-22 + + + %SIPDirectory%objects/metadata/transfers/dataverse-c5fcf270-6835-433a-a2b9-010ea363f467/agents.json + + + + + + + + + + UUID + 09919f58-3e52-450e-b79e-f5d4d5abce5c + + ingestion + 2020-09-22T15:09:49.439867+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 30f57e68-76ca-4cf7-9361-a1300e634320 + + message digest calculation + 2020-09-22T15:09:49.636800+00:00 + + program="python"; module="hashlib.sha256()" + + + + + 3cd63459daf926b17d66855b9969b9579fe57933724899557f6956e388bcb975 + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + c65702ab-e420-4f13-ba12-ddcad03fb1ac + + virus check + 2020-09-22T15:09:52.736443+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + UUID + 866064b8-4299-48d0-b063-ebb731b59b02 + + + 0 + + sha256 + 19be6192867efc7fe7425429f062a90931f99431a6b8a309fef7aaaced03b787 + + 10578 + + + Unknown + + + + 2020-09-22 + + + %SIPDirectory%objects/metadata/transfers/dataverse-c5fcf270-6835-433a-a2b9-010ea363f467/dataset.json + + + + + + + + + + UUID + 2f73049c-5501-46af-8be4-d2d97940f00b + + ingestion + 2020-09-22T15:09:49.425868+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + c49ac659-ffdb-4160-b98f-693f7e8e869c + + message digest calculation + 2020-09-22T15:09:49.598856+00:00 + + program="python"; module="hashlib.sha256()" + + + + + 19be6192867efc7fe7425429f062a90931f99431a6b8a309fef7aaaced03b787 + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + e4328e5b-d869-4bd2-b00a-73aba042cf82 + + virus check + 2020-09-22T15:09:52.788281+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + UUID + d7fa6c04-7a7b-4f96-9643-6571480e5e2b + + + 0 + + sha256 + 53a190e1e89cc2952cd0dfa642c8d38495fe1d9393c49b5240f4f2290f076815 + + 444 + + + Unknown + + + + 2020-09-22 + + + %SIPDirectory%objects/metadata/transfers/dataverse-c5fcf270-6835-433a-a2b9-010ea363f467/directory_tree.txt + + + + + + + + + + UUID + 4c512d4d-418a-4068-ac96-0f0eccd16e03 + + ingestion + 2020-09-22T15:09:49.455711+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 45188549-f2e1-40dd-be3b-474876135599 + + message digest calculation + 2020-09-22T15:09:49.654973+00:00 + + program="python"; module="hashlib.sha256()" + + + + + 53a190e1e89cc2952cd0dfa642c8d38495fe1d9393c49b5240f4f2290f076815 + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + d38b4581-66f0-4606-95bb-b8cbcc217435 + + virus check + 2020-09-22T15:09:52.772356+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + UUID + 02893fc3-97c8-4a8e-89d3-b12f4a78e35c + + + 0 + + sha256 + c4f07b87073efc7527bb5618cb40857147036b4b4a07e0f0003cf8e380e4ce0f + + 76927 + + + Unknown + + + + 2020-09-22 + + + %SIPDirectory%objects/submissionDocumentation/transfer-dataverse-c5fcf270-6835-433a-a2b9-010ea363f467/METS.xml + + + + + + + + + + UUID + 89a703ce-789b-4721-b5a3-1539c0efc59d + + ingestion + 2020-09-22T15:09:38.792184+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 7340f8f5-601a-4c8d-be9e-ba66a68acff1 + + message digest calculation + 2020-09-22T15:09:38.918917+00:00 + + program="python"; module="hashlib.sha256()" + + + + + c4f07b87073efc7527bb5618cb40857147036b4b4a07e0f0003cf8e380e4ce0f + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + c36ec20d-c712-4d78-b9be-3813229d3b00 + + virus check + 2020-09-22T15:09:41.507438+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/AIPscan/Aggregator/tests/fixtures/original_name_mets/document-empty-dirs.xml b/AIPscan/Aggregator/tests/fixtures/original_name_mets/document-empty-dirs.xml new file mode 100644 index 00000000..c22fac3d --- /dev/null +++ b/AIPscan/Aggregator/tests/fixtures/original_name_mets/document-empty-dirs.xml @@ -0,0 +1,859 @@ + + + + + + + + + UUID + bdcc9374-1ed8-40c5-a75d-69515c5680bb + + %transferDirectory%objects/E1/ + + + + + + + + + + UUID + 595128b8-fd1e-48d5-90fd-d5a202618888 + + %transferDirectory%objects/0B/E1/ + + + + + + + + + + UUID + 97f78cc1-e6fd-483d-a139-18f633bc865c + + empty-dirs-97f78cc1-e6fd-483d-a139-18f633bc865c + + + + + + + + + + UUID + 6c7c9e42-7d63-4e40-b8aa-b8e0cad32f10 + + %transferDirectory%objects/0A/ + + + + + + + + + + UUID + d68b2903-1a72-4a0e-80b2-b0ed309b4547 + + %transferDirectory%objects/0B/ + + + + + + + + + + + UUID + 503bd90a-e48a-454d-b7e3-b572682213e4 + + + 0 + + sha256 + 555da75a404558aea84bf31c812185a57cb529bc99a9600219fe558dfc7e5915 + + 131362 + + + Unknown + + + + 2018-08-02 + + + %transferDirectory%objects/0A/Cv7a7v7WAAADJTO.jpg + + + + + + + + + + UUID + 99104dc7-ab69-45dc-96e0-b43c7bf04d0f + + ingestion + 2020-09-22T14:18:34.183845+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + a6f6e9d7-bf51-43d9-875d-1a67950b4498 + + message digest calculation + 2020-09-22T14:18:34.391548+00:00 + + program="python"; module="hashlib.sha256()" + + + + + 555da75a404558aea84bf31c812185a57cb529bc99a9600219fe558dfc7e5915 + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + e586a1fc-21fa-4996-85ef-c72a70eea36b + + virus check + 2020-09-22T14:18:38.095057+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + UUID + 7bdebeb1-333b-4ab3-8d5c-58d78ded8fb1 + + + 0 + + sha256 + 555da75a404558aea84bf31c812185a57cb529bc99a9600219fe558dfc7e5915 + + 131362 + + + Unknown + + + + 2018-08-02 + + + %transferDirectory%objects/0B/Cv7a7v7WAAADJTO.jpg + + + + + + + + + + UUID + 33a4dc8c-25a6-4dc4-ab95-ccb556aa3e7f + + ingestion + 2020-09-22T14:18:34.155799+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + db2ceb45-3d82-41fc-9fef-d47317d7e883 + + message digest calculation + 2020-09-22T14:18:34.361410+00:00 + + program="python"; module="hashlib.sha256()" + + + + + 555da75a404558aea84bf31c812185a57cb529bc99a9600219fe558dfc7e5915 + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 7e17718d-4c4e-46e1-8a99-2eea0135c233 + + virus check + 2020-09-22T14:18:38.139152+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + UUID + 8f3d2d50-d108-4f0a-ba64-77ad1c684f2a + + + 0 + + sha256 + 59de494b66789afceab7b2d9e3d134df69c800027f2f217d63193d743358d52a + + 272 + + + Unknown + + + + 2020-09-22 + + + %SIPDirectory%objects/metadata/transfers/empty-dirs-48724172-6d8e-4e12-bc2b-be03956dc952/directory_tree.txt + + + + + + + + + + UUID + 82fb085a-db84-4ab7-b400-8b27f6d80b56 + + ingestion + 2020-09-22T14:19:08.889276+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 5e25e3b7-3407-4638-babf-f08b7267617f + + message digest calculation + 2020-09-22T14:19:09.008634+00:00 + + program="python"; module="hashlib.sha256()" + + + + + 59de494b66789afceab7b2d9e3d134df69c800027f2f217d63193d743358d52a + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 520bb5cc-2687-48f2-8f15-4e6fd5f4626c + + virus check + 2020-09-22T14:19:12.060040+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + UUID + fa89820a-02c3-411c-92cc-f3b6fa68ef6a + + + 0 + + sha256 + c60c883f81d12c8512f973ebff94788075336ebb031b64d014fa68aab8f3c717 + + 26708 + + + Unknown + + + + 2020-09-22 + + + %SIPDirectory%objects/submissionDocumentation/transfer-empty-dirs-48724172-6d8e-4e12-bc2b-be03956dc952/METS.xml + + + + + + + + + + UUID + 504fda12-b9ee-4e1c-9e33-dda7240608d6 + + ingestion + 2020-09-22T14:18:58.431054+00:00 + + + + + + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 377a99a6-175b-49e8-8b0a-d605b4ac3fbe + + message digest calculation + 2020-09-22T14:18:58.498914+00:00 + + program="python"; module="hashlib.sha256()" + + + + + c60c883f81d12c8512f973ebff94788075336ebb031b64d014fa68aab8f3c717 + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + UUID + 2ad3dfdc-927d-4a8c-96dc-b416740d064d + + virus check + 2020-09-22T14:19:01.779518+00:00 + + program="ClamAV (clamd)"; version="ClamAV 0.99.2"; virusDefinitions="25934/Mon Sep 21 13:52:04 2020" + + + Pass + + + + + + preservation system + Archivematica-1.12 + + + repository code + test + + + Archivematica user pk + 1 + + + + + + + + + + + preservation system + Archivematica-1.12 + + Archivematica + software + + + + + + + + + + repository code + test + + test + organization + + + + + + + + + + Archivematica user pk + 1 + + username="test", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/AIPscan/Aggregator/tests/test_mets.py b/AIPscan/Aggregator/tests/test_mets.py index f912f309..89602583 100644 --- a/AIPscan/Aggregator/tests/test_mets.py +++ b/AIPscan/Aggregator/tests/test_mets.py @@ -13,19 +13,32 @@ @pytest.mark.parametrize( - "fixture_path, transfer_name", + "fixture_path, transfer_name, mets_error", [ - (os.path.join("features_mets", "features-mets.xml"), "myTransfer"), - (os.path.join("iso_mets", "iso_mets.xml"), "iso"), + (os.path.join("features_mets", "features-mets.xml"), "myTransfer", False), + (os.path.join("iso_mets", "iso_mets.xml"), "iso", False), + ( + os.path.join("original_name_mets", "document-empty-dirs.xml"), + "empty-dirs", + False, + ), + # Exception: Cannot disambiguate dmdSec_1 in the METS using + # mets-reader-writer and so we cannot retrieve the originalName. + (os.path.join("original_name_mets", "dataverse_example.xml"), "", True), ], ) -def test_get_aip_original_name(fixture_path, transfer_name): +def test_get_aip_original_name(fixture_path, transfer_name, mets_error): """Make sure that we can reliably get original name from the METS file given we haven't any mets-reader-writer helpers. """ script_dir = os.path.dirname(os.path.realpath(__file__)) mets_file = os.path.join(script_dir, FIXTURES_DIR, fixture_path) mets = metsrw.METSDocument.fromfile(mets_file) + if mets_error: + # Function should raise an error to work with. + with pytest.raises(mets_parse_helpers.METSError): + _ = mets_parse_helpers.get_aip_original_name(mets) + return assert mets_parse_helpers.get_aip_original_name(mets) == transfer_name # Test the same works with a string. with open(mets_file, "rb") as mets_stream: