From ffad65607e46ac5157ff3f1e3b5326da10080736 Mon Sep 17 00:00:00 2001 From: RayPlante Date: Thu, 2 Feb 2023 17:19:58 -0500 Subject: [PATCH 1/7] rmm ingest: add skeleton for init_metrics_for() --- python/nistoar/rmm/mongo/loader.py | 4 ++- python/nistoar/rmm/mongo/nerdm.py | 32 +++++++++++++++++++- python/tests/nistoar/rmm/mongo/test_nerdm.py | 17 +++++++++++ 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/python/nistoar/rmm/mongo/loader.py b/python/nistoar/rmm/mongo/loader.py index c131ed8..8ceff5c 100644 --- a/python/nistoar/rmm/mongo/loader.py +++ b/python/nistoar/rmm/mongo/loader.py @@ -109,7 +109,9 @@ def load_data(self, data, key=None, onupdate='quiet'): warning will be issued. If set to 'fail', an exception will be raised. If it is a function, it will be executed before loading the new data. It - should take data and key as arguments; it should + should take data and key as arguments where data will + previously saved record and key will be the MongoDB + that was used to select it; this function should return True if the new data should then be loaded or False if it should not. """ diff --git a/python/nistoar/rmm/mongo/nerdm.py b/python/nistoar/rmm/mongo/nerdm.py index a1a97cf..00c9bcc 100644 --- a/python/nistoar/rmm/mongo/nerdm.py +++ b/python/nistoar/rmm/mongo/nerdm.py @@ -1,7 +1,7 @@ """ load NERDm records into the RMM's MongoDB database """ -import json, os, sys +import json, os, sys, warnings from collections import Mapping from .loader import (Loader, RecordIngestError, JSONEncodingError, @@ -99,6 +99,21 @@ class LatestLoader(_NERDmRenditionLoader): def __init__(self, dburl, schemadir, log=None): super(NERDmLoader.LatestLoader, self).__init__(LATEST_COLLECTION_NAME, dburl, schemadir, log) + def load_data(self, data, key=None, onupdate='quiet'): + added = super().load_data(data, key, onupdate) + if added: + # initialize the metrics collections as needed + try: + init_metrics_for(self._db, data) + except Exception as ex: + msg = "Failure detected while initializing Metric data for %s: %s" % \ + (data.get("@id", "unknown record"), str(ex)) + if self.log: + self.log.warning(msg) + else: + warnings.warn(msg, UpdateWarning) + return added + class ReleaseSetLoader(_NERDmRenditionLoader): def __init__(self, dburl, schemadir, log=None): super(NERDmLoader.ReleaseSetLoader, self).__init__(RELEASES_COLLECTION_NAME, dburl, schemadir, log) @@ -266,3 +281,18 @@ def load_from_dir(self, dirpath, validate=True, results=None): return results +def init_metrics_for(db, nerdm): + """ + initialize the metrics-related collections for dataset described in the given NERDm record + as needed. + + This function assumes that the given NERDm record is the latest description of the dataset. + It should not be called with NERDm records describing earlier versions of the dataset. + + :param Database db: the MongoDB Database instance the contains the metrics collections. + This instance will have come from a MongoDB client that is already + connected to a backend server. + :param dict nerdm: the NERDm record to initialize for. + """ + # replace this with the actual implementation + raise NotImplementedError("init_metrics_for() not implemented") diff --git a/python/tests/nistoar/rmm/mongo/test_nerdm.py b/python/tests/nistoar/rmm/mongo/test_nerdm.py index c1409b7..381a680 100644 --- a/python/tests/nistoar/rmm/mongo/test_nerdm.py +++ b/python/tests/nistoar/rmm/mongo/test_nerdm.py @@ -11,6 +11,7 @@ schemadir = os.path.join(basedir, "model") exdir = os.path.join(schemadir, "examples") janaffile = os.path.join(exdir, "janaf.json") +pdrfile = os.path.join(exdir, "mds2-2106.json") dburl = None if os.environ.get('MONGO_TESTDB_URL'): @@ -177,6 +178,22 @@ def test_load_from_file(self): self.assertEqual(self.ldr._client.get_database().versions.count_documents({}), 1) self.assertEqual(self.ldr._client.get_database().releasesets.count_documents({}), 1) + def test_init_metrics_for(self): + with open(pdrfile) as fd: + rec = json.load(fd) + + # this record has files in it + self.assertTrue(any(['/od/ds/' in f.get('downloadURL','') for f in rec.get('components',[])])) + + self.ldr.connect() + database = self.ldr._db + + nerdm.init_metrics_for(database, rec) + + # replace this with checks of successful loading into the database + self.fail("Tests not implemented") + + if __name__ == '__main__': From bdbe4022ee105c78ce2b9b7308bff7f3b59e446a Mon Sep 17 00:00:00 2001 From: deoyani Date: Wed, 26 Apr 2023 11:38:04 -0400 Subject: [PATCH 2/7] Added new function to handle insert skeleton metrics for newly released data publication. --- python/nistoar/rmm/mongo/nerdm.py | 72 +++++++++++++++++++- python/tests/nistoar/rmm/mongo/test_nerdm.py | 15 +++- 2 files changed, 82 insertions(+), 5 deletions(-) diff --git a/python/nistoar/rmm/mongo/nerdm.py b/python/nistoar/rmm/mongo/nerdm.py index 00c9bcc..84cb1e0 100644 --- a/python/nistoar/rmm/mongo/nerdm.py +++ b/python/nistoar/rmm/mongo/nerdm.py @@ -1,6 +1,7 @@ """ load NERDm records into the RMM's MongoDB database """ +# import pandas as pd import json, os, sys, warnings from collections import Mapping @@ -294,5 +295,72 @@ def init_metrics_for(db, nerdm): connected to a backend server. :param dict nerdm: the NERDm record to initialize for. """ - # replace this with the actual implementation - raise NotImplementedError("init_metrics_for() not implemented") + #Convert nderm dict to an array of dict + #nerdm_use = [nerdm] + + record_collection_fields = { + "first_time_logged": None, + "last_time_logged": None, + "total_size_download":0, + "success_get":0, + "number_users":0, + "record_download":0, + "ip_list":[]} + + #Record fields to be copied + record_fields = ['pdrid', 'ediid'] + + files_collection_fields = { + "success_get" : 0, + "failure_get" : 0, + "datacart_or_client" : 0, + "download_size" : 0, + "number_users" : 0, + "ip_list": [], + "first_time_logged" : None, + "last_time_logged" : None, + } + + nerdm['pdrid'] = nerdm.pop('@id') + records = {} + #Copy fields + for field in record_fields: + records[field] = nerdm[field] + + #Initialize record fields + for col in record_collection_fields.keys(): + if col not in records.keys(): + records[col] = record_collection_fields[col] + print("RecordsMetrics Creation") + db["recordMetrics"].insert_one(records) + + + #Get files from record components + files = flatten_records(nerdm, record_fields, files_collection_fields) + print("FileMetrics Creation") + db["fileMetrics"].insert_many(files) + + +def flatten_records(record, record_fields, initialize_fields): + files = [] + keys_to_keep = ['filepath', 'size'] + + for component in record['components']: + #Get file information + file_dict = {} + if 'filepath' in component.keys(): + for key in keys_to_keep: + if key in component.keys(): + file_dict[key] = component[key] + if 'size' in file_dict.keys(): + file_dict['filesize'] = file_dict.pop('size') + else: + file_dict['filesize'] = 0 + #Get record information + for key in record_fields: + file_dict[key] = record[key] + #Initialize other fields + for key in initialize_fields.keys(): + file_dict[key] = initialize_fields[key] + files.append(file_dict) + return files \ No newline at end of file diff --git a/python/tests/nistoar/rmm/mongo/test_nerdm.py b/python/tests/nistoar/rmm/mongo/test_nerdm.py index 381a680..102a5ab 100644 --- a/python/tests/nistoar/rmm/mongo/test_nerdm.py +++ b/python/tests/nistoar/rmm/mongo/test_nerdm.py @@ -1,3 +1,4 @@ + import pdb, os, json, urllib.parse, warnings, logging import unittest as test from pymongo import MongoClient @@ -33,6 +34,12 @@ def tearDown(self): if not hasattr(client, 'get_database'): client.get_database = client.get_default_database db = client.get_database() + if "recordMetrics" in db.list_collection_names(): + db.drop_collection("recordMetrics") + if "fileMetrics" in db.list_collection_names(): + db.drop_collection("fileMetrics") + db.create_collection("recordMetrics") + db.create_collection("fileMetrics") if "record" in db.list_collection_names(): db.drop_collection("record") if "versions" in db.list_collection_names(): @@ -187,11 +194,13 @@ def test_init_metrics_for(self): self.ldr.connect() database = self.ldr._db - nerdm.init_metrics_for(database, rec) - + c = self.ldr._client.get_database().recordMetrics.find() + self.assertEqual(c[0]['pdrid'], 'ark:/88434/mds2-2106') + c = self.ldr._client.get_database().fileMetrics.find() + self.assertEqual(c[0]['pdrid'], 'ark:/88434/mds2-2106') # replace this with checks of successful loading into the database - self.fail("Tests not implemented") + #self.fail("Tests not implemented") From 09a2ac4af9e909b8c99f753ec448671a36eaa9ed Mon Sep 17 00:00:00 2001 From: deoyani Date: Wed, 6 Sep 2023 11:21:15 -0400 Subject: [PATCH 3/7] Updated the metrics ingestion part --- python/nistoar/rmm/mongo/nerdm.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/python/nistoar/rmm/mongo/nerdm.py b/python/nistoar/rmm/mongo/nerdm.py index 84cb1e0..60bade6 100644 --- a/python/nistoar/rmm/mongo/nerdm.py +++ b/python/nistoar/rmm/mongo/nerdm.py @@ -299,6 +299,8 @@ def init_metrics_for(db, nerdm): #nerdm_use = [nerdm] record_collection_fields = { + "pdrid": None, + "ediid":None, "first_time_logged": None, "last_time_logged": None, "total_size_download":0, @@ -311,14 +313,18 @@ def init_metrics_for(db, nerdm): record_fields = ['pdrid', 'ediid'] files_collection_fields = { + "pdrid": None, + "ediid":None, + "filesize": 0, "success_get" : 0, "failure_get" : 0, "datacart_or_client" : 0, - "download_size" : 0, + "total_size_download": 0, "number_users" : 0, "ip_list": [], "first_time_logged" : None, "last_time_logged" : None, + "downloadURL": None } nerdm['pdrid'] = nerdm.pop('@id') @@ -343,7 +349,7 @@ def init_metrics_for(db, nerdm): def flatten_records(record, record_fields, initialize_fields): files = [] - keys_to_keep = ['filepath', 'size'] + keys_to_keep = ['filepath', 'size', 'downloadURL'] for component in record['components']: #Get file information @@ -356,6 +362,9 @@ def flatten_records(record, record_fields, initialize_fields): file_dict['filesize'] = file_dict.pop('size') else: file_dict['filesize'] = 0 + + if 'downloadURL' not in component.keys(): + file_dict['downloadURL'] = '' #Get record information for key in record_fields: file_dict[key] = record[key] @@ -363,4 +372,5 @@ def flatten_records(record, record_fields, initialize_fields): for key in initialize_fields.keys(): file_dict[key] = initialize_fields[key] files.append(file_dict) - return files \ No newline at end of file + return files + From 7713bf93e6effce51221d0b6ae67a472282b6326 Mon Sep 17 00:00:00 2001 From: deoyani Date: Fri, 8 Dec 2023 14:55:29 -0500 Subject: [PATCH 4/7] Updated the issue with unit test failure for metrics skeleton creation --- python/nistoar/rmm/mongo/nerdm.py | 12 +++++------- python/tests/nistoar/rmm/mongo/test_nerdm.py | 4 +++- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/nistoar/rmm/mongo/nerdm.py b/python/nistoar/rmm/mongo/nerdm.py index 60bade6..d3f88c4 100644 --- a/python/nistoar/rmm/mongo/nerdm.py +++ b/python/nistoar/rmm/mongo/nerdm.py @@ -297,7 +297,6 @@ def init_metrics_for(db, nerdm): """ #Convert nderm dict to an array of dict #nerdm_use = [nerdm] - record_collection_fields = { "pdrid": None, "ediid":None, @@ -313,7 +312,7 @@ def init_metrics_for(db, nerdm): record_fields = ['pdrid', 'ediid'] files_collection_fields = { - "pdrid": None, + "pdrid": None, "ediid":None, "filesize": 0, "success_get" : 0, @@ -326,8 +325,8 @@ def init_metrics_for(db, nerdm): "last_time_logged" : None, "downloadURL": None } - nerdm['pdrid'] = nerdm.pop('@id') + records = {} #Copy fields for field in record_fields: @@ -337,13 +336,11 @@ def init_metrics_for(db, nerdm): for col in record_collection_fields.keys(): if col not in records.keys(): records[col] = record_collection_fields[col] - print("RecordsMetrics Creation") db["recordMetrics"].insert_one(records) #Get files from record components files = flatten_records(nerdm, record_fields, files_collection_fields) - print("FileMetrics Creation") db["fileMetrics"].insert_many(files) @@ -370,7 +367,8 @@ def flatten_records(record, record_fields, initialize_fields): file_dict[key] = record[key] #Initialize other fields for key in initialize_fields.keys(): - file_dict[key] = initialize_fields[key] - files.append(file_dict) + if(key not in file_dict.keys()): + file_dict[key] = initialize_fields[key] + files.append(file_dict) return files diff --git a/python/tests/nistoar/rmm/mongo/test_nerdm.py b/python/tests/nistoar/rmm/mongo/test_nerdm.py index 102a5ab..f390bee 100644 --- a/python/tests/nistoar/rmm/mongo/test_nerdm.py +++ b/python/tests/nistoar/rmm/mongo/test_nerdm.py @@ -188,7 +188,8 @@ def test_load_from_file(self): def test_init_metrics_for(self): with open(pdrfile) as fd: rec = json.load(fd) - + + # print(" **** TEST ****", rec) # this record has files in it self.assertTrue(any(['/od/ds/' in f.get('downloadURL','') for f in rec.get('components',[])])) @@ -198,6 +199,7 @@ def test_init_metrics_for(self): c = self.ldr._client.get_database().recordMetrics.find() self.assertEqual(c[0]['pdrid'], 'ark:/88434/mds2-2106') c = self.ldr._client.get_database().fileMetrics.find() + # print(" TEST ****", c[0]) self.assertEqual(c[0]['pdrid'], 'ark:/88434/mds2-2106') # replace this with checks of successful loading into the database #self.fail("Tests not implemented") From 4bd71d26dc422c07fb873cf89afb49b6128b2084 Mon Sep 17 00:00:00 2001 From: deoyani Date: Wed, 13 Dec 2023 20:06:16 -0500 Subject: [PATCH 5/7] updated code --- python/nistoar/rmm/mongo/nerdm.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/nistoar/rmm/mongo/nerdm.py b/python/nistoar/rmm/mongo/nerdm.py index d3f88c4..7376adf 100644 --- a/python/nistoar/rmm/mongo/nerdm.py +++ b/python/nistoar/rmm/mongo/nerdm.py @@ -10,6 +10,7 @@ from .loader import ValidationError, SchemaError, RefResolutionError from nistoar.nerdm import utils from nistoar.nerdm.convert.rmm import NERDmForRMM +import pdb DEF_BASE_SCHEMA = "https://data.nist.gov/od/dm/nerdm-schema/v0.5#" DEF_SCHEMA = DEF_BASE_SCHEMA + "/definitions/Resource" @@ -336,11 +337,14 @@ def init_metrics_for(db, nerdm): for col in record_collection_fields.keys(): if col not in records.keys(): records[col] = record_collection_fields[col] - db["recordMetrics"].insert_one(records) - + if(db["recordMetrics"].find_one({"ediid": nerdm["ediid"]}) is None): + db["recordMetrics"].insert_one(records) #Get files from record components files = flatten_records(nerdm, record_fields, files_collection_fields) + current_files = db["fileMetrics"].find({"ediid": nerdm["ediid"]}) + pdb.set_trace() + db["fileMetrics"].insert_many(files) From 3c6621c6f4d1c08e277766271cec9669c38e8532 Mon Sep 17 00:00:00 2001 From: deoyani Date: Mon, 18 Dec 2023 07:29:46 -0500 Subject: [PATCH 6/7] Fixing issues and conflicts --- python/nistoar/rmm/mongo/nerdm.py | 21 +++++++++++--------- python/tests/nistoar/rmm/mongo/test_nerdm.py | 4 +--- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/python/nistoar/rmm/mongo/nerdm.py b/python/nistoar/rmm/mongo/nerdm.py index 7376adf..616a1a1 100644 --- a/python/nistoar/rmm/mongo/nerdm.py +++ b/python/nistoar/rmm/mongo/nerdm.py @@ -10,7 +10,7 @@ from .loader import ValidationError, SchemaError, RefResolutionError from nistoar.nerdm import utils from nistoar.nerdm.convert.rmm import NERDmForRMM -import pdb + DEF_BASE_SCHEMA = "https://data.nist.gov/od/dm/nerdm-schema/v0.5#" DEF_SCHEMA = DEF_BASE_SCHEMA + "/definitions/Resource" @@ -298,6 +298,7 @@ def init_metrics_for(db, nerdm): """ #Convert nderm dict to an array of dict #nerdm_use = [nerdm] + record_collection_fields = { "pdrid": None, "ediid":None, @@ -313,7 +314,7 @@ def init_metrics_for(db, nerdm): record_fields = ['pdrid', 'ediid'] files_collection_fields = { - "pdrid": None, + "pdrid": None, "ediid":None, "filesize": 0, "success_get" : 0, @@ -326,8 +327,8 @@ def init_metrics_for(db, nerdm): "last_time_logged" : None, "downloadURL": None } + nerdm['pdrid'] = nerdm.pop('@id') - records = {} #Copy fields for field in record_fields: @@ -342,10 +343,13 @@ def init_metrics_for(db, nerdm): #Get files from record components files = flatten_records(nerdm, record_fields, files_collection_fields) + files_to_update = [] current_files = db["fileMetrics"].find({"ediid": nerdm["ediid"]}) - pdb.set_trace() - - db["fileMetrics"].insert_many(files) + current_files_filepaths = [x["filepath"] for x in current_files] + for file_item in files: + if file_item['filepath'] not in current_files_filepaths: + files_to_update.append(file_item) + db["fileMetrics"].insert_many(files_to_update) def flatten_records(record, record_fields, initialize_fields): @@ -371,8 +375,7 @@ def flatten_records(record, record_fields, initialize_fields): file_dict[key] = record[key] #Initialize other fields for key in initialize_fields.keys(): - if(key not in file_dict.keys()): - file_dict[key] = initialize_fields[key] - files.append(file_dict) + file_dict[key] = initialize_fields[key] + files.append(file_dict) return files diff --git a/python/tests/nistoar/rmm/mongo/test_nerdm.py b/python/tests/nistoar/rmm/mongo/test_nerdm.py index f390bee..102a5ab 100644 --- a/python/tests/nistoar/rmm/mongo/test_nerdm.py +++ b/python/tests/nistoar/rmm/mongo/test_nerdm.py @@ -188,8 +188,7 @@ def test_load_from_file(self): def test_init_metrics_for(self): with open(pdrfile) as fd: rec = json.load(fd) - - # print(" **** TEST ****", rec) + # this record has files in it self.assertTrue(any(['/od/ds/' in f.get('downloadURL','') for f in rec.get('components',[])])) @@ -199,7 +198,6 @@ def test_init_metrics_for(self): c = self.ldr._client.get_database().recordMetrics.find() self.assertEqual(c[0]['pdrid'], 'ark:/88434/mds2-2106') c = self.ldr._client.get_database().fileMetrics.find() - # print(" TEST ****", c[0]) self.assertEqual(c[0]['pdrid'], 'ark:/88434/mds2-2106') # replace this with checks of successful loading into the database #self.fail("Tests not implemented") From 654313de9a98d27cbd893616c5c4908a2e99b210 Mon Sep 17 00:00:00 2001 From: deoyani Date: Mon, 18 Dec 2023 14:48:52 -0500 Subject: [PATCH 7/7] Updated code and added a test. --- python/nistoar/rmm/mongo/nerdm.py | 41 ++++++++++++-------- python/tests/nistoar/rmm/mongo/test_nerdm.py | 1 + 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/python/nistoar/rmm/mongo/nerdm.py b/python/nistoar/rmm/mongo/nerdm.py index 616a1a1..c6d6f99 100644 --- a/python/nistoar/rmm/mongo/nerdm.py +++ b/python/nistoar/rmm/mongo/nerdm.py @@ -338,31 +338,42 @@ def init_metrics_for(db, nerdm): for col in record_collection_fields.keys(): if col not in records.keys(): records[col] = record_collection_fields[col] + if(db["recordMetrics"].find_one({"ediid": nerdm["ediid"]}) is None): db["recordMetrics"].insert_one(records) #Get files from record components - files = flatten_records(nerdm, record_fields, files_collection_fields) + files = flatten_records(nerdm, files_collection_fields) files_to_update = [] + current_files = db["fileMetrics"].find({"ediid": nerdm["ediid"]}) current_files_filepaths = [x["filepath"] for x in current_files] for file_item in files: - if file_item['filepath'] not in current_files_filepaths: - files_to_update.append(file_item) - db["fileMetrics"].insert_many(files_to_update) + if 'filepath' in file_item.keys(): + if file_item['filepath'] not in current_files_filepaths: + files_to_update.append(file_item) + + if len(files_to_update)>0: + db["fileMetrics"].insert_many(files_to_update) - -def flatten_records(record, record_fields, initialize_fields): +# This takes a nerdm record and collect the files related data from components. +# Inputs are record=nerdm to be updated +# initialize fields=fileMetrics fields to be updated +def flatten_records(record, initialize_fields): files = [] - keys_to_keep = ['filepath', 'size', 'downloadURL'] - + keys_to_keep = ['filepath', 'size', 'downloadURL', 'ediid', '@id'] for component in record['components']: - #Get file information file_dict = {} - if 'filepath' in component.keys(): + #Initialize fields + for key in initialize_fields.keys(): + file_dict[key] = initialize_fields[key] + #Get file information + + if 'filepath' in component.keys(): for key in keys_to_keep: if key in component.keys(): file_dict[key] = component[key] + if 'size' in file_dict.keys(): file_dict['filesize'] = file_dict.pop('size') else: @@ -370,12 +381,10 @@ def flatten_records(record, record_fields, initialize_fields): if 'downloadURL' not in component.keys(): file_dict['downloadURL'] = '' - #Get record information - for key in record_fields: - file_dict[key] = record[key] - #Initialize other fields - for key in initialize_fields.keys(): - file_dict[key] = initialize_fields[key] + + file_dict['pdrid'] = record['pdrid'] + file_dict['ediid'] = record['ediid'] + files.append(file_dict) return files diff --git a/python/tests/nistoar/rmm/mongo/test_nerdm.py b/python/tests/nistoar/rmm/mongo/test_nerdm.py index 102a5ab..010e814 100644 --- a/python/tests/nistoar/rmm/mongo/test_nerdm.py +++ b/python/tests/nistoar/rmm/mongo/test_nerdm.py @@ -199,6 +199,7 @@ def test_init_metrics_for(self): self.assertEqual(c[0]['pdrid'], 'ark:/88434/mds2-2106') c = self.ldr._client.get_database().fileMetrics.find() self.assertEqual(c[0]['pdrid'], 'ark:/88434/mds2-2106') + self.assertEqual(c[0]['filepath'], "NIST_NPL_InterlabData2019.csv.sha256") # replace this with checks of successful loading into the database #self.fail("Tests not implemented")