diff --git a/python/nistoar/rmm/mongo/loader.py b/python/nistoar/rmm/mongo/loader.py index c131ed8..8ceff5c 100644 --- a/python/nistoar/rmm/mongo/loader.py +++ b/python/nistoar/rmm/mongo/loader.py @@ -109,7 +109,9 @@ def load_data(self, data, key=None, onupdate='quiet'): warning will be issued. If set to 'fail', an exception will be raised. If it is a function, it will be executed before loading the new data. It - should take data and key as arguments; it should + should take data and key as arguments where data will + previously saved record and key will be the MongoDB + that was used to select it; this function should return True if the new data should then be loaded or False if it should not. """ diff --git a/python/nistoar/rmm/mongo/nerdm.py b/python/nistoar/rmm/mongo/nerdm.py index a1a97cf..c6d6f99 100644 --- a/python/nistoar/rmm/mongo/nerdm.py +++ b/python/nistoar/rmm/mongo/nerdm.py @@ -1,7 +1,8 @@ """ load NERDm records into the RMM's MongoDB database """ -import json, os, sys +# import pandas as pd +import json, os, sys, warnings from collections import Mapping from .loader import (Loader, RecordIngestError, JSONEncodingError, @@ -10,6 +11,7 @@ from nistoar.nerdm import utils from nistoar.nerdm.convert.rmm import NERDmForRMM + DEF_BASE_SCHEMA = "https://data.nist.gov/od/dm/nerdm-schema/v0.5#" DEF_SCHEMA = DEF_BASE_SCHEMA + "/definitions/Resource" @@ -99,6 +101,21 @@ class LatestLoader(_NERDmRenditionLoader): def __init__(self, dburl, schemadir, log=None): super(NERDmLoader.LatestLoader, self).__init__(LATEST_COLLECTION_NAME, dburl, schemadir, log) + def load_data(self, data, key=None, onupdate='quiet'): + added = super().load_data(data, key, onupdate) + if added: + # initialize the metrics collections as needed + try: + init_metrics_for(self._db, data) + except Exception as ex: + msg = "Failure detected while initializing Metric data for %s: %s" % \ + (data.get("@id", "unknown record"), str(ex)) + if self.log: + self.log.warning(msg) + else: + warnings.warn(msg, UpdateWarning) + return added + class ReleaseSetLoader(_NERDmRenditionLoader): def __init__(self, dburl, schemadir, log=None): super(NERDmLoader.ReleaseSetLoader, self).__init__(RELEASES_COLLECTION_NAME, dburl, schemadir, log) @@ -266,3 +283,108 @@ def load_from_dir(self, dirpath, validate=True, results=None): return results +def init_metrics_for(db, nerdm): + """ + initialize the metrics-related collections for dataset described in the given NERDm record + as needed. + + This function assumes that the given NERDm record is the latest description of the dataset. + It should not be called with NERDm records describing earlier versions of the dataset. + + :param Database db: the MongoDB Database instance the contains the metrics collections. + This instance will have come from a MongoDB client that is already + connected to a backend server. + :param dict nerdm: the NERDm record to initialize for. + """ + #Convert nderm dict to an array of dict + #nerdm_use = [nerdm] + + record_collection_fields = { + "pdrid": None, + "ediid":None, + "first_time_logged": None, + "last_time_logged": None, + "total_size_download":0, + "success_get":0, + "number_users":0, + "record_download":0, + "ip_list":[]} + + #Record fields to be copied + record_fields = ['pdrid', 'ediid'] + + files_collection_fields = { + "pdrid": None, + "ediid":None, + "filesize": 0, + "success_get" : 0, + "failure_get" : 0, + "datacart_or_client" : 0, + "total_size_download": 0, + "number_users" : 0, + "ip_list": [], + "first_time_logged" : None, + "last_time_logged" : None, + "downloadURL": None + } + + nerdm['pdrid'] = nerdm.pop('@id') + records = {} + #Copy fields + for field in record_fields: + records[field] = nerdm[field] + + #Initialize record fields + for col in record_collection_fields.keys(): + if col not in records.keys(): + records[col] = record_collection_fields[col] + + if(db["recordMetrics"].find_one({"ediid": nerdm["ediid"]}) is None): + db["recordMetrics"].insert_one(records) + + #Get files from record components + files = flatten_records(nerdm, files_collection_fields) + files_to_update = [] + + current_files = db["fileMetrics"].find({"ediid": nerdm["ediid"]}) + current_files_filepaths = [x["filepath"] for x in current_files] + for file_item in files: + if 'filepath' in file_item.keys(): + if file_item['filepath'] not in current_files_filepaths: + files_to_update.append(file_item) + + if len(files_to_update)>0: + db["fileMetrics"].insert_many(files_to_update) + +# This takes a nerdm record and collect the files related data from components. +# Inputs are record=nerdm to be updated +# initialize fields=fileMetrics fields to be updated +def flatten_records(record, initialize_fields): + files = [] + keys_to_keep = ['filepath', 'size', 'downloadURL', 'ediid', '@id'] + for component in record['components']: + file_dict = {} + #Initialize fields + for key in initialize_fields.keys(): + file_dict[key] = initialize_fields[key] + #Get file information + + if 'filepath' in component.keys(): + for key in keys_to_keep: + if key in component.keys(): + file_dict[key] = component[key] + + if 'size' in file_dict.keys(): + file_dict['filesize'] = file_dict.pop('size') + else: + file_dict['filesize'] = 0 + + if 'downloadURL' not in component.keys(): + file_dict['downloadURL'] = '' + + file_dict['pdrid'] = record['pdrid'] + file_dict['ediid'] = record['ediid'] + + files.append(file_dict) + return files + diff --git a/python/tests/nistoar/rmm/mongo/test_nerdm.py b/python/tests/nistoar/rmm/mongo/test_nerdm.py index c1409b7..010e814 100644 --- a/python/tests/nistoar/rmm/mongo/test_nerdm.py +++ b/python/tests/nistoar/rmm/mongo/test_nerdm.py @@ -1,3 +1,4 @@ + import pdb, os, json, urllib.parse, warnings, logging import unittest as test from pymongo import MongoClient @@ -11,6 +12,7 @@ schemadir = os.path.join(basedir, "model") exdir = os.path.join(schemadir, "examples") janaffile = os.path.join(exdir, "janaf.json") +pdrfile = os.path.join(exdir, "mds2-2106.json") dburl = None if os.environ.get('MONGO_TESTDB_URL'): @@ -32,6 +34,12 @@ def tearDown(self): if not hasattr(client, 'get_database'): client.get_database = client.get_default_database db = client.get_database() + if "recordMetrics" in db.list_collection_names(): + db.drop_collection("recordMetrics") + if "fileMetrics" in db.list_collection_names(): + db.drop_collection("fileMetrics") + db.create_collection("recordMetrics") + db.create_collection("fileMetrics") if "record" in db.list_collection_names(): db.drop_collection("record") if "versions" in db.list_collection_names(): @@ -177,6 +185,25 @@ def test_load_from_file(self): self.assertEqual(self.ldr._client.get_database().versions.count_documents({}), 1) self.assertEqual(self.ldr._client.get_database().releasesets.count_documents({}), 1) + def test_init_metrics_for(self): + with open(pdrfile) as fd: + rec = json.load(fd) + + # this record has files in it + self.assertTrue(any(['/od/ds/' in f.get('downloadURL','') for f in rec.get('components',[])])) + + self.ldr.connect() + database = self.ldr._db + nerdm.init_metrics_for(database, rec) + c = self.ldr._client.get_database().recordMetrics.find() + self.assertEqual(c[0]['pdrid'], 'ark:/88434/mds2-2106') + c = self.ldr._client.get_database().fileMetrics.find() + self.assertEqual(c[0]['pdrid'], 'ark:/88434/mds2-2106') + self.assertEqual(c[0]['filepath'], "NIST_NPL_InterlabData2019.csv.sha256") + # replace this with checks of successful loading into the database + #self.fail("Tests not implemented") + + if __name__ == '__main__':