usnistgov · RayPlante · Jan 11, 2024 · Feb 2, 2023 · Apr 26, 2023 · Sep 6, 2023
diff --git a/python/nistoar/rmm/mongo/loader.py b/python/nistoar/rmm/mongo/loader.py
@@ -109,7 +109,9 @@ def load_data(self, data, key=None, onupdate='quiet'):
                             warning will be issued.  If set to 'fail', an 
                             exception will be raised.  If it is a function, it 
                             will be executed before loading the new data.  It 
-                            should take data and key as arguments; it should 
+                            should take data and key as arguments where data will
+                            previously saved record and key will be the MongoDB 
+                            that was used to select it; this function should 
                             return True if the new data should then be loaded
                             or False if it should not.  
         """

diff --git a/python/nistoar/rmm/mongo/nerdm.py b/python/nistoar/rmm/mongo/nerdm.py
@@ -1,7 +1,8 @@
 """
 load NERDm records into the RMM's MongoDB database
 """
-import json, os, sys
+# import pandas as pd
+import json, os, sys, warnings
 from collections import Mapping
 
 from .loader import (Loader, RecordIngestError, JSONEncodingError,
@@ -10,6 +11,7 @@
 from nistoar.nerdm import utils
 from nistoar.nerdm.convert.rmm import NERDmForRMM
 
+
 DEF_BASE_SCHEMA = "https://data.nist.gov/od/dm/nerdm-schema/v0.5#"
 DEF_SCHEMA = DEF_BASE_SCHEMA + "/definitions/Resource"
 
@@ -99,6 +101,21 @@ class LatestLoader(_NERDmRenditionLoader):
         def __init__(self, dburl, schemadir, log=None):
             super(NERDmLoader.LatestLoader, self).__init__(LATEST_COLLECTION_NAME, dburl, schemadir, log)
 
+        def load_data(self, data, key=None, onupdate='quiet'):
+            added = super().load_data(data, key, onupdate)
+            if added:
+                # initialize the metrics collections as needed
+                try:
+                    init_metrics_for(self._db, data)
+                except Exception as ex:
+                    msg = "Failure detected while initializing Metric data for %s: %s" % \
+                        (data.get("@id", "unknown record"), str(ex))
+                    if self.log:
+                        self.log.warning(msg)
+                    else:
+                        warnings.warn(msg, UpdateWarning)
+            return added
+
     class ReleaseSetLoader(_NERDmRenditionLoader):
         def __init__(self, dburl, schemadir, log=None):
             super(NERDmLoader.ReleaseSetLoader, self).__init__(RELEASES_COLLECTION_NAME, dburl, schemadir, log)
@@ -266,3 +283,108 @@ def load_from_dir(self, dirpath, validate=True, results=None):
 
         return results
 
+def init_metrics_for(db, nerdm):
+    """
+    initialize the metrics-related collections for dataset described in the given NERDm record
+    as needed.  
+
+    This function assumes that the given NERDm record is the latest description of the dataset.
+    It should not be called with NERDm records describing earlier versions of the dataset.  
+
+    :param Database db:  the MongoDB Database instance the contains the metrics collections.  
+                         This instance will have come from a MongoDB client that is already 
+                         connected to a backend server.
+    :param dict  nerdm:  the NERDm record to initialize for.
+    """
+    #Convert nderm dict to an array of dict
+    #nerdm_use = [nerdm]
+
+    record_collection_fields = { 
+                                "pdrid": None,
+                                "ediid":None, 
+                                "first_time_logged": None, 
+                                "last_time_logged": None,
+                                "total_size_download":0,
+                                "success_get":0,
+                                "number_users":0,
+                                "record_download":0, 
+                                "ip_list":[]}
+
+    #Record fields to be copied
+    record_fields = ['pdrid', 'ediid']
+
+    files_collection_fields = {
+                         "pdrid": None,
+                        "ediid":None, 
+                        "filesize": 0,
+                        "success_get" : 0, 
+                        "failure_get" : 0, 
+                        "datacart_or_client" : 0,
+                        "total_size_download": 0,
+                        "number_users" : 0,
+                        "ip_list": [],
+                        "first_time_logged" : None,
+                        "last_time_logged" : None,
+                        "downloadURL": None
+                        }
+
+    nerdm['pdrid'] = nerdm.pop('@id')
+    records = {}
+    #Copy fields
+    for field in record_fields:
+        records[field] = nerdm[field]
+
+     #Initialize record fields
+    for col in record_collection_fields.keys():
+        if col not in records.keys():
+            records[col] = record_collection_fields[col]
+
+    if(db["recordMetrics"].find_one({"ediid": nerdm["ediid"]}) is None):
+        db["recordMetrics"].insert_one(records)
+
+    #Get files from record components
+    files = flatten_records(nerdm, files_collection_fields)
+    files_to_update = []
+
+    current_files = db["fileMetrics"].find({"ediid": nerdm["ediid"]})
+    current_files_filepaths = [x["filepath"] for x in current_files]
+    for file_item in files:
+        if 'filepath' in file_item.keys():
+            if file_item['filepath'] not in current_files_filepaths:
+                files_to_update.append(file_item)
+
+    if len(files_to_update)>0:            
+        db["fileMetrics"].insert_many(files_to_update)
+
+# This takes a nerdm record and collect the files related data from components.
+# Inputs are record=nerdm to be updated
+# initialize fields=fileMetrics fields to be updated
+def flatten_records(record, initialize_fields):
+    files = []
+    keys_to_keep = ['filepath', 'size', 'downloadURL', 'ediid', '@id']
+    for component in record['components']:
+        file_dict = {}
+        #Initialize  fields
+        for key in initialize_fields.keys():
+            file_dict[key] = initialize_fields[key]
+        #Get file information
+
+        if 'filepath' in component.keys():            
+            for key in keys_to_keep:
+                if key in component.keys():
+                    file_dict[key] = component[key]
+
+        if 'size' in file_dict.keys():
+            file_dict['filesize'] = file_dict.pop('size')
+        else:
+            file_dict['filesize'] = 0
+
+        if 'downloadURL' not in component.keys():
+                file_dict['downloadURL'] = ''
+
+        file_dict['pdrid'] = record['pdrid']
+        file_dict['ediid'] = record['ediid']
+
+        files.append(file_dict)
+    return files
+
diff --git a/python/tests/nistoar/rmm/mongo/test_nerdm.py b/python/tests/nistoar/rmm/mongo/test_nerdm.py
@@ -1,3 +1,4 @@
+
 import pdb, os, json, urllib.parse, warnings, logging
 import unittest as test
 from pymongo import MongoClient
@@ -11,6 +12,7 @@
 schemadir = os.path.join(basedir, "model")
 exdir = os.path.join(schemadir, "examples")
 janaffile = os.path.join(exdir, "janaf.json")
+pdrfile = os.path.join(exdir, "mds2-2106.json")
 
 dburl = None
 if os.environ.get('MONGO_TESTDB_URL'):
@@ -32,6 +34,12 @@ def tearDown(self):
         if not hasattr(client, 'get_database'):
             client.get_database = client.get_default_database
         db = client.get_database()
+        if "recordMetrics" in db.list_collection_names():
+            db.drop_collection("recordMetrics")
+        if "fileMetrics" in db.list_collection_names():
+            db.drop_collection("fileMetrics")    
+        db.create_collection("recordMetrics")
+        db.create_collection("fileMetrics")
         if "record" in db.list_collection_names():
             db.drop_collection("record")
         if "versions" in db.list_collection_names():
@@ -177,6 +185,25 @@ def test_load_from_file(self):
         self.assertEqual(self.ldr._client.get_database().versions.count_documents({}), 1)
         self.assertEqual(self.ldr._client.get_database().releasesets.count_documents({}), 1)
 
+    def test_init_metrics_for(self):
+        with open(pdrfile) as fd:
+            rec = json.load(fd)
+
+        # this record has files in it
+        self.assertTrue(any(['/od/ds/' in f.get('downloadURL','') for f in rec.get('components',[])]))
+
+        self.ldr.connect()
+        database = self.ldr._db
+        nerdm.init_metrics_for(database, rec)
+        c = self.ldr._client.get_database().recordMetrics.find()
+        self.assertEqual(c[0]['pdrid'], 'ark:/88434/mds2-2106')
+        c = self.ldr._client.get_database().fileMetrics.find()
+        self.assertEqual(c[0]['pdrid'], 'ark:/88434/mds2-2106')
+        self.assertEqual(c[0]['filepath'], "NIST_NPL_InterlabData2019.csv.sha256")
+        # replace this with checks of successful loading into the database
+        #self.fail("Tests not implemented")
+
+
 
 
 if __name__ == '__main__':