Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/ingest2metrics #58

Merged
merged 8 commits into from
Jan 11, 2024
Merged
4 changes: 3 additions & 1 deletion python/nistoar/rmm/mongo/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,9 @@ def load_data(self, data, key=None, onupdate='quiet'):
warning will be issued. If set to 'fail', an
exception will be raised. If it is a function, it
will be executed before loading the new data. It
should take data and key as arguments; it should
should take data and key as arguments where data will
previously saved record and key will be the MongoDB
that was used to select it; this function should
return True if the new data should then be loaded
or False if it should not.
"""
Expand Down
124 changes: 123 additions & 1 deletion python/nistoar/rmm/mongo/nerdm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""
load NERDm records into the RMM's MongoDB database
"""
import json, os, sys
# import pandas as pd
import json, os, sys, warnings
from collections import Mapping

from .loader import (Loader, RecordIngestError, JSONEncodingError,
Expand All @@ -10,6 +11,7 @@
from nistoar.nerdm import utils
from nistoar.nerdm.convert.rmm import NERDmForRMM


DEF_BASE_SCHEMA = "https://data.nist.gov/od/dm/nerdm-schema/v0.5#"
DEF_SCHEMA = DEF_BASE_SCHEMA + "/definitions/Resource"

Expand Down Expand Up @@ -99,6 +101,21 @@ class LatestLoader(_NERDmRenditionLoader):
def __init__(self, dburl, schemadir, log=None):
super(NERDmLoader.LatestLoader, self).__init__(LATEST_COLLECTION_NAME, dburl, schemadir, log)

def load_data(self, data, key=None, onupdate='quiet'):
added = super().load_data(data, key, onupdate)
if added:
# initialize the metrics collections as needed
try:
init_metrics_for(self._db, data)
except Exception as ex:
msg = "Failure detected while initializing Metric data for %s: %s" % \
(data.get("@id", "unknown record"), str(ex))
if self.log:
self.log.warning(msg)
else:
warnings.warn(msg, UpdateWarning)
return added

class ReleaseSetLoader(_NERDmRenditionLoader):
def __init__(self, dburl, schemadir, log=None):
super(NERDmLoader.ReleaseSetLoader, self).__init__(RELEASES_COLLECTION_NAME, dburl, schemadir, log)
Expand Down Expand Up @@ -266,3 +283,108 @@ def load_from_dir(self, dirpath, validate=True, results=None):

return results

def init_metrics_for(db, nerdm):
"""
initialize the metrics-related collections for dataset described in the given NERDm record
as needed.

This function assumes that the given NERDm record is the latest description of the dataset.
It should not be called with NERDm records describing earlier versions of the dataset.

:param Database db: the MongoDB Database instance the contains the metrics collections.
This instance will have come from a MongoDB client that is already
connected to a backend server.
:param dict nerdm: the NERDm record to initialize for.
"""
#Convert nderm dict to an array of dict
#nerdm_use = [nerdm]

record_collection_fields = {
"pdrid": None,
"ediid":None,
"first_time_logged": None,
"last_time_logged": None,
"total_size_download":0,
"success_get":0,
"number_users":0,
"record_download":0,
"ip_list":[]}

#Record fields to be copied
record_fields = ['pdrid', 'ediid']

files_collection_fields = {
"pdrid": None,
"ediid":None,
"filesize": 0,
"success_get" : 0,
"failure_get" : 0,
"datacart_or_client" : 0,
"total_size_download": 0,
"number_users" : 0,
"ip_list": [],
"first_time_logged" : None,
"last_time_logged" : None,
"downloadURL": None
}

nerdm['pdrid'] = nerdm.pop('@id')
records = {}
#Copy fields
for field in record_fields:
records[field] = nerdm[field]

#Initialize record fields
for col in record_collection_fields.keys():
if col not in records.keys():
records[col] = record_collection_fields[col]

if(db["recordMetrics"].find_one({"ediid": nerdm["ediid"]}) is None):
db["recordMetrics"].insert_one(records)

#Get files from record components
files = flatten_records(nerdm, files_collection_fields)
files_to_update = []

current_files = db["fileMetrics"].find({"ediid": nerdm["ediid"]})
current_files_filepaths = [x["filepath"] for x in current_files]
for file_item in files:
if 'filepath' in file_item.keys():
if file_item['filepath'] not in current_files_filepaths:
files_to_update.append(file_item)

if len(files_to_update)>0:
db["fileMetrics"].insert_many(files_to_update)

# This takes a nerdm record and collect the files related data from components.
# Inputs are record=nerdm to be updated
# initialize fields=fileMetrics fields to be updated
def flatten_records(record, initialize_fields):
files = []
keys_to_keep = ['filepath', 'size', 'downloadURL', 'ediid', '@id']
for component in record['components']:
file_dict = {}
#Initialize fields
for key in initialize_fields.keys():
file_dict[key] = initialize_fields[key]
#Get file information

if 'filepath' in component.keys():
for key in keys_to_keep:
if key in component.keys():
file_dict[key] = component[key]

if 'size' in file_dict.keys():
file_dict['filesize'] = file_dict.pop('size')
else:
file_dict['filesize'] = 0

if 'downloadURL' not in component.keys():
file_dict['downloadURL'] = ''

file_dict['pdrid'] = record['pdrid']
file_dict['ediid'] = record['ediid']

files.append(file_dict)
return files

27 changes: 27 additions & 0 deletions python/tests/nistoar/rmm/mongo/test_nerdm.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

import pdb, os, json, urllib.parse, warnings, logging
import unittest as test
from pymongo import MongoClient
Expand All @@ -11,6 +12,7 @@
schemadir = os.path.join(basedir, "model")
exdir = os.path.join(schemadir, "examples")
janaffile = os.path.join(exdir, "janaf.json")
pdrfile = os.path.join(exdir, "mds2-2106.json")

dburl = None
if os.environ.get('MONGO_TESTDB_URL'):
Expand All @@ -32,6 +34,12 @@ def tearDown(self):
if not hasattr(client, 'get_database'):
client.get_database = client.get_default_database
db = client.get_database()
if "recordMetrics" in db.list_collection_names():
db.drop_collection("recordMetrics")
if "fileMetrics" in db.list_collection_names():
db.drop_collection("fileMetrics")
db.create_collection("recordMetrics")
db.create_collection("fileMetrics")
if "record" in db.list_collection_names():
db.drop_collection("record")
if "versions" in db.list_collection_names():
Expand Down Expand Up @@ -177,6 +185,25 @@ def test_load_from_file(self):
self.assertEqual(self.ldr._client.get_database().versions.count_documents({}), 1)
self.assertEqual(self.ldr._client.get_database().releasesets.count_documents({}), 1)

def test_init_metrics_for(self):
with open(pdrfile) as fd:
rec = json.load(fd)

# this record has files in it
self.assertTrue(any(['/od/ds/' in f.get('downloadURL','') for f in rec.get('components',[])]))

self.ldr.connect()
database = self.ldr._db
nerdm.init_metrics_for(database, rec)
c = self.ldr._client.get_database().recordMetrics.find()
self.assertEqual(c[0]['pdrid'], 'ark:/88434/mds2-2106')
c = self.ldr._client.get_database().fileMetrics.find()
self.assertEqual(c[0]['pdrid'], 'ark:/88434/mds2-2106')
self.assertEqual(c[0]['filepath'], "NIST_NPL_InterlabData2019.csv.sha256")
# replace this with checks of successful loading into the database
#self.fail("Tests not implemented")




if __name__ == '__main__':
Expand Down
Loading