Skip to content

Commit

Permalink
Merge pull request #5 from BAMresearch/main
Browse files Browse the repository at this point in the history
Fixes to the models, added RawDataset and DerivedDataset, and a derived dataset upload mechanism
  • Loading branch information
dylanmcreynolds authored Jan 26, 2022
2 parents 37fdcb6 + 548ca5d commit 79809ad
Show file tree
Hide file tree
Showing 2 changed files with 192 additions and 26 deletions.
68 changes: 65 additions & 3 deletions pyscicat/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import requests

from .model import Attachment, Datablock, Dataset
from pyscicat.model import Attachment, Datablock, Dataset, RawDataset, DerivedDataset

logger = logging.getLogger("splash_ingest")
can_debug = logger.isEnabledFor(logging.DEBUG)
Expand Down Expand Up @@ -136,6 +136,40 @@ def _send_to_scicat(self, url, dataDict=None, cmd="post"):
# err = resp.json()["error"]
# raise ScicatCommError(f"Error creating Sample {err}")

def upload_dataset(self, dataset: Dataset) -> str:
"""Upload a raw or derived dataset (method is autosensing)
Parameters
----------
dataset : Dataset
Dataset to load
Returns
-------
str
pid (or unique identifier) of the newly created dataset
Raises
------
ScicatCommError
Raises if a non-20x message is returned
"""
if isinstance(dataset, RawDataset):
dataset_url = self._base_url + "RawDataSets/replaceOrCreate"
elif isinstance(dataset, DerivedDataset):
dataset_url = self._base_url + "DerivedDatasets/replaceOrCreate"
else:
logging.error(
"Dataset type not recognized (not Derived or Raw dataset instances)"
)
resp = self._send_to_scicat(dataset_url, dataset.dict(exclude_none=True))
if not resp.ok:
err = resp.json()["error"]
raise ScicatCommError(f"Error creating dataset {err}")
new_pid = resp.json().get("pid")
logger.info(f"new dataset created {new_pid}")
return new_pid

def upload_raw_dataset(self, dataset: Dataset) -> str:
"""Upload a raw dataset
Expand Down Expand Up @@ -163,7 +197,36 @@ def upload_raw_dataset(self, dataset: Dataset) -> str:
logger.info(f"new dataset created {new_pid}")
return new_pid

def upload_datablock(self, datablock: Datablock):
def upload_derived_dataset(self, dataset: Dataset) -> str:
"""Upload a derived dataset
Parameters
----------
dataset : Dataset
Dataset to upload
Returns
-------
str
pid (or unique identifier) of the newly created dataset
Raises
------
ScicatCommError
Raises if a non-20x message is returned
"""
derived_dataset_url = self._base_url + "DerivedDataSets/replaceOrCreate"
resp = self._send_to_scicat(
derived_dataset_url, dataset.dict(exclude_none=True)
)
if not resp.ok:
err = resp.json()["error"]
raise ScicatCommError(f"Error creating raw dataset {err}")
new_pid = resp.json().get("pid")
logger.info(f"new dataset created {new_pid}")
return new_pid

def upload_datablock(self, datablock: Datablock, datasetType: str = "RawDatasets"):
"""Upload a Datablock
Parameters
Expand All @@ -176,7 +239,6 @@ def upload_datablock(self, datablock: Datablock):
ScicatCommError
Raises if a non-20x message is returned
"""
datasetType = "RawDatasets"

url = (
self._base_url
Expand Down
150 changes: 127 additions & 23 deletions pyscicat/model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import enum

# from re import L
from typing import List, Dict, Optional

from pydantic import BaseModel
Expand Down Expand Up @@ -27,41 +29,143 @@ class MongoQueryable(BaseModel):
createdAt: Optional[str]


class User(BaseModel):
"""Base user."""

# TODO: find out which of these are not optional and update
realm: str
username: str
email: str
emailVerified: bool = False
id: str


class Proposal(Ownable, MongoQueryable):
"""
Defines the purpose of an experiment and links an experiment to principal investigator and main proposer
"""

# TODO: find out which of these are not optional and update
proposalId: Optional[str]
pi_email: Optional[str]
pi_firstname: Optional[str]
pi_lastname: Optional[str]
email: Optional[str]
firstname: Optional[str]
lastname: Optional[str]
title: Optional[str]
abstract: Optional[str]
startTime: Optional[str]
endTime: Optional[str]
MeasurementPeriodList: Optional[
List[dict]
] # may need updating with the measurement period model


class Sample(Ownable, MongoQueryable):
"""
Models describing the characteristics of the samples to be investigated.
Raw datasets should be linked to such sample definitions.
"""

# TODO: find out which of these are not optional and update
sampleId: Optional[str]
owner: Optional[str]
description: Optional[str]
sampleCharacteristics: Optional[dict]
isPublished: bool = False


class Job(MongoQueryable):
"""
This collection keeps information about jobs to be excuted in external systems.
In particular it keeps information about the jobs submitted for archiving or
retrieving datasets stored inside an archive system. It can also be used to keep
track of analysis jobs e.g. for automated analysis workflows
"""

id: Optional[str]
emailJobInitiator: str
type: str
creationTime: Optional[str] # not sure yet which ones are optional or not.
executionTime: Optional[str]
jobParams: Optional[dict]
jobStatusMessage: Optional[str]
datasetList: Optional[dict] # documentation says dict, but should maybe be list?
jobResultObject: Optional[dict] # ibid.


class Instrument(MongoQueryable):
"""
Instrument class, most of this is flexibly definable in customMetadata
"""

pid: Optional[str]
name: str
customMetadata: Optional[dict]


class Dataset(Ownable, MongoQueryable):
"""
A dataset in SciCat
A dataset in SciCat, base class for derived and raw datasets
"""

pid: Optional[str]
owner: str
ownerEmail: Optional[str]
orcidOfOwner: Optional[str]
classification: Optional[str]
contactEmail: str
creationLocation: str
creationTime: str
creationTime: str # datetime
datasetName: Optional[str]
type: DatasetType
description: Optional[str]
history: Optional[List[dict]] # list of foreigh key ids to the Messages table
instrumentId: Optional[str]
proposalId: str
dataFormat: str
principalInvestigator: str
sourceFolder: str
sourceFolderHost: Optional[str]
size: Optional[int]
packedSize: Optional[int]
isPublished: Optional[bool] = False
keywords: Optional[List[str]]
license: Optional[str]
numberOfFiles: Optional[int]
numberOfFilesArchived: Optional[int]
scientificMetadata: Dict
sampleId: str
isPublished: str
description: Optional[str]
orcidOfOwner: Optional[str]
packedSize: Optional[int]
owner: str
ownerEmail: Optional[str]
sharedWith: Optional[List[str]]
size: Optional[int]
sourceFolder: str
sourceFolderHost: Optional[str]
techniques: Optional[List[dict]] # with {'pid':pid, 'name': name} as entries
type: DatasetType
validationStatus: Optional[str]
keywords: Optional[List[str]]
datasetName: Optional[str]
classification: Optional[str]
license: Optional[str]
version: Optional[str]
isPublished: Optional[bool] = False


class RawDataset(Dataset):
"""
Raw datasets from which derived datasets are... derived.
"""

principalInvestigator: Optional[str]
creationLocation: Optional[str]
dataFormat: str
type: DatasetType = "raw"
createdAt: Optional[str] # datetime
updatedAt: Optional[str] # datetime
dataFormat: Optional[str]
endTime: Optional[str] # datetime
sampleId: Optional[str]
proposalId: Optional[str]
scientificMetadata: Optional[Dict]


class DerivedDataset(Dataset):
"""
Derived datasets which have been generated based on one or more raw datasets
"""

investigator: Optional[str]
inputDatasets: List[str]
usedSoftware: List[str] # not optional!
jobParameters: Optional[dict]
jobLogData: Optional[str]
scientificMetadata: Optional[Dict]


class DataFile(MongoQueryable):
Expand Down

0 comments on commit 79809ad

Please sign in to comment.