Skip to content

Commit

Permalink
Merge pull request #229 from broadinstitute/development
Browse files Browse the repository at this point in the history
Release 1.13.0
  • Loading branch information
jlchang authored Feb 1, 2022
2 parents 62c5b3b + 0c287e5 commit e5625e2
Show file tree
Hide file tree
Showing 10 changed files with 345 additions and 97 deletions.
63 changes: 52 additions & 11 deletions ingest/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ def __init__(
self.study_file_id = ObjectId(study_file_id)
# lambda below initializes new key with nested dictionary as value and avoids KeyError
self.issues = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
# collect error info for Mixpanel reporting
self.props = {
"errorTypes": [],
"errors": [],
"warningTypes": [],
"warnings": [],
}
csv_file, self.file_handle = self.open_file(self.file_path)
# Remove white spaces, quotes (only lowercase annot_types)
self.headers = [header.strip().strip('"') for header in next(csv_file)]
Expand Down Expand Up @@ -201,18 +208,44 @@ def create_data_frame(self):
)[0]
self.file = Annotations.convert_header_to_multi_index(df, column_names)

def store_validation_issue(self, type, category, msg, associated_info=None):
def store_validation_issue(
self, type, msg, issue_name, issue_type=None, associated_info=None
):
"""Stores validation issues in proper arrangement
:param type: type of issue (error or warn)
:param type: type of issue (error or warning)
:param category: issue category (format, jsonschema, ontology)
derive from issue_name (aligns with CSFV errorType names)
unless a special issue_type is supplied
:param msg: issue message
:param value: list of IDs associated with the issue
"""
# derive category from errorType name (aligned with CSFV names)
if issue_type:
category = issue_type
else:
category = issue_name.split(":")[0]

if associated_info:
self.issues[type][category][msg].extend(associated_info)
else:
self.issues[type][category][msg] = None

if category == "runtime":
# do not log runtime errors as file-validation failure errorType
pass
else:
# propagate detected warnings and errors to Mixpanel
if type == "error" and issue_name:
if issue_name not in self.props["errorTypes"]:
self.props["errorTypes"].append(issue_name)
if msg not in self.props["errors"]:
self.props["errors"].append(msg)
elif type == "warn" and issue_name:
if issue_name not in self.props["warningTypes"]:
self.props["warningTypes"].append(issue_name)
if msg not in self.props["warnings"]:
self.props["warnings"].append(msg)

def validate_header_keyword(self):
"""Check header row starts with NAME (case-insensitive).
Expand All @@ -224,10 +257,10 @@ def validate_header_keyword(self):
valid = True
if self.headers[0] != "NAME":
msg = f'File keyword "NAME" provided as {self.headers[0]}'
self.store_validation_issue("warn", "format", msg)
self.store_validation_issue("warn", msg, "format:cap:name")
else:
msg = "Malformed file header row, missing NAME keyword. (Case Sensitive)"
self.store_validation_issue("error", "format", msg)
self.store_validation_issue("error", msg, "format:cap:name")
return valid

def validate_unique_header(self):
Expand All @@ -246,12 +279,12 @@ def validate_unique_header(self):
duplicate_headers.add(x)
msg = f"Duplicated header names are not allowed: {duplicate_headers}"
log_exception(Annotations.dev_logger, Annotations.user_logger, msg)
self.store_validation_issue("error", "format", msg)
self.store_validation_issue("error", msg, "format:cap:unique")
valid = False
if any("Unnamed" in s for s in list(unique_headers)):
msg = "Headers cannot contain empty values"
log_exception(Annotations.dev_logger, Annotations.user_logger, msg)
self.store_validation_issue("error", "format", msg)
self.store_validation_issue("error", msg, "format:cap:no-empty")
valid = False
return valid

Expand All @@ -264,10 +297,10 @@ def validate_type_keyword(self):
valid = True
if self.annot_types[0] != "TYPE":
msg = f'File keyword "TYPE" provided as {self.annot_types[0]}'
self.store_validation_issue("warn", "format", msg)
self.store_validation_issue("warn", msg, "format:cap:type")
else:
msg = "Malformed TYPE row, missing TYPE. (Case Sensitive)"
self.store_validation_issue("error", "format", msg)
self.store_validation_issue("error", msg, "format:cap:type")
return valid

def validate_type_annotations(self):
Expand Down Expand Up @@ -295,7 +328,12 @@ def validate_type_annotations(self):
invalid_types.append(t)
if invalid_types:
msg = 'TYPE row annotations should be "group" or "numeric"'
self.store_validation_issue("error", "format", msg, invalid_types)
self.store_validation_issue(
"error",
msg,
"format:cap:group-or-numeric",
associated_info=invalid_types,
)
else:
valid = True
return valid
Expand All @@ -320,7 +358,7 @@ def validate_against_header_count(self):
f"Header mismatch: {len_annot_type} TYPE declarations "
f"for {len_headers} column headers"
)
self.store_validation_issue("error", "format", msg)
self.store_validation_issue("error", msg, "format:cap:count")
else:
valid = True
return valid
Expand Down Expand Up @@ -349,5 +387,8 @@ def validate_numeric_annots(self):
if annot_type == "numeric" and column_dtype == "object":
valid = False
msg = f"Numeric annotation, {annot_name}, contains non-numeric data (or unidentified NA values)"
self.store_validation_issue("error", "format", msg)
self.store_validation_issue(
"error", msg, "content:invalid-type:not-numeric"
)
return valid

6 changes: 3 additions & 3 deletions ingest/cell_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,6 @@ def __init__(
self, file_path, self.ALLOWED_FILE_TYPES, study_id, study_file_id
)
self.cell_names = []
# lambda below initializes new key with nested dictionary as value and avoids KeyError
self.issues = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
self.ontology = defaultdict(lambda: defaultdict(list))
self.ordered_ontology = defaultdict(list)
self.ordered_labels = defaultdict(list)
Expand Down Expand Up @@ -112,7 +110,9 @@ def validate_header_for_coordinate_values(self):
return True
else:
msg = "Header names can not be coordinate values x, y, or z (case insensitive)"
self.store_validation_issue("error", "format", msg)
self.store_validation_issue(
"error", msg, "format:cap:metadata-no-coordinates"
)
return False

def conforms_to_metadata_convention(self):
Expand Down
8 changes: 6 additions & 2 deletions ingest/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,9 @@ def require_X_Y_not_nan(self):
if self.file[annot_name].isna().any().bool():
is_valid = False
msg = f"Missing coordinate values in {annot_name} column"
self.store_validation_issue("error", "format", msg)
self.store_validation_issue(
"error", msg, "content:cluster:missing-coordinates-values"
)
return is_valid

def is_valid_format(self):
Expand All @@ -112,7 +114,9 @@ def validate_header_for_coordinate_values(self):
msg = (
"Header must have coordinate values 'x' and 'y' (case insensitive)"
)
self.store_validation_issue("error", "format", msg)
self.store_validation_issue(
"error", msg, "format:cap:cluster-coordinates"
)
return False
return True

Expand Down
79 changes: 57 additions & 22 deletions ingest/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os

# File is responsible for defining globals and initializing them
try:

from mongo_connection import MongoConnection
except ImportError:
from .mongo_connection import MongoConnection
Expand Down Expand Up @@ -46,6 +47,8 @@ def __init__(self, study, study_file, user_uuid=None):
"fileName": study_file.file_name,
"fileType": study_file.file_type,
"fileSize": study_file.file_size,
"trigger": study_file.trigger,
"logger": "ingest-pipeline",
"appId": "single-cell-portal",
}

Expand All @@ -57,6 +60,20 @@ def update(self, props):
self.__properties = {**self.__properties, **props}


def bypass_mongo_writes():
"""Check if developer has set environment variable to bypass writing data to MongoDB
BYPASS_MONGO_WRITES='yes'
"""
if os.environ.get("BYPASS_MONGO_WRITES") is not None:
skip = os.environ["BYPASS_MONGO_WRITES"]
if skip == "yes":
return True
else:
return False
else:
return False


class Study:
"""Provides attributes for a given study
"""
Expand All @@ -70,22 +87,29 @@ def study(self):

@study.setter
def study(self, study_id: str):
# Annotation Object expects a proper BSON ID
# even when the input validation is bypassed here
try:
study_id = ObjectId(study_id)
except Exception:
raise ValueError("Must pass in valid object ID for study ID")
study = list(
MONGO_CONNECTION._client["study_accessions"].find(
{"study_id": study_id}, {"_id": 0}
)
)
if not study:
raise ValueError(
"Study ID is not registered with a study. Please provide a valid study ID"
)
# set dummy accession if running in developer mode
if bypass_mongo_writes():
self.accession = "SCPdev"
else:
self.__study = study.pop()
self.accession = self.__study["accession"]

study = list(
MONGO_CONNECTION._client["study_accessions"].find(
{"study_id": study_id}, {"_id": 0}
)
)
if not study:
raise ValueError(
"Study ID is not registered with a study. Please provide a valid study ID"
)
else:
self.__study = study.pop()
self.accession = self.__study["accession"]


class StudyFile:
Expand All @@ -104,14 +128,25 @@ def study_file(self, study_file_id):
study_file_id = ObjectId(study_file_id)
except Exception:
raise ValueError("Must pass in valid object ID for study file ID")
query = MONGO_CONNECTION._client["study_files"].find({"_id": study_file_id})
query_results = list(query)
if not query_results:
raise ValueError(
"Study file ID is not registered with a study. Please provide a valid study file ID."
)
if bypass_mongo_writes():
# set dummy values if running in developer mode
self.file_type = "input_validation_bypassed"
self.file_size = 1
self.file_name = str(study_file_id)
self.trigger = 'dev_mode'
else:
self.__study_file = query_results.pop()
self.file_type = self.study_file["file_type"]
self.file_size = self.study_file["upload_file_size"]
self.file_name = self.study_file["name"]
query = MONGO_CONNECTION._client["study_files"].find({"_id": study_file_id})
query_results = list(query)
if not query_results:
raise ValueError(
"Study file ID is not registered with a study. Please provide a valid study file ID."
)
else:
self.__study_file = query_results.pop()
self.file_type = self.study_file["file_type"]
self.file_size = self.study_file["upload_file_size"]
self.file_name = self.study_file["name"]
if self.study_file.get("remote_location") is not None:
self.trigger = 'sync'
else:
self.trigger = 'upload'
Loading

0 comments on commit e5625e2

Please sign in to comment.