Merge pull request #229 from broadinstitute/development

Release 1.13.0
broadinstitute · Feb 1, 2022 · e5625e2 · e5625e2
2 parents 62c5b3b + 0c287e5
commit e5625e2
Show file tree

Hide file tree

Showing 10 changed files with 345 additions and 97 deletions.
diff --git a/ingest/annotations.py b/ingest/annotations.py
@@ -45,6 +45,13 @@ def __init__(
             self.study_file_id = ObjectId(study_file_id)
         # lambda below initializes new key with nested dictionary as value and avoids KeyError
         self.issues = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+        # collect error info for Mixpanel reporting
+        self.props = {
+            "errorTypes": [],
+            "errors": [],
+            "warningTypes": [],
+            "warnings": [],
+        }
         csv_file, self.file_handle = self.open_file(self.file_path)
         # Remove white spaces, quotes (only lowercase annot_types)
         self.headers = [header.strip().strip('"') for header in next(csv_file)]
@@ -201,18 +208,44 @@ def create_data_frame(self):
         )[0]
         self.file = Annotations.convert_header_to_multi_index(df, column_names)
 
-    def store_validation_issue(self, type, category, msg, associated_info=None):
+    def store_validation_issue(
+        self, type, msg, issue_name, issue_type=None, associated_info=None
+    ):
         """Stores validation issues in proper arrangement
-            :param type: type of issue (error or warn)
+        :param type: type of issue (error or warning)
         :param category: issue category (format, jsonschema, ontology)
+            derive from issue_name (aligns with CSFV errorType names)
+            unless a special issue_type is supplied
         :param msg: issue message
         :param value: list of IDs associated with the issue
         """
+        # derive category from errorType name (aligned with CSFV names)
+        if issue_type:
+            category = issue_type
+        else:
+            category = issue_name.split(":")[0]
+
         if associated_info:
             self.issues[type][category][msg].extend(associated_info)
         else:
             self.issues[type][category][msg] = None
 
+        if category == "runtime":
+            # do not log runtime errors as file-validation failure errorType
+            pass
+        else:
+            # propagate detected warnings and errors to Mixpanel
+            if type == "error" and issue_name:
+                if issue_name not in self.props["errorTypes"]:
+                    self.props["errorTypes"].append(issue_name)
+                if msg not in self.props["errors"]:
+                    self.props["errors"].append(msg)
+            elif type == "warn" and issue_name:
+                if issue_name not in self.props["warningTypes"]:
+                    self.props["warningTypes"].append(issue_name)
+                if msg not in self.props["warnings"]:
+                    self.props["warnings"].append(msg)
+
     def validate_header_keyword(self):
         """Check header row starts with NAME (case-insensitive).
 
@@ -224,10 +257,10 @@ def validate_header_keyword(self):
             valid = True
             if self.headers[0] != "NAME":
                 msg = f'File keyword "NAME" provided as {self.headers[0]}'
-                self.store_validation_issue("warn", "format", msg)
+                self.store_validation_issue("warn", msg, "format:cap:name")
         else:
             msg = "Malformed file header row, missing NAME keyword. (Case Sensitive)"
-            self.store_validation_issue("error", "format", msg)
+            self.store_validation_issue("error", msg, "format:cap:name")
         return valid
 
     def validate_unique_header(self):
@@ -246,12 +279,12 @@ def validate_unique_header(self):
                     duplicate_headers.add(x)
             msg = f"Duplicated header names are not allowed: {duplicate_headers}"
             log_exception(Annotations.dev_logger, Annotations.user_logger, msg)
-            self.store_validation_issue("error", "format", msg)
+            self.store_validation_issue("error", msg, "format:cap:unique")
             valid = False
         if any("Unnamed" in s for s in list(unique_headers)):
             msg = "Headers cannot contain empty values"
             log_exception(Annotations.dev_logger, Annotations.user_logger, msg)
-            self.store_validation_issue("error", "format", msg)
+            self.store_validation_issue("error", msg, "format:cap:no-empty")
             valid = False
         return valid
 
@@ -264,10 +297,10 @@ def validate_type_keyword(self):
             valid = True
             if self.annot_types[0] != "TYPE":
                 msg = f'File keyword "TYPE" provided as {self.annot_types[0]}'
-                self.store_validation_issue("warn", "format", msg)
+                self.store_validation_issue("warn", msg, "format:cap:type")
         else:
             msg = "Malformed TYPE row, missing TYPE. (Case Sensitive)"
-            self.store_validation_issue("error", "format", msg)
+            self.store_validation_issue("error", msg, "format:cap:type")
         return valid
 
     def validate_type_annotations(self):
@@ -295,7 +328,12 @@ def validate_type_annotations(self):
                     invalid_types.append(t)
         if invalid_types:
             msg = 'TYPE row annotations should be "group" or "numeric"'
-            self.store_validation_issue("error", "format", msg, invalid_types)
+            self.store_validation_issue(
+                "error",
+                msg,
+                "format:cap:group-or-numeric",
+                associated_info=invalid_types,
+            )
         else:
             valid = True
         return valid
@@ -320,7 +358,7 @@ def validate_against_header_count(self):
                 f"Header mismatch: {len_annot_type} TYPE declarations "
                 f"for {len_headers} column headers"
             )
-            self.store_validation_issue("error", "format", msg)
+            self.store_validation_issue("error", msg, "format:cap:count")
         else:
             valid = True
         return valid
@@ -349,5 +387,8 @@ def validate_numeric_annots(self):
             if annot_type == "numeric" and column_dtype == "object":
                 valid = False
                 msg = f"Numeric annotation, {annot_name}, contains non-numeric data (or unidentified NA values)"
-                self.store_validation_issue("error", "format", msg)
+                self.store_validation_issue(
+                    "error", msg, "content:invalid-type:not-numeric"
+                )
         return valid
+
diff --git a/ingest/cell_metadata.py b/ingest/cell_metadata.py
@@ -55,8 +55,6 @@ def __init__(
             self, file_path, self.ALLOWED_FILE_TYPES, study_id, study_file_id
         )
         self.cell_names = []
-        # lambda below initializes new key with nested dictionary as value and avoids KeyError
-        self.issues = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
         self.ontology = defaultdict(lambda: defaultdict(list))
         self.ordered_ontology = defaultdict(list)
         self.ordered_labels = defaultdict(list)
@@ -112,7 +110,9 @@ def validate_header_for_coordinate_values(self):
             return True
         else:
             msg = "Header names can not be coordinate values x, y, or z (case insensitive)"
-            self.store_validation_issue("error", "format", msg)
+            self.store_validation_issue(
+                "error", msg, "format:cap:metadata-no-coordinates"
+            )
             return False
 
     def conforms_to_metadata_convention(self):

diff --git a/ingest/clusters.py b/ingest/clusters.py
@@ -93,7 +93,9 @@ def require_X_Y_not_nan(self):
                 if self.file[annot_name].isna().any().bool():
                     is_valid = False
                     msg = f"Missing coordinate values in {annot_name} column"
-                    self.store_validation_issue("error", "format", msg)
+                    self.store_validation_issue(
+                        "error", msg, "content:cluster:missing-coordinates-values"
+                    )
         return is_valid
 
     def is_valid_format(self):
@@ -112,7 +114,9 @@ def validate_header_for_coordinate_values(self):
                 msg = (
                     "Header must have coordinate values 'x' and 'y' (case insensitive)"
                 )
-                self.store_validation_issue("error", "format", msg)
+                self.store_validation_issue(
+                    "error", msg, "format:cap:cluster-coordinates"
+                )
                 return False
         return True
 

diff --git a/ingest/config.py b/ingest/config.py
@@ -1,6 +1,7 @@
+import os
+
 # File is responsible for defining globals and initializing them
 try:
-
     from mongo_connection import MongoConnection
 except ImportError:
     from .mongo_connection import MongoConnection
@@ -46,6 +47,8 @@ def __init__(self, study, study_file, user_uuid=None):
             "fileName": study_file.file_name,
             "fileType": study_file.file_type,
             "fileSize": study_file.file_size,
+            "trigger": study_file.trigger,
+            "logger": "ingest-pipeline",
             "appId": "single-cell-portal",
         }
 
@@ -57,6 +60,20 @@ def update(self, props):
             self.__properties = {**self.__properties, **props}
 
 
+def bypass_mongo_writes():
+    """Check if developer has set environment variable to bypass writing data to MongoDB
+        BYPASS_MONGO_WRITES='yes'
+    """
+    if os.environ.get("BYPASS_MONGO_WRITES") is not None:
+        skip = os.environ["BYPASS_MONGO_WRITES"]
+        if skip == "yes":
+            return True
+        else:
+            return False
+    else:
+        return False
+
+
 class Study:
     """Provides attributes for a given study
     """
@@ -70,22 +87,29 @@ def study(self):
 
     @study.setter
     def study(self, study_id: str):
+        # Annotation Object expects a proper BSON ID
+        # even when the input validation is bypassed here
         try:
             study_id = ObjectId(study_id)
         except Exception:
             raise ValueError("Must pass in valid object ID for study ID")
-        study = list(
-            MONGO_CONNECTION._client["study_accessions"].find(
-                {"study_id": study_id}, {"_id": 0}
-            )
-        )
-        if not study:
-            raise ValueError(
-                "Study ID is not registered with a study. Please provide a valid study ID"
-            )
+        # set dummy accession if running in developer mode
+        if bypass_mongo_writes():
+            self.accession = "SCPdev"
         else:
-            self.__study = study.pop()
-            self.accession = self.__study["accession"]
+
+            study = list(
+                MONGO_CONNECTION._client["study_accessions"].find(
+                    {"study_id": study_id}, {"_id": 0}
+                )
+            )
+            if not study:
+                raise ValueError(
+                    "Study ID is not registered with a study. Please provide a valid study ID"
+                )
+            else:
+                self.__study = study.pop()
+                self.accession = self.__study["accession"]
 
 
 class StudyFile:
@@ -104,14 +128,25 @@ def study_file(self, study_file_id):
             study_file_id = ObjectId(study_file_id)
         except Exception:
             raise ValueError("Must pass in valid object ID for study file ID")
-        query = MONGO_CONNECTION._client["study_files"].find({"_id": study_file_id})
-        query_results = list(query)
-        if not query_results:
-            raise ValueError(
-                "Study file ID is not registered with a study. Please provide a valid study file ID."
-            )
+        if bypass_mongo_writes():
+            # set dummy values if running in developer mode
+            self.file_type = "input_validation_bypassed"
+            self.file_size = 1
+            self.file_name = str(study_file_id)
+            self.trigger = 'dev_mode'
         else:
-            self.__study_file = query_results.pop()
-            self.file_type = self.study_file["file_type"]
-            self.file_size = self.study_file["upload_file_size"]
-            self.file_name = self.study_file["name"]
+            query = MONGO_CONNECTION._client["study_files"].find({"_id": study_file_id})
+            query_results = list(query)
+            if not query_results:
+                raise ValueError(
+                    "Study file ID is not registered with a study. Please provide a valid study file ID."
+                )
+            else:
+                self.__study_file = query_results.pop()
+                self.file_type = self.study_file["file_type"]
+                self.file_size = self.study_file["upload_file_size"]
+                self.file_name = self.study_file["name"]
+                if self.study_file.get("remote_location") is not None:
+                    self.trigger = 'sync'
+                else:
+                    self.trigger = 'upload'