Skip to content

Commit

Permalink
Merge pull request #227 from broadinstitute/development
Browse files Browse the repository at this point in the history
Release 1.12.4
  • Loading branch information
jlchang authored Dec 8, 2021
2 parents 37136b5 + 994b53c commit 62c5b3b
Show file tree
Hide file tree
Showing 20 changed files with 600 additions and 653 deletions.
20 changes: 8 additions & 12 deletions ingest/cell_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@
"""
import collections
import ntpath
from collections import defaultdict
from collections import defaultdict, OrderedDict
from dataclasses import dataclass
from typing import Dict, Generator, List, Tuple, Union # noqa: F401
import copy
import json
import logging

from bson.objectid import ObjectId
from mypy_extensions import TypedDict
Expand All @@ -28,18 +27,10 @@
validate_input_metadata,
write_metadata_to_bq,
)
from monitor import setup_logger
except ImportError:
# Used when importing as external package, e.g. imports in single_cell_portal code
from .annotations import Annotations
from .ingest_files import DataArray, IngestFiles
from ..monitor import setup_logger

dev_logger = setup_logger(__name__, "log.txt", format="support_configs")

user_logger = setup_logger(
__name__ + ".user_logger", "user_log.txt", level=logging.ERROR
)


class CellMetadata(Annotations):
Expand Down Expand Up @@ -67,7 +58,8 @@ def __init__(
# lambda below initializes new key with nested dictionary as value and avoids KeyError
self.issues = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
self.ontology = defaultdict(lambda: defaultdict(list))
self.ontology_label = dict()
self.ordered_ontology = defaultdict(list)
self.ordered_labels = defaultdict(list)
self.cells = []
self.numeric_array_columns = {}
self.kwargs = kwargs
Expand Down Expand Up @@ -151,7 +143,11 @@ def transform(self):
else "group"
)

group = True if annot_type == "group" else False
group = (
True
if (annot_type == "group" or stored_mongo_annot_type == "group")
else False
)
# should not store annotations with >200 unique values for viz
# annot_header is the column of data, which includes name and type
# large is any annotation with more than 200 + 2 unique values
Expand Down
23 changes: 22 additions & 1 deletion ingest/clusters.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy
from dataclasses import dataclass
from typing import Dict, Generator, List, Tuple, Union # noqa: F401

Expand Down Expand Up @@ -73,7 +74,27 @@ def __init__(
def validate(self):
""" Runs all validation checks
"""
return all([self.is_valid_format(), self.validate_numeric_annots()])
return all(
[
self.is_valid_format(),
self.validate_numeric_annots(),
self.require_X_Y_not_nan(),
]
)

def require_X_Y_not_nan(self):
"""
X Y and Z are expected to be fully populated in cluster files
Fail coordinate data columns that have NaN values
"""
is_valid = True
for annot_name in self.headers:
if annot_name.lower() in ("x", "y", "z"):
if self.file[annot_name].isna().any().bool():
is_valid = False
msg = f"Missing coordinate values in {annot_name} column"
self.store_validation_issue("error", "format", msg)
return is_valid

def is_valid_format(self):
"""Validates format by calling all format validation methods"""
Expand Down
159 changes: 146 additions & 13 deletions ingest/validation/validate_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,13 +220,6 @@ def retrieve_ols_term(self, ontology_urls, term, property_name, attribute_type):
ontology_shortname, term_id = re.split("[_:]", term)
except (ValueError, TypeError):
msg = f'{property_name}: Could not parse provided ontology id, "{term}".'
if attribute_type == "array":
if "|" not in term:
msg += (
f" There is only one array value, for ontology id, '{term}.' "
"If multiple values are expected, use a pipe ('|') to separate values."
)

raise ValueError(msg)
# check if we have already retrieved this ontology reference
if ontology_shortname not in self.cached_ontologies:
Expand Down Expand Up @@ -344,7 +337,7 @@ def parse_organ_region_ontology_id(term):
except (TypeError, ValueError):
# when term value is empty string -> TypeError, convert this to a value error
raise ValueError(
f'organ_region: Could not parse provided ontology id, "{term}"'
f'organ_region: Could not parse provided ontology ID, "{term}".'
)


Expand Down Expand Up @@ -512,8 +505,12 @@ def insert_array_ontology_label_row_data(
error_msg = f"{property_name}: mismatched # of {property_name} and {ontology_label} values"
metadata.store_validation_issue("error", "ontology", error_msg, [cell_id])
return row

if not row[ontology_label]:
array_label_for_bq = []
# track original labels, including blanks, in the ordered ontology structure
metadata.ordered_ontology[property_name].extend(row[property_name])
metadata.ordered_labels[property_name].extend('')
for id in row[property_name]:
label_lookup = ""
try:
Expand Down Expand Up @@ -545,7 +542,9 @@ def insert_array_ontology_label_row_data(
)
array_label_for_bq.append(label_lookup)
row[ontology_label] = array_label_for_bq

else:
metadata.ordered_ontology[property_name].extend(row[property_name])
metadata.ordered_labels[property_name].extend(row[ontology_label])
for id, label in zip(row[property_name], row[ontology_label]):
metadata.ontology[property_name][(id, label)].append(cell_id)
return row
Expand All @@ -558,6 +557,9 @@ def insert_ontology_label_row_data(
cell_id = row["CellID"]

if not row[ontology_label]:
# track original labels, including blanks, in the ordered ontology structure
metadata.ordered_ontology[property_name].append(id)
metadata.ordered_labels[property_name].append('')
# for optional columns, try to fill it in
property_type = convention["properties"][property_name]["type"]
try:
Expand All @@ -580,6 +582,9 @@ def insert_ontology_label_row_data(
print(e)
error_msg = f"Optional column {ontology_label} empty and could not be resolved from {property_name} column value {row[property_name]}"
metadata.store_validation_issue("warn", "ontology", error_msg, [cell_id])
else:
metadata.ordered_ontology[property_name].append(id)
metadata.ordered_labels[property_name].append(row[ontology_label])

metadata.ontology[property_name][(row[property_name], row[ontology_label])].append(
cell_id
Expand Down Expand Up @@ -827,7 +832,8 @@ def cast_metadata_type(metadatum, value, id_for_error_detail, convention, metada
try:
if "|" not in value:
msg = (
f"There is only one array value, for {metadatum}: {value}. "
f"{metadatum}: accepts an array of values.\n"
"Lines detected with single values instead of arrays.\n"
"If multiple values are expected, use a pipe ('|') to separate values."
)
metadata.store_validation_issue(
Expand Down Expand Up @@ -915,8 +921,28 @@ def process_metadata_row(metadata, convention, line):
row_info = dict(itertools.zip_longest(metadata_names, line))
processed_row = {}
for k, v in row_info.items():
# for metadata not in convention, no need to process
# for metadata not in convention, check numeric metadata for valid values
if k not in convention["properties"].keys():
type_index = metadata.headers.index(k)
k_type = metadata.annot_types[type_index]
if k_type == "numeric":
k_numeric = type(v) in [int, float]
if not k_numeric:
# pandas coercion will stringify numbers
# error messages including v are more confusing than helpful
# e.g. percent_mt: supplied value 0.0825991189427313 is not numeric
# 0.0825991189427313 was stringified due to non-numeric in the data column
# but the value 0.0825991189427313 was validly supplied as a numeric
try:
float(v)
msg = f"{k}: one or more values in data column are not numeric"
# only true non-numerics should be reported in detail
except ValueError:
msg = f"{k}: supplied value {v} is not numeric"
metadata.store_validation_issue(
"error", "type", msg, {row_info["CellID"]}
)
dev_logger.error(msg)
continue
# for optional metadata, do not pass empty cells (nan)
if k not in convention["required"]:
Expand Down Expand Up @@ -1227,13 +1253,120 @@ def review_metadata_names(metadata):
metadata.store_validation_issue("error", "metadata_name", error_msg)


def identify_multiply_assigned(list):
"""Given a list of ontology IDs and their purported labels,
return list of unique multiply-assigned labels
"""
ontology_tracker = defaultdict(lambda: defaultdict(int))
multiply_assigned = []
for element in list:
id, label = element
ontology_tracker[label][id] += 1
for label in ontology_tracker:
if len(ontology_tracker[label].keys()) > 1:
multiply_assigned.append(label)
multiply_assigned.sort()
return multiply_assigned


def assess_ontology_ids(ids, property_name, metadata):
"""
Check ordered collection of ontology IDs for increasing numeric values
"""
evidence_of_excel_drag = False
evidence_of_excel_drag_threshold = 25
binned_ids = defaultdict(list)
for id in ids:
# The binning avoids any spurrious numeric contiguity
# between IDs that actually have different shortnames
# because the detection threshold is fairly generous,
# the binning is probably unneeded
try:
ontology_shortname, term_id = re.split("[_:]", id)
binned_ids[ontology_shortname].append(term_id)
except (ValueError, TypeError):
# invalid ontology id will already be flagged as convention error
# no need to also mark as ontology error as part of Excel drag detection
pass
for ontology in binned_ids.keys():
id_numerics = []
incrementation_count = 0
for term in binned_ids[ontology]:
# Regex extracts away the numeric part of the term
# from the constant, text portion of the ontology ID
# some term ids have a text component on the term side
# so a regex is needed instead of a simple split
term_numeric = re.search('(\d)*$', term)
id_numerics.append(int(term_numeric.group()))
for x, y in zip(id_numerics, id_numerics[1:]):
if y - x == 1:
incrementation_count += 1
elif y - x != 1:
incrementation_count = 0
if incrementation_count >= evidence_of_excel_drag_threshold:
evidence_of_excel_drag = True
break
return evidence_of_excel_drag


def detect_excel_drag(metadata, convention):
""" Check if ontology IDs submitted have characteristic Excel drag properties
Todo1: "Excel drag" detection of array-based ontologyID data
is lacking (need to track pipe-delimited string)
Todo2: need to bypass EBI OLS queries to "fill in"
missing ontology labels for optional metadata)
Hint: try working with raw metadata.file data, would
allow this check to be moved before collect_jsonschema_errors
"""
excel_drag = False
for property_name in metadata.ontology.keys():
if len(set(metadata.ordered_ontology[property_name])) == 1:
continue
else:
property_ids = metadata.ordered_ontology[property_name]
unique_ids = set(property_ids)
property_labels = metadata.ordered_labels[property_name]
property_labels_blanks_removed = [i for i in property_labels if i]
unique_labels = set(property_labels_blanks_removed)
# likely ontology label mis-assignment if multiple ontology IDs ascribed to same ontology label
label_multiply_assigned = len(unique_ids) > len(unique_labels)
try:
if assess_ontology_ids(property_ids, property_name, metadata):
msg = (
f"{property_name}: Long stretch of contiguously incrementing "
+ "ontology ID values suggest cut and paste issue - exiting validation, "
+ "ontology content not validated against ontology server.\n"
"Please confirm ontology IDs are correct and resubmit.\n"
)
excel_drag = True
if label_multiply_assigned:
multiply_assigned = identify_multiply_assigned(
set(zip(property_ids, property_labels))
)
if multiply_assigned:
msg += f"Check for mismatches between ontology ID and provided ontology label(s) {multiply_assigned}\n"
metadata.store_validation_issue("error", "ontology", msg)
dev_logger.exception(msg)
except ValueError as valueError:
metadata.store_validation_issue("error", "ontology", valueError.args[0])

return excel_drag


def validate_input_metadata(metadata, convention, bq_json=None):
"""Wrapper function to run validation functions
"""
dev_logger.info("Checking metadata content against convention rules")
collect_jsonschema_errors(metadata, convention, bq_json)
review_metadata_names(metadata)
validate_collected_ontology_data(metadata, convention)
confirm_uniform_units(metadata, convention)
dev_logger.info('Checking for "Excel drag" events')
if not detect_excel_drag(metadata, convention):
# "short-circut" ontology validation if "Excel drag" detected
# avoids a bloat of calls to EBI OLS, return error faster and avoid
# long-compute-time issue (if false positives are possible, bypass will be needed)
dev_logger.info('Validating ontology content against EBI OLS')
validate_collected_ontology_data(metadata, convention)
confirm_uniform_units(metadata, convention)


def push_metadata_to_bq(metadata, ndjson, dataset, table):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="scp-ingest-pipeline",
version="1.12.2",
version="1.12.4",
description="ETL pipeline for single-cell RNA-seq data",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
Loading

0 comments on commit 62c5b3b

Please sign in to comment.