Skip to content

Commit

Permalink
Merge pull request #367 from broadinstitute/development
Browse files Browse the repository at this point in the history
Release 1.36.0
  • Loading branch information
bistline authored Oct 22, 2024
2 parents 4631af6 + 4190761 commit aa12131
Show file tree
Hide file tree
Showing 15 changed files with 368 additions and 12 deletions.
92 changes: 92 additions & 0 deletions .github/workflows/minify_ontologies.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
name: Minify ontologies

on:
pull_request:
types: [opened] # Only trigger on PR "opened" event
# push: # Uncomment, update branches to develop / debug
# branches:
# jb-anndata-mixpanel-props

jobs:
build:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Copy and decompress ontologies in repo
run: cd ingest/validation/ontologies; mkdir tmp; cp -r *.min.tsv.gz tmp/; gzip -d tmp/*.min.tsv.gz

- name: Minify newest ontologies
run: cd ingest/validation; python3 minify_ontologies.py; gzip -dkf ontologies/*.min.tsv.gz

- name: Diff and commit changes
run: |
#!/bin/bash
# Revert the default `set -e` in GitHub Actions, to e.g. ensure
# "diff" doesn't throw an error when something is found
set +e
# set -x # Enable debugging
cd ingest/validation/ontologies
# Define directories
SOURCE_DIR="."
TMP_DIR="tmp"
# Ensure TMP_DIR exists
if [ ! -d "$TMP_DIR" ]; then
echo "Temporary directory $TMP_DIR does not exist."
exit 1
fi
# Flag to track if there are any changes
CHANGES_DETECTED=false
# Find and diff files
for FILE in $(find "$SOURCE_DIR" -type f -name "*.min.tsv"); do
# Get the base name of the file
BASENAME=$(basename "$FILE")
# Construct the path to the corresponding file in the TMP_DIR
TMP_FILE="$TMP_DIR/$BASENAME"
# Check if the corresponding file exists in TMP_DIR
if [ -f "$TMP_FILE" ]; then
# Run the diff command
echo "Diffing $FILE and $TMP_FILE"
diff "$FILE" "$TMP_FILE" > diff_output.txt
# Check if diff output is not empty
if [ -s diff_output.txt ]; then
echo "Differences found in $BASENAME"
cat diff_output.txt
# Copy the updated file to the source directory (if needed)
cp "$TMP_FILE" "$FILE"
# Mark that changes have been detected
CHANGES_DETECTED=true
# Stage the modified file
git add "$FILE".gz
else
echo "No differences in $BASENAME"
fi
else
echo "No corresponding file found in $TMP_DIR for $BASENAME"
fi
done
if [ "$CHANGES_DETECTED" = true ]; then
# Update version to signal downstream caches should update
echo "$(date +%s) # validation cache key" > version.txt
git add version.txt
# Configure Git
git config --global user.name "github-actions"
git config --global user.email "[email protected]"
# Commit changes
git commit -m "Update minified ontologies via GitHub Actions"
git push origin ${{ github.ref_name }}
else
echo "No changes to commit."
fi
23 changes: 15 additions & 8 deletions ingest/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
MONGO_CONNECTION = MongoConnection()


def init(study_id, study_file_id, user_metric_uuid=None):
def init(study_id, study_file_id, user_metric_uuid=None, action=None):
global __metric_properties

study = Study(study_id)
study_file = StudyFile(study_file_id)
__metric_properties = MetricProperties(study, study_file, user_metric_uuid)
__metric_properties = MetricProperties(study, study_file, user_metric_uuid, action)


def set_parent_event_name(event_name):
Expand All @@ -39,7 +39,7 @@ class MetricProperties:
# This is a generic write-only log token, not a secret
USER_ID = "2f30ec50-a04d-4d43-8fd1-b136a2045079"

def __init__(self, study, study_file, user_uuid=None):
def __init__(self, study, study_file, user_uuid=None, action=None):
distinct_id = user_uuid if user_uuid else MetricProperties.USER_ID
self.__properties = {
"distinct_id": distinct_id,
Expand All @@ -50,7 +50,11 @@ def __init__(self, study, study_file, user_uuid=None):
"trigger": study_file.trigger,
"logger": "ingest-pipeline",
"appId": "single-cell-portal",
"action": action
}
# merge in referenceAnnDataFile if necessary
if study_file.file_type == 'AnnData':
self.__properties["referenceAnnDataFile"] = study_file.is_reference_anndata

def get_properties(self):
return self.__properties
Expand Down Expand Up @@ -171,12 +175,15 @@ def study_file(self, study_file_id):
self.file_type = self.study_file["file_type"]
self.file_size = self.study_file["upload_file_size"]
self.file_name = self.study_file["name"]
upload_trigger = self.study_file.get("options", {}).get("upload_trigger")
# when set, remote_location is the name of the file in the bucket
if self.study_file.get("remote_location") is not None:
if self.study_file["remote_location"] == "":
self.trigger = 'upload'
else:
self.trigger = 'sync'
if upload_trigger is not None:
self.trigger = upload_trigger
elif self.study_file["remote_location"] is not None:
self.trigger = 'upload' if self.study_file["remote_location"] == "" else 'sync'
# indicate trigger state for tests/mocks
else:
self.trigger = 'not set'

if self.study_file["file_type"] == 'AnnData':
self.is_reference_anndata = self.study_file.get("ann_data_file_info", {}).get("reference_file")
13 changes: 12 additions & 1 deletion ingest/ingest_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/sparse/sparsemini_matrix.mtx --gene-file ../tests/data/differential_expression/sparse/sparsemini_features.tsv --barcode-file ../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv --matrix-file-type mtx --annotation-file ../tests/data/differential_expression/sparse/sparsemini_metadata.txt --cluster-file ../tests/data/differential_expression/sparse/sparsemini_cluster.txt --cluster-name de_sparse_integration --study-accession SCPsparsemini --differential-expression
# Differential expression analysis (h5ad matrix)
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/de_dense_matrix.tsv --matrix-file-type h5ad --annotation-file ../tests/data/differential_expression/de_dense_metadata.tsv --cluster-file ../tests/data/differential_expression/de_dense_cluster.tsv --cluster-name de_integration --study-accession SCPdev --differential-expression
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name louvain --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --matrix-file-type h5ad --annotation-file ../tests/data/anndata/h5ad_frag.metadata.tsv --cluster-file ../tests/data/anndata/h5ad_frag.cluster.X_umap.tsv --cluster-name umap --study-accession SCPdev --differential-expression
"""
import json
Expand Down Expand Up @@ -124,6 +124,12 @@ class IngestPipeline:
"../schema/alexandria_convention/alexandria_convention_schema.json"
)

# array of actions to use when reporting to Mixpanel
ACTION_NAMES = [
'ingest_cluster', 'ingest_cell_metadata', 'ingest_expression', 'ingest_anndata', 'ingest_subsample',
'ingest_differential_expression', 'differential_expression', 'render_expression_arrays', 'rank_genes'
]

# Logger provides more details for trouble shooting
dev_logger = setup_logger(__name__, "log.txt", format="support_configs")
user_logger = setup_logger(__name__ + ".usr", "user_log.txt", level=logging.ERROR)
Expand Down Expand Up @@ -786,6 +792,10 @@ def exit_pipeline(ingest, status, status_cell_metadata, arguments):
sys.exit(os.EX_DATAERR)
sys.exit(1)

def get_action_from_args(arguments):
"""Get the action from list of arguments denoting which data type is being ingested/extracted"""
action = list(set(IngestPipeline.ACTION_NAMES) & set(arguments))
return action[0] if action else ""

def main() -> None:
"""Enables running Ingest Pipeline via CLI
Expand All @@ -811,6 +821,7 @@ def main() -> None:
arguments["study_id"],
arguments["study_file_id"],
arguments["user_metrics_uuid"],
get_action_from_args(arguments)
)
ingest = IngestPipeline(**arguments)
status, status_cell_metadata = run_ingest(ingest, arguments, parsed_args)
Expand Down
151 changes: 151 additions & 0 deletions ingest/validation/minify_ontologies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""Minifies ontologies used in EBI OLS, to enable instant ontology validation
This converts ~224 MB in ontology JSON files into 2 MB TSV.GZs at build-time.
The 2 MB compressed ontologies can then be retrieved at runtime.
Only IDs, labels, and synonyms are retained from the original ontologies.
Example:
cd ingest/validation
python minify_ontologies.py
"""

import argparse
import json
import urllib.request
from pathlib import Path
import gzip

MONDO_URL = 'https://github.com/monarch-initiative/mondo/releases/latest/download/mondo.json'
PATO_URL = 'https://github.com/pato-ontology/pato/raw/master/pato.json'
NCBITAXON_URL = 'https://github.com/obophenotype/ncbitaxon/releases/latest/download/taxslim.json'
EFO_URL = 'https://github.com/EBISPOT/efo/releases/latest/download/efo.json'
UBERON_URL = 'https://github.com/obophenotype/uberon/releases/latest/download/uberon.json'
CL_URL = 'https://github.com/obophenotype/cell-ontology/releases/latest/download/cl.json'

ONTOLOGY_JSON_URLS = {
'disease': [MONDO_URL, PATO_URL],
'species': [NCBITAXON_URL],
'library_preparation_protocol': [EFO_URL],
'organ': [UBERON_URL],
'cell_type': [CL_URL]
}

def fetch(url, use_cache=True):
"""Request remote resource, read local cache if availalble
"""
filename = url.split('/')[-1]
if use_cache == False or (use_cache and not Path(filename).is_file()):
with urllib.request.urlopen(url) as f:
content = f.read()
if use_cache:
with open(filename, 'wb') as f:
f.write(content)
else:
with open(filename) as f:
content = f.read()
return [content, filename]

def fetch_ontologies(ontology_json_urls, use_cache=True):
"""Retrieve ontology JSON and JSON filename for required ontology
"""
ontologies = {}
for annotation in ontology_json_urls:
ontology_urls = ontology_json_urls[annotation]
ontologies[annotation] = []
for ontology_url in ontology_urls:
print(f'Fetch ontology: {ontology_url}')
raw_ontology, filename = fetch(ontology_url, use_cache)
ontology_json = json.loads(raw_ontology)
ontologies[annotation].append([ontology_json, filename])
return ontologies

def get_synonyms(node, label):
"""Get related and exact synonyms for an ontology node
"""
if 'meta' not in node or 'synonyms' not in node['meta']:
return ''

raw_synonyms = []
synonym_nodes = node['meta']['synonyms']
for synonym_node in synonym_nodes:
if 'val' not in synonym_node:
# Handles e.g. incomplete EFO synonym nodes
continue
raw_synonym = synonym_node['val']
if (
not raw_synonym.startswith('obsolete ') and # Omit obsolete synonyms
raw_synonym != label # Omit synonyms that are redundant with label
):
raw_synonyms.append(raw_synonym)
synonyms = '||'.join(raw_synonyms) # Unambiguously delimit synonyms
return synonyms

def minify(ontology_json, filename):
"""Convert full ontology JSON into a minimal gzipped TSV, write to disk
"""
ontology_shortname = filename.split('.json')[0]
if ontology_shortname == 'taxslim':
ontology_shortname = 'ncbitaxon'
ontology_shortname_uc = ontology_shortname.upper()
graph_nodes = ontology_json['graphs'][0]['nodes']

raw_nodes = list(filter(
lambda n: f'/{ontology_shortname_uc}_' in n['id'].upper() and 'lbl' in n,
graph_nodes
))

all_nodes = list(map(
lambda n: (
[n['id'].split('/')[-1], n['lbl'], get_synonyms(n, n['lbl'])]
), raw_nodes
))

# Remove obsolete labels
nodes = list(filter(
lambda n: not n[1].startswith('obsolete '),
all_nodes
))

tsv_content = '\n'.join(
map(lambda n: '\t'.join(n), nodes)
)
compressed_tsv_content = gzip.compress(tsv_content.encode())

output_filename = f'ontologies/{ontology_shortname}.min.tsv.gz'
with open(output_filename, 'wb') as f:
f.write(compressed_tsv_content)
print(f'Wrote {output_filename}')


class OntologyMinifier:

def __init__(self, annotations=None, use_cache=True):
# Enable minifying incomplete set of ontologies, e.g. for testing
if annotations:
ontology_json_urls = {}
for annotation in annotations:
ontology_json_urls[annotation] = ONTOLOGY_JSON_URLS[annotation]
else:
ontology_json_urls = ONTOLOGY_JSON_URLS

ontologies = fetch_ontologies(ontology_json_urls, use_cache)
for annotation in ontologies:
for conf in ontologies[annotation]:
ontology_json, filename = conf
minify(ontology_json, filename)

if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
"--use-cache",
help=(
"Whether to use previously-downloaded raw ontologies"
),
action="store_true"
)
args = parser.parse_args()
use_cache = args.use_cache
OntologyMinifier(None, use_cache)
Binary file added ingest/validation/ontologies/cl.min.tsv.gz
Binary file not shown.
Binary file added ingest/validation/ontologies/efo.min.tsv.gz
Binary file not shown.
Binary file added ingest/validation/ontologies/mondo.min.tsv.gz
Binary file not shown.
Binary file not shown.
Binary file added ingest/validation/ontologies/pato.min.tsv.gz
Binary file not shown.
Binary file added ingest/validation/ontologies/uberon.min.tsv.gz
Binary file not shown.
1 change: 1 addition & 0 deletions ingest/validation/ontologies/version.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1726600528 # validation cache key
20 changes: 20 additions & 0 deletions tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
IngestPipeline,
exit_pipeline,
run_ingest,
get_action_from_args
)
from expression_files.expression_files import GeneExpression

Expand Down Expand Up @@ -793,6 +794,25 @@ def test_extract_processed_matrix_from_anndata(self):
except:
print(f"Error while deleting file : {file}")

def test_get_action_from_args(self):
args = [
"--study-id",
"5d276a50421aa9117c982845",
"--study-file-id",
"5dd5ae25421aa910a723a337",
"ingest_subsample",
"--cluster-file",
"../tests/data/good_subsample_cluster.csv",
"--name",
"cluster1",
"--cell-metadata-file",
"../tests/data/test_cell_metadata.csv",
"--subsample",
]
self.assertEqual("ingest_subsample", get_action_from_args(args))
bad_args = ["foo", "bar", "bing"]
self.assertEqual("", get_action_from_args(bad_args))


if __name__ == "__main__":
unittest.main()
Loading

0 comments on commit aa12131

Please sign in to comment.