Merge pull request #367 from broadinstitute/development

Release 1.36.0
broadinstitute · Oct 22, 2024 · aa12131 · aa12131
2 parents 4631af6 + 4190761
commit aa12131
Show file tree

Hide file tree

Showing 15 changed files with 368 additions and 12 deletions.
diff --git a/.github/workflows/minify_ontologies.yml b/.github/workflows/minify_ontologies.yml
@@ -0,0 +1,92 @@
+name: Minify ontologies
+
+on:
+  pull_request:
+    types: [opened]  # Only trigger on PR "opened" event
+#  push: # Uncomment, update branches to develop / debug
+#    branches:
+#      jb-anndata-mixpanel-props
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Copy and decompress ontologies in repo
+      run: cd ingest/validation/ontologies; mkdir tmp; cp -r *.min.tsv.gz tmp/; gzip -d tmp/*.min.tsv.gz
+
+    - name: Minify newest ontologies
+      run: cd ingest/validation; python3 minify_ontologies.py; gzip -dkf ontologies/*.min.tsv.gz
+
+    - name: Diff and commit changes
+      run: |
+        #!/bin/bash
+
+        # Revert the default `set -e` in GitHub Actions, to e.g. ensure
+        # "diff" doesn't throw an error when something is found
+        set +e
+        # set -x  # Enable debugging
+
+        cd ingest/validation/ontologies
+
+        # Define directories
+        SOURCE_DIR="."
+        TMP_DIR="tmp"
+
+        # Ensure TMP_DIR exists
+        if [ ! -d "$TMP_DIR" ]; then
+          echo "Temporary directory $TMP_DIR does not exist."
+          exit 1
+        fi
+
+        # Flag to track if there are any changes
+        CHANGES_DETECTED=false
+
+        # Find and diff files
+        for FILE in $(find "$SOURCE_DIR" -type f -name "*.min.tsv"); do
+          # Get the base name of the file
+          BASENAME=$(basename "$FILE")
+          # Construct the path to the corresponding file in the TMP_DIR
+          TMP_FILE="$TMP_DIR/$BASENAME"
+
+          # Check if the corresponding file exists in TMP_DIR
+          if [ -f "$TMP_FILE" ]; then
+            # Run the diff command
+            echo "Diffing $FILE and $TMP_FILE"
+            diff "$FILE" "$TMP_FILE" > diff_output.txt
+            # Check if diff output is not empty
+            if [ -s diff_output.txt ]; then
+              echo "Differences found in $BASENAME"
+              cat diff_output.txt
+              # Copy the updated file to the source directory (if needed)
+              cp "$TMP_FILE" "$FILE"
+              # Mark that changes have been detected
+              CHANGES_DETECTED=true
+              # Stage the modified file
+              git add "$FILE".gz
+            else
+              echo "No differences in $BASENAME"
+            fi
+          else
+            echo "No corresponding file found in $TMP_DIR for $BASENAME"
+          fi
+        done
+
+        if [ "$CHANGES_DETECTED" = true ]; then
+          # Update version to signal downstream caches should update
+          echo "$(date +%s) # validation cache key" > version.txt
+          git add version.txt
+
+          # Configure Git
+          git config --global user.name "github-actions"
+          git config --global user.email "[email protected]"
+
+          # Commit changes
+          git commit -m "Update minified ontologies via GitHub Actions"
+          git push origin ${{ github.ref_name }}
+        else
+          echo "No changes to commit."
+        fi
diff --git a/ingest/config.py b/ingest/config.py
@@ -10,12 +10,12 @@
 MONGO_CONNECTION = MongoConnection()
 
 
-def init(study_id, study_file_id, user_metric_uuid=None):
+def init(study_id, study_file_id, user_metric_uuid=None, action=None):
     global __metric_properties
 
     study = Study(study_id)
     study_file = StudyFile(study_file_id)
-    __metric_properties = MetricProperties(study, study_file, user_metric_uuid)
+    __metric_properties = MetricProperties(study, study_file, user_metric_uuid, action)
 
 
 def set_parent_event_name(event_name):
@@ -39,7 +39,7 @@ class MetricProperties:
     # This is a generic write-only log token, not a secret
     USER_ID = "2f30ec50-a04d-4d43-8fd1-b136a2045079"
 
-    def __init__(self, study, study_file, user_uuid=None):
+    def __init__(self, study, study_file, user_uuid=None, action=None):
         distinct_id = user_uuid if user_uuid else MetricProperties.USER_ID
         self.__properties = {
             "distinct_id": distinct_id,
@@ -50,7 +50,11 @@ def __init__(self, study, study_file, user_uuid=None):
             "trigger": study_file.trigger,
             "logger": "ingest-pipeline",
             "appId": "single-cell-portal",
+            "action": action
         }
+        # merge in referenceAnnDataFile if necessary
+        if study_file.file_type == 'AnnData':
+            self.__properties["referenceAnnDataFile"] = study_file.is_reference_anndata
 
     def get_properties(self):
         return self.__properties
@@ -171,12 +175,15 @@ def study_file(self, study_file_id):
                 self.file_type = self.study_file["file_type"]
                 self.file_size = self.study_file["upload_file_size"]
                 self.file_name = self.study_file["name"]
+                upload_trigger = self.study_file.get("options", {}).get("upload_trigger")
                 # when set, remote_location is the name of the file in the bucket
-                if self.study_file.get("remote_location") is not None:
-                    if self.study_file["remote_location"] == "":
-                        self.trigger = 'upload'
-                    else:
-                        self.trigger = 'sync'
+                if upload_trigger is not None:
+                    self.trigger = upload_trigger
+                elif self.study_file["remote_location"] is not None:
+                    self.trigger = 'upload' if self.study_file["remote_location"] == "" else 'sync'
                 # indicate trigger state for tests/mocks
                 else:
                     self.trigger = 'not set'
+
+                if self.study_file["file_type"] == 'AnnData':
+                    self.is_reference_anndata = self.study_file.get("ann_data_file_info", {}).get("reference_file")
diff --git a/ingest/ingest_pipeline.py b/ingest/ingest_pipeline.py
@@ -67,7 +67,7 @@
 python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/sparse/sparsemini_matrix.mtx --gene-file ../tests/data/differential_expression/sparse/sparsemini_features.tsv --barcode-file ../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv --matrix-file-type mtx --annotation-file ../tests/data/differential_expression/sparse/sparsemini_metadata.txt --cluster-file ../tests/data/differential_expression/sparse/sparsemini_cluster.txt --cluster-name de_sparse_integration --study-accession SCPsparsemini --differential-expression
 
 # Differential expression analysis (h5ad matrix)
-python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/de_dense_matrix.tsv --matrix-file-type h5ad --annotation-file ../tests/data/differential_expression/de_dense_metadata.tsv --cluster-file ../tests/data/differential_expression/de_dense_cluster.tsv --cluster-name de_integration --study-accession SCPdev --differential-expression
+python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name louvain --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --matrix-file-type h5ad --annotation-file ../tests/data/anndata/h5ad_frag.metadata.tsv --cluster-file ../tests/data/anndata/h5ad_frag.cluster.X_umap.tsv --cluster-name umap --study-accession SCPdev --differential-expression
 
 """
 import json
@@ -124,6 +124,12 @@ class IngestPipeline:
         "../schema/alexandria_convention/alexandria_convention_schema.json"
     )
 
+    # array of actions to use when reporting to Mixpanel
+    ACTION_NAMES = [
+        'ingest_cluster', 'ingest_cell_metadata', 'ingest_expression', 'ingest_anndata', 'ingest_subsample',
+        'ingest_differential_expression', 'differential_expression', 'render_expression_arrays', 'rank_genes'
+    ]
+
     # Logger provides more details for trouble shooting
     dev_logger = setup_logger(__name__, "log.txt", format="support_configs")
     user_logger = setup_logger(__name__ + ".usr", "user_log.txt", level=logging.ERROR)
@@ -786,6 +792,10 @@ def exit_pipeline(ingest, status, status_cell_metadata, arguments):
                     sys.exit(os.EX_DATAERR)
             sys.exit(1)
 
+def get_action_from_args(arguments):
+    """Get the action from list of arguments denoting which data type is being ingested/extracted"""
+    action = list(set(IngestPipeline.ACTION_NAMES) & set(arguments))
+    return action[0] if action else ""
 
 def main() -> None:
     """Enables running Ingest Pipeline via CLI
@@ -811,6 +821,7 @@ def main() -> None:
         arguments["study_id"],
         arguments["study_file_id"],
         arguments["user_metrics_uuid"],
+        get_action_from_args(arguments)
     )
     ingest = IngestPipeline(**arguments)
     status, status_cell_metadata = run_ingest(ingest, arguments, parsed_args)

diff --git a/ingest/validation/minify_ontologies.py b/ingest/validation/minify_ontologies.py
@@ -0,0 +1,151 @@
+"""Minifies ontologies used in EBI OLS, to enable instant ontology validation
+
+This converts ~224 MB in ontology JSON files into 2 MB TSV.GZs at build-time.
+The 2 MB compressed ontologies can then be retrieved at runtime.
+Only IDs, labels, and synonyms are retained from the original ontologies.
+
+Example:
+cd ingest/validation
+python minify_ontologies.py
+"""
+
+import argparse
+import json
+import urllib.request
+from pathlib import Path
+import gzip
+
+MONDO_URL = 'https://github.com/monarch-initiative/mondo/releases/latest/download/mondo.json'
+PATO_URL = 'https://github.com/pato-ontology/pato/raw/master/pato.json'
+NCBITAXON_URL = 'https://github.com/obophenotype/ncbitaxon/releases/latest/download/taxslim.json'
+EFO_URL = 'https://github.com/EBISPOT/efo/releases/latest/download/efo.json'
+UBERON_URL = 'https://github.com/obophenotype/uberon/releases/latest/download/uberon.json'
+CL_URL = 'https://github.com/obophenotype/cell-ontology/releases/latest/download/cl.json'
+
+ONTOLOGY_JSON_URLS = {
+    'disease': [MONDO_URL, PATO_URL],
+    'species': [NCBITAXON_URL],
+    'library_preparation_protocol': [EFO_URL],
+    'organ': [UBERON_URL],
+    'cell_type': [CL_URL]
+}
+
+def fetch(url, use_cache=True):
+    """Request remote resource, read local cache if availalble
+    """
+    filename = url.split('/')[-1]
+    if use_cache == False or (use_cache and not Path(filename).is_file()):
+        with urllib.request.urlopen(url) as f:
+            content = f.read()
+        if use_cache:
+            with open(filename, 'wb') as f:
+                f.write(content)
+    else:
+        with open(filename) as f:
+            content = f.read()
+    return [content, filename]
+
+def fetch_ontologies(ontology_json_urls, use_cache=True):
+    """Retrieve ontology JSON and JSON filename for required ontology
+    """
+    ontologies = {}
+    for annotation in ontology_json_urls:
+        ontology_urls = ontology_json_urls[annotation]
+        ontologies[annotation] = []
+        for ontology_url in ontology_urls:
+            print(f'Fetch ontology: {ontology_url}')
+            raw_ontology, filename = fetch(ontology_url, use_cache)
+            ontology_json = json.loads(raw_ontology)
+            ontologies[annotation].append([ontology_json, filename])
+    return ontologies
+
+def get_synonyms(node, label):
+    """Get related and exact synonyms for an ontology node
+    """
+    if 'meta' not in node or 'synonyms' not in node['meta']:
+        return ''
+
+    raw_synonyms = []
+    synonym_nodes = node['meta']['synonyms']
+    for synonym_node in synonym_nodes:
+        if 'val' not in synonym_node:
+            # Handles e.g. incomplete EFO synonym nodes
+            continue
+        raw_synonym = synonym_node['val']
+        if (
+            not raw_synonym.startswith('obsolete ') and # Omit obsolete synonyms
+            raw_synonym != label # Omit synonyms that are redundant with label
+        ):
+            raw_synonyms.append(raw_synonym)
+    synonyms = '||'.join(raw_synonyms) # Unambiguously delimit synonyms
+    return synonyms
+
+def minify(ontology_json, filename):
+    """Convert full ontology JSON into a minimal gzipped TSV, write to disk
+    """
+    ontology_shortname = filename.split('.json')[0]
+    if ontology_shortname == 'taxslim':
+        ontology_shortname = 'ncbitaxon'
+    ontology_shortname_uc = ontology_shortname.upper()
+    graph_nodes = ontology_json['graphs'][0]['nodes']
+
+    raw_nodes = list(filter(
+        lambda n: f'/{ontology_shortname_uc}_' in n['id'].upper() and 'lbl' in n,
+        graph_nodes
+    ))
+
+    all_nodes = list(map(
+        lambda n: (
+            [n['id'].split('/')[-1], n['lbl'], get_synonyms(n, n['lbl'])]
+        ), raw_nodes
+    ))
+
+    # Remove obsolete labels
+    nodes = list(filter(
+        lambda n: not n[1].startswith('obsolete '),
+        all_nodes
+    ))
+
+    tsv_content = '\n'.join(
+        map(lambda n: '\t'.join(n), nodes)
+    )
+    compressed_tsv_content = gzip.compress(tsv_content.encode())
+
+    output_filename = f'ontologies/{ontology_shortname}.min.tsv.gz'
+    with open(output_filename, 'wb') as f:
+        f.write(compressed_tsv_content)
+    print(f'Wrote {output_filename}')
+
+
+class OntologyMinifier:
+
+    def __init__(self, annotations=None, use_cache=True):
+        # Enable minifying incomplete set of ontologies, e.g. for testing
+        if annotations:
+            ontology_json_urls = {}
+            for annotation in annotations:
+                ontology_json_urls[annotation] = ONTOLOGY_JSON_URLS[annotation]
+        else:
+            ontology_json_urls = ONTOLOGY_JSON_URLS
+
+        ontologies = fetch_ontologies(ontology_json_urls, use_cache)
+        for annotation in ontologies:
+            for conf in ontologies[annotation]:
+                ontology_json, filename = conf
+                minify(ontology_json, filename)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument(
+        "--use-cache",
+        help=(
+            "Whether to use previously-downloaded raw ontologies"
+        ),
+        action="store_true"
+    )
+    args = parser.parse_args()
+    use_cache = args.use_cache
+    OntologyMinifier(None, use_cache)
diff --git a/ingest/validation/ontologies/cl.min.tsv.gz b/ingest/validation/ontologies/cl.min.tsv.gz
diff --git a/ingest/validation/ontologies/efo.min.tsv.gz b/ingest/validation/ontologies/efo.min.tsv.gz
diff --git a/ingest/validation/ontologies/mondo.min.tsv.gz b/ingest/validation/ontologies/mondo.min.tsv.gz
diff --git a/ingest/validation/ontologies/ncbitaxon.min.tsv.gz b/ingest/validation/ontologies/ncbitaxon.min.tsv.gz
diff --git a/ingest/validation/ontologies/pato.min.tsv.gz b/ingest/validation/ontologies/pato.min.tsv.gz
diff --git a/ingest/validation/ontologies/uberon.min.tsv.gz b/ingest/validation/ontologies/uberon.min.tsv.gz
diff --git a/ingest/validation/ontologies/version.txt b/ingest/validation/ontologies/version.txt
@@ -0,0 +1 @@
+1726600528 # validation cache key
diff --git a/tests/test_ingest.py b/tests/test_ingest.py
@@ -47,6 +47,7 @@
     IngestPipeline,
     exit_pipeline,
     run_ingest,
+    get_action_from_args
 )
 from expression_files.expression_files import GeneExpression
 
@@ -793,6 +794,25 @@ def test_extract_processed_matrix_from_anndata(self):
             except:
                 print(f"Error while deleting file : {file}")
 
+    def test_get_action_from_args(self):
+        args = [
+            "--study-id",
+            "5d276a50421aa9117c982845",
+            "--study-file-id",
+            "5dd5ae25421aa910a723a337",
+            "ingest_subsample",
+            "--cluster-file",
+            "../tests/data/good_subsample_cluster.csv",
+            "--name",
+            "cluster1",
+            "--cell-metadata-file",
+            "../tests/data/test_cell_metadata.csv",
+            "--subsample",
+        ]
+        self.assertEqual("ingest_subsample", get_action_from_args(args))
+        bad_args = ["foo", "bar", "bing"]
+        self.assertEqual("", get_action_from_args(bad_args))
+
 
 if __name__ == "__main__":
     unittest.main()