From 0b20fdf9d8e0e842735bdf9c07faf3dd944a4a9a Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Tue, 8 Aug 2023 18:00:55 -0400 Subject: [PATCH 01/31] created cg-gnn extraction script --- spatialprofilingtoolbox/cggnn/__init__.py | 2 + spatialprofilingtoolbox/cggnn/extract.py | 57 +++++++++++++++ .../cggnn/scripts/extract.py | 48 +++++++++++++ spatialprofilingtoolbox/cggnn/scripts/run.py | 69 ++++--------------- .../cggnn/scripts/run_sql.py | 3 +- 5 files changed, 121 insertions(+), 58 deletions(-) create mode 100644 spatialprofilingtoolbox/cggnn/extract.py create mode 100644 spatialprofilingtoolbox/cggnn/scripts/extract.py diff --git a/spatialprofilingtoolbox/cggnn/__init__.py b/spatialprofilingtoolbox/cggnn/__init__.py index 4be301fc3..d3dc32f9e 100644 --- a/spatialprofilingtoolbox/cggnn/__init__.py +++ b/spatialprofilingtoolbox/cggnn/__init__.py @@ -1,2 +1,4 @@ """Cell-graph graph neural network functionality.""" __version__ = '0.2.1' + +from spatialprofilingtoolbox.cggnn.extract import extract_cggnn_data diff --git a/spatialprofilingtoolbox/cggnn/extract.py b/spatialprofilingtoolbox/cggnn/extract.py new file mode 100644 index 000000000..03a8e1b9f --- /dev/null +++ b/spatialprofilingtoolbox/cggnn/extract.py @@ -0,0 +1,57 @@ +"""Extract information cg-gnn needs from SPT.""" + +from typing import Any + +from pandas import DataFrame, concat, merge # type: ignore +from numpy import sort # type: ignore + +from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor + + +def _create_cell_df( + dfs_by_specimen: dict[str, DataFrame], + feature_names: dict[str, str], +) -> DataFrame: + """Find simple and complex phenotypes, and locations and merge into a DataFrame.""" + feature_ids_to_names = {ft_id: 'FT_' + ft_name for ft_id, ft_name in feature_names.items()} + for specimen, df_specimen in dfs_by_specimen.items(): + df_specimen.rename(feature_ids_to_names, axis=1, inplace=True) + # TODO: Create complex phenotype columns + df_specimen.rename({'pixel x': 'center_x', 'pixel y': 'center_y'}, + axis=1, inplace=True) + df_specimen['specimen'] = specimen + + # TODO: Reorder so that it's simple phenotype, specimen, complex phenotype, x, y + df = concat(dfs_by_specimen.values(), axis=0) + df.index.name = 'histological_structure' + return df + + +def _create_label_df( + df_assignments: DataFrame, + df_strata: DataFrame, +) -> tuple[DataFrame, dict[int, str]]: + """Get slide-level results.""" + df = merge(df_assignments, df_strata, on='stratum identifier', how='left')[ + ['specimen', 'subject diagnosed result']].rename( + {'specimen': 'slide', 'subject diagnosed result': 'result'}, axis=1) + label_to_result = dict(enumerate(sort(df['result'].unique()))) + return df.replace({res: i for i, res in label_to_result.items()}), label_to_result + + +def extract_cggnn_data( + spt_db_config_location: str, + study: str, +) -> tuple[DataFrame, DataFrame, dict[int, str]]: + """Extract information cg-gnn needs from SPT.""" + extractor = FeatureMatrixExtractor(database_config_file=spt_db_config_location) + study_data: dict[str, Any] = extractor.extract(study=study) + df_cell = _create_cell_df( + {slide: data['dataframe'] for slide, data in study_data['feature matrices'].items()}, + study_data['channel symbols by column name'] + ) + df_label, label_to_result_text = _create_label_df( + study_data['sample cohorts']['assignments'], + study_data['sample cohorts']['strata'] + ) + return df_cell, df_label, label_to_result_text diff --git a/spatialprofilingtoolbox/cggnn/scripts/extract.py b/spatialprofilingtoolbox/cggnn/scripts/extract.py new file mode 100644 index 000000000..67f49aa0f --- /dev/null +++ b/spatialprofilingtoolbox/cggnn/scripts/extract.py @@ -0,0 +1,48 @@ +"""Extract information cg-gnn needs from SPT and save to file.""" + +from argparse import ArgumentParser +from os.path import join, exists +from json import dump + +from spatialprofilingtoolbox.cggnn import extract_cggnn_data + + +def parse_arguments(): + """Process command line arguments.""" + parser = ArgumentParser( + prog='spt cggnn extract', + description='Extract information cg-gnn needs from SPT and save to file.' + ) + parser.add_argument( + '--spt_db_config_location', + type=str, + help='File location for SPT DB config file.', + required=True + ) + parser.add_argument( + '--study', + type=str, + help='Name of the study to query data for in SPT.', + required=True + ) + parser.add_argument( + '--output_location', + type=str, + help='Directory to save extracted data to.', + required=True + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_arguments() + df_cell, df_label, label_to_result = extract_cggnn_data(args.spt_db_config_location, args.study) + + assert isinstance(args.output_location, str) + dict_filename = join(args.output_location, 'label_to_results.json') + cells_filename = join(args.output_location, 'cells.h5') + labels_filename = join(args.output_location, 'labels.h5') + if not (exists(dict_filename) and exists(cells_filename) and exists(labels_filename)): + df_cell.to_hdf(cells_filename, 'cells') + df_label.to_hdf(labels_filename, 'labels') + dump(label_to_result, open(dict_filename, 'w', encoding='utf-8')) diff --git a/spatialprofilingtoolbox/cggnn/scripts/run.py b/spatialprofilingtoolbox/cggnn/scripts/run.py index deb6387e9..bcaf133dd 100644 --- a/spatialprofilingtoolbox/cggnn/scripts/run.py +++ b/spatialprofilingtoolbox/cggnn/scripts/run.py @@ -1,16 +1,13 @@ -"Run through the entire SPT CG-GNN pipeline using a local db config." +"""Run through the entire SPT CG-GNN pipeline using a local db config.""" + from argparse import ArgumentParser from os.path import join -from pandas import DataFrame from pandas import read_csv -from pandas import concat, merge -from numpy import sort -from spatialprofilingtoolbox.db.database_connection import DatabaseConnectionMaker -from spatialprofilingtoolbox.db.database_connection import DBCredentials +from spatialprofilingtoolbox.cggnn import extract_cggnn_data +from spatialprofilingtoolbox.db.database_connection import DatabaseConnectionMaker, DBCredentials from spatialprofilingtoolbox.db.importance_score_transcriber import transcribe_importance -from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor from spatialprofilingtoolbox.standalone_utilities.module_load_error import SuggestExtrasException try: from cggnn.run_all import run_with_dfs @@ -19,7 +16,7 @@ def parse_arguments(): - "Process command line arguments." + """Process command line arguments.""" parser = ArgumentParser( prog='spt cggnn run', description='Create cell graphs from SPT tables saved locally, train a graph neural ' @@ -120,42 +117,10 @@ def parse_arguments(): return parser.parse_args() -def _create_cell_df(cell_dfs: dict[str, DataFrame], feature_names: dict[str, str]) -> DataFrame: - "Find chemical species, phenotypes, and locations and merge into a DataFrame." - - for specimen, df_specimen in cell_dfs.items(): - df_specimen.rename({ft_id: 'FT_' + ft_name for ft_id, ft_name in feature_names.items()}, - axis=1, inplace=True) - # TODO: Create phenotype columns - df_specimen.rename({'pixel x': 'center_x', 'pixel y': 'center_y'}, - axis=1, inplace=True) - df_specimen['specimen'] = specimen - - # TODO: Reorder so that it's feature, specimen, phenotype, xy - # TODO: Verify histological structure ID or recreate one - df = concat(cell_dfs.values(), axis=0) - df.index.name = 'histological_structure' - return df - - -def _create_label_df(df_assignments: DataFrame, - df_strata: DataFrame) -> tuple[DataFrame, dict[int, str]]: - """Get slide-level results.""" - df = merge(df_assignments, df_strata, on='stratum identifier', how='left')[ - ['specimen', 'subject diagnosed result']].rename( - {'specimen': 'slide', 'subject diagnosed result': 'result'}, axis=1) - label_to_result = dict(enumerate(sort(df['result'].unique()))) - return df.replace({res: i for i, res in label_to_result.items()}), label_to_result - - -def retrieve_importances() -> DataFrame: - filename = join('out', 'importances.csv') - return read_csv(filename, index_col=0) - - -def save_importances(_args): - df = retrieve_importances() - credentials = DBCredentials(_args.dbname, _args.host, _args.user, _args.password) +def save_importance(dbname: str, host: str, user: str, password: str) -> None: + """Save cell importance scores as defined by cggnn to the database.""" + df = read_csv(join('out', 'importances.csv'), index_col=0) + credentials = DBCredentials(dbname, host, user, password) connection = DatabaseConnectionMaker.make_connection(credentials) transcribe_importance(df, connection) connection.close() @@ -163,20 +128,10 @@ def save_importances(_args): if __name__ == "__main__": args = parse_arguments() - extractor = FeatureMatrixExtractor(database_config_file=args.spt_db_config_location) - study_data: dict[str, dict] = extractor.extract(study=args.study) - - df_cell = _create_cell_df( - {slide: data['dataframe'] - for slide, data in study_data['feature matrices'].items()}, - study_data['channel symbols by column name']) - df_label, label_to_result_text = _create_label_df( - study_data['sample cohorts']['assignments'], - study_data['sample cohorts']['strata']) - + df_cell, df_label, label_to_result = extract_cggnn_data(args.spt_db_config_location, args.study) run_with_dfs(df_cell, df_label, - label_to_result_text, + label_to_result, args.validation_data_percent, args.test_data_percent, args.roi_side_length, @@ -189,4 +144,4 @@ def save_importances(_args): args.merge_rois, args.prune_misclassified) - save_importances(args) + save_importance(args.dbname, args.host, args.user, args.password) diff --git a/spatialprofilingtoolbox/cggnn/scripts/run_sql.py b/spatialprofilingtoolbox/cggnn/scripts/run_sql.py index a0ab4a078..8244df941 100644 --- a/spatialprofilingtoolbox/cggnn/scripts/run_sql.py +++ b/spatialprofilingtoolbox/cggnn/scripts/run_sql.py @@ -1,4 +1,5 @@ "Run through the entire SPT CG-GNN pipeline, starting from a SPT SQL database." + from argparse import ArgumentParser from spatialprofilingtoolbox.standalone_utilities.module_load_error import SuggestExtrasException try: @@ -8,7 +9,7 @@ def parse_arguments(): - "Process command line arguments." + """Process command line arguments.""" parser = ArgumentParser( prog='spt cggnn run_sql', description='Create cell graphs from SPT SQL tables, train a graph neural network on ' From ae2170d443e02ad2311bd2c4ba2030c83fd9d961 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Wed, 9 Aug 2023 17:24:01 -0400 Subject: [PATCH 02/31] added a quick test for cggnn extract --- test/cggnn/unit_tests/test_extract.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 test/cggnn/unit_tests/test_extract.sh diff --git a/test/cggnn/unit_tests/test_extract.sh b/test/cggnn/unit_tests/test_extract.sh new file mode 100644 index 000000000..399d4a542 --- /dev/null +++ b/test/cggnn/unit_tests/test_extract.sh @@ -0,0 +1,13 @@ +spt cggnn extract \ + --spt_db_config_location ../db/.spt_db.config.container \ + --study "Melanoma intralesional IL2" \ + --output_location . +status=$([ $? ] && [ -e "label_to_results.json" ] && [ -e "cells.h5" ] && [ -e "labels.h5" ]) +[ $status -eq 0 ] || echo "cggnn extract failed." + +if [ $status -eq 0 ]; +then + exit 0 +else + exit 1 +fi From ad8e61b0556fd09fde76090373636921da306989 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Fri, 11 Aug 2023 19:07:41 -0400 Subject: [PATCH 03/31] hotfix importance saving --- spatialprofilingtoolbox/cggnn/scripts/run.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/spatialprofilingtoolbox/cggnn/scripts/run.py b/spatialprofilingtoolbox/cggnn/scripts/run.py index bcaf133dd..e9b6c62a8 100644 --- a/spatialprofilingtoolbox/cggnn/scripts/run.py +++ b/spatialprofilingtoolbox/cggnn/scripts/run.py @@ -117,11 +117,10 @@ def parse_arguments(): return parser.parse_args() -def save_importance(dbname: str, host: str, user: str, password: str) -> None: +def save_importance(spt_db_config_location: str) -> None: """Save cell importance scores as defined by cggnn to the database.""" df = read_csv(join('out', 'importances.csv'), index_col=0) - credentials = DBCredentials(dbname, host, user, password) - connection = DatabaseConnectionMaker.make_connection(credentials) + connection = DatabaseConnectionMaker(spt_db_config_location).get_connection() transcribe_importance(df, connection) connection.close() @@ -143,5 +142,4 @@ def save_importance(dbname: str, host: str, user: str, password: str) -> None: args.explainer, args.merge_rois, args.prune_misclassified) - - save_importance(args.dbname, args.host, args.user, args.password) + save_importance(args.spt_db_config_location) From c002b6d19748d65ca5c0f242df72ee1eabc490fd Mon Sep 17 00:00:00 2001 From: James Mathews Date: Tue, 15 Aug 2023 15:37:35 -0400 Subject: [PATCH 04/31] Use updated pytorch image for Python 3.11 support, cggnn extract test passes. --- build/cggnn/Dockerfile | 28 ++++++++++++++++-------- spatialprofilingtoolbox/cggnn/extract.py | 2 +- test/cggnn/unit_tests/test_extract.sh | 12 +++++++++- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/build/cggnn/Dockerfile b/build/cggnn/Dockerfile index 9a45858d6..34b89e9c6 100644 --- a/build/cggnn/Dockerfile +++ b/build/cggnn/Dockerfile @@ -1,13 +1,23 @@ -FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime +FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime ENV DEBIAN_FRONTEND=noninteractive -RUN apt update && apt install -y gcc libpq-dev && rm -rf /var/lib/apt/lists/* +RUN apt update && apt install -y gcc libpq-dev WORKDIR /usr/src/app -RUN python -m pip install dgl-cu116 dglgo -f https://data.dgl.ai/wheels/repo.html -RUN python -m pip install psycopg2==2.9.6 -RUN python -m pip install adiscstudies==0.11.0 -RUN python -m pip install numba==0.57.0 -RUN python -m pip install attrs==23.1.0 -RUN python -m pip install cg-gnn +RUN apt install software-properties-common -y +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt update +RUN apt install python3.11 -y +RUN apt install python3.11-dev -y +RUN apt install python3.11-venv -y +RUN apt install python3.11-distutils -y +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 && python3.11 -m ensurepip +RUN python3.11 -m pip install dgl-cu117 dglgo -f https://data.dgl.ai/wheels/repo.html +RUN python3.11 -m pip install psycopg2==2.9.6 +RUN python3.11 -m pip install adiscstudies==0.11.0 +RUN python3.11 -m pip install numba==0.57.0 +RUN python3.11 -m pip install attrs==23.1.0 +# RUN python -m pip install cg-gnn +COPY cg_gnn-0.1.5-py3-none-any.whl ./ +RUN python3.11 -m pip install cg_gnn-0.1.5-py3-none-any.whl ARG version ARG service_name ARG WHEEL_FILENAME @@ -15,4 +25,4 @@ LABEL version=$version LABEL service_name=$service_name ENV service_name $service_name COPY $WHEEL_FILENAME ./ -RUN python -m pip install "$WHEEL_FILENAME" +RUN python3.11 -m pip install "$WHEEL_FILENAME" diff --git a/spatialprofilingtoolbox/cggnn/extract.py b/spatialprofilingtoolbox/cggnn/extract.py index 03a8e1b9f..4555c3548 100644 --- a/spatialprofilingtoolbox/cggnn/extract.py +++ b/spatialprofilingtoolbox/cggnn/extract.py @@ -45,7 +45,7 @@ def extract_cggnn_data( ) -> tuple[DataFrame, DataFrame, dict[int, str]]: """Extract information cg-gnn needs from SPT.""" extractor = FeatureMatrixExtractor(database_config_file=spt_db_config_location) - study_data: dict[str, Any] = extractor.extract(study=study) + study_data = extractor.extract(study=study)[study] df_cell = _create_cell_df( {slide: data['dataframe'] for slide, data in study_data['feature matrices'].items()}, study_data['channel symbols by column name'] diff --git a/test/cggnn/unit_tests/test_extract.sh b/test/cggnn/unit_tests/test_extract.sh index 399d4a542..ced9a3e5e 100644 --- a/test/cggnn/unit_tests/test_extract.sh +++ b/test/cggnn/unit_tests/test_extract.sh @@ -2,9 +2,19 @@ spt cggnn extract \ --spt_db_config_location ../db/.spt_db.config.container \ --study "Melanoma intralesional IL2" \ --output_location . -status=$([ $? ] && [ -e "label_to_results.json" ] && [ -e "cells.h5" ] && [ -e "labels.h5" ]) +$([ $? -eq 0 ] && [ -e "label_to_results.json" ] && [ -e "cells.h5" ] && [ -e "labels.h5" ]) +status="$?" +echo "Status: $status" [ $status -eq 0 ] || echo "cggnn extract failed." +cat label_to_results.json +python3.11 -c 'import pandas as pd; print(pd.read_hdf("cells.h5"))' +python3.11 -c 'import pandas as pd; print(pd.read_hdf("labels.h5"))' + +rm label_to_results.json +rm cells.h5 +rm labels.h5 + if [ $status -eq 0 ]; then exit 0 From ccdf77124e8439725b2a2b41c1b000ae6c104da4 Mon Sep 17 00:00:00 2001 From: James Mathews Date: Wed, 16 Aug 2023 16:45:57 -0400 Subject: [PATCH 05/31] Use python3.11 directly in test case usage of pip, now that python3.11 is specially installed. --- test/cggnn/unit_tests/test_image_runs_properly.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/cggnn/unit_tests/test_image_runs_properly.sh b/test/cggnn/unit_tests/test_image_runs_properly.sh index f99a041ad..d20b37f52 100644 --- a/test/cggnn/unit_tests/test_image_runs_properly.sh +++ b/test/cggnn/unit_tests/test_image_runs_properly.sh @@ -1,4 +1,4 @@ -pip freeze | grep dgl +python3.11 -m pip freeze | grep dgl status=$? [ $status -eq 0 ] || echo "Docker image for cggnn did not build and run properly." From c9aab1b67cfc039150a1d8777b7133b868894f7a Mon Sep 17 00:00:00 2001 From: James Mathews Date: Wed, 16 Aug 2023 16:48:55 -0400 Subject: [PATCH 06/31] Same fix applied to second test. --- test/cggnn/module_tests/test_image_runs_properly.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/cggnn/module_tests/test_image_runs_properly.sh b/test/cggnn/module_tests/test_image_runs_properly.sh index f99a041ad..d20b37f52 100644 --- a/test/cggnn/module_tests/test_image_runs_properly.sh +++ b/test/cggnn/module_tests/test_image_runs_properly.sh @@ -1,4 +1,4 @@ -pip freeze | grep dgl +python3.11 -m pip freeze | grep dgl status=$? [ $status -eq 0 ] || echo "Docker image for cggnn did not build and run properly." From 61e855e94cb512212656d4dd4f68e8760c024a25 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Wed, 16 Aug 2023 17:26:14 -0400 Subject: [PATCH 07/31] undo cggnn docker change --- build/cggnn/Dockerfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/build/cggnn/Dockerfile b/build/cggnn/Dockerfile index 34b89e9c6..739a7efb9 100644 --- a/build/cggnn/Dockerfile +++ b/build/cggnn/Dockerfile @@ -15,9 +15,7 @@ RUN python3.11 -m pip install psycopg2==2.9.6 RUN python3.11 -m pip install adiscstudies==0.11.0 RUN python3.11 -m pip install numba==0.57.0 RUN python3.11 -m pip install attrs==23.1.0 -# RUN python -m pip install cg-gnn -COPY cg_gnn-0.1.5-py3-none-any.whl ./ -RUN python3.11 -m pip install cg_gnn-0.1.5-py3-none-any.whl +RUN python3.11 -m pip install cg-gnn ARG version ARG service_name ARG WHEEL_FILENAME From bdda7d76b1bcba602b35434ae508f182d871abc3 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Mon, 21 Aug 2023 13:16:23 -0400 Subject: [PATCH 08/31] added pheno, removed multistudy to FME --- spatialprofilingtoolbox/cggnn/extract.py | 4 +- .../db/feature_matrix_extractor.py | 252 +++++++++--------- .../db/scripts/retrieve_feature_matrices.py | 40 +-- spatialprofilingtoolbox/db/squidpy_metrics.py | 4 +- .../workflow/common/sparse_matrix_puller.py | 6 +- .../workflow/common/structure_centroids.py | 33 ++- .../workflow/phenotype_proximity/core.py | 22 +- .../test_feature_matrix_extraction.py | 25 +- 8 files changed, 195 insertions(+), 191 deletions(-) diff --git a/spatialprofilingtoolbox/cggnn/extract.py b/spatialprofilingtoolbox/cggnn/extract.py index 9d3f257b6..2b30a30af 100644 --- a/spatialprofilingtoolbox/cggnn/extract.py +++ b/spatialprofilingtoolbox/cggnn/extract.py @@ -7,7 +7,7 @@ from spatialprofilingtoolbox.db.feature_matrix_extractor import ( FeatureMatrixExtractor, - Bundle, + StudyBundle, ) @@ -54,7 +54,7 @@ def extract_cggnn_data( ) -> tuple[DataFrame, DataFrame, dict[int, str]]: """Extract information cg-gnn needs from SPT.""" extractor = FeatureMatrixExtractor(database_config_file=spt_db_config_location) - study_data = cast(Bundle, extractor.extract(study=study)) + study_data = cast(StudyBundle, extractor.extract(study=study)) df_cell = _create_cell_df( { slide: cast(DataFrame, data['dataframe']) diff --git a/spatialprofilingtoolbox/db/feature_matrix_extractor.py b/spatialprofilingtoolbox/db/feature_matrix_extractor.py index 1d30af422..63ea75063 100644 --- a/spatialprofilingtoolbox/db/feature_matrix_extractor.py +++ b/spatialprofilingtoolbox/db/feature_matrix_extractor.py @@ -1,16 +1,17 @@ -""" -Convenience provision of a feature matrix for each study, the data retrieved from the SPT database. -""" +"""Convenience provision of a feature matrix for each study, retrieved from the SPT database.""" from enum import Enum from enum import auto -from typing import cast +from typing import cast, Any from pandas import DataFrame from psycopg2.extensions import cursor as Psycopg2Cursor from spatialprofilingtoolbox import DatabaseConnectionMaker +from spatialprofilingtoolbox.db.exchange_data_formats.metrics import PhenotypeCriteria +from spatialprofilingtoolbox.db.phenotypes import PhenotypesAccess from spatialprofilingtoolbox.db.stratification_puller import StratificationPuller +from spatialprofilingtoolbox.db.study_access import StudyAccess from spatialprofilingtoolbox.workflow.common.structure_centroids_puller import \ StructureCentroidsPuller from spatialprofilingtoolbox.workflow.common.sparse_matrix_puller import SparseMatrixPuller @@ -18,8 +19,8 @@ logger = colorized_logger(__name__) -BundlePart = dict[str, DataFrame | str | dict[str, DataFrame | str ]] -Bundle = dict[str, dict[str, BundlePart]] +StudyBundle = dict[str, dict[str, Any]] + class DBSource(Enum): """Indicator of intended database source.""" @@ -35,9 +36,9 @@ class FeatureMatrixExtractor: db_source: DBSource def __init__(self, - cursor: Psycopg2Cursor | None=None, - database_config_file: str | None=None, - ): + cursor: Psycopg2Cursor | None = None, + database_config_file: str | None = None, + ) -> None: self.cursor = cast(Psycopg2Cursor, cursor) self.database_config_file = database_config_file if cursor is not None: @@ -57,10 +58,10 @@ def _report_on_arguments(self): logger.warning(message) def extract(self, - specimen: str | None=None, - study: str | None=None, - continuous_also: bool=False, - ) -> Bundle | None: + specimen: str | None = None, + study: str | None = None, + continuous_also: bool = False, + ) -> StudyBundle: extraction = None match self.db_source: case DBSource.CURSOR: @@ -83,10 +84,12 @@ def extract(self, return extraction def _extract(self, - specimen: str | None=None, - study: str | None=None, - continuous_also: bool=False, - ) -> Bundle | None: + specimen: str | None = None, + study: str | None = None, + continuous_also: bool = False, + ) -> StudyBundle: + if (specimen is None) != (study is None): + raise ValueError('Must specify exactly one of specimen or study.') data_arrays = self._retrieve_expressions_from_database( specimen=specimen, study=study, @@ -96,58 +99,83 @@ def _extract(self, specimen=specimen, study=study, ) - stratification = self._retrieve_derivative_stratification_from_database() - study_component_lookup = self._retrieve_study_component_lookup() - merged = self._merge_dictionaries( - self._create_feature_matrices(data_arrays, centroid_coordinates), - self._create_channel_information(data_arrays), - stratification, - new_keys=['feature matrices','channel symbols by column name', 'sample cohorts'], - study_component_lookup=study_component_lookup, - ) - if merged is None: - return None - if study is not None: - for key in list(merged.keys()): - if not key == study: - del merged[key] - return merged + if study is None: + assert specimen is not None + study = StudyAccess(self.cursor).get_study_from_specimen(specimen) + channel_information = self._create_channel_information(data_arrays[study]) + phenotypes, phenotype_information = self._retrieve_phenotypes(study, channel_information) + return { + 'feature matrices': self._create_feature_matrices( + data_arrays[study], + centroid_coordinates[study], + phenotypes + ), + 'channel symbols by column name': channel_information, + 'sample cohorts': self._retrieve_derivative_stratification_from_database()[study], + 'phenotypes by column name': phenotype_information + } @staticmethod - def redact_dataframes(extraction): - for study_name, study in extraction.items(): - for specimen in study['feature matrices'].keys(): - extraction[study_name]['feature matrices'][specimen]['dataframe'] = None - key = 'continuous dataframe' - if key in extraction[study_name]['feature matrices'][specimen]: - extraction[study_name]['feature matrices'][specimen][key] = None - extraction[study_name]['sample cohorts']['assignments'] = None - extraction[study_name]['sample cohorts']['strata'] = None + def redact_dataframes(study: dict[str, dict[str, Any]]) -> None: + for specimen in study['feature matrices'].keys(): + study['feature matrices'][specimen]['dataframe'] = None + key = 'continuous dataframe' + if key in study['feature matrices'][specimen]: + study['feature matrices'][specimen][key] = None + study['sample cohorts']['assignments'] = None + study['sample cohorts']['strata'] = None def _retrieve_expressions_from_database(self, - specimen: str | None=None, - study: str | None=None, - continuous_also: bool=False, - ): + specimen: str | None = None, + study: str | None = None, + continuous_also: bool = False, + ) -> dict[str, dict[str, Any]]: logger.info('Retrieving expression data from database.') puller = SparseMatrixPuller(self.cursor) puller.pull(specimen=specimen, study=study, continuous_also=continuous_also) data_arrays = puller.get_data_arrays() logger.info('Done retrieving expression data from database.') - return data_arrays.get_studies() + return list(data_arrays.get_studies().values())[0] def _retrieve_structure_centroids_from_database(self, - specimen: str | None=None, - study: str | None=None, - ): + specimen: str | None = None, + study: str | None = None, + ) -> dict[str, Any]: logger.info('Retrieving polygon centroids from shapefiles in database.') puller = StructureCentroidsPuller(self.cursor) puller.pull(specimen=specimen, study=study) structure_centroids = puller.get_structure_centroids() logger.info('Done retrieving centroids.') - return structure_centroids.get_studies() + return list(structure_centroids.get_studies().values())[0] + + def _retrieve_phenotypes(self, + study_name: str, + channel_names: dict[str, str] + ) -> tuple[dict[str, PhenotypeCriteria], dict[str, str]]: + logger.info('Retrieving phenotypes from database.') + channel_to_column_name = {n: c for c, n in channel_names.items()} + phenotypes: dict[str, PhenotypeCriteria] = {} + phenotype_access = PhenotypesAccess(self.cursor) + phenotype_information: dict[str, str] = {} + for i, symbol_data in enumerate(phenotype_access.get_phenotype_symbols(study_name)): + column_name = f'P{i}' + symbol = symbol_data.handle_string + phenotypes[column_name] = phenotype_access.get_phenotype_criteria(study_name, symbol) + phenotype_information[column_name] = symbol - def _retrieve_derivative_stratification_from_database(self): + # Convert marker lists to FeatureMatrixExtractor column names. + phenotypes[column_name].positive_markers = [ + channel_to_column_name[marker] + for marker in phenotypes[column_name].positive_markers + ] + phenotypes[column_name].negative_markers = [ + channel_to_column_name[marker] + for marker in phenotypes[column_name].negative_markers + ] + logger.info('Done retrieving phenotypes.') + return phenotypes, phenotype_information + + def _retrieve_derivative_stratification_from_database(self) -> dict[str, dict[str, Any]]: logger.info('Retrieving stratification from database.') puller = StratificationPuller(self.cursor) puller.pull() @@ -155,104 +183,86 @@ def _retrieve_derivative_stratification_from_database(self): logger.info('Done retrieving stratification.') return stratification - def _retrieve_study_component_lookup(self): + def _retrieve_study_component_lookup(self) -> dict[str, str]: self.cursor.execute('SELECT * FROM study_component ; ') rows = self.cursor.fetchall() - lookup = {} + lookup: dict[str, str] = {} for row in rows: lookup[row[1]] = row[0] return lookup - def _create_feature_matrices(self, data_arrays, centroid_coordinates): + def _create_feature_matrices(self, + study: dict[str, dict[str, Any]], + centroid_coordinates: dict[str, Any], + phenotypes: dict[str, PhenotypeCriteria], + ) -> dict[str, DataFrame]: logger.info('Creating feature matrices from binary data arrays and centroids.') matrices = {} - for k, study_name in enumerate(sorted(list(data_arrays.keys()))): - study = data_arrays[study_name] - matrices[study_name] = {} - for j, specimen in enumerate(sorted(list(study['data arrays by specimen'].keys()))): + for j, specimen in enumerate(sorted(list(study['data arrays by specimen'].keys()))): + logger.debug('Specimen %s .', specimen) + expressions = study['data arrays by specimen'][specimen] + number_channels = len(study['target index lookup']) + rows = [ + self._create_feature_matrix_row( + centroid_coordinates[specimen][i], + expressions[i], + number_channels, + ) for i in range(len(expressions)) + ] + dataframe = DataFrame( + rows, + columns=['pixel x', 'pixel y'] + [f'F{i}' for i in range(number_channels)], + ) + for column_name, criteria in phenotypes.items(): + dataframe[column_name] = (dataframe[criteria.positive_markers].all(axis=1) & \ + dataframe[criteria.negative_markers].all(axis=1)).astype(int) + matrices[specimen] = { + 'dataframe': dataframe, + 'filename': f'{j}.tsv', + } + + if 'continuous data arrays by specimen' in study: + specimens = list(study['continuous data arrays by specimen'].keys()) + for j, specimen in enumerate(sorted(specimens)): logger.debug('Specimen %s .', specimen) - expressions = study['data arrays by specimen'][specimen] + expression_vectors = study['continuous data arrays by specimen'][specimen] number_channels = len(study['target index lookup']) - rows = [ - self._create_feature_matrix_row( - centroid_coordinates[study_name][specimen][i], - expressions[i], - number_channels, - ) - for i in range(len(expressions)) - ] dataframe = DataFrame( - rows, - columns=['pixel x', 'pixel y'] + [f'F{i}' for i in range(number_channels)], + expression_vectors, + columns=[f'F{i}' for i in range(number_channels)], ) - matrices[study_name][specimen] = { - 'dataframe': dataframe, - 'filename': f'{k}.{j}.tsv', - } - - if 'continuous data arrays by specimen' in study: - specimens = list(study['continuous data arrays by specimen'].keys()) - for j, specimen in enumerate(sorted(specimens)): - logger.debug('Specimen %s .', specimen) - expression_vectors = study['continuous data arrays by specimen'][specimen] - number_channels = len(study['target index lookup']) - dataframe = DataFrame( - expression_vectors, - columns=[f'F{i}' for i in range(number_channels)], - ) - matrices[study_name][specimen]['continuous dataframe'] = dataframe + for column_name, criteria in phenotypes.items(): + dataframe[column_name] = (dataframe[criteria.positive_markers].all(axis=1) & \ + dataframe[criteria.negative_markers].all(axis=1)).astype(int) + matrices[specimen]['continuous dataframe'] = dataframe logger.info('Done creating feature matrices.') return matrices @staticmethod - def _create_feature_matrix_row(centroid, binary, number_channels): + def _create_feature_matrix_row( + centroid: tuple[float, float], + binary: list[str], + number_channels: int, + ) -> list[float | int]: template = '{0:0%sb}' % number_channels # pylint: disable=consider-using-f-string - feature_vector = [int(value) for value in list(template.format(binary)[::-1])] + feature_vector: list[int] = [int(value) for value in list(template.format(binary)[::-1])] return [centroid[0], centroid[1]] + feature_vector - def _create_channel_information(self, data_arrays): - return { - study_name: self._create_channel_information_for_study(study) - for study_name, study in data_arrays.items() - } - - def _create_channel_information_for_study(self, study): + def _create_channel_information(self, + study_information: dict[str, dict[str, Any]] + ) -> dict[str, str]: logger.info('Aggregating channel information for one study.') targets = { int(index): target - for target, index in study['target index lookup'].items() + for target, index in study_information['target index lookup'].items() } symbols = { target: symbol - for symbol, target in study['target by symbol'].items() + for symbol, target in study_information['target by symbol'].items() } logger.info('Done aggregating channel information.') return { f'F{i}': symbols[targets[i]] for i in sorted([int(index) for index in targets.keys()]) } - - def _merge_dictionaries(self, - *args, - new_keys: list, - study_component_lookup: dict - ) -> Bundle | None: - if not len(args) == len(new_keys): - logger.error( - "Can not match up dictionaries to be merged with the list of key names to be " - "issued for them." - ) - return None - - merged: dict = {} - for i in range(len(new_keys)): - for substudy, value in args[i].items(): - merged[study_component_lookup[substudy]] = {} - - for i, key in enumerate(new_keys): - for substudy, value in args[i].items(): - merged[study_component_lookup[substudy]][key] = value - - logger.info('Done merging into a single dictionary bundle.') - return merged diff --git a/spatialprofilingtoolbox/db/scripts/retrieve_feature_matrices.py b/spatialprofilingtoolbox/db/scripts/retrieve_feature_matrices.py index 467f6544b..fd8129758 100644 --- a/spatialprofilingtoolbox/db/scripts/retrieve_feature_matrices.py +++ b/spatialprofilingtoolbox/db/scripts/retrieve_feature_matrices.py @@ -1,4 +1,5 @@ """Convenience CLI wrapper of FeatureMatrixExtractor functionality, writes to files.""" + import argparse import json from os.path import exists @@ -12,16 +13,14 @@ SuggestExtrasException try: from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor - from spatialprofilingtoolbox.db.feature_matrix_extractor import Bundle + from spatialprofilingtoolbox.db.feature_matrix_extractor import StudyBundle except ModuleNotFoundError as e: SuggestExtrasException(e, 'db') from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor -from spatialprofilingtoolbox.db.feature_matrix_extractor import Bundle +from spatialprofilingtoolbox.db.feature_matrix_extractor import StudyBundle from spatialprofilingtoolbox.workflow.common.cli_arguments import add_argument -from spatialprofilingtoolbox.standalone_utilities.log_formats import colorized_logger -logger = colorized_logger('spt db create-schema') def retrieve(args: argparse.Namespace): database_config_file = None @@ -31,16 +30,15 @@ def retrieve(args: argparse.Namespace): message = f'Need to supply valid database config filename: {database_config_file}' raise FileNotFoundError(message) extractor = FeatureMatrixExtractor(database_config_file=database_config_file) - bundle = cast(Bundle, extractor.extract()) - for _, study in bundle.items(): - feature_matrices = cast(dict[str, dict[str, DataFrame | str ]], study['feature matrices']) - for _, specimen_data in feature_matrices.items(): - df = cast(DataFrame, specimen_data['dataframe']) - filename = cast(str, specimen_data['filename']) - df.to_csv(filename, sep='\t', index=False) - outcomes = cast(DataFrame, study['sample cohorts']['assignments']) - filename = 'assignments.tsv' - outcomes.to_csv(filename, sep='\t', index=False) + bundle = cast(StudyBundle, extractor.extract(args.study_name)) + feature_matrices = cast(dict[str, dict[str, DataFrame | str]], bundle['feature matrices']) + for _, specimen_data in feature_matrices.items(): + df = cast(DataFrame, specimen_data['dataframe']) + filename = cast(str, specimen_data['filename']) + df.to_csv(filename, sep='\t', index=False) + outcomes = cast(DataFrame, bundle['sample cohorts']['assignments']) + filename = 'assignments.tsv' + outcomes.to_csv(filename, sep='\t', index=False) FeatureMatrixExtractor.redact_dataframes(bundle) with open('features.json', 'wt', encoding='utf-8') as file: file.write(json.dumps(bundle, indent=2)) @@ -50,12 +48,14 @@ def retrieve(args: argparse.Namespace): parser = argparse.ArgumentParser( prog='spt db retrieve-feature-matrices', description=''' -Retrieve feature matrices for each sample of each study, an outcomes dataframe -for each study, and a column/channel names lookup for each study. -Retrieves from any database that conforms to "single cell ADI" database schema. -Writes TSV files to the current working directory, with filenames listed alongside -specimen and channel name information in: features.json -''' +Retrieve feature matrices for each sample of a study and corresponding outcomes +dataframe and column/channel names lookup from any database that conforms to +"single cell ADI" database schema and writes them as TSV files to the current +working directory, with filenames listed alongside specimen and channel name +information in: features.json +''', + ) add_argument(parser, 'database config') + add_argument(parser, 'study name') retrieve(parser.parse_args()) diff --git a/spatialprofilingtoolbox/db/squidpy_metrics.py b/spatialprofilingtoolbox/db/squidpy_metrics.py index aead67a07..d2b0d6c26 100644 --- a/spatialprofilingtoolbox/db/squidpy_metrics.py +++ b/spatialprofilingtoolbox/db/squidpy_metrics.py @@ -7,7 +7,7 @@ from spatialprofilingtoolbox import DatabaseConnectionMaker from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor -from spatialprofilingtoolbox.db.feature_matrix_extractor import Bundle +from spatialprofilingtoolbox.db.feature_matrix_extractor import StudyBundle from spatialprofilingtoolbox.db.exchange_data_formats.metrics import PhenotypeCriteria from spatialprofilingtoolbox.db.create_data_analysis_study import DataAnalysisStudyFactory from spatialprofilingtoolbox.workflow.common.squidpy import ( @@ -66,7 +66,7 @@ def _fetch_cells_and_phenotypes( study: str, ) -> tuple[dict[str, DataFrame], dict[str, str]]: extractor = FeatureMatrixExtractor(cursor) - bundle = cast(Bundle, extractor.extract(study=study)) + bundle = cast(StudyBundle, extractor.extract(study=study)) FeatureMatrices = dict[str, dict[str, DataFrame | str]] feature_matrices = cast(FeatureMatrices, bundle[study]['feature matrices']) features_by_specimen = { diff --git a/spatialprofilingtoolbox/workflow/common/sparse_matrix_puller.py b/spatialprofilingtoolbox/workflow/common/sparse_matrix_puller.py index 20a9133fa..52e1f6956 100644 --- a/spatialprofilingtoolbox/workflow/common/sparse_matrix_puller.py +++ b/spatialprofilingtoolbox/workflow/common/sparse_matrix_puller.py @@ -2,7 +2,7 @@ (in-memory) binary compressed format. """ -from typing import cast +from typing import cast, Any from psycopg2.extensions import cursor as Psycopg2Cursor @@ -36,9 +36,9 @@ class CompressedDataArrays: """ def __init__(self): - self.studies = {} + self.studies: dict[str, dict[str, dict[str, Any]]] = {} - def get_studies(self): + def get_studies(self) -> dict[str, dict[str, dict[str, Any]]]: return self.studies def add_study_data( diff --git a/spatialprofilingtoolbox/workflow/common/structure_centroids.py b/spatialprofilingtoolbox/workflow/common/structure_centroids.py index 37a934459..d9d2adc1d 100644 --- a/spatialprofilingtoolbox/workflow/common/structure_centroids.py +++ b/spatialprofilingtoolbox/workflow/common/structure_centroids.py @@ -1,4 +1,6 @@ """An object for in-memory storage of summarized-location data for all cells of each study.""" + +from typing import Any from pickle import dump from pickle import load from os.path import join @@ -8,33 +10,36 @@ class StructureCentroids: - """An object for in-memory storage of summarized-location data for all cells of - each study. - - Member `studies` is a dictionary with keys the study names. The values are - dictionaries, providing for each specimen name (for specimens collected as - part of the given study) the list of pairs of pixel coordinate values - representing the centroid of the shape specification for a given cell. The - order is ascending lexicographical order of the corresponding "histological + """An object for in-memory storage of summarized-location data for all cells of each study. + + Member `studies` is a dictionary with keys the study names. The values are dictionaries, + providing for each specimen name (for specimens collected as part of the given study) the list + of pairs of pixel coordinate values representing the centroid of the shape specification for a + given cell. The order is ascending lexicographical order of the corresponding "histological structure" identifier strings. """ + def __init__(self): - self.studies = {} + self.studies: dict[str, dict[str, Any]] = {} - def get_studies(self): + def get_studies(self) -> dict[str, dict[str, Any]]: return self.studies - def add_study_data(self, study_name, structure_centroids_by_specimen): + def add_study_data( + self, + study_name: str, + structure_centroids_by_specimen: dict[str, Any] + ) -> None: self.studies[study_name] = structure_centroids_by_specimen - def write_to_file(self, data_directory): + def write_to_file(self, data_directory: str) -> None: with open(join(data_directory, CENTROIDS_FILENAME), 'wb') as file: dump(self.get_studies(), file) - def load_from_file(self, data_directory): + def load_from_file(self, data_directory: str) -> None: with open(join(data_directory, CENTROIDS_FILENAME), 'rb') as file: self.studies = load(file) @staticmethod - def already_exists(data_directory): + def already_exists(data_directory: str) -> bool: return isfile(join(data_directory, CENTROIDS_FILENAME)) diff --git a/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py b/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py index 5a42f5af1..cc8be1cdc 100644 --- a/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py +++ b/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py @@ -9,7 +9,7 @@ from spatialprofilingtoolbox.workflow.component_interfaces.core import CoreJob from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor -from spatialprofilingtoolbox.db.feature_matrix_extractor import Bundle +from spatialprofilingtoolbox.db.feature_matrix_extractor import StudyBundle from spatialprofilingtoolbox.workflow.common.logging.performance_timer import \ PerformanceTimerReporter from spatialprofilingtoolbox import DatabaseConnectionMaker @@ -33,12 +33,12 @@ class PhenotypeProximityCoreJob(CoreJob): tree: BallTree def __init__(self, - study_name: str='', - database_config_file: str='', - performance_report_file: str='', - results_file: str='', + study_name: str = '', + database_config_file: str = '', + performance_report_file: str = '', + results_file: str = '', **kwargs # pylint: disable=unused-argument - ): + ) -> None: self.study_name = study_name self.database_config_file = database_config_file self.results_file = results_file @@ -65,7 +65,7 @@ def log_job_info(self): def calculate_proximity(self): self.reporter.record_timepoint('Start pulling data for one sample.') extractor = FeatureMatrixExtractor(database_config_file=self.database_config_file) - bundle = cast(Bundle, extractor.extract(specimen=self.sample_identifier)) + bundle = cast(StudyBundle, extractor.extract(specimen=self.sample_identifier)) self.reporter.record_timepoint('Finished pulling data for one sample.') study_name = list(bundle.keys())[0] identifier = list(bundle[study_name]['feature matrices'].keys())[0] @@ -91,7 +91,7 @@ def calculate_proximity(self): ] all_signatures = singleton_signatures + signatures cases = self.get_cases(all_signatures) - proximity_metrics=[ + proximity_metrics = [ compute_proximity_metric_for_signature_pair(s1, s2, r, cells, self.tree) for s1, s2, r in cases ] @@ -116,7 +116,7 @@ def get_named_phenotype_signatures(self) -> tuple[list[str], list[PhenotypeCrite ''', (self.study_name,)) rows = cursor.fetchall() cursor.close() - lookup = {value : key for key, value in self.channel_symbols_by_column_name.items()} + lookup = {value: key for key, value in self.channel_symbols_by_column_name.items()} criteria = DataFrame(rows, columns=['phenotype', 'name', 'channel', 'polarity']) criteria = criteria[['phenotype', 'channel', 'polarity']] @@ -125,8 +125,8 @@ def list_channels(df: DataFrame, polarity: int) -> list[str]: def make_signature(df) -> PhenotypeCriteria: return PhenotypeCriteria( - positive_markers = list_channels(df, 1), - negative_markers = list_channels(df, 0), + positive_markers=list_channels(df, 1), + negative_markers=list_channels(df, 0), ) by_identifier: dict[str, PhenotypeCriteria] = {} for key, criteria in criteria.groupby(['phenotype']): diff --git a/test/workflow/unit_tests/test_feature_matrix_extraction.py b/test/workflow/unit_tests/test_feature_matrix_extraction.py index adea04f6b..663367bca 100644 --- a/test/workflow/unit_tests/test_feature_matrix_extraction.py +++ b/test/workflow/unit_tests/test_feature_matrix_extraction.py @@ -6,14 +6,6 @@ from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor -def get_study(bundle): - study_name = 'Melanoma intralesional IL2' - if not study_name in bundle.keys(): - print(f'Missing study: {study_name}') - sys.exit(1) - return bundle[study_name] - - def test_sample_set(study): if study['feature matrices'].keys() != set(['lesion 0_1', 'lesion 6_1']): print(f'Wrong sample set: {list(study["feature matrices"].keys())}') @@ -138,8 +130,7 @@ def test_stratification(study): if __name__ == '__main__': extractor = FeatureMatrixExtractor(database_config_file='../db/.spt_db.config.container') - matrix_bundle = extractor.extract() - test_study = get_study(matrix_bundle) + test_study = extractor.extract(study='Melanoma intralesional IL2') test_sample_set(test_study) test_feature_matrix_schemas(test_study) show_example_feature_matrix(test_study) @@ -147,25 +138,23 @@ def test_stratification(study): test_expression_vectors(test_study) test_stratification(test_study) - one_sample_bundle = extractor.extract(specimen='lesion 6_1') - one_sample_study = get_study(one_sample_bundle) + one_sample_study = extractor.extract(specimen='lesion 6_1') test_one_sample_set(one_sample_study) test_feature_matrix_schemas(one_sample_study) test_channels(one_sample_study) test_expression_vectors(one_sample_study) - one_sample_bundle_continuous = extractor.extract(specimen='lesion 6_1', continuous_also=True) - one_sample_study_continuous = get_study(one_sample_bundle_continuous) + one_sample_study_continuous = extractor.extract(specimen='lesion 6_1', continuous_also=True) test_one_sample_set(one_sample_study_continuous) test_feature_matrix_schemas(one_sample_study_continuous) test_channels(one_sample_study_continuous) test_expression_vectors_continuous(one_sample_study_continuous) - FeatureMatrixExtractor.redact_dataframes(matrix_bundle) + FeatureMatrixExtractor.redact_dataframes(test_study) print('\nMetadata "bundle" with dataframes removed:') - print(json.dumps(matrix_bundle, indent=2)) + print(json.dumps(test_study, indent=2)) print('\n... and in the one-sample case:') - FeatureMatrixExtractor.redact_dataframes(one_sample_bundle) + FeatureMatrixExtractor.redact_dataframes(one_sample_study) print('\nMetadata "bundle" with dataframes removed:') - print(json.dumps(one_sample_bundle, indent=2)) + print(json.dumps(one_sample_study, indent=2)) From 0c2c6588e6dc7185350953799af62d5f015efeb5 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Mon, 21 Aug 2023 14:37:24 -0400 Subject: [PATCH 09/31] logic hotfix --- spatialprofilingtoolbox/db/feature_matrix_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spatialprofilingtoolbox/db/feature_matrix_extractor.py b/spatialprofilingtoolbox/db/feature_matrix_extractor.py index 63ea75063..fe6186d95 100644 --- a/spatialprofilingtoolbox/db/feature_matrix_extractor.py +++ b/spatialprofilingtoolbox/db/feature_matrix_extractor.py @@ -88,7 +88,7 @@ def _extract(self, study: str | None = None, continuous_also: bool = False, ) -> StudyBundle: - if (specimen is None) != (study is None): + if (specimen is None) == (study is None): raise ValueError('Must specify exactly one of specimen or study.') data_arrays = self._retrieve_expressions_from_database( specimen=specimen, From 3269864325a58396e618ce13789142b779f536d8 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Mon, 21 Aug 2023 16:34:37 -0400 Subject: [PATCH 10/31] fix study-substudy references --- .../db/feature_matrix_extractor.py | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/spatialprofilingtoolbox/db/feature_matrix_extractor.py b/spatialprofilingtoolbox/db/feature_matrix_extractor.py index fe6186d95..fd00ae838 100644 --- a/spatialprofilingtoolbox/db/feature_matrix_extractor.py +++ b/spatialprofilingtoolbox/db/feature_matrix_extractor.py @@ -102,16 +102,23 @@ def _extract(self, if study is None: assert specimen is not None study = StudyAccess(self.cursor).get_study_from_specimen(specimen) - channel_information = self._create_channel_information(data_arrays[study]) + stratification = self._retrieve_derivative_stratification_from_database() + for substudy in self._retrieve_component_studies(study): + if substudy in stratification: + break + else: + raise RuntimeError('Stratification substudy not found for study.') + + channel_information = self._create_channel_information(data_arrays) phenotypes, phenotype_information = self._retrieve_phenotypes(study, channel_information) return { 'feature matrices': self._create_feature_matrices( - data_arrays[study], - centroid_coordinates[study], + data_arrays, + centroid_coordinates, phenotypes ), 'channel symbols by column name': channel_information, - 'sample cohorts': self._retrieve_derivative_stratification_from_database()[study], + 'sample cohorts': stratification[substudy], 'phenotypes by column name': phenotype_information } @@ -183,12 +190,16 @@ def _retrieve_derivative_stratification_from_database(self) -> dict[str, dict[st logger.info('Done retrieving stratification.') return stratification - def _retrieve_study_component_lookup(self) -> dict[str, str]: - self.cursor.execute('SELECT * FROM study_component ; ') + def _retrieve_component_studies(self, study: str) -> set[str]: + self.cursor.execute(f''' + SELECT component_study + FROM study_component + WHERE primary_study = '{study}'; + ''') rows = self.cursor.fetchall() - lookup: dict[str, str] = {} + lookup: set[str] = set() for row in rows: - lookup[row[1]] = row[0] + lookup.add(row[0]) return lookup def _create_feature_matrices(self, From aa012c7ffea62069d90649ac17322c1fca879b1d Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Mon, 21 Aug 2023 16:59:15 -0400 Subject: [PATCH 11/31] fix test to account for phenotype columns --- test/workflow/unit_tests/test_feature_matrix_extraction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/workflow/unit_tests/test_feature_matrix_extraction.py b/test/workflow/unit_tests/test_feature_matrix_extraction.py index 663367bca..b99f17cbe 100644 --- a/test/workflow/unit_tests/test_feature_matrix_extraction.py +++ b/test/workflow/unit_tests/test_feature_matrix_extraction.py @@ -25,8 +25,8 @@ def test_feature_matrix_schemas(study): print(f'Missing some columns in dataframe (case "{specimen}"): ') print(df.to_string(index=False)) sys.exit(1) - if df.shape != (100, 28): - print(f'Wrong number of rows or columns: {df.shape}') + if df.shape != (100, 32): + print(f'Wrong number of rows or columns: {df.shape} != (100, 32)') sys.exit(1) From 30e8ef1151347291d05e99ce6ba43337306304f6 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Mon, 21 Aug 2023 17:54:56 -0400 Subject: [PATCH 12/31] make phenotype dict key consistent --- spatialprofilingtoolbox/db/feature_matrix_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spatialprofilingtoolbox/db/feature_matrix_extractor.py b/spatialprofilingtoolbox/db/feature_matrix_extractor.py index fd00ae838..c5d6ea150 100644 --- a/spatialprofilingtoolbox/db/feature_matrix_extractor.py +++ b/spatialprofilingtoolbox/db/feature_matrix_extractor.py @@ -119,7 +119,7 @@ def _extract(self, ), 'channel symbols by column name': channel_information, 'sample cohorts': stratification[substudy], - 'phenotypes by column name': phenotype_information + 'phenotype symbols by column name': phenotype_information } @staticmethod From f5123d6086c346b227c0b145aebdaa871b610fd2 Mon Sep 17 00:00:00 2001 From: James Mathews Date: Tue, 22 Aug 2023 19:54:31 -0400 Subject: [PATCH 13/31] Update usage of extractor to omit study reference. --- spatialprofilingtoolbox/db/squidpy_metrics.py | 4 ++-- .../workflow/phenotype_proximity/core.py | 7 +++---- test/workflow/unit_tests/test_feature_matrix_extraction.py | 4 ++++ 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/spatialprofilingtoolbox/db/squidpy_metrics.py b/spatialprofilingtoolbox/db/squidpy_metrics.py index d2b0d6c26..1e55b55ea 100644 --- a/spatialprofilingtoolbox/db/squidpy_metrics.py +++ b/spatialprofilingtoolbox/db/squidpy_metrics.py @@ -68,13 +68,13 @@ def _fetch_cells_and_phenotypes( extractor = FeatureMatrixExtractor(cursor) bundle = cast(StudyBundle, extractor.extract(study=study)) FeatureMatrices = dict[str, dict[str, DataFrame | str]] - feature_matrices = cast(FeatureMatrices, bundle[study]['feature matrices']) + feature_matrices = cast(FeatureMatrices, bundle['feature matrices']) features_by_specimen = { specimen: cast(DataFrame, packet['dataframe']) for specimen, packet in feature_matrices.items() } channel_symbols_by_columns_name = cast( dict[str, str], - bundle[study]['channel symbols by column name'], + bundle['channel symbols by column name'], ) return features_by_specimen, channel_symbols_by_columns_name diff --git a/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py b/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py index cc8be1cdc..4b7760e80 100644 --- a/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py +++ b/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py @@ -67,10 +67,9 @@ def calculate_proximity(self): extractor = FeatureMatrixExtractor(database_config_file=self.database_config_file) bundle = cast(StudyBundle, extractor.extract(specimen=self.sample_identifier)) self.reporter.record_timepoint('Finished pulling data for one sample.') - study_name = list(bundle.keys())[0] - identifier = list(bundle[study_name]['feature matrices'].keys())[0] + identifier = list(bundle['feature matrices'].keys())[0] Sample = dict[str, DataFrame | str] - sample = cast(Sample, bundle[study_name]['feature matrices'][identifier]) + sample = cast(Sample, bundle['feature matrices'][identifier]) cells = cast(DataFrame, sample['dataframe']) logger.info('Dataframe pulled: %s', cells.head()) @@ -78,7 +77,7 @@ def calculate_proximity(self): self.channel_symbols_by_column_name = cast( dict[str, str], - bundle[study_name]['channel symbols by column name'], + bundle['channel symbols by column name'], ) phenotype_identifiers, signatures = self.get_named_phenotype_signatures() logger.info('Named phenotypes:') diff --git a/test/workflow/unit_tests/test_feature_matrix_extraction.py b/test/workflow/unit_tests/test_feature_matrix_extraction.py index b99f17cbe..c6059de1b 100644 --- a/test/workflow/unit_tests/test_feature_matrix_extraction.py +++ b/test/workflow/unit_tests/test_feature_matrix_extraction.py @@ -54,6 +54,10 @@ def create_column_name(channels, channel_num): for specimen in study['feature matrices'].keys(): df = study['feature matrices'][specimen]['dataframe'] + + print('Dataframe: ' + str(specimen)) + print(df) + expression_vectors = sorted([ tuple(row[f'F{i}'] for i in range(26)) for _, row in df.iterrows() From 91be9ba057c49110e1c4f89a9139bb589de117e3 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Tue, 22 Aug 2023 19:57:50 -0400 Subject: [PATCH 14/31] remove phenotypes from continuous dataframes --- spatialprofilingtoolbox/db/feature_matrix_extractor.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/spatialprofilingtoolbox/db/feature_matrix_extractor.py b/spatialprofilingtoolbox/db/feature_matrix_extractor.py index c5d6ea150..0756142e0 100644 --- a/spatialprofilingtoolbox/db/feature_matrix_extractor.py +++ b/spatialprofilingtoolbox/db/feature_matrix_extractor.py @@ -225,8 +225,10 @@ def _create_feature_matrices(self, columns=['pixel x', 'pixel y'] + [f'F{i}' for i in range(number_channels)], ) for column_name, criteria in phenotypes.items(): - dataframe[column_name] = (dataframe[criteria.positive_markers].all(axis=1) & \ - dataframe[criteria.negative_markers].all(axis=1)).astype(int) + dataframe[column_name] = ( + dataframe[criteria.positive_markers].all(axis=1) & + dataframe[criteria.negative_markers].all(axis=1) + ).astype(int) matrices[specimen] = { 'dataframe': dataframe, 'filename': f'{j}.tsv', @@ -242,9 +244,6 @@ def _create_feature_matrices(self, expression_vectors, columns=[f'F{i}' for i in range(number_channels)], ) - for column_name, criteria in phenotypes.items(): - dataframe[column_name] = (dataframe[criteria.positive_markers].all(axis=1) & \ - dataframe[criteria.negative_markers].all(axis=1)).astype(int) matrices[specimen]['continuous dataframe'] = dataframe logger.info('Done creating feature matrices.') From 70008eb409184b5a45a39ddac793b1935ebce6d5 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Thu, 24 Aug 2023 11:55:15 -0400 Subject: [PATCH 15/31] fix pheno neg expression match --- spatialprofilingtoolbox/db/feature_matrix_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spatialprofilingtoolbox/db/feature_matrix_extractor.py b/spatialprofilingtoolbox/db/feature_matrix_extractor.py index 0756142e0..bca098676 100644 --- a/spatialprofilingtoolbox/db/feature_matrix_extractor.py +++ b/spatialprofilingtoolbox/db/feature_matrix_extractor.py @@ -227,7 +227,7 @@ def _create_feature_matrices(self, for column_name, criteria in phenotypes.items(): dataframe[column_name] = ( dataframe[criteria.positive_markers].all(axis=1) & - dataframe[criteria.negative_markers].all(axis=1) + ~dataframe[criteria.negative_markers].any(axis=1) ).astype(int) matrices[specimen] = { 'dataframe': dataframe, From 7f14832cb7c47e226ffbf95b570959d44d0615b9 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Thu, 24 Aug 2023 14:53:25 -0400 Subject: [PATCH 16/31] split extract stratification, symbols in col names --- spatialprofilingtoolbox/cggnn/extract.py | 44 +--- .../db/feature_matrix_extractor.py | 219 +++++++++--------- .../db/scripts/retrieve_feature_matrices.py | 18 +- spatialprofilingtoolbox/db/squidpy_metrics.py | 20 +- .../workflow/phenotype_proximity/core.py | 21 +- .../test_feature_matrix_extraction.py | 82 +++---- 6 files changed, 180 insertions(+), 224 deletions(-) diff --git a/spatialprofilingtoolbox/cggnn/extract.py b/spatialprofilingtoolbox/cggnn/extract.py index 2b30a30af..9ee553da4 100644 --- a/spatialprofilingtoolbox/cggnn/extract.py +++ b/spatialprofilingtoolbox/cggnn/extract.py @@ -1,36 +1,23 @@ """Extract information cg-gnn needs from SPT.""" -from typing import cast - from pandas import DataFrame, concat, merge # type: ignore from numpy import sort # type: ignore -from spatialprofilingtoolbox.db.feature_matrix_extractor import ( - FeatureMatrixExtractor, - StudyBundle, -) +from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor -def _create_cell_df( - dfs_by_specimen: dict[str, DataFrame], - feature_names: dict[str, str], -) -> DataFrame: +def _create_cell_df(dfs_by_specimen: dict[str, DataFrame]) -> DataFrame: """Find simple and complex phenotypes, and locations and merge into a DataFrame.""" - feature_ids_to_names = {ft_id: 'FT_' + ft_name for ft_id, ft_name in feature_names.items()} for specimen, df_specimen in dfs_by_specimen.items(): - df_specimen.rename(feature_ids_to_names, axis=1, inplace=True) - # TODO: Create phenotype columns - df_specimen.rename( - {'pixel x': 'center_x', 'pixel y': 'center_y'}, - axis=1, - inplace=True, - ) df_specimen['specimen'] = specimen - # TODO: Reorder so that it's simple phenotype, specimen, complex phenotype, x, y df = concat(dfs_by_specimen.values(), axis=0) df.index.name = 'histological_structure' - return df + # Reorder columns so it's specimen, xy, channels, and phenotypes + column_order = ['specimen', 'pixel x', 'pixel y'] + column_order.extend(df.columns[df.columns.str.startswith('C ')]) + column_order.extend(df.columns[df.columns.str.startswith('P ')]) + return df[column_order] def _create_label_df( @@ -54,16 +41,9 @@ def extract_cggnn_data( ) -> tuple[DataFrame, DataFrame, dict[int, str]]: """Extract information cg-gnn needs from SPT.""" extractor = FeatureMatrixExtractor(database_config_file=spt_db_config_location) - study_data = cast(StudyBundle, extractor.extract(study=study)) - df_cell = _create_cell_df( - { - slide: cast(DataFrame, data['dataframe']) - for slide, data in study_data['feature matrices'].items() - }, - cast(dict[str, str], study_data['channel symbols by column name']), - ) - df_label, label_to_result_text = _create_label_df( - cast(DataFrame, study_data['sample cohorts']['assignments']), - cast(DataFrame, study_data['sample cohorts']['strata']), - ) + df_cell = _create_cell_df({ + slide: data.dataframe for slide, data in extractor.extract(study=study).items() + }) + cohorts = extractor.extract_cohorts(study) + df_label, label_to_result_text = _create_label_df(cohorts['assignments'], cohorts['strata']) return df_cell, df_label, label_to_result_text diff --git a/spatialprofilingtoolbox/db/feature_matrix_extractor.py b/spatialprofilingtoolbox/db/feature_matrix_extractor.py index bca098676..c88217975 100644 --- a/spatialprofilingtoolbox/db/feature_matrix_extractor.py +++ b/spatialprofilingtoolbox/db/feature_matrix_extractor.py @@ -3,6 +3,7 @@ from enum import Enum from enum import auto from typing import cast, Any +from dataclasses import dataclass from pandas import DataFrame from psycopg2.extensions import cursor as Psycopg2Cursor @@ -10,7 +11,10 @@ from spatialprofilingtoolbox import DatabaseConnectionMaker from spatialprofilingtoolbox.db.exchange_data_formats.metrics import PhenotypeCriteria from spatialprofilingtoolbox.db.phenotypes import PhenotypesAccess -from spatialprofilingtoolbox.db.stratification_puller import StratificationPuller +from spatialprofilingtoolbox.db.stratification_puller import ( + StratificationPuller, + Stratification, +) from spatialprofilingtoolbox.db.study_access import StudyAccess from spatialprofilingtoolbox.workflow.common.structure_centroids_puller import \ StructureCentroidsPuller @@ -19,10 +23,14 @@ logger = colorized_logger(__name__) -StudyBundle = dict[str, dict[str, Any]] +@dataclass +class MatrixBundle: + """Bundle of information for a specimen matrix.""" + dataframe: DataFrame + filename: str + continuous_dataframe: DataFrame | None = None - -class DBSource(Enum): +class _DBSource(Enum): """Indicator of intended database source.""" CURSOR = auto() CONFIG_FILE = auto() @@ -33,7 +41,7 @@ class FeatureMatrixExtractor: """Pull from the database and create convenience bundle of feature matrices and metadata.""" cursor: Psycopg2Cursor database_config_file: str | None - db_source: DBSource + db_source: _DBSource def __init__(self, cursor: Psycopg2Cursor | None = None, @@ -42,11 +50,11 @@ def __init__(self, self.cursor = cast(Psycopg2Cursor, cursor) self.database_config_file = database_config_file if cursor is not None: - self.db_source = DBSource.CURSOR + self.db_source = _DBSource.CURSOR elif database_config_file is not None: - self.db_source = DBSource.CONFIG_FILE + self.db_source = _DBSource.CONFIG_FILE else: - self.db_source = DBSource.UNKNOWN + self.db_source = _DBSource.UNKNOWN self._report_on_arguments() def _report_on_arguments(self): @@ -61,16 +69,38 @@ def extract(self, specimen: str | None = None, study: str | None = None, continuous_also: bool = False, - ) -> StudyBundle: - extraction = None + ) -> dict[str, MatrixBundle]: + """Extract feature matrices for a specimen or every specimen in a study. + + Parameters + ---------- + specimen: str | None = None + study: str | None = None + Which specimen to extract features for or study to extract features for all specimens + specimens for. Exactly one of specimen or study must be provided. + continuous_also: bool = False + Whether to also calculate and return a DataFrame for each specimen with continuous + channel information in addition to the default DataFrame which provides binary cast + channel information. + + Returns + ------- + dict[str, MatrixBundle] + A dictionary of specimen names to a MatrixBundle dataclass instances, which contain: + 1. `dataframe`, a DataFrame with the feature matrix for the specimen, including + centroid location, channel information, and phenotype information. + 2. `filename`, a filename for the DataFrame. + 3. `continuous_dataframe`, a DataFrame with continuous channel information if + continuous_also is true, otherwise this property is None. + """ match self.db_source: - case DBSource.CURSOR: + case _DBSource.CURSOR: extraction = self._extract( specimen=specimen, study=study, continuous_also=continuous_also, ) - case DBSource.CONFIG_FILE: + case _DBSource.CONFIG_FILE: with DatabaseConnectionMaker(self.database_config_file) as dcm: with dcm.get_connection().cursor() as cursor: self.cursor = cursor @@ -79,15 +109,15 @@ def extract(self, study=study, continuous_also=continuous_also, ) - case DBSource.UNKNOWN: - logger.error('The database source can not be determined.') + case _DBSource.UNKNOWN: + raise RuntimeError('The database source can not be determined.') return extraction def _extract(self, specimen: str | None = None, study: str | None = None, continuous_also: bool = False, - ) -> StudyBundle: + ) -> dict[str, MatrixBundle]: if (specimen is None) == (study is None): raise ValueError('Must specify exactly one of specimen or study.') data_arrays = self._retrieve_expressions_from_database( @@ -102,35 +132,13 @@ def _extract(self, if study is None: assert specimen is not None study = StudyAccess(self.cursor).get_study_from_specimen(specimen) - stratification = self._retrieve_derivative_stratification_from_database() - for substudy in self._retrieve_component_studies(study): - if substudy in stratification: - break - else: - raise RuntimeError('Stratification substudy not found for study.') - channel_information = self._create_channel_information(data_arrays) - phenotypes, phenotype_information = self._retrieve_phenotypes(study, channel_information) - return { - 'feature matrices': self._create_feature_matrices( - data_arrays, - centroid_coordinates, - phenotypes - ), - 'channel symbols by column name': channel_information, - 'sample cohorts': stratification[substudy], - 'phenotype symbols by column name': phenotype_information - } - - @staticmethod - def redact_dataframes(study: dict[str, dict[str, Any]]) -> None: - for specimen in study['feature matrices'].keys(): - study['feature matrices'][specimen]['dataframe'] = None - key = 'continuous dataframe' - if key in study['feature matrices'][specimen]: - study['feature matrices'][specimen][key] = None - study['sample cohorts']['assignments'] = None - study['sample cohorts']['strata'] = None + return self._create_feature_matrices( + data_arrays, + centroid_coordinates, + self._retrieve_phenotypes(study), + self._create_channel_information(data_arrays), + ) def _retrieve_expressions_from_database(self, specimen: str | None = None, @@ -155,96 +163,55 @@ def _retrieve_structure_centroids_from_database(self, logger.info('Done retrieving centroids.') return list(structure_centroids.get_studies().values())[0] - def _retrieve_phenotypes(self, - study_name: str, - channel_names: dict[str, str] - ) -> tuple[dict[str, PhenotypeCriteria], dict[str, str]]: + def _retrieve_phenotypes(self, study_name: str) -> dict[str, PhenotypeCriteria]: logger.info('Retrieving phenotypes from database.') - channel_to_column_name = {n: c for c, n in channel_names.items()} phenotypes: dict[str, PhenotypeCriteria] = {} phenotype_access = PhenotypesAccess(self.cursor) - phenotype_information: dict[str, str] = {} - for i, symbol_data in enumerate(phenotype_access.get_phenotype_symbols(study_name)): - column_name = f'P{i}' + for symbol_data in phenotype_access.get_phenotype_symbols(study_name): symbol = symbol_data.handle_string - phenotypes[column_name] = phenotype_access.get_phenotype_criteria(study_name, symbol) - phenotype_information[column_name] = symbol - - # Convert marker lists to FeatureMatrixExtractor column names. - phenotypes[column_name].positive_markers = [ - channel_to_column_name[marker] - for marker in phenotypes[column_name].positive_markers - ] - phenotypes[column_name].negative_markers = [ - channel_to_column_name[marker] - for marker in phenotypes[column_name].negative_markers - ] + phenotypes[symbol] = phenotype_access.get_phenotype_criteria(study_name, symbol) logger.info('Done retrieving phenotypes.') - return phenotypes, phenotype_information - - def _retrieve_derivative_stratification_from_database(self) -> dict[str, dict[str, Any]]: - logger.info('Retrieving stratification from database.') - puller = StratificationPuller(self.cursor) - puller.pull() - stratification = puller.get_stratification() - logger.info('Done retrieving stratification.') - return stratification - - def _retrieve_component_studies(self, study: str) -> set[str]: - self.cursor.execute(f''' - SELECT component_study - FROM study_component - WHERE primary_study = '{study}'; - ''') - rows = self.cursor.fetchall() - lookup: set[str] = set() - for row in rows: - lookup.add(row[0]) - return lookup + return phenotypes def _create_feature_matrices(self, study: dict[str, dict[str, Any]], centroid_coordinates: dict[str, Any], phenotypes: dict[str, PhenotypeCriteria], - ) -> dict[str, DataFrame]: + channel_information: list[str], + ) -> dict[str, MatrixBundle]: logger.info('Creating feature matrices from binary data arrays and centroids.') - matrices = {} + matrices: dict[str, MatrixBundle] = {} for j, specimen in enumerate(sorted(list(study['data arrays by specimen'].keys()))): logger.debug('Specimen %s .', specimen) expressions = study['data arrays by specimen'][specimen] - number_channels = len(study['target index lookup']) rows = [ self._create_feature_matrix_row( centroid_coordinates[specimen][i], expressions[i], - number_channels, + len(study['target index lookup']), ) for i in range(len(expressions)) ] dataframe = DataFrame( rows, - columns=['pixel x', 'pixel y'] + [f'F{i}' for i in range(number_channels)], + columns=['pixel x', 'pixel y'] + [f'C {cs}' for cs in channel_information], ) - for column_name, criteria in phenotypes.items(): - dataframe[column_name] = ( - dataframe[criteria.positive_markers].all(axis=1) & - ~dataframe[criteria.negative_markers].any(axis=1) + for symbol, criteria in phenotypes.items(): + dataframe[f'P {symbol}'] = ( + dataframe[[f'C {m}' for m in criteria.positive_markers]].all(axis=1) & + ~dataframe[[f'C {m}' for m in criteria.negative_markers]].any(axis=1) ).astype(int) - matrices[specimen] = { - 'dataframe': dataframe, - 'filename': f'{j}.tsv', - } + matrices[specimen] = MatrixBundle(dataframe, f'{j}.tsv') if 'continuous data arrays by specimen' in study: specimens = list(study['continuous data arrays by specimen'].keys()) for j, specimen in enumerate(sorted(specimens)): logger.debug('Specimen %s .', specimen) expression_vectors = study['continuous data arrays by specimen'][specimen] - number_channels = len(study['target index lookup']) dataframe = DataFrame( expression_vectors, - columns=[f'F{i}' for i in range(number_channels)], + columns=[f'C {cs}' for cs in channel_information], ) - matrices[specimen]['continuous dataframe'] = dataframe + matrices[specimen].continuous_dataframe = dataframe logger.info('Done creating feature matrices.') return matrices @@ -261,7 +228,7 @@ def _create_feature_matrix_row( def _create_channel_information(self, study_information: dict[str, dict[str, Any]] - ) -> dict[str, str]: + ) -> list[str]: logger.info('Aggregating channel information for one study.') targets = { int(index): target @@ -272,7 +239,49 @@ def _create_channel_information(self, for symbol, target in study_information['target by symbol'].items() } logger.info('Done aggregating channel information.') - return { - f'F{i}': symbols[targets[i]] - for i in sorted([int(index) for index in targets.keys()]) - } + return [ + symbols[targets[i]] for i in sorted([int(index) for index in targets.keys()]) + ] + + def extract_cohorts(self, study: str) -> dict[str, DataFrame]: + """Extract specimen cohort information for every specimen in a study.""" + match self.db_source: + case _DBSource.CURSOR: + extraction = self._extract_cohorts(study) + case _DBSource.CONFIG_FILE: + with DatabaseConnectionMaker(self.database_config_file) as dcm: + with dcm.get_connection().cursor() as cursor: + self.cursor = cursor + extraction = self._extract_cohorts(study) + case _DBSource.UNKNOWN: + raise RuntimeError('The database source can not be determined.') + return extraction + + def _extract_cohorts(self, study: str) -> dict[str, DataFrame]: + stratification = self._retrieve_derivative_stratification_from_database() + for substudy in self._retrieve_component_studies(study): + if substudy in stratification: + break + else: + raise RuntimeError('Stratification substudy not found for study.') + return stratification[substudy] + + def _retrieve_derivative_stratification_from_database(self) -> Stratification: + logger.info('Retrieving stratification from database.') + puller = StratificationPuller(self.cursor) + puller.pull() + stratification = puller.get_stratification() + logger.info('Done retrieving stratification.') + return stratification + + def _retrieve_component_studies(self, study: str) -> set[str]: + self.cursor.execute(f''' + SELECT component_study + FROM study_component + WHERE primary_study = '{study}'; + ''') + rows = self.cursor.fetchall() + lookup: set[str] = set() + for row in rows: + lookup.add(row[0]) + return lookup diff --git a/spatialprofilingtoolbox/db/scripts/retrieve_feature_matrices.py b/spatialprofilingtoolbox/db/scripts/retrieve_feature_matrices.py index fd8129758..4a3711138 100644 --- a/spatialprofilingtoolbox/db/scripts/retrieve_feature_matrices.py +++ b/spatialprofilingtoolbox/db/scripts/retrieve_feature_matrices.py @@ -1,23 +1,17 @@ """Convenience CLI wrapper of FeatureMatrixExtractor functionality, writes to files.""" import argparse -import json from os.path import exists from os.path import abspath from os.path import expanduser -from typing import cast - -from pandas import DataFrame from spatialprofilingtoolbox.standalone_utilities.module_load_error import \ SuggestExtrasException try: from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor - from spatialprofilingtoolbox.db.feature_matrix_extractor import StudyBundle except ModuleNotFoundError as e: SuggestExtrasException(e, 'db') from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor -from spatialprofilingtoolbox.db.feature_matrix_extractor import StudyBundle from spatialprofilingtoolbox.workflow.common.cli_arguments import add_argument @@ -30,18 +24,12 @@ def retrieve(args: argparse.Namespace): message = f'Need to supply valid database config filename: {database_config_file}' raise FileNotFoundError(message) extractor = FeatureMatrixExtractor(database_config_file=database_config_file) - bundle = cast(StudyBundle, extractor.extract(args.study_name)) - feature_matrices = cast(dict[str, dict[str, DataFrame | str]], bundle['feature matrices']) + feature_matrices = extractor.extract(args.study_name) for _, specimen_data in feature_matrices.items(): - df = cast(DataFrame, specimen_data['dataframe']) - filename = cast(str, specimen_data['filename']) - df.to_csv(filename, sep='\t', index=False) - outcomes = cast(DataFrame, bundle['sample cohorts']['assignments']) + specimen_data.dataframe.to_csv(specimen_data.filename, sep='\t', index=False) + outcomes = extractor.extract_cohorts(args.study_name)['assignments'] filename = 'assignments.tsv' outcomes.to_csv(filename, sep='\t', index=False) - FeatureMatrixExtractor.redact_dataframes(bundle) - with open('features.json', 'wt', encoding='utf-8') as file: - file.write(json.dumps(bundle, indent=2)) if __name__ == '__main__': diff --git a/spatialprofilingtoolbox/db/squidpy_metrics.py b/spatialprofilingtoolbox/db/squidpy_metrics.py index 1e55b55ea..f886e9696 100644 --- a/spatialprofilingtoolbox/db/squidpy_metrics.py +++ b/spatialprofilingtoolbox/db/squidpy_metrics.py @@ -1,13 +1,10 @@ """Make squidpy metrics that don't require specific phenotype selection available.""" -from typing import cast - from pandas import DataFrame from psycopg2.extensions import cursor as Psycopg2Cursor from spatialprofilingtoolbox import DatabaseConnectionMaker from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor -from spatialprofilingtoolbox.db.feature_matrix_extractor import StudyBundle from spatialprofilingtoolbox.db.exchange_data_formats.metrics import PhenotypeCriteria from spatialprofilingtoolbox.db.create_data_analysis_study import DataAnalysisStudyFactory from spatialprofilingtoolbox.workflow.common.squidpy import ( @@ -65,16 +62,11 @@ def _fetch_cells_and_phenotypes( cursor: Psycopg2Cursor, study: str, ) -> tuple[dict[str, DataFrame], dict[str, str]]: - extractor = FeatureMatrixExtractor(cursor) - bundle = cast(StudyBundle, extractor.extract(study=study)) - FeatureMatrices = dict[str, dict[str, DataFrame | str]] - feature_matrices = cast(FeatureMatrices, bundle['feature matrices']) + feature_matrices = FeatureMatrixExtractor(cursor).extract(study=study) features_by_specimen = { - specimen: cast(DataFrame, packet['dataframe']) - for specimen, packet in feature_matrices.items() + specimen: bundle.dataframe for specimen, bundle in feature_matrices.items() } - channel_symbols_by_columns_name = cast( - dict[str, str], - bundle['channel symbols by column name'], - ) - return features_by_specimen, channel_symbols_by_columns_name + channel_symbols_by_column_name = { + col_name: col_name[2:] for col_name in list(feature_matrices.values())[0].dataframe.columns + } + return features_by_specimen, channel_symbols_by_column_name diff --git a/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py b/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py index 4b7760e80..bf7688278 100644 --- a/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py +++ b/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py @@ -1,7 +1,8 @@ """The core calculator for the proximity calculation on a single source file.""" + import warnings import pickle -from typing import cast + import pandas as pd from pandas import DataFrame @@ -9,7 +10,6 @@ from spatialprofilingtoolbox.workflow.component_interfaces.core import CoreJob from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor -from spatialprofilingtoolbox.db.feature_matrix_extractor import StudyBundle from spatialprofilingtoolbox.workflow.common.logging.performance_timer import \ PerformanceTimerReporter from spatialprofilingtoolbox import DatabaseConnectionMaker @@ -64,21 +64,18 @@ def log_job_info(self): def calculate_proximity(self): self.reporter.record_timepoint('Start pulling data for one sample.') - extractor = FeatureMatrixExtractor(database_config_file=self.database_config_file) - bundle = cast(StudyBundle, extractor.extract(specimen=self.sample_identifier)) + bundle = FeatureMatrixExtractor(database_config_file=self.database_config_file).extract( + specimen=self.sample_identifier) self.reporter.record_timepoint('Finished pulling data for one sample.') - identifier = list(bundle['feature matrices'].keys())[0] - Sample = dict[str, DataFrame | str] - sample = cast(Sample, bundle['feature matrices'][identifier]) - cells = cast(DataFrame, sample['dataframe']) + identifier = list(bundle.keys())[0] + cells = bundle[identifier].dataframe logger.info('Dataframe pulled: %s', cells.head()) self.create_ball_tree(cells) - self.channel_symbols_by_column_name = cast( - dict[str, str], - bundle['channel symbols by column name'], - ) + self.channel_symbols_by_column_name = { + col_name: col_name[2:] for col_name in cells.columns + } phenotype_identifiers, signatures = self.get_named_phenotype_signatures() logger.info('Named phenotypes:') logger.info(signatures) diff --git a/test/workflow/unit_tests/test_feature_matrix_extraction.py b/test/workflow/unit_tests/test_feature_matrix_extraction.py index c6059de1b..fdb48c6f3 100644 --- a/test/workflow/unit_tests/test_feature_matrix_extraction.py +++ b/test/workflow/unit_tests/test_feature_matrix_extraction.py @@ -1,71 +1,69 @@ -import json import sys -import pandas as pd +from pandas import read_csv, DataFrame -from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor +from spatialprofilingtoolbox.db.feature_matrix_extractor import ( + FeatureMatrixExtractor, + MatrixBundle, +) -def test_sample_set(study): - if study['feature matrices'].keys() != set(['lesion 0_1', 'lesion 6_1']): - print(f'Wrong sample set: {list(study["feature matrices"].keys())}') +def test_sample_set(study: dict[str, MatrixBundle]): + if study.keys() != set(['lesion 0_1', 'lesion 6_1']): + print(f'Wrong sample set: {list(study.keys())}') sys.exit(1) -def test_one_sample_set(study): - if study['feature matrices'].keys() != set(['lesion 6_1']): - print(f'Wrong sample set: {list(study["feature matrices"].keys())}') +def test_one_sample_set(study: dict[str, MatrixBundle]): + if study.keys() != set(['lesion 6_1']): + print(f'Wrong sample set: {list(study.keys())}') sys.exit(1) -def test_feature_matrix_schemas(study): - for specimen, sample in study['feature matrices'].items(): - df = sample['dataframe'] - if not all(f'F{i}' in df.columns for i in range(26)): - print(f'Missing some columns in dataframe (case "{specimen}"): ') - print(df.to_string(index=False)) - sys.exit(1) +def test_feature_matrix_schemas(study: dict[str, MatrixBundle]): + for specimen, sample in study.items(): + df = sample.dataframe if df.shape != (100, 32): print(f'Wrong number of rows or columns: {df.shape} != (100, 32)') sys.exit(1) -def show_example_feature_matrix(study): +def show_example_feature_matrix(study: dict[str, MatrixBundle]): specimen = 'lesion 0_1' - df = study['feature matrices'][specimen]['dataframe'] + df = study[specimen].dataframe print(f'Example feature matrix, for specimen {specimen}:') print(df.to_string(index=False)) print('') -def test_channels(study): - channels = study['channel symbols by column name'] +def test_channels(study: dict[str, MatrixBundle]): + channels = list(study.values())[0].dataframe.columns known = ['B2M', 'B7H3', 'CD14', 'CD163', 'CD20', 'CD25', 'CD27', 'CD3', 'CD4', 'CD56', 'CD68', 'CD8', 'DAPI', 'FOXP3', 'IDO1', 'KI67', 'LAG3', 'MHCI', 'MHCII', 'MRC1', 'PD1', 'PDL1', 'S100B', 'SOX10', 'TGM2', 'TIM3'] - if set(channels.values()) != set(known): - print(f'Wrong channel set: {list(channels.values())}') + if set(channels) != set(known): + print(f'Wrong channel set: {channels.tolist()}') sys.exit(1) -def test_expression_vectors(study): +def test_expression_vectors(study: dict[str, MatrixBundle]): def create_column_name(channels, channel_num): return channels[channel_num] + '_Positive' - for specimen in study['feature matrices'].keys(): - df = study['feature matrices'][specimen]['dataframe'] + for specimen in study.keys(): + df = study[specimen].dataframe print('Dataframe: ' + str(specimen)) print(df) expression_vectors = sorted([ - tuple(row[f'F{i}'] for i in range(26)) + tuple(row[row.index.str.startswith('C ')].tolist()) for _, row in df.iterrows() ]) filenames = {'lesion 0_1': '0.csv', 'lesion 6_1': '3.csv'} cells_filename = filenames[specimen] - reference = pd.read_csv( + reference = read_csv( f'../test_data/adi_preprocessed_tables/dataset1/{cells_filename}', sep=',') channels = study['channel symbols by column name'] @@ -85,21 +83,21 @@ def create_column_name(channels, channel_num): print('Expression vector sets are as expected.') -def test_expression_vectors_continuous(study): +def test_expression_vectors_continuous(study: dict[str, MatrixBundle]): def create_column_name(channels, channel_num): return channels[channel_num] + '_Intensity' - for specimen in study['feature matrices'].keys(): - df = study['feature matrices'][specimen]['continuous dataframe'] + for specimen in study.keys(): + df = study[specimen].continuous_dataframe print(df.head()) expression_vectors = sorted([ - tuple(row[f'F{i}'] for i in range(26)) + tuple(row[row.index.str.startswith('C ')].tolist()) for _, row in df.iterrows() ]) filenames = {'lesion 0_1': '0.csv', 'lesion 6_1': '3.csv'} cells_filename = filenames[specimen] - reference = pd.read_csv( + reference = read_csv( f'../test_data/adi_preprocessed_tables/dataset1/{cells_filename}', sep=',') channels = study['channel symbols by column name'] @@ -119,9 +117,9 @@ def create_column_name(channels, channel_num): print('Expression vector sets are as expected.') -def test_stratification(study): - df = study['sample cohorts']['assignments'] - strata = study['sample cohorts']['strata'] +def test_stratification(study: dict[str, DataFrame]): + df = study['assignments'] + strata = study['strata'] print('Sample cohorts:') print(df.to_string(index=False)) print(strata.to_string(index=False)) @@ -134,13 +132,14 @@ def test_stratification(study): if __name__ == '__main__': extractor = FeatureMatrixExtractor(database_config_file='../db/.spt_db.config.container') - test_study = extractor.extract(study='Melanoma intralesional IL2') + study_name = 'Melanoma intralesional IL2' + test_study = extractor.extract(study=study_name) test_sample_set(test_study) test_feature_matrix_schemas(test_study) show_example_feature_matrix(test_study) test_channels(test_study) test_expression_vectors(test_study) - test_stratification(test_study) + test_stratification(extractor.extract_cohorts(study_name)) one_sample_study = extractor.extract(specimen='lesion 6_1') test_one_sample_set(one_sample_study) @@ -153,12 +152,3 @@ def test_stratification(study): test_feature_matrix_schemas(one_sample_study_continuous) test_channels(one_sample_study_continuous) test_expression_vectors_continuous(one_sample_study_continuous) - - FeatureMatrixExtractor.redact_dataframes(test_study) - print('\nMetadata "bundle" with dataframes removed:') - print(json.dumps(test_study, indent=2)) - - print('\n... and in the one-sample case:') - FeatureMatrixExtractor.redact_dataframes(one_sample_study) - print('\nMetadata "bundle" with dataframes removed:') - print(json.dumps(one_sample_study, indent=2)) From 61c53b5930509b87c7defdd8a9a0e009662df8be Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Thu, 24 Aug 2023 18:52:54 -0400 Subject: [PATCH 17/31] a little formatting on FME --- spatialprofilingtoolbox/db/feature_matrix_extractor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spatialprofilingtoolbox/db/feature_matrix_extractor.py b/spatialprofilingtoolbox/db/feature_matrix_extractor.py index c88217975..0f30ab77d 100644 --- a/spatialprofilingtoolbox/db/feature_matrix_extractor.py +++ b/spatialprofilingtoolbox/db/feature_matrix_extractor.py @@ -23,6 +23,7 @@ logger = colorized_logger(__name__) + @dataclass class MatrixBundle: """Bundle of information for a specimen matrix.""" @@ -30,6 +31,7 @@ class MatrixBundle: filename: str continuous_dataframe: DataFrame | None = None + class _DBSource(Enum): """Indicator of intended database source.""" CURSOR = auto() @@ -71,7 +73,7 @@ def extract(self, continuous_also: bool = False, ) -> dict[str, MatrixBundle]: """Extract feature matrices for a specimen or every specimen in a study. - + Parameters ---------- specimen: str | None = None From d10fdd851f8f52ad0a6bb8aab3133b6ef93c5d45 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Thu, 24 Aug 2023 19:11:37 -0400 Subject: [PATCH 18/31] explore classes for cggnn extraction --- spatialprofilingtoolbox/cggnn/extract.py | 39 +++++++++++++++---- .../cggnn/scripts/explore_classes.py | 33 ++++++++++++++++ .../cggnn/scripts/extract.py | 23 +++++++++-- test/cggnn/unit_tests/test_explore.sh | 12 ++++++ 4 files changed, 95 insertions(+), 12 deletions(-) create mode 100644 spatialprofilingtoolbox/cggnn/scripts/explore_classes.py create mode 100644 test/cggnn/unit_tests/test_explore.sh diff --git a/spatialprofilingtoolbox/cggnn/extract.py b/spatialprofilingtoolbox/cggnn/extract.py index 9ee553da4..13efb6df8 100644 --- a/spatialprofilingtoolbox/cggnn/extract.py +++ b/spatialprofilingtoolbox/cggnn/extract.py @@ -23,21 +23,40 @@ def _create_cell_df(dfs_by_specimen: dict[str, DataFrame]) -> DataFrame: def _create_label_df( df_assignments: DataFrame, df_strata: DataFrame, + strata_to_use: list[int] | None, ) -> tuple[DataFrame, dict[int, str]]: """Get slide-level results.""" - df = merge(df_assignments, df_strata, on='stratum identifier', how='left')[ - ['specimen', 'subject diagnosed result'] - ].rename( - {'specimen': 'slide', 'subject diagnosed result': 'result'}, - axis=1, - ) - label_to_result = dict(enumerate(sort(df['result'].unique()))) + df_assignments = df_assignments.set_index('specimen') + df_strata = df_strata.set_index('stratum identifier') + + # Filter for strata to use + if strata_to_use is not None: + df_strata = df_strata.loc[sorted(strata_to_use)] + if df_strata.shape[0] < 2: + raise ValueError(f'Need at least 2 strata to classify, there are {df_strata.shape[0]}.') + + # Drop columns that're the same for all kept strata + for col in df_strata.columns.tolist(): + if df_strata[col].unique().size == 1: + df_strata = df_strata.drop(col, axis=1) + + # Compress remaining columns into a single string + df_strata['label'] = '(' + df_strata.iloc[:, 0].astype(str) + for i in range(1, df_strata.shape[1]): + df_strata['label'] += df_strata.iloc[:, i].astype(str) + df_strata['label'] += ')' + df_strata = df_strata[['label']] + + # Merge with specimen assignments, keeping only selected strata + df = merge(df_assignments, df_strata, on='stratum identifier', how='inner')[['label']] + label_to_result = dict(enumerate(sort(df['label'].unique()))) return df.replace({res: i for i, res in label_to_result.items()}), label_to_result def extract_cggnn_data( spt_db_config_location: str, study: str, + strata_to_use: list[int] | None, ) -> tuple[DataFrame, DataFrame, dict[int, str]]: """Extract information cg-gnn needs from SPT.""" extractor = FeatureMatrixExtractor(database_config_file=spt_db_config_location) @@ -45,5 +64,9 @@ def extract_cggnn_data( slide: data.dataframe for slide, data in extractor.extract(study=study).items() }) cohorts = extractor.extract_cohorts(study) - df_label, label_to_result_text = _create_label_df(cohorts['assignments'], cohorts['strata']) + df_label, label_to_result_text = _create_label_df( + cohorts['assignments'], + cohorts['strata'], + strata_to_use, + ) return df_cell, df_label, label_to_result_text diff --git a/spatialprofilingtoolbox/cggnn/scripts/explore_classes.py b/spatialprofilingtoolbox/cggnn/scripts/explore_classes.py new file mode 100644 index 000000000..0367ad200 --- /dev/null +++ b/spatialprofilingtoolbox/cggnn/scripts/explore_classes.py @@ -0,0 +1,33 @@ +"""Report the different strata available to classify with.""" + +from argparse import ArgumentParser + +from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor + + +def parse_arguments(): + """Process command line arguments.""" + parser = ArgumentParser( + prog='spt cggnn explore_classes', + description='See the strata available to classify on.' + ) + parser.add_argument( + '--spt_db_config_location', + type=str, + help='Location of the SPT DB config file.', + required=True + ) + parser.add_argument( + '--study', + type=str, + help='Name of the study to query data for.', + required=True + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_arguments() + extractor = FeatureMatrixExtractor(args.spt_db_config_location) + strata = extractor.extract_cohorts(study=args.study)['strata'] + print(strata.to_string()) diff --git a/spatialprofilingtoolbox/cggnn/scripts/extract.py b/spatialprofilingtoolbox/cggnn/scripts/extract.py index 67f49aa0f..b70bfc1a2 100644 --- a/spatialprofilingtoolbox/cggnn/scripts/extract.py +++ b/spatialprofilingtoolbox/cggnn/scripts/extract.py @@ -16,15 +16,25 @@ def parse_arguments(): parser.add_argument( '--spt_db_config_location', type=str, - help='File location for SPT DB config file.', + help='Location of the SPT DB config file.', required=True ) parser.add_argument( '--study', type=str, - help='Name of the study to query data for in SPT.', + help='Location of the SPT DB config file to use.', required=True ) + parser.add_argument( + '--strata', + nargs='+', + type=int, + help='Specimen strata to use as labels, identified according to the "stratum identifier" ' + 'in `explore_classes`. This should be given as space separated integers.\n' + 'If not provided, all strata will be used.', + required=False, + default=None + ) parser.add_argument( '--output_location', type=str, @@ -36,7 +46,11 @@ def parse_arguments(): if __name__ == "__main__": args = parse_arguments() - df_cell, df_label, label_to_result = extract_cggnn_data(args.spt_db_config_location, args.study) + df_cell, df_label, label_to_result = extract_cggnn_data( + args.spt_db_config_location, + args.study, + args.strata, + ) assert isinstance(args.output_location, str) dict_filename = join(args.output_location, 'label_to_results.json') @@ -45,4 +59,5 @@ def parse_arguments(): if not (exists(dict_filename) and exists(cells_filename) and exists(labels_filename)): df_cell.to_hdf(cells_filename, 'cells') df_label.to_hdf(labels_filename, 'labels') - dump(label_to_result, open(dict_filename, 'w', encoding='utf-8')) + with open(dict_filename, 'w', encoding='utf-8') as f: + dump(label_to_result, f) diff --git a/test/cggnn/unit_tests/test_explore.sh b/test/cggnn/unit_tests/test_explore.sh new file mode 100644 index 000000000..184c0626d --- /dev/null +++ b/test/cggnn/unit_tests/test_explore.sh @@ -0,0 +1,12 @@ +spt cggnn explore_classes \ + --spt_db_config_location ../db/.spt_db.config.container \ + --study "Melanoma intralesional IL2" +status=$? +[ $status -eq 0 ] || echo "cggnn explore_classes failed." + +if [ $status -eq 0 ]; +then + exit 0 +else + exit 1 +fi From 94b504b91ad0db842fa734ac2118971da7696431 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Fri, 25 Aug 2023 11:33:22 -0400 Subject: [PATCH 19/31] fix FME test --- .../test_feature_matrix_extraction.py | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/test/workflow/unit_tests/test_feature_matrix_extraction.py b/test/workflow/unit_tests/test_feature_matrix_extraction.py index fdb48c6f3..61eb2c3c9 100644 --- a/test/workflow/unit_tests/test_feature_matrix_extraction.py +++ b/test/workflow/unit_tests/test_feature_matrix_extraction.py @@ -37,19 +37,17 @@ def show_example_feature_matrix(study: dict[str, MatrixBundle]): def test_channels(study: dict[str, MatrixBundle]): - channels = list(study.values())[0].dataframe.columns - known = ['B2M', 'B7H3', 'CD14', 'CD163', 'CD20', 'CD25', 'CD27', 'CD3', 'CD4', 'CD56', 'CD68', + columns = list(study.values())[0].dataframe.columns + channels = set(name[2:] for name in columns[columns.str.startswith('C ')]) + known = {'B2M', 'B7H3', 'CD14', 'CD163', 'CD20', 'CD25', 'CD27', 'CD3', 'CD4', 'CD56', 'CD68', 'CD8', 'DAPI', 'FOXP3', 'IDO1', 'KI67', 'LAG3', 'MHCI', 'MHCII', 'MRC1', 'PD1', - 'PDL1', 'S100B', 'SOX10', 'TGM2', 'TIM3'] - if set(channels) != set(known): + 'PDL1', 'S100B', 'SOX10', 'TGM2', 'TIM3'} + if channels != known: print(f'Wrong channel set: {channels.tolist()}') sys.exit(1) def test_expression_vectors(study: dict[str, MatrixBundle]): - def create_column_name(channels, channel_num): - return channels[channel_num] + '_Positive' - for specimen in study.keys(): df = study[specimen].dataframe @@ -65,10 +63,11 @@ def create_column_name(channels, channel_num): cells_filename = filenames[specimen] reference = read_csv( f'../test_data/adi_preprocessed_tables/dataset1/{cells_filename}', sep=',') - channels = study['channel symbols by column name'] + columns = list(study.values())[0].dataframe.columns + channels = [name[2:] for name in columns[columns.str.startswith('C ')]] expected_expression_vectors = sorted([ - tuple(row[create_column_name(channels, f'F{i}')] for i in range(26)) + tuple(row[f'{channel}_Positive'] for channel in channels) for _, row in reference.iterrows() ]) @@ -84,9 +83,6 @@ def create_column_name(channels, channel_num): def test_expression_vectors_continuous(study: dict[str, MatrixBundle]): - def create_column_name(channels, channel_num): - return channels[channel_num] + '_Intensity' - for specimen in study.keys(): df = study[specimen].continuous_dataframe print(df.head()) @@ -99,10 +95,11 @@ def create_column_name(channels, channel_num): cells_filename = filenames[specimen] reference = read_csv( f'../test_data/adi_preprocessed_tables/dataset1/{cells_filename}', sep=',') - channels = study['channel symbols by column name'] + columns = list(study.values())[0].dataframe.columns + channels = [name[2:] for name in columns[columns.str.startswith('C ')]] expected_expression_vectors = sorted([ - tuple(row[create_column_name(channels, f'F{i}')] for i in range(26)) + tuple(row[f'{channel}_Intensity'] for channel in channels) for _, row in reference.iterrows() ]) From 96634c97e845a84bfe366056b5022c8328de8da9 Mon Sep 17 00:00:00 2001 From: James Mathews Date: Fri, 25 Aug 2023 15:31:28 -0400 Subject: [PATCH 20/31] Make test more diagnosable. --- test/db/module_tests/test_autocomputed_squidpy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/db/module_tests/test_autocomputed_squidpy.py b/test/db/module_tests/test_autocomputed_squidpy.py index 5a5e560b8..964e0b1e4 100644 --- a/test/db/module_tests/test_autocomputed_squidpy.py +++ b/test/db/module_tests/test_autocomputed_squidpy.py @@ -26,7 +26,8 @@ def check_records(feature_values): rows = [(row[0], row[1], round6(row[2])) for row in feature_values] missing = set(get_expected_records()).difference(rows) if len(missing) > 0: - raise ValueError(f'Expected to find records: {missing}') + raise ValueError(f'Expected to find records: {sorted(missing)}\nGot: {sorted(rows)}') + print('All expected records found.') unexpected = set(rows).difference(get_expected_records()) if len(unexpected) > 0: From c26eeb450e16e1ce1c8c46edc5a4aa2637626b2a Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Mon, 4 Sep 2023 20:17:41 -0400 Subject: [PATCH 21/31] cggnn extract clarity refactors --- spatialprofilingtoolbox/cggnn/extract.py | 24 +++++++++++++++---- .../cggnn/scripts/extract.py | 2 +- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/spatialprofilingtoolbox/cggnn/extract.py b/spatialprofilingtoolbox/cggnn/extract.py index 13efb6df8..253360840 100644 --- a/spatialprofilingtoolbox/cggnn/extract.py +++ b/spatialprofilingtoolbox/cggnn/extract.py @@ -28,26 +28,40 @@ def _create_label_df( """Get slide-level results.""" df_assignments = df_assignments.set_index('specimen') df_strata = df_strata.set_index('stratum identifier') + df_strata = _filter_for_strata(strata_to_use, df_strata) + df_strata = _drop_unneeded_columns(df_strata) + df_strata = _compress_df(df_strata) + return _label(df_assignments, df_strata) - # Filter for strata to use + +def _filter_for_strata(strata_to_use: list[int] | None, df_strata: DataFrame) -> DataFrame: if strata_to_use is not None: df_strata = df_strata.loc[sorted(strata_to_use)] if df_strata.shape[0] < 2: raise ValueError(f'Need at least 2 strata to classify, there are {df_strata.shape[0]}.') + return df_strata + - # Drop columns that're the same for all kept strata +def _drop_unneeded_columns(df_strata: DataFrame) -> DataFrame: + """Drop columns that have internally same contents.""" for col in df_strata.columns.tolist(): - if df_strata[col].unique().size == 1: + if df_strata[col].nunique() == 1: df_strata = df_strata.drop(col, axis=1) + return df_strata - # Compress remaining columns into a single string + +def _compress_df(df_strata: DataFrame) -> DataFrame: + """Compress remaining columns into a single string""" df_strata['label'] = '(' + df_strata.iloc[:, 0].astype(str) for i in range(1, df_strata.shape[1]): df_strata['label'] += df_strata.iloc[:, i].astype(str) df_strata['label'] += ')' df_strata = df_strata[['label']] + return df_strata + - # Merge with specimen assignments, keeping only selected strata +def _label(df_assignments: DataFrame, df_strata: DataFrame) -> tuple[DataFrame, dict[int, str]]: + """Merge with specimen assignments, keeping only selected strata.""" df = merge(df_assignments, df_strata, on='stratum identifier', how='inner')[['label']] label_to_result = dict(enumerate(sort(df['label'].unique()))) return df.replace({res: i for i, res in label_to_result.items()}), label_to_result diff --git a/spatialprofilingtoolbox/cggnn/scripts/extract.py b/spatialprofilingtoolbox/cggnn/scripts/extract.py index b70bfc1a2..12752e8ea 100644 --- a/spatialprofilingtoolbox/cggnn/scripts/extract.py +++ b/spatialprofilingtoolbox/cggnn/scripts/extract.py @@ -22,7 +22,7 @@ def parse_arguments(): parser.add_argument( '--study', type=str, - help='Location of the SPT DB config file to use.', + help='Name of the study to query data for.', required=True ) parser.add_argument( From f3f6178eef9fb9f321d615b630806e450bbe5a84 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Wed, 6 Sep 2023 12:42:42 -0400 Subject: [PATCH 22/31] update providers for new feature column names --- spatialprofilingtoolbox/db/squidpy_metrics.py | 30 +++---- .../workflow/common/cell_df_indexer.py | 47 ---------- .../workflow/common/proximity.py | 28 +++--- .../workflow/common/squidpy.py | 20 +++-- .../workflow/phenotype_proximity/core.py | 86 +++++++++---------- .../phenotype_proximity/integrator.py | 4 +- .../check_proximity_metric_values.py | 34 +++++--- 7 files changed, 102 insertions(+), 147 deletions(-) delete mode 100644 spatialprofilingtoolbox/workflow/common/cell_df_indexer.py diff --git a/spatialprofilingtoolbox/db/squidpy_metrics.py b/spatialprofilingtoolbox/db/squidpy_metrics.py index f886e9696..7a7406e09 100644 --- a/spatialprofilingtoolbox/db/squidpy_metrics.py +++ b/spatialprofilingtoolbox/db/squidpy_metrics.py @@ -24,10 +24,7 @@ def create_and_transcribe_squidpy_features( """Transcribe "off-demand" Squidpy feature(s) in features system.""" connection = database_connection_maker.get_connection() das = DataAnalysisStudyFactory(connection, study, 'spatial autocorrelation').create() - features_by_specimen, channel_symbols_by_column_name = _fetch_cells_and_phenotypes( - connection.cursor(), - study, - ) + features_by_specimen = _fetch_cells(connection.cursor(), study) with ADIFeaturesUploader( database_connection_maker, data_analysis_study=das, @@ -39,7 +36,6 @@ def create_and_transcribe_squidpy_features( create_and_transcribe_one_sample( sample, df, - channel_symbols_by_column_name, feature_uploader, ) @@ -47,26 +43,24 @@ def create_and_transcribe_squidpy_features( def create_and_transcribe_one_sample( sample: str, df: DataFrame, - channel_symbols_by_column_name: dict[str, str], feature_uploader: ADIFeaturesUploader, ) -> None: - for column, symbol in channel_symbols_by_column_name.items(): - criteria = PhenotypeCriteria(positive_markers=[column], negative_markers=[]) - value = compute_squidpy_metric_for_one_sample(df, [criteria], 'spatial autocorrelation') - if value is None: - continue - feature_uploader.stage_feature_value((symbol,), sample, value) + for column in df.columns: + if column.startswith('C '): + symbol = column[2:] + criteria = PhenotypeCriteria(positive_markers=[symbol], negative_markers=[]) + value = compute_squidpy_metric_for_one_sample(df, [criteria], 'spatial autocorrelation') + if value is None: + continue + feature_uploader.stage_feature_value((symbol,), sample, value) -def _fetch_cells_and_phenotypes( +def _fetch_cells( cursor: Psycopg2Cursor, study: str, -) -> tuple[dict[str, DataFrame], dict[str, str]]: +) -> dict[str, DataFrame]: feature_matrices = FeatureMatrixExtractor(cursor).extract(study=study) features_by_specimen = { specimen: bundle.dataframe for specimen, bundle in feature_matrices.items() } - channel_symbols_by_column_name = { - col_name: col_name[2:] for col_name in list(feature_matrices.values())[0].dataframe.columns - } - return features_by_specimen, channel_symbols_by_column_name + return features_by_specimen diff --git a/spatialprofilingtoolbox/workflow/common/cell_df_indexer.py b/spatialprofilingtoolbox/workflow/common/cell_df_indexer.py deleted file mode 100644 index fec073d40..000000000 --- a/spatialprofilingtoolbox/workflow/common/cell_df_indexer.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Convenience functions for working with cell DataFrames.""" - -import warnings - -from numpy import asarray, ndarray -from numpy.typing import NDArray -from pandas import DataFrame - -from spatialprofilingtoolbox.db.exchange_data_formats.metrics import PhenotypeCriteria -from spatialprofilingtoolbox.standalone_utilities.log_formats import colorized_logger - -logger = colorized_logger(__name__) - -ValueMultiIndex = tuple[int | tuple[int, ...], list[str]] - -def _get_value_and_multiindex(signature: PhenotypeCriteria) -> ValueMultiIndex: - value = (1,) * len(signature.positive_markers) + (0,) * len(signature.negative_markers) - _value: int | tuple[int, ...] - if len(value) == 1: - _value = value[0] - else: - _value = value - multiindex = [*signature.positive_markers, *signature.negative_markers] - return _value, multiindex - - -def get_mask(cells: DataFrame, signature: PhenotypeCriteria) -> NDArray[bool,]: - """Transform phenotype signature into a boolean mask for a DataFrame.""" - value, multiindex = _get_value_and_multiindex(signature) - try: - with warnings.catch_warnings(): - message = "indexing past lexsort depth may impact performance" - warnings.filterwarnings("ignore", message=message) - loc = cells.set_index(multiindex).index.get_loc(value) - except KeyError as exception: - logger.debug('Some KeyError. %s', str(exception)) - return asarray([False,] * cells.shape[0]) - if isinstance(loc, ndarray): - return loc - if isinstance(loc, slice): - range1 = [False,]*(loc.start - 0) - range2 = [True,]*(loc.stop - loc.start) - range3 = [False,]*(cells.shape[0] - loc.stop) - return asarray(range1 + range2 + range3) - if isinstance(loc, int): - return asarray([i == loc for i in range(cells.shape[0])]) - raise ValueError(f'Could not select by index: {multiindex}. Got: {loc}') diff --git a/spatialprofilingtoolbox/workflow/common/proximity.py b/spatialprofilingtoolbox/workflow/common/proximity.py index 84e5e036a..52b8f178d 100644 --- a/spatialprofilingtoolbox/workflow/common/proximity.py +++ b/spatialprofilingtoolbox/workflow/common/proximity.py @@ -1,14 +1,12 @@ """Low-level calculations of proximity metric.""" from math import isnan -import re from pandas import DataFrame from sklearn.neighbors import BallTree # type: ignore from numpy import logical_and from spatialprofilingtoolbox.db.exchange_data_formats.metrics import PhenotypeCriteria -from spatialprofilingtoolbox.workflow.common.cell_df_indexer import get_mask from spatialprofilingtoolbox.standalone_utilities.log_formats import colorized_logger logger = colorized_logger(__name__) @@ -21,12 +19,18 @@ def compute_proximity_metric_for_signature_pair( cells: DataFrame, tree: BallTree ) -> float | None: - mask1 = get_mask(cells, signature1) - mask2 = get_mask(cells, signature2) + cells = cells.rename({ + column: (column[2:] if (column.startswith('C ') or column.startswith('P ')) else column) + for column in cells.columns + }, axis=1) + mask1 = cells[signature1.positive_markers].all(axis=1) & \ + (~cells[signature1.negative_markers]).all(axis=1) + mask2 = cells[signature1.positive_markers].all(axis=1) & \ + (~cells[signature2.negative_markers]).all(axis=1) source_count = sum(mask1) if source_count == 0: return None - source_cell_locations = cells.loc()[mask1][['pixel x', 'pixel y']] + source_cell_locations = cells.loc[mask1, ['pixel x', 'pixel y']] within_radius_indices_list = tree.query_radius( source_cell_locations, radius, @@ -50,25 +54,15 @@ def _validate_value(value) -> bool: return True -def _phenotype_identifier_lookup(handle, channel_symbols_by_column_name) -> str: - if re.match(r'^\d+$', handle): - return f'cell_phenotype {handle}' - if re.match(r'^F\d+$', handle): - channel_symbol = channel_symbols_by_column_name[handle] - return channel_symbol - raise ValueError(f'Did not understand meaning of specifier: {handle}') - - def stage_proximity_feature_values( feature_uploader, feature_values, - channel_symbols_by_column_name, sample_identifier, ) -> None: for _, row in feature_values.iterrows(): specifiers = ( - _phenotype_identifier_lookup(row['Phenotype 1'], channel_symbols_by_column_name), - _phenotype_identifier_lookup(row['Phenotype 2'], channel_symbols_by_column_name), + row['Phenotype 1'], + row['Phenotype 2'], row['Pixel radius'], ) value = row['Proximity'] diff --git a/spatialprofilingtoolbox/workflow/common/squidpy.py b/spatialprofilingtoolbox/workflow/common/squidpy.py index 305e01a6e..71ede4b3d 100644 --- a/spatialprofilingtoolbox/workflow/common/squidpy.py +++ b/spatialprofilingtoolbox/workflow/common/squidpy.py @@ -5,7 +5,7 @@ from numpy.typing import NDArray from numpy import isnan -from pandas import DataFrame +from pandas import DataFrame, Series from squidpy.gr import ( # type: ignore spatial_neighbors, nhood_enrichment, @@ -18,7 +18,6 @@ from scipy.stats import norm # type: ignore from spatialprofilingtoolbox.db.exchange_data_formats.metrics import PhenotypeCriteria -from spatialprofilingtoolbox.workflow.common.cell_df_indexer import get_mask from spatialprofilingtoolbox import get_feature_description from spatialprofilingtoolbox import squidpy_feature_classnames @@ -37,8 +36,15 @@ def compute_squidpy_metric_for_one_sample( radius: float | None = None, ) -> float | None: """Compute Squidpy metrics for a tissue sample with a clustering of the given phenotypes.""" - df_cell.sort_index(inplace=True) - masks: list[NDArray[Any]] = [get_mask(df_cell, signature) for signature in phenotypes] + df_cell = df_cell.rename({ + column: (column[2:] if (column.startswith('C ') or column.startswith('P ')) else column) + for column in cells.columns + }, axis=1) + masks: list[Series[bool]] = [ + (df_cell[signature.positive_markers].all(axis=1) & + (~df_cell[signature.negative_markers]).all(axis=1)) + for signature in phenotypes + ] adata = convert_df_to_anndata(df_cell, masks) match feature_class: case 'neighborhood enrichment': @@ -90,7 +96,7 @@ def round10(value): def convert_df_to_anndata( df: DataFrame, - phenotypes_to_cluster_on: list[NDArray[Any]] | None = None, + phenotypes_to_cluster_on: list[Series[bool]] | None = None, ) -> AnnData: """Convert SPT DataFrame to AnnData object for use with Squidpy metrics. @@ -99,7 +105,7 @@ def convert_df_to_anndata( A dataframe with an arbitrary index, x and y locations of histological structures with column names 'pixel x' and 'pixel y', and several columns with arbitrary names each indicating the expression of a phenotype. - phenotypes_to_cluster_on: list[NDArray[Any]] | None + phenotypes_to_cluster_on: list[Series[bool]] | None Used to create a 'cluster' column in the AnnData object if provided. Each list is a mask of positive or negative features that indicate the phenotype. * If only one phenotype is provided, two clusters will be created mirroring the @@ -113,7 +119,7 @@ def convert_df_to_anndata( """ locations: NDArray[Any] = df[['pixel x', 'pixel y']].to_numpy() adata = AnnData( - df.to_numpy(), + df.drop(['pixel x', 'pixel y'], axis=1).to_numpy(), obsm={'spatial': locations}, # type: ignore ) spatial_neighbors(adata) diff --git a/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py b/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py index bf7688278..78e3e84ff 100644 --- a/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py +++ b/spatialprofilingtoolbox/workflow/phenotype_proximity/core.py @@ -29,7 +29,6 @@ class PhenotypeProximityCoreJob(CoreJob): """Core/parallelizable functionality for the phenotype proximity workflow.""" radii = [60, 120] - channel_symbols_by_column_name: dict[str, str] tree: BallTree def __init__(self, @@ -73,35 +72,46 @@ def calculate_proximity(self): self.create_ball_tree(cells) - self.channel_symbols_by_column_name = { - col_name: col_name[2:] for col_name in cells.columns - } - phenotype_identifiers, signatures = self.get_named_phenotype_signatures() + # Assemble phenotype signatures for every channel and phenotype + channels = sorted( + [col_name[2:] for col_name in cells.columns if col_name.startswith('C ')] + ) + phenotypes = sorted( + [col_name[2:] for col_name in cells.columns if col_name.startswith('P ')] + ) + signatures = self.get_named_phenotype_signatures() + assert set(phenotypes) == signatures.keys() logger.info('Named phenotypes:') - logger.info(signatures) - - channels = sorted(self.channel_symbols_by_column_name.keys()) - singleton_signatures = [ - PhenotypeCriteria(positive_markers=[column_name], negative_markers=[]) - for column_name in channels - ] - all_signatures = singleton_signatures + signatures - cases = self.get_cases(all_signatures) - proximity_metrics = [ - compute_proximity_metric_for_signature_pair(s1, s2, r, cells, self.tree) - for s1, s2, r in cases - ] - self.write_table(proximity_metrics, self.get_cases(channels + phenotype_identifiers)) + logger.info(phenotypes) + all_features = channels + phenotypes + signatures.update({ + column_name: PhenotypeCriteria( + positive_markers=[column_name], + negative_markers=[], + ) for column_name in channels + }) + + # Calculate proximity metrics for every phenotype pair and write + proximity_metrics = { + (f1, f2, r): compute_proximity_metric_for_signature_pair( + signatures[f1], + signatures[f2], + r, + cells, + self.tree, + ) for f1, f2, r in self.get_cases(all_features) + } + self.write_table(proximity_metrics) def create_ball_tree(self, cells): self.tree = BallTree(cells[['pixel x', 'pixel y']].to_numpy()) - def get_named_phenotype_signatures(self) -> tuple[list[str], list[PhenotypeCriteria]]: + def get_named_phenotype_signatures(self) -> dict[str, PhenotypeCriteria]: with DatabaseConnectionMaker(self.database_config_file) as dcm: connection = dcm.get_connection() cursor = connection.cursor() cursor.execute(''' - SELECT cp.identifier, cp.symbol, cs.symbol, CASE cpc.polarity WHEN 'positive' THEN 1 WHEN 'negative' THEN 0 END coded_value + SELECT cp.symbol, cs.symbol, CASE cpc.polarity WHEN 'positive' THEN 1 WHEN 'negative' THEN 0 END coded_value FROM cell_phenotype cp JOIN cell_phenotype_criterion cpc ON cpc.cell_phenotype=cp.identifier JOIN chemical_species cs ON cs.identifier=cpc.marker @@ -112,12 +122,10 @@ def get_named_phenotype_signatures(self) -> tuple[list[str], list[PhenotypeCrite ''', (self.study_name,)) rows = cursor.fetchall() cursor.close() - lookup = {value: key for key, value in self.channel_symbols_by_column_name.items()} - criteria = DataFrame(rows, columns=['phenotype', 'name', 'channel', 'polarity']) - criteria = criteria[['phenotype', 'channel', 'polarity']] + criteria = DataFrame(rows, columns=['phenotype', 'channel', 'polarity']) def list_channels(df: DataFrame, polarity: int) -> list[str]: - return [lookup[r['channel']] for _, r in df.iterrows() if r['polarity'] == polarity] + return [r['channel'] for _, r in df.iterrows() if r['polarity'] == polarity] def make_signature(df) -> PhenotypeCriteria: return PhenotypeCriteria( @@ -125,31 +133,23 @@ def make_signature(df) -> PhenotypeCriteria: negative_markers=list_channels(df, 0), ) by_identifier: dict[str, PhenotypeCriteria] = {} - for key, criteria in criteria.groupby(['phenotype']): - by_identifier[str(key[0])] = make_signature(criteria) - identifiers = sorted(by_identifier.keys()) - return identifiers, [by_identifier[i] for i in identifiers] + for phenotype, criteria in criteria.groupby('phenotype'): + by_identifier[phenotype] = make_signature(criteria) + return by_identifier - def get_cases(self, items): + def get_cases(self, items: list[str]) -> list[tuple[str, str, float]]: return [ - (s1, s2, radius) - for s1 in items - for s2 in items + (f1, f2, radius) + for f1 in items + for f2 in items for radius in PhenotypeProximityCoreJob.radii ] - def write_table(self, proximity_metrics, cases): - if len(proximity_metrics) != len(cases): - raise ValueError('Number of computed features not equal to number of cases.') - rows = list(zip( - [case[0] for case in cases], - [case[1] for case in cases], - [case[2] for case in cases], - proximity_metrics, - )) + def write_table(self, proximity_metrics: dict[tuple[str, str, float], float | None]) -> None: + rows = [(f1, f2, r, m) for (f1, f2, r), m in proximity_metrics.items()] columns = ['Phenotype 1', 'Phenotype 2', 'Pixel radius', 'Proximity'] df = DataFrame(rows, columns=columns) - bundle = [df, self.channel_symbols_by_column_name, self.sample_identifier] + bundle = [df, self.sample_identifier] with open(self.results_file, 'wb') as file: pickle.dump(bundle, file) logger.info('Computed metrics: %s', df.head()) diff --git a/spatialprofilingtoolbox/workflow/phenotype_proximity/integrator.py b/spatialprofilingtoolbox/workflow/phenotype_proximity/integrator.py index 569f1b492..890a1e569 100644 --- a/spatialprofilingtoolbox/workflow/phenotype_proximity/integrator.py +++ b/spatialprofilingtoolbox/workflow/phenotype_proximity/integrator.py @@ -68,9 +68,9 @@ def export_feature_values(self, core_computation_results_files, data_analysis_st def send_features_to_uploader(self, feature_uploader, core_computation_results_files): for results_file in core_computation_results_files: with open(results_file, 'rb') as file: - feature_values, channel_symbols_by_column_name, sample_identifier= pickle.load(file) + feature_values, sample_identifier= pickle.load(file) stage_proximity_feature_values( feature_uploader, feature_values, - channel_symbols_by_column_name, sample_identifier, + sample_identifier, ) diff --git a/test/workflow/module_tests/check_proximity_metric_values.py b/test/workflow/module_tests/check_proximity_metric_values.py index 816c23eaf..6ced154e0 100644 --- a/test/workflow/module_tests/check_proximity_metric_values.py +++ b/test/workflow/module_tests/check_proximity_metric_values.py @@ -5,39 +5,47 @@ from spatialprofilingtoolbox.db.database_connection import DatabaseConnectionMaker -def lookup_all_cell_phenotype_signatures(database_config_file): +def lookup_all_cell_phenotype_signatures(database_config_file) -> dict[str, set[tuple[str, str]]]: with DatabaseConnectionMaker(database_config_file) as dcm: connection = dcm.get_connection() cursor = connection.cursor() cursor.execute(''' SELECT - cpc.cell_phenotype, cs.symbol, cpc.polarity + cp.symbol AS phenotype, + cs.symbol AS channel, + cpc.polarity FROM cell_phenotype_criterion cpc - JOIN chemical_species cs ON cs.identifier=cpc.marker - ORDER BY cpc.cell_phenotype + JOIN cell_phenotype cp + ON cp.identifier=cpc.cell_phenotype + JOIN chemical_species cs + ON cs.identifier=cpc.marker + ORDER BY phenotype ; ''') rows = cursor.fetchall() cursor.close() - df = pd.DataFrame(rows, columns=['cell phenotype', 'symbol', 'polarity']) - def extract_signature(group): + df = pd.DataFrame(rows, columns=['phenotype', 'channel', 'polarity']) + def extract_signature(group: pd.DataFrame) -> set[tuple[str, str]]: return set( - (row['symbol'], '+' if row['polarity'] == 'positive' else '-') - for i, row in group.iterrows() + (row['channel'], '+' if row['polarity'] == 'positive' else '-') + for _, row in group.iterrows() ) return { - str(cell_phenotype) : extract_signature(group) - for cell_phenotype, group in df.groupby('cell phenotype') + phenotype : extract_signature(group) + for phenotype, group in df.groupby('phenotype') } -def retrieve_cell_phenotype_identifier(description_string, lookup): +def retrieve_cell_phenotype_identifier( + description_string: str, + lookup: dict[str, set[tuple[str, str]]], +) -> str: signature = set((re.sub(r'[\+\-]', '', token), re.search(r'[\+\-]', token).group(0)) for token in description_string.split(' ')) if len(signature) == 1: return list(signature)[0][0] - for key, value in lookup.items(): + for phenotype, value in lookup.items(): if value == signature: - return f'cell_phenotype {key}' + return phenotype raise KeyError(f'Could not figure out {description_string}. ' f'Looked for {signature} in values of: {lookup}') From aced7a13c5835a706f0c56deb5925ac1a8042ff7 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Wed, 6 Sep 2023 18:07:02 -0400 Subject: [PATCH 23/31] add cg-gnn to toml checking --- build/build_scripts/check_dockerfiles_consistency.py | 2 +- build/build_scripts/create_pyproject.py | 2 +- pyproject.toml.unversioned | 7 ++++--- spatialprofilingtoolbox/workflow/common/squidpy.py | 11 ++++++----- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/build/build_scripts/check_dockerfiles_consistency.py b/build/build_scripts/check_dockerfiles_consistency.py index f1d92d4e5..e1571991d 100644 --- a/build/build_scripts/check_dockerfiles_consistency.py +++ b/build/build_scripts/check_dockerfiles_consistency.py @@ -10,7 +10,7 @@ def load_dockerfile(submodule): def check_exists(dependency, dockerfile): for line in dockerfile.split('\n'): - if re.search(f'RUN python3? -m pip install "?{dependency}"?$', line): + if re.search(f'RUN python3?(.11)? -m pip install "?{dependency}"?$', line): return True if re.search(dependency, line): print(f'Dependency "{dependency}" is mentioned in Dockerfile, but something isn\'t quite right with installation command.') diff --git a/build/build_scripts/create_pyproject.py b/build/build_scripts/create_pyproject.py index 7239fd712..e0a045ff4 100644 --- a/build/build_scripts/create_pyproject.py +++ b/build/build_scripts/create_pyproject.py @@ -2,7 +2,7 @@ import toml def validate_dependencies_all(project): - modules = ['apiserver', 'db', 'ondemand', 'workflow'] + modules = ['apiserver', 'cggnn', 'db', 'ondemand', 'workflow'] dependencies = set() for module in modules: dependencies = dependencies.union(set(project['project']['optional-dependencies'][module])) diff --git a/pyproject.toml.unversioned b/pyproject.toml.unversioned index c832a1974..8e053ef8e 100644 --- a/pyproject.toml.unversioned +++ b/pyproject.toml.unversioned @@ -41,6 +41,9 @@ apiserver = [ "Pillow==9.5.0", "pydantic==2.0.2" ] +cggnn = [ + "cg-gnn" +] db = [ "pandas==2.0.2", "pyshp==2.2.0", @@ -55,9 +58,6 @@ ondemand = [ "pydantic==2.0.2", "squidpy==1.3.0" ] -cggnn = [ - "cg-gnn" -] workflow = [ "matplotlib==3.7.1", "umap-learn==0.5.3", @@ -71,6 +71,7 @@ workflow = [ "Pillow==9.5.0" ] all = [ + "cg-gnn", "matplotlib==3.7.1", "umap-learn==0.5.3", "uvicorn>=0.15.0,<0.16.0", diff --git a/spatialprofilingtoolbox/workflow/common/squidpy.py b/spatialprofilingtoolbox/workflow/common/squidpy.py index 71ede4b3d..93ed4bc05 100644 --- a/spatialprofilingtoolbox/workflow/common/squidpy.py +++ b/spatialprofilingtoolbox/workflow/common/squidpy.py @@ -40,9 +40,9 @@ def compute_squidpy_metric_for_one_sample( column: (column[2:] if (column.startswith('C ') or column.startswith('P ')) else column) for column in cells.columns }, axis=1) - masks: list[Series[bool]] = [ - (df_cell[signature.positive_markers].all(axis=1) & - (~df_cell[signature.negative_markers]).all(axis=1)) + masks: list[Series] = [ + (df_cell[signature.positive_markers].all(axis=1) & + (~df_cell[signature.negative_markers]).all(axis=1)) for signature in phenotypes ] adata = convert_df_to_anndata(df_cell, masks) @@ -94,9 +94,10 @@ def _summarize_spatial_autocorrelation(unstructured_metrics) -> float | None: def round10(value): return int(pow(10, 10) * value) / pow(10, 10) + def convert_df_to_anndata( df: DataFrame, - phenotypes_to_cluster_on: list[Series[bool]] | None = None, + phenotypes_to_cluster_on: list[Series] | None = None, ) -> AnnData: """Convert SPT DataFrame to AnnData object for use with Squidpy metrics. @@ -105,7 +106,7 @@ def convert_df_to_anndata( A dataframe with an arbitrary index, x and y locations of histological structures with column names 'pixel x' and 'pixel y', and several columns with arbitrary names each indicating the expression of a phenotype. - phenotypes_to_cluster_on: list[Series[bool]] | None + phenotypes_to_cluster_on: list[Series] | None Used to create a 'cluster' column in the AnnData object if provided. Each list is a mask of positive or negative features that indicate the phenotype. * If only one phenotype is provided, two clusters will be created mirroring the From 434f474b132af57aff4122ae30f23e860cefc3d8 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Wed, 6 Sep 2023 20:06:50 -0400 Subject: [PATCH 24/31] adjust squidpy clustering --- .../workflow/common/squidpy.py | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/spatialprofilingtoolbox/workflow/common/squidpy.py b/spatialprofilingtoolbox/workflow/common/squidpy.py index 93ed4bc05..c8d4b3405 100644 --- a/spatialprofilingtoolbox/workflow/common/squidpy.py +++ b/spatialprofilingtoolbox/workflow/common/squidpy.py @@ -2,6 +2,7 @@ from typing import Any from typing import cast +from warnings import warn from numpy.typing import NDArray from numpy import isnan @@ -38,7 +39,7 @@ def compute_squidpy_metric_for_one_sample( """Compute Squidpy metrics for a tissue sample with a clustering of the given phenotypes.""" df_cell = df_cell.rename({ column: (column[2:] if (column.startswith('C ') or column.startswith('P ')) else column) - for column in cells.columns + for column in df_cell.columns }, axis=1) masks: list[Series] = [ (df_cell[signature.positive_markers].all(axis=1) & @@ -65,7 +66,9 @@ def _summarize_neighborhood_enrichment(unstructured_metrics) -> float | None: return float(norm.cdf(zscore)) -def _summarize_co_occurrence(unstructured_metrics) -> float | None: +def _summarize_co_occurrence(unstructured_metrics: dict[str, NDArray[Any]] | None) -> float | None: + if unstructured_metrics is None: + return None occurrence_ratios = unstructured_metrics['occ'] return float(occurrence_ratios[0][1][0]) @@ -130,8 +133,10 @@ def convert_df_to_anndata( for phenotype in phenotypes_to_cluster_on[1:]: clustering[(clustering == 0) & phenotype] = i_cluster i_cluster += 1 - adata.obs['cluster'] = clustering + adata.obs['cluster'] = clustering.to_numpy() adata.obs['cluster'] = adata.obs['cluster'].astype('category') + if adata.obs['cluster'].nunique() == 1: + warn('All phenotypes provided had identical values. Only one cluster could be made.') return adata @@ -142,17 +147,21 @@ def _nhood_enrichment(adata: AnnData) -> dict[str, list[float] | list[int]]: return {'zscore': zscore.tolist(), 'count': count.tolist()} -def _co_occurrence(adata: AnnData, radius: float) -> dict[str, list[float]]: +def _co_occurrence(adata: AnnData, radius: float) -> dict[str, NDArray[Any]] | None: """Compute co-occurrence probability of clusters.""" - result = co_occurrence( - adata, - 'cluster', - copy=True, - interval=[0.0, radius], # type: ignore - show_progress_bar=False, - ) - occ, interval = cast(tuple[NDArray[Any], NDArray[Any]], result) - return {'occ': occ.tolist(), 'interval': interval.tolist()} + try: + result = co_occurrence( + adata, + 'cluster', + copy=True, + interval=[0.0, radius], # type: ignore + show_progress_bar=False, + ) + except ValueError: + warn('Not enough clusters.') + return None + occ, interval = cast(tuple[NDArray[Anyy], NDArray[Any]], result) + return {'occ': occ, 'interval': interval} def _ripley(adata: AnnData) -> dict[str, list[list[float]] | list[float] | list[int]]: From 702d631c43bda4019ea2b1f7f479176f67675b82 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Wed, 6 Sep 2023 20:12:27 -0400 Subject: [PATCH 25/31] actually handling this without a try except is better --- .../workflow/common/squidpy.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/spatialprofilingtoolbox/workflow/common/squidpy.py b/spatialprofilingtoolbox/workflow/common/squidpy.py index c8d4b3405..87a6cc507 100644 --- a/spatialprofilingtoolbox/workflow/common/squidpy.py +++ b/spatialprofilingtoolbox/workflow/common/squidpy.py @@ -149,17 +149,15 @@ def _nhood_enrichment(adata: AnnData) -> dict[str, list[float] | list[int]]: def _co_occurrence(adata: AnnData, radius: float) -> dict[str, NDArray[Any]] | None: """Compute co-occurrence probability of clusters.""" - try: - result = co_occurrence( - adata, - 'cluster', - copy=True, - interval=[0.0, radius], # type: ignore - show_progress_bar=False, - ) - except ValueError: - warn('Not enough clusters.') + if adata.obs['cluster'].nunique() < 2: return None + result = co_occurrence( + adata, + 'cluster', + copy=True, + interval=[0.0, radius], # type: ignore + show_progress_bar=False, + ) occ, interval = cast(tuple[NDArray[Anyy], NDArray[Any]], result) return {'occ': occ, 'interval': interval} From d2eb9c1b5de73d15304db72fd0398a689ef6dc66 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Wed, 6 Sep 2023 20:13:24 -0400 Subject: [PATCH 26/31] any typo --- spatialprofilingtoolbox/workflow/common/squidpy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spatialprofilingtoolbox/workflow/common/squidpy.py b/spatialprofilingtoolbox/workflow/common/squidpy.py index 87a6cc507..e07adde70 100644 --- a/spatialprofilingtoolbox/workflow/common/squidpy.py +++ b/spatialprofilingtoolbox/workflow/common/squidpy.py @@ -158,7 +158,7 @@ def _co_occurrence(adata: AnnData, radius: float) -> dict[str, NDArray[Any]] | N interval=[0.0, radius], # type: ignore show_progress_bar=False, ) - occ, interval = cast(tuple[NDArray[Anyy], NDArray[Any]], result) + occ, interval = cast(tuple[NDArray[Any], NDArray[Any]], result) return {'occ': occ, 'interval': interval} From dac2cf92a6c4de8098413398b2b6c3f017a59069 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Wed, 6 Sep 2023 20:56:06 -0400 Subject: [PATCH 27/31] handle malformed squidpy returns more gracefully --- .../workflow/common/squidpy.py | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/spatialprofilingtoolbox/workflow/common/squidpy.py b/spatialprofilingtoolbox/workflow/common/squidpy.py index e07adde70..1673bb0e5 100644 --- a/spatialprofilingtoolbox/workflow/common/squidpy.py +++ b/spatialprofilingtoolbox/workflow/common/squidpy.py @@ -61,7 +61,11 @@ def compute_squidpy_metric_for_one_sample( return None -def _summarize_neighborhood_enrichment(unstructured_metrics) -> float | None: +def _summarize_neighborhood_enrichment( + unstructured_metrics: dict[str, NDArray[Any]] +) -> float | None: + if len(unstructured_metrics['zscore'].shape) != 2: + return None zscore = float(unstructured_metrics['zscore'][0][1]) return float(norm.cdf(zscore)) @@ -70,13 +74,17 @@ def _summarize_co_occurrence(unstructured_metrics: dict[str, NDArray[Any]] | Non if unstructured_metrics is None: return None occurrence_ratios = unstructured_metrics['occ'] + if len(occurrence_ratios.shape) != 3: + return None return float(occurrence_ratios[0][1][0]) -def _summarize_ripley(unstructured_metrics) -> float | None: +def _summarize_ripley(unstructured_metrics: dict[str, NDArray[Any]]) -> float | None: bins = unstructured_metrics['bins'] pvalues = unstructured_metrics['pvalues'] - pairs = list(zip(bins, pvalues)) + pairs = list(zip(bins.tolist(), pvalues.tolist())) + if len(bins) != len(pvalues) or len(pairs) == 0: + return None filtered = [pair for pair in pairs if pair[1] > 0] if len(filtered) == 0: return 1.0 @@ -84,7 +92,7 @@ def _summarize_ripley(unstructured_metrics) -> float | None: return float(sorted_pairs[0][1]) -def _summarize_spatial_autocorrelation(unstructured_metrics) -> float | None: +def _summarize_spatial_autocorrelation(unstructured_metrics: DataFrame) -> float | None: row = unstructured_metrics.iloc[0] pvalue = float(row['pval_norm']) if isnan(pvalue): @@ -140,11 +148,11 @@ def convert_df_to_anndata( return adata -def _nhood_enrichment(adata: AnnData) -> dict[str, list[float] | list[int]]: +def _nhood_enrichment(adata: AnnData) -> dict[str, NDArray[Any]]: """Compute neighborhood enrichment by permutation test.""" result = nhood_enrichment(adata, 'cluster', copy=True, seed=128, show_progress_bar=False) zscore, count = cast(tuple[NDArray[Any], NDArray[Any]], result) - return {'zscore': zscore.tolist(), 'count': count.tolist()} + return {'zscore': zscore, 'count': count} def _co_occurrence(adata: AnnData, radius: float) -> dict[str, NDArray[Any]] | None: @@ -162,12 +170,14 @@ def _co_occurrence(adata: AnnData, radius: float) -> dict[str, NDArray[Any]] | N return {'occ': occ, 'interval': interval} -def _ripley(adata: AnnData) -> dict[str, list[list[float]] | list[float] | list[int]]: +def _ripley(adata: AnnData) -> dict[str, NDArray[Any]]: r"""Compute various Ripley\'s statistics for point processes.""" result = ripley(adata, 'cluster', copy=True) + bins = cast(NDArray[Any], result['bins']) + pvalues = cast(NDArray[Any], result['pvalues']) return { - 'bins': result['bins'].tolist(), - 'pvalues': result['pvalues'].tolist()[0], + 'bins': bins, + 'pvalues': pvalues[0,], } From 9779a787b805a6e83acfc0460ee9fd01aeaacf19 Mon Sep 17 00:00:00 2001 From: Carlin Liao Date: Fri, 8 Sep 2023 17:42:52 -0400 Subject: [PATCH 28/31] cggnn extract docstring --- spatialprofilingtoolbox/cggnn/extract.py | 28 +++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/spatialprofilingtoolbox/cggnn/extract.py b/spatialprofilingtoolbox/cggnn/extract.py index 253360840..be76310b4 100644 --- a/spatialprofilingtoolbox/cggnn/extract.py +++ b/spatialprofilingtoolbox/cggnn/extract.py @@ -72,7 +72,33 @@ def extract_cggnn_data( study: str, strata_to_use: list[int] | None, ) -> tuple[DataFrame, DataFrame, dict[int, str]]: - """Extract information cg-gnn needs from SPT.""" + """Extract information cg-gnn needs from SPT. + + Parameters + ---------- + spt_db_config_location : str + Location of the SPT DB config file. + study : str + Name of the study to query data for. + strata_to_use : list[int] | None + Specimen strata to use as labels, identified according to the "stratum identifier" in + `explore_classes`. This should be given as space separated integers. + If not provided, all strata will be used. + + Returns + ------- + df_cell: DataFrame + Rows are individual cells, indexed by an integer ID. + Column or column groups are, named and in order: + 1. The 'specimen' the cell is from + 2. Cell centroid positions 'pixel x' and 'pixel y' + 3. Channel expressions starting with 'C ' and followed by human-readable symbol text + 4. Phenotype expressions starting with 'P ' followed by human-readable symbol text + df_label: DataFrame + Rows are specimens, the sole column 'label' is its class label as an integer. + label_to_result_text: dict[int, str] + Mapping from class integer label to human-interpretable result text. + """ extractor = FeatureMatrixExtractor(database_config_file=spt_db_config_location) df_cell = _create_cell_df({ slide: data.dataframe for slide, data in extractor.extract(study=study).items() From f4a0f240ce972a049667210a2752953e18dd449c Mon Sep 17 00:00:00 2001 From: James Mathews Date: Tue, 12 Sep 2023 14:32:42 -0400 Subject: [PATCH 29/31] Change dataframe to bool values to permit "all" & "all" syntax. --- spatialprofilingtoolbox/workflow/common/squidpy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spatialprofilingtoolbox/workflow/common/squidpy.py b/spatialprofilingtoolbox/workflow/common/squidpy.py index 1673bb0e5..d0f7b8865 100644 --- a/spatialprofilingtoolbox/workflow/common/squidpy.py +++ b/spatialprofilingtoolbox/workflow/common/squidpy.py @@ -40,7 +40,7 @@ def compute_squidpy_metric_for_one_sample( df_cell = df_cell.rename({ column: (column[2:] if (column.startswith('C ') or column.startswith('P ')) else column) for column in df_cell.columns - }, axis=1) + }, axis=1).astype(bool) masks: list[Series] = [ (df_cell[signature.positive_markers].all(axis=1) & (~df_cell[signature.negative_markers]).all(axis=1)) From bb677996e9d85e05df9633e92932acc173848c31 Mon Sep 17 00:00:00 2001 From: James Mathews Date: Tue, 12 Sep 2023 15:58:06 -0400 Subject: [PATCH 30/31] Fix accidentally booleanization of pixel position columns. --- spatialprofilingtoolbox/workflow/common/squidpy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spatialprofilingtoolbox/workflow/common/squidpy.py b/spatialprofilingtoolbox/workflow/common/squidpy.py index d0f7b8865..d57716c24 100644 --- a/spatialprofilingtoolbox/workflow/common/squidpy.py +++ b/spatialprofilingtoolbox/workflow/common/squidpy.py @@ -40,10 +40,10 @@ def compute_squidpy_metric_for_one_sample( df_cell = df_cell.rename({ column: (column[2:] if (column.startswith('C ') or column.startswith('P ')) else column) for column in df_cell.columns - }, axis=1).astype(bool) + }, axis=1) masks: list[Series] = [ - (df_cell[signature.positive_markers].all(axis=1) & - (~df_cell[signature.negative_markers]).all(axis=1)) + (df_cell.astype(bool)[signature.positive_markers].all(axis=1) & + (~df_cell.astype(bool)[signature.negative_markers]).all(axis=1)) for signature in phenotypes ] adata = convert_df_to_anndata(df_cell, masks) From 5dfc0014ba92f0db82f80cf165163acfb6613f88 Mon Sep 17 00:00:00 2001 From: James Mathews Date: Tue, 12 Sep 2023 16:14:25 -0400 Subject: [PATCH 31/31] Make operator order precedence explicit, booleanization and negation. --- spatialprofilingtoolbox/workflow/common/proximity.py | 9 +++++---- spatialprofilingtoolbox/workflow/common/squidpy.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/spatialprofilingtoolbox/workflow/common/proximity.py b/spatialprofilingtoolbox/workflow/common/proximity.py index 52b8f178d..42ea892b4 100644 --- a/spatialprofilingtoolbox/workflow/common/proximity.py +++ b/spatialprofilingtoolbox/workflow/common/proximity.py @@ -23,11 +23,12 @@ def compute_proximity_metric_for_signature_pair( column: (column[2:] if (column.startswith('C ') or column.startswith('P ')) else column) for column in cells.columns }, axis=1) - mask1 = cells[signature1.positive_markers].all(axis=1) & \ - (~cells[signature1.negative_markers]).all(axis=1) - mask2 = cells[signature1.positive_markers].all(axis=1) & \ - (~cells[signature2.negative_markers]).all(axis=1) + mask1 = cells.astype(bool)[signature1.positive_markers].all(axis=1) & \ + (~(cells.astype(bool))[signature1.negative_markers]).all(axis=1) + mask2 = cells.astype(bool)[signature2.positive_markers].all(axis=1) & \ + (~(cells.astype(bool))[signature2.negative_markers]).all(axis=1) source_count = sum(mask1) + if source_count == 0: return None source_cell_locations = cells.loc[mask1, ['pixel x', 'pixel y']] diff --git a/spatialprofilingtoolbox/workflow/common/squidpy.py b/spatialprofilingtoolbox/workflow/common/squidpy.py index d57716c24..92f1316d2 100644 --- a/spatialprofilingtoolbox/workflow/common/squidpy.py +++ b/spatialprofilingtoolbox/workflow/common/squidpy.py @@ -43,7 +43,7 @@ def compute_squidpy_metric_for_one_sample( }, axis=1) masks: list[Series] = [ (df_cell.astype(bool)[signature.positive_markers].all(axis=1) & - (~df_cell.astype(bool)[signature.negative_markers]).all(axis=1)) + (~(df_cell.astype(bool))[signature.negative_markers]).all(axis=1)) for signature in phenotypes ] adata = convert_df_to_anndata(df_cell, masks)