nadeemlab · jimmymathews · Sep 14, 2023 · Sep 12, 2023 · Sep 13, 2023 · Sep 13, 2023
diff --git a/build/cggnn/Dockerfile b/build/cggnn/Dockerfile
@@ -15,6 +15,7 @@ RUN python3.11 -m pip install psycopg2==2.9.6
 RUN python3.11 -m pip install adiscstudies==0.11.0
 RUN python3.11 -m pip install numba==0.57.0
 RUN python3.11 -m pip install attrs==23.1.0
+RUN python3.11 -m pip install tables
 RUN python3.11 -m pip install cg-gnn
 ARG version
 ARG service_name

diff --git a/pyproject.toml.unversioned b/pyproject.toml.unversioned
@@ -43,7 +43,8 @@ apiserver = [
     "secure==0.3.0"
 ]
 cggnn = [
-    "cg-gnn"
+    "cg-gnn",
+    "tables"
 ]
 db = [
     "pandas==2.0.2",
@@ -87,7 +88,8 @@ all = [
     "fastapi==0.100.0",
     "Pillow==9.5.0",
     "squidpy==1.3.0",
-    "secure==0.3.0"
+    "secure==0.3.0",
+    "tables"
 ]
 dev = [
     "autopep8",

diff --git a/spatialprofilingtoolbox/cggnn/extract.py b/spatialprofilingtoolbox/cggnn/extract.py
@@ -13,10 +13,17 @@ def _create_cell_df(dfs_by_specimen: dict[str, DataFrame]) -> DataFrame:
 
     df = concat(dfs_by_specimen.values(), axis=0)
     df.index.name = 'histological_structure'
+
+    # Convert binary int columns to boolean
+    channels = df.columns[df.columns.str.startswith('C ')]
+    phenotypes = df.columns[df.columns.str.startswith('P ')]
+    df[channels] = df[channels].astype(bool)
+    df[phenotypes] = df[phenotypes].astype(bool)
+
     # Reorder columns so it's specimen, xy, channels, and phenotypes
     column_order = ['specimen', 'pixel x', 'pixel y']
-    column_order.extend(df.columns[df.columns.str.startswith('C ')])
-    column_order.extend(df.columns[df.columns.str.startswith('P ')])
+    column_order.extend(channels)
+    column_order.extend(phenotypes)
     return df[column_order]
 
 
@@ -25,8 +32,9 @@ def _create_label_df(
     df_strata: DataFrame,
     strata_to_use: list[int] | None,
 ) -> tuple[DataFrame, dict[int, str]]:
-    """Get slide-level results."""
-    df_assignments = df_assignments.set_index('specimen')
+    """Get specimen-level results."""
+    df_assignments['stratum identifier'] = df_assignments['stratum identifier'].astype(int)
+    df_strata['stratum identifier'] = df_strata['stratum identifier'].astype(int)
     df_strata = df_strata.set_index('stratum identifier')
     df_strata = _filter_for_strata(strata_to_use, df_strata)
     df_strata = _drop_unneeded_columns(df_strata)
@@ -52,17 +60,22 @@ def _drop_unneeded_columns(df_strata: DataFrame) -> DataFrame:
 
 def _compress_df(df_strata: DataFrame) -> DataFrame:
     """Compress remaining columns into a single string"""
-    df_strata['label'] = '(' + df_strata.iloc[:, 0].astype(str)
-    for i in range(1, df_strata.shape[1]):
-        df_strata['label'] += df_strata.iloc[:, i].astype(str)
-    df_strata['label'] += ')'
-    df_strata = df_strata[['label']]
+    n_columns = df_strata.shape[1]
+    if n_columns == 1:
+        df_strata = df_strata.rename(columns={df_strata.columns[0]: 'label'})
+    else:
+        df_strata['label'] = '(' + df_strata.iloc[:, 0].astype(str)
+        for i in range(1, n_columns):
+            df_strata['label'] += ', ' + df_strata.iloc[:, i].astype(str)
+        df_strata['label'] += ')'
+        df_strata = df_strata[['label']]
     return df_strata
 
 
 def _label(df_assignments: DataFrame, df_strata: DataFrame) -> tuple[DataFrame, dict[int, str]]:
     """Merge with specimen assignments, keeping only selected strata."""
-    df = merge(df_assignments, df_strata, on='stratum identifier', how='inner')[['label']]
+    df = merge(df_assignments, df_strata, on='stratum identifier', how='inner'
+               ).set_index('specimen')[['label']]
     label_to_result = dict(enumerate(sort(df['label'].unique())))
     return df.replace({res: i for i, res in label_to_result.items()}), label_to_result
 
@@ -73,7 +86,7 @@ def extract_cggnn_data(
     strata_to_use: list[int] | None,
 ) -> tuple[DataFrame, DataFrame, dict[int, str]]:
     """Extract information cg-gnn needs from SPT.
-    
+
     Parameters
     ----------
     spt_db_config_location : str
@@ -84,7 +97,7 @@ def extract_cggnn_data(
         Specimen strata to use as labels, identified according to the "stratum identifier" in
         `explore_classes`. This should be given as space separated integers.
         If not provided, all strata will be used.
-    
+
     Returns
     -------
     df_cell: DataFrame
@@ -100,13 +113,17 @@ def extract_cggnn_data(
         Mapping from class integer label to human-interpretable result text.
     """
     extractor = FeatureMatrixExtractor(database_config_file=spt_db_config_location)
-    df_cell = _create_cell_df({
-        slide: data.dataframe for slide, data in extractor.extract(study=study).items()
-    })
     cohorts = extractor.extract_cohorts(study)
     df_label, label_to_result_text = _create_label_df(
         cohorts['assignments'],
         cohorts['strata'],
         strata_to_use,
     )
+    df_cell = _create_cell_df({
+        specimen: extractor.extract(specimen=specimen, retain_structure_id=True)[specimen].dataframe
+        for specimen in df_label.index
+    } if (strata_to_use is not None) else {
+        specimen: data.dataframe
+        for specimen, data in extractor.extract(study=study, retain_structure_id=True).items()
+    })
     return df_cell, df_label, label_to_result_text
diff --git a/spatialprofilingtoolbox/cggnn/scripts/explore_classes.py b/spatialprofilingtoolbox/cggnn/scripts/explore_classes.py
@@ -28,6 +28,6 @@ def parse_arguments():
 
 if __name__ == "__main__":
     args = parse_arguments()
-    extractor = FeatureMatrixExtractor(args.spt_db_config_location)
+    extractor = FeatureMatrixExtractor(database_config_file=args.spt_db_config_location)
     strata = extractor.extract_cohorts(study=args.study)['strata']
     print(strata.to_string())
diff --git a/spatialprofilingtoolbox/cggnn/scripts/extract.py b/spatialprofilingtoolbox/cggnn/scripts/extract.py
@@ -1,6 +1,7 @@
 """Extract information cg-gnn needs from SPT and save to file."""
 
 from argparse import ArgumentParser
+from os import makedirs
 from os.path import join, exists
 from json import dump
 
@@ -46,17 +47,18 @@ def parse_arguments():
 
 if __name__ == "__main__":
     args = parse_arguments()
-    df_cell, df_label, label_to_result = extract_cggnn_data(
-        args.spt_db_config_location,
-        args.study,
-        args.strata,
-    )
-
-    assert isinstance(args.output_location, str)
-    dict_filename = join(args.output_location, 'label_to_results.json')
-    cells_filename = join(args.output_location, 'cells.h5')
-    labels_filename = join(args.output_location, 'labels.h5')
+    output_location: str = join(args.output_location, args.study)
+    assert isinstance(output_location, str)
+    makedirs(output_location, exist_ok=True)
+    dict_filename = join(output_location, 'label_to_results.json')
+    cells_filename = join(output_location, 'cells.h5')
+    labels_filename = join(output_location, 'labels.h5')
     if not (exists(dict_filename) and exists(cells_filename) and exists(labels_filename)):
+        df_cell, df_label, label_to_result = extract_cggnn_data(
+            args.spt_db_config_location,
+            args.study,
+            args.strata,
+        )
         df_cell.to_hdf(cells_filename, 'cells')
         df_label.to_hdf(labels_filename, 'labels')
         with open(dict_filename, 'w', encoding='utf-8') as f:

diff --git a/spatialprofilingtoolbox/db/feature_matrix_extractor.py b/spatialprofilingtoolbox/db/feature_matrix_extractor.py
@@ -71,6 +71,7 @@ def extract(self,
         specimen: str | None = None,
         study: str | None = None,
         continuous_also: bool = False,
+        retain_structure_id: bool = False,
     ) -> dict[str, MatrixBundle]:
         """Extract feature matrices for a specimen or every specimen in a study.
 
@@ -84,6 +85,8 @@ def extract(self,
             Whether to also calculate and return a DataFrame for each specimen with continuous
             channel information in addition to the default DataFrame which provides binary cast
             channel information.
+        retain_structure_id: bool = False
+            Whether to index cells by their histological structure ID rather than arbitrary indices.
 
         Returns
         -------
@@ -101,6 +104,7 @@ def extract(self,
                     specimen=specimen,
                     study=study,
                     continuous_also=continuous_also,
+                    retain_structure_id=retain_structure_id,
                 )
             case _DBSource.CONFIG_FILE:
                 with DatabaseConnectionMaker(self.database_config_file) as dcm:
@@ -110,6 +114,7 @@ def extract(self,
                             specimen=specimen,
                             study=study,
                             continuous_also=continuous_also,
+                            retain_structure_id=retain_structure_id,
                         )
             case _DBSource.UNKNOWN:
                 raise RuntimeError('The database source can not be determined.')
@@ -119,6 +124,7 @@ def _extract(self,
         specimen: str | None = None,
         study: str | None = None,
         continuous_also: bool = False,
+        retain_structure_id: bool = False,
     ) -> dict[str, MatrixBundle]:
         if (specimen is None) == (study is None):
             raise ValueError('Must specify exactly one of specimen or study.')
@@ -140,6 +146,7 @@ def _extract(self,
             centroid_coordinates,
             self._retrieve_phenotypes(study),
             self._create_channel_information(data_arrays),
+            retain_structure_id,
         )
 
     def _retrieve_expressions_from_database(self,
@@ -180,6 +187,7 @@ def _create_feature_matrices(self,
         centroid_coordinates: dict[str, Any],
         phenotypes: dict[str, PhenotypeCriteria],
         channel_information: list[str],
+        retain_structure_id: bool,
     ) -> dict[str, MatrixBundle]:
         logger.info('Creating feature matrices from binary data arrays and centroids.')
         matrices: dict[str, MatrixBundle] = {}
@@ -196,6 +204,7 @@ def _create_feature_matrices(self,
             dataframe = DataFrame(
                 rows,
                 columns=['pixel x', 'pixel y'] + [f'C {cs}' for cs in channel_information],
+                index=self._extract_cell_ids(specimen) if retain_structure_id else None,
             )
             for symbol, criteria in phenotypes.items():
                 dataframe[f'P {symbol}'] = (
@@ -212,6 +221,7 @@ def _create_feature_matrices(self,
                 dataframe = DataFrame(
                     expression_vectors,
                     columns=[f'C {cs}' for cs in channel_information],
+                    index=self._extract_cell_ids(specimen) if retain_structure_id else None,
                 )
                 matrices[specimen].continuous_dataframe = dataframe
 
@@ -228,6 +238,24 @@ def _create_feature_matrix_row(
         feature_vector: list[int] = [int(value) for value in list(template.format(binary)[::-1])]
         return [centroid[0], centroid[1]] + feature_vector
 
+    def _extract_cell_ids(self, specimen: str) -> list[int]:
+        self.cursor.execute(f'''
+            SELECT hsi.histological_structure
+            FROM histological_structure_identification hsi
+                JOIN histological_structure hs
+                    ON hsi.histological_structure=hs.identifier
+                JOIN data_file df
+                    ON hsi.data_source=df.sha256_hash
+                JOIN specimen_data_measurement_process sdmp
+                    ON df.source_generation_process=sdmp.identifier
+            WHERE
+                hs.anatomical_entity='cell' AND
+                sdmp.specimen='{specimen}'
+            ORDER BY hsi.histological_structure
+            ;
+        ''')
+        return [int(entry[0]) for entry in self.cursor.fetchall()]
+
     def _create_channel_information(self,
         study_information: dict[str, dict[str, Any]]
     ) -> list[str]:

diff --git a/spatialprofilingtoolbox/workflow/common/sparse_matrix_puller.py b/spatialprofilingtoolbox/workflow/common/sparse_matrix_puller.py
@@ -7,6 +7,7 @@
 from psycopg2.extensions import cursor as Psycopg2Cursor
 
 from spatialprofilingtoolbox.db.expressions_table_indexer import ExpressionsTableIndexer
+from spatialprofilingtoolbox.db.study_access import StudyAccess
 from spatialprofilingtoolbox.workflow.common.logging.fractional_progress_reporter \
     import FractionalProgressReporter
 from spatialprofilingtoolbox.standalone_utilities.log_formats import colorized_logger
@@ -107,7 +108,14 @@ class SparseMatrixPuller:
     def __init__(self, cursor: Psycopg2Cursor):
         self.cursor = cursor
 
-    def pull(self, specimen: str | None=None, study: str | None=None, continuous_also: bool=False):
+    def pull(self,
+        specimen: str | None = None,
+        study: str | None = None,
+        continuous_also: bool = False,
+    ) -> None:
+        """Pull sparse matrices into self.data_arrays."""
+        if (specimen is not None) and (study is not None):
+            raise ValueError('Must specify exactly one of specimen or study, or neither.')
         self.data_arrays = self._retrieve_data_arrays(
             specimen=specimen,
             study=study,
@@ -118,10 +126,12 @@ def get_data_arrays(self):
         return self.data_arrays
 
     def _retrieve_data_arrays(self,
-            specimen: str | None=None,
-            study: str | None=None,
-            continuous_also: bool=False,
-        ) -> CompressedDataArrays:
+        specimen: str | None = None,
+        study: str | None = None,
+        continuous_also: bool = False,
+    ) -> CompressedDataArrays:
+        if specimen is not None:
+            study = StudyAccess(self.cursor).get_study_from_specimen(specimen)
         study_names = self._get_study_names(study=study)
         data_arrays = CompressedDataArrays()
         for study_name in study_names:
@@ -136,9 +146,9 @@ def _retrieve_data_arrays(self,
     def _fill_data_arrays_for_study(self,
         data_arrays: CompressedDataArrays,
         study_name: str,
-        specimen: str | None=None,
-        continuous_also: bool=False,
-    ):
+        specimen: str | None = None,
+        continuous_also: bool = False,
+    ) -> None:
         specimens = self._get_pertinent_specimens(study_name, specimen=specimen)
         target_by_symbol = self._get_target_by_symbol(study_name)
         logger.debug('Pulling sparse entries for study "%s".', study_name)
@@ -158,8 +168,8 @@ def _fill_data_arrays_for_study(self,
                 continue
             parsed = parse(sparse_entries, continuous_also=continuous_also)
             data_arrays_by_specimen, \
-            target_index_lookup, \
-            continuous_data_arrays_by_specimen = parsed
+                target_index_lookup, \
+                continuous_data_arrays_by_specimen = parsed
             data_arrays.add_study_data(
                 study_name,
                 data_arrays_by_specimen,
@@ -172,9 +182,18 @@ def _fill_data_arrays_for_study(self,
 
     def _get_pertinent_specimens(self,
         study_name: str,
-        specimen: str | None=None,
+        specimen: str | None = None,
     ) -> tuple[str, ...]:
         if specimen is not None:
+            self.cursor.execute('''
+            SELECT sdmp.specimen
+            FROM specimen_data_measurement_process sdmp
+            WHERE sdmp.study=%s
+                AND sdmp.specimen=%s
+            ;
+            ''', (study_name, specimen))
+            if len(self.cursor.fetchall()) == 0:
+                raise ValueError(f'Specimen "{specimen}" not found in study "{study_name}".')
             return (specimen,)
         self.cursor.execute('''
         SELECT sdmp.specimen
@@ -186,7 +205,7 @@ def _get_pertinent_specimens(self,
         rows = self.cursor.fetchall()
         return tuple(cast(str, row[0]) for row in rows)
 
-    def _get_study_names(self, study: str | None=None) -> tuple[str, ...]:
+    def _get_study_names(self, study: str | None = None) -> tuple[str, ...]:
         if study is None:
             self.cursor.execute('SELECT name FROM specimen_measurement_study ;')
             rows = self.cursor.fetchall()
@@ -204,8 +223,8 @@ def _get_study_names(self, study: str | None=None) -> tuple[str, ...]:
             logger.info('    %s', name)
         return names
 
-    def _get_sparse_entries(self, study_name:str , specimen: str) -> list[tuple]:
-        sparse_entries: list[tuple] = []
+    def _get_sparse_entries(self, study_name: str, specimen: str) -> list[tuple[Any, ...]]:
+        sparse_entries: list[tuple[Any, ...]] = []
         number_log_messages = 0
         self.cursor.execute(
             self._get_sparse_matrix_query_specimen_specific(),
@@ -265,7 +284,7 @@ def _get_batch_size(self) -> int:
 
     def _parse_data_arrays_by_specimen(self,
         sparse_entries: list[tuple],
-        continuous_also: bool=False,
+        continuous_also: bool = False,
     ):
         target_index_lookup = self._get_target_index_lookup(sparse_entries)
         sparse_entries.sort(key=lambda x: (x[3], x[0]))
@@ -291,7 +310,7 @@ def _parse_data_arrays_by_specimen(self,
                     data_arrays_by_specimen[specimen],
                     buffer,
                     target_index_lookup,
-                    continuous_data_array = continuous_data_arrays_by_specimen[specimen],
+                    continuous_data_array=continuous_data_arrays_by_specimen[specimen],
                 )
                 done_message = 'Done parsing %s feature vectors from %s.'
                 logger.debug(done_message, len(data_arrays_by_specimen[specimen]), specimen)