nadeemlab · CarlinLiao · Sep 12, 2023 · Aug 8, 2023 · Aug 9, 2023 · Aug 11, 2023
diff --git a/build/build_scripts/check_dockerfiles_consistency.py b/build/build_scripts/check_dockerfiles_consistency.py
@@ -10,7 +10,7 @@ def load_dockerfile(submodule):
 
 def check_exists(dependency, dockerfile):
     for line in dockerfile.split('\n'):
-        if re.search(f'RUN python3? -m pip install "?{dependency}"?$', line):
+        if re.search(f'RUN python3?(.11)? -m pip install "?{dependency}"?$', line):
             return True
         if re.search(dependency, line):
             print(f'Dependency "{dependency}" is mentioned in Dockerfile, but something isn\'t quite right with installation command.')

diff --git a/build/build_scripts/create_pyproject.py b/build/build_scripts/create_pyproject.py
@@ -2,7 +2,7 @@
 import toml
 
 def validate_dependencies_all(project):
-    modules = ['apiserver', 'db', 'ondemand', 'workflow']
+    modules = ['apiserver', 'cggnn', 'db', 'ondemand', 'workflow']
     dependencies = set()
     for module in modules:
         dependencies = dependencies.union(set(project['project']['optional-dependencies'][module]))

diff --git a/build/cggnn/Dockerfile b/build/cggnn/Dockerfile
@@ -1,18 +1,26 @@
-FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime
+FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
 ENV DEBIAN_FRONTEND=noninteractive
-RUN apt update && apt install -y gcc libpq-dev && rm -rf /var/lib/apt/lists/*
+RUN apt update && apt install -y gcc libpq-dev
 WORKDIR /usr/src/app
-RUN python -m pip install dgl-cu116 dglgo -f https://data.dgl.ai/wheels/repo.html
-RUN python -m pip install psycopg2==2.9.6
-RUN python -m pip install adiscstudies==0.11.0
-RUN python -m pip install numba==0.57.0
-RUN python -m pip install attrs==23.1.0
-RUN python -m pip install cg-gnn
+RUN apt install software-properties-common -y
+RUN add-apt-repository ppa:deadsnakes/ppa
+RUN apt update
+RUN apt install python3.11 -y
+RUN apt install python3.11-dev -y
+RUN apt install python3.11-venv -y
+RUN apt install python3.11-distutils -y
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 && python3.11 -m ensurepip
+RUN python3.11 -m pip install dgl-cu117 dglgo -f https://data.dgl.ai/wheels/repo.html
+RUN python3.11 -m pip install psycopg2==2.9.6
+RUN python3.11 -m pip install adiscstudies==0.11.0
+RUN python3.11 -m pip install numba==0.57.0
+RUN python3.11 -m pip install attrs==23.1.0
+RUN python3.11 -m pip install cg-gnn
 ARG version
 ARG service_name
 ARG WHEEL_FILENAME
 LABEL version=$version
 LABEL service_name=$service_name
 ENV service_name $service_name
 COPY $WHEEL_FILENAME ./
-RUN python -m pip install "$WHEEL_FILENAME"
+RUN python3.11 -m pip install "$WHEEL_FILENAME"
diff --git a/pyproject.toml.unversioned b/pyproject.toml.unversioned
@@ -41,6 +41,9 @@ apiserver = [
     "Pillow==9.5.0",
     "pydantic==2.0.2"
 ]
+cggnn = [
+    "cg-gnn"
+]
 db = [
     "pandas==2.0.2",
     "pyshp==2.2.0",
@@ -55,9 +58,6 @@ ondemand = [
     "pydantic==2.0.2",
     "squidpy==1.3.0"
 ]
-cggnn = [
-    "cg-gnn"
-]
 workflow = [
     "matplotlib==3.7.1",
     "umap-learn==0.5.3",
@@ -71,6 +71,7 @@ workflow = [
     "Pillow==9.5.0"
 ]
 all = [
+    "cg-gnn",
     "matplotlib==3.7.1",
     "umap-learn==0.5.3",
     "uvicorn>=0.15.0,<0.16.0",

diff --git a/spatialprofilingtoolbox/cggnn/__init__.py b/spatialprofilingtoolbox/cggnn/__init__.py
@@ -1,2 +1,4 @@
 """Cell-graph graph neural network functionality."""
 __version__ = '0.2.1'
+
+from spatialprofilingtoolbox.cggnn.extract import extract_cggnn_data
diff --git a/spatialprofilingtoolbox/cggnn/extract.py b/spatialprofilingtoolbox/cggnn/extract.py
@@ -0,0 +1,112 @@
+"""Extract information cg-gnn needs from SPT."""
+
+from pandas import DataFrame, concat, merge  # type: ignore
+from numpy import sort  # type: ignore
+
+from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor
+
+
+def _create_cell_df(dfs_by_specimen: dict[str, DataFrame]) -> DataFrame:
+    """Find simple and complex phenotypes, and locations and merge into a DataFrame."""
+    for specimen, df_specimen in dfs_by_specimen.items():
+        df_specimen['specimen'] = specimen
+
+    df = concat(dfs_by_specimen.values(), axis=0)
+    df.index.name = 'histological_structure'
+    # Reorder columns so it's specimen, xy, channels, and phenotypes
+    column_order = ['specimen', 'pixel x', 'pixel y']
+    column_order.extend(df.columns[df.columns.str.startswith('C ')])
+    column_order.extend(df.columns[df.columns.str.startswith('P ')])
+    return df[column_order]
+
+
+def _create_label_df(
+    df_assignments: DataFrame,
+    df_strata: DataFrame,
+    strata_to_use: list[int] | None,
+) -> tuple[DataFrame, dict[int, str]]:
+    """Get slide-level results."""
+    df_assignments = df_assignments.set_index('specimen')
+    df_strata = df_strata.set_index('stratum identifier')
+    df_strata = _filter_for_strata(strata_to_use, df_strata)
+    df_strata = _drop_unneeded_columns(df_strata)
+    df_strata = _compress_df(df_strata)
+    return _label(df_assignments, df_strata)
+
+
+def _filter_for_strata(strata_to_use: list[int] | None, df_strata: DataFrame) -> DataFrame:
+    if strata_to_use is not None:
+        df_strata = df_strata.loc[sorted(strata_to_use)]
+    if df_strata.shape[0] < 2:
+        raise ValueError(f'Need at least 2 strata to classify, there are {df_strata.shape[0]}.')
+    return df_strata
+
+
+def _drop_unneeded_columns(df_strata: DataFrame) -> DataFrame:
+    """Drop columns that have internally same contents."""
+    for col in df_strata.columns.tolist():
+        if df_strata[col].nunique() == 1:
+            df_strata = df_strata.drop(col, axis=1)
+    return df_strata
+
+
+def _compress_df(df_strata: DataFrame) -> DataFrame:
+    """Compress remaining columns into a single string"""
+    df_strata['label'] = '(' + df_strata.iloc[:, 0].astype(str)
+    for i in range(1, df_strata.shape[1]):
+        df_strata['label'] += df_strata.iloc[:, i].astype(str)
+    df_strata['label'] += ')'
+    df_strata = df_strata[['label']]
+    return df_strata
+
+
+def _label(df_assignments: DataFrame, df_strata: DataFrame) -> tuple[DataFrame, dict[int, str]]:
+    """Merge with specimen assignments, keeping only selected strata."""
+    df = merge(df_assignments, df_strata, on='stratum identifier', how='inner')[['label']]
+    label_to_result = dict(enumerate(sort(df['label'].unique())))
+    return df.replace({res: i for i, res in label_to_result.items()}), label_to_result
+
+
+def extract_cggnn_data(
+    spt_db_config_location: str,
+    study: str,
+    strata_to_use: list[int] | None,
+) -> tuple[DataFrame, DataFrame, dict[int, str]]:
+    """Extract information cg-gnn needs from SPT.
+
+    Parameters
+    ----------
+    spt_db_config_location : str
+        Location of the SPT DB config file.
+    study : str
+        Name of the study to query data for.
+    strata_to_use : list[int] | None
+        Specimen strata to use as labels, identified according to the "stratum identifier" in
+        `explore_classes`. This should be given as space separated integers.
+        If not provided, all strata will be used.
+
+    Returns
+    -------
+    df_cell: DataFrame
+        Rows are individual cells, indexed by an integer ID.
+        Column or column groups are, named and in order:
+            1. The 'specimen' the cell is from
+            2. Cell centroid positions 'pixel x' and 'pixel y'
+            3. Channel expressions starting with 'C ' and followed by human-readable symbol text
+            4. Phenotype expressions starting with 'P ' followed by human-readable symbol text
+    df_label: DataFrame
+        Rows are specimens, the sole column 'label' is its class label as an integer.
+    label_to_result_text: dict[int, str]
+        Mapping from class integer label to human-interpretable result text.
+    """
+    extractor = FeatureMatrixExtractor(database_config_file=spt_db_config_location)
+    df_cell = _create_cell_df({
+        slide: data.dataframe for slide, data in extractor.extract(study=study).items()
+    })
+    cohorts = extractor.extract_cohorts(study)
+    df_label, label_to_result_text = _create_label_df(
+        cohorts['assignments'],
+        cohorts['strata'],
+        strata_to_use,
+    )
+    return df_cell, df_label, label_to_result_text
diff --git a/spatialprofilingtoolbox/cggnn/scripts/explore_classes.py b/spatialprofilingtoolbox/cggnn/scripts/explore_classes.py
@@ -0,0 +1,33 @@
+"""Report the different strata available to classify with."""
+
+from argparse import ArgumentParser
+
+from spatialprofilingtoolbox.db.feature_matrix_extractor import FeatureMatrixExtractor
+
+
+def parse_arguments():
+    """Process command line arguments."""
+    parser = ArgumentParser(
+        prog='spt cggnn explore_classes',
+        description='See the strata available to classify on.'
+    )
+    parser.add_argument(
+        '--spt_db_config_location',
+        type=str,
+        help='Location of the SPT DB config file.',
+        required=True
+    )
+    parser.add_argument(
+        '--study',
+        type=str,
+        help='Name of the study to query data for.',
+        required=True
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    extractor = FeatureMatrixExtractor(args.spt_db_config_location)
+    strata = extractor.extract_cohorts(study=args.study)['strata']
+    print(strata.to_string())
diff --git a/spatialprofilingtoolbox/cggnn/scripts/extract.py b/spatialprofilingtoolbox/cggnn/scripts/extract.py
@@ -0,0 +1,63 @@
+"""Extract information cg-gnn needs from SPT and save to file."""
+
+from argparse import ArgumentParser
+from os.path import join, exists
+from json import dump
+
+from spatialprofilingtoolbox.cggnn import extract_cggnn_data
+
+
+def parse_arguments():
+    """Process command line arguments."""
+    parser = ArgumentParser(
+        prog='spt cggnn extract',
+        description='Extract information cg-gnn needs from SPT and save to file.'
+    )
+    parser.add_argument(
+        '--spt_db_config_location',
+        type=str,
+        help='Location of the SPT DB config file.',
+        required=True
+    )
+    parser.add_argument(
+        '--study',
+        type=str,
+        help='Name of the study to query data for.',
+        required=True
+    )
+    parser.add_argument(
+        '--strata',
+        nargs='+',
+        type=int,
+        help='Specimen strata to use as labels, identified according to the "stratum identifier" '
+             'in `explore_classes`. This should be given as space separated integers.\n'
+             'If not provided, all strata will be used.',
+        required=False,
+        default=None
+    )
+    parser.add_argument(
+        '--output_location',
+        type=str,
+        help='Directory to save extracted data to.',
+        required=True
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    df_cell, df_label, label_to_result = extract_cggnn_data(
+        args.spt_db_config_location,
+        args.study,
+        args.strata,
+    )
+
+    assert isinstance(args.output_location, str)
+    dict_filename = join(args.output_location, 'label_to_results.json')
+    cells_filename = join(args.output_location, 'cells.h5')
+    labels_filename = join(args.output_location, 'labels.h5')
+    if not (exists(dict_filename) and exists(cells_filename) and exists(labels_filename)):
+        df_cell.to_hdf(cells_filename, 'cells')
+        df_label.to_hdf(labels_filename, 'labels')
+        with open(dict_filename, 'w', encoding='utf-8') as f:
+            dump(label_to_result, f)