Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hotfix cggnn scripts, streamline calls #208

Merged
merged 5 commits into from
Sep 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions build/cggnn/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ RUN python3.11 -m pip install psycopg2==2.9.6
RUN python3.11 -m pip install adiscstudies==0.11.0
RUN python3.11 -m pip install numba==0.57.0
RUN python3.11 -m pip install attrs==23.1.0
RUN python3.11 -m pip install tables
RUN python3.11 -m pip install cg-gnn
ARG version
ARG service_name
Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml.unversioned
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ apiserver = [
"secure==0.3.0"
]
cggnn = [
"cg-gnn"
"cg-gnn",
"tables"
]
db = [
"pandas==2.0.2",
Expand Down Expand Up @@ -87,7 +88,8 @@ all = [
"fastapi==0.100.0",
"Pillow==9.5.0",
"squidpy==1.3.0",
"secure==0.3.0"
"secure==0.3.0",
"tables"
]
dev = [
"autopep8",
Expand Down
47 changes: 32 additions & 15 deletions spatialprofilingtoolbox/cggnn/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,17 @@ def _create_cell_df(dfs_by_specimen: dict[str, DataFrame]) -> DataFrame:

df = concat(dfs_by_specimen.values(), axis=0)
df.index.name = 'histological_structure'

# Convert binary int columns to boolean
channels = df.columns[df.columns.str.startswith('C ')]
phenotypes = df.columns[df.columns.str.startswith('P ')]
df[channels] = df[channels].astype(bool)
df[phenotypes] = df[phenotypes].astype(bool)

# Reorder columns so it's specimen, xy, channels, and phenotypes
column_order = ['specimen', 'pixel x', 'pixel y']
column_order.extend(df.columns[df.columns.str.startswith('C ')])
column_order.extend(df.columns[df.columns.str.startswith('P ')])
column_order.extend(channels)
column_order.extend(phenotypes)
return df[column_order]


Expand All @@ -25,8 +32,9 @@ def _create_label_df(
df_strata: DataFrame,
strata_to_use: list[int] | None,
) -> tuple[DataFrame, dict[int, str]]:
"""Get slide-level results."""
df_assignments = df_assignments.set_index('specimen')
"""Get specimen-level results."""
df_assignments['stratum identifier'] = df_assignments['stratum identifier'].astype(int)
df_strata['stratum identifier'] = df_strata['stratum identifier'].astype(int)
df_strata = df_strata.set_index('stratum identifier')
df_strata = _filter_for_strata(strata_to_use, df_strata)
df_strata = _drop_unneeded_columns(df_strata)
Expand All @@ -52,17 +60,22 @@ def _drop_unneeded_columns(df_strata: DataFrame) -> DataFrame:

def _compress_df(df_strata: DataFrame) -> DataFrame:
"""Compress remaining columns into a single string"""
df_strata['label'] = '(' + df_strata.iloc[:, 0].astype(str)
for i in range(1, df_strata.shape[1]):
df_strata['label'] += df_strata.iloc[:, i].astype(str)
df_strata['label'] += ')'
df_strata = df_strata[['label']]
n_columns = df_strata.shape[1]
if n_columns == 1:
df_strata = df_strata.rename(columns={df_strata.columns[0]: 'label'})
else:
df_strata['label'] = '(' + df_strata.iloc[:, 0].astype(str)
for i in range(1, n_columns):
df_strata['label'] += ', ' + df_strata.iloc[:, i].astype(str)
df_strata['label'] += ')'
df_strata = df_strata[['label']]
return df_strata


def _label(df_assignments: DataFrame, df_strata: DataFrame) -> tuple[DataFrame, dict[int, str]]:
"""Merge with specimen assignments, keeping only selected strata."""
df = merge(df_assignments, df_strata, on='stratum identifier', how='inner')[['label']]
df = merge(df_assignments, df_strata, on='stratum identifier', how='inner'
).set_index('specimen')[['label']]
label_to_result = dict(enumerate(sort(df['label'].unique())))
return df.replace({res: i for i, res in label_to_result.items()}), label_to_result

Expand All @@ -73,7 +86,7 @@ def extract_cggnn_data(
strata_to_use: list[int] | None,
) -> tuple[DataFrame, DataFrame, dict[int, str]]:
"""Extract information cg-gnn needs from SPT.

Parameters
----------
spt_db_config_location : str
Expand All @@ -84,7 +97,7 @@ def extract_cggnn_data(
Specimen strata to use as labels, identified according to the "stratum identifier" in
`explore_classes`. This should be given as space separated integers.
If not provided, all strata will be used.

Returns
-------
df_cell: DataFrame
Expand All @@ -100,13 +113,17 @@ def extract_cggnn_data(
Mapping from class integer label to human-interpretable result text.
"""
extractor = FeatureMatrixExtractor(database_config_file=spt_db_config_location)
df_cell = _create_cell_df({
slide: data.dataframe for slide, data in extractor.extract(study=study).items()
})
cohorts = extractor.extract_cohorts(study)
df_label, label_to_result_text = _create_label_df(
cohorts['assignments'],
cohorts['strata'],
strata_to_use,
)
df_cell = _create_cell_df({
specimen: extractor.extract(specimen=specimen, retain_structure_id=True)[specimen].dataframe
for specimen in df_label.index
} if (strata_to_use is not None) else {
specimen: data.dataframe
for specimen, data in extractor.extract(study=study, retain_structure_id=True).items()
})
return df_cell, df_label, label_to_result_text
2 changes: 1 addition & 1 deletion spatialprofilingtoolbox/cggnn/scripts/explore_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ def parse_arguments():

if __name__ == "__main__":
args = parse_arguments()
extractor = FeatureMatrixExtractor(args.spt_db_config_location)
extractor = FeatureMatrixExtractor(database_config_file=args.spt_db_config_location)
strata = extractor.extract_cohorts(study=args.study)['strata']
print(strata.to_string())
22 changes: 12 additions & 10 deletions spatialprofilingtoolbox/cggnn/scripts/extract.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Extract information cg-gnn needs from SPT and save to file."""

from argparse import ArgumentParser
from os import makedirs
from os.path import join, exists
from json import dump

Expand Down Expand Up @@ -46,17 +47,18 @@ def parse_arguments():

if __name__ == "__main__":
args = parse_arguments()
df_cell, df_label, label_to_result = extract_cggnn_data(
args.spt_db_config_location,
args.study,
args.strata,
)

assert isinstance(args.output_location, str)
dict_filename = join(args.output_location, 'label_to_results.json')
cells_filename = join(args.output_location, 'cells.h5')
labels_filename = join(args.output_location, 'labels.h5')
output_location: str = join(args.output_location, args.study)
assert isinstance(output_location, str)
makedirs(output_location, exist_ok=True)
dict_filename = join(output_location, 'label_to_results.json')
cells_filename = join(output_location, 'cells.h5')
labels_filename = join(output_location, 'labels.h5')
if not (exists(dict_filename) and exists(cells_filename) and exists(labels_filename)):
df_cell, df_label, label_to_result = extract_cggnn_data(
args.spt_db_config_location,
args.study,
args.strata,
)
df_cell.to_hdf(cells_filename, 'cells')
df_label.to_hdf(labels_filename, 'labels')
with open(dict_filename, 'w', encoding='utf-8') as f:
Expand Down
28 changes: 28 additions & 0 deletions spatialprofilingtoolbox/db/feature_matrix_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def extract(self,
specimen: str | None = None,
study: str | None = None,
continuous_also: bool = False,
retain_structure_id: bool = False,
) -> dict[str, MatrixBundle]:
"""Extract feature matrices for a specimen or every specimen in a study.

Expand All @@ -84,6 +85,8 @@ def extract(self,
Whether to also calculate and return a DataFrame for each specimen with continuous
channel information in addition to the default DataFrame which provides binary cast
channel information.
retain_structure_id: bool = False
Whether to index cells by their histological structure ID rather than arbitrary indices.

Returns
-------
Expand All @@ -101,6 +104,7 @@ def extract(self,
specimen=specimen,
study=study,
continuous_also=continuous_also,
retain_structure_id=retain_structure_id,
)
case _DBSource.CONFIG_FILE:
with DatabaseConnectionMaker(self.database_config_file) as dcm:
Expand All @@ -110,6 +114,7 @@ def extract(self,
specimen=specimen,
study=study,
continuous_also=continuous_also,
retain_structure_id=retain_structure_id,
)
case _DBSource.UNKNOWN:
raise RuntimeError('The database source can not be determined.')
Expand All @@ -119,6 +124,7 @@ def _extract(self,
specimen: str | None = None,
study: str | None = None,
continuous_also: bool = False,
retain_structure_id: bool = False,
) -> dict[str, MatrixBundle]:
if (specimen is None) == (study is None):
raise ValueError('Must specify exactly one of specimen or study.')
Expand All @@ -140,6 +146,7 @@ def _extract(self,
centroid_coordinates,
self._retrieve_phenotypes(study),
self._create_channel_information(data_arrays),
retain_structure_id,
)

def _retrieve_expressions_from_database(self,
Expand Down Expand Up @@ -180,6 +187,7 @@ def _create_feature_matrices(self,
centroid_coordinates: dict[str, Any],
phenotypes: dict[str, PhenotypeCriteria],
channel_information: list[str],
retain_structure_id: bool,
) -> dict[str, MatrixBundle]:
logger.info('Creating feature matrices from binary data arrays and centroids.')
matrices: dict[str, MatrixBundle] = {}
Expand All @@ -196,6 +204,7 @@ def _create_feature_matrices(self,
dataframe = DataFrame(
rows,
columns=['pixel x', 'pixel y'] + [f'C {cs}' for cs in channel_information],
index=self._extract_cell_ids(specimen) if retain_structure_id else None,
)
for symbol, criteria in phenotypes.items():
dataframe[f'P {symbol}'] = (
Expand All @@ -212,6 +221,7 @@ def _create_feature_matrices(self,
dataframe = DataFrame(
expression_vectors,
columns=[f'C {cs}' for cs in channel_information],
index=self._extract_cell_ids(specimen) if retain_structure_id else None,
)
matrices[specimen].continuous_dataframe = dataframe

Expand All @@ -228,6 +238,24 @@ def _create_feature_matrix_row(
feature_vector: list[int] = [int(value) for value in list(template.format(binary)[::-1])]
return [centroid[0], centroid[1]] + feature_vector

def _extract_cell_ids(self, specimen: str) -> list[int]:
self.cursor.execute(f'''
SELECT hsi.histological_structure
FROM histological_structure_identification hsi
JOIN histological_structure hs
ON hsi.histological_structure=hs.identifier
JOIN data_file df
ON hsi.data_source=df.sha256_hash
JOIN specimen_data_measurement_process sdmp
ON df.source_generation_process=sdmp.identifier
WHERE
hs.anatomical_entity='cell' AND
sdmp.specimen='{specimen}'
ORDER BY hsi.histological_structure
;
''')
return [int(entry[0]) for entry in self.cursor.fetchall()]

def _create_channel_information(self,
study_information: dict[str, dict[str, Any]]
) -> list[str]:
Expand Down
51 changes: 35 additions & 16 deletions spatialprofilingtoolbox/workflow/common/sparse_matrix_puller.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from psycopg2.extensions import cursor as Psycopg2Cursor

from spatialprofilingtoolbox.db.expressions_table_indexer import ExpressionsTableIndexer
from spatialprofilingtoolbox.db.study_access import StudyAccess
from spatialprofilingtoolbox.workflow.common.logging.fractional_progress_reporter \
import FractionalProgressReporter
from spatialprofilingtoolbox.standalone_utilities.log_formats import colorized_logger
Expand Down Expand Up @@ -107,7 +108,14 @@ class SparseMatrixPuller:
def __init__(self, cursor: Psycopg2Cursor):
self.cursor = cursor

def pull(self, specimen: str | None=None, study: str | None=None, continuous_also: bool=False):
def pull(self,
specimen: str | None = None,
study: str | None = None,
continuous_also: bool = False,
) -> None:
"""Pull sparse matrices into self.data_arrays."""
if (specimen is not None) and (study is not None):
raise ValueError('Must specify exactly one of specimen or study, or neither.')
self.data_arrays = self._retrieve_data_arrays(
specimen=specimen,
study=study,
Expand All @@ -118,10 +126,12 @@ def get_data_arrays(self):
return self.data_arrays

def _retrieve_data_arrays(self,
specimen: str | None=None,
study: str | None=None,
continuous_also: bool=False,
) -> CompressedDataArrays:
specimen: str | None = None,
study: str | None = None,
continuous_also: bool = False,
) -> CompressedDataArrays:
if specimen is not None:
study = StudyAccess(self.cursor).get_study_from_specimen(specimen)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should work to specialize to just the one study in case a specimen is provided.

study_names = self._get_study_names(study=study)
data_arrays = CompressedDataArrays()
for study_name in study_names:
Expand All @@ -136,9 +146,9 @@ def _retrieve_data_arrays(self,
def _fill_data_arrays_for_study(self,
data_arrays: CompressedDataArrays,
study_name: str,
specimen: str | None=None,
continuous_also: bool=False,
):
specimen: str | None = None,
continuous_also: bool = False,
) -> None:
specimens = self._get_pertinent_specimens(study_name, specimen=specimen)
target_by_symbol = self._get_target_by_symbol(study_name)
logger.debug('Pulling sparse entries for study "%s".', study_name)
Expand All @@ -158,8 +168,8 @@ def _fill_data_arrays_for_study(self,
continue
parsed = parse(sparse_entries, continuous_also=continuous_also)
data_arrays_by_specimen, \
target_index_lookup, \
continuous_data_arrays_by_specimen = parsed
target_index_lookup, \
continuous_data_arrays_by_specimen = parsed
data_arrays.add_study_data(
study_name,
data_arrays_by_specimen,
Expand All @@ -172,9 +182,18 @@ def _fill_data_arrays_for_study(self,

def _get_pertinent_specimens(self,
study_name: str,
specimen: str | None=None,
specimen: str | None = None,
) -> tuple[str, ...]:
if specimen is not None:
self.cursor.execute('''
SELECT sdmp.specimen
FROM specimen_data_measurement_process sdmp
WHERE sdmp.study=%s
AND sdmp.specimen=%s
;
''', (study_name, specimen))
if len(self.cursor.fetchall()) == 0:
raise ValueError(f'Specimen "{specimen}" not found in study "{study_name}".')
return (specimen,)
self.cursor.execute('''
SELECT sdmp.specimen
Expand All @@ -186,7 +205,7 @@ def _get_pertinent_specimens(self,
rows = self.cursor.fetchall()
return tuple(cast(str, row[0]) for row in rows)

def _get_study_names(self, study: str | None=None) -> tuple[str, ...]:
def _get_study_names(self, study: str | None = None) -> tuple[str, ...]:
if study is None:
self.cursor.execute('SELECT name FROM specimen_measurement_study ;')
rows = self.cursor.fetchall()
Expand All @@ -204,8 +223,8 @@ def _get_study_names(self, study: str | None=None) -> tuple[str, ...]:
logger.info(' %s', name)
return names

def _get_sparse_entries(self, study_name:str , specimen: str) -> list[tuple]:
sparse_entries: list[tuple] = []
def _get_sparse_entries(self, study_name: str, specimen: str) -> list[tuple[Any, ...]]:
sparse_entries: list[tuple[Any, ...]] = []
number_log_messages = 0
self.cursor.execute(
self._get_sparse_matrix_query_specimen_specific(),
Expand Down Expand Up @@ -265,7 +284,7 @@ def _get_batch_size(self) -> int:

def _parse_data_arrays_by_specimen(self,
sparse_entries: list[tuple],
continuous_also: bool=False,
continuous_also: bool = False,
):
target_index_lookup = self._get_target_index_lookup(sparse_entries)
sparse_entries.sort(key=lambda x: (x[3], x[0]))
Expand All @@ -291,7 +310,7 @@ def _parse_data_arrays_by_specimen(self,
data_arrays_by_specimen[specimen],
buffer,
target_index_lookup,
continuous_data_array = continuous_data_arrays_by_specimen[specimen],
continuous_data_array=continuous_data_arrays_by_specimen[specimen],
)
done_message = 'Done parsing %s feature vectors from %s.'
logger.debug(done_message, len(data_arrays_by_specimen[specimen]), specimen)
Expand Down
Loading