From 6d311965526a61088e2fdc4ce4ca83feec0d6c82 Mon Sep 17 00:00:00 2001 From: James Mathews Date: Mon, 25 Sep 2023 15:54:11 -0400 Subject: [PATCH] Add histological structure id saving to binary compressed format, and read out, and add indexing on it to the DataFrames. --- .../ondemand/compressed_matrix_writer.py | 3 ++- .../ondemand/providers/provider.py | 22 +++++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/spatialprofilingtoolbox/ondemand/compressed_matrix_writer.py b/spatialprofilingtoolbox/ondemand/compressed_matrix_writer.py index f915dd106..3e608e4f1 100644 --- a/spatialprofilingtoolbox/ondemand/compressed_matrix_writer.py +++ b/spatialprofilingtoolbox/ondemand/compressed_matrix_writer.py @@ -85,7 +85,8 @@ def _get_specimens_and_indices( def _write_data_array_to_file(self, data_array: dict[int, int], filename: str) -> None: with open(filename, 'wb') as file: - for entry in data_array: + for histological_structure_id, entry in data_array.items(): + file.write(histological_structure_id.to_bytes(8, 'little')) file.write(entry.to_bytes(8, 'little')) def _report_subsample_for_inspection(self, data_arrays: CompressedDataArrays): diff --git a/spatialprofilingtoolbox/ondemand/providers/provider.py b/spatialprofilingtoolbox/ondemand/providers/provider.py index a7d0b8fb5..777827a5d 100644 --- a/spatialprofilingtoolbox/ondemand/providers/provider.py +++ b/spatialprofilingtoolbox/ondemand/providers/provider.py @@ -96,7 +96,7 @@ def _get_data_array_from_file( target_index_lookup: dict, target_by_symbol: dict, ) -> DataFrame: - """Load data arrays from a precomputed JSON artifact.""" + """Load data arrays from a precomputed binary artifact.""" rows = [] target_index_lookup = cast(dict, target_index_lookup) target_by_symbol = cast(dict, target_by_symbol) @@ -104,24 +104,28 @@ def _get_data_array_from_file( size = len(feature_columns) with open(filename, 'rb') as file: while True: - buffer = file.read(8) - row = cls._parse_cell_row(buffer, size) + buffer1 = file.read(8) + buffer2 = file.read(8) + row = cls._parse_cell_row(buffer1, buffer2, size) if row is None: break rows.append(row) - return DataFrame(rows, columns=feature_columns + ['integer']) + df = DataFrame(rows, columns=feature_columns + ['integer', 'histological_structure_id']) + df.set_index('histological_structure_id', inplace=True) + return df @staticmethod - def _parse_cell_row(buffer: bytes, size: int) -> tuple[int, ...] | None: - if buffer == b'': + def _parse_cell_row(buffer1: bytes, buffer2: bytes, size: int) -> tuple[int, ...] | None: + if buffer1 == b'': return None binary_expression_64_string = ''.join([ ''.join(list(reversed(bin(ii)[2:].rjust(8, '0')))) - for ii in buffer + for ii in buffer2 ]) truncated_to_channels = binary_expression_64_string[0:size] - integer = int.from_bytes(buffer, 'little') - return tuple([int(b) for b in list(truncated_to_channels)] + [integer]) + integer_phenotypes = int.from_bytes(buffer2, 'little') + integer_id = int.from_bytes(buffer1, 'little') + return tuple([int(b) for b in list(truncated_to_channels)] + [integer_phenotypes] + [integer_id]) @staticmethod def _add_centroids(