From 6d311965526a61088e2fdc4ce4ca83feec0d6c82 Mon Sep 17 00:00:00 2001
From: James Mathews <mail@jmath.read-books.org>
Date: Mon, 25 Sep 2023 15:54:11 -0400
Subject: [PATCH] Add histological structure id saving to binary compressed
 format, and read out, and add indexing on it to the DataFrames.

---
 .../ondemand/compressed_matrix_writer.py      |  3 ++-
 .../ondemand/providers/provider.py            | 22 +++++++++++--------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/spatialprofilingtoolbox/ondemand/compressed_matrix_writer.py b/spatialprofilingtoolbox/ondemand/compressed_matrix_writer.py
index f915dd106..3e608e4f1 100644
--- a/spatialprofilingtoolbox/ondemand/compressed_matrix_writer.py
+++ b/spatialprofilingtoolbox/ondemand/compressed_matrix_writer.py
@@ -85,7 +85,8 @@ def _get_specimens_and_indices(
 
     def _write_data_array_to_file(self, data_array: dict[int, int], filename: str) -> None:
         with open(filename, 'wb') as file:
-            for entry in data_array:
+            for histological_structure_id, entry in data_array.items():
+                file.write(histological_structure_id.to_bytes(8, 'little'))
                 file.write(entry.to_bytes(8, 'little'))
 
     def _report_subsample_for_inspection(self, data_arrays: CompressedDataArrays):
diff --git a/spatialprofilingtoolbox/ondemand/providers/provider.py b/spatialprofilingtoolbox/ondemand/providers/provider.py
index a7d0b8fb5..777827a5d 100644
--- a/spatialprofilingtoolbox/ondemand/providers/provider.py
+++ b/spatialprofilingtoolbox/ondemand/providers/provider.py
@@ -96,7 +96,7 @@ def _get_data_array_from_file(
         target_index_lookup: dict,
         target_by_symbol: dict,
     ) -> DataFrame:
-        """Load data arrays from a precomputed JSON artifact."""
+        """Load data arrays from a precomputed binary artifact."""
         rows = []
         target_index_lookup = cast(dict, target_index_lookup)
         target_by_symbol = cast(dict, target_by_symbol)
@@ -104,24 +104,28 @@ def _get_data_array_from_file(
         size = len(feature_columns)
         with open(filename, 'rb') as file:
             while True:
-                buffer = file.read(8)
-                row = cls._parse_cell_row(buffer, size)
+                buffer1 = file.read(8)
+                buffer2 = file.read(8)
+                row = cls._parse_cell_row(buffer1, buffer2, size)
                 if row is None:
                     break
                 rows.append(row)
-        return DataFrame(rows, columns=feature_columns + ['integer'])
+        df = DataFrame(rows, columns=feature_columns + ['integer', 'histological_structure_id'])
+        df.set_index('histological_structure_id', inplace=True)
+        return df
 
     @staticmethod
-    def _parse_cell_row(buffer: bytes, size: int) -> tuple[int, ...] | None:
-        if buffer == b'':
+    def _parse_cell_row(buffer1: bytes, buffer2: bytes, size: int) -> tuple[int, ...] | None:
+        if buffer1 == b'':
             return None
         binary_expression_64_string = ''.join([
             ''.join(list(reversed(bin(ii)[2:].rjust(8, '0'))))
-            for ii in buffer
+            for ii in buffer2
         ])
         truncated_to_channels = binary_expression_64_string[0:size]
-        integer = int.from_bytes(buffer, 'little')
-        return tuple([int(b) for b in list(truncated_to_channels)] + [integer])
+        integer_phenotypes = int.from_bytes(buffer2, 'little')
+        integer_id = int.from_bytes(buffer1, 'little')
+        return tuple([int(b) for b in list(truncated_to_channels)] + [integer_phenotypes] + [integer_id])
 
     @staticmethod
     def _add_centroids(