Add tablenumber capabilities to distinguish repeated image numbers fr…

…om multiple datasets (#188) * initial rework of tablenumber efforts * enhance tablenumber work * add manual join to compare during tests * linting and threaded test * add list dep * fix htex tests * resolve legacy test issue with cytominer-database * fix tablenumber test * add docs * Update cytotable/convert.py Co-authored-by: Gregory Way <[email protected]> * update docs * linting --------- Co-authored-by: Gregory Way <[email protected]>
cytomining · Oct 25, 2024 · ccca87f · ccca87f
1 parent 4e8c57f
commit ccca87f
Show file tree

Hide file tree

Showing 7 changed files with 371 additions and 100 deletions.
diff --git a/cytotable/convert.py b/cytotable/convert.py
@@ -173,6 +173,106 @@ def _prep_cast_column_data_types(
     return columns
 
 
+@python_app
+def _set_tablenumber(
+    sources: Dict[str, List[Dict[str, Any]]],
+    add_tablenumber: Optional[bool] = None,
+) -> Dict[str, List[Dict[str, Any]]]:
+    """
+    Gathers a "TableNumber" from the image table (if CSV) or
+    SQLite file (if SQLite source) which is a unique identifier
+    intended to help differentiate between imagenumbers
+    to create distinct records for single-cell profiles
+    referenced across multiple source data exports.
+    For example, ImageNumber column values from CellProfiler
+    will repeat across exports, meaning we may lose distinction
+    when combining multiple export files together through CytoTable.
+
+    Note:
+    - If using CSV data sources, the image.csv table is used for checksum.
+    - If using SQLite data sources, the entire SQLite database is used for checksum.
+
+    Args:
+        sources: Dict[str, List[Dict[str, Any]]]
+            Contains metadata about data tables and related contents.
+        add_tablenumber: Optional[bool]
+            Whether to add a calculated tablenumber.
+            Note: when False, adds None as the tablenumber
+
+    Returns:
+        List[Dict[str, Any]]
+            New source group with added TableNumber details.
+    """
+
+    from cloudpathlib import AnyPath
+
+    from cytotable.utils import _gather_tablenumber_checksum
+
+    image_table_groups = {
+        # create a data structure with the common parent for each dataset
+        # and the calculated checksum from the image table.
+        # note: the source_path parent is used for non-SQLite files
+        # whereas the direct source path is used for SQLite files.
+        (
+            str(source["source_path"].parent)
+            if source["source_path"].suffix != "sqlite"
+            else source["source_path"]
+        ): source["source_path"]
+        for source_group_name, source_group_vals in sources.items()
+        # use the image tables references only for the basis of the
+        # these calculations.
+        if any(
+            value in str(AnyPath(source_group_name).stem).lower()
+            for value in ["image", "per_image"]
+        )
+        for source in source_group_vals
+    }
+
+    # determine if we need to add tablenumber data
+    if (
+        # case for detecting multiple image tables which need to be differentiated
+        add_tablenumber is None
+        and (len(image_table_groups) <= 1)
+    ) or (
+        # case for explicitly set no tablenumbers
+        add_tablenumber
+        is False
+    ):
+        return {
+            source_group_name: [
+                dict(
+                    source,
+                    **{
+                        "tablenumber": None,
+                    },
+                )
+                for source in source_group_vals
+            ]
+            for source_group_name, source_group_vals in sources.items()
+        }
+
+    # gather the image table from the source_group
+    tablenumber_table = {
+        # create a data structure with the common parent for each dataset
+        # and the calculated checksum from the image table
+        group: _gather_tablenumber_checksum(path)
+        for group, path in image_table_groups.items()
+    }
+
+    # return a modified sources data structure with the tablenumber added
+    return {
+        source_group_name: [
+            dict(
+                source,
+                **{"tablenumber": tablenumber_table[str(source["source_path"].parent)]},
+            )
+            for source in source_group_vals
+            if str(source["source_path"].parent) in list(tablenumber_table.keys())
+        ]
+        for source_group_name, source_group_vals in sources.items()
+    }
+
+
 @python_app
 def _get_table_keyset_pagination_sets(
     chunk_size: int,
@@ -310,15 +410,27 @@ def _source_pageset_to_parquet(
     )
     pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
 
+    # build tablenumber segment addition (if necessary)
+    tablenumber_sql = (
+        # to become tablenumber in sql select later with bigint (8-byte integer)
+        # we cast here to bigint to avoid concat or join conflicts later due to
+        # misaligned automatic data typing.
+        f"CAST({source['tablenumber']} AS BIGINT) as TableNumber, "
+        if source["tablenumber"] is not None
+        # don't introduce the column if we aren't supposed to add tablenumber
+        # as per parameter.
+        else ""
+    )
+
     # add source table columns
     casted_source_cols = [
         # here we cast the column to the specified type ensure the colname remains the same
         f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
         for column in source["columns"]
     ]
 
-    # create selection statement from lists above
-    select_columns = ",".join(
+    # create selection statement from tablenumber_sql + lists above
+    select_columns = tablenumber_sql + ",".join(
         # if we should sort the output, add the metadata_cols
         casted_source_cols
         if sort_output
@@ -376,6 +488,7 @@ def _source_pageset_to_parquet(
                     page_key=source["page_key"],
                     pageset=pageset,
                     sort_output=sort_output,
+                    tablenumber=source["tablenumber"],
                 ),
                 where=result_filepath,
             )
@@ -994,6 +1107,7 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
     sort_output: bool,
     page_keys: Dict[str, str],
     data_type_cast_map: Optional[Dict[str, str]] = None,
+    add_tablenumber: Optional[bool] = None,
     **kwargs,
 ) -> Union[Dict[str, List[Dict[str, Any]]], List[Any], str]:
     """
@@ -1137,6 +1251,12 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
         for source_group_name, source_group_vals in invalid_files_dropped.items()
     }
 
+    # add tablenumber details, appending None if not add_tablenumber
+    tablenumber_prepared = _set_tablenumber(
+        sources=evaluate_futures(column_names_and_types_gathered),
+        add_tablenumber=add_tablenumber,
+    ).result()
+
     results = {
         source_group_name: [
             dict(
@@ -1165,7 +1285,7 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
             for source in source_group_vals
         ]
         for source_group_name, source_group_vals in evaluate_futures(
-            column_names_and_types_gathered
+            tablenumber_prepared
         ).items()
     }
 
@@ -1277,6 +1397,7 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
     infer_common_schema: bool = True,
     drop_null: bool = False,
     data_type_cast_map: Optional[Dict[str, str]] = None,
+    add_tablenumber: Optional[bool] = None,
     page_keys: Optional[Dict[str, str]] = None,
     sort_output: bool = True,
     preset: Optional[str] = "cellprofiler_csv",
@@ -1326,6 +1447,11 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             A dictionary mapping data type groups to specific types.
             Roughly includes Arrow data types language from:
             https://arrow.apache.org/docs/python/api/datatypes.html
+        add_tablenumber: Optional[bool]
+            Whether to add a calculated tablenumber which helps differentiate
+            various repeated values (such as ObjectNumber) within source data.
+            Useful for processing multiple SQLite or CSV data sources together
+            to retain distinction from each dataset.
         page_keys: str:
             The table and column names to be used for key pagination.
             Uses the form: {"table_name":"column_name"}.
@@ -1466,6 +1592,7 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             infer_common_schema=infer_common_schema,
             drop_null=drop_null,
             data_type_cast_map=data_type_cast_map,
+            add_tablenumber=add_tablenumber,
             sort_output=sort_output,
             page_keys=cast(dict, page_keys),
             **kwargs,

diff --git a/cytotable/utils.py b/cytotable/utils.py
@@ -182,6 +182,7 @@ def _sqlite_mixed_type_query_to_parquet(
     page_key: str,
     pageset: Tuple[Union[int, float], Union[int, float]],
     sort_output: bool,
+    tablenumber: Optional[int] = None,
 ) -> str:
     """
     Performs SQLite table data extraction where one or many
@@ -201,6 +202,9 @@ def _sqlite_mixed_type_query_to_parquet(
             Specifies whether to sort cytotable output or not.
         add_cytotable_meta: bool, default=False:
             Whether to add CytoTable metadata fields or not
+        tablenumber: Optional[int], default=None:
+            An optional table number to append to the results.
+            Defaults to None.
 
     Returns:
         pyarrow.Table:
@@ -256,9 +260,19 @@ def _sqlite_affinity_data_type_lookup(col_type: str) -> str:
             # return the translated type for use in SQLite
             return translated_type[0]
 
+        # build tablenumber segment addition (if necessary)
+        tablenumber_sql = (
+            # to become tablenumber in sql select later with integer
+            f"CAST({tablenumber} AS INTEGER) as TableNumber, "
+            if tablenumber is not None
+            # if we don't have a tablenumber value, don't introduce the column
+            else ""
+        )
+
         # create cases for mixed-type handling in each column discovered above
-        query_parts = [
-            f"""
+        query_parts = tablenumber_sql + ", ".join(
+            [
+                f"""
             CASE
                 /* when the storage class type doesn't match the column, return nulltype */
                 WHEN typeof({col['column_name']}) !=
@@ -267,13 +281,14 @@ def _sqlite_affinity_data_type_lookup(col_type: str) -> str:
                 ELSE {col['column_name']}
             END AS {col['column_name']}
             """
-            for col in column_info
-        ]
+                for col in column_info
+            ]
+        )
 
         # perform the select using the cases built above and using chunksize + offset
         sql_stmt = f"""
             SELECT
-                {', '.join(query_parts)}
+                {query_parts}
             FROM {table_name}
             WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
             {"ORDER BY " + page_key if sort_output else ""};
@@ -482,6 +497,47 @@ def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
     )
 
 
+def _gather_tablenumber_checksum(pathname: str, buffer_size: int = 1048576) -> int:
+    """
+    Build and return a checksum for use as a unique identifier across datasets
+    referenced from cytominer-database:
+    https://github.com/cytomining/cytominer-database/blob/master/cytominer_database/ingest_variable_engine.py#L129
+
+    Args:
+        pathname: str:
+            A path to a file with which to generate the checksum on.
+        buffer_size: int:
+            Buffer size to use for reading data.
+
+    Returns:
+        int
+            an integer representing the checksum of the pathname file.
+    """
+
+    import os
+    import zlib
+
+    # check whether the buffer size is larger than the file_size
+    file_size = os.path.getsize(pathname)
+    if file_size < buffer_size:
+        buffer_size = file_size
+
+    # open file
+    with open(str(pathname), "rb") as stream:
+        # begin result formation
+        result = zlib.crc32(bytes(0))
+        while True:
+            # read data from stream using buffer size
+            buffer = stream.read(buffer_size)
+            if not buffer:
+                # if we have no more data to use, break while loop
+                break
+            # use buffer read data to form checksum
+            result = zlib.crc32(buffer, result)
+
+    return result & 0xFFFFFFFF
+
+
 def _unwrap_value(val: Union[parsl.dataflow.futures.AppFuture, Any]) -> Any:
     """
     Helper function to unwrap futures from values or return values

diff --git a/docs/source/architecture.data.md b/docs/source/architecture.data.md
@@ -25,6 +25,7 @@ Data are organized into tables of generally two categories:
 
 Identifying or key fields for image and compartment tables may include the following:
 
+- __TableNumber__: Provides a unique number based on the file referenced to build CytoTable output to help distinguish from repeated values in ImageNumber, ObjectNumber or other metadata columns which are referenced. Typically useful when using multiple SQLite or CSV-based source datasets.
 - __ImageNumber__: Provides specificity on what image is being referenced (there may be many).
 - __ObjectNumber__: Provides specificity for a specific compartment object within an ImageNumber.
 - __Parent_Cells__: Provides a related Cell compartment ObjectNumber. This field is canonically referenced from the Cytoplasm compartment for joining Cytoplasm and Cell compartment data. (see [Cytoplasm Compartment Data Relationships](architecture.data.md#cytoplasm-compartment-data-relationships) below for greater detail)

diff --git a/docs/source/python-api.md b/docs/source/python-api.md
@@ -45,6 +45,10 @@ Convert
 
 |
 
+.. autofunction:: _set_tablenumber
+
+|
+
 .. autofunction:: _prepend_column_name
 
 |
-Original file line number
+Diff line change
@@ Expand Up / @@ -45,6 +45,10 @@ Convert @@
     |
+    .. autofunction:: _set_tablenumber
+    |
     .. autofunction:: _prepend_column_name
     |
@@ Expand Down @@