Skip to content

Commit

Permalink
Add tablenumber capabilities to distinguish repeated image numbers fr…
Browse files Browse the repository at this point in the history
…om multiple datasets (#188)

* initial rework of tablenumber efforts

* enhance tablenumber work

* add manual join to compare during tests

* linting and threaded test

* add list dep

* fix htex tests

* resolve legacy test issue with cytominer-database

* fix tablenumber test

* add docs

* Update cytotable/convert.py

Co-authored-by: Gregory Way <[email protected]>

* update docs

* linting

---------

Co-authored-by: Gregory Way <[email protected]>
  • Loading branch information
d33bs and gwaybio authored Oct 25, 2024
1 parent 4e8c57f commit ccca87f
Show file tree
Hide file tree
Showing 7 changed files with 371 additions and 100 deletions.
133 changes: 130 additions & 3 deletions cytotable/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,106 @@ def _prep_cast_column_data_types(
return columns


@python_app
def _set_tablenumber(
sources: Dict[str, List[Dict[str, Any]]],
add_tablenumber: Optional[bool] = None,
) -> Dict[str, List[Dict[str, Any]]]:
"""
Gathers a "TableNumber" from the image table (if CSV) or
SQLite file (if SQLite source) which is a unique identifier
intended to help differentiate between imagenumbers
to create distinct records for single-cell profiles
referenced across multiple source data exports.
For example, ImageNumber column values from CellProfiler
will repeat across exports, meaning we may lose distinction
when combining multiple export files together through CytoTable.
Note:
- If using CSV data sources, the image.csv table is used for checksum.
- If using SQLite data sources, the entire SQLite database is used for checksum.
Args:
sources: Dict[str, List[Dict[str, Any]]]
Contains metadata about data tables and related contents.
add_tablenumber: Optional[bool]
Whether to add a calculated tablenumber.
Note: when False, adds None as the tablenumber
Returns:
List[Dict[str, Any]]
New source group with added TableNumber details.
"""

from cloudpathlib import AnyPath

from cytotable.utils import _gather_tablenumber_checksum

image_table_groups = {
# create a data structure with the common parent for each dataset
# and the calculated checksum from the image table.
# note: the source_path parent is used for non-SQLite files
# whereas the direct source path is used for SQLite files.
(
str(source["source_path"].parent)
if source["source_path"].suffix != "sqlite"
else source["source_path"]
): source["source_path"]
for source_group_name, source_group_vals in sources.items()
# use the image tables references only for the basis of the
# these calculations.
if any(
value in str(AnyPath(source_group_name).stem).lower()
for value in ["image", "per_image"]
)
for source in source_group_vals
}

# determine if we need to add tablenumber data
if (
# case for detecting multiple image tables which need to be differentiated
add_tablenumber is None
and (len(image_table_groups) <= 1)
) or (
# case for explicitly set no tablenumbers
add_tablenumber
is False
):
return {
source_group_name: [
dict(
source,
**{
"tablenumber": None,
},
)
for source in source_group_vals
]
for source_group_name, source_group_vals in sources.items()
}

# gather the image table from the source_group
tablenumber_table = {
# create a data structure with the common parent for each dataset
# and the calculated checksum from the image table
group: _gather_tablenumber_checksum(path)
for group, path in image_table_groups.items()
}

# return a modified sources data structure with the tablenumber added
return {
source_group_name: [
dict(
source,
**{"tablenumber": tablenumber_table[str(source["source_path"].parent)]},
)
for source in source_group_vals
if str(source["source_path"].parent) in list(tablenumber_table.keys())
]
for source_group_name, source_group_vals in sources.items()
}


@python_app
def _get_table_keyset_pagination_sets(
chunk_size: int,
Expand Down Expand Up @@ -310,15 +410,27 @@ def _source_pageset_to_parquet(
)
pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)

# build tablenumber segment addition (if necessary)
tablenumber_sql = (
# to become tablenumber in sql select later with bigint (8-byte integer)
# we cast here to bigint to avoid concat or join conflicts later due to
# misaligned automatic data typing.
f"CAST({source['tablenumber']} AS BIGINT) as TableNumber, "
if source["tablenumber"] is not None
# don't introduce the column if we aren't supposed to add tablenumber
# as per parameter.
else ""
)

# add source table columns
casted_source_cols = [
# here we cast the column to the specified type ensure the colname remains the same
f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
for column in source["columns"]
]

# create selection statement from lists above
select_columns = ",".join(
# create selection statement from tablenumber_sql + lists above
select_columns = tablenumber_sql + ",".join(
# if we should sort the output, add the metadata_cols
casted_source_cols
if sort_output
Expand Down Expand Up @@ -376,6 +488,7 @@ def _source_pageset_to_parquet(
page_key=source["page_key"],
pageset=pageset,
sort_output=sort_output,
tablenumber=source["tablenumber"],
),
where=result_filepath,
)
Expand Down Expand Up @@ -994,6 +1107,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
sort_output: bool,
page_keys: Dict[str, str],
data_type_cast_map: Optional[Dict[str, str]] = None,
add_tablenumber: Optional[bool] = None,
**kwargs,
) -> Union[Dict[str, List[Dict[str, Any]]], List[Any], str]:
"""
Expand Down Expand Up @@ -1137,6 +1251,12 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
for source_group_name, source_group_vals in invalid_files_dropped.items()
}

# add tablenumber details, appending None if not add_tablenumber
tablenumber_prepared = _set_tablenumber(
sources=evaluate_futures(column_names_and_types_gathered),
add_tablenumber=add_tablenumber,
).result()

results = {
source_group_name: [
dict(
Expand Down Expand Up @@ -1165,7 +1285,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
for source in source_group_vals
]
for source_group_name, source_group_vals in evaluate_futures(
column_names_and_types_gathered
tablenumber_prepared
).items()
}

Expand Down Expand Up @@ -1277,6 +1397,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
infer_common_schema: bool = True,
drop_null: bool = False,
data_type_cast_map: Optional[Dict[str, str]] = None,
add_tablenumber: Optional[bool] = None,
page_keys: Optional[Dict[str, str]] = None,
sort_output: bool = True,
preset: Optional[str] = "cellprofiler_csv",
Expand Down Expand Up @@ -1326,6 +1447,11 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
A dictionary mapping data type groups to specific types.
Roughly includes Arrow data types language from:
https://arrow.apache.org/docs/python/api/datatypes.html
add_tablenumber: Optional[bool]
Whether to add a calculated tablenumber which helps differentiate
various repeated values (such as ObjectNumber) within source data.
Useful for processing multiple SQLite or CSV data sources together
to retain distinction from each dataset.
page_keys: str:
The table and column names to be used for key pagination.
Uses the form: {"table_name":"column_name"}.
Expand Down Expand Up @@ -1466,6 +1592,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
infer_common_schema=infer_common_schema,
drop_null=drop_null,
data_type_cast_map=data_type_cast_map,
add_tablenumber=add_tablenumber,
sort_output=sort_output,
page_keys=cast(dict, page_keys),
**kwargs,
Expand Down
66 changes: 61 additions & 5 deletions cytotable/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ def _sqlite_mixed_type_query_to_parquet(
page_key: str,
pageset: Tuple[Union[int, float], Union[int, float]],
sort_output: bool,
tablenumber: Optional[int] = None,
) -> str:
"""
Performs SQLite table data extraction where one or many
Expand All @@ -201,6 +202,9 @@ def _sqlite_mixed_type_query_to_parquet(
Specifies whether to sort cytotable output or not.
add_cytotable_meta: bool, default=False:
Whether to add CytoTable metadata fields or not
tablenumber: Optional[int], default=None:
An optional table number to append to the results.
Defaults to None.
Returns:
pyarrow.Table:
Expand Down Expand Up @@ -256,9 +260,19 @@ def _sqlite_affinity_data_type_lookup(col_type: str) -> str:
# return the translated type for use in SQLite
return translated_type[0]

# build tablenumber segment addition (if necessary)
tablenumber_sql = (
# to become tablenumber in sql select later with integer
f"CAST({tablenumber} AS INTEGER) as TableNumber, "
if tablenumber is not None
# if we don't have a tablenumber value, don't introduce the column
else ""
)

# create cases for mixed-type handling in each column discovered above
query_parts = [
f"""
query_parts = tablenumber_sql + ", ".join(
[
f"""
CASE
/* when the storage class type doesn't match the column, return nulltype */
WHEN typeof({col['column_name']}) !=
Expand All @@ -267,13 +281,14 @@ def _sqlite_affinity_data_type_lookup(col_type: str) -> str:
ELSE {col['column_name']}
END AS {col['column_name']}
"""
for col in column_info
]
for col in column_info
]
)

# perform the select using the cases built above and using chunksize + offset
sql_stmt = f"""
SELECT
{', '.join(query_parts)}
{query_parts}
FROM {table_name}
WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
{"ORDER BY " + page_key if sort_output else ""};
Expand Down Expand Up @@ -482,6 +497,47 @@ def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
)


def _gather_tablenumber_checksum(pathname: str, buffer_size: int = 1048576) -> int:
"""
Build and return a checksum for use as a unique identifier across datasets
referenced from cytominer-database:
https://github.com/cytomining/cytominer-database/blob/master/cytominer_database/ingest_variable_engine.py#L129
Args:
pathname: str:
A path to a file with which to generate the checksum on.
buffer_size: int:
Buffer size to use for reading data.
Returns:
int
an integer representing the checksum of the pathname file.
"""

import os
import zlib

# check whether the buffer size is larger than the file_size
file_size = os.path.getsize(pathname)
if file_size < buffer_size:
buffer_size = file_size

# open file
with open(str(pathname), "rb") as stream:
# begin result formation
result = zlib.crc32(bytes(0))
while True:
# read data from stream using buffer size
buffer = stream.read(buffer_size)
if not buffer:
# if we have no more data to use, break while loop
break
# use buffer read data to form checksum
result = zlib.crc32(buffer, result)

return result & 0xFFFFFFFF


def _unwrap_value(val: Union[parsl.dataflow.futures.AppFuture, Any]) -> Any:
"""
Helper function to unwrap futures from values or return values
Expand Down
1 change: 1 addition & 0 deletions docs/source/architecture.data.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Data are organized into tables of generally two categories:

Identifying or key fields for image and compartment tables may include the following:

- __TableNumber__: Provides a unique number based on the file referenced to build CytoTable output to help distinguish from repeated values in ImageNumber, ObjectNumber or other metadata columns which are referenced. Typically useful when using multiple SQLite or CSV-based source datasets.
- __ImageNumber__: Provides specificity on what image is being referenced (there may be many).
- __ObjectNumber__: Provides specificity for a specific compartment object within an ImageNumber.
- __Parent_Cells__: Provides a related Cell compartment ObjectNumber. This field is canonically referenced from the Cytoplasm compartment for joining Cytoplasm and Cell compartment data. (see [Cytoplasm Compartment Data Relationships](architecture.data.md#cytoplasm-compartment-data-relationships) below for greater detail)
Expand Down
4 changes: 4 additions & 0 deletions docs/source/python-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ Convert
|
.. autofunction:: _set_tablenumber
|
.. autofunction:: _prepend_column_name
|
Expand Down
Loading

0 comments on commit ccca87f

Please sign in to comment.