Skip to content

Commit

Permalink
Use compartment tables as basis of preset joins (#183)
Browse files Browse the repository at this point in the history
* add order to limit and offset queries

* order sqlite based results

* add testing data and citation

* add a test for the data

* modify presets and test

* formatting

* modify presets to pass tests

* clarify testing docs

Co-Authored-By: Gregory Way <[email protected]>

---------

Co-authored-by: Gregory Way <[email protected]>
  • Loading branch information
d33bs and gwaybio authored Apr 18, 2024
1 parent 496ff36 commit c1b4765
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 13 deletions.
15 changes: 15 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -195,3 +195,18 @@ references:
See:
- https://sbpdiscovery.org/our-scientists/alexandre-colas-phd
- https://www.moleculardevices.com/products/cellular-imaging-systems/acquisition-and-analysis-software/in-carta-image-analysis-software
- authors:
- name: "Way Lab nf1_cellpainting_data Team"
date-accessed: "2024-04-03"
title: Way Lab nf1_cellpainting_data CellProfiler Data
type: data
repository-code: "https://github.com/WayScience/nf1_cellpainting_data"
url: "https://github.com/WayScience/nf1_cellpainting_data/raw/main/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite"
scope: "Plate_3_nf1_analysis.sqlite"
notes: >-
CellProfiler generated data from nf1_cellpainting_data project is used to help validate
expected results for CytoTable.
identifiers:
- description: "Github Link with Contributors"
type: url
value: "https://github.com/WayScience/nf1_cellpainting_data/graphs/contributors"
26 changes: 13 additions & 13 deletions cytotable/presets.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@
SELECT
*
FROM
Image_Filtered AS image
LEFT JOIN read_parquet('cytoplasm.parquet') AS cytoplasm ON
cytoplasm.Metadata_ImageNumber = image.Metadata_ImageNumber
read_parquet('cytoplasm.parquet') AS cytoplasm
LEFT JOIN read_parquet('cells.parquet') AS cells ON
cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
AND cells.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
AND nuclei.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
LEFT JOIN Image_Filtered AS image ON
image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
""",
},
"cellprofiler_sqlite": {
Expand Down Expand Up @@ -85,15 +85,15 @@
SELECT
*
FROM
Per_Image_Filtered AS per_image
LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON
per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber
read_parquet('per_cytoplasm.parquet') AS per_cytoplasm
LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
AND per_cells.Cells_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Cells
LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
AND per_nuclei.Nuclei_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Nuclei
LEFT JOIN Per_Image_Filtered AS per_image ON
per_image.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
""",
},
"cellprofiler_sqlite_pycytominer": {
Expand Down Expand Up @@ -136,15 +136,15 @@
SELECT
*
FROM
Per_Image_Filtered AS per_image
LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON
per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber
read_parquet('per_cytoplasm.parquet') AS per_cytoplasm
LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
AND per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
LEFT JOIN Per_Image_Filtered AS per_image ON
per_image.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
""",
},
"cell-health-cellprofiler-to-cytominer-database": {
Expand Down Expand Up @@ -190,10 +190,7 @@
SELECT
*
FROM
Image_Filtered AS image
LEFT JOIN read_parquet('cytoplasm.parquet') AS cytoplasm ON
cytoplasm.Metadata_TableNumber = image.Metadata_TableNumber
AND cytoplasm.Metadata_ImageNumber = image.Metadata_ImageNumber
read_parquet('cytoplasm.parquet') AS cytoplasm
LEFT JOIN read_parquet('cells.parquet') AS cells ON
cells.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
AND cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
Expand All @@ -202,6 +199,9 @@
nuclei.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
AND nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
LEFT JOIN Image_Filtered AS image ON
image.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
AND image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
""",
},
"in-carta": {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Creates small test dataset from Plate_3_nf1_analysis.sqlite for testing purposes.
Source:
https://github.com/WayScience/nf1_cellpainting_data/raw/main/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite
"""

# disable similar line checks for pylint
# pylint: disable=R0801

import shutil
import sqlite3

SQLITE_SOURCE = "Plate_3_nf1_analysis.sqlite"
SQLITE_TARGET = "test-Plate_3_nf1_analysis.sqlite"

# note: we presume the pre-existence of Plate_3_nf1_analysis.sqlite
# from an earlier download outside of this python work.
shutil.copy(SQLITE_SOURCE, SQLITE_TARGET)

with sqlite3.connect(SQLITE_TARGET) as conn:
# delete data except that related to two tablenumbers
conn.execute(
"""
DELETE FROM Per_Image
/* use site and well which are known to
contain imagenumbers that don't persist
to compartment tables */
WHERE Image_Metadata_Site != '1'
AND Image_Metadata_Well != 'B1';
"""
)
# do the same for compartment tables, also removing objectnumbers > 3
for table in ["Cells", "Nuclei", "Cytoplasm"]:
conn.execute(
f"""
DELETE FROM Per_{table}
WHERE
/* filter using only imagenumbers which exist in modified
image table after deletions */
ImageNumber NOT IN (SELECT ImageNumber FROM Per_Image)
/* Here we limit the number of objects which are returned
for each compartment table so as to keep the test dataset
very small. */
OR {table}_Number_Object_Number > 2
"""
)

conn.commit()
conn.execute("VACUUM;")
conn.commit()
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
Cell_Health_-_Cell_Painting_Single_Cell_Profiles/9995672?file=18506036
"""

# disable similar line checks for pylint
# pylint: disable=R0801

import shutil
import sqlite3

Expand Down
38 changes: 38 additions & 0 deletions tests/test_convert_threaded.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import parsl
import pyarrow as pa
import pyarrow.compute as pc
import pytest
from parsl.config import Config
from parsl.executors import ThreadPoolExecutor
Expand Down Expand Up @@ -237,3 +238,40 @@ def test_get_source_filepaths(
).result()
# test that the single dir structure includes 4 unique keys
assert len(set(single_dir_result.keys())) == 4


def test_avoid_na_row_output(
load_parsl_threaded: None, fx_tempdir: str, data_dir_cellprofiler: str
):
"""
Test to help detect and avoid scenarios where CytoTable returns rows of
NA-based data. This occurs when CytoTable processes CellProfiler data
sources with images that do not contain segmented objects. In other words,
this test ensures CytoTable produces correct output data when the input
CellProfiler image table contains imagenumbers that do not exist in any
compartment object.
Therefore, CytoTable does not return single-cell rows which include image
table metadata and NA feature data. Using compartment tables as the basis
of data joins avoids this issue.
"""

# run convert using a dataset known to contain the scenario outlined above.
parquet_file = convert(
source_path=(
f"{data_dir_cellprofiler}"
"/nf1_cellpainting_data/test-Plate_3_nf1_analysis.sqlite"
),
dest_path=f"{fx_tempdir}/nf1_cellpainting_data/test-Plate_3_nf1_analysis.parquet",
dest_datatype="parquet",
preset="cellprofiler_sqlite_pycytominer",
)

# check that we have no nulls within Metadata_ImageNumber column
assert not pc.sum(
pc.is_null(
parquet.read_table(
source=parquet_file,
).column("Metadata_ImageNumber")
)
).as_py()

0 comments on commit c1b4765

Please sign in to comment.