Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use compartment tables as basis of preset joins #183

Merged
merged 9 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -195,3 +195,18 @@ references:
See:
- https://sbpdiscovery.org/our-scientists/alexandre-colas-phd
- https://www.moleculardevices.com/products/cellular-imaging-systems/acquisition-and-analysis-software/in-carta-image-analysis-software
- authors:
- name: "Way Lab nf1_cellpainting_data Team"
date-accessed: "2024-04-03"
title: Way Lab nf1_cellpainting_data CellProfiler Data
type: data
repository-code: "https://github.com/WayScience/nf1_cellpainting_data"
url: "https://github.com/WayScience/nf1_cellpainting_data/raw/main/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite"
scope: "Plate_3_nf1_analysis.sqlite"
notes: >-
CellProfiler generated data from nf1_cellpainting_data project is used to help validate
expected results for CytoTable.
identifiers:
- description: "Github Link with Contributors"
type: url
value: "https://github.com/WayScience/nf1_cellpainting_data/graphs/contributors"
26 changes: 13 additions & 13 deletions cytotable/presets.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@
SELECT
*
FROM
Image_Filtered AS image
LEFT JOIN read_parquet('cytoplasm.parquet') AS cytoplasm ON
cytoplasm.Metadata_ImageNumber = image.Metadata_ImageNumber
read_parquet('cytoplasm.parquet') AS cytoplasm
LEFT JOIN read_parquet('cells.parquet') AS cells ON
cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
AND cells.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
AND nuclei.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
LEFT JOIN Image_Filtered AS image ON
image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
""",
},
"cellprofiler_sqlite": {
Expand Down Expand Up @@ -85,15 +85,15 @@
SELECT
*
FROM
Per_Image_Filtered AS per_image
LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON
per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber
read_parquet('per_cytoplasm.parquet') AS per_cytoplasm
LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
AND per_cells.Cells_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Cells
LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
AND per_nuclei.Nuclei_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Nuclei
LEFT JOIN Per_Image_Filtered AS per_image ON
per_image.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
""",
},
"cellprofiler_sqlite_pycytominer": {
Expand Down Expand Up @@ -136,15 +136,15 @@
SELECT
*
FROM
Per_Image_Filtered AS per_image
LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON
per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber
read_parquet('per_cytoplasm.parquet') AS per_cytoplasm
LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
AND per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
LEFT JOIN Per_Image_Filtered AS per_image ON
per_image.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
""",
},
"cell-health-cellprofiler-to-cytominer-database": {
Expand Down Expand Up @@ -190,10 +190,7 @@
SELECT
*
FROM
Image_Filtered AS image
LEFT JOIN read_parquet('cytoplasm.parquet') AS cytoplasm ON
cytoplasm.Metadata_TableNumber = image.Metadata_TableNumber
AND cytoplasm.Metadata_ImageNumber = image.Metadata_ImageNumber
read_parquet('cytoplasm.parquet') AS cytoplasm
LEFT JOIN read_parquet('cells.parquet') AS cells ON
cells.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
AND cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
Expand All @@ -202,6 +199,9 @@
nuclei.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
AND nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
LEFT JOIN Image_Filtered AS image ON
image.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
AND image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
""",
},
"in-carta": {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Creates small test dataset from Plate_3_nf1_analysis.sqlite for testing purposes.

Source:
https://github.com/WayScience/nf1_cellpainting_data/raw/main/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite
"""

# disable similar line checks for pylint
# pylint: disable=R0801

import shutil
import sqlite3

SQLITE_SOURCE = "Plate_3_nf1_analysis.sqlite"
SQLITE_TARGET = "test-Plate_3_nf1_analysis.sqlite"

# note: we presume the pre-existence of Plate_3_nf1_analysis.sqlite
# from an earlier download outside of this python work.
shutil.copy(SQLITE_SOURCE, SQLITE_TARGET)

with sqlite3.connect(SQLITE_TARGET) as conn:
# delete data except that related to two tablenumbers
conn.execute(
"""
DELETE FROM Per_Image
/* use site and well which are known to
contain imagenumbers that don't persist
to compartment tables */
WHERE Image_Metadata_Site != '1'
AND Image_Metadata_Well != 'B1';
"""
)
# do the same for compartment tables, also removing objectnumbers > 3
for table in ["Cells", "Nuclei", "Cytoplasm"]:
conn.execute(
f"""
DELETE FROM Per_{table}
WHERE
/* filter using only imagenumbers which exist in modified
image table after deletions */
ImageNumber NOT IN (SELECT ImageNumber FROM Per_Image)
/* Here we limit the number of objects which are returned
for each compartment table so as to keep the test dataset
very small. */
OR {table}_Number_Object_Number > 2
"""
)

conn.commit()
conn.execute("VACUUM;")
conn.commit()
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
Cell_Health_-_Cell_Painting_Single_Cell_Profiles/9995672?file=18506036
"""

# disable similar line checks for pylint
# pylint: disable=R0801

import shutil
import sqlite3

Expand Down
32 changes: 32 additions & 0 deletions tests/test_convert_threaded.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import parsl
import pyarrow as pa
import pyarrow.compute as pc
import pytest
from parsl.config import Config
from parsl.executors import ThreadPoolExecutor
Expand Down Expand Up @@ -237,3 +238,34 @@ def test_get_source_filepaths(
).result()
# test that the single dir structure includes 4 unique keys
assert len(set(single_dir_result.keys())) == 4


def test_avoid_na_row_output(
load_parsl_threaded: None, fx_tempdir: str, data_dir_cellprofiler: str
):
"""
Test to help detect and avoid scenarios where rows of NA-based data are returned
due to undetected objects in compartments (and as a result, imagenumbers).
For example, if there are imagenumbers in the image table and not the compartment table,
we want to avoid returning rows of NA data from the compartment tables after joins take place.
d33bs marked this conversation as resolved.
Show resolved Hide resolved
"""

# run convert using a dataset known to contain the scenario outlined above.
parquet_file = convert(
source_path=(
f"{data_dir_cellprofiler}"
"/nf1_cellpainting_data/test-Plate_3_nf1_analysis.sqlite"
),
dest_path=f"{fx_tempdir}/nf1_cellpainting_data/test-Plate_3_nf1_analysis.parquet",
dest_datatype="parquet",
preset="cellprofiler_sqlite_pycytominer",
)

# check that we have no nulls within Metadata_ImageNumber column
assert not pc.sum(
pc.is_null(
parquet.read_table(
source=parquet_file,
).column("Metadata_ImageNumber")
)
).as_py()
Loading