diff --git a/cytotable/__init__.py b/cytotable/__init__.py index fa0cb4f4..65d1bb91 100644 --- a/cytotable/__init__.py +++ b/cytotable/__init__.py @@ -2,7 +2,7 @@ __init__.py for cytotable """ -# note: version data is maintained by poetry-dynamic-versioning +# note: version data is maintained by poetry-dynamic-versioning (do not edit) __version__ = "0.0.0" from .convert import convert diff --git a/cytotable/constants.py b/cytotable/constants.py new file mode 100644 index 00000000..591b3238 --- /dev/null +++ b/cytotable/constants.py @@ -0,0 +1,74 @@ +""" +CytoTable: constants - storing various constants to be used throughout cytotable. +""" + +import multiprocessing +import os +from typing import cast + +from cytotable.utils import _get_cytotable_version + +# read max threads from environment if necessary +# max threads will be used with default Parsl config and Duckdb +MAX_THREADS = ( + multiprocessing.cpu_count() + if "CYTOTABLE_MAX_THREADS" not in os.environ + else int(cast(int, os.environ.get("CYTOTABLE_MAX_THREADS"))) +) + +# enables overriding default memory mapping behavior with pyarrow memory mapping +CYTOTABLE_ARROW_USE_MEMORY_MAPPING = ( + os.environ.get("CYTOTABLE_ARROW_USE_MEMORY_MAPPING", "1") == "1" +) + +DDB_DATA_TYPE_SYNONYMS = { + "real": ["float32", "float4", "float"], + "double": ["float64", "float8", "numeric", "decimal"], + "integer": ["int32", "int4", "int", "signed"], + "bigint": ["int64", "int8", "long"], +} + +# A reference dictionary for SQLite affinity and storage class types +# See more here: https://www.sqlite.org/datatype3.html#affinity_name_examples +SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = { + "integer": [ + "int", + "integer", + "tinyint", + "smallint", + "mediumint", + "bigint", + "unsigned big int", + "int2", + "int8", + ], + "text": [ + "character", + "varchar", + "varying character", + "nchar", + "native character", + "nvarchar", + "text", + "clob", + ], + "blob": ["blob"], + "real": [ + "real", + "double", + "double precision", + "float", + ], + "numeric": [ + "numeric", + "decimal", + "boolean", + "date", + "datetime", + ], +} + +CYTOTABLE_DEFAULT_PARQUET_METADATA = { + "data-producer": "https://github.com/cytomining/CytoTable", + "data-producer-version": str(_get_cytotable_version()), +} diff --git a/cytotable/convert.py b/cytotable/convert.py index b0842335..ad083b80 100644 --- a/cytotable/convert.py +++ b/cytotable/convert.py @@ -302,7 +302,11 @@ def _source_chunk_to_parquet( from cloudpathlib import AnyPath from pyarrow import parquet - from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet + from cytotable.utils import ( + _duckdb_reader, + _sqlite_mixed_type_query_to_parquet, + _write_parquet_table_with_metadata, + ) # attempt to build dest_path source_dest_path = ( @@ -339,7 +343,7 @@ def _source_chunk_to_parquet( # read data with chunk size + offset # and export to parquet with _duckdb_reader() as ddb_reader: - parquet.write_table( + _write_parquet_table_with_metadata( table=ddb_reader.execute( f""" {base_query} @@ -358,7 +362,7 @@ def _source_chunk_to_parquet( "Mismatch Type Error" in str(e) and str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite" ): - parquet.write_table( + _write_parquet_table_with_metadata( # here we use sqlite instead of duckdb to extract # data for special cases where column and value types # may not align (which is valid functionality in SQLite). @@ -414,7 +418,8 @@ def _prepend_column_name( import pyarrow.parquet as parquet - from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING + from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING + from cytotable.utils import _write_parquet_table_with_metadata targets = tuple(metadata) + tuple(compartments) @@ -499,7 +504,7 @@ def _prepend_column_name( updated_column_names.append(column_name) # perform table column name updates - parquet.write_table( + _write_parquet_table_with_metadata( table=table.rename_columns(updated_column_names), where=table_path ) @@ -569,8 +574,12 @@ def _concat_source_group( import pyarrow as pa import pyarrow.parquet as parquet + from cytotable.constants import ( + CYTOTABLE_ARROW_USE_MEMORY_MAPPING, + CYTOTABLE_DEFAULT_PARQUET_METADATA, + ) from cytotable.exceptions import SchemaException - from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING + from cytotable.utils import _write_parquet_table_with_metadata # build a result placeholder concatted: List[Dict[str, Any]] = [ @@ -600,7 +609,9 @@ def _concat_source_group( destination_path.parent.mkdir(parents=True, exist_ok=True) # build the schema for concatenation writer - writer_schema = pa.schema(common_schema) + writer_schema = pa.schema(common_schema).with_metadata( + CYTOTABLE_DEFAULT_PARQUET_METADATA + ) # build a parquet file writer which will be used to append files # as a single concatted parquet file, referencing the first file's schema @@ -713,7 +724,7 @@ def _join_source_chunk( import pyarrow.parquet as parquet - from cytotable.utils import _duckdb_reader + from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata # Attempt to read the data to parquet file # using duckdb for extraction and pyarrow for @@ -757,7 +768,7 @@ def _join_source_chunk( ) # write the result - parquet.write_table( + _write_parquet_table_with_metadata( table=result, where=result_file_path, ) @@ -797,7 +808,11 @@ def _concat_join_sources( import pyarrow.parquet as parquet - from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING + from cytotable.constants import ( + CYTOTABLE_ARROW_USE_MEMORY_MAPPING, + CYTOTABLE_DEFAULT_PARQUET_METADATA, + ) + from cytotable.utils import _write_parquet_table_with_metadata # remove the unjoined concatted compartments to prepare final dest_path usage # (we now have joined results) @@ -811,7 +826,7 @@ def _concat_join_sources( shutil.rmtree(path=dest_path) # write the concatted result as a parquet file - parquet.write_table( + _write_parquet_table_with_metadata( table=pa.concat_tables( tables=[ parquet.read_table( @@ -826,7 +841,9 @@ def _concat_join_sources( # build a parquet file writer which will be used to append files # as a single concatted parquet file, referencing the first file's schema # (all must be the same schema) - writer_schema = parquet.read_schema(join_sources[0]) + writer_schema = parquet.read_schema(join_sources[0]).with_metadata( + CYTOTABLE_DEFAULT_PARQUET_METADATA + ) with parquet.ParquetWriter(str(dest_path), writer_schema) as writer: for table_path in join_sources: writer.write_table( diff --git a/cytotable/utils.py b/cytotable/utils.py index 371e575e..8f317b0c 100644 --- a/cytotable/utils.py +++ b/cytotable/utils.py @@ -3,13 +3,13 @@ """ import logging -import multiprocessing import os import pathlib -from typing import Any, Dict, Union, cast +from typing import Any, Dict, Optional, Union, cast import duckdb import parsl +import pyarrow as pa from cloudpathlib import AnyPath, CloudPath from cloudpathlib.exceptions import InvalidPrefixError from parsl.app.app import AppBase @@ -19,67 +19,6 @@ logger = logging.getLogger(__name__) -# read max threads from environment if necessary -# max threads will be used with default Parsl config and Duckdb -MAX_THREADS = ( - multiprocessing.cpu_count() - if "CYTOTABLE_MAX_THREADS" not in os.environ - else int(cast(int, os.environ.get("CYTOTABLE_MAX_THREADS"))) -) - -# enables overriding default memory mapping behavior with pyarrow memory mapping -CYTOTABLE_ARROW_USE_MEMORY_MAPPING = ( - os.environ.get("CYTOTABLE_ARROW_USE_MEMORY_MAPPING", "1") == "1" -) - -DDB_DATA_TYPE_SYNONYMS = { - "real": ["float32", "float4", "float"], - "double": ["float64", "float8", "numeric", "decimal"], - "integer": ["int32", "int4", "int", "signed"], - "bigint": ["int64", "int8", "long"], -} - -# A reference dictionary for SQLite affinity and storage class types -# See more here: https://www.sqlite.org/datatype3.html#affinity_name_examples -SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = { - "integer": [ - "int", - "integer", - "tinyint", - "smallint", - "mediumint", - "bigint", - "unsigned big int", - "int2", - "int8", - ], - "text": [ - "character", - "varchar", - "varying character", - "nchar", - "native character", - "nvarchar", - "text", - "clob", - ], - "blob": ["blob"], - "real": [ - "real", - "double", - "double precision", - "float", - ], - "numeric": [ - "numeric", - "decimal", - "boolean", - "date", - "datetime", - ], -} - - # reference the original init original_init = AppBase.__init__ @@ -198,6 +137,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection: duckdb.DuckDBPyConnection """ + import duckdb + + from cytotable.constants import MAX_THREADS + return duckdb.connect().execute( # note: we use an f-string here to # dynamically configure threads as appropriate @@ -252,8 +195,8 @@ def _sqlite_mixed_type_query_to_parquet( import pyarrow as pa + from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS from cytotable.exceptions import DatatypeException - from cytotable.utils import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS # open sqlite3 connection with sqlite3.connect(source_path) as conn: @@ -384,6 +327,9 @@ def _arrow_type_cast_if_specified( Dict[str, str] A potentially data type updated dictionary of column information """ + + from cytotable.constants import DDB_DATA_TYPE_SYNONYMS + # for casting to new float type if "float" in data_type_cast_map.keys() and column["column_dtype"] in [ "REAL", @@ -453,3 +399,56 @@ def _expand_path( modifed_path = modifed_path.expanduser() return modifed_path.resolve() + + +def _get_cytotable_version() -> str: + """ + Seeks the current version of CytoTable using either pkg_resources + or dunamai to determine the current version being used. + + Returns: + str + A string representing the version of CytoTable currently being used. + """ + + try: + # attempt to gather the development version from dunamai + # for scenarios where cytotable from source is used. + import dunamai + + return dunamai.Version.from_any_vcs().serialize() + except (RuntimeError, ModuleNotFoundError): + # else grab a static version from __init__.py + # for scenarios where the built/packaged cytotable is used. + import cytotable + + return cytotable.__version__ + + +def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None: + """ + Adds metadata to parquet output from CytoTable. + Note: this mostly wraps pyarrow.parquet.write_table + https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html + + Args: + table: pa.Table: + Pyarrow table to be serialized as parquet table. + **kwargs: Any: + kwargs provided to this function roughly align with + pyarrow.parquet.write_table. The following might be + examples of what to expect here: + - where: str or pyarrow.NativeFile + """ + + from pyarrow import parquet + + from cytotable.constants import CYTOTABLE_DEFAULT_PARQUET_METADATA + from cytotable.utils import _get_cytotable_version + + parquet.write_table( + table=table.replace_schema_metadata( + metadata=CYTOTABLE_DEFAULT_PARQUET_METADATA + ), + **kwargs, + ) diff --git a/docs/source/contributing.md b/docs/source/contributing.md index 2b5f3e91..3a578207 100644 --- a/docs/source/contributing.md +++ b/docs/source/contributing.md @@ -190,6 +190,8 @@ CytoTable semvers are controlled through [`poetry-dynamic-versioning`](https://g CytoTable release git tags are automatically applied through [GitHub Releases](https://docs.github.com/en/repositories/releasing-projects-on-github/about-releases) and related inferred changes from [`release-drafter`](https://github.com/release-drafter/release-drafter). 1. Open a pull request and use a repository label for `release-` to label the pull request for visibility with [`release-drafter`](https://github.com/release-drafter/release-drafter) (for example, see [CytoTable#108](https://github.com/cytomining/CytoTable/pull/108) as a reference of a semver patch update). -1. On merging the pull request for the release, a [GitHub Actions workflow](https://docs.github.com/en/actions/using-workflows) defined in `draft-release.yml` leveraging [`release-drafter`](https://github.com/release-drafter/release-drafter) will draft a release for maintainers to modify as needed (double checking on the prepared git tag and content). +1. On merging the pull request for the release, a [GitHub Actions workflow](https://docs.github.com/en/actions/using-workflows) defined in `draft-release.yml` leveraging [`release-drafter`](https://github.com/release-drafter/release-drafter) will draft a release for maintainers. + The draft GitHub release will include a version tag based on the GitHub PR label applied and `release-drafter`. + The draft release does not normally need additional modifications but may be changed as needed. 1. Make modifications as necessary to the draft GitHub release, then publish the release. 1. On publishing the release, another GitHub Actions workflow defined in `publish-pypi.yml` will run to build and deploy the Python package to PyPI (utilizing the earlier modified `pyproject.toml` semantic version reference for labeling the release). diff --git a/pyproject.toml b/pyproject.toml index 7fa8649f..4cfc7f9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "CytoTable" -# note: version data is maintained by poetry-dynamic-versioning +# note: version data is maintained by poetry-dynamic-versioning (do not edit) version = "0.0.0" description = "Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools." authors = ["Cytomining Community"] @@ -20,13 +20,14 @@ vcs = "git" requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"] build-backend = "poetry_dynamic_versioning.backend" +[tool.setuptools_scm] + [tool.poetry.dependencies] python = ">=3.8,<3.13" pyarrow = ">=13.0.0" cloudpathlib = {extras = ["all"], version = "^0.15.0"} duckdb = ">=0.8.0" parsl = ">=2023.9.25" -dunamai = "^1.19.0" [tool.poetry.dev-dependencies] pytest = "^7.4.0" @@ -38,8 +39,7 @@ moto = {extras = ["s3", "server"], version = "^4.0.0"} cffconvert = "^2.0.0" cytominer-database = "^0.3.4" pycytominer = { git = "https://github.com/cytomining/pycytominer.git", rev = "09b2c79aa94908e3520f0931a844db4fba7fd3fb" } - - +dunamai = "^1.19.0" [tool.vulture] min_confidence = 80 diff --git a/tests/test_convert.py b/tests/test_convert.py index 50d78a88..8572fd29 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -37,7 +37,9 @@ _column_sort, _duckdb_reader, _expand_path, + _get_cytotable_version, _sqlite_mixed_type_query_to_parquet, + _write_parquet_table_with_metadata, ) @@ -60,6 +62,51 @@ def test_config(): ) == sorted(config_preset.keys()) +def test_get_cytotable_version(): + """ + Tests get_cytotable_version + """ + + assert isinstance(_get_cytotable_version(), str) + + +def test_write_parquet_table_with_metadata(fx_tempdir: str): + """ + Tests _write_parquet_table_with_metadata + """ + + # create a test dir and str with path for test file + pathlib.Path(f"{fx_tempdir}/pq_metadata_test").mkdir() + destination_file = f"{fx_tempdir}/pq_metadata_test/test.parquet" + + # test writing to parquet file with metadata + _write_parquet_table_with_metadata( + table=pa.Table.from_pydict( + { + "col_1": [ + "a", + ], + "col_2": [ + 1, + ], + } + ), + where=destination_file, + ) + + # assert that we have the metadata in place as expected + assert ( + parquet.read_table(source=destination_file).schema.metadata[b"data-producer"] + == b"https://github.com/cytomining/CytoTable" + ) + assert ( + parquet.read_table(source=destination_file).schema.metadata[ + b"data-producer-version" + ] + == _get_cytotable_version().encode() + ) + + def test_existing_dest_path(fx_tempdir: str, data_dir_cellprofiler_sqlite_nf1: str): """ Tests running cytotable.convert with existing dest_path