cytomining · d33bs · Dec 20, 2023 · Dec 18, 2023 · Dec 19, 2023 · Dec 19, 2023
diff --git a/.github/actions/setup-poetry/action.yml b/.github/actions/setup-poetry/action.yml
@@ -0,0 +1,11 @@
+name: Setup Environment and Cache
+description: |
+  Setup poetry for use with GitHub Actions workflows.
+  Note: presumes pre-installed Python.
+runs:
+  using: "composite"
+  steps:
+    - name: Setup poetry and poetry-dynamic-versioning
+      shell: bash
+      run: |
+        python -m pip install poetry poetry-dynamic-versioning
diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
@@ -19,10 +19,8 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: "3.10"
-      - name: install poetry
-        uses: abatilo/actions-poetry@v2
-        with:
-          poetry-version: "1.6.1"
+      - name: Setup for poetry
+        uses: ./.github/actions/setup-poetry
       - name: poetry deps
         run: poetry install
       - name: Build documentation

diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml
@@ -22,10 +22,8 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: "3.10"
-      - name: install poetry
-        uses: abatilo/actions-poetry@v2
-        with:
-          poetry-version: "1.6.1"
+      - name: Setup for poetry
+        uses: ./.github/actions/setup-poetry
       - name: poetry deps
         run: poetry install
       - name: poetry build distribution content

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -24,8 +24,8 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python_version }}
-      - name: Install poetry
-        run: pip install poetry
+      - name: Setup for poetry
+        uses: ./.github/actions/setup-poetry
       - name: Install environment
         run: poetry install --no-interaction --no-ansi
       - name: Run sphinx-docs build test

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,15 +4,15 @@ default_language_version:
   python: python3.10
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v4.5.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
       - id: check-yaml
       - id: check-added-large-files
       - id: check-toml
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.5
+    rev: v2.2.6
     hooks:
       - id: codespell
         exclude: >
@@ -29,37 +29,37 @@ repos:
           - mdformat-myst
           - mdformat-gfm
   - repo: https://github.com/adrienverge/yamllint
-    rev: v1.32.0
+    rev: v1.33.0
     hooks:
       - id: yamllint
   - repo: https://github.com/psf/black
-    rev: 23.9.1
+    rev: 23.12.0
     hooks:
       - id: black
   - repo: https://github.com/asottile/blacken-docs
     rev: 1.16.0
     hooks:
       - id: blacken-docs
   - repo: https://github.com/PyCQA/bandit
-    rev: 1.7.5
+    rev: 1.7.6
     hooks:
       - id: bandit
         args: ["-c", "pyproject.toml"]
         additional_dependencies: ["bandit[toml]"]
   - repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
       - id: isort
   - repo: https://github.com/jendrikseipp/vulture
-    rev: v2.9.1
+    rev: v2.10
     hooks:
       - id: vulture
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.5.1
+    rev: v1.7.1
     hooks:
       - id: mypy
   - repo: https://github.com/PyCQA/pylint
-    rev: v3.0.0a7
+    rev: v3.0.3
     hooks:
       - id: pylint
         name: pylint

diff --git a/cytotable/__init__.py b/cytotable/__init__.py
@@ -1,6 +1,10 @@
 """
 __init__.py for cytotable
 """
+
+# note: version data is maintained by poetry-dynamic-versioning
+__version__ = "0.0.0"
+
 from .convert import convert
 from .exceptions import (
     CytoTableException,

diff --git a/cytotable/constants.py b/cytotable/constants.py
@@ -0,0 +1,74 @@
+"""
+CytoTable: constants - storing various constants to be used throughout cytotable.
+"""
+
+import multiprocessing
+import os
+from typing import cast
+
+from cytotable.utils import _get_cytotable_version
+
+# read max threads from environment if necessary
+# max threads will be used with default Parsl config and Duckdb
+MAX_THREADS = (
+    multiprocessing.cpu_count()
+    if "CYTOTABLE_MAX_THREADS" not in os.environ
+    else int(cast(int, os.environ.get("CYTOTABLE_MAX_THREADS")))
+)
+
+# enables overriding default memory mapping behavior with pyarrow memory mapping
+CYTOTABLE_ARROW_USE_MEMORY_MAPPING = (
+    os.environ.get("CYTOTABLE_ARROW_USE_MEMORY_MAPPING", "1") == "1"
+)
+
+DDB_DATA_TYPE_SYNONYMS = {
+    "real": ["float32", "float4", "float"],
+    "double": ["float64", "float8", "numeric", "decimal"],
+    "integer": ["int32", "int4", "int", "signed"],
+    "bigint": ["int64", "int8", "long"],
+}
+
+# A reference dictionary for SQLite affinity and storage class types
+# See more here: https://www.sqlite.org/datatype3.html#affinity_name_examples
+SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
+    "integer": [
+        "int",
+        "integer",
+        "tinyint",
+        "smallint",
+        "mediumint",
+        "bigint",
+        "unsigned big int",
+        "int2",
+        "int8",
+    ],
+    "text": [
+        "character",
+        "varchar",
+        "varying character",
+        "nchar",
+        "native character",
+        "nvarchar",
+        "text",
+        "clob",
+    ],
+    "blob": ["blob"],
+    "real": [
+        "real",
+        "double",
+        "double precision",
+        "float",
+    ],
+    "numeric": [
+        "numeric",
+        "decimal",
+        "boolean",
+        "date",
+        "datetime",
+    ],
+}
+
+CYTOTABLE_DEFAULT_PARQUET_METADATA = {
+    "data-producer": "https://github.com/cytomining/CytoTable",
+    "data-producer-version": str(_get_cytotable_version()),
+}
diff --git a/cytotable/convert.py b/cytotable/convert.py
@@ -302,7 +302,11 @@ def _source_chunk_to_parquet(
     from cloudpathlib import AnyPath
     from pyarrow import parquet
 
-    from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
+    from cytotable.utils import (
+        _duckdb_reader,
+        _sqlite_mixed_type_query_to_parquet,
+        _write_parquet_table_with_metadata,
+    )
 
     # attempt to build dest_path
     source_dest_path = (
@@ -339,7 +343,7 @@ def _source_chunk_to_parquet(
         # read data with chunk size + offset
         # and export to parquet
         with _duckdb_reader() as ddb_reader:
-            parquet.write_table(
+            _write_parquet_table_with_metadata(
                 table=ddb_reader.execute(
                     f"""
                     {base_query}
@@ -358,7 +362,7 @@ def _source_chunk_to_parquet(
             "Mismatch Type Error" in str(e)
             and str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite"
         ):
-            parquet.write_table(
+            _write_parquet_table_with_metadata(
                 # here we use sqlite instead of duckdb to extract
                 # data for special cases where column and value types
                 # may not align (which is valid functionality in SQLite).
@@ -414,7 +418,8 @@ def _prepend_column_name(
 
     import pyarrow.parquet as parquet
 
-    from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
+    from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
+    from cytotable.utils import _write_parquet_table_with_metadata
 
     targets = tuple(metadata) + tuple(compartments)
 
@@ -499,7 +504,7 @@ def _prepend_column_name(
             updated_column_names.append(column_name)
 
     # perform table column name updates
-    parquet.write_table(
+    _write_parquet_table_with_metadata(
         table=table.rename_columns(updated_column_names), where=table_path
     )
 
@@ -569,8 +574,12 @@ def _concat_source_group(
     import pyarrow as pa
     import pyarrow.parquet as parquet
 
+    from cytotable.constants import (
+        CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
+        CYTOTABLE_DEFAULT_PARQUET_METADATA,
+    )
     from cytotable.exceptions import SchemaException
-    from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
+    from cytotable.utils import _write_parquet_table_with_metadata
 
     # build a result placeholder
     concatted: List[Dict[str, Any]] = [
@@ -600,7 +609,9 @@ def _concat_source_group(
     destination_path.parent.mkdir(parents=True, exist_ok=True)
 
     # build the schema for concatenation writer
-    writer_schema = pa.schema(common_schema)
+    writer_schema = pa.schema(common_schema).with_metadata(
+        CYTOTABLE_DEFAULT_PARQUET_METADATA
+    )
 
     # build a parquet file writer which will be used to append files
     # as a single concatted parquet file, referencing the first file's schema
@@ -713,7 +724,7 @@ def _join_source_chunk(
 
     import pyarrow.parquet as parquet
 
-    from cytotable.utils import _duckdb_reader
+    from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
 
     # Attempt to read the data to parquet file
     # using duckdb for extraction and pyarrow for
@@ -757,7 +768,7 @@ def _join_source_chunk(
     )
 
     # write the result
-    parquet.write_table(
+    _write_parquet_table_with_metadata(
         table=result,
         where=result_file_path,
     )
@@ -797,7 +808,11 @@ def _concat_join_sources(
 
     import pyarrow.parquet as parquet
 
-    from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
+    from cytotable.constants import (
+        CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
+        CYTOTABLE_DEFAULT_PARQUET_METADATA,
+    )
+    from cytotable.utils import _write_parquet_table_with_metadata
 
     # remove the unjoined concatted compartments to prepare final dest_path usage
     # (we now have joined results)
@@ -811,7 +826,7 @@ def _concat_join_sources(
         shutil.rmtree(path=dest_path)
 
     # write the concatted result as a parquet file
-    parquet.write_table(
+    _write_parquet_table_with_metadata(
         table=pa.concat_tables(
             tables=[
                 parquet.read_table(
@@ -826,7 +841,9 @@ def _concat_join_sources(
     # build a parquet file writer which will be used to append files
     # as a single concatted parquet file, referencing the first file's schema
     # (all must be the same schema)
-    writer_schema = parquet.read_schema(join_sources[0])
+    writer_schema = parquet.read_schema(join_sources[0]).with_metadata(
+        CYTOTABLE_DEFAULT_PARQUET_METADATA
+    )
     with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
         for table_path in join_sources:
             writer.write_table(