From 23f6bc9b7797feeebfb164be6123309b11a6b8c5 Mon Sep 17 00:00:00 2001
From: matt garber <Matthew.Garber@childrens.harvard.edu>
Date: Mon, 17 Jun 2024 11:19:09 -0400
Subject: [PATCH] Chunking sql export queries, coverage (#249)

* Chunking SQL export queries

* cleanup extra files, docstring tweaks
---
 .coveragerc                                   |   2 +
 .github/workflows/ci.yaml                     |  16 +-
 .gitignore                                    |   1 +
 cumulus_library/actions/exporter.py           |  37 +-
 cumulus_library/cli.py                        |  98 ++---
 cumulus_library/cli_parser.py                 |   1 +
 cumulus_library/databases.py                  | 116 ++++--
 docs/first-time-setup.md                      |   3 -
 pyproject.toml                                |   1 +
 tests/test_actions.py                         |   4 +-
 tests/test_cli.py                             | 114 +++++-
 .../core/core__count_condition_month.csv      |   4 +-
 .../core__count_documentreference_month.csv   |  36 +-
 .../core/core__count_encounter_all_types.csv  |  16 +-
 .../core__count_encounter_all_types_month.csv |  48 +--
 .../core/core__count_encounter_month.csv      | 128 +++---
 .../core__count_encounter_priority_month.csv  |  24 +-
 .../core__count_encounter_service_month.csv   |  24 +-
 .../core/core__count_encounter_type_month.csv |  12 +-
 .../core__count_medicationrequest_month.csv   |  24 +-
 .../core__count_observation_lab_month.csv     |  28 +-
 .../core/core__count_patient.csv              |  24 +-
 .../expected_export/core/core__meta_date.csv  |   2 +-
 .../core/core__meta_version.csv               |   2 +-
 .../study_bad_manifest/manifest.toml          |   4 +
 tests/test_data/study_python_valid/module2.py |   4 +-
 .../test_data/study_python_valid_generated.md |   7 +
 tests/test_databases.py                       | 367 ++++++++++++++++++
 28 files changed, 831 insertions(+), 316 deletions(-)
 create mode 100644 .coveragerc
 create mode 100644 tests/test_data/study_bad_manifest/manifest.toml
 create mode 100644 tests/test_databases.py

diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 00000000..1e6c3683
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,2 @@
+[run]
+omit =cumulus_library/schema/*
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 0e3fe5e9..fe872d65 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -29,9 +29,23 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install ".[test]"
+      - name: Create mock AWS credentials
+        run: |
+          mkdir ~/.aws && touch ~/.aws/credentials 
+          echo -e "[test]\naws_access_key_id = test\naws_secret_access_key = test" > ~/.aws/credentials
       - name: Test with pytest
         run: |
-          python -m pytest
+          python -m pytest --cov-report xml --cov=cumulus_library tests
+      - name: Generate coverage report
+        uses: orgoro/coverage@v3.1
+        with:
+            coverageFile: coverage.xml
+            token: ${{ secrets.GITHUB_TOKEN }}
+            thresholdAll: .9
+            thresholdNew: 1
+            thresholdModified: .95
+
+
   lint:
     runs-on: ubuntu-22.04
     steps:
diff --git a/.gitignore b/.gitignore
index 705a1721..8e47ed3d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ output.sql
 *generated.md
 MRCONSO.RRF
 *.zip
+coverage.xml
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/cumulus_library/actions/exporter.py b/cumulus_library/actions/exporter.py
index f28fa55f..9e854574 100644
--- a/cumulus_library/actions/exporter.py
+++ b/cumulus_library/actions/exporter.py
@@ -1,6 +1,7 @@
-import csv
 import pathlib
 
+import pyarrow
+from pyarrow import csv, parquet
 from rich.progress import track
 
 from cumulus_library import base_utils, databases, study_parser
@@ -25,12 +26,25 @@ def reset_counts_exports(
             file.unlink()
 
 
+def _write_chunk(writer, chunk, schema):
+    writer.write(
+        pyarrow.Table.from_pandas(
+            chunk.sort_values(
+                by=list(chunk.columns), ascending=False, na_position="first"
+            ),
+            preserve_index=False,
+            schema=schema,
+        )
+    )
+
+
 def export_study(
     manifest_parser: study_parser.StudyManifestParser,
     db: databases.DatabaseBackend,
     schema_name: str,
     data_path: pathlib.Path,
     archive: bool,
+    chunksize: int = 1000000,
 ) -> list:
     """Exports csvs/parquet extracts of tables listed in export_list
     :param db: A database backend
@@ -56,13 +70,22 @@ def export_study(
         description=f"Exporting {manifest_parser.get_study_prefix()} data...",
     ):
         query = f"SELECT * FROM {table}"
-        dataframe = db.execute_as_pandas(query)
+        dataframe_chunks, db_schema = db.execute_as_pandas(query, chunksize=chunksize)
         path.mkdir(parents=True, exist_ok=True)
-        dataframe = dataframe.sort_values(
-            by=list(dataframe.columns), ascending=False, na_position="first"
-        )
-        dataframe.to_csv(f"{path}/{table}.csv", index=False, quoting=csv.QUOTE_MINIMAL)
-        dataframe.to_parquet(f"{path}/{table}.parquet", index=False)
+        schema = pyarrow.schema(db.col_pyarrow_types_from_sql(db_schema))
+        with parquet.ParquetWriter(f"{path}/{table}.parquet", schema) as p_writer:
+            with csv.CSVWriter(
+                f"{path}/{table}.csv",
+                schema,
+                write_options=csv.WriteOptions(
+                    # Note that this quoting style is not exactly csv.QUOTE_MINIMAL
+                    # https://github.com/apache/arrow/issues/42032
+                    quoting_style="needed"
+                ),
+            ) as c_writer:
+                for chunk in dataframe_chunks:
+                    _write_chunk(p_writer, chunk, schema)  # pragma: no cover
+                    _write_chunk(c_writer, chunk, schema)  # pragma: no cover
         queries.append(queries)
     if archive:
         base_utils.zip_dir(path, data_path, manifest_parser.get_study_prefix())
diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py
index 5e951008..c34dc5d1 100755
--- a/cumulus_library/cli.py
+++ b/cumulus_library/cli.py
@@ -82,11 +82,12 @@ def clean_study(
         :param stats_clean: If true, removes previous stats runs
         :keyword prefix: If True, does a search by string prefix in place of study name
         """
-        if targets is None or targets == ["all"]:
+        if targets is None:
             sys.exit(
                 "Explicit targets for cleaning not provided. "
                 "Provide one or more explicit study prefixes to remove."
             )
+
         for target in targets:
             if prefix:
                 manifest_parser = study_parser.StudyManifestParser()
@@ -207,25 +208,6 @@ def run_matching_table_builder(
             config=config,
         )
 
-    def clean_and_build_all(
-        self, study_dict: dict, config: base_utils.StudyConfig
-    ) -> None:
-        """Builds tables for all studies.
-
-        NOTE: By design, this method will always exclude the `template` study dir,
-        since 99% of the time you don't need a live copy in the database.
-
-        :param study_dict: A dict of paths
-        :param config: A StudyConfig object containing optional params
-        """
-        study_dict = dict(study_dict)
-        study_dict.pop("template")
-        for precursor_study in ["vocab", "core"]:
-            self.clean_and_build_study(study_dict[precursor_study], config=config)
-            study_dict.pop(precursor_study)
-        for key in study_dict:
-            self.clean_and_build_study(study_dict[key], config=config)
-
     ### Data exporters
     def export_study(
         self, target: pathlib.Path, data_path: pathlib.Path, archive: bool
@@ -241,11 +223,6 @@ def export_study(
             manifest_parser, self.db, self.schema_name, data_path, archive
         )
 
-    def export_all(self, study_dict: dict, data_path: pathlib.Path, archive: bool):
-        """Exports all defined count tables to disk"""
-        for key in study_dict.keys():
-            self.export_study(study_dict[key], data_path, archive)
-
     def generate_study_sql(
         self,
         target: pathlib.Path,
@@ -296,24 +273,6 @@ def get_abs_path(path: str) -> pathlib.Path:
         return pathlib.Path(pathlib.Path.cwd(), path)
 
 
-def create_template(path: str) -> None:
-    """Creates a manifest in target dir if one doesn't exist"""
-    abs_path = get_abs_path(path)
-    manifest_path = pathlib.Path(abs_path, "manifest.toml")
-    if manifest_path.exists():
-        sys.exit(f"A manifest.toml already exists at {abs_path}, skipping creation")
-    abs_path.mkdir(parents=True, exist_ok=True)
-
-    copy_lists = [
-        ["studies/template/manifest.toml", "manifest.toml"],
-        [".sqlfluff", ".sqlfluff"],
-    ]
-    for source, dest in copy_lists:
-        source_path = pathlib.Path(pathlib.Path(__file__).resolve().parents[0], source)
-        dest_path = pathlib.Path(abs_path, dest)
-        dest_path.write_bytes(source_path.read_bytes())
-
-
 def get_study_dict(alt_dir_paths: list) -> dict[str, pathlib.Path] | None:
     """Gets valid study targets from ./studies/, and any pip installed studies
 
@@ -362,10 +321,8 @@ def get_studies_by_manifest_path(path: pathlib.Path) -> dict[str, pathlib.Path]:
 def run_cli(args: dict):
     """Controls which library tasks are run based on CLI arguments"""
     console = rich.console.Console()
-    if args["action"] == "create":
-        create_template(args["create_dir"])
 
-    elif args["action"] == "upload":
+    if args["action"] == "upload":
         try:
             uploader.upload_files(args)
         except requests.RequestException as e:
@@ -387,7 +344,7 @@ def run_cli(args: dict):
                 runner.verbose = True
             console.print("[italic] Connecting to database...")
             runner.cursor.execute("SHOW DATABASES")
-            study_dict = get_study_dict(args["study_dir"])
+            study_dict = get_study_dict(args.get("study_dir"))
             if "prefix" not in args.keys():
                 if args.get("target"):
                     for target in args["target"]:
@@ -406,19 +363,16 @@ def run_cli(args: dict):
                     prefix=args["prefix"],
                 )
             elif args["action"] == "build":
-                if "all" in args["target"]:
-                    runner.clean_and_build_all(study_dict, config=config)
-                else:
-                    for target in args["target"]:
-                        if args["builder"]:
-                            runner.run_matching_table_builder(
-                                study_dict[target], args["builder"], config=config
-                            )
-                        else:
-                            runner.clean_and_build_study(
-                                study_dict[target],
-                                config=config,
-                            )
+                for target in args["target"]:
+                    if args["builder"]:
+                        runner.run_matching_table_builder(
+                            study_dict[target], args["builder"], config=config
+                        )
+                    else:
+                        runner.clean_and_build_study(
+                            study_dict[target],
+                            config=config,
+                        )
 
             elif args["action"] == "export":
                 if args["archive"]:
@@ -429,20 +383,17 @@ def run_cli(args: dict):
                         "set[/italic], primarily dates, on a per patient level.\n\n"
                         "[bold]By doing this, you are assuming the responsibility for "
                         "meeting your organization's security requirements for "
-                        "storing this data in a secure manager.[/bold]\n\n"
+                        "storing this data in a secure manner.[/bold]\n\n"
                         "Type Y to proceed, or any other value to quit.\n"
                     )
                     console.print(warning_text)
                     response = input()
                     if response.lower() != "y":
                         sys.exit()
-                if "all" in args["target"]:
-                    runner.export_all(study_dict, args["data_path"], args["archive"])
-                else:
-                    for target in args["target"]:
-                        runner.export_study(
-                            study_dict[target], args["data_path"], args["archive"]
-                        )
+                for target in args["target"]:
+                    runner.export_study(
+                        study_dict[target], args["data_path"], args["archive"]
+                    )
 
             elif args["action"] == "import":
                 for archive in args["archive_path"]:
@@ -475,11 +426,6 @@ def main(cli_args=None):
     if args["action"] is None:
         parser.print_usage()
         sys.exit(1)
-    if args.get("target"):
-        for target in args["target"]:
-            if target == "all":
-                args["target"] = ["all"]
-                break
 
     arg_env_pairs = (
         ("data_path", "CUMULUS_LIBRARY_DATA_PATH"),
@@ -493,7 +439,7 @@ def main(cli_args=None):
         ("umls_key", "UMLS_API_KEY"),
         ("url", "CUMULUS_AGGREGATOR_URL"),
         ("user", "CUMULUS_AGGREGATOR_USER"),
-        ("workgroup", "CUMULUS_LIBRARY_WORKGROUP"),
+        ("work_group", "CUMULUS_LIBRARY_WORKGROUP"),
     )
     read_env_vars = []
     for pair in arg_env_pairs:
@@ -541,8 +487,8 @@ def main(cli_args=None):
 
 
 def main_cli():  # called by the generated wrapper scripts
-    main()
+    main()  # pragma: no cover
 
 
 if __name__ == "__main__":
-    main()
+    main()  # pragma: no cover
diff --git a/cumulus_library/cli_parser.py b/cumulus_library/cli_parser.py
index f4f42aac..fa79aeb8 100644
--- a/cumulus_library/cli_parser.py
+++ b/cumulus_library/cli_parser.py
@@ -12,6 +12,7 @@ def add_aws_config(parser: argparse.ArgumentParser) -> None:
     aws.add_argument(
         "--workgroup",
         default="cumulus",
+        dest="work_group",
         help="Cumulus Athena workgroup (default: cumulus)",
     )
     aws.add_argument(
diff --git a/cumulus_library/databases.py b/cumulus_library/databases.py
index 9d0520c5..a6e48db7 100644
--- a/cumulus_library/databases.py
+++ b/cumulus_library/databases.py
@@ -9,6 +9,7 @@
 """
 
 import abc
+import collections
 import datetime
 import json
 import os
@@ -35,16 +36,16 @@ class DatabaseCursor(Protocol):
     """Protocol for a PEP-249 compatible cursor"""
 
     def execute(self, sql: str) -> None:
-        pass
+        pass  # pragma: no cover
 
     def fetchone(self) -> list | None:
-        pass
+        pass  # pragma: no cover
 
     def fetchmany(self, size: int | None) -> list[list] | None:
-        pass
+        pass  # pragma: no cover
 
     def fetchall(self) -> list[list] | None:
-        pass
+        pass  # pragma: no cover
 
 
 class DatabaseParser(abc.ABC):
@@ -151,7 +152,9 @@ def pandas_cursor(self) -> DatabaseCursor:
         """
 
     @abc.abstractmethod
-    def execute_as_pandas(self, sql: str) -> pandas.DataFrame:
+    def execute_as_pandas(
+        self, sql: str, chunksize: int | None = None
+    ) -> (pandas.DataFrame | collections.abc.Iterator[pandas.DataFrame], list[tuple]):
         """Returns a pandas.DataFrame version of the results from the provided SQL"""
 
     @abc.abstractmethod
@@ -172,7 +175,7 @@ def operational_errors(self) -> tuple[Exception]:
     def col_parquet_types_from_pandas(self, field_types: list) -> list:
         """Returns appropriate types for creating tables based from parquet.
 
-        By default, returns the input (which assumes that the DB infers directly
+        By default, returns an empty list (which assumes that the DB infers directly
         from parquet data types). Only override if your DB uses an explicit SerDe
         format, or otherwise needs a modified typing to inject directly into a query."""
 
@@ -196,9 +199,10 @@ def col_parquet_types_from_pandas(self, field_types: list) -> list:
         #             raise errors.CumulusLibraryError(
         #                 f"Unsupported type {type(field)} found."
         #             )
-        # return output
+        return []
 
-        return field_types
+    def col_pyarrow_types_from_sql(self, columns: list[tuple]) -> list:
+        return columns
 
     def upload_file(
         self,
@@ -257,14 +261,17 @@ def cursor(self) -> AthenaCursor:
     def pandas_cursor(self) -> AthenaPandasCursor:
         return self.connection.cursor(cursor=AthenaPandasCursor)
 
-    def execute_as_pandas(self, sql: str) -> pandas.DataFrame:
-        return self.pandas_cursor().execute(sql).as_pandas()
+    def execute_as_pandas(
+        self, sql: str, chunksize: int | None = None
+    ) -> (pandas.DataFrame | collections.abc.Iterator[pandas.DataFrame], list[tuple]):
+        query = self.pandas_cursor().execute(sql, chunksize=chunksize)
+        return query.as_pandas(), query.description
 
     def parser(self) -> DatabaseParser:
         return AthenaParser()
 
     def operational_errors(self) -> tuple[Exception]:
-        return (pyathena.OperationalError,)
+        return (pyathena.OperationalError,)  # pragma: no cover
 
     def col_parquet_types_from_pandas(self, field_types: list) -> list:
         output = []
@@ -272,7 +279,10 @@ def col_parquet_types_from_pandas(self, field_types: list) -> list:
             match field:
                 case numpy.dtypes.ObjectDType():
                     output.append("STRING")
-                case pandas.core.arrays.integer.Int64Dtype():
+                case (
+                    pandas.core.arrays.integer.Int64Dtype()
+                    | numpy.dtypes.Int64DType()
+                ):
                     output.append("INT")
                 case numpy.dtypes.Float64DType():
                     output.append("DOUBLE")
@@ -282,7 +292,31 @@ def col_parquet_types_from_pandas(self, field_types: list) -> list:
                     output.append("TIMESTAMP")
                 case _:
                     raise errors.CumulusLibraryError(
-                        f"Unsupported type {type(field)} found."
+                        f"Unsupported pandas type {type(field)} found."
+                    )
+        return output
+
+    def col_pyarrow_types_from_sql(self, columns: list[tuple]) -> list:
+        output = []
+        for column in columns:
+            match column[1]:
+                case "varchar":
+                    output.append((column[0], pyarrow.string()))
+                case "bigint":
+                    output.append((column[0], pyarrow.int64()))
+                case "integer":
+                    output.append((column[0], pyarrow.int64()))
+                case "double":
+                    output.append((column[0], pyarrow.float64()))
+                case "boolean":
+                    output.append((column[0], pyarrow.bool_()))
+                case "date":
+                    output.append((column[0], pyarrow.date64()))
+                case "timestamp":
+                    output.append((column[0], pyarrow.timestamp("s")))
+                case _:
+                    raise errors.CumulusLibraryError(
+                        output.append(f"Unsupported SQL type '{column}' found.")
                     )
         return output
 
@@ -296,9 +330,8 @@ def upload_file(
         force_upload=False,
     ) -> str | None:
         # We'll investigate the connection to get the relevant S3 upload path.
-        wg_conf = self.connection._client.get_work_group(WorkGroup=self.work_group)[
-            "WorkGroup"
-        ]["Configuration"]["ResultConfiguration"]
+        workgroup = self.connection._client.get_work_group(WorkGroup=self.work_group)
+        wg_conf = workgroup["WorkGroup"]["Configuration"]["ResultConfiguration"]
         s3_path = wg_conf["OutputLocation"]
         bucket = "/".join(s3_path.split("/")[2:3])
         key_prefix = "/".join(s3_path.split("/")[3:])
@@ -315,7 +348,7 @@ def upload_file(
             f"{key_prefix}cumulus_user_uploads/{self.schema_name}/" f"{study}/{topic}"
         )
         if not remote_filename:
-            remote_filename = file
+            remote_filename = file.name
 
         session = boto3.Session(profile_name=self.connection.profile_name)
         s3_client = session.client("s3")
@@ -337,7 +370,7 @@ def upload_file(
         return f"s3://{bucket}/{s3_key}"
 
     def close(self) -> None:
-        return self.connection.close()
+        return self.connection.close()  # pragma: no cover
 
 
 class AthenaParser(DatabaseParser):
@@ -525,18 +558,47 @@ def pandas_cursor(self) -> duckdb.DuckDBPyConnection:
         # Since this is not provided, return the vanilla cursor
         return self.connection
 
-    def execute_as_pandas(self, sql: str) -> pandas.DataFrame:
+    def execute_as_pandas(
+        self, sql: str, chunksize: int | None = None
+    ) -> (pandas.DataFrame | collections.abc.Iterator[pandas.DataFrame], list[tuple]):
         # We call convert_dtypes here in case there are integer columns.
         # Pandas will normally cast nullable-int as a float type unless
         # we call this to convert to its nullable int column type.
         # PyAthena seems to do this correctly for us, but not DuckDB.
-        return self.connection.execute(sql).df().convert_dtypes()
+        result = self.connection.execute(sql)
+        if chunksize:
+            return iter([result.df().convert_dtypes()]), result.description
+        return result.df().convert_dtypes(), result.description
+
+    def col_pyarrow_types_from_sql(self, columns: list[tuple]) -> list:
+        output = []
+        for column in columns:
+            match column[1]:
+                case "STRING":
+                    output.append((column[0], pyarrow.string()))
+                case "INTEGER":
+                    output.append((column[0], pyarrow.int64()))
+                case "NUMBER":
+                    output.append((column[0], pyarrow.float64()))
+                case "DOUBLE":
+                    output.append((column[0], pyarrow.float64()))
+                case "boolean" | "bool":
+                    output.append((column[0], pyarrow.bool_()))
+                case "Date":
+                    output.append((column[0], pyarrow.date64()))
+                case "TIMESTAMP" | "DATETIME":
+                    output.append((column[0], pyarrow.timestamp("s")))
+                case _:
+                    raise errors.CumulusLibraryError(
+                        f"{column[0],column[1]} does not have a conversion type"
+                    )
+        return output
 
     def parser(self) -> DatabaseParser:
         return DuckDbParser()
 
     def operational_errors(self) -> tuple[Exception]:
-        return (duckdb.OperationalError,)
+        return (duckdb.OperationalError,)  # pragma: no cover
 
     def close(self) -> None:
         self.connection.close()
@@ -652,23 +714,25 @@ def read_ndjson_dir(path: str) -> dict[str, pyarrow.Table]:
 
 def create_db_backend(args: dict[str, str]) -> DatabaseBackend:
     db_config.db_type = args["db_type"]
-    database = args["schema_name"]
+    schema = args["schema_name"]
     load_ndjson_dir = args.get("load_ndjson_dir")
 
     if db_config.db_type == "duckdb":
-        backend = DuckDatabaseBackend(database)  # `database` is path name in this case
+        backend = DuckDatabaseBackend(schema)  # `schema` is path name in this case
         if load_ndjson_dir:
             backend.insert_tables(read_ndjson_dir(load_ndjson_dir))
     elif db_config.db_type == "athena":
         backend = AthenaDatabaseBackend(
             args["region"],
-            args["workgroup"],
+            args["work_group"],
             args["profile"],
-            database,
+            schema,
         )
         if load_ndjson_dir:
             sys.exit("Loading an ndjson dir is not supported with --db-type=athena.")
     else:
-        raise ValueError(f"Unexpected --db-type value '{db_config.db_type}'")
+        raise errors.CumulusLibraryError(
+            f"'{db_config.db_type}' is not a supported database."
+        )
 
     return backend
diff --git a/docs/first-time-setup.md b/docs/first-time-setup.md
index 0746c2c1..0197ed18 100644
--- a/docs/first-time-setup.md
+++ b/docs/first-time-setup.md
@@ -28,9 +28,6 @@ You can install directly from pypi by running:
 Installing adds a `cumulus-library` command for interacting with Athena.
 It provides several actions for users:
 
-- `create` will create a manifest file for you so you can start working on
-authoring queries (more information on this in 
-[Creating studies](./creating-studies.md)).
 - `build` will create new study tables, replacing previously created versions
 (more information on this in [Creating studies](./creating-studies.md)).
 - `clean` will remove studies from Athena, in case you no longer need them
diff --git a/pyproject.toml b/pyproject.toml
index 46c12dce..154c97dd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ dev = [
 test = [
     "freezegun",
     "pytest",
+    "pytest-cov",
     "responses"
 ]
 
diff --git a/tests/test_actions.py b/tests/test_actions.py
index 67874c92..482dc0ef 100644
--- a/tests/test_actions.py
+++ b/tests/test_actions.py
@@ -299,7 +299,9 @@ def test_export_study(tmp_path, mock_db_core):
         f"{Path(__file__).parent.parent}/cumulus_library/studies/core",
         data_path=f"{tmp_path}/export",
     )
-    exporter.export_study(parser, mock_db_core, None, f"{tmp_path}/export", False)
+    exporter.export_study(
+        parser, mock_db_core, None, f"{tmp_path}/export", False, chunksize=20
+    )
     for file in Path(f"{tmp_path}/export").glob("*.*"):
         assert file in parser.get_export_table_list()
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 4e7ed14d..a03a2f43 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -14,6 +14,7 @@
 from pathlib import Path
 from unittest import mock
 
+import pandas
 import pytest
 import responses
 import toml
@@ -80,6 +81,16 @@ def test_cli_early_exit(args):
             does_not_raise(),
             "study_python_valid__table",
         ),
+        (
+            ["build", "-t", "study_python_valid", "--continue", "module2"],
+            does_not_raise(),
+            "study_python_valid__table_2",
+        ),
+        (
+            ["build", "-t", "study_bad_manifest"],
+            pytest.raises(errors.StudyManifestParsingError),
+            "study_python_valid__table_2",
+        ),
         (["build", "-t", "wrong"], pytest.raises(SystemExit), None),
         (
             [
@@ -103,6 +114,7 @@ def test_cli_path_mapping(mock_load_json, mock_path, tmp_path, args, raises, exp
             "__desc__": "",
             "allowlist": {
                 "study_python_valid": "study_python_valid",
+                "study_bad_manifest": "study_bad_manifest",
             },
         }
         args = duckdb_args(args, tmp_path)
@@ -223,7 +235,7 @@ def test_generate_md(mock_path, tmp_path):
 )
 @mock.patch("sysconfig.get_path")
 @pytest.mark.parametrize(
-    "args,expected",
+    "args,expected,raises",
     [
         (
             [
@@ -232,6 +244,7 @@ def test_generate_md(mock_path, tmp_path):
                 "core",
             ],
             "core__",
+            does_not_raise(),
         ),
         (
             [
@@ -241,6 +254,7 @@ def test_generate_md(mock_path, tmp_path):
                 "foo",
             ],
             "foo",
+            does_not_raise(),
         ),
         (
             [
@@ -250,18 +264,27 @@ def test_generate_md(mock_path, tmp_path):
                 "--statistics",
             ],
             "core__",
+            does_not_raise(),
+        ),
+        (
+            [
+                "clean",
+            ],
+            "core__",
+            pytest.raises(SystemExit),
         ),
     ],
 )
-def test_clean(mock_path, tmp_path, args, expected):  # pylint: disable=unused-argument
-    mock_path.return_value = f"{Path(__file__).resolve().parents[0]}/test_data/"
-    cli.main(cli_args=duckdb_args(["build", "-t", "core"], tmp_path))
-    with does_not_raise():
-        with mock.patch.object(builtins, "input", lambda _: "y"):
-            cli.main(cli_args=duckdb_args(args, tmp_path))
-            db = DuckDatabaseBackend(f"{tmp_path}/duck.db")
-            for table in db.cursor().execute("show tables").fetchall():
-                assert expected not in table
+def test_clean(mock_path, tmp_path, args, expected, raises):  # pylint: disable=unused-argument
+    with raises:
+        mock_path.return_value = f"{Path(__file__).resolve().parents[0]}/test_data/"
+        cli.main(cli_args=duckdb_args(["build", "-t", "core"], tmp_path))
+        with does_not_raise():
+            with mock.patch.object(builtins, "input", lambda _: "y"):
+                cli.main(cli_args=duckdb_args(args, tmp_path))
+                db = DuckDatabaseBackend(f"{tmp_path}/duck.db")
+                for table in db.cursor().execute("show tables").fetchall():
+                    assert expected not in table
 
 
 @mock.patch.dict(
@@ -269,9 +292,28 @@ def test_clean(mock_path, tmp_path, args, expected):  # pylint: disable=unused-a
     clear=True,
 )
 @pytest.mark.parametrize(
-    "build_args,export_args,expected_tables",
+    "build_args,export_args,expected_tables,raises",
     [
-        (["build", "-t", "core"], ["export", "-t", "core"], 59),
+        (
+            ["build", "-t", "core"],
+            ["export", "-t", "core"],
+            59,
+            does_not_raise(),
+        ),
+        (
+            # checking that a study is loaded from a child directory
+            # of a user-defined path
+            [
+                "build",
+                "-t",
+                "study_valid",
+                "-s",
+                "tests/test_data/",
+            ],
+            ["export", "-t", "study_valid", "-s", "tests/test_data/"],
+            2,
+            does_not_raise(),
+        ),
         (
             # checking that a study is loaded from a child directory
             # of a user-defined path
@@ -284,8 +326,9 @@ def test_clean(mock_path, tmp_path, args, expected):  # pylint: disable=unused-a
             ],
             ["export", "-t", "study_valid", "-s", "tests/test_data/"],
             2,
+            does_not_raise(),
         ),
-        (["build", "-t", "vocab"], None, 3),
+        (["build", "-t", "vocab"], None, 3, does_not_raise()),
         (
             # checking that a study is loaded from the directory of a user-defined
             # path. we're also validating that the CLI accepts the statistics keyword
@@ -299,11 +342,27 @@ def test_clean(mock_path, tmp_path, args, expected):  # pylint: disable=unused-a
             ],
             ["export", "-t", "study_valid", "-s", "tests/test_data/study_valid/"],
             2,
+            does_not_raise(),
+        ),
+        (
+            [
+                "build",
+                "-t",
+                "study_valid",
+                "-s",
+                "tests/test_data/study_valid/",
+                "--statistics",
+            ],
+            ["export", "-t", "study_valid", "-s", "tests/test_data/study_valid/"],
+            2,
+            does_not_raise(),
         ),
     ],
 )
-def test_cli_executes_queries(tmp_path, build_args, export_args, expected_tables):
-    with does_not_raise():
+def test_cli_executes_queries(
+    tmp_path, build_args, export_args, expected_tables, raises
+):
+    with raises:
         build_args = duckdb_args(build_args, tmp_path)
         cli.main(cli_args=build_args)
         if export_args is not None:
@@ -604,3 +663,28 @@ def test_cli_custom_args(mock_config, tmp_path, option, raises):
         )
         called_options = mock_config.call_args[1]["options"]
         assert called_options[option.split(":")[0]] == option.split(":")[1]
+
+
+@mock.patch.dict(os.environ, clear=True)
+def test_cli_import_study(tmp_path):
+    test_data = {"string": ["a", "b", None]}
+    df = pandas.DataFrame(test_data)
+    (tmp_path / "archive").mkdir()
+    df.to_parquet(tmp_path / "archive/test__table.parquet")
+    df.to_csv(tmp_path / "archive/test__table.csv")
+    with zipfile.ZipFile(tmp_path / "archive/test.zip", "w") as archive:
+        archive.write(tmp_path / "archive/test__table.parquet")
+        archive.write(tmp_path / "archive/test__table.csv")
+    (tmp_path / "archive/test__table.parquet").unlink()
+    (tmp_path / "archive/test__table.csv").unlink()
+
+    cli.main(
+        cli_args=duckdb_args(
+            [
+                "import",
+                "-a",
+                str(tmp_path / "archive/test.zip"),
+            ],
+            tmp_path,
+        )
+    )
diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv
index 0128853f..4e278870 100644
--- a/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv
+++ b/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv
@@ -1,3 +1,3 @@
-cnt,category_code,recordedDate_month,code_display
+"cnt","category_code","recordedDate_month","code_display"
 15,,,
-15,encounter-diagnosis,,
+15,"encounter-diagnosis",,
diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_documentreference_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_documentreference_month.csv
index 42a04ba1..2257390e 100644
--- a/tests/test_data/duckdb_data/expected_export/core/core__count_documentreference_month.csv
+++ b/tests/test_data/duckdb_data/expected_export/core/core__count_documentreference_month.csv
@@ -1,19 +1,19 @@
-cnt,type_display,author_month,class_display
+"cnt","type_display","author_month","class_display"
 50,,,
-50,Evaluation + Plan note,,
-50,Emergency department note,,
-46,,,ambulatory
-46,Evaluation + Plan note,,ambulatory
-46,Emergency department note,,ambulatory
-26,,2018-07-01,
-26,Evaluation + Plan note,2018-07-01,
-26,Emergency department note,2018-07-01,
-24,,2018-07-01,ambulatory
-24,,2018-06-01,
-24,Evaluation + Plan note,2018-07-01,ambulatory
-24,Evaluation + Plan note,2018-06-01,
-24,Emergency department note,2018-07-01,ambulatory
-24,Emergency department note,2018-06-01,
-22,,2018-06-01,ambulatory
-22,Evaluation + Plan note,2018-06-01,ambulatory
-22,Emergency department note,2018-06-01,ambulatory
+50,"Evaluation + Plan note",,
+50,"Emergency department note",,
+46,,,"ambulatory"
+46,"Evaluation + Plan note",,"ambulatory"
+46,"Emergency department note",,"ambulatory"
+26,,"2018-07-01",
+26,"Evaluation + Plan note","2018-07-01",
+26,"Emergency department note","2018-07-01",
+24,,"2018-07-01","ambulatory"
+24,,"2018-06-01",
+24,"Evaluation + Plan note","2018-07-01","ambulatory"
+24,"Evaluation + Plan note","2018-06-01",
+24,"Emergency department note","2018-07-01","ambulatory"
+24,"Emergency department note","2018-06-01",
+22,,"2018-06-01","ambulatory"
+22,"Evaluation + Plan note","2018-06-01","ambulatory"
+22,"Emergency department note","2018-06-01","ambulatory"
diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types.csv
index d20a99fb..2ff56f2c 100644
--- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types.csv
+++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types.csv
@@ -1,9 +1,9 @@
-cnt,class_display,type_display,serviceType_display,priority_display
+"cnt","class_display","type_display","serviceType_display","priority_display"
 50,,,,
-50,,,,cumulus__none
-50,,,cumulus__none,
-50,,,cumulus__none,cumulus__none
-46,ambulatory,,,
-46,ambulatory,,,cumulus__none
-46,ambulatory,,cumulus__none,
-46,ambulatory,,cumulus__none,cumulus__none
\ No newline at end of file
+50,,,,"cumulus__none"
+50,,,"cumulus__none",
+50,,,"cumulus__none","cumulus__none"
+46,"ambulatory",,,
+46,"ambulatory",,,"cumulus__none"
+46,"ambulatory",,"cumulus__none",
+46,"ambulatory",,"cumulus__none","cumulus__none"
diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types_month.csv
index 35640a26..324ac11c 100644
--- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types_month.csv
+++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types_month.csv
@@ -1,25 +1,25 @@
-cnt,class_display,type_display,serviceType_display,priority_display,period_start_month
+"cnt","class_display","type_display","serviceType_display","priority_display","period_start_month"
 50,,,,,
-50,,,,cumulus__none,
-50,,,cumulus__none,,
-50,,,cumulus__none,cumulus__none,
-46,ambulatory,,,,
-46,ambulatory,,,cumulus__none,
-46,ambulatory,,cumulus__none,,
-46,ambulatory,,cumulus__none,cumulus__none,
-26,,,,,2018-07-01
-26,,,,cumulus__none,2018-07-01
-26,,,cumulus__none,,2018-07-01
-26,,,cumulus__none,cumulus__none,2018-07-01
-24,,,,,2018-06-01
-24,,,,cumulus__none,2018-06-01
-24,,,cumulus__none,,2018-06-01
-24,,,cumulus__none,cumulus__none,2018-06-01
-24,ambulatory,,,,2018-07-01
-24,ambulatory,,,cumulus__none,2018-07-01
-24,ambulatory,,cumulus__none,,2018-07-01
-24,ambulatory,,cumulus__none,cumulus__none,2018-07-01
-22,ambulatory,,,,2018-06-01
-22,ambulatory,,,cumulus__none,2018-06-01
-22,ambulatory,,cumulus__none,,2018-06-01
-22,ambulatory,,cumulus__none,cumulus__none,2018-06-01
\ No newline at end of file
+50,,,,"cumulus__none",
+50,,,"cumulus__none",,
+50,,,"cumulus__none","cumulus__none",
+46,"ambulatory",,,,
+46,"ambulatory",,,"cumulus__none",
+46,"ambulatory",,"cumulus__none",,
+46,"ambulatory",,"cumulus__none","cumulus__none",
+26,,,,,"2018-07-01"
+26,,,,"cumulus__none","2018-07-01"
+26,,,"cumulus__none",,"2018-07-01"
+26,,,"cumulus__none","cumulus__none","2018-07-01"
+24,,,,,"2018-06-01"
+24,,,,"cumulus__none","2018-06-01"
+24,,,"cumulus__none",,"2018-06-01"
+24,,,"cumulus__none","cumulus__none","2018-06-01"
+24,"ambulatory",,,,"2018-07-01"
+24,"ambulatory",,,"cumulus__none","2018-07-01"
+24,"ambulatory",,"cumulus__none",,"2018-07-01"
+24,"ambulatory",,"cumulus__none","cumulus__none","2018-07-01"
+22,"ambulatory",,,,"2018-06-01"
+22,"ambulatory",,,"cumulus__none","2018-06-01"
+22,"ambulatory",,"cumulus__none",,"2018-06-01"
+22,"ambulatory",,"cumulus__none","cumulus__none","2018-06-01"
diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_month.csv
index 267d1133..5c8242e6 100644
--- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_month.csv
+++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_month.csv
@@ -1,65 +1,65 @@
-cnt,period_start_month,class_display,age_at_visit,gender,race_display,ethnicity_display
+"cnt","period_start_month","class_display","age_at_visit","gender","race_display","ethnicity_display"
 50,,,,,,
-47,,,,,white,
-46,,ambulatory,,,,
-45,,,,,,not hispanic or latino
-43,,,,,white,not hispanic or latino
-43,,ambulatory,,,white,
-42,,ambulatory,,,,not hispanic or latino
-40,,ambulatory,,,white,not hispanic or latino
-29,,,,female,,
-28,,,,female,white,
-27,,,,female,,not hispanic or latino
-27,,ambulatory,,female,,
-26,,,,female,white,not hispanic or latino
-26,,ambulatory,,female,white,
-26,2018-07-01,,,,,
-26,2018-07-01,,,,white,
-25,,ambulatory,,female,,not hispanic or latino
-24,,ambulatory,,female,white,not hispanic or latino
-24,2018-07-01,,,,,not hispanic or latino
-24,2018-07-01,,,,white,not hispanic or latino
-24,2018-07-01,ambulatory,,,,
-24,2018-07-01,ambulatory,,,white,
-24,2018-06-01,,,,,
-23,2018-07-01,ambulatory,,,,not hispanic or latino
-23,2018-07-01,ambulatory,,,white,not hispanic or latino
-22,2018-06-01,ambulatory,,,,
-21,,,,male,,
-21,2018-06-01,,,,,not hispanic or latino
-21,2018-06-01,,,,white,
-19,,,,male,white,
-19,,ambulatory,,male,,
-19,2018-06-01,,,,white,not hispanic or latino
-19,2018-06-01,ambulatory,,,,not hispanic or latino
-19,2018-06-01,ambulatory,,,white,
-18,,,,male,,not hispanic or latino
-17,,,,male,white,not hispanic or latino
-17,,ambulatory,,male,,not hispanic or latino
-17,,ambulatory,,male,white,
-17,2018-06-01,ambulatory,,,white,not hispanic or latino
-16,,ambulatory,,male,white,not hispanic or latino
-15,2018-06-01,,,female,,
-14,2018-07-01,,,female,,
-14,2018-07-01,,,female,,not hispanic or latino
-14,2018-07-01,,,female,white,
-14,2018-07-01,,,female,white,not hispanic or latino
-14,2018-06-01,,,female,white,
-14,2018-06-01,ambulatory,,female,,
-13,2018-07-01,ambulatory,,female,,
-13,2018-07-01,ambulatory,,female,,not hispanic or latino
-13,2018-07-01,ambulatory,,female,white,
-13,2018-07-01,ambulatory,,female,white,not hispanic or latino
-13,2018-06-01,,,female,,not hispanic or latino
-13,2018-06-01,ambulatory,,female,white,
-12,2018-07-01,,,male,,
-12,2018-07-01,,,male,white,
-12,2018-06-01,,,female,white,not hispanic or latino
-12,2018-06-01,ambulatory,,female,,not hispanic or latino
-11,2018-07-01,ambulatory,,male,,
-11,2018-07-01,ambulatory,,male,white,
-11,2018-06-01,ambulatory,,female,white,not hispanic or latino
-10,2018-07-01,,,male,,not hispanic or latino
-10,2018-07-01,,,male,white,not hispanic or latino
-10,2018-07-01,ambulatory,,male,,not hispanic or latino
-10,2018-07-01,ambulatory,,male,white,not hispanic or latino
+47,,,,,"white",
+46,,"ambulatory",,,,
+45,,,,,,"not hispanic or latino"
+43,,,,,"white","not hispanic or latino"
+43,,"ambulatory",,,"white",
+42,,"ambulatory",,,,"not hispanic or latino"
+40,,"ambulatory",,,"white","not hispanic or latino"
+29,,,,"female",,
+28,,,,"female","white",
+27,,,,"female",,"not hispanic or latino"
+27,,"ambulatory",,"female",,
+26,,,,"female","white","not hispanic or latino"
+26,,"ambulatory",,"female","white",
+26,"2018-07-01",,,,,
+26,"2018-07-01",,,,"white",
+25,,"ambulatory",,"female",,"not hispanic or latino"
+24,,"ambulatory",,"female","white","not hispanic or latino"
+24,"2018-07-01",,,,,"not hispanic or latino"
+24,"2018-07-01",,,,"white","not hispanic or latino"
+24,"2018-07-01","ambulatory",,,,
+24,"2018-07-01","ambulatory",,,"white",
+24,"2018-06-01",,,,,
+23,"2018-07-01","ambulatory",,,,"not hispanic or latino"
+23,"2018-07-01","ambulatory",,,"white","not hispanic or latino"
+22,"2018-06-01","ambulatory",,,,
+21,,,,"male",,
+21,"2018-06-01",,,,,"not hispanic or latino"
+21,"2018-06-01",,,,"white",
+19,,,,"male","white",
+19,,"ambulatory",,"male",,
+19,"2018-06-01",,,,"white","not hispanic or latino"
+19,"2018-06-01","ambulatory",,,,"not hispanic or latino"
+19,"2018-06-01","ambulatory",,,"white",
+18,,,,"male",,"not hispanic or latino"
+17,,,,"male","white","not hispanic or latino"
+17,,"ambulatory",,"male",,"not hispanic or latino"
+17,,"ambulatory",,"male","white",
+17,"2018-06-01","ambulatory",,,"white","not hispanic or latino"
+16,,"ambulatory",,"male","white","not hispanic or latino"
+15,"2018-06-01",,,"female",,
+14,"2018-07-01",,,"female",,
+14,"2018-07-01",,,"female",,"not hispanic or latino"
+14,"2018-07-01",,,"female","white",
+14,"2018-07-01",,,"female","white","not hispanic or latino"
+14,"2018-06-01",,,"female","white",
+14,"2018-06-01","ambulatory",,"female",,
+13,"2018-07-01","ambulatory",,"female",,
+13,"2018-07-01","ambulatory",,"female",,"not hispanic or latino"
+13,"2018-07-01","ambulatory",,"female","white",
+13,"2018-07-01","ambulatory",,"female","white","not hispanic or latino"
+13,"2018-06-01",,,"female",,"not hispanic or latino"
+13,"2018-06-01","ambulatory",,"female","white",
+12,"2018-07-01",,,"male",,
+12,"2018-07-01",,,"male","white",
+12,"2018-06-01",,,"female","white","not hispanic or latino"
+12,"2018-06-01","ambulatory",,"female",,"not hispanic or latino"
+11,"2018-07-01","ambulatory",,"male",,
+11,"2018-07-01","ambulatory",,"male","white",
+11,"2018-06-01","ambulatory",,"female","white","not hispanic or latino"
+10,"2018-07-01",,,"male",,"not hispanic or latino"
+10,"2018-07-01",,,"male","white","not hispanic or latino"
+10,"2018-07-01","ambulatory",,"male",,"not hispanic or latino"
+10,"2018-07-01","ambulatory",,"male","white","not hispanic or latino"
diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_priority_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_priority_month.csv
index bc2cc2d6..2b16c7e2 100644
--- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_priority_month.csv
+++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_priority_month.csv
@@ -1,13 +1,13 @@
-cnt,class_display,priority_display,period_start_month
+"cnt","class_display","priority_display","period_start_month"
 50,,,
-50,,cumulus__none,
-46,ambulatory,,
-46,ambulatory,cumulus__none,
-26,,,2018-07-01
-26,,cumulus__none,2018-07-01
-24,,,2018-06-01
-24,,cumulus__none,2018-06-01
-24,ambulatory,,2018-07-01
-24,ambulatory,cumulus__none,2018-07-01
-22,ambulatory,,2018-06-01
-22,ambulatory,cumulus__none,2018-06-01
+50,,"cumulus__none",
+46,"ambulatory",,
+46,"ambulatory","cumulus__none",
+26,,,"2018-07-01"
+26,,"cumulus__none","2018-07-01"
+24,,,"2018-06-01"
+24,,"cumulus__none","2018-06-01"
+24,"ambulatory",,"2018-07-01"
+24,"ambulatory","cumulus__none","2018-07-01"
+22,"ambulatory",,"2018-06-01"
+22,"ambulatory","cumulus__none","2018-06-01"
diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_service_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_service_month.csv
index 1ef0dae9..2dc35115 100644
--- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_service_month.csv
+++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_service_month.csv
@@ -1,13 +1,13 @@
-cnt,class_display,serviceType_display,period_start_month
+"cnt","class_display","serviceType_display","period_start_month"
 50,,,
-50,,cumulus__none,
-46,ambulatory,,
-46,ambulatory,cumulus__none,
-26,,,2018-07-01
-26,,cumulus__none,2018-07-01
-24,,,2018-06-01
-24,,cumulus__none,2018-06-01
-24,ambulatory,,2018-07-01
-24,ambulatory,cumulus__none,2018-07-01
-22,ambulatory,,2018-06-01
-22,ambulatory,cumulus__none,2018-06-01
+50,,"cumulus__none",
+46,"ambulatory",,
+46,"ambulatory","cumulus__none",
+26,,,"2018-07-01"
+26,,"cumulus__none","2018-07-01"
+24,,,"2018-06-01"
+24,,"cumulus__none","2018-06-01"
+24,"ambulatory",,"2018-07-01"
+24,"ambulatory","cumulus__none","2018-07-01"
+22,"ambulatory",,"2018-06-01"
+22,"ambulatory","cumulus__none","2018-06-01"
diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_type_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_type_month.csv
index bb5afe09..02621fa4 100644
--- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_type_month.csv
+++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_type_month.csv
@@ -1,7 +1,7 @@
-cnt,class_display,type_display,period_start_month
+"cnt","class_display","type_display","period_start_month"
 50,,,
-46,ambulatory,,
-26,,,2018-07-01
-24,,,2018-06-01
-24,ambulatory,,2018-07-01
-22,ambulatory,,2018-06-01
+46,"ambulatory",,
+26,,,"2018-07-01"
+24,,,"2018-06-01"
+24,"ambulatory",,"2018-07-01"
+22,"ambulatory",,"2018-06-01"
diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_medicationrequest_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_medicationrequest_month.csv
index 0c01d794..b0299974 100644
--- a/tests/test_data/duckdb_data/expected_export/core/core__count_medicationrequest_month.csv
+++ b/tests/test_data/duckdb_data/expected_export/core/core__count_medicationrequest_month.csv
@@ -1,13 +1,13 @@
-cnt,status,intent,authoredon_month,medication_display
+"cnt","status","intent","authoredon_month","medication_display"
 27,,,,
-27,,order,,
-26,stopped,,,
-26,stopped,order,,
-15,,,2018-07-01,
-15,,order,2018-07-01,
-15,stopped,,2018-07-01,
-15,stopped,order,2018-07-01,
-12,,,2018-06-01,
-12,,order,2018-06-01,
-11,stopped,,2018-06-01,
-11,stopped,order,2018-06-01,
+27,,"order",,
+26,"stopped",,,
+26,"stopped","order",,
+15,,,"2018-07-01",
+15,,"order","2018-07-01",
+15,"stopped",,"2018-07-01",
+15,"stopped","order","2018-07-01",
+12,,,"2018-06-01",
+12,,"order","2018-06-01",
+11,"stopped",,"2018-06-01",
+11,"stopped","order","2018-06-01",
diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_observation_lab_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_observation_lab_month.csv
index 51b9e49e..5555c1ec 100644
--- a/tests/test_data/duckdb_data/expected_export/core/core__count_observation_lab_month.csv
+++ b/tests/test_data/duckdb_data/expected_export/core/core__count_observation_lab_month.csv
@@ -1,15 +1,15 @@
-cnt,effectiveDateTime_month,observation_code,valueCodeableConcept_display,class_display
+"cnt","effectiveDateTime_month","observation_code","valueCodeableConcept_display","class_display"
 20,,,,
-20,,,,ambulatory
-10,,,Urine smell ammoniacal (finding),
-10,,,Urine smell ammoniacal (finding),ambulatory
-10,,,Brown color (qualifier value),
-10,,,Brown color (qualifier value),ambulatory
-10,,5778-6,,
-10,,5778-6,,ambulatory
-10,,5778-6,Brown color (qualifier value),
-10,,5778-6,Brown color (qualifier value),ambulatory
-10,,34533-0,,
-10,,34533-0,,ambulatory
-10,,34533-0,Urine smell ammoniacal (finding),
-10,,34533-0,Urine smell ammoniacal (finding),ambulatory
+20,,,,"ambulatory"
+10,,,"Urine smell ammoniacal (finding)",
+10,,,"Urine smell ammoniacal (finding)","ambulatory"
+10,,,"Brown color (qualifier value)",
+10,,,"Brown color (qualifier value)","ambulatory"
+10,,"5778-6",,
+10,,"5778-6",,"ambulatory"
+10,,"5778-6","Brown color (qualifier value)",
+10,,"5778-6","Brown color (qualifier value)","ambulatory"
+10,,"34533-0",,
+10,,"34533-0",,"ambulatory"
+10,,"34533-0","Urine smell ammoniacal (finding)",
+10,,"34533-0","Urine smell ammoniacal (finding)","ambulatory"
diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_patient.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_patient.csv
index d9a72050..c19cc543 100644
--- a/tests/test_data/duckdb_data/expected_export/core/core__count_patient.csv
+++ b/tests/test_data/duckdb_data/expected_export/core/core__count_patient.csv
@@ -1,13 +1,13 @@
-cnt,gender,race_display,ethnicity_display
+"cnt","gender","race_display","ethnicity_display"
 50,,,
-47,,white,
-45,,,not hispanic or latino
-43,,white,not hispanic or latino
-29,female,,
-28,female,white,
-27,female,,not hispanic or latino
-26,female,white,not hispanic or latino
-21,male,,
-19,male,white,
-18,male,,not hispanic or latino
-17,male,white,not hispanic or latino
+47,,"white",
+45,,,"not hispanic or latino"
+43,,"white","not hispanic or latino"
+29,"female",,
+28,"female","white",
+27,"female",,"not hispanic or latino"
+26,"female","white","not hispanic or latino"
+21,"male",,
+19,"male","white",
+18,"male",,"not hispanic or latino"
+17,"male","white","not hispanic or latino"
diff --git a/tests/test_data/duckdb_data/expected_export/core/core__meta_date.csv b/tests/test_data/duckdb_data/expected_export/core/core__meta_date.csv
index 6d2ad6b7..2e2c4f61 100644
--- a/tests/test_data/duckdb_data/expected_export/core/core__meta_date.csv
+++ b/tests/test_data/duckdb_data/expected_export/core/core__meta_date.csv
@@ -1,2 +1,2 @@
-min_date,max_date
+"min_date","max_date"
 2018-06-01,2018-07-31
diff --git a/tests/test_data/duckdb_data/expected_export/core/core__meta_version.csv b/tests/test_data/duckdb_data/expected_export/core/core__meta_version.csv
index 4d65fc4e..cbd90d88 100644
--- a/tests/test_data/duckdb_data/expected_export/core/core__meta_version.csv
+++ b/tests/test_data/duckdb_data/expected_export/core/core__meta_version.csv
@@ -1,2 +1,2 @@
-data_package_version
+"data_package_version"
 3
diff --git a/tests/test_data/study_bad_manifest/manifest.toml b/tests/test_data/study_bad_manifest/manifest.toml
new file mode 100644
index 00000000..d1cdd86c
--- /dev/null
+++ b/tests/test_data/study_bad_manifest/manifest.toml
@@ -0,0 +1,4 @@
+study_prefix = "study_bad_manifest"
+
+[sql_config
+file_names = ["test.sql"]
diff --git a/tests/test_data/study_python_valid/module2.py b/tests/test_data/study_python_valid/module2.py
index 70fafa14..9ba693c1 100644
--- a/tests/test_data/study_python_valid/module2.py
+++ b/tests/test_data/study_python_valid/module2.py
@@ -5,4 +5,6 @@ class ModuleTwoRunner(BaseTableBuilder):
     display_text = "module2"
 
     def prepare_queries(self, cursor: object, schema: str, *args, **kwargs):
-        pass
+        self.queries.append(
+            "CREATE TABLE IF NOT EXISTS study_python_valid__table_2 (test int);"
+        )
diff --git a/tests/test_data/study_python_valid_generated.md b/tests/test_data/study_python_valid_generated.md
index 00e40822..ee8857bc 100644
--- a/tests/test_data/study_python_valid_generated.md
+++ b/tests/test_data/study_python_valid_generated.md
@@ -17,3 +17,10 @@
 |test  |INTEGER|           |
 
 
+### study_python_valid__table_2
+
+|Column| Type  |Description|
+|------|-------|-----------|
+|test  |INTEGER|           |
+
+
diff --git a/tests/test_databases.py b/tests/test_databases.py
new file mode 100644
index 00000000..836c12d7
--- /dev/null
+++ b/tests/test_databases.py
@@ -0,0 +1,367 @@
+"""Low level database tests
+
+This is intended to exercise edge cases not covered via more integrated testing"""
+
+import datetime
+import os
+import pathlib
+from contextlib import nullcontext as does_not_raise
+from unittest import mock
+
+import pandas
+import pyarrow
+import pytest
+
+from cumulus_library import databases, errors
+
+ATHENA_KWARGS = {
+    "region": "test",
+    "work_group": "test",
+    "profile": "test",
+    "schema_name": "test",
+}
+DUCKDB_KWARGS = {
+    "db_file": ":memory:",
+}
+
+
+@mock.patch.dict(
+    os.environ,
+    clear=True,
+)
+@pytest.mark.parametrize(
+    "db,data,expected,raises",
+    [
+        (
+            databases.AthenaDatabaseBackend(**ATHENA_KWARGS),
+            pandas.DataFrame(
+                {
+                    "str": ["str"],
+                    "int": [123],
+                    "float": [1.23],
+                    "bool": [True],
+                    "datetime": [datetime.datetime.now()],
+                }
+            ),
+            ["STRING", "INT", "DOUBLE", "BOOLEAN", "TIMESTAMP"],
+            does_not_raise(),
+        ),
+        (
+            databases.DuckDatabaseBackend(**DUCKDB_KWARGS),
+            pandas.DataFrame(
+                {
+                    "str": ["str"],
+                    "int": [123],
+                    "float": [1.23],
+                    "bool": [True],
+                    "datetime": [datetime.datetime.now()],
+                }
+            ),
+            [],
+            does_not_raise(),
+        ),
+        (
+            databases.AthenaDatabaseBackend(**ATHENA_KWARGS),
+            pandas.DataFrame({"cat": pandas.Series(["a"], dtype="category")}),
+            ["STRING", "INT", "DOUBLE", "BOOLEAN", "TIMESTAMP"],
+            pytest.raises(errors.CumulusLibraryError),
+        ),
+    ],
+)
+def test_col_types_from_pandas(db, data, expected, raises):
+    with raises:
+        vals = db.col_parquet_types_from_pandas(data.dtypes)
+        assert set(expected) == set(vals)
+
+
+@mock.patch.dict(
+    os.environ,
+    clear=True,
+)
+@pytest.mark.parametrize(
+    "db,data,expected,raises",
+    [
+        (
+            databases.AthenaDatabaseBackend(**ATHENA_KWARGS),
+            [
+                (
+                    "a",
+                    "varchar",
+                ),
+                (
+                    "b",
+                    "bigint",
+                ),
+                (
+                    "c",
+                    "integer",
+                ),
+                (
+                    "d",
+                    "double",
+                ),
+                (
+                    "e",
+                    "boolean",
+                ),
+                (
+                    "f",
+                    "date",
+                ),
+                ("g", "timestamp"),
+            ],
+            [
+                (
+                    "a",
+                    pyarrow.string(),
+                ),
+                (
+                    "b",
+                    pyarrow.int64(),
+                ),
+                (
+                    "c",
+                    pyarrow.int64(),
+                ),
+                (
+                    "d",
+                    pyarrow.float64(),
+                ),
+                (
+                    "e",
+                    pyarrow.bool_(),
+                ),
+                (
+                    "f",
+                    pyarrow.date64(),
+                ),
+                ("g", pyarrow.timestamp("s")),
+            ],
+            does_not_raise(),
+        ),
+        (
+            databases.AthenaDatabaseBackend(**ATHENA_KWARGS),
+            [("a", "other_type")],
+            [],
+            pytest.raises(errors.CumulusLibraryError),
+        ),
+        (
+            databases.DuckDatabaseBackend(**DUCKDB_KWARGS),
+            [
+                (
+                    "a",
+                    "STRING",
+                ),
+                (
+                    "b",
+                    "INTEGER",
+                ),
+                (
+                    "c",
+                    "NUMBER",
+                ),
+                (
+                    "d",
+                    "DOUBLE",
+                ),
+                (
+                    "e",
+                    "boolean",
+                ),
+                (
+                    "f",
+                    "Date",
+                ),
+                ("g", "TIMESTAMP"),
+            ],
+            [
+                (
+                    "a",
+                    pyarrow.string(),
+                ),
+                (
+                    "b",
+                    pyarrow.int64(),
+                ),
+                (
+                    "c",
+                    pyarrow.float64(),
+                ),
+                (
+                    "d",
+                    pyarrow.float64(),
+                ),
+                (
+                    "e",
+                    pyarrow.bool_(),
+                ),
+                (
+                    "f",
+                    pyarrow.date64(),
+                ),
+                ("g", pyarrow.timestamp("s")),
+            ],
+            does_not_raise(),
+        ),
+        (
+            databases.DuckDatabaseBackend(**DUCKDB_KWARGS),
+            [("a", "other_type")],
+            [],
+            pytest.raises(errors.CumulusLibraryError),
+        ),
+    ],
+)
+def test_pyarrow_types_from_sql(db, data, expected, raises):
+    with raises:
+        vals = db.col_pyarrow_types_from_sql(data)
+        assert len(expected) == len(vals)
+        for index in range(0, len(vals)):
+            assert vals[index][-1] == expected[index][-1]
+
+
+@mock.patch.dict(
+    os.environ,
+    clear=True,
+)
+@pytest.mark.parametrize(
+    "args,expected_type, raises",
+    [
+        (
+            {**{"db_type": "duckdb", "schema_name": "test"}, **DUCKDB_KWARGS},
+            databases.DuckDatabaseBackend,
+            does_not_raise(),
+        ),
+        (
+            {**{"db_type": "athena"}, **ATHENA_KWARGS},
+            databases.AthenaDatabaseBackend,
+            does_not_raise(),
+        ),
+        (
+            {**{"db_type": "athena", "load_ndjson_dir": "file.json"}, **ATHENA_KWARGS},
+            databases.AthenaDatabaseBackend,
+            pytest.raises(SystemExit),
+        ),
+        (
+            # https://en.wikipedia.org/wiki/Cornerstone_(software)
+            {**{"db_type": "cornerstone", "schema_name": "test"}},
+            None,
+            pytest.raises(errors.CumulusLibraryError),
+        ),
+    ],
+)
+def test_create_db_backend(args, expected_type, raises):
+    with raises:
+        db = databases.create_db_backend(args)
+        assert isinstance(db, expected_type)
+
+
+def test_upload_file_default():
+    db = databases.DuckDatabaseBackend(**DUCKDB_KWARGS)
+    location = db.upload_file(
+        file=pathlib.Path(__file__).resolve(),
+        study="test",
+        topic="table",
+    )
+    assert location is None
+
+
+@mock.patch.dict(
+    os.environ,
+    clear=True,
+)
+@pytest.mark.parametrize(
+    "args,sse,keycount,expected,raises",
+    [
+        (
+            {
+                "file": pathlib.Path(__file__).resolve(),
+                "study": "study",
+                "topic": "table",
+                "remote_filename": None,
+                "force_upload": False,
+            },
+            "SSE_KMS",
+            1,
+            "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table",
+            does_not_raise(),
+        ),
+        (
+            {
+                "file": pathlib.Path(__file__).resolve(),
+                "study": "study",
+                "topic": "table",
+                "remote_filename": None,
+                "force_upload": False,
+            },
+            "SSE_KMS",
+            0,
+            "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table",
+            does_not_raise(),
+        ),
+        (
+            {
+                "file": pathlib.Path(__file__).resolve(),
+                "study": "study",
+                "topic": "table",
+                "remote_filename": None,
+                "force_upload": False,
+            },
+            "SSE-S3",
+            0,
+            "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table",
+            pytest.raises(errors.AWSError),
+        ),
+        (
+            {
+                "file": pathlib.Path(__file__).resolve(),
+                "study": "study",
+                "topic": "table",
+                "remote_filename": None,
+                "force_upload": True,
+            },
+            "SSE_KMS",
+            1,
+            "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table",
+            does_not_raise(),
+        ),
+        (
+            {
+                "file": pathlib.Path(__file__).resolve(),
+                "study": "study",
+                "topic": "table",
+                "remote_filename": "custom.name",
+                "force_upload": False,
+            },
+            "SSE_KMS",
+            0,
+            "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table",
+            does_not_raise(),
+        ),
+    ],
+)
+@mock.patch("botocore.client")
+def test_upload_file_athena(mock_botocore, args, sse, keycount, expected, raises):
+    mock_data = {
+        "WorkGroup": {
+            "Configuration": {
+                "ResultConfiguration": {
+                    "OutputLocation": "s3://test_bucket/test_location/",
+                    "EncryptionConfiguration": {"EncryptionOption": sse},
+                }
+            }
+        }
+    }
+    mock_clientobj = mock_botocore.ClientCreator.return_value.create_client.return_value
+    mock_clientobj.get_work_group.return_value = mock_data
+    mock_clientobj.list_objects_v2.return_value = {"KeyCount": keycount}
+    db = databases.AthenaDatabaseBackend(**ATHENA_KWARGS)
+    with raises:
+        location = db.upload_file(**args)
+        assert location == expected
+        if keycount == 0 or args["force_upload"]:
+            assert mock_clientobj.put_object.called
+            kwargs = mock_clientobj.put_object.call_args_list[0][1]
+            if args["remote_filename"]:
+                assert kwargs["Key"].endswith(args["remote_filename"])
+            else:
+                assert kwargs["Key"].endswith(args["file"].name)