From 23f6bc9b7797feeebfb164be6123309b11a6b8c5 Mon Sep 17 00:00:00 2001 From: matt garber Date: Mon, 17 Jun 2024 11:19:09 -0400 Subject: [PATCH] Chunking sql export queries, coverage (#249) * Chunking SQL export queries * cleanup extra files, docstring tweaks --- .coveragerc | 2 + .github/workflows/ci.yaml | 16 +- .gitignore | 1 + cumulus_library/actions/exporter.py | 37 +- cumulus_library/cli.py | 98 ++--- cumulus_library/cli_parser.py | 1 + cumulus_library/databases.py | 116 ++++-- docs/first-time-setup.md | 3 - pyproject.toml | 1 + tests/test_actions.py | 4 +- tests/test_cli.py | 114 +++++- .../core/core__count_condition_month.csv | 4 +- .../core__count_documentreference_month.csv | 36 +- .../core/core__count_encounter_all_types.csv | 16 +- .../core__count_encounter_all_types_month.csv | 48 +-- .../core/core__count_encounter_month.csv | 128 +++--- .../core__count_encounter_priority_month.csv | 24 +- .../core__count_encounter_service_month.csv | 24 +- .../core/core__count_encounter_type_month.csv | 12 +- .../core__count_medicationrequest_month.csv | 24 +- .../core__count_observation_lab_month.csv | 28 +- .../core/core__count_patient.csv | 24 +- .../expected_export/core/core__meta_date.csv | 2 +- .../core/core__meta_version.csv | 2 +- .../study_bad_manifest/manifest.toml | 4 + tests/test_data/study_python_valid/module2.py | 4 +- .../test_data/study_python_valid_generated.md | 7 + tests/test_databases.py | 367 ++++++++++++++++++ 28 files changed, 831 insertions(+), 316 deletions(-) create mode 100644 .coveragerc create mode 100644 tests/test_data/study_bad_manifest/manifest.toml create mode 100644 tests/test_databases.py diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..1e6c3683 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,2 @@ +[run] +omit =cumulus_library/schema/* diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0e3fe5e9..fe872d65 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -29,9 +29,23 @@ jobs: run: | python -m pip install --upgrade pip pip install ".[test]" + - name: Create mock AWS credentials + run: | + mkdir ~/.aws && touch ~/.aws/credentials + echo -e "[test]\naws_access_key_id = test\naws_secret_access_key = test" > ~/.aws/credentials - name: Test with pytest run: | - python -m pytest + python -m pytest --cov-report xml --cov=cumulus_library tests + - name: Generate coverage report + uses: orgoro/coverage@v3.1 + with: + coverageFile: coverage.xml + token: ${{ secrets.GITHUB_TOKEN }} + thresholdAll: .9 + thresholdNew: 1 + thresholdModified: .95 + + lint: runs-on: ubuntu-22.04 steps: diff --git a/.gitignore b/.gitignore index 705a1721..8e47ed3d 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ output.sql *generated.md MRCONSO.RRF *.zip +coverage.xml # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/cumulus_library/actions/exporter.py b/cumulus_library/actions/exporter.py index f28fa55f..9e854574 100644 --- a/cumulus_library/actions/exporter.py +++ b/cumulus_library/actions/exporter.py @@ -1,6 +1,7 @@ -import csv import pathlib +import pyarrow +from pyarrow import csv, parquet from rich.progress import track from cumulus_library import base_utils, databases, study_parser @@ -25,12 +26,25 @@ def reset_counts_exports( file.unlink() +def _write_chunk(writer, chunk, schema): + writer.write( + pyarrow.Table.from_pandas( + chunk.sort_values( + by=list(chunk.columns), ascending=False, na_position="first" + ), + preserve_index=False, + schema=schema, + ) + ) + + def export_study( manifest_parser: study_parser.StudyManifestParser, db: databases.DatabaseBackend, schema_name: str, data_path: pathlib.Path, archive: bool, + chunksize: int = 1000000, ) -> list: """Exports csvs/parquet extracts of tables listed in export_list :param db: A database backend @@ -56,13 +70,22 @@ def export_study( description=f"Exporting {manifest_parser.get_study_prefix()} data...", ): query = f"SELECT * FROM {table}" - dataframe = db.execute_as_pandas(query) + dataframe_chunks, db_schema = db.execute_as_pandas(query, chunksize=chunksize) path.mkdir(parents=True, exist_ok=True) - dataframe = dataframe.sort_values( - by=list(dataframe.columns), ascending=False, na_position="first" - ) - dataframe.to_csv(f"{path}/{table}.csv", index=False, quoting=csv.QUOTE_MINIMAL) - dataframe.to_parquet(f"{path}/{table}.parquet", index=False) + schema = pyarrow.schema(db.col_pyarrow_types_from_sql(db_schema)) + with parquet.ParquetWriter(f"{path}/{table}.parquet", schema) as p_writer: + with csv.CSVWriter( + f"{path}/{table}.csv", + schema, + write_options=csv.WriteOptions( + # Note that this quoting style is not exactly csv.QUOTE_MINIMAL + # https://github.com/apache/arrow/issues/42032 + quoting_style="needed" + ), + ) as c_writer: + for chunk in dataframe_chunks: + _write_chunk(p_writer, chunk, schema) # pragma: no cover + _write_chunk(c_writer, chunk, schema) # pragma: no cover queries.append(queries) if archive: base_utils.zip_dir(path, data_path, manifest_parser.get_study_prefix()) diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py index 5e951008..c34dc5d1 100755 --- a/cumulus_library/cli.py +++ b/cumulus_library/cli.py @@ -82,11 +82,12 @@ def clean_study( :param stats_clean: If true, removes previous stats runs :keyword prefix: If True, does a search by string prefix in place of study name """ - if targets is None or targets == ["all"]: + if targets is None: sys.exit( "Explicit targets for cleaning not provided. " "Provide one or more explicit study prefixes to remove." ) + for target in targets: if prefix: manifest_parser = study_parser.StudyManifestParser() @@ -207,25 +208,6 @@ def run_matching_table_builder( config=config, ) - def clean_and_build_all( - self, study_dict: dict, config: base_utils.StudyConfig - ) -> None: - """Builds tables for all studies. - - NOTE: By design, this method will always exclude the `template` study dir, - since 99% of the time you don't need a live copy in the database. - - :param study_dict: A dict of paths - :param config: A StudyConfig object containing optional params - """ - study_dict = dict(study_dict) - study_dict.pop("template") - for precursor_study in ["vocab", "core"]: - self.clean_and_build_study(study_dict[precursor_study], config=config) - study_dict.pop(precursor_study) - for key in study_dict: - self.clean_and_build_study(study_dict[key], config=config) - ### Data exporters def export_study( self, target: pathlib.Path, data_path: pathlib.Path, archive: bool @@ -241,11 +223,6 @@ def export_study( manifest_parser, self.db, self.schema_name, data_path, archive ) - def export_all(self, study_dict: dict, data_path: pathlib.Path, archive: bool): - """Exports all defined count tables to disk""" - for key in study_dict.keys(): - self.export_study(study_dict[key], data_path, archive) - def generate_study_sql( self, target: pathlib.Path, @@ -296,24 +273,6 @@ def get_abs_path(path: str) -> pathlib.Path: return pathlib.Path(pathlib.Path.cwd(), path) -def create_template(path: str) -> None: - """Creates a manifest in target dir if one doesn't exist""" - abs_path = get_abs_path(path) - manifest_path = pathlib.Path(abs_path, "manifest.toml") - if manifest_path.exists(): - sys.exit(f"A manifest.toml already exists at {abs_path}, skipping creation") - abs_path.mkdir(parents=True, exist_ok=True) - - copy_lists = [ - ["studies/template/manifest.toml", "manifest.toml"], - [".sqlfluff", ".sqlfluff"], - ] - for source, dest in copy_lists: - source_path = pathlib.Path(pathlib.Path(__file__).resolve().parents[0], source) - dest_path = pathlib.Path(abs_path, dest) - dest_path.write_bytes(source_path.read_bytes()) - - def get_study_dict(alt_dir_paths: list) -> dict[str, pathlib.Path] | None: """Gets valid study targets from ./studies/, and any pip installed studies @@ -362,10 +321,8 @@ def get_studies_by_manifest_path(path: pathlib.Path) -> dict[str, pathlib.Path]: def run_cli(args: dict): """Controls which library tasks are run based on CLI arguments""" console = rich.console.Console() - if args["action"] == "create": - create_template(args["create_dir"]) - elif args["action"] == "upload": + if args["action"] == "upload": try: uploader.upload_files(args) except requests.RequestException as e: @@ -387,7 +344,7 @@ def run_cli(args: dict): runner.verbose = True console.print("[italic] Connecting to database...") runner.cursor.execute("SHOW DATABASES") - study_dict = get_study_dict(args["study_dir"]) + study_dict = get_study_dict(args.get("study_dir")) if "prefix" not in args.keys(): if args.get("target"): for target in args["target"]: @@ -406,19 +363,16 @@ def run_cli(args: dict): prefix=args["prefix"], ) elif args["action"] == "build": - if "all" in args["target"]: - runner.clean_and_build_all(study_dict, config=config) - else: - for target in args["target"]: - if args["builder"]: - runner.run_matching_table_builder( - study_dict[target], args["builder"], config=config - ) - else: - runner.clean_and_build_study( - study_dict[target], - config=config, - ) + for target in args["target"]: + if args["builder"]: + runner.run_matching_table_builder( + study_dict[target], args["builder"], config=config + ) + else: + runner.clean_and_build_study( + study_dict[target], + config=config, + ) elif args["action"] == "export": if args["archive"]: @@ -429,20 +383,17 @@ def run_cli(args: dict): "set[/italic], primarily dates, on a per patient level.\n\n" "[bold]By doing this, you are assuming the responsibility for " "meeting your organization's security requirements for " - "storing this data in a secure manager.[/bold]\n\n" + "storing this data in a secure manner.[/bold]\n\n" "Type Y to proceed, or any other value to quit.\n" ) console.print(warning_text) response = input() if response.lower() != "y": sys.exit() - if "all" in args["target"]: - runner.export_all(study_dict, args["data_path"], args["archive"]) - else: - for target in args["target"]: - runner.export_study( - study_dict[target], args["data_path"], args["archive"] - ) + for target in args["target"]: + runner.export_study( + study_dict[target], args["data_path"], args["archive"] + ) elif args["action"] == "import": for archive in args["archive_path"]: @@ -475,11 +426,6 @@ def main(cli_args=None): if args["action"] is None: parser.print_usage() sys.exit(1) - if args.get("target"): - for target in args["target"]: - if target == "all": - args["target"] = ["all"] - break arg_env_pairs = ( ("data_path", "CUMULUS_LIBRARY_DATA_PATH"), @@ -493,7 +439,7 @@ def main(cli_args=None): ("umls_key", "UMLS_API_KEY"), ("url", "CUMULUS_AGGREGATOR_URL"), ("user", "CUMULUS_AGGREGATOR_USER"), - ("workgroup", "CUMULUS_LIBRARY_WORKGROUP"), + ("work_group", "CUMULUS_LIBRARY_WORKGROUP"), ) read_env_vars = [] for pair in arg_env_pairs: @@ -541,8 +487,8 @@ def main(cli_args=None): def main_cli(): # called by the generated wrapper scripts - main() + main() # pragma: no cover if __name__ == "__main__": - main() + main() # pragma: no cover diff --git a/cumulus_library/cli_parser.py b/cumulus_library/cli_parser.py index f4f42aac..fa79aeb8 100644 --- a/cumulus_library/cli_parser.py +++ b/cumulus_library/cli_parser.py @@ -12,6 +12,7 @@ def add_aws_config(parser: argparse.ArgumentParser) -> None: aws.add_argument( "--workgroup", default="cumulus", + dest="work_group", help="Cumulus Athena workgroup (default: cumulus)", ) aws.add_argument( diff --git a/cumulus_library/databases.py b/cumulus_library/databases.py index 9d0520c5..a6e48db7 100644 --- a/cumulus_library/databases.py +++ b/cumulus_library/databases.py @@ -9,6 +9,7 @@ """ import abc +import collections import datetime import json import os @@ -35,16 +36,16 @@ class DatabaseCursor(Protocol): """Protocol for a PEP-249 compatible cursor""" def execute(self, sql: str) -> None: - pass + pass # pragma: no cover def fetchone(self) -> list | None: - pass + pass # pragma: no cover def fetchmany(self, size: int | None) -> list[list] | None: - pass + pass # pragma: no cover def fetchall(self) -> list[list] | None: - pass + pass # pragma: no cover class DatabaseParser(abc.ABC): @@ -151,7 +152,9 @@ def pandas_cursor(self) -> DatabaseCursor: """ @abc.abstractmethod - def execute_as_pandas(self, sql: str) -> pandas.DataFrame: + def execute_as_pandas( + self, sql: str, chunksize: int | None = None + ) -> (pandas.DataFrame | collections.abc.Iterator[pandas.DataFrame], list[tuple]): """Returns a pandas.DataFrame version of the results from the provided SQL""" @abc.abstractmethod @@ -172,7 +175,7 @@ def operational_errors(self) -> tuple[Exception]: def col_parquet_types_from_pandas(self, field_types: list) -> list: """Returns appropriate types for creating tables based from parquet. - By default, returns the input (which assumes that the DB infers directly + By default, returns an empty list (which assumes that the DB infers directly from parquet data types). Only override if your DB uses an explicit SerDe format, or otherwise needs a modified typing to inject directly into a query.""" @@ -196,9 +199,10 @@ def col_parquet_types_from_pandas(self, field_types: list) -> list: # raise errors.CumulusLibraryError( # f"Unsupported type {type(field)} found." # ) - # return output + return [] - return field_types + def col_pyarrow_types_from_sql(self, columns: list[tuple]) -> list: + return columns def upload_file( self, @@ -257,14 +261,17 @@ def cursor(self) -> AthenaCursor: def pandas_cursor(self) -> AthenaPandasCursor: return self.connection.cursor(cursor=AthenaPandasCursor) - def execute_as_pandas(self, sql: str) -> pandas.DataFrame: - return self.pandas_cursor().execute(sql).as_pandas() + def execute_as_pandas( + self, sql: str, chunksize: int | None = None + ) -> (pandas.DataFrame | collections.abc.Iterator[pandas.DataFrame], list[tuple]): + query = self.pandas_cursor().execute(sql, chunksize=chunksize) + return query.as_pandas(), query.description def parser(self) -> DatabaseParser: return AthenaParser() def operational_errors(self) -> tuple[Exception]: - return (pyathena.OperationalError,) + return (pyathena.OperationalError,) # pragma: no cover def col_parquet_types_from_pandas(self, field_types: list) -> list: output = [] @@ -272,7 +279,10 @@ def col_parquet_types_from_pandas(self, field_types: list) -> list: match field: case numpy.dtypes.ObjectDType(): output.append("STRING") - case pandas.core.arrays.integer.Int64Dtype(): + case ( + pandas.core.arrays.integer.Int64Dtype() + | numpy.dtypes.Int64DType() + ): output.append("INT") case numpy.dtypes.Float64DType(): output.append("DOUBLE") @@ -282,7 +292,31 @@ def col_parquet_types_from_pandas(self, field_types: list) -> list: output.append("TIMESTAMP") case _: raise errors.CumulusLibraryError( - f"Unsupported type {type(field)} found." + f"Unsupported pandas type {type(field)} found." + ) + return output + + def col_pyarrow_types_from_sql(self, columns: list[tuple]) -> list: + output = [] + for column in columns: + match column[1]: + case "varchar": + output.append((column[0], pyarrow.string())) + case "bigint": + output.append((column[0], pyarrow.int64())) + case "integer": + output.append((column[0], pyarrow.int64())) + case "double": + output.append((column[0], pyarrow.float64())) + case "boolean": + output.append((column[0], pyarrow.bool_())) + case "date": + output.append((column[0], pyarrow.date64())) + case "timestamp": + output.append((column[0], pyarrow.timestamp("s"))) + case _: + raise errors.CumulusLibraryError( + output.append(f"Unsupported SQL type '{column}' found.") ) return output @@ -296,9 +330,8 @@ def upload_file( force_upload=False, ) -> str | None: # We'll investigate the connection to get the relevant S3 upload path. - wg_conf = self.connection._client.get_work_group(WorkGroup=self.work_group)[ - "WorkGroup" - ]["Configuration"]["ResultConfiguration"] + workgroup = self.connection._client.get_work_group(WorkGroup=self.work_group) + wg_conf = workgroup["WorkGroup"]["Configuration"]["ResultConfiguration"] s3_path = wg_conf["OutputLocation"] bucket = "/".join(s3_path.split("/")[2:3]) key_prefix = "/".join(s3_path.split("/")[3:]) @@ -315,7 +348,7 @@ def upload_file( f"{key_prefix}cumulus_user_uploads/{self.schema_name}/" f"{study}/{topic}" ) if not remote_filename: - remote_filename = file + remote_filename = file.name session = boto3.Session(profile_name=self.connection.profile_name) s3_client = session.client("s3") @@ -337,7 +370,7 @@ def upload_file( return f"s3://{bucket}/{s3_key}" def close(self) -> None: - return self.connection.close() + return self.connection.close() # pragma: no cover class AthenaParser(DatabaseParser): @@ -525,18 +558,47 @@ def pandas_cursor(self) -> duckdb.DuckDBPyConnection: # Since this is not provided, return the vanilla cursor return self.connection - def execute_as_pandas(self, sql: str) -> pandas.DataFrame: + def execute_as_pandas( + self, sql: str, chunksize: int | None = None + ) -> (pandas.DataFrame | collections.abc.Iterator[pandas.DataFrame], list[tuple]): # We call convert_dtypes here in case there are integer columns. # Pandas will normally cast nullable-int as a float type unless # we call this to convert to its nullable int column type. # PyAthena seems to do this correctly for us, but not DuckDB. - return self.connection.execute(sql).df().convert_dtypes() + result = self.connection.execute(sql) + if chunksize: + return iter([result.df().convert_dtypes()]), result.description + return result.df().convert_dtypes(), result.description + + def col_pyarrow_types_from_sql(self, columns: list[tuple]) -> list: + output = [] + for column in columns: + match column[1]: + case "STRING": + output.append((column[0], pyarrow.string())) + case "INTEGER": + output.append((column[0], pyarrow.int64())) + case "NUMBER": + output.append((column[0], pyarrow.float64())) + case "DOUBLE": + output.append((column[0], pyarrow.float64())) + case "boolean" | "bool": + output.append((column[0], pyarrow.bool_())) + case "Date": + output.append((column[0], pyarrow.date64())) + case "TIMESTAMP" | "DATETIME": + output.append((column[0], pyarrow.timestamp("s"))) + case _: + raise errors.CumulusLibraryError( + f"{column[0],column[1]} does not have a conversion type" + ) + return output def parser(self) -> DatabaseParser: return DuckDbParser() def operational_errors(self) -> tuple[Exception]: - return (duckdb.OperationalError,) + return (duckdb.OperationalError,) # pragma: no cover def close(self) -> None: self.connection.close() @@ -652,23 +714,25 @@ def read_ndjson_dir(path: str) -> dict[str, pyarrow.Table]: def create_db_backend(args: dict[str, str]) -> DatabaseBackend: db_config.db_type = args["db_type"] - database = args["schema_name"] + schema = args["schema_name"] load_ndjson_dir = args.get("load_ndjson_dir") if db_config.db_type == "duckdb": - backend = DuckDatabaseBackend(database) # `database` is path name in this case + backend = DuckDatabaseBackend(schema) # `schema` is path name in this case if load_ndjson_dir: backend.insert_tables(read_ndjson_dir(load_ndjson_dir)) elif db_config.db_type == "athena": backend = AthenaDatabaseBackend( args["region"], - args["workgroup"], + args["work_group"], args["profile"], - database, + schema, ) if load_ndjson_dir: sys.exit("Loading an ndjson dir is not supported with --db-type=athena.") else: - raise ValueError(f"Unexpected --db-type value '{db_config.db_type}'") + raise errors.CumulusLibraryError( + f"'{db_config.db_type}' is not a supported database." + ) return backend diff --git a/docs/first-time-setup.md b/docs/first-time-setup.md index 0746c2c1..0197ed18 100644 --- a/docs/first-time-setup.md +++ b/docs/first-time-setup.md @@ -28,9 +28,6 @@ You can install directly from pypi by running: Installing adds a `cumulus-library` command for interacting with Athena. It provides several actions for users: -- `create` will create a manifest file for you so you can start working on -authoring queries (more information on this in -[Creating studies](./creating-studies.md)). - `build` will create new study tables, replacing previously created versions (more information on this in [Creating studies](./creating-studies.md)). - `clean` will remove studies from Athena, in case you no longer need them diff --git a/pyproject.toml b/pyproject.toml index 46c12dce..154c97dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dev = [ test = [ "freezegun", "pytest", + "pytest-cov", "responses" ] diff --git a/tests/test_actions.py b/tests/test_actions.py index 67874c92..482dc0ef 100644 --- a/tests/test_actions.py +++ b/tests/test_actions.py @@ -299,7 +299,9 @@ def test_export_study(tmp_path, mock_db_core): f"{Path(__file__).parent.parent}/cumulus_library/studies/core", data_path=f"{tmp_path}/export", ) - exporter.export_study(parser, mock_db_core, None, f"{tmp_path}/export", False) + exporter.export_study( + parser, mock_db_core, None, f"{tmp_path}/export", False, chunksize=20 + ) for file in Path(f"{tmp_path}/export").glob("*.*"): assert file in parser.get_export_table_list() diff --git a/tests/test_cli.py b/tests/test_cli.py index 4e7ed14d..a03a2f43 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -14,6 +14,7 @@ from pathlib import Path from unittest import mock +import pandas import pytest import responses import toml @@ -80,6 +81,16 @@ def test_cli_early_exit(args): does_not_raise(), "study_python_valid__table", ), + ( + ["build", "-t", "study_python_valid", "--continue", "module2"], + does_not_raise(), + "study_python_valid__table_2", + ), + ( + ["build", "-t", "study_bad_manifest"], + pytest.raises(errors.StudyManifestParsingError), + "study_python_valid__table_2", + ), (["build", "-t", "wrong"], pytest.raises(SystemExit), None), ( [ @@ -103,6 +114,7 @@ def test_cli_path_mapping(mock_load_json, mock_path, tmp_path, args, raises, exp "__desc__": "", "allowlist": { "study_python_valid": "study_python_valid", + "study_bad_manifest": "study_bad_manifest", }, } args = duckdb_args(args, tmp_path) @@ -223,7 +235,7 @@ def test_generate_md(mock_path, tmp_path): ) @mock.patch("sysconfig.get_path") @pytest.mark.parametrize( - "args,expected", + "args,expected,raises", [ ( [ @@ -232,6 +244,7 @@ def test_generate_md(mock_path, tmp_path): "core", ], "core__", + does_not_raise(), ), ( [ @@ -241,6 +254,7 @@ def test_generate_md(mock_path, tmp_path): "foo", ], "foo", + does_not_raise(), ), ( [ @@ -250,18 +264,27 @@ def test_generate_md(mock_path, tmp_path): "--statistics", ], "core__", + does_not_raise(), + ), + ( + [ + "clean", + ], + "core__", + pytest.raises(SystemExit), ), ], ) -def test_clean(mock_path, tmp_path, args, expected): # pylint: disable=unused-argument - mock_path.return_value = f"{Path(__file__).resolve().parents[0]}/test_data/" - cli.main(cli_args=duckdb_args(["build", "-t", "core"], tmp_path)) - with does_not_raise(): - with mock.patch.object(builtins, "input", lambda _: "y"): - cli.main(cli_args=duckdb_args(args, tmp_path)) - db = DuckDatabaseBackend(f"{tmp_path}/duck.db") - for table in db.cursor().execute("show tables").fetchall(): - assert expected not in table +def test_clean(mock_path, tmp_path, args, expected, raises): # pylint: disable=unused-argument + with raises: + mock_path.return_value = f"{Path(__file__).resolve().parents[0]}/test_data/" + cli.main(cli_args=duckdb_args(["build", "-t", "core"], tmp_path)) + with does_not_raise(): + with mock.patch.object(builtins, "input", lambda _: "y"): + cli.main(cli_args=duckdb_args(args, tmp_path)) + db = DuckDatabaseBackend(f"{tmp_path}/duck.db") + for table in db.cursor().execute("show tables").fetchall(): + assert expected not in table @mock.patch.dict( @@ -269,9 +292,28 @@ def test_clean(mock_path, tmp_path, args, expected): # pylint: disable=unused-a clear=True, ) @pytest.mark.parametrize( - "build_args,export_args,expected_tables", + "build_args,export_args,expected_tables,raises", [ - (["build", "-t", "core"], ["export", "-t", "core"], 59), + ( + ["build", "-t", "core"], + ["export", "-t", "core"], + 59, + does_not_raise(), + ), + ( + # checking that a study is loaded from a child directory + # of a user-defined path + [ + "build", + "-t", + "study_valid", + "-s", + "tests/test_data/", + ], + ["export", "-t", "study_valid", "-s", "tests/test_data/"], + 2, + does_not_raise(), + ), ( # checking that a study is loaded from a child directory # of a user-defined path @@ -284,8 +326,9 @@ def test_clean(mock_path, tmp_path, args, expected): # pylint: disable=unused-a ], ["export", "-t", "study_valid", "-s", "tests/test_data/"], 2, + does_not_raise(), ), - (["build", "-t", "vocab"], None, 3), + (["build", "-t", "vocab"], None, 3, does_not_raise()), ( # checking that a study is loaded from the directory of a user-defined # path. we're also validating that the CLI accepts the statistics keyword @@ -299,11 +342,27 @@ def test_clean(mock_path, tmp_path, args, expected): # pylint: disable=unused-a ], ["export", "-t", "study_valid", "-s", "tests/test_data/study_valid/"], 2, + does_not_raise(), + ), + ( + [ + "build", + "-t", + "study_valid", + "-s", + "tests/test_data/study_valid/", + "--statistics", + ], + ["export", "-t", "study_valid", "-s", "tests/test_data/study_valid/"], + 2, + does_not_raise(), ), ], ) -def test_cli_executes_queries(tmp_path, build_args, export_args, expected_tables): - with does_not_raise(): +def test_cli_executes_queries( + tmp_path, build_args, export_args, expected_tables, raises +): + with raises: build_args = duckdb_args(build_args, tmp_path) cli.main(cli_args=build_args) if export_args is not None: @@ -604,3 +663,28 @@ def test_cli_custom_args(mock_config, tmp_path, option, raises): ) called_options = mock_config.call_args[1]["options"] assert called_options[option.split(":")[0]] == option.split(":")[1] + + +@mock.patch.dict(os.environ, clear=True) +def test_cli_import_study(tmp_path): + test_data = {"string": ["a", "b", None]} + df = pandas.DataFrame(test_data) + (tmp_path / "archive").mkdir() + df.to_parquet(tmp_path / "archive/test__table.parquet") + df.to_csv(tmp_path / "archive/test__table.csv") + with zipfile.ZipFile(tmp_path / "archive/test.zip", "w") as archive: + archive.write(tmp_path / "archive/test__table.parquet") + archive.write(tmp_path / "archive/test__table.csv") + (tmp_path / "archive/test__table.parquet").unlink() + (tmp_path / "archive/test__table.csv").unlink() + + cli.main( + cli_args=duckdb_args( + [ + "import", + "-a", + str(tmp_path / "archive/test.zip"), + ], + tmp_path, + ) + ) diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv index 0128853f..4e278870 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv @@ -1,3 +1,3 @@ -cnt,category_code,recordedDate_month,code_display +"cnt","category_code","recordedDate_month","code_display" 15,,, -15,encounter-diagnosis,, +15,"encounter-diagnosis",, diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_documentreference_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_documentreference_month.csv index 42a04ba1..2257390e 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_documentreference_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_documentreference_month.csv @@ -1,19 +1,19 @@ -cnt,type_display,author_month,class_display +"cnt","type_display","author_month","class_display" 50,,, -50,Evaluation + Plan note,, -50,Emergency department note,, -46,,,ambulatory -46,Evaluation + Plan note,,ambulatory -46,Emergency department note,,ambulatory -26,,2018-07-01, -26,Evaluation + Plan note,2018-07-01, -26,Emergency department note,2018-07-01, -24,,2018-07-01,ambulatory -24,,2018-06-01, -24,Evaluation + Plan note,2018-07-01,ambulatory -24,Evaluation + Plan note,2018-06-01, -24,Emergency department note,2018-07-01,ambulatory -24,Emergency department note,2018-06-01, -22,,2018-06-01,ambulatory -22,Evaluation + Plan note,2018-06-01,ambulatory -22,Emergency department note,2018-06-01,ambulatory +50,"Evaluation + Plan note",, +50,"Emergency department note",, +46,,,"ambulatory" +46,"Evaluation + Plan note",,"ambulatory" +46,"Emergency department note",,"ambulatory" +26,,"2018-07-01", +26,"Evaluation + Plan note","2018-07-01", +26,"Emergency department note","2018-07-01", +24,,"2018-07-01","ambulatory" +24,,"2018-06-01", +24,"Evaluation + Plan note","2018-07-01","ambulatory" +24,"Evaluation + Plan note","2018-06-01", +24,"Emergency department note","2018-07-01","ambulatory" +24,"Emergency department note","2018-06-01", +22,,"2018-06-01","ambulatory" +22,"Evaluation + Plan note","2018-06-01","ambulatory" +22,"Emergency department note","2018-06-01","ambulatory" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types.csv index d20a99fb..2ff56f2c 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types.csv @@ -1,9 +1,9 @@ -cnt,class_display,type_display,serviceType_display,priority_display +"cnt","class_display","type_display","serviceType_display","priority_display" 50,,,, -50,,,,cumulus__none -50,,,cumulus__none, -50,,,cumulus__none,cumulus__none -46,ambulatory,,, -46,ambulatory,,,cumulus__none -46,ambulatory,,cumulus__none, -46,ambulatory,,cumulus__none,cumulus__none \ No newline at end of file +50,,,,"cumulus__none" +50,,,"cumulus__none", +50,,,"cumulus__none","cumulus__none" +46,"ambulatory",,, +46,"ambulatory",,,"cumulus__none" +46,"ambulatory",,"cumulus__none", +46,"ambulatory",,"cumulus__none","cumulus__none" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types_month.csv index 35640a26..324ac11c 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_all_types_month.csv @@ -1,25 +1,25 @@ -cnt,class_display,type_display,serviceType_display,priority_display,period_start_month +"cnt","class_display","type_display","serviceType_display","priority_display","period_start_month" 50,,,,, -50,,,,cumulus__none, -50,,,cumulus__none,, -50,,,cumulus__none,cumulus__none, -46,ambulatory,,,, -46,ambulatory,,,cumulus__none, -46,ambulatory,,cumulus__none,, -46,ambulatory,,cumulus__none,cumulus__none, -26,,,,,2018-07-01 -26,,,,cumulus__none,2018-07-01 -26,,,cumulus__none,,2018-07-01 -26,,,cumulus__none,cumulus__none,2018-07-01 -24,,,,,2018-06-01 -24,,,,cumulus__none,2018-06-01 -24,,,cumulus__none,,2018-06-01 -24,,,cumulus__none,cumulus__none,2018-06-01 -24,ambulatory,,,,2018-07-01 -24,ambulatory,,,cumulus__none,2018-07-01 -24,ambulatory,,cumulus__none,,2018-07-01 -24,ambulatory,,cumulus__none,cumulus__none,2018-07-01 -22,ambulatory,,,,2018-06-01 -22,ambulatory,,,cumulus__none,2018-06-01 -22,ambulatory,,cumulus__none,,2018-06-01 -22,ambulatory,,cumulus__none,cumulus__none,2018-06-01 \ No newline at end of file +50,,,,"cumulus__none", +50,,,"cumulus__none",, +50,,,"cumulus__none","cumulus__none", +46,"ambulatory",,,, +46,"ambulatory",,,"cumulus__none", +46,"ambulatory",,"cumulus__none",, +46,"ambulatory",,"cumulus__none","cumulus__none", +26,,,,,"2018-07-01" +26,,,,"cumulus__none","2018-07-01" +26,,,"cumulus__none",,"2018-07-01" +26,,,"cumulus__none","cumulus__none","2018-07-01" +24,,,,,"2018-06-01" +24,,,,"cumulus__none","2018-06-01" +24,,,"cumulus__none",,"2018-06-01" +24,,,"cumulus__none","cumulus__none","2018-06-01" +24,"ambulatory",,,,"2018-07-01" +24,"ambulatory",,,"cumulus__none","2018-07-01" +24,"ambulatory",,"cumulus__none",,"2018-07-01" +24,"ambulatory",,"cumulus__none","cumulus__none","2018-07-01" +22,"ambulatory",,,,"2018-06-01" +22,"ambulatory",,,"cumulus__none","2018-06-01" +22,"ambulatory",,"cumulus__none",,"2018-06-01" +22,"ambulatory",,"cumulus__none","cumulus__none","2018-06-01" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_month.csv index 267d1133..5c8242e6 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_month.csv @@ -1,65 +1,65 @@ -cnt,period_start_month,class_display,age_at_visit,gender,race_display,ethnicity_display +"cnt","period_start_month","class_display","age_at_visit","gender","race_display","ethnicity_display" 50,,,,,, -47,,,,,white, -46,,ambulatory,,,, -45,,,,,,not hispanic or latino -43,,,,,white,not hispanic or latino -43,,ambulatory,,,white, -42,,ambulatory,,,,not hispanic or latino -40,,ambulatory,,,white,not hispanic or latino -29,,,,female,, -28,,,,female,white, -27,,,,female,,not hispanic or latino -27,,ambulatory,,female,, -26,,,,female,white,not hispanic or latino -26,,ambulatory,,female,white, -26,2018-07-01,,,,, -26,2018-07-01,,,,white, -25,,ambulatory,,female,,not hispanic or latino -24,,ambulatory,,female,white,not hispanic or latino -24,2018-07-01,,,,,not hispanic or latino -24,2018-07-01,,,,white,not hispanic or latino -24,2018-07-01,ambulatory,,,, -24,2018-07-01,ambulatory,,,white, -24,2018-06-01,,,,, -23,2018-07-01,ambulatory,,,,not hispanic or latino -23,2018-07-01,ambulatory,,,white,not hispanic or latino -22,2018-06-01,ambulatory,,,, -21,,,,male,, -21,2018-06-01,,,,,not hispanic or latino -21,2018-06-01,,,,white, -19,,,,male,white, -19,,ambulatory,,male,, -19,2018-06-01,,,,white,not hispanic or latino -19,2018-06-01,ambulatory,,,,not hispanic or latino -19,2018-06-01,ambulatory,,,white, -18,,,,male,,not hispanic or latino -17,,,,male,white,not hispanic or latino -17,,ambulatory,,male,,not hispanic or latino -17,,ambulatory,,male,white, -17,2018-06-01,ambulatory,,,white,not hispanic or latino -16,,ambulatory,,male,white,not hispanic or latino -15,2018-06-01,,,female,, -14,2018-07-01,,,female,, -14,2018-07-01,,,female,,not hispanic or latino -14,2018-07-01,,,female,white, -14,2018-07-01,,,female,white,not hispanic or latino -14,2018-06-01,,,female,white, -14,2018-06-01,ambulatory,,female,, -13,2018-07-01,ambulatory,,female,, -13,2018-07-01,ambulatory,,female,,not hispanic or latino -13,2018-07-01,ambulatory,,female,white, -13,2018-07-01,ambulatory,,female,white,not hispanic or latino -13,2018-06-01,,,female,,not hispanic or latino -13,2018-06-01,ambulatory,,female,white, -12,2018-07-01,,,male,, -12,2018-07-01,,,male,white, -12,2018-06-01,,,female,white,not hispanic or latino -12,2018-06-01,ambulatory,,female,,not hispanic or latino -11,2018-07-01,ambulatory,,male,, -11,2018-07-01,ambulatory,,male,white, -11,2018-06-01,ambulatory,,female,white,not hispanic or latino -10,2018-07-01,,,male,,not hispanic or latino -10,2018-07-01,,,male,white,not hispanic or latino -10,2018-07-01,ambulatory,,male,,not hispanic or latino -10,2018-07-01,ambulatory,,male,white,not hispanic or latino +47,,,,,"white", +46,,"ambulatory",,,, +45,,,,,,"not hispanic or latino" +43,,,,,"white","not hispanic or latino" +43,,"ambulatory",,,"white", +42,,"ambulatory",,,,"not hispanic or latino" +40,,"ambulatory",,,"white","not hispanic or latino" +29,,,,"female",, +28,,,,"female","white", +27,,,,"female",,"not hispanic or latino" +27,,"ambulatory",,"female",, +26,,,,"female","white","not hispanic or latino" +26,,"ambulatory",,"female","white", +26,"2018-07-01",,,,, +26,"2018-07-01",,,,"white", +25,,"ambulatory",,"female",,"not hispanic or latino" +24,,"ambulatory",,"female","white","not hispanic or latino" +24,"2018-07-01",,,,,"not hispanic or latino" +24,"2018-07-01",,,,"white","not hispanic or latino" +24,"2018-07-01","ambulatory",,,, +24,"2018-07-01","ambulatory",,,"white", +24,"2018-06-01",,,,, +23,"2018-07-01","ambulatory",,,,"not hispanic or latino" +23,"2018-07-01","ambulatory",,,"white","not hispanic or latino" +22,"2018-06-01","ambulatory",,,, +21,,,,"male",, +21,"2018-06-01",,,,,"not hispanic or latino" +21,"2018-06-01",,,,"white", +19,,,,"male","white", +19,,"ambulatory",,"male",, +19,"2018-06-01",,,,"white","not hispanic or latino" +19,"2018-06-01","ambulatory",,,,"not hispanic or latino" +19,"2018-06-01","ambulatory",,,"white", +18,,,,"male",,"not hispanic or latino" +17,,,,"male","white","not hispanic or latino" +17,,"ambulatory",,"male",,"not hispanic or latino" +17,,"ambulatory",,"male","white", +17,"2018-06-01","ambulatory",,,"white","not hispanic or latino" +16,,"ambulatory",,"male","white","not hispanic or latino" +15,"2018-06-01",,,"female",, +14,"2018-07-01",,,"female",, +14,"2018-07-01",,,"female",,"not hispanic or latino" +14,"2018-07-01",,,"female","white", +14,"2018-07-01",,,"female","white","not hispanic or latino" +14,"2018-06-01",,,"female","white", +14,"2018-06-01","ambulatory",,"female",, +13,"2018-07-01","ambulatory",,"female",, +13,"2018-07-01","ambulatory",,"female",,"not hispanic or latino" +13,"2018-07-01","ambulatory",,"female","white", +13,"2018-07-01","ambulatory",,"female","white","not hispanic or latino" +13,"2018-06-01",,,"female",,"not hispanic or latino" +13,"2018-06-01","ambulatory",,"female","white", +12,"2018-07-01",,,"male",, +12,"2018-07-01",,,"male","white", +12,"2018-06-01",,,"female","white","not hispanic or latino" +12,"2018-06-01","ambulatory",,"female",,"not hispanic or latino" +11,"2018-07-01","ambulatory",,"male",, +11,"2018-07-01","ambulatory",,"male","white", +11,"2018-06-01","ambulatory",,"female","white","not hispanic or latino" +10,"2018-07-01",,,"male",,"not hispanic or latino" +10,"2018-07-01",,,"male","white","not hispanic or latino" +10,"2018-07-01","ambulatory",,"male",,"not hispanic or latino" +10,"2018-07-01","ambulatory",,"male","white","not hispanic or latino" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_priority_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_priority_month.csv index bc2cc2d6..2b16c7e2 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_priority_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_priority_month.csv @@ -1,13 +1,13 @@ -cnt,class_display,priority_display,period_start_month +"cnt","class_display","priority_display","period_start_month" 50,,, -50,,cumulus__none, -46,ambulatory,, -46,ambulatory,cumulus__none, -26,,,2018-07-01 -26,,cumulus__none,2018-07-01 -24,,,2018-06-01 -24,,cumulus__none,2018-06-01 -24,ambulatory,,2018-07-01 -24,ambulatory,cumulus__none,2018-07-01 -22,ambulatory,,2018-06-01 -22,ambulatory,cumulus__none,2018-06-01 +50,,"cumulus__none", +46,"ambulatory",, +46,"ambulatory","cumulus__none", +26,,,"2018-07-01" +26,,"cumulus__none","2018-07-01" +24,,,"2018-06-01" +24,,"cumulus__none","2018-06-01" +24,"ambulatory",,"2018-07-01" +24,"ambulatory","cumulus__none","2018-07-01" +22,"ambulatory",,"2018-06-01" +22,"ambulatory","cumulus__none","2018-06-01" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_service_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_service_month.csv index 1ef0dae9..2dc35115 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_service_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_service_month.csv @@ -1,13 +1,13 @@ -cnt,class_display,serviceType_display,period_start_month +"cnt","class_display","serviceType_display","period_start_month" 50,,, -50,,cumulus__none, -46,ambulatory,, -46,ambulatory,cumulus__none, -26,,,2018-07-01 -26,,cumulus__none,2018-07-01 -24,,,2018-06-01 -24,,cumulus__none,2018-06-01 -24,ambulatory,,2018-07-01 -24,ambulatory,cumulus__none,2018-07-01 -22,ambulatory,,2018-06-01 -22,ambulatory,cumulus__none,2018-06-01 +50,,"cumulus__none", +46,"ambulatory",, +46,"ambulatory","cumulus__none", +26,,,"2018-07-01" +26,,"cumulus__none","2018-07-01" +24,,,"2018-06-01" +24,,"cumulus__none","2018-06-01" +24,"ambulatory",,"2018-07-01" +24,"ambulatory","cumulus__none","2018-07-01" +22,"ambulatory",,"2018-06-01" +22,"ambulatory","cumulus__none","2018-06-01" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_type_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_type_month.csv index bb5afe09..02621fa4 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_type_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_encounter_type_month.csv @@ -1,7 +1,7 @@ -cnt,class_display,type_display,period_start_month +"cnt","class_display","type_display","period_start_month" 50,,, -46,ambulatory,, -26,,,2018-07-01 -24,,,2018-06-01 -24,ambulatory,,2018-07-01 -22,ambulatory,,2018-06-01 +46,"ambulatory",, +26,,,"2018-07-01" +24,,,"2018-06-01" +24,"ambulatory",,"2018-07-01" +22,"ambulatory",,"2018-06-01" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_medicationrequest_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_medicationrequest_month.csv index 0c01d794..b0299974 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_medicationrequest_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_medicationrequest_month.csv @@ -1,13 +1,13 @@ -cnt,status,intent,authoredon_month,medication_display +"cnt","status","intent","authoredon_month","medication_display" 27,,,, -27,,order,, -26,stopped,,, -26,stopped,order,, -15,,,2018-07-01, -15,,order,2018-07-01, -15,stopped,,2018-07-01, -15,stopped,order,2018-07-01, -12,,,2018-06-01, -12,,order,2018-06-01, -11,stopped,,2018-06-01, -11,stopped,order,2018-06-01, +27,,"order",, +26,"stopped",,, +26,"stopped","order",, +15,,,"2018-07-01", +15,,"order","2018-07-01", +15,"stopped",,"2018-07-01", +15,"stopped","order","2018-07-01", +12,,,"2018-06-01", +12,,"order","2018-06-01", +11,"stopped",,"2018-06-01", +11,"stopped","order","2018-06-01", diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_observation_lab_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_observation_lab_month.csv index 51b9e49e..5555c1ec 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_observation_lab_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_observation_lab_month.csv @@ -1,15 +1,15 @@ -cnt,effectiveDateTime_month,observation_code,valueCodeableConcept_display,class_display +"cnt","effectiveDateTime_month","observation_code","valueCodeableConcept_display","class_display" 20,,,, -20,,,,ambulatory -10,,,Urine smell ammoniacal (finding), -10,,,Urine smell ammoniacal (finding),ambulatory -10,,,Brown color (qualifier value), -10,,,Brown color (qualifier value),ambulatory -10,,5778-6,, -10,,5778-6,,ambulatory -10,,5778-6,Brown color (qualifier value), -10,,5778-6,Brown color (qualifier value),ambulatory -10,,34533-0,, -10,,34533-0,,ambulatory -10,,34533-0,Urine smell ammoniacal (finding), -10,,34533-0,Urine smell ammoniacal (finding),ambulatory +20,,,,"ambulatory" +10,,,"Urine smell ammoniacal (finding)", +10,,,"Urine smell ammoniacal (finding)","ambulatory" +10,,,"Brown color (qualifier value)", +10,,,"Brown color (qualifier value)","ambulatory" +10,,"5778-6",, +10,,"5778-6",,"ambulatory" +10,,"5778-6","Brown color (qualifier value)", +10,,"5778-6","Brown color (qualifier value)","ambulatory" +10,,"34533-0",, +10,,"34533-0",,"ambulatory" +10,,"34533-0","Urine smell ammoniacal (finding)", +10,,"34533-0","Urine smell ammoniacal (finding)","ambulatory" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_patient.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_patient.csv index d9a72050..c19cc543 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_patient.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_patient.csv @@ -1,13 +1,13 @@ -cnt,gender,race_display,ethnicity_display +"cnt","gender","race_display","ethnicity_display" 50,,, -47,,white, -45,,,not hispanic or latino -43,,white,not hispanic or latino -29,female,, -28,female,white, -27,female,,not hispanic or latino -26,female,white,not hispanic or latino -21,male,, -19,male,white, -18,male,,not hispanic or latino -17,male,white,not hispanic or latino +47,,"white", +45,,,"not hispanic or latino" +43,,"white","not hispanic or latino" +29,"female",, +28,"female","white", +27,"female",,"not hispanic or latino" +26,"female","white","not hispanic or latino" +21,"male",, +19,"male","white", +18,"male",,"not hispanic or latino" +17,"male","white","not hispanic or latino" diff --git a/tests/test_data/duckdb_data/expected_export/core/core__meta_date.csv b/tests/test_data/duckdb_data/expected_export/core/core__meta_date.csv index 6d2ad6b7..2e2c4f61 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__meta_date.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__meta_date.csv @@ -1,2 +1,2 @@ -min_date,max_date +"min_date","max_date" 2018-06-01,2018-07-31 diff --git a/tests/test_data/duckdb_data/expected_export/core/core__meta_version.csv b/tests/test_data/duckdb_data/expected_export/core/core__meta_version.csv index 4d65fc4e..cbd90d88 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__meta_version.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__meta_version.csv @@ -1,2 +1,2 @@ -data_package_version +"data_package_version" 3 diff --git a/tests/test_data/study_bad_manifest/manifest.toml b/tests/test_data/study_bad_manifest/manifest.toml new file mode 100644 index 00000000..d1cdd86c --- /dev/null +++ b/tests/test_data/study_bad_manifest/manifest.toml @@ -0,0 +1,4 @@ +study_prefix = "study_bad_manifest" + +[sql_config +file_names = ["test.sql"] diff --git a/tests/test_data/study_python_valid/module2.py b/tests/test_data/study_python_valid/module2.py index 70fafa14..9ba693c1 100644 --- a/tests/test_data/study_python_valid/module2.py +++ b/tests/test_data/study_python_valid/module2.py @@ -5,4 +5,6 @@ class ModuleTwoRunner(BaseTableBuilder): display_text = "module2" def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): - pass + self.queries.append( + "CREATE TABLE IF NOT EXISTS study_python_valid__table_2 (test int);" + ) diff --git a/tests/test_data/study_python_valid_generated.md b/tests/test_data/study_python_valid_generated.md index 00e40822..ee8857bc 100644 --- a/tests/test_data/study_python_valid_generated.md +++ b/tests/test_data/study_python_valid_generated.md @@ -17,3 +17,10 @@ |test |INTEGER| | +### study_python_valid__table_2 + +|Column| Type |Description| +|------|-------|-----------| +|test |INTEGER| | + + diff --git a/tests/test_databases.py b/tests/test_databases.py new file mode 100644 index 00000000..836c12d7 --- /dev/null +++ b/tests/test_databases.py @@ -0,0 +1,367 @@ +"""Low level database tests + +This is intended to exercise edge cases not covered via more integrated testing""" + +import datetime +import os +import pathlib +from contextlib import nullcontext as does_not_raise +from unittest import mock + +import pandas +import pyarrow +import pytest + +from cumulus_library import databases, errors + +ATHENA_KWARGS = { + "region": "test", + "work_group": "test", + "profile": "test", + "schema_name": "test", +} +DUCKDB_KWARGS = { + "db_file": ":memory:", +} + + +@mock.patch.dict( + os.environ, + clear=True, +) +@pytest.mark.parametrize( + "db,data,expected,raises", + [ + ( + databases.AthenaDatabaseBackend(**ATHENA_KWARGS), + pandas.DataFrame( + { + "str": ["str"], + "int": [123], + "float": [1.23], + "bool": [True], + "datetime": [datetime.datetime.now()], + } + ), + ["STRING", "INT", "DOUBLE", "BOOLEAN", "TIMESTAMP"], + does_not_raise(), + ), + ( + databases.DuckDatabaseBackend(**DUCKDB_KWARGS), + pandas.DataFrame( + { + "str": ["str"], + "int": [123], + "float": [1.23], + "bool": [True], + "datetime": [datetime.datetime.now()], + } + ), + [], + does_not_raise(), + ), + ( + databases.AthenaDatabaseBackend(**ATHENA_KWARGS), + pandas.DataFrame({"cat": pandas.Series(["a"], dtype="category")}), + ["STRING", "INT", "DOUBLE", "BOOLEAN", "TIMESTAMP"], + pytest.raises(errors.CumulusLibraryError), + ), + ], +) +def test_col_types_from_pandas(db, data, expected, raises): + with raises: + vals = db.col_parquet_types_from_pandas(data.dtypes) + assert set(expected) == set(vals) + + +@mock.patch.dict( + os.environ, + clear=True, +) +@pytest.mark.parametrize( + "db,data,expected,raises", + [ + ( + databases.AthenaDatabaseBackend(**ATHENA_KWARGS), + [ + ( + "a", + "varchar", + ), + ( + "b", + "bigint", + ), + ( + "c", + "integer", + ), + ( + "d", + "double", + ), + ( + "e", + "boolean", + ), + ( + "f", + "date", + ), + ("g", "timestamp"), + ], + [ + ( + "a", + pyarrow.string(), + ), + ( + "b", + pyarrow.int64(), + ), + ( + "c", + pyarrow.int64(), + ), + ( + "d", + pyarrow.float64(), + ), + ( + "e", + pyarrow.bool_(), + ), + ( + "f", + pyarrow.date64(), + ), + ("g", pyarrow.timestamp("s")), + ], + does_not_raise(), + ), + ( + databases.AthenaDatabaseBackend(**ATHENA_KWARGS), + [("a", "other_type")], + [], + pytest.raises(errors.CumulusLibraryError), + ), + ( + databases.DuckDatabaseBackend(**DUCKDB_KWARGS), + [ + ( + "a", + "STRING", + ), + ( + "b", + "INTEGER", + ), + ( + "c", + "NUMBER", + ), + ( + "d", + "DOUBLE", + ), + ( + "e", + "boolean", + ), + ( + "f", + "Date", + ), + ("g", "TIMESTAMP"), + ], + [ + ( + "a", + pyarrow.string(), + ), + ( + "b", + pyarrow.int64(), + ), + ( + "c", + pyarrow.float64(), + ), + ( + "d", + pyarrow.float64(), + ), + ( + "e", + pyarrow.bool_(), + ), + ( + "f", + pyarrow.date64(), + ), + ("g", pyarrow.timestamp("s")), + ], + does_not_raise(), + ), + ( + databases.DuckDatabaseBackend(**DUCKDB_KWARGS), + [("a", "other_type")], + [], + pytest.raises(errors.CumulusLibraryError), + ), + ], +) +def test_pyarrow_types_from_sql(db, data, expected, raises): + with raises: + vals = db.col_pyarrow_types_from_sql(data) + assert len(expected) == len(vals) + for index in range(0, len(vals)): + assert vals[index][-1] == expected[index][-1] + + +@mock.patch.dict( + os.environ, + clear=True, +) +@pytest.mark.parametrize( + "args,expected_type, raises", + [ + ( + {**{"db_type": "duckdb", "schema_name": "test"}, **DUCKDB_KWARGS}, + databases.DuckDatabaseBackend, + does_not_raise(), + ), + ( + {**{"db_type": "athena"}, **ATHENA_KWARGS}, + databases.AthenaDatabaseBackend, + does_not_raise(), + ), + ( + {**{"db_type": "athena", "load_ndjson_dir": "file.json"}, **ATHENA_KWARGS}, + databases.AthenaDatabaseBackend, + pytest.raises(SystemExit), + ), + ( + # https://en.wikipedia.org/wiki/Cornerstone_(software) + {**{"db_type": "cornerstone", "schema_name": "test"}}, + None, + pytest.raises(errors.CumulusLibraryError), + ), + ], +) +def test_create_db_backend(args, expected_type, raises): + with raises: + db = databases.create_db_backend(args) + assert isinstance(db, expected_type) + + +def test_upload_file_default(): + db = databases.DuckDatabaseBackend(**DUCKDB_KWARGS) + location = db.upload_file( + file=pathlib.Path(__file__).resolve(), + study="test", + topic="table", + ) + assert location is None + + +@mock.patch.dict( + os.environ, + clear=True, +) +@pytest.mark.parametrize( + "args,sse,keycount,expected,raises", + [ + ( + { + "file": pathlib.Path(__file__).resolve(), + "study": "study", + "topic": "table", + "remote_filename": None, + "force_upload": False, + }, + "SSE_KMS", + 1, + "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table", + does_not_raise(), + ), + ( + { + "file": pathlib.Path(__file__).resolve(), + "study": "study", + "topic": "table", + "remote_filename": None, + "force_upload": False, + }, + "SSE_KMS", + 0, + "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table", + does_not_raise(), + ), + ( + { + "file": pathlib.Path(__file__).resolve(), + "study": "study", + "topic": "table", + "remote_filename": None, + "force_upload": False, + }, + "SSE-S3", + 0, + "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table", + pytest.raises(errors.AWSError), + ), + ( + { + "file": pathlib.Path(__file__).resolve(), + "study": "study", + "topic": "table", + "remote_filename": None, + "force_upload": True, + }, + "SSE_KMS", + 1, + "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table", + does_not_raise(), + ), + ( + { + "file": pathlib.Path(__file__).resolve(), + "study": "study", + "topic": "table", + "remote_filename": "custom.name", + "force_upload": False, + }, + "SSE_KMS", + 0, + "s3://test_bucket/test_location/cumulus_user_uploads/test/study/table", + does_not_raise(), + ), + ], +) +@mock.patch("botocore.client") +def test_upload_file_athena(mock_botocore, args, sse, keycount, expected, raises): + mock_data = { + "WorkGroup": { + "Configuration": { + "ResultConfiguration": { + "OutputLocation": "s3://test_bucket/test_location/", + "EncryptionConfiguration": {"EncryptionOption": sse}, + } + } + } + } + mock_clientobj = mock_botocore.ClientCreator.return_value.create_client.return_value + mock_clientobj.get_work_group.return_value = mock_data + mock_clientobj.list_objects_v2.return_value = {"KeyCount": keycount} + db = databases.AthenaDatabaseBackend(**ATHENA_KWARGS) + with raises: + location = db.upload_file(**args) + assert location == expected + if keycount == 0 or args["force_upload"]: + assert mock_clientobj.put_object.called + kwargs = mock_clientobj.put_object.call_args_list[0][1] + if args["remote_filename"]: + assert kwargs["Key"].endswith(args["remote_filename"]) + else: + assert kwargs["Key"].endswith(args["file"].name)