From 7e72e89205a0e25afd304cff7f90139e78bad223 Mon Sep 17 00:00:00 2001 From: matt garber Date: Thu, 7 Nov 2024 16:11:18 -0500 Subject: [PATCH] Refactor of get chart data endpoint (#139) * Refactor of get chart data endpoint * Comment cleanup * light coverage --- .github/workflows/ci.yaml | 3 + .sqlfluff | 49 ++ .sqlfluffignore | 2 + pyproject.toml | 4 +- src/dashboard/get_chart_data/filter_config.py | 107 ---- .../get_chart_data/get_chart_data.py | 195 ++++++-- src/dashboard/get_chart_data/requirements.txt | 1 + .../templates/filter_inline.sql.jinja | 131 +++++ .../templates/get_chart_data.sql.jinja | 49 ++ .../get_chart_data/templates/syntax.sql.jinja | 24 + src/shared/errors.py | 4 + tests/dashboard/test_filter_config.py | 163 ------- tests/dashboard/test_filter_inline.py | 459 ++++++++++++++++++ tests/dashboard/test_filter_inline.sql.jinja | 2 + tests/dashboard/test_get_chart_data.py | 272 ++++++++++- 15 files changed, 1127 insertions(+), 338 deletions(-) create mode 100644 .sqlfluff create mode 100644 .sqlfluffignore delete mode 100644 src/dashboard/get_chart_data/filter_config.py create mode 100644 src/dashboard/get_chart_data/requirements.txt create mode 100644 src/dashboard/get_chart_data/templates/filter_inline.sql.jinja create mode 100644 src/dashboard/get_chart_data/templates/get_chart_data.sql.jinja create mode 100644 src/dashboard/get_chart_data/templates/syntax.sql.jinja delete mode 100644 tests/dashboard/test_filter_config.py create mode 100644 tests/dashboard/test_filter_inline.py create mode 100644 tests/dashboard/test_filter_inline.sql.jinja diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2944fec..461e242 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -53,6 +53,9 @@ jobs: run: | python -m pip install --upgrade pip pip install ".[dev]" + - name: Run sqlfluff on jinja templates + run: | + sqlfluff lint - name: Run ruff if: success() || failure() # still run ruff if above checks fails run: | diff --git a/.sqlfluff b/.sqlfluff new file mode 100644 index 0000000..5406a4d --- /dev/null +++ b/.sqlfluff @@ -0,0 +1,49 @@ +[sqlfluff] +templater = jinja +dialect = athena +sql_file_exts = .sql,.sql.jinja +exclude_rules= + # these rule overfires on athena nested arrays + references.from, + structure.column_order, + aliasing.unused, + # this rule interferes with FHIR naming conventions + capitalisation.identifiers +max_line_length = 90 + +[sqlfluff:rules:layout.long_lines] +ignore_comment_lines = true + +[sqlfluff:rules:capitalisation.keywords] +capitalisation_policy = upper + + +[sqlfluff:templater:jinja] +load_macros_from_path = src/dashboard/get_chart_data/templates + +[sqlfluff:templater:jinja:context] +data_column = 'foo' +stratifier_column= 'bar' +count_columns=['cnt'] +schema='schema' +data_package_id='study__table__001' +coalesce_columns=['baz','foobar'] +inline_configs=[ + [ + {'data':'foo','type': 'matches': 'a'}, + {'data':'foo','type': 'matches': 'b'} + ], + [ + {'data':'baz','type': 'eq', 'matches': '1'}, + {'data':'baz','type': 'isNull'} + ] + ] +none_configs=[ + [ + {'data':'foo','type': 'matches': 'a'}, + {'data':'foo','type': 'matches': 'b'} + ], + [ + {'data':'baz','type': 'isNull'} + ] + ] diff --git a/.sqlfluffignore b/.sqlfluffignore new file mode 100644 index 0000000..31471ce --- /dev/null +++ b/.sqlfluffignore @@ -0,0 +1,2 @@ +tests/ +.aws-sam/ \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index db53087..3d6d6a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ dependencies= [ "arrow >=1.2.3", "awswrangler >=3.5, <4", "boto3", + "Jinja2 >=3.1.4, <4", "pandas >=2, <3", "requests", # scripts only "rich", @@ -48,8 +49,9 @@ test = [ "pytest-mock" ] dev = [ - "ruff < 0.6", "pre-commit", + "ruff < 0.6", + "sqlfluff >= 3.2.5" ] [tool.coverage.run] diff --git a/src/dashboard/get_chart_data/filter_config.py b/src/dashboard/get_chart_data/filter_config.py deleted file mode 100644 index 44c0127..0000000 --- a/src/dashboard/get_chart_data/filter_config.py +++ /dev/null @@ -1,107 +0,0 @@ -"""Reimplementation of dashboard's filtering logic for SQL construction""" - -_FILTER_MAP_TWO_PARAM = { - # Text - "strEq": "%s LIKE '%s'", - "strContains": "%s LIKE '%%%s%%'", - "strStartsWith": "%s LIKE '%s%%'", - "strEndsWith": "%s LIKE '%%%s'", - "matches": "%s ~ '%s'", - "strEqCI": "%s ILIKE '%s'", - "strContainsCI": "%s ILIKE '%%%s%%'", - "strStartsWithCI": "%s ILIKE '%s%%'", - "strEndsWithCI": "%s ILIKE '%%%s'", - "matchesCI": "%s ~* '%s'", - "strNotEq": "%s NOT LIKE '%s'", - "strNotContains": "%s NOT LIKE '%%%s%%'", - "strNotStartsWith": "%s NOT LIKE '%s%%'", - "strNotEndsWith": "%s NOT LIKE '%%%s'", - "notMatches": "%s !~ '%s'", - "strNotEqCI": "%s NOT ILIKE '%s'", - "strNotContainsCI": "%s NOT ILIKE '%%%s%%'", - "strNotStartsWithCI": "%s NOT ILIKE '%s%%'", - "strNotEndsWithCI": "%s NOT ILIKE '%%%s'", - "notMatchesCI": "%s !~* '%s'", - # numeric - "eq": "%s = %s", - "ne": "%s != %s", - "gt": "%s > %s", - "gte": "%s >= %s", - "lt": "%s < %s", - "lte": "%s <= %s", - # dates - "sameDay": "from_iso8601_timestamp(%s) = date_trunc('day',from_iso8601_timestamp('%s'))", - "sameWeek": "date_trunc('week',from_iso8601_timestamp(%s)) = " - "date_trunc('week',from_iso8601_timestamp('%s'))", - "sameMonth": "date_trunc('month',from_iso8601_timestamp(%s)) = " - "date_trunc('month',from_iso8601_timestamp('%s'))", - "sameYear": "date_trunc('year',from_iso8601_timestamp(%s)) = " - "date_trunc('year',from_iso8601_timestamp('%s'))", - "sameDayOrBefore": "from_iso8601_timestamp(%s) <= " - "date_trunc('day',from_iso8601_timestamp('%s'))", - "sameWeekOrBefore": "date_trunc('week',from_iso8601_timestamp(%s)) <= " - "date_trunc('week',from_iso8601_timestamp('%s'))", - "sameMonthOrBefore": ( - "date_trunc('month',from_iso8601_timestamp(%s)) <= " - "date_trunc('month',from_iso8601_timestamp('%s'))" - ), - "sameYearOrBefore": "date_trunc('year',from_iso8601_timestamp(%s)) <= " - "date_trunc('year',from_iso8601_timestamp('%s'))", - "sameDayOrAfter": "from_iso8601_timestamp(%s) >= " - "date_trunc('day',from_iso8601_timestamp('%s'))", - "sameWeekOrAfter": ( - "date_trunc('week',from_iso8601_timestamp(%s)) " - ">= date_trunc('week',from_iso8601_timestamp('%s'))" - ), - "sameMonthOrAfter": ( - "date_trunc('month',from_iso8601_timestamp(%s)) >= " - "date_trunc('month',from_iso8601_timestamp('%s'))" - ), - "sameYearOrAfter": "date_trunc('year',from_iso8601_timestamp(%s)) >= " - "date_trunc('year',from_iso8601_timestamp('%s'))", - "beforeDay": "from_iso8601_timestamp(%s) < " "date_trunc('day',from_iso8601_timestamp('%s'))", - "beforeWeek": "date_trunc('week',from_iso8601_timestamp(%s)) < " - "date_trunc('week',from_iso8601_timestamp('%s'))", - "beforeMonth": "date_trunc('month',from_iso8601_timestamp(%s)) < " - "date_trunc('month',from_iso8601_timestamp('%s'))", - "beforeYear": "date_trunc('year',from_iso8601_timestamp(%s)) < " - "date_trunc('year',from_iso8601_timestamp('%s'))", - "afterDay": "from_iso8601_timestamp(%s) > " "date_trunc('day',from_iso8601_timestamp('%s'))", - "afterWeek": "date_trunc('week',from_iso8601_timestamp(%s)) > " - "date_trunc('week',from_iso8601_timestamp('%s'))", - "afterMonth": "date_trunc('month',from_iso8601_timestamp(%s)) > " - "date_trunc('month',from_iso8601_timestamp('%s'))", - "afterYear": "date_trunc('year',from_iso8601_timestamp(%s)) > " - "date_trunc('year',from_iso8601_timestamp('%s'))", -} - -_FILTER_MAP_ONE_PARAM = { - # Booleans - "isTrue": "%s IS TRUE", - "isNotTrue": "%s IS NOT TRUE", - "isFalse": "%s IS FALSE", - "isNotFalse": "%s IS NOT FALSE", - # Any - "isNull": "%s IS NULL", - "isNotNull": "%s IS NOT NULL", -} - - -def _parse_filter_req(filter_req): - if "," in filter_req: - return " AND ".join(_parse_filter_req(x) for x in filter_req.split(",")) - filter_req_split = filter_req.split(":") - if filter_req_split[1] in _FILTER_MAP_ONE_PARAM: - return _FILTER_MAP_ONE_PARAM[filter_req_split[1]] % filter_req_split[0] - return _FILTER_MAP_TWO_PARAM[filter_req_split[1]] % ( - filter_req_split[0], - filter_req_split[2], - ) - - -def get_filter_string(filter_array): - if len(filter_array) > 1: - return " OR ".join(_parse_filter_req(x) for x in filter_array) - elif len(filter_array) == 1: - return _parse_filter_req(filter_array[0]) - return "" diff --git a/src/dashboard/get_chart_data/get_chart_data.py b/src/dashboard/get_chart_data/get_chart_data.py index e9baaa4..38ec8b7 100644 --- a/src/dashboard/get_chart_data/get_chart_data.py +++ b/src/dashboard/get_chart_data/get_chart_data.py @@ -4,10 +4,11 @@ import logging import os +import pathlib import awswrangler import boto3 -import filter_config +import jinja2 import pandas from shared import decorators, enums, errors, functions @@ -15,6 +16,94 @@ logger = logging.getLogger() logger.setLevel(log_level) +# These constants are specified by the dashboard API and should not be changed +# unless a corresponding change is made on that side. +INLINE_FILTERS = ( + # string filters + "strEq", + "strContains", + "strStartsWith", + "strEndsWith", + "matches", + "strEqCI", + "strContainsCI", + "strStartsWithCI", + "strEndsWithCI", + "strMatchesCI", + "strNotEq", + "strNotContains", + "strNotStartsWith", + "strNotEndsWith", + "notMatches", + "strNotEqCI", + "strNotContainsCI", + "strNotStartsWithCI", + "strNotEndsWithCI", + "notMatchesCI", + # date filters + "sameDay", + "sameWeek", + "sameMonth", + "sameYear", + "sameDayOrBefore", + "sameWeekOrBefore", + "sameMonthOrBefore", + "sameYearOrBefore", + "sameDayOrAfter", + "sameWeekOrAfter", + "sameYearOrAfter", + "beforeDay", + "beforeWeek", + "beforeMonth", + "beforeYear", + "afterDay", + "afterMonth", + "afterYear", + # Boolean filters (one param only) + "isTrue", + "isNotTrue", + "isFalse", + "isNotFalse", + # null filters (one param only) + "isNull", + "isNotNull", + # numeric filters + "eq", + "ne", + "gt", + "gte", + "lt", + "lte", +) + +# This set of filters (i.e. all the string/null filters) are suitable to be run against +# table slices where we are looking at explict source nulls without having to worry +# about type issues +NONE_FILTERS = ( + "strEq", + "strContains", + "strStartsWith", + "strEndsWith", + "matches", + "strEqCI", + "strContainsCI", + "strStartsWithCI", + "strEndsWithCI", + "strMatchesCI", + "strNotEq", + "strNotContains", + "strNotStartsWith", + "strNotEndsWith", + "notMatches", + "strNotEqCI", + "strNotContainsCI", + "strNotStartsWithCI", + "strNotEndsWithCI", + "notMatchesCI", + "isNull", + "isNotNull", +) + def _get_table_cols(dp_id: str) -> list: """Returns the columns associated with a table. @@ -41,57 +130,81 @@ def _get_table_cols(dp_id: str) -> list: raise errors.AggregatorS3Error -def _build_query(query_params: dict, filters: list, path_params: dict) -> str: - """Creates a query from the dashboard API spec""" +def _build_query(query_params: dict, filter_groups: list, path_params: dict) -> str: + """Creates a query from the dashboard API spec + :arg queryparams: All arguments passed to the endpoint + :arg filter_groups: Filter params used to generate filtering logic. + These should look like ['col:arg:optionalvalue,...',''] + :path path_params: URL specific arguments, as a convenience + """ dp_id = path_params["data_package_id"] columns = _get_table_cols(dp_id) - filter_str = filter_config.get_filter_string(filters) - if filter_str != "": - filter_str = f"AND {filter_str} " + + inline_configs = [] + none_configs = [] + for filter_group in filter_groups: + # in this block we are trying to do the following things: + # - create a config for where the selected column is not `cumulus__none` + # - create a config for where the selected column is `cumulus__none`, removing + # filters that would cause type cast errors + filter_group = filter_group.split(",") + config_params = [] + none_params = [] + for filter_config in filter_group: + filter_config = filter_config.split(":") + if filter_config[1] in INLINE_FILTERS: + params = {"data": filter_config[0], "filter_type": filter_config[1]} + if len(filter_config) == 3: + params["bound"] = filter_config[2] + config_params.append(params) + if filter_config[1] in NONE_FILTERS or filter_config[0] != query_params["column"]: + if params.get("bound", "").casefold() == "none": + params["bound"] = "cumulus__none" + none_params.append(params) + else: + raise errors.AggregatorFilterError( + f"Invalid filter type {filter_config[1]} requested." + ) + + inline_configs.append(config_params) + none_configs.append(none_params) count_col = next(c for c in columns if c.startswith("cnt")) columns.remove(count_col) - select_str = f"{query_params['column']}, sum({count_col}) as {count_col}" - strat_str = "" - group_str = f"{query_params['column']}" - # the 'if in' check is meant to handle the case where the selected column is also + # these 'if in' checks is meant to handle the case where the selected column is also # present in the filter logic and has already been removed if query_params["column"] in columns: columns.remove(query_params["column"]) - if "stratifier" in query_params.keys(): - select_str = f"{query_params['stratifier']}, {select_str}" - group_str = f"{query_params['stratifier']}, {group_str}" + if query_params.get("stratifier") in columns: columns.remove(query_params["stratifier"]) - strat_str = f'AND {query_params["stratifier"]} IS NOT NULL ' - if len(columns) > 0: - coalesce_str = ( - f"WHERE COALESCE (cast({' AS VARCHAR), cast('.join(columns)} AS VARCHAR)) " - "IS NOT NULL AND " + with open(pathlib.Path(__file__).parent / "templates/get_chart_data.sql.jinja") as file: + template = file.read() + loader = jinja2.FileSystemLoader(pathlib.Path(__file__).parent / "templates/") + env = jinja2.Environment(loader=loader).from_string(template) # noqa: S701 + query_str = env.render( + data_column=query_params["column"], + stratifier_column=query_params.get("stratifier", None), + count_columns=[count_col], + schema=os.environ.get("GLUE_DB_NAME"), + data_package_id=path_params["data_package_id"], + coalesce_columns=columns, + inline_configs=inline_configs, + none_configs=none_configs, ) - else: - coalesce_str = "WHERE " - query_str = ( - f"SELECT {select_str} " # nosec # noqa: S608 - f"FROM \"{os.environ.get('GLUE_DB_NAME')}\".\"{dp_id}\" " - f"{coalesce_str}" - f"{query_params['column']} IS NOT NULL " - f"{filter_str}" - f"{strat_str}" - f"GROUP BY {group_str} " - ) - if "stratifier" in query_params.keys(): - query_str += f"ORDER BY {query_params['stratifier']}, {query_params['column']}" - else: - query_str += f"ORDER BY {query_params['column']}" return query_str, count_col def _format_payload( - df: pandas.DataFrame, query_params: dict, filters: list, count_col: str + df: pandas.DataFrame, query_params: dict, filter_groups: list, count_col: str ) -> dict: - """Coerces query results into the return format defined by the dashboard""" + """Coerces query results into the return format defined by the dashboard + + :arg queryparams: All arguments passed to the endpoint + :arg filter_groups: Filter params used to generate filtering logic. + These should look like ['col:arg:optionalvalue,...',''] + :path count_col: column to use as the primary count column""" payload = {} payload["column"] = query_params["column"] - payload["filters"] = filters + payload["filters"] = filter_groups payload["rowCount"] = int(df.shape[0]) payload["totalCount"] = int(df["cnt"].sum()) if "stratifier" in query_params.keys(): @@ -124,20 +237,20 @@ def chart_data_handler(event, context): """manages event from dashboard api call and retrieves data""" del context query_params = event["queryStringParameters"] - filters = event["multiValueQueryStringParameters"].get("filter", []) - if "filter" in query_params and filters == []: - filters = [query_params["filter"]] + filter_groups = event["multiValueQueryStringParameters"].get("filter", []) + if "filter" in query_params and filter_groups == []: + filter_groups = [query_params["filter"]] path_params = event["pathParameters"] boto3.setup_default_session(region_name="us-east-1") try: - query, count_col = _build_query(query_params, filters, path_params) + query, count_col = _build_query(query_params, filter_groups, path_params) df = awswrangler.athena.read_sql_query( query, database=os.environ.get("GLUE_DB_NAME"), s3_output=f"s3://{os.environ.get('BUCKET_NAME')}/awswrangler", workgroup=os.environ.get("WORKGROUP_NAME"), ) - res = _format_payload(df, query_params, filters, count_col) + res = _format_payload(df, query_params, filter_groups, count_col) res = functions.http_response(200, res) except errors.AggregatorS3Error: # while the API is publicly accessible, we've been asked to not pass diff --git a/src/dashboard/get_chart_data/requirements.txt b/src/dashboard/get_chart_data/requirements.txt new file mode 100644 index 0000000..ae41c0e --- /dev/null +++ b/src/dashboard/get_chart_data/requirements.txt @@ -0,0 +1 @@ +Jinja2 >= 3.1.4, <4 diff --git a/src/dashboard/get_chart_data/templates/filter_inline.sql.jinja b/src/dashboard/get_chart_data/templates/filter_inline.sql.jinja new file mode 100644 index 0000000..27d35b5 --- /dev/null +++ b/src/dashboard/get_chart_data/templates/filter_inline.sql.jinja @@ -0,0 +1,131 @@ +{%- import 'syntax.sql.jinja' as syntax -%} +{%- set typesafe_filters = ['strEq','strContains','strStartsWith','strEndsWith','matches','strEqCI', + 'strContainsCI', 'strStartsWithCI','strEndsWithCI','strMatchesCI','strNotEq','strNotContains', + 'strNotStartsWith','strNotEndsWith','notMatches','strNotEqCI','strNotContainsCI', + 'strNotStartsWithCI','strNotEndsWithCI','notMatchesCI'] -%} + +{%- macro render_filter( data, filter_type, bound) -%} + {#- TODO: replace all LIKE filters with regexp() calls -#} + {#- Sting filters -#} + {%- if filter_type == 'strEq' -%} +"{{ data }}" LIKE '{{ bound }}' +{%- elif filter_type == 'strContains' -%} +"{{ data }}" LIKE '%{{ bound }}%' +{%- elif filter_type == 'strStartsWith' -%} +"{{ data }}" LIKE '{{ bound }}%' +{%- elif filter_type == 'strEndsWith' -%} +"{{ data }}" LIKE '%{{ bound }}' +{%- elif filter_type == 'matches' -%} +regexp_like("{{ data }}", '{{ bound }}') +{%- elif filter_type == 'strEqCI' -%} +"{{ data }}" ILIKE '{{ bound }}' +{%- elif filter_type == 'strContainsCI' -%} +"{{ data }}" ILIKE '%{{ bound }}%' +{%- elif filter_type == 'strStartsWithCI' -%} +"{{ data }}" ILIKE '{{ bound }}%' +{%- elif filter_type == 'strEndsWithCI' -%} +"{{ data }}" ILIKE '%{{ bound }}' +{%- elif filter_type == 'matchesCI' -%} +regexp_like("{{ data }}", '(?i){{ bound }}') +{%- elif filter_type == 'strNotEq' -%} +"{{ data }}" NOT LIKE '{{ bound }}' +{%- elif filter_type == 'strNotContains' -%} +"{{ data }}" NOT LIKE '%{{ bound }}%' +{%- elif filter_type == 'strNotStartsWith' -%} +"{{ data }}" NOT LIKE '{{ bound }}%' +{%- elif filter_type == 'strNotEndsWith' -%} +"{{ data }}" NOT LIKE '%{{ bound }}' +{%- elif filter_type == 'notMatches' -%} +NOT regexp_like("{{ data }}", '{{ bound }}') +{%- elif filter_type == 'strNotEqCI' -%} +"{{ data }}" NOT ILIKE '{{ bound }}' +{%- elif filter_type == 'strNotContainsCI' -%} +"{{ data }}" NOT ILIKE '%{{ bound }}%' +{%- elif filter_type == 'strNotStartsWithCI' -%} +"{{ data }}" NOT ILIKE '{{ bound }}%' +{%- elif filter_type == 'strNotEndsWithCI' -%} +"{{ data }}" NOT ILIKE '%{{ bound }}' +{%- elif filter_type == 'notMatchesCI' -%} +NOT regexp_like("{{ data }}", '(?i){{ bound }}') +{#- Date filters -#} +{%- elif filter_type == 'sameDay' -%} +from_iso8601_timestamp("{{ data }}") = date_trunc('day',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'sameWeek' -%} +date_trunc('week',from_iso8601_timestamp('{{ data }}')) = date_trunc('week',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'sameMonth' -%} +date_trunc('month',from_iso8601_timestamp('{{ data }}')) = date_trunc('month',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'sameYear' -%} +date_trunc('year',from_iso8601_timestamp('{{ data }}')) = date_trunc('year',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'sameDayOrBefore' -%} +from_iso8601_timestamp("{{ data }}") <= date_trunc('day',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'sameWeekOrBefore' -%} +date_trunc('week',from_iso8601_timestamp('{{ data }}')) <= date_trunc('week',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'sameMonthOrBefore' -%} +date_trunc('month',from_iso8601_timestamp('{{ data }}')) <= date_trunc('month',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'sameYearOrBefore' -%} +date_trunc('year',from_iso8601_timestamp('{{ data }}')) <= date_trunc('year',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'sameDayOrAfter' -%} +from_iso8601_timestamp("{{ data }}") >= date_trunc('day',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'sameWeekOrAfter' -%} +date_trunc('week',from_iso8601_timestamp('{{ data }}')) >= date_trunc('week',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'sameMonthOrAfter' -%} +date_trunc('month',from_iso8601_timestamp('{{ data }}')) >= date_trunc('month',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'sameYearOrAfter' -%} +date_trunc('year',from_iso8601_timestamp('{{ data }}')) >= date_trunc('year',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'beforeDay' -%} +from_iso8601_timestamp("{{ data }}") < date_trunc('day',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'beforeWeek' -%} +date_trunc('week',from_iso8601_timestamp('{{ data }}')) < date_trunc('week',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'beforeMonth' -%} +date_trunc('month',from_iso8601_timestamp('{{ data }}')) < date_trunc('month',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'beforeYear' -%} +date_trunc('year',from_iso8601_timestamp('{{ data }}')) < date_trunc('year',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'afterDay' -%} +from_iso8601_timestamp("{{ data }}") > date_trunc('day',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'afterWeek' -%} +date_trunc('week',from_iso8601_timestamp('{{ data }}')) > date_trunc('week',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'afterMonth' -%} +date_trunc('month',from_iso8601_timestamp('{{ data }}')) > date_trunc('month',from_iso8601_timestamp('{{ bound }}')) +{%- elif filter_type == 'afterYear' -%} +date_trunc('year',from_iso8601_timestamp('{{ data }}')) > date_trunc('year',from_iso8601_timestamp('{{ bound }}')) +{#- Boolean filters -#} +{%- elif filter_type == 'isTrue' -%} +"{{ data }}" IS TRUE +{%- elif filter_type == 'isNotTrue' -%} +"{{ data }}" IS NOT TRUE +{%- elif filter_type == 'isFalse' -%} +"{{ data }}" IS FALSE +{%- elif filter_type == 'isNotFalse' -%} +"{{ data }}" IS NOT FALSE +{#- Null filters -#} +{%- elif filter_type == 'isNull' -%} +"{{ data }}" IS NULL +{%- elif filter_type == 'isNotNull' -%} +"{{ data }}" IS NOT NULL +{#- Numeric filters -#} +{%- elif filter_type == 'eq'-%} +"{{ data }}" = {{ bound }} +{%- elif filter_type == 'ne'-%} +"{{ data }}" != {{ bound }} +{%- elif filter_type == 'gt'-%} +"{{ data }}" > {{ bound }} +{%- elif filter_type == 'gte'-%} +"{{ data }}" >= {{ bound }} +{%- elif filter_type == 'lt'-%} +"{{ data }}" < {{ bound }} +{%- elif filter_type == 'lte'-%} +"{{ data }}" <= {{ bound }} +{%- else -%} + not found {{ filter_type }} + {%- endif -%} +{%- endmacro -%} + +{%- macro get_filters(aggregate_configs) -%} + {%- for configs in aggregate_configs %} + {{ syntax.or_delineate(loop) }}( + {%- for config in configs %} + {{ syntax.and_delineate(loop) }}{{ render_filter(config['data'],config['filter_type'],config['bound']) }} + {%- endfor %} + ) + {%- endfor -%} +{%- endmacro -%} diff --git a/src/dashboard/get_chart_data/templates/get_chart_data.sql.jinja b/src/dashboard/get_chart_data/templates/get_chart_data.sql.jinja new file mode 100644 index 0000000..fc740ef --- /dev/null +++ b/src/dashboard/get_chart_data/templates/get_chart_data.sql.jinja @@ -0,0 +1,49 @@ +{%- import 'syntax.sql.jinja' as syntax -%} +{%- import 'filter_inline.sql.jinja' as inline -%} +SELECT + {%- if stratifier_column %} + "{{ stratifier_column }}", + {%- endif %} + "{{ data_column }}" + {%- if count_columns %},{%- endif %} +{%- for column in count_columns %} + "{{ column }}" +{{ syntax.comma_delineate(loop) }} +{%- endfor %} +FROM "{{ schema }}"."{{ data_package_id }}" +WHERE + {%- if coalesce_columns %} COALESCE( + {%- for column in coalesce_columns %} + CAST("{{ column }}" AS VARCHAR){{ syntax.comma_delineate(loop) }} + {%- endfor %} + ) IS NULL + AND{%- endif %} "{{ data_column }}" IS NOT NULL + AND ( + ( + {{ data_column }} != 'cumulus__none' + {%- if inline_configs %} + AND + ( + {{ inline.get_filters(inline_configs) }} + ) + {%- endif %} + ) + OR ( + {{ data_column }} = 'cumulus__none' + {%- if none_configs %} + AND + ( + {{ inline.get_filters(none_configs) }} + ) + {%- endif %} + ) + ) + {%- if stratifier_column %} + AND "{{ stratifier_column }}" IS NOT NULL + {%- endif %} +ORDER BY + {%- if stratifier_column %} + "{{ stratifier_column }}", "{{ data_column }}" + {%- else %} + "{{ data_column }}" + {%- endif %} diff --git a/src/dashboard/get_chart_data/templates/syntax.sql.jinja b/src/dashboard/get_chart_data/templates/syntax.sql.jinja new file mode 100644 index 0000000..cc0a72d --- /dev/null +++ b/src/dashboard/get_chart_data/templates/syntax.sql.jinja @@ -0,0 +1,24 @@ +{%- macro comma_delineate(loop) -%} + {%- if not loop.last -%} + , + {%- endif -%} +{%- endmacro -%} + + +{# Note that the following two delineations are meant to be at the front of the string +in a loop - this is to enable formatting in a WHERE statement like this: +--- +WHERE + b.bar = a.foo + AND b.baz != a.foo +--- +This is slightly easier to work with when debugging queries (and also +conforms better to the mozilla style guide) +#} +{%- macro and_delineate(loop) -%} + {%- if not loop.first -%}AND {% endif -%} +{%- endmacro -%} + +{%- macro or_delineate(loop) -%} + {%- if not loop.first -%}OR {% endif -%} +{%- endmacro -%} diff --git a/src/shared/errors.py b/src/shared/errors.py index a33b967..4db84ae 100644 --- a/src/shared/errors.py +++ b/src/shared/errors.py @@ -1,2 +1,6 @@ class AggregatorS3Error(Exception): """Errors related to accessing files in S3""" + + +class AggregatorFilterError(Exception): + """Errors related to accessing files in S3""" diff --git a/tests/dashboard/test_filter_config.py b/tests/dashboard/test_filter_config.py deleted file mode 100644 index 39685ad..0000000 --- a/tests/dashboard/test_filter_config.py +++ /dev/null @@ -1,163 +0,0 @@ -import pytest - -from src.dashboard.get_chart_data import filter_config - - -@pytest.mark.parametrize( - "input_str,output_str", - [ - # Checking individual conversions - (["col:strEq:str"], "col LIKE 'str'"), - (["col:strContains:str"], "col LIKE '%str%'"), - (["col:strStartsWith:str"], "col LIKE 'str%'"), - (["col:strEndsWith:str"], "col LIKE '%str'"), - (["col:matches:str"], "col ~ 'str'"), - (["col:strEqCI:str"], "col ILIKE 'str'"), - (["col:strContainsCI:str"], "col ILIKE '%str%'"), - (["col:strStartsWithCI:str"], "col ILIKE 'str%'"), - (["col:strEndsWithCI:str"], "col ILIKE '%str'"), - (["col:matchesCI:str"], "col ~* 'str'"), - (["col:strNotEq:str"], "col NOT LIKE 'str'"), - (["col:strNotContains:str"], "col NOT LIKE '%str%'"), - (["col:strNotStartsWith:str"], "col NOT LIKE 'str%'"), - (["col:strNotEndsWith:str"], "col NOT LIKE '%str'"), - (["col:notMatches:str"], "col !~ 'str'"), - (["col:strNotEqCI:str"], "col NOT ILIKE 'str'"), - (["col:strNotContainsCI:str"], "col NOT ILIKE '%str%'"), - (["col:strNotStartsWithCI:str"], "col NOT ILIKE 'str%'"), - (["col:strNotEndsWithCI:str"], "col NOT ILIKE '%str'"), - (["col:notMatchesCI:str"], "col !~* 'str'"), - (["col:eq:10.2"], "col = 10.2"), - (["col:ne:10.2"], "col != 10.2"), - (["col:gt:10.2"], "col > 10.2"), - (["col:gte:10.2"], "col >= 10.2"), - (["col:lt:10.2"], "col < 10.2"), - (["col:lte:10.2"], "col <= 10.2"), - (["col:isTrue"], "col IS TRUE"), - (["col:isNotTrue"], "col IS NOT TRUE"), - (["col:isFalse"], "col IS FALSE"), - (["col:isNotFalse"], "col IS NOT FALSE"), - (["col:isNull"], "col IS NULL"), - (["col:isNotNull"], "col IS NOT NULL"), - (["col:strEq:str"], "col LIKE 'str'"), - ( - ["column:sameDay:1900-01-01"], - "from_iso8601_timestamp(column) = " - "date_trunc('day',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:sameWeek:1900-01-01"], - "date_trunc('week',from_iso8601_timestamp(column)) = " - "date_trunc('week',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:sameMonth:1900-01-01"], - "date_trunc('month',from_iso8601_timestamp(column)) = " - "date_trunc('month',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:sameYear:1900-01-01"], - "date_trunc('year',from_iso8601_timestamp(column)) = " - "date_trunc('year',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:sameDayOrBefore:1900-01-01"], - "from_iso8601_timestamp(column) <= " - "date_trunc('day',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:sameWeekOrBefore:1900-01-01"], - "date_trunc('week',from_iso8601_timestamp(column)) <= " - "date_trunc('week',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:sameMonthOrBefore:1900-01-01"], - ( - "date_trunc('month',from_iso8601_timestamp(column)) <= " - "date_trunc('month',from_iso8601_timestamp('1900-01-01'))" - ), - ), - ( - ["column:sameYearOrBefore:1900-01-01"], - "date_trunc('year',from_iso8601_timestamp(column)) <= " - "date_trunc('year',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:sameDayOrAfter:1900-01-01"], - "from_iso8601_timestamp(column) >= " - "date_trunc('day',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:sameWeekOrAfter:1900-01-01"], - "date_trunc('week',from_iso8601_timestamp(column)) >= " - "date_trunc('week',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:sameMonthOrAfter:1900-01-01"], - ( - "date_trunc('month',from_iso8601_timestamp(column)) >= " - "date_trunc('month',from_iso8601_timestamp('1900-01-01'))" - ), - ), - ( - ["column:sameYearOrAfter:1900-01-01"], - "date_trunc('year',from_iso8601_timestamp(column)) >= " - "date_trunc('year',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:beforeDay:1900-01-01"], - "from_iso8601_timestamp(column) < " - "date_trunc('day',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:beforeWeek:1900-01-01"], - "date_trunc('week',from_iso8601_timestamp(column)) < " - "date_trunc('week',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:beforeMonth:1900-01-01"], - "date_trunc('month',from_iso8601_timestamp(column)) < " - "date_trunc('month',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:beforeYear:1900-01-01"], - "date_trunc('year',from_iso8601_timestamp(column)) < " - "date_trunc('year',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:afterDay:1900-01-01"], - "from_iso8601_timestamp(column) > " - "date_trunc('day',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:afterWeek:1900-01-01"], - "date_trunc('week',from_iso8601_timestamp(column)) > " - "date_trunc('week',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:afterMonth:1900-01-01"], - "date_trunc('month',from_iso8601_timestamp(column)) > " - "date_trunc('month',from_iso8601_timestamp('1900-01-01'))", - ), - ( - ["column:afterYear:1900-01-01"], - "date_trunc('year',from_iso8601_timestamp(column)) > " - "date_trunc('year',from_iso8601_timestamp('1900-01-01'))", - ), - # Checking compound statements - ( - ["col:strEq:str", "col:strEqCI:str"], - "col LIKE 'str' OR col ILIKE 'str'", - ), - ( - ["col:strEq:str,col:strEqCI:str"], - "col LIKE 'str' AND col ILIKE 'str'", - ), - ( - ["col:strEq:str", "col:strEq:str,col:strEqCI:str"], - "col LIKE 'str' OR col LIKE 'str' AND col ILIKE 'str'", - ), - ], -) -def test_filter_string(input_str, output_str): - assert filter_config.get_filter_string(input_str) == output_str diff --git a/tests/dashboard/test_filter_inline.py b/tests/dashboard/test_filter_inline.py new file mode 100644 index 0000000..2e01c94 --- /dev/null +++ b/tests/dashboard/test_filter_inline.py @@ -0,0 +1,459 @@ +import pathlib + +import jinja2 +import pytest + + +@pytest.mark.parametrize( + "filter_configs,expected", + [ + # Checking individual conversions + ( + ["col:strEq:str"], + """ + ( + "col" LIKE 'str' + )""", + ), + ( + ["col:strContains:str"], + """ + ( + "col" LIKE '%str%' + )""", + ), + ( + ["col:strStartsWith:str"], + """ + ( + "col" LIKE 'str%' + )""", + ), + ( + ["col:strEndsWith:str"], + """ + ( + "col" LIKE '%str' + )""", + ), + ( + ["col:matches:str"], + """ + ( + regexp_like("col", 'str') + )""", + ), + ( + ["col:strEqCI:str"], + """ + ( + "col" ILIKE 'str' + )""", + ), + ( + ["col:strContainsCI:str"], + """ + ( + "col" ILIKE '%str%' + )""", + ), + ( + ["col:strStartsWithCI:str"], + """ + ( + "col" ILIKE 'str%' + )""", + ), + ( + ["col:strEndsWithCI:str"], + """ + ( + "col" ILIKE '%str' + )""", + ), + ( + ["col:matchesCI:str"], + """ + ( + regexp_like("col", '(?i)str') + )""", + ), + ( + ["col:strNotEq:str"], + """ + ( + "col" NOT LIKE 'str' + )""", + ), + ( + ["col:strNotContains:str"], + """ + ( + "col" NOT LIKE '%str%' + )""", + ), + ( + ["col:strNotStartsWith:str"], + """ + ( + "col" NOT LIKE 'str%' + )""", + ), + ( + ["col:strNotEndsWith:str"], + """ + ( + "col" NOT LIKE '%str' + )""", + ), + ( + ["col:notMatches:str"], + """ + ( + NOT regexp_like("col", 'str') + )""", + ), + ( + ["col:strNotEqCI:str"], + """ + ( + "col" NOT ILIKE 'str' + )""", + ), + ( + ["col:strNotContainsCI:str"], + """ + ( + "col" NOT ILIKE '%str%' + )""", + ), + ( + ["col:strNotStartsWithCI:str"], + """ + ( + "col" NOT ILIKE 'str%' + )""", + ), + ( + ["col:strNotEndsWithCI:str"], + """ + ( + "col" NOT ILIKE '%str' + )""", + ), + ( + ["col:notMatchesCI:str"], + """ + ( + NOT regexp_like("col", '(?i)str') + )""", + ), + ( + ["col:eq:10.2"], + """ + ( + "col" = 10.2 + )""", + ), + ( + ["col:ne:10.2"], + """ + ( + "col" != 10.2 + )""", + ), + ( + ["col:gt:10.2"], + """ + ( + "col" > 10.2 + )""", + ), + ( + ["col:gte:10.2"], + """ + ( + "col" >= 10.2 + )""", + ), + ( + ["col:lt:10.2"], + """ + ( + "col" < 10.2 + )""", + ), + ( + ["col:lte:10.2"], + """ + ( + "col" <= 10.2 + )""", + ), + ( + ["col:isTrue"], + """ + ( + "col" IS TRUE + )""", + ), + ( + ["col:isNotTrue"], + """ + ( + "col" IS NOT TRUE + )""", + ), + ( + ["col:isFalse"], + """ + ( + "col" IS FALSE + )""", + ), + ( + ["col:isNotFalse"], + """ + ( + "col" IS NOT FALSE + )""", + ), + ( + ["col:isNull"], + """ + ( + "col" IS NULL + )""", + ), + ( + ["col:isNotNull"], + """ + ( + "col" IS NOT NULL + )""", + ), + ( + ["col:strEq:str"], + """ + ( + "col" LIKE 'str' + )""", + ), + ( + ["column:sameDay:1900-01-01"], + """ + ( + from_iso8601_timestamp("column") = """ + """date_trunc('day',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:sameWeek:1900-01-01"], + """ + ( + date_trunc('week',from_iso8601_timestamp('column')) = """ + """date_trunc('week',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:sameMonth:1900-01-01"], + """ + ( + date_trunc('month',from_iso8601_timestamp('column')) = """ + """date_trunc('month',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:sameYear:1900-01-01"], + """ + ( + date_trunc('year',from_iso8601_timestamp('column')) = """ + """date_trunc('year',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:sameDayOrBefore:1900-01-01"], + """ + ( + from_iso8601_timestamp("column") <= """ + """date_trunc('day',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:sameWeekOrBefore:1900-01-01"], + """ + ( + date_trunc('week',from_iso8601_timestamp('column')) <= """ + """date_trunc('week',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:sameMonthOrBefore:1900-01-01"], + ( + """ + ( + date_trunc('month',from_iso8601_timestamp('column')) <= """ + """date_trunc('month',from_iso8601_timestamp('1900-01-01')) + )""" + ), + ), + ( + ["column:sameYearOrBefore:1900-01-01"], + """ + ( + date_trunc('year',from_iso8601_timestamp('column')) <= """ + """date_trunc('year',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:sameDayOrAfter:1900-01-01"], + """ + ( + from_iso8601_timestamp("column") >= """ + """date_trunc('day',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:sameWeekOrAfter:1900-01-01"], + """ + ( + date_trunc('week',from_iso8601_timestamp('column')) >= """ + """date_trunc('week',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:sameMonthOrAfter:1900-01-01"], + ( + """ + ( + date_trunc('month',from_iso8601_timestamp('column')) >= """ + """date_trunc('month',from_iso8601_timestamp('1900-01-01')) + )""" + ), + ), + ( + ["column:sameYearOrAfter:1900-01-01"], + """ + ( + date_trunc('year',from_iso8601_timestamp('column')) >= """ + """date_trunc('year',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:beforeDay:1900-01-01"], + """ + ( + from_iso8601_timestamp("column") < """ + """date_trunc('day',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:beforeWeek:1900-01-01"], + """ + ( + date_trunc('week',from_iso8601_timestamp('column')) < """ + """date_trunc('week',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:beforeMonth:1900-01-01"], + """ + ( + date_trunc('month',from_iso8601_timestamp('column')) < """ + """date_trunc('month',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:beforeYear:1900-01-01"], + """ + ( + date_trunc('year',from_iso8601_timestamp('column')) < """ + """date_trunc('year',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:afterDay:1900-01-01"], + """ + ( + from_iso8601_timestamp("column") > """ + """date_trunc('day',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:afterWeek:1900-01-01"], + """ + ( + date_trunc('week',from_iso8601_timestamp('column')) > """ + """date_trunc('week',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:afterMonth:1900-01-01"], + """ + ( + date_trunc('month',from_iso8601_timestamp('column')) > """ + """date_trunc('month',from_iso8601_timestamp('1900-01-01')) + )""", + ), + ( + ["column:afterYear:1900-01-01"], + """ + ( + date_trunc('year',from_iso8601_timestamp('column')) > """ + """date_trunc('year',from_iso8601_timestamp('1900-01-01')) + )""", + ), + # Checking compound statements + ( + ["col:strEq:str", "col:strEqCI:str"], + """ + ( + "col" LIKE 'str' + ) + OR ( + "col" ILIKE 'str' + )""", + ), + ( + ["col:strEq:str,col:strEqCI:str"], + """ + ( + "col" LIKE 'str' + AND "col" ILIKE 'str' + )""", + ), + ( + ["col:strEq:str", "col:strEq:str,col:strEqCI:str"], + """ + ( + "col" LIKE 'str' + ) + OR ( + "col" LIKE 'str' + AND "col" ILIKE 'str' + )""", + ), + ], +) +def test_filter_string(filter_configs, expected): + inline_configs = [] + for filter_config in filter_configs: + subconfigs = filter_config.split(",") + inline_config = [] + for subconfig in subconfigs: + subconfig = subconfig.split(":") + config = {"data": subconfig[0], "filter_type": subconfig[1]} + if len(subconfig) == 3: + config["bound"] = subconfig[2] + inline_config.append(config) + inline_configs.append(inline_config) + with open(pathlib.Path(__file__).parent / "test_filter_inline.sql.jinja") as file: + template = file.read() + loader = jinja2.FileSystemLoader( + pathlib.Path(__file__).parent / "../../src/dashboard/get_chart_data/templates/" + ) + env = jinja2.Environment(loader=loader).from_string(template) + query = env.render( + inline_configs=inline_configs, + ) + assert query == expected diff --git a/tests/dashboard/test_filter_inline.sql.jinja b/tests/dashboard/test_filter_inline.sql.jinja new file mode 100644 index 0000000..9e23b70 --- /dev/null +++ b/tests/dashboard/test_filter_inline.sql.jinja @@ -0,0 +1,2 @@ +{%- import 'filter_inline.sql.jinja' as inline -%} +{{inline.get_filters(inline_configs)}} \ No newline at end of file diff --git a/tests/dashboard/test_get_chart_data.py b/tests/dashboard/test_get_chart_data.py index 62a1ae7..fb52f12 100644 --- a/tests/dashboard/test_get_chart_data.py +++ b/tests/dashboard/test_get_chart_data.py @@ -1,7 +1,9 @@ import json import os +from contextlib import nullcontext as does_not_raise from unittest import mock +import botocore import pandas import pytest @@ -29,53 +31,248 @@ def mock_data_frame(filter_param): @mock.patch("src.dashboard.get_chart_data.get_chart_data._get_table_cols", mock_get_table_cols) @mock.patch.dict(os.environ, MOCK_ENV) @pytest.mark.parametrize( - "query_params,filters,path_params,query_str", + "query_params,filter_groups,path_params,query_str, raises", [ ( {"column": "gender"}, [], {"data_package_id": "test_study"}, - f'SELECT gender, sum(cnt) as cnt FROM "{TEST_GLUE_DB}"."test_study" ' - "WHERE COALESCE (cast(race AS VARCHAR)) IS NOT NULL AND gender IS NOT NULL " - "GROUP BY gender ORDER BY gender", + f"""SELECT + "gender", + "cnt" + +FROM "{TEST_GLUE_DB}"."test_study" +WHERE COALESCE( + CAST("race" AS VARCHAR) + ) IS NULL + AND "gender" IS NOT NULL + AND ( + ( + gender != 'cumulus__none' + ) + OR ( + gender = 'cumulus__none' + ) + ) +ORDER BY + "gender\"""", + does_not_raise(), ), ( {"column": "gender", "stratifier": "race"}, [], {"data_package_id": "test_study"}, - f'SELECT race, gender, sum(cnt) as cnt FROM "{TEST_GLUE_DB}"."test_study" ' - "WHERE gender IS NOT NULL " - "AND race IS NOT NULL " - "GROUP BY race, gender ORDER BY race, gender", + f"""SELECT + "race", + "gender", + "cnt" + +FROM "{TEST_GLUE_DB}"."test_study" +WHERE "gender" IS NOT NULL + AND ( + ( + gender != 'cumulus__none' + ) + OR ( + gender = 'cumulus__none' + ) + ) + AND "race" IS NOT NULL +ORDER BY + "race", "gender\"""", + does_not_raise(), ), ( {"column": "gender"}, ["gender:strEq:female"], {"data_package_id": "test_study"}, - f'SELECT gender, sum(cnt) as cnt FROM "{TEST_GLUE_DB}"."test_study" ' - "WHERE COALESCE (cast(race AS VARCHAR)) IS NOT NULL AND gender IS NOT NULL " - "AND gender LIKE 'female' " - "GROUP BY gender ORDER BY gender", + f"""SELECT + "gender", + "cnt" + +FROM "{TEST_GLUE_DB}"."test_study" +WHERE COALESCE( + CAST("race" AS VARCHAR) + ) IS NULL + AND "gender" IS NOT NULL + AND ( + ( + gender != 'cumulus__none' + AND + ( + + ( + "gender" LIKE 'female' + ) + ) + ) + OR ( + gender = 'cumulus__none' + AND + ( + + ( + "gender" LIKE 'female' + ) + ) + ) + ) +ORDER BY + "gender\"""", + does_not_raise(), ), ( {"column": "gender", "stratifier": "race"}, - ["gender:strEq:female"], + ["gender:strEq:none"], {"data_package_id": "test_study"}, - f'SELECT race, gender, sum(cnt) as cnt FROM "{TEST_GLUE_DB}"."test_study" ' - "WHERE gender IS NOT NULL " - "AND gender LIKE 'female' " - "AND race IS NOT NULL " - "GROUP BY race, gender ORDER BY race, gender", + f"""SELECT + "race", + "gender", + "cnt" + +FROM "{TEST_GLUE_DB}"."test_study" +WHERE "gender" IS NOT NULL + AND ( + ( + gender != 'cumulus__none' + AND + ( + + ( + "gender" LIKE 'cumulus__none' + ) + ) + ) + OR ( + gender = 'cumulus__none' + AND + ( + + ( + "gender" LIKE 'cumulus__none' + ) + ) + ) + ) + AND "race" IS NOT NULL +ORDER BY + "race", "gender\"""", + does_not_raise(), + ), + ( + {"column": "gender", "stratifier": "race"}, + ["cnt:gt:2", "cnt:lt:10"], + {"data_package_id": "test_study"}, + f"""SELECT + "race", + "gender", + "cnt" + +FROM "{TEST_GLUE_DB}"."test_study" +WHERE "gender" IS NOT NULL + AND ( + ( + gender != 'cumulus__none' + AND + ( + + ( + "cnt" > 2 + ) + OR ( + "cnt" < 10 + ) + ) + ) + OR ( + gender = 'cumulus__none' + AND + ( + + ( + "cnt" > 2 + ) + OR ( + "cnt" < 10 + ) + ) + ) + ) + AND "race" IS NOT NULL +ORDER BY + "race", "gender\"""", + does_not_raise(), + ), + ( + {"column": "gender", "stratifier": "race"}, + [ + "gender:matches:a", + "gender:matches:e,gender:matches:m", + ], + {"data_package_id": "test_study"}, + f"""SELECT + "race", + "gender", + "cnt" + +FROM "{TEST_GLUE_DB}"."test_study" +WHERE "gender" IS NOT NULL + AND ( + ( + gender != 'cumulus__none' + AND + ( + + ( + regexp_like("gender", 'a') + ) + OR ( + regexp_like("gender", 'e') + AND regexp_like("gender", 'm') + ) + ) + ) + OR ( + gender = 'cumulus__none' + AND + ( + + ( + regexp_like("gender", 'a') + ) + OR ( + regexp_like("gender", 'e') + AND regexp_like("gender", 'm') + ) + ) + ) + ) + AND "race" IS NOT NULL +ORDER BY + "race", "gender\"""", + does_not_raise(), + ), + ( + {"column": "gender", "stratifier": "race"}, + [ + "gender:invalid:a", + ], + {"data_package_id": "test_study"}, + "", + # The deployed class vs testing module approach makes getting + # the actual error raised here fussy. + pytest.raises(Exception), ), ], ) -def test_build_query(query_params, filters, path_params, query_str): - query, _ = get_chart_data._build_query(query_params, filters, path_params) - assert query == query_str +def test_build_query(query_params, filter_groups, path_params, query_str, raises): + with raises: + query, _ = get_chart_data._build_query(query_params, filter_groups, path_params) + assert query == query_str @pytest.mark.parametrize( - "query_params,filters,expected_payload", + "query_params,filter_groups,expected_payload", [ ( {"column": "gender"}, @@ -99,9 +296,9 @@ def test_build_query(query_params, filters, path_params, query_str): ), ], ) -def test_format_payload(query_params, filters, expected_payload): - df = mock_data_frame(filters) - payload = get_chart_data._format_payload(df, query_params, filters, "cnt") +def test_format_payload(query_params, filter_groups, expected_payload): + df = mock_data_frame(filter_groups) + payload = get_chart_data._format_payload(df, query_params, filter_groups, "cnt") assert payload == expected_payload @@ -112,9 +309,21 @@ def test_get_data_cols(mock_bucket): assert res == list(cols) +@mock.patch("botocore.client") +def test_get_data_cols_err(mock_client): + mock_clientobj = mock_client.ClientCreator.return_value.create_client.return_value + mock_clientobj.get_object.side_effect = [ + None, + botocore.exceptions.ClientError({}, {}), + ] + with pytest.raises(Exception): + table_id = f"{EXISTING_STUDY}__{EXISTING_DATA_P}__{EXISTING_VERSION}" + get_chart_data._get_table_cols(table_id) + + @mock.patch( "src.dashboard.get_chart_data.get_chart_data._build_query", - lambda query_params, filters, path_params: ( + lambda query_params, filter_groups, path_params: ( ( "SELECT gender, sum(cnt) as cnt" f'FROM "{TEST_GLUE_DB}"."test_study" ' @@ -143,3 +352,14 @@ def test_handler(): '"rowCount": 2, "totalCount": 20, "data": [{"rows": [["male", 10], ' '["female", 10]]}]}' ) + event = { + "queryStringParameters": {"column": "gender", "filter": "gender:strEq:female"}, + "multiValueQueryStringParameters": {}, + "pathParameters": {}, + } + res = get_chart_data.chart_data_handler(event, {}) + assert res["body"] == ( + '{"column": "gender", "filters": ["gender:strEq:female"], ' + '"rowCount": 2, "totalCount": 20, "data": [{"rows": [["male", 10], ' + '["female", 10]]}]}' + )