Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removed vocab study #293

Merged
merged 3 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ jobs:
WG: cumulus
DB: cumulus_library_regression_db
run: |
cumulus-library build -t vocab -t core --profile $PROFILE --workgroup $WG --database $DB
cumulus-library export -t vocab -t core ./tests/regression/data_export/ --profile $PROFILE --workgroup $WG --database $DB
cumulus-library build -t core --profile $PROFILE --workgroup $WG --database $DB
cumulus-library export -t core ./tests/regression/data_export/ --profile $PROFILE --workgroup $WG --database $DB
- name: Compare vs known data
run: python ./tests/regression/run_regression.py

Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ output.sql
MRCONSO.RRF
*.zip
coverage.xml
*.parquet

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
6 changes: 0 additions & 6 deletions cumulus_library/studies/vocab/README.MD

This file was deleted.

110,149 changes: 0 additions & 110,149 deletions cumulus_library/studies/vocab/icd/ICD10CM_2023AA.bsv

This file was deleted.

270,675 changes: 0 additions & 270,675 deletions cumulus_library/studies/vocab/icd/ICD10PCS_2023AA.bsv

This file was deleted.

22,406 changes: 0 additions & 22,406 deletions cumulus_library/studies/vocab/icd/ICD9CM_2023AA.bsv

This file was deleted.

17 changes: 0 additions & 17 deletions cumulus_library/studies/vocab/manifest.toml

This file was deleted.

18 changes: 0 additions & 18 deletions cumulus_library/studies/vocab/reference_sql/vocab_icd_builder.sql

This file was deleted.

2 changes: 0 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@
"observation": [["id"], ["encounter", "reference"]],
"patient": [["id"]],
}
VOCAB_ICD_ROW_COUNT = 403230

# Utility functions


Expand Down
14 changes: 0 additions & 14 deletions tests/regression/run_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,19 +85,5 @@ def regress_core():
print("✅ Core study reference and export matched ✅")


def regress_vocab():
export_path = f"{Path(__file__).resolve().parent}/data_export/vocab"
with open(f"{export_path}/vocab__icd.csv") as f:
export_size = len(f.readlines())
# this is the value of
if export_size != VOCAB_ICD_ROW_COUNT:
sys.exit(
f"❌ Vocab tables built from parquets are not expected length."
f" Found rows: {export_size} ❌"
)
print("✅ Vocab tables built from parquets are expected length ✅")


if __name__ == "__main__":
regress_vocab()
regress_core()
1 change: 0 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,6 @@ def test_clean(tmp_path, args, expected, raises):
3,
does_not_raise(),
),
(["build", "-t", "vocab"], None, 3, does_not_raise()),
(
# checking that a study is loaded from the directory of a user-defined
# path. we're also validating that the CLI accepts the statistics keyword
Expand Down
3 changes: 3 additions & 0 deletions tests/test_data/study_static_file/bsvs/ICD10CM_2023AA.bsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
C0000727|ICD10CM|PT|R10.0|Acute abdomen
C0000737|ICD10CM|PT|R10.9|Unspecified abdominal pain
C0000744|ICD10CM|ET|E78.6|Abetalipoproteinemia
3 changes: 3 additions & 0 deletions tests/test_data/study_static_file/bsvs/ICD10PCS_2023AA.bsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
C0005491|ICD10PCS|PT|GZC9ZZZ|Biofeedback
C0005491|ICD10PCS|PX|GZC9ZZZ|Mental Health @ None @ Biofeedback @ Other Biofeedback @ None @ None @ None
C0010332|ICD10PCS|PT|GZ2ZZZZ|Crisis Intervention
11 changes: 11 additions & 0 deletions tests/test_data/study_static_file/manifest.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
study_prefix = "study_static_file"

[table_builder_config]
file_names = [
"static_file_builder.py",
]

[export_config]
export_list = [
"study_static_file__table"
]
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from cumulus_library.template_sql import base_templates


class VocabIcdRunner(cumulus_library.BaseTableBuilder):
class StaticFileBuilder(cumulus_library.BaseTableBuilder):
display_text = "Creating ICD vocab..."

def prepare_queries(
Expand All @@ -26,19 +26,19 @@ def prepare_queries(
:param schema: the schema/db name, matching the cursor
"""

table_name = "vocab__icd"
table_name = "study_static_file__table"
path = pathlib.Path(__file__).parent
icd_files = path.glob("icd/*.bsv")
icd_files = path.glob("bsvs/*.bsv")
headers = ["CUI", "TTY", "CODE", "SAB", "STR"]
header_types = ["STRING", "STRING", "STRING", "STRING", "STRING"]
for file in icd_files:
parquet_path = path / f"icd/{file.stem}.parquet"
parquet_path = path / f"bsvs/{file.stem}.parquet"
df = pandas.read_csv(file, delimiter="|", names=headers)
df.to_parquet(parquet_path)
remote_path = config.db.upload_file(
file=parquet_path,
study="vocab",
topic="icd",
study="study_static_file",
topic="static_file",
remote_filename=f"{file.stem}.parquet",
force_upload=config.force_upload,
)
Expand All @@ -48,7 +48,7 @@ def prepare_queries(
base_templates.get_ctas_from_parquet_query(
schema_name=config.schema,
table_name=table_name,
local_location=path / "icd",
local_location=path / "bsvs",
remote_location=remote_path,
table_cols=headers,
remote_table_cols_types=header_types,
Expand Down
24 changes: 5 additions & 19 deletions tests/test_vocab.py → tests/test_static_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,13 @@
os.environ,
clear=True,
)
def test_vocab(tmp_path):
def test_static_file(tmp_path):
cli.main(
cli_args=conftest.duckdb_args(
[
"build",
"-t",
"core",
"-s",
"./tests/test_data",
"--database",
"test",
],
tmp_path,
)
)
cli.main(
cli_args=conftest.duckdb_args(
[
"build",
"-t",
"vocab",
"study_static_file",
"-s",
"./tests/test_data",
"--database",
Expand All @@ -40,10 +26,10 @@ def test_vocab(tmp_path):
)
db = databases.DuckDatabaseBackend(f"{tmp_path}/duck.db")
cursor = db.cursor()
table_rows, cols = conftest.get_sorted_table_data(cursor, "vocab__icd")
table_rows, cols = conftest.get_sorted_table_data(cursor, "study_static_file__table")
expected_cols = {"CUI", "TTY", "CODE", "SAB", "STR"}
found_cols = {col_schema[0] for col_schema in cols}
assert expected_cols == found_cols
assert len(table_rows) == conftest.VOCAB_ICD_ROW_COUNT
assert len(table_rows) == 6
assert table_rows[0] == ("C0000727", "ICD10CM", "PT", "R10.0", "Acute abdomen")
assert table_rows[-1] == ("C5700317", "ICD10CM", "HT", "M91.3", "Pseudocoxalgia")
assert table_rows[-1] == ("C0010332", "ICD10PCS", "PT", "GZ2ZZZZ", "Crisis Intervention")
Loading