diff --git a/.gitignore b/.gitignore index 8c97b3c..b74ff93 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,8 @@ phenopackets/ + +# Cache files +.oncoexporter_cache/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -166,8 +170,6 @@ cython_debug/ c2p_env/ notebooks/Untitled.ipynb -\.*.pkl - *.pkl *.tsv .DS_Store diff --git a/pyproject.toml b/pyproject.toml index b5a9ce0..c0581ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ exclude = ["notebooks", "test"] [project] name = "oncoexporter" dynamic = ["version"] -requires-python = ">=3.5" +requires-python = ">=3.8" description = "Generate and work with GA4GH phenopackets for NCI" readme = "README.md" authors = [ @@ -36,12 +36,12 @@ keywords = [ dependencies = [ "hpo-toolkit>=0.3.0,<0.4.0", "phenopackets>=2.0.2", - "requests", + "requests>=2.25.0,<3.0", "cdapython@git+https://github.com/CancerDataAggregator/cda-python", - "tqdm" + "tqdm>=4.60" ] [project.optional-dependencies] -test = ["pytest", "parameterized"] +test = ["pytest>=7.0.0,<8.0.0"] diff --git a/src/oncoexporter/cda/.gitignore b/src/oncoexporter/cda/.gitignore deleted file mode 100644 index f2a723b..0000000 --- a/src/oncoexporter/cda/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.pkl \ No newline at end of file diff --git a/src/oncoexporter/cda/cda_downloader.py b/src/oncoexporter/cda/cda_downloader.py index 53b6a5e..3cda931 100644 --- a/src/oncoexporter/cda/cda_downloader.py +++ b/src/oncoexporter/cda/cda_downloader.py @@ -1,9 +1,9 @@ +import csv import os import platform +from importlib.resources import files + import requests -import csv -import warnings -from importlib_resources import files class CdaDownloader: @@ -11,7 +11,7 @@ class CdaDownloader: def __init__(self): self.get_ncit_neoplasm_core() - def download_if_needed(self, overwrite_downloads:bool): + def download_if_needed(self, overwrite_downloads: bool): local_dir = self.get_local_share_directory() self._icdo_to_ncit_path = None self.load_icdo_to_ncit_tsv(overwrite=overwrite_downloads, local_dir=local_dir) @@ -19,7 +19,7 @@ def download_if_needed(self, overwrite_downloads:bool): def get_icdo_to_ncit_path(self): return self._icdo_to_ncit_path - def load_icdo_to_ncit_tsv(self, overwrite:bool, local_dir:str): + def load_icdo_to_ncit_tsv(self, overwrite: bool, local_dir: str): """ Download if necessary the NCIT ICD-O mapping file and store it in the package ncit_files folder :param overwrite: whether to overwrite an existing file (otherwise we skip downloading) @@ -42,8 +42,6 @@ def load_icdo_to_ncit_tsv(self, overwrite:bool, local_dir:str): print(f"[INFO] Downloaded {icdo_to_ncit_map_url}") self._icdo_to_ncit_path = icd_path - - def get_ncit_neoplasm_core(self): # Reads contents with UTF-8 encoding and returns str. neo_core = files('oncoexporter.ncit_files').joinpath('Neoplasm_Core.csv').read_text() @@ -57,4 +55,4 @@ def get_local_share_directory(self, local_dir=None): if not os.path.exists(local_dir): os.makedirs(local_dir) print(f"[INFO] Created new directory for oncoexporter at {local_dir}") - return local_dir \ No newline at end of file + return local_dir diff --git a/src/oncoexporter/cda/cda_table_importer.py b/src/oncoexporter/cda/cda_table_importer.py index 2eb1f72..a3419a4 100644 --- a/src/oncoexporter/cda/cda_table_importer.py +++ b/src/oncoexporter/cda/cda_table_importer.py @@ -15,47 +15,55 @@ class CdaTableImporter(CdaImporter): - """This class is the entry point for transforming CDA data into GA4GH Phenopackets. Client code only needs to initialize it with a CDA query and it can return phenopackets with the **get_ga4gh_phenopackets** method. It also returns individual tables for that can be used for testing or visualizing data. + """This class is the entry point for transforming CDA data into GA4GH Phenopackets. Client code only needs + to initialize it with a CDA query, and it can return phenopackets with the :func:`get_ga4gh_phenopackets`. + It also returns individual tables for that can be used for testing or visualizing data. - The CDA query determines the cohort that will be retrieved from CDA. This class then retrieves data for this cohort in form of pandas DataFrames and extracts data for phenopacket construction using the data in the tables + The CDA query determines the cohort that will be retrieved from CDA. This class then retrieves data + for this cohort in form of pandas DataFrames and extracts data for phenopacket construction using the data + in the tables :param cohort_name: A user-chosen name for the cohort :type cohort_name: str - :param query: A query for CDA such as 'primary_diagnosis_site = "Lung"' - :type query: str - :param query_obj: a query object created by CDA. Note that either this argument or the query (string) argument must be provided, but not both. - :type query_obj: Q + :param query: a query object created by CDA. Note that either this argument or the query (string) argument must be provided, but not both. + :type query: Q :param use_cache: if True, cache/retrieve from cache :type use_cache: bool + :param cache_dir: a `str` with path to the folder to store the cache files """ - def __init__(self, cohort_name: str, query: str = None, query_obj: Q = None, - use_cache=False): - """Constructor - """ - if query is not None and query_obj is None: - self._query = Q(query) - elif query_obj is not None and query is None: - if not isinstance(query_obj, Q): - raise ValueError(f"query_obj argument must be Q.Q object, but instead was {type(query_obj)}") - self._query = query_obj - else: - raise ValueError("Need to pass either query or query_obj argument but not both") + def __init__(self, cohort_name: str, + query: Q, + use_cache: bool = False, + cache_dir: typing.Optional[str] = None): + if not isinstance(query, Q): + raise ValueError(f"query_obj argument must be Q.Q object, but instead was {type(query)}") + self._query = query + self._use_cache = use_cache self._cohort_name = cohort_name + if cache_dir is None: + self._cache_dir = os.path.join(os.getcwd(), '.oncoexporter_cache') + if not os.path.isdir(self._cache_dir): + os.makedirs(self._cache_dir, exist_ok=True) + else: + if not os.path.isdir(cache_dir) or not os.access(cache_dir, os.W_OK): + raise ValueError(f'`cache_dir` must be a writable directory: {cache_dir}') + def _get_cda_df(self, callback_fxn, cache_name: str): - print(f"Retrieving dataframe {cache_name}") - if self._use_cache and os.path.isfile(cache_name): - with open(cache_name, 'rb') as cachehandle: - print(f"loading cached dataframe from {cache_name}") + fpath_cache = os.path.join(self._cache_dir, cache_name) + print(f"Retrieving dataframe {fpath_cache}") + if self._use_cache and os.path.isfile(fpath_cache): + with open(fpath_cache, 'rb') as cachehandle: + print(f"loading cached dataframe from {fpath_cache}") individual_df = pickle.load(cachehandle) else: print(f"calling CDA function") individual_df = callback_fxn() if self._use_cache: - print(f"Creating cached dataframe as {cache_name}") - with open(cache_name, 'wb') as f: + print(f"Creating cached dataframe as {fpath_cache}") + with open(fpath_cache, 'wb') as f: pickle.dump(individual_df, f) return individual_df @@ -72,10 +80,8 @@ def get_subject_df(self, page_size=10000) -> pd.DataFrame: :returns: pandas DataFrame that corresponds to the CDA subject table. :rtype: pd.DataFrame """ - if self._query is None: - raise Exception(f"Could not retrieve subject dataframe because query object was None") - callable = lambda: self._query.subject.run(page_size=page_size).get_all().to_dataframe(); - subject_df = self._get_cda_df(callable, f".{self._cohort_name}_individual_df.pkl"); + callable = lambda: self._query.subject.run(page_size=page_size).get_all().to_dataframe() + subject_df = self._get_cda_df(callable, f"{self._cohort_name}_individual_df.pkl") return subject_df @@ -86,10 +92,10 @@ def get_merged_diagnosis_research_subject_df(self, page_size=10000) -> pd.DataFr :type page_size: int """ diagnosis_callable = lambda: self._query.diagnosis.run(page_size=page_size).get_all().to_dataframe() - diagnosis_df = self._get_cda_df(diagnosis_callable, f".{self._cohort_name}_diagnosis_df.pkl") + diagnosis_df = self._get_cda_df(diagnosis_callable, f"{self._cohort_name}_diagnosis_df.pkl") print("obtained diagnosis_df") rsub_callable = lambda: self._query.researchsubject.run(page_size=page_size).get_all().to_dataframe() - rsub_df = self._get_cda_df(rsub_callable, f".{self._cohort_name}_rsub_df.pkl") + rsub_df = self._get_cda_df(rsub_callable, f"{self._cohort_name}_rsub_df.pkl") print("obtained rsub_df") merged_df = pd.merge(diagnosis_df, rsub_df, left_on='researchsubject_id', right_on='researchsubject_id', suffixes=["_di", "_rs"]) @@ -108,17 +114,17 @@ def get_specimen_df(self, page_size=10000) -> pd.DataFrame: :rtype: pd.DataFrame """ specimen_callable = lambda: self._query.specimen.run(page_size=page_size).get_all().to_dataframe() - specimen_df = self._get_cda_df(specimen_callable, f".{self._cohort_name}_specimen_df.pkl") + specimen_df = self._get_cda_df(specimen_callable, f"{self._cohort_name}_specimen_df.pkl") return specimen_df def get_treatment_df(self, page_size=10000) -> pd.DataFrame: treatment_callable = lambda: self._query.treatment.run(page_size=page_size).get_all().to_dataframe() - treatment_df = self._get_cda_df(treatment_callable, f".{self._cohort_name}_treatment_df.pkl") + treatment_df = self._get_cda_df(treatment_callable, f"{self._cohort_name}_treatment_df.pkl") return treatment_df def get_mutation_df(self, page_size=10000) -> pd.DataFrame: mutation_callable = lambda: self._query.mutation.run(page_size=page_size).get_all().to_dataframe() - mutation_df = self._get_cda_df(mutation_callable, f".{self._cohort_name}_mutation_df.pkl") + mutation_df = self._get_cda_df(mutation_callable, f"{self._cohort_name}_mutation_df.pkl") return mutation_df diff --git a/src/run_cervix.py b/src/run_cervix.py index 9d56d23..119d4d2 100644 --- a/src/run_cervix.py +++ b/src/run_cervix.py @@ -5,7 +5,7 @@ from oncoexporter.cda import CdaTableImporter Tsite = Q('primary_diagnosis_site = "%uter%" OR primary_diagnosis_site = "%cerv%"', ) -table_importer = CdaTableImporter(query_obj=Tsite, use_cache=True, cohort_name='Cervix') +table_importer = CdaTableImporter(query=Tsite, use_cache=True, cohort_name='Cervix') p = table_importer.get_ga4gh_phenopackets(page_size=10000) print("Created {} phenopackets".format(len(p))) diff --git a/src/run_lung.py b/src/run_lung.py index 68f7b38..8857658 100644 --- a/src/run_lung.py +++ b/src/run_lung.py @@ -5,7 +5,7 @@ from oncoexporter.cda import CdaTableImporter Tsite = Q('primary_diagnosis_site = "%lung%" OR primary_diagnosis_site = "%pulmonary%"') -table_importer = CdaTableImporter(query_obj=Tsite, use_cache=True, cohort_name='Lung') +table_importer = CdaTableImporter(query=Tsite, use_cache=True, cohort_name='Lung') p = table_importer.get_ga4gh_phenopackets(page_size=10000) print("Created {} phenopackets".format(len(p))) diff --git a/tests/test_cda_disease_factory.py b/tests/test_cda_disease_factory.py index 0285f0e..d013673 100644 --- a/tests/test_cda_disease_factory.py +++ b/tests/test_cda_disease_factory.py @@ -1,8 +1,9 @@ import unittest import os import pandas as pd -from parameterized import parameterized import phenopackets as PPKt +import pytest + from oncoexporter.cda import CdaDiseaseFactory @@ -38,7 +39,7 @@ def test_disease_stage_is_ontology_term(self): self.assertEqual(self.disease_objs['s1'].disease_stage[0].__class__, PPKt.OntologyClass) - @parameterized.expand([ + @pytest.mark.parametrize('subject_id, expected_ncit_ontology', [ ('s1', STAGE_IA), # IA ('s2', STAGE_IB), # IB ('s3', STAGE_IIA), # IIA