Merge pull request #60 from monarch-initiative/change_python_req

Change python requirement to >=3.8
monarch-initiative · Jan 22, 2024 · 28a540c · 28a540c
2 parents c40b5ee + 85e3e58
commit 28a540c
Show file tree

Hide file tree

Showing 8 changed files with 58 additions and 52 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,8 @@
 phenopackets/
+
+# Cache files
+.oncoexporter_cache/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -166,8 +170,6 @@ cython_debug/
 c2p_env/
 notebooks/Untitled.ipynb
 
-\.*.pkl
-
 *.pkl
 *.tsv
 .DS_Store

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ exclude = ["notebooks", "test"]
 [project]
 name = "oncoexporter"
 dynamic = ["version"]
-requires-python = ">=3.5"
+requires-python = ">=3.8"
 description = "Generate and work with GA4GH phenopackets for NCI"
 readme = "README.md"
 authors = [
@@ -36,12 +36,12 @@ keywords = [
 dependencies = [
     "hpo-toolkit>=0.3.0,<0.4.0",
     "phenopackets>=2.0.2",
-    "requests",
+    "requests>=2.25.0,<3.0",
     "cdapython@git+https://github.com/CancerDataAggregator/cda-python",
-    "tqdm"
+    "tqdm>=4.60"
 ]
 [project.optional-dependencies]
-test = ["pytest", "parameterized"]
+test = ["pytest>=7.0.0,<8.0.0"]
 
 
 

diff --git a/src/oncoexporter/cda/.gitignore b/src/oncoexporter/cda/.gitignore
diff --git a/src/oncoexporter/cda/cda_downloader.py b/src/oncoexporter/cda/cda_downloader.py
@@ -1,25 +1,25 @@
+import csv
 import os
 import platform
+from importlib.resources import files
+
 import requests
-import csv
-import warnings
-from importlib_resources import files
 
 
 class CdaDownloader:
 
     def __init__(self):
         self.get_ncit_neoplasm_core()
 
-    def download_if_needed(self, overwrite_downloads:bool):
+    def download_if_needed(self, overwrite_downloads: bool):
         local_dir = self.get_local_share_directory()
         self._icdo_to_ncit_path = None
         self.load_icdo_to_ncit_tsv(overwrite=overwrite_downloads, local_dir=local_dir)
 
     def get_icdo_to_ncit_path(self):
         return self._icdo_to_ncit_path
 
-    def load_icdo_to_ncit_tsv(self, overwrite:bool, local_dir:str):
+    def load_icdo_to_ncit_tsv(self, overwrite: bool, local_dir: str):
         """
         Download if necessary the NCIT ICD-O mapping file and store it in the package ncit_files folder
         :param overwrite: whether to overwrite an existing file (otherwise we skip downloading)
@@ -42,8 +42,6 @@ def load_icdo_to_ncit_tsv(self, overwrite:bool, local_dir:str):
             print(f"[INFO] Downloaded {icdo_to_ncit_map_url}")
         self._icdo_to_ncit_path = icd_path
 
-
-
     def get_ncit_neoplasm_core(self):
         # Reads contents with UTF-8 encoding and returns str.
         neo_core = files('oncoexporter.ncit_files').joinpath('Neoplasm_Core.csv').read_text()
@@ -57,4 +55,4 @@ def get_local_share_directory(self, local_dir=None):
         if not os.path.exists(local_dir):
             os.makedirs(local_dir)
             print(f"[INFO] Created new directory for oncoexporter at {local_dir}")
-        return local_dir
+        return local_dir
diff --git a/src/oncoexporter/cda/cda_table_importer.py b/src/oncoexporter/cda/cda_table_importer.py
@@ -15,47 +15,55 @@
 
 
 class CdaTableImporter(CdaImporter):
-    """This class is the entry point for transforming CDA data into GA4GH Phenopackets. Client code only needs to initialize it with a CDA query and it can return phenopackets with the **get_ga4gh_phenopackets** method. It also returns individual tables for that can be used for testing or visualizing data.
+    """This class is the entry point for transforming CDA data into GA4GH Phenopackets. Client code only needs
+    to initialize it with a CDA query, and it can return phenopackets with the :func:`get_ga4gh_phenopackets`.
+    It also returns individual tables for that can be used for testing or visualizing data.
 
-    The CDA query determines the cohort that will be retrieved from CDA. This class then retrieves data for this cohort in form of pandas DataFrames and extracts data for phenopacket construction using the data in the tables
+    The CDA query determines the cohort that will be retrieved from CDA. This class then retrieves data
+    for this cohort in form of pandas DataFrames and extracts data for phenopacket construction using the data
+    in the tables
 
     :param cohort_name: A user-chosen name for the cohort
     :type cohort_name: str
-    :param query: A query for CDA such as 'primary_diagnosis_site = "Lung"'
-    :type query: str
-    :param query_obj: a query object created by CDA. Note that either this argument or the query (string) argument must be provided, but not both.
-    :type query_obj: Q
+    :param query: a query object created by CDA. Note that either this argument or the query (string) argument must be provided, but not both.
+    :type query: Q
     :param use_cache: if True, cache/retrieve from cache
     :type use_cache: bool
+    :param cache_dir: a `str` with path to the folder to store the cache files
     """
 
-    def __init__(self, cohort_name: str, query: str = None, query_obj: Q = None,
-                 use_cache=False):
-        """Constructor
-        """
-        if query is not None and query_obj is None:
-            self._query = Q(query)
-        elif query_obj is not None and query is None:
-            if not isinstance(query_obj, Q):
-                raise ValueError(f"query_obj argument must be Q.Q object, but instead was {type(query_obj)}")
-            self._query = query_obj
-        else:
-            raise ValueError("Need to pass either query or query_obj argument but not both")
+    def __init__(self, cohort_name: str,
+                 query: Q,
+                 use_cache: bool = False,
+                 cache_dir: typing.Optional[str] = None):
+        if not isinstance(query, Q):
+            raise ValueError(f"query_obj argument must be Q.Q object, but instead was {type(query)}")
+        self._query = query
+
         self._use_cache = use_cache
         self._cohort_name = cohort_name
 
+        if cache_dir is None:
+            self._cache_dir = os.path.join(os.getcwd(), '.oncoexporter_cache')
+            if not os.path.isdir(self._cache_dir):
+                os.makedirs(self._cache_dir, exist_ok=True)
+        else:
+            if not os.path.isdir(cache_dir) or not os.access(cache_dir, os.W_OK):
+                raise ValueError(f'`cache_dir` must be a writable directory: {cache_dir}')
+
     def _get_cda_df(self, callback_fxn, cache_name: str):
-        print(f"Retrieving dataframe {cache_name}")
-        if self._use_cache and os.path.isfile(cache_name):
-            with open(cache_name, 'rb') as cachehandle:
-                print(f"loading cached dataframe from {cache_name}")
+        fpath_cache = os.path.join(self._cache_dir, cache_name)
+        print(f"Retrieving dataframe {fpath_cache}")
+        if self._use_cache and os.path.isfile(fpath_cache):
+            with open(fpath_cache, 'rb') as cachehandle:
+                print(f"loading cached dataframe from {fpath_cache}")
                 individual_df = pickle.load(cachehandle)
         else:
             print(f"calling CDA function")
             individual_df = callback_fxn()
             if self._use_cache:
-                print(f"Creating cached dataframe as {cache_name}")
-                with open(cache_name, 'wb') as f:
+                print(f"Creating cached dataframe as {fpath_cache}")
+                with open(fpath_cache, 'wb') as f:
                     pickle.dump(individual_df, f)
         return individual_df
 
@@ -72,10 +80,8 @@ def get_subject_df(self, page_size=10000) -> pd.DataFrame:
         :returns: pandas DataFrame that corresponds to the CDA subject table.
         :rtype: pd.DataFrame
         """
-        if self._query is None:
-            raise Exception(f"Could not retrieve subject dataframe because query object was None")
-        callable = lambda: self._query.subject.run(page_size=page_size).get_all().to_dataframe();
-        subject_df = self._get_cda_df(callable, f".{self._cohort_name}_individual_df.pkl");
+        callable = lambda: self._query.subject.run(page_size=page_size).get_all().to_dataframe()
+        subject_df = self._get_cda_df(callable, f"{self._cohort_name}_individual_df.pkl")
         return subject_df
 
 
@@ -86,10 +92,10 @@ def get_merged_diagnosis_research_subject_df(self, page_size=10000) -> pd.DataFr
         :type page_size: int
         """
         diagnosis_callable = lambda: self._query.diagnosis.run(page_size=page_size).get_all().to_dataframe()
-        diagnosis_df = self._get_cda_df(diagnosis_callable, f".{self._cohort_name}_diagnosis_df.pkl")
+        diagnosis_df = self._get_cda_df(diagnosis_callable, f"{self._cohort_name}_diagnosis_df.pkl")
         print("obtained diagnosis_df")
         rsub_callable = lambda: self._query.researchsubject.run(page_size=page_size).get_all().to_dataframe()
-        rsub_df = self._get_cda_df(rsub_callable, f".{self._cohort_name}_rsub_df.pkl")
+        rsub_df = self._get_cda_df(rsub_callable, f"{self._cohort_name}_rsub_df.pkl")
         print("obtained rsub_df")
         merged_df = pd.merge(diagnosis_df, rsub_df, left_on='researchsubject_id', right_on='researchsubject_id',
                                 suffixes=["_di", "_rs"])
@@ -108,17 +114,17 @@ def get_specimen_df(self, page_size=10000) -> pd.DataFrame:
         :rtype: pd.DataFrame
         """
         specimen_callable = lambda: self._query.specimen.run(page_size=page_size).get_all().to_dataframe()
-        specimen_df = self._get_cda_df(specimen_callable, f".{self._cohort_name}_specimen_df.pkl")
+        specimen_df = self._get_cda_df(specimen_callable, f"{self._cohort_name}_specimen_df.pkl")
         return specimen_df
 
     def get_treatment_df(self, page_size=10000) -> pd.DataFrame:
         treatment_callable = lambda: self._query.treatment.run(page_size=page_size).get_all().to_dataframe()
-        treatment_df = self._get_cda_df(treatment_callable, f".{self._cohort_name}_treatment_df.pkl")
+        treatment_df = self._get_cda_df(treatment_callable, f"{self._cohort_name}_treatment_df.pkl")
         return treatment_df
 
     def get_mutation_df(self, page_size=10000) -> pd.DataFrame:
         mutation_callable = lambda: self._query.mutation.run(page_size=page_size).get_all().to_dataframe()
-        mutation_df = self._get_cda_df(mutation_callable, f".{self._cohort_name}_mutation_df.pkl")
+        mutation_df = self._get_cda_df(mutation_callable, f"{self._cohort_name}_mutation_df.pkl")
         return mutation_df
 
 

diff --git a/src/run_cervix.py b/src/run_cervix.py
@@ -5,7 +5,7 @@
 from oncoexporter.cda import CdaTableImporter
 
 Tsite = Q('primary_diagnosis_site = "%uter%" OR primary_diagnosis_site = "%cerv%"', )
-table_importer = CdaTableImporter(query_obj=Tsite, use_cache=True, cohort_name='Cervix')
+table_importer = CdaTableImporter(query=Tsite, use_cache=True, cohort_name='Cervix')
 p = table_importer.get_ga4gh_phenopackets(page_size=10000)
 
 print("Created {} phenopackets".format(len(p)))

diff --git a/src/run_lung.py b/src/run_lung.py
@@ -5,7 +5,7 @@
 from oncoexporter.cda import CdaTableImporter
 
 Tsite = Q('primary_diagnosis_site = "%lung%" OR primary_diagnosis_site = "%pulmonary%"')
-table_importer = CdaTableImporter(query_obj=Tsite, use_cache=True, cohort_name='Lung')
+table_importer = CdaTableImporter(query=Tsite, use_cache=True, cohort_name='Lung')
 p = table_importer.get_ga4gh_phenopackets(page_size=10000)
 
 print("Created {} phenopackets".format(len(p)))

diff --git a/tests/test_cda_disease_factory.py b/tests/test_cda_disease_factory.py
@@ -1,8 +1,9 @@
 import unittest
 import os
 import pandas as pd
-from parameterized import parameterized
 import phenopackets as PPKt
+import pytest
+
 from oncoexporter.cda import CdaDiseaseFactory
 
 
@@ -38,7 +39,7 @@ def test_disease_stage_is_ontology_term(self):
         self.assertEqual(self.disease_objs['s1'].disease_stage[0].__class__,
                          PPKt.OntologyClass)
 
-    @parameterized.expand([
+    @pytest.mark.parametrize('subject_id, expected_ncit_ontology', [
         ('s1', STAGE_IA),          # IA
         ('s2', STAGE_IB),          # IB
         ('s3', STAGE_IIA),         # IIA