Skip to content

Commit

Permalink
Merge pull request #60 from monarch-initiative/change_python_req
Browse files Browse the repository at this point in the history
Change python requirement to >=3.8
  • Loading branch information
ielis authored Jan 22, 2024
2 parents c40b5ee + 85e3e58 commit 28a540c
Show file tree
Hide file tree
Showing 8 changed files with 58 additions and 52 deletions.
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
phenopackets/

# Cache files
.oncoexporter_cache/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down Expand Up @@ -166,8 +170,6 @@ cython_debug/
c2p_env/
notebooks/Untitled.ipynb

\.*.pkl

*.pkl
*.tsv
.DS_Store
Expand Down
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ exclude = ["notebooks", "test"]
[project]
name = "oncoexporter"
dynamic = ["version"]
requires-python = ">=3.5"
requires-python = ">=3.8"
description = "Generate and work with GA4GH phenopackets for NCI"
readme = "README.md"
authors = [
Expand All @@ -36,12 +36,12 @@ keywords = [
dependencies = [
"hpo-toolkit>=0.3.0,<0.4.0",
"phenopackets>=2.0.2",
"requests",
"requests>=2.25.0,<3.0",
"cdapython@git+https://github.com/CancerDataAggregator/cda-python",
"tqdm"
"tqdm>=4.60"
]
[project.optional-dependencies]
test = ["pytest", "parameterized"]
test = ["pytest>=7.0.0,<8.0.0"]



Expand Down
1 change: 0 additions & 1 deletion src/oncoexporter/cda/.gitignore

This file was deleted.

14 changes: 6 additions & 8 deletions src/oncoexporter/cda/cda_downloader.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
import csv
import os
import platform
from importlib.resources import files

import requests
import csv
import warnings
from importlib_resources import files


class CdaDownloader:

def __init__(self):
self.get_ncit_neoplasm_core()

def download_if_needed(self, overwrite_downloads:bool):
def download_if_needed(self, overwrite_downloads: bool):
local_dir = self.get_local_share_directory()
self._icdo_to_ncit_path = None
self.load_icdo_to_ncit_tsv(overwrite=overwrite_downloads, local_dir=local_dir)

def get_icdo_to_ncit_path(self):
return self._icdo_to_ncit_path

def load_icdo_to_ncit_tsv(self, overwrite:bool, local_dir:str):
def load_icdo_to_ncit_tsv(self, overwrite: bool, local_dir: str):
"""
Download if necessary the NCIT ICD-O mapping file and store it in the package ncit_files folder
:param overwrite: whether to overwrite an existing file (otherwise we skip downloading)
Expand All @@ -42,8 +42,6 @@ def load_icdo_to_ncit_tsv(self, overwrite:bool, local_dir:str):
print(f"[INFO] Downloaded {icdo_to_ncit_map_url}")
self._icdo_to_ncit_path = icd_path



def get_ncit_neoplasm_core(self):
# Reads contents with UTF-8 encoding and returns str.
neo_core = files('oncoexporter.ncit_files').joinpath('Neoplasm_Core.csv').read_text()
Expand All @@ -57,4 +55,4 @@ def get_local_share_directory(self, local_dir=None):
if not os.path.exists(local_dir):
os.makedirs(local_dir)
print(f"[INFO] Created new directory for oncoexporter at {local_dir}")
return local_dir
return local_dir
72 changes: 39 additions & 33 deletions src/oncoexporter/cda/cda_table_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,47 +15,55 @@


class CdaTableImporter(CdaImporter):
"""This class is the entry point for transforming CDA data into GA4GH Phenopackets. Client code only needs to initialize it with a CDA query and it can return phenopackets with the **get_ga4gh_phenopackets** method. It also returns individual tables for that can be used for testing or visualizing data.
"""This class is the entry point for transforming CDA data into GA4GH Phenopackets. Client code only needs
to initialize it with a CDA query, and it can return phenopackets with the :func:`get_ga4gh_phenopackets`.
It also returns individual tables for that can be used for testing or visualizing data.
The CDA query determines the cohort that will be retrieved from CDA. This class then retrieves data for this cohort in form of pandas DataFrames and extracts data for phenopacket construction using the data in the tables
The CDA query determines the cohort that will be retrieved from CDA. This class then retrieves data
for this cohort in form of pandas DataFrames and extracts data for phenopacket construction using the data
in the tables
:param cohort_name: A user-chosen name for the cohort
:type cohort_name: str
:param query: A query for CDA such as 'primary_diagnosis_site = "Lung"'
:type query: str
:param query_obj: a query object created by CDA. Note that either this argument or the query (string) argument must be provided, but not both.
:type query_obj: Q
:param query: a query object created by CDA. Note that either this argument or the query (string) argument must be provided, but not both.
:type query: Q
:param use_cache: if True, cache/retrieve from cache
:type use_cache: bool
:param cache_dir: a `str` with path to the folder to store the cache files
"""

def __init__(self, cohort_name: str, query: str = None, query_obj: Q = None,
use_cache=False):
"""Constructor
"""
if query is not None and query_obj is None:
self._query = Q(query)
elif query_obj is not None and query is None:
if not isinstance(query_obj, Q):
raise ValueError(f"query_obj argument must be Q.Q object, but instead was {type(query_obj)}")
self._query = query_obj
else:
raise ValueError("Need to pass either query or query_obj argument but not both")
def __init__(self, cohort_name: str,
query: Q,
use_cache: bool = False,
cache_dir: typing.Optional[str] = None):
if not isinstance(query, Q):
raise ValueError(f"query_obj argument must be Q.Q object, but instead was {type(query)}")
self._query = query

self._use_cache = use_cache
self._cohort_name = cohort_name

if cache_dir is None:
self._cache_dir = os.path.join(os.getcwd(), '.oncoexporter_cache')
if not os.path.isdir(self._cache_dir):
os.makedirs(self._cache_dir, exist_ok=True)
else:
if not os.path.isdir(cache_dir) or not os.access(cache_dir, os.W_OK):
raise ValueError(f'`cache_dir` must be a writable directory: {cache_dir}')

def _get_cda_df(self, callback_fxn, cache_name: str):
print(f"Retrieving dataframe {cache_name}")
if self._use_cache and os.path.isfile(cache_name):
with open(cache_name, 'rb') as cachehandle:
print(f"loading cached dataframe from {cache_name}")
fpath_cache = os.path.join(self._cache_dir, cache_name)
print(f"Retrieving dataframe {fpath_cache}")
if self._use_cache and os.path.isfile(fpath_cache):
with open(fpath_cache, 'rb') as cachehandle:
print(f"loading cached dataframe from {fpath_cache}")
individual_df = pickle.load(cachehandle)
else:
print(f"calling CDA function")
individual_df = callback_fxn()
if self._use_cache:
print(f"Creating cached dataframe as {cache_name}")
with open(cache_name, 'wb') as f:
print(f"Creating cached dataframe as {fpath_cache}")
with open(fpath_cache, 'wb') as f:
pickle.dump(individual_df, f)
return individual_df

Expand All @@ -72,10 +80,8 @@ def get_subject_df(self, page_size=10000) -> pd.DataFrame:
:returns: pandas DataFrame that corresponds to the CDA subject table.
:rtype: pd.DataFrame
"""
if self._query is None:
raise Exception(f"Could not retrieve subject dataframe because query object was None")
callable = lambda: self._query.subject.run(page_size=page_size).get_all().to_dataframe();
subject_df = self._get_cda_df(callable, f".{self._cohort_name}_individual_df.pkl");
callable = lambda: self._query.subject.run(page_size=page_size).get_all().to_dataframe()
subject_df = self._get_cda_df(callable, f"{self._cohort_name}_individual_df.pkl")
return subject_df


Expand All @@ -86,10 +92,10 @@ def get_merged_diagnosis_research_subject_df(self, page_size=10000) -> pd.DataFr
:type page_size: int
"""
diagnosis_callable = lambda: self._query.diagnosis.run(page_size=page_size).get_all().to_dataframe()
diagnosis_df = self._get_cda_df(diagnosis_callable, f".{self._cohort_name}_diagnosis_df.pkl")
diagnosis_df = self._get_cda_df(diagnosis_callable, f"{self._cohort_name}_diagnosis_df.pkl")
print("obtained diagnosis_df")
rsub_callable = lambda: self._query.researchsubject.run(page_size=page_size).get_all().to_dataframe()
rsub_df = self._get_cda_df(rsub_callable, f".{self._cohort_name}_rsub_df.pkl")
rsub_df = self._get_cda_df(rsub_callable, f"{self._cohort_name}_rsub_df.pkl")
print("obtained rsub_df")
merged_df = pd.merge(diagnosis_df, rsub_df, left_on='researchsubject_id', right_on='researchsubject_id',
suffixes=["_di", "_rs"])
Expand All @@ -108,17 +114,17 @@ def get_specimen_df(self, page_size=10000) -> pd.DataFrame:
:rtype: pd.DataFrame
"""
specimen_callable = lambda: self._query.specimen.run(page_size=page_size).get_all().to_dataframe()
specimen_df = self._get_cda_df(specimen_callable, f".{self._cohort_name}_specimen_df.pkl")
specimen_df = self._get_cda_df(specimen_callable, f"{self._cohort_name}_specimen_df.pkl")
return specimen_df

def get_treatment_df(self, page_size=10000) -> pd.DataFrame:
treatment_callable = lambda: self._query.treatment.run(page_size=page_size).get_all().to_dataframe()
treatment_df = self._get_cda_df(treatment_callable, f".{self._cohort_name}_treatment_df.pkl")
treatment_df = self._get_cda_df(treatment_callable, f"{self._cohort_name}_treatment_df.pkl")
return treatment_df

def get_mutation_df(self, page_size=10000) -> pd.DataFrame:
mutation_callable = lambda: self._query.mutation.run(page_size=page_size).get_all().to_dataframe()
mutation_df = self._get_cda_df(mutation_callable, f".{self._cohort_name}_mutation_df.pkl")
mutation_df = self._get_cda_df(mutation_callable, f"{self._cohort_name}_mutation_df.pkl")
return mutation_df


Expand Down
2 changes: 1 addition & 1 deletion src/run_cervix.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from oncoexporter.cda import CdaTableImporter

Tsite = Q('primary_diagnosis_site = "%uter%" OR primary_diagnosis_site = "%cerv%"', )
table_importer = CdaTableImporter(query_obj=Tsite, use_cache=True, cohort_name='Cervix')
table_importer = CdaTableImporter(query=Tsite, use_cache=True, cohort_name='Cervix')
p = table_importer.get_ga4gh_phenopackets(page_size=10000)

print("Created {} phenopackets".format(len(p)))
Expand Down
2 changes: 1 addition & 1 deletion src/run_lung.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from oncoexporter.cda import CdaTableImporter

Tsite = Q('primary_diagnosis_site = "%lung%" OR primary_diagnosis_site = "%pulmonary%"')
table_importer = CdaTableImporter(query_obj=Tsite, use_cache=True, cohort_name='Lung')
table_importer = CdaTableImporter(query=Tsite, use_cache=True, cohort_name='Lung')
p = table_importer.get_ga4gh_phenopackets(page_size=10000)

print("Created {} phenopackets".format(len(p)))
Expand Down
5 changes: 3 additions & 2 deletions tests/test_cda_disease_factory.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import unittest
import os
import pandas as pd
from parameterized import parameterized
import phenopackets as PPKt
import pytest

from oncoexporter.cda import CdaDiseaseFactory


Expand Down Expand Up @@ -38,7 +39,7 @@ def test_disease_stage_is_ontology_term(self):
self.assertEqual(self.disease_objs['s1'].disease_stage[0].__class__,
PPKt.OntologyClass)

@parameterized.expand([
@pytest.mark.parametrize('subject_id, expected_ncit_ontology', [
('s1', STAGE_IA), # IA
('s2', STAGE_IB), # IB
('s3', STAGE_IIA), # IIA
Expand Down

0 comments on commit 28a540c

Please sign in to comment.