Skip to content

Commit

Permalink
Added tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Nicolai-vKuegelgen committed Mar 11, 2024
1 parent 1bb597b commit b5fe121
Show file tree
Hide file tree
Showing 4 changed files with 224 additions and 25 deletions.
25 changes: 14 additions & 11 deletions cubi_tk/irods_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def __init__(
:param irods_env_path: Path to irods_environment.json
:type irods_env_path: pathlib.Path, optional
"""
super.__init__(ask, irods_env_path)
super().__init__(ask, irods_env_path)
self.hash_scheme = hash_scheme

def retrieve_irods_data_objects(self, irods_path: str) -> Dict[str, List[iRODSDataObject]]:
Expand All @@ -308,15 +308,15 @@ def retrieve_irods_data_objects(self, irods_path: str) -> Dict[str, List[iRODSDa
"""

# Connect to iRODS
with self.session as irods_session:
with self.session as session:
try:
root_coll = irods_session.collections.get(irods_path)
root_coll = session.collections.get(irods_path)

# Get files and run checks
logger.info("Querying for data objects")

if root_coll is not None:
irods_data_objs = self.get_data_objs(root_coll)
irods_data_objs = self._irods_query(session, root_coll)
irods_obj_dict = self.parse_irods_collection(irods_data_objs)
return irods_obj_dict

Expand All @@ -326,24 +326,27 @@ def retrieve_irods_data_objects(self, irods_path: str) -> Dict[str, List[iRODSDa

return {}

def get_data_objs(
self, root_coll: iRODSCollection
def _irods_query(
self,
session: iRODSSession,
root_coll: iRODSCollection,
) -> Dict[str, Union[Dict[str, iRODSDataObject], List[iRODSDataObject]]]:
"""Get data objects recursively under the given iRODS path."""
data_objs = dict(files=[], checksums={})

ignore_schemes = [k.lower() for k in HASH_SCHEMES if k != self.hash_scheme.upper()]
irods_sess = root_coll.manager.sess

query = irods_sess.query(DataObjectModel, CollectionModel).filter(
query = session.query(DataObjectModel, CollectionModel).filter(
Like(CollectionModel.name, f"{root_coll.path}%")
)

data_objs = dict(files=[], checksums={})
for res in query:
# If the 'res' dict is not split into Colllection&Object the resulting iRODSDataObject is not fully functional, likely because a name/path/... attribute is overwritten somewhere
# If the 'res' dict is not split into Colllection&Object the resulting iRODSDataObject is not fully functional,
# likely because a name/path/... attribute is overwritten somewhere
coll_res = {k: v for k, v in res.items() if k.icat_id >= 500}
obj_res = {k: v for k, v in res.items() if k.icat_id < 500}
coll = iRODSCollection(root_coll.manager, coll_res)
obj = iRODSDataObject(irods_sess.data_objects, parent=coll, results=[obj_res])
obj = iRODSDataObject(session.data_objects, parent=coll, results=[obj_res])

if obj.path.endswith("." + self.hash_scheme.lower()):
data_objs["checksums"][obj.path] = obj
Expand Down
29 changes: 16 additions & 13 deletions cubi_tk/sodar/pull_data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ class PullDataCollection(PullDataCommon):
"dragen": [
"**/*_FAM_dragen.fam.hard-filtered.vcf.gz"
"**/*_FAM_dragen.fam.hard-filtered.vcf.gz.tbi",
"**/*dragen.qc-coverage*",
"**/*.qc-coverage*.csv",
"**/*.ped",
"**/*.mapping_metrics.csv",
],
}

Expand Down Expand Up @@ -76,9 +78,8 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
help="UUID from Assay to check. Used to specify target while dealing with multi-assay projects.",
)

group_files = parser.add_argument_group(
"File Selection", mutually_exclusive=True, required=True
)
group_files = parser.add_mutually_exclusive_group(required=True)

group_files.add_argument(
"-p", "--preset", help="Preset to use for file selection.", choices=cls.presets.keys()
)
Expand Down Expand Up @@ -108,7 +109,7 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
)
group_samples.add_argument(
"--biomedsheet",
help="Biomedsheet file for filtering collections. Sets tes-column to 2 and "
help="Biomedsheet file for filtering collections. Sets tsv-column to 2 and "
"tsv-header to 13. Takes precedence over --tsv.",
)
group_samples.add_argument(
Expand All @@ -133,7 +134,7 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
)
parser.add_argument(
"--output-regex",
nargs="3",
nargs=3,
action="append",
metavar=("FILEPART", "MATCH", "REPL"),
default=[],
Expand Down Expand Up @@ -206,15 +207,15 @@ def execute(self) -> typing.Optional[int]:
samples = None

# Find all remote files (iRODS)
FileSearcher = RetrieveSodarCollection(
filesearcher = RetrieveSodarCollection(
self.args.sodar_url,
self.args.sodar_api_token,
self.args.assay_uuid,
self.args.project_uuid,
)

remote_files_dict = FileSearcher.perform()
assay_path = FileSearcher.get_assay_irods_path(self.args.assay_uuid)
remote_files_dict = filesearcher.perform()
assay_path = filesearcher.get_assay_irods_path(self.args.assay_uuid)

if self.args.all_files:
file_patterns = []
Expand All @@ -223,8 +224,8 @@ def execute(self) -> typing.Optional[int]:
else: # self.args.file_pattern
file_patterns = self.args.file_pattern

filtered_remote_files_dict = self.filter_irods_collection(
remote_files_dict, file_patterns, samples, self.args.substring_match, assay_path
filtered_remote_files_dict = self.filter_irods_file_list(
file_patterns, samples, self.args.substring_match, assay_path
)

if len(filtered_remote_files_dict) == 0:
Expand Down Expand Up @@ -265,8 +266,8 @@ def parse_sample_tsv(tsv_path, sample_col=1, n_header_cols=1, skip_comments=True

return samples

def filter_irods_collection(
self,
@staticmethod
def filter_irods_file_list(
remote_files_dict: Dict[str, List[iRODSDataObject]],
common_assay_path: str,
file_patterns: List[str],
Expand Down Expand Up @@ -353,6 +354,8 @@ def build_download_jobs(
self.args.output_dir, self.args.output_pattern.format(**out_parts)
),
irods_obj.path,
# # Unclear if this is available or not
# irods_obj.size,
)
output_list.append(job)

Expand Down
43 changes: 42 additions & 1 deletion tests/test_irods_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import irods.exception
import pytest

from cubi_tk.irods_common import TransferJob, iRODSCommon, iRODSTransfer
from cubi_tk.irods_common import TransferJob, iRODSCommon, iRODSTransfer, iRODSRetrieveCollection


def test_transfer_job_bytes(fs):
Expand Down Expand Up @@ -167,3 +167,44 @@ def test_irods_transfer_get(mocksession, jobs):
# download
mockget.assert_any_call(job.path_remote, job.path_local)
assert itransfer.size == 222


# Test iRODSRetrieveCollection #########


# This tests `retrieve_irods_data_objects` and by extension `parse_irods_collection`
# A test for _irods_query would require mocking `session.query` results in a
# way that allows creation of iRODSDataObject instances from those results
@patch("cubi_tk.irods_common.iRODSCommon._init_irods")
@patch("cubi_tk.irods_common.iRODSRetrieveCollection._irods_query")
def test_irods_retrieve_data_objects(mockquery, mocksession):
# Possible alternative to MagicMocks here:
# create a fake iRODSDataObject class with a path attribute
mockobj1 = MagicMock()
mockobj1.path = "/root/coll1/file1.vcf.gz"
mockobj1.name = "file1.vcf.gz"
mockobj2 = MagicMock()
mockobj2.path = "/root/coll2/file2.vcf.gz"
mockobj2.name = "file2.vcf.gz"
mockobj3 = MagicMock()
mockobj3.path = "/root/coll1/subcol/file1.vcf.gz"
mockobj3.name = "file1.vcf.gz"

mockcksum = MagicMock()

mockquery.return_value = {
"files": [mockobj1, mockobj2, mockobj3],
"checksums": {
"/root/coll1/file1.vcf.gz": mockcksum,
"/root/coll2/file2.vcf.gz": mockcksum,
"/root/coll1/subcol/file1.vcf.gz": mockcksum,
},
}

mocksession.collections.get.return_value = "path"

data_objs = iRODSRetrieveCollection().retrieve_irods_data_objects("/fake/path")

expected_data_objs = {"file1.vcf.gz": [mockobj1, mockobj3], "file2.vcf.gz": [mockobj2]}

assert data_objs == expected_data_objs
152 changes: 152 additions & 0 deletions tests/test_sodar_pull_data_collection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
from copy import deepcopy
from unittest.mock import MagicMock

import pathlib
import pytest
import re

from cubi_tk.sodar.pull_data_collection import PullDataCollection
from cubi_tk.irods_common import TransferJob


class MockDataObject:
def __init__(self, path):
self.path = path

def __eq__(self, other):
return self.path == other.path

def __repr__(self):
return f"MockDataObject(path={self.path})"


@pytest.fixture
def filtered_data_objects():
return {
"coll1-N1-DNA1": [
MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol1/file1.vcf.gz"),
MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol2/file1.vcf.gz"),
MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol1/miscFile.txt"),
],
"coll2-N1-DNA1": [
MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/file2.vcf.gz"),
MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/file2.bam"),
MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/miscFile.txt"),
],
}


def test_filter_irods_collection(filtered_data_objects):
fake_irods_data_dict = {
"file1.vcf.gz": [
MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol1/file1.vcf.gz"),
MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol2/file1.vcf.gz"),
],
"file2.vcf.gz": [
MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/file2.vcf.gz"),
],
"file2.bam": [
MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/file2.bam"),
],
"miscFile.txt": [
MockDataObject(path="/irods/project/coll1-N1-DNA1/subcol1/miscFile.txt"),
MockDataObject(path="/irods/project/coll2-N1-DNA1/subcol1/miscFile.txt"),
],
}

kwarg_list = [
# No filters at all -> all files
{"file_patterns": [], "samples": [], "substring_match": False},
# Test filepattern filter works
{"file_patterns": ["*.vcf.gz"], "samples": [], "substring_match": False},
# Test file pattern with mutiple patterns, also **/*.X & *.Y
{"file_patterns": ["*.vcf.gz", "**/*.txt"], "samples": [], "substring_match": False},
# Test Sample/Collection filter works
{"file_patterns": [], "samples": ["coll1-N1-DNA1"], "substring_match": False},
# Test substring matching works
{"file_patterns": [], "samples": ["coll1"], "substring_match": True},
]

expected_results = [
deepcopy(filtered_data_objects),
{
k: [v for v in l if v.path.endswith("vcf.gz")]
for k, l in deepcopy(filtered_data_objects).items()
},
{
k: [v for v in l if not v.path.endswith("bam")]
for k, l in deepcopy(filtered_data_objects).items()
},
{k: l for k, l in deepcopy(filtered_data_objects).items() if k == "coll1-N1-DNA1"},
{k: l for k, l in deepcopy(filtered_data_objects).items() if k == "coll1-N1-DNA1"},
]

for kwargs, expected in zip(kwarg_list, expected_results):
result = PullDataCollection.filter_irods_file_list(
fake_irods_data_dict, "/irods/project", **kwargs
)
assert result == expected


def test_build_download_jobs(filtered_data_objects):
mockargs = MagicMock()
mockargs.output_dir = "/path/to/output"
mockargs.output_regex = [] # ['', '', '']
mockargs.output_pattern = "{collection}/{subcollections}/{filename}"

testinstance = PullDataCollection(mockargs)

expected_out = [
TransferJob(
path_remote=obj.path, path_local=obj.path.replace("/irods/project", "/path/to/output")
)
for k, l in filtered_data_objects.items()
for obj in l
]
out = testinstance.build_download_jobs(filtered_data_objects, "/irods/project")
assert out == expected_out

# Test with different output pattern
mockargs.output_pattern = "{collection}/{filename}"
expected_out = [
TransferJob(
path_remote=obj.path,
path_local=re.sub(
"/subcol[12]", "", obj.path.replace("/irods/project", "/path/to/output")
),
)
for k, l in filtered_data_objects.items()
for obj in l
]
out = testinstance.build_download_jobs(filtered_data_objects, "/irods/project")
assert out == expected_out

# Test with regex
mockargs.output_regex = [
["subcollections", "subcol", "subcollection"],
["collection", "-N1-DNA1", ""],
]
mockargs.output_pattern = "{collection}/{subcollections}/{filename}"
expected_out = [
TransferJob(
path_remote=obj.path,
path_local=obj.path.replace("/irods/project", "/path/to/output")
.replace("subcol", "subcollection")
.replace("-N1-DNA1", ""),
)
for k, l in filtered_data_objects.items()
for obj in l
]
out = testinstance.build_download_jobs(filtered_data_objects, "/irods/project")
assert out == expected_out


def test_parse_samplesheet():
# Test on Biomedsheet
samples = PullDataCollection.parse_sample_tsv(
pathlib.Path(__file__).resolve().parent / "data" / "pull_sheets" / "sheet.tsv",
sample_col=2,
n_header_cols=13,
)

assert samples == ["index", "mother", "father"]

0 comments on commit b5fe121

Please sign in to comment.