Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Adds FileDictMixin #347

Merged
merged 23 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions q2_types/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
# ----------------------------------------------------------------------------
import gzip
import itertools
import re
import warnings
from collections import defaultdict
from typing import List

import skbio
Expand Down Expand Up @@ -138,3 +140,106 @@ def _validate_mag_ids(
"correctly. Printing duplicate MAG IDs: "
f"{set(duplicates)}"
)


class FileDictMixin:
def file_dict(self, relative=False):
"""
For per sample directories it returns a mapping of sample id to
another dictionary where keys represent the file name and values
correspond to the filepath for each file matching the pathspec.
For files, it returns a mapping of file name to filepath for each
file matching the pathspec. If the dir format has the attribute
'suffixes', then these are removed from filenames.

Parameters
---------
relative : bool
Whether to return filepaths relative to the directory's location.
Returns absolute filepaths by default.

Returns
-------
dict
Mapping of sample id -> filepath as described above.
Or mapping of sample id -> dict {filename: filepath} as
described above.
Both levels of the dictionary are sorted alphabetically by key.
"""
suffixes = getattr(self, "suffixes", [])
file_pattern = re.compile(self.pathspec)
ids = defaultdict(dict)

for entry in self.path.iterdir():
if entry.is_dir():
outer_id = entry.name
for path in entry.iterdir():
if file_pattern.match(path.name):

file_path, inner_id = _process_path(
path=path,
relative=relative,
dir_format=self,
suffixes=suffixes,
)

ids[outer_id][inner_id] = file_path
ids[outer_id] = dict(sorted(ids[outer_id].items()))
else:
if file_pattern.match(entry.name):

file_path, inner_id = _process_path(
path=entry,
relative=relative,
dir_format=self,
suffixes=suffixes,
)

ids[inner_id] = file_path

return dict(sorted(ids.items()))


def _process_path(path, relative, dir_format, suffixes):
"""
This function processes the input file path to generate an absolute or
relative path string and the ID derived from the file name. The ID is
extracted by removing the one of the specified suffixes from the file
name. If no suffixes are specified the ID is defined to be the filename.

Parameters:
---------
path : Path
A Path object representing the file path to process.
relative : bool
A flag indicating whether the returned path should be relative
to the directory formats path or absolute.
dir_format : model.DirectoryFormat.
Any object of class model.DirectoryFormat.
suffixes : List
A list of suffixes that should be removed from the filenames to
generate the ID.

Returns:
-------
processed_path : str
The full relative or absolute path to the file.
_id : str
The ID derived from the file name. ID will be "" if the filename
consists only of the suffix.
"""
file_name = path.stem
_id = file_name

if suffixes:
for suffix in suffixes:
if file_name.endswith(suffix):
_id = file_name[:-len(suffix)]
break

processed_path = (
path.absolute().relative_to(dir_format.path.absolute())
if relative
else path.absolute()
)
return str(processed_path), _id
3 changes: 1 addition & 2 deletions q2_types/genome_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
GenesDirectoryFormat, ProteinsDirectoryFormat, LociDirectoryFormat,
GFF3Format, OrthologFileFmt, SeedOrthologDirFmt,
GenomeSequencesDirectoryFormat, OrthologAnnotationDirFmt,
GenomeDataDirectoryFormat,
)
from ._objects import IntervalMetadataIterator
from ._types import (
Expand All @@ -25,6 +24,6 @@
'GenesDirectoryFormat', 'ProteinsDirectoryFormat', 'LociDirectoryFormat',
'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs',
'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence',
'OrthologAnnotationDirFmt', 'NOG', 'GenomeDataDirectoryFormat',
'OrthologAnnotationDirFmt', 'NOG',
'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations"
]
70 changes: 13 additions & 57 deletions q2_types/genome_data/_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import re
from collections import defaultdict

import qiime2.plugin.model as model
from qiime2.plugin import ValidationError

from q2_types._util import FileDictMixin
from q2_types.feature_data import DNAFASTAFormat, ProteinFASTAFormat


Expand All @@ -19,63 +19,18 @@ def _validate_(self, level):
pass


class GenomeDataDirectoryFormat(model.DirectoryFormat):
def genome_dict(self, relative=False):
"""
For per sample directories it returns a mapping of sample id to
another dictionary where keys represent the file name and values
correspond to the filepath for each file.
For files, it returns a mapping of file name to filepath for each file.

Parameters
---------
relative : bool
Whether to return filepaths relative to the directory's location.
Returns absolute filepaths by default.

Returns
-------
dict
Mapping of filename -> filepath as described above.
Or mapping of sample id -> dict {filename: filepath} as
described above.
Both levels of the dictionary are sorted alphabetically by key.
"""
ids = defaultdict(dict)
for entry in self.path.iterdir():
if entry.is_dir():
sample_id = entry.name
for path in entry.iterdir():
file_name = path.stem
file_path = (
path.absolute().relative_to(self.path.absolute())
if relative else path.absolute()
)
ids[sample_id][file_name] = str(file_path)
ids[sample_id] = dict(sorted(ids[sample_id].items()))
else:
file_name = entry.stem
file_path = (
entry.absolute().relative_to(self.path.absolute())
if relative else entry.absolute()
)
ids[file_name] = str(file_path)

return dict(sorted(ids.items()))


class GenesDirectoryFormat(GenomeDataDirectoryFormat):
genes = model.FileCollection(r'.+\.(fa|fna|fasta)$',
format=DNAFASTAFormat)
class GenesDirectoryFormat(model.DirectoryFormat, FileDictMixin):
pathspec = r'.+\.(fa|fna|fasta)$'
genes = model.FileCollection(pathspec, format=DNAFASTAFormat)

@genes.set_path_maker
def genes_path_maker(self, genome_id):
return '%s.fasta' % genome_id


class ProteinsDirectoryFormat(GenomeDataDirectoryFormat):
proteins = model.FileCollection(r'.+\.(fa|faa|fasta)$',
format=ProteinFASTAFormat)
class ProteinsDirectoryFormat(model.DirectoryFormat, FileDictMixin):
pathspec = r'.+\.(fa|faa|fasta)$'
proteins = model.FileCollection(pathspec, format=ProteinFASTAFormat)

@proteins.set_path_maker
def proteins_path_maker(self, genome_id):
Expand Down Expand Up @@ -205,17 +160,18 @@ def _validate_(self, level):
f'{line_number}') from e


class LociDirectoryFormat(GenomeDataDirectoryFormat):
loci = model.FileCollection(r'.+\.gff$',
format=GFF3Format)
class LociDirectoryFormat(model.DirectoryFormat, FileDictMixin):
pathspec = r'.+\.gff$'
loci = model.FileCollection(pathspec, format=GFF3Format)

@loci.set_path_maker
def loci_path_maker(self, genome_id):
return '%s.gff' % genome_id


class GenomeSequencesDirectoryFormat(GenomeDataDirectoryFormat):
genomes = model.FileCollection(r'.+\.(fasta|fa)$', format=DNAFASTAFormat)
class GenomeSequencesDirectoryFormat(model.DirectoryFormat, FileDictMixin):
pathspec = r'.+\.(fasta|fa)$'
genomes = model.FileCollection(pathspec, format=DNAFASTAFormat)

@genomes.set_path_maker
def genomes_path_maker(self, genome_id):
Expand Down
47 changes: 0 additions & 47 deletions q2_types/genome_data/tests/test_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import unittest
from pathlib import Path

from qiime2.core.exceptions import ValidationError
from qiime2.plugin.testing import TestPluginBase
Expand All @@ -15,7 +14,6 @@
GenesDirectoryFormat, ProteinsDirectoryFormat, GFF3Format,
LociDirectoryFormat, SeedOrthologDirFmt, OrthologFileFmt,
OrthologAnnotationDirFmt, GenomeSequencesDirectoryFormat,
GenomeDataDirectoryFormat
)


Expand Down Expand Up @@ -180,51 +178,6 @@ def test_ortholog_annotations_annot_dict(self):
}
self.assertDictEqual(obs, exp)

def test_genome_data_dirfmt_samples_genome_dict(self):
genes = GenomeDataDirectoryFormat(
self.get_data_path('genes_samples'), mode='r')

obs = genes.genome_dict()
exp = {
'sample1': {
'genes1': str(Path(genes.path / 'sample1/genes1.fa')),
},
'sample2': {
'genes2': str(Path(genes.path / 'sample2/genes2.fa')),
},
}
self.assertDictEqual(obs, exp)

obs = genes.genome_dict(relative=True)
exp = {
'sample1': {
'genes1': 'sample1/genes1.fa',
},
'sample2': {
'genes2': 'sample2/genes2.fa',
},
}
self.assertDictEqual(obs, exp)

def test_genes_dirfmt_genome_dict(self):
genes = (
GenomeDataDirectoryFormat(self.get_data_path('genes'), mode='r')
)

obs = genes.genome_dict()
exp = {
'genes1': str(Path(genes.path / 'genes1.fa')),
'genes2': str(Path(genes.path / 'genes2.fa'))
}
self.assertDictEqual(obs, exp)

obs = genes.genome_dict(relative=True)
exp = {
'genes1': 'genes1.fa',
'genes2': 'genes2.fa'
}
self.assertDictEqual(obs, exp)


if __name__ == '__main__':
unittest.main()
19 changes: 10 additions & 9 deletions q2_types/kraken2/_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import pandas as pd
from pandas.core.dtypes.common import is_string_dtype
from qiime2.plugin import model, ValidationError

from q2_types._util import FileDictMixin


class Kraken2ReportFormat(model.TextFileFormat):
MEASURE_COLUMNS = {
Expand Down Expand Up @@ -67,10 +68,10 @@ def _validate_(self, level):
)


class Kraken2ReportDirectoryFormat(model.DirectoryFormat):
reports = model.FileCollection(
r'.+report\.(txt|tsv)$', format=Kraken2ReportFormat
)
class Kraken2ReportDirectoryFormat(model.DirectoryFormat, FileDictMixin):
pathspec = r'.+report\.(txt|tsv)$'
suffixes = ['.report']
reports = model.FileCollection(pathspec, format=Kraken2ReportFormat)

@reports.set_path_maker
def reports_path_maker(self, sample_id, mag_id=None):
Expand Down Expand Up @@ -146,10 +147,10 @@ def _validate_(self, level):
)


class Kraken2OutputDirectoryFormat(model.DirectoryFormat):
reports = model.FileCollection(
r'.+output\.(txt|tsv)$', format=Kraken2OutputFormat
)
class Kraken2OutputDirectoryFormat(model.DirectoryFormat, FileDictMixin):
pathspec = r'.+output\.(txt|tsv)$'
suffixes = ['.output']
reports = model.FileCollection(pathspec, format=Kraken2OutputFormat)

@reports.set_path_maker
def reports_path_maker(self, sample_id, mag_id=None):
Expand Down
32 changes: 32 additions & 0 deletions q2_types/tests/data/kraken-outputs-mags/sample1/bin1.output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
C k119_33069 1912795 10855 1912795:Q
C k119_55515 1583098 5698 1583098:Q
C k119_66468 1323375 5173 1323375:Q
C k119_33506 182217 17101 182217:Q
C k119_22814 1472 19997 1472:Q
C k119_23274 29388 23523 29388:Q
C k119_45180 545501 25821 545501:Q
C k119_34380 1218 4423 1218:Q
C k119_1654 2518177 31450 2518177:Q
C k119_45407 221027 2908 221027:Q
C k119_12788 59919 2856 59919:Q
U k119_34900 0 3045 0:Q
C k119_45855 851 19053 851:Q
C k119_90411 2647897 2589 2647897:Q
C k119_57806 2653681 4515 2653681:Q
C k119_58481 131567 19174 131567:Q
C k119_47669 2682541 11848 2682541:Q
C k119_59208 1977865 3665 1977865:Q
C k119_16398 2770780 5030 2770780:Q
C k119_60835 400634 2807 400634:Q
C k119_49584 2490633 6493 2490633:Q
C k119_28869 111780 8356 111780:Q
C k119_94747 2305987 3774 2305987:Q
C k119_40414 983544 27806 983544:Q
C k119_73618 2563896 3473 2563896:Q
C k119_84540 332101 3409 332101:Q
C k119_73768 2593542 29942 2593542:Q
C k119_41848 34105 8793 34105:Q
C k119_43035 1301 4680 1301:Q
C k119_65066 1547445 10430 1547445:Q
C k119_10361 491950 68731 491950:Q
C k119_10711 52959 8685 52959:Q
Loading
Loading