Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Adds FileDictMixin #347

Merged
merged 23 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions q2_types/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
# ----------------------------------------------------------------------------
import gzip
import itertools
import re
import warnings
from collections import defaultdict
from typing import List

import skbio
Expand Down Expand Up @@ -138,3 +140,105 @@ def _validate_mag_ids(
"correctly. Printing duplicate MAG IDs: "
f"{set(duplicates)}"
)


class FileDictMixin:
def file_dict(self, relative=False, suffixes=None):
"""
For per sample directories it returns a mapping of sample id to
another dictionary where keys represent the file name and values
correspond to the filepath for each file matching the pathspec.
For files, it returns a mapping of file name to filepath for each
file matching the pathspec. The specified suffixes are removed
from filenames.

Parameters
---------
relative : bool
Whether to return filepaths relative to the directory's location.
Returns absolute filepaths by default.
suffixes : List
A list of suffixes that should be removed from the filenames to
generate the ID.

Returns
-------
dict
Mapping of filename -> filepath as described above.
Or mapping of sample id -> dict {filename: filepath} as
described above.
Both levels of the dictionary are sorted alphabetically by key.
"""
file_pattern = re.compile(self.pathspec)
ids = defaultdict(dict)
for entry in self.path.iterdir():
if entry.is_dir():
outer_id = entry.name
for path in entry.iterdir():
if file_pattern.match(path.name):

file_path, inner_id = _process_path(
path=path,
relative=relative,
dir_format=self,
suffixes=suffixes,
)

ids[outer_id][inner_id] = str(file_path)
ids[outer_id] = dict(sorted(ids[outer_id].items()))
else:
if file_pattern.match(entry.name):

file_path, inner_id = _process_path(
path=entry,
relative=relative,
dir_format=self,
suffixes=suffixes,
)

ids[inner_id] = str(file_path)

return dict(sorted(ids.items()))


def _process_path(path, relative, dir_format, suffixes):
"""
This function processes the input file path to generate an absolute or
relative path string and the ID derived from the file name. The ID is
extracted by removing the one of the specified suffixes from the file
name. If no suffixes are specified the ID is defined to be the filename.

Parameters:
---------
path : Path
A Path object representing the file path to process.
relative : bool
A flag indicating whether the returned path should be relative
to the directory formats path or absolute.
dir_format : model.DirectoryFormat.
Any object of class model.DirectoryFormat.

Returns:
-------
processed_path : str
The full relative or absolut path to the file.
_id : str
The ID derived from the file name. ID will be "" if the filename
consists only of the suffix.
"""
file_name = path.stem

_id = file_name

if suffixes:
for suffix in suffixes:
if file_name.endswith(suffix):
_id = file_name[:-len(suffix)]
break

processed_path = (
path.absolute().relative_to(dir_format.path.absolute())
if relative
else path.absolute()
)
return str(processed_path), _id
3 changes: 1 addition & 2 deletions q2_types/genome_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
GenesDirectoryFormat, ProteinsDirectoryFormat, LociDirectoryFormat,
GFF3Format, OrthologFileFmt, SeedOrthologDirFmt,
GenomeSequencesDirectoryFormat, OrthologAnnotationDirFmt,
GenomeDataDirectoryFormat,
)
from ._objects import IntervalMetadataIterator
from ._types import (
Expand All @@ -25,6 +24,6 @@
'GenesDirectoryFormat', 'ProteinsDirectoryFormat', 'LociDirectoryFormat',
'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs',
'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence',
'OrthologAnnotationDirFmt', 'NOG', 'GenomeDataDirectoryFormat',
'OrthologAnnotationDirFmt', 'NOG',
'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations"
]
70 changes: 13 additions & 57 deletions q2_types/genome_data/_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import re
from collections import defaultdict

import qiime2.plugin.model as model
from qiime2.plugin import ValidationError

from q2_types._util import FileDictMixin
from q2_types.feature_data import DNAFASTAFormat, ProteinFASTAFormat


Expand All @@ -19,63 +19,18 @@ def _validate_(self, level):
pass


class GenomeDataDirectoryFormat(model.DirectoryFormat):
def genome_dict(self, relative=False):
"""
For per sample directories it returns a mapping of sample id to
another dictionary where keys represent the file name and values
correspond to the filepath for each file.
For files, it returns a mapping of file name to filepath for each file.

Parameters
---------
relative : bool
Whether to return filepaths relative to the directory's location.
Returns absolute filepaths by default.

Returns
-------
dict
Mapping of filename -> filepath as described above.
Or mapping of sample id -> dict {filename: filepath} as
described above.
Both levels of the dictionary are sorted alphabetically by key.
"""
ids = defaultdict(dict)
for entry in self.path.iterdir():
if entry.is_dir():
sample_id = entry.name
for path in entry.iterdir():
file_name = path.stem
file_path = (
path.absolute().relative_to(self.path.absolute())
if relative else path.absolute()
)
ids[sample_id][file_name] = str(file_path)
ids[sample_id] = dict(sorted(ids[sample_id].items()))
else:
file_name = entry.stem
file_path = (
entry.absolute().relative_to(self.path.absolute())
if relative else entry.absolute()
)
ids[file_name] = str(file_path)

return dict(sorted(ids.items()))


class GenesDirectoryFormat(GenomeDataDirectoryFormat):
genes = model.FileCollection(r'.+\.(fa|fna|fasta)$',
format=DNAFASTAFormat)
class GenesDirectoryFormat(model.DirectoryFormat, FileDictMixin):
pathspec = r'.+\.(fa|fna|fasta)$'
genes = model.FileCollection(pathspec, format=DNAFASTAFormat)

@genes.set_path_maker
def genes_path_maker(self, genome_id):
return '%s.fasta' % genome_id


class ProteinsDirectoryFormat(GenomeDataDirectoryFormat):
proteins = model.FileCollection(r'.+\.(fa|faa|fasta)$',
format=ProteinFASTAFormat)
class ProteinsDirectoryFormat(model.DirectoryFormat, FileDictMixin):
pathspec = r'.+\.(fa|faa|fasta)$'
proteins = model.FileCollection(pathspec, format=ProteinFASTAFormat)

@proteins.set_path_maker
def proteins_path_maker(self, genome_id):
Expand Down Expand Up @@ -205,17 +160,18 @@ def _validate_(self, level):
f'{line_number}') from e


class LociDirectoryFormat(GenomeDataDirectoryFormat):
loci = model.FileCollection(r'.+\.gff$',
format=GFF3Format)
class LociDirectoryFormat(model.DirectoryFormat, FileDictMixin):
pathspec = r'.+\.gff$'
loci = model.FileCollection(pathspec, format=GFF3Format)

@loci.set_path_maker
def loci_path_maker(self, genome_id):
return '%s.gff' % genome_id


class GenomeSequencesDirectoryFormat(GenomeDataDirectoryFormat):
genomes = model.FileCollection(r'.+\.(fasta|fa)$', format=DNAFASTAFormat)
class GenomeSequencesDirectoryFormat(model.DirectoryFormat, FileDictMixin):
pathspec = r'.+\.(fasta|fa)$'
genomes = model.FileCollection(pathspec, format=DNAFASTAFormat)

@genomes.set_path_maker
def genomes_path_maker(self, genome_id):
Expand Down
47 changes: 0 additions & 47 deletions q2_types/genome_data/tests/test_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import unittest
from pathlib import Path

from qiime2.core.exceptions import ValidationError
from qiime2.plugin.testing import TestPluginBase
Expand All @@ -15,7 +14,6 @@
GenesDirectoryFormat, ProteinsDirectoryFormat, GFF3Format,
LociDirectoryFormat, SeedOrthologDirFmt, OrthologFileFmt,
OrthologAnnotationDirFmt, GenomeSequencesDirectoryFormat,
GenomeDataDirectoryFormat
)


Expand Down Expand Up @@ -180,51 +178,6 @@ def test_ortholog_annotations_annot_dict(self):
}
self.assertDictEqual(obs, exp)

def test_genome_data_dirfmt_samples_genome_dict(self):
genes = GenomeDataDirectoryFormat(
self.get_data_path('genes_samples'), mode='r')

obs = genes.genome_dict()
exp = {
'sample1': {
'genes1': str(Path(genes.path / 'sample1/genes1.fa')),
},
'sample2': {
'genes2': str(Path(genes.path / 'sample2/genes2.fa')),
},
}
self.assertDictEqual(obs, exp)

obs = genes.genome_dict(relative=True)
exp = {
'sample1': {
'genes1': 'sample1/genes1.fa',
},
'sample2': {
'genes2': 'sample2/genes2.fa',
},
}
self.assertDictEqual(obs, exp)

def test_genes_dirfmt_genome_dict(self):
genes = (
GenomeDataDirectoryFormat(self.get_data_path('genes'), mode='r')
)

obs = genes.genome_dict()
exp = {
'genes1': str(Path(genes.path / 'genes1.fa')),
'genes2': str(Path(genes.path / 'genes2.fa'))
}
self.assertDictEqual(obs, exp)

obs = genes.genome_dict(relative=True)
exp = {
'genes1': 'genes1.fa',
'genes2': 'genes2.fa'
}
self.assertDictEqual(obs, exp)


if __name__ == '__main__':
unittest.main()
17 changes: 8 additions & 9 deletions q2_types/kraken2/_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import pandas as pd
from pandas.core.dtypes.common import is_string_dtype
from qiime2.plugin import model, ValidationError

from q2_types._util import FileDictMixin


class Kraken2ReportFormat(model.TextFileFormat):
MEASURE_COLUMNS = {
Expand Down Expand Up @@ -67,10 +68,9 @@ def _validate_(self, level):
)


class Kraken2ReportDirectoryFormat(model.DirectoryFormat):
reports = model.FileCollection(
r'.+report\.(txt|tsv)$', format=Kraken2ReportFormat
)
class Kraken2ReportDirectoryFormat(model.DirectoryFormat, FileDictMixin):
pathspec = r'.+report\.(txt|tsv)$'
reports = model.FileCollection(pathspec, format=Kraken2ReportFormat)

@reports.set_path_maker
def reports_path_maker(self, sample_id, mag_id=None):
Expand Down Expand Up @@ -146,10 +146,9 @@ def _validate_(self, level):
)


class Kraken2OutputDirectoryFormat(model.DirectoryFormat):
reports = model.FileCollection(
r'.+output\.(txt|tsv)$', format=Kraken2OutputFormat
)
class Kraken2OutputDirectoryFormat(model.DirectoryFormat, FileDictMixin):
pathspec = r'.+output\.(txt|tsv)$'
reports = model.FileCollection(pathspec, format=Kraken2OutputFormat)

@reports.set_path_maker
def reports_path_maker(self, sample_id, mag_id=None):
Expand Down
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Loading
Loading