qiime2 · colinvwood · Feb 11, 2025 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/q2_types/_util.py b/q2_types/_util.py
@@ -7,7 +7,9 @@
 # ----------------------------------------------------------------------------
 import gzip
 import itertools
+import re
 import warnings
+from collections import defaultdict
 from typing import List
 
 import skbio
@@ -138,3 +140,106 @@ def _validate_mag_ids(
                 "correctly. Printing duplicate MAG IDs: "
                 f"{set(duplicates)}"
             )
+
+
+class FileDictMixin:
+    def file_dict(self, relative=False):
+        """
+        For per sample directories it returns a mapping of sample id to
+        another dictionary where keys represent the file name and values
+        correspond to the filepath for each file matching the pathspec.
+        For files, it returns a mapping of file name to filepath for each
+        file matching the pathspec. If the dir format has the attribute
+        'suffixes', then these are removed from filenames.
+
+        Parameters
+        ---------
+        relative : bool
+            Whether to return filepaths relative to the directory's location.
+            Returns absolute filepaths by default.
+
+        Returns
+        -------
+        dict
+            Mapping of sample id -> filepath as described above.
+            Or mapping of sample id -> dict {filename: filepath} as
+            described above.
+            Both levels of the dictionary are sorted alphabetically by key.
+        """
+        suffixes = getattr(self, "suffixes", [])
+        file_pattern = re.compile(self.pathspec)
+        ids = defaultdict(dict)
+
+        for entry in self.path.iterdir():
+            if entry.is_dir():
+                outer_id = entry.name
+                for path in entry.iterdir():
+                    if file_pattern.match(path.name):
+
+                        file_path, inner_id = _process_path(
+                            path=path,
+                            relative=relative,
+                            dir_format=self,
+                            suffixes=suffixes,
+                        )
+
+                        ids[outer_id][inner_id] = file_path
+                ids[outer_id] = dict(sorted(ids[outer_id].items()))
+            else:
+                if file_pattern.match(entry.name):
+
+                    file_path, inner_id = _process_path(
+                        path=entry,
+                        relative=relative,
+                        dir_format=self,
+                        suffixes=suffixes,
+                    )
+
+                    ids[inner_id] = file_path
+
+        return dict(sorted(ids.items()))
+
+
+def _process_path(path, relative, dir_format, suffixes):
+    """
+    This function processes the input file path to generate an absolute or
+    relative path string and the ID derived from the file name. The ID is
+    extracted by removing the one of the specified suffixes from the file
+    name. If no suffixes are specified the ID is defined to be the filename.
+
+    Parameters:
+    ---------
+        path : Path
+            A Path object representing the file path to process.
+        relative : bool
+            A flag indicating whether the returned path should be relative
+            to the directory formats path or absolute.
+        dir_format : model.DirectoryFormat.
+            Any object of class model.DirectoryFormat.
+        suffixes : List
+            A list of suffixes that should be removed from the filenames to
+            generate the ID.
+
+    Returns:
+    -------
+        processed_path : str
+            The full relative or absolute path to the file.
+        _id : str
+            The ID derived from the file name. ID will be "" if the filename
+            consists only of the suffix.
+    """
+    file_name = path.stem
+    _id = file_name
+
+    if suffixes:
+        for suffix in suffixes:
+            if file_name.endswith(suffix):
+                _id = file_name[:-len(suffix)]
+                break
+
+    processed_path = (
+        path.absolute().relative_to(dir_format.path.absolute())
+        if relative
+        else path.absolute()
+    )
+    return str(processed_path), _id
diff --git a/q2_types/genome_data/__init__.py b/q2_types/genome_data/__init__.py
@@ -11,7 +11,6 @@
     GenesDirectoryFormat, ProteinsDirectoryFormat, LociDirectoryFormat,
     GFF3Format, OrthologFileFmt, SeedOrthologDirFmt,
     GenomeSequencesDirectoryFormat, OrthologAnnotationDirFmt,
-    GenomeDataDirectoryFormat,
 )
 from ._objects import IntervalMetadataIterator
 from ._types import (
@@ -25,6 +24,6 @@
     'GenesDirectoryFormat', 'ProteinsDirectoryFormat', 'LociDirectoryFormat',
     'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs',
     'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence',
-    'OrthologAnnotationDirFmt', 'NOG', 'GenomeDataDirectoryFormat',
+    'OrthologAnnotationDirFmt', 'NOG',
     'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations"
     ]
diff --git a/q2_types/genome_data/_formats.py b/q2_types/genome_data/_formats.py
@@ -6,11 +6,11 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import re
-from collections import defaultdict
 
 import qiime2.plugin.model as model
 from qiime2.plugin import ValidationError
 
+from q2_types._util import FileDictMixin
 from q2_types.feature_data import DNAFASTAFormat, ProteinFASTAFormat
 
 
@@ -19,63 +19,18 @@ def _validate_(self, level):
         pass
 
 
-class GenomeDataDirectoryFormat(model.DirectoryFormat):
-    def genome_dict(self, relative=False):
-        """
-        For per sample directories it returns a mapping of sample id to
-        another dictionary where keys represent the file name and values
-        correspond to the filepath for each file.
-        For files, it returns a mapping of file name to filepath for each file.
-
-        Parameters
-        ---------
-        relative : bool
-            Whether to return filepaths relative to the directory's location.
-            Returns absolute filepaths by default.
-
-        Returns
-        -------
-        dict
-            Mapping of filename -> filepath as described above.
-            Or mapping of sample id -> dict {filename: filepath} as
-            described above.
-            Both levels of the dictionary are sorted alphabetically by key.
-        """
-        ids = defaultdict(dict)
-        for entry in self.path.iterdir():
-            if entry.is_dir():
-                sample_id = entry.name
-                for path in entry.iterdir():
-                    file_name = path.stem
-                    file_path = (
-                        path.absolute().relative_to(self.path.absolute())
-                        if relative else path.absolute()
-                    )
-                    ids[sample_id][file_name] = str(file_path)
-                ids[sample_id] = dict(sorted(ids[sample_id].items()))
-            else:
-                file_name = entry.stem
-                file_path = (
-                    entry.absolute().relative_to(self.path.absolute())
-                    if relative else entry.absolute()
-                )
-                ids[file_name] = str(file_path)
-
-        return dict(sorted(ids.items()))
-
-
-class GenesDirectoryFormat(GenomeDataDirectoryFormat):
-    genes = model.FileCollection(r'.+\.(fa|fna|fasta)$',
-                                 format=DNAFASTAFormat)
+class GenesDirectoryFormat(model.DirectoryFormat, FileDictMixin):
+    pathspec = r'.+\.(fa|fna|fasta)$'
+    genes = model.FileCollection(pathspec, format=DNAFASTAFormat)
 
     @genes.set_path_maker
     def genes_path_maker(self, genome_id):
         return '%s.fasta' % genome_id
 
 
-class ProteinsDirectoryFormat(GenomeDataDirectoryFormat):
-    proteins = model.FileCollection(r'.+\.(fa|faa|fasta)$',
-                                    format=ProteinFASTAFormat)
+class ProteinsDirectoryFormat(model.DirectoryFormat, FileDictMixin):
+    pathspec = r'.+\.(fa|faa|fasta)$'
+    proteins = model.FileCollection(pathspec, format=ProteinFASTAFormat)
 
     @proteins.set_path_maker
     def proteins_path_maker(self, genome_id):
@@ -205,17 +160,18 @@ def _validate_(self, level):
                                       f'{line_number}') from e
 
 
-class LociDirectoryFormat(GenomeDataDirectoryFormat):
-    loci = model.FileCollection(r'.+\.gff$',
-                                format=GFF3Format)
+class LociDirectoryFormat(model.DirectoryFormat, FileDictMixin):
+    pathspec = r'.+\.gff$'
+    loci = model.FileCollection(pathspec, format=GFF3Format)
 
     @loci.set_path_maker
     def loci_path_maker(self, genome_id):
         return '%s.gff' % genome_id
 
 
-class GenomeSequencesDirectoryFormat(GenomeDataDirectoryFormat):
-    genomes = model.FileCollection(r'.+\.(fasta|fa)$', format=DNAFASTAFormat)
+class GenomeSequencesDirectoryFormat(model.DirectoryFormat, FileDictMixin):
+    pathspec = r'.+\.(fasta|fa)$'
+    genomes = model.FileCollection(pathspec, format=DNAFASTAFormat)
 
     @genomes.set_path_maker
     def genomes_path_maker(self, genome_id):

diff --git a/q2_types/genome_data/tests/test_formats.py b/q2_types/genome_data/tests/test_formats.py
@@ -6,7 +6,6 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import unittest
-from pathlib import Path
 
 from qiime2.core.exceptions import ValidationError
 from qiime2.plugin.testing import TestPluginBase
@@ -15,7 +14,6 @@
     GenesDirectoryFormat, ProteinsDirectoryFormat, GFF3Format,
     LociDirectoryFormat, SeedOrthologDirFmt, OrthologFileFmt,
     OrthologAnnotationDirFmt, GenomeSequencesDirectoryFormat,
-    GenomeDataDirectoryFormat
 )
 
 
@@ -180,51 +178,6 @@ def test_ortholog_annotations_annot_dict(self):
         }
         self.assertDictEqual(obs, exp)
 
-    def test_genome_data_dirfmt_samples_genome_dict(self):
-        genes = GenomeDataDirectoryFormat(
-            self.get_data_path('genes_samples'), mode='r')
-
-        obs = genes.genome_dict()
-        exp = {
-            'sample1': {
-                'genes1': str(Path(genes.path / 'sample1/genes1.fa')),
-            },
-            'sample2': {
-                'genes2': str(Path(genes.path / 'sample2/genes2.fa')),
-            },
-        }
-        self.assertDictEqual(obs, exp)
-
-        obs = genes.genome_dict(relative=True)
-        exp = {
-            'sample1': {
-                'genes1': 'sample1/genes1.fa',
-            },
-            'sample2': {
-                'genes2': 'sample2/genes2.fa',
-            },
-        }
-        self.assertDictEqual(obs, exp)
-
-    def test_genes_dirfmt_genome_dict(self):
-        genes = (
-            GenomeDataDirectoryFormat(self.get_data_path('genes'), mode='r')
-        )
-
-        obs = genes.genome_dict()
-        exp = {
-            'genes1': str(Path(genes.path / 'genes1.fa')),
-            'genes2': str(Path(genes.path / 'genes2.fa'))
-        }
-        self.assertDictEqual(obs, exp)
-
-        obs = genes.genome_dict(relative=True)
-        exp = {
-            'genes1': 'genes1.fa',
-            'genes2': 'genes2.fa'
-        }
-        self.assertDictEqual(obs, exp)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/q2_types/kraken2/_formats.py b/q2_types/kraken2/_formats.py
@@ -5,11 +5,12 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
-
 import pandas as pd
 from pandas.core.dtypes.common import is_string_dtype
 from qiime2.plugin import model, ValidationError
 
+from q2_types._util import FileDictMixin
+
 
 class Kraken2ReportFormat(model.TextFileFormat):
     MEASURE_COLUMNS = {
@@ -67,10 +68,10 @@ def _validate_(self, level):
             )
 
 
-class Kraken2ReportDirectoryFormat(model.DirectoryFormat):
-    reports = model.FileCollection(
-        r'.+report\.(txt|tsv)$', format=Kraken2ReportFormat
-    )
+class Kraken2ReportDirectoryFormat(model.DirectoryFormat, FileDictMixin):
+    pathspec = r'.+report\.(txt|tsv)$'
+    suffixes = ['.report']
+    reports = model.FileCollection(pathspec, format=Kraken2ReportFormat)
 
     @reports.set_path_maker
     def reports_path_maker(self, sample_id, mag_id=None):
@@ -146,10 +147,10 @@ def _validate_(self, level):
             )
 
 
-class Kraken2OutputDirectoryFormat(model.DirectoryFormat):
-    reports = model.FileCollection(
-        r'.+output\.(txt|tsv)$', format=Kraken2OutputFormat
-    )
+class Kraken2OutputDirectoryFormat(model.DirectoryFormat, FileDictMixin):
+    pathspec = r'.+output\.(txt|tsv)$'
+    suffixes = ['.output']
+    reports = model.FileCollection(pathspec, format=Kraken2OutputFormat)
 
     @reports.set_path_maker
     def reports_path_maker(self, sample_id, mag_id=None):

diff --git a/q2_types/tests/data/kraken-outputs-mags/sample1/bin1.output.txt b/q2_types/tests/data/kraken-outputs-mags/sample1/bin1.output.txt
@@ -0,0 +1,32 @@
+C	k119_33069	1912795	10855	1912795:Q
+C	k119_55515	1583098	5698	1583098:Q
+C	k119_66468	1323375	5173	1323375:Q
+C	k119_33506	182217	17101	182217:Q
+C	k119_22814	1472	19997	1472:Q
+C	k119_23274	29388	23523	29388:Q
+C	k119_45180	545501	25821	545501:Q
+C	k119_34380	1218	4423	1218:Q
+C	k119_1654	2518177	31450	2518177:Q
+C	k119_45407	221027	2908	221027:Q
+C	k119_12788	59919	2856	59919:Q
+U	k119_34900	0	3045	0:Q
+C	k119_45855	851	19053	851:Q
+C	k119_90411	2647897	2589	2647897:Q
+C	k119_57806	2653681	4515	2653681:Q
+C	k119_58481	131567	19174	131567:Q
+C	k119_47669	2682541	11848	2682541:Q
+C	k119_59208	1977865	3665	1977865:Q
+C	k119_16398	2770780	5030	2770780:Q
+C	k119_60835	400634	2807	400634:Q
+C	k119_49584	2490633	6493	2490633:Q
+C	k119_28869	111780	8356	111780:Q
+C	k119_94747	2305987	3774	2305987:Q
+C	k119_40414	983544	27806	983544:Q
+C	k119_73618	2563896	3473	2563896:Q
+C	k119_84540	332101	3409	332101:Q
+C	k119_73768	2593542	29942	2593542:Q
+C	k119_41848	34105	8793	34105:Q
+C	k119_43035	1301	4680	1301:Q
+C	k119_65066	1547445	10430	1547445:Q
+C	k119_10361	491950	68731	491950:Q
+C	k119_10711	52959	8685	52959:Q