qiime2 · colinvwood · Feb 11, 2025 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/q2_types/_util.py b/q2_types/_util.py
@@ -7,7 +7,9 @@
 # ----------------------------------------------------------------------------
 import gzip
 import itertools
+import re
 import warnings
+from collections import defaultdict
 from typing import List
 
 import skbio
@@ -138,3 +140,105 @@ def _validate_mag_ids(
                 "correctly. Printing duplicate MAG IDs: "
                 f"{set(duplicates)}"
             )
+
+
+class FileDictMixin:
+    def file_dict(self, relative=False, suffixes=None):
+        """
+        For per sample directories it returns a mapping of sample id to
+        another dictionary where keys represent the file name and values
+        correspond to the filepath for each file matching the pathspec.
+        For files, it returns a mapping of file name to filepath for each
+        file matching the pathspec. The specified suffixes are removed
+        from filenames.
+
+        Parameters
+        ---------
+        relative : bool
+            Whether to return filepaths relative to the directory's location.
+            Returns absolute filepaths by default.
+        suffixes : List
+            A list of suffixes that should be removed from the filenames to
+            generate the ID.
+
+        Returns
+        -------
+        dict
+            Mapping of filename -> filepath as described above.
+            Or mapping of sample id -> dict {filename: filepath} as
+            described above.
+            Both levels of the dictionary are sorted alphabetically by key.
+        """
+        file_pattern = re.compile(self.pathspec)
+        ids = defaultdict(dict)
+        for entry in self.path.iterdir():
+            if entry.is_dir():
+                outer_id = entry.name
+                for path in entry.iterdir():
+                    if file_pattern.match(path.name):
+
+                        file_path, inner_id = _process_path(
+                            path=path,
+                            relative=relative,
+                            dir_format=self,
+                            suffixes=suffixes,
+                        )
+
+                        ids[outer_id][inner_id] = str(file_path)
+                ids[outer_id] = dict(sorted(ids[outer_id].items()))
+            else:
+                if file_pattern.match(entry.name):
+
+                    file_path, inner_id = _process_path(
+                        path=entry,
+                        relative=relative,
+                        dir_format=self,
+                        suffixes=suffixes,
+                    )
+
+                    ids[inner_id] = str(file_path)
+
+        return dict(sorted(ids.items()))
+
+
+def _process_path(path, relative, dir_format, suffixes):
+    """
+    This function processes the input file path to generate an absolute or
+    relative path string and the ID derived from the file name. The ID is
+    extracted by removing the one of the specified suffixes from the file
+    name. If no suffixes are specified the ID is defined to be the filename.
+
+    Parameters:
+    ---------
+        path : Path
+            A Path object representing the file path to process.
+        relative : bool
+            A flag indicating whether the returned path should be relative
+            to the directory formats path or absolute.
+        dir_format : model.DirectoryFormat.
+            Any object of class model.DirectoryFormat.
+
+    Returns:
+    -------
+        processed_path : str
+            The full relative or absolut path to the file.
+        _id : str
+            The ID derived from the file name. ID will be "" if the filename
+            consists only of the suffix.
+    """
+    file_name = path.stem
+
+    _id = file_name
+
+    if suffixes:
+        for suffix in suffixes:
+            if file_name.endswith(suffix):
+                _id = file_name[:-len(suffix)]
+                break
+
+    processed_path = (
+        path.absolute().relative_to(dir_format.path.absolute())
+        if relative
+        else path.absolute()
+    )
+    return str(processed_path), _id
diff --git a/q2_types/genome_data/__init__.py b/q2_types/genome_data/__init__.py
@@ -11,7 +11,6 @@
     GenesDirectoryFormat, ProteinsDirectoryFormat, LociDirectoryFormat,
     GFF3Format, OrthologFileFmt, SeedOrthologDirFmt,
     GenomeSequencesDirectoryFormat, OrthologAnnotationDirFmt,
-    GenomeDataDirectoryFormat,
 )
 from ._objects import IntervalMetadataIterator
 from ._types import (
@@ -25,6 +24,6 @@
     'GenesDirectoryFormat', 'ProteinsDirectoryFormat', 'LociDirectoryFormat',
     'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs',
     'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence',
-    'OrthologAnnotationDirFmt', 'NOG', 'GenomeDataDirectoryFormat',
+    'OrthologAnnotationDirFmt', 'NOG',
     'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations"
     ]
diff --git a/q2_types/genome_data/_formats.py b/q2_types/genome_data/_formats.py
@@ -6,11 +6,11 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import re
-from collections import defaultdict
 
 import qiime2.plugin.model as model
 from qiime2.plugin import ValidationError
 
+from q2_types._util import FileDictMixin
 from q2_types.feature_data import DNAFASTAFormat, ProteinFASTAFormat
 
 
@@ -19,63 +19,18 @@ def _validate_(self, level):
         pass
 
 
-class GenomeDataDirectoryFormat(model.DirectoryFormat):
-    def genome_dict(self, relative=False):
-        """
-        For per sample directories it returns a mapping of sample id to
-        another dictionary where keys represent the file name and values
-        correspond to the filepath for each file.
-        For files, it returns a mapping of file name to filepath for each file.
-
-        Parameters
-        ---------
-        relative : bool
-            Whether to return filepaths relative to the directory's location.
-            Returns absolute filepaths by default.
-
-        Returns
-        -------
-        dict
-            Mapping of filename -> filepath as described above.
-            Or mapping of sample id -> dict {filename: filepath} as
-            described above.
-            Both levels of the dictionary are sorted alphabetically by key.
-        """
-        ids = defaultdict(dict)
-        for entry in self.path.iterdir():
-            if entry.is_dir():
-                sample_id = entry.name
-                for path in entry.iterdir():
-                    file_name = path.stem
-                    file_path = (
-                        path.absolute().relative_to(self.path.absolute())
-                        if relative else path.absolute()
-                    )
-                    ids[sample_id][file_name] = str(file_path)
-                ids[sample_id] = dict(sorted(ids[sample_id].items()))
-            else:
-                file_name = entry.stem
-                file_path = (
-                    entry.absolute().relative_to(self.path.absolute())
-                    if relative else entry.absolute()
-                )
-                ids[file_name] = str(file_path)
-
-        return dict(sorted(ids.items()))
-
-
-class GenesDirectoryFormat(GenomeDataDirectoryFormat):
-    genes = model.FileCollection(r'.+\.(fa|fna|fasta)$',
-                                 format=DNAFASTAFormat)
+class GenesDirectoryFormat(model.DirectoryFormat, FileDictMixin):
+    pathspec = r'.+\.(fa|fna|fasta)$'
+    genes = model.FileCollection(pathspec, format=DNAFASTAFormat)
 
     @genes.set_path_maker
     def genes_path_maker(self, genome_id):
         return '%s.fasta' % genome_id
 
 
-class ProteinsDirectoryFormat(GenomeDataDirectoryFormat):
-    proteins = model.FileCollection(r'.+\.(fa|faa|fasta)$',
-                                    format=ProteinFASTAFormat)
+class ProteinsDirectoryFormat(model.DirectoryFormat, FileDictMixin):
+    pathspec = r'.+\.(fa|faa|fasta)$'
+    proteins = model.FileCollection(pathspec, format=ProteinFASTAFormat)
 
     @proteins.set_path_maker
     def proteins_path_maker(self, genome_id):
@@ -205,17 +160,18 @@ def _validate_(self, level):
                                       f'{line_number}') from e
 
 
-class LociDirectoryFormat(GenomeDataDirectoryFormat):
-    loci = model.FileCollection(r'.+\.gff$',
-                                format=GFF3Format)
+class LociDirectoryFormat(model.DirectoryFormat, FileDictMixin):
+    pathspec = r'.+\.gff$'
+    loci = model.FileCollection(pathspec, format=GFF3Format)
 
     @loci.set_path_maker
     def loci_path_maker(self, genome_id):
         return '%s.gff' % genome_id
 
 
-class GenomeSequencesDirectoryFormat(GenomeDataDirectoryFormat):
-    genomes = model.FileCollection(r'.+\.(fasta|fa)$', format=DNAFASTAFormat)
+class GenomeSequencesDirectoryFormat(model.DirectoryFormat, FileDictMixin):
+    pathspec = r'.+\.(fasta|fa)$'
+    genomes = model.FileCollection(pathspec, format=DNAFASTAFormat)
 
     @genomes.set_path_maker
     def genomes_path_maker(self, genome_id):

diff --git a/q2_types/genome_data/tests/test_formats.py b/q2_types/genome_data/tests/test_formats.py
@@ -6,7 +6,6 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import unittest
-from pathlib import Path
 
 from qiime2.core.exceptions import ValidationError
 from qiime2.plugin.testing import TestPluginBase
@@ -15,7 +14,6 @@
     GenesDirectoryFormat, ProteinsDirectoryFormat, GFF3Format,
     LociDirectoryFormat, SeedOrthologDirFmt, OrthologFileFmt,
     OrthologAnnotationDirFmt, GenomeSequencesDirectoryFormat,
-    GenomeDataDirectoryFormat
 )
 
 
@@ -180,51 +178,6 @@ def test_ortholog_annotations_annot_dict(self):
         }
         self.assertDictEqual(obs, exp)
 
-    def test_genome_data_dirfmt_samples_genome_dict(self):
-        genes = GenomeDataDirectoryFormat(
-            self.get_data_path('genes_samples'), mode='r')
-
-        obs = genes.genome_dict()
-        exp = {
-            'sample1': {
-                'genes1': str(Path(genes.path / 'sample1/genes1.fa')),
-            },
-            'sample2': {
-                'genes2': str(Path(genes.path / 'sample2/genes2.fa')),
-            },
-        }
-        self.assertDictEqual(obs, exp)
-
-        obs = genes.genome_dict(relative=True)
-        exp = {
-            'sample1': {
-                'genes1': 'sample1/genes1.fa',
-            },
-            'sample2': {
-                'genes2': 'sample2/genes2.fa',
-            },
-        }
-        self.assertDictEqual(obs, exp)
-
-    def test_genes_dirfmt_genome_dict(self):
-        genes = (
-            GenomeDataDirectoryFormat(self.get_data_path('genes'), mode='r')
-        )
-
-        obs = genes.genome_dict()
-        exp = {
-            'genes1': str(Path(genes.path / 'genes1.fa')),
-            'genes2': str(Path(genes.path / 'genes2.fa'))
-        }
-        self.assertDictEqual(obs, exp)
-
-        obs = genes.genome_dict(relative=True)
-        exp = {
-            'genes1': 'genes1.fa',
-            'genes2': 'genes2.fa'
-        }
-        self.assertDictEqual(obs, exp)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/q2_types/kraken2/_formats.py b/q2_types/kraken2/_formats.py
@@ -5,11 +5,12 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
-
 import pandas as pd
 from pandas.core.dtypes.common import is_string_dtype
 from qiime2.plugin import model, ValidationError
 
+from q2_types._util import FileDictMixin
+
 
 class Kraken2ReportFormat(model.TextFileFormat):
     MEASURE_COLUMNS = {
@@ -67,10 +68,9 @@ def _validate_(self, level):
             )
 
 
-class Kraken2ReportDirectoryFormat(model.DirectoryFormat):
-    reports = model.FileCollection(
-        r'.+report\.(txt|tsv)$', format=Kraken2ReportFormat
-    )
+class Kraken2ReportDirectoryFormat(model.DirectoryFormat, FileDictMixin):
+    pathspec = r'.+report\.(txt|tsv)$'
+    reports = model.FileCollection(pathspec, format=Kraken2ReportFormat)
 
     @reports.set_path_maker
     def reports_path_maker(self, sample_id, mag_id=None):
@@ -146,10 +146,9 @@ def _validate_(self, level):
             )
 
 
-class Kraken2OutputDirectoryFormat(model.DirectoryFormat):
-    reports = model.FileCollection(
-        r'.+output\.(txt|tsv)$', format=Kraken2OutputFormat
-    )
+class Kraken2OutputDirectoryFormat(model.DirectoryFormat, FileDictMixin):
+    pathspec = r'.+output\.(txt|tsv)$'
+    reports = model.FileCollection(pathspec, format=Kraken2OutputFormat)
 
     @reports.set_path_maker
     def reports_path_maker(self, sample_id, mag_id=None):

diff --git a/q2_types/tests/data/not_per_sample/id1_suffix1.txt b/q2_types/tests/data/not_per_sample/id1_suffix1.txt
diff --git a/q2_types/tests/data/not_per_sample/id2_suffix2.txt b/q2_types/tests/data/not_per_sample/id2_suffix2.txt
diff --git a/q2_types/tests/data/not_per_sample/some_file b/q2_types/tests/data/not_per_sample/some_file
diff --git a/q2_types/tests/data/per_sample/sample1/id1_suffix.txt b/q2_types/tests/data/per_sample/sample1/id1_suffix.txt
diff --git a/q2_types/tests/data/per_sample/sample2/id2_suffix.txt b/q2_types/tests/data/per_sample/sample2/id2_suffix.txt
diff --git a/q2_types/tests/data/per_sample/some_file b/q2_types/tests/data/per_sample/some_file