Merge pull request #146 from phac-nml/development

Patch release v2.6.1
phac-nml · Mar 5, 2021 · 1f4da70 · 1f4da70
2 parents e7f5b14 + 1834d8e
commit 1f4da70
Show file tree

Hide file tree

Showing 9 changed files with 93 additions and 67 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.6, 3.7, 3.8, 3.9]
 
     steps:
     - uses: actions/checkout@v2
@@ -23,14 +23,6 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install flake8 pytest
-        pip install .
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Test with pytest
-      run: |
-        pytest
+        python -m pip install flake8 pytest tox tox-gh-actions
+    - name: Test with tox
+      run: tox
diff --git a/bio_hansel/__init__.py b/bio_hansel/__init__.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-__version__ = '2.6.0'
+__version__ = '2.6.1'
 program_name = 'bio_hansel'
 program_summary = f'BioHansel version {__version__}: Subtype microbial genomes using SNV targeting k-mer subtyping ' \
                   f'schemes. '

diff --git a/bio_hansel/main.py b/bio_hansel/main.py
@@ -53,12 +53,17 @@ def init_parser():
                         help='Input genome FASTA/FASTQ files (can be Gzipped)')
     parser.add_argument('-s', '--scheme',
                         default='heidelberg',
-                        help='Scheme to use for subtyping (built-in: "heidelberg", "enteritidis", "typhi", "typhimurium", "tb_lineage"; OR user-specified: '
-                             '/path/to/user/scheme)')
+                        help='Scheme to use for subtyping (built-in: '
+                             '"heidelberg", "enteritidis", "typhi", '
+                             '"typhimurium", "tb_lineage"; '
+                             'OR user-specified: /path/to/user/scheme)')
     parser.add_argument('--scheme-name',
                         help='Custom user-specified SNP substyping scheme name')
     parser.add_argument('-M', '--scheme-metadata',
-                        help='Scheme subtype metadata table (tab-delimited file with ".tsv" or ".tab" extension or CSV with ".csv" extension format accepted; MUST contain column called "subtype")')
+                        help='Scheme subtype metadata table (tab-delimited '
+                             'file with ".tsv" or ".tab" extension or CSV '
+                             'with ".csv" extension format accepted; MUST '
+                             'contain column called "subtype")')
     parser.add_argument('-p', '--paired-reads',
                         nargs=2,
                         metavar=('forward_reads', 'reverse_reads'),
@@ -99,7 +104,9 @@ def init_parser():
                         help='Frequencies below this coverage are considered low coverage')
     parser.add_argument('--max-missing-kmers',
                         type=float,
-                        help='Decimal proportion of maximum allowable missing kmers before being considered an error. (0.0 - 1.0)')
+                        help='Decimal proportion of maximum allowable missing'
+                             ' kmers before being considered an error. '
+                             '(0.0 - 1.0)')
     parser.add_argument('--min-ambiguous-kmers',
                         type=int,
                         help='Minimum number of missing kmers to be considered an ambiguous result')
@@ -108,10 +115,13 @@ def init_parser():
                         help='Overall kmer coverage below this value will trigger a low coverage warning')
     parser.add_argument('--max-intermediate-kmers',
                         type=float,
-                        help='Decimal proportion of maximum allowable missing kmers to be considered an intermediate subtype. (0.0 - 1.0)')
+                        help='Decimal proportion of maximum allowable '
+                             'missing kmers to be considered an '
+                             'intermediate subtype. (0.0 - 1.0)')
     parser.add_argument('--max-degenerate-kmers',
                         type=int,
-                        help='Maximum number of scheme k-mers allowed before quitting with a usage warning. Default is 100000')
+                        help='Maximum number of scheme k-mers allowed before '
+                             'quitting with a usage warning. Default is 100000')
     parser.add_argument('-t', '--threads',
                         type=int,
                         default=1,

diff --git a/bio_hansel/qc/checks.py b/bio_hansel/qc/checks.py
@@ -17,16 +17,18 @@ def is_overall_coverage_low(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -
         return None, None
 
     if st.avg_kmer_coverage < p.min_coverage_warning:
-        return QC.WARNING, f'Low coverage for all kmers ({st.avg_kmer_coverage:.3f} < {p.min_coverage_warning} expected)'
+        return QC.WARNING, f'Low coverage for all kmers ' \
+                           f'({st.avg_kmer_coverage:.3f} < {p.min_coverage_warning} ' \
+                           f'expected)'
     return None, None
 
 
 def is_missing_kmers(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
     """Are there more missing kmers than tolerated?
 
     Note:
-        For reads, calculate the average coverage depth from the kmers that are present and provide an adequate
-        error message based on the coverage.
+        For reads, calculate the average coverage depth from the kmers that
+        are present and provide an adequate error message based on the coverage.
 
     Args:
         st: Subtype results
@@ -47,18 +49,15 @@ def is_missing_kmers(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple
                                        p=p)
     else:
         message_list = []
-
         subtype_list = st.subtype.split(';')
         n_kmers_matching_expected = st.n_kmers_matching_all_expected.split(';')
-
         dfpos = df[df.is_pos_kmer]
         mixed_subtype_counts = get_mixed_subtype_kmer_counts(dfpos=dfpos,
                                                              subtype_list=subtype_list)
-
         kmers_matching_negative = st.n_kmers_matching_negative
         for curr_subtype, exp in zip(subtype_list, n_kmers_matching_expected):
-            # We can omit the status because there will be a fail status already from non consistent subtypes.
-
+            # We can omit the status because there will be a fail status
+            # already from non consistent subtypes.
             obs = mixed_subtype_counts.get(curr_subtype) + int(kmers_matching_negative)
             _, curr_messages = check_for_missing_kmers(is_fastq=st.is_fastq_input(),
                                                        subtype_result=curr_subtype,
@@ -69,9 +68,7 @@ def is_missing_kmers(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple
                                                        p=p)
 
             message_list.append(curr_messages)
-
         error_messages = ' | '.join(filter(None.__ne__, message_list))
-
         return QC.FAIL, error_messages
 
 
@@ -102,19 +99,23 @@ def check_for_missing_kmers(is_fastq: bool,
     messages = None
 
     # proportion of missing kmers
-    p_missing = (exp - obs) / exp  # type: float
+    p_missing: float = (exp - obs) / exp
     if p_missing > p.max_perc_missing_kmers:
         status = QC.FAIL
         if is_fastq:
-            kmers_with_hits = df[df['is_kmer_freq_okay']]  # type: pd.DataFrame
+            kmers_with_hits: pd.DataFrame = df[df['is_kmer_freq_okay']]
             depth = kmers_with_hits['freq'].mean()
             if depth < p.low_coverage_depth_freq:
-                coverage_msg = f'Low coverage depth ({depth:.1f} < {float(p.low_coverage_depth_freq):.1f} expected); ' \
-                               f'you may need more WGS data.'
+                coverage_msg = f'Low coverage depth ({depth:.1f} ' \
+                               f'< {float(p.low_coverage_depth_freq):.1f} ' \
+                               f'expected); you may need more WGS data.'
             else:
-                coverage_msg = f'Okay coverage depth ({depth:.1f} >= {float(p.low_coverage_depth_freq):.1f} expected), ' \
-                               f'but this may be the wrong serovar or species for scheme "{scheme}"'
-            messages = f'{p_missing:.2%} missing kmers; more than {p.max_perc_missing_kmers:.2%} missing ' \
+                coverage_msg = f'Okay coverage depth ({depth:.1f} ' \
+                               f'>= {float(p.low_coverage_depth_freq):.1f} ' \
+                               f'expected), but this may be the wrong serovar ' \
+                               f'or species for scheme "{scheme}"'
+            messages = f'{p_missing:.2%} missing kmers; more than ' \
+                       f'{p.max_perc_missing_kmers:.2%} missing ' \
                        f'kmers threshold. {coverage_msg}'
         else:
             messages = f'{p_missing:.2%} missing kmers for subtype "{subtype_result}"; more than ' \
@@ -150,8 +151,10 @@ def is_mixed_subtype(st: Subtype, df: pd.DataFrame, *args) -> Tuple[Optional[str
                     f'the same target site{s} {positions} for subtype "{st.subtype}".'
 
 
-def is_missing_too_many_target_sites(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[
-    Optional[str], Optional[str]]:
+def is_missing_too_many_target_sites(st: Subtype,
+                                     df: pd.DataFrame,
+                                     p: SubtypingParams) \
+        -> Tuple[Optional[str], Optional[str]]:
     """Are there too many missing target sites for an expected subtype?
 
     Check if there are any refpositions missing from the subtyping scheme in the result.
@@ -208,9 +211,11 @@ def is_missing_hierarchical_kmers(st: Subtype, *args) -> Tuple[Optional[str], Op
     """Are there any missing nested subtypes in the final subtype call?
 
     Note:
-        This method will check if there's any missing_nested_subtypes in the result, which would indicate a non confident
-        result. This is due to the fact that if you have a subtyping result of `2.1.1.2` and you're missing `2.1.1` or `2.1`
-        that you can't be sure that the subtype's final call is 2.1.1.2 due to the missing information.
+        This method will check if there's any missing_nested_subtypes in the
+        result, which would indicate a non confident result. This is due to
+        the fact that if you have a subtyping result of `2.1.1.2` and
+        you're missing `2.1.1` or `2.1` that you can't be sure that the
+        subtype's final call is 2.1.1.2 due to the missing information.
 
     Args:
         st: Subtype results
@@ -226,8 +231,10 @@ def is_missing_hierarchical_kmers(st: Subtype, *args) -> Tuple[Optional[str], Op
     return None, None
 
 
-def is_maybe_intermediate_subtype(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[
-    Optional[str], Optional[str]]:
+def is_maybe_intermediate_subtype(st: Subtype,
+                                  df: pd.DataFrame,
+                                  p: SubtypingParams) \
+        -> Tuple[Optional[str], Optional[str]]:
     """Is the result a possible intermediate subtype?
 
     Return a WARNING message if all the conditions are true:

diff --git a/bio_hansel/subtype.py b/bio_hansel/subtype.py
@@ -1,8 +1,8 @@
 # -*- coding: utf-8 -*-
+import os
 from typing import List, Optional
 
 import attr
-import os
 
 from .const import REGEX_FASTQ
 
@@ -15,7 +15,7 @@ class Subtype(object):
     scheme_version = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str)))
     subtype = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str)))
     non_present_subtypes = attr.ib(default=None)  # type: Optional[List[str]]
-    missing_nested_subtypes = attr.ib(default=None) 
+    missing_nested_subtypes = attr.ib(default=None)
     all_subtypes = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str)))
     inconsistent_subtypes = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str)))
     kmers_matching_subtype = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str)))
@@ -46,12 +46,12 @@ def _file_path_validator(self, attribute, value):
                 if not os.path.exists(x):
                     raise OSError('Input file "{}" does not exist!'.format(x))
         else:
-            raise ValueError('Unexpected type for input file path "{}": {}'.format(type(value), value))
+            raise ValueError(f'Unexpected type for input file path "{type(value)}": {value}')
 
     def is_fastq_input(self):
         if isinstance(self.file_path, str):
             return bool(REGEX_FASTQ.match(self.file_path))
         elif isinstance(self.file_path, list):
             return all(bool(REGEX_FASTQ.match(x)) for x in self.file_path)
         else:
-            raise ValueError('Unexpected type "{}" for "file_path": {}'.format(type(self.file_path), self.file_path))
+            raise ValueError(f'Unexpected type "{self.file_path}" for "file_path": {self.file_path}')
diff --git a/bio_hansel/subtyper.py b/bio_hansel/subtyper.py
@@ -481,7 +481,7 @@ def missing_nested_subtypes(subtype_result: str, positive_subtypes: Set[str]) ->
     Args:
         subtype_result: Final subtype result
         positive_subtypes: Set of unique positive subtypes found
-    
+
     Returns:
         String of missing hierarchical subtypes or `None` if there are no missing nested hierarchical subtypes.
     """

diff --git a/bio_hansel/utils.py b/bio_hansel/utils.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 
+import argparse
 import logging
 import os
 import re
@@ -44,7 +45,7 @@ def genome_name_from_fasta_path(fasta_path: str) -> str:
     """
     filename = os.path.basename(fasta_path)
     filename = re.sub(r'\.gz$', '', filename)
-    return re.sub(r'\.(fa|fas|fasta|fna|\w{1,})(\.gz)?$', '', filename)
+    return re.sub(r'\.(fa|fas|fasta|fna|\w+)(\.gz)?$', '', filename)
 
 
 def compare_subtypes(a: List[Any], b: List[Any]) -> bool:
@@ -61,12 +62,12 @@ def find_inconsistent_subtypes(subtypes: List[List[int]]) -> List[str]:
             is_consistent = compare_subtypes(a, b)
             if not is_consistent:
                 incon.append((a, b))
-    l = []
+    all_inconsistent_subtypes = []
     for a, b in incon:
         astr = '.'.join([str(x) for x in a])
         bstr = '.'.join([str(x) for x in b])
-        l += [astr, bstr]
-    c = Counter(l)
+        all_inconsistent_subtypes += [astr, bstr]
+    c = Counter(all_inconsistent_subtypes)
     incon_subtypes = []
     for subtype, freq in c.most_common():
         if freq >= 1:
@@ -171,7 +172,7 @@ def is_gzipped(p: str) -> bool:
     return bool(re.match(r'^.+\.gz$', p))
 
 
-def init_subtyping_params(args: Optional[Any] = None,
+def init_subtyping_params(args: Optional[argparse.Namespace] = None,
                           scheme: Optional[str] = None) -> SubtypingParams:
     """Initialize subtyping parameters based on command-line arguments and scheme defaults
 
@@ -182,10 +183,8 @@ def init_subtyping_params(args: Optional[Any] = None,
     Returns:
         SubtypingParams with user-supplied values then scheme defaults then global defaults loaded
     """
-    subtyping_params = get_scheme_params(scheme)
-    if subtyping_params is None:
-        subtyping_params = SubtypingParams()
-    if args is not None:
+    subtyping_params = get_scheme_params(scheme) or SubtypingParams()
+    if args:
         if args.low_cov_depth_freq:
             subtyping_params.low_coverage_depth_freq = args.low_cov_depth_freq
         if args.max_missing_kmers:
@@ -255,4 +254,4 @@ def expand_degenerate_bases(seq):
          List of all possible kmers given a degenerate base or not
     """
 
-    return list(map("".join, product(*map(bases_dict.get, seq))))
+    return list(map("".join, product(*map(bases_dict.get, seq))))
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 2.6.0
+current_version = 2.6.1
 commit = True
 tag = True
 
@@ -24,3 +24,24 @@ test = pytest
 [tool:pytest]
 collect_ignore = ['setup.py']
 
+[tox:tox]
+envlist = py36, py37, py38, py39, flake8
+
+[gh-actions]
+python = 
+	3.6: py36
+	3.7: py37
+	3.8: py38
+	3.9: py39
+
+[testenv:flake8]
+basepython = python
+deps = flake8
+commands = 
+	flake8 bio_hansel --count --select=E9,F63,F7,F82 --show-source --statistics
+	flake8 bio_hansel --count --exit-zero --max-line-length=127 --statistics
+
+[testenv]
+deps = pytest
+commands = pytest
+
diff --git a/setup.py b/setup.py
@@ -14,10 +14,6 @@
     'rich'
 ]
 
-setup_requirements = ['pytest-runner', ]
-
-test_requirements = ['pytest', ]
-
 setup(
     author='Peter Kruczkiewicz',
     author_email='[email protected]',
@@ -38,18 +34,19 @@
     ],
     description='Subtype microbial whole-genome sequencing (WGS) '
                 'data using SNV targeting k-mer subtyping schemes.',
-    entry_points={'console_scripts': ['hansel=bio_hansel.main:main']},
+    entry_points={'console_scripts': [
+        'hansel=bio_hansel.main:main',
+        'biohansel=bio_hansel.main:main',
+    ]},
     install_requires=requirements,
     keywords='Salmonella enterica Heidelberg Enteritidis SNP kmer subtyping Aho-Corasick',
     license='Apache Software License 2.0',
     long_description=readme,
     name='bio_hansel',
     package_data={'bio_hansel': ['data/*/*.fasta', 'data/*/*.tsv',]},
     packages=find_packages(exclude=['test_*.py', 'tests']),
-    setup_requires=setup_requirements,
     test_suite='tests',
-    tests_require=test_requirements,
     url='https://github.com/phac-nml/biohansel',
-    version='2.6.0',
+    version='2.6.1',
     zip_safe=False,
 )