Skip to content

Commit

Permalink
Merge pull request #146 from phac-nml/development
Browse files Browse the repository at this point in the history
Patch release v2.6.1
  • Loading branch information
peterk87 authored Mar 5, 2021
2 parents e7f5b14 + 1834d8e commit 1f4da70
Show file tree
Hide file tree
Showing 9 changed files with 93 additions and 67 deletions.
16 changes: 4 additions & 12 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.7, 3.8, 3.9]
python-version: [3.6, 3.7, 3.8, 3.9]

steps:
- uses: actions/checkout@v2
Expand All @@ -23,14 +23,6 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest
pip install .
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest
python -m pip install flake8 pytest tox tox-gh-actions
- name: Test with tox
run: tox
2 changes: 1 addition & 1 deletion bio_hansel/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

__version__ = '2.6.0'
__version__ = '2.6.1'
program_name = 'bio_hansel'
program_summary = f'BioHansel version {__version__}: Subtype microbial genomes using SNV targeting k-mer subtyping ' \
f'schemes. '
Expand Down
22 changes: 16 additions & 6 deletions bio_hansel/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,17 @@ def init_parser():
help='Input genome FASTA/FASTQ files (can be Gzipped)')
parser.add_argument('-s', '--scheme',
default='heidelberg',
help='Scheme to use for subtyping (built-in: "heidelberg", "enteritidis", "typhi", "typhimurium", "tb_lineage"; OR user-specified: '
'/path/to/user/scheme)')
help='Scheme to use for subtyping (built-in: '
'"heidelberg", "enteritidis", "typhi", '
'"typhimurium", "tb_lineage"; '
'OR user-specified: /path/to/user/scheme)')
parser.add_argument('--scheme-name',
help='Custom user-specified SNP substyping scheme name')
parser.add_argument('-M', '--scheme-metadata',
help='Scheme subtype metadata table (tab-delimited file with ".tsv" or ".tab" extension or CSV with ".csv" extension format accepted; MUST contain column called "subtype")')
help='Scheme subtype metadata table (tab-delimited '
'file with ".tsv" or ".tab" extension or CSV '
'with ".csv" extension format accepted; MUST '
'contain column called "subtype")')
parser.add_argument('-p', '--paired-reads',
nargs=2,
metavar=('forward_reads', 'reverse_reads'),
Expand Down Expand Up @@ -99,7 +104,9 @@ def init_parser():
help='Frequencies below this coverage are considered low coverage')
parser.add_argument('--max-missing-kmers',
type=float,
help='Decimal proportion of maximum allowable missing kmers before being considered an error. (0.0 - 1.0)')
help='Decimal proportion of maximum allowable missing'
' kmers before being considered an error. '
'(0.0 - 1.0)')
parser.add_argument('--min-ambiguous-kmers',
type=int,
help='Minimum number of missing kmers to be considered an ambiguous result')
Expand All @@ -108,10 +115,13 @@ def init_parser():
help='Overall kmer coverage below this value will trigger a low coverage warning')
parser.add_argument('--max-intermediate-kmers',
type=float,
help='Decimal proportion of maximum allowable missing kmers to be considered an intermediate subtype. (0.0 - 1.0)')
help='Decimal proportion of maximum allowable '
'missing kmers to be considered an '
'intermediate subtype. (0.0 - 1.0)')
parser.add_argument('--max-degenerate-kmers',
type=int,
help='Maximum number of scheme k-mers allowed before quitting with a usage warning. Default is 100000')
help='Maximum number of scheme k-mers allowed before '
'quitting with a usage warning. Default is 100000')
parser.add_argument('-t', '--threads',
type=int,
default=1,
Expand Down
55 changes: 31 additions & 24 deletions bio_hansel/qc/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,18 @@ def is_overall_coverage_low(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -
return None, None

if st.avg_kmer_coverage < p.min_coverage_warning:
return QC.WARNING, f'Low coverage for all kmers ({st.avg_kmer_coverage:.3f} < {p.min_coverage_warning} expected)'
return QC.WARNING, f'Low coverage for all kmers ' \
f'({st.avg_kmer_coverage:.3f} < {p.min_coverage_warning} ' \
f'expected)'
return None, None


def is_missing_kmers(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
"""Are there more missing kmers than tolerated?
Note:
For reads, calculate the average coverage depth from the kmers that are present and provide an adequate
error message based on the coverage.
For reads, calculate the average coverage depth from the kmers that
are present and provide an adequate error message based on the coverage.
Args:
st: Subtype results
Expand All @@ -47,18 +49,15 @@ def is_missing_kmers(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple
p=p)
else:
message_list = []

subtype_list = st.subtype.split(';')
n_kmers_matching_expected = st.n_kmers_matching_all_expected.split(';')

dfpos = df[df.is_pos_kmer]
mixed_subtype_counts = get_mixed_subtype_kmer_counts(dfpos=dfpos,
subtype_list=subtype_list)

kmers_matching_negative = st.n_kmers_matching_negative
for curr_subtype, exp in zip(subtype_list, n_kmers_matching_expected):
# We can omit the status because there will be a fail status already from non consistent subtypes.

# We can omit the status because there will be a fail status
# already from non consistent subtypes.
obs = mixed_subtype_counts.get(curr_subtype) + int(kmers_matching_negative)
_, curr_messages = check_for_missing_kmers(is_fastq=st.is_fastq_input(),
subtype_result=curr_subtype,
Expand All @@ -69,9 +68,7 @@ def is_missing_kmers(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple
p=p)

message_list.append(curr_messages)

error_messages = ' | '.join(filter(None.__ne__, message_list))

return QC.FAIL, error_messages


Expand Down Expand Up @@ -102,19 +99,23 @@ def check_for_missing_kmers(is_fastq: bool,
messages = None

# proportion of missing kmers
p_missing = (exp - obs) / exp # type: float
p_missing: float = (exp - obs) / exp
if p_missing > p.max_perc_missing_kmers:
status = QC.FAIL
if is_fastq:
kmers_with_hits = df[df['is_kmer_freq_okay']] # type: pd.DataFrame
kmers_with_hits: pd.DataFrame = df[df['is_kmer_freq_okay']]
depth = kmers_with_hits['freq'].mean()
if depth < p.low_coverage_depth_freq:
coverage_msg = f'Low coverage depth ({depth:.1f} < {float(p.low_coverage_depth_freq):.1f} expected); ' \
f'you may need more WGS data.'
coverage_msg = f'Low coverage depth ({depth:.1f} ' \
f'< {float(p.low_coverage_depth_freq):.1f} ' \
f'expected); you may need more WGS data.'
else:
coverage_msg = f'Okay coverage depth ({depth:.1f} >= {float(p.low_coverage_depth_freq):.1f} expected), ' \
f'but this may be the wrong serovar or species for scheme "{scheme}"'
messages = f'{p_missing:.2%} missing kmers; more than {p.max_perc_missing_kmers:.2%} missing ' \
coverage_msg = f'Okay coverage depth ({depth:.1f} ' \
f'>= {float(p.low_coverage_depth_freq):.1f} ' \
f'expected), but this may be the wrong serovar ' \
f'or species for scheme "{scheme}"'
messages = f'{p_missing:.2%} missing kmers; more than ' \
f'{p.max_perc_missing_kmers:.2%} missing ' \
f'kmers threshold. {coverage_msg}'
else:
messages = f'{p_missing:.2%} missing kmers for subtype "{subtype_result}"; more than ' \
Expand Down Expand Up @@ -150,8 +151,10 @@ def is_mixed_subtype(st: Subtype, df: pd.DataFrame, *args) -> Tuple[Optional[str
f'the same target site{s} {positions} for subtype "{st.subtype}".'


def is_missing_too_many_target_sites(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[
Optional[str], Optional[str]]:
def is_missing_too_many_target_sites(st: Subtype,
df: pd.DataFrame,
p: SubtypingParams) \
-> Tuple[Optional[str], Optional[str]]:
"""Are there too many missing target sites for an expected subtype?
Check if there are any refpositions missing from the subtyping scheme in the result.
Expand Down Expand Up @@ -208,9 +211,11 @@ def is_missing_hierarchical_kmers(st: Subtype, *args) -> Tuple[Optional[str], Op
"""Are there any missing nested subtypes in the final subtype call?
Note:
This method will check if there's any missing_nested_subtypes in the result, which would indicate a non confident
result. This is due to the fact that if you have a subtyping result of `2.1.1.2` and you're missing `2.1.1` or `2.1`
that you can't be sure that the subtype's final call is 2.1.1.2 due to the missing information.
This method will check if there's any missing_nested_subtypes in the
result, which would indicate a non confident result. This is due to
the fact that if you have a subtyping result of `2.1.1.2` and
you're missing `2.1.1` or `2.1` that you can't be sure that the
subtype's final call is 2.1.1.2 due to the missing information.
Args:
st: Subtype results
Expand All @@ -226,8 +231,10 @@ def is_missing_hierarchical_kmers(st: Subtype, *args) -> Tuple[Optional[str], Op
return None, None


def is_maybe_intermediate_subtype(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[
Optional[str], Optional[str]]:
def is_maybe_intermediate_subtype(st: Subtype,
df: pd.DataFrame,
p: SubtypingParams) \
-> Tuple[Optional[str], Optional[str]]:
"""Is the result a possible intermediate subtype?
Return a WARNING message if all the conditions are true:
Expand Down
8 changes: 4 additions & 4 deletions bio_hansel/subtype.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# -*- coding: utf-8 -*-
import os
from typing import List, Optional

import attr
import os

from .const import REGEX_FASTQ

Expand All @@ -15,7 +15,7 @@ class Subtype(object):
scheme_version = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str)))
subtype = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str)))
non_present_subtypes = attr.ib(default=None) # type: Optional[List[str]]
missing_nested_subtypes = attr.ib(default=None)
missing_nested_subtypes = attr.ib(default=None)
all_subtypes = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str)))
inconsistent_subtypes = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str)))
kmers_matching_subtype = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str)))
Expand Down Expand Up @@ -46,12 +46,12 @@ def _file_path_validator(self, attribute, value):
if not os.path.exists(x):
raise OSError('Input file "{}" does not exist!'.format(x))
else:
raise ValueError('Unexpected type for input file path "{}": {}'.format(type(value), value))
raise ValueError(f'Unexpected type for input file path "{type(value)}": {value}')

def is_fastq_input(self):
if isinstance(self.file_path, str):
return bool(REGEX_FASTQ.match(self.file_path))
elif isinstance(self.file_path, list):
return all(bool(REGEX_FASTQ.match(x)) for x in self.file_path)
else:
raise ValueError('Unexpected type "{}" for "file_path": {}'.format(type(self.file_path), self.file_path))
raise ValueError(f'Unexpected type "{self.file_path}" for "file_path": {self.file_path}')
2 changes: 1 addition & 1 deletion bio_hansel/subtyper.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ def missing_nested_subtypes(subtype_result: str, positive_subtypes: Set[str]) ->
Args:
subtype_result: Final subtype result
positive_subtypes: Set of unique positive subtypes found
Returns:
String of missing hierarchical subtypes or `None` if there are no missing nested hierarchical subtypes.
"""
Expand Down
19 changes: 9 additions & 10 deletions bio_hansel/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-

import argparse
import logging
import os
import re
Expand Down Expand Up @@ -44,7 +45,7 @@ def genome_name_from_fasta_path(fasta_path: str) -> str:
"""
filename = os.path.basename(fasta_path)
filename = re.sub(r'\.gz$', '', filename)
return re.sub(r'\.(fa|fas|fasta|fna|\w{1,})(\.gz)?$', '', filename)
return re.sub(r'\.(fa|fas|fasta|fna|\w+)(\.gz)?$', '', filename)


def compare_subtypes(a: List[Any], b: List[Any]) -> bool:
Expand All @@ -61,12 +62,12 @@ def find_inconsistent_subtypes(subtypes: List[List[int]]) -> List[str]:
is_consistent = compare_subtypes(a, b)
if not is_consistent:
incon.append((a, b))
l = []
all_inconsistent_subtypes = []
for a, b in incon:
astr = '.'.join([str(x) for x in a])
bstr = '.'.join([str(x) for x in b])
l += [astr, bstr]
c = Counter(l)
all_inconsistent_subtypes += [astr, bstr]
c = Counter(all_inconsistent_subtypes)
incon_subtypes = []
for subtype, freq in c.most_common():
if freq >= 1:
Expand Down Expand Up @@ -171,7 +172,7 @@ def is_gzipped(p: str) -> bool:
return bool(re.match(r'^.+\.gz$', p))


def init_subtyping_params(args: Optional[Any] = None,
def init_subtyping_params(args: Optional[argparse.Namespace] = None,
scheme: Optional[str] = None) -> SubtypingParams:
"""Initialize subtyping parameters based on command-line arguments and scheme defaults
Expand All @@ -182,10 +183,8 @@ def init_subtyping_params(args: Optional[Any] = None,
Returns:
SubtypingParams with user-supplied values then scheme defaults then global defaults loaded
"""
subtyping_params = get_scheme_params(scheme)
if subtyping_params is None:
subtyping_params = SubtypingParams()
if args is not None:
subtyping_params = get_scheme_params(scheme) or SubtypingParams()
if args:
if args.low_cov_depth_freq:
subtyping_params.low_coverage_depth_freq = args.low_cov_depth_freq
if args.max_missing_kmers:
Expand Down Expand Up @@ -255,4 +254,4 @@ def expand_degenerate_bases(seq):
List of all possible kmers given a degenerate base or not
"""

return list(map("".join, product(*map(bases_dict.get, seq))))
return list(map("".join, product(*map(bases_dict.get, seq))))
23 changes: 22 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 2.6.0
current_version = 2.6.1
commit = True
tag = True

Expand All @@ -24,3 +24,24 @@ test = pytest
[tool:pytest]
collect_ignore = ['setup.py']

[tox:tox]
envlist = py36, py37, py38, py39, flake8

[gh-actions]
python =
3.6: py36
3.7: py37
3.8: py38
3.9: py39

[testenv:flake8]
basepython = python
deps = flake8
commands =
flake8 bio_hansel --count --select=E9,F63,F7,F82 --show-source --statistics
flake8 bio_hansel --count --exit-zero --max-line-length=127 --statistics

[testenv]
deps = pytest
commands = pytest

13 changes: 5 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@
'rich'
]

setup_requirements = ['pytest-runner', ]

test_requirements = ['pytest', ]

setup(
author='Peter Kruczkiewicz',
author_email='[email protected]',
Expand All @@ -38,18 +34,19 @@
],
description='Subtype microbial whole-genome sequencing (WGS) '
'data using SNV targeting k-mer subtyping schemes.',
entry_points={'console_scripts': ['hansel=bio_hansel.main:main']},
entry_points={'console_scripts': [
'hansel=bio_hansel.main:main',
'biohansel=bio_hansel.main:main',
]},
install_requires=requirements,
keywords='Salmonella enterica Heidelberg Enteritidis SNP kmer subtyping Aho-Corasick',
license='Apache Software License 2.0',
long_description=readme,
name='bio_hansel',
package_data={'bio_hansel': ['data/*/*.fasta', 'data/*/*.tsv',]},
packages=find_packages(exclude=['test_*.py', 'tests']),
setup_requires=setup_requirements,
test_suite='tests',
tests_require=test_requirements,
url='https://github.com/phac-nml/biohansel',
version='2.6.0',
version='2.6.1',
zip_safe=False,
)

0 comments on commit 1f4da70

Please sign in to comment.