diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1df230a..6186bb1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.7, 3.8, 3.9] + python-version: [3.6, 3.7, 3.8, 3.9] steps: - uses: actions/checkout@v2 @@ -23,14 +23,6 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 pytest - pip install . - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - pytest + python -m pip install flake8 pytest tox tox-gh-actions + - name: Test with tox + run: tox diff --git a/bio_hansel/__init__.py b/bio_hansel/__init__.py index c65abd2..a209de7 100644 --- a/bio_hansel/__init__.py +++ b/bio_hansel/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -__version__ = '2.6.0' +__version__ = '2.6.1' program_name = 'bio_hansel' program_summary = f'BioHansel version {__version__}: Subtype microbial genomes using SNV targeting k-mer subtyping ' \ f'schemes. ' diff --git a/bio_hansel/main.py b/bio_hansel/main.py index f52e286..6b1e116 100644 --- a/bio_hansel/main.py +++ b/bio_hansel/main.py @@ -53,12 +53,17 @@ def init_parser(): help='Input genome FASTA/FASTQ files (can be Gzipped)') parser.add_argument('-s', '--scheme', default='heidelberg', - help='Scheme to use for subtyping (built-in: "heidelberg", "enteritidis", "typhi", "typhimurium", "tb_lineage"; OR user-specified: ' - '/path/to/user/scheme)') + help='Scheme to use for subtyping (built-in: ' + '"heidelberg", "enteritidis", "typhi", ' + '"typhimurium", "tb_lineage"; ' + 'OR user-specified: /path/to/user/scheme)') parser.add_argument('--scheme-name', help='Custom user-specified SNP substyping scheme name') parser.add_argument('-M', '--scheme-metadata', - help='Scheme subtype metadata table (tab-delimited file with ".tsv" or ".tab" extension or CSV with ".csv" extension format accepted; MUST contain column called "subtype")') + help='Scheme subtype metadata table (tab-delimited ' + 'file with ".tsv" or ".tab" extension or CSV ' + 'with ".csv" extension format accepted; MUST ' + 'contain column called "subtype")') parser.add_argument('-p', '--paired-reads', nargs=2, metavar=('forward_reads', 'reverse_reads'), @@ -99,7 +104,9 @@ def init_parser(): help='Frequencies below this coverage are considered low coverage') parser.add_argument('--max-missing-kmers', type=float, - help='Decimal proportion of maximum allowable missing kmers before being considered an error. (0.0 - 1.0)') + help='Decimal proportion of maximum allowable missing' + ' kmers before being considered an error. ' + '(0.0 - 1.0)') parser.add_argument('--min-ambiguous-kmers', type=int, help='Minimum number of missing kmers to be considered an ambiguous result') @@ -108,10 +115,13 @@ def init_parser(): help='Overall kmer coverage below this value will trigger a low coverage warning') parser.add_argument('--max-intermediate-kmers', type=float, - help='Decimal proportion of maximum allowable missing kmers to be considered an intermediate subtype. (0.0 - 1.0)') + help='Decimal proportion of maximum allowable ' + 'missing kmers to be considered an ' + 'intermediate subtype. (0.0 - 1.0)') parser.add_argument('--max-degenerate-kmers', type=int, - help='Maximum number of scheme k-mers allowed before quitting with a usage warning. Default is 100000') + help='Maximum number of scheme k-mers allowed before ' + 'quitting with a usage warning. Default is 100000') parser.add_argument('-t', '--threads', type=int, default=1, diff --git a/bio_hansel/qc/checks.py b/bio_hansel/qc/checks.py index e4079ee..07742b1 100644 --- a/bio_hansel/qc/checks.py +++ b/bio_hansel/qc/checks.py @@ -17,7 +17,9 @@ def is_overall_coverage_low(st: Subtype, df: pd.DataFrame, p: SubtypingParams) - return None, None if st.avg_kmer_coverage < p.min_coverage_warning: - return QC.WARNING, f'Low coverage for all kmers ({st.avg_kmer_coverage:.3f} < {p.min_coverage_warning} expected)' + return QC.WARNING, f'Low coverage for all kmers ' \ + f'({st.avg_kmer_coverage:.3f} < {p.min_coverage_warning} ' \ + f'expected)' return None, None @@ -25,8 +27,8 @@ def is_missing_kmers(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple """Are there more missing kmers than tolerated? Note: - For reads, calculate the average coverage depth from the kmers that are present and provide an adequate - error message based on the coverage. + For reads, calculate the average coverage depth from the kmers that + are present and provide an adequate error message based on the coverage. Args: st: Subtype results @@ -47,18 +49,15 @@ def is_missing_kmers(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple p=p) else: message_list = [] - subtype_list = st.subtype.split(';') n_kmers_matching_expected = st.n_kmers_matching_all_expected.split(';') - dfpos = df[df.is_pos_kmer] mixed_subtype_counts = get_mixed_subtype_kmer_counts(dfpos=dfpos, subtype_list=subtype_list) - kmers_matching_negative = st.n_kmers_matching_negative for curr_subtype, exp in zip(subtype_list, n_kmers_matching_expected): - # We can omit the status because there will be a fail status already from non consistent subtypes. - + # We can omit the status because there will be a fail status + # already from non consistent subtypes. obs = mixed_subtype_counts.get(curr_subtype) + int(kmers_matching_negative) _, curr_messages = check_for_missing_kmers(is_fastq=st.is_fastq_input(), subtype_result=curr_subtype, @@ -69,9 +68,7 @@ def is_missing_kmers(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple p=p) message_list.append(curr_messages) - error_messages = ' | '.join(filter(None.__ne__, message_list)) - return QC.FAIL, error_messages @@ -102,19 +99,23 @@ def check_for_missing_kmers(is_fastq: bool, messages = None # proportion of missing kmers - p_missing = (exp - obs) / exp # type: float + p_missing: float = (exp - obs) / exp if p_missing > p.max_perc_missing_kmers: status = QC.FAIL if is_fastq: - kmers_with_hits = df[df['is_kmer_freq_okay']] # type: pd.DataFrame + kmers_with_hits: pd.DataFrame = df[df['is_kmer_freq_okay']] depth = kmers_with_hits['freq'].mean() if depth < p.low_coverage_depth_freq: - coverage_msg = f'Low coverage depth ({depth:.1f} < {float(p.low_coverage_depth_freq):.1f} expected); ' \ - f'you may need more WGS data.' + coverage_msg = f'Low coverage depth ({depth:.1f} ' \ + f'< {float(p.low_coverage_depth_freq):.1f} ' \ + f'expected); you may need more WGS data.' else: - coverage_msg = f'Okay coverage depth ({depth:.1f} >= {float(p.low_coverage_depth_freq):.1f} expected), ' \ - f'but this may be the wrong serovar or species for scheme "{scheme}"' - messages = f'{p_missing:.2%} missing kmers; more than {p.max_perc_missing_kmers:.2%} missing ' \ + coverage_msg = f'Okay coverage depth ({depth:.1f} ' \ + f'>= {float(p.low_coverage_depth_freq):.1f} ' \ + f'expected), but this may be the wrong serovar ' \ + f'or species for scheme "{scheme}"' + messages = f'{p_missing:.2%} missing kmers; more than ' \ + f'{p.max_perc_missing_kmers:.2%} missing ' \ f'kmers threshold. {coverage_msg}' else: messages = f'{p_missing:.2%} missing kmers for subtype "{subtype_result}"; more than ' \ @@ -150,8 +151,10 @@ def is_mixed_subtype(st: Subtype, df: pd.DataFrame, *args) -> Tuple[Optional[str f'the same target site{s} {positions} for subtype "{st.subtype}".' -def is_missing_too_many_target_sites(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[ - Optional[str], Optional[str]]: +def is_missing_too_many_target_sites(st: Subtype, + df: pd.DataFrame, + p: SubtypingParams) \ + -> Tuple[Optional[str], Optional[str]]: """Are there too many missing target sites for an expected subtype? Check if there are any refpositions missing from the subtyping scheme in the result. @@ -208,9 +211,11 @@ def is_missing_hierarchical_kmers(st: Subtype, *args) -> Tuple[Optional[str], Op """Are there any missing nested subtypes in the final subtype call? Note: - This method will check if there's any missing_nested_subtypes in the result, which would indicate a non confident - result. This is due to the fact that if you have a subtyping result of `2.1.1.2` and you're missing `2.1.1` or `2.1` - that you can't be sure that the subtype's final call is 2.1.1.2 due to the missing information. + This method will check if there's any missing_nested_subtypes in the + result, which would indicate a non confident result. This is due to + the fact that if you have a subtyping result of `2.1.1.2` and + you're missing `2.1.1` or `2.1` that you can't be sure that the + subtype's final call is 2.1.1.2 due to the missing information. Args: st: Subtype results @@ -226,8 +231,10 @@ def is_missing_hierarchical_kmers(st: Subtype, *args) -> Tuple[Optional[str], Op return None, None -def is_maybe_intermediate_subtype(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[ - Optional[str], Optional[str]]: +def is_maybe_intermediate_subtype(st: Subtype, + df: pd.DataFrame, + p: SubtypingParams) \ + -> Tuple[Optional[str], Optional[str]]: """Is the result a possible intermediate subtype? Return a WARNING message if all the conditions are true: diff --git a/bio_hansel/subtype.py b/bio_hansel/subtype.py index f56e3f8..5be1d26 100644 --- a/bio_hansel/subtype.py +++ b/bio_hansel/subtype.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- +import os from typing import List, Optional import attr -import os from .const import REGEX_FASTQ @@ -15,7 +15,7 @@ class Subtype(object): scheme_version = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str))) subtype = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str))) non_present_subtypes = attr.ib(default=None) # type: Optional[List[str]] - missing_nested_subtypes = attr.ib(default=None) + missing_nested_subtypes = attr.ib(default=None) all_subtypes = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str))) inconsistent_subtypes = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str))) kmers_matching_subtype = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(str))) @@ -46,7 +46,7 @@ def _file_path_validator(self, attribute, value): if not os.path.exists(x): raise OSError('Input file "{}" does not exist!'.format(x)) else: - raise ValueError('Unexpected type for input file path "{}": {}'.format(type(value), value)) + raise ValueError(f'Unexpected type for input file path "{type(value)}": {value}') def is_fastq_input(self): if isinstance(self.file_path, str): @@ -54,4 +54,4 @@ def is_fastq_input(self): elif isinstance(self.file_path, list): return all(bool(REGEX_FASTQ.match(x)) for x in self.file_path) else: - raise ValueError('Unexpected type "{}" for "file_path": {}'.format(type(self.file_path), self.file_path)) + raise ValueError(f'Unexpected type "{self.file_path}" for "file_path": {self.file_path}') diff --git a/bio_hansel/subtyper.py b/bio_hansel/subtyper.py index b55a3c1..1686023 100644 --- a/bio_hansel/subtyper.py +++ b/bio_hansel/subtyper.py @@ -481,7 +481,7 @@ def missing_nested_subtypes(subtype_result: str, positive_subtypes: Set[str]) -> Args: subtype_result: Final subtype result positive_subtypes: Set of unique positive subtypes found - + Returns: String of missing hierarchical subtypes or `None` if there are no missing nested hierarchical subtypes. """ diff --git a/bio_hansel/utils.py b/bio_hansel/utils.py index 318ef55..e5ba0ef 100644 --- a/bio_hansel/utils.py +++ b/bio_hansel/utils.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import argparse import logging import os import re @@ -44,7 +45,7 @@ def genome_name_from_fasta_path(fasta_path: str) -> str: """ filename = os.path.basename(fasta_path) filename = re.sub(r'\.gz$', '', filename) - return re.sub(r'\.(fa|fas|fasta|fna|\w{1,})(\.gz)?$', '', filename) + return re.sub(r'\.(fa|fas|fasta|fna|\w+)(\.gz)?$', '', filename) def compare_subtypes(a: List[Any], b: List[Any]) -> bool: @@ -61,12 +62,12 @@ def find_inconsistent_subtypes(subtypes: List[List[int]]) -> List[str]: is_consistent = compare_subtypes(a, b) if not is_consistent: incon.append((a, b)) - l = [] + all_inconsistent_subtypes = [] for a, b in incon: astr = '.'.join([str(x) for x in a]) bstr = '.'.join([str(x) for x in b]) - l += [astr, bstr] - c = Counter(l) + all_inconsistent_subtypes += [astr, bstr] + c = Counter(all_inconsistent_subtypes) incon_subtypes = [] for subtype, freq in c.most_common(): if freq >= 1: @@ -171,7 +172,7 @@ def is_gzipped(p: str) -> bool: return bool(re.match(r'^.+\.gz$', p)) -def init_subtyping_params(args: Optional[Any] = None, +def init_subtyping_params(args: Optional[argparse.Namespace] = None, scheme: Optional[str] = None) -> SubtypingParams: """Initialize subtyping parameters based on command-line arguments and scheme defaults @@ -182,10 +183,8 @@ def init_subtyping_params(args: Optional[Any] = None, Returns: SubtypingParams with user-supplied values then scheme defaults then global defaults loaded """ - subtyping_params = get_scheme_params(scheme) - if subtyping_params is None: - subtyping_params = SubtypingParams() - if args is not None: + subtyping_params = get_scheme_params(scheme) or SubtypingParams() + if args: if args.low_cov_depth_freq: subtyping_params.low_coverage_depth_freq = args.low_cov_depth_freq if args.max_missing_kmers: @@ -255,4 +254,4 @@ def expand_degenerate_bases(seq): List of all possible kmers given a degenerate base or not """ - return list(map("".join, product(*map(bases_dict.get, seq)))) \ No newline at end of file + return list(map("".join, product(*map(bases_dict.get, seq)))) diff --git a/setup.cfg b/setup.cfg index 157deff..2ac7394 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.6.0 +current_version = 2.6.1 commit = True tag = True @@ -24,3 +24,24 @@ test = pytest [tool:pytest] collect_ignore = ['setup.py'] +[tox:tox] +envlist = py36, py37, py38, py39, flake8 + +[gh-actions] +python = + 3.6: py36 + 3.7: py37 + 3.8: py38 + 3.9: py39 + +[testenv:flake8] +basepython = python +deps = flake8 +commands = + flake8 bio_hansel --count --select=E9,F63,F7,F82 --show-source --statistics + flake8 bio_hansel --count --exit-zero --max-line-length=127 --statistics + +[testenv] +deps = pytest +commands = pytest + diff --git a/setup.py b/setup.py index 5cac07c..a8b5123 100644 --- a/setup.py +++ b/setup.py @@ -14,10 +14,6 @@ 'rich' ] -setup_requirements = ['pytest-runner', ] - -test_requirements = ['pytest', ] - setup( author='Peter Kruczkiewicz', author_email='peter.kruczkiewicz@gmail.com', @@ -38,7 +34,10 @@ ], description='Subtype microbial whole-genome sequencing (WGS) ' 'data using SNV targeting k-mer subtyping schemes.', - entry_points={'console_scripts': ['hansel=bio_hansel.main:main']}, + entry_points={'console_scripts': [ + 'hansel=bio_hansel.main:main', + 'biohansel=bio_hansel.main:main', + ]}, install_requires=requirements, keywords='Salmonella enterica Heidelberg Enteritidis SNP kmer subtyping Aho-Corasick', license='Apache Software License 2.0', @@ -46,10 +45,8 @@ name='bio_hansel', package_data={'bio_hansel': ['data/*/*.fasta', 'data/*/*.tsv',]}, packages=find_packages(exclude=['test_*.py', 'tests']), - setup_requires=setup_requirements, test_suite='tests', - tests_require=test_requirements, url='https://github.com/phac-nml/biohansel', - version='2.6.0', + version='2.6.1', zip_safe=False, )