From 7db5205be34a60b21f6e2474d266c372c272b2c3 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Thu, 4 May 2023 08:59:07 +0200 Subject: [PATCH] style: apply style black, flake8, isort (#108) Also includes this as a check in CI. --- .github/workflows/ci.yml | 47 +++- .gitignore | 4 + docs/utils.py | 16 +- fixes/fixer | 3 +- misc/rsync-log-summary | 1 - misc/threading-verification.py | 10 +- sbin/get-sequence-urls | 2 +- setup.cfg | 3 +- setup.py | 1 + src/biocommons/__init__.py | 1 + src/biocommons/seqrepo/__init__.py | 10 +- .../seqrepo/_internal/logging_support.py | 2 - src/biocommons/seqrepo/_internal/translate.py | 131 ++++----- src/biocommons/seqrepo/_versionwarning.py | 7 +- src/biocommons/seqrepo/cli.py | 254 ++++++++++-------- src/biocommons/seqrepo/config.py | 7 +- src/biocommons/seqrepo/dataproxy.py | 11 +- src/biocommons/seqrepo/fastadir/__init__.py | 2 +- .../fastadir/_data/migrations/0000-base.py | 6 +- .../fastadir/_data/migrations/0001-initial.py | 7 +- src/biocommons/seqrepo/fastadir/bases.py | 8 +- src/biocommons/seqrepo/fastadir/fabgz.py | 21 +- src/biocommons/seqrepo/fastadir/fastadir.py | 37 ++- src/biocommons/seqrepo/fastaiter/__init__.py | 2 +- src/biocommons/seqrepo/fastaiter/fastaiter.py | 5 +- src/biocommons/seqrepo/seqaliasdb/__init__.py | 2 +- .../seqaliasdb/_data/migrations/0000-base.py | 6 +- .../_data/migrations/0001-initial.py | 37 ++- .../seqrepo/seqaliasdb/seqaliasdb.py | 62 +++-- src/biocommons/seqrepo/seqrepo.py | 98 +++---- src/biocommons/seqrepo/utils.py | 1 - tests/conftest.py | 11 +- tests/test_cli.py | 31 ++- tests/test_fastadir.py | 3 +- tests/test_fastaiter.py | 4 +- tests/test_seqaliasdb.py | 33 +-- tests/test_seqrepo.py | 62 +++-- tests/test_utils.py | 13 +- 38 files changed, 534 insertions(+), 427 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index da899de..6315f16 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,45 @@ jobs: env: GITHUB_TOKEN: '${{ secrets.GITHUB_TOKEN }}' + linting: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + + - name: Install Python + uses: actions/setup-python@v2 + with: + python-version: "3.8" + + - name: Install dependencies + run: | + # Install / update package management tools + pip install -U pip setuptools + # Test dependencies + pip install -U pytest pytest-cov pytest-vcr black==23.3.0 flake8>=6.0,<7.0 isort>=5.0,<6.0 + # Dump installed packages and versions + pip freeze + + - name: Run linting tools + run: | + black -l 120 --check --diff . + isort --profile=black --check --diff . + flake8 src tests docs setup.py + + - name: Comment PR + if: github.event_name == 'pull_request' && failure() + uses: marocchino/sticky-pull-request-comment@v1.1.0 + with: + message: | + - Please format your Python code with [black](https://black.readthedocs.io): `make black` + - Please organize your imports [isorts](https://isort.readthedocs.io): `make isort` + - Please ensure that your code passes [flake8](https://flake8.pycqa.org/en/latest/): `make flake8` + + You can trigger all lints locally by running `black -l 120 --check . && isort --profile=black --check . && flake8 src tests docs setup.py` + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + testing: + needs: linting runs-on: ubuntu-latest strategy: matrix: @@ -36,12 +74,17 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Install system dependencies + - name: Install dependencies run: | sudo apt install libhts-dev libhts3 libhtscodecs-dev libhtscodecs2 tabix + # Install / update package management tools. pip install -U pip setuptools - pip install -U pytest pytest-cov pytest-vcr + # Test dependencies + pip install -U pytest pytest-cov pytest-vcr black==23.3.0 flake8>=6.0,<7.0 isort>=5.0,<6.0 + # Install the local package itself in editable mode. pip install -e . + # Dump installed packages and versions + pip freeze - name: Run tests run: | diff --git a/.gitignore b/.gitignore index 40dfebe..7ba9721 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# Editors +.*.sw? +*~ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/docs/utils.py b/docs/utils.py index e81f754..e1d227d 100644 --- a/docs/utils.py +++ b/docs/utils.py @@ -1,13 +1,11 @@ +import math from base64 import urlsafe_b64decode, urlsafe_b64encode from binascii import hexlify, unhexlify -import datetime -import math -import sys def _format_time(timespan, precision=3): """Formats the timespan in a human readable form - + >>> _format_time(0.35) '350 ms' @@ -26,21 +24,21 @@ def _format_time(timespan, precision=3): if timespan >= 60.0: # we have more than a minute, format that in a human readable form # Idea from http://snipplr.com/view/5713/ - parts = [("d", 60*60*24),("h", 60*60),("min", 60), ("s", 1)] + parts = [("d", 60 * 60 * 24), ("h", 60 * 60), ("min", 60), ("s", 1)] time = [] leftover = timespan for suffix, length in parts: value = int(leftover / length) if value > 0: leftover = leftover % length - time.append(u'%s%s' % (str(value), suffix)) + time.append(u"%s%s" % (str(value), suffix)) if leftover < 1: break return " ".join(time) - units = [u"s", u"ms", u"us", u"ns"] # the save value + units = [u"s", u"ms", u"us", u"ns"] # the save value scaling = [1, 1e3, 1e6, 1e9] - + if timespan > 0.0: order = min(-int(math.floor(math.log10(timespan)) // 3), 3) else: @@ -51,6 +49,6 @@ def _format_time(timespan, precision=3): def hex_to_base64url(s): return urlsafe_b64encode(unhexlify(s)).decode("ascii") + def base64url_to_hex(s): return hexlify(urlsafe_b64decode(s)).decode("ascii") - diff --git a/fixes/fixer b/fixes/fixer index fcbcc63..fd39fea 100755 --- a/fixes/fixer +++ b/fixes/fixer @@ -8,14 +8,13 @@ This script isn't for general users import argparse import logging -import sqlite3 import os +import sqlite3 import sys import coloredlogs import yaml - _logger = logging.getLogger() fixes_dir = os.path.dirname(sys.argv[0]) diff --git a/misc/rsync-log-summary b/misc/rsync-log-summary index 3d2cf34..f04397c 100755 --- a/misc/rsync-log-summary +++ b/misc/rsync-log-summary @@ -11,7 +11,6 @@ import sys import coloredlogs from dateutil.parser import parse - _logger = logging.getLogger(__name__) # e.g., 2016/08/31 01:24:34 [32383] rsync on seqrepo/ from ec2-....compute.amazonaws.com (52.34.43.195) diff --git a/misc/threading-verification.py b/misc/threading-verification.py index fc4efc9..235d3bd 100644 --- a/misc/threading-verification.py +++ b/misc/threading-verification.py @@ -22,9 +22,9 @@ import os -from multiprocessing import Process, Queue import sqlite3 import sys +from multiprocessing import Process, Queue from biocommons.seqrepo import SeqRepo @@ -37,7 +37,7 @@ def fetch_in_thread(sr, nsa): def fetch_seq(q, nsa): pid, ppid = os.getpid(), os.getppid() q.put((pid, ppid, sr[nsa])) - + q = Queue() p = Process(target=fetch_seq, args=(q, nsa)) p.start() @@ -46,9 +46,9 @@ def fetch_seq(q, nsa): assert pid != ppid, "sequence was not fetched from thread" return pid, ppid, seq - -def make_seqrepo(writeable): + +def make_seqrepo(writeable): sr = SeqRepo("/tmp/sr", writeable=True) sr.store("SMELLASSWEET", [{"namespace": "en", "alias": "rose"}, {"namespace": "fr", "alias": "rose"}]) @@ -70,6 +70,6 @@ def _test(sr): print("sys.platform: " + sys.platform) print("sys.version: " + sys.version.replace("\n", " ")) print("sqlite3.sqlite_version: " + sqlite3.sqlite_version) - + _test(make_seqrepo(writeable=False)) _test(make_seqrepo(writeable=True)) diff --git a/sbin/get-sequence-urls b/sbin/get-sequence-urls index 87dfc0b..8759cfe 100755 --- a/sbin/get-sequence-urls +++ b/sbin/get-sequence-urls @@ -15,9 +15,9 @@ import logging import re import sys from urllib.request import urljoin, urlopen -from requests_html import HTMLSession, HTML import coloredlogs +from requests_html import HTML, HTMLSession _logger = logging.getLogger(__name__) diff --git a/setup.cfg b/setup.cfg index 5c3bc92..1cd277d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -103,6 +103,5 @@ all_files = 1 max-line-length = 120 exclude = tests/* max-complexity = 10 -ignore = E129,E221,E241,E251,E303,W291 - +ignore = E203, E266, E501, W503 diff --git a/setup.py b/setup.py index 460aabe..d5d43d7 100644 --- a/setup.py +++ b/setup.py @@ -1,2 +1,3 @@ from setuptools import setup + setup(use_scm_version=True) diff --git a/src/biocommons/__init__.py b/src/biocommons/__init__.py index 54720a6..96686a2 100644 --- a/src/biocommons/__init__.py +++ b/src/biocommons/__init__.py @@ -1,3 +1,4 @@ # pragma: nocover import pkg_resources + pkg_resources.declare_namespace(__name__) diff --git a/src/biocommons/seqrepo/__init__.py b/src/biocommons/seqrepo/__init__.py index 670a8da..51341fe 100644 --- a/src/biocommons/seqrepo/__init__.py +++ b/src/biocommons/seqrepo/__init__.py @@ -2,24 +2,24 @@ from __future__ import absolute_import, division, print_function, unicode_literals import logging -import pkg_resources import warnings +import pkg_resources + _logger = logging.getLogger(__name__) -from ._versionwarning import * +from ._versionwarning import * # noqa; F403 try: __version__ = pkg_resources.get_distribution(__name__).version -except pkg_resources.DistributionNotFound as e: # pragma: no cover +except pkg_resources.DistributionNotFound: # pragma: no cover warnings.warn("can't get __version__ because %s package isn't installed" % __package__, Warning) __version__ = None _logger.info(__name__ + " " + __version__) -from .seqrepo import SeqRepo - +from .seqrepo import SeqRepo # noqa: F401, E402 # # Copyright 2016 biocommons.fastadir Contributors (https://github.com/biocommons/biocommons.fastadir/) diff --git a/src/biocommons/seqrepo/_internal/logging_support.py b/src/biocommons/seqrepo/_internal/logging_support.py index 85e1c52..41ac562 100644 --- a/src/biocommons/seqrepo/_internal/logging_support.py +++ b/src/biocommons/seqrepo/_internal/logging_support.py @@ -1,5 +1,3 @@ -import logging - class DuplicateFilter: """ Filters away duplicate log messages. diff --git a/src/biocommons/seqrepo/_internal/translate.py b/src/biocommons/seqrepo/_internal/translate.py index 33bb19b..a3e4090 100644 --- a/src/biocommons/seqrepo/_internal/translate.py +++ b/src/biocommons/seqrepo/_internal/translate.py @@ -17,7 +17,6 @@ import datetime - def translate_db2api(namespace, alias): """ >>> translate_db2api("VMC", "GS_1234") @@ -32,10 +31,7 @@ def translate_db2api(namespace, alias): if namespace == "LRG": return [("lrg", alias)] if namespace == "VMC": - return [ - ("sha512t24u", alias[3:] if alias else None), - ("ga4gh", "SQ." + alias[3:] if alias else None), - ] + return [("sha512t24u", alias[3:] if alias else None), ("ga4gh", "SQ." + alias[3:] if alias else None)] return [] @@ -53,21 +49,16 @@ def translate_api2db(namespace, alias): if namespace == "lrg": return [("LRG", alias)] if namespace == "sha512t24u": - return [ - ("VMC", "GS_" + alias if alias else None), - ] + return [("VMC", "GS_" + alias if alias else None)] if namespace == "ga4gh": - return [ - ("VMC", "GS_" + alias[3:]), - ] + return [("VMC", "GS_" + alias[3:])] return [] - def translate_alias_records(aliases_itr): """given an iterator of find_aliases results, return a stream with translated records""" - + for arec in aliases_itr: yield arec @@ -78,56 +69,70 @@ def translate_alias_records(aliases_itr): yield arec2 - - if __name__ == "__main__": aliases = [ - {'seqalias_id': 16, - 'seq_id': '9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6', - 'alias': 'ncbiac/e', - 'added': datetime.datetime(2020, 7, 6, 5, 27, 23), - 'is_current': 1, - 'namespace': 'Ensembl'}, - {'seqalias_id': 16, - 'seq_id': '9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6', - 'alias': 'ncbiac/e', - 'added': datetime.datetime(2020, 7, 6, 5, 27, 23), - 'is_current': 1, - 'namespace': 'ensembl'}, - {'seqalias_id': 3, - 'seq_id': '9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6', - 'alias': 'be8a4c35767bb783a7b8b6dc04ba3718', - 'added': datetime.datetime(2020, 7, 6, 5, 10, 57), - 'is_current': 1, - 'namespace': 'MD5'}, - {'seqalias_id': 5, - 'seq_id': '9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6', - 'alias': 'ncbiac', - 'added': datetime.datetime(2020, 7, 6, 5, 10, 57), - 'is_current': 1, - 'namespace': 'NCBI'}, - {'seqalias_id': 5, - 'seq_id': '9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6', - 'alias': 'ncbiac', - 'added': datetime.datetime(2020, 7, 6, 5, 10, 57), - 'is_current': 1, - 'namespace': 'refseq'}, - {'seqalias_id': 4, - 'seq_id': '9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6', - 'alias': '5W5mCzikufDcezdNTGKLa9zricw', - 'added': datetime.datetime(2020, 7, 6, 5, 10, 57), - 'is_current': 1, - 'namespace': 'SEGUID'}, - {'seqalias_id': 2, - 'seq_id': '9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6', - 'alias': 'e56e660b38a4b9f0dc7b374d4c628b6bdceb89cc', - 'added': datetime.datetime(2020, 7, 6, 5, 10, 57), - 'is_current': 1, - 'namespace': 'SHA1'}, - {'seqalias_id': 1, - 'seq_id': '9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6', - 'alias': 'GS_9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6', - 'added': datetime.datetime(2020, 7, 6, 5, 10, 57), - 'is_current': 1, - 'namespace': 'VMC'} + { + "seqalias_id": 16, + "seq_id": "9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6", + "alias": "ncbiac/e", + "added": datetime.datetime(2020, 7, 6, 5, 27, 23), + "is_current": 1, + "namespace": "Ensembl", + }, + { + "seqalias_id": 16, + "seq_id": "9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6", + "alias": "ncbiac/e", + "added": datetime.datetime(2020, 7, 6, 5, 27, 23), + "is_current": 1, + "namespace": "ensembl", + }, + { + "seqalias_id": 3, + "seq_id": "9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6", + "alias": "be8a4c35767bb783a7b8b6dc04ba3718", + "added": datetime.datetime(2020, 7, 6, 5, 10, 57), + "is_current": 1, + "namespace": "MD5", + }, + { + "seqalias_id": 5, + "seq_id": "9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6", + "alias": "ncbiac", + "added": datetime.datetime(2020, 7, 6, 5, 10, 57), + "is_current": 1, + "namespace": "NCBI", + }, + { + "seqalias_id": 5, + "seq_id": "9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6", + "alias": "ncbiac", + "added": datetime.datetime(2020, 7, 6, 5, 10, 57), + "is_current": 1, + "namespace": "refseq", + }, + { + "seqalias_id": 4, + "seq_id": "9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6", + "alias": "5W5mCzikufDcezdNTGKLa9zricw", + "added": datetime.datetime(2020, 7, 6, 5, 10, 57), + "is_current": 1, + "namespace": "SEGUID", + }, + { + "seqalias_id": 2, + "seq_id": "9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6", + "alias": "e56e660b38a4b9f0dc7b374d4c628b6bdceb89cc", + "added": datetime.datetime(2020, 7, 6, 5, 10, 57), + "is_current": 1, + "namespace": "SHA1", + }, + { + "seqalias_id": 1, + "seq_id": "9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6", + "alias": "GS_9Sn3d56Fzds_c6ovS__sj1fbMd_Xd3J6", + "added": datetime.datetime(2020, 7, 6, 5, 10, 57), + "is_current": 1, + "namespace": "VMC", + }, ] diff --git a/src/biocommons/seqrepo/_versionwarning.py b/src/biocommons/seqrepo/_versionwarning.py index 2df7dac..f147926 100644 --- a/src/biocommons/seqrepo/_versionwarning.py +++ b/src/biocommons/seqrepo/_versionwarning.py @@ -9,11 +9,12 @@ __all__ = [] -version_warning = ("biocommons packages are tested and supported only on Python >= 3.6" - " (https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6)") +version_warning = ( + "biocommons packages are tested and supported only on Python >= 3.6" + " (https://github.com/biocommons/org/wiki/Migrating-to-Python-3.6)" +) _logger = logging.getLogger(__package__) if sys.version_info < (3, 6): _logger.warning(version_warning) - diff --git a/src/biocommons/seqrepo/cli.py b/src/biocommons/seqrepo/cli.py index 5aa6ed7..87eded6 100644 --- a/src/biocommons/seqrepo/cli.py +++ b/src/biocommons/seqrepo/cli.py @@ -20,43 +20,38 @@ import itertools import logging import os -import pprint import re import shutil import stat -import sys import subprocess +import sys import tempfile - import bioutils.assemblies import bioutils.seqfetcher -import six import tqdm -from . import __version__, SeqRepo +from . import SeqRepo, __version__ from .fastaiter import FastaIter from .utils import parse_defline, validate_aliases - SEQREPO_ROOT_DIR = os.environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo") DEFAULT_INSTANCE_NAME_RW = "master" DEFAULT_INSTANCE_NAME_RO = "latest" instance_name_new_re = re.compile(r"^20[12]\d-\d\d-\d\d$") # smells like a new datestamp, 2017-01-17 -instance_name_old_re = re.compile(r"^20[12]1\d\d\d\d\d$") # smells like an old datestamp, 20170117 -instance_name_re = re.compile(r"^20[12]\d-?\d\d-?\d\d$") # smells like a datestamp, 20170117 or 2017-01-17 +instance_name_old_re = re.compile(r"^20[12]1\d\d\d\d\d$") # smells like an old datestamp, 20170117 +instance_name_re = re.compile(r"^20[12]\d-?\d\d-?\d\d$") # smells like a datestamp, 20170117 or 2017-01-17 _logger = logging.getLogger(__name__) def _get_remote_instances(opts): line_re = re.compile(r"d[-rwx]{9}\s+[\d,]+ \d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2} (.+)") - rsync_cmd = [opts.rsync_exe, "--no-motd", "--copy-dirlinks", - opts.remote_host + "::seqrepo"] + rsync_cmd = [opts.rsync_exe, "--no-motd", "--copy-dirlinks", opts.remote_host + "::seqrepo"] _logger.debug("Executing `" + " ".join(rsync_cmd) + "`") lines = subprocess.check_output(rsync_cmd).decode().splitlines()[1:] - dirs = (m.group(1) for m in (line_re.match(l) for l in lines) if m) + dirs = (m.group(1) for m in (line_re.match(line) for line in lines) if m) return sorted(list(filter(instance_name_new_re.match, dirs))) @@ -78,10 +73,13 @@ def parse_arguments(): top_p = argparse.ArgumentParser( description=__doc__.split("\n\n")[0], formatter_class=argparse.ArgumentDefaultsHelpFormatter, - epilog="seqrepo " + __version__ + ". See https://github.com/biocommons/biocommons.seqrepo for more information") + epilog="seqrepo %s. See https://github.com/biocommons/biocommons.seqrepo for more information" % __version__, + ) top_p.add_argument("--dry-run", "-n", default=False, action="store_true") top_p.add_argument("--remote-host", default="dl.biocommons.org", help="rsync server host") - top_p.add_argument("--root-directory", "-r", default=SEQREPO_ROOT_DIR, help="seqrepo root directory (SEQREPO_ROOT_DIR)") + top_p.add_argument( + "--root-directory", "-r", default=SEQREPO_ROOT_DIR, help="seqrepo root directory (SEQREPO_ROOT_DIR)" + ) top_p.add_argument("--rsync-exe", default="/usr/bin/rsync", help="path to rsync executable") top_p.add_argument("--verbose", "-v", action="count", default=0, help="be verbose; multiple accepted") top_p.add_argument("--version", action="version", version=__version__) @@ -95,56 +93,60 @@ def parse_arguments(): # add-assembly-names ap = subparsers.add_parser( - "add-assembly-names", help="add assembly aliases (from bioutils.assemblies) to existing sequences") + "add-assembly-names", help="add assembly aliases (from bioutils.assemblies) to existing sequences" + ) ap.set_defaults(func=add_assembly_names) ap.add_argument( - "--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RW, help="instance name; must be writeable (i.e., not a snapshot)") + "--instance-name", + "-i", + default=DEFAULT_INSTANCE_NAME_RW, + help="instance name; must be writeable (i.e., not a snapshot)", + ) ap.add_argument( - "--partial-load", "-p", default=False, action="store_true", help="assign assembly aliases even if some sequences are missing") + "--partial-load", + "-p", + default=False, + action="store_true", + help="assign assembly aliases even if some sequences are missing", + ) ap.add_argument( - "--reload-all", "-r", default=False, action="store_true", help="reload all assemblies, not just missing ones") + "--reload-all", "-r", default=False, action="store_true", help="reload all assemblies, not just missing ones" + ) # export ap = subparsers.add_parser("export", help="export sequences") ap.set_defaults(func=export) - ap.add_argument("ALIASES", - nargs="*", - help="specific aliases to export") + ap.add_argument("ALIASES", nargs="*", help="specific aliases to export") ap.add_argument("--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RO, help="instance name") - ap.add_argument( - "--namespace", - "-n", - help="namespace name (e.g., refseq, NCBI, Ensembl, LRG)", ) + ap.add_argument("--namespace", "-n", help="namespace name (e.g., refseq, NCBI, Ensembl, LRG)") # export aliases ap = subparsers.add_parser("export-aliases", help="export aliases") ap.set_defaults(func=export_aliases) ap.add_argument("--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RO, help="instance name") - ap.add_argument( - "--namespace", - "-n", - help="namespace name (e.g., refseq, NCBI, Ensembl, LRG)", ) + ap.add_argument("--namespace", "-n", help="namespace name (e.g., refseq, NCBI, Ensembl, LRG)") # fetch-load ap = subparsers.add_parser("fetch-load", help="fetch remote sequences by accession and load them (low-throughput!)") ap.set_defaults(func=fetch_load) ap.add_argument( - "--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RW, help="instance name; must be writeable (i.e., not a snapshot)") - ap.add_argument( - "accessions", - nargs="+", - help="accessions (NCBI or Ensembl)", ) - ap.add_argument( - "--namespace", - "-n", - required=True, - help="namespace name (e.g., NCBI, Ensembl, LRG)", ) + "--instance-name", + "-i", + default=DEFAULT_INSTANCE_NAME_RW, + help="instance name; must be writeable (i.e., not a snapshot)", + ) + ap.add_argument("accessions", nargs="+", help="accessions (NCBI or Ensembl)") + ap.add_argument("--namespace", "-n", required=True, help="namespace name (e.g., NCBI, Ensembl, LRG)") # init ap = subparsers.add_parser("init", help="initialize seqrepo directory") ap.set_defaults(func=init) ap.add_argument( - "--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RW, help="instance name; must be writeable (i.e., not a snapshot)") + "--instance-name", + "-i", + default=DEFAULT_INSTANCE_NAME_RW, + help="instance name; must be writeable (i.e., not a snapshot)", + ) # list-local-instances ap = subparsers.add_parser("list-local-instances", help="list local seqrepo instances") @@ -158,22 +160,21 @@ def parse_arguments(): ap = subparsers.add_parser("load", help="load a single fasta file") ap.set_defaults(func=load) ap.add_argument( - "--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RW, help="instance name; must be writeable (i.e., not a snapshot)") - ap.add_argument( - "fasta_files", - nargs="+", - help="fasta files to load (compressed okay)", ) - ap.add_argument( - "--namespace", - "-n", - required=True, - help="namespace name (e.g., NCBI, Ensembl, LRG)", ) + "--instance-name", + "-i", + default=DEFAULT_INSTANCE_NAME_RW, + help="instance name; must be writeable (i.e., not a snapshot)", + ) + ap.add_argument("fasta_files", nargs="+", help="fasta files to load (compressed okay)") + ap.add_argument("--namespace", "-n", required=True, help="namespace name (e.g., NCBI, Ensembl, LRG)") # pull ap = subparsers.add_parser("pull", help="pull incremental update from seqrepo mirror") ap.set_defaults(func=pull) ap.add_argument("--instance-name", "-i", default=None, help="instance name") - ap.add_argument("--update-latest", "-l", default=False, action="store_true", help="set latest symlink to point to this instance") + ap.add_argument( + "--update-latest", "-l", default=False, action="store_true", help="set latest symlink to point to this instance" + ) # show-status ap = subparsers.add_parser("show-status", help="show seqrepo status") @@ -183,11 +184,13 @@ def parse_arguments(): # snapshot ap = subparsers.add_parser("snapshot", help="create a new read-only seqrepo snapshot") ap.set_defaults(func=snapshot) + ap.add_argument("--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RW, help="instance name; must be writeable") ap.add_argument( - "--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RW, help="instance name; must be writeable") - ap.add_argument("--destination-name", "-d", - default=datetime.datetime.utcnow().strftime("%F"), - help="destination directory name (must not already exist)") + "--destination-name", + "-d", + default=datetime.datetime.utcnow().strftime("%F"), + help="destination directory name (must not already exist)", + ) # start-shell ap = subparsers.add_parser("start-shell", help="start interactive shell with initialized seqrepo") @@ -197,14 +200,12 @@ def parse_arguments(): # upgrade ap = subparsers.add_parser("upgrade", help="upgrade seqrepo database and directory") ap.set_defaults(func=upgrade) - ap.add_argument( - "--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RW, help="instance name; must be writeable") + ap.add_argument("--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RW, help="instance name; must be writeable") # update digests ap = subparsers.add_parser("update-digests", help="update computed digests in place") ap.set_defaults(func=update_digests) - ap.add_argument( - "--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RW, help="instance name; must be writeable") + ap.add_argument("--instance-name", "-i", default=DEFAULT_INSTANCE_NAME_RW, help="instance name; must be writeable") # update latest (symlink) ap = subparsers.add_parser("update-latest", help="create symlink `latest` to newest seqrepo instance") @@ -216,6 +217,7 @@ def parse_arguments(): ############################################################################ + def add_assembly_names(opts): """add assembly names as aliases to existing sequences @@ -262,12 +264,22 @@ def add_assembly_names(opts): # all assembled-molecules (1..22, X, Y, MT) have ncbi aliases in seqrepo not_in_seqrepo = [s["refseq_ac"] for s in eq_sequences if s["refseq_ac"] not in ncbi_alias_map] if not_in_seqrepo: - _logger.warning("Assembly {an} references {n} accessions not in SeqRepo instance {opts.instance_name} (e.g., {acs})".format( - an=assy_name, n=len(not_in_seqrepo), opts=opts, acs=", ".join(not_in_seqrepo[:5]+["..."]), seqrepo_dir=seqrepo_dir)) + _logger.warning( + ( + "Assembly {an} references {n} accessions not in SeqRepo instance " + "{opts.instance_name} @ {seqrepo_dir} (e.g., {acs})" + ).format( + an=assy_name, + n=len(not_in_seqrepo), + opts=opts, + acs=", ".join(not_in_seqrepo[:5] + ["..."]), + seqrepo_dir=seqrepo_dir, + ) + ) if not opts.partial_load: _logger.warning("Skipping {an} (-p to enable partial loading)".format(an=assy_name)) continue - + eq_sequences = [es for es in eq_sequences if es["refseq_ac"] in ncbi_alias_map] _logger.info("Loading {n} new accessions for assembly {an}".format(an=assy_name, n=len(eq_sequences))) @@ -276,41 +288,45 @@ def add_assembly_names(opts): aliases = [{"namespace": assy_name, "alias": a} for a in [s["name"]] + s["aliases"]] for alias in aliases: sr.aliases.store_alias(seq_id=seq_id, **alias) - _logger.debug("Added assembly alias {a[namespace]}:{a[alias]} for {seq_id}".format(a=alias, seq_id=seq_id)) + _logger.debug( + "Added assembly alias {a[namespace]}:{a[alias]} for {seq_id}".format(a=alias, seq_id=seq_id) + ) sr.commit() -def export(opts): +def export(opts): # noqa: C901 seqrepo_dir = os.path.join(opts.root_directory, opts.instance_name) sr = SeqRepo(seqrepo_dir) if opts.ALIASES: + def alias_generator(): for alias in set(opts.ALIASES): - yield from sr.aliases.find_aliases(namespace=opts.namespace, # None okay - alias=alias, - translate_ncbi_namespace=True) + yield from sr.aliases.find_aliases( + namespace=opts.namespace, alias=alias, translate_ncbi_namespace=True # None okay + ) + def _rec_iterator(): """yield (srec, [arec]) tuples to export""" - grouped_alias_iterator = itertools.groupby(alias_generator(), - key=lambda arec: (arec["seq_id"])) + grouped_alias_iterator = itertools.groupby(alias_generator(), key=lambda arec: (arec["seq_id"])) for seq_id, arecs in grouped_alias_iterator: srec = sr.sequences.fetch_seqinfo(seq_id) srec["seq"] = sr.sequences.fetch(seq_id) yield srec, arecs - + elif opts.namespace: + def _rec_iterator(): """yield (srec, [arec]) tuples to export""" - alias_iterator = sr.aliases.find_aliases(namespace=opts.namespace, - translate_ncbi_namespace=True) - grouped_alias_iterator = itertools.groupby(alias_iterator, - key=lambda arec: (arec["seq_id"])) + alias_iterator = sr.aliases.find_aliases(namespace=opts.namespace, translate_ncbi_namespace=True) + grouped_alias_iterator = itertools.groupby(alias_iterator, key=lambda arec: (arec["seq_id"])) for seq_id, arecs in grouped_alias_iterator: srec = sr.sequences.fetch_seqinfo(seq_id) srec["seq"] = sr.sequences.fetch(seq_id) yield srec, arecs + else: + def _rec_iterator(): yield from sr @@ -318,17 +334,15 @@ def _rec_iterator(): nsad = _convert_alias_records_to_ns_dict(arecs) aliases = ["{ns}:{a}".format(ns=ns, a=a) for ns, aliases in sorted(nsad.items()) for a in aliases] print(">" + " ".join(aliases)) - for l in _wrap_lines(srec["seq"], 100): - print(l) - - + for line in _wrap_lines(srec["seq"], 100): + print(line) def export_aliases(opts): seqrepo_dir = os.path.join(opts.root_directory, opts.instance_name) sr = SeqRepo(seqrepo_dir) alias_iterator = sr.aliases.find_aliases(translate_ncbi_namespace=True) - grouped_alias_iterator = itertools.groupby(alias_iterator, key=lambda arec: (arec["seq_id"])) + grouped_alias_iterator = itertools.groupby(alias_iterator, key=lambda arec: (arec["seq_id"])) for _, arecs in grouped_alias_iterator: if opts.namespace: if not any(arec for arec in arecs if arec["namespace"] == opts.namespace): @@ -339,7 +353,7 @@ def export_aliases(opts): nsaliases.sort(key=lambda a: (not a.startswith("VMC:"), a)) # VMC first nsaliases[0] = nsaliases[0].replace("VMC:GS_", "GA4GH:SQ.") print("\t".join(nsaliases)) - + def fetch_load(opts): disable_bar = _logger.getEffectiveLevel() < logging.WARNING @@ -365,7 +379,7 @@ def init(opts): seqrepo_dir = os.path.join(opts.root_directory, opts.instance_name) if os.path.exists(seqrepo_dir) and len(os.listdir(seqrepo_dir)) > 0: raise IOError("{seqrepo_dir} exists and is not empty".format(seqrepo_dir=seqrepo_dir)) - sr = SeqRepo(seqrepo_dir, writeable=True) # flake8: noqa + sr = SeqRepo(seqrepo_dir, writeable=True) # noqa: F841 def list_local_instances(opts): @@ -381,6 +395,7 @@ def list_remote_instances(opts): for i in instances: print(" " + i) + def load(opts): # TODO: drop this test if opts.namespace == "-": @@ -407,8 +422,11 @@ def load(opts): seq_bar = tqdm.tqdm(FastaIter(fh), unit=" seqs", disable=disable_bar, leave=False) for defline, seq in seq_bar: n_seqs_seen += 1 - seq_bar.set_description("sequences: {nsa}/{nss} added/seen; aliases: {naa} added".format( - nss=n_seqs_seen, nsa=n_seqs_added, naa=n_aliases_added)) + seq_bar.set_description( + "sequences: {nsa}/{nss} added/seen; aliases: {naa} added".format( + nss=n_seqs_seen, nsa=n_seqs_added, naa=n_aliases_added + ) + ) aliases = parse_defline(defline, opts.namespace) validate_aliases(aliases) n_sa, n_aa = sr.store(seq, aliases) @@ -433,12 +451,12 @@ def pull(opts): return tmp_dir = tempfile.mkdtemp(dir=opts.root_directory, prefix=instance_name + ".") - os.rmdir(tmp_dir) # let rsync create it the directory + os.rmdir(tmp_dir) # let rsync create it the directory cmd = [opts.rsync_exe, "-aHP", "--no-motd"] if local_instances: latest_local_instance = local_instances[-1] - cmd += ["--link-dest=" + os.path.join(opts.root_directory, latest_local_instance) + "/"] + cmd += ["--link-dest=%s/" % os.path.join(opts.root_directory, latest_local_instance)] cmd += ["{h}::seqrepo/{i}/".format(h=opts.remote_host, i=instance_name), tmp_dir] _logger.debug("Executing: " + " ".join(cmd)) @@ -455,18 +473,29 @@ def show_status(opts): seqrepo_dir = os.path.join(opts.root_directory, opts.instance_name) tot_size = sum( os.path.getsize(os.path.join(dirpath, filename)) - for dirpath, dirnames, filenames in os.walk(seqrepo_dir) for filename in filenames) + for dirpath, dirnames, filenames in os.walk(seqrepo_dir) + for filename in filenames + ) sr = SeqRepo(seqrepo_dir) print("seqrepo {version}".format(version=__version__)) print("instance directory: {sr._root_dir}, {ts:.1f} GB".format(sr=sr, ts=tot_size / 1e9)) - print("backends: fastadir (schema {fd_v}), seqaliasdb (schema {sa_v}) ".format( - fd_v=sr.sequences.schema_version(), sa_v=sr.aliases.schema_version())) - print("sequences: {ss[n_sequences]} sequences, {ss[tot_length]} residues, {ss[n_files]} files".format( - ss=sr.sequences.stats())) print( - "aliases: {sa[n_aliases]} aliases, {sa[n_current]} current, {sa[n_namespaces]} namespaces, {sa[n_sequences]} sequences". - format(sa=sr.aliases.stats())) + "backends: fastadir (schema {fd_v}), seqaliasdb (schema {sa_v}) ".format( + fd_v=sr.sequences.schema_version(), sa_v=sr.aliases.schema_version() + ) + ) + print( + "sequences: {ss[n_sequences]} sequences, {ss[tot_length]} residues, {ss[n_files]} files".format( + ss=sr.sequences.stats() + ) + ) + print( + ( + "aliases: {sa[n_aliases]} aliases, {sa[n_current]} current, {sa[n_namespaces]} " + "namespaces, {sa[n_sequences]} sequences" + ).format(sa=sr.aliases.stats()) + ) return sr @@ -508,8 +537,12 @@ def snapshot(opts): os.mkdir(dp) # hard link sequence files - for rp in (os.path.join(dirpath, filename) for dirpath, _, filenames in os.walk(".") for filename in filenames - if ".bgz" in filename): + for rp in ( + os.path.join(dirpath, filename) + for dirpath, _, filenames in os.walk(".") + for filename in filenames + if ".bgz" in filename + ): dp = os.path.join(tmp_dir, rp) os.link(rp, dp) @@ -526,8 +559,11 @@ def _drop_write(p): new_mode = mode & ~mode_aw os.chmod(p, new_mode) - for dp in (os.path.join(dirpath, dirent) - for dirpath, dirnames, filenames in os.walk(tmp_dir) for dirent in dirnames + filenames): + for dp in ( + os.path.join(dirpath, dirent) + for dirpath, dirnames, filenames in os.walk(tmp_dir) + for dirent in dirnames + filenames + ): _drop_write(dp) _drop_write(tmp_dir) os.rename(tmp_dir, dst_dir) @@ -538,12 +574,18 @@ def _drop_write(p): def start_shell(opts): seqrepo_dir = os.path.join(opts.root_directory, opts.instance_name) - sr = SeqRepo(seqrepo_dir) + sr = SeqRepo(seqrepo_dir) # noqa: F841 import IPython - IPython.embed(header="\n".join([ - "seqrepo (https://github.com/biocommons/biocommons.seqrepo/)", "version: " + __version__, - "instance path: " + seqrepo_dir - ])) + + IPython.embed( + header="\n".join( + [ + "seqrepo (https://github.com/biocommons/biocommons.seqrepo/)", + "version: " + __version__, + "instance path: " + seqrepo_dir, + ] + ) + ) def upgrade(opts): @@ -578,18 +620,15 @@ def update_latest(opts, mri=None): def main(): opts = parse_arguments() - verbose_log_level = (logging.WARN if opts.verbose == 0 else - logging.INFO if opts.verbose == 1 else - logging.DEBUG) + verbose_log_level = logging.WARN if opts.verbose == 0 else logging.INFO if opts.verbose == 1 else logging.DEBUG logging.basicConfig(level=verbose_log_level) opts.func(opts) - - ############################################################################ # INTERNAL + def _convert_alias_records_to_ns_dict(records): """converts a set of alias db records to a dict like {ns: [aliases], ...} aliases are lexicographicaly sorted @@ -597,11 +636,10 @@ def _convert_alias_records_to_ns_dict(records): records = sorted(records, key=lambda r: (r["namespace"], r["alias"])) return {g: [r["alias"] for r in gi] for g, gi in itertools.groupby(records, key=lambda r: r["namespace"])} + def _wrap_lines(seq, line_width): for i in range(0, len(seq), line_width): - yield seq[i:i + line_width] - - + yield seq[i : i + line_width] if __name__ == "__main__": diff --git a/src/biocommons/seqrepo/config.py b/src/biocommons/seqrepo/config.py index 4209c35..abd6c36 100644 --- a/src/biocommons/seqrepo/config.py +++ b/src/biocommons/seqrepo/config.py @@ -4,8 +4,9 @@ seqrepo_env_var = os.environ.get("SEQREPO_LRU_CACHE_MAXSIZE", "1000000") SEQREPO_LRU_CACHE_MAXSIZE = int(seqrepo_env_var) except ValueError: - if seqrepo_env_var.lower() == 'none': + if seqrepo_env_var.lower() == "none": SEQREPO_LRU_CACHE_MAXSIZE = None else: - raise ValueError('SEQREPO_LRU_CACHE_MAXSIZE must be a valid int, none, or not set, ' - 'currently it is ' + seqrepo_env_var) + raise ValueError( + "SEQREPO_LRU_CACHE_MAXSIZE must be a valid int, none, or not set, " "currently it is " + seqrepo_env_var + ) diff --git a/src/biocommons/seqrepo/dataproxy.py b/src/biocommons/seqrepo/dataproxy.py index e5de0e3..19a8121 100644 --- a/src/biocommons/seqrepo/dataproxy.py +++ b/src/biocommons/seqrepo/dataproxy.py @@ -5,16 +5,15 @@ """ -from abc import ABC, abstractmethod -from collections.abc import Sequence import datetime import functools import logging import os +from abc import ABC, abstractmethod from urllib.parse import urlparse -from bioutils.accessions import coerce_namespace import requests +from bioutils.accessions import coerce_namespace _logger = logging.getLogger(__name__) @@ -205,8 +204,7 @@ def create_dataproxy(uri: str = None) -> _DataProxy: """ - uri = (uri - or os.environ.get("SEQREPO_DATAPROXY_URI", None)) + uri = uri or os.environ.get("SEQREPO_DATAPROXY_URI", None) if uri is None: raise ValueError("No data proxy URI provided or found in SEQREPO_DATAPROXY_URI") @@ -223,10 +221,11 @@ def create_dataproxy(uri: str = None) -> _DataProxy: if proto in ("", "file"): # pylint: disable=import-error, import-outside-toplevel from biocommons.seqrepo import SeqRepo + sr = SeqRepo(root_dir=parsed_uri.path) dp = SeqRepoDataProxy(sr) elif proto in ("http", "https"): - dp = SeqRepoRESTDataProxy(uri[len(provider) + 1:]) + dp = SeqRepoRESTDataProxy(uri[len(provider) + 1 :]) else: raise ValueError(f"SeqRepo URI scheme {parsed_uri.scheme} not implemented") diff --git a/src/biocommons/seqrepo/fastadir/__init__.py b/src/biocommons/seqrepo/fastadir/__init__.py index d0c58aa..0d2db26 100644 --- a/src/biocommons/seqrepo/fastadir/__init__.py +++ b/src/biocommons/seqrepo/fastadir/__init__.py @@ -1 +1 @@ -from .fastadir import FastaDir +from .fastadir import FastaDir # noqa: F401 diff --git a/src/biocommons/seqrepo/fastadir/_data/migrations/0000-base.py b/src/biocommons/seqrepo/fastadir/_data/migrations/0000-base.py index 5a828d2..a04b420 100644 --- a/src/biocommons/seqrepo/fastadir/_data/migrations/0000-base.py +++ b/src/biocommons/seqrepo/fastadir/_data/migrations/0000-base.py @@ -4,8 +4,10 @@ step("""create unique index meta_key_idx on meta(key)""", """drop index meta_key_idx""") -step("""create table log (ts timestamp not null default current_timestamp, v text not null, msg text not null);""", - """drop table log""") +step( + """create table log (ts timestamp not null default current_timestamp, v text not null, msg text not null);""", + """drop table log""", +) step("""insert into log (v,msg) values ('0', 'database created')""") diff --git a/src/biocommons/seqrepo/fastadir/_data/migrations/0001-initial.py b/src/biocommons/seqrepo/fastadir/_data/migrations/0001-initial.py index f0bedb0..e9791ff 100644 --- a/src/biocommons/seqrepo/fastadir/_data/migrations/0001-initial.py +++ b/src/biocommons/seqrepo/fastadir/_data/migrations/0001-initial.py @@ -1,13 +1,16 @@ from yoyo import step -step(""" +step( + """ create table seqinfo ( seq_id text primary key, len integer not null, alpha text not null, added timestamp not null default current_timestamp, relpath text not null -)""", """drop table seqinfo""") +)""", + """drop table seqinfo""", +) step("""create unique index seqinfo_seq_id_idx on seqinfo(seq_id)""") diff --git a/src/biocommons/seqrepo/fastadir/bases.py b/src/biocommons/seqrepo/fastadir/bases.py index 16446a8..4425bcc 100644 --- a/src/biocommons/seqrepo/fastadir/bases.py +++ b/src/biocommons/seqrepo/fastadir/bases.py @@ -4,17 +4,17 @@ @six.add_metaclass(abc.ABCMeta) -class BaseReader(): +class BaseReader: @abc.abstractmethod def fetch(self, seq_id, start, end): - pass # pragma: no cover + pass # pragma: no cover def __getitem__(self, ac): return self.fetch(ac) @six.add_metaclass(abc.ABCMeta) -class BaseWriter(): +class BaseWriter: @abc.abstractmethod def store(self, seq_id, seq): - pass # pragma: no cover + pass # pragma: no cover diff --git a/src/biocommons/seqrepo/fastadir/fabgz.py b/src/biocommons/seqrepo/fastadir/fabgz.py index 68120ac..cc70418 100644 --- a/src/biocommons/seqrepo/fastadir/fabgz.py +++ b/src/biocommons/seqrepo/fastadir/fabgz.py @@ -17,10 +17,8 @@ import subprocess import six - from pysam import FastaFile - _logger = logging.getLogger(__name__) line_width = 100 @@ -48,13 +46,20 @@ def _find_bgzip(): except AttributeError: raise RuntimeError("Didn't find version string in bgzip executable ({exe})".format(exe=exe)) except missing_file_exception: - raise RuntimeError("{exe} doesn't exist; you need to install htslib and tabix (See https://github.com/biocommons/biocommons.seqrepo#requirements)".format(exe=exe)) + raise RuntimeError( + "{exe} doesn't exist; you need to install htslib and tabix (See https://github.com/biocommons/biocommons.seqrepo#requirements)".format( + exe=exe + ) + ) except Exception: raise RuntimeError("Unknown error while executing {exe}".format(exe=exe)) bgzip_version_info = tuple(map(int, bgzip_version.split("."))) if bgzip_version_info < min_bgzip_version_info: - raise RuntimeError("bgzip ({exe}) {ev} is too old; >= {rv} is required; please upgrade".format( - exe=exe, ev=bgzip_version, rv=min_bgzip_version)) + raise RuntimeError( + "bgzip ({exe}) {ev} is too old; >= {rv} is required; please upgrade".format( + exe=exe, ev=bgzip_version, rv=min_bgzip_version + ) + ) _logger.info("Using bgzip {ev} ({exe})".format(ev=bgzip_version, exe=exe)) return exe @@ -104,12 +109,12 @@ def __init__(self, filename): def store(self, seq_id, seq): def wrap_lines(seq, line_width): for i in range(0, len(seq), line_width): - yield seq[i:i + line_width] + yield seq[i : i + line_width] if seq_id not in self._added: self._fh.write(">" + seq_id + "\n") - for l in wrap_lines(seq, line_width): - self._fh.write(l + "\n") + for line in wrap_lines(seq, line_width): + self._fh.write(line + "\n") self._added.add(seq_id) _logger.debug("added seq_id {i}; length {l}".format(i=seq_id, l=len(seq))) return seq_id diff --git a/src/biocommons/seqrepo/fastadir/fastadir.py b/src/biocommons/seqrepo/fastadir/fastadir.py index 361ffc9..5637d24 100644 --- a/src/biocommons/seqrepo/fastadir/fastadir.py +++ b/src/biocommons/seqrepo/fastadir/fastadir.py @@ -8,8 +8,6 @@ import pkg_resources import yoyo - - from ..config import SEQREPO_LRU_CACHE_MAXSIZE from .bases import BaseReader, BaseWriter from .fabgz import FabgzReader, FabgzWriter @@ -65,23 +63,26 @@ def __init__(self, root_dir, writeable=False, check_same_thread=True): os.makedirs(self._root_dir, exist_ok=True) self._upgrade_db() - self._db = sqlite3.connect(self._db_path, - check_same_thread=check_same_thread, - detect_types=sqlite3.PARSE_DECLTYPES) + self._db = sqlite3.connect( + self._db_path, check_same_thread=check_same_thread, detect_types=sqlite3.PARSE_DECLTYPES + ) schema_version = self.schema_version() self._db.row_factory = sqlite3.Row # if we're not at the expected schema version for this code, bail if schema_version != expected_schema_version: - raise RuntimeError("""Upgrade required: Database schema - version is {} and code expects {}""".format(schema_version, expected_schema_version)) + raise RuntimeError( + """Upgrade required: Database schema + version is {} and code expects {}""".format( + schema_version, expected_schema_version + ) + ) # ############################################################################ # Special methods def __contains__(self, seq_id): - c = self._fetch_one("select exists(select 1 from seqinfo where seq_id = ? limit 1) as ex", - (seq_id, )) + c = self._fetch_one("select exists(select 1 from seqinfo where seq_id = ? limit 1) as ex", (seq_id,)) return True if c["ex"] else False @@ -113,8 +114,12 @@ def fetch(self, seq_id, start=None, end=None): rec = self.fetch_seqinfo(seq_id) if self._writing and self._writing["relpath"] == rec["relpath"]: - _logger.warning("""Fetching from file opened for writing; - closing first ({})""".format(rec["relpath"])) + _logger.warning( + """Fetching from file opened for writing; + closing first ({})""".format( + rec["relpath"] + ) + ) self.commit() path = os.path.join(self._root_dir, rec["relpath"]) @@ -175,8 +180,11 @@ def store(self, seq_id, seq): self._writing["fabgz"].store(seq_id, seq) alpha = "".join(sorted(set(seq))) cursor = self._db.cursor() - cursor.execute("""insert into seqinfo (seq_id, len, alpha, relpath) - values (?, ?, ?,?)""", (seq_id, len(seq), alpha, self._writing["relpath"])) + cursor.execute( + """insert into seqinfo (seq_id, len, alpha, relpath) + values (?, ?, ?,?)""", + (seq_id, len(seq), alpha, self._writing["relpath"]), + ) return seq_id # ############################################################################ @@ -190,7 +198,7 @@ def _fetch_one(self, sql, params=()): def _upgrade_db(self): """upgrade db using scripts for specified (current) schema version""" migration_path = "_data/migrations" - sqlite3.connect(self._db_path).close() # ensure that it exists + sqlite3.connect(self._db_path).close() # ensure that it exists db_url = "sqlite:///" + self._db_path backend = yoyo.get_backend(db_url) migration_dir = pkg_resources.resource_filename(__package__, migration_path) @@ -205,6 +213,7 @@ def _open_for_reading(self, path): def _dump_aliases(self): import prettytable + fields = "seq_id len alpha added relpath".split() pt = prettytable.PrettyTable(field_names=fields) cursor = self._db.cursor() diff --git a/src/biocommons/seqrepo/fastaiter/__init__.py b/src/biocommons/seqrepo/fastaiter/__init__.py index 7d7b810..b838a5e 100644 --- a/src/biocommons/seqrepo/fastaiter/__init__.py +++ b/src/biocommons/seqrepo/fastaiter/__init__.py @@ -1 +1 @@ -from .fastaiter import FastaIter +from .fastaiter import FastaIter # noqa: F401 diff --git a/src/biocommons/seqrepo/fastaiter/fastaiter.py b/src/biocommons/seqrepo/fastaiter/fastaiter.py index afa6cc4..f9aa6fd 100644 --- a/src/biocommons/seqrepo/fastaiter/fastaiter.py +++ b/src/biocommons/seqrepo/fastaiter/fastaiter.py @@ -4,12 +4,13 @@ def FastaIter(handle): Lines before the start of the first record are ignored. """ + seq_lines = [] header = None for line in handle: if line.startswith(">"): if header is not None: # not the first record yield header, "".join(seq_lines) - seq_lines = list() + seq_lines = [] header = line[1:].rstrip() else: if header is not None: # not the first record @@ -17,5 +18,5 @@ def FastaIter(handle): if header is not None: yield header, "".join(seq_lines) - else: # no FASTA records in file + else: # no FASTA records in file return diff --git a/src/biocommons/seqrepo/seqaliasdb/__init__.py b/src/biocommons/seqrepo/seqaliasdb/__init__.py index 008c939..06b1794 100644 --- a/src/biocommons/seqrepo/seqaliasdb/__init__.py +++ b/src/biocommons/seqrepo/seqaliasdb/__init__.py @@ -1 +1 @@ -from .seqaliasdb import SeqAliasDB +from .seqaliasdb import SeqAliasDB # noqa: F401 diff --git a/src/biocommons/seqrepo/seqaliasdb/_data/migrations/0000-base.py b/src/biocommons/seqrepo/seqaliasdb/_data/migrations/0000-base.py index 4644063..d76fcb1 100644 --- a/src/biocommons/seqrepo/seqaliasdb/_data/migrations/0000-base.py +++ b/src/biocommons/seqrepo/seqaliasdb/_data/migrations/0000-base.py @@ -4,8 +4,10 @@ step("""create unique index meta_key_idx on meta(key)""", """drop index meta_key_idx""") -step("""create table log (ts timestamp not null default current_timestamp, v text not null, msg text not null);""", - """drop table log""") +step( + """create table log (ts timestamp not null default current_timestamp, v text not null, msg text not null);""", + """drop table log""", +) step("""insert into log (v,msg) values ('', 'database created')""") diff --git a/src/biocommons/seqrepo/seqaliasdb/_data/migrations/0001-initial.py b/src/biocommons/seqrepo/seqaliasdb/_data/migrations/0001-initial.py index 2ffd6a9..77266b6 100644 --- a/src/biocommons/seqrepo/seqaliasdb/_data/migrations/0001-initial.py +++ b/src/biocommons/seqrepo/seqaliasdb/_data/migrations/0001-initial.py @@ -1,6 +1,7 @@ from yoyo import step -step(""" +step( + """ create table seqalias ( seqalias_id integer primary key, seq_id text not null, @@ -8,25 +9,37 @@ alias text not null, added timestamp not null default current_timestamp, is_current int not null default 1 -)""", """drop table seqalias""") +)""", + """drop table seqalias""", +) # current alias must be unique with a namespace -step(""" +step( + """ create unique index seqalias_unique_ns_alias_idx on seqalias(namespace, alias) where is_current = 1 -""") +""" +) -step(""" +step( + """ create index seqalias_seq_id_idx on seqalias(seq_id) -""") +""" +) -step(""" +step( + """ create index seqalias_namespace_idx on seqalias(namespace) -""") +""" +) -step(""" +step( + """ create index seqalias_alias_idx on seqalias(alias) -""") +""" +) -step(""" +step( + """ update meta set value = '1' where key = 'schema version' -""") +""" +) diff --git a/src/biocommons/seqrepo/seqaliasdb/seqaliasdb.py b/src/biocommons/seqrepo/seqaliasdb/seqaliasdb.py index bf1410c..55c454b 100644 --- a/src/biocommons/seqrepo/seqaliasdb/seqaliasdb.py +++ b/src/biocommons/seqrepo/seqaliasdb/seqaliasdb.py @@ -1,4 +1,3 @@ -import itertools import logging import sqlite3 @@ -6,19 +5,19 @@ import yoyo from .._internal.translate import translate_alias_records, translate_api2db -from .._internal.logging_support import DuplicateFilter _logger = logging.getLogger(__name__) -#_logger.addFilter(DuplicateFilter()) +# _logger.addFilter(DuplicateFilter()) expected_schema_version = 1 min_sqlite_version_info = (3, 8, 0) -if sqlite3.sqlite_version_info < min_sqlite_version_info: # pragma: no cover +if sqlite3.sqlite_version_info < min_sqlite_version_info: # pragma: no cover min_sqlite_version = ".".join(map(str, min_sqlite_version_info)) - msg = "{} requires sqlite3 >= {} but {} is installed".format(__package__, min_sqlite_version, - sqlite3.sqlite_version) + msg = "{} requires sqlite3 >= {} but {} is installed".format( + __package__, min_sqlite_version, sqlite3.sqlite_version + ) raise ImportError(msg) @@ -33,27 +32,30 @@ def __init__(self, db_path, writeable=False, translate_ncbi_namespace=None, chec self._writeable = writeable if translate_ncbi_namespace is not None: - _logger.warning("translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed") + _logger.warning( + "translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed" + ) if self._writeable: self._upgrade_db() - self._db = sqlite3.connect(self._db_path, - check_same_thread=check_same_thread, - detect_types=sqlite3.PARSE_DECLTYPES) + self._db = sqlite3.connect( + self._db_path, check_same_thread=check_same_thread, detect_types=sqlite3.PARSE_DECLTYPES + ) self._db.row_factory = sqlite3.Row schema_version = self.schema_version() # if we're not at the expected schema version for this code, bail - if schema_version != expected_schema_version: # pragma: no cover - raise RuntimeError("Upgrade required: Database schema" - "version is {} and code expects {}".format(schema_version, expected_schema_version)) + if schema_version != expected_schema_version: # pragma: no cover + raise RuntimeError( + "Upgrade required: Database schema" + "version is {} and code expects {}".format(schema_version, expected_schema_version) + ) # ############################################################################ # Special methods def __contains__(self, seq_id): cursor = self._db.cursor() - cursor.execute("select exists(select 1 from seqalias where seq_id = ? limit 1) as ex", - (seq_id, )) + cursor.execute("select exists(select 1 from seqalias where seq_id = ? limit 1) as ex", (seq_id,)) c = cursor.fetchone() return True if c["ex"] else False @@ -68,13 +70,14 @@ def fetch_aliases(self, seq_id, current_only=True, translate_ncbi_namespace=None """return list of alias annotation records (dicts) for a given seq_id""" _logger.warning("SeqAliasDB::fetch_aliases() is deprecated; use find_aliases(seq_id=...) instead") if translate_ncbi_namespace is not None: - _logger.warning("translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed") - return [dict(r) for r in self.find_aliases(seq_id=seq_id, - current_only=current_only)] + _logger.warning( + "translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed" + ) + return [dict(r) for r in self.find_aliases(seq_id=seq_id, current_only=current_only)] def find_aliases(self, seq_id=None, namespace=None, alias=None, current_only=True, translate_ncbi_namespace=None): """returns iterator over alias annotation dicts that match criteria - + The arguments, all optional, restrict the records that are returned. Without arguments, all aliases are returned. @@ -92,7 +95,9 @@ def eq_or_like(s): return "like" if "%" in s else "=" if translate_ncbi_namespace is not None: - _logger.warning("translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed") + _logger.warning( + "translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed" + ) if namespace is not None: ns_api2db = translate_api2db(namespace, alias) @@ -154,16 +159,17 @@ def store_alias(self, seq_id, namespace, alias): log_pfx = "store({q},{n},{a})".format(n=namespace, a=alias, q=seq_id) cursor = self._db.cursor() try: - cursor.execute("insert into seqalias (seq_id, namespace, alias) values (?, ?, ?)", (seq_id, namespace, - alias)) + cursor.execute( + "insert into seqalias (seq_id, namespace, alias) values (?, ?, ?)", (seq_id, namespace, alias) + ) # success => new record return cursor.lastrowid except Exception as ex: # Every driver has own class for IntegrityError so we have to # investigate if the exception class name contains 'IntegrityError' # which we can ignore - if not type(ex).__name__.endswith('IntegrityError'): - raise(ex) + if not type(ex).__name__.endswith("IntegrityError"): + raise (ex) # IntegrityError fall-through # existing record is guaranteed to exist uniquely; fetchone() should always succeed @@ -180,14 +186,12 @@ def store_alias(self, seq_id, namespace, alias): cursor.execute("update seqalias set is_current = 0 where seqalias_id = ?", [current_rec["seqalias_id"]]) return self.store_alias(seq_id, namespace, alias) - - - # ############################################################################ # Internal methods - def _dump_aliases(self): # pragma: no cover + def _dump_aliases(self): # pragma: no cover import prettytable + cursor = self._db.cursor() fields = "seqalias_id seq_id namespace alias added is_current".split() pt = prettytable.PrettyTable(field_names=fields) @@ -199,7 +203,7 @@ def _dump_aliases(self): # pragma: no cover def _upgrade_db(self): """upgrade db using scripts for specified (current) schema version""" migration_path = "_data/migrations" - sqlite3.connect(self._db_path).close() # ensure that it exists + sqlite3.connect(self._db_path).close() # ensure that it exists db_url = "sqlite:///" + self._db_path backend = yoyo.get_backend(db_url) migration_dir = pkg_resources.resource_filename(__package__, migration_path) diff --git a/src/biocommons/seqrepo/seqrepo.py b/src/biocommons/seqrepo/seqrepo.py index d22ae80..fa353c1 100644 --- a/src/biocommons/seqrepo/seqrepo.py +++ b/src/biocommons/seqrepo/seqrepo.py @@ -1,15 +1,15 @@ -from collections.abc import Sequence -from functools import lru_cache import logging import os import re +from collections.abc import Sequence +from functools import lru_cache import bioutils.digests from bioutils.digests import seq_seqhash as sha512t24u from .config import SEQREPO_LRU_CACHE_MAXSIZE -from .seqaliasdb import SeqAliasDB from .fastadir import FastaDir +from .seqaliasdb import SeqAliasDB _logger = logging.getLogger(__name__) @@ -19,12 +19,11 @@ ct_n_residues = 1e9 # namespace-alias separator -nsa_sep = u":" +nsa_sep = ":" uri_re = re.compile(r"([^:]+):(.+)") - class SequenceProxy(Sequence): """Provides efficient and transparent string-like access, including random access slicing and reversing, to a biological sequence that @@ -49,7 +48,7 @@ def __eq__(self, s: str): def __getitem__(self, key): if isinstance(key, int): - key = slice(key, key+1) + key = slice(key, key + 1) if key.step is not None: raise ValueError("Only contiguous sequence slices are supported") return self._fetch(key.start, key.stop) @@ -93,7 +92,15 @@ class SeqRepo(object): """ - def __init__(self, root_dir, writeable=False, upcase=True, translate_ncbi_namespace=None, check_same_thread=False, use_sequenceproxy=True): + def __init__( + self, + root_dir, + writeable=False, + upcase=True, + translate_ncbi_namespace=None, + check_same_thread=False, + use_sequenceproxy=True, + ): self._root_dir = root_dir self._upcase = upcase self._db_path = os.path.join(self._root_dir, "aliases.sqlite3") @@ -112,12 +119,12 @@ def __init__(self, root_dir, writeable=False, upcase=True, translate_ncbi_namesp raise OSError("Unable to open SeqRepo directory {}".format(self._root_dir)) self.sequences = FastaDir(self._seq_path, writeable=self._writeable, check_same_thread=self._check_same_thread) - self.aliases = SeqAliasDB(self._db_path, - writeable=self._writeable, - check_same_thread=self._check_same_thread) + self.aliases = SeqAliasDB(self._db_path, writeable=self._writeable, check_same_thread=self._check_same_thread) if translate_ncbi_namespace is not None: - _logger.warn("translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed") + _logger.warn( + "translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed" + ) def __contains__(self, nsa): ns, a = nsa.split(nsa_sep) if nsa_sep in nsa else (None, nsa) @@ -153,18 +160,19 @@ def commit(self): self.sequences.commit() self.aliases.commit() if self._pending_sequences + self._pending_aliases > 0: - _logger.info("Committed {} sequences ({} residues) and {} aliases".format( - self._pending_sequences, self._pending_sequences_len, self._pending_aliases)) + _logger.info( + "Committed {} sequences ({} residues) and {} aliases".format( + self._pending_sequences, self._pending_sequences_len, self._pending_aliases + ) + ) self._pending_sequences = 0 self._pending_sequences_len = 0 self._pending_aliases = 0 - def fetch(self, alias, start=None, end=None, namespace=None): seq_id = self._get_unique_seqid(alias=alias, namespace=namespace) return self.sequences.fetch(seq_id, start, end) - def fetch_uri(self, uri, start=None, end=None): """fetch sequence for URI/CURIE of the form namespace:alias, such as NCBI:NM_000059.3. @@ -174,7 +182,6 @@ def fetch_uri(self, uri, start=None, end=None): namespace, alias = uri_re.match(uri).groups() return self.fetch(alias=alias, namespace=namespace, start=start, end=end) - def store(self, seq, nsaliases): """nsaliases is a list of dicts, like: @@ -191,8 +198,9 @@ def store(self, seq, nsaliases): try: seqhash = sha512t24u(seq) - except Exception as e: + except Exception: import pprint + _logger.critical("Exception raised for " + pprint.pformat(nsaliases)) raise seq_id = seqhash @@ -204,10 +212,11 @@ def store(self, seq, nsaliases): l=len(seq), na=len(nsaliases), nsa_sep=nsa_sep, - aliases=", ".join("{nsa[namespace]}:{nsa[alias]}".format(nsa=nsa) for nsa in nsaliases)) + aliases=", ".join("{nsa[namespace]}:{nsa[alias]}".format(nsa=nsa) for nsa in nsaliases), + ) if seq_id not in self.sequences: _logger.info("Storing " + msg) - if len(seq) > ct_n_residues: # pragma: no cover + if len(seq) > ct_n_residues: # pragma: no cover _logger.debug("Precommit for large sequence") self.commit() self.sequences.store(seq_id, seq) @@ -230,14 +239,18 @@ def store(self, seq, nsaliases): self.aliases.store_alias(seq_id=seq_id, namespace=namespace, alias=alias) self._pending_aliases += len(upd_tuples) n_aliases_added += len(upd_tuples) - if (self._pending_sequences > ct_n_seqs or self._pending_aliases > ct_n_aliases - or self._pending_sequences_len > ct_n_residues): # pragma: no cover - _logger.info("Hit commit thresholds ({self._pending_sequences} sequences, " - "{self._pending_aliases} aliases, {self._pending_sequences_len} residues)".format(self=self)) + if ( + self._pending_sequences > ct_n_seqs + or self._pending_aliases > ct_n_aliases + or self._pending_sequences_len > ct_n_residues + ): # pragma: no cover + _logger.info( + "Hit commit thresholds ({self._pending_sequences} sequences, " + "{self._pending_aliases} aliases, {self._pending_sequences_len} residues)".format(self=self) + ) self.commit() return n_seqs_added, n_aliases_added - def translate_alias(self, alias, namespace=None, target_namespaces=None, translate_ncbi_namespace=None): """given an alias and optional namespace, return a list of all other aliases for same sequence @@ -245,7 +258,9 @@ def translate_alias(self, alias, namespace=None, target_namespaces=None, transla """ if translate_ncbi_namespace is not None: - _logger.warn("translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed") + _logger.warn( + "translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed" + ) seq_id = self._get_unique_seqid(alias=alias, namespace=namespace) aliases = self.aliases.find_aliases(seq_id=seq_id) if target_namespaces: @@ -253,23 +268,21 @@ def translate_alias(self, alias, namespace=None, target_namespaces=None, transla aliases = [nsa_sep.join([a["namespace"], a["alias"]]) for a in aliases] return aliases - def translate_identifier(self, identifier, target_namespaces=None, translate_ncbi_namespace=None): """Given a string identifier, return a list of aliases (as identifiers) that refer to the same sequence. """ if translate_ncbi_namespace is not None: - _logger.warn("translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed") + _logger.warn( + "translate_ncbi_namespace is obsolete; translation is now automatic; this flag will be removed" + ) namespace, alias = identifier.split(nsa_sep) if nsa_sep in identifier else (None, identifier) - return self.translate_alias(alias=alias, - namespace=namespace, - target_namespaces=target_namespaces) - + return self.translate_alias(alias=alias, namespace=namespace, target_namespaces=target_namespaces) ############################################################################ - # Internal Methods + # Internal Methods @lru_cache(maxsize=SEQREPO_LRU_CACHE_MAXSIZE) def _get_unique_seqid(self, alias, namespace): @@ -288,7 +301,6 @@ def _get_unique_seqid(self, alias, namespace): raise KeyError("Alias {} (namespace: {}): not unique".format(alias, namespace)) return seq_ids.pop() - def _update_digest_aliases(self, seq_id, seq): """compute digest aliases for seq and update; returns number of digest @@ -301,22 +313,10 @@ def _update_digest_aliases(self, seq_id, seq): ir = bioutils.digests.seq_vmc_identifier(seq) seq_aliases = [ - { - "namespace": ir["namespace"], - "alias": ir["accession"], - }, - { - "namespace": "SHA1", - "alias": bioutils.digests.seq_sha1(seq) - }, - { - "namespace": "MD5", - "alias": bioutils.digests.seq_md5(seq) - }, - { - "namespace": "SEGUID", - "alias": bioutils.digests.seq_seguid(seq) - }, + {"namespace": ir["namespace"], "alias": ir["accession"]}, + {"namespace": "SHA1", "alias": bioutils.digests.seq_sha1(seq)}, + {"namespace": "MD5", "alias": bioutils.digests.seq_md5(seq)}, + {"namespace": "SEGUID", "alias": bioutils.digests.seq_seguid(seq)}, ] for sa in seq_aliases: self.aliases.store_alias(seq_id=seq_id, **sa) diff --git a/src/biocommons/seqrepo/utils.py b/src/biocommons/seqrepo/utils.py index e1d281a..418ec4b 100644 --- a/src/biocommons/seqrepo/utils.py +++ b/src/biocommons/seqrepo/utils.py @@ -1,6 +1,5 @@ import re - ncbi_defline_re = re.compile(r"(?Pref)\|(?P[^|]+)") invalid_alias_chars_re = re.compile(r"[^-+./_\w]") diff --git a/tests/conftest.py b/tests/conftest.py index a22f473..293f0bb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ import os + import pytest from biocommons.seqrepo import SeqRepo @@ -18,23 +19,23 @@ def rest_dataproxy(): @pytest.fixture(scope="session") def seqrepo(tmpdir_factory): - dir = str(tmpdir_factory.mktemp('seqrepo')) + dir = str(tmpdir_factory.mktemp("seqrepo")) return SeqRepo(dir, writeable=True) @pytest.fixture(scope="session") def seqrepo_ro(tmpdir_factory): - dir = str(tmpdir_factory.mktemp('seqrepo')) + dir = str(tmpdir_factory.mktemp("seqrepo")) sr = SeqRepo(dir, writeable=True) - del sr # close it + del sr # close it return SeqRepo(dir) @pytest.fixture(scope="session") def seqrepo_keepcase(tmpdir_factory): - dir = str(tmpdir_factory.mktemp('seqrepo')) + dir = str(tmpdir_factory.mktemp("seqrepo")) return SeqRepo(dir, upcase=False, writeable=True) def test_create(seqrepo): - assert str(seqrepo).startswith('SeqRepo(root_dir=/') + assert str(seqrepo).startswith("SeqRepo(root_dir=/") diff --git a/tests/test_cli.py b/tests/test_cli.py index 70e7d4d..1700395 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -7,7 +7,7 @@ import pytest -from biocommons.seqrepo.cli import (init, load) +from biocommons.seqrepo.cli import init, load from biocommons.seqrepo.fastaiter import FastaIter from biocommons.seqrepo.utils import parse_defline @@ -18,7 +18,7 @@ class MockOpts(object): pass test_dir = os.path.dirname(__file__) - test_data_dir = os.path.join(test_dir, 'data') + test_data_dir = os.path.join(test_dir, "data") opts = MockOpts() opts.root_directory = os.path.join(tempfile.mkdtemp(prefix="seqrepo_pytest_"), "seqrepo") @@ -48,33 +48,36 @@ def test_20_load(opts): def test_refseq_fasta(opts): def _get_refseq_alias(aliases): for al in aliases: - if al['namespace'] == 'refseq': - return al['alias'] + if al["namespace"] == "refseq": + return al["alias"] return None - init(opts) - opts.namespace = 'refseq' - old_fasta = '>gi|295424141|ref|NM_000439.4| Homo sapiens proprotein convertase subtilisin/kexin type 1 ' + \ - '(PCSK1), transcript variant 1, mRNA\nTTT' - new_fasta = '>NM_000439.4 Homo sapiens proprotein convertase subtilisin/kexin type 1 (PCSK1), ' + \ - 'transcript variant 1, mRNA\nTTT' + opts.namespace = "refseq" + old_fasta = ( + ">gi|295424141|ref|NM_000439.4| Homo sapiens proprotein convertase subtilisin/kexin type 1 " + + "(PCSK1), transcript variant 1, mRNA\nTTT" + ) + new_fasta = ( + ">NM_000439.4 Homo sapiens proprotein convertase subtilisin/kexin type 1 (PCSK1), " + + "transcript variant 1, mRNA\nTTT" + ) aliases = parse_defline(old_fasta, opts.namespace) nm = _get_refseq_alias(aliases) - assert nm == 'NM_000439.4' + assert nm == "NM_000439.4" aliases2 = parse_defline(new_fasta, opts.namespace) nm2 = _get_refseq_alias(aliases2) - assert nm2 == 'NM_000439.4' + assert nm2 == "NM_000439.4" data = io.StringIO(new_fasta) iterator = FastaIter(data) header, seq = next(iterator) - assert header.startswith('NM_000439.4 Homo sapiens proprotein convertase subtilisin/kexin type 1 (PCSK1)') + assert header.startswith("NM_000439.4 Homo sapiens proprotein convertase subtilisin/kexin type 1 (PCSK1)") assert seq == "TTT" aliases3 = parse_defline(header, opts.namespace) nm3 = _get_refseq_alias(aliases3) - assert nm3 == 'NM_000439.4' + assert nm3 == "NM_000439.4" diff --git a/tests/test_fastadir.py b/tests/test_fastadir.py index 535101c..dd868ea 100644 --- a/tests/test_fastadir.py +++ b/tests/test_fastadir.py @@ -35,6 +35,7 @@ def test_write_reread(): if __name__ == "__main__": import logging + logging.basicConfig(level=logging.DEBUG) test_write_reread() @@ -58,4 +59,4 @@ def test_writeability(): fd._writeable = False fd.store("NC_000001.11", "TGGTGGCACGCGCTTGTAGT") - fd._writeable = True \ No newline at end of file + fd._writeable = True diff --git a/tests/test_fastaiter.py b/tests/test_fastaiter.py index 1fa52e2..0d31310 100644 --- a/tests/test_fastaiter.py +++ b/tests/test_fastaiter.py @@ -1,8 +1,7 @@ +import pytest import six from six.moves import StringIO -import pytest - from biocommons.seqrepo.fastaiter import FastaIter @@ -74,4 +73,3 @@ def test_multiline(): # should be empty now with pytest.raises(StopIteration): six.next(iterator) - diff --git a/tests/test_seqaliasdb.py b/tests/test_seqaliasdb.py index b301c48..d9fc9af 100644 --- a/tests/test_seqaliasdb.py +++ b/tests/test_seqaliasdb.py @@ -39,31 +39,12 @@ def test_seqinfo(): aliases = [{k: r[k] for k in alias_keys} for r in db.find_aliases(current_only=False)] aliases.sort(key=lambda r: (r["seqalias_id"], r["seq_id"], r["namespace"], r["alias"], r["is_current"])) - assert aliases == [{ - 'seqalias_id': 1, - 'seq_id': 'q1', - 'namespace': 'A', - 'alias': '1', - 'is_current': 0 - }, { - 'seqalias_id': 2, - 'seq_id': 'q1', - 'namespace': 'A', - 'alias': '2', - 'is_current': 1 - }, { - 'seqalias_id': 3, - 'seq_id': 'q1', - 'namespace': 'B', - 'alias': '1', - 'is_current': 1 - }, { - 'seqalias_id': 4, - 'seq_id': 'q2', - 'namespace': 'A', - 'alias': '1', - 'is_current': 1 - }] + assert aliases == [ + {"seqalias_id": 1, "seq_id": "q1", "namespace": "A", "alias": "1", "is_current": 0}, + {"seqalias_id": 2, "seq_id": "q1", "namespace": "A", "alias": "2", "is_current": 1}, + {"seqalias_id": 3, "seq_id": "q1", "namespace": "B", "alias": "1", "is_current": 1}, + {"seqalias_id": 4, "seq_id": "q2", "namespace": "A", "alias": "1", "is_current": 1}, + ] # __contains__ assert "q1" in db @@ -71,7 +52,7 @@ def test_seqinfo(): assert db.stats()["n_sequences"] == 2 - del db # close + del db # close db = SeqAliasDB(db_path) with pytest.raises(RuntimeError): diff --git a/tests/test_seqrepo.py b/tests/test_seqrepo.py index c97aeeb..939bf87 100644 --- a/tests/test_seqrepo.py +++ b/tests/test_seqrepo.py @@ -5,12 +5,12 @@ def test_create(seqrepo): - assert str(seqrepo).startswith('SeqRepo(root_dir=/') + assert str(seqrepo).startswith("SeqRepo(root_dir=/") def test_seqrepo_dir_not_exist(tmpdir_factory): """Ensure that exception is raised for non-existent seqrepo directory""" - dir = str(tmpdir_factory.mktemp('seqrepo')) + "-IDONTEXIST" + dir = str(tmpdir_factory.mktemp("seqrepo")) + "-IDONTEXIST" with pytest.raises(OSError) as ex: SeqRepo(dir, writeable=False) @@ -19,9 +19,9 @@ def test_seqrepo_dir_not_exist(tmpdir_factory): def test_store(seqrepo): seqrepo.store("SMELLASSWEET", [{"namespace": "en", "alias": "rose"}, {"namespace": "fr", "alias": "rose"}]) - seqrepo.store("smellassweet", [{"namespace": "es", "alias": "rosa"}]) # same sequence, new alias + seqrepo.store("smellassweet", [{"namespace": "es", "alias": "rosa"}]) # same sequence, new alias - seqrepo.store("ASINCHANGE", [{"namespace": "en", "alias": "coin"}]) # same alias, diff seqs in diff namespaces + seqrepo.store("ASINCHANGE", [{"namespace": "en", "alias": "coin"}]) # same alias, diff seqs in diff namespaces seqrepo.store("ASINACORNER", [{"namespace": "fr", "alias": "coin"}]) seqrepo.commit() @@ -37,10 +37,10 @@ def test_fetch(seqrepo): assert seqrepo.fetch("rosa", start=5, end=7) == "AS" with pytest.raises(KeyError): - assert seqrepo.fetch("bogus") # non-existent alias + assert seqrepo.fetch("bogus") # non-existent alias with pytest.raises(KeyError): - assert seqrepo.fetch("coin") # ambiguous alias + assert seqrepo.fetch("coin") # ambiguous alias assert seqrepo.fetch("coin", namespace="en") == "ASINCHANGE" assert seqrepo.fetch("coin", namespace="fr") == "ASINACORNER" @@ -59,7 +59,7 @@ def test_digests(seqrepo): assert seqrepo.fetch_uri("SEGUID:aMQF/cdHkAayLkVYs8XV2u+Hy34") == "ASINACORNER" assert seqrepo.fetch_uri("SHA1:68c405fdc7479006b22e4558b3c5d5daef87cb7e") == "ASINACORNER" assert seqrepo.fetch_uri("VMC:GS_LDz34B6fA_fLxFoc2agLrXQRYuupOGGM") == "ASINACORNER" - + def test_errors(seqrepo_ro): with pytest.raises(RuntimeError): @@ -77,46 +77,53 @@ def test_refseq_lookup(seqrepo): assert seqrepo["ncbiac"] == "NCBISEQUENCE" assert seqrepo["NCBI:ncbiac"] == "NCBISEQUENCE" assert seqrepo["refseq:ncbiac"] == "NCBISEQUENCE" - + def test_namespace_translation(tmpdir_factory): - dir = str(tmpdir_factory.mktemp('seqrepo')) + dir = str(tmpdir_factory.mktemp("seqrepo")) seqrepo = SeqRepo(dir, writeable=True) # store sequences - seqrepo.store("NCBISEQUENCE", [{"namespace": "NCBI", "alias": "ncbiac" }]) + seqrepo.store("NCBISEQUENCE", [{"namespace": "NCBI", "alias": "ncbiac"}]) seqrepo.store("ENSEMBLSEQUENCE", [{"namespace": "Ensembl", "alias": "ensemblac"}]) - seqrepo.store("LRGSEQUENCE", [{"namespace": "LRG", "alias": "lrgac" }]) - seqrepo.store("REFSEQSEQUENCE", [{"namespace": "refseq", "alias": "refseqac" }]) # should be stored as NCBI:refseqac + seqrepo.store("LRGSEQUENCE", [{"namespace": "LRG", "alias": "lrgac"}]) + seqrepo.store("REFSEQSEQUENCE", [{"namespace": "refseq", "alias": "refseqac"}]) # should be stored as NCBI:refseqac seqrepo.commit() # lookups, no query translation - assert seqrepo["NCBI:ncbiac"] == "NCBISEQUENCE" + assert seqrepo["NCBI:ncbiac"] == "NCBISEQUENCE" assert seqrepo["Ensembl:ensemblac"] == "ENSEMBLSEQUENCE" - assert seqrepo["LRG:lrgac"] == "LRGSEQUENCE" - assert seqrepo["NCBI:refseqac"] == "REFSEQSEQUENCE" # tests ns translation on store + assert seqrepo["LRG:lrgac"] == "LRGSEQUENCE" + assert seqrepo["NCBI:refseqac"] == "REFSEQSEQUENCE" # tests ns translation on store # lookups, w/ query translation - assert seqrepo["refseq:ncbiac"] == "NCBISEQUENCE" - assert seqrepo["RefSeq:ncbiac"] == "NCBISEQUENCE" # case-squashed + assert seqrepo["refseq:ncbiac"] == "NCBISEQUENCE" + assert seqrepo["RefSeq:ncbiac"] == "NCBISEQUENCE" # case-squashed assert seqrepo["Ensembl:ensemblac"] == "ENSEMBLSEQUENCE" - assert seqrepo["LRG:lrgac"] == "LRGSEQUENCE" + assert seqrepo["LRG:lrgac"] == "LRGSEQUENCE" seq_id = seqrepo._get_unique_seqid(alias="ncbiac", namespace="NCBI") aliases = list(seqrepo.aliases.find_aliases(seq_id=seq_id)) assert any(a for a in aliases if a["namespace"] == "refseq") assert any(a for a in aliases if a["namespace"] == "ga4gh") - assert seqrepo["ga4gh:SQ."+seq_id] == "NCBISEQUENCE" - assert seqrepo["sha512t24u:"+seq_id] == "NCBISEQUENCE" - + assert seqrepo["ga4gh:SQ." + seq_id] == "NCBISEQUENCE" + assert seqrepo["sha512t24u:" + seq_id] == "NCBISEQUENCE" def test_translation(seqrepo): - assert "MD5:8b2698fb0b0c93558a6adbb11edb1e4b" in seqrepo.translate_identifier("en:rose"), "failed fully-qualified identifier lookup" - assert "MD5:8b2698fb0b0c93558a6adbb11edb1e4b" in seqrepo.translate_identifier("rose"), "failed unqualified identifier lookup" - assert "VMC:GS_bsoUMlD3TrEtlh9Dt1iT29mzfkwwFUDr" in seqrepo.translate_identifier("en:rose"), "failed to find expected identifier in returned identifiers" - assert ["VMC:GS_bsoUMlD3TrEtlh9Dt1iT29mzfkwwFUDr"] == seqrepo.translate_identifier("en:rose", target_namespaces=["VMC"]), "failed to rerieve exactly the expected identifier" + assert "MD5:8b2698fb0b0c93558a6adbb11edb1e4b" in seqrepo.translate_identifier( + "en:rose" + ), "failed fully-qualified identifier lookup" + assert "MD5:8b2698fb0b0c93558a6adbb11edb1e4b" in seqrepo.translate_identifier( + "rose" + ), "failed unqualified identifier lookup" + assert "VMC:GS_bsoUMlD3TrEtlh9Dt1iT29mzfkwwFUDr" in seqrepo.translate_identifier( + "en:rose" + ), "failed to find expected identifier in returned identifiers" + assert ["VMC:GS_bsoUMlD3TrEtlh9Dt1iT29mzfkwwFUDr"] == seqrepo.translate_identifier( + "en:rose", target_namespaces=["VMC"] + ), "failed to rerieve exactly the expected identifier" def test_sequenceproxy(seqrepo): @@ -124,6 +131,5 @@ def test_sequenceproxy(seqrepo): # instantiated with use_sequenceproxy=True sp = SequenceProxy(seqrepo, namespace=None, alias="rosa") - assert sp # __bool__ dunder method - assert sp[5:7] == "AS" # __eq__ and __getitem__ - + assert sp # __bool__ dunder method + assert sp[5:7] == "AS" # __eq__ and __getitem__ diff --git a/tests/test_utils.py b/tests/test_utils.py index 618ff56..f9ec473 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -23,17 +23,11 @@ def test_parse_defline(): assert parse_defline(defline, "refseq") == [{"namespace": "refseq", "alias": "NG_007107.2"}] defline = ">gi|568815364|ref|NT_077402.3| Homo sapiens chromosome 1 genomic scaffold, GRCh38.p7 Primary Assembly HSCHR1_CTG1" - assert parse_defline(defline, "refseq") == [ - {"namespace": "refseq", "alias": "NT_077402.3"} - ] - + assert parse_defline(defline, "refseq") == [{"namespace": "refseq", "alias": "NT_077402.3"}] def test_validate_aliases(): - aliases = [ - {"namespace": "refseq", "alias": "NM_012345.6"}, - {"namespace": "Ensembl", "alias": "ENST012345.6"} - ] + aliases = [{"namespace": "refseq", "alias": "NM_012345.6"}, {"namespace": "Ensembl", "alias": "ENST012345.6"}] assert validate_aliases(aliases) # okay @@ -42,7 +36,6 @@ def test_validate_aliases(): with pytest.raises(RuntimeError): validate_aliases([{"namespace": "refseq", "alias": "NM_012345"}]) - + with pytest.raises(RuntimeError): validate_aliases([{"namespace": "Ensembl", "alias": "ENST012345"}]) -