From 645510ea350ede6e0eb6ae0e64ec883b5d0104b2 Mon Sep 17 00:00:00 2001 From: Kevin Murray Date: Thu, 11 Jul 2024 07:38:43 +0200 Subject: [PATCH] fixes for pairs --- blsl/_utils.py | 15 ++++++++------- blsl/pairs.py | 42 ++++++++++++++++++++++-------------------- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/blsl/_utils.py b/blsl/_utils.py index 3bc6001..c17df70 100644 --- a/blsl/_utils.py +++ b/blsl/_utils.py @@ -13,13 +13,14 @@ def fqparse(stream, n=4): else: with open(stream, "rt") as fh: yield from fqparse(fh, n) - fqp = list() - for line in stream: - fqp.append(line.rstrip("\n")) + else: + fqp = list() + for line in stream: + fqp.append(line.rstrip("\n")) + if len(fqp) == n: + yield fqp + fqp = list() if len(fqp) == n: yield fqp - fqp = list() - if len(fqp) == n: - yield fqp - assert len(fqp) == 0 + assert len(fqp) == 0 diff --git a/blsl/pairs.py b/blsl/pairs.py index b86b889..b15d6c7 100644 --- a/blsl/pairs.py +++ b/blsl/pairs.py @@ -1,34 +1,36 @@ from tqdm.auto import tqdm from collections import defaultdict import re +from sys import stdout, stderr, stdin from ._utils import fqparse -def seqnum(n, c): - if n.endswith("/1"): - return 1 - if n.endswith("/2"): - return 2 - if c: - if c[0].startswith("1:"): - return 1 - if c[0].startswith("2:"): - return 2 - raise ValueError(f"Unknown pair from fastq header: '{n} {''.join(c)}'") - def pairslash(header): name, *comment = header.split(" ", 1) if name.endswith("/1") or name.endswith("/2"): return header - i = seqnum(name, comment) + + i = None + if name.endswith("/1"): + i = 1 + if name.endswith("/2"): + i = 2 + if comment: + if comment[0].startswith("1:"): + i = 1 + if comment[0].startswith("2:"): + i = 2 + if i is None: + raise ValueError(f"Unable to guess pair from header '{header}'") return f"{name}/{i} {comment[0]}" def read_fileset(readfiles): res = defaultdict(dict) for readfile in readfiles: - stem, read = re.match(r"(.+?)[_.]?(R[12]|SE|IL|INTERLEAVED)?(_001)?\.(FQ|FASTQ)(\.GZ)?", readfile.upper()).groups()[:2] + stem, read = re.match(r"(.+?)[_.]?(R[12]|SE|IL|INTERLEAVED)?(_001)?\.(FQ|FASTQ)(\.GZ)?", readfile).groups()[:2] + read = read.upper() if read == "INTERLEAVED": read = "IL" if read in res[stem]: @@ -43,16 +45,16 @@ def output_read(args, read): print("\n".join(read), file=args.output) -def handle_fileset(args, fileset): +def handle_fileset(args, stem, fileset): if "R1" in fileset and "R2" in fileset: - for r1, r2 in tqdm(zip(fqparse(fileset["R1"]), fqparse(fileset["R2"]))): + for r1, r2 in tqdm(zip(fqparse(fileset["R1"]), fqparse(fileset["R2"])), desc=stem): output_read(args, r1) output_read(args, r2) if "IL" in fileset: - for r in tqdm(fqparse(fileset("IL"))): + for r in tqdm(fqparse(fileset("IL")), desc=stem): output_read(args, r) if "SE" in fileset: - for r in tqdm(fqparse(fileset("SE"))): + for r in tqdm(fqparse(fileset("SE")), desc=stem): output_read(args, r) @@ -60,7 +62,7 @@ def main(argv=None): """Handle paired-end reads, with various transformations.""" import argparse ap = argparse.ArgumentParser("blsl pairs") - ap.add_argument("--output", "-o", default="/dev/stdout", + ap.add_argument("--output", "-o", default=stdout, type=argparse.FileType("w"), help="Output file (default: stdout)") ap.add_argument("--pairslash", "-s", action="store_true", help="Add /1 /2 to paired reads") @@ -68,7 +70,7 @@ def main(argv=None): args=ap.parse_args(argv) for stem, fileset in read_fileset(args.readfile).items(): - handle_fileset(args, fileset) + handle_fileset(args, stem, fileset) if __name__ == "__main__": main()