Skip to content

Commit

Permalink
fixes for pairs
Browse files Browse the repository at this point in the history
  • Loading branch information
kdm9 committed Jul 11, 2024
1 parent e9623be commit 645510e
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 27 deletions.
15 changes: 8 additions & 7 deletions blsl/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ def fqparse(stream, n=4):
else:
with open(stream, "rt") as fh:
yield from fqparse(fh, n)
fqp = list()
for line in stream:
fqp.append(line.rstrip("\n"))
else:
fqp = list()
for line in stream:
fqp.append(line.rstrip("\n"))
if len(fqp) == n:
yield fqp
fqp = list()
if len(fqp) == n:
yield fqp
fqp = list()
if len(fqp) == n:
yield fqp
assert len(fqp) == 0
assert len(fqp) == 0

42 changes: 22 additions & 20 deletions blsl/pairs.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,36 @@
from tqdm.auto import tqdm
from collections import defaultdict
import re
from sys import stdout, stderr, stdin
from ._utils import fqparse


def seqnum(n, c):
if n.endswith("/1"):
return 1
if n.endswith("/2"):
return 2
if c:
if c[0].startswith("1:"):
return 1
if c[0].startswith("2:"):
return 2
raise ValueError(f"Unknown pair from fastq header: '{n} {''.join(c)}'")


def pairslash(header):
name, *comment = header.split(" ", 1)
if name.endswith("/1") or name.endswith("/2"):
return header
i = seqnum(name, comment)

i = None
if name.endswith("/1"):
i = 1
if name.endswith("/2"):
i = 2
if comment:
if comment[0].startswith("1:"):
i = 1
if comment[0].startswith("2:"):
i = 2
if i is None:
raise ValueError(f"Unable to guess pair from header '{header}'")
return f"{name}/{i} {comment[0]}"


def read_fileset(readfiles):
res = defaultdict(dict)
for readfile in readfiles:
stem, read = re.match(r"(.+?)[_.]?(R[12]|SE|IL|INTERLEAVED)?(_001)?\.(FQ|FASTQ)(\.GZ)?", readfile.upper()).groups()[:2]
stem, read = re.match(r"(.+?)[_.]?(R[12]|SE|IL|INTERLEAVED)?(_001)?\.(FQ|FASTQ)(\.GZ)?", readfile).groups()[:2]
read = read.upper()
if read == "INTERLEAVED":
read = "IL"
if read in res[stem]:
Expand All @@ -43,32 +45,32 @@ def output_read(args, read):
print("\n".join(read), file=args.output)


def handle_fileset(args, fileset):
def handle_fileset(args, stem, fileset):
if "R1" in fileset and "R2" in fileset:
for r1, r2 in tqdm(zip(fqparse(fileset["R1"]), fqparse(fileset["R2"]))):
for r1, r2 in tqdm(zip(fqparse(fileset["R1"]), fqparse(fileset["R2"])), desc=stem):
output_read(args, r1)
output_read(args, r2)
if "IL" in fileset:
for r in tqdm(fqparse(fileset("IL"))):
for r in tqdm(fqparse(fileset("IL")), desc=stem):
output_read(args, r)
if "SE" in fileset:
for r in tqdm(fqparse(fileset("SE"))):
for r in tqdm(fqparse(fileset("SE")), desc=stem):
output_read(args, r)


def main(argv=None):
"""Handle paired-end reads, with various transformations."""
import argparse
ap = argparse.ArgumentParser("blsl pairs")
ap.add_argument("--output", "-o", default="/dev/stdout",
ap.add_argument("--output", "-o", default=stdout, type=argparse.FileType("w"),
help="Output file (default: stdout)")
ap.add_argument("--pairslash", "-s", action="store_true",
help="Add /1 /2 to paired reads")
ap.add_argument("readfile", nargs='+', default="/dev/stdin")
args=ap.parse_args(argv)

for stem, fileset in read_fileset(args.readfile).items():
handle_fileset(args, fileset)
handle_fileset(args, stem, fileset)

if __name__ == "__main__":
main()
Expand Down

0 comments on commit 645510e

Please sign in to comment.