Skip to content

Commit

Permalink
fqstat: fixes + better CLI help
Browse files Browse the repository at this point in the history
  • Loading branch information
kdm9 committed Oct 10, 2024
1 parent f6f2d38 commit 24385d0
Showing 1 changed file with 8 additions and 5 deletions.
13 changes: 8 additions & 5 deletions blsl/fqstat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
from dataclasses import dataclass
from pathlib import Path
from concurrent.futures import as_completed, ProcessPoolExecutor
from sys import stdout
import argparse
import multiprocessing

from tqdm import tqdm

def head(raw, size=1_000_000):
return io.BytesIO(raw.read(size))
Expand All @@ -27,7 +29,7 @@ def estimate_fq_stats(fq, head_bytes=1_000_000):
fq = Path(fq)
with open(fq, "rb") as fh:
buf = head(fh, size=head_bytes)
bytes_read = len(buf.getbuffer())
bytes_read = len(buf.getbuffer())
zfh = gzip.open(buf)
n = 0
recsizes = 0
Expand All @@ -37,8 +39,9 @@ def estimate_fq_stats(fq, head_bytes=1_000_000):
n += 1
recsizes += len(hdr) + len(seq) + len(qhdr) + len(qual)
readlens += len(seq) -1
except EOFError:
except (EOFError, gzip.BadGzipFile):
pass
#raise RuntimeError(f"BadGzipFile: {fq} after {n}")
fsize = fq.stat().st_size
estim_reads = fsize / bytes_read * n
return FQStat(path=fq, file_size=fsize, estimated_nreads=round(estim_reads),
Expand All @@ -48,13 +51,14 @@ def estimate_fq_stats(fq, head_bytes=1_000_000):


def main(argv=None):
"""Estimate stats from a fastq file based on the first kilobytes of the file, keeping high accuracy"""
ap = argparse.ArgumentParser()
ap.add_argument("--out", "-o", type=argparse.FileType("w"), default=stdout,
help="Output table")
ap.add_argument("--threads", "-j", type=int, default=multiprocessing.cpu_count(),
help="Parallel CPUs")
ap.add_argument("--head", "-b", type=int, default=1_000_000,
help="Inspect the first N bytes")
ap.add_argument("--head", "-b", type=int, default=20_000,
help="Inspect the first N bytes (default: 20kb, R^2 is still ~1.0!. Increase to improve accuracy slightly, above 1e6 is pointless.)")
ap.add_argument("fastqs", nargs="+")

args = ap.parse_args(argv)
Expand All @@ -70,4 +74,3 @@ def main(argv=None):
print("path", "file_size", "estimated_n_reads", "read_length", "record_size", "bytes_per_record", sep="\t", file=args.out)
for res in results:
print(res.path, res.file_size, res.estimated_nreads, res.mean_read_len, res.mean_record_size, res.bytes_per_record, sep="\t", file=args.out)

0 comments on commit 24385d0

Please sign in to comment.