From f6f2d3848d16bc73352619730af00bfccf330ecc Mon Sep 17 00:00:00 2001 From: Kevin Murray Date: Thu, 10 Oct 2024 16:47:44 +0200 Subject: [PATCH] add fqstat genebed --- blsl/__init__.py | 6 ++++ blsl/fqstat.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ blsl/genebed.py | 30 ++++++++++++++++++++ 3 files changed, 109 insertions(+) create mode 100644 blsl/fqstat.py create mode 100644 blsl/genebed.py diff --git a/blsl/__init__.py b/blsl/__init__.py index 063adce..cfbb9c9 100644 --- a/blsl/__init__.py +++ b/blsl/__init__.py @@ -74,6 +74,10 @@ from .gfftagsane import main as gfftagsane_main cmds["gfftagsane"] = gfftagsane_main +from .genebed import main as genebed_main +cmds["genebed"] = genebed_main + + from .liftoff_gff3 import liftoff_gff3_main cmds["liftoff-gff3"] = liftoff_gff3_main @@ -89,6 +93,8 @@ from .pairs import main as pairs_main cmds["pairs"] = pairs_main +from .fqstat import main as fqstat_main +cmds["fqstat"] = fqstat_main try: diff --git a/blsl/fqstat.py b/blsl/fqstat.py new file mode 100644 index 0000000..ff67bed --- /dev/null +++ b/blsl/fqstat.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +import gzip +import io +from collections import Counter +from dataclasses import dataclass +from pathlib import Path +from concurrent.futures import as_completed, ProcessPoolExecutor +import argparse +import multiprocessing + + +def head(raw, size=1_000_000): + return io.BytesIO(raw.read(size)) + +@dataclass +class FQStat: + path: Path + file_size: int + estimated_nreads: float + mean_read_len: float + mean_record_size: float + n_reads_sampled: int + bytes_per_record: float + + +def estimate_fq_stats(fq, head_bytes=1_000_000): + fq = Path(fq) + with open(fq, "rb") as fh: + buf = head(fh, size=head_bytes) + bytes_read = len(buf.getbuffer()) + zfh = gzip.open(buf) + n = 0 + recsizes = 0 + readlens = 0 + try: + for hdr, seq, qhdr, qual in zip(zfh, zfh, zfh, zfh): + n += 1 + recsizes += len(hdr) + len(seq) + len(qhdr) + len(qual) + readlens += len(seq) -1 + except EOFError: + pass + fsize = fq.stat().st_size + estim_reads = fsize / bytes_read * n + return FQStat(path=fq, file_size=fsize, estimated_nreads=round(estim_reads), + mean_read_len=readlens/n, mean_record_size=recsizes/n, + n_reads_sampled=n, bytes_per_record=bytes_read/n) + + + +def main(argv=None): + ap = argparse.ArgumentParser() + ap.add_argument("--out", "-o", type=argparse.FileType("w"), default=stdout, + help="Output table") + ap.add_argument("--threads", "-j", type=int, default=multiprocessing.cpu_count(), + help="Parallel CPUs") + ap.add_argument("--head", "-b", type=int, default=1_000_000, + help="Inspect the first N bytes") + ap.add_argument("fastqs", nargs="+") + + args = ap.parse_args(argv) + + results = [] + with ProcessPoolExecutor(args.threads) as exc: + jobs = set() + for fq in args.fastqs: + jobs.add(exc.submit(estimate_fq_stats, fq, head_bytes=args.head)) + for res in tqdm(as_completed(jobs)): + results.append(res.result()) + + print("path", "file_size", "estimated_n_reads", "read_length", "record_size", "bytes_per_record", sep="\t", file=args.out) + for res in results: + print(res.path, res.file_size, res.estimated_nreads, res.mean_read_len, res.mean_record_size, res.bytes_per_record, sep="\t", file=args.out) + diff --git a/blsl/genebed.py b/blsl/genebed.py new file mode 100644 index 0000000..996a4b1 --- /dev/null +++ b/blsl/genebed.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +import argparse +from sys import stdout +from .gffparse import parseGFF3, write_line +from tqdm import tqdm + +def main(argv=None): + """Extract a BED file of genes from a GFF""" + ap = argparse.ArgumentParser("blsl genebed") + ap.add_argument("-o", "--output", type=argparse.FileType("wt"), default=stdout, + help="Output gff file") + #ap.add_argument("-f", "--fields", default="ID,Name,Parent,locus_tag", + # help="Attribute tags to keep (case sensitive, give multiple times like -f ID -f tag2 -f tag3).") + ap.add_argument("input") + + args = ap.parse_args(argv) + #args.fields = args.fields.split(",") + + def N(x): + if x is None: + return "." + return str(x) + + for record in tqdm(parseGFF3(args.input, return_as=dict)): + if record["type"] != "gene": + continue + print(record["seqid"], record["start"], record["end"], record["attributes"]["ID"], N(record["score"]), N(record["strand"]), sep="\t", file=args.output) + +if __name__ == "__main__": + main()