Skip to content

Commit

Permalink
fqstat: bugfixes
Browse files Browse the repository at this point in the history
  • Loading branch information
kdm9 committed Oct 15, 2024
1 parent 24385d0 commit b470326
Showing 1 changed file with 19 additions and 3 deletions.
22 changes: 19 additions & 3 deletions blsl/fqstat.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
def head(raw, size=1_000_000):
return io.BytesIO(raw.read(size))

@dataclass
@dataclass(order=True)
class FQStat:
path: Path
file_size: int
Expand All @@ -41,14 +41,22 @@ def estimate_fq_stats(fq, head_bytes=1_000_000):
readlens += len(seq) -1
except (EOFError, gzip.BadGzipFile):
pass
#raise RuntimeError(f"BadGzipFile: {fq} after {n}")
fsize = fq.stat().st_size
if n < 1:
return FQStat(path=fq, file_size=fsize, estimated_nreads=0, mean_read_len=0, mean_record_size=0, n_reads_sampled=n, bytes_per_record=0)
estim_reads = fsize / bytes_read * n
return FQStat(path=fq, file_size=fsize, estimated_nreads=round(estim_reads),
mean_read_len=readlens/n, mean_record_size=recsizes/n,
n_reads_sampled=n, bytes_per_record=bytes_read/n)


def parse_fofn(file):
with open(file) as fh:
res = set()
for fn in fh:
res.add(fn.rstrip())
return res


def main(argv=None):
"""Estimate stats from a fastq file based on the first kilobytes of the file, keeping high accuracy"""
Expand All @@ -59,10 +67,18 @@ def main(argv=None):
help="Parallel CPUs")
ap.add_argument("--head", "-b", type=int, default=20_000,
help="Inspect the first N bytes (default: 20kb, R^2 is still ~1.0!. Increase to improve accuracy slightly, above 1e6 is pointless.)")
ap.add_argument("--fofn", "-f", action="store_true",
help="Treat args as files of file names (one per line")
ap.add_argument("fastqs", nargs="+")

args = ap.parse_args(argv)

if args.fofn:
res = set()
for fofn in args.fastqs:
res.update(parse_fofn(fofn))
args.fastqs = list(sorted(res))

results = []
with ProcessPoolExecutor(args.threads) as exc:
jobs = set()
Expand All @@ -72,5 +88,5 @@ def main(argv=None):
results.append(res.result())

print("path", "file_size", "estimated_n_reads", "read_length", "record_size", "bytes_per_record", sep="\t", file=args.out)
for res in results:
for res in sorted(results):
print(res.path, res.file_size, res.estimated_nreads, res.mean_read_len, res.mean_record_size, res.bytes_per_record, sep="\t", file=args.out)

0 comments on commit b470326

Please sign in to comment.