Skip to content

Commit

Permalink
version 0.2.2: lots of new stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
kdm9 committed Nov 2, 2023
1 parent 4d47e31 commit e9a6234
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 22 deletions.
8 changes: 7 additions & 1 deletion blsl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from sys import argv, exit, stderr

__version__ = "0.2.1"
__version__ = "0.2.2"

cmds = {}

Expand All @@ -28,6 +28,9 @@
from .liftoff_gff3 import liftoff_gff3_main
cmds["liftoff-gff3"] = liftoff_gff3_main

from .gfftagsane import main as gfftagsane_main
cmds["gfftagsane"] = gfftagsane_main

from .pansn_rename import main as pansn_rename_main
cmds["pansn-rename"] = pansn_rename_main

Expand Down Expand Up @@ -80,6 +83,9 @@
try:
from .vcfstats import main as vcfstats_main
cmds["vcfstats"] = vcfstats_main

from .vcfparallel import main as vcfparallel_main
cmds["vcfparallel"] = vcfparallel_main
except ImportError as exc:
if len(argv) < 2:
print(str(exc), "-- disabling vcfstats command", file=stderr)
Expand Down
24 changes: 18 additions & 6 deletions blsl/deepclust2fa.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def deepclust2fa_main(argv=None):
help="Buffer sequences rather than writing (more ram, more speed)")
ap.add_argument("-c", "--clusters", required=True,
help="Diamond deepclust result")
ap.add_argument("--orthofinder", "--of", action="store_true",
help="--clusters is actually an orthofinder result, like OG: seq1 seq2 seq3")
ap.add_argument("-f", "--faa", required=True,
help="Fasta sequences (that you gave to diamond deepclust --db)")
ap.add_argument("-o", "--outdir", required=True, type=Path,
Expand All @@ -41,12 +43,22 @@ def deepclust2fa_main(argv=None):
seq2cent = {}
cent2seq = defaultdict(list)
with open(args.clusters) as fh:
for rec in csv.reader(fh, dialect=tsv):
if rec[0] == "centroid" and rec[1] == "member":
continue
c, s = rec[0:2]
seq2cent[s] = c
cent2seq[c].append(s)
if args.orthofinder:
for line in fh:
og, seqs = line.split(":")
og = og.strip()
seqs = seqs.split(", ")
for seq in seqs:
seq = seq.strip()
seq2cent[seq] = og
cent2seq[og].append(seq)
else:
for rec in csv.reader(fh, dialect=tsv):
if rec[0] == "centroid" and rec[1] == "member":
continue
c, s = rec[0:2]
seq2cent[s] = c
cent2seq[c].append(s)
cent2seq_new = {}
seq2cent_new = {}
for cent, seqs in cent2seq.items():
Expand Down
21 changes: 11 additions & 10 deletions blsl/gffparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def parseGFF3(filename, return_as=dict):
"""
#Parse with transparent decompression
openFunc = gzip.open if str(filename).endswith(".gz") else open
with openFunc(filename) as infile:
with openFunc(filename, "rt") as infile:
for line in infile:
#if line.startswith("###"):
# ### Yield last gene if we move that here
Expand Down Expand Up @@ -103,6 +103,7 @@ def gff_heirarchy(filename, progress=None):
}
ignore = {
"source",
"stop_codon",
}
records = {}
l2l1 = {}
Expand Down Expand Up @@ -194,20 +195,20 @@ def prefix_name(entry, sid):
prefix_name(gchild, gcid)
#print("H", gcid, gchild["attributes"]["Name"])


def write_line(entry, file):
x = [entry[field] if entry[field] is not None else "." for i, field in enumerate(gffInfoFields)]
x[-1] = attr2line(x[-1])
print(*x, sep="\t", file=file)

def write_gene(gene, geneid=None, file=None, changenames=False):
def write_line(entry):
x = [entry[field] if entry[field] is not None else "." for i, field in enumerate(gffInfoFields)]
x[-1] = attr2line(x[-1])
print(*x, sep="\t", file=file)
#print(gene, geneid)
#print("## gff-version 3", file=file)
if geneid or changenames:
reformat_names(gene, geneid, changenames)
write_line(gene)
write_line(gene, file)
for child in gene.get("children", {}).values():
write_line(child)
write_line(child, file)
for gchild in child.get("children", {}).values():
write_line(gchild)
write_line(gchild, file)
print("###", file=file)


Expand Down
25 changes: 25 additions & 0 deletions blsl/gfftagsane.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env python3
import argparse
from sys import stdout
from .gffparse import parseGFF3, write_line
from tqdm import tqdm

def main(argv=None):
"""Sanitise a messy gff attribute column to just simple tags """
ap = argparse.ArgumentParser("gfftagsane")
ap.add_argument("-o", "--output", type=argparse.FileType("wt"), default=stdout,
help="Output gff file")
ap.add_argument("-f", "--fields", default="ID,Name,Parent,locus_tag",
help="Attribute tags to keep (case sensitive, give multiple times like -f ID -f tag2 -f tag3).")
ap.add_argument("input")

args = ap.parse_args(argv)
args.fields = args.fields.split(",")

for record in tqdm(parseGFF3(args.input, return_as=dict)):
attrs = {k: record["attributes"][k] for k in args.fields if k in record["attributes"]}
record["attributes"] = attrs
write_line(record, args.output)

if __name__ == "__main__":
main()
16 changes: 11 additions & 5 deletions blsl/pansn_rename.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@ def tqdm(*args, **kwargs):
from sys import stderr, exit


def make_replacer(mode, reffa=None, org_name=None, delim="~", outdelim="~", refdelim="~"):
def make_replacer(mode, reffa=None, org_name=None, delim="~", outdelim="~", refdelim="~", lowercase=True):
def L(string):
if lowercase:
return string.lower()
return string
names = []
if not reffa.endswith(".fai"):
reffa += ".fai"
Expand All @@ -26,11 +30,11 @@ def make_replacer(mode, reffa=None, org_name=None, delim="~", outdelim="~", refd
splitname = name.strip().split(refdelim)
pansn_name = delim.join(splitname)
if mode == "cat":
fromto[pansn_name] = outdelim.join(splitname)
fromto[pansn_name] = L(outdelim.join(splitname))
elif mode == "add":
fromto[splitname[2]] = outdelim.join(splitname)
fromto[splitname[2]] = L(outdelim.join(splitname))
elif mode == "rm":
fromto[pansn_name] = splitname[2]
fromto[pansn_name] = L(splitname[2])
return fromto


Expand Down Expand Up @@ -76,6 +80,8 @@ def main(argv=None):
help="Delimiter between columns in c/tsv file. (default tab)")
ap.add_argument("-d", "--delim", default="_",
help="Delimiter between fields in PanSN names in <INPUT> (the # in indiv#1#chr1)")
ap.add_argument("-l", "--lowercase", action="store_true",
help="Lowercase all names")
ap.add_argument("-D", "--out-delim",
help="--delim for the output. Defaults to the same as --delim.")
ap.add_argument("-r", "--reference-fasta", required=True,
Expand Down Expand Up @@ -106,7 +112,7 @@ def main(argv=None):
if args.ref_delim is None:
args.ref_delim = args.delim

replacer = make_replacer(args.mode, args.reference_fasta, args.org_name, args.delim, args.out_delim, args.ref_delim)
replacer = make_replacer(args.mode, args.reference_fasta, args.org_name, args.delim, args.out_delim, args.ref_delim, lowercase=args.lowercase)

input_ftype = "tsv"
try:
Expand Down
95 changes: 95 additions & 0 deletions blsl/vcfparallel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python3
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from tqdm import tqdm
from cyvcf2 import VCF

import sys
from sys import stdin, stdout, stderr
import subprocess
import argparse
import math
import tempfile
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
import uuid


def parallel_regions(vcf, cores=1):
V = VCF(vcf)
# 10 chunks per chrom per core
chunks = len(V.seqlens)*10*cores
for cname, clen in zip(V.seqnames, V.seqlens):
chunk = int(max(min(clen, 1000), math.ceil(clen/chunks)))
for start in range(0, clen, chunk):
s = start+1
e = min(clen, start+chunk+1)
yield f"{cname}:{s:09d}-{e:09d}"


def one_chunk(vcf, chunk, temp_prefix, filter="", pipeline=""):
cmd=f"bcftools view -r {chunk} {filter} -Ou {vcf}"
if pipeline:
cmd = f"{cmd} | {pipeline}"
ofile = f"{temp_prefix}{chunk}.bcf"
with open(ofile, "wb") as ofh, open(ofile + ".log", "wb") as log:
subprocess.run(cmd, shell=True, stdout=ofh, stderr=log, check=True)
subprocess.run(f"bcftools index -f {ofile}", shell=True, stderr=log, check=True)
return ofile


def chunkwise_pipeline(args):
filestomerge = []
with ProcessPoolExecutor(args.threads) as exc:
jobs = []
for region in parallel_regions(args.vcf):
jobs.append(exc.submit(one_chunk, args.vcf, region, args.temp_prefix, filter=args.filter, pipeline=args.commands))
for job in tqdm(as_completed(jobs), total=len(jobs), unit="chunk"):
ofile = job.result()
filestomerge.append(ofile)
return filestomerge


def merge_results(args, filestomerge):
fofn = f"{args.temp_prefix}fofn.txt"
with open(fofn, "w") as fh:
for file in sorted(filestomerge):
print(file, file=fh)
cmd = f"bcftools concat --file-list {fofn} --rm-dup all --allow-overlaps --threads {args.threads} -O{args.outformat} -o {args.output}"
proc = subprocess.run(cmd, shell=True, check=True)
return proc.returncode


def main(argv=None):
"""Use bcftools to calculate various statistics, outputing an R-ready table"""
ap = argparse.ArgumentParser()
ap.add_argument("-o", "--output", required=True,
help="Output V/BCF file")
ap.add_argument("-O", "--outformat", default="z8",
help="Output file format (passed directly to bcftools -O, see man bcftools but z=vcf.gz and b=bcf)")
ap.add_argument("-T", "--temp-prefix", default=None,
help="Temporary output directory")
ap.add_argument("-f", "--filter", default="", type=str,
help="bcftools view arguments for variant filtering")
ap.add_argument("-c", "--commands", default="", type=str,
help="command(s) to operate. Must take uncompressed bcf on stdin and yield uncompresed bcf on stdout. Can use | and other shell features.")
ap.add_argument("-t", "--threads", default=1, type=int,
help="Number of parallel threads")
ap.add_argument("vcf")
args = ap.parse_args(argv)

if args.temp_prefix is None:
args.temp_prefix = tempfile.gettempdir() + "/bcffilter"
tp = Path(args.temp_prefix)
if tp.is_dir() and not args.temp_prefix.endswith("/"):
args.temp_prefix += "/"
args.temp_prefix += str(uuid.uuid4())
print(f"Using temporary file prefix: {args.temp_prefix}. This WILL NOT be automatically cleaned!!", file=stderr)

chunkfiles = chunkwise_pipeline(args)
merge_results(args, chunkfiles)

if __name__ == "__main__":
main()

0 comments on commit e9a6234

Please sign in to comment.