version 0.2.2: lots of new stuff

kdm9 · Nov 2, 2023 · e9a6234 · e9a6234
1 parent 4d47e31
commit e9a6234
Show file tree

Hide file tree

Showing 6 changed files with 167 additions and 22 deletions.
diff --git a/blsl/__init__.py b/blsl/__init__.py
@@ -6,7 +6,7 @@
 
 from sys import argv, exit, stderr
 
-__version__ = "0.2.1"
+__version__ = "0.2.2"
 
 cmds = {}
 
@@ -28,6 +28,9 @@
 from .liftoff_gff3 import liftoff_gff3_main
 cmds["liftoff-gff3"] = liftoff_gff3_main
 
+from .gfftagsane import main as gfftagsane_main
+cmds["gfftagsane"] = gfftagsane_main
+
 from .pansn_rename import main as pansn_rename_main
 cmds["pansn-rename"] = pansn_rename_main
 
@@ -80,6 +83,9 @@
 try:
     from .vcfstats import main as vcfstats_main
     cmds["vcfstats"] = vcfstats_main
+
+    from .vcfparallel import main as vcfparallel_main
+    cmds["vcfparallel"] = vcfparallel_main
 except ImportError as exc:
     if len(argv) < 2:
         print(str(exc), "-- disabling vcfstats command", file=stderr)

diff --git a/blsl/deepclust2fa.py b/blsl/deepclust2fa.py
@@ -32,6 +32,8 @@ def deepclust2fa_main(argv=None):
             help="Buffer sequences rather than writing (more ram, more speed)")
     ap.add_argument("-c", "--clusters", required=True,
             help="Diamond deepclust result")
+    ap.add_argument("--orthofinder", "--of",  action="store_true",
+            help="--clusters is actually an orthofinder result, like OG: seq1 seq2 seq3")
     ap.add_argument("-f", "--faa", required=True,
             help="Fasta sequences (that you gave to diamond deepclust --db)")
     ap.add_argument("-o", "--outdir", required=True, type=Path,
@@ -41,12 +43,22 @@ def deepclust2fa_main(argv=None):
     seq2cent = {}
     cent2seq = defaultdict(list)
     with open(args.clusters) as fh:
-        for rec in csv.reader(fh, dialect=tsv):
-            if rec[0] == "centroid" and rec[1] == "member":
-                continue
-            c, s = rec[0:2]
-            seq2cent[s] = c
-            cent2seq[c].append(s)
+        if args.orthofinder:
+            for line in fh:
+                og, seqs = line.split(":")
+                og = og.strip()
+                seqs = seqs.split(", ")
+                for seq in seqs:
+                    seq = seq.strip()
+                    seq2cent[seq] = og
+                    cent2seq[og].append(seq)
+        else:
+            for rec in csv.reader(fh, dialect=tsv):
+                if rec[0] == "centroid" and rec[1] == "member":
+                    continue
+                c, s = rec[0:2]
+                seq2cent[s] = c
+                cent2seq[c].append(s)
     cent2seq_new = {}
     seq2cent_new = {}
     for cent, seqs in cent2seq.items():

diff --git a/blsl/gffparse.py b/blsl/gffparse.py
@@ -55,7 +55,7 @@ def parseGFF3(filename, return_as=dict):
     """
     #Parse with transparent decompression
     openFunc = gzip.open if str(filename).endswith(".gz") else open
-    with openFunc(filename) as infile:
+    with openFunc(filename, "rt") as infile:
         for line in infile:
             #if line.startswith("###"):
             #    ### Yield last gene if we move that here
@@ -103,6 +103,7 @@ def gff_heirarchy(filename, progress=None):
     }
     ignore = {
         "source",
+        "stop_codon",
     }
     records = {}
     l2l1 = {}
@@ -194,20 +195,20 @@ def prefix_name(entry, sid):
                 prefix_name(gchild, gcid)
             #print("H", gcid, gchild["attributes"]["Name"])
 
+
+def write_line(entry, file):
+    x = [entry[field] if entry[field] is not None else "." for i, field in enumerate(gffInfoFields)]
+    x[-1] = attr2line(x[-1])
+    print(*x, sep="\t", file=file)
+
 def write_gene(gene, geneid=None, file=None, changenames=False):
-    def write_line(entry):
-        x = [entry[field] if entry[field] is not None else "." for i, field in enumerate(gffInfoFields)]
-        x[-1] = attr2line(x[-1])
-        print(*x, sep="\t", file=file)
-    #print(gene, geneid)
-    #print("## gff-version 3", file=file)
     if geneid or changenames:
         reformat_names(gene, geneid, changenames)
-    write_line(gene)
+    write_line(gene, file)
     for child in gene.get("children", {}).values():
-        write_line(child)
+        write_line(child, file)
         for gchild in child.get("children", {}).values():
-            write_line(gchild)       
+            write_line(gchild, file)       
     print("###", file=file)
 
 

diff --git a/blsl/gfftagsane.py b/blsl/gfftagsane.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+import argparse
+from sys import stdout
+from .gffparse import parseGFF3, write_line
+from tqdm import tqdm
+
+def main(argv=None):
+    """Sanitise a messy gff attribute column to just simple tags """
+    ap = argparse.ArgumentParser("gfftagsane")
+    ap.add_argument("-o", "--output", type=argparse.FileType("wt"), default=stdout,
+            help="Output gff file")
+    ap.add_argument("-f", "--fields", default="ID,Name,Parent,locus_tag",
+            help="Attribute tags to keep (case sensitive, give multiple times like -f ID -f tag2 -f tag3).")
+    ap.add_argument("input")
+
+    args = ap.parse_args(argv)
+    args.fields = args.fields.split(",")
+
+    for record in tqdm(parseGFF3(args.input, return_as=dict)):
+        attrs = {k: record["attributes"][k] for k in args.fields if k in record["attributes"]}
+        record["attributes"] = attrs
+        write_line(record, args.output)
+
+if __name__ == "__main__":
+    main()
diff --git a/blsl/pansn_rename.py b/blsl/pansn_rename.py
@@ -11,7 +11,11 @@ def tqdm(*args, **kwargs):
 from sys import stderr, exit
 
 
-def make_replacer(mode, reffa=None, org_name=None, delim="~", outdelim="~", refdelim="~"):
+def make_replacer(mode, reffa=None, org_name=None, delim="~", outdelim="~", refdelim="~", lowercase=True):
+    def L(string):
+        if lowercase:
+            return string.lower()
+        return string
     names = []
     if not reffa.endswith(".fai"):
         reffa += ".fai"
@@ -26,11 +30,11 @@ def make_replacer(mode, reffa=None, org_name=None, delim="~", outdelim="~", refd
             splitname = name.strip().split(refdelim)
         pansn_name = delim.join(splitname)
         if mode == "cat": 
-            fromto[pansn_name] = outdelim.join(splitname)
+            fromto[pansn_name] = L(outdelim.join(splitname))
         elif mode == "add":
-            fromto[splitname[2]] = outdelim.join(splitname)
+            fromto[splitname[2]] = L(outdelim.join(splitname))
         elif mode == "rm":
-            fromto[pansn_name] = splitname[2]
+            fromto[pansn_name] = L(splitname[2])
     return fromto
 
 
@@ -76,6 +80,8 @@ def main(argv=None):
         help="Delimiter between columns in c/tsv file. (default tab)")
     ap.add_argument("-d", "--delim", default="_",
         help="Delimiter between fields in PanSN names in <INPUT> (the # in indiv#1#chr1)")
+    ap.add_argument("-l", "--lowercase", action="store_true",
+        help="Lowercase all names")
     ap.add_argument("-D", "--out-delim",
         help="--delim for the output. Defaults to the same as --delim.")
     ap.add_argument("-r", "--reference-fasta", required=True,
@@ -106,7 +112,7 @@ def main(argv=None):
     if args.ref_delim is None:
         args.ref_delim = args.delim
 
-    replacer = make_replacer(args.mode, args.reference_fasta, args.org_name, args.delim, args.out_delim, args.ref_delim)
+    replacer = make_replacer(args.mode, args.reference_fasta, args.org_name, args.delim, args.out_delim, args.ref_delim, lowercase=args.lowercase)
 
     input_ftype = "tsv"
     try:

diff --git a/blsl/vcfparallel.py b/blsl/vcfparallel.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from tqdm import tqdm
+from cyvcf2 import VCF
+
+import sys
+from sys import stdin, stdout, stderr
+import subprocess
+import argparse
+import math
+import tempfile
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+import uuid
+
+
+def parallel_regions(vcf, cores=1):
+    V = VCF(vcf)
+    # 10 chunks per chrom per core
+    chunks = len(V.seqlens)*10*cores
+    for cname, clen in zip(V.seqnames, V.seqlens):
+        chunk = int(max(min(clen, 1000), math.ceil(clen/chunks)))
+        for start in range(0, clen, chunk):
+            s = start+1
+            e = min(clen, start+chunk+1)
+            yield f"{cname}:{s:09d}-{e:09d}"
+
+
+def one_chunk(vcf, chunk, temp_prefix, filter="", pipeline=""):
+    cmd=f"bcftools view -r {chunk} {filter} -Ou {vcf}"
+    if pipeline:
+        cmd = f"{cmd} | {pipeline}"
+    ofile = f"{temp_prefix}{chunk}.bcf"
+    with open(ofile, "wb") as ofh, open(ofile + ".log", "wb") as log:
+        subprocess.run(cmd, shell=True, stdout=ofh, stderr=log, check=True)
+        subprocess.run(f"bcftools index -f {ofile}", shell=True, stderr=log, check=True)
+    return ofile
+
+
+def chunkwise_pipeline(args):
+    filestomerge = []
+    with ProcessPoolExecutor(args.threads) as exc:
+        jobs = []
+        for region in parallel_regions(args.vcf):
+            jobs.append(exc.submit(one_chunk, args.vcf, region, args.temp_prefix, filter=args.filter, pipeline=args.commands))
+        for job in tqdm(as_completed(jobs), total=len(jobs), unit="chunk"):
+            ofile = job.result()
+            filestomerge.append(ofile)
+    return filestomerge
+
+
+def merge_results(args, filestomerge):
+    fofn = f"{args.temp_prefix}fofn.txt"
+    with open(fofn, "w") as fh:
+        for file in sorted(filestomerge):
+            print(file, file=fh)
+    cmd = f"bcftools concat --file-list {fofn} --rm-dup all --allow-overlaps --threads {args.threads} -O{args.outformat} -o {args.output}"
+    proc = subprocess.run(cmd, shell=True, check=True)
+    return proc.returncode
+
+
+def main(argv=None):
+    """Use bcftools to calculate various statistics, outputing an R-ready table"""
+    ap = argparse.ArgumentParser()
+    ap.add_argument("-o", "--output", required=True,
+            help="Output V/BCF file")
+    ap.add_argument("-O", "--outformat", default="z8",
+            help="Output file format (passed directly to bcftools -O, see man bcftools but z=vcf.gz and b=bcf)")
+    ap.add_argument("-T", "--temp-prefix", default=None,
+            help="Temporary output directory")
+    ap.add_argument("-f", "--filter", default="", type=str,
+            help="bcftools view arguments for variant filtering")
+    ap.add_argument("-c", "--commands", default="", type=str,
+            help="command(s) to operate. Must take uncompressed bcf on stdin and yield uncompresed bcf on stdout. Can use | and other shell features.")
+    ap.add_argument("-t", "--threads", default=1, type=int,
+            help="Number of parallel threads")
+    ap.add_argument("vcf")
+    args = ap.parse_args(argv)
+
+    if args.temp_prefix is None:
+        args.temp_prefix = tempfile.gettempdir() + "/bcffilter"
+    tp = Path(args.temp_prefix)
+    if tp.is_dir() and not args.temp_prefix.endswith("/"):
+        args.temp_prefix += "/"
+    args.temp_prefix += str(uuid.uuid4())
+    print(f"Using temporary file prefix: {args.temp_prefix}. This WILL NOT be automatically cleaned!!", file=stderr)
+
+    chunkfiles = chunkwise_pipeline(args)
+    merge_results(args, chunkfiles)
+
+if __name__ == "__main__":
+    main()