Merge pull request #3 from hallamlab/dev

minor fixes
hallamlab · May 17, 2023 · 1a3326a · 1a3326a
2 parents ced7a5c + 3a3802f
commit 1a3326a
Show file tree

Hide file tree

Showing 14 changed files with 245 additions and 17 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,5 @@
 
 __pycache__
 cache
+
+/lib/limes_x
diff --git a/logistics/extract_mg-reads/setup/setup.smk b/logistics/extract_mg-reads/setup/setup.smk
@@ -34,6 +34,6 @@ rule compile_pigz:
         """
         tar -xf {input} && mv pigz pigz_lib \
         && cd pigz_lib && make \
-        && cp pigz ../ && cd ../ && rm -rf pigz_lib \
+        && chmod +x pigz && cp pigz ../ && cd ../ && rm -rf pigz_lib \
         && rm {input}
         """
diff --git a/metagenomics/checkm_on_bin/setup/setup.smk b/metagenomics/checkm_on_bin/setup/setup.smk
@@ -7,7 +7,7 @@ CHECKM_DB_REP = CHECKM_DB+"/.dmanifest"
 rule singularity:
     input:
         CONTAINER,
-        CHECKM_DB,
+        CHECKM_DB_REP,
 
 rule get_image:
     output:

diff --git a/metagenomics/metagenomic_assembly/lib/definition.py b/metagenomics/metagenomic_assembly/lib/definition.py
@@ -5,6 +5,7 @@
 SAMPLE      = Item('sra accession')
 READS       = Item('metagenomic gzipped reads')
 
+ASMBLER     = Item('assembler')
 ASM         = Item('metagenomic assembly')
 ASM_WS      = Item('metagenomic assembly work')
 
@@ -41,7 +42,7 @@ def _flye(reads: list[Path]):
         flye --meta --threads {params.threads} \
             --pacbio-raw {' '.join('/ws/'+str(r) for r in reads)} --out-dir /ws/{ws}
         mv {ws}/assembly.fasta {out}
-        """
+        """, "flye"
 
     def _megahit(reads: list[Path]):
         ones, twos, singles = [], [], []
@@ -69,7 +70,7 @@ def _megahit(reads: list[Path]):
         megahit --num-cpu-threads {params.threads} --memory {params.mem_gb}e9\
             {' '.join(read_params)} --out-dir /ws/{ws}
         mv {ws}/final.contigs.fa {out}
-        """
+        """, "megahit"
 
     #https://bioinformatics.stackexchange.com/questions/935/fast-way-to-count-number-of-reads-and-number-of-bases-in-a-fastq-file
     read_sizes = context.output_folder.joinpath("temp.readcount.txt")
@@ -88,7 +89,10 @@ def _megahit(reads: list[Path]):
         if int(nucleotides)/int(num_reads) > 600:
             is_short_read = False
 
-    exe_cmd = _megahit(reads) if is_short_read else _flye(reads)
+    if is_short_read:
+        exe_cmd, assembler = _megahit(reads)
+    else:
+        exe_cmd, assembler = _flye(reads)
     code = context.shell(f"""\
         PYTHONPATH=""
         {exe_cmd}
@@ -97,7 +101,8 @@ def _megahit(reads: list[Path]):
     return JobResult(
         manifest = {
             ASM: out,
-            ASM_WS: ws
+            ASMBLER: assembler,
+            ASM_WS: ws,
         },
     )
 
@@ -106,6 +111,7 @@ def _megahit(reads: list[Path]):
     .AddInput(SAMPLE,       groupby=SAMPLE)\
     .AddInput(READS,        groupby=SAMPLE)\
     .PromiseOutput(ASM)\
+    .PromiseOutput(ASMBLER)\
     .SuggestedResources(threads=8, memory_gb=16)\
     .Requires({FLYE, MEGAHIT, PIGZ})\
     .SetHome(__file__)\

diff --git a/metagenomics/metagenomic_assembly/setup/setup.smk b/metagenomics/metagenomic_assembly/setup/setup.smk
@@ -39,6 +39,6 @@ rule compile_pigz:
         """
         tar -xf {input} && mv pigz pigz_lib \
         && cd pigz_lib && make \
-        && cp pigz ../ && cd ../ && rm -rf pigz_lib \
+        && chmod +x pigz && cp pigz ../ && cd ../ && rm -rf pigz_lib \
         && rm {input}
         """
diff --git a/metagenomics/metagenomic_binning/lib/definition.py b/metagenomics/metagenomic_binning/lib/definition.py
@@ -19,7 +19,10 @@
 PIGZ        = 'pigz'
 
 def example_procedure(context: JobContext) -> JobResult:
-    COMPLETION, CONTAMINATION = 50, 5
+    # https://www.nature.com/articles/nbt.3893
+    # accepted standard for med. quality bins
+    COMPLETION, CONTAMINATION = 50, 10 
+
     TEMP_PREFIX = "temp"
     cache = context.output_folder.joinpath(TEMP_PREFIX)
     os.makedirs(cache, exist_ok=True)
@@ -47,10 +50,6 @@ def fail(msg: str):
         assert isinstance(r, Path), f"expected path for reads, got {type(r)} for {r}"
         zipped_reads.append(r)
 
-    # rtype = context.manifest[READ_TYPE]
-    # assert isinstance(rtype, str), f"invalid read type: {rtype}"
-    # _, read_type = rtype.split(':')
-
     name = context.manifest[SAMPLE]
     assert isinstance(name, str), f"name wasn't a str: {name}"
 
@@ -77,11 +76,12 @@ def fail(msg: str):
         else:
             single = True
 
-    # https://github.com/bxlab/metaWRAP/issues/254    
+    # does not handle interleaved
     special_read_type = " "
     if single:
         special_read_type = "--single-end"
     if paired and single:
+        # https://github.com/bxlab/metaWRAP/issues/254    
         reads = [r for r in reads if not str(r).split(".")[-2].endswith("_2")]
 
     #################################################################################
@@ -151,7 +151,7 @@ def get_new_name(bin: str):
 
     TAB = '\t'
     stats_file = context.output_folder.joinpath(f"{name}.stats")
-    with open(refine_out.joinpath("metawrap_50_5_bins.stats")) as original:
+    with open(refine_out.joinpath(f"metawrap_{COMPLETION}_{CONTAMINATION}_bins.stats")) as original:
         with open(stats_file, 'w') as stats:
             header = original.readline()
             stats.write(header)

diff --git a/metagenomics/metagenomic_binning/setup/setup.smk b/metagenomics/metagenomic_binning/setup/setup.smk
@@ -34,7 +34,7 @@ rule compile_pigz:
         """
         tar -xf {input} && mv pigz pigz_lib \
         && cd pigz_lib && make \
-        && cp pigz ../ && cd ../ && rm -rf pigz_lib \
+        && chmod +x pigz && cp pigz ../ && cd ../ && rm -rf pigz_lib \
         && rm {input}
         """
 

diff --git a/metagenomics/taxonomy_on_bin/lib/gtdbtk.py b/metagenomics/taxonomy_on_bin/lib/gtdbtk.py
@@ -63,6 +63,7 @@ def gtdbtk_procedure(context: JobContext, SAMPLE, BINS, GTDBTK_WS, GTDBTK_TAX, C
 
     code = context.shell(f"""\
         mkdir -p {temp_dir}
+        PYTHONPATH=""
         singularity run -B {",".join(binds)} {container} \
         gtdbtk classify_wf -x {ext} \
             --cpus {params.threads} --pplacer_cpus {int(min(params.threads, params.mem_gb//(40+1)))} \
@@ -84,7 +85,7 @@ def gtdbtk_procedure(context: JobContext, SAMPLE, BINS, GTDBTK_WS, GTDBTK_TAX, C
             },
         )
 
-    summary = context.output_folder.joinpath(f"{sample}-bins.tax.tsv")
+    summary = context.output_folder.joinpath(f"{sample}.tax.tsv")
     written_header = False
     with open(summary, "w") as out:
         for table in file_candidates:
@@ -94,6 +95,13 @@ def gtdbtk_procedure(context: JobContext, SAMPLE, BINS, GTDBTK_WS, GTDBTK_TAX, C
                 for l in tsv:
                     out.write(l)
 
+    # todo: return trees as well
+    # out_tree = context.output_folder.joinpath(f"{sample}.tax.tree")
+    # file_candidates = os.listdir(classify_out) if classify_out.exists() else []
+    # file_candidates = [classify_out.joinpath(f) for f in file_candidates if (f.endswith(".tree") and "backbone" not in str(f))]
+    # with open(out_tree, "w") as out:
+
+
     #clean up
     context.shell(f"""\
         cd {context.output_folder} && rm -r {TEMP_PREFIX}*

diff --git a/metagenomics/taxonomy_on_bin/setup/setup.smk b/metagenomics/taxonomy_on_bin/setup/setup.smk
@@ -12,7 +12,7 @@ rule get_image:
         CONTAINER
     shell:
         """
-        singularity build {output} docker://quay.io/txyliu/gtdbtk
+        singularity build {output} docker://ecogenomic/gtdbtk:2.2.6
         """
 
 # check url if change version

diff --git a/metagenomics/taxonomy_on_reads/lib/definition.py b/metagenomics/taxonomy_on_reads/lib/definition.py
@@ -0,0 +1,73 @@
+from pathlib import Path
+from limes_x import ModuleBuilder, Item, JobContext, JobResult
+
+SAMPLE      = Item('sra accession') # this is just used for grouping, todo: remove
+READS       = Item('metagenomic gzipped reads')
+
+REPORT      = Item('reads taxonomy table')
+HITS        = Item('gzipped taxonmic hits of reads')
+
+CONTAINER   = 'kraken2.sif'
+_DB_SIZE = 16
+# _DB_SIZE = 8
+REF_DB      = f'k2_standard_{_DB_SIZE:02}gb_20230314' # 8gb db is named "...08gb..."
+PIGZ        = 'pigz'
+
+def procedure(context: JobContext) -> JobResult:
+    P = context.params
+    M = context.manifest
+    WS = context.output_folder
+    REF = P.reference_folder
+    TEMP = "TEMP"
+
+    sample = M[SAMPLE]
+    assert isinstance(sample, str), f"expected str for sample, got: {sample}"
+
+    _reads = M[READS]
+    input_dir = WS.joinpath(f"{TEMP}.inputs")
+    context.shell(f"mkdir -p {input_dir}")
+    assert not isinstance(_reads, str), f"got string instead of path for reads: {_reads}"
+    if not isinstance(_reads, list): _reads = [_reads]
+    reads = []
+    for r in _reads:
+        assert isinstance(r, Path), f"expected path for reads, got: {r}"
+        context.shell(f"cp -L {r.absolute()} {input_dir.joinpath(r.name)}")
+        reads.append(r.name)
+
+    binds = [
+        f"{REF}/:/ref",
+        f"{input_dir}:/inputs",
+        f"{WS}:/ws",
+    ]
+
+    reports_name = f"{sample}.report.krkn"
+    hits_name = f"{sample}.hits.krkn"
+    hits_zipped = f"{hits_name}.gz"
+
+    context.shell(f"""\
+        singularity run -B {",".join(binds)} {REF.joinpath(CONTAINER)} kraken2 \
+            --threads {P.threads} --gzip-compressed {'--memory-mapping' if P.mem_gb<_DB_SIZE else ""} \
+            --db /ref/{REF_DB} \
+            --use-names --report /ws/{reports_name} \
+            {' '.join([f'/inputs/{r}' for r in reads])} \
+        | {REF.joinpath(PIGZ)} -7 -p {P.threads} >{WS.joinpath(hits_zipped)}
+        rm -r {WS}/{TEMP}*
+    """)
+
+    return JobResult(
+        manifest={
+            REPORT: WS.joinpath(reports_name),
+            HITS: WS.joinpath(hits_zipped),
+        },
+    )
+
+MODULE = ModuleBuilder()\
+    .SetProcedure(procedure)\
+    .AddInput(SAMPLE,       groupby=SAMPLE)\
+    .AddInput(READS,        groupby=SAMPLE)\
+    .PromiseOutput(REPORT)\
+    .PromiseOutput(HITS)\
+    .SuggestedResources(threads=6, memory_gb=20)\
+    .Requires({CONTAINER, REF_DB, PIGZ})\
+    .SetHome(__file__, name=None)\
+    .Build()
diff --git a/metagenomics/taxonomy_on_reads/setup/setup.smk b/metagenomics/taxonomy_on_reads/setup/setup.smk
@@ -0,0 +1,61 @@
+CONTAINER = "kraken2.sif"
+REF_DB = "k2_standard_16gb_20230314"
+REF_DB_FILE = f"{REF_DB}/hash.k2d"
+REF_DB_ZIP = f"{REF_DB}.tar.gz"
+PIGZ = "pigz"
+
+rule singularity:
+    input:
+        PIGZ,
+        CONTAINER,
+        REF_DB_FILE
+
+rule get_image:
+    output:
+        CONTAINER
+    shell:
+        """
+        singularity build {output} docker://staphb/kraken2:2.1.2-no-db
+        """
+
+rule download_refdata:
+    output:
+        REF_DB_ZIP
+    shell:
+        """
+        wget --quiet https://genome-idx.s3.amazonaws.com/kraken/{output}
+        """
+
+rule extract_refdata:
+    input:
+        REF_DB_ZIP
+    output:
+        REF_DB_FILE
+    params:
+        db=REF_DB
+    shell:
+        """
+        mkdir -p {params.db}
+        tar -xf {input} -C {params.db}
+        """
+
+rule get_pigz:
+    output:
+        "pigz.tar.gz"
+    shell:
+        """
+        wget https://zlib.net/pigz/{output}
+        """
+
+rule compile_pigz:
+    output:
+        PIGZ
+    input:
+        "pigz.tar.gz"
+    shell:
+        """
+        tar -xf {input} && mv pigz pigz_lib \
+        && cd pigz_lib && make \
+        && chmod +x pigz && cp pigz ../ && cd ../ && rm -rf pigz_lib \
+        && rm {input}
+        """
diff --git a/test/kraken/scratch.sh b/test/kraken/scratch.sh
@@ -0,0 +1,13 @@
+# docker run --rm \
+#     --mount type=bind,source="/home/tony/workspace/python/Limes-all/Limes-compute-modules/scratch/kraken/cache/",target="/ws" \
+#     staphb/kraken2:latest kraken2 \
+#     --threads 6 \
+#     --db /ws/k2_standard_08gb_20230314 \
+#     --gzip-compressed \
+#     --use-names --report /ws/testrep \
+#     --output /ws/testout \
+#     /ws/SRR10140508_1.fastq.gz
+
+docker run --rm \
+    staphb/kraken2:latest kraken2 \
+    --help
diff --git a/test/kraken/setup.py b/test/kraken/setup.py
@@ -0,0 +1,21 @@
+import os, sys
+sys.path = list(set([
+    "../../../Limes-x/src/"
+]+sys.path))
+from pathlib import Path
+import limes_x as lx
+from limes_x import ComputeModule
+
+
+ref_dir = Path("../../../lx_ref")
+os.makedirs(ref_dir, exist_ok=True)
+
+HERE = Path("../../")
+
+modules = []
+# modules += ComputeModule.LoadSet(HERE.joinpath("logistics"))
+modules += ComputeModule.LoadSet(HERE.joinpath("metagenomics"))
+modules = [m for m in modules if m.name == "taxonomy_on_reads"]
+# modules += ComputeModule.LoadSet(HERE.joinpath("high_throughput_screening"))
+wf = lx.Workflow(modules, ref_dir)
+wf.Setup('singularity')
diff --git a/test/kraken/test.py b/test/kraken/test.py
@@ -0,0 +1,44 @@
+import os, sys
+sys.path = list(set([
+    "../../lib/"
+]+sys.path))
+from pathlib import Path
+import limes_x as lx
+
+modules = []
+# modules += lx.LoadComputeModules("../../../Limes-compute-modules/logistics")
+modules += lx.LoadComputeModules("../../metagenomics")
+
+WS = Path("./cache/test_workspace")
+os.system(f"rm -r {WS}")
+wf = lx.Workflow(modules, reference_folder="../../../lx_ref")
+wf.Run(
+    workspace=WS,
+    targets=[
+        lx.Item('reads taxonomy table'),
+        # lx.Item('metagenomic gzipped reads'),
+        # lx.Item('metagenomic assembly'),
+        # lx.Item("metagenomic bin"),
+        # lx.Item("checkm stats"),
+        # lx.Item('assembly taxonomy table'),
+        # lx.Item('genomic annotation'),
+    ],
+    given=[
+        lx.InputGroup(  
+            group_by=(lx.Item("sra accession"), "SRR10140508"), 
+            children={
+                lx.Item("username"): "tony",
+                lx.Item("metagenomic gzipped reads"): [
+                    Path(p) for p in [
+                        "./cache/SRR10140508_1.fastq.gz",
+                        "./cache/SRR10140508_2.fastq.gz",
+                    ]
+                ]
+            },
+        )
+    ],
+    executor=lx.Executor(),
+    params=lx.Params(
+        mem_gb=14,
+    )
+)