kraken2

hallamlab · May 11, 2023 · 7d393bb · 7d393bb
1 parent 60e0064
commit 7d393bb
Show file tree

Hide file tree

Showing 5 changed files with 211 additions and 0 deletions.
diff --git a/metagenomics/taxonomy_on_reads/lib/definition.py b/metagenomics/taxonomy_on_reads/lib/definition.py
@@ -0,0 +1,72 @@
+from pathlib import Path
+from limes_x import ModuleBuilder, Item, JobContext, JobResult
+
+SAMPLE      = Item('sra accession') # this is just used for grouping, todo: remove
+READS       = Item('metagenomic gzipped reads')
+
+REPORT      = Item('reads taxonomy table')
+HITS        = Item('gzipped taxonmic hits of reads')
+
+CONTAINER   = 'kraken2.sif'
+_DB_SIZE = 16
+REF_DB      = f'k2_standard_{_DB_SIZE:02}gb_20230314' # 8gb db is named "...08gb..."
+PIGZ        = 'pigz'
+
+def procedure(context: JobContext) -> JobResult:
+    P = context.params
+    M = context.manifest
+    WS = context.output_folder
+    REF = P.reference_folder
+    TEMP = "TEMP"
+
+    sample = M[SAMPLE]
+    assert isinstance(sample, str), f"expected str for sample, got: {sample}"
+
+    _reads = M[READS]
+    input_dir = WS.joinpath(f"{TEMP}.inputs")
+    context.shell(f"mkdir -p {input_dir}")
+    assert not isinstance(_reads, str), f"got string instead of path for reads: {_reads}"
+    if not isinstance(_reads, list): _reads = [_reads]
+    reads = []
+    for r in _reads:
+        assert isinstance(r, Path), f"expected path for reads, got: {r}"
+        context.shell(f"ln -s {r.absolute()} {input_dir.joinpath(r.name)}")
+        reads.append(r.name)
+
+    binds = [
+        f"{REF}/:/ref",
+        f"{input_dir}:/inputs",
+        f"{WS}:/ws",
+    ]
+
+    reports_name = f"{sample}.report.krkn"
+    hits_name = f"{sample}.hits.krkn"
+    hits_zipped = f"{hits_name}.gz"
+
+    context.shell(f"""\
+        singularity run -B {",".join(binds)} {REF.joinpath(CONTAINER)} kraken2 \
+            --threads {P.threads} --gzip-compressed {'--memory-mapping' if P.mem_gb<_DB_SIZE else ""} \
+            --db /ref/{REF_DB} \
+            --use-names --report /ws/{reports_name} \
+            {' '.join([f'/inputs/{r}' for r in reads])} \
+        | {REF.joinpath(PIGZ)} -7 -p {P.threads} >{WS.joinpath(hits_zipped)}
+        rm -r {WS}/{TEMP}*
+    """)
+
+    return JobResult(
+        manifest={
+            REPORT: WS.joinpath(reports_name),
+            HITS: WS.joinpath(hits_zipped),
+        },
+    )
+
+MODULE = ModuleBuilder()\
+    .SetProcedure(procedure)\
+    .AddInput(SAMPLE,       groupby=SAMPLE)\
+    .AddInput(READS,        groupby=SAMPLE)\
+    .PromiseOutput(REPORT)\
+    .PromiseOutput(HITS)\
+    .SuggestedResources(threads=6, memory_gb=20)\
+    .Requires({CONTAINER, REF_DB, PIGZ})\
+    .SetHome(__file__, name=None)\
+    .Build()
diff --git a/metagenomics/taxonomy_on_reads/setup/setup.smk b/metagenomics/taxonomy_on_reads/setup/setup.smk
@@ -0,0 +1,61 @@
+CONTAINER = "kraken2.sif"
+REF_DB = "k2_standard_16gb_20230314"
+REF_DB_FILE = f"{REF_DB}/hash.k2d"
+REF_DB_ZIP = f"{REF_DB}.tar.gz"
+PIGZ = "pigz"
+
+rule singularity:
+    input:
+        PIGZ,
+        CONTAINER,
+        REF_DB_FILE
+
+rule get_image:
+    output:
+        CONTAINER
+    shell:
+        """
+        singularity build {output} docker://staphb/kraken2:2.1.2-no-db
+        """
+
+rule download_refdata:
+    output:
+        REF_DB_ZIP
+    shell:
+        """
+        wget --quiet https://genome-idx.s3.amazonaws.com/kraken/{output}
+        """
+
+rule extract_refdata:
+    input:
+        REF_DB_ZIP
+    output:
+        REF_DB_FILE
+    params:
+        db=REF_DB
+    shell:
+        """
+        mkdir -p {params.db}
+        tar -xf {input} -C {params.db}
+        """
+
+rule get_pigz:
+    output:
+        "pigz.tar.gz"
+    shell:
+        """
+        wget https://zlib.net/pigz/{output}
+        """
+
+rule compile_pigz:
+    output:
+        PIGZ
+    input:
+        "pigz.tar.gz"
+    shell:
+        """
+        tar -xf {input} && mv pigz pigz_lib \
+        && cd pigz_lib && make \
+        && chmod +x pigz && cp pigz ../ && cd ../ && rm -rf pigz_lib \
+        && rm {input}
+        """
diff --git a/test/kraken/scratch.sh b/test/kraken/scratch.sh
@@ -0,0 +1,13 @@
+# docker run --rm \
+#     --mount type=bind,source="/home/tony/workspace/python/Limes-all/Limes-compute-modules/scratch/kraken/cache/",target="/ws" \
+#     staphb/kraken2:latest kraken2 \
+#     --threads 6 \
+#     --db /ws/k2_standard_08gb_20230314 \
+#     --gzip-compressed \
+#     --use-names --report /ws/testrep \
+#     --output /ws/testout \
+#     /ws/SRR10140508_1.fastq.gz
+
+docker run --rm \
+    staphb/kraken2:latest kraken2 \
+    --help
diff --git a/test/kraken/setup.py b/test/kraken/setup.py
@@ -0,0 +1,21 @@
+import os, sys
+sys.path = list(set([
+    "../../../Limes-x/src/"
+]+sys.path))
+from pathlib import Path
+import limes_x as lx
+from limes_x import ComputeModule
+
+
+ref_dir = Path("../../../lx_ref")
+os.makedirs(ref_dir, exist_ok=True)
+
+HERE = Path("../../")
+
+modules = []
+# modules += ComputeModule.LoadSet(HERE.joinpath("logistics"))
+modules += ComputeModule.LoadSet(HERE.joinpath("metagenomics"))
+modules = [m for m in modules if m.name == "taxonomy_on_reads"]
+# modules += ComputeModule.LoadSet(HERE.joinpath("high_throughput_screening"))
+wf = lx.Workflow(modules, ref_dir)
+wf.Setup('singularity')
diff --git a/test/kraken/test.py b/test/kraken/test.py
@@ -0,0 +1,44 @@
+import os, sys
+sys.path = list(set([
+    "../../lib/"
+]+sys.path))
+from pathlib import Path
+import limes_x as lx
+
+modules = []
+# modules += lx.LoadComputeModules("../../../Limes-compute-modules/logistics")
+modules += lx.LoadComputeModules("../../metagenomics")
+
+WS = Path("./cache/test_workspace")
+os.system(f"rm -r {WS}")
+wf = lx.Workflow(modules, reference_folder="../../../lx_ref")
+wf.Run(
+    workspace=WS,
+    targets=[
+        lx.Item('reads taxonomy table'),
+        # lx.Item('metagenomic gzipped reads'),
+        # lx.Item('metagenomic assembly'),
+        # lx.Item("metagenomic bin"),
+        # lx.Item("checkm stats"),
+        # lx.Item('assembly taxonomy table'),
+        # lx.Item('genomic annotation'),
+    ],
+    given=[
+        lx.InputGroup(  
+            group_by=(lx.Item("sra accession"), "SRR10140508"), 
+            children={
+                lx.Item("username"): "tony",
+                lx.Item("metagenomic gzipped reads"): [
+                    Path(p) for p in [
+                        "./cache/SRR10140508_1.fastq.gz",
+                        "./cache/SRR10140508_2.fastq.gz",
+                    ]
+                ]
+            },
+        )
+    ],
+    executor=lx.Executor(),
+    params=lx.Params(
+        mem_gb=14,
+    )
+)