Skip to content

Commit

Permalink
kraken2
Browse files Browse the repository at this point in the history
  • Loading branch information
Tony-xy-Liu committed May 11, 2023
1 parent 60e0064 commit 7d393bb
Show file tree
Hide file tree
Showing 5 changed files with 211 additions and 0 deletions.
72 changes: 72 additions & 0 deletions metagenomics/taxonomy_on_reads/lib/definition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from pathlib import Path
from limes_x import ModuleBuilder, Item, JobContext, JobResult

SAMPLE = Item('sra accession') # this is just used for grouping, todo: remove
READS = Item('metagenomic gzipped reads')

REPORT = Item('reads taxonomy table')
HITS = Item('gzipped taxonmic hits of reads')

CONTAINER = 'kraken2.sif'
_DB_SIZE = 16
REF_DB = f'k2_standard_{_DB_SIZE:02}gb_20230314' # 8gb db is named "...08gb..."
PIGZ = 'pigz'

def procedure(context: JobContext) -> JobResult:
P = context.params
M = context.manifest
WS = context.output_folder
REF = P.reference_folder
TEMP = "TEMP"

sample = M[SAMPLE]
assert isinstance(sample, str), f"expected str for sample, got: {sample}"

_reads = M[READS]
input_dir = WS.joinpath(f"{TEMP}.inputs")
context.shell(f"mkdir -p {input_dir}")
assert not isinstance(_reads, str), f"got string instead of path for reads: {_reads}"
if not isinstance(_reads, list): _reads = [_reads]
reads = []
for r in _reads:
assert isinstance(r, Path), f"expected path for reads, got: {r}"
context.shell(f"ln -s {r.absolute()} {input_dir.joinpath(r.name)}")
reads.append(r.name)

binds = [
f"{REF}/:/ref",
f"{input_dir}:/inputs",
f"{WS}:/ws",
]

reports_name = f"{sample}.report.krkn"
hits_name = f"{sample}.hits.krkn"
hits_zipped = f"{hits_name}.gz"

context.shell(f"""\
singularity run -B {",".join(binds)} {REF.joinpath(CONTAINER)} kraken2 \
--threads {P.threads} --gzip-compressed {'--memory-mapping' if P.mem_gb<_DB_SIZE else ""} \
--db /ref/{REF_DB} \
--use-names --report /ws/{reports_name} \
{' '.join([f'/inputs/{r}' for r in reads])} \
| {REF.joinpath(PIGZ)} -7 -p {P.threads} >{WS.joinpath(hits_zipped)}
rm -r {WS}/{TEMP}*
""")

return JobResult(
manifest={
REPORT: WS.joinpath(reports_name),
HITS: WS.joinpath(hits_zipped),
},
)

MODULE = ModuleBuilder()\
.SetProcedure(procedure)\
.AddInput(SAMPLE, groupby=SAMPLE)\
.AddInput(READS, groupby=SAMPLE)\
.PromiseOutput(REPORT)\
.PromiseOutput(HITS)\
.SuggestedResources(threads=6, memory_gb=20)\
.Requires({CONTAINER, REF_DB, PIGZ})\
.SetHome(__file__, name=None)\
.Build()
61 changes: 61 additions & 0 deletions metagenomics/taxonomy_on_reads/setup/setup.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
CONTAINER = "kraken2.sif"
REF_DB = "k2_standard_16gb_20230314"
REF_DB_FILE = f"{REF_DB}/hash.k2d"
REF_DB_ZIP = f"{REF_DB}.tar.gz"
PIGZ = "pigz"

rule singularity:
input:
PIGZ,
CONTAINER,
REF_DB_FILE

rule get_image:
output:
CONTAINER
shell:
"""
singularity build {output} docker://staphb/kraken2:2.1.2-no-db
"""

rule download_refdata:
output:
REF_DB_ZIP
shell:
"""
wget --quiet https://genome-idx.s3.amazonaws.com/kraken/{output}
"""

rule extract_refdata:
input:
REF_DB_ZIP
output:
REF_DB_FILE
params:
db=REF_DB
shell:
"""
mkdir -p {params.db}
tar -xf {input} -C {params.db}
"""

rule get_pigz:
output:
"pigz.tar.gz"
shell:
"""
wget https://zlib.net/pigz/{output}
"""

rule compile_pigz:
output:
PIGZ
input:
"pigz.tar.gz"
shell:
"""
tar -xf {input} && mv pigz pigz_lib \
&& cd pigz_lib && make \
&& chmod +x pigz && cp pigz ../ && cd ../ && rm -rf pigz_lib \
&& rm {input}
"""
13 changes: 13 additions & 0 deletions test/kraken/scratch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# docker run --rm \
# --mount type=bind,source="/home/tony/workspace/python/Limes-all/Limes-compute-modules/scratch/kraken/cache/",target="/ws" \
# staphb/kraken2:latest kraken2 \
# --threads 6 \
# --db /ws/k2_standard_08gb_20230314 \
# --gzip-compressed \
# --use-names --report /ws/testrep \
# --output /ws/testout \
# /ws/SRR10140508_1.fastq.gz

docker run --rm \
staphb/kraken2:latest kraken2 \
--help
21 changes: 21 additions & 0 deletions test/kraken/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os, sys
sys.path = list(set([
"../../../Limes-x/src/"
]+sys.path))
from pathlib import Path
import limes_x as lx
from limes_x import ComputeModule


ref_dir = Path("../../../lx_ref")
os.makedirs(ref_dir, exist_ok=True)

HERE = Path("../../")

modules = []
# modules += ComputeModule.LoadSet(HERE.joinpath("logistics"))
modules += ComputeModule.LoadSet(HERE.joinpath("metagenomics"))
modules = [m for m in modules if m.name == "taxonomy_on_reads"]
# modules += ComputeModule.LoadSet(HERE.joinpath("high_throughput_screening"))
wf = lx.Workflow(modules, ref_dir)
wf.Setup('singularity')
44 changes: 44 additions & 0 deletions test/kraken/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os, sys
sys.path = list(set([
"../../lib/"
]+sys.path))
from pathlib import Path
import limes_x as lx

modules = []
# modules += lx.LoadComputeModules("../../../Limes-compute-modules/logistics")
modules += lx.LoadComputeModules("../../metagenomics")

WS = Path("./cache/test_workspace")
os.system(f"rm -r {WS}")
wf = lx.Workflow(modules, reference_folder="../../../lx_ref")
wf.Run(
workspace=WS,
targets=[
lx.Item('reads taxonomy table'),
# lx.Item('metagenomic gzipped reads'),
# lx.Item('metagenomic assembly'),
# lx.Item("metagenomic bin"),
# lx.Item("checkm stats"),
# lx.Item('assembly taxonomy table'),
# lx.Item('genomic annotation'),
],
given=[
lx.InputGroup(
group_by=(lx.Item("sra accession"), "SRR10140508"),
children={
lx.Item("username"): "tony",
lx.Item("metagenomic gzipped reads"): [
Path(p) for p in [
"./cache/SRR10140508_1.fastq.gz",
"./cache/SRR10140508_2.fastq.gz",
]
]
},
)
],
executor=lx.Executor(),
params=lx.Params(
mem_gb=14,
)
)

0 comments on commit 7d393bb

Please sign in to comment.