diff --git a/metagenomics/taxonomy_on_reads/lib/definition.py b/metagenomics/taxonomy_on_reads/lib/definition.py new file mode 100755 index 0000000..32cdd73 --- /dev/null +++ b/metagenomics/taxonomy_on_reads/lib/definition.py @@ -0,0 +1,72 @@ +from pathlib import Path +from limes_x import ModuleBuilder, Item, JobContext, JobResult + +SAMPLE = Item('sra accession') # this is just used for grouping, todo: remove +READS = Item('metagenomic gzipped reads') + +REPORT = Item('reads taxonomy table') +HITS = Item('gzipped taxonmic hits of reads') + +CONTAINER = 'kraken2.sif' +_DB_SIZE = 16 +REF_DB = f'k2_standard_{_DB_SIZE:02}gb_20230314' # 8gb db is named "...08gb..." +PIGZ = 'pigz' + +def procedure(context: JobContext) -> JobResult: + P = context.params + M = context.manifest + WS = context.output_folder + REF = P.reference_folder + TEMP = "TEMP" + + sample = M[SAMPLE] + assert isinstance(sample, str), f"expected str for sample, got: {sample}" + + _reads = M[READS] + input_dir = WS.joinpath(f"{TEMP}.inputs") + context.shell(f"mkdir -p {input_dir}") + assert not isinstance(_reads, str), f"got string instead of path for reads: {_reads}" + if not isinstance(_reads, list): _reads = [_reads] + reads = [] + for r in _reads: + assert isinstance(r, Path), f"expected path for reads, got: {r}" + context.shell(f"ln -s {r.absolute()} {input_dir.joinpath(r.name)}") + reads.append(r.name) + + binds = [ + f"{REF}/:/ref", + f"{input_dir}:/inputs", + f"{WS}:/ws", + ] + + reports_name = f"{sample}.report.krkn" + hits_name = f"{sample}.hits.krkn" + hits_zipped = f"{hits_name}.gz" + + context.shell(f"""\ + singularity run -B {",".join(binds)} {REF.joinpath(CONTAINER)} kraken2 \ + --threads {P.threads} --gzip-compressed {'--memory-mapping' if P.mem_gb<_DB_SIZE else ""} \ + --db /ref/{REF_DB} \ + --use-names --report /ws/{reports_name} \ + {' '.join([f'/inputs/{r}' for r in reads])} \ + | {REF.joinpath(PIGZ)} -7 -p {P.threads} >{WS.joinpath(hits_zipped)} + rm -r {WS}/{TEMP}* + """) + + return JobResult( + manifest={ + REPORT: WS.joinpath(reports_name), + HITS: WS.joinpath(hits_zipped), + }, + ) + +MODULE = ModuleBuilder()\ + .SetProcedure(procedure)\ + .AddInput(SAMPLE, groupby=SAMPLE)\ + .AddInput(READS, groupby=SAMPLE)\ + .PromiseOutput(REPORT)\ + .PromiseOutput(HITS)\ + .SuggestedResources(threads=6, memory_gb=20)\ + .Requires({CONTAINER, REF_DB, PIGZ})\ + .SetHome(__file__, name=None)\ + .Build() diff --git a/metagenomics/taxonomy_on_reads/setup/setup.smk b/metagenomics/taxonomy_on_reads/setup/setup.smk new file mode 100644 index 0000000..0ec171d --- /dev/null +++ b/metagenomics/taxonomy_on_reads/setup/setup.smk @@ -0,0 +1,61 @@ +CONTAINER = "kraken2.sif" +REF_DB = "k2_standard_16gb_20230314" +REF_DB_FILE = f"{REF_DB}/hash.k2d" +REF_DB_ZIP = f"{REF_DB}.tar.gz" +PIGZ = "pigz" + +rule singularity: + input: + PIGZ, + CONTAINER, + REF_DB_FILE + +rule get_image: + output: + CONTAINER + shell: + """ + singularity build {output} docker://staphb/kraken2:2.1.2-no-db + """ + +rule download_refdata: + output: + REF_DB_ZIP + shell: + """ + wget --quiet https://genome-idx.s3.amazonaws.com/kraken/{output} + """ + +rule extract_refdata: + input: + REF_DB_ZIP + output: + REF_DB_FILE + params: + db=REF_DB + shell: + """ + mkdir -p {params.db} + tar -xf {input} -C {params.db} + """ + +rule get_pigz: + output: + "pigz.tar.gz" + shell: + """ + wget https://zlib.net/pigz/{output} + """ + +rule compile_pigz: + output: + PIGZ + input: + "pigz.tar.gz" + shell: + """ + tar -xf {input} && mv pigz pigz_lib \ + && cd pigz_lib && make \ + && chmod +x pigz && cp pigz ../ && cd ../ && rm -rf pigz_lib \ + && rm {input} + """ \ No newline at end of file diff --git a/test/kraken/scratch.sh b/test/kraken/scratch.sh new file mode 100755 index 0000000..1dc2f8c --- /dev/null +++ b/test/kraken/scratch.sh @@ -0,0 +1,13 @@ +# docker run --rm \ +# --mount type=bind,source="/home/tony/workspace/python/Limes-all/Limes-compute-modules/scratch/kraken/cache/",target="/ws" \ +# staphb/kraken2:latest kraken2 \ +# --threads 6 \ +# --db /ws/k2_standard_08gb_20230314 \ +# --gzip-compressed \ +# --use-names --report /ws/testrep \ +# --output /ws/testout \ +# /ws/SRR10140508_1.fastq.gz + +docker run --rm \ + staphb/kraken2:latest kraken2 \ + --help diff --git a/test/kraken/setup.py b/test/kraken/setup.py new file mode 100755 index 0000000..f744323 --- /dev/null +++ b/test/kraken/setup.py @@ -0,0 +1,21 @@ +import os, sys +sys.path = list(set([ + "../../../Limes-x/src/" +]+sys.path)) +from pathlib import Path +import limes_x as lx +from limes_x import ComputeModule + + +ref_dir = Path("../../../lx_ref") +os.makedirs(ref_dir, exist_ok=True) + +HERE = Path("../../") + +modules = [] +# modules += ComputeModule.LoadSet(HERE.joinpath("logistics")) +modules += ComputeModule.LoadSet(HERE.joinpath("metagenomics")) +modules = [m for m in modules if m.name == "taxonomy_on_reads"] +# modules += ComputeModule.LoadSet(HERE.joinpath("high_throughput_screening")) +wf = lx.Workflow(modules, ref_dir) +wf.Setup('singularity') diff --git a/test/kraken/test.py b/test/kraken/test.py new file mode 100755 index 0000000..9e39bdc --- /dev/null +++ b/test/kraken/test.py @@ -0,0 +1,44 @@ +import os, sys +sys.path = list(set([ + "../../lib/" +]+sys.path)) +from pathlib import Path +import limes_x as lx + +modules = [] +# modules += lx.LoadComputeModules("../../../Limes-compute-modules/logistics") +modules += lx.LoadComputeModules("../../metagenomics") + +WS = Path("./cache/test_workspace") +os.system(f"rm -r {WS}") +wf = lx.Workflow(modules, reference_folder="../../../lx_ref") +wf.Run( + workspace=WS, + targets=[ + lx.Item('reads taxonomy table'), + # lx.Item('metagenomic gzipped reads'), + # lx.Item('metagenomic assembly'), + # lx.Item("metagenomic bin"), + # lx.Item("checkm stats"), + # lx.Item('assembly taxonomy table'), + # lx.Item('genomic annotation'), + ], + given=[ + lx.InputGroup( + group_by=(lx.Item("sra accession"), "SRR10140508"), + children={ + lx.Item("username"): "tony", + lx.Item("metagenomic gzipped reads"): [ + Path(p) for p in [ + "./cache/SRR10140508_1.fastq.gz", + "./cache/SRR10140508_2.fastq.gz", + ] + ] + }, + ) + ], + executor=lx.Executor(), + params=lx.Params( + mem_gb=14, + ) +)