Skip to content

Commit

Permalink
Merge pull request #3 from hallamlab/dev
Browse files Browse the repository at this point in the history
minor fixes
  • Loading branch information
Tony-xy-Liu authored May 17, 2023
2 parents ced7a5c + 3a3802f commit 1a3326a
Show file tree
Hide file tree
Showing 14 changed files with 245 additions and 17 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@

__pycache__
cache

/lib/limes_x
2 changes: 1 addition & 1 deletion logistics/extract_mg-reads/setup/setup.smk
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,6 @@ rule compile_pigz:
"""
tar -xf {input} && mv pigz pigz_lib \
&& cd pigz_lib && make \
&& cp pigz ../ && cd ../ && rm -rf pigz_lib \
&& chmod +x pigz && cp pigz ../ && cd ../ && rm -rf pigz_lib \
&& rm {input}
"""
2 changes: 1 addition & 1 deletion metagenomics/checkm_on_bin/setup/setup.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ CHECKM_DB_REP = CHECKM_DB+"/.dmanifest"
rule singularity:
input:
CONTAINER,
CHECKM_DB,
CHECKM_DB_REP,

rule get_image:
output:
Expand Down
14 changes: 10 additions & 4 deletions metagenomics/metagenomic_assembly/lib/definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
SAMPLE = Item('sra accession')
READS = Item('metagenomic gzipped reads')

ASMBLER = Item('assembler')
ASM = Item('metagenomic assembly')
ASM_WS = Item('metagenomic assembly work')

Expand Down Expand Up @@ -41,7 +42,7 @@ def _flye(reads: list[Path]):
flye --meta --threads {params.threads} \
--pacbio-raw {' '.join('/ws/'+str(r) for r in reads)} --out-dir /ws/{ws}
mv {ws}/assembly.fasta {out}
"""
""", "flye"

def _megahit(reads: list[Path]):
ones, twos, singles = [], [], []
Expand Down Expand Up @@ -69,7 +70,7 @@ def _megahit(reads: list[Path]):
megahit --num-cpu-threads {params.threads} --memory {params.mem_gb}e9\
{' '.join(read_params)} --out-dir /ws/{ws}
mv {ws}/final.contigs.fa {out}
"""
""", "megahit"

#https://bioinformatics.stackexchange.com/questions/935/fast-way-to-count-number-of-reads-and-number-of-bases-in-a-fastq-file
read_sizes = context.output_folder.joinpath("temp.readcount.txt")
Expand All @@ -88,7 +89,10 @@ def _megahit(reads: list[Path]):
if int(nucleotides)/int(num_reads) > 600:
is_short_read = False

exe_cmd = _megahit(reads) if is_short_read else _flye(reads)
if is_short_read:
exe_cmd, assembler = _megahit(reads)
else:
exe_cmd, assembler = _flye(reads)
code = context.shell(f"""\
PYTHONPATH=""
{exe_cmd}
Expand All @@ -97,7 +101,8 @@ def _megahit(reads: list[Path]):
return JobResult(
manifest = {
ASM: out,
ASM_WS: ws
ASMBLER: assembler,
ASM_WS: ws,
},
)

Expand All @@ -106,6 +111,7 @@ def _megahit(reads: list[Path]):
.AddInput(SAMPLE, groupby=SAMPLE)\
.AddInput(READS, groupby=SAMPLE)\
.PromiseOutput(ASM)\
.PromiseOutput(ASMBLER)\
.SuggestedResources(threads=8, memory_gb=16)\
.Requires({FLYE, MEGAHIT, PIGZ})\
.SetHome(__file__)\
Expand Down
2 changes: 1 addition & 1 deletion metagenomics/metagenomic_assembly/setup/setup.smk
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,6 @@ rule compile_pigz:
"""
tar -xf {input} && mv pigz pigz_lib \
&& cd pigz_lib && make \
&& cp pigz ../ && cd ../ && rm -rf pigz_lib \
&& chmod +x pigz && cp pigz ../ && cd ../ && rm -rf pigz_lib \
&& rm {input}
"""
14 changes: 7 additions & 7 deletions metagenomics/metagenomic_binning/lib/definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
PIGZ = 'pigz'

def example_procedure(context: JobContext) -> JobResult:
COMPLETION, CONTAMINATION = 50, 5
# https://www.nature.com/articles/nbt.3893
# accepted standard for med. quality bins
COMPLETION, CONTAMINATION = 50, 10

TEMP_PREFIX = "temp"
cache = context.output_folder.joinpath(TEMP_PREFIX)
os.makedirs(cache, exist_ok=True)
Expand Down Expand Up @@ -47,10 +50,6 @@ def fail(msg: str):
assert isinstance(r, Path), f"expected path for reads, got {type(r)} for {r}"
zipped_reads.append(r)

# rtype = context.manifest[READ_TYPE]
# assert isinstance(rtype, str), f"invalid read type: {rtype}"
# _, read_type = rtype.split(':')

name = context.manifest[SAMPLE]
assert isinstance(name, str), f"name wasn't a str: {name}"

Expand All @@ -77,11 +76,12 @@ def fail(msg: str):
else:
single = True

# https://github.com/bxlab/metaWRAP/issues/254
# does not handle interleaved
special_read_type = " "
if single:
special_read_type = "--single-end"
if paired and single:
# https://github.com/bxlab/metaWRAP/issues/254
reads = [r for r in reads if not str(r).split(".")[-2].endswith("_2")]

#################################################################################
Expand Down Expand Up @@ -151,7 +151,7 @@ def get_new_name(bin: str):

TAB = '\t'
stats_file = context.output_folder.joinpath(f"{name}.stats")
with open(refine_out.joinpath("metawrap_50_5_bins.stats")) as original:
with open(refine_out.joinpath(f"metawrap_{COMPLETION}_{CONTAMINATION}_bins.stats")) as original:
with open(stats_file, 'w') as stats:
header = original.readline()
stats.write(header)
Expand Down
2 changes: 1 addition & 1 deletion metagenomics/metagenomic_binning/setup/setup.smk
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ rule compile_pigz:
"""
tar -xf {input} && mv pigz pigz_lib \
&& cd pigz_lib && make \
&& cp pigz ../ && cd ../ && rm -rf pigz_lib \
&& chmod +x pigz && cp pigz ../ && cd ../ && rm -rf pigz_lib \
&& rm {input}
"""

Expand Down
10 changes: 9 additions & 1 deletion metagenomics/taxonomy_on_bin/lib/gtdbtk.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def gtdbtk_procedure(context: JobContext, SAMPLE, BINS, GTDBTK_WS, GTDBTK_TAX, C

code = context.shell(f"""\
mkdir -p {temp_dir}
PYTHONPATH=""
singularity run -B {",".join(binds)} {container} \
gtdbtk classify_wf -x {ext} \
--cpus {params.threads} --pplacer_cpus {int(min(params.threads, params.mem_gb//(40+1)))} \
Expand All @@ -84,7 +85,7 @@ def gtdbtk_procedure(context: JobContext, SAMPLE, BINS, GTDBTK_WS, GTDBTK_TAX, C
},
)

summary = context.output_folder.joinpath(f"{sample}-bins.tax.tsv")
summary = context.output_folder.joinpath(f"{sample}.tax.tsv")
written_header = False
with open(summary, "w") as out:
for table in file_candidates:
Expand All @@ -94,6 +95,13 @@ def gtdbtk_procedure(context: JobContext, SAMPLE, BINS, GTDBTK_WS, GTDBTK_TAX, C
for l in tsv:
out.write(l)

# todo: return trees as well
# out_tree = context.output_folder.joinpath(f"{sample}.tax.tree")
# file_candidates = os.listdir(classify_out) if classify_out.exists() else []
# file_candidates = [classify_out.joinpath(f) for f in file_candidates if (f.endswith(".tree") and "backbone" not in str(f))]
# with open(out_tree, "w") as out:


#clean up
context.shell(f"""\
cd {context.output_folder} && rm -r {TEMP_PREFIX}*
Expand Down
2 changes: 1 addition & 1 deletion metagenomics/taxonomy_on_bin/setup/setup.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ rule get_image:
CONTAINER
shell:
"""
singularity build {output} docker://quay.io/txyliu/gtdbtk
singularity build {output} docker://ecogenomic/gtdbtk:2.2.6
"""

# check url if change version
Expand Down
73 changes: 73 additions & 0 deletions metagenomics/taxonomy_on_reads/lib/definition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from pathlib import Path
from limes_x import ModuleBuilder, Item, JobContext, JobResult

SAMPLE = Item('sra accession') # this is just used for grouping, todo: remove
READS = Item('metagenomic gzipped reads')

REPORT = Item('reads taxonomy table')
HITS = Item('gzipped taxonmic hits of reads')

CONTAINER = 'kraken2.sif'
_DB_SIZE = 16
# _DB_SIZE = 8
REF_DB = f'k2_standard_{_DB_SIZE:02}gb_20230314' # 8gb db is named "...08gb..."
PIGZ = 'pigz'

def procedure(context: JobContext) -> JobResult:
P = context.params
M = context.manifest
WS = context.output_folder
REF = P.reference_folder
TEMP = "TEMP"

sample = M[SAMPLE]
assert isinstance(sample, str), f"expected str for sample, got: {sample}"

_reads = M[READS]
input_dir = WS.joinpath(f"{TEMP}.inputs")
context.shell(f"mkdir -p {input_dir}")
assert not isinstance(_reads, str), f"got string instead of path for reads: {_reads}"
if not isinstance(_reads, list): _reads = [_reads]
reads = []
for r in _reads:
assert isinstance(r, Path), f"expected path for reads, got: {r}"
context.shell(f"cp -L {r.absolute()} {input_dir.joinpath(r.name)}")
reads.append(r.name)

binds = [
f"{REF}/:/ref",
f"{input_dir}:/inputs",
f"{WS}:/ws",
]

reports_name = f"{sample}.report.krkn"
hits_name = f"{sample}.hits.krkn"
hits_zipped = f"{hits_name}.gz"

context.shell(f"""\
singularity run -B {",".join(binds)} {REF.joinpath(CONTAINER)} kraken2 \
--threads {P.threads} --gzip-compressed {'--memory-mapping' if P.mem_gb<_DB_SIZE else ""} \
--db /ref/{REF_DB} \
--use-names --report /ws/{reports_name} \
{' '.join([f'/inputs/{r}' for r in reads])} \
| {REF.joinpath(PIGZ)} -7 -p {P.threads} >{WS.joinpath(hits_zipped)}
rm -r {WS}/{TEMP}*
""")

return JobResult(
manifest={
REPORT: WS.joinpath(reports_name),
HITS: WS.joinpath(hits_zipped),
},
)

MODULE = ModuleBuilder()\
.SetProcedure(procedure)\
.AddInput(SAMPLE, groupby=SAMPLE)\
.AddInput(READS, groupby=SAMPLE)\
.PromiseOutput(REPORT)\
.PromiseOutput(HITS)\
.SuggestedResources(threads=6, memory_gb=20)\
.Requires({CONTAINER, REF_DB, PIGZ})\
.SetHome(__file__, name=None)\
.Build()
61 changes: 61 additions & 0 deletions metagenomics/taxonomy_on_reads/setup/setup.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
CONTAINER = "kraken2.sif"
REF_DB = "k2_standard_16gb_20230314"
REF_DB_FILE = f"{REF_DB}/hash.k2d"
REF_DB_ZIP = f"{REF_DB}.tar.gz"
PIGZ = "pigz"

rule singularity:
input:
PIGZ,
CONTAINER,
REF_DB_FILE

rule get_image:
output:
CONTAINER
shell:
"""
singularity build {output} docker://staphb/kraken2:2.1.2-no-db
"""

rule download_refdata:
output:
REF_DB_ZIP
shell:
"""
wget --quiet https://genome-idx.s3.amazonaws.com/kraken/{output}
"""

rule extract_refdata:
input:
REF_DB_ZIP
output:
REF_DB_FILE
params:
db=REF_DB
shell:
"""
mkdir -p {params.db}
tar -xf {input} -C {params.db}
"""

rule get_pigz:
output:
"pigz.tar.gz"
shell:
"""
wget https://zlib.net/pigz/{output}
"""

rule compile_pigz:
output:
PIGZ
input:
"pigz.tar.gz"
shell:
"""
tar -xf {input} && mv pigz pigz_lib \
&& cd pigz_lib && make \
&& chmod +x pigz && cp pigz ../ && cd ../ && rm -rf pigz_lib \
&& rm {input}
"""
13 changes: 13 additions & 0 deletions test/kraken/scratch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# docker run --rm \
# --mount type=bind,source="/home/tony/workspace/python/Limes-all/Limes-compute-modules/scratch/kraken/cache/",target="/ws" \
# staphb/kraken2:latest kraken2 \
# --threads 6 \
# --db /ws/k2_standard_08gb_20230314 \
# --gzip-compressed \
# --use-names --report /ws/testrep \
# --output /ws/testout \
# /ws/SRR10140508_1.fastq.gz

docker run --rm \
staphb/kraken2:latest kraken2 \
--help
21 changes: 21 additions & 0 deletions test/kraken/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os, sys
sys.path = list(set([
"../../../Limes-x/src/"
]+sys.path))
from pathlib import Path
import limes_x as lx
from limes_x import ComputeModule


ref_dir = Path("../../../lx_ref")
os.makedirs(ref_dir, exist_ok=True)

HERE = Path("../../")

modules = []
# modules += ComputeModule.LoadSet(HERE.joinpath("logistics"))
modules += ComputeModule.LoadSet(HERE.joinpath("metagenomics"))
modules = [m for m in modules if m.name == "taxonomy_on_reads"]
# modules += ComputeModule.LoadSet(HERE.joinpath("high_throughput_screening"))
wf = lx.Workflow(modules, ref_dir)
wf.Setup('singularity')
44 changes: 44 additions & 0 deletions test/kraken/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os, sys
sys.path = list(set([
"../../lib/"
]+sys.path))
from pathlib import Path
import limes_x as lx

modules = []
# modules += lx.LoadComputeModules("../../../Limes-compute-modules/logistics")
modules += lx.LoadComputeModules("../../metagenomics")

WS = Path("./cache/test_workspace")
os.system(f"rm -r {WS}")
wf = lx.Workflow(modules, reference_folder="../../../lx_ref")
wf.Run(
workspace=WS,
targets=[
lx.Item('reads taxonomy table'),
# lx.Item('metagenomic gzipped reads'),
# lx.Item('metagenomic assembly'),
# lx.Item("metagenomic bin"),
# lx.Item("checkm stats"),
# lx.Item('assembly taxonomy table'),
# lx.Item('genomic annotation'),
],
given=[
lx.InputGroup(
group_by=(lx.Item("sra accession"), "SRR10140508"),
children={
lx.Item("username"): "tony",
lx.Item("metagenomic gzipped reads"): [
Path(p) for p in [
"./cache/SRR10140508_1.fastq.gz",
"./cache/SRR10140508_2.fastq.gz",
]
]
},
)
],
executor=lx.Executor(),
params=lx.Params(
mem_gb=14,
)
)

0 comments on commit 1a3326a

Please sign in to comment.