From b3ce8db0126fc0ebb8c992c0efa14b71e5e8339d Mon Sep 17 00:00:00 2001 From: worku005 Date: Thu, 24 Oct 2024 16:06:50 +0200 Subject: [PATCH 01/41] adding singularity image for HapHiC --- .../singularity/haphic/haphic.f8f7451.def | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 workflow/singularity/haphic/haphic.f8f7451.def diff --git a/workflow/singularity/haphic/haphic.f8f7451.def b/workflow/singularity/haphic/haphic.f8f7451.def new file mode 100644 index 0000000..c7df714 --- /dev/null +++ b/workflow/singularity/haphic/haphic.f8f7451.def @@ -0,0 +1,36 @@ +Bootstrap: docker +From: mambaorg/micromamba:1.5.10-alpine3.20 + +%post + apk add git bash + + # Clone HapHiC repository + git clone https://github.com/zengxiaofei/HapHiC.git /opt/HapHiC + cd /opt/HapHiC + git checkout f8f7451b932a80b54cf21a9a7861d5f5c9140da2 + + # Create the conda environment + micromamba create -q -y -n haphic -f /opt/HapHiC/conda_env/environment_py310.yml + + # Create wrapper script for haphic + cat > /usr/local/bin/haphic << 'EOF' +#!/bin/bash +exec micromamba run -n haphic /opt/HapHiC/haphic "$@" +EOF + + chmod +x /usr/local/bin/haphic + + # Clean up + micromamba clean --all --yes + +%runscript + exec "$@" + +%help + This container provides HapHiC with all its dependencies. + + Usage: + singularity run haphic.sif haphic -h + singularity run haphic.sif haphic check + + For more information, visit: https://github.com/zengxiaofei/HapHiC From eb78b000d297a85e58a8b676fd73b48110650929 Mon Sep 17 00:00:00 2001 From: worku005 Date: Thu, 24 Oct 2024 16:08:15 +0200 Subject: [PATCH 02/41] rename ntjoin to scaffolding to generalise with hi-c later --- .../rules/2.scaffolding/{01.ntjoin.smk => 01.scaffolding.smk} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename workflow/rules/2.scaffolding/{01.ntjoin.smk => 01.scaffolding.smk} (100%) diff --git a/workflow/rules/2.scaffolding/01.ntjoin.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk similarity index 100% rename from workflow/rules/2.scaffolding/01.ntjoin.smk rename to workflow/rules/2.scaffolding/01.scaffolding.smk From 44cecee6bed23746f4149c547361312b368401a2 Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 09:51:04 +0200 Subject: [PATCH 03/41] implement code to run HapHiC, but not connected to rest of pipeline yet --- .idea/misc.xml | 3 + config/samples.tsv | 8 +- workflow/envs/mapping.yaml | 3 +- .../rules/2.scaffolding/01.scaffolding.smk | 127 ++++++++++++++++++ workflow/rules/common.smk | 9 ++ workflow/schemas/samples.schema.yaml | 8 ++ .../singularity/haphic/haphic.f8f7451.def | 11 +- 7 files changed, 163 insertions(+), 6 deletions(-) diff --git a/.idea/misc.xml b/.idea/misc.xml index 060d2c5..db8786c 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,7 @@ + + \ No newline at end of file diff --git a/config/samples.tsv b/config/samples.tsv index 9f37f09..e30d510 100644 --- a/config/samples.tsv +++ b/config/samples.tsv @@ -1,4 +1,4 @@ -accessionId hifi ont illumina_1 illumina_2 haplotypes speciesName taxId referenceId -accessionA "resources/reads/HiFi/readsA1.fasta;resources/reads/HiFi/readsA2.fasta" "resources/reads/sample1/library1_1.fq.gz" "resources/reads/sample1/library1_2.fq.gz" 1 "speciesSp" 123 referenceA -accessionB "resources/reads/HiFi/readsB.fasta" "resources/reads/ONT/readsB1.fasta;resources/reads/ONT/readsB2.fasta" "" "" 2 "speciesSp" 123 referenceA -accessionC "resources/reads/HiFi/readsC.fasta" "" "" "" 1 "speciesSd" 456 referenceB +accessionId hifi ont illumina_1 illumina_2 hic_1 hic_2 haplotypes speciesName taxId referenceId +accessionA "resources/reads/HiFi/readsA1.fasta;resources/reads/HiFi/readsA2.fasta" "resources/reads/sample1/library1_1.fq.gz" "resources/reads/sample1/library1_2.fq.gz" "" "" 1 "speciesSp" 123 referenceA +accessionB "resources/reads/HiFi/readsB.fasta" "resources/reads/ONT/readsB1.fasta;resources/reads/ONT/readsB2.fasta" "" "" "" "" 2 "speciesSp" 123 referenceA +accessionC "resources/reads/HiFi/readsC.fasta" "" "" "" "" "" 1 "speciesSd" 456 referenceB diff --git a/workflow/envs/mapping.yaml b/workflow/envs/mapping.yaml index 5c21ba1..01d0faa 100644 --- a/workflow/envs/mapping.yaml +++ b/workflow/envs/mapping.yaml @@ -4,4 +4,5 @@ channels: dependencies: - bwa-mem2=2.2.1 - minimap2=2.26 - - samtools=1.19 \ No newline at end of file + - samtools=1.19 + - samblaster=0.1.26 \ No newline at end of file diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index e0cea54..28b61fd 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -28,3 +28,130 @@ rule ntjoin: ntJoin assemble target=$(basename {output.all} | rev | cut -d"." -f7- | rev) references='{wildcards.reference}.fa' target_weight='1' reference_weights='2' n=2 G=10000 agp=True no_cut=True overlap=False k={wildcards.k} w={wildcards.w} mkt=True prefix=$(basename {output.all} | rev | cut -d '.' -f 2- | rev) t={threads} ) &> {log} """ + +rule bwa_index_contigs: + input: + contigs = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa", + output: + index1 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.0123", + index2 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.amb", + index3 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.ann", + index4 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.bwt.2bit.64", + index5 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.pac", + log: + "results/logs/2.scaffolding/bwa_index_contigs/{asmname}.min{minlen}.log" + benchmark: + "results/benchmarks/2.scaffolding/bwa_index_contigs/{asmname}.min{minlen}.txt" + conda: + "../../envs/mapping.yaml" + shell: + "bwa-mem2 index {input.contigs} &> {log}" + +rule map_hic: + input: + contigs = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa", + index1 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.0123", + index2 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.amb", + index3 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.ann", + index4 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.bwt.2bit.64", + index5 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.pac", + forward = get_hic_1, + backward = get_hic_2, + output: + "results/{asmname}/2.scaffolding/01.haphic/{asmname}.hic.sorted.bam", + log: + "results/logs/2.scaffolding/map_hic/{asmname}.log" + benchmark: + "results/benchmarks/2.scaffolding/map_hic/{asmname}.txt" + threads: + 10 + conda: + "../../envs/mapping.yaml" + shell: + "(bwa-mem2 mem -t {threads} -5SP {input.contigs} {input.forward} {input.backward} | samblaster | samtools view - -@ $(({threads}-1)) -S -h -b -F 3340 -o {output}) &> {log}" + +rule filter_hic: + input: + "results/{asmname}/2.scaffolding/01.haphic/{asmname}.hic.sorted.bam", + output: + "results/{asmname}/2.scaffolding/01.haphic/{asmname}.hic.sorted.filtered.bam", + log: + "results/logs/2.scaffolding/filter_hic/{asmname}.log" + benchmark: + "results/benchmarks/2.scaffolding/filter_hic/{asmname}.txt" + threads: + 10 + singularity: + "workflow/singularity/haphic/haphic.f8f7451.sif" + shell: + "(filter_bam.py {input} 1 --nm 3 --threads {threads} | samtools view - -b -@ $(({threads}-1)) -o {output}) &> {log}" + +rule haphic: + input: + contigs = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa", + minlen=config["min_contig_len"], + ), + hic = "results/{asmname}/2.scaffolding/01.haphic/{asmname}.hic.sorted.filtered.bam", + #gfa = "", #TODO: we could look into using GFA from hifiasm if hifiasm was used, but this interferes with the current renaming of contigs (HapHiC calls this feature EXPERIMENTAL!) + output: + directory("results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/"), + log: + "results/logs/2.scaffolding/haphic/{asmname}.log" + benchmark: + "results/benchmarks/2.scaffolding/haphic/{asmname}.txt" + params: + num_chr = lambda wildcards: len(config["reference_genomes"][get_reference_id(wildcards.asmname)]["chromosomes"]), #the number of chromosomes + threads: + 8 + singularity: + "workflow/singularity/haphic/haphic.f8f7451.sif" + shell: + "haphic pipeline --threads {threads} --outdir {output} --verbose {input.contigs} {input.hic} {params.num_chr} &> {log}" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 00bfcce..abc7c90 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -43,6 +43,15 @@ def get_illumina_1(wildcards): def get_illumina_2(wildcards): return SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["illumina_2"].values.item() +def has_hic(wildcards): + return not SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["hic_1"].isnull().values.item() + +def get_hic_1(wildcards): + return SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["hic_1"].values.item() + +def get_hic_2(wildcards): + return SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["hic_2"].values.item() + def get_haplotypes(wildcards): return int(SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["haplotypes"].values.item()) diff --git a/workflow/schemas/samples.schema.yaml b/workflow/schemas/samples.schema.yaml index fb78998..160b0ee 100644 --- a/workflow/schemas/samples.schema.yaml +++ b/workflow/schemas/samples.schema.yaml @@ -23,6 +23,14 @@ properties: type: string description: path to the reverse Illumina reads pattern: "^.*\\.[fF](ast)?q(\\.gz)?$" + hic_1: + type: string + description: path to the forward Hi-C reads + pattern: "^.*\\.[fF](ast)?q(\\.gz)?$" + hic_2: + type: string + description: path to the reverse Hi-C reads + pattern: "^.*\\.[fF](ast)?q(\\.gz)?$" haplotypes: type: integer description: expected number of haplotypes in the sample; use 1 for relatively homozygous diploids but 2 for heterozygous diploids diff --git a/workflow/singularity/haphic/haphic.f8f7451.def b/workflow/singularity/haphic/haphic.f8f7451.def index c7df714..b0f4858 100644 --- a/workflow/singularity/haphic/haphic.f8f7451.def +++ b/workflow/singularity/haphic/haphic.f8f7451.def @@ -11,14 +11,23 @@ From: mambaorg/micromamba:1.5.10-alpine3.20 # Create the conda environment micromamba create -q -y -n haphic -f /opt/HapHiC/conda_env/environment_py310.yml + micromamba install -y -n haphic -c conda-forge -c bioconda samtools=1.19.1 # Create wrapper script for haphic cat > /usr/local/bin/haphic << 'EOF' #!/bin/bash exec micromamba run -n haphic /opt/HapHiC/haphic "$@" EOF - chmod +x /usr/local/bin/haphic + + # Create wrapper script for all files in /opt/HapHiC/utils/ (except .jar) + for file in $(ls /opt/HapHiC/utils/ | grep -v ".jar"); do + cat > /usr/local/bin/${file} << EOF +#!/bin/bash +exec micromamba run -n haphic /opt/HapHiC/utils/${file} "\$@" +EOF + chmod +x /usr/local/bin/${file} + done # Clean up micromamba clean --all --yes From f545ba86781f7d87161cea7bdbf3bdb7750c5229 Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 09:59:57 +0200 Subject: [PATCH 04/41] fix typo in renamed file --- workflow/rules/2.scaffolding/all.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/2.scaffolding/all.smk b/workflow/rules/2.scaffolding/all.smk index b379402..20b94da 100644 --- a/workflow/rules/2.scaffolding/all.smk +++ b/workflow/rules/2.scaffolding/all.smk @@ -1,4 +1,4 @@ -include: "01.ntjoin.smk" +include: "01.scaffolding.smk" include: "02.renaming.smk" include: "03.mummer.smk" From 26d9698320aa19f27d60a0b667d8969b2936f804 Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 10:04:37 +0200 Subject: [PATCH 05/41] fix missing minlen expansion for hic --- workflow/rules/2.scaffolding/01.scaffolding.smk | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index 28b61fd..46488e2 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -49,12 +49,12 @@ rule bwa_index_contigs: rule map_hic: input: - contigs = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa", - index1 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.0123", - index2 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.amb", - index3 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.ann", - index4 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.bwt.2bit.64", - index5 = "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.pac", + contigs = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa", minlen=config["min_contig_len"],), + index1 = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa.0123", minlen=config["min_contig_len"],), + index2 = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa.amb", minlen=config["min_contig_len"],), + index3 = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa.ann", minlen=config["min_contig_len"],), + index4 = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa.bwt.2bit.64", minlen=config["min_contig_len"],), + index5 = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa.pac", minlen=config["min_contig_len"],), forward = get_hic_1, backward = get_hic_2, output: @@ -88,9 +88,7 @@ rule filter_hic: rule haphic: input: - contigs = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa", - minlen=config["min_contig_len"], - ), + contigs = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa", minlen=config["min_contig_len"],), hic = "results/{asmname}/2.scaffolding/01.haphic/{asmname}.hic.sorted.filtered.bam", #gfa = "", #TODO: we could look into using GFA from hifiasm if hifiasm was used, but this interferes with the current renaming of contigs (HapHiC calls this feature EXPERIMENTAL!) output: From 9dd537e77995252e366acabdea3181da65041b63 Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 11:54:37 +0200 Subject: [PATCH 06/41] fix argument for filter_bam in haphic --- workflow/rules/2.scaffolding/01.scaffolding.smk | 2 +- workflow/singularity/haphic/haphic.f8f7451.def | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index 46488e2..f1efcd5 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -84,7 +84,7 @@ rule filter_hic: singularity: "workflow/singularity/haphic/haphic.f8f7451.sif" shell: - "(filter_bam.py {input} 1 --nm 3 --threads {threads} | samtools view - -b -@ $(({threads}-1)) -o {output}) &> {log}" + "(filter_bam.py {input} 1 --NM 3 --threads {threads} | samtools view - -b -@ $(({threads}-1)) -o {output}) &> {log}" rule haphic: input: diff --git a/workflow/singularity/haphic/haphic.f8f7451.def b/workflow/singularity/haphic/haphic.f8f7451.def index b0f4858..41628a1 100644 --- a/workflow/singularity/haphic/haphic.f8f7451.def +++ b/workflow/singularity/haphic/haphic.f8f7451.def @@ -28,6 +28,13 @@ exec micromamba run -n haphic /opt/HapHiC/utils/${file} "\$@" EOF chmod +x /usr/local/bin/${file} done + + # Create wrapper script for samtools + cat > /usr/local/bin/samtools << 'EOF' +#!/bin/bash +exec micromamba run -n haphic samtools "$@" +EOF + chmod +x /usr/local/bin/samtools # Clean up micromamba clean --all --yes From faaa54a29f4b0c1576c82c478a07638fbb0fb1ce Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 13:43:27 +0200 Subject: [PATCH 07/41] add output of haphic --- .../rules/2.scaffolding/01.scaffolding.smk | 32 ++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index f1efcd5..f53501f 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -90,9 +90,11 @@ rule haphic: input: contigs = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa", minlen=config["min_contig_len"],), hic = "results/{asmname}/2.scaffolding/01.haphic/{asmname}.hic.sorted.filtered.bam", - #gfa = "", #TODO: we could look into using GFA from hifiasm if hifiasm was used, but this interferes with the current renaming of contigs (HapHiC calls this feature EXPERIMENTAL!) + #gfa = "", #TODO: we could at some point look into using GFA from hifiasm if hifiasm was used, but this interferes with the current renaming of contigs (HapHiC calls this feature EXPERIMENTAL!) output: - directory("results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/"), + fa = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/04.build/scaffolds.fa", + salse_agp = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/04.build/scaffolds.agp", + yahs_agp = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/04.build/scaffolds.raw.agp", log: "results/logs/2.scaffolding/haphic/{asmname}.log" benchmark: @@ -104,9 +106,31 @@ rule haphic: singularity: "workflow/singularity/haphic/haphic.f8f7451.sif" shell: - "haphic pipeline --threads {threads} --outdir {output} --verbose {input.contigs} {input.hic} {params.num_chr} &> {log}" - + "haphic pipeline --threads {threads} --outdir $(dirname $(dirname {output.fa})) --verbose {input.contigs} {input.hic} {params.num_chr} &> {log}" +rule haphic_plot: + input: + agp = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/04.build/scaffolds.raw.agp", + bam = "results/{asmname}/2.scaffolding/01.haphic/{asmname}.hic.sorted.filtered.bam", + output: + pdf = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/scaffolds.pdf", + pkl = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/contact_matrix.pkl", + log: + "results/logs/2.scaffolding/haphic_plot/{asmname}.log" + benchmark: + "results/benchmarks/2.scaffolding/haphic_plot/{asmname}.txt" + singularity: + "workflow/singularity/haphic/haphic.f8f7451.sif" + shell: + # "haphic plot {input.agp} {input.bam} &> {log}" + """ + ( + agp="$(realpath {input.agp})" + bam="$(realpath {input.bam})" + cd $(dirname {output.pdf}) + haphic plot $agp $bam + ) &> {log} + """ From 642d6a86cd4f786a6ed75f3e155d153f90ad1bec Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 13:47:25 +0200 Subject: [PATCH 08/41] multithread haphic plot --- workflow/rules/2.scaffolding/01.scaffolding.smk | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index f53501f..eef41b0 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -119,6 +119,8 @@ rule haphic_plot: "results/logs/2.scaffolding/haphic_plot/{asmname}.log" benchmark: "results/benchmarks/2.scaffolding/haphic_plot/{asmname}.txt" + threads: + 8 singularity: "workflow/singularity/haphic/haphic.f8f7451.sif" shell: @@ -128,7 +130,7 @@ rule haphic_plot: agp="$(realpath {input.agp})" bam="$(realpath {input.bam})" cd $(dirname {output.pdf}) - haphic plot $agp $bam + haphic plot --threads {threads} $agp $bam ) &> {log} """ From f55e59d29580eae3e2a350dc12ea4912a9ee9aaf Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 14:57:26 +0200 Subject: [PATCH 09/41] adding contact map of hi-c to report --- workflow/report/haphic.rst | 4 ++++ workflow/rules/2.scaffolding/01.scaffolding.smk | 7 +++++-- 2 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 workflow/report/haphic.rst diff --git a/workflow/report/haphic.rst b/workflow/report/haphic.rst new file mode 100644 index 0000000..1332512 --- /dev/null +++ b/workflow/report/haphic.rst @@ -0,0 +1,4 @@ +This contact map shows the scaffolding effort done by the HapHiC tool. The +scaffolding is based on the Hi-C data and the assembly (reference-free). Please +use manual curation with e.g. Juicer to improve the scaffolding. A bash script +to create the necessary files for Juicer can be found at `results/{{ snakemake.wildcards.asmname }}/2.scaffolding/01.haphic/{{ snakemake.wildcards.asmname }}_HapHiC/04.build/juicebox.sh`. \ No newline at end of file diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index eef41b0..cf1a74f 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -113,7 +113,11 @@ rule haphic_plot: agp = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/04.build/scaffolds.raw.agp", bam = "results/{asmname}/2.scaffolding/01.haphic/{asmname}.hic.sorted.filtered.bam", output: - pdf = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/scaffolds.pdf", + pdf = report("results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/contact_map.pdf", + category="HapHiC", + caption="../../report/haphic.rst", + labels={"assembly": "{asmname}"} + ), pkl = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/contact_matrix.pkl", log: "results/logs/2.scaffolding/haphic_plot/{asmname}.log" @@ -124,7 +128,6 @@ rule haphic_plot: singularity: "workflow/singularity/haphic/haphic.f8f7451.sif" shell: - # "haphic plot {input.agp} {input.bam} &> {log}" """ ( agp="$(realpath {input.agp})" From fd70bb8e692b4555ffb87ebf81b432a3f4bc74bc Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 15:03:15 +0200 Subject: [PATCH 10/41] improve description hic contact map --- workflow/report/haphic.rst | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/workflow/report/haphic.rst b/workflow/report/haphic.rst index 1332512..bcaf264 100644 --- a/workflow/report/haphic.rst +++ b/workflow/report/haphic.rst @@ -1,4 +1,13 @@ This contact map shows the scaffolding effort done by the HapHiC tool. The -scaffolding is based on the Hi-C data and the assembly (reference-free). Please -use manual curation with e.g. Juicer to improve the scaffolding. A bash script -to create the necessary files for Juicer can be found at `results/{{ snakemake.wildcards.asmname }}/2.scaffolding/01.haphic/{{ snakemake.wildcards.asmname }}_HapHiC/04.build/juicebox.sh`. \ No newline at end of file +scaffolding is based on the Hi-C data and the assembly (reference-free). In the +contact map, the axes represent the groups of scaffolds that were scaffolded +together. The color intensity represents the number of Hi-C contacts between +the groups of scaffolds. These contacts are used to determine the physical +distance between sequences in the assembly. + +Please use manual curation with e.g. Juicer to improve the scaffolding. A bash +script to create the necessary files for Juicer can be found at `results/{{ snakemake.wildcards.asmname }}/2.scaffolding/01.haphic/{{ snakemake.wildcards.asmname }}_HapHiC/04.build/juicebox.sh` +Ideally, a diagonal should be seen from the left bottom to the top right, with +the centromeres, telomeres and other repeats as off-diagonal signals. In case +other off-diagonal signals are present, these indicate mis-joins during the +scaffolding process (or misassemblies in the assembly). \ No newline at end of file From 98603f6f683e1d1fdca9bf475c091eb8af6ae154 Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 15:07:55 +0200 Subject: [PATCH 11/41] fix typos in report rst for haphic --- workflow/report/haphic.rst | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/workflow/report/haphic.rst b/workflow/report/haphic.rst index bcaf264..7aa0906 100644 --- a/workflow/report/haphic.rst +++ b/workflow/report/haphic.rst @@ -1,12 +1,17 @@ This contact map shows the scaffolding effort done by the HapHiC tool. The -scaffolding is based on the Hi-C data and the assembly (reference-free). In the -contact map, the axes represent the groups of scaffolds that were scaffolded -together. The color intensity represents the number of Hi-C contacts between +scaffolding is based on the Hi-C data and the assembly (thus reference-free). In +the contact map, the axes represent the groups of scaffolds that were scaffolded +together. The colour intensity represents the number of Hi-C contacts between the groups of scaffolds. These contacts are used to determine the physical distance between sequences in the assembly. -Please use manual curation with e.g. Juicer to improve the scaffolding. A bash -script to create the necessary files for Juicer can be found at `results/{{ snakemake.wildcards.asmname }}/2.scaffolding/01.haphic/{{ snakemake.wildcards.asmname }}_HapHiC/04.build/juicebox.sh` +Please use manual curation with e.g. Juicebox to improve the scaffolding. Run +the following command to create the necessary files for Juicebox: + +.. code-block:: bash + + singularity exec workflow/singularity/haphic/haphic.f8f7451.sif results/{{ snakemake.wildcards.asmname }}/2.scaffolding/01.haphic/{{ snakemake.wildcards.asmname }}_HapHiC/04.build/juicebox.sh + Ideally, a diagonal should be seen from the left bottom to the top right, with the centromeres, telomeres and other repeats as off-diagonal signals. In case other off-diagonal signals are present, these indicate mis-joins during the From 50c0cbf3155066e206dcb004ced1b7bb3b1f7a76 Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 15:11:24 +0200 Subject: [PATCH 12/41] remove code for juicebox as it doesn't run --- workflow/report/haphic.rst | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/workflow/report/haphic.rst b/workflow/report/haphic.rst index 7aa0906..8fed425 100644 --- a/workflow/report/haphic.rst +++ b/workflow/report/haphic.rst @@ -5,12 +5,9 @@ together. The colour intensity represents the number of Hi-C contacts between the groups of scaffolds. These contacts are used to determine the physical distance between sequences in the assembly. -Please use manual curation with e.g. Juicebox to improve the scaffolding. Run -the following command to create the necessary files for Juicebox: - -.. code-block:: bash - - singularity exec workflow/singularity/haphic/haphic.f8f7451.sif results/{{ snakemake.wildcards.asmname }}/2.scaffolding/01.haphic/{{ snakemake.wildcards.asmname }}_HapHiC/04.build/juicebox.sh +Please use manual curation with e.g. Juicebox to improve the scaffolding. The +following bash script contains example commands to create the input: +`results/{{ snakemake.wildcards.asmname }}/2.scaffolding/01.haphic/{{ snakemake.wildcards.asmname }}_HapHiC/04.build/juicebox.sh` Ideally, a diagonal should be seen from the left bottom to the top right, with the centromeres, telomeres and other repeats as off-diagonal signals. In case From 67a0539feef2f86becf9a66ad911b14a04f674d2 Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 15:35:13 +0200 Subject: [PATCH 13/41] try adding YaHS --- workflow/envs/yahs.yaml | 5 ++ .../rules/2.scaffolding/01.scaffolding.smk | 57 ++++++++++++++++--- 2 files changed, 53 insertions(+), 9 deletions(-) create mode 100644 workflow/envs/yahs.yaml diff --git a/workflow/envs/yahs.yaml b/workflow/envs/yahs.yaml new file mode 100644 index 0000000..8a9e43d --- /dev/null +++ b/workflow/envs/yahs.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - yahs=1.2.2 \ No newline at end of file diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index cf1a74f..2347535 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -58,7 +58,7 @@ rule map_hic: forward = get_hic_1, backward = get_hic_2, output: - "results/{asmname}/2.scaffolding/01.haphic/{asmname}.hic.sorted.bam", + "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.bam", log: "results/logs/2.scaffolding/map_hic/{asmname}.log" benchmark: @@ -72,9 +72,9 @@ rule map_hic: rule filter_hic: input: - "results/{asmname}/2.scaffolding/01.haphic/{asmname}.hic.sorted.bam", + "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.bam", output: - "results/{asmname}/2.scaffolding/01.haphic/{asmname}.hic.sorted.filtered.bam", + "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", log: "results/logs/2.scaffolding/filter_hic/{asmname}.log" benchmark: @@ -89,7 +89,7 @@ rule filter_hic: rule haphic: input: contigs = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa", minlen=config["min_contig_len"],), - hic = "results/{asmname}/2.scaffolding/01.haphic/{asmname}.hic.sorted.filtered.bam", + hic = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", #gfa = "", #TODO: we could at some point look into using GFA from hifiasm if hifiasm was used, but this interferes with the current renaming of contigs (HapHiC calls this feature EXPERIMENTAL!) output: fa = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/04.build/scaffolds.fa", @@ -113,12 +113,13 @@ rule haphic_plot: agp = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/04.build/scaffolds.raw.agp", bam = "results/{asmname}/2.scaffolding/01.haphic/{asmname}.hic.sorted.filtered.bam", output: - pdf = report("results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/contact_map.pdf", + pdf = report("results/{asmname}/2.scaffolding/01.haphic/contact_map.pdf", category="HapHiC", caption="../../report/haphic.rst", - labels={"assembly": "{asmname}"} + labels={"assembly": "{asmname}", + "algorithm": "HapHiC"} ), - pkl = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/contact_matrix.pkl", + pkl = "results/{asmname}/2.scaffolding/01.haphic/contact_matrix.pkl", log: "results/logs/2.scaffolding/haphic_plot/{asmname}.log" benchmark: @@ -137,9 +138,47 @@ rule haphic_plot: ) &> {log} """ +rule yahs: + input: + contigs = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa", minlen=config["min_contig_len"],), + hic = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", + output: + fa = "results/{asmname}/2.scaffolding/01.yahs/{asmname}_scaffolds_final.fa", + agp = "results/{asmname}/2.scaffolding/01.yahs/{asmname}_scaffolds_final.agp", + log: + "results/logs/2.scaffolding/yahs/{asmname}.log" + benchmark: + "results/benchmarks/2.scaffolding/yahs/{asmname}.txt" + params: + telomere_motif = config["telomere_motif"], + threads: + 8 + conda: + "../../envs/yahs.yaml" + shell: + """ + ( + mkdir -p $(dirname {output.fa}) + yahs --telo-motif {params.telomere_motif} -o $(echo ${output.fa} | sed 's/_scaffolds_final.fa$//g') {input.contigs} {input.hic} + ) &> {log} + """ - - +use rule haphic_plot as yahs_plot with: + input: + agp = "results/{asmname}/2.scaffolding/01.yahs/{asmname}_scaffolds_final.agp", + bam = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", + output: + pdf = report("results/{asmname}/2.scaffolding/01.yahs/contact_map.pdf", + category="Yahs", + caption="../../report/yahs.rst", + labels={"assembly": "{asmname}", + "algorithm": "YaHS"} + ), + pkl = "results/{asmname}/2.scaffolding/01.yahs/contact_matrix.pkl", + log: + "results/logs/2.scaffolding/yahs_plot/{asmname}.log" + benchmark: + "results/benchmarks/2.scaffolding/yahs_plot/{asmname}.txt" From 38b8d7e6e714a3a55b504dfc68533f505bc1099e Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 15:39:16 +0200 Subject: [PATCH 14/41] fix location bam file for hic --- workflow/rules/2.scaffolding/01.scaffolding.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index 2347535..cd342ed 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -111,7 +111,7 @@ rule haphic: rule haphic_plot: input: agp = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/04.build/scaffolds.raw.agp", - bam = "results/{asmname}/2.scaffolding/01.haphic/{asmname}.hic.sorted.filtered.bam", + bam = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", output: pdf = report("results/{asmname}/2.scaffolding/01.haphic/contact_map.pdf", category="HapHiC", From ceb06467e36e1a7aaa03305f46d9986cded3b9ab Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 15:42:24 +0200 Subject: [PATCH 15/41] fix typo --- workflow/rules/2.scaffolding/01.scaffolding.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index cd342ed..5b94716 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -159,7 +159,7 @@ rule yahs: """ ( mkdir -p $(dirname {output.fa}) - yahs --telo-motif {params.telomere_motif} -o $(echo ${output.fa} | sed 's/_scaffolds_final.fa$//g') {input.contigs} {input.hic} + yahs --telo-motif {params.telomere_motif} -o $(echo {output.fa} | sed 's/_scaffolds_final.fa$//g') {input.contigs} {input.hic} ) &> {log} """ From 843d205aceb64b586d970a0fd194bc2347e82ed2 Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 15:44:09 +0200 Subject: [PATCH 16/41] index contigs prior to yahs --- workflow/rules/1.assembly/02.contigs.smk | 14 ++++++++++++++ workflow/rules/2.scaffolding/01.scaffolding.smk | 1 + 2 files changed, 15 insertions(+) diff --git a/workflow/rules/1.assembly/02.contigs.smk b/workflow/rules/1.assembly/02.contigs.smk index e832541..36916bb 100644 --- a/workflow/rules/1.assembly/02.contigs.smk +++ b/workflow/rules/1.assembly/02.contigs.smk @@ -43,3 +43,17 @@ rule add_prefix: "../../envs/bioawk.yaml" shell: "bioawk -c fastx '{{ print \">{params.prefix}\" ++i; print $seq }}' {input} > {output} 2> {log}" + +rule index_contigs: + input: + "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa" + output: + "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.fai" + log: + "results/logs/1.assembly/index_contigs/{asmname}.min{minlen}.log" + benchmark: + "results/benchmarks/1.assembly/index_contigs/{asmname}.min{minlen}.txt" + conda: + "../../envs/samtools.yaml" + shell: + "samtools faidx {input} &> {log}" \ No newline at end of file diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index 5b94716..b0650c8 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -141,6 +141,7 @@ rule haphic_plot: rule yahs: input: contigs = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa", minlen=config["min_contig_len"],), + contigs_fai = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa.fai", minlen=config["min_contig_len"],), hic = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", output: fa = "results/{asmname}/2.scaffolding/01.yahs/{asmname}_scaffolds_final.fa", From e5ba4751fc57d12c845b7f77f63f18e73b4eb8bd Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 16:26:47 +0200 Subject: [PATCH 17/41] update hi-c contact maps --- workflow/report/haphic.rst | 15 ---------- workflow/report/hic.rst | 11 +++++++ .../rules/2.scaffolding/01.scaffolding.smk | 30 +++++++++++++++---- 3 files changed, 36 insertions(+), 20 deletions(-) delete mode 100644 workflow/report/haphic.rst create mode 100644 workflow/report/hic.rst diff --git a/workflow/report/haphic.rst b/workflow/report/haphic.rst deleted file mode 100644 index 8fed425..0000000 --- a/workflow/report/haphic.rst +++ /dev/null @@ -1,15 +0,0 @@ -This contact map shows the scaffolding effort done by the HapHiC tool. The -scaffolding is based on the Hi-C data and the assembly (thus reference-free). In -the contact map, the axes represent the groups of scaffolds that were scaffolded -together. The colour intensity represents the number of Hi-C contacts between -the groups of scaffolds. These contacts are used to determine the physical -distance between sequences in the assembly. - -Please use manual curation with e.g. Juicebox to improve the scaffolding. The -following bash script contains example commands to create the input: -`results/{{ snakemake.wildcards.asmname }}/2.scaffolding/01.haphic/{{ snakemake.wildcards.asmname }}_HapHiC/04.build/juicebox.sh` - -Ideally, a diagonal should be seen from the left bottom to the top right, with -the centromeres, telomeres and other repeats as off-diagonal signals. In case -other off-diagonal signals are present, these indicate mis-joins during the -scaffolding process (or misassemblies in the assembly). \ No newline at end of file diff --git a/workflow/report/hic.rst b/workflow/report/hic.rst new file mode 100644 index 0000000..1f0cdbf --- /dev/null +++ b/workflow/report/hic.rst @@ -0,0 +1,11 @@ +This contact map shows the scaffolding effort using Hi-C data. In this contact +map, the axes represent the groups of scaffolds that were scaffolded together. +The colour intensity represents the number of Hi-C contacts between the groups +of scaffolds. These contacts are used to determine the physical distance between +sequences in the assembly. + +Ideally, a diagonal should be seen from the left bottom to the top right, with +the centromeres, telomeres and other repeats as off-diagonal signals. In case +other off-diagonal signals are present, these indicate mis-joins during the +scaffolding process (or misassemblies in the assembly). The scaffolding can be +improved manually using tools like Juicebox or 3D-DNA outside of this pipeline. \ No newline at end of file diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index b0650c8..d264db8 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -114,8 +114,8 @@ rule haphic_plot: bam = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", output: pdf = report("results/{asmname}/2.scaffolding/01.haphic/contact_map.pdf", - category="HapHiC", - caption="../../report/haphic.rst", + category="Hi-C", + caption="../../report/hic.rst", labels={"assembly": "{asmname}", "algorithm": "HapHiC"} ), @@ -170,8 +170,8 @@ use rule haphic_plot as yahs_plot with: bam = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", output: pdf = report("results/{asmname}/2.scaffolding/01.yahs/contact_map.pdf", - category="Yahs", - caption="../../report/yahs.rst", + category="Hi-C", + caption="../../report/hic.rst", labels={"assembly": "{asmname}", "algorithm": "YaHS"} ), @@ -181,7 +181,27 @@ use rule haphic_plot as yahs_plot with: benchmark: "results/benchmarks/2.scaffolding/yahs_plot/{asmname}.txt" - +use rule haphic_plot as ntjoin_plot with: + input: + agp = lambda wildcards: expand("results/{asmname}/2.scaffolding/01.ntjoin/{asmname}.vs.{reference}.min{minlen}.k{k}.w{w}.n2.all.scaffolds.agp", + reference=get_reference_id(wildcards.asmname), + minlen=config["min_contig_len"], + k=config["ntjoin_k"], + w=config["ntjoin_w"], + ), + bam = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", + output: + pdf = report("results/{asmname}/2.scaffolding/01.ntjoin/contact_map.pdf", + category="Hi-C", + caption="../../report/hic.rst", + labels={"assembly": "{asmname}", + "algorithm": "ntJoin"} + ), + pkl = "results/{asmname}/2.scaffolding/01.ntjoin/contact_matrix.pkl", + log: + "results/logs/2.scaffolding/ntjoin_plot/{asmname}.log" + benchmark: + "results/benchmarks/2.scaffolding/ntjoin_plot/{asmname}.txt" From 7919a3068068d83c6d53722064c07e6f776f810d Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 16:27:52 +0200 Subject: [PATCH 18/41] fix typo in code --- workflow/rules/2.scaffolding/01.scaffolding.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index d264db8..aab65b0 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -183,7 +183,7 @@ use rule haphic_plot as yahs_plot with: use rule haphic_plot as ntjoin_plot with: input: - agp = lambda wildcards: expand("results/{asmname}/2.scaffolding/01.ntjoin/{asmname}.vs.{reference}.min{minlen}.k{k}.w{w}.n2.all.scaffolds.agp", + agp = lambda wildcards: expand("results/{{asmname}}/2.scaffolding/01.ntjoin/{{asmname}}.vs.{reference}.min{minlen}.k{k}.w{w}.n2.all.scaffolds.agp", reference=get_reference_id(wildcards.asmname), minlen=config["min_contig_len"], k=config["ntjoin_k"], From 69e9fa54cd916d00b8f0343393dcec3ff5d9d000 Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 16:59:25 +0200 Subject: [PATCH 19/41] add html for conversion table --- .../rules/2.scaffolding/01.scaffolding.smk | 1 + workflow/rules/2.scaffolding/02.renaming.smk | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index aab65b0..aa28069 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -190,6 +190,7 @@ use rule haphic_plot as ntjoin_plot with: w=config["ntjoin_w"], ), bam = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", + table = "results/{asmname}/2.scaffolding/02.renaming/{asmname}.html", #making sure that the table is generated before the plot so that the report can be interpreted output: pdf = report("results/{asmname}/2.scaffolding/01.ntjoin/contact_map.pdf", category="Hi-C", diff --git a/workflow/rules/2.scaffolding/02.renaming.smk b/workflow/rules/2.scaffolding/02.renaming.smk index 7a95fce..7c7ccb6 100644 --- a/workflow/rules/2.scaffolding/02.renaming.smk +++ b/workflow/rules/2.scaffolding/02.renaming.smk @@ -22,6 +22,29 @@ rule create_renaming_table: ) &> {log} """ +rule visualise_ntjoin_renaming: + input: + table = lambda wildcards: expand("results/{{asmname}}/2.scaffolding/02.renaming/{{asmname}}.vs.{reference}.min{minlen}.k{k}.w{w}.n2.all.scaffolds.conversion.tsv", + reference=get_reference_id(wildcards.asmname), + minlen=config["min_contig_len"], + k=config["ntjoin_k"], + w=config["ntjoin_w"], + ), + output: + report("results/{asmname}/2.scaffolding/02.renaming/{asmname}.html", + category="Hi-C", + labels={"assembly": "{asmname}", + "algorithm": "ntJoin (conversion table)"} + ), + log: + "results/logs/2.scaffolding/visualise_ntjoin_renaming/{asmname}.log" + benchmark: + "results/benchmarks/2.scaffolding/visualise_ntjoin_renaming/{asmname}.txt" + conda: + "../../envs/csvtotable.yaml" + shell: + "csvtotable -d $'\\t' {input} {output} &> {log}" + rule renaming_scaffolds: input: all = lambda wildcards: expand("results/{{asmname}}/2.scaffolding/01.ntjoin/{{asmname}}.vs.{reference}.min{minlen}.k{k}.w{w}.n2.all.scaffolds.fa", From e6fcf6ab7a603ebbbe6f30584440b1a607224225 Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 17:38:38 +0200 Subject: [PATCH 20/41] update metadata hi-c plots in report --- workflow/rules/2.scaffolding/01.scaffolding.smk | 9 ++++++--- workflow/rules/2.scaffolding/02.renaming.smk | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index aa28069..8db120b 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -108,7 +108,7 @@ rule haphic: shell: "haphic pipeline --threads {threads} --outdir $(dirname $(dirname {output.fa})) --verbose {input.contigs} {input.hic} {params.num_chr} &> {log}" -rule haphic_plot: +rule haphic_plot_hic: input: agp = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/04.build/scaffolds.raw.agp", bam = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", @@ -117,6 +117,7 @@ rule haphic_plot: category="Hi-C", caption="../../report/hic.rst", labels={"assembly": "{asmname}", + "stage": "scaffolds", "algorithm": "HapHiC"} ), pkl = "results/{asmname}/2.scaffolding/01.haphic/contact_matrix.pkl", @@ -164,7 +165,7 @@ rule yahs: ) &> {log} """ -use rule haphic_plot as yahs_plot with: +use rule haphic_plot_hic as yahs_plot_hic with: input: agp = "results/{asmname}/2.scaffolding/01.yahs/{asmname}_scaffolds_final.agp", bam = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", @@ -173,6 +174,7 @@ use rule haphic_plot as yahs_plot with: category="Hi-C", caption="../../report/hic.rst", labels={"assembly": "{asmname}", + "stage": "scaffolds", "algorithm": "YaHS"} ), pkl = "results/{asmname}/2.scaffolding/01.yahs/contact_matrix.pkl", @@ -181,7 +183,7 @@ use rule haphic_plot as yahs_plot with: benchmark: "results/benchmarks/2.scaffolding/yahs_plot/{asmname}.txt" -use rule haphic_plot as ntjoin_plot with: +use rule haphic_plot_hic as ntjoin_plot_hic with: input: agp = lambda wildcards: expand("results/{{asmname}}/2.scaffolding/01.ntjoin/{{asmname}}.vs.{reference}.min{minlen}.k{k}.w{w}.n2.all.scaffolds.agp", reference=get_reference_id(wildcards.asmname), @@ -196,6 +198,7 @@ use rule haphic_plot as ntjoin_plot with: category="Hi-C", caption="../../report/hic.rst", labels={"assembly": "{asmname}", + "stage": "scaffolds", "algorithm": "ntJoin"} ), pkl = "results/{asmname}/2.scaffolding/01.ntjoin/contact_matrix.pkl", diff --git a/workflow/rules/2.scaffolding/02.renaming.smk b/workflow/rules/2.scaffolding/02.renaming.smk index 7c7ccb6..cb72b33 100644 --- a/workflow/rules/2.scaffolding/02.renaming.smk +++ b/workflow/rules/2.scaffolding/02.renaming.smk @@ -34,6 +34,7 @@ rule visualise_ntjoin_renaming: report("results/{asmname}/2.scaffolding/02.renaming/{asmname}.html", category="Hi-C", labels={"assembly": "{asmname}", + "stage": "scaffolds", "algorithm": "ntJoin (conversion table)"} ), log: From b592c26faf9e0dcc4bec7ad1bdb9f46bf30b096f Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 17:42:18 +0200 Subject: [PATCH 21/41] add header to scaffolding conversion table --- workflow/rules/2.scaffolding/02.renaming.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/2.scaffolding/02.renaming.smk b/workflow/rules/2.scaffolding/02.renaming.smk index cb72b33..7728fa9 100644 --- a/workflow/rules/2.scaffolding/02.renaming.smk +++ b/workflow/rules/2.scaffolding/02.renaming.smk @@ -44,7 +44,7 @@ rule visualise_ntjoin_renaming: conda: "../../envs/csvtotable.yaml" shell: - "csvtotable -d $'\\t' {input} {output} &> {log}" + "awk 'BEGIN{{FS = OFS = \"\\t\"; print \"chromosome name\",\"scaffold name\";}} {{print;}}' {input.table} | csvtotable -d $'\\t' -o {output} &> {log}" rule renaming_scaffolds: input: From 9dab3ab1b7c6797590096f1ac9d261235777e432 Mon Sep 17 00:00:00 2001 From: worku005 Date: Fri, 25 Oct 2024 17:46:04 +0200 Subject: [PATCH 22/41] fix adding header conversion table --- workflow/rules/2.scaffolding/02.renaming.smk | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/workflow/rules/2.scaffolding/02.renaming.smk b/workflow/rules/2.scaffolding/02.renaming.smk index 7728fa9..1257af5 100644 --- a/workflow/rules/2.scaffolding/02.renaming.smk +++ b/workflow/rules/2.scaffolding/02.renaming.smk @@ -44,7 +44,13 @@ rule visualise_ntjoin_renaming: conda: "../../envs/csvtotable.yaml" shell: - "awk 'BEGIN{{FS = OFS = \"\\t\"; print \"chromosome name\",\"scaffold name\";}} {{print;}}' {input.table} | csvtotable -d $'\\t' -o {output} &> {log}" + """ + ( + awk 'BEGIN{{FS = OFS = \"\\t\"; print \"chromosome name\",\"scaffold name\";}} {{print;}}' {input.table} > {input.table}.tmp + csvtotable -d $'\\t' -o {input.table}.tmp {output} + rm {input.table}.tmp + ) &> {log} + """ rule renaming_scaffolds: input: From bbd901802bed6d0dc5af6b4b964e7c5328b2ad67 Mon Sep 17 00:00:00 2001 From: worku005 Date: Mon, 28 Oct 2024 14:44:08 +0100 Subject: [PATCH 23/41] add hi-c plots to scaffolding output if they exist --- workflow/rules/2.scaffolding/all.smk | 10 ++++++++++ workflow/rules/common.smk | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/workflow/rules/2.scaffolding/all.smk b/workflow/rules/2.scaffolding/all.smk index 20b94da..9084eed 100644 --- a/workflow/rules/2.scaffolding/all.smk +++ b/workflow/rules/2.scaffolding/all.smk @@ -21,10 +21,20 @@ def get_mummerplot_scaffolds(wildcards): filelist.append(f"results/{asmname}/2.scaffolding/03.mummer/{asmname}.vs.{reference}.plot.gp") return filelist +def get_hic_plot(wildcards): + filelist = [] + for asmname in get_all_accessions(): + if has_hic(asmname): + filelist.append(f"results/{asmname}/2.scaffolding/01.haphic/contact_map.pdf") + filelist.append(f"results/{asmname}/2.scaffolding/01.yahs/contact_map.pdf") + filelist.append(f"results/{asmname}/2.scaffolding/01.ntjoin/contact_map.pdf") + return filelist + rule scaffold: input: expand("final_output/{asmname}.full.fa", asmname = get_all_accessions()), get_mummerplot_scaffolds, + get_hic_plots, #TODO: temporary output: touch("results/scaffolding.done") diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index abc7c90..7287735 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -43,8 +43,8 @@ def get_illumina_1(wildcards): def get_illumina_2(wildcards): return SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["illumina_2"].values.item() -def has_hic(wildcards): - return not SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["hic_1"].isnull().values.item() +def has_hic(asmname): + return not SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(asmname)]["hic_1"].isnull().values.item() def get_hic_1(wildcards): return SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["hic_1"].values.item() From d61a4b30325435dece5973cbdb01d0e671471b39 Mon Sep 17 00:00:00 2001 From: worku005 Date: Mon, 28 Oct 2024 14:45:09 +0100 Subject: [PATCH 24/41] fix typo --- workflow/rules/2.scaffolding/all.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/2.scaffolding/all.smk b/workflow/rules/2.scaffolding/all.smk index 9084eed..254fc0e 100644 --- a/workflow/rules/2.scaffolding/all.smk +++ b/workflow/rules/2.scaffolding/all.smk @@ -21,7 +21,7 @@ def get_mummerplot_scaffolds(wildcards): filelist.append(f"results/{asmname}/2.scaffolding/03.mummer/{asmname}.vs.{reference}.plot.gp") return filelist -def get_hic_plot(wildcards): +def get_hic_plots(wildcards): filelist = [] for asmname in get_all_accessions(): if has_hic(asmname): From 2aa254a5827d2d0d000fedc3b2b8f248f7437784 Mon Sep 17 00:00:00 2001 From: worku005 Date: Wed, 30 Oct 2024 08:39:41 +0100 Subject: [PATCH 25/41] fix 'existing output dir for haphic' --- workflow/rules/2.scaffolding/01.scaffolding.smk | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index 8db120b..0985e00 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -106,7 +106,12 @@ rule haphic: singularity: "workflow/singularity/haphic/haphic.f8f7451.sif" shell: - "haphic pipeline --threads {threads} --outdir $(dirname $(dirname {output.fa})) --verbose {input.contigs} {input.hic} {params.num_chr} &> {log}" + """ + ( + [ -d $(dirname {output.fa}) ] && rm -rd $(dirname {output.fa}) + haphic pipeline --threads {threads} --outdir $(dirname $(dirname {output.fa})) --verbose {input.contigs} {input.hic} {params.num_chr} + ) &> {log} + """ rule haphic_plot_hic: input: From a87b0cf8b9f7876ccb2c3ee72102261415430760 Mon Sep 17 00:00:00 2001 From: worku005 Date: Wed, 30 Oct 2024 08:42:15 +0100 Subject: [PATCH 26/41] fix rm command --- workflow/rules/2.scaffolding/01.scaffolding.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index 0985e00..b13c643 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -108,7 +108,7 @@ rule haphic: shell: """ ( - [ -d $(dirname {output.fa}) ] && rm -rd $(dirname {output.fa}) + [ -d $(dirname {output.fa}) ] && rm -rf $(dirname {output.fa}) haphic pipeline --threads {threads} --outdir $(dirname $(dirname {output.fa})) --verbose {input.contigs} {input.hic} {params.num_chr} ) &> {log} """ From d91d7791369b76a7923d2f8441e2b7dd95a16727 Mon Sep 17 00:00:00 2001 From: worku005 Date: Tue, 5 Nov 2024 09:47:53 +0100 Subject: [PATCH 27/41] include Hi-C for assembly --- workflow/rules/1.assembly/01.assembly.smk | 86 ++++++++++++++++++++++- workflow/rules/1.assembly/02.contigs.smk | 16 ++++- workflow/rules/common.smk | 3 + 3 files changed, 101 insertions(+), 4 deletions(-) diff --git a/workflow/rules/1.assembly/01.assembly.smk b/workflow/rules/1.assembly/01.assembly.smk index c6a076f..8f0ad59 100644 --- a/workflow/rules/1.assembly/01.assembly.smk +++ b/workflow/rules/1.assembly/01.assembly.smk @@ -16,6 +16,26 @@ rule hifiasm: shell: "hifiasm -t {threads} -o $(echo {output.consensus} | rev | cut -d '.' -f 4- | rev) {input.hifi} 2> {log}" +rule hifiasm_with_hic: + input: + hifi = get_hifi, + hic1 = get_hic_1, + hic2 = get_hic_2, + output: + hap1 = "results/{asmname}/1.assembly/01.hifiasm_hifi_and_hic/{asmname}.bp.hap1.p_ctg.gfa", + hap2 = "results/{asmname}/1.assembly/01.hifiasm_hifi_and_hic/{asmname}.bp.hap2.p_ctg.gfa", + consensus = "results/{asmname}/1.assembly/01.hifiasm_hifi_and_hic/{asmname}.bp.p_ctg.gfa", + log: + "results/logs/1.assembly/hifiasm/{asmname}.log" + benchmark: + "results/benchmarks/1.assembly/hifiasm/{asmname}.txt" + threads: + min(max(workflow.cores - 1, 1), 50) + conda: + "../../envs/hifiasm.yaml" + shell: + "hifiasm -t {threads} -o $(echo {output.consensus} | rev | cut -d '.' -f 4- | rev) --h1 {input.hic1} --h2 {input.h2} {input.hifi} 2> {log}" + rule hifiasm_with_ont: input: hifi = get_hifi, @@ -35,6 +55,27 @@ rule hifiasm_with_ont: shell: "hifiasm -t {threads} -o $(echo {output.consensus} | rev | cut -d '.' -f 4- | rev) --ul $(echo {input.ont} | sed 's/ /,/g') {input.hifi} 2> {log}" +rule hifiasm_with_hic_and_ont: + input: + hifi = get_hifi, + hic1 = get_hic_1, + hic2 = get_hic_2, + ont = get_ont, + output: + hap1 = "results/{asmname}/1.assembly/01.hifiasm_hifi_hic_and_ont/{asmname}.bp.hap1.p_ctg.gfa", + hap2 = "results/{asmname}/1.assembly/01.hifiasm_hifi_hic_and_ont/{asmname}.bp.hap2.p_ctg.gfa", + consensus = "results/{asmname}/1.assembly/01.hifiasm_hifi_hic_and_ont/{asmname}.bp.p_ctg.gfa", + log: + "results/logs/1.assembly/hifiasm_with_hic_and_ont/{asmname}.log" + benchmark: + "results/benchmarks/1.assembly/hifiasm_with_hic_and_ont/{asmname}.txt" + threads: + min(max(workflow.cores - 1, 1), 50) + conda: + "../../envs/hifiasm.yaml" + shell: + "hifiasm -t {threads} -o $(echo {output.consensus} | rev | cut -d '.' -f 4- | rev) --h1 {input.hic1} --h2 {input.h2} --ul $(echo {input.ont} | sed 's/ /,/g') {input.hifi} 2> {log}" + def get_hifiasm_output(wildcards): if get_haplotypes(wildcards) == 1: return "results/{asmname}/1.assembly/01.hifiasm_{ext}/{asmname}.bp.p_ctg.gfa" @@ -73,6 +114,26 @@ rule verkko: shell: "verkko --local-memory {resources.gbmem} --local-cpus {threads} -d $(dirname {output}) --hifi {input.hifi} &> {log}" +rule verkko_with_hic: + input: + hifi = get_hifi, + hic1 = get_hic_1, + hic2 = get_hic_2, + output: + "results/{asmname}/1.assembly/01.verkko_hifi_and_hic/assembly.fasta", + log: + "results/logs/1.assembly/verkko_hifi_and_hic/{asmname}.log" + benchmark: + "results/benchmarks/1.assembly/verkko_hifi_and_hic/{asmname}.txt" + resources: + gbmem=320 #best to make it a multiple of 32 for verkko + threads: + min(max(workflow.cores - 1, 1), 50) + conda: + "../../envs/verkko.yaml" + shell: + "verkko --local-memory {resources.gbmem} --local-cpus {threads} -d $(dirname {output}) --hifi {input.hifi} --hic1 {input.hic1} --hic2 {input.hic2} &> {log}" + rule verkko_with_ont: input: hifi = get_hifi, @@ -83,12 +144,35 @@ rule verkko_with_ont: "results/logs/1.assembly/verkko_hifi_and_ont/{asmname}.log" benchmark: "results/benchmarks/1.assembly/verkko_hifi_and_ont/{asmname}.txt" + resources: + gbmem=320 #best to make it a multiple of 32 for verkko + threads: + min(max(workflow.cores - 1, 1), 50) + conda: + "../../envs/verkko.yaml" + shell: + "verkko --local-memory {resources.gbmem} --local-cpus {threads} -d $(dirname {output}) --hifi {input.hifi} --nano {input.ont} &> {log}" + +rule verkko_with_hic_and_ont: + input: + hifi = get_hifi, + hic1 = get_hic_1, + hic2 = get_hic_2, + ont = get_ont, + output: + "results/{asmname}/1.assembly/01.verkko_hifi_hic_and_ont/assembly.fasta", + log: + "results/logs/1.assembly/verkko_hifi_hic_and_ont/{asmname}.log" + benchmark: + "results/benchmarks/1.assembly/verkko_hifi_hic_and_ont/{asmname}.txt" + resources: + gbmem=320 #best to make it a multiple of 32 for verkko threads: min(max(workflow.cores - 1, 1), 50) conda: "../../envs/verkko.yaml" shell: - "verkko --threads {threads} -d $(dirname {output}) --hifi {input.hifi} --nano {input.ont} &> {log}" + "verkko --local-memory {resources.gbmem} --local-cpus {threads} -d $(dirname {output}) --hifi {input.hifi} --hic1 {input.hic1} --hic2 {input.hic2} --nano {input.ont} &> {log}" rule verkko_assembly_minimap2: input: diff --git a/workflow/rules/1.assembly/02.contigs.smk b/workflow/rules/1.assembly/02.contigs.smk index 36916bb..0b35d17 100644 --- a/workflow/rules/1.assembly/02.contigs.smk +++ b/workflow/rules/1.assembly/02.contigs.smk @@ -1,8 +1,18 @@ +def get_assembly(wildcards): + if has_ont(wildcards.asmname): + if has_hic(wildcards.asmname): + return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_hic_and_ont/{wildcards.asmname}.bp.p_ctg.gfa" + else: + return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_and_ont/{wildcards.asmname}.bp.p_ctg.gfa" + else: + if has_hic(wildcards.asmname): + return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_and_hic/{wildcards.asmname}.bp.p_ctg.gfa" + else: + return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_only/{wildcards.asmname}.bp.p_ctg.gfa" + rule filter_contigs: input: - lambda wildcards: branch(SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["ont"].isnull().values.item(), #check if ont is null - then=f"results/{{asmname}}/1.assembly/01.{config['assembler']}_hifi_only/{{asmname}}.fa", - otherwise=f"results/{{asmname}}/1.assembly/01.{config['assembler']}_hifi_and_ont/{{asmname}}.fa"), + get_assembly, output: "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.fa" log: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 7287735..94204da 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -34,6 +34,9 @@ def get_all_accessions_from_asmset(asmset, minimum=2): def get_hifi(wildcards): return SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["hifi"].values.item().split(";") +def has_ont(asmname): + return not SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(asmname)]["ont"].isnull().values.item() + def get_ont(wildcards): return SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["ont"].values.item().split(";") From 49669aff921265d4355adad6c72bec0a14861c9b Mon Sep 17 00:00:00 2001 From: worku005 Date: Tue, 5 Nov 2024 10:46:54 +0100 Subject: [PATCH 28/41] remove hi-c from scaffolding completely but retain hi-C plots --- workflow/envs/yahs.yaml | 5 - .../rules/2.scaffolding/01.scaffolding.smk | 155 ++---------------- workflow/rules/2.scaffolding/02.renaming.smk | 4 +- workflow/rules/2.scaffolding/all.smk | 2 - .../5.quality_assessment/13.statistics.smk | 2 +- workflow/schemas/config.schema.yaml | 1 + 6 files changed, 17 insertions(+), 152 deletions(-) delete mode 100644 workflow/envs/yahs.yaml diff --git a/workflow/envs/yahs.yaml b/workflow/envs/yahs.yaml deleted file mode 100644 index 8a9e43d..0000000 --- a/workflow/envs/yahs.yaml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - yahs=1.2.2 \ No newline at end of file diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.scaffolding.smk index b13c643..c479694 100644 --- a/workflow/rules/2.scaffolding/01.scaffolding.smk +++ b/workflow/rules/2.scaffolding/01.scaffolding.smk @@ -86,50 +86,29 @@ rule filter_hic: shell: "(filter_bam.py {input} 1 --NM 3 --threads {threads} | samtools view - -b -@ $(({threads}-1)) -o {output}) &> {log}" -rule haphic: +rule ntjoin_plot_hic: input: - contigs = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa", minlen=config["min_contig_len"],), - hic = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", - #gfa = "", #TODO: we could at some point look into using GFA from hifiasm if hifiasm was used, but this interferes with the current renaming of contigs (HapHiC calls this feature EXPERIMENTAL!) - output: - fa = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/04.build/scaffolds.fa", - salse_agp = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/04.build/scaffolds.agp", - yahs_agp = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/04.build/scaffolds.raw.agp", - log: - "results/logs/2.scaffolding/haphic/{asmname}.log" - benchmark: - "results/benchmarks/2.scaffolding/haphic/{asmname}.txt" - params: - num_chr = lambda wildcards: len(config["reference_genomes"][get_reference_id(wildcards.asmname)]["chromosomes"]), #the number of chromosomes - threads: - 8 - singularity: - "workflow/singularity/haphic/haphic.f8f7451.sif" - shell: - """ - ( - [ -d $(dirname {output.fa}) ] && rm -rf $(dirname {output.fa}) - haphic pipeline --threads {threads} --outdir $(dirname $(dirname {output.fa})) --verbose {input.contigs} {input.hic} {params.num_chr} - ) &> {log} - """ - -rule haphic_plot_hic: - input: - agp = "results/{asmname}/2.scaffolding/01.haphic/{asmname}_HapHiC/04.build/scaffolds.raw.agp", + agp = lambda wildcards: expand("results/{{asmname}}/2.scaffolding/01.ntjoin/{{asmname}}.vs.{reference}.min{minlen}.k{k}.w{w}.n2.all.scaffolds.agp", + reference=get_reference_id(wildcards.asmname), + minlen=config["min_contig_len"], + k=config["ntjoin_k"], + w=config["ntjoin_w"], + ), bam = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", + table = "results/{asmname}/2.scaffolding/02.renaming/{asmname}.html", #making sure that the table is generated before the plot so that the report can be interpreted output: - pdf = report("results/{asmname}/2.scaffolding/01.haphic/contact_map.pdf", + pdf = report("results/{asmname}/2.scaffolding/01.ntjoin/contact_map.pdf", category="Hi-C", caption="../../report/hic.rst", labels={"assembly": "{asmname}", "stage": "scaffolds", - "algorithm": "HapHiC"} + "algorithm": "ntJoin"} ), - pkl = "results/{asmname}/2.scaffolding/01.haphic/contact_matrix.pkl", + pkl = "results/{asmname}/2.scaffolding/01.ntjoin/contact_matrix.pkl", log: - "results/logs/2.scaffolding/haphic_plot/{asmname}.log" + "results/logs/2.scaffolding/ntjoin_plot/{asmname}.log" benchmark: - "results/benchmarks/2.scaffolding/haphic_plot/{asmname}.txt" + "results/benchmarks/2.scaffolding/ntjoin_plot/{asmname}.txt" threads: 8 singularity: @@ -143,111 +122,3 @@ rule haphic_plot_hic: haphic plot --threads {threads} $agp $bam ) &> {log} """ - -rule yahs: - input: - contigs = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa", minlen=config["min_contig_len"],), - contigs_fai = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa.fai", minlen=config["min_contig_len"],), - hic = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", - output: - fa = "results/{asmname}/2.scaffolding/01.yahs/{asmname}_scaffolds_final.fa", - agp = "results/{asmname}/2.scaffolding/01.yahs/{asmname}_scaffolds_final.agp", - log: - "results/logs/2.scaffolding/yahs/{asmname}.log" - benchmark: - "results/benchmarks/2.scaffolding/yahs/{asmname}.txt" - params: - telomere_motif = config["telomere_motif"], - threads: - 8 - conda: - "../../envs/yahs.yaml" - shell: - """ - ( - mkdir -p $(dirname {output.fa}) - yahs --telo-motif {params.telomere_motif} -o $(echo {output.fa} | sed 's/_scaffolds_final.fa$//g') {input.contigs} {input.hic} - ) &> {log} - """ - -use rule haphic_plot_hic as yahs_plot_hic with: - input: - agp = "results/{asmname}/2.scaffolding/01.yahs/{asmname}_scaffolds_final.agp", - bam = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", - output: - pdf = report("results/{asmname}/2.scaffolding/01.yahs/contact_map.pdf", - category="Hi-C", - caption="../../report/hic.rst", - labels={"assembly": "{asmname}", - "stage": "scaffolds", - "algorithm": "YaHS"} - ), - pkl = "results/{asmname}/2.scaffolding/01.yahs/contact_matrix.pkl", - log: - "results/logs/2.scaffolding/yahs_plot/{asmname}.log" - benchmark: - "results/benchmarks/2.scaffolding/yahs_plot/{asmname}.txt" - -use rule haphic_plot_hic as ntjoin_plot_hic with: - input: - agp = lambda wildcards: expand("results/{{asmname}}/2.scaffolding/01.ntjoin/{{asmname}}.vs.{reference}.min{minlen}.k{k}.w{w}.n2.all.scaffolds.agp", - reference=get_reference_id(wildcards.asmname), - minlen=config["min_contig_len"], - k=config["ntjoin_k"], - w=config["ntjoin_w"], - ), - bam = "results/{asmname}/2.scaffolding/01.hic/{asmname}.hic.sorted.filtered.bam", - table = "results/{asmname}/2.scaffolding/02.renaming/{asmname}.html", #making sure that the table is generated before the plot so that the report can be interpreted - output: - pdf = report("results/{asmname}/2.scaffolding/01.ntjoin/contact_map.pdf", - category="Hi-C", - caption="../../report/hic.rst", - labels={"assembly": "{asmname}", - "stage": "scaffolds", - "algorithm": "ntJoin"} - ), - pkl = "results/{asmname}/2.scaffolding/01.ntjoin/contact_matrix.pkl", - log: - "results/logs/2.scaffolding/ntjoin_plot/{asmname}.log" - benchmark: - "results/benchmarks/2.scaffolding/ntjoin_plot/{asmname}.txt" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/workflow/rules/2.scaffolding/02.renaming.smk b/workflow/rules/2.scaffolding/02.renaming.smk index 1257af5..b999604 100644 --- a/workflow/rules/2.scaffolding/02.renaming.smk +++ b/workflow/rules/2.scaffolding/02.renaming.smk @@ -1,7 +1,7 @@ rule create_renaming_table: input: - agp= "results/{asmname}/2.scaffolding/01.ntjoin/{asmname}.vs.{reference}.min{minlen}.k{k}.w{w}.n2.all.scaffolds.agp", - mxdot= "results/{asmname}/2.scaffolding/01.ntjoin/{asmname}.vs.{reference}.min{minlen}.k{k}.w{w}.n2.all.scaffolds.mx.dot", + agp = "results/{asmname}/2.scaffolding/01.ntjoin/{asmname}.vs.{reference}.min{minlen}.k{k}.w{w}.n2.all.scaffolds.agp", + mxdot = "results/{asmname}/2.scaffolding/01.ntjoin/{asmname}.vs.{reference}.min{minlen}.k{k}.w{w}.n2.all.scaffolds.mx.dot", output: "results/{asmname}/2.scaffolding/02.renaming/{asmname}.vs.{reference}.min{minlen}.k{k}.w{w}.n2.all.scaffolds.conversion.tsv" log: diff --git a/workflow/rules/2.scaffolding/all.smk b/workflow/rules/2.scaffolding/all.smk index 254fc0e..d7b5a0a 100644 --- a/workflow/rules/2.scaffolding/all.smk +++ b/workflow/rules/2.scaffolding/all.smk @@ -25,8 +25,6 @@ def get_hic_plots(wildcards): filelist = [] for asmname in get_all_accessions(): if has_hic(asmname): - filelist.append(f"results/{asmname}/2.scaffolding/01.haphic/contact_map.pdf") - filelist.append(f"results/{asmname}/2.scaffolding/01.yahs/contact_map.pdf") filelist.append(f"results/{asmname}/2.scaffolding/01.ntjoin/contact_map.pdf") return filelist diff --git a/workflow/rules/5.quality_assessment/13.statistics.smk b/workflow/rules/5.quality_assessment/13.statistics.smk index c39878c..d956fc5 100644 --- a/workflow/rules/5.quality_assessment/13.statistics.smk +++ b/workflow/rules/5.quality_assessment/13.statistics.smk @@ -16,7 +16,7 @@ rule individual_statistics: benchmark: "results/benchmarks/5.quality_assessment/individual_statistics/{asmname}.txt" params: - inputdata = lambda wildcards: "HiFi+ONT" if not SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["ont"].isnull().values.item() else "HiFi only", + inputdata = lambda wildcards: "HiFi+ONT+Hi-C" if has_ont(wildcards.asmname) and has_hic(wildcards.asmname) else "HiFi+ONT" if has_ont(wildcards.asmname) else "HiFi+Hi-C" if has_hic(wildcards.asmname) else "HiFi only", assembler = config["assembler"], conda: "../../envs/seqkit.yaml" diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index c0bfc8e..eea3d5b 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -14,6 +14,7 @@ properties: type: string description: assembler to use; supported assemblers are hifiasm, verkko pattern: "^(hifiasm|verkko)$" + default: "hifiasm" min_contig_len: type: integer From 2e754910cf51b9ec31f91f63b582a6728806ce6c Mon Sep 17 00:00:00 2001 From: worku005 Date: Tue, 5 Nov 2024 10:47:13 +0100 Subject: [PATCH 29/41] update README with optional Hi-C data description --- README.md | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 75d49e9..a5b4c9f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # MoGAAAP (Modular Genome Assembly, Annotation and Assessment Pipeline) This repository contains a Snakemake pipeline for the assembly, scaffolding, analysis, annotation and quality assessment of HiFi-based assemblies. Although developed for a project in lettuce, the pipeline is designed to work with any organism. -The pipeline will work with both HiFi and ONT data, although the former is required. +The pipeline will work with both HiFi and ONT data, although only the former is required. ## Index - [Downloading the pipeline](#downloading-pipeline) @@ -95,17 +95,19 @@ All fields to fill in are well-documented in the provided `config/config.yaml` f The `config/samples.tsv` has the following columns to fill in (one row per sample): -| Column name | Description | -|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `accessionId` | The accession ID of the sample. This name has to be unique. | -| `hifi` | The path to the HiFi reads in FASTQ or FASTA format. Multiple libraries can be provided by separating them with a semicolon. | -| `ont` | The path to the ONT reads in FASTQ or FASTA format. Multiple libraries can be provided by separating them with a semicolon. | -| `illumina_1` | The path to the forward Illumina reads in FASTQ format. | -| `illumina_2` | The path to the reverse Illumina reads in FASTQ format. | -| `haplotypes` | The expected number of haplotypes in the assembly. Use 1 for (near) homozygous accessions and 2 for heterozygous accessions. NB: currently only 1 or 2 is supported. | -| `speciesName` | A name for the species that is used by Helixer to name novel genes. | -| `taxId` | The NCBI taxonomy ID of the species. | -| `referenceId` | A unique identifier for the reference genome for which genome (FASTA), annotation (GFF3) and chromosome names are provided in the `config/config.yaml` file. | +| Column name | Description | +|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `accessionId` | The accession ID of the sample. This name has to be unique. | +| `hifi` | The path to the HiFi reads in FASTQ or FASTA format. Multiple libraries can be provided by separating them with a semicolon. | +| `ont` | OPTIONAL. The path to the ONT reads in FASTQ or FASTA format. Multiple libraries can be provided by separating them with a semicolon. | +| `illumina_1` | OPTIONAL. The path to the forward Illumina reads in FASTQ format. | +| `illumina_2` | OPTIONAL. The path to the reverse Illumina reads in FASTQ format. | +| `hic_1` | OPTIONAL. The path to the forward Hi-C reads in FASTQ format. | +| `hic_2` | OPTIONAL. The path to the reverse Hi-C reads in FASTQ format. | +| `haplotypes` | The expected number of haplotypes in the assembly. Use 1 for (near) homozygous accessions and 2 for heterozygous accessions. NB: currently only 1 or 2 is supported. | +| `speciesName` | A name for the species that is used by Helixer to name novel genes. | +| `taxId` | The NCBI taxonomy ID of the species. | +| `referenceId` | A unique identifier for the reference genome for which genome (FASTA), annotation (GFF3) and chromosome names are provided in the `config/config.yaml` file. | Both `config/config.yaml` and `config/samples.tsv` files validated against a built-in schema that throws an error if the files are not correctly filled in. @@ -205,13 +207,14 @@ graph TD; #### Overview In the assemble module, HiFi reads are assembled using `hifiasm`. If ONT reads are given, these are used in the assembly process using the `--ul` parameter of `hifiasm`. +Also, if Hi-C reads are given, these are used in the assembly process. Since the output of `hifiasm` is a GFA file, we next convert the (consensus) primary contigs GFA to a FASTA file. In case the user has indicated that the accession is heterozygous, the two haplotype assemblies as outputted by `hifiasm` are converted to FASTA files instead. Finally, we produce an alignment of the (contig) assembly against the provided reference genome using `nucmer`. To prevent spurious alignments, we slightly increased the `-l` and `-g` parameter of `nucmer`. -As alternative to `hifiasm`, we also implemented `verkko` as it is known to work well with HiFi and ONT data. -For heterozygous accessions, `hapdup` is used to try separating the haplotypes based on HiFi alignment to the assembly. +As alternative to `hifiasm`, we also implemented `verkko` as it is known to work well with HiFi (and ONT and Hi-C) data. +For heterozygous accessions, `hapdup` is used to try separating the haplotypes based on HiFi alignment to the Verkko assembly. Please bear in mind that the pipeline was developed with `hifiasm` in mind, so although these other assemblers will technically work, the pipeline may not be optimally set up for them. In some preliminary tests, we found that `verkko` doesn't work well with heterozygous accessions, resulting in a partly phased assembly. @@ -232,6 +235,7 @@ In our experience, increasing the value for `ntjoin_w` resolves most issues when After scaffolding, the sequences in the scaffolded assembly are renamed to reflect their actual chromosome names according to the reference genome. Finally, `nucmer` is run again to produce an alignment plot for visual inspection of the scaffolding process. +If Hi-C reads were provided, the Hi-C contact map is also produced for visual inspection of the `ntJoin` scaffolding process. #### Next steps As the assembly as outputted by this module is used as starting point for the analyse, annotate and qa modules, it is crucial it matches the expectations in terms of size and chromosome number. @@ -290,6 +294,12 @@ In this case, the pipeline is not able to accurately discern which reference chr This lack of collinearity should also be visible in the MUMmerplots created by the `assemble` module. The only solution in this case would be to choose another (more closely related) reference genome and re-run the `assemble` module to check if this new reference genome is collinear with the assembly before continuing with the `scaffold` module. +### Q: Pipeline creates scaffolds that are obviously wrong +A: This issue can have multiple causes, but the most common one is that the `ntJoin` parameters are not set correctly. +From our own experience, increasing the value for `ntjoin_w` resolves most issues when no correct scaffolding is produced. +Also, it is important to keep in mind that this pipeline is not meant to create a perfect assembly, but to provide a starting point for human curation. +So feel free to adjust the pipeline or the assembly to your own needs! + ### Q: Pipeline crashes mid-run A: This is intended Snakemake behaviour: if any job fails, the pipeline will only finish current running jobs and then exit. As for the reason of stopping, please check the log file of the job that failed. From 556411a4e5376c8ad8144654dacbe7560dc1ff24 Mon Sep 17 00:00:00 2001 From: worku005 Date: Tue, 5 Nov 2024 10:49:13 +0100 Subject: [PATCH 30/41] put name of ntjoin smk file back --- .../rules/2.scaffolding/{01.scaffolding.smk => 01.ntjoin.smk} | 0 workflow/rules/2.scaffolding/all.smk | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename workflow/rules/2.scaffolding/{01.scaffolding.smk => 01.ntjoin.smk} (100%) diff --git a/workflow/rules/2.scaffolding/01.scaffolding.smk b/workflow/rules/2.scaffolding/01.ntjoin.smk similarity index 100% rename from workflow/rules/2.scaffolding/01.scaffolding.smk rename to workflow/rules/2.scaffolding/01.ntjoin.smk diff --git a/workflow/rules/2.scaffolding/all.smk b/workflow/rules/2.scaffolding/all.smk index d7b5a0a..cef85b7 100644 --- a/workflow/rules/2.scaffolding/all.smk +++ b/workflow/rules/2.scaffolding/all.smk @@ -1,4 +1,4 @@ -include: "01.scaffolding.smk" +include: "01.ntjoin.smk" include: "02.renaming.smk" include: "03.mummer.smk" From a7823780dd4e8c511a075a242c62686425736877 Mon Sep 17 00:00:00 2001 From: worku005 Date: Tue, 5 Nov 2024 10:53:31 +0100 Subject: [PATCH 31/41] remove old comment --- workflow/rules/2.scaffolding/all.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/2.scaffolding/all.smk b/workflow/rules/2.scaffolding/all.smk index cef85b7..acec38a 100644 --- a/workflow/rules/2.scaffolding/all.smk +++ b/workflow/rules/2.scaffolding/all.smk @@ -33,6 +33,6 @@ rule scaffold: expand("final_output/{asmname}.full.fa", asmname = get_all_accessions()), get_mummerplot_scaffolds, - get_hic_plots, #TODO: temporary + get_hic_plots, output: touch("results/scaffolding.done") From f6344d0f6dd5566881cb7addc4ccdcdea5cc5c60 Mon Sep 17 00:00:00 2001 From: worku005 Date: Tue, 5 Nov 2024 11:02:39 +0100 Subject: [PATCH 32/41] fix typo in input file names --- workflow/rules/1.assembly/01.assembly.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/1.assembly/01.assembly.smk b/workflow/rules/1.assembly/01.assembly.smk index 8f0ad59..7601bc5 100644 --- a/workflow/rules/1.assembly/01.assembly.smk +++ b/workflow/rules/1.assembly/01.assembly.smk @@ -34,7 +34,7 @@ rule hifiasm_with_hic: conda: "../../envs/hifiasm.yaml" shell: - "hifiasm -t {threads} -o $(echo {output.consensus} | rev | cut -d '.' -f 4- | rev) --h1 {input.hic1} --h2 {input.h2} {input.hifi} 2> {log}" + "hifiasm -t {threads} -o $(echo {output.consensus} | rev | cut -d '.' -f 4- | rev) --h1 {input.hic1} --h2 {input.hic2} {input.hifi} 2> {log}" rule hifiasm_with_ont: input: @@ -74,7 +74,7 @@ rule hifiasm_with_hic_and_ont: conda: "../../envs/hifiasm.yaml" shell: - "hifiasm -t {threads} -o $(echo {output.consensus} | rev | cut -d '.' -f 4- | rev) --h1 {input.hic1} --h2 {input.h2} --ul $(echo {input.ont} | sed 's/ /,/g') {input.hifi} 2> {log}" + "hifiasm -t {threads} -o $(echo {output.consensus} | rev | cut -d '.' -f 4- | rev) --h1 {input.hic1} --h2 {input.hic2} --ul $(echo {input.ont} | sed 's/ /,/g') {input.hifi} 2> {log}" def get_hifiasm_output(wildcards): if get_haplotypes(wildcards) == 1: From 00877035ee50ef85e5a32ce099467d39930c60be Mon Sep 17 00:00:00 2001 From: worku005 Date: Wed, 6 Nov 2024 09:41:02 +0100 Subject: [PATCH 33/41] fix hifiasm output when using hi-c --- workflow/rules/1.assembly/01.assembly.smk | 17 +++++++++-------- workflow/rules/1.assembly/02.contigs.smk | 8 ++++---- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/workflow/rules/1.assembly/01.assembly.smk b/workflow/rules/1.assembly/01.assembly.smk index 7601bc5..4e674a4 100644 --- a/workflow/rules/1.assembly/01.assembly.smk +++ b/workflow/rules/1.assembly/01.assembly.smk @@ -22,9 +22,9 @@ rule hifiasm_with_hic: hic1 = get_hic_1, hic2 = get_hic_2, output: - hap1 = "results/{asmname}/1.assembly/01.hifiasm_hifi_and_hic/{asmname}.bp.hap1.p_ctg.gfa", - hap2 = "results/{asmname}/1.assembly/01.hifiasm_hifi_and_hic/{asmname}.bp.hap2.p_ctg.gfa", - consensus = "results/{asmname}/1.assembly/01.hifiasm_hifi_and_hic/{asmname}.bp.p_ctg.gfa", + hap1 = "results/{asmname}/1.assembly/01.hifiasm_hifi_and_hic/{asmname}.hic.hap1.p_ctg.gfa", + hap2 = "results/{asmname}/1.assembly/01.hifiasm_hifi_and_hic/{asmname}.hic.hap2.p_ctg.gfa", + consensus = "results/{asmname}/1.assembly/01.hifiasm_hifi_and_hic/{asmname}.hic.p_ctg.gfa", log: "results/logs/1.assembly/hifiasm/{asmname}.log" benchmark: @@ -62,9 +62,9 @@ rule hifiasm_with_hic_and_ont: hic2 = get_hic_2, ont = get_ont, output: - hap1 = "results/{asmname}/1.assembly/01.hifiasm_hifi_hic_and_ont/{asmname}.bp.hap1.p_ctg.gfa", - hap2 = "results/{asmname}/1.assembly/01.hifiasm_hifi_hic_and_ont/{asmname}.bp.hap2.p_ctg.gfa", - consensus = "results/{asmname}/1.assembly/01.hifiasm_hifi_hic_and_ont/{asmname}.bp.p_ctg.gfa", + hap1 = "results/{asmname}/1.assembly/01.hifiasm_hifi_hic_and_ont/{asmname}.hic.hap1.p_ctg.gfa", + hap2 = "results/{asmname}/1.assembly/01.hifiasm_hifi_hic_and_ont/{asmname}.hic.hap2.p_ctg.gfa", + consensus = "results/{asmname}/1.assembly/01.hifiasm_hifi_hic_and_ont/{asmname}.hic.p_ctg.gfa", log: "results/logs/1.assembly/hifiasm_with_hic_and_ont/{asmname}.log" benchmark: @@ -77,12 +77,13 @@ rule hifiasm_with_hic_and_ont: "hifiasm -t {threads} -o $(echo {output.consensus} | rev | cut -d '.' -f 4- | rev) --h1 {input.hic1} --h2 {input.hic2} --ul $(echo {input.ont} | sed 's/ /,/g') {input.hifi} 2> {log}" def get_hifiasm_output(wildcards): + suffix = "bp" if has_hic(wildcards.asmname) else "bp" if get_haplotypes(wildcards) == 1: - return "results/{asmname}/1.assembly/01.hifiasm_{ext}/{asmname}.bp.p_ctg.gfa" + return f"results/{{asmname}}/1.assembly/01.hifiasm_{{ext}}/{{asmname}}.{suffix}.p_ctg.gfa" else: haplotype = int(wildcards.asmname[-1]) asmname = get_clean_accession_id(wildcards.asmname) - return f"results/{asmname}/1.assembly/01.hifiasm_{wildcards.ext}/{asmname}.bp.hap{haplotype}.p_ctg.gfa" + return f"results/{asmname}/1.assembly/01.hifiasm_{wildcards.ext}/{asmname}.{suffix}.hap{haplotype}.p_ctg.gfa" rule hifiasm_to_fasta: input: diff --git a/workflow/rules/1.assembly/02.contigs.smk b/workflow/rules/1.assembly/02.contigs.smk index 0b35d17..d413cab 100644 --- a/workflow/rules/1.assembly/02.contigs.smk +++ b/workflow/rules/1.assembly/02.contigs.smk @@ -1,14 +1,14 @@ def get_assembly(wildcards): if has_ont(wildcards.asmname): if has_hic(wildcards.asmname): - return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_hic_and_ont/{wildcards.asmname}.bp.p_ctg.gfa" + return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_hic_and_ont/{wildcards.asmname}.fa" else: - return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_and_ont/{wildcards.asmname}.bp.p_ctg.gfa" + return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_and_ont/{wildcards.asmname}.fa" else: if has_hic(wildcards.asmname): - return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_and_hic/{wildcards.asmname}.bp.p_ctg.gfa" + return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_and_hic/{wildcards.asmname}.fa" else: - return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_only/{wildcards.asmname}.bp.p_ctg.gfa" + return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_only/{wildcards.asmname}.fa" rule filter_contigs: input: From 59562d3552fcb702df538eeb81cf013dbae9de33 Mon Sep 17 00:00:00 2001 From: worku005 Date: Wed, 6 Nov 2024 09:42:37 +0100 Subject: [PATCH 34/41] fix typo --- workflow/rules/1.assembly/01.assembly.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/1.assembly/01.assembly.smk b/workflow/rules/1.assembly/01.assembly.smk index 4e674a4..9f7be80 100644 --- a/workflow/rules/1.assembly/01.assembly.smk +++ b/workflow/rules/1.assembly/01.assembly.smk @@ -77,7 +77,7 @@ rule hifiasm_with_hic_and_ont: "hifiasm -t {threads} -o $(echo {output.consensus} | rev | cut -d '.' -f 4- | rev) --h1 {input.hic1} --h2 {input.hic2} --ul $(echo {input.ont} | sed 's/ /,/g') {input.hifi} 2> {log}" def get_hifiasm_output(wildcards): - suffix = "bp" if has_hic(wildcards.asmname) else "bp" + suffix = "hic" if has_hic(wildcards.asmname) else "bp" if get_haplotypes(wildcards) == 1: return f"results/{{asmname}}/1.assembly/01.hifiasm_{{ext}}/{{asmname}}.{suffix}.p_ctg.gfa" else: From c132f4125946c5bd25e21eb22d306a2530df9075 Mon Sep 17 00:00:00 2001 From: worku005 Date: Thu, 7 Nov 2024 09:24:46 +0100 Subject: [PATCH 35/41] update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 16eebcf..391caaa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ All notable changes to MoGAAAP will be documented in this file. +## [Unreleased] + +## Added +- Add optional use of Hi-C during assembly and for visualising ntJoin contact map (#56). + ## [0.1.2] - 2024-10-24 ## Fixed From 9f88be641d58136c706fb710d6ef76025cec5105 Mon Sep 17 00:00:00 2001 From: worku005 Date: Thu, 7 Nov 2024 09:43:55 +0100 Subject: [PATCH 36/41] add K-mer completeness to statistics table --- workflow/report/statistics.rst | 9 ++------- workflow/rules/5.quality_assessment/13.statistics.smk | 6 ++++-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/workflow/report/statistics.rst b/workflow/report/statistics.rst index 5573db5..3d40e98 100644 --- a/workflow/report/statistics.rst +++ b/workflow/report/statistics.rst @@ -1,8 +1,3 @@ The overall statistics of a defined set of assemblies as per configuration. Only -HiFi reads are shown here, even if ONT reads were used in the assembly and/or -Illumina reads for the QA. - -One important remark about the (un)assigned sequences is that chromosomes that -were assembled as a single contig are counted among the unassigned sequences -instead of the assigned sequences. Otherwise, assigned sequences contain all -chromosomes. +HiFi reads are shown here for k-mer assessments, even if ONT reads were used in +the assembly and/or Illumina reads for the QA. diff --git a/workflow/rules/5.quality_assessment/13.statistics.smk b/workflow/rules/5.quality_assessment/13.statistics.smk index d956fc5..cdbc760 100644 --- a/workflow/rules/5.quality_assessment/13.statistics.smk +++ b/workflow/rules/5.quality_assessment/13.statistics.smk @@ -2,7 +2,8 @@ rule individual_statistics: input: assembly = "results/{asmname}/2.scaffolding/02.renaming/{asmname}.fa", contigs = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa", minlen=config["min_contig_len"]), - qv = lambda wildcards: expand("results/{{asmname}}/5.quality_assessment/01.merqury/{k}/hifi/{{asmname}}_vs_hifi.qv", k=config["k_qa"]), + qv = expand("results/{{asmname}}/5.quality_assessment/01.merqury/{k}/hifi/{{asmname}}_vs_hifi.qv", k=config["k_qa"]), + kmerstats = expand("results/{{asmname}}/5.quality_assessment/01.merqury/{k}/hifi/{{asmname}}_vs_hifi.completeness.stats", k=config["k_qa"]), full_annotation = "results/{asmname}/4.annotation/03.combined/{asmname}.gff", coding_annotation = "results/{asmname}/4.annotation/03.combined/{asmname}.coding.gff", output: @@ -23,7 +24,7 @@ rule individual_statistics: shell: """ ( - printf "Name\\tTotal length\\t#sequences\\tN50\\t#genes (full)\\t#genes (coding)\\t#transcripts (coding)\\t#chromosomes\\tTotal length (chromosomes)\\t#unassigned sequences\\tTotal length (unassigned sequences)\\tTotal QV (HiFi)\\t#contigs\\tContig N50\\tInput data\\tAssembler\\n" > {output.tsv} + printf "Name\\tTotal length\\t#sequences\\tN50\\t#genes (full)\\t#genes (coding)\\t#transcripts (coding)\\t#chromosomes\\tTotal length (chromosomes)\\t#unassigned sequences\\tTotal length (unassigned sequences)\\tTotal QV (HiFi)\\tK-mer completeness (HiFi)\\t#contigs\\tContig N50\\tInput data\\tAssembler\\n" > {output.tsv} printf "{wildcards.asmname}\\t" >> {output.tsv} seqkit stats -abTj1 {input.assembly} > {output.assembly} awk 'BEGIN{{FS = "\\t";}} NR==2{{printf "%s\\t%s\\t%s\\t", $5,$4,$13;}}' {output.assembly} >> {output.tsv} @@ -35,6 +36,7 @@ rule individual_statistics: seqkit grep -vrp "Chr" {input.assembly} | seqkit stats -abTj1 > {output.unassigned} awk 'BEGIN{{FS = "\\t";}} NR==2{{printf "%s\\t%s\\t", $4,$5;}}' {output.unassigned} >> {output.tsv} awk 'BEGIN{{FS = "\\t";}} NR==1{{printf "%s\\t", $4;}}' {input.qv} >> {output.tsv} + awk 'BEGIN{{FS = "\\t";}} NR==1{{printf "%s\\t", $5;}}' {input.kmerstats} >> {output.tsv} seqkit stats -abTj1 {input.contigs} > {output.contigs} awk 'BEGIN{{FS = "\\t";}} NR==2{{printf "%s\\t%s\\t", $4,$13;}}' {output.contigs} >> {output.tsv} printf "{params.inputdata}\\t" >> {output.tsv} From c0039893d4c7393348792e18190269ed70f43b88 Mon Sep 17 00:00:00 2001 From: worku005 Date: Thu, 7 Nov 2024 09:51:24 +0100 Subject: [PATCH 37/41] use illumina for final stats if available, otherwise stick with hifi --- workflow/rules/5.quality_assessment/13.statistics.smk | 7 ++++--- workflow/rules/common.smk | 11 +++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/workflow/rules/5.quality_assessment/13.statistics.smk b/workflow/rules/5.quality_assessment/13.statistics.smk index cdbc760..d32b815 100644 --- a/workflow/rules/5.quality_assessment/13.statistics.smk +++ b/workflow/rules/5.quality_assessment/13.statistics.smk @@ -2,8 +2,8 @@ rule individual_statistics: input: assembly = "results/{asmname}/2.scaffolding/02.renaming/{asmname}.fa", contigs = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa", minlen=config["min_contig_len"]), - qv = expand("results/{{asmname}}/5.quality_assessment/01.merqury/{k}/hifi/{{asmname}}_vs_hifi.qv", k=config["k_qa"]), - kmerstats = expand("results/{{asmname}}/5.quality_assessment/01.merqury/{k}/hifi/{{asmname}}_vs_hifi.completeness.stats", k=config["k_qa"]), + qv = lambda wildcards: expand("results/{{asmname}}/5.quality_assessment/01.merqury/{k}/{wgstype}/{{asmname}}_vs_{wgstype}.qv", k=config["k_qa"], wgstype=get_best_wgstype(wildcards.asmname).tolower()), + kmerstats = lambda wildcards: expand("results/{{asmname}}/5.quality_assessment/01.merqury/{k}/{wgstype}/{{asmname}}_vs_{wgstype}.completeness.stats", k=config["k_qa"], wgstype=get_best_wgstype(wildcards.asmname).tolower()), full_annotation = "results/{asmname}/4.annotation/03.combined/{asmname}.gff", coding_annotation = "results/{asmname}/4.annotation/03.combined/{asmname}.coding.gff", output: @@ -19,12 +19,13 @@ rule individual_statistics: params: inputdata = lambda wildcards: "HiFi+ONT+Hi-C" if has_ont(wildcards.asmname) and has_hic(wildcards.asmname) else "HiFi+ONT" if has_ont(wildcards.asmname) else "HiFi+Hi-C" if has_hic(wildcards.asmname) else "HiFi only", assembler = config["assembler"], + best_wgstype = lambda wildcards: get_best_wgstype(wildcards.asmname) conda: "../../envs/seqkit.yaml" shell: """ ( - printf "Name\\tTotal length\\t#sequences\\tN50\\t#genes (full)\\t#genes (coding)\\t#transcripts (coding)\\t#chromosomes\\tTotal length (chromosomes)\\t#unassigned sequences\\tTotal length (unassigned sequences)\\tTotal QV (HiFi)\\tK-mer completeness (HiFi)\\t#contigs\\tContig N50\\tInput data\\tAssembler\\n" > {output.tsv} + printf "Name\\tTotal length\\t#sequences\\tN50\\t#genes (full)\\t#genes (coding)\\t#transcripts (coding)\\t#chromosomes\\tTotal length (chromosomes)\\t#unassigned sequences\\tTotal length (unassigned sequences)\\tTotal QV ({params.best_wgstype})\\tK-mer completeness ({params.best_wgstype})\\t#contigs\\tContig N50\\tInput data\\tAssembler\\n" > {output.tsv} printf "{wildcards.asmname}\\t" >> {output.tsv} seqkit stats -abTj1 {input.assembly} > {output.assembly} awk 'BEGIN{{FS = "\\t";}} NR==2{{printf "%s\\t%s\\t%s\\t", $5,$4,$13;}}' {output.assembly} >> {output.tsv} diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 94204da..02d5fd5 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -31,6 +31,14 @@ def get_all_accessions_from_asmset(asmset, minimum=2): accessions = [] return accessions +def get_best_wgstype(asmname): + """ + Return the best wgstype for a given asmname: if Illumina is available, return "Illumina", otherwise return "HiFi". + """ + if has_illumina(asmname): + return "Illumina" + return "HiFi" + def get_hifi(wildcards): return SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["hifi"].values.item().split(";") @@ -40,6 +48,9 @@ def has_ont(asmname): def get_ont(wildcards): return SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["ont"].values.item().split(";") +def has_illumina(asmname): + return not SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(asmname)]["illumina_1"].isnull().values.item() + def get_illumina_1(wildcards): return SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["illumina_1"].values.item() From 14c19fd690242ac22020578f3646bd578875d57d Mon Sep 17 00:00:00 2001 From: worku005 Date: Thu, 7 Nov 2024 09:53:03 +0100 Subject: [PATCH 38/41] fix typo --- workflow/rules/5.quality_assessment/13.statistics.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/5.quality_assessment/13.statistics.smk b/workflow/rules/5.quality_assessment/13.statistics.smk index d32b815..bae7c85 100644 --- a/workflow/rules/5.quality_assessment/13.statistics.smk +++ b/workflow/rules/5.quality_assessment/13.statistics.smk @@ -2,8 +2,8 @@ rule individual_statistics: input: assembly = "results/{asmname}/2.scaffolding/02.renaming/{asmname}.fa", contigs = expand("results/{{asmname}}/1.assembly/02.contigs/{{asmname}}.min{minlen}.sorted.renamed.fa", minlen=config["min_contig_len"]), - qv = lambda wildcards: expand("results/{{asmname}}/5.quality_assessment/01.merqury/{k}/{wgstype}/{{asmname}}_vs_{wgstype}.qv", k=config["k_qa"], wgstype=get_best_wgstype(wildcards.asmname).tolower()), - kmerstats = lambda wildcards: expand("results/{{asmname}}/5.quality_assessment/01.merqury/{k}/{wgstype}/{{asmname}}_vs_{wgstype}.completeness.stats", k=config["k_qa"], wgstype=get_best_wgstype(wildcards.asmname).tolower()), + qv = lambda wildcards: expand("results/{{asmname}}/5.quality_assessment/01.merqury/{k}/{wgstype}/{{asmname}}_vs_{wgstype}.qv", k=config["k_qa"], wgstype=get_best_wgstype(wildcards.asmname).lower()), + kmerstats = lambda wildcards: expand("results/{{asmname}}/5.quality_assessment/01.merqury/{k}/{wgstype}/{{asmname}}_vs_{wgstype}.completeness.stats", k=config["k_qa"], wgstype=get_best_wgstype(wildcards.asmname).lower()), full_annotation = "results/{asmname}/4.annotation/03.combined/{asmname}.gff", coding_annotation = "results/{asmname}/4.annotation/03.combined/{asmname}.coding.gff", output: From 46c3792b9504b61f18ca86782fa62a3985730616 Mon Sep 17 00:00:00 2001 From: worku005 Date: Thu, 7 Nov 2024 10:02:33 +0100 Subject: [PATCH 39/41] update report text statistics --- workflow/report/statistics.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/report/statistics.rst b/workflow/report/statistics.rst index 3d40e98..80cc1d8 100644 --- a/workflow/report/statistics.rst +++ b/workflow/report/statistics.rst @@ -1,3 +1,3 @@ -The overall statistics of a defined set of assemblies as per configuration. Only -HiFi reads are shown here for k-mer assessments, even if ONT reads were used in -the assembly and/or Illumina reads for the QA. +The overall statistics of a defined set of assemblies as per configuration. If +Illumina data is available, Illumina data is used for k-mer statistics, +otherwise HiFi data is used. From 3a43e431cc210f8f8ffdd8300b76daabc524f5ee Mon Sep 17 00:00:00 2001 From: worku005 Date: Thu, 7 Nov 2024 10:02:44 +0100 Subject: [PATCH 40/41] update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 391caaa..da2b2a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ All notable changes to MoGAAAP will be documented in this file. ## Added - Add optional use of Hi-C during assembly and for visualising ntJoin contact map (#56). +- Add k-mer completeness statistics (#57). +- Use Illumina instead of HiFi for final stats if available (#57). ## [0.1.2] - 2024-10-24 From 380d718d818e30c46d22be0784a31006488c6aa2 Mon Sep 17 00:00:00 2001 From: worku005 Date: Thu, 7 Nov 2024 10:09:50 +0100 Subject: [PATCH 41/41] create release v0.2.0 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da2b2a6..d621de4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ All notable changes to MoGAAAP will be documented in this file. -## [Unreleased] +## [0.2.0] - 2024-11-07 ## Added - Add optional use of Hi-C during assembly and for visualising ntJoin contact map (#56).