Merge pull request #58 from dirkjanvw/release_v0.2.0

Release v0.2.0
dirkjanvw · Nov 7, 2024 · bd3ebfb · bd3ebfb
2 parents 27c559a + 380d718
commit bd3ebfb
Show file tree

Hide file tree

Showing 17 changed files with 393 additions and 38 deletions.
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 All notable changes to MoGAAAP will be documented in this file.
 
+## [0.2.0] - 2024-11-07
+
+## Added
+- Add optional use of Hi-C during assembly and for visualising ntJoin contact map (#56).
+- Add k-mer completeness statistics (#57).
+- Use Illumina instead of HiFi for final stats if available (#57).
+
 ## [0.1.2] - 2024-10-24
 
 ## Fixed

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # MoGAAAP (Modular Genome Assembly, Annotation and Assessment Pipeline)
 This repository contains a Snakemake pipeline for the assembly, scaffolding, analysis, annotation and quality assessment of HiFi-based assemblies.
 Although developed for a project in lettuce, the pipeline is designed to work with any organism.
-The pipeline will work with both HiFi and ONT data, although the former is required.
+The pipeline will work with both HiFi and ONT data, although only the former is required.
 
 ## Index
 - [Downloading the pipeline](#downloading-pipeline)
@@ -95,17 +95,19 @@ All fields to fill in are well-documented in the provided `config/config.yaml` f
 
 The `config/samples.tsv` has the following columns to fill in (one row per sample):
 
-| Column name     | Description                                                                                                                                                          |
-|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `accessionId`   | The accession ID of the sample. This name has to be unique.                                                                                                          |
-| `hifi`          | The path to the HiFi reads in FASTQ or FASTA format. Multiple libraries can be provided by separating them with a semicolon.                                         |
-| `ont`           | The path to the ONT reads in FASTQ or FASTA format. Multiple libraries can be provided by separating them with a semicolon.                                          |
-| `illumina_1`    | The path to the forward Illumina reads in FASTQ format.                                                                                                              |
-| `illumina_2`    | The path to the reverse Illumina reads in FASTQ format.                                                                                                              |
-| `haplotypes`    | The expected number of haplotypes in the assembly. Use 1 for (near) homozygous accessions and 2 for heterozygous accessions. NB: currently only 1 or 2 is supported. |
-| `speciesName`   | A name for the species that is used by Helixer to name novel genes.                                                                                                  |
-| `taxId`         | The NCBI taxonomy ID of the species.                                                                                                                                 |
-| `referenceId`   | A unique identifier for the reference genome for which genome (FASTA), annotation (GFF3) and chromosome names are provided in the `config/config.yaml` file.         |
+| Column name   | Description                                                                                                                                                          |
+|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `accessionId` | The accession ID of the sample. This name has to be unique.                                                                                                          |
+| `hifi`        | The path to the HiFi reads in FASTQ or FASTA format. Multiple libraries can be provided by separating them with a semicolon.                                         |
+| `ont`         | OPTIONAL. The path to the ONT reads in FASTQ or FASTA format. Multiple libraries can be provided by separating them with a semicolon.                                |
+| `illumina_1`  | OPTIONAL. The path to the forward Illumina reads in FASTQ format.                                                                                                    |
+| `illumina_2`  | OPTIONAL. The path to the reverse Illumina reads in FASTQ format.                                                                                                    |
+| `hic_1`       | OPTIONAL. The path to the forward Hi-C reads in FASTQ format.                                                                                                        |
+| `hic_2`       | OPTIONAL. The path to the reverse Hi-C reads in FASTQ format.                                                                                                        |
+| `haplotypes`  | The expected number of haplotypes in the assembly. Use 1 for (near) homozygous accessions and 2 for heterozygous accessions. NB: currently only 1 or 2 is supported. |
+| `speciesName` | A name for the species that is used by Helixer to name novel genes.                                                                                                  |
+| `taxId`       | The NCBI taxonomy ID of the species.                                                                                                                                 |
+| `referenceId` | A unique identifier for the reference genome for which genome (FASTA), annotation (GFF3) and chromosome names are provided in the `config/config.yaml` file.         |
 
 Both `config/config.yaml` and `config/samples.tsv` files validated against a built-in schema that throws an error if the files are not correctly filled in.
 
@@ -205,13 +207,14 @@ graph TD;
 #### Overview
 In the assemble module, HiFi reads are assembled using `hifiasm`.
 If ONT reads are given, these are used in the assembly process using the `--ul` parameter of `hifiasm`.
+Also, if Hi-C reads are given, these are used in the assembly process.
 Since the output of `hifiasm` is a GFA file, we next convert the (consensus) primary contigs GFA to a FASTA file.
 In case the user has indicated that the accession is heterozygous, the two haplotype assemblies as outputted by `hifiasm` are converted to FASTA files instead.
 Finally, we produce an alignment of the (contig) assembly against the provided reference genome using `nucmer`.
 To prevent spurious alignments, we slightly increased the `-l` and `-g` parameter of `nucmer`.
 
-As alternative to `hifiasm`, we also implemented `verkko` as it is known to work well with HiFi and ONT data.
-For heterozygous accessions, `hapdup` is used to try separating the haplotypes based on HiFi alignment to the assembly.
+As alternative to `hifiasm`, we also implemented `verkko` as it is known to work well with HiFi (and ONT and Hi-C) data.
+For heterozygous accessions, `hapdup` is used to try separating the haplotypes based on HiFi alignment to the Verkko assembly.
 Please bear in mind that the pipeline was developed with `hifiasm` in mind, so although these other assemblers will technically work, the pipeline may not be optimally set up for them.
 In some preliminary tests, we found that `verkko` doesn't work well with heterozygous accessions, resulting in a partly phased assembly.
 
@@ -232,6 +235,7 @@ In our experience, increasing the value for `ntjoin_w` resolves most issues when
 
 After scaffolding, the sequences in the scaffolded assembly are renamed to reflect their actual chromosome names according to the reference genome.
 Finally, `nucmer` is run again to produce an alignment plot for visual inspection of the scaffolding process.
+If Hi-C reads were provided, the Hi-C contact map is also produced for visual inspection of the `ntJoin` scaffolding process.
 
 #### Next steps
 As the assembly as outputted by this module is used as starting point for the analyse, annotate and qa modules, it is crucial it matches the expectations in terms of size and chromosome number.
@@ -290,6 +294,12 @@ In this case, the pipeline is not able to accurately discern which reference chr
 This lack of collinearity should also be visible in the MUMmerplots created by the `assemble` module.
 The only solution in this case would be to choose another (more closely related) reference genome and re-run the `assemble` module to check if this new reference genome is collinear with the assembly before continuing with the `scaffold` module.
 
+### Q: Pipeline creates scaffolds that are obviously wrong
+A: This issue can have multiple causes, but the most common one is that the `ntJoin` parameters are not set correctly.
+From our own experience, increasing the value for `ntjoin_w` resolves most issues when no correct scaffolding is produced.
+Also, it is important to keep in mind that this pipeline is not meant to create a perfect assembly, but to provide a starting point for human curation.
+So feel free to adjust the pipeline or the assembly to your own needs!
+
 ### Q: Pipeline crashes mid-run
 A: This is intended Snakemake behaviour: if any job fails, the pipeline will only finish current running jobs and then exit.
 As for the reason of stopping, please check the log file of the job that failed.

diff --git a/config/samples.tsv b/config/samples.tsv
@@ -1,4 +1,4 @@
-accessionId	hifi	ont	illumina_1	illumina_2	haplotypes	speciesName	taxId	referenceId
-accessionA	"resources/reads/HiFi/readsA1.fasta;resources/reads/HiFi/readsA2.fasta"		"resources/reads/sample1/library1_1.fq.gz"	"resources/reads/sample1/library1_2.fq.gz"	1	"speciesSp"	123	referenceA
-accessionB	"resources/reads/HiFi/readsB.fasta"	"resources/reads/ONT/readsB1.fasta;resources/reads/ONT/readsB2.fasta"	""	""	2	"speciesSp"	123	referenceA
-accessionC	"resources/reads/HiFi/readsC.fasta"	""	""	""	1	"speciesSd"	456	referenceB
+accessionId	hifi	ont	illumina_1	illumina_2	hic_1	hic_2	haplotypes	speciesName	taxId	referenceId
+accessionA	"resources/reads/HiFi/readsA1.fasta;resources/reads/HiFi/readsA2.fasta"		"resources/reads/sample1/library1_1.fq.gz"	"resources/reads/sample1/library1_2.fq.gz"	""	""	1	"speciesSp"	123	referenceA
+accessionB	"resources/reads/HiFi/readsB.fasta"	"resources/reads/ONT/readsB1.fasta;resources/reads/ONT/readsB2.fasta"	""	""	""	""	2	"speciesSp"	123	referenceA
+accessionC	"resources/reads/HiFi/readsC.fasta"	""	""	""	""	""	1	"speciesSd"	456	referenceB
diff --git a/workflow/envs/mapping.yaml b/workflow/envs/mapping.yaml
@@ -4,4 +4,5 @@ channels:
 dependencies:
   - bwa-mem2=2.2.1
   - minimap2=2.26
-  - samtools=1.19
+  - samtools=1.19
+  - samblaster=0.1.26
diff --git a/workflow/report/hic.rst b/workflow/report/hic.rst
@@ -0,0 +1,11 @@
+This contact map shows the scaffolding effort using Hi-C data. In this contact
+map, the axes represent the groups of scaffolds that were scaffolded together.
+The colour intensity represents the number of Hi-C contacts between the groups
+of scaffolds. These contacts are used to determine the physical distance between
+sequences in the assembly.
+
+Ideally, a diagonal should be seen from the left bottom to the top right, with
+the centromeres, telomeres and other repeats as off-diagonal signals. In case
+other off-diagonal signals are present, these indicate mis-joins during the
+scaffolding process (or misassemblies in the assembly). The scaffolding can be
+improved manually using tools like Juicebox or 3D-DNA outside of this pipeline.
diff --git a/workflow/report/statistics.rst b/workflow/report/statistics.rst
@@ -1,8 +1,3 @@
-The overall statistics of a defined set of assemblies as per configuration. Only
-HiFi reads are shown here, even if ONT reads were used in the assembly and/or
-Illumina reads for the QA.
-
-One important remark about the (un)assigned sequences is that chromosomes that
-were assembled as a single contig are counted among the unassigned sequences
-instead of the assigned sequences. Otherwise, assigned sequences contain all
-chromosomes.
+The overall statistics of a defined set of assemblies as per configuration. If
+Illumina data is available, Illumina data is used for k-mer statistics,
+otherwise HiFi data is used.
diff --git a/workflow/rules/1.assembly/01.assembly.smk b/workflow/rules/1.assembly/01.assembly.smk
@@ -16,6 +16,26 @@ rule hifiasm:
     shell:
         "hifiasm -t {threads} -o $(echo {output.consensus} | rev | cut -d '.' -f 4- | rev) {input.hifi} 2> {log}"
 
+rule hifiasm_with_hic:
+    input:
+        hifi = get_hifi,
+        hic1 = get_hic_1,
+        hic2 = get_hic_2,
+    output:
+        hap1 = "results/{asmname}/1.assembly/01.hifiasm_hifi_and_hic/{asmname}.hic.hap1.p_ctg.gfa",
+        hap2 = "results/{asmname}/1.assembly/01.hifiasm_hifi_and_hic/{asmname}.hic.hap2.p_ctg.gfa",
+        consensus = "results/{asmname}/1.assembly/01.hifiasm_hifi_and_hic/{asmname}.hic.p_ctg.gfa",
+    log:
+        "results/logs/1.assembly/hifiasm/{asmname}.log"
+    benchmark:
+        "results/benchmarks/1.assembly/hifiasm/{asmname}.txt"
+    threads:
+        min(max(workflow.cores - 1, 1), 50)
+    conda:
+        "../../envs/hifiasm.yaml"
+    shell:
+        "hifiasm -t {threads} -o $(echo {output.consensus} | rev | cut -d '.' -f 4- | rev) --h1 {input.hic1} --h2 {input.hic2} {input.hifi} 2> {log}"
+
 rule hifiasm_with_ont:
     input:
         hifi = get_hifi,
@@ -35,13 +55,35 @@ rule hifiasm_with_ont:
     shell:
         "hifiasm -t {threads} -o $(echo {output.consensus} | rev | cut -d '.' -f 4- | rev) --ul $(echo {input.ont} | sed 's/ /,/g') {input.hifi} 2> {log}"
 
+rule hifiasm_with_hic_and_ont:
+    input:
+        hifi = get_hifi,
+        hic1 = get_hic_1,
+        hic2 = get_hic_2,
+        ont = get_ont,
+    output:
+        hap1 = "results/{asmname}/1.assembly/01.hifiasm_hifi_hic_and_ont/{asmname}.hic.hap1.p_ctg.gfa",
+        hap2 = "results/{asmname}/1.assembly/01.hifiasm_hifi_hic_and_ont/{asmname}.hic.hap2.p_ctg.gfa",
+        consensus = "results/{asmname}/1.assembly/01.hifiasm_hifi_hic_and_ont/{asmname}.hic.p_ctg.gfa",
+    log:
+        "results/logs/1.assembly/hifiasm_with_hic_and_ont/{asmname}.log"
+    benchmark:
+        "results/benchmarks/1.assembly/hifiasm_with_hic_and_ont/{asmname}.txt"
+    threads:
+        min(max(workflow.cores - 1, 1), 50)
+    conda:
+        "../../envs/hifiasm.yaml"
+    shell:
+        "hifiasm -t {threads} -o $(echo {output.consensus} | rev | cut -d '.' -f 4- | rev) --h1 {input.hic1} --h2 {input.hic2} --ul $(echo {input.ont} | sed 's/ /,/g') {input.hifi} 2> {log}"
+
 def get_hifiasm_output(wildcards):
+    suffix = "hic" if has_hic(wildcards.asmname) else "bp"
     if get_haplotypes(wildcards) == 1:
-        return "results/{asmname}/1.assembly/01.hifiasm_{ext}/{asmname}.bp.p_ctg.gfa"
+        return f"results/{{asmname}}/1.assembly/01.hifiasm_{{ext}}/{{asmname}}.{suffix}.p_ctg.gfa"
     else:
         haplotype = int(wildcards.asmname[-1])
         asmname = get_clean_accession_id(wildcards.asmname)
-        return f"results/{asmname}/1.assembly/01.hifiasm_{wildcards.ext}/{asmname}.bp.hap{haplotype}.p_ctg.gfa"
+        return f"results/{asmname}/1.assembly/01.hifiasm_{wildcards.ext}/{asmname}.{suffix}.hap{haplotype}.p_ctg.gfa"
 
 rule hifiasm_to_fasta:
     input:
@@ -73,6 +115,26 @@ rule verkko:
     shell:
         "verkko --local-memory {resources.gbmem} --local-cpus {threads} -d $(dirname {output}) --hifi {input.hifi} &> {log}"
 
+rule verkko_with_hic:
+    input:
+        hifi = get_hifi,
+        hic1 = get_hic_1,
+        hic2 = get_hic_2,
+    output:
+        "results/{asmname}/1.assembly/01.verkko_hifi_and_hic/assembly.fasta",
+    log:
+        "results/logs/1.assembly/verkko_hifi_and_hic/{asmname}.log"
+    benchmark:
+        "results/benchmarks/1.assembly/verkko_hifi_and_hic/{asmname}.txt"
+    resources:
+        gbmem=320  #best to make it a multiple of 32 for verkko
+    threads:
+        min(max(workflow.cores - 1, 1), 50)
+    conda:
+        "../../envs/verkko.yaml"
+    shell:
+        "verkko --local-memory {resources.gbmem} --local-cpus {threads} -d $(dirname {output}) --hifi {input.hifi} --hic1 {input.hic1} --hic2 {input.hic2} &> {log}"
+
 rule verkko_with_ont:
     input:
         hifi = get_hifi,
@@ -83,12 +145,35 @@ rule verkko_with_ont:
         "results/logs/1.assembly/verkko_hifi_and_ont/{asmname}.log"
     benchmark:
         "results/benchmarks/1.assembly/verkko_hifi_and_ont/{asmname}.txt"
+    resources:
+        gbmem=320  #best to make it a multiple of 32 for verkko
+    threads:
+        min(max(workflow.cores - 1, 1), 50)
+    conda:
+        "../../envs/verkko.yaml"
+    shell:
+        "verkko --local-memory {resources.gbmem} --local-cpus {threads} -d $(dirname {output}) --hifi {input.hifi} --nano {input.ont} &> {log}"
+
+rule verkko_with_hic_and_ont:
+    input:
+        hifi = get_hifi,
+        hic1 = get_hic_1,
+        hic2 = get_hic_2,
+        ont = get_ont,
+    output:
+        "results/{asmname}/1.assembly/01.verkko_hifi_hic_and_ont/assembly.fasta",
+    log:
+        "results/logs/1.assembly/verkko_hifi_hic_and_ont/{asmname}.log"
+    benchmark:
+        "results/benchmarks/1.assembly/verkko_hifi_hic_and_ont/{asmname}.txt"
+    resources:
+        gbmem=320  #best to make it a multiple of 32 for verkko
     threads:
         min(max(workflow.cores - 1, 1), 50)
     conda:
         "../../envs/verkko.yaml"
     shell:
-        "verkko --threads {threads} -d $(dirname {output}) --hifi {input.hifi} --nano {input.ont} &> {log}"
+        "verkko --local-memory {resources.gbmem} --local-cpus {threads} -d $(dirname {output}) --hifi {input.hifi} --hic1 {input.hic1} --hic2 {input.hic2} --nano {input.ont} &> {log}"
 
 rule verkko_assembly_minimap2:
     input:

diff --git a/workflow/rules/1.assembly/02.contigs.smk b/workflow/rules/1.assembly/02.contigs.smk
@@ -1,8 +1,18 @@
+def get_assembly(wildcards):
+    if has_ont(wildcards.asmname):
+        if has_hic(wildcards.asmname):
+            return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_hic_and_ont/{wildcards.asmname}.fa"
+        else:
+            return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_and_ont/{wildcards.asmname}.fa"
+    else:
+        if has_hic(wildcards.asmname):
+            return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_and_hic/{wildcards.asmname}.fa"
+        else:
+            return f"results/{wildcards.asmname}/1.assembly/01.{config['assembler']}_hifi_only/{wildcards.asmname}.fa"
+
 rule filter_contigs:
     input:
-        lambda wildcards: branch(SAMPLES[SAMPLES["accessionId"] == get_clean_accession_id(wildcards.asmname)]["ont"].isnull().values.item(), #check if ont is null
-                then=f"results/{{asmname}}/1.assembly/01.{config['assembler']}_hifi_only/{{asmname}}.fa",
-                otherwise=f"results/{{asmname}}/1.assembly/01.{config['assembler']}_hifi_and_ont/{{asmname}}.fa"),
+        get_assembly,
     output:
         "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.fa"
     log:
@@ -43,3 +53,17 @@ rule add_prefix:
         "../../envs/bioawk.yaml"
     shell:
         "bioawk -c fastx '{{ print \">{params.prefix}\" ++i; print $seq }}' {input} > {output} 2> {log}"
+
+rule index_contigs:
+    input:
+        "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa"
+    output:
+        "results/{asmname}/1.assembly/02.contigs/{asmname}.min{minlen}.sorted.renamed.fa.fai"
+    log:
+        "results/logs/1.assembly/index_contigs/{asmname}.min{minlen}.log"
+    benchmark:
+        "results/benchmarks/1.assembly/index_contigs/{asmname}.min{minlen}.txt"
+    conda:
+        "../../envs/samtools.yaml"
+    shell:
+        "samtools faidx {input} &> {log}"