Merge pull request #13 from bcgsc/ntjoin_multi_ref_fix

Prepare for v1.0.0 release
bcgsc · Sep 25, 2023 · 33c12ab · 33c12ab
2 parents 606c52f + d348212
commit 33c12ab
Show file tree

Hide file tree

Showing 7 changed files with 229 additions and 153 deletions.
diff --git a/README.md b/README.md
@@ -40,8 +40,25 @@ git clone https://github.com/bcgsc/mtGrasp.git
 * ntCard
 
 ---
+### Installation Instructions for Dependencies
+#### Step 1: 
 
-#### Special Installation Instructions for MITOS
+Recommended (Faster):
+```
+conda create -n mtgrasp python=3.10 mamba
+conda activate mtgrasp
+mamba install -c conda-forge -c bioconda snakemake 'blast>=2.10.0' biopython seqtk abyss ntjoin bwa samtools pilon ntcard
+```
+
+Alternative (Slower):
+```
+conda create -n mtgrasp python=3.10
+conda activate mtgrasp
+conda install -c conda-forge -c bioconda snakemake 'blast>=2.10.0' biopython seqtk abyss ntjoin bwa samtools pilon ntcard
+```
+
+
+#### Step 2: Special Installation Instructions for MITOS
 
 
 As MITOS uses an older Python version, please install it in a new conda environment called "mitos" using the instructions below:
@@ -121,16 +138,17 @@ However, if such sequences are unavailable, you can move up the taxonomic hierar
 
 `-b` or `--sealer_k=STRING`: k-mer size(s) used in sealer gap filling ['60,80,100,120']
 
-`-e` or `--end_recov_sealer_fpr=N`: False positive rate for the bloom filter used by sealer during flanking end recovery [0.01]
+`-sf` or `--end_recov_sealer_fpr=N`: False positive rate for the bloom filter used by sealer during flanking end recovery [0.01]
 
-`-v` or `--end_recov_sealer_k=STRING`: k-mer size used in sealer flanking end recovery ['60,80,100,120']
+`-sk` or `--end_recov_sealer_k=STRING`: k-mer size used in sealer flanking end recovery ['60,80,100,120']
 
 `-i` or `--end_recov_p=N`: Merge at most N alternate paths during sealer flanking end recovery; use 'nolimit' for no limit [5]
 
 `-t` or `--threads=N`: number of threads to use [8]
 
 `-ma` or `--mismatch_allowed=N`: Maximum number of mismatches allowed while determining the overlapping region between the two ends of the mitochondrial assembly [1]
 
+`-v` or `--version`: Print out the version of mtGrasp and exit
 
 
 ---
@@ -193,28 +211,6 @@ If you are not interested in the standardized mitogenome sequence(s) or MITOS an
 
 
 
-### Summarize mtGrasp results
-
-```
-mtgrasp_summarize.py -i <Input text file> -t <Output tsv file> -l <Output text file>
-```
-Here, this script will summarize the mtGrasp results for all assembly output folders listed in the input text file `<Input text file>`. The output tsv file `<Output tsv file>` will contain the following columns:
-
-`Assembly`: the name of the assembly output folder along with the k and kc values used for the assembly
-
-`Number of sequences`: the number of mitochondrial sequences generated by mtGrasp
-
-`Scaffold lengths`: the length(s) of the mitochondrial sequence(s) generated by mtGrasp
-
-`Standardization status`: whether the mitochondrial sequence(s) generated by mtGrasp is standardized or not
-
-`Number of gaps (pre-GapFilling)`: the number of gaps in the mitochondrial sequence(s) generated by mtGrasp before gap filling
-
-`Number of gaps (post-GapFilling)`: the number of gaps in the mitochondrial sequence(s) generated by mtGrasp after gap filling
-
-
-Finally, `<Output text file>` will contain the full path(s) to the assembly output fasta file(s)
-
 ---
 ### Standardize your own mitochondrial sequence(s) using mtGrasp
 If you have your own mitochondrial sequence(s) and would like to standardize it/them using mtGrasp, you can do so by using mtGrasp's `mtgrasp_standardize.py` script. 
@@ -263,5 +259,30 @@ Because mtGrasp annotation uses a third-party tool called [MITOS](https://www.sc
 
 
 
+---
+
+### Summarize mtGrasp results
+
+```
+mtgrasp_summarize.py -i <Input text file> -p <Prefix of the summary files>
+```
+
+Here, this script will summarize the mtGrasp results for all assembly output folders listed in the input text file `<Input text file>`. The output tsv file `{prefix}_mtgrasp_{mtgrasp_version}_assembly_summary.tsv'` will contain the following columns:
+
+`Assembly`: the name of the assembly output folder along with the k and kc values (number of read pairs if `-sub` is enabled) used for the assembly
+
+`Ns per 1000bp`: the number of Ns per 1000bp in the mitochondrial sequence(s) generated by mtGrasp
+
+`Number of Contigs`: the number of mitochondrial sequences generated by mtGrasp
+
+`Total Number of Base Pairs Per Assembly`: the total number of base pairs in the mitochondrial sequence(s) generated by mtGrasp
+
+`Length of the Lonest Contig (bp)`: the length of the longest mitochondrial sequence generated by mtGrasp
+
+`Circular or Linear`: whether the mitochondrial sequence(s) generated by mtGrasp is circular or linear
+
+`Standardization Status`: whether the mitochondrial sequence(s) generated by mtGrasp is start-site standardized and strand standardized
+
 
+Finally, `{prefix}_mtgrasp_{mtgrasp_version}_path_to_output.txt` will contain the complete path(s) to the assembly output fasta file(s)
 
diff --git a/create_references_for_ntjoin.py b/create_references_for_ntjoin.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+# Split a fasta file into multiple files, one per sequence and create a reference config file for ntJoin
+# Usage: split_fasta.py <fasta file> <output directory> <config file>
+
+import sys
+
+ntjoin_reference_weight = 2 #Refer to ntJoin README for more info: https://github.com/bcgsc/ntJoin
+input = sys.argv[1]
+output = sys.argv[2]
+config = sys.argv[3]
+ref_list = []
+with open(input, 'r') as f:
+    for line in f:
+       	line = line.strip()
+        if line.startswith('>'):
+            header = line        
+            filename = header.split(' ')[0].strip('>') + '.fasta'
+        else:
+            sequence = line
+            with open(output + '/' + filename, 'w') as g:
+                g.write(header + '\n')
+                g.write(sequence + '\n')
+            ref_list.append(f'{output}/{filename},{ntjoin_reference_weight}')
+
+with open(config, 'w') as f:
+    f.write('\n'.join(ref_list))
+
+
diff --git a/mtgrasp.py b/mtgrasp.py
@@ -4,6 +4,7 @@
 import argparse
 import subprocess
 import shlex
+mtgrasp_version = 'mtGrasp v1.0.0' # Make sure to edit the version for future releases
 
 parser = argparse.ArgumentParser(description='Usage of mtGrasp')
 parser.add_argument('-r1', '--read1', help='Forward read fastq.gz file', required=True)
@@ -19,15 +20,16 @@
 parser.add_argument('-s', '--sealer_fpr', help='False positive rate for the bloom filter used by sealer during gap filling [0.01]', default = 0.01)
 parser.add_argument('-p', '--gap_filling_p', help='Merge at most N alternate paths during sealer gap filling step [5]', default = 5)
 parser.add_argument('-b', '--sealer_k', help='k-mer size used in sealer gap filling [60,80,100,120]', default = '60,80,100,120')
-parser.add_argument('-e', '--end_recov_sealer_fpr', help='False positive rate for the bloom filter used by sealer during flanking end recovery [0.01]', default = 0.01)
-parser.add_argument('-v', '--end_recov_sealer_k', help='k-mer size used in sealer flanking end recovery [60,80,100,120]', default = '60,80,100,120')
+parser.add_argument('-sf', '--end_recov_sealer_fpr', help='False positive rate for the bloom filter used by sealer during flanking end recovery [0.01]', default = 0.01)
+parser.add_argument('-sk', '--end_recov_sealer_k', help='k-mer size used in sealer flanking end recovery [60,80,100,120]', default = '60,80,100,120')
 parser.add_argument('-i', '--end_recov_p', help='Merge at most N alternate paths during sealer flanking end recovery [5]', default = 5)
 parser.add_argument('-u', '--unlock', help='Remove a lock implemented by snakemake on the working directory', action='store_true')
 parser.add_argument('-ma', '--mismatch_allowed', help='Maximum number of mismatches allowed while determining the overlapping region between the two ends of the mitochondrial assembly [1]', default = 1)
 parser.add_argument('-sub', '--subsample', help='Subsample N read pairs from two paired FASTQ files [2000000]', default = 2000000)
 parser.add_argument('-nsub', '--nosubsample', help='Run mtGrasp using the entire read dataset without subsampling [False]', action='store_true')
 parser.add_argument('-an', '--annotate', help='Run gene annotation on the final assembly output [False]', action='store_true')
 parser.add_argument('-d', '--delete', help='Delete intermediate subdirectories/files once mtGrasp reaches completion [False]', action='store_true')
+parser.add_argument('-v', '--version', action="version", version=mtgrasp_version)
 
 
 
@@ -57,6 +59,7 @@
 
 
 
+
 # get the directory of the mtgrasp.smk script
 string = subprocess.check_output(['which', 'mtgrasp.smk'])
 string = string.decode('utf-8')
@@ -96,6 +99,7 @@
                                {read1_base} {read2_base} {script_dir} {threads} {mt_gen} {kmer} \
                                {kc} {ref_path}  {abyss_fpr} {sealer_fpr} {p} {sealer_k} \
                                {end_recov_sealer_fpr} {end_recov_p} {end_recov_sealer_k} {mismatch_allowed} 'No' "))
+
 else:
     print('Please double check mtGrasp usage information')
     exit(1)

diff --git a/mtgrasp.smk b/mtgrasp.smk
@@ -1,4 +1,7 @@
 # Snakemake file for mtGrasp pipeline
+# Make sure to edit the version for new releases
+mtgrasp_version = 'v1.0.0'
+
 
 import os.path
 import shlex
@@ -19,7 +22,7 @@ else:
 # Start of the pipeline
 rule all:
      input:
-        expand(current_dir + "{library}/final_output/{library}_k{k}_kc{kc}/{library}_k{k}_kc{kc}.final-mtgrasp-assembly.fa", library = config["out_dir"], k = config["k"], kc = config["kc"])
+        expand(current_dir + "{library}/final_output/{library}_k{k}_kc{kc}/{library}_k{k}_kc{kc}.final-mtgrasp_%s-assembly.fa"%(mtgrasp_version), library = config["out_dir"], k = config["k"], kc = config["kc"])
 
 
 
@@ -186,8 +189,8 @@ rule create_lists:
      input:
          rules.blast.output
      output:
-         ref_list=current_dir + "{library}/blast/k{k}_kc{kc}-query.txt",
-         query_list=current_dir + "{library}/blast/k{k}_kc{kc}-ref.txt"
+         ref_list=current_dir + "{library}/blast/k{k}_kc{kc}-ref.txt",
+         query_list=current_dir + "{library}/blast/k{k}_kc{kc}-query.txt"
      benchmark:
          current_dir + "{library}/benchmark/k{k}_kc{kc}.create_lists.benchmark.txt"
      shell:
@@ -202,14 +205,18 @@ rule extract_seq:
          assemblies=rules.select_length.output
      output:
          query_out=current_dir + "{library}/mito_filtering_output/k{k}_kc{kc}-scaffolds.blast-mt_db.fa",
-         ref_out=current_dir + "{library}/mito_filtering_output/k{k}_kc{kc}-ref.fa"
+         ref_out=current_dir + "{library}/mito_filtering_output/k{k}_kc{kc}-ref.fa",
      benchmark:
          current_dir + "{library}/benchmark/k{k}_kc{kc}.extract_seq.benchmark.txt"
      params:
-         ref_fasta=config["ref_path"]
+         ref_fasta=config["ref_path"],
+         ref_outdir=current_dir + "{library}/blast/mito_refs",
+         ref_config = current_dir + "{library}/blast/k{k}_kc{kc}-ntjoin_ref_config.csv"
      shell:
          "seqtk subseq {input.assemblies} {input.query} > {output.query_out} ; "
-         "seqtk subseq {params.ref_fasta} {input.ref} > {output.ref_out}"
+         "seqtk subseq {params.ref_fasta} {input.ref} > {output.ref_out} ;"
+         " mkdir -p {params.ref_outdir} && create_references_for_ntjoin.py {output.ref_out} {params.ref_outdir} {params.ref_config}"
+
 
 
 #Prepare for polishing: ntJoin+Sealer or Sealer
@@ -228,19 +235,18 @@ rule pre_polishing:
         sealer_fpr=config['sealer_fpr'],
         threads=config['threads'],
         p=config['p'],
-        k=config['sealer_k']
-      benchmark:
-        current_dir + "{library}/benchmark/k{k}_kc{kc}.pre_polishing.benchmark.txt"
-      log:
+        k=config['sealer_k'],
+        ref_config = current_dir + "{library}/blast/k{k}_kc{kc}-ntjoin_ref_config.csv",
         sealer=current_dir + "{library}/prepolishing/k{k}_kc{kc}_sealer.log",
         ntjoin=current_dir + "{library}/mito_filtering_output/k{k}_kc{kc}_ntjoin.log"
-
+      benchmark:
+        current_dir + "{library}/benchmark/k{k}_kc{kc}.pre_polishing.benchmark.txt"
       run: 
           import os
           target = os.path.basename(input.target)
           ref = os.path.basename(input.ref)
-          log_ntjoin = os.path.basename(log.ntjoin)
-          log_sealer = log.sealer
+          log_ntjoin = os.path.basename(params.ntjoin)
+          log_sealer = params.sealer
           bf = bf_sealer(params.r1, params.r2, wildcards.library, params.threads, params.sealer_fpr,params.k)
           count = sum(1 for line in open(input[0]))
           k = k_string_converter(params.k)
@@ -249,7 +255,7 @@ rule pre_polishing:
           if count == 0:
                 print(f"Input file {input.target} is empty, no mitochondrial sequence found.")
                 exit(1)
-          if num_gaps != 0 and count == 2:
+          elif num_gaps != 0 and count == 2:
             print("---Start Sealer Gap Filling---")
             print("One-piece contig found, no ntJoin scaffolding needed")
             shell("""abyss-sealer -b{bf} -j {params.threads} -vv {k} -P {params.p} -o {params.out} -S {input.target} {params.r1} {params.r2} &> {log_sealer}""")
@@ -261,9 +267,15 @@ rule pre_polishing:
 
           # If multiple contigs are found, both ntJoin and Sealer are needed  
           else: 
-               print("Multiple contigs found, ntJoin scaffolding and Gap-filling are both needed")
-               shell("""bash run_ntjoin.sh {params.workdir} {target} {ref} {log_ntjoin} {params.threads}""") 
-               shell("""abyss-sealer -b{bf} -j {params.threads} -vv {k} -P {params.p} -o {params.out} -S {params.ntjoin_out}  {params.r1} {params.r2} &> {log_sealer}""")
+               print("Multiple contigs found, ntJoin scaffolding starts")
+               shell("""run_ntjoin.sh {params.workdir} {target} {params.ref_config} {log_ntjoin} {params.threads}""") 
+               # check gaps need to be filled or not post-ntJoin
+               if check_gaps(params.ntjoin_out) == 0:
+                  print("---No Gaps Found After ntJoin, Gap Filling Not Needed---")
+                  shell("cp {params.ntjoin_out} {output}")
+               else:
+                  print("---Gap Found After ntJoin, Start Sealer Gap Filling---")
+                  shell("""abyss-sealer -b{bf} -j {params.threads} -vv {k} -P {params.p} -o {params.out} -S {params.ntjoin_out}  {params.r1} {params.r2} &> {log_sealer}""")
 
 
 
@@ -348,7 +360,7 @@ rule standardization:
         input:
             rules.end_recovery.output
         output:
-            current_dir + "{library}/final_output/{library}_k{k}_kc{kc}/{library}_k{k}_kc{kc}.final-mtgrasp-assembly.fa"
+            current_dir + "{library}/final_output/{library}_k{k}_kc{kc}/{library}_k{k}_kc{kc}.final-mtgrasp_%s-assembly.fa"%(mtgrasp_version)
         benchmark:
             current_dir + "{library}/benchmark/k{k}_kc{kc}.standardization.benchmark.txt"
         params:

diff --git a/mtgrasp_standardize.py b/mtgrasp_standardize.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Make sure to edit the version for new releases
+mtgrasp_version = 'v1.0.0'
+
 import argparse
 import sys
 import shlex
@@ -187,7 +190,7 @@ def run_mitos(env_name, file, code, dir, script_dir):
 if os.stat(file).st_size == 0:
     print('File is empty, no start-site standardization needed')
     # write an empty fasta file
-    with open('%s/%s.final-mtgrasp-assembly.fa'%(output_dir,sample), 'w') as f:
+    with open('%s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir,sample, mtgrasp_version), 'w') as f:
          f.write('')
 
 
@@ -197,7 +200,7 @@ def run_mitos(env_name, file, code, dir, script_dir):
 elif num_of_assemblies > 1 and scenario_in_file == False:
     print('Multiple contigs in the fasta file, standardization cannot be performed')
     # write the original fasta file to the output directory
-    with open('%s/%s.final-mtgrasp-assembly.fa'%(output_dir,sample), 'w') as f:
+    with open('%s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir,sample, mtgrasp_version), 'w') as f:
             for record in SeqIO.parse(file, "fasta"):
                 f.write('>Non-Standardized_%s_seq\n' % (sample) + str(record.seq) + '\n')
 # if the file has only one sequence or has 2 sequences (but 'Scenario' is found in file)
@@ -271,7 +274,7 @@ def run_mitos(env_name, file, code, dir, script_dir):
 
 
       standardized_seq = remove_duplicates_in_a_list(standardized_seq)
-      file_name = '%s/%s.final-mtgrasp-assembly.fa'%(output_dir, sample)
+      file_name = '%s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir, sample, mtgrasp_version)
 
 
       # Store the results of the function and file reading in variables
@@ -301,7 +304,7 @@ def run_mitos(env_name, file, code, dir, script_dir):
 
 # Run MitoS annotation on the final assembly fasta file if '-a' argument is provided
 if annotate:
-      print('Start Annotating %s/%s.final-mtgrasp-assembly.fa'%(output_dir, sample))
+      print('Start Annotating %s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir, sample, mtgrasp_version))
       if not os.path.exists(f'{output_dir}/annotation_output'):
          cmd1 = f'mkdir -p {output_dir}/annotation_output' 
          cmd_shlex1 = shlex.split(cmd1)
@@ -310,7 +313,7 @@ def run_mitos(env_name, file, code, dir, script_dir):
 
 
       try:
-          output = run_mitos("mitos", '%s/%s.final-mtgrasp-assembly.fa'%(output_dir, sample), mito_gencode,f'{output_dir}/annotation_output', script_dir)
+          output = run_mitos("mitos", '%s/%s.final-mtgrasp_%s-assembly.fa'%(output_dir, sample, mtgrasp_version), mito_gencode,f'{output_dir}/annotation_output', script_dir)
           print(f"Output: {output}")
       except Exception as e:
           print(f"Error: {e}")