[MRG] add a flag to prevent metagenome download from SRA (#157)

* add 'prevent_sra_download' flag * update docs for prevent_sra_download * add test for prevent_sra_download * add parameter to defaults.conf so it'll be in showconf * fix test * add missing file * update config file documentation * add new parameter to config check
dib-lab · Feb 12, 2022 · 72c44c9 · 72c44c9
1 parent deaaa0c
commit 72c44c9
Show file tree

Hide file tree

Showing 5 changed files with 63 additions and 1 deletion.
diff --git a/doc/configuring.md b/doc/configuring.md
@@ -20,6 +20,25 @@ Much of the configuration for genome-grist is about where to find more informati
 
 For Genbank genomes, this is easy and genome-grist does it automatically! But if you're providing your own genomes and taxonomy information, it's a bit trickier.
 
+## Analyzing your own metagenomes
+
+You can provide a list of SRA run accessions in the `samples:` list in
+the config file, and genome-grist will automatically download, interleave,
+and trim them for you.
+
+If you want to run genome-grist on your own metagenomes, you need to
+provide one FASTQ file for each sample in the `abundtrim/`
+subdirectory of the output directory; for example, for the output
+directory `outputs.private` and the sample named `podar`, you would
+need to create `outputs.private/abundtrim/podar.abundtrim.fq.gz`.
+This should be an interleaved file of Illumina reads, as generated by
+(for example) `seqtk mergepe`.
+
+Providing correctly-named files will shortcut automatic SRA downloads for
+these files, and genome-grist will download any remaining samples.
+If you want to prevent automatic downloading from the SRA completely,
+you can set the parameter `prevent_sra_download: true` in the config file.
+
 ## Using Genbank genomes
 
 For Genbank genomes, all the necessary information is available already, or automatically determined by genome-grist.
@@ -49,7 +68,7 @@ For now, we suggest naming the first sequence in each FASTA file with the genome
 
 ### Creating one or more sourmash databases
 
-You can mix local databases with genbank databases without fear! You'll need to provide one or more sourmash databases for any local collections, and you do this as usual via the config paramter `sourmash_databases`, which takes a list of paths to sourmash database locations.
+You can mix local databases with genbank databases without fear! You'll need to provide one or more sourmash databases for any local collections, and you do this as usual via the config parameter `sourmash_databases`, which takes a list of paths to sourmash database locations.
 
 To build your own sourmash databases, you'll need sourmash sketches for each genome. Sketch all your genomes with the following command:
 ```
@@ -312,6 +331,10 @@ taxonomies:
 local_databases_info: 
 - /path/to/local-sourmash-db/database3.info.csv
 
+# prevent_sra_download: turn off download of metagenomes from SRA by sample ID."
+# DEFAULT: false.
+prevent_sra_download: false
+
 # picklist: a --picklist argument to use when searching the sourmash database, to limit which signatures to search.
 # see sourmash command line documentation for more details.
 # EXAMPLE:

diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile
@@ -105,6 +105,7 @@ known_config_keys = { 'samples', 'outdir', 'tempdir',
                       'sourmash_sigtype',
                       'prefetch_memory',
                       'taxonomies',
+                      'prevent_sra_download',
                       'picklist' }
 
 unknown_config_keys = all_config_keys - known_config_keys
@@ -453,11 +454,21 @@ rule download_sra_wc:
         temp_r1 =  f"{base_tempdir}/{{sample}}.d/{{sample}}_1.fastq",
         temp_r2 =  f"{base_tempdir}/{{sample}}.d/{{sample}}_2.fastq",
         temp_unp = f"{base_tempdir}/{{sample}}.d/{{sample}}.fastq",
+    params:
+        do_not_run_me = 1 if config.get("prevent_sra_download", False) else 0,
+        outdir = outdir,
     threads: 6
     conda: "env/sra.yml"
     resources:
         mem_mb=40000,
     shell: '''
+        if [ "{params.do_not_run_me}" = 1 ]; then
+            echo "** genome-grist is trying to download from SRA for sample {wildcards.sample},"
+            echo "** but 'prevent_sra_download' is set to true in config."
+            echo "** Does '{params.outdir}/abundtrim/{wildcards.sample}.abundtrim.fq.gz' exist?"
+            exit -1
+        fi
+
         echo tmp directory: {output.temp_dir}
         echo running fasterq-dump for {wildcards.sample}
 

diff --git a/genome_grist/conf/defaults.conf b/genome_grist/conf/defaults.conf
@@ -11,3 +11,6 @@ tempdir:
 
 # cache genbank genome info in ./genbank_cache/ by default
 genbank_cache: ./genbank_cache/
+
+# allow automatic download from SRA
+prevent_sra_download: false
diff --git a/tests/test-data/test-block-sra.conf b/tests/test-data/test-block-sra.conf
@@ -0,0 +1,7 @@
+samples:
+- SRR5950647
+outdir: outputs.pytest
+sourmash_databases:
+- tests/test-data/SRR5950647.x.gtdb-rs202.matches.zip
+
+prevent_sra_download: true
diff --git a/tests/test_snakemake.py b/tests/test_snakemake.py
@@ -232,3 +232,21 @@ def test_bad_config_4():
                            extra_args=["check"])
 
     assert status != 0
+
+
+@pytest.mark.dependency()
+def test_block_sra_downloads():
+    # run 'smash_reads' with a non-existent metagenome file & make sure
+    # that it doesn't work.
+    global _tempdir
+
+    conf = utils.relative_file('tests/test-data/test-block-sra.conf')
+
+    extra_args = ["smash_reads"]
+    status = run_snakemake(
+        conf,
+        verbose=True,
+        outdir=_tempdir,
+        extra_args=extra_args,
+    )
+    assert status != 0