From 72c44c9b4a90ed8a4773d816bf3e7055ca7ffd5b Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 12 Feb 2022 10:26:23 -0800
Subject: [PATCH] [MRG] add a flag to prevent metagenome download from SRA
 (#157)

* add 'prevent_sra_download' flag

* update docs for prevent_sra_download

* add test for prevent_sra_download

* add parameter to defaults.conf so it'll be in showconf

* fix test

* add missing file

* update config file documentation

* add new parameter to config check
---
 doc/configuring.md                  | 25 ++++++++++++++++++++++++-
 genome_grist/conf/Snakefile         | 11 +++++++++++
 genome_grist/conf/defaults.conf     |  3 +++
 tests/test-data/test-block-sra.conf |  7 +++++++
 tests/test_snakemake.py             | 18 ++++++++++++++++++
 5 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 tests/test-data/test-block-sra.conf

diff --git a/doc/configuring.md b/doc/configuring.md
index b99116c..0b296b3 100644
--- a/doc/configuring.md
+++ b/doc/configuring.md
@@ -20,6 +20,25 @@ Much of the configuration for genome-grist is about where to find more informati
 
 For Genbank genomes, this is easy and genome-grist does it automatically! But if you're providing your own genomes and taxonomy information, it's a bit trickier.
 
+## Analyzing your own metagenomes
+
+You can provide a list of SRA run accessions in the `samples:` list in
+the config file, and genome-grist will automatically download, interleave,
+and trim them for you.
+
+If you want to run genome-grist on your own metagenomes, you need to
+provide one FASTQ file for each sample in the `abundtrim/`
+subdirectory of the output directory; for example, for the output
+directory `outputs.private` and the sample named `podar`, you would
+need to create `outputs.private/abundtrim/podar.abundtrim.fq.gz`.
+This should be an interleaved file of Illumina reads, as generated by
+(for example) `seqtk mergepe`.
+
+Providing correctly-named files will shortcut automatic SRA downloads for
+these files, and genome-grist will download any remaining samples.
+If you want to prevent automatic downloading from the SRA completely,
+you can set the parameter `prevent_sra_download: true` in the config file.
+
 ## Using Genbank genomes
 
 For Genbank genomes, all the necessary information is available already, or automatically determined by genome-grist.
@@ -49,7 +68,7 @@ For now, we suggest naming the first sequence in each FASTA file with the genome
 
 ### Creating one or more sourmash databases
 
-You can mix local databases with genbank databases without fear! You'll need to provide one or more sourmash databases for any local collections, and you do this as usual via the config paramter `sourmash_databases`, which takes a list of paths to sourmash database locations.
+You can mix local databases with genbank databases without fear! You'll need to provide one or more sourmash databases for any local collections, and you do this as usual via the config parameter `sourmash_databases`, which takes a list of paths to sourmash database locations.
 
 To build your own sourmash databases, you'll need sourmash sketches for each genome. Sketch all your genomes with the following command:
 ```
@@ -312,6 +331,10 @@ taxonomies:
 local_databases_info: 
 - /path/to/local-sourmash-db/database3.info.csv
 
+# prevent_sra_download: turn off download of metagenomes from SRA by sample ID."
+# DEFAULT: false.
+prevent_sra_download: false
+
 # picklist: a --picklist argument to use when searching the sourmash database, to limit which signatures to search.
 # see sourmash command line documentation for more details.
 # EXAMPLE:
diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile
index e0c06e8..dd79d30 100755
--- a/genome_grist/conf/Snakefile
+++ b/genome_grist/conf/Snakefile
@@ -105,6 +105,7 @@ known_config_keys = { 'samples', 'outdir', 'tempdir',
                       'sourmash_sigtype',
                       'prefetch_memory',
                       'taxonomies',
+                      'prevent_sra_download',
                       'picklist' }
 
 unknown_config_keys = all_config_keys - known_config_keys
@@ -453,11 +454,21 @@ rule download_sra_wc:
         temp_r1 =  f"{base_tempdir}/{{sample}}.d/{{sample}}_1.fastq",
         temp_r2 =  f"{base_tempdir}/{{sample}}.d/{{sample}}_2.fastq",
         temp_unp = f"{base_tempdir}/{{sample}}.d/{{sample}}.fastq",
+    params:
+        do_not_run_me = 1 if config.get("prevent_sra_download", False) else 0,
+        outdir = outdir,
     threads: 6
     conda: "env/sra.yml"
     resources:
         mem_mb=40000,
     shell: '''
+        if [ "{params.do_not_run_me}" = 1 ]; then
+            echo "** genome-grist is trying to download from SRA for sample {wildcards.sample},"
+            echo "** but 'prevent_sra_download' is set to true in config."
+            echo "** Does '{params.outdir}/abundtrim/{wildcards.sample}.abundtrim.fq.gz' exist?"
+            exit -1
+        fi
+
         echo tmp directory: {output.temp_dir}
         echo running fasterq-dump for {wildcards.sample}
 
diff --git a/genome_grist/conf/defaults.conf b/genome_grist/conf/defaults.conf
index 395006a..8172853 100644
--- a/genome_grist/conf/defaults.conf
+++ b/genome_grist/conf/defaults.conf
@@ -11,3 +11,6 @@ tempdir:
 
 # cache genbank genome info in ./genbank_cache/ by default
 genbank_cache: ./genbank_cache/
+
+# allow automatic download from SRA
+prevent_sra_download: false
diff --git a/tests/test-data/test-block-sra.conf b/tests/test-data/test-block-sra.conf
new file mode 100644
index 0000000..e288b10
--- /dev/null
+++ b/tests/test-data/test-block-sra.conf
@@ -0,0 +1,7 @@
+samples:
+- SRR5950647
+outdir: outputs.pytest
+sourmash_databases:
+- tests/test-data/SRR5950647.x.gtdb-rs202.matches.zip
+
+prevent_sra_download: true
diff --git a/tests/test_snakemake.py b/tests/test_snakemake.py
index 019b560..4ca0065 100644
--- a/tests/test_snakemake.py
+++ b/tests/test_snakemake.py
@@ -232,3 +232,21 @@ def test_bad_config_4():
                            extra_args=["check"])
 
     assert status != 0
+
+
+@pytest.mark.dependency()
+def test_block_sra_downloads():
+    # run 'smash_reads' with a non-existent metagenome file & make sure
+    # that it doesn't work.
+    global _tempdir
+
+    conf = utils.relative_file('tests/test-data/test-block-sra.conf')
+
+    extra_args = ["smash_reads"]
+    status = run_snakemake(
+        conf,
+        verbose=True,
+        outdir=_tempdir,
+        extra_args=extra_args,
+    )
+    assert status != 0