dib-lab · camillescott · Oct 27, 2020 · Jan 13, 2021 · Jan 14, 2021 · Jan 22, 2021
diff --git a/dammit/components/run.py b/dammit/components/run.py
@@ -111,7 +111,8 @@ def run_group(config,
         config.core['n_threads'] = n_threads
 
     if not max_threads_per_task:
-        config.core['max_threads_per_task'] = config.core['n_threads']
+        config.core['max_threads_per_task'] = config.core['n_threads'] if config.core['max_threads_per_task'] == 0 \
+                                              else min(config.core['n_threads'], config.core['max_threads_per_task'])
     else:
         config.core['max_threads_per_task'] = min(config.core['n_threads'], max_threads_per_task)
 

diff --git a/dammit/config.yml b/dammit/config.yml
@@ -15,7 +15,7 @@ regex_rename: null
 # n_threads: total threads to pass to snakemake -j
 n_threads: 0
 # max threads to use on a single job
-max_threads_per_task: 1
+max_threads_per_task: 0
 
 verbosity: 0
 
@@ -60,7 +60,7 @@ hmmpress:
 cmscan:
   params:
     evalue: 0.00001
-    extra: "--nohmmonly"
+    extra: "--nohmmonly --rfam --cut_ga"
 
 cmpress:
   params:

diff --git a/dammit/databases.yml b/dammit/databases.yml
@@ -76,11 +76,6 @@ orthodb_genes:
   output_suffix:
     - ""
 busco:
-  url: https://gitlab.com/ezlab/busco/-/raw/4.0.6/config/config.ini
-  access: download
-  fileformat: uncompressed
-  output_suffix:
-    - ""
   # to update lineage list:
     # busco --list-datasets to see all
     # copy in list, do visual selection of lines (ctrl-V G)

diff --git a/dammit/tests/test-data/pom.20.dammit.evalue10.fasta b/dammit/tests/test-data/pom.20.dammit.evalue10.fasta
diff --git a/dammit/tests/test-data/pom.20.dammit.evalue10.gff3 b/dammit/tests/test-data/pom.20.dammit.evalue10.gff3
diff --git a/dammit/tests/test-data/pom.20.udbs.dammit.fasta b/dammit/tests/test-data/pom.20.udbs.dammit.fasta
diff --git a/dammit/tests/test-data/pom.20.udbs.dammit.gff3 b/dammit/tests/test-data/pom.20.udbs.dammit.gff3
diff --git a/dammit/tests/test-data/test-conf.yml b/dammit/tests/test-data/test-conf.yml
@@ -0,0 +1,9 @@
+basename: Splat
+busco:
+  configfile: busco.default.ini
+  params:
+    extra: ''
+global_evalue: 1.0
+busco_groups:
+- saccharomycetes_odb10
+max_threads_per_task: 1
diff --git a/dammit/tests/test_annotate.py b/dammit/tests/test_annotate.py
@@ -11,6 +11,7 @@
 import pandas as pd
 
 from .utils import run
+from dammit.meta import __path__
 
 pd.set_option('display.max_columns', None)
 pd.set_option('display.width', None)
@@ -22,7 +23,7 @@ def compare_gff(fn_a, fn_b):
     df_a.reset_index(inplace=True, drop=True)
     df_b = gff3.GFF3Parser(fn_b).read().sort_values(['seqid', 'start', 'end', 'ID', 'Target']).sort_index(axis=1)
     df_b.reset_index(inplace=True, drop=True)
-    
+
     print('First DF:', df_a, '\n', '=' * 40)
     print('Second DF:', df_b, '\n', '=' * 40)
     return df_a.equals(df_b)
@@ -42,7 +43,8 @@ def test_default(self, tmpdir, datadir, n_threads):
             exp_gff3 = datadir('pom.20.dammit.gff3')
             exp_fasta = datadir('pom.20.dammit.fasta')
 
-            args = ['run', '--busco-group', 'saccharomycetes_odb10', '--n-threads', str(n_threads), 'annotate', transcripts]
+            args = ['run', '--busco-group', 'saccharomycetes_odb10', 
+                    '--n-threads', str(n_threads), 'annotate', transcripts]
             status, out, err = run(*args)
 
             outdir = 'pom.20.dammit'
@@ -65,7 +67,8 @@ def test_evalue(self, tmpdir, datadir):
             exp_gff3 = datadir('pom.20.dammit.evalue10.gff3')
             exp_fasta = datadir('pom.20.dammit.evalue10.fasta')
 
-            args = ['run', '--busco-group', 'saccharomycetes_odb10', 'annotate', transcripts, '--global-evalue', '10.0']
+            args = ['run', '--busco-group', 'saccharomycetes_odb10', 
+                    'annotate', transcripts, '--global-evalue', '10.0']
             status, out, err = run(*args)
 
             outdir = 'pom.20.dammit'
@@ -74,7 +77,7 @@ def test_evalue(self, tmpdir, datadir):
 
             assert compare_gff(gff3_fn, exp_gff3)
             assert open(fasta_fn).read() == open(exp_fasta).read()
-  
+
     @pytest.mark.parametrize('n_threads', (1,4))
     def test_user_database(self, tmpdir, datadir, n_threads):
         '''--n-threads [N] --pipeline quick annotate --user-database [PEP.fa] [INPUT.fa]
@@ -86,7 +89,8 @@ def test_user_database(self, tmpdir, datadir, n_threads):
             exp_gff3 = datadir('pom.20.udb.dammit.gff3')
             exp_fasta = datadir('pom.20.udb.dammit.fasta')
 
-            args = ['run', '--busco-group', 'saccharomycetes_odb10', '--n-threads', str(n_threads), '--pipeline', 'quick', 'annotate',
+            args = ['run', '--busco-group', 'saccharomycetes_odb10', 
+                    '--n-threads', str(n_threads), '--pipeline', 'quick', 'annotate',
                     transcripts, '--user-database', pep]
             status, out, err = run(*args)
 
@@ -97,32 +101,39 @@ def test_user_database(self, tmpdir, datadir, n_threads):
             fasta_fn = os.path.join(outdir, 'pom.20.dammit.fasta')
 
             assert status == 0
-            #assert compare_gff(gff3_fn, exp_gff3)
-            #assert open(fasta_fn).read() == open(exp_fasta).read()
+            assert compare_gff(gff3_fn, exp_gff3)
+            assert open(fasta_fn).read() == open(exp_fasta).read()
 
+    @pytest.mark.parametrize('n_threads', (1,4))
+    def test_annotate_multiple_user_databases(self, tmpdir, datadir, n_threads):
+        '''--pipeline quick annotate --user-database [PEP1.fa] --user-database [PEP2.fa] [INPUT.fa]
+        '''
 
-#    def test_annotate_multiple_user_databases(self, tmpdir, datadir):
-#        '''--pipeline quick annotate --user-database [PEP1.fa] --user-database [PEP2.fa] [INPUT.fa]
-#        '''
-#
-#        with tmpdir.as_cwd():
-#            transcripts = datadir('pom.single.fa')
-#            pep = datadir('pep.fa')
-#            pep2 = datadir('odb_subset.fa')
-#            exp_gff3 = datadir('pom.single.fa.dammit.gff3.udb')
-#            exp_fasta = datadir('pom.single.fa.dammit.fasta.udb')
-#
-#            args = ['annotate', '--quick',
-#                    transcripts, '--user-databases', pep, pep2,
-#                    '--verbosity', '2']
-#            status, out, err = run(args)
-#
-#            outdir = '{0}.dammit'.format(transcripts)
-#
-#            assert status == 0
+        with tmpdir.as_cwd():
+            transcripts = datadir('pom.20.fa')
+            pep = datadir('pep.fa')
+            pep2 = datadir('odb_subset.fa')
+            exp_gff3 = datadir('pom.20.udbs.dammit.gff3')
+            exp_fasta = datadir('pom.20.udbs.dammit.fasta')
+
+            args = ['run', '--n-threads', str(n_threads),
+                    '--busco-group', 'saccharomycetes_odb10',
+                    '--pipeline', 'quick', 'annotate',
+                    '--user-database', pep,
+                    '--user-database', pep2,
+                    transcripts]
+            status, out, err = run(*args)
 
-    def test_basename(self, tmpdir, datadir):
-        '''--pipeline quick annotate --base-name [NAME] [INPUT.fa]
+            outdir = 'pom.20.dammit'
+            gff3_fn = os.path.join(outdir, 'pom.20.dammit.gff3')
+            fasta_fn = os.path.join(outdir, 'pom.20.dammit.fasta')
+
+            assert status == 0
+            assert compare_gff(gff3_fn, exp_gff3)
+            assert open(fasta_fn).read() == open(exp_fasta).read()
+
+    def test_annotate_basename(self, tmpdir, datadir):
+        '''Test annotate --pipeline quick annotate --base-name [NAME] [INPUT.fa]
         '''
 
         with tmpdir.as_cwd():
@@ -138,9 +149,9 @@ def test_basename(self, tmpdir, datadir):
 
             contents = open(fn).read()
             assert 'Test_0' in contents
-    
+
     def test_multiple_busco_groups(self, tmpdir, datadir):
-        '''--pipeline quick --busco-group bacteria_odb10 --busco-group saccharomycetes_odb10
+        '''--pipeline quick --busco-group saccharomycetes_odb10 --busco-group saccharomycetes_odb10
         '''
 
         with tmpdir.as_cwd():
@@ -170,12 +181,11 @@ def test_regex_rename(self, tmpdir, datadir):
                     '--busco-group', 'saccharomycetes_odb10',
                     'annotate', '--regex-rename', r'(?P<name>^[a-zA-Z0-9\.]+)',
                     transcripts]
-            
+
             status, out, err = run(*args)
             assert status == 0
 
             gff3_fn = os.path.join('pom.20.dammit', 'pom.20.dammit.gff3')
-
             assert compare_gff(gff3_fn, exp_gff3_fn)
 
     def test_norename(self, tmpdir, datadir):
@@ -198,29 +208,113 @@ def test_norename(self, tmpdir, datadir):
 
             assert compare_gff(gff3_fn, exp_gff3_fn)
 
+    def test_annotate_outdir(self, tmpdir, datadir):
+        '''
+        Test output directory option
+        '''
 
-def test_annotate_dbdir_fail(tmpdir, datadir):
-    '''Test annotation with a faulty database directory.
-    '''
+        with tmpdir.as_cwd():
+            transcripts = datadir('pom.20.fa')
 
-    with tmpdir.as_cwd():
-        transcripts = datadir('pom.20.fa')
+            outdir = 'test_out'
+            args = ['run', '--pipeline', 'quick', 'annotate',
+                    transcripts, '--output-dir', outdir]
+            status, out, err = run(*args)
+            assert os.path.isfile(os.path.join(outdir, 'pom.20.fasta'))
 
-        args = ['run', '--database-dir', '.', 'annotate', transcripts]
-        status, out, err = run(*args, fail_ok=True)
-
-        assert 'you probably need to install the dammit databases' in err
-        assert status == 1
+    # make sure DAMMIT_DB_DIR is set in your testing env
+    # (export DAMMIT_DB_DIR=/path/to/databases)
+    def test_annotate_dbdir_fail(self, tmpdir, datadir):
+        '''Test annotation with a faulty database directory.
+           dammit run --database-dir [DB_DIR] annotate [INPUT.fa]
+        '''
+
+        with tmpdir.as_cwd():
+            transcripts = datadir('pom.20.fa')
+
+            args = ['run', '--pipeline', 'quick', '--database-dir', '.', 'annotate', transcripts]
+            status, out, err = run(*args, fail_ok=True)
 
+            assert 'you probably need to install the dammit databases' in err
+            assert status == 1
 
-def test_annotate_dbdir(tmpdir, datadir):
-    '''Test that --database-dir works.
-    '''
 
-    with tmpdir.as_cwd():
-        transcripts = datadir('pom.20.fa')
+    def test_annotate_dbdir(self, tmpdir, datadir):
+        '''Test that --database-dir works.
+           dammit run --database-dir [DB_DIR] annotate [INPUT.fa]
+        '''
 
-        database_dir = os.environ['DAMMIT_DB_DIR']
-        args = ['run', '--busco-group', 'saccharomycetes_odb10', '--database-dir', database_dir, '--pipeline', 'quick', 'annotate',  transcripts]
-        status, out, err = run(*args)
-        assert status == 0
+        with tmpdir.as_cwd():
+            transcripts = datadir('pom.20.fa')
+            database_dir = os.environ['DAMMIT_DB_DIR']
+
+            args = ['run', '--busco-group', 'saccharomycetes_odb10', '--pipeline', 'quick',
+                    '--database-dir', database_dir, 'annotate', '--dry-run', transcripts]
+            status, out, err = run(*args)
+
+            assert status == 0
+
+
+    def test_temp_dir(self, tmpdir, datadir):
+        '''Test that --temp-dir works.
+        '''
+
+        with tmpdir.as_cwd():
+            dammit_temp_dir = "TEMP"
+            args = ['run', '--pipeline', 'quick', '--temp-dir', dammit_temp_dir, 'databases']
+            status, out, err = run(*args)
+
+            assert status == 0
+            assert any("run.databases" in f for f in os.listdir(dammit_temp_dir))
+
+
+    def test_max_threads_per_task(self, tmpdir, datadir):
+        '''Test that --max_threads_per_task works.
+        '''
+
+        with tmpdir.as_cwd():
+            transcripts = datadir('pom.20.fa')
+            args = ['run', '--max-threads-per-task', 1, '--busco-group', 'saccharomycetes_odb10',
+                    '--pipeline', 'quick', 'annotate',  '--dry-run', transcripts]
+            status, out, err = run(*args)
+            outdir = 'pom.20.dammit'
+
+            assert status == 0
+            assert "Threads (per-task):       1" in err
+
+
+    def test_user_config_file(self, tmpdir, datadir):
+        '''Test that --config-file works.
+        '''
+
+        with tmpdir.as_cwd():
+            transcripts = datadir('pom.20.fa')
+            conf = datadir('test-conf.yml')
+            args = ['--config-file', conf, 'run', '--n-threads', '2',
+                    '--pipeline', 'quick', 'annotate', '--dry-run', transcripts]
+            status, out, err = run(*args)
+            outdir = 'pom.20.dammit'
+
+            print(out, err)
+
+            assert status == 0
+            assert "BUSCO groups:             saccharomycetes_odb10" in err
+            assert "E-value Cutoff (global):  1.0" in err
+            assert "Pipeline:                 quick" in err
+            assert "Threads (per-task):       1" in err
+            assert "Threads (total):          2" in err
+
+
+    def test_busco_config_file(self, tmpdir, datadir):
+        '''Test that --busco-config-file works.
+        '''
+
+        with tmpdir.as_cwd():
+            transcripts = datadir('pom.20.fa')
+            busco_conf = os.path.join(__path__, 'busco.default.ini')
+            args = ['run', '--busco-config-file', busco_conf, '--busco-group', 'saccharomycetes_odb10', 
+                    '--pipeline', 'quick', 'annotate', '--dry-run', transcripts]
+            status, out, err = run(*args)
+            outdir = 'pom.20.dammit'
+
+            assert status == 0
diff --git a/dammit/workflows/annotate.snakefile b/dammit/workflows/annotate.snakefile
@@ -183,7 +183,6 @@ rule cmscan:
     log:
         os.path.join(logs_dir, '{transcriptome}.x.{database}.cmscan.log')
     params:
-        evalue_threshold = GLOBAL_EVALUE if GLOBAL_EVALUE is not None else config['cmscan']['params'].get('evalue', 0.00001),
         extra = config['cmscan']['params'].get('extra', ''),
     threads: THREADS_PER_TASK
     wrapper: f'file://{__wrappers__}/infernal/cmscan.wrapper.py'

diff --git a/dammit/wrappers/infernal/cmscan.wrapper.py b/dammit/wrappers/infernal/cmscan.wrapper.py
@@ -11,6 +11,7 @@
 profile = snakemake.input.get("profile")
 profile = profile.rsplit(".i", 1)[0]
 
+assert not (snakemake.params.get('evalue_threshold', '') and snakemake.params.get('score_threshold', ''))
 assert profile.endswith(".cm"), 'your profile file should end with ".cm"'
 
 # direct output to file <f>, not stdout
@@ -29,13 +30,14 @@
 ## default params: enable evalue threshold. If bitscore thresh is provided, use that instead (both not allowed)
 
 # report <= this evalue threshold in output
-evalue_threshold = snakemake.params.get("evalue_threshold", 10)  # use cmscan default
+evalue_threshold = snakemake.params.get("evalue_threshold", '')  # use cmscan default
 # report >= this score threshold in output
-score_threshold = snakemake.params.get("score_threshold", "")
+score_threshold = snakemake.params.get("score_threshold", '')
 
+thresh_cmd = ''
 if score_threshold:
     thresh_cmd = f" -T {float(score_threshold)} "
-else:
+if evalue_threshold:
     thresh_cmd = f" -E {float(evalue_threshold)} "
 
 extra = snakemake.params.get("extra", "")

diff --git a/generate-test-data.sh b/generate-test-data.sh
@@ -21,6 +21,9 @@ dammit run  --busco-group saccharomycetes_odb10  --n-threads 4 annotate --global
 # Passing a user database
 dammit run  --busco-group saccharomycetes_odb10  --n-threads 4 --pipeline quick annotate $DATA_DIR/$TEST_FILE --user-database $DATA_DIR/$TEST_PEP -o $TEST_NAME.dammit.udb 
 
+# multiple user databases
+dammit run --n-threads 4 --busco-group saccharomycetes_odb10 --pipeline quick annotate --user-database $DATA_DIR/pep.fa --user-database $DATA_DIR/odb_subset.fa $DATA_DIR/$TEST_FILE -o $TEST_NAME.dammit.multi-udb
+
 # Passing multiple musco groups
 dammit run --n-threads 4 --pipeline quick --busco-group bacteria_odb10 --busco-group saccharomycetes_odb10 annotate -o pom.256.dammit.busco-multi $DATA_DIR/pom.256.fa 
 
@@ -30,6 +33,7 @@ dammit run --busco-group saccharomycetes_odb10 --pipeline quick annotate --regex
 # Rename with backmapping
 dammit run --busco-group saccharomycetes_odb10 --pipeline quick annotate $DATA_DIR/$TEST_FILE --no-rename -o $TEST_NAME.dammit.norename
 
+
 #
 # Copy the data
 #
@@ -54,7 +58,7 @@ cp $TEST_NAME.dammit.norename/$TEST_NAME.dammit.fasta $DATA_DIR/$TEST_NAME.dammi
 #cp $TEST_NAME.dammit.nr/$TEST_NAME.dammit.fasta $DATA_DIR/$TEST_NAME.dammit.fasta.nr
 #cp $TEST_NAME.dammit.nr/$TEST_NAME.dammit.gff3 $DATA_DIR/$TEST_NAME.dammit.gff3.nr
 
-#cp $TEST_NAME.dammit.norename/$TEST_NAME.dammit.fasta $DATA_DIR/$TEST_NAME.dammit.fasta.norename
-#cp $TEST_NAME.dammit.norename/$TEST_NAME.dammit.gff3 $DATA_DIR/$TEST_NAME.dammit.gff3.norename
+cp $TEST_NAME.dammit.norename/$TEST_NAME.dammit.fasta $DATA_DIR/$TEST_NAME.dammit.fasta.norename
+cp $TEST_NAME.dammit.norename/$TEST_NAME.dammit.gff3 $DATA_DIR/$TEST_NAME.dammit.gff3.norename