Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized parameters for cmscan with Rfam #232

Open
wants to merge 20 commits into
base: v2_staging
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion dammit/components/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ def run_group(config,
config.core['n_threads'] = n_threads

if not max_threads_per_task:
config.core['max_threads_per_task'] = config.core['n_threads']
config.core['max_threads_per_task'] = config.core['n_threads'] if config.core['max_threads_per_task'] == 0 \
else min(config.core['n_threads'], config.core['max_threads_per_task'])
else:
config.core['max_threads_per_task'] = min(config.core['n_threads'], max_threads_per_task)

Expand Down
4 changes: 2 additions & 2 deletions dammit/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ regex_rename: null
# n_threads: total threads to pass to snakemake -j
n_threads: 0
# max threads to use on a single job
max_threads_per_task: 1
max_threads_per_task: 0

verbosity: 0

Expand Down Expand Up @@ -60,7 +60,7 @@ hmmpress:
cmscan:
params:
evalue: 0.00001
extra: "--nohmmonly"
extra: "--nohmmonly --rfam --cut_ga"

cmpress:
params:
Expand Down
5 changes: 0 additions & 5 deletions dammit/databases.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,6 @@ orthodb_genes:
output_suffix:
- ""
busco:
url: https://gitlab.com/ezlab/busco/-/raw/4.0.6/config/config.ini
access: download
fileformat: uncompressed
output_suffix:
- ""
# to update lineage list:
# busco --list-datasets to see all
# copy in list, do visual selection of lines (ctrl-V G)
Expand Down
42 changes: 21 additions & 21 deletions dammit/tests/test-data/pom.20.dammit.evalue10.fasta

Large diffs are not rendered by default.

110 changes: 0 additions & 110 deletions dammit/tests/test-data/pom.20.dammit.evalue10.gff3

Large diffs are not rendered by default.

42 changes: 42 additions & 0 deletions dammit/tests/test-data/pom.20.udbs.dammit.fasta

Large diffs are not rendered by default.

216 changes: 216 additions & 0 deletions dammit/tests/test-data/pom.20.udbs.dammit.gff3

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions dammit/tests/test-data/test-conf.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
basename: Splat
busco:
configfile: busco.default.ini
params:
extra: ''
global_evalue: 1.0
busco_groups:
- saccharomycetes_odb10
max_threads_per_task: 1
196 changes: 145 additions & 51 deletions dammit/tests/test_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pandas as pd

from .utils import run
from dammit.meta import __path__

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
Expand All @@ -22,7 +23,7 @@ def compare_gff(fn_a, fn_b):
df_a.reset_index(inplace=True, drop=True)
df_b = gff3.GFF3Parser(fn_b).read().sort_values(['seqid', 'start', 'end', 'ID', 'Target']).sort_index(axis=1)
df_b.reset_index(inplace=True, drop=True)

print('First DF:', df_a, '\n', '=' * 40)
print('Second DF:', df_b, '\n', '=' * 40)
return df_a.equals(df_b)
Expand All @@ -42,7 +43,8 @@ def test_default(self, tmpdir, datadir, n_threads):
exp_gff3 = datadir('pom.20.dammit.gff3')
exp_fasta = datadir('pom.20.dammit.fasta')

args = ['run', '--busco-group', 'saccharomycetes_odb10', '--n-threads', str(n_threads), 'annotate', transcripts]
args = ['run', '--busco-group', 'saccharomycetes_odb10',
'--n-threads', str(n_threads), 'annotate', transcripts]
status, out, err = run(*args)

outdir = 'pom.20.dammit'
Expand All @@ -65,7 +67,8 @@ def test_evalue(self, tmpdir, datadir):
exp_gff3 = datadir('pom.20.dammit.evalue10.gff3')
exp_fasta = datadir('pom.20.dammit.evalue10.fasta')

args = ['run', '--busco-group', 'saccharomycetes_odb10', 'annotate', transcripts, '--global-evalue', '10.0']
args = ['run', '--busco-group', 'saccharomycetes_odb10',
'annotate', transcripts, '--global-evalue', '10.0']
status, out, err = run(*args)

outdir = 'pom.20.dammit'
Expand All @@ -74,7 +77,7 @@ def test_evalue(self, tmpdir, datadir):

assert compare_gff(gff3_fn, exp_gff3)
assert open(fasta_fn).read() == open(exp_fasta).read()

@pytest.mark.parametrize('n_threads', (1,4))
def test_user_database(self, tmpdir, datadir, n_threads):
'''--n-threads [N] --pipeline quick annotate --user-database [PEP.fa] [INPUT.fa]
Expand All @@ -86,7 +89,8 @@ def test_user_database(self, tmpdir, datadir, n_threads):
exp_gff3 = datadir('pom.20.udb.dammit.gff3')
exp_fasta = datadir('pom.20.udb.dammit.fasta')

args = ['run', '--busco-group', 'saccharomycetes_odb10', '--n-threads', str(n_threads), '--pipeline', 'quick', 'annotate',
args = ['run', '--busco-group', 'saccharomycetes_odb10',
'--n-threads', str(n_threads), '--pipeline', 'quick', 'annotate',
transcripts, '--user-database', pep]
status, out, err = run(*args)

Expand All @@ -97,32 +101,39 @@ def test_user_database(self, tmpdir, datadir, n_threads):
fasta_fn = os.path.join(outdir, 'pom.20.dammit.fasta')

assert status == 0
#assert compare_gff(gff3_fn, exp_gff3)
#assert open(fasta_fn).read() == open(exp_fasta).read()
assert compare_gff(gff3_fn, exp_gff3)
assert open(fasta_fn).read() == open(exp_fasta).read()

@pytest.mark.parametrize('n_threads', (1,4))
def test_annotate_multiple_user_databases(self, tmpdir, datadir, n_threads):
'''--pipeline quick annotate --user-database [PEP1.fa] --user-database [PEP2.fa] [INPUT.fa]
'''

# def test_annotate_multiple_user_databases(self, tmpdir, datadir):
# '''--pipeline quick annotate --user-database [PEP1.fa] --user-database [PEP2.fa] [INPUT.fa]
# '''
#
# with tmpdir.as_cwd():
# transcripts = datadir('pom.single.fa')
# pep = datadir('pep.fa')
# pep2 = datadir('odb_subset.fa')
# exp_gff3 = datadir('pom.single.fa.dammit.gff3.udb')
# exp_fasta = datadir('pom.single.fa.dammit.fasta.udb')
#
# args = ['annotate', '--quick',
# transcripts, '--user-databases', pep, pep2,
# '--verbosity', '2']
# status, out, err = run(args)
#
# outdir = '{0}.dammit'.format(transcripts)
#
# assert status == 0
with tmpdir.as_cwd():
transcripts = datadir('pom.20.fa')
pep = datadir('pep.fa')
pep2 = datadir('odb_subset.fa')
exp_gff3 = datadir('pom.20.udbs.dammit.gff3')
exp_fasta = datadir('pom.20.udbs.dammit.fasta')

args = ['run', '--n-threads', str(n_threads),
'--busco-group', 'saccharomycetes_odb10',
'--pipeline', 'quick', 'annotate',
'--user-database', pep,
'--user-database', pep2,
transcripts]
status, out, err = run(*args)

def test_basename(self, tmpdir, datadir):
'''--pipeline quick annotate --base-name [NAME] [INPUT.fa]
outdir = 'pom.20.dammit'
gff3_fn = os.path.join(outdir, 'pom.20.dammit.gff3')
fasta_fn = os.path.join(outdir, 'pom.20.dammit.fasta')

assert status == 0
assert compare_gff(gff3_fn, exp_gff3)
assert open(fasta_fn).read() == open(exp_fasta).read()

def test_annotate_basename(self, tmpdir, datadir):
'''Test annotate --pipeline quick annotate --base-name [NAME] [INPUT.fa]
'''

with tmpdir.as_cwd():
Expand All @@ -138,9 +149,9 @@ def test_basename(self, tmpdir, datadir):

contents = open(fn).read()
assert 'Test_0' in contents

def test_multiple_busco_groups(self, tmpdir, datadir):
'''--pipeline quick --busco-group bacteria_odb10 --busco-group saccharomycetes_odb10
'''--pipeline quick --busco-group saccharomycetes_odb10 --busco-group saccharomycetes_odb10
'''

with tmpdir.as_cwd():
Expand Down Expand Up @@ -170,12 +181,11 @@ def test_regex_rename(self, tmpdir, datadir):
'--busco-group', 'saccharomycetes_odb10',
'annotate', '--regex-rename', r'(?P<name>^[a-zA-Z0-9\.]+)',
transcripts]

status, out, err = run(*args)
assert status == 0

gff3_fn = os.path.join('pom.20.dammit', 'pom.20.dammit.gff3')

assert compare_gff(gff3_fn, exp_gff3_fn)

def test_norename(self, tmpdir, datadir):
Expand All @@ -198,29 +208,113 @@ def test_norename(self, tmpdir, datadir):

assert compare_gff(gff3_fn, exp_gff3_fn)

def test_annotate_outdir(self, tmpdir, datadir):
'''
Test output directory option
'''

def test_annotate_dbdir_fail(tmpdir, datadir):
'''Test annotation with a faulty database directory.
'''
with tmpdir.as_cwd():
transcripts = datadir('pom.20.fa')

with tmpdir.as_cwd():
transcripts = datadir('pom.20.fa')
outdir = 'test_out'
args = ['run', '--pipeline', 'quick', 'annotate',
transcripts, '--output-dir', outdir]
status, out, err = run(*args)
assert os.path.isfile(os.path.join(outdir, 'pom.20.fasta'))

args = ['run', '--database-dir', '.', 'annotate', transcripts]
status, out, err = run(*args, fail_ok=True)

assert 'you probably need to install the dammit databases' in err
assert status == 1
# make sure DAMMIT_DB_DIR is set in your testing env
# (export DAMMIT_DB_DIR=/path/to/databases)
def test_annotate_dbdir_fail(self, tmpdir, datadir):
'''Test annotation with a faulty database directory.
dammit run --database-dir [DB_DIR] annotate [INPUT.fa]
'''

with tmpdir.as_cwd():
transcripts = datadir('pom.20.fa')

args = ['run', '--pipeline', 'quick', '--database-dir', '.', 'annotate', transcripts]
status, out, err = run(*args, fail_ok=True)

assert 'you probably need to install the dammit databases' in err
assert status == 1

def test_annotate_dbdir(tmpdir, datadir):
'''Test that --database-dir works.
'''

with tmpdir.as_cwd():
transcripts = datadir('pom.20.fa')
def test_annotate_dbdir(self, tmpdir, datadir):
'''Test that --database-dir works.
dammit run --database-dir [DB_DIR] annotate [INPUT.fa]
'''

database_dir = os.environ['DAMMIT_DB_DIR']
args = ['run', '--busco-group', 'saccharomycetes_odb10', '--database-dir', database_dir, '--pipeline', 'quick', 'annotate', transcripts]
status, out, err = run(*args)
assert status == 0
with tmpdir.as_cwd():
transcripts = datadir('pom.20.fa')
database_dir = os.environ['DAMMIT_DB_DIR']

args = ['run', '--busco-group', 'saccharomycetes_odb10', '--pipeline', 'quick',
'--database-dir', database_dir, 'annotate', '--dry-run', transcripts]
status, out, err = run(*args)

assert status == 0


def test_temp_dir(self, tmpdir, datadir):
'''Test that --temp-dir works.
'''

with tmpdir.as_cwd():
dammit_temp_dir = "TEMP"
args = ['run', '--pipeline', 'quick', '--temp-dir', dammit_temp_dir, 'databases']
status, out, err = run(*args)

assert status == 0
assert any("run.databases" in f for f in os.listdir(dammit_temp_dir))


def test_max_threads_per_task(self, tmpdir, datadir):
'''Test that --max_threads_per_task works.
'''

with tmpdir.as_cwd():
transcripts = datadir('pom.20.fa')
args = ['run', '--max-threads-per-task', 1, '--busco-group', 'saccharomycetes_odb10',
'--pipeline', 'quick', 'annotate', '--dry-run', transcripts]
status, out, err = run(*args)
outdir = 'pom.20.dammit'

assert status == 0
assert "Threads (per-task): 1" in err


def test_user_config_file(self, tmpdir, datadir):
'''Test that --config-file works.
'''

with tmpdir.as_cwd():
transcripts = datadir('pom.20.fa')
conf = datadir('test-conf.yml')
args = ['--config-file', conf, 'run', '--n-threads', '2',
'--pipeline', 'quick', 'annotate', '--dry-run', transcripts]
status, out, err = run(*args)
outdir = 'pom.20.dammit'

print(out, err)

assert status == 0
assert "BUSCO groups: saccharomycetes_odb10" in err
assert "E-value Cutoff (global): 1.0" in err
assert "Pipeline: quick" in err
assert "Threads (per-task): 1" in err
assert "Threads (total): 2" in err


def test_busco_config_file(self, tmpdir, datadir):
'''Test that --busco-config-file works.
'''

with tmpdir.as_cwd():
transcripts = datadir('pom.20.fa')
busco_conf = os.path.join(__path__, 'busco.default.ini')
args = ['run', '--busco-config-file', busco_conf, '--busco-group', 'saccharomycetes_odb10',
'--pipeline', 'quick', 'annotate', '--dry-run', transcripts]
status, out, err = run(*args)
outdir = 'pom.20.dammit'

assert status == 0
1 change: 0 additions & 1 deletion dammit/workflows/annotate.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ rule cmscan:
log:
os.path.join(logs_dir, '{transcriptome}.x.{database}.cmscan.log')
params:
evalue_threshold = GLOBAL_EVALUE if GLOBAL_EVALUE is not None else config['cmscan']['params'].get('evalue', 0.00001),
extra = config['cmscan']['params'].get('extra', ''),
threads: THREADS_PER_TASK
wrapper: f'file://{__wrappers__}/infernal/cmscan.wrapper.py'
Expand Down
8 changes: 5 additions & 3 deletions dammit/wrappers/infernal/cmscan.wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
profile = snakemake.input.get("profile")
profile = profile.rsplit(".i", 1)[0]

assert not (snakemake.params.get('evalue_threshold', '') and snakemake.params.get('score_threshold', ''))
assert profile.endswith(".cm"), 'your profile file should end with ".cm"'

# direct output to file <f>, not stdout
Expand All @@ -29,13 +30,14 @@
## default params: enable evalue threshold. If bitscore thresh is provided, use that instead (both not allowed)

# report <= this evalue threshold in output
evalue_threshold = snakemake.params.get("evalue_threshold", 10) # use cmscan default
evalue_threshold = snakemake.params.get("evalue_threshold", '') # use cmscan default
# report >= this score threshold in output
score_threshold = snakemake.params.get("score_threshold", "")
score_threshold = snakemake.params.get("score_threshold", '')

thresh_cmd = ''
if score_threshold:
thresh_cmd = f" -T {float(score_threshold)} "
else:
if evalue_threshold:
thresh_cmd = f" -E {float(evalue_threshold)} "

extra = snakemake.params.get("extra", "")
Expand Down
8 changes: 6 additions & 2 deletions generate-test-data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ dammit run --busco-group saccharomycetes_odb10 --n-threads 4 annotate --global
# Passing a user database
dammit run --busco-group saccharomycetes_odb10 --n-threads 4 --pipeline quick annotate $DATA_DIR/$TEST_FILE --user-database $DATA_DIR/$TEST_PEP -o $TEST_NAME.dammit.udb

# multiple user databases
dammit run --n-threads 4 --busco-group saccharomycetes_odb10 --pipeline quick annotate --user-database $DATA_DIR/pep.fa --user-database $DATA_DIR/odb_subset.fa $DATA_DIR/$TEST_FILE -o $TEST_NAME.dammit.multi-udb

# Passing multiple musco groups
dammit run --n-threads 4 --pipeline quick --busco-group bacteria_odb10 --busco-group saccharomycetes_odb10 annotate -o pom.256.dammit.busco-multi $DATA_DIR/pom.256.fa

Expand All @@ -30,6 +33,7 @@ dammit run --busco-group saccharomycetes_odb10 --pipeline quick annotate --regex
# Rename with backmapping
dammit run --busco-group saccharomycetes_odb10 --pipeline quick annotate $DATA_DIR/$TEST_FILE --no-rename -o $TEST_NAME.dammit.norename


#
# Copy the data
#
Expand All @@ -54,7 +58,7 @@ cp $TEST_NAME.dammit.norename/$TEST_NAME.dammit.fasta $DATA_DIR/$TEST_NAME.dammi
#cp $TEST_NAME.dammit.nr/$TEST_NAME.dammit.fasta $DATA_DIR/$TEST_NAME.dammit.fasta.nr
#cp $TEST_NAME.dammit.nr/$TEST_NAME.dammit.gff3 $DATA_DIR/$TEST_NAME.dammit.gff3.nr

#cp $TEST_NAME.dammit.norename/$TEST_NAME.dammit.fasta $DATA_DIR/$TEST_NAME.dammit.fasta.norename
#cp $TEST_NAME.dammit.norename/$TEST_NAME.dammit.gff3 $DATA_DIR/$TEST_NAME.dammit.gff3.norename
cp $TEST_NAME.dammit.norename/$TEST_NAME.dammit.fasta $DATA_DIR/$TEST_NAME.dammit.fasta.norename
cp $TEST_NAME.dammit.norename/$TEST_NAME.dammit.gff3 $DATA_DIR/$TEST_NAME.dammit.gff3.norename