diff --git a/tests/script_tests_debug15/filenames.txt b/tests/script_tests_debug15/filenames.txt new file mode 100755 index 0000000..1e4da16 --- /dev/null +++ b/tests/script_tests_debug15/filenames.txt @@ -0,0 +1,2 @@ +../Organisms/taxid_28901_877_genomic.fna.gz +../Organisms/taxid_1192839_4_genomic.fna.gz diff --git a/tests/script_tests_debug15/python_output/replicate_in_python.py b/tests/script_tests_debug15/python_output/replicate_in_python.py new file mode 100644 index 0000000..e272109 --- /dev/null +++ b/tests/script_tests_debug15/python_output/replicate_in_python.py @@ -0,0 +1,75 @@ +# here I will manually re-create the steps that kmc is supposed to be doing, and compare where the differences are taking place + +import os +import sys +# The following is for ease of development (so I don't need to keep re-installing the tool) +try: + from CMash import MinHash as MH + from CMash.Make import MakeTSTNew + from Query import Create + from Query import Intersect + from Query import Counters + from Query import Containment + from Query import PostProcess +except ImportError: + try: + import MinHash as MH + import Create + import Intersect + import Counters + import Containment + import PostProcess + except ImportError: + try: + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) + from CMash import MinHash as MH + from CMash.Make import MakeTSTNew + from CMash.Query import Create # fix relative imports + from CMash.Query import Intersect + from CMash.Query import Counters + from CMash.Query import Containment + from CMash.Query import PostProcess + except: + print("Stupid IDE relative imports...") +from multiprocessing import Pool # Much faster without dummy (threading) +import multiprocessing +from itertools import * +import argparse +import khmer +import marisa_trie as mt + +# for IDE REPL testing +os.chdir("/home/dkoslicki/Desktop/CMash/tests/script_tests_debug15/python_output") + +# Import the database and dump the Kmers +reads_path = "../../Organisms/taxid_1192839_4_genomic.fna.gz" +training_path = "../TrainingDatabase.h5" +input_type = 'fasta' +threads = 16 +temp_dir = "." +verbose = True +I = Intersect(reads_path, training_path, input_type=input_type, threads=threads, temp_dir=temp_dir, verbose=verbose) + + + +# this is KMC-free, so I can just call Isaac's code +I.cmashDump = "TrainingDatabase_dump.fa" +I.dump_training_kmers() + +# dump the k-mers using KMC +I.db_kmers_loc = "TrainingDatabase_dump" +I.count_training_kmers() +# FIXME: problem is here: the output of KMC is: +#Stats: +# No. of k-mers below min. threshold : 0 +# No. of k-mers above max. threshold : 0 +# No. of unique k-mers : 3 # <------- +# No. of unique counted k-mers : 3 # <------- +# Total no. of k-mers : 3 # <------- +# Total no. of reads : 1 # <------- +# Total no. of super-k-mers : 0 +# and: +# $ kmc_dump TrainingDatabase_dump /dev/fd/1 +# AAAATCGCTC 1 +# AAGTACTGAA 1 +# ATACATAGCA 1 diff --git a/tests/script_tests_debug15/run_small_tests.sh b/tests/script_tests_debug15/run_small_tests.sh new file mode 100755 index 0000000..b1fea6b --- /dev/null +++ b/tests/script_tests_debug15/run_small_tests.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# For manual tests. Run this in the tests/script_tests folder + +# In case you have multiple versions installed (eg. Metalign as well as CMash), make sure python is looking in the right place: +export PYTHONPATH="$(dirname $(dirname "`pwd`"))":$PYTHONPATH + +#Make sure the correct CMash is being pulled from +testFile=$(python -c "from CMash import MinHash as MH; print(MH.__file__)") +parentDir=`dirname $PWD` +parentDir=`dirname ${parentDir}` +correctFile="${parentDir}/CMash/MinHash.py" +if [ "$testFile" == "$correctFile" ]; +then + echo "Files are correct" +else + echo "Files are not correct" + exit 1 +fi + +scriptsDir="${parentDir}/scripts" +testOrganism="../Organisms/taxid_1192839_4_genomic.fna.gz" + +# make the training database +echo "Training on data" +rm TrainingDatabase.h5 2> /dev/null +rm TrainingDatabase.tst 2> /dev/null +/usr/bin/time python ${scriptsDir}/MakeStreamingDNADatabase.py filenames.txt TrainingDatabase.h5 -k 10 +if test -f TrainingDatabase.h5; then + if test -f TrainingDatabase.tst; then + echo "Training file successfully created" + else + echo "SOMETHING WENT WRONG!!!!" + exit 1 + fi + else + echo "SOMETHING WENT WRONG!!!!" + exit 1 +fi + +echo "Classifying sample, sensitive settings" +rm results.csv 2> /dev/null +# make a streaming pre-filter +/usr/bin/time python ${scriptsDir}/StreamingQueryDNADatabase.py ${testOrganism} TrainingDatabase.h5 results.csv 10-10-1 --sensitive +if test -f results.csv; then + echo "sensitive classify successful" + cat results.csv +else + echo "SOMETHING WENT WRONG!!!!" + exit 1 +fi + +# intersection tests + +echo "Classifying sample, sensitive settings, with KMC intersect" +rm results.csv 2> /dev/null +# make a streaming pre-filter +/usr/bin/time python ${scriptsDir}/StreamingQueryDNADatabase.py ${testOrganism} TrainingDatabase.h5 results.csv 10-10-1 --sensitive --intersect +if test -f results.csv; then + echo "sensitive classify successful" + cat results.csv +else + echo "SOMETHING WENT WRONG!!!!" + exit 1 +fi