-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
found that the first KMC issue is in counting the database k-mers. Se…
…e replicate_in_python.py after the FIXME for #15
- Loading branch information
Showing
3 changed files
with
142 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
../Organisms/taxid_28901_877_genomic.fna.gz | ||
../Organisms/taxid_1192839_4_genomic.fna.gz |
75 changes: 75 additions & 0 deletions
75
tests/script_tests_debug15/python_output/replicate_in_python.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
# here I will manually re-create the steps that kmc is supposed to be doing, and compare where the differences are taking place | ||
|
||
import os | ||
import sys | ||
# The following is for ease of development (so I don't need to keep re-installing the tool) | ||
try: | ||
from CMash import MinHash as MH | ||
from CMash.Make import MakeTSTNew | ||
from Query import Create | ||
from Query import Intersect | ||
from Query import Counters | ||
from Query import Containment | ||
from Query import PostProcess | ||
except ImportError: | ||
try: | ||
import MinHash as MH | ||
import Create | ||
import Intersect | ||
import Counters | ||
import Containment | ||
import PostProcess | ||
except ImportError: | ||
try: | ||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) | ||
from CMash import MinHash as MH | ||
from CMash.Make import MakeTSTNew | ||
from CMash.Query import Create # fix relative imports | ||
from CMash.Query import Intersect | ||
from CMash.Query import Counters | ||
from CMash.Query import Containment | ||
from CMash.Query import PostProcess | ||
except: | ||
print("Stupid IDE relative imports...") | ||
from multiprocessing import Pool # Much faster without dummy (threading) | ||
import multiprocessing | ||
from itertools import * | ||
import argparse | ||
import khmer | ||
import marisa_trie as mt | ||
|
||
# for IDE REPL testing | ||
os.chdir("/home/dkoslicki/Desktop/CMash/tests/script_tests_debug15/python_output") | ||
|
||
# Import the database and dump the Kmers | ||
reads_path = "../../Organisms/taxid_1192839_4_genomic.fna.gz" | ||
training_path = "../TrainingDatabase.h5" | ||
input_type = 'fasta' | ||
threads = 16 | ||
temp_dir = "." | ||
verbose = True | ||
I = Intersect(reads_path, training_path, input_type=input_type, threads=threads, temp_dir=temp_dir, verbose=verbose) | ||
|
||
|
||
|
||
# this is KMC-free, so I can just call Isaac's code | ||
I.cmashDump = "TrainingDatabase_dump.fa" | ||
I.dump_training_kmers() | ||
|
||
# dump the k-mers using KMC | ||
I.db_kmers_loc = "TrainingDatabase_dump" | ||
I.count_training_kmers() | ||
# FIXME: problem is here: the output of KMC is: | ||
#Stats: | ||
# No. of k-mers below min. threshold : 0 | ||
# No. of k-mers above max. threshold : 0 | ||
# No. of unique k-mers : 3 # <------- | ||
# No. of unique counted k-mers : 3 # <------- | ||
# Total no. of k-mers : 3 # <------- | ||
# Total no. of reads : 1 # <------- | ||
# Total no. of super-k-mers : 0 | ||
# and: | ||
# $ kmc_dump TrainingDatabase_dump /dev/fd/1 | ||
# AAAATCGCTC 1 | ||
# AAGTACTGAA 1 | ||
# ATACATAGCA 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#!/bin/bash | ||
|
||
# For manual tests. Run this in the tests/script_tests folder | ||
|
||
# In case you have multiple versions installed (eg. Metalign as well as CMash), make sure python is looking in the right place: | ||
export PYTHONPATH="$(dirname $(dirname "`pwd`"))":$PYTHONPATH | ||
|
||
#Make sure the correct CMash is being pulled from | ||
testFile=$(python -c "from CMash import MinHash as MH; print(MH.__file__)") | ||
parentDir=`dirname $PWD` | ||
parentDir=`dirname ${parentDir}` | ||
correctFile="${parentDir}/CMash/MinHash.py" | ||
if [ "$testFile" == "$correctFile" ]; | ||
then | ||
echo "Files are correct" | ||
else | ||
echo "Files are not correct" | ||
exit 1 | ||
fi | ||
|
||
scriptsDir="${parentDir}/scripts" | ||
testOrganism="../Organisms/taxid_1192839_4_genomic.fna.gz" | ||
|
||
# make the training database | ||
echo "Training on data" | ||
rm TrainingDatabase.h5 2> /dev/null | ||
rm TrainingDatabase.tst 2> /dev/null | ||
/usr/bin/time python ${scriptsDir}/MakeStreamingDNADatabase.py filenames.txt TrainingDatabase.h5 -k 10 | ||
if test -f TrainingDatabase.h5; then | ||
if test -f TrainingDatabase.tst; then | ||
echo "Training file successfully created" | ||
else | ||
echo "SOMETHING WENT WRONG!!!!" | ||
exit 1 | ||
fi | ||
else | ||
echo "SOMETHING WENT WRONG!!!!" | ||
exit 1 | ||
fi | ||
|
||
echo "Classifying sample, sensitive settings" | ||
rm results.csv 2> /dev/null | ||
# make a streaming pre-filter | ||
/usr/bin/time python ${scriptsDir}/StreamingQueryDNADatabase.py ${testOrganism} TrainingDatabase.h5 results.csv 10-10-1 --sensitive | ||
if test -f results.csv; then | ||
echo "sensitive classify successful" | ||
cat results.csv | ||
else | ||
echo "SOMETHING WENT WRONG!!!!" | ||
exit 1 | ||
fi | ||
|
||
# intersection tests | ||
|
||
echo "Classifying sample, sensitive settings, with KMC intersect" | ||
rm results.csv 2> /dev/null | ||
# make a streaming pre-filter | ||
/usr/bin/time python ${scriptsDir}/StreamingQueryDNADatabase.py ${testOrganism} TrainingDatabase.h5 results.csv 10-10-1 --sensitive --intersect | ||
if test -f results.csv; then | ||
echo "sensitive classify successful" | ||
cat results.csv | ||
else | ||
echo "SOMETHING WENT WRONG!!!!" | ||
exit 1 | ||
fi |