Skip to content

Commit

Permalink
found that the first KMC issue is in counting the database k-mers. Se…
Browse files Browse the repository at this point in the history
…e replicate_in_python.py after the FIXME for #15
  • Loading branch information
dkoslicki committed Apr 1, 2020
1 parent 3dcd8a2 commit dd86529
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 0 deletions.
2 changes: 2 additions & 0 deletions tests/script_tests_debug15/filenames.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
../Organisms/taxid_28901_877_genomic.fna.gz
../Organisms/taxid_1192839_4_genomic.fna.gz
75 changes: 75 additions & 0 deletions tests/script_tests_debug15/python_output/replicate_in_python.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# here I will manually re-create the steps that kmc is supposed to be doing, and compare where the differences are taking place

import os
import sys
# The following is for ease of development (so I don't need to keep re-installing the tool)
try:
from CMash import MinHash as MH
from CMash.Make import MakeTSTNew
from Query import Create
from Query import Intersect
from Query import Counters
from Query import Containment
from Query import PostProcess
except ImportError:
try:
import MinHash as MH
import Create
import Intersect
import Counters
import Containment
import PostProcess
except ImportError:
try:
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
from CMash import MinHash as MH
from CMash.Make import MakeTSTNew
from CMash.Query import Create # fix relative imports
from CMash.Query import Intersect
from CMash.Query import Counters
from CMash.Query import Containment
from CMash.Query import PostProcess
except:
print("Stupid IDE relative imports...")
from multiprocessing import Pool # Much faster without dummy (threading)
import multiprocessing
from itertools import *
import argparse
import khmer
import marisa_trie as mt

# for IDE REPL testing
os.chdir("/home/dkoslicki/Desktop/CMash/tests/script_tests_debug15/python_output")

# Import the database and dump the Kmers
reads_path = "../../Organisms/taxid_1192839_4_genomic.fna.gz"
training_path = "../TrainingDatabase.h5"
input_type = 'fasta'
threads = 16
temp_dir = "."
verbose = True
I = Intersect(reads_path, training_path, input_type=input_type, threads=threads, temp_dir=temp_dir, verbose=verbose)



# this is KMC-free, so I can just call Isaac's code
I.cmashDump = "TrainingDatabase_dump.fa"
I.dump_training_kmers()

# dump the k-mers using KMC
I.db_kmers_loc = "TrainingDatabase_dump"
I.count_training_kmers()
# FIXME: problem is here: the output of KMC is:
#Stats:
# No. of k-mers below min. threshold : 0
# No. of k-mers above max. threshold : 0
# No. of unique k-mers : 3 # <-------
# No. of unique counted k-mers : 3 # <-------
# Total no. of k-mers : 3 # <-------
# Total no. of reads : 1 # <-------
# Total no. of super-k-mers : 0
# and:
# $ kmc_dump TrainingDatabase_dump /dev/fd/1
# AAAATCGCTC 1
# AAGTACTGAA 1
# ATACATAGCA 1
65 changes: 65 additions & 0 deletions tests/script_tests_debug15/run_small_tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash

# For manual tests. Run this in the tests/script_tests folder

# In case you have multiple versions installed (eg. Metalign as well as CMash), make sure python is looking in the right place:
export PYTHONPATH="$(dirname $(dirname "`pwd`"))":$PYTHONPATH

#Make sure the correct CMash is being pulled from
testFile=$(python -c "from CMash import MinHash as MH; print(MH.__file__)")
parentDir=`dirname $PWD`
parentDir=`dirname ${parentDir}`
correctFile="${parentDir}/CMash/MinHash.py"
if [ "$testFile" == "$correctFile" ];
then
echo "Files are correct"
else
echo "Files are not correct"
exit 1
fi

scriptsDir="${parentDir}/scripts"
testOrganism="../Organisms/taxid_1192839_4_genomic.fna.gz"

# make the training database
echo "Training on data"
rm TrainingDatabase.h5 2> /dev/null
rm TrainingDatabase.tst 2> /dev/null
/usr/bin/time python ${scriptsDir}/MakeStreamingDNADatabase.py filenames.txt TrainingDatabase.h5 -k 10
if test -f TrainingDatabase.h5; then
if test -f TrainingDatabase.tst; then
echo "Training file successfully created"
else
echo "SOMETHING WENT WRONG!!!!"
exit 1
fi
else
echo "SOMETHING WENT WRONG!!!!"
exit 1
fi

echo "Classifying sample, sensitive settings"
rm results.csv 2> /dev/null
# make a streaming pre-filter
/usr/bin/time python ${scriptsDir}/StreamingQueryDNADatabase.py ${testOrganism} TrainingDatabase.h5 results.csv 10-10-1 --sensitive
if test -f results.csv; then
echo "sensitive classify successful"
cat results.csv
else
echo "SOMETHING WENT WRONG!!!!"
exit 1
fi

# intersection tests

echo "Classifying sample, sensitive settings, with KMC intersect"
rm results.csv 2> /dev/null
# make a streaming pre-filter
/usr/bin/time python ${scriptsDir}/StreamingQueryDNADatabase.py ${testOrganism} TrainingDatabase.h5 results.csv 10-10-1 --sensitive --intersect
if test -f results.csv; then
echo "sensitive classify successful"
cat results.csv
else
echo "SOMETHING WENT WRONG!!!!"
exit 1
fi

0 comments on commit dd86529

Please sign in to comment.