Skip to content

Commit

Permalink
agrees on read k-mers, problem with kmc_dump atm. See line 155 in rep…
Browse files Browse the repository at this point in the history
…licate_in_python.py #15
  • Loading branch information
dkoslicki committed Apr 1, 2020
1 parent c31b3cc commit a82082e
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 11 deletions.
67 changes: 59 additions & 8 deletions tests/script_tests_debug15/python_output/replicate_in_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import khmer
import marisa_trie as mt
import subprocess
import sys

# for IDE REPL testing
os.chdir("/home/dkoslicki/Desktop/CMash/tests/script_tests_debug15/python_output")
Expand All @@ -52,7 +53,8 @@
I = Intersect(reads_path, training_path, input_type=input_type, threads=threads, temp_dir=temp_dir, verbose=verbose)



#####################################################
# dumping database k-mers: count_training_kmers
# this is KMC-free, so I can just call Isaac's code
I.cmashDump = "TrainingDatabase_dump.fa"
I.dump_training_kmers()
Expand Down Expand Up @@ -88,25 +90,74 @@
# which looks much more correct

# check this in python
true_canonical_kmers = set()
python_training_kmers = set()
CEs = MH.import_multiple_from_single_hdf5(training_path)
for CE in CEs:
for kmer in CE._kmers:
kmer_rc = khmer.reverse_complement(kmer)
if kmer < kmer_rc:
true_canonical_kmers.add(kmer)
python_training_kmers.add(kmer)
else:
true_canonical_kmers.add(kmer_rc)
python_training_kmers.add(kmer_rc)

print(f"Python's count of number of k-mers: {len(true_canonical_kmers)}")
print(f"Python's count of number of k-mers: {len(python_training_kmers)}")
#with open("python_training_canonical_kmers.txt", 'w') as fid:
# for kmer in true_canonical_kmers:
# fid.write(f"{kmer}\n")

# dump kmc version of the canonical kmers
result = subprocess.run(f"kmc_dump TrainingDatabase_dump /dev/fd/1", capture_output=True, shell=True)
kmc_canonical_kmers = set(map(lambda x: x.split('\t')[0], result.stdout.decode('utf-8').split('\n')))
kmc_canonical_kmers.remove('')
kmc_training_kmers = set(map(lambda x: x.split('\t')[0], result.stdout.decode('utf-8').split('\n')))
kmc_training_kmers.remove('')

if sorted(list(true_canonical_kmers)) == sorted(list(kmc_canonical_kmers)):
if sorted(list(python_training_kmers)) == sorted(list(kmc_training_kmers)):
print("Yes! Python and KMC agree on the database dumped k-mers")
else:
raise Exception("NO! Python and KMC DO NOT agree on the database dumped k-mers")


#####################################################
# test the counting of input k-mers count_input_kmers()
I.count_input_kmers()

# do it in python
python_read_kmers = set()
fid = khmer.ReadParser(reads_path)
for record in fid:
seq = record.sequence
for kmer in MH.kmers(seq, I.ksize):
kmer_rc = khmer.reverse_complement(kmer)
if kmer < kmer_rc:
python_read_kmers.add(kmer)
else:
python_read_kmers.add(kmer_rc)

# do it with KMC
I.count_input_kmers()
result = subprocess.run(f"kmc_dump {I.reads_kmc_out_file} /dev/fd/1", capture_output=True, shell=True)
kmc_reads_kmers = set(map(lambda x: x.split('\t')[0], result.stdout.decode('utf-8').split('\n')))
kmc_reads_kmers.remove('')
if sorted(list(python_read_kmers)) == sorted(list(kmc_reads_kmers)):
print("Yes! Python and KMC agree on the read dumped k-mers")
else:
raise Exception("NO! Python and KMC DO NOT agree on the read dumped k-mers")


#####################################################
# test the intersection: intersect()

I.intersect()

# do the intersection in python
python_intersection_kmers = python_read_kmers.intersection(python_training_kmers)

# do the intersection with KMC
# FIXME: kmc_dump doesn't like output to stdout
result = subprocess.run(f"kmc_dump {I.intersection_kmc_dump_file} /dev/fd/1", capture_output=True, shell=True)
kmc_intersection_kmers = set(map(lambda x: x.split('\t')[0], result.stdout.decode('utf-8').split('\n')))
kmc_intersection_kmers.remove('')
if sorted(list(python_intersection_kmers)) == sorted(list(kmc_intersection_kmers)):
print("Yes! Python and KMC agree on the intersection k-mers")
else:
raise Exception("NO! Python and KMC DO NOT agree on the intersection k-mers")

6 changes: 3 additions & 3 deletions tests/script_tests_debug15/run_small_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ testOrganism="../Organisms/taxid_1192839_4_genomic.fna.gz"
echo "Training on data"
rm TrainingDatabase.h5 2> /dev/null
rm TrainingDatabase.tst 2> /dev/null
/usr/bin/time python ${scriptsDir}/MakeStreamingDNADatabase.py filenames.txt TrainingDatabase.h5 -k 10
/usr/bin/time python ${scriptsDir}/MakeStreamingDNADatabase.py filenames.txt TrainingDatabase.h5 -k 15
if test -f TrainingDatabase.h5; then
if test -f TrainingDatabase.tst; then
echo "Training file successfully created"
Expand All @@ -41,7 +41,7 @@ fi
echo "Classifying sample, sensitive settings"
rm results.csv 2> /dev/null
# make a streaming pre-filter
/usr/bin/time python ${scriptsDir}/StreamingQueryDNADatabase.py ${testOrganism} TrainingDatabase.h5 results.csv 10-10-1 --sensitive
/usr/bin/time python ${scriptsDir}/StreamingQueryDNADatabase.py ${testOrganism} TrainingDatabase.h5 results.csv 5-15-2 --sensitive
if test -f results.csv; then
echo "sensitive classify successful"
cat results.csv
Expand All @@ -55,7 +55,7 @@ fi
echo "Classifying sample, sensitive settings, with KMC intersect"
rm results.csv 2> /dev/null
# make a streaming pre-filter
/usr/bin/time python ${scriptsDir}/StreamingQueryDNADatabase.py ${testOrganism} TrainingDatabase.h5 results.csv 10-10-1 --sensitive --intersect
/usr/bin/time python ${scriptsDir}/StreamingQueryDNADatabase.py ${testOrganism} TrainingDatabase.h5 results.csv 5-15-2 --sensitive --intersect
if test -f results.csv; then
echo "sensitive classify successful"
cat results.csv
Expand Down

0 comments on commit a82082e

Please sign in to comment.