Skip to content

Commit

Permalink
finally figured out what's going on. See comments in #15 for description
Browse files Browse the repository at this point in the history
  • Loading branch information
dkoslicki committed Apr 2, 2020
1 parent 62a0d71 commit 6e26050
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 2 deletions.
5 changes: 3 additions & 2 deletions CMash/Query.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ def __init__(self, reads_path, training_path, input_type='fasta', threads=8, tem
# intersection dump file
self.intersection_kmc_out_file = os.path.join(self.temp_dir, str(self.ksize) + 'mers_intersection')
self.intersection_kmc_dump_file = os.path.join(self.temp_dir, str(self.ksize) + 'mers_intersection_dump')
# final output file
self.final_out_file = self.intersection_kmc_dump_file + '.fa'

def get_kmer_size(self):
"""Reads the training database (in HDF5 format)
Expand Down Expand Up @@ -294,11 +296,10 @@ def intersect(self):

#read intersection & rewrite in fasta format
with(open(dump_path, 'r')) as infile:
with(open(dump_path + '.fa', 'w')) as fasta:
with(open(self.final_out_file, 'w')) as fasta:
for line in infile:
seq = line.split()[0]
fasta.write('>seq' + '\n' + seq + '\n')
self.out_file = dump_path + '.fa'

def compute_intersection(self):
self.dump_training_kmers()
Expand Down
80 changes: 80 additions & 0 deletions tests/script_tests_debug15/python_output/replicate_in_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import marisa_trie as mt
import subprocess
import sys
import re

# for IDE REPL testing
os.chdir("/home/dkoslicki/Desktop/CMash/tests/script_tests_debug15/python_output")
Expand Down Expand Up @@ -173,3 +174,82 @@
print("Yes! Python and fasta dump agree on the intersection k-mers")
else:
raise Exception("NO! Python and fasta dump DO NOT agree on the intersection k-mers")


#########################################################
# test the StreamingQueryDNADatabas.py code on the original file vs the kmc reduced file
def parseNumList(input):
"""Thank you stack overflow"""
m = re.match(r'(\d+)(?:-(\d+))?(?:-(\d+))?$', input)
start = int(m.group(1))
end = int(m.group(2))
if m.group(3):
increment = int(m.group(3))
else:
increment = 1
return list(range(start, end+1, increment))

TST_file = "../TrainingDatabase.tst"
k_range_str = '5-15-2'
k_range = parseNumList(k_range_str)
C = Create(training_database_file=training_path, bloom_filter_file="", TST_file=TST_file, k_range=k_range)
C.import_TST()
C.create_BF_prefilter()

# on the original reads
counter = Counters(tree=C.tree, k_range=C.k_range, all_kmers_bf=C.all_kmers_bf)
fid = khmer.ReadParser(reads_path)
to_proc = [record.sequence for record in fid]
fid.close()
res = map(counter.process_seq, to_proc)
flattened_res = [item for sublist in res if sublist for item in sublist]
all_reads_match_tuples = list(set(flattened_res))

# on the kmc dumped reads
counter = Counters(tree=C.tree, k_range=C.k_range, all_kmers_bf=C.all_kmers_bf)
fid = khmer.ReadParser(I.final_out_file)
to_proc = [record.sequence for record in fid]
fid.close()
res = map(counter.process_seq, to_proc)
flattened_res = [item for sublist in res if sublist for item in sublist]
kmc_reads_match_tuples = list(set(flattened_res))

if sorted(all_reads_match_tuples) == sorted(kmc_reads_match_tuples):
print("Yes! match_tuples are the same for original reads or kmc reads")
else:
raise Exception("NO! match_tuples are NOT the same for original reads or kmc reads")

missing_matches = set(all_reads_match_tuples) - set(kmc_reads_match_tuples)
missing_match = list(missing_matches)[0]
ce_index = missing_match[0]
k_range_loc = missing_match[1]
sketch_index = missing_match[2]
print(f"A missing kmer: {CEs[ce_index]._kmers[sketch_index]}")
print(f"A missing kmer_rc: {khmer.reverse_complement(CEs[ce_index]._kmers[sketch_index])}")
print(f"at kmer size: {k_range[k_range_loc]}")
missing_kmer_trunc = CEs[ce_index]._kmers[sketch_index][0:k_range[k_range_loc]]
print(f"kmer trunc: {missing_kmer_trunc}")
print(f"kmer_rc trunc: {khmer.reverse_complement(CEs[ce_index]._kmers[sketch_index])[0:k_range[k_range_loc]]}")
print(f"trunc kmer_rc: {khmer.reverse_complement(CEs[ce_index]._kmers[sketch_index][0:k_range[k_range_loc]])}")

# Look for that k-mer in the query file:
result = subprocess.run(f"zgrep -c -i {missing_kmer_trunc} {reads_path}", capture_output=True, shell=True)
print(f"matches in reads to missing_kmer_trunc: {result.stdout}")

# look for that k-mer in the intersection file
result = subprocess.run(f"zgrep -c -i {missing_kmer_trunc} {I.final_out_file}", capture_output=True, shell=True)
print(f"matches in intersection to missing_kmer_trunc: {result.stdout}")

# look for the k-mer in the reads dump
result = subprocess.run(f"kmc_dump {I.reads_kmc_out_file} /dev/fd/1 | cut -f 1 | grep -c -i {missing_kmer_trunc}", capture_output=True, shell=True)
print(f"matches in kmc reads dump to missing_kmer_trunc: {result.stdout}")
# so indeed, the kmer_trunc is in the reads dump, but it only matches as prefixes to k-mers that aren't in the training database:
# kmc_dump reads_15mers_dump /dev/fd/1 | cut -f 1 | grep -i TGCCCTGTGGC
#ATGCCCTGTGGCTGG
#ATGCTGCCCTGTGGC
#CCCCTGCCCTGTGGC
#CTGCCCTGTGGCAGC
#TGCCCTGTGGCAGCA #<---------------
# grep -c TGCCCTGTGGCAGCA TrainingDatabase_dump.fa
# 0

0 comments on commit 6e26050

Please sign in to comment.