Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] "Overload" cython consume methods with type introspection #1787

Merged
merged 8 commits into from
Sep 17, 2017
11 changes: 6 additions & 5 deletions khmer/_oxli/graphs.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ from libc.stdint cimport uint8_t, uint32_t, uint64_t, uintptr_t

from khmer._oxli.oxli_types cimport *
from khmer._oxli.hashing cimport Kmer, CpKmer, KmerSet, CpKmerFactory, CpKmerIterator
from khmer._oxli.parsing cimport CpReadParser, CpSequence
from khmer._oxli.parsing cimport CpReadParser, CpSequence, FastxParserPtr
from khmer._oxli.legacy_partitioning cimport (CpSubsetPartition, cp_pre_partition_info,
SubsetPartition)
from khmer._oxli.utils cimport oxli_raise_py_error
Expand Down Expand Up @@ -139,15 +139,15 @@ cdef extern from "oxli/hashgraph.hh" namespace "oxli" nogil:

void consume_seqfile_and_tag[SeqIO](const string &,
unsigned int,
unsigned long long)
unsigned long long)

# Ugly workaround. For some reason, Cython doesn't like *just this*
# templated overload -- it chooses whichever was defined last, breaking
# resolution for either strings of FastxParserPtr. So, we rename it on
# the Cython side and give it a real name substitution for code gen.
void consume_seqfile_and_tag_readparser "consume_seqfile_and_tag" [SeqIO](shared_ptr[CpReadParser[SeqIO]],
unsigned int,
unsigned long long)
unsigned long long)

void consume_sequence_and_tag(const string &,
unsigned long long &)
Expand All @@ -160,7 +160,7 @@ cdef extern from "oxli/hashgraph.hh" namespace "oxli" nogil:
unsigned int &,
unsigned long long &) except +oxli_raise_py_error

uintptr_t trim_on_stoptags(string)
uintptr_t trim_on_stoptags(string)

unsigned int traverse_from_kmer(CpKmer,
uint32_t,
Expand All @@ -177,7 +177,7 @@ cdef extern from "oxli/hashgraph.hh" namespace "oxli" nogil:
void load_stop_tags(string, bool) except +oxli_raise_py_error
void extract_unique_paths(string, uint32_t, float, vector[string])
void calc_connected_graph_size(CpKmer, uint64_t&, KmerSet&,
const uint64_t, bool)
const uint64_t, bool)
uint32_t kmer_degree(HashIntoType, HashIntoType)
uint32_t kmer_degree(const char *)
void find_high_degree_nodes(const char *, set[HashIntoType] &) const
Expand Down Expand Up @@ -246,6 +246,7 @@ cdef class Hashtable:
cdef HashIntoType sanitize_hash_kmer(self, object kmer) except -1
cdef bytes _valid_sequence(self, str sequence)
cdef CpKmer _build_kmer(self, object kmer) except *
cdef FastxParserPtr _get_parser(self, object parser_or_filename) except *
cdef list _get_raw_tables(self, uint8_t **, vector[uint64_t])


Expand Down
189 changes: 87 additions & 102 deletions khmer/_oxli/graphs.pyx

Large diffs are not rendered by default.

10 changes: 3 additions & 7 deletions khmer/_oxli/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@ def get_n_primes_near_x(n_primes, x):
if len(primes) != n_primes:
msg = "unable to find {0} prime numbers < {1}".format(n_primes, x)
raise RuntimeError(msg)
return primes
return primes


cdef bytes _bstring(s):
if not isinstance(s, (basestring, bytes)):
raise TypeError("Requires a string-like sequence")
raise TypeError("Requires a string-like sequence, "\
" got {0} of type {1}".format(s, type(s)))

if isinstance(s, unicode):
s = s.encode('utf-8')
Expand All @@ -30,9 +31,6 @@ cdef unicode _ustring(s):
if type(s) is unicode:
# fast path for most common case(s)
return <unicode>s
elif PY_MAJOR_VERSION < 3 and isinstance(s, bytes):
# only accept byte strings in Python 2.x, not in Py3
return (<bytes>s).decode('UTF-8')
elif isinstance(s, unicode):
# an evil cast to <unicode> might work here in some(!) cases,
# depending on what the further processing does. to be safe,
Expand All @@ -58,5 +56,3 @@ cdef void _fill(double * fill_to, object fill_from):
'''UNSAFE fill from flat python iterable to C array.'''
for idx, item in enumerate(fill_from):
fill_to[idx] = <double>item


4 changes: 2 additions & 2 deletions oxli/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ def build_graph(ifilenames, graph, num_threads=1, tags=False):
- tags: should there be tags
"""
if tags:
eat = graph.consume_seqfile_and_tag_with_reads_parser
eat = graph.consume_seqfile_and_tag
else:
eat = graph.consume_seqfile_with_reads_parser
eat = graph.consume_seqfile

for _, ifile in enumerate(ifilenames):
rparser = khmer.ReadParser(ifile)
Expand Down
2 changes: 1 addition & 1 deletion sandbox/count-kmers-single.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def main():
for _ in range(args.threads):
thread = \
threading.Thread(
target=countgraph.consume_seqfile_with_reads_parser,
target=countgraph.consume_seqfile,
args=(rparser, )
)
threads.append(thread)
Expand Down
2 changes: 1 addition & 1 deletion sandbox/optimal_args_hashbits.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def main():
file=sys.stderr)

htable = khmer.new_nodegraph(args.ksize, args.max_tablesize, args.n_tables)
target_method = htable.consume_seqfile_with_reads_parser
target_method = htable.consume_seqfile

for _, filename in enumerate(filenames):
rparser = khmer.ReadParser(filename)
Expand Down
4 changes: 2 additions & 2 deletions scripts/abundance-dist-single.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
for _ in range(args.threads):
thread = \
threading.Thread(
target=countgraph.consume_seqfile_with_reads_parser,
target=countgraph.consume_seqfile,
args=(rparser, )
)
threads.append(thread)
Expand All @@ -162,7 +162,7 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
abundance_lists = []

def __do_abundance_dist__(read_parser):
abundances = countgraph.abundance_distribution_with_reads_parser(
abundances = countgraph.abundance_distribution(
read_parser, tracking)
abundance_lists.append(abundances)

Expand Down
2 changes: 1 addition & 1 deletion scripts/filter-abund-single.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def main():
for _ in range(args.threads):
cur_thread = \
threading.Thread(
target=graph.consume_seqfile_with_reads_parser,
target=graph.consume_seqfile,
args=(rparser, )
)
threads.append(cur_thread)
Expand Down
2 changes: 1 addition & 1 deletion scripts/load-into-counting.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def main():
for _ in range(args.threads):
cur_thrd = \
threading.Thread(
target=countgraph.consume_seqfile_with_reads_parser,
target=countgraph.consume_seqfile,
args=(rparser, )
)
threads.append(cur_thrd)
Expand Down
6 changes: 3 additions & 3 deletions tests/test_countgraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -1186,16 +1186,16 @@ def test_consume_absentfasta():
print(str(err))


def test_consume_absentfasta_with_reads_parser():
def test_consume_absentfasta():
countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
countgraph.consume_seqfile_with_reads_parser()
countgraph.consume_seqfile()
assert 0, "this should fail"
except TypeError as err:
print(str(err))
try:
readparser = ReadParser(utils.get_test_data('empty-file'))
countgraph.consume_seqfile_with_reads_parser(readparser)
countgraph.consume_seqfile(readparser)
assert 0, "this should fail"
except OSError as err:
print(str(err))
Expand Down
8 changes: 4 additions & 4 deletions tests/test_nodegraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,16 +905,16 @@ def test_bad_primes_list():
print(str(e))


def test_consume_absentfasta_with_reads_parser():
def test_consume_absentfasta():
nodegraph = khmer.Nodegraph(31, 1, 1)
try:
nodegraph.consume_seqfile_with_reads_parser()
nodegraph.consume_seqfile()
assert 0, "this should fail"
except TypeError as err:
print(str(err))
try:
readparser = ReadParser(utils.get_test_data('empty-file'))
nodegraph.consume_seqfile_with_reads_parser(readparser)
nodegraph.consume_seqfile(readparser)
assert 0, "this should fail"
except OSError as err:
print(str(err))
Expand All @@ -934,7 +934,7 @@ def test_consume_seqfile_and_tag_with_badreads_parser():
nodegraph = khmer.Nodegraph(6, 1e6, 2)
try:
readsparser = khmer.ReadParser(utils.get_test_data("test-empty.fa"))
nodegraph.consume_seqfile_and_tag_with_reads_parser(readsparser)
nodegraph.consume_seqfile_and_tag(readsparser)
assert 0, "this should fail"
except OSError as e:
print(str(e))
Expand Down
4 changes: 2 additions & 2 deletions tests/test_tabletype.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ def test_consume_seqfile_reads_parser(AnyTabletype):
kh = AnyTabletype(5)
rparser = ReadParser(utils.get_test_data('test-fastq-reads.fq'))

kh.consume_seqfile_with_reads_parser(rparser)
kh.consume_seqfile(rparser)

kh2 = AnyTabletype(5)
for record in screed.open(utils.get_test_data('test-fastq-reads.fq')):
Expand Down Expand Up @@ -460,7 +460,7 @@ def test_abund_dist_A_readparser(AnyTabletype):
tracking = Nodegraph(4, 1, 1, primes=PRIMES_1m)

kh.consume_seqfile(A_filename)
dist = kh.abundance_distribution_with_reads_parser(rparser, tracking)
dist = kh.abundance_distribution(rparser, tracking)

print(dist[:10])
assert sum(dist) == 1
Expand Down