Merge pull request #1787 from dib-lab/fix/consume-filename-or-parser

[MRG] "Overload" cython consume methods with type introspection
dib-lab · Sep 17, 2017 · 01e826b · 01e826b
2 parents ae3c90e + f0f61cd
commit 01e826b
Show file tree

Hide file tree

Showing 12 changed files with 113 additions and 131 deletions.
diff --git a/khmer/_oxli/graphs.pxd b/khmer/_oxli/graphs.pxd
@@ -7,7 +7,7 @@ from libc.stdint cimport uint8_t, uint32_t, uint64_t, uintptr_t
 
 from khmer._oxli.oxli_types cimport *
 from khmer._oxli.hashing cimport Kmer, CpKmer, KmerSet, CpKmerFactory, CpKmerIterator
-from khmer._oxli.parsing cimport CpReadParser, CpSequence
+from khmer._oxli.parsing cimport CpReadParser, CpSequence, FastxParserPtr
 from khmer._oxli.legacy_partitioning cimport (CpSubsetPartition, cp_pre_partition_info,
                                    SubsetPartition)
 from khmer._oxli.utils cimport oxli_raise_py_error
@@ -139,15 +139,15 @@ cdef extern from "oxli/hashgraph.hh" namespace "oxli" nogil:
 
         void consume_seqfile_and_tag[SeqIO](const string &,
                                    unsigned int,
-                                   unsigned long long) 
+                                   unsigned long long)
 
         # Ugly workaround. For some reason, Cython doesn't like *just this*
         # templated overload -- it chooses whichever was defined last, breaking
         # resolution for either strings of FastxParserPtr. So, we rename it on
         # the Cython side and give it a real name substitution for code gen.
         void consume_seqfile_and_tag_readparser "consume_seqfile_and_tag" [SeqIO](shared_ptr[CpReadParser[SeqIO]],
                                    unsigned int,
-                                   unsigned long long) 
+                                   unsigned long long)
 
         void consume_sequence_and_tag(const string &,
                                       unsigned long long &)
@@ -160,7 +160,7 @@ cdef extern from "oxli/hashgraph.hh" namespace "oxli" nogil:
                                        unsigned int &,
                                        unsigned long long &) except +oxli_raise_py_error
 
-        uintptr_t trim_on_stoptags(string) 
+        uintptr_t trim_on_stoptags(string)
 
         unsigned int traverse_from_kmer(CpKmer,
                                         uint32_t,
@@ -177,7 +177,7 @@ cdef extern from "oxli/hashgraph.hh" namespace "oxli" nogil:
         void load_stop_tags(string, bool) except +oxli_raise_py_error
         void extract_unique_paths(string, uint32_t, float, vector[string])
         void calc_connected_graph_size(CpKmer, uint64_t&, KmerSet&,
-                                       const uint64_t, bool) 
+                                       const uint64_t, bool)
         uint32_t kmer_degree(HashIntoType, HashIntoType)
         uint32_t kmer_degree(const char *)
         void find_high_degree_nodes(const char *, set[HashIntoType] &) const
@@ -246,6 +246,7 @@ cdef class Hashtable:
     cdef HashIntoType sanitize_hash_kmer(self, object kmer) except -1
     cdef bytes _valid_sequence(self, str sequence)
     cdef CpKmer _build_kmer(self, object kmer) except *
+    cdef FastxParserPtr _get_parser(self, object parser_or_filename) except *
     cdef list _get_raw_tables(self, uint8_t **, vector[uint64_t])
 
 

diff --git a/khmer/_oxli/graphs.pyx b/khmer/_oxli/graphs.pyx
diff --git a/khmer/_oxli/utils.pyx b/khmer/_oxli/utils.pyx
@@ -14,12 +14,13 @@ def get_n_primes_near_x(n_primes, x):
     if len(primes) != n_primes:
         msg = "unable to find {0} prime numbers < {1}".format(n_primes, x)
         raise RuntimeError(msg)
-    return primes 
+    return primes
 
 
 cdef bytes _bstring(s):
     if not isinstance(s, (basestring, bytes)):
-        raise TypeError("Requires a string-like sequence")
+        raise TypeError("Requires a string-like sequence, "\
+                        " got {0} of type {1}".format(s, type(s)))
 
     if isinstance(s, unicode):
         s = s.encode('utf-8')
@@ -30,9 +31,6 @@ cdef unicode _ustring(s):
     if type(s) is unicode:
         # fast path for most common case(s)
         return <unicode>s
-    elif PY_MAJOR_VERSION < 3 and isinstance(s, bytes):
-        # only accept byte strings in Python 2.x, not in Py3
-        return (<bytes>s).decode('UTF-8')
     elif isinstance(s, unicode):
         # an evil cast to <unicode> might work here in some(!) cases,
         # depending on what the further processing does.  to be safe,
@@ -58,5 +56,3 @@ cdef void _fill(double * fill_to, object fill_from):
     '''UNSAFE fill from flat python iterable to C array.'''
     for idx, item in enumerate(fill_from):
         fill_to[idx] = <double>item
-
-
diff --git a/oxli/functions.py b/oxli/functions.py
@@ -49,9 +49,9 @@ def build_graph(ifilenames, graph, num_threads=1, tags=False):
     - tags: should there be tags
     """
     if tags:
-        eat = graph.consume_seqfile_and_tag_with_reads_parser
+        eat = graph.consume_seqfile_and_tag
     else:
-        eat = graph.consume_seqfile_with_reads_parser
+        eat = graph.consume_seqfile
 
     for _, ifile in enumerate(ifilenames):
         rparser = khmer.ReadParser(ifile)

diff --git a/sandbox/count-kmers-single.py b/sandbox/count-kmers-single.py
@@ -102,7 +102,7 @@ def main():
     for _ in range(args.threads):
         thread = \
             threading.Thread(
-                target=countgraph.consume_seqfile_with_reads_parser,
+                target=countgraph.consume_seqfile,
                 args=(rparser, )
             )
         threads.append(thread)

diff --git a/sandbox/optimal_args_hashbits.py b/sandbox/optimal_args_hashbits.py
@@ -80,7 +80,7 @@ def main():
           file=sys.stderr)
 
     htable = khmer.new_nodegraph(args.ksize, args.max_tablesize, args.n_tables)
-    target_method = htable.consume_seqfile_with_reads_parser
+    target_method = htable.consume_seqfile
 
     for _, filename in enumerate(filenames):
         rparser = khmer.ReadParser(filename)

diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py
@@ -147,7 +147,7 @@ def main():  # pylint: disable=too-many-locals,too-many-branches
     for _ in range(args.threads):
         thread = \
             threading.Thread(
-                target=countgraph.consume_seqfile_with_reads_parser,
+                target=countgraph.consume_seqfile,
                 args=(rparser, )
             )
         threads.append(thread)
@@ -162,7 +162,7 @@ def main():  # pylint: disable=too-many-locals,too-many-branches
     abundance_lists = []
 
     def __do_abundance_dist__(read_parser):
-        abundances = countgraph.abundance_distribution_with_reads_parser(
+        abundances = countgraph.abundance_distribution(
             read_parser, tracking)
         abundance_lists.append(abundances)
 

diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py
@@ -140,7 +140,7 @@ def main():
     for _ in range(args.threads):
         cur_thread = \
             threading.Thread(
-                target=graph.consume_seqfile_with_reads_parser,
+                target=graph.consume_seqfile,
                 args=(rparser, )
             )
         threads.append(cur_thread)

diff --git a/scripts/load-into-counting.py b/scripts/load-into-counting.py
@@ -148,7 +148,7 @@ def main():
         for _ in range(args.threads):
             cur_thrd = \
                 threading.Thread(
-                    target=countgraph.consume_seqfile_with_reads_parser,
+                    target=countgraph.consume_seqfile,
                     args=(rparser, )
                 )
             threads.append(cur_thrd)

diff --git a/tests/test_countgraph.py b/tests/test_countgraph.py
@@ -1186,16 +1186,16 @@ def test_consume_absentfasta():
         print(str(err))
 
 
-def test_consume_absentfasta_with_reads_parser():
+def test_consume_absentfasta():
     countgraph = khmer.Countgraph(4, 4 ** 4, 4)
     try:
-        countgraph.consume_seqfile_with_reads_parser()
+        countgraph.consume_seqfile()
         assert 0, "this should fail"
     except TypeError as err:
         print(str(err))
     try:
         readparser = ReadParser(utils.get_test_data('empty-file'))
-        countgraph.consume_seqfile_with_reads_parser(readparser)
+        countgraph.consume_seqfile(readparser)
         assert 0, "this should fail"
     except OSError as err:
         print(str(err))

diff --git a/tests/test_nodegraph.py b/tests/test_nodegraph.py
@@ -905,16 +905,16 @@ def test_bad_primes_list():
         print(str(e))
 
 
-def test_consume_absentfasta_with_reads_parser():
+def test_consume_absentfasta():
     nodegraph = khmer.Nodegraph(31, 1, 1)
     try:
-        nodegraph.consume_seqfile_with_reads_parser()
+        nodegraph.consume_seqfile()
         assert 0, "this should fail"
     except TypeError as err:
         print(str(err))
     try:
         readparser = ReadParser(utils.get_test_data('empty-file'))
-        nodegraph.consume_seqfile_with_reads_parser(readparser)
+        nodegraph.consume_seqfile(readparser)
         assert 0, "this should fail"
     except OSError as err:
         print(str(err))
@@ -934,7 +934,7 @@ def test_consume_seqfile_and_tag_with_badreads_parser():
     nodegraph = khmer.Nodegraph(6, 1e6, 2)
     try:
         readsparser = khmer.ReadParser(utils.get_test_data("test-empty.fa"))
-        nodegraph.consume_seqfile_and_tag_with_reads_parser(readsparser)
+        nodegraph.consume_seqfile_and_tag(readsparser)
         assert 0, "this should fail"
     except OSError as e:
         print(str(e))

diff --git a/tests/test_tabletype.py b/tests/test_tabletype.py
@@ -376,7 +376,7 @@ def test_consume_seqfile_reads_parser(AnyTabletype):
     kh = AnyTabletype(5)
     rparser = ReadParser(utils.get_test_data('test-fastq-reads.fq'))
 
-    kh.consume_seqfile_with_reads_parser(rparser)
+    kh.consume_seqfile(rparser)
 
     kh2 = AnyTabletype(5)
     for record in screed.open(utils.get_test_data('test-fastq-reads.fq')):
@@ -460,7 +460,7 @@ def test_abund_dist_A_readparser(AnyTabletype):
     tracking = Nodegraph(4, 1, 1, primes=PRIMES_1m)
 
     kh.consume_seqfile(A_filename)
-    dist = kh.abundance_distribution_with_reads_parser(rparser, tracking)
+    dist = kh.abundance_distribution(rparser, tracking)
 
     print(dist[:10])
     assert sum(dist) == 1