From 2a663783e653011a7d4a785cabbc7dd98eb3cef2 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Thu, 20 Jun 2024 14:08:57 +0200 Subject: [PATCH] Amino acid alphabet k-mer support (#4) --- include/builder/build.impl | 4 +- include/builder/parse_file.hpp | 7 +- include/constants.hpp | 4 - include/cover/parse_file.hpp | 10 +- include/dictionary.impl | 6 +- include/dump.impl | 3 +- include/kmer.hpp | 119 ++++++++++++++++-- .../streaming_query_canonical_parsing.hpp | 9 +- .../query/streaming_query_regular_parsing.hpp | 6 +- include/statistics.impl | 3 +- include/util.hpp | 64 +--------- src/bench_utils.hpp | 6 +- src/check_utils.hpp | 9 +- src/permute.cpp | 2 +- test/test_alphabet.cpp | 38 +++--- 15 files changed, 168 insertions(+), 122 deletions(-) diff --git a/include/builder/build.impl b/include/builder/build.impl index 07453eb..dab731c 100644 --- a/include/builder/build.impl +++ b/include/builder/build.impl @@ -22,8 +22,8 @@ void dictionary::build(std::string const& filename, " but got k = " + std::to_string(build_config.k)); } if (build_config.m == 0) throw std::runtime_error("m must be > 0"); - if (build_config.m > constants::max_m) { - throw std::runtime_error("m must be less <= " + std::to_string(constants::max_m) + + if (build_config.m > kmer_t::max_m) { + throw std::runtime_error("m must be less <= " + std::to_string(kmer_t::max_m) + " but got m = " + std::to_string(build_config.m)); } if (build_config.m > build_config.k) throw std::runtime_error("m must be < k"); diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp index 1fc31c4..9ba9b95 100644 --- a/include/builder/parse_file.hpp +++ b/include/builder/parse_file.hpp @@ -51,7 +51,7 @@ void parse_file(std::istream& is, parse_data& data, assert(end > begin); char const* super_kmer = sequence.data() + begin; uint64_t size = (end - begin) + k - 1; - assert(util::is_valid(super_kmer, size)); + assert(util::is_valid(super_kmer, size)); /* if num_kmers_in_super_kmer > k - m + 1, then split the super_kmer into blocks */ uint64_t num_kmers_in_super_kmer = end - begin; @@ -164,12 +164,13 @@ void parse_file(std::istream& is, parse_data& data, while (end != sequence.size() - k + 1) { char const* kmer = sequence.data() + end; - assert(util::is_valid(kmer, k)); + assert(util::is_valid(kmer, k)); kmer_t uint_kmer = util::string_to_uint_kmer(kmer, k); uint64_t minimizer = util::compute_minimizer(uint_kmer, k, m, seed); if (build_config.canonical_parsing) { - kmer_t uint_kmer_rc = uint_kmer.reverse_complement(k); + kmer_t uint_kmer_rc = uint_kmer; + uint_kmer_rc.reverse_complement_inplace(k); uint64_t minimizer_rc = util::compute_minimizer(uint_kmer_rc, k, m, seed); minimizer = std::min(minimizer, minimizer_rc); } diff --git a/include/constants.hpp b/include/constants.hpp index 3f04546..aa5ccf2 100644 --- a/include/constants.hpp +++ b/include/constants.hpp @@ -3,10 +3,6 @@ #include "kmer.hpp" namespace sshash::constants { - -/* max *odd* size that can be packed into 64 bits */ -constexpr uint64_t max_m = 31; - constexpr uint64_t invalid_uint64 = uint64_t(-1); constexpr uint64_t seed = 1; diff --git a/include/cover/parse_file.hpp b/include/cover/parse_file.hpp index 2b1c4c2..53581c7 100644 --- a/include/cover/parse_file.hpp +++ b/include/cover/parse_file.hpp @@ -198,6 +198,7 @@ void reverse_header(std::string const& input, std::string& output, uint64_t k) { for (auto weight : weights) output.append(std::to_string(weight) + " "); } +template void permute_and_write(std::istream& is, std::string const& output_filename, std::string const& tmp_dirname, pthash::compact_vector const& permutation, pthash::bit_vector const& signs, uint64_t k) { @@ -255,8 +256,8 @@ void permute_and_write(std::istream& is, std::string const& output_filename, and reverse the weights in header_sequence */ dna_sequence_rc.resize(dna_sequence.size()); header_sequence_r.clear(); - util::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(), - dna_sequence.size()); + kmer_t::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(), + dna_sequence.size()); reverse_header(header_sequence, header_sequence_r, k); dna_sequence.swap(dna_sequence_rc); header_sequence.swap(header_sequence_r); @@ -352,6 +353,7 @@ void permute_and_write(std::istream& is, std::string const& output_filename, } } +template void permute_and_write(std::string const& input_filename, std::string const& output_filename, std::string const& tmp_dirname, pthash::compact_vector const& permutation, pthash::bit_vector const& signs, uint64_t k) { @@ -360,9 +362,9 @@ void permute_and_write(std::string const& input_filename, std::string const& out std::cout << "reading file '" << input_filename << "'..." << std::endl; if (util::ends_with(input_filename, ".gz")) { zip_istream zis(is); - permute_and_write(zis, output_filename, tmp_dirname, permutation, signs, k); + permute_and_write(zis, output_filename, tmp_dirname, permutation, signs, k); } else { - permute_and_write(is, output_filename, tmp_dirname, permutation, signs, k); + permute_and_write(is, output_filename, tmp_dirname, permutation, signs, k); } is.close(); } diff --git a/include/dictionary.impl b/include/dictionary.impl index d00fa4a..72d41a9 100644 --- a/include/dictionary.impl +++ b/include/dictionary.impl @@ -24,7 +24,8 @@ lookup_result dictionary::lookup_uint_regular_parsing(kmer_t uint_kmer) template lookup_result dictionary::lookup_uint_canonical_parsing(kmer_t uint_kmer) const { - kmer_t uint_kmer_rc = uint_kmer.reverse_complement(m_k); + kmer_t uint_kmer_rc = uint_kmer; + uint_kmer_rc.reverse_complement_inplace(m_k); auto minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); auto minimizer_rc = util::compute_minimizer(uint_kmer_rc, m_k, m_m, m_seed); uint64_t bucket_id = m_minimizers.lookup(uint64_t(std::min(minimizer, minimizer_rc))); @@ -79,7 +80,8 @@ lookup_result dictionary::lookup_advanced_uint(kmer_t uint_kmer, auto res = lookup_uint_regular_parsing(uint_kmer); assert(res.kmer_orientation == constants::forward_orientation); if (check_reverse_complement and res.kmer_id == constants::invalid_uint64) { - kmer_t uint_kmer_rc = uint_kmer.reverse_complement(m_k); + kmer_t uint_kmer_rc = uint_kmer; + uint_kmer_rc.reverse_complement_inplace(m_k); res = lookup_uint_regular_parsing(uint_kmer_rc); res.kmer_orientation = constants::backward_orientation; } diff --git a/include/dump.impl b/include/dump.impl index a51973d..a2373fa 100644 --- a/include/dump.impl +++ b/include/dump.impl @@ -31,7 +31,8 @@ void dictionary::dump(std::string const& filename) const { kmer_t kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * m_k); auto [minimizer, pos] = util::compute_minimizer_pos(kmer, m_k, m_m, m_seed); if (m_canonical_parsing) { - kmer_t kmer_rc = kmer.reverse_complement(m_k); + kmer_t kmer_rc = kmer; + kmer_rc.reverse_complement_inplace(m_k); auto [minimizer_rc, pos_rc] = util::compute_minimizer_pos(kmer_rc, m_k, m_m, m_seed); if (minimizer_rc < minimizer) { diff --git a/include/kmer.hpp b/include/kmer.hpp index 1200dc2..69affbe 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -87,14 +87,12 @@ struct uint_kmer_t { static constexpr uint16_t uint_kmer_bits = 8 * sizeof(Kmer); static constexpr uint8_t bits_per_char = BitsPerChar; - // max *odd* size that can be packed into uint_kmer_bits bits - static constexpr uint16_t max_k = []() { - uint16_t max_k_any = uint_kmer_bits / bits_per_char; - return max_k_any % 2 == 0 ? max_k_any - 1 : max_k_any; - }(); static_assert(uint_kmer_bits % 64 == 0, "Kmer must use 64*k bits"); - static_assert(bits_per_char < 64, "Less than 64 bits per character"); + static_assert(bits_per_char < 64, "BitsPerChar must be less than 64"); + + static constexpr uint16_t max_k = uint_kmer_bits / bits_per_char; + static constexpr uint16_t max_m = 64 / bits_per_char; }; template @@ -104,8 +102,16 @@ struct alpha_kmer_t : uint_kmer_t { static constexpr char const* alphabet = Alphabet; static constexpr uint8_t alphabet_size = std::char_traits::length(Alphabet); + static bool is_valid(char c); static uint64_t char_to_uint(char c); static char uint64_to_char(uint64_t x) { return alphabet[x]; } + + // Revcompl only makes sense for DNA, fallback to noop otherwise + [[maybe_unused]] virtual void reverse_complement_inplace(uint64_t) {} + [[maybe_unused]] static void compute_reverse_complement(char const* input, char* output, + uint64_t size) { + for (uint64_t i = 0; i != size; ++i) { output[i] = input[i]; } + } }; #ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING @@ -119,8 +125,10 @@ struct dna_uint_kmer_t : alpha_kmer_t { using base = alpha_kmer_t; using base::uint_kmer_bits; using base::bits_per_char; - using base::max_k; using base::base; + // Use odd lengths for DNA to avoid dealing with self-complements + static constexpr uint16_t max_k = base::max_k - !(base::max_k % 2); + static constexpr uint16_t max_m = base::max_m - !(base::max_m % 2); /* This works with the map: A -> 00; C -> 01; G -> 11; T -> 10. @@ -147,14 +155,13 @@ struct dna_uint_kmer_t : alpha_kmer_t { return res; } - [[maybe_unused]] dna_uint_kmer_t reverse_complement(uint64_t k) { - dna_uint_kmer_t x(*this); + [[maybe_unused]] void reverse_complement_inplace(uint64_t k) override { assert(k <= max_k); - dna_uint_kmer_t res(0); - for (uint16_t i = 0; i < uint_kmer_bits; i += 64) { res.append64(crc64(x.pop64())); } + dna_uint_kmer_t rev(0); + for (uint16_t i = 0; i < uint_kmer_bits; i += 64) { rev.append64(crc64(base::pop64())); } // res is full reverse-complement to x - res.drop(uint_kmer_bits - k * bits_per_char); - return res; + rev.drop(uint_kmer_bits - k * bits_per_char); + *this = rev; } #ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING @@ -186,6 +193,92 @@ struct dna_uint_kmer_t : alpha_kmer_t { */ static uint64_t char_to_uint(char c) { return (c >> 1) & 3; } #endif + + /* + Forward character map: + A -> A 65 + C -> C 67 + G -> G 71 + T -> T 84 + a -> a 97 + c -> c 99 + g -> g 103 + t -> t 116 + All other chars map to zero. + */ + static constexpr char canonicalize_basepair_forward_map[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65, 0, 67, 0, 0, 0, 71, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 84, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 0, 99, 0, 0, 0, 103, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + /* + Reverse character map: + 65 A -> T 84 + 67 C -> G 71 + 71 G -> C 67 + 84 T -> A 65 + 97 a -> t 116 + 99 c -> g 103 + 103 g -> c 99 + 116 t -> a 97 + All other chars map to zero. + */ + static constexpr char canonicalize_basepair_reverse_map[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, 0, 71, 0, 0, 0, 67, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116, 0, 103, 0, 0, 0, 99, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + [[maybe_unused]] static void compute_reverse_complement(char const* input, char* output, + uint64_t size) { + for (uint64_t i = 0; i != size; ++i) { + int c = input[i]; + output[size - i - 1] = canonicalize_basepair_reverse_map[c]; + } + } + + static inline bool is_valid(char c) { + return canonicalize_basepair_forward_map[static_cast(c)]; + } +}; + +inline constexpr char amino_acids[] = "ABCDEFGHIJKLMNOPQRSTUVWYZX"; +template +struct aa_uint_kmer_t : alpha_kmer_t { + using base = alpha_kmer_t; + using base::uint_kmer_bits; + using base::bits_per_char; + using base::base; + + static constexpr int8_t char_to_aa[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 25, 23, 24, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 23, 24, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; + + static bool is_valid(char c) { return ~char_to_aa[static_cast(c)]; } + static uint64_t char_to_uint(char c) { return char_to_aa[static_cast(c)]; } }; // also supports bitpack<__uint128_t, 1>, std::bitset<256>, etc diff --git a/include/query/streaming_query_canonical_parsing.hpp b/include/query/streaming_query_canonical_parsing.hpp index e812e69..a5756e6 100644 --- a/include/query/streaming_query_canonical_parsing.hpp +++ b/include/query/streaming_query_canonical_parsing.hpp @@ -46,7 +46,8 @@ struct streaming_query_canonical_parsing { lookup_result lookup_advanced(const char* kmer) { /* 1. validation */ - bool is_valid = m_start ? util::is_valid(kmer, m_k) : util::is_valid(kmer[m_k - 1]); + bool is_valid = + m_start ? util::is_valid(kmer, m_k) : kmer_t::is_valid(kmer[m_k - 1]); if (!is_valid) { start(); return lookup_result(); @@ -62,7 +63,8 @@ struct streaming_query_canonical_parsing { } m_curr_minimizer = m_minimizer_enum.next(m_kmer, m_start); assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed)); - m_kmer_rc = m_kmer.reverse_complement(m_k); + m_kmer_rc = m_kmer; + m_kmer_rc.reverse_complement_inplace(m_k); constexpr bool reverse = true; uint64_t minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start, reverse); assert(minimizer_rc == util::compute_minimizer(m_kmer_rc, m_k, m_m, m_seed)); @@ -174,7 +176,8 @@ struct streaming_query_canonical_parsing { kmer_t val = m_string_iterator.read(2 * m_k); if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) { - kmer_t val_rc = val.reverse_complement(m_k); + kmer_t val_rc = val; + val_rc.reverse_complement_inplace(m_k); uint64_t minimizer = std::min(util::compute_minimizer(val, m_k, m_m, m_seed), util::compute_minimizer(val_rc, m_k, m_m, m_seed)); diff --git a/include/query/streaming_query_regular_parsing.hpp b/include/query/streaming_query_regular_parsing.hpp index 360468b..ad58989 100644 --- a/include/query/streaming_query_regular_parsing.hpp +++ b/include/query/streaming_query_regular_parsing.hpp @@ -50,7 +50,8 @@ struct streaming_query_regular_parsing { lookup_result lookup_advanced(const char* kmer) { /* 1. validation */ - bool is_valid = m_start ? util::is_valid(kmer, m_k) : util::is_valid(kmer[m_k - 1]); + bool is_valid = + m_start ? util::is_valid(kmer, m_k) : kmer_t::is_valid(kmer[m_k - 1]); if (!is_valid) { m_start = true; return lookup_result(); @@ -66,7 +67,8 @@ struct streaming_query_regular_parsing { } m_curr_minimizer = m_minimizer_enum.next(m_kmer, m_start); assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed)); - m_kmer_rc = m_kmer.reverse_complement(m_k); + m_kmer_rc = m_kmer; + m_kmer_rc.reverse_complement_inplace(m_k); constexpr bool reverse = true; m_curr_minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start, reverse); assert(m_curr_minimizer_rc == util::compute_minimizer(m_kmer_rc, m_k, m_m, m_seed)); diff --git a/include/statistics.impl b/include/statistics.impl index e9f59fe..c9fef8e 100644 --- a/include/statistics.impl +++ b/include/statistics.impl @@ -28,7 +28,8 @@ void dictionary::compute_statistics() const { kmer_t kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * m_k); auto [minimizer, pos] = util::compute_minimizer_pos(kmer, m_k, m_m, m_seed); if (m_canonical_parsing) { - kmer_t kmer_rc = kmer.reverse_complement(m_k); + kmer_t kmer_rc = kmer; + kmer_rc.reverse_complement_inplace(m_k); auto [minimizer_rc, pos_rc] = util::compute_minimizer_pos(kmer_rc, m_k, m_m, m_seed); if (minimizer_rc < minimizer) { diff --git a/include/util.hpp b/include/util.hpp index 18e5a57..0667fda 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -148,66 +148,10 @@ template return str; } -/* - Forward character map: - A -> A 65 - C -> C 67 - G -> G 71 - T -> T 84 - a -> a 97 - c -> c 99 - g -> g 103 - t -> t 116 - All other chars map to zero. -*/ -static const char canonicalize_basepair_forward_map[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 65, 0, 67, 0, 0, 0, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 0, 99, 0, 0, 0, 103, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 116, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - -/* - Reverse character map: - 65 A -> T 84 - 67 C -> G 71 - 71 G -> C 67 - 84 T -> A 65 - 97 a -> t 116 - 99 c -> g 103 - 103 g -> c 99 - 116 t -> a 97 - All other chars map to zero. -*/ -static const char canonicalize_basepair_reverse_map[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 84, 0, 71, 0, 0, 0, 67, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116, 0, 103, 0, 0, 0, 99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 97, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - -[[maybe_unused]] static void compute_reverse_complement(char const* input, char* output, - uint64_t size) { - for (uint64_t i = 0; i != size; ++i) { - int c = input[i]; - output[size - i - 1] = canonicalize_basepair_reverse_map[c]; - } -} - -static inline bool is_valid(int c) { return canonicalize_basepair_forward_map[c]; } - +template [[maybe_unused]] static bool is_valid(char const* str, uint64_t size) { for (uint64_t i = 0; i != size; ++i) { - int c = str[i]; - if (canonicalize_basepair_forward_map[c] == 0) return false; + if (!kmer_t::is_valid(str[i])) { return false; } } return true; } @@ -217,7 +161,7 @@ static inline bool is_valid(int c) { return canonicalize_basepair_forward_map[c] */ template uint64_t compute_minimizer(kmer_t kmer, const uint64_t k, const uint64_t m, const uint64_t seed) { - assert(m <= constants::max_m); + assert(m <= kmer_t::max_m); assert(m <= k); uint64_t min_hash = uint64_t(-1); kmer_t minimizer = kmer_t(-1); @@ -238,7 +182,7 @@ uint64_t compute_minimizer(kmer_t kmer, const uint64_t k, const uint64_t m, cons template std::pair compute_minimizer_pos(kmer_t kmer, uint64_t k, uint64_t m, uint64_t seed) { - assert(m <= constants::max_m); + assert(m <= kmer_t::max_m); assert(m <= k); uint64_t min_hash = uint64_t(-1); kmer_t minimizer = kmer_t(-1); diff --git a/src/bench_utils.hpp b/src/bench_utils.hpp index f32dd18..9062b60 100644 --- a/src/bench_utils.hpp +++ b/src/bench_utils.hpp @@ -35,7 +35,7 @@ void perf_test_lookup_access(dictionary const& dict) { dict.access(id, kmer.data()); if ((i & 1) == 0) { /* transform 50% of the kmers into their reverse complements */ - util::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); + kmer_t::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); lookup_queries.push_back(kmer_rc); } else { lookup_queries.push_back(kmer); @@ -82,7 +82,7 @@ void perf_test_lookup_access(dictionary const& dict) { dict.access(id, kmer.data()); if ((i & 1) == 0) { /* transform 50% of the kmers into their reverse complements */ - util::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); + kmer_t::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); lookup_queries.push_back(kmer_rc); } else { lookup_queries.push_back(kmer); @@ -161,7 +161,7 @@ void perf_test_lookup_weight(dictionary const& dict) { dict.access(id, kmer.data()); if ((i & 1) == 0) { /* transform 50% of the kmers into their reverse complements */ - util::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); + kmer_t::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); lookup_queries.push_back(kmer_rc); } else { lookup_queries.push_back(kmer); diff --git a/src/check_utils.hpp b/src/check_utils.hpp index a4d9d10..ab4df99 100644 --- a/src/check_utils.hpp +++ b/src/check_utils.hpp @@ -40,7 +40,7 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& ++num_lines; for (uint64_t i = 0; i + k <= line.size(); ++i) { - assert(util::is_valid(line.data() + i, k)); + assert(util::is_valid(line.data() + i, k)); kmer_t uint_kmer = util::string_to_uint_kmer(line.data() + i, k); bool orientation = constants::forward_orientation; @@ -51,7 +51,7 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& /* transform 50% of the kmers into their reverse complements */ if ((num_kmers & 1) == 0) { - uint_kmer = uint_kmer.reverse_complement(k); + uint_kmer.reverse_complement_inplace(k); orientation = constants::backward_orientation; } @@ -140,7 +140,8 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& // check access dict.access(id, got_kmer_str.data()); kmer_t got_uint_kmer = util::string_to_uint_kmer(got_kmer_str.data(), k); - kmer_t got_uint_kmer_rc = got_uint_kmer.reverse_complement(k); + kmer_t got_uint_kmer_rc = got_uint_kmer; + got_uint_kmer_rc.reverse_complement_inplace(k); if (got_uint_kmer != uint_kmer and got_uint_kmer_rc != uint_kmer) { std::cout << "ERROR: got '" << got_kmer_str << "' but expected '" << expected_kmer_str << "'" << std::endl; @@ -192,7 +193,7 @@ bool check_correctness_navigational_kmer_query(std::istream& is, dictionary(line.data() + i, k)); if (num_kmers != 0 and num_kmers % 5000000 == 0) { std::cout << "checked " << num_kmers << " kmers" << std::endl; } diff --git a/src/permute.cpp b/src/permute.cpp index e716be1..3cb0d5c 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -94,7 +94,7 @@ int permute(int argc, char** argv) { } /* permute and save to output file */ - permute_and_write(input_filename, output_filename, tmp_dirname, permutation, signs, k); + permute_and_write(input_filename, output_filename, tmp_dirname, permutation, signs, k); std::remove(permutation_filename.c_str()); return 0; diff --git a/test/test_alphabet.cpp b/test/test_alphabet.cpp index 701726b..788df29 100644 --- a/test/test_alphabet.cpp +++ b/test/test_alphabet.cpp @@ -38,7 +38,7 @@ int main(int argc, char** argv) { "c"); for (uint64_t i = 0; i != read.length() - k + 1; ++i) { - bool is_valid = util::is_valid(read.data() + i, k); + bool is_valid = util::is_valid(read.data() + i, k); if (!is_valid) { std::cerr << "ERROR: '" << std::string(read.data() + i, k) << "' is NOT valid!" << std::endl; @@ -79,40 +79,40 @@ int main(int argc, char** argv) { for (uint64_t i = 0; i != 1000; ++i) { // generate a random kmer of length kmer_len random_kmer(kmer.data(), kmer_len); - util::compute_reverse_complement(kmer.data(), rc.data(), kmer_len); + kmer_t::compute_reverse_complement(kmer.data(), rc.data(), kmer_len); kmer_t uint_kmer = util::string_to_uint_kmer(kmer.data(), kmer_len); - uint_kmer = uint_kmer.reverse_complement(kmer_len); + uint_kmer.reverse_complement_inplace(kmer_len); expect(util::uint_kmer_to_string(uint_kmer, kmer_len), rc); } } /****/ - expect(util::canonicalize_basepair_forward_map[int('A')], 'A'); - expect(util::canonicalize_basepair_forward_map[int('a')], 'a'); + expect(kmer_t::canonicalize_basepair_forward_map[int('A')], 'A'); + expect(kmer_t::canonicalize_basepair_forward_map[int('a')], 'a'); - expect(util::canonicalize_basepair_forward_map[int('C')], 'C'); - expect(util::canonicalize_basepair_forward_map[int('c')], 'c'); + expect(kmer_t::canonicalize_basepair_forward_map[int('C')], 'C'); + expect(kmer_t::canonicalize_basepair_forward_map[int('c')], 'c'); - expect(util::canonicalize_basepair_forward_map[int('T')], 'T'); - expect(util::canonicalize_basepair_forward_map[int('t')], 't'); + expect(kmer_t::canonicalize_basepair_forward_map[int('T')], 'T'); + expect(kmer_t::canonicalize_basepair_forward_map[int('t')], 't'); - expect(util::canonicalize_basepair_forward_map[int('G')], 'G'); - expect(util::canonicalize_basepair_forward_map[int('g')], 'g'); + expect(kmer_t::canonicalize_basepair_forward_map[int('G')], 'G'); + expect(kmer_t::canonicalize_basepair_forward_map[int('g')], 'g'); /****/ - expect(util::canonicalize_basepair_reverse_map[int('A')], 'T'); - expect(util::canonicalize_basepair_reverse_map[int('a')], 't'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('A')], 'T'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('a')], 't'); - expect(util::canonicalize_basepair_reverse_map[int('C')], 'G'); - expect(util::canonicalize_basepair_reverse_map[int('c')], 'g'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('C')], 'G'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('c')], 'g'); - expect(util::canonicalize_basepair_reverse_map[int('T')], 'A'); - expect(util::canonicalize_basepair_reverse_map[int('t')], 'a'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('T')], 'A'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('t')], 'a'); - expect(util::canonicalize_basepair_reverse_map[int('G')], 'C'); - expect(util::canonicalize_basepair_reverse_map[int('g')], 'c'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('G')], 'C'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('g')], 'c'); return 0; }