Skip to content

Commit

Permalink
Amino acid alphabet k-mer support (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
hmusta authored Jun 20, 2024
1 parent 6b93c03 commit 2a66378
Show file tree
Hide file tree
Showing 15 changed files with 168 additions and 122 deletions.
4 changes: 2 additions & 2 deletions include/builder/build.impl
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ void dictionary<kmer_t>::build(std::string const& filename,
" but got k = " + std::to_string(build_config.k));
}
if (build_config.m == 0) throw std::runtime_error("m must be > 0");
if (build_config.m > constants::max_m) {
throw std::runtime_error("m must be less <= " + std::to_string(constants::max_m) +
if (build_config.m > kmer_t::max_m) {
throw std::runtime_error("m must be less <= " + std::to_string(kmer_t::max_m) +
" but got m = " + std::to_string(build_config.m));
}
if (build_config.m > build_config.k) throw std::runtime_error("m must be < k");
Expand Down
7 changes: 4 additions & 3 deletions include/builder/parse_file.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ void parse_file(std::istream& is, parse_data<kmer_t>& data,
assert(end > begin);
char const* super_kmer = sequence.data() + begin;
uint64_t size = (end - begin) + k - 1;
assert(util::is_valid(super_kmer, size));
assert(util::is_valid<kmer_t>(super_kmer, size));

/* if num_kmers_in_super_kmer > k - m + 1, then split the super_kmer into blocks */
uint64_t num_kmers_in_super_kmer = end - begin;
Expand Down Expand Up @@ -164,12 +164,13 @@ void parse_file(std::istream& is, parse_data<kmer_t>& data,

while (end != sequence.size() - k + 1) {
char const* kmer = sequence.data() + end;
assert(util::is_valid(kmer, k));
assert(util::is_valid<kmer_t>(kmer, k));
kmer_t uint_kmer = util::string_to_uint_kmer<kmer_t>(kmer, k);
uint64_t minimizer = util::compute_minimizer<kmer_t>(uint_kmer, k, m, seed);

if (build_config.canonical_parsing) {
kmer_t uint_kmer_rc = uint_kmer.reverse_complement(k);
kmer_t uint_kmer_rc = uint_kmer;
uint_kmer_rc.reverse_complement_inplace(k);
uint64_t minimizer_rc = util::compute_minimizer<kmer_t>(uint_kmer_rc, k, m, seed);
minimizer = std::min(minimizer, minimizer_rc);
}
Expand Down
4 changes: 0 additions & 4 deletions include/constants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@
#include "kmer.hpp"

namespace sshash::constants {

/* max *odd* size that can be packed into 64 bits */
constexpr uint64_t max_m = 31;

constexpr uint64_t invalid_uint64 = uint64_t(-1);

constexpr uint64_t seed = 1;
Expand Down
10 changes: 6 additions & 4 deletions include/cover/parse_file.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ void reverse_header(std::string const& input, std::string& output, uint64_t k) {
for (auto weight : weights) output.append(std::to_string(weight) + " ");
}

template <typename kmer_t>
void permute_and_write(std::istream& is, std::string const& output_filename,
std::string const& tmp_dirname, pthash::compact_vector const& permutation,
pthash::bit_vector const& signs, uint64_t k) {
Expand Down Expand Up @@ -255,8 +256,8 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
and reverse the weights in header_sequence */
dna_sequence_rc.resize(dna_sequence.size());
header_sequence_r.clear();
util::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(),
dna_sequence.size());
kmer_t::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(),
dna_sequence.size());
reverse_header(header_sequence, header_sequence_r, k);
dna_sequence.swap(dna_sequence_rc);
header_sequence.swap(header_sequence_r);
Expand Down Expand Up @@ -352,6 +353,7 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
}
}

template <typename kmer_t>
void permute_and_write(std::string const& input_filename, std::string const& output_filename,
std::string const& tmp_dirname, pthash::compact_vector const& permutation,
pthash::bit_vector const& signs, uint64_t k) {
Expand All @@ -360,9 +362,9 @@ void permute_and_write(std::string const& input_filename, std::string const& out
std::cout << "reading file '" << input_filename << "'..." << std::endl;
if (util::ends_with(input_filename, ".gz")) {
zip_istream zis(is);
permute_and_write(zis, output_filename, tmp_dirname, permutation, signs, k);
permute_and_write<kmer_t>(zis, output_filename, tmp_dirname, permutation, signs, k);
} else {
permute_and_write(is, output_filename, tmp_dirname, permutation, signs, k);
permute_and_write<kmer_t>(is, output_filename, tmp_dirname, permutation, signs, k);
}
is.close();
}
Expand Down
6 changes: 4 additions & 2 deletions include/dictionary.impl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ lookup_result dictionary<kmer_t>::lookup_uint_regular_parsing(kmer_t uint_kmer)

template <class kmer_t>
lookup_result dictionary<kmer_t>::lookup_uint_canonical_parsing(kmer_t uint_kmer) const {
kmer_t uint_kmer_rc = uint_kmer.reverse_complement(m_k);
kmer_t uint_kmer_rc = uint_kmer;
uint_kmer_rc.reverse_complement_inplace(m_k);
auto minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed);
auto minimizer_rc = util::compute_minimizer(uint_kmer_rc, m_k, m_m, m_seed);
uint64_t bucket_id = m_minimizers.lookup(uint64_t(std::min(minimizer, minimizer_rc)));
Expand Down Expand Up @@ -79,7 +80,8 @@ lookup_result dictionary<kmer_t>::lookup_advanced_uint(kmer_t uint_kmer,
auto res = lookup_uint_regular_parsing(uint_kmer);
assert(res.kmer_orientation == constants::forward_orientation);
if (check_reverse_complement and res.kmer_id == constants::invalid_uint64) {
kmer_t uint_kmer_rc = uint_kmer.reverse_complement(m_k);
kmer_t uint_kmer_rc = uint_kmer;
uint_kmer_rc.reverse_complement_inplace(m_k);
res = lookup_uint_regular_parsing(uint_kmer_rc);
res.kmer_orientation = constants::backward_orientation;
}
Expand Down
3 changes: 2 additions & 1 deletion include/dump.impl
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ void dictionary<kmer_t>::dump(std::string const& filename) const {
kmer_t kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * m_k);
auto [minimizer, pos] = util::compute_minimizer_pos(kmer, m_k, m_m, m_seed);
if (m_canonical_parsing) {
kmer_t kmer_rc = kmer.reverse_complement(m_k);
kmer_t kmer_rc = kmer;
kmer_rc.reverse_complement_inplace(m_k);
auto [minimizer_rc, pos_rc] =
util::compute_minimizer_pos(kmer_rc, m_k, m_m, m_seed);
if (minimizer_rc < minimizer) {
Expand Down
119 changes: 106 additions & 13 deletions include/kmer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,12 @@ struct uint_kmer_t {

static constexpr uint16_t uint_kmer_bits = 8 * sizeof(Kmer);
static constexpr uint8_t bits_per_char = BitsPerChar;
// max *odd* size that can be packed into uint_kmer_bits bits
static constexpr uint16_t max_k = []() {
uint16_t max_k_any = uint_kmer_bits / bits_per_char;
return max_k_any % 2 == 0 ? max_k_any - 1 : max_k_any;
}();

static_assert(uint_kmer_bits % 64 == 0, "Kmer must use 64*k bits");
static_assert(bits_per_char < 64, "Less than 64 bits per character");
static_assert(bits_per_char < 64, "BitsPerChar must be less than 64");

static constexpr uint16_t max_k = uint_kmer_bits / bits_per_char;
static constexpr uint16_t max_m = 64 / bits_per_char;
};

template <typename Kmer, uint8_t BitsPerChar, char const* Alphabet>
Expand All @@ -104,8 +102,16 @@ struct alpha_kmer_t : uint_kmer_t<Kmer, BitsPerChar> {
static constexpr char const* alphabet = Alphabet;
static constexpr uint8_t alphabet_size = std::char_traits<char>::length(Alphabet);

static bool is_valid(char c);
static uint64_t char_to_uint(char c);
static char uint64_to_char(uint64_t x) { return alphabet[x]; }

// Revcompl only makes sense for DNA, fallback to noop otherwise
[[maybe_unused]] virtual void reverse_complement_inplace(uint64_t) {}
[[maybe_unused]] static void compute_reverse_complement(char const* input, char* output,
uint64_t size) {
for (uint64_t i = 0; i != size; ++i) { output[i] = input[i]; }
}
};

#ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING
Expand All @@ -119,8 +125,10 @@ struct dna_uint_kmer_t : alpha_kmer_t<Kmer, 2, nucleotides> {
using base = alpha_kmer_t<Kmer, 2, nucleotides>;
using base::uint_kmer_bits;
using base::bits_per_char;
using base::max_k;
using base::base;
// Use odd lengths for DNA to avoid dealing with self-complements
static constexpr uint16_t max_k = base::max_k - !(base::max_k % 2);
static constexpr uint16_t max_m = base::max_m - !(base::max_m % 2);
/*
This works with the map:
A -> 00; C -> 01; G -> 11; T -> 10.
Expand All @@ -147,14 +155,13 @@ struct dna_uint_kmer_t : alpha_kmer_t<Kmer, 2, nucleotides> {
return res;
}

[[maybe_unused]] dna_uint_kmer_t reverse_complement(uint64_t k) {
dna_uint_kmer_t x(*this);
[[maybe_unused]] void reverse_complement_inplace(uint64_t k) override {
assert(k <= max_k);
dna_uint_kmer_t res(0);
for (uint16_t i = 0; i < uint_kmer_bits; i += 64) { res.append64(crc64(x.pop64())); }
dna_uint_kmer_t rev(0);
for (uint16_t i = 0; i < uint_kmer_bits; i += 64) { rev.append64(crc64(base::pop64())); }
// res is full reverse-complement to x
res.drop(uint_kmer_bits - k * bits_per_char);
return res;
rev.drop(uint_kmer_bits - k * bits_per_char);
*this = rev;
}

#ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING
Expand Down Expand Up @@ -186,6 +193,92 @@ struct dna_uint_kmer_t : alpha_kmer_t<Kmer, 2, nucleotides> {
*/
static uint64_t char_to_uint(char c) { return (c >> 1) & 3; }
#endif

/*
Forward character map:
A -> A 65
C -> C 67
G -> G 71
T -> T 84
a -> a 97
c -> c 99
g -> g 103
t -> t 116
All other chars map to zero.
*/
static constexpr char canonicalize_basepair_forward_map[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65, 0, 67, 0, 0, 0, 71, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 84, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 0, 99, 0, 0, 0, 103,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

/*
Reverse character map:
65 A -> T 84
67 C -> G 71
71 G -> C 67
84 T -> A 65
97 a -> t 116
99 c -> g 103
103 g -> c 99
116 t -> a 97
All other chars map to zero.
*/
static constexpr char canonicalize_basepair_reverse_map[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, 0, 71, 0, 0, 0, 67, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116, 0, 103, 0, 0, 0, 99,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

[[maybe_unused]] static void compute_reverse_complement(char const* input, char* output,
uint64_t size) {
for (uint64_t i = 0; i != size; ++i) {
int c = input[i];
output[size - i - 1] = canonicalize_basepair_reverse_map[c];
}
}

static inline bool is_valid(char c) {
return canonicalize_basepair_forward_map[static_cast<size_t>(c)];
}
};

inline constexpr char amino_acids[] = "ABCDEFGHIJKLMNOPQRSTUVWYZX";
template <typename Kmer>
struct aa_uint_kmer_t : alpha_kmer_t<Kmer, 5, amino_acids> {
using base = alpha_kmer_t<Kmer, 5, amino_acids>;
using base::uint_kmer_bits;
using base::bits_per_char;
using base::base;

static constexpr int8_t char_to_aa[256] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
25, 23, 24, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 23, 24, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};

static bool is_valid(char c) { return ~char_to_aa[static_cast<size_t>(c)]; }
static uint64_t char_to_uint(char c) { return char_to_aa[static_cast<size_t>(c)]; }
};

// also supports bitpack<__uint128_t, 1>, std::bitset<256>, etc
Expand Down
9 changes: 6 additions & 3 deletions include/query/streaming_query_canonical_parsing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ struct streaming_query_canonical_parsing {

lookup_result lookup_advanced(const char* kmer) {
/* 1. validation */
bool is_valid = m_start ? util::is_valid(kmer, m_k) : util::is_valid(kmer[m_k - 1]);
bool is_valid =
m_start ? util::is_valid<kmer_t>(kmer, m_k) : kmer_t::is_valid(kmer[m_k - 1]);
if (!is_valid) {
start();
return lookup_result();
Expand All @@ -62,7 +63,8 @@ struct streaming_query_canonical_parsing {
}
m_curr_minimizer = m_minimizer_enum.next(m_kmer, m_start);
assert(m_curr_minimizer == util::compute_minimizer<kmer_t>(m_kmer, m_k, m_m, m_seed));
m_kmer_rc = m_kmer.reverse_complement(m_k);
m_kmer_rc = m_kmer;
m_kmer_rc.reverse_complement_inplace(m_k);
constexpr bool reverse = true;
uint64_t minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start, reverse);
assert(minimizer_rc == util::compute_minimizer<kmer_t>(m_kmer_rc, m_k, m_m, m_seed));
Expand Down Expand Up @@ -174,7 +176,8 @@ struct streaming_query_canonical_parsing {
kmer_t val = m_string_iterator.read(2 * m_k);

if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) {
kmer_t val_rc = val.reverse_complement(m_k);
kmer_t val_rc = val;
val_rc.reverse_complement_inplace(m_k);
uint64_t minimizer =
std::min(util::compute_minimizer(val, m_k, m_m, m_seed),
util::compute_minimizer(val_rc, m_k, m_m, m_seed));
Expand Down
6 changes: 4 additions & 2 deletions include/query/streaming_query_regular_parsing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ struct streaming_query_regular_parsing {

lookup_result lookup_advanced(const char* kmer) {
/* 1. validation */
bool is_valid = m_start ? util::is_valid(kmer, m_k) : util::is_valid(kmer[m_k - 1]);
bool is_valid =
m_start ? util::is_valid<kmer_t>(kmer, m_k) : kmer_t::is_valid(kmer[m_k - 1]);
if (!is_valid) {
m_start = true;
return lookup_result();
Expand All @@ -66,7 +67,8 @@ struct streaming_query_regular_parsing {
}
m_curr_minimizer = m_minimizer_enum.next(m_kmer, m_start);
assert(m_curr_minimizer == util::compute_minimizer<kmer_t>(m_kmer, m_k, m_m, m_seed));
m_kmer_rc = m_kmer.reverse_complement(m_k);
m_kmer_rc = m_kmer;
m_kmer_rc.reverse_complement_inplace(m_k);
constexpr bool reverse = true;
m_curr_minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start, reverse);
assert(m_curr_minimizer_rc == util::compute_minimizer<kmer_t>(m_kmer_rc, m_k, m_m, m_seed));
Expand Down
3 changes: 2 additions & 1 deletion include/statistics.impl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ void dictionary<kmer_t>::compute_statistics() const {
kmer_t kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * m_k);
auto [minimizer, pos] = util::compute_minimizer_pos(kmer, m_k, m_m, m_seed);
if (m_canonical_parsing) {
kmer_t kmer_rc = kmer.reverse_complement(m_k);
kmer_t kmer_rc = kmer;
kmer_rc.reverse_complement_inplace(m_k);
auto [minimizer_rc, pos_rc] =
util::compute_minimizer_pos(kmer_rc, m_k, m_m, m_seed);
if (minimizer_rc < minimizer) {
Expand Down
Loading

0 comments on commit 2a66378

Please sign in to comment.