Skip to content

Commit

Permalink
feat: using bits instead of size_t for merging FMIndices
Browse files Browse the repository at this point in the history
  • Loading branch information
SGSSGene committed Jul 31, 2024
1 parent f0b5d13 commit bcfbb40
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 49 deletions.
46 changes: 25 additions & 21 deletions src/fmindex-collection/fmindex/merge.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,40 +15,37 @@ namespace fmindex_collection {
* creates the R array for interleaving FMIndices
*/
template <typename OccLhs, typename OccRhs, typename value_t = size_t>
auto computeInterleavingR(OccLhs const& lhsOcc, OccRhs const& rhsOcc) -> std::vector<value_t> {
auto computeInterleavingR(OccLhs const& lhsOcc, OccRhs const& rhsOcc) -> std::vector<bool> {
if (rhsOcc.size() > std::numeric_limits<value_t>::max()) {
throw std::runtime_error{"Can not create interleaving R for this value type, index to large"};
}

auto R = std::vector<value_t>{};
R.resize(rhsOcc.size(), 0);
auto R = std::vector<bool>{};
// auto R = std::vector<value_t>{};
R.resize(lhsOcc.size() + rhsOcc.size(), false);

size_t idx1{};
size_t idx2{};

for (size_t i{0}; i < R.size(); ++i) {
for (size_t i{0}; i < rhsOcc.size(); ++i) {
auto c = rhsOcc.symbol(idx2);
idx1 = lhsOcc.rank(idx1, c);
idx2 = rhsOcc.rank(idx2, c);
R[idx2] = idx1;
R[idx1 + idx2] = true;
}
return R;
}



//!TODO swap index1 and index2 if index1 is smaller
template <typename Res = void, typename OccLhs, typename OccRhs, typename TCSA>
auto merge(FMIndex<OccLhs, TCSA> const& index1, FMIndex<OccRhs, TCSA> const& index2) -> FMIndex<std::conditional_t<std::is_void_v<Res>, OccLhs, Res>, TCSA> {
auto mergeImpl(FMIndex<OccLhs, TCSA> const& index1, FMIndex<OccRhs, TCSA> const& index2, size_t seqOffset1, size_t seqOffset2) -> FMIndex<std::conditional_t<std::is_void_v<Res>, OccLhs, Res>, TCSA> {
auto R = computeInterleavingR(index1.occ, index2.occ);

// Interleave BWT->R and SA->ssa
auto mergedBWT = std::vector<uint8_t>{};
mergedBWT.reserve(index1.size() + index2.size());

// The right hand index sequences, require adjusted sequence number
size_t seqOffset = index1.occ.rank(index1.size(), 0);

using CSA = decltype(index1.csa);
auto csa = CSA::createJoinedCSA(index1.csa, index2.csa);

Expand All @@ -60,26 +57,33 @@ auto merge(FMIndex<OccLhs, TCSA> const& index1, FMIndex<OccRhs, TCSA> const& ind
}
};

size_t idx1{};
for (size_t idx2{}; idx2 < R.size(); ++idx2) {
for (; idx1 < R[idx2]; ++idx1) {
size_t idx1{}, idx2{};
for (bool v : R) {
if (!v) {
mergedBWT.push_back(index1.occ.symbol(idx1));
addSSAEntry(index1, idx1, 0);
addSSAEntry(index1, idx1, seqOffset1);
idx1 += 1;
} else {
mergedBWT.push_back(index2.occ.symbol(idx2));
addSSAEntry(index2, idx2, seqOffset2);
idx2 += 1;
}
mergedBWT.push_back(index2.occ.symbol(idx2));
addSSAEntry(index2, idx2, seqOffset);
}
for (; idx1 < index1.size(); ++idx1) {
mergedBWT.push_back(index1.occ.symbol(idx1));
addSSAEntry(index1, idx1, 0);
}
R.clear();


return {mergedBWT, std::move(csa)};
}

//!TODO swap index1 and index2 if index1 is smaller
template <typename Res = void, typename OccLhs, typename OccRhs, typename TCSA>
auto merge(FMIndex<OccLhs, TCSA> const& index1, FMIndex<OccRhs, TCSA> const& index2) -> FMIndex<std::conditional_t<std::is_void_v<Res>, OccLhs, Res>, TCSA> {
if (index1.size() >= index2.size()) {
return mergeImpl<Res>(index1, index2, 0, index1.occ.rank(index1.size(), 0));
}
return mergeImpl<Res>(index2, index1, index2.occ.rank(index2.size(), 0), 0);
}


template <typename Res = void, typename OccLhs, typename OccRhs, typename TCSA>
auto mergeImpl(BiFMIndex<OccLhs, TCSA> const& index1, BiFMIndex<OccRhs, TCSA> const& index2, size_t seqOffset1, size_t seqOffset2) -> BiFMIndex<std::conditional_t<std::is_void_v<Res>, OccLhs, Res>, TCSA> {
auto csa = TCSA::createJoinedCSA(index1.csa, index2.csa);
Expand Down
77 changes: 49 additions & 28 deletions src/test_fmindex-collection/fmindex/checkMerge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,45 +19,66 @@ TEST_CASE("checking merging of fmindices", "[FMIndex][merge]") {
auto index1 = Index{data1, /*.samplingRate =*/ 1, /*.threadNbr =*/ 1};
auto index2 = Index{data2, /*.samplingRate =*/ 2, /*.threadNbr =*/ 1};

std::cout << "idx: " << "\n";
size_t idx1{};
for (auto _ : data1[0]) {
(void)_;
char c = index1.occ.symbol(idx1);
idx1 = index1.occ.rank(idx1, c);
std::cout << (int)c << " " << idx1 << "\n";
}

std::cout << "Occ index1:\n";
for (size_t i{0}; i < index1.size(); ++i) {
auto idx0 = index1.occ.rank(i, 0);
auto idx1 = index1.occ.rank(i, 1);
auto idx2 = index1.occ.rank(i, 2);

std::cout << idx0 << " " << idx1 << " " << idx2 << "\n";
}

auto index12 = merge(index1, index2);

std::cout << "Occ index12:\n";
auto expectedRanks = std::vector<std::tuple<size_t, size_t, size_t>> {
{ 0, 2, 9},
{ 0, 2, 10},
{ 0, 2, 11},
{ 1, 2, 11},
{ 1, 3, 11},
{ 1, 4, 11},
{ 1, 4, 12},
{ 1, 4, 13},
{ 1, 4, 14},
{ 1, 5, 14},
{ 1, 5, 15},
{ 1, 5, 16},
{ 2, 5, 16},
{ 2, 6, 16},
{ 2, 7, 16},
{ 2, 8, 16},
{ 2, 8, 17},
{ 2, 8, 18},
};
auto expectedSA = std::vector<std::tuple<size_t, size_t>> {
{0, 8},
{1, 8},
{0, 0},
{0, 1},
{0, 2},
{0, 3},
{0, 7},
{1, 0},
{1, 2},
{1, 4},
{1, 6},
{0, 6},
{0, 5},
{0, 4},
{1, 1},
{1, 5},
{1, 4},
{1, 3},
};
for (size_t i{0}; i < index12.size(); ++i) {
auto idx0 = index12.occ.rank(i, 0);
auto idx1 = index12.occ.rank(i, 1);
auto idx2 = index12.occ.rank(i, 2);
auto [seq, pos] = index12.locate(i);

std::cout << idx0 << " " << idx1 << " " << idx2 << " : " << seq << " " << pos << "\n";
INFO(i);
CHECK(std::get<0>(expectedRanks[i]) == idx0);
CHECK(std::get<1>(expectedRanks[i]) == idx1);
CHECK(std::get<2>(expectedRanks[i]) == idx2);
CHECK(std::get<0>(expectedSA[i]) == seq);
CHECK(std::get<1>(expectedSA[i]) == pos);
}


std::cout << "reconstruct:\n";
auto texts = reconstructText(index12);
for (auto const& t : texts) {
for (auto c : t) {
std::cout << (int)c;
}
std::cout << "\n";
}
REQUIRE(texts.size() == 2);
CHECK(texts[0] == std::vector<uint8_t>{0, 2, 1, 2, 1, 2, 1, 2, 2});
CHECK(texts[1] == std::vector<uint8_t>{0, 1, 1, 1, 1, 2, 2, 2, 2});
}

TEST_CASE("checking merging of fmindices", "[BiFMIndex][merge]") {
Expand Down

0 comments on commit bcfbb40

Please sign in to comment.