From 70b4b59d49c646b3b31834ecd105fe461403fc0d Mon Sep 17 00:00:00 2001 From: Simon Gene Gottlieb Date: Wed, 31 Jul 2024 16:02:16 +0200 Subject: [PATCH] feat: only using bits for BiFMIndex --- src/fmindex-collection/fmindex/merge.h | 49 ++--- src/fmindex-collection/utils.h | 1 + .../fmindex/checkMerge.cpp | 208 ++++++++++++------ 3 files changed, 167 insertions(+), 91 deletions(-) diff --git a/src/fmindex-collection/fmindex/merge.h b/src/fmindex-collection/fmindex/merge.h index cbf9fd1e..cbfdf4ed 100644 --- a/src/fmindex-collection/fmindex/merge.h +++ b/src/fmindex-collection/fmindex/merge.h @@ -54,7 +54,11 @@ auto mergeImpl(FMIndex const& index1, FMIndex const& if (loc) { auto [seq, pos] = *loc; csa.push_back(std::tuple{seq + seqOffset, pos}); + } else { + csa.push_back(std::nullopt); } + + }; size_t idx1{}, idx2{}; @@ -95,16 +99,6 @@ auto mergeImpl(BiFMIndex const& index1, BiFMIndex co // compute normal forward bwt { auto R = computeInterleavingR(index1.occ, index2.occ); -/* - // The right hand index sequences, require adjusted sequence number - auto [seqOffset1, seqOffset2] = [&]() -> std::tuple { - if (swapOffsets) { - return {index2.occ.rank(index2.size(), 0), 0}; - } - return {0, index1.occ.rank(index1.size(), 0)}; - }(); -*/ - auto addSSAEntry = [&csa](auto const& index, size_t idx, size_t seqOffset) { auto loc = index.csa.value(idx); if (loc) { @@ -114,39 +108,36 @@ auto mergeImpl(BiFMIndex const& index1, BiFMIndex co csa.push_back(std::nullopt); } }; - - size_t idx1{}; - for (size_t idx2{}; idx2 < R.size(); ++idx2) { - for (; idx1 < R[idx2]; ++idx1) { + size_t idx1{}, idx2{}; + for (bool v : R) { + if (!v) { mergedBWT.push_back(index1.occ.symbol(idx1)); addSSAEntry(index1, idx1, seqOffset1); + idx1 += 1; + } else { + mergedBWT.push_back(index2.occ.symbol(idx2)); + addSSAEntry(index2, idx2, seqOffset2); + idx2 += 1; } - mergedBWT.push_back(index2.occ.symbol(idx2)); - addSSAEntry(index2, idx2, seqOffset2); - } - for (; idx1 < index1.size(); ++idx1) { - mergedBWT.push_back(index1.occ.symbol(idx1)); - addSSAEntry(index1, idx1, seqOffset1); } } // Interleave BWT->R and SA->ssa auto mergedBWTRev = std::vector{}; - mergedBWTRev.reserve(index1.size() + index2.size()); + mergedBWTRev.reserve(mergedBWT.size()); // compute reversed bwt { auto R = computeInterleavingR(index1.occRev, index2.occRev); - - size_t idx1{}; - for (size_t idx2{}; idx2 < R.size(); ++idx2) { - for (; idx1 < R[idx2]; ++idx1) { + size_t idx1{}, idx2{}; + for (bool v : R) { + if (!v) { mergedBWTRev.push_back(index1.occRev.symbol(idx1)); + idx1 += 1; + } else { + mergedBWTRev.push_back(index2.occRev.symbol(idx2)); + idx2 += 1; } - mergedBWTRev.push_back(index2.occRev.symbol(idx2)); - } - for (; idx1 < index1.size(); ++idx1) { - mergedBWTRev.push_back(index1.occRev.symbol(idx1)); } } diff --git a/src/fmindex-collection/utils.h b/src/fmindex-collection/utils.h index 383c3c56..338a6e0e 100644 --- a/src/fmindex-collection/utils.h +++ b/src/fmindex-collection/utils.h @@ -226,6 +226,7 @@ auto reconstructText(Index const& index, size_t seqNbr) -> std::vector idx = index.occ.rank(idx, c); r.push_back(c); } while (c != 0); + r.pop_back(); // remove last zero std::ranges::reverse(r); return r; } diff --git a/src/test_fmindex-collection/fmindex/checkMerge.cpp b/src/test_fmindex-collection/fmindex/checkMerge.cpp index ca7ac64c..5fd5f5c2 100644 --- a/src/test_fmindex-collection/fmindex/checkMerge.cpp +++ b/src/test_fmindex-collection/fmindex/checkMerge.cpp @@ -16,7 +16,7 @@ TEST_CASE("checking merging of fmindices", "[FMIndex][merge]") { using OccTable = fmindex_collection::occtable::Bitvector<3>; using Index = fmindex_collection::FMIndex; - auto index1 = Index{data1, /*.samplingRate =*/ 1, /*.threadNbr =*/ 1}; + auto index1 = Index{data1, /*.samplingRate =*/ 2, /*.threadNbr =*/ 1}; auto index2 = Index{data2, /*.samplingRate =*/ 2, /*.threadNbr =*/ 1}; auto index12 = merge(index1, index2); @@ -47,7 +47,11 @@ TEST_CASE("checking merging of fmindices", "[FMIndex][merge]") { {0, 0}, {0, 1}, {0, 2}, + {1, 1}, + {1, 3}, + {1, 5}, {0, 3}, + {1, 7}, {0, 7}, {1, 0}, {1, 2}, @@ -56,10 +60,6 @@ TEST_CASE("checking merging of fmindices", "[FMIndex][merge]") { {0, 6}, {0, 5}, {0, 4}, - {1, 1}, - {1, 5}, - {1, 4}, - {1, 3}, }; for (size_t i{0}; i < index12.size(); ++i) { auto idx0 = index12.occ.rank(i, 0); @@ -77,8 +77,8 @@ TEST_CASE("checking merging of fmindices", "[FMIndex][merge]") { auto texts = reconstructText(index12); REQUIRE(texts.size() == 2); - CHECK(texts[0] == std::vector{0, 2, 1, 2, 1, 2, 1, 2, 2}); - CHECK(texts[1] == std::vector{0, 1, 1, 1, 1, 2, 2, 2, 2}); + CHECK(texts[0] == data2[0]); + CHECK(texts[1] == data1[0]); } TEST_CASE("checking merging of fmindices", "[BiFMIndex][merge]") { @@ -94,64 +94,148 @@ TEST_CASE("checking merging of fmindices", "[BiFMIndex][merge]") { auto index2 = Index{data2, /*.samplingRate =*/ 2, /*.threadNbr =*/ 1}; auto index3 = Index{data3, /*.samplingRate =*/ 2, /*.threadNbr =*/ 1}; - std::cout << "idx: " << "\n"; - size_t idx1{}; - for (auto _ : data1[0]) { - (void)_; - char c = index1.occ.symbol(idx1); - idx1 = index1.occ.rank(idx1, c); - std::cout << (int)c << " " << idx1 << "\n"; - } - - for (size_t i{0}; i < index1.size(); ++i) { - auto idx0 = index1.occ.rank(i, 0); - auto idx1 = index1.occ.rank(i, 1); - auto idx2 = index1.occ.rank(i, 2); - - auto [seq, pos] = index1.locate(i); - - std::cout << idx0 << " " << idx1 << " " << idx2 << " : " << seq << " " << pos << "\n"; - } - - auto index12 = merge(index1, index2); - std::cout << "reconstruct 12:\n"; - for (auto const& t : reconstructText(index12)) { - for (auto c : t) { - std::cout << (int)c; + SECTION("merging index1 and index2 into index12") { + auto index12 = merge(index1, index2); + + auto expectedRanks = std::vector> { + { 0, 2, 9}, + { 0, 2, 10}, + { 0, 2, 11}, + { 1, 2, 11}, + { 1, 3, 11}, + { 1, 4, 11}, + { 1, 4, 12}, + { 1, 4, 13}, + { 1, 4, 14}, + { 1, 5, 14}, + { 1, 5, 15}, + { 1, 5, 16}, + { 2, 5, 16}, + { 2, 6, 16}, + { 2, 7, 16}, + { 2, 8, 16}, + { 2, 8, 17}, + { 2, 8, 18}, + }; + auto expectedSA = std::vector> { + {0, 8}, + {1, 8}, + {0, 0}, + {0, 1}, + {0, 2}, + {1, 1}, + {1, 3}, + {1, 5}, + {0, 3}, + {1, 7}, + {0, 7}, + {1, 0}, + {1, 2}, + {1, 4}, + {1, 6}, + {0, 6}, + {0, 5}, + {0, 4}, + }; + for (size_t i{0}; i < index12.size(); ++i) { + auto idx0 = index12.occ.rank(i, 0); + auto idx1 = index12.occ.rank(i, 1); + auto idx2 = index12.occ.rank(i, 2); + auto [seq, pos] = index12.locate(i); + INFO(i); + CHECK(std::get<0>(expectedRanks[i]) == idx0); + CHECK(std::get<1>(expectedRanks[i]) == idx1); + CHECK(std::get<2>(expectedRanks[i]) == idx2); + CHECK(std::get<0>(expectedSA[i]) == seq); + CHECK(std::get<1>(expectedSA[i]) == pos); } - std::cout << "\n"; - } - for (size_t i{0}; i < index12.size(); ++i) { - auto idx0 = index12.occ.rank(i, 0); - auto idx1 = index12.occ.rank(i, 1); - auto idx2 = index12.occ.rank(i, 2); - - auto [seq, pos] = index12.locate(i); - - std::cout << idx0 << " " << idx1 << " " << idx2 << " : " << seq << " " << pos << "\n"; - } - - auto index123 = merge(index12, index3); - - - std::cout << "reconstruct 123:\n"; - for (auto const& t : reconstructText(index123)) { - for (auto c : t) { - std::cout << (int)c; + auto texts = reconstructText(index12); + REQUIRE(texts.size() == 2); + CHECK(texts[0] == data2[0]); + CHECK(texts[1] == data1[0]); + + SECTION("merging index12 and index3 into index123") { + auto index123 = merge(index12, index3); + + auto expectedRanks = std::vector> { + {0, 3, 14}, + {0, 3, 15}, + {0, 3, 16}, + {0, 3, 17}, + {1, 3, 17}, + {1, 4, 17}, + {1, 4, 18}, + {2, 4, 18}, + {2, 5, 18}, + {2, 5, 19}, + {2, 5, 20}, + {2, 6, 20}, + {2, 6, 21}, + {2, 7, 21}, + {2, 8, 21}, + {2, 8, 22}, + {2, 8, 23}, + {2, 8, 24}, + {2, 8, 25}, + {3, 8, 25}, + {3, 9, 25}, + {3, 10, 25}, + {3, 11, 25}, + {3, 12, 25}, + {3, 12, 26}, + {3, 13, 26}, + {3, 13, 27}, + }; + auto expectedSA = std::vector> { + {0, 8}, + {2, 8}, + {1, 8}, + {0, 0}, + {0, 1}, + {2, 4}, + {2, 0}, + {0, 2}, + {1, 1}, + {1, 3}, + {2, 5}, + {1, 5}, + {2, 1}, + {0, 3}, + {2, 7}, + {1, 7}, + {0, 7}, + {2, 3}, + {1, 0}, + {1, 2}, + {1, 4}, + {2, 6}, + {1, 6}, + {0, 6}, + {2, 2}, + {0, 5}, + {0, 4}, + }; + for (size_t i{0}; i < index123.size(); ++i) { + auto idx0 = index123.occ.rank(i, 0); + auto idx1 = index123.occ.rank(i, 1); + auto idx2 = index123.occ.rank(i, 2); + auto [seq, pos] = index123.locate(i); + INFO(i); + CHECK(std::get<0>(expectedRanks[i]) == idx0); + CHECK(std::get<1>(expectedRanks[i]) == idx1); + CHECK(std::get<2>(expectedRanks[i]) == idx2); + CHECK(std::get<0>(expectedSA[i]) == seq); + CHECK(std::get<1>(expectedSA[i]) == pos); + } + + + auto texts = reconstructText(index123); + REQUIRE(texts.size() == 3); + CHECK(texts[0] == data3[0]); + CHECK(texts[1] == data2[0]); + CHECK(texts[2] == data1[0]); } - std::cout << "\n"; - } - for (size_t i{0}; i < index123.size(); ++i) { - auto idx0 = index123.occ.rank(i, 0); - auto idx1 = index123.occ.rank(i, 1); - auto idx2 = index123.occ.rank(i, 2); - - auto [seq, pos] = index123.locate(i); - - std::cout << idx0 << " " << idx1 << " " << idx2 << " : " << seq << " " << pos << "\n"; } - - }