Skip to content

Commit

Permalink
feat: only using bits for BiFMIndex
Browse files Browse the repository at this point in the history
  • Loading branch information
SGSSGene committed Jul 31, 2024
1 parent 2700686 commit 70b4b59
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 91 deletions.
49 changes: 20 additions & 29 deletions src/fmindex-collection/fmindex/merge.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,11 @@ auto mergeImpl(FMIndex<OccLhs, TCSA> const& index1, FMIndex<OccRhs, TCSA> const&
if (loc) {
auto [seq, pos] = *loc;
csa.push_back(std::tuple{seq + seqOffset, pos});
} else {
csa.push_back(std::nullopt);
}


};

size_t idx1{}, idx2{};
Expand Down Expand Up @@ -95,16 +99,6 @@ auto mergeImpl(BiFMIndex<OccLhs, TCSA> const& index1, BiFMIndex<OccRhs, TCSA> co
// compute normal forward bwt
{
auto R = computeInterleavingR(index1.occ, index2.occ);
/*
// The right hand index sequences, require adjusted sequence number
auto [seqOffset1, seqOffset2] = [&]() -> std::tuple<size_t, size_t> {
if (swapOffsets) {
return {index2.occ.rank(index2.size(), 0), 0};
}
return {0, index1.occ.rank(index1.size(), 0)};
}();
*/

auto addSSAEntry = [&csa](auto const& index, size_t idx, size_t seqOffset) {
auto loc = index.csa.value(idx);
if (loc) {
Expand All @@ -114,39 +108,36 @@ auto mergeImpl(BiFMIndex<OccLhs, TCSA> const& index1, BiFMIndex<OccRhs, TCSA> co
csa.push_back(std::nullopt);
}
};

size_t idx1{};
for (size_t idx2{}; idx2 < R.size(); ++idx2) {
for (; idx1 < R[idx2]; ++idx1) {
size_t idx1{}, idx2{};
for (bool v : R) {
if (!v) {
mergedBWT.push_back(index1.occ.symbol(idx1));
addSSAEntry(index1, idx1, seqOffset1);
idx1 += 1;
} else {
mergedBWT.push_back(index2.occ.symbol(idx2));
addSSAEntry(index2, idx2, seqOffset2);
idx2 += 1;
}
mergedBWT.push_back(index2.occ.symbol(idx2));
addSSAEntry(index2, idx2, seqOffset2);
}
for (; idx1 < index1.size(); ++idx1) {
mergedBWT.push_back(index1.occ.symbol(idx1));
addSSAEntry(index1, idx1, seqOffset1);
}
}

// Interleave BWT->R and SA->ssa
auto mergedBWTRev = std::vector<uint8_t>{};
mergedBWTRev.reserve(index1.size() + index2.size());
mergedBWTRev.reserve(mergedBWT.size());

// compute reversed bwt
{
auto R = computeInterleavingR(index1.occRev, index2.occRev);

size_t idx1{};
for (size_t idx2{}; idx2 < R.size(); ++idx2) {
for (; idx1 < R[idx2]; ++idx1) {
size_t idx1{}, idx2{};
for (bool v : R) {
if (!v) {
mergedBWTRev.push_back(index1.occRev.symbol(idx1));
idx1 += 1;
} else {
mergedBWTRev.push_back(index2.occRev.symbol(idx2));
idx2 += 1;
}
mergedBWTRev.push_back(index2.occRev.symbol(idx2));
}
for (; idx1 < index1.size(); ++idx1) {
mergedBWTRev.push_back(index1.occRev.symbol(idx1));
}
}

Expand Down
1 change: 1 addition & 0 deletions src/fmindex-collection/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ auto reconstructText(Index const& index, size_t seqNbr) -> std::vector<uint8_t>
idx = index.occ.rank(idx, c);
r.push_back(c);
} while (c != 0);
r.pop_back(); // remove last zero
std::ranges::reverse(r);
return r;
}
Expand Down
208 changes: 146 additions & 62 deletions src/test_fmindex-collection/fmindex/checkMerge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ TEST_CASE("checking merging of fmindices", "[FMIndex][merge]") {
using OccTable = fmindex_collection::occtable::Bitvector<3>;
using Index = fmindex_collection::FMIndex<OccTable>;

auto index1 = Index{data1, /*.samplingRate =*/ 1, /*.threadNbr =*/ 1};
auto index1 = Index{data1, /*.samplingRate =*/ 2, /*.threadNbr =*/ 1};
auto index2 = Index{data2, /*.samplingRate =*/ 2, /*.threadNbr =*/ 1};

auto index12 = merge(index1, index2);
Expand Down Expand Up @@ -47,7 +47,11 @@ TEST_CASE("checking merging of fmindices", "[FMIndex][merge]") {
{0, 0},
{0, 1},
{0, 2},
{1, 1},
{1, 3},
{1, 5},
{0, 3},
{1, 7},
{0, 7},
{1, 0},
{1, 2},
Expand All @@ -56,10 +60,6 @@ TEST_CASE("checking merging of fmindices", "[FMIndex][merge]") {
{0, 6},
{0, 5},
{0, 4},
{1, 1},
{1, 5},
{1, 4},
{1, 3},
};
for (size_t i{0}; i < index12.size(); ++i) {
auto idx0 = index12.occ.rank(i, 0);
Expand All @@ -77,8 +77,8 @@ TEST_CASE("checking merging of fmindices", "[FMIndex][merge]") {

auto texts = reconstructText(index12);
REQUIRE(texts.size() == 2);
CHECK(texts[0] == std::vector<uint8_t>{0, 2, 1, 2, 1, 2, 1, 2, 2});
CHECK(texts[1] == std::vector<uint8_t>{0, 1, 1, 1, 1, 2, 2, 2, 2});
CHECK(texts[0] == data2[0]);
CHECK(texts[1] == data1[0]);
}

TEST_CASE("checking merging of fmindices", "[BiFMIndex][merge]") {
Expand All @@ -94,64 +94,148 @@ TEST_CASE("checking merging of fmindices", "[BiFMIndex][merge]") {
auto index2 = Index{data2, /*.samplingRate =*/ 2, /*.threadNbr =*/ 1};
auto index3 = Index{data3, /*.samplingRate =*/ 2, /*.threadNbr =*/ 1};

std::cout << "idx: " << "\n";
size_t idx1{};
for (auto _ : data1[0]) {
(void)_;
char c = index1.occ.symbol(idx1);
idx1 = index1.occ.rank(idx1, c);
std::cout << (int)c << " " << idx1 << "\n";
}

for (size_t i{0}; i < index1.size(); ++i) {
auto idx0 = index1.occ.rank(i, 0);
auto idx1 = index1.occ.rank(i, 1);
auto idx2 = index1.occ.rank(i, 2);

auto [seq, pos] = index1.locate(i);

std::cout << idx0 << " " << idx1 << " " << idx2 << " : " << seq << " " << pos << "\n";
}

auto index12 = merge(index1, index2);
std::cout << "reconstruct 12:\n";
for (auto const& t : reconstructText(index12)) {
for (auto c : t) {
std::cout << (int)c;
SECTION("merging index1 and index2 into index12") {
auto index12 = merge(index1, index2);

auto expectedRanks = std::vector<std::tuple<size_t, size_t, size_t>> {
{ 0, 2, 9},
{ 0, 2, 10},
{ 0, 2, 11},
{ 1, 2, 11},
{ 1, 3, 11},
{ 1, 4, 11},
{ 1, 4, 12},
{ 1, 4, 13},
{ 1, 4, 14},
{ 1, 5, 14},
{ 1, 5, 15},
{ 1, 5, 16},
{ 2, 5, 16},
{ 2, 6, 16},
{ 2, 7, 16},
{ 2, 8, 16},
{ 2, 8, 17},
{ 2, 8, 18},
};
auto expectedSA = std::vector<std::tuple<size_t, size_t>> {
{0, 8},
{1, 8},
{0, 0},
{0, 1},
{0, 2},
{1, 1},
{1, 3},
{1, 5},
{0, 3},
{1, 7},
{0, 7},
{1, 0},
{1, 2},
{1, 4},
{1, 6},
{0, 6},
{0, 5},
{0, 4},
};
for (size_t i{0}; i < index12.size(); ++i) {
auto idx0 = index12.occ.rank(i, 0);
auto idx1 = index12.occ.rank(i, 1);
auto idx2 = index12.occ.rank(i, 2);
auto [seq, pos] = index12.locate(i);
INFO(i);
CHECK(std::get<0>(expectedRanks[i]) == idx0);
CHECK(std::get<1>(expectedRanks[i]) == idx1);
CHECK(std::get<2>(expectedRanks[i]) == idx2);
CHECK(std::get<0>(expectedSA[i]) == seq);
CHECK(std::get<1>(expectedSA[i]) == pos);
}
std::cout << "\n";
}
for (size_t i{0}; i < index12.size(); ++i) {
auto idx0 = index12.occ.rank(i, 0);
auto idx1 = index12.occ.rank(i, 1);
auto idx2 = index12.occ.rank(i, 2);

auto [seq, pos] = index12.locate(i);

std::cout << idx0 << " " << idx1 << " " << idx2 << " : " << seq << " " << pos << "\n";
}



auto index123 = merge(index12, index3);


std::cout << "reconstruct 123:\n";
for (auto const& t : reconstructText(index123)) {
for (auto c : t) {
std::cout << (int)c;
auto texts = reconstructText(index12);
REQUIRE(texts.size() == 2);
CHECK(texts[0] == data2[0]);
CHECK(texts[1] == data1[0]);

SECTION("merging index12 and index3 into index123") {
auto index123 = merge(index12, index3);

auto expectedRanks = std::vector<std::tuple<size_t, size_t, size_t>> {
{0, 3, 14},
{0, 3, 15},
{0, 3, 16},
{0, 3, 17},
{1, 3, 17},
{1, 4, 17},
{1, 4, 18},
{2, 4, 18},
{2, 5, 18},
{2, 5, 19},
{2, 5, 20},
{2, 6, 20},
{2, 6, 21},
{2, 7, 21},
{2, 8, 21},
{2, 8, 22},
{2, 8, 23},
{2, 8, 24},
{2, 8, 25},
{3, 8, 25},
{3, 9, 25},
{3, 10, 25},
{3, 11, 25},
{3, 12, 25},
{3, 12, 26},
{3, 13, 26},
{3, 13, 27},
};
auto expectedSA = std::vector<std::tuple<size_t, size_t>> {
{0, 8},
{2, 8},
{1, 8},
{0, 0},
{0, 1},
{2, 4},
{2, 0},
{0, 2},
{1, 1},
{1, 3},
{2, 5},
{1, 5},
{2, 1},
{0, 3},
{2, 7},
{1, 7},
{0, 7},
{2, 3},
{1, 0},
{1, 2},
{1, 4},
{2, 6},
{1, 6},
{0, 6},
{2, 2},
{0, 5},
{0, 4},
};
for (size_t i{0}; i < index123.size(); ++i) {
auto idx0 = index123.occ.rank(i, 0);
auto idx1 = index123.occ.rank(i, 1);
auto idx2 = index123.occ.rank(i, 2);
auto [seq, pos] = index123.locate(i);
INFO(i);
CHECK(std::get<0>(expectedRanks[i]) == idx0);
CHECK(std::get<1>(expectedRanks[i]) == idx1);
CHECK(std::get<2>(expectedRanks[i]) == idx2);
CHECK(std::get<0>(expectedSA[i]) == seq);
CHECK(std::get<1>(expectedSA[i]) == pos);
}


auto texts = reconstructText(index123);
REQUIRE(texts.size() == 3);
CHECK(texts[0] == data3[0]);
CHECK(texts[1] == data2[0]);
CHECK(texts[2] == data1[0]);
}
std::cout << "\n";
}
for (size_t i{0}; i < index123.size(); ++i) {
auto idx0 = index123.occ.rank(i, 0);
auto idx1 = index123.occ.rank(i, 1);
auto idx2 = index123.occ.rank(i, 2);

auto [seq, pos] = index123.locate(i);

std::cout << idx0 << " " << idx1 << " " << idx2 << " : " << seq << " " << pos << "\n";
}


}

0 comments on commit 70b4b59

Please sign in to comment.