Skip to content

Commit

Permalink
refactor: using bv.rank instead of binary search
Browse files Browse the repository at this point in the history
  • Loading branch information
SGSSGene committed Nov 18, 2024
1 parent 0a096b0 commit f8d691f
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 23 deletions.
25 changes: 10 additions & 15 deletions src/fmindex-collection/suffixarray/CSA.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,22 @@ struct CSA {
accInputSizes.emplace_back(accInputSizes.back() + len);
}

// Construct bit vector, indicating sequence start in text
auto textSeqStart = bitvector::CompactBitvector{};
for (auto sizes : _inputSizes) {
textSeqStart.push_back(true);
for (size_t i{1}; i < sizes; ++i) {
textSeqStart.push_back(false);
}
}

// Construct sampled suffix array
size_t ssaI{}; // Index of the ssa that is inside of sa
bv.reserve((sa.size()+samplingRate-1) / samplingRate);
for (size_t i{0}; i < sa.size(); ++i) {
auto [subjId, subjPos] = [&]() -> std::tuple<size_t, size_t> {
// find subject id
auto iter = std::upper_bound(accInputSizes.begin(), accInputSizes.end(), sa[i]);
size_t subjId = std::distance(accInputSizes.begin(), iter) - 1;
auto subjId = textSeqStart.rank(sa[i]+1) - 1;

// compute subj position
auto subjPos = sa[i] - accInputSizes[subjId];
Expand All @@ -117,21 +125,8 @@ struct CSA {
return {subjId, subjPos};
}();

//bool sample = (sa[i] % samplingRate) == 0;
bool sample = (subjPos % samplingRate) == 0;
if (sample) {
// find subject id
/* auto iter = std::upper_bound(accInputSizes.begin(), accInputSizes.end(), sa[i]);
size_t subjId = std::distance(accInputSizes.begin(), iter) - 1;
auto subjPos = sa[i] - accInputSizes[subjId];
if (reverse) {
auto len = _inputSizes[subjId];
if (subjPos < len-1) {
subjPos = len - subjPos - 1;
} else {
subjPos = len;
}
}*/
sa[ssaI] = subjPos | (subjId << bitsForPosition);
++ssaI;
}
Expand Down
27 changes: 19 additions & 8 deletions src/fmindex-collection/suffixarray/DenseCSA.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,15 @@ struct DenseCSA {
largestText = std::max(largestText, len);
}

// Construct bit vector, indicating sequence start in text
auto textSeqStart = bitvector::CompactBitvector{};
for (auto sizes : _inputSizes) {
textSeqStart.push_back(true);
for (size_t i{1}; i < sizes; ++i) {
textSeqStart.push_back(false);
}
}

// Construct sampled suffix array
size_t bitsForPos = std::max(size_t{1}, size_t(std::ceil(std::log2(largestText))));
seqCount = _inputSizes.size();
Expand All @@ -103,11 +112,11 @@ struct DenseCSA {
ssaPos.reserve(sa.size() / samplingRate);
ssaSeq.reserve(sa.size() / samplingRate);
for (size_t i{0}; i < sa.size(); ++i) {
bool sample = (sa[i] % samplingRate) == 0;
if (sample) {
auto [subjId, subjPos] = [&]() -> std::tuple<size_t, size_t> {
// find subject id
auto iter = std::upper_bound(accInputSizes.begin(), accInputSizes.end(), sa[i]);
size_t subjId = std::distance(accInputSizes.begin(), iter) - 1;
auto subjId = textSeqStart.rank(sa[i]+1) - 1;

// compute subj position
auto subjPos = sa[i] - accInputSizes[subjId];
if (reverse) {
auto len = _inputSizes[subjId];
Expand All @@ -117,14 +126,16 @@ struct DenseCSA {
subjPos = len;
}
}
return {subjId, subjPos};
}();

bool sample = (subjPos % samplingRate) == 0;
if (sample) {
ssaSeq.push_back(subjId);
ssaPos.push_back(subjPos);
}
bv.push_back(sample);
}

this->bv = bitvector::CompactBitvector{sa.size(), [&](size_t idx) {
return (sa[idx] % samplingRate) == 0;
}};
}


Expand Down

0 comments on commit f8d691f

Please sign in to comment.