Skip to content

Commit

Permalink
Merge pull request #45 from SGSSGene/feat/merging_fm_indices
Browse files Browse the repository at this point in the history
Feat/merging fm indices
  • Loading branch information
SGSSGene authored Jun 16, 2024
2 parents a47a934 + d3d1257 commit 8ca2e52
Show file tree
Hide file tree
Showing 44 changed files with 942 additions and 292 deletions.
15 changes: 10 additions & 5 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,21 @@ jobs:
- {os: ubuntu-22.04, compiler: gcc14-cpp20-debug-sanitize_undefined}
- {os: ubuntu-22.04, compiler: gcc14-cpp20-debug-sanitize_thread}
- {os: ubuntu-22.04, compiler: gcc14-cpp20-lcov}
- {os: ubuntu-22.04, compiler: gcc14-cpp23-release}
- {os: ubuntu-22.04, compiler: clang16-cpp20-release}
- {os: ubuntu-22.04, compiler: clang17-cpp20-release}
- {os: macos-12, compiler: gcc13-cpp20-release, cmake_flags: "-DFMC_USE_SDSL=NO"}
- {os: macos-12, compiler: clang17-cpp20-release, cmake_flags: "-DFMC_USE_SDSL=NO"}
- {os: windows-2022, compiler: msvc-cpp20-release, cmake_flags: "-DFMC_USE_SDSL=NO"}
- {os: windows-2022, compiler: msvc-cpp20-debug, cmake_flags: "-DFMC_USE_SDSL=NO"}
- {os: macos-12, compiler: gcc14-cpp20-release, cmake_flags: "-DFMC_USE_SDSL=NO"}
- {os: macos-12, compiler: clang17-cpp20-release, cmake_flags: "-DFMC_USE_SDSL=NO"}
- {os: windows-2022, compiler: msvc-cpp20-release, cmake_flags: "-DFMC_USE_SDSL=NO"}
# not running any tests, since it takes to long
- {os: windows-2022, compiler: msvc-cpp20-debug-notests, cmake_flags: "-DFMC_USE_SDSL=NO"}
- {os: ubuntu-22.04, compiler: emscripten-cpp20-release, cmake_flags: "-DCMAKE_EXE_LINKER_FLAGS='-sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=1000000'", cmake_cxx_flags: "-Wno-c++11-narrowing"}
steps:
- name: Standard IV-project testing
uses: iv-project/IVaction@v9.10
uses: iv-project/IVaction@v9.12
with:
compiler: ${{ matrix.compiler }}
threads: 2
cmake_flags: ${{ matrix.cmake_flags }}
cmake_c_flags: ${{ matrix.cmake_c_flags }}
cmake_cxx_flags: ${{ matrix.cmake_cxx_flags }}
2 changes: 1 addition & 1 deletion fmindex_collection-config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ endif()

set(LIBSAIS_USE_OPENMP ${OpenMP_C_FOUND})
set(LIBSAIS_BUILD_SHARED_LIB OFF)
CPMAddPackage("gh:IlyaGrebnov/[email protected].1")
CPMAddPackage("gh:IlyaGrebnov/[email protected].2")

if (FMC_USE_SDSL)
CPMAddPackage(
Expand Down
10 changes: 10 additions & 0 deletions src/example/argp.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@ struct Config {
size_t minK{0}, maxK{6}, k_stepSize{1};
bool reverse{true};
bool help{false};
bool partialBuildUp{false};
size_t threads{1};
std::set<std::string> extensions;
bool convertUnknownChar{false};

std::vector<std::string> algorithms;

Expand Down Expand Up @@ -57,6 +60,9 @@ auto loadConfig(int argc, char const* const* argv) {
} else if (argv[i] == std::string{"--queries"} and i+1 < argc) {
++i;
config.maxQueries = std::stod(argv[i]);
} else if (argv[i] == std::string{"--threads"} and i+1 < argc) {
++i;
config.threads = std::stod(argv[i]);
} else if (argv[i] == std::string{"--read_length"} and i+1 < argc) {
++i;
config.readLength = std::stod(argv[i]);
Expand All @@ -76,6 +82,10 @@ auto loadConfig(int argc, char const* const* argv) {
config.reverse = false;
} else if (argv[i] == std::string{"--help"}) {
config.help = true;
} else if (argv[i] == std::string{"--partialBuildUp"}) {
config.partialBuildUp = true;
} else if (argv[i] == std::string{"--convertUnknownChar"}) {
config.convertUnknownChar = true;
} else if (argv[i] == std::string{"--mode"} and i+1 < argc) {
++i;
auto s = std::string{argv[i]};
Expand Down
9 changes: 5 additions & 4 deletions src/example/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ int main(int argc, char const* const* argv) {
, ext, gens);
return 0;
}
auto const [queries, queryInfos] = loadQueries<Sigma>(config.queryPath, config.reverse);
auto const [queries, queryInfos] = loadQueries<Sigma>(config.queryPath, config.reverse, config.convertUnknownChar);

if (!queries.empty()) {
fmt::print("loaded {} queries (incl reverse complements)\n", queries.size());
Expand All @@ -89,7 +89,8 @@ int main(int argc, char const* const* argv) {
}
fmt::print("start loading {} ...", name);
fflush(stdout);
auto index = loadDenseIndex<CSA, Table>(config.indexPath, /*samplingRate*/16, /*threadNbr*/1);
size_t samplingRate = 16;
auto index = loadDenseIndex<CSA, Table>(config.indexPath, samplingRate, config.threads, config.partialBuildUp, config.convertUnknownChar);
fmt::print("done\n");
for (auto const& algorithm : config.algorithms) {
fmt::print("using algorithm {}\n", algorithm);
Expand Down Expand Up @@ -231,7 +232,7 @@ int main(int argc, char const* const* argv) {
if (algorithm.size() == 15 && algorithm.substr(0, 13) == "pseudo_fmtree") {
size_t maxDepth = std::stod(algorithm.substr(13, 2));
for (auto const& [queryId, cursor, e] : resultCursors) {
for (auto [seqId, pos] : LocateFMTree{index, cursor, maxDepth}) {
for (auto [seqId, pos] : LocateFMTree{index, cursor, samplingRate, maxDepth}) {
results.emplace_back(queryId, seqId, pos, e);
}
resultCt += cursor.len;
Expand All @@ -240,7 +241,7 @@ int main(int argc, char const* const* argv) {
for (auto const& [queryId, cursor, e] : resultCursors) {
locateFMTree<16>(index, cursor, [&, &queryId=queryId, &e=e](size_t seqId, size_t pos) {
results.emplace_back(queryId, seqId, pos, e);
});
}, samplingRate);
resultCt += cursor.len;
}

Expand Down
136 changes: 128 additions & 8 deletions src/example/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "utils/utils.h"

#include <fmindex-collection/fmindex-collection.h>
#include <fmindex-collection/fmindex/merge.h>
#include <fmindex-collection/occtable/all.h>
#include <fmindex-collection/suffixarray/DenseCSA.h>

Expand All @@ -24,7 +25,7 @@ struct Query {
{}
};
template <size_t Sigma>
auto loadQueries(std::string path, bool reverse) {
auto loadQueries(std::string path, bool reverse, bool convertUnknownChar) {
std::vector<std::vector<uint8_t>> queries;
std::vector<Query> queryInfos;
if (path.empty() || !std::filesystem::exists(path)) {
Expand Down Expand Up @@ -86,7 +87,15 @@ auto loadQueries(std::string path, bool reverse) {
else if ((*ptr == 'N' || *ptr == 'n') and Sigma == 6) query.push_back(5);
else if (*ptr == '\n') {}
else {
throw std::runtime_error("unknown alphabet");
if (convertUnknownChar) {
if (Sigma == 6) {
query.push_back(5);
} else {
query.push_back(1);
}
} else {
throw std::runtime_error("unknown alphabet");
}
}
++ptr;
}
Expand All @@ -97,12 +106,27 @@ auto loadQueries(std::string path, bool reverse) {
}

template <typename CSA, typename Table>
auto loadIndex(std::string path, size_t samplingRate, size_t threadNbr) {
auto loadIndex(std::string path, size_t samplingRate, size_t threadNbr, bool convertUnknownChar) {
auto sw = StopWatch{};
auto indexPath = path + "." + Table::extension() + ".index";
if (!std::filesystem::exists(indexPath)) {
auto [ref, refInfo] = loadQueries<Table::Sigma>(path, false);
auto index = fmindex_collection::BiFMIndex<Table>{ref, samplingRate, threadNbr};
auto [ref, refInfo] = loadQueries<Table::Sigma>(path, false, convertUnknownChar);
auto index = [&]() {
auto refs = std::vector<std::vector<uint8_t>>{};
refs.resize(1);

auto index = std::optional<fmindex_collection::BiFMIndex<Table>>{};
for (auto& r : ref) {
refs[0] = std::move(r);
auto newIndex = fmindex_collection::BiFMIndex<Table>{refs, samplingRate, threadNbr};
if (!index) {
index = std::move(newIndex);
} else {
index = merge(*index, newIndex);
}
}
return *index;
}();
// save index here
auto ofs = std::ofstream{indexPath, std::ios::binary};
auto archive = cereal::BinaryOutputArchive{ofs};
Expand All @@ -119,12 +143,107 @@ auto loadIndex(std::string path, size_t samplingRate, size_t threadNbr) {
}

template <typename CSA, typename Table>
auto loadDenseIndex(std::string path, size_t samplingRate, size_t threadNbr) {
auto loadDenseIndex(std::string path, size_t samplingRate, size_t threadNbr, bool partialBuildUp, bool convertUnknownChar) {
auto sw = StopWatch{};
auto indexPath = path + "." + Table::extension() + ".dense.index";
if (!std::filesystem::exists(indexPath)) {
auto [ref, refInfo] = loadQueries<Table::Sigma>(path, false);
auto index = fmindex_collection::BiFMIndex<Table, fmindex_collection::DenseCSA>{ref, samplingRate, threadNbr};
auto [ref, refInfo] = loadQueries<Table::Sigma>(path, false, convertUnknownChar);
using Index = fmindex_collection::BiFMIndex<Table, fmindex_collection::DenseCSA>;
auto index = [&]() -> Index {
if (!partialBuildUp) {
return {ref, samplingRate, threadNbr};
}
auto longestRef = std::accumulate(ref.begin(), ref.end(), size_t{}, [](size_t a, auto const& v) {
return std::max(a, v.size());
});
std::cout << "longest ref: " << longestRef << "\n";
#if 0
auto refs = std::vector<std::vector<uint8_t>>{};
refs.resize(1);
auto index = std::optional<Index>{};
for (auto& r : ref) {
refs[0] = std::move(r);
std::cout << "indexing " << refs[0].size() << "\n";
auto newIndex = Index{refs, samplingRate, threadNbr};
if (!index) {
index = std::move(newIndex);
} else {
std::cout << "merging " << index->size() << " + " << refs[0].size() << "\n";
index = merge(*index, newIndex);
}
}
return std::move(*index);
#elif 1
auto refs = std::vector<std::vector<uint8_t>>{};
auto index = std::optional<Index>{};
size_t acc = 0;
auto makePartialIndex = [&]() {
if (refs.empty()) return;
std::cout << "indexing " << acc << "\n";
auto newIndex = Index{refs, samplingRate, threadNbr};
if (!index) {
index = std::move(newIndex);
} else {
std::cout << "merging " << index->size() << " + " << acc << "\n";
index = merge(*index, newIndex);
}

acc = 0;
refs.clear();
};
for (size_t i{}; i < ref.size(); ++i) {
if (ref[i].size() + acc >= longestRef) {
makePartialIndex();
}
refs.emplace_back(std::move(ref[i]));
acc += refs.back().size();
}
makePartialIndex();
return std::move(*index);

#else
auto refs = std::vector<std::vector<uint8_t>>{};
refs.resize(1);
auto indices = std::vector<Index>{};

auto sort = [&]() {
std::sort(indices.begin(), indices.end(), [](auto const& lhs, auto const& rhs) {
return lhs.size() > rhs.size();
});
};
for (auto& r : ref) {
refs[0] = std::move(r);
std::cout << "indexing " << refs[0].size() << " " << indices.size() << "\n";
auto newIndex = Index{refs, samplingRate, threadNbr};
indices.emplace_back(std::move(newIndex));
sort();

while (indices.size() > 1) {
auto const& l1 = *(indices.end()-1);
auto const& l2 = *(indices.end()-2);
if (l2.size() > l1.size()*2) {
break;
}
std::cout << "merging " << l2.size() << " + " << l1.size() << " " << indices.size() << "\n";
auto newIndex = merge(l2, l1);
indices.pop_back(); indices.pop_back();
indices.emplace_back(std::move(newIndex));
sort();
}
}
while (indices.size() > 1) {
auto const& l1 = *(indices.end()-1);
auto const& l2 = *(indices.end()-2);
std::cout << "merging " << l2.size() << " + " << l1.size() << " " << indices.size() << "(fin)\n";
auto newIndex = merge(l2, l1);
indices.pop_back(); indices.pop_back();
indices.emplace_back(std::move(newIndex));
sort();
}
return std::move(indices.back());
#endif
}();

// save index here
auto ofs = std::ofstream{indexPath, std::ios::binary};
auto archive = cereal::BinaryOutputArchive{ofs};
Expand All @@ -149,6 +268,7 @@ void visitAllTables(CB cb) {
cb.template operator()<fmindex_collection::occtable::compactBitvectorPrefix::OccTable<Sigma>>();
cb.template operator()<fmindex_collection::occtable::interleaved8::OccTable<Sigma>>();*/
cb.template operator()<fmindex_collection::occtable::Interleaved_16<Sigma>>();
cb.template operator()<fmindex_collection::occtable::L1Bitvector<Sigma>>();
/* cb.template operator()<fmindex_collection::occtable::interleaved32::OccTable<Sigma>>();
cb.template operator()<fmindex_collection::occtable::interleaved8Aligned::OccTable<Sigma>>();
cb.template operator()<fmindex_collection::occtable::interleaved16Aligned::OccTable<Sigma>>();
Expand Down
24 changes: 13 additions & 11 deletions src/example/utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,9 @@ auto benchmarkTable(std::string name, T const& bwt) -> Result {

result.expectedMemory = s;

if (s < 1024ull*1024*1024*8ull) {
static_assert(sizeof(size_t) == 8);

if (s < size_t{1024}*1024*1024*8) {
StopWatch watch;
auto table = Table{bwt};

Expand Down Expand Up @@ -156,9 +158,9 @@ auto benchmarkTable(std::string name, T const& bwt) -> Result {
result.benchV2 = watch.reset();
}
{ // benchmark V3
uint64_t jumps{1};
uint64_t pos = table.rank(0, bwt[0]);
uint64_t a{};
size_t jumps{1};
size_t pos = table.rank(0, bwt[0]);
size_t a{};
while (pos != 0 && jumps/2 < bwt.size()) {
jumps += 1;
pos = table.rank(pos, bwt[pos]);
Expand All @@ -169,9 +171,9 @@ auto benchmarkTable(std::string name, T const& bwt) -> Result {
result.benchV3 = watch.reset();
}
{ // benchmark V4
uint64_t a{};
uint64_t jumps{1};
uint64_t pos = table.rank(0, bwt[0]);
size_t a{};
size_t jumps{1};
size_t pos = table.rank(0, bwt[0]);
while (pos != 0 && jumps/2 < bwt.size()) {
jumps += 1;
a += table.prefix_rank(pos, bwt[pos]);
Expand All @@ -182,7 +184,7 @@ auto benchmarkTable(std::string name, T const& bwt) -> Result {
}
{ // benchmark V5
xorshf96_reset();
uint64_t a{};
size_t a{};
for (size_t i{0}; i < 10'000'000; ++i) {
auto symb = xorshf96() % Table::Sigma;
auto row = xorshf96() % table.size();
Expand All @@ -195,9 +197,9 @@ auto benchmarkTable(std::string name, T const& bwt) -> Result {
result.benchV5 = watch.reset();
}
{ // benchmark V6
uint64_t jumps{1};
uint64_t pos = table.rank(0, bwt[0]);
uint64_t a{};
size_t jumps{1};
size_t pos = table.rank(0, bwt[0]);
size_t a{};
while (pos != 0 && jumps/2 < bwt.size()) {
jumps += 1;
auto [rs, prs] = table.all_ranks(pos);
Expand Down
Loading

0 comments on commit 8ca2e52

Please sign in to comment.