From 4fa45acb2f7d6e956a930de55bb8ddebebac04bc Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 11 Nov 2024 18:49:28 +0100 Subject: [PATCH 01/18] Initial implementation of NCHG cartesian-product --- src/libnchg/include/nchg/hash.hpp | 40 ++ src/nchg/CMakeLists.txt | 1 + .../cartesian_product/cartesian_product.cpp | 425 ++++++++++++++++++ src/nchg/cli.cpp | 87 +++- src/nchg/include/nchg/tools/cli.hpp | 4 + src/nchg/include/nchg/tools/config.hpp | 12 + src/nchg/include/nchg/tools/tools.hpp | 1 + 7 files changed, 563 insertions(+), 7 deletions(-) create mode 100644 src/libnchg/include/nchg/hash.hpp create mode 100644 src/nchg/cartesian_product/cartesian_product.cpp diff --git a/src/libnchg/include/nchg/hash.hpp b/src/libnchg/include/nchg/hash.hpp new file mode 100644 index 0000000..9f0538d --- /dev/null +++ b/src/libnchg/include/nchg/hash.hpp @@ -0,0 +1,40 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: GPL-3.0 +// +// This library is free software: you can redistribute it and/or +// modify it under the terms of the GNU Public License as published +// by the Free Software Foundation; either version 3 of the License, +// or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Library General Public License for more details. +// +// You should have received a copy of the GNU Public License along +// with this library. If not, see +// . + +#pragma once + +#include +#include + +namespace nchg::internal { +// Adapted from: +// https://www.boost.org/doc/libs/1_37_0/doc/html/hash/reference.html#boost.hash_combine + +template +[[nodiscard]] inline std::size_t hash_combine(std::size_t seed, const T &v) { + // NOLINTNEXTLINE(*-avoid-magic-numbers) + seed ^= std::hash{}(v) + 0x9e3779b9 + (seed << 6U) + (seed >> 2U); + return seed; +} +template +[[nodiscard]] inline std::size_t hash_combine(std::size_t seed, const T &v, const Args &...args) { + // NOLINTNEXTLINE(*-avoid-magic-numbers) + seed ^= std::hash{}(v) + 0x9e3779b9 + (seed << 6U) + (seed >> 2U); + return hash_combine(seed, args...); +} +} // namespace nchg::internal diff --git a/src/nchg/CMakeLists.txt b/src/nchg/CMakeLists.txt index acb2823..16e3697 100644 --- a/src/nchg/CMakeLists.txt +++ b/src/nchg/CMakeLists.txt @@ -33,6 +33,7 @@ add_executable(NCHG) target_sources( NCHG PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/cartesian_product/cartesian_product.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/common/io.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/compute/compute.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/expected/expected.cpp" diff --git a/src/nchg/cartesian_product/cartesian_product.cpp b/src/nchg/cartesian_product/cartesian_product.cpp new file mode 100644 index 0000000..925ffdf --- /dev/null +++ b/src/nchg/cartesian_product/cartesian_product.cpp @@ -0,0 +1,425 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: GPL-3.0 +// +// This library is free software: you can redistribute it and/or +// modify it under the terms of the GNU Public License as published +// by the Free Software Foundation; either version 3 of the License, +// or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Library General Public License for more details. +// +// You should have received a copy of the GNU Public License along +// with this library. If not, see +// . + +// clang-format off +#include "nchg/suppress_warnings.hpp" +NCHG_DISABLE_WARNING_PUSH +NCHG_DISABLE_WARNING_DEPRECATED_DECLARATIONS +#include +#include +NCHG_DISABLE_WARNING_POP +// clang-format on + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nchg/hash.hpp" +#include "nchg/tools/common.hpp" +#include "nchg/tools/config.hpp" +#include "nchg/tools/io.hpp" +#include "nchg/tools/tools.hpp" + +namespace nchg { + +using Domain = decltype(hictk::GenomicInterval::parse_bed(std::declval())); +using DomainPtr = std::shared_ptr; + +struct DomainHasher { + using is_transparent = void; + + [[nodiscard]] static std::size_t operator()(const Domain& domain) noexcept { + const auto& [chrom, start, end] = domain; + return internal::hash_combine(0, chrom, start, end); + } + + [[nodiscard]] static std::size_t operator()(const DomainPtr& ptr) noexcept { + if (!ptr) [[unlikely]] { + return std::hash>{}(ptr); + } + + return operator()(*ptr); + } +}; + +struct DomainEq { + using is_transparent = void; + + [[nodiscard]] static bool operator()(const Domain& domain1, const Domain& domain2) noexcept { + const auto& [chrom1, start1, end1] = domain1; + const auto& [chrom2, start2, end2] = domain2; + + return chrom1 == chrom2 && start1 == start2 && end1 == end2; + } + + [[nodiscard]] static bool operator()(const DomainPtr& ptr1, const Domain& domain2) noexcept { + if (!ptr1) [[unlikely]] { + return false; + } + + return operator()(*ptr1, domain2); + } + + [[nodiscard]] static bool operator()(const Domain& domain1, const DomainPtr& ptr2) noexcept { + return operator()(ptr2, domain1); + } + + [[nodiscard]] static bool operator()(const DomainPtr& ptr1, const DomainPtr& ptr2) noexcept { + if (!ptr1 || !ptr2) [[unlikely]] { + return ptr1 == ptr2; + } + + return operator()(*ptr1, *ptr2); + } +}; + +[[nodiscard]] static hictk::Reference parse_chrom_sizes(const std::filesystem::path& path) { + if (path.empty()) { + return {}; + } + + auto chroms = hictk::Reference::from_chrom_sizes(path); + if (!chroms.empty()) { + return chroms; + } + + throw std::runtime_error(fmt::format("unable to parse any chromosomes from file \"{}\"", path)); +} + +static void sort_domains(std::vector& domains, + const std::vector>& chroms) { + phmap::flat_hash_map chrom_ranks(chroms.size()); + std::ranges::transform( + chroms, std::inserter(chrom_ranks, chrom_ranks.end()), + [&](const auto& kv) { return std::make_pair(kv.first, chrom_ranks.size()); }); + + assert(!chroms.empty()); + std::ranges::sort(domains, [&](const Domain& lhs, const Domain& rhs) { + const auto& i1 = chrom_ranks.at(std::get<0>(lhs)); + const auto& i2 = chrom_ranks.at(std::get<0>(rhs)); + if (i1 != i2) { + return i1 < i2; + } + + const auto start1 = std::get<1>(lhs); + const auto start2 = std::get<1>(rhs); + if (start1 != start2) { + return start1 < start2; + } + + const auto end1 = std::get<2>(lhs); + const auto end2 = std::get<2>(rhs); + return end1 < end2; + }); +} + +static void sort_domains(std::vector& domains, const hictk::Reference& chroms) { + assert(!chroms.empty()); + std::ranges::sort(domains, [&](const Domain& lhs, const Domain& rhs) { + const auto& chrom1 = chroms.at(std::get<0>(lhs)); + const auto& chrom2 = chroms.at(std::get<0>(rhs)); + if (chrom1 != chrom2) { + return chrom1 < chrom2; + } + + const auto start1 = std::get<1>(lhs); + const auto start2 = std::get<1>(rhs); + if (start1 != start2) { + return start1 < start2; + } + + const auto end1 = std::get<2>(lhs); + const auto end2 = std::get<2>(rhs); + return end1 < end2; + }); +} + +static void detect_unordered_records( + std::string_view chrom_name, std::size_t line_number, + std::vector>& parsed_chromosomes) { + if (parsed_chromosomes.empty()) [[unlikely]] { + assert(line_number == 0); + parsed_chromosomes.emplace_back(std::string{chrom_name}, line_number); + return; + } + + if (parsed_chromosomes.back().first == chrom_name) [[likely]] { + return; + } + + const auto it = std::ranges::find_if(parsed_chromosomes, + [&](const auto& kv) { return kv.first == chrom_name; }); + + if (it == parsed_chromosomes.end()) [[likely]] { + parsed_chromosomes.emplace_back(std::string{chrom_name}, line_number); + return; + } + + std::vector chrom_names{}; + std::transform(it, parsed_chromosomes.end(), std::back_inserter(chrom_names), [](const auto& kv) { + return fmt::format("{} (first seen on line #{})", kv.first, kv.second); + }); + throw std::runtime_error( + fmt::format("domains are not sorted by their genomic coordinates: line #{} " + "references chromosome \"{}\", which was first encountered on line " + "\"{}\" and was followed by the chromosome(s) listed below:\n - {}", + line_number, it->first, it->second, fmt::join(chrom_names, "\n - "))); +} + +enum class ParseStatus : std::uint_fast8_t { PARSED, SKIPPED, DUPLICATE }; + +[[nodiscard]] static ParseStatus parse_record( + std::string_view line, std::size_t line_number, const hictk::Reference& reference, + std::vector>& parsed_chromosomes, + phmap::flat_hash_map& domains) { + auto record = hictk::GenomicInterval::parse_bed(line); + if (!reference.empty() && !reference.contains(std::get<0>(record))) { + SPDLOG_DEBUG("skipping line #{}", line_number); + return ParseStatus::SKIPPED; + } + + if (reference.empty()) { + const auto& chrom = std::get<0>(record); + detect_unordered_records(chrom, line_number, parsed_chromosomes); + } + + if (domains.contains(record)) [[unlikely]] { + SPDLOG_DEBUG("found duplicate record in line #{}", line_number); + return ParseStatus::DUPLICATE; + } + + domains.emplace(std::make_shared(std::move(record)), domains.size()); + return ParseStatus::PARSED; +} + +[[nodiscard]] static std::vector parse_domains(std::istream& stream, + const hictk::Reference& reference) { + // pair + std::vector> parsed_chromosomes{}; + phmap::flat_hash_map unique_domains; + + std::string line; + std::size_t duplicate_records{}; + std::size_t skipped_records{}; + std::size_t i = 0; + + try { + assert(stream.good()); + for (; std::getline(stream, line); ++i) { + if (line.empty()) [[unlikely]] { + continue; + } + const auto status = parse_record(truncate_bed3_record(line, '\t'), i, reference, + parsed_chromosomes, unique_domains); + switch (status) { + using enum ParseStatus; + case SKIPPED: { + ++skipped_records; + break; + } + case DUPLICATE: { + ++duplicate_records; + break; + } + case PARSED: { + break; + } + default: + hictk::unreachable_code(); // TODO fixme + } + } + } catch (const std::exception& e) { + if (!stream.eof()) { + throw std::runtime_error(fmt::format("failed to parse line {}: {}", i, e.what())); + } + } catch (...) { + throw std::runtime_error(fmt::format("failed to parse line {}: unknown error", i)); + } + + if (unique_domains.empty()) { + throw std::runtime_error("unable to parse any domains"); + } + + if (duplicate_records != 0) { + SPDLOG_WARN("parser discarded {} duplicate domains", duplicate_records); + } + + if (skipped_records != 0) { + assert(!reference.empty()); + SPDLOG_WARN( + "parsed skipped {} records because they referred to chromosomes not listed in the given " + "reference genome.", + skipped_records); + } + + std::vector domains(unique_domains.size()); + + for (auto& [domain, j] : unique_domains) { + domains[j] = std::move(*domain); + } + + SPDLOG_INFO("begin sorting domains..."); + if (!reference.empty()) { + sort_domains(domains, reference); + } else { + sort_domains(domains, parsed_chromosomes); + } + SPDLOG_INFO("done sorting!"); + + return domains; +} + +struct ChromIndex { + std::string chrom{}; + std::size_t start_offset{}; + std::size_t end_offset{}; +}; + +[[nodiscard]] std::vector index_chromosomes(const std::vector& domains) { + std::vector index{}; + + for (std::size_t i = 0; i < domains.size(); ++i) { + const auto& chrom = std::get<0>(domains[i]); + const auto i0 = i; + auto i1 = i0; + for (; i1 < domains.size(); ++i1) { + if (chrom != std::get<0>(domains[i1])) { + break; + } + } + index.emplace_back(chrom, i0, i1); + i = i1; + } + + return index; +} + +[[nodiscard]] static std::vector parse_domains(const std::filesystem::path& path, + const hictk::Reference& chroms) { + const auto read_from_stdin = path == "-" || path == "stdin"; + + std::ifstream fs{}; + fs.exceptions(fs.exceptions() | std::ios::badbit | std::ios::failbit); + if (read_from_stdin) { + SPDLOG_INFO("reading domains from stdin..."); + } else { + SPDLOG_INFO("reading domains from file \"{}\"...", path.string()); + fs.open(path); + } + + try { + auto domains = fs.is_open() ? parse_domains(fs, chroms) : parse_domains(std::cin, chroms); + + if (read_from_stdin) { + SPDLOG_INFO("successfully read {} domains from file stdin!", domains.size()); + } else { + SPDLOG_INFO("successfully read {} domains from file \"{}\"!", domains.size(), path.string()); + } + + return domains; + } catch (const std::exception& e) { + if (read_from_stdin) { + throw std::runtime_error(fmt::format("failed to parse domains from stdin: {}", e.what())); + } + throw std::runtime_error( + fmt::format("failed to parse domains from file \"{}\": {}", path.string(), e.what())); + + } catch (...) { + if (read_from_stdin) { + throw std::runtime_error(fmt::format("failed to parse domains from stdin: unknown error")); + } + throw std::runtime_error( + fmt::format("failed to parse domains from file \"{}\": unknown error", path.string())); + } +} + +[[nodiscard]] static std::size_t print_chunk(const std::vector& domains, std::size_t i0, + std::size_t i1, std::size_t j0, std::size_t j1) { + assert(i0 <= i1); + assert(j0 <= j1); + assert(j0 >= i0); + + std::size_t num_records = 0; + for (std::size_t i = i0; i < i1; ++i) { + const auto& [chrom1, start1, end1] = domains[i]; + for (std::size_t j = j0; j < j1; ++j) { + const auto& [chrom2, start2, end2] = domains[j]; + fmt::print(FMT_COMPILE("{}\t{}\t{}\t{}\t{}\t{}\n"), chrom1, start1, end1, chrom2, start2, + end2); + ++num_records; + } + } + + return num_records; +} + +int run_command(const CartesianProductConfig& c) { + assert(c.process_cis || c.process_trans); + + const auto t0 = std::chrono::steady_clock::now(); + + const auto domains = parse_domains(c.path_to_domains, parse_chrom_sizes(c.path_to_chrom_sizes)); + const auto index = index_chromosomes(domains); + + std::size_t records_processed = 0; + for (std::size_t i = 0; i < index.size(); ++i) { + const auto& [chrom1, i0, i1] = index[i]; + for (std::size_t j = i; j < index.size(); ++j) { + const auto& [chrom2, j0, j1] = index[j]; + + if (!c.process_cis && chrom1 == chrom2) { + SPDLOG_DEBUG("skipping domains for {}:{}...", chrom1, chrom2); + continue; + } + if (!c.process_trans && chrom1 != chrom2) { + SPDLOG_DEBUG("skipping domains for {}:{}...", chrom1, chrom2); + continue; + } + + SPDLOG_DEBUG("printing domains for {}:{}...", chrom1, chrom2); + records_processed += print_chunk(domains, i0, i1, j0, j1); + } + } + + const auto t1 = std::chrono::steady_clock::now(); + SPDLOG_INFO("processed {} records in {}!", records_processed, format_duration(t1 - t0)); + + return 0; +} + +} // namespace nchg diff --git a/src/nchg/cli.cpp b/src/nchg/cli.cpp index 48f7cc2..b1e70fc 100644 --- a/src/nchg/cli.cpp +++ b/src/nchg/cli.cpp @@ -59,7 +59,9 @@ auto Cli::parse_arguments() -> Config { _cli.parse(_argc, _argv); using enum subcommand; - if (_cli.get_subcommand("compute")->parsed()) { + if (_cli.get_subcommand("cartesian-product")->parsed()) { + _subcommand = cartesian_product; + } else if (_cli.get_subcommand("compute")->parsed()) { _subcommand = compute; } else if (_cli.get_subcommand("expected")->parsed()) { _subcommand = expected; @@ -103,6 +105,8 @@ int Cli::exit() const noexcept { return _exit_code; } std::string_view Cli::subcommand_to_str(subcommand s) noexcept { switch (s) { + case cartesian_product: + return "cartesian-product"; case compute: return "compute"; case expected: @@ -132,6 +136,7 @@ void Cli::make_cli() { _cli.set_version_flag("-V,--version", "0.0.2"); _cli.require_subcommand(1); + make_cartesian_product_subcommand(); make_compute_subcommand(); make_expected_subcommand(); make_filter_subcommand(); @@ -139,6 +144,61 @@ void Cli::make_cli() { make_view_subcommand(); } +void Cli::make_cartesian_product_subcommand() { + auto &sc = + *_cli.add_subcommand("cartesian-product", + "Compute the cartesian product of domains from a list in BED3 format.") + ->fallthrough() + ->preparse_callback([this]([[maybe_unused]] std::size_t i) { + assert(_config.index() == 0); + _config = CartesianProductConfig{}; + }); + + _config = CartesianProductConfig{}; + auto &c = std::get(_config); + + // clang-format off + sc.add_option( + "domains", + c.path_to_domains, + "Path to a BED3+ file with the list of domains to be processed.\n" + "Domains should be sorted by chromosome when --chrom-sizes is not provided.\n" + "Pass \"-\" or \"stdin\" if the domains should be read from stdin.") + ->check(CLI::ExistingFile | CLI::IsMember{{"-", "stdin"}}) + ->required(); + + sc.add_option( + "-c,--chrom-sizes", + c.path_to_chrom_sizes, + "Path to .chrom.sizes file.\n" + "Chromosomes will be used to sort domains prior to processing.") + ->check(CLI::ExistingFile); + + sc.add_flag_function( + "--cis-only", + [&c](auto n) { if (n != 0) {c.process_trans = false;} }, + "Only output pairs of domains corresponding to regions interacting in cis.") + ->capture_default_str(); + + sc.add_flag_function( + "--trans-only", + [&c](auto n) { if (n != 0) {c.process_cis = false; }}, + "Only output pairs of domains corresponding to regions interacting in trans.") + ->capture_default_str(); + + sc.add_option( + "-v,--verbosity", + c.verbosity, + "Set verbosity of output to the console.") + ->check(CLI::Range(1, 4)) + ->capture_default_str(); + // clang-format on + + sc.get_option("--cis-only")->excludes("--trans-only"); + + _config = std::monostate{}; +} + void Cli::make_compute_subcommand() { auto &sc = *_cli.add_subcommand( @@ -633,6 +693,8 @@ void Cli::make_view_subcommand() { void Cli::validate_args() const { switch (get_subcommand()) { + case cartesian_product: + return validate_cartesian_product_subcommand(); // NOLINT case compute: return validate_compute_subcommand(); // NOLINT case expected: @@ -648,6 +710,8 @@ void Cli::validate_args() const { } } +void Cli::validate_cartesian_product_subcommand() const {} + void Cli::validate_compute_subcommand() const { const auto &c = std::get(_config); const auto &sc = *_cli.get_subcommand("compute"); @@ -740,6 +804,8 @@ void Cli::validate_view_subcommand() const {} void Cli::transform_args() { switch (get_subcommand()) { + case cartesian_product: + return transform_args_cartesian_product_subcommand(); // NOLINT case compute: return transform_args_compute_subcommand(); // NOLINT case expected: @@ -755,12 +821,8 @@ void Cli::transform_args() { } } -void Cli::transform_args_expected_subcommand() { - auto &c = std::get(_config); - if (c.chrom1 != "all" && c.chrom2 == "all") { - c.chrom2 = c.chrom1; - } - +void Cli::transform_args_cartesian_product_subcommand() { + auto &c = std::get(_config); // in spdlog, high numbers correspond to low log levels assert(c.verbosity > 0 && c.verbosity <= SPDLOG_LEVEL_CRITICAL); c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; @@ -823,6 +885,17 @@ void Cli::transform_args_compute_subcommand() { c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; } +void Cli::transform_args_expected_subcommand() { + auto &c = std::get(_config); + if (c.chrom1 != "all" && c.chrom2 == "all") { + c.chrom2 = c.chrom1; + } + + // in spdlog, high numbers correspond to low log levels + assert(c.verbosity > 0 && c.verbosity <= SPDLOG_LEVEL_CRITICAL); + c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; +} + void Cli::transform_args_filter_subcommand() { auto &c = std::get(_config); // in spdlog, high numbers correspond to low log levels diff --git a/src/nchg/include/nchg/tools/cli.hpp b/src/nchg/include/nchg/tools/cli.hpp index ca79524..3086831 100644 --- a/src/nchg/include/nchg/tools/cli.hpp +++ b/src/nchg/include/nchg/tools/cli.hpp @@ -118,6 +118,7 @@ class Cli { public: enum subcommand : std::uint_fast8_t { help, + cartesian_product, compute, expected, filter, @@ -144,6 +145,7 @@ class Cli { subcommand _subcommand{subcommand::help}; mutable std::vector _warnings{}; + void make_cartesian_product_subcommand(); void make_compute_subcommand(); void make_expected_subcommand(); void make_filter_subcommand(); @@ -151,6 +153,7 @@ class Cli { void make_view_subcommand(); void make_cli(); + void validate_cartesian_product_subcommand() const; void validate_compute_subcommand() const; void validate_expected_subcommand() const; void validate_filter_subcommand() const; @@ -158,6 +161,7 @@ class Cli { void validate_view_subcommand() const; void validate_args() const; + void transform_args_cartesian_product_subcommand(); void transform_args_compute_subcommand(); void transform_args_expected_subcommand(); void transform_args_filter_subcommand(); diff --git a/src/nchg/include/nchg/tools/config.hpp b/src/nchg/include/nchg/tools/config.hpp index 7591d20..6153477 100644 --- a/src/nchg/include/nchg/tools/config.hpp +++ b/src/nchg/include/nchg/tools/config.hpp @@ -26,6 +26,17 @@ #include namespace nchg { + +struct CartesianProductConfig { + std::filesystem::path path_to_domains{}; + std::filesystem::path path_to_chrom_sizes{}; + + bool process_cis{true}; + bool process_trans{true}; + + std::uint8_t verbosity{3}; +}; + struct ComputePvalConfig { std::filesystem::path exec{}; std::filesystem::path path_to_hic{}; @@ -134,6 +145,7 @@ struct ViewConfig { // clang-format off using Config = std::variant< std::monostate, + CartesianProductConfig, ComputePvalConfig, ExpectedConfig, FilterConfig, diff --git a/src/nchg/include/nchg/tools/tools.hpp b/src/nchg/include/nchg/tools/tools.hpp index ea5cb6e..fbf4da2 100644 --- a/src/nchg/include/nchg/tools/tools.hpp +++ b/src/nchg/include/nchg/tools/tools.hpp @@ -22,6 +22,7 @@ namespace nchg { +[[nodiscard]] int run_command(const CartesianProductConfig& c); [[nodiscard]] int run_command(const ComputePvalConfig& c); [[nodiscard]] int run_command(const ExpectedConfig& c); [[nodiscard]] int run_command(const FilterConfig& c); From 601e3b4fb1274d0ed919e8f72b41cfa467df9542 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 11 Nov 2024 19:38:49 +0100 Subject: [PATCH 02/18] Fix typo --- src/nchg/cartesian_product/cartesian_product.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nchg/cartesian_product/cartesian_product.cpp b/src/nchg/cartesian_product/cartesian_product.cpp index 925ffdf..16b22fc 100644 --- a/src/nchg/cartesian_product/cartesian_product.cpp +++ b/src/nchg/cartesian_product/cartesian_product.cpp @@ -346,7 +346,7 @@ struct ChromIndex { auto domains = fs.is_open() ? parse_domains(fs, chroms) : parse_domains(std::cin, chroms); if (read_from_stdin) { - SPDLOG_INFO("successfully read {} domains from file stdin!", domains.size()); + SPDLOG_INFO("successfully read {} domains from stdin!", domains.size()); } else { SPDLOG_INFO("successfully read {} domains from file \"{}\"!", domains.size(), path.string()); } From a1ac243b81eed08f2a4dba640f660281bff83249 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 11 Nov 2024 19:43:00 +0100 Subject: [PATCH 03/18] Fix macOS builds --- src/nchg/cartesian_product/cartesian_product.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/nchg/cartesian_product/cartesian_product.cpp b/src/nchg/cartesian_product/cartesian_product.cpp index 16b22fc..a9acd5a 100644 --- a/src/nchg/cartesian_product/cartesian_product.cpp +++ b/src/nchg/cartesian_product/cartesian_product.cpp @@ -322,7 +322,11 @@ struct ChromIndex { break; } } +#if defined(__apple_build_version__) && __apple_build_version__ < 16000000 + index.emplace_back(ChromIndex{chrom, i0, i1}); +#else index.emplace_back(chrom, i0, i1); +#endif i = i1; } From fb367ff5c80f3294d3ef794720f98ae52d8274fb Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 11 Nov 2024 19:46:16 +0100 Subject: [PATCH 04/18] Fix typos --- src/nchg/cartesian_product/cartesian_product.cpp | 2 +- src/nchg/cli.cpp | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/nchg/cartesian_product/cartesian_product.cpp b/src/nchg/cartesian_product/cartesian_product.cpp index a9acd5a..eddf1ac 100644 --- a/src/nchg/cartesian_product/cartesian_product.cpp +++ b/src/nchg/cartesian_product/cartesian_product.cpp @@ -282,7 +282,7 @@ enum class ParseStatus : std::uint_fast8_t { PARSED, SKIPPED, DUPLICATE }; if (skipped_records != 0) { assert(!reference.empty()); SPDLOG_WARN( - "parsed skipped {} records because they referred to chromosomes not listed in the given " + "parser skipped {} record(s) because they referred to chromosomes not listed in the given " "reference genome.", skipped_records); } diff --git a/src/nchg/cli.cpp b/src/nchg/cli.cpp index b1e70fc..a15da79 100644 --- a/src/nchg/cli.cpp +++ b/src/nchg/cli.cpp @@ -177,13 +177,12 @@ void Cli::make_cartesian_product_subcommand() { sc.add_flag_function( "--cis-only", [&c](auto n) { if (n != 0) {c.process_trans = false;} }, - "Only output pairs of domains corresponding to regions interacting in cis.") + "Only output pair of domains corresponding to regions interacting in cis.") ->capture_default_str(); - sc.add_flag_function( "--trans-only", [&c](auto n) { if (n != 0) {c.process_cis = false; }}, - "Only output pairs of domains corresponding to regions interacting in trans.") + "Only output pair of domains corresponding to regions interacting in trans.") ->capture_default_str(); sc.add_option( From bb9032c73a9553249aeea9057808d67f7d2c0051 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 11 Nov 2024 20:05:26 +0100 Subject: [PATCH 05/18] Add option to support processing domains for a single chromosome pair --- .../cartesian_product/cartesian_product.cpp | 26 +++++++++++++++-- src/nchg/cli.cpp | 28 +++++++++++++++++-- src/nchg/include/nchg/tools/config.hpp | 3 ++ 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/src/nchg/cartesian_product/cartesian_product.cpp b/src/nchg/cartesian_product/cartesian_product.cpp index eddf1ac..25879a8 100644 --- a/src/nchg/cartesian_product/cartesian_product.cpp +++ b/src/nchg/cartesian_product/cartesian_product.cpp @@ -108,7 +108,21 @@ struct DomainEq { } }; -[[nodiscard]] static hictk::Reference parse_chrom_sizes(const std::filesystem::path& path) { +[[nodiscard]] static hictk::Reference parse_chrom_sizes(const std::filesystem::path& path, + const std::optional& chrom1, + const std::optional& chrom2) { + if (chrom1.has_value()) { + assert(chrom2.has_value()); + + const std::vector names{*chrom1, *chrom2}; + const std::vector sizes(2, std::numeric_limits::max()); + if (chrom1 == chrom2) { + return {names.begin(), names.begin() + 1, sizes.begin()}; + } + + return {names.begin(), names.end(), sizes.begin()}; + } + if (path.empty()) { return {}; } @@ -397,14 +411,22 @@ int run_command(const CartesianProductConfig& c) { const auto t0 = std::chrono::steady_clock::now(); - const auto domains = parse_domains(c.path_to_domains, parse_chrom_sizes(c.path_to_chrom_sizes)); + const auto domains = parse_domains(c.path_to_domains, + parse_chrom_sizes(c.path_to_chrom_sizes, c.chrom1, c.chrom2)); + const auto index = index_chromosomes(domains); std::size_t records_processed = 0; for (std::size_t i = 0; i < index.size(); ++i) { const auto& [chrom1, i0, i1] = index[i]; + if (c.chrom1.has_value() && *c.chrom1 != chrom1) { + continue; + } for (std::size_t j = i; j < index.size(); ++j) { const auto& [chrom2, j0, j1] = index[j]; + if (c.chrom2.has_value() && *c.chrom2 != chrom2) { + continue; + } if (!c.process_cis && chrom1 == chrom2) { SPDLOG_DEBUG("skipping domains for {}:{}...", chrom1, chrom2); diff --git a/src/nchg/cli.cpp b/src/nchg/cli.cpp index a15da79..3ba3fa8 100644 --- a/src/nchg/cli.cpp +++ b/src/nchg/cli.cpp @@ -166,14 +166,22 @@ void Cli::make_cartesian_product_subcommand() { "Pass \"-\" or \"stdin\" if the domains should be read from stdin.") ->check(CLI::ExistingFile | CLI::IsMember{{"-", "stdin"}}) ->required(); - sc.add_option( "-c,--chrom-sizes", c.path_to_chrom_sizes, "Path to .chrom.sizes file.\n" "Chromosomes will be used to sort domains prior to processing.") ->check(CLI::ExistingFile); - + sc.add_option( + "--chrom1", + c.chrom1, + "Name of the first chromosome used to filter domains.") + ->capture_default_str(); + sc.add_option( + "--chrom2", + c.chrom2, + "Name of the second chromosome used to filter domains.") + ->capture_default_str(); sc.add_flag_function( "--cis-only", [&c](auto n) { if (n != 0) {c.process_trans = false;} }, @@ -184,7 +192,6 @@ void Cli::make_cartesian_product_subcommand() { [&c](auto n) { if (n != 0) {c.process_cis = false; }}, "Only output pair of domains corresponding to regions interacting in trans.") ->capture_default_str(); - sc.add_option( "-v,--verbosity", c.verbosity, @@ -193,8 +200,16 @@ void Cli::make_cartesian_product_subcommand() { ->capture_default_str(); // clang-format on + sc.get_option("--chrom-sizes")->excludes("--chrom1"); + sc.get_option("--chrom-sizes")->excludes("--chrom2"); + + sc.get_option("--cis-only")->excludes("--chrom1"); + sc.get_option("--cis-only")->excludes("--chrom2"); sc.get_option("--cis-only")->excludes("--trans-only"); + sc.get_option("--trans-only")->excludes("--chrom1"); + sc.get_option("--trans-only")->excludes("--chrom2"); + _config = std::monostate{}; } @@ -822,6 +837,13 @@ void Cli::transform_args() { void Cli::transform_args_cartesian_product_subcommand() { auto &c = std::get(_config); + + if (c.chrom1.has_value()) { + if (!c.chrom2.has_value()) { + c.chrom2 = c.chrom1; + } + } + // in spdlog, high numbers correspond to low log levels assert(c.verbosity > 0 && c.verbosity <= SPDLOG_LEVEL_CRITICAL); c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; diff --git a/src/nchg/include/nchg/tools/config.hpp b/src/nchg/include/nchg/tools/config.hpp index 6153477..c294739 100644 --- a/src/nchg/include/nchg/tools/config.hpp +++ b/src/nchg/include/nchg/tools/config.hpp @@ -31,6 +31,9 @@ struct CartesianProductConfig { std::filesystem::path path_to_domains{}; std::filesystem::path path_to_chrom_sizes{}; + std::optional chrom1{}; + std::optional chrom2{}; + bool process_cis{true}; bool process_trans{true}; From 7424f75207a124bed2635e0157110903fdb11c13 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Tue, 12 Nov 2024 15:07:38 +0100 Subject: [PATCH 06/18] Properly deal with \r\n line terminations on UNIX systems --- src/nchg/cartesian_product/cartesian_product.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/nchg/cartesian_product/cartesian_product.cpp b/src/nchg/cartesian_product/cartesian_product.cpp index 25879a8..3618ec5 100644 --- a/src/nchg/cartesian_product/cartesian_product.cpp +++ b/src/nchg/cartesian_product/cartesian_product.cpp @@ -258,6 +258,9 @@ enum class ParseStatus : std::uint_fast8_t { PARSED, SKIPPED, DUPLICATE }; if (line.empty()) [[unlikely]] { continue; } + if (line.back() == '\r') [[unlike]] { + line.resize(line.size() - 1); + } const auto status = parse_record(truncate_bed3_record(line, '\t'), i, reference, parsed_chromosomes, unique_domains); switch (status) { From f2557a0c69f0985f2f8847232fe0bc1e89f02d1f Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Tue, 12 Nov 2024 20:43:40 +0100 Subject: [PATCH 07/18] Bugfix --- src/nchg/cartesian_product/cartesian_product.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nchg/cartesian_product/cartesian_product.cpp b/src/nchg/cartesian_product/cartesian_product.cpp index 3618ec5..017c35e 100644 --- a/src/nchg/cartesian_product/cartesian_product.cpp +++ b/src/nchg/cartesian_product/cartesian_product.cpp @@ -258,7 +258,7 @@ enum class ParseStatus : std::uint_fast8_t { PARSED, SKIPPED, DUPLICATE }; if (line.empty()) [[unlikely]] { continue; } - if (line.back() == '\r') [[unlike]] { + if (line.back() == '\r') [[unlikely]] { line.resize(line.size() - 1); } const auto status = parse_record(truncate_bed3_record(line, '\t'), i, reference, @@ -398,7 +398,7 @@ struct ChromIndex { std::size_t num_records = 0; for (std::size_t i = i0; i < i1; ++i) { const auto& [chrom1, start1, end1] = domains[i]; - for (std::size_t j = j0; j < j1; ++j) { + for (std::size_t j = std::max(j0, i); j < j1; ++j) { const auto& [chrom2, start2, end2] = domains[j]; fmt::print(FMT_COMPILE("{}\t{}\t{}\t{}\t{}\t{}\n"), chrom1, start1, end1, chrom2, start2, end2); From ec954697dc787aa9735ed091e163aee32050a1f5 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Tue, 12 Nov 2024 21:46:28 +0100 Subject: [PATCH 08/18] Update NCHG compute to accept a list of 2D domains in BEDPE format --- .../cartesian_product/cartesian_product.cpp | 2 +- src/nchg/cli.cpp | 5 +- src/nchg/common/io.cpp | 16 +- src/nchg/common/io_impl.hpp | 17 + src/nchg/compute/compute.cpp | 519 ++++++++++++++---- src/nchg/include/nchg/tools/io.hpp | 3 +- 6 files changed, 438 insertions(+), 124 deletions(-) diff --git a/src/nchg/cartesian_product/cartesian_product.cpp b/src/nchg/cartesian_product/cartesian_product.cpp index 017c35e..b7d7d19 100644 --- a/src/nchg/cartesian_product/cartesian_product.cpp +++ b/src/nchg/cartesian_product/cartesian_product.cpp @@ -261,7 +261,7 @@ enum class ParseStatus : std::uint_fast8_t { PARSED, SKIPPED, DUPLICATE }; if (line.back() == '\r') [[unlikely]] { line.resize(line.size() - 1); } - const auto status = parse_record(truncate_bed3_record(line, '\t'), i, reference, + const auto status = parse_record(truncate_record<3>(line, '\t'), i, reference, parsed_chromosomes, unique_domains); switch (status) { using enum ParseStatus; diff --git a/src/nchg/cli.cpp b/src/nchg/cli.cpp index 3ba3fa8..b38fedb 100644 --- a/src/nchg/cli.cpp +++ b/src/nchg/cli.cpp @@ -265,8 +265,9 @@ void Cli::make_compute_subcommand() { sc.add_option( "--domains", c.path_to_domains, - "Path to a BED file with a list of domains whose pair-wise interactions should\n" - "be assessed for significance.") + "Path to a BEDPE file with the list of 2D domains to be processed.\n" + "For each domain, NCHG will first fetch and aggregate interactions overlapping with the given coordinates.\n" + "Then, NCHG will asses the statistical significance of the observed interactions after aggregation.") ->check(CLI::ExistingFile); sc.add_flag_function( "--cis-only", diff --git a/src/nchg/common/io.cpp b/src/nchg/common/io.cpp index 592afb4..012ed75 100644 --- a/src/nchg/common/io.cpp +++ b/src/nchg/common/io.cpp @@ -350,20 +350,6 @@ void RecordBatchBuilder::write(parquet::arrow::FileWriter &writer) { reset(); } -std::string_view truncate_bed3_record(std::string_view record, char sep) { - const auto pos1 = record.find(sep); - if (pos1 == std::string_view::npos) { - throw std::runtime_error("invalid bed record, expected 3 tokens, found 1"); - } - const auto pos2 = record.find('\t', pos1 + 1); - if (pos2 == std::string_view::npos) { - throw std::runtime_error("invalid bed record, expected 3 tokens, found 2"); - } - const auto pos3 = record.find('\t', pos2 + 1); - - return record.substr(0, pos3); -} - phmap::flat_hash_map> parse_bin_mask( const hictk::Reference &chroms, std::uint32_t bin_size, const std::filesystem::path &path) { if (path.empty()) { @@ -391,7 +377,7 @@ phmap::flat_hash_map> parse_bin_mask( } try { - const auto record = truncate_bed3_record(buffer); + const auto record = truncate_record<3>(buffer); auto domain = hictk::GenomicInterval::parse_bed(chroms, record); const auto num_bins = (domain.chrom().size() + bin_size - 1) / bin_size; diff --git a/src/nchg/common/io_impl.hpp b/src/nchg/common/io_impl.hpp index 4431594..ca36f20 100644 --- a/src/nchg/common/io_impl.hpp +++ b/src/nchg/common/io_impl.hpp @@ -247,4 +247,21 @@ inline std::unique_ptr init_parquet_file_writer( return result.MoveValueUnsafe(); } +template +std::string_view truncate_record(std::string_view record, char sep) { + static_assert(NTOKS != 0); + + std::size_t offset{}; + for (std::size_t i = 0; i < NTOKS; ++i) { + const auto pos = record.find(sep, offset + 1); + if (pos == std::string_view::npos && i != NTOKS - 1) [[unlikely]] { + throw std::runtime_error( + fmt::format("invalid record, expected {} tokens, found {}", NTOKS, i)); + } + offset = pos; + } + + return record.substr(0, offset); +} + } // namespace nchg diff --git a/src/nchg/compute/compute.cpp b/src/nchg/compute/compute.cpp index 918a157..c55cef0 100644 --- a/src/nchg/compute/compute.cpp +++ b/src/nchg/compute/compute.cpp @@ -64,56 +64,314 @@ NCHG_DISABLE_WARNING_POP namespace nchg { -[[nodiscard]] static std::vector parse_domains( - const hictk::Reference &chroms, const std::filesystem::path &path, std::string_view chrom1, - std::string_view chrom2) { - SPDLOG_INFO("[{}:{}] reading domains from {}...", chrom1, chrom2, path); - std::vector domains{}; - std::string buffer{}; +using ChromPair = std::pair; +using BG2Domain = std::pair; - std::ifstream fs{}; - fs.exceptions(fs.exceptions() | std::ios::badbit | std::ios::failbit); +using BG2DomainSet = phmap::flat_hash_set; +using ChromosomePairs = phmap::btree_set; - try { - fs.open(path); +class BG2Domains { + std::vector _domains; + std::unique_ptr _mtx{}; - for (std::size_t i = 1; std::getline(fs, buffer); ++i) { - if (buffer.empty()) { - continue; - } + [[nodiscard]] static std::vector parse_domains( + const hictk::Reference &chroms, const std::filesystem::path &path, bool keep_cis, + bool keep_trans, const std::optional &chrom1, + const std::optional &chrom2) { + SPDLOG_INFO("reading domains from file \"{}\"...", path.string()); + const auto t0 = std::chrono::steady_clock::now(); - if (buffer.back() == '\r') { - buffer.resize(buffer.size() - 1); - } + std::ifstream ifs{}; + ifs.exceptions(ifs.exceptions() | std::ios::badbit | std::ios::failbit); + + std::size_t i = 1; + std::string buffer; + std::size_t domains_dropped{}; + std::size_t duplicate_domains{}; + std::size_t dropped_cis{}; + std::size_t dropped_trans{}; + BG2DomainSet domain_set{}; + + if (chrom1.has_value()) { + assert(chrom2.has_value()); + assert(chroms.contains(*chrom1)); + assert(chroms.contains(*chrom2)); + } + + try { + ifs.open(path); - try { - const auto record = truncate_bed3_record(buffer); - auto domain = hictk::GenomicInterval::parse_bed(chroms, record); + for (; std::getline(ifs, buffer); ++i) { + if (buffer.empty()) { + continue; + } + + if (buffer.back() == '\r') { + buffer.resize(buffer.size() - 1); + } + + const auto record = truncate_record<6>(buffer); + const auto domain1 = truncate_record<3>(record); + const auto domain2 = truncate_record<3>(record.substr(domain1.size() + 1)); + + const auto chrom1_parsed = truncate_record<1>(domain1); + const auto chrom2_parsed = truncate_record<1>(domain2); + + if (!chrom1.has_value()) { + assert(!chrom2.has_value()); + if (!chroms.contains(chrom1_parsed) || !chroms.contains(chrom2_parsed)) { + ++domains_dropped; + continue; + } - if (chrom1 != "all") { - assert(chrom2 != "all"); - if (domain.chrom().name() != chrom1 && domain.chrom().name() != chrom2) { + if (!keep_cis && chrom1_parsed == chrom2_parsed) { + ++dropped_cis; continue; } + + if (!keep_trans && chrom1_parsed != chrom2_parsed) { + ++dropped_trans; + continue; + } + } + + if ((chrom1.has_value() && *chrom1 != chrom1_parsed) || + (chrom2.has_value() && *chrom2 != chrom2_parsed)) { + ++domains_dropped; + continue; + } + + auto domain = std::make_pair(hictk::GenomicInterval::parse_bed(chroms, domain1), + hictk::GenomicInterval::parse_bed(chroms, domain2)); + + if (domain.first > domain.second) { + throw std::runtime_error( + fmt::format("domains cannot overlap with the lower triangular matrix: offending " + "domain {:ucsc}; {:ucsc}", + domain.first, domain.second)); } - domains.emplace_back(std::move(domain)); - } catch (const std::exception &e) { + const auto &[_, inserted] = domain_set.emplace(std::move(domain)); + duplicate_domains += static_cast(!inserted); + } + } catch (const std::exception &e) { + if (!ifs.eof()) { throw std::runtime_error( fmt::format("found an invalid record at line {} of file {}: {}", i, path, e.what())); } + } catch (...) { + throw std::runtime_error( + fmt::format("found an invalid record at line {} of file {}: unknown error", i, path)); + } + + if (duplicate_domains != 0) { + SPDLOG_WARN("found {} duplicate domain(s)", domains_dropped); + } + if (domains_dropped != 0) { + if (chrom1.has_value()) { + assert(chrom2.has_value()); + SPDLOG_WARN( + "[{}:{}]: {} domain(s) were dropped because they did not map to the selected " + "chromosomes", + chrom1->name(), chrom2->name(), domains_dropped); + } else { + SPDLOG_WARN("{} domain(s) were dropped because they did not map to any known chromosome", + domains_dropped); + } + } + if (dropped_cis != 0) { + SPDLOG_WARN( + "{} domain(s) were dropped because they overlapped with the cis area of the interaction " + "map", + dropped_cis); + } + if (dropped_trans != 0) { + SPDLOG_WARN( + "{} domain(s) were dropped because they overlapped with the trans area of the " + "interaction map", + dropped_trans); } - } catch (const std::exception &) { - if (!fs.eof()) { - throw; + if (domain_set.empty()) { + throw std::runtime_error( + fmt::format("unable to parse any domain from file \"{}\"", path.string())); } + + std::vector domains(domain_set.size()); + std::ranges::move(domain_set, domains.begin()); + { + BG2DomainSet tmp{}; + std::swap(domain_set, tmp); + } + + // We want to domains to be sorted in a tiled fashion: + // e.g. all chr1:chr1 domains should precede all chr1:chr2 domains. + // Within tiles, domains should be sorted by their genomic coordinates + std::ranges::sort(domains, [](const auto &domain1, const auto &domain2) { + const auto &c1 = domain1.first.chrom(); + const auto &c2 = domain2.first.chrom(); + const auto &c3 = domain1.second.chrom(); + const auto &c4 = domain2.second.chrom(); + + if (c1 != c2) { + return c1 < c2; + } + if (c3 != c4) { + return c3 < c4; + } + + auto pos1 = domain1.first.start(); + auto pos2 = domain2.first.start(); + + if (pos1 != pos2) { + return pos1 < pos2; + } + + auto pos3 = domain1.second.start(); + auto pos4 = domain2.second.start(); + + if (pos3 != pos4) { + return pos3 < pos4; + } + + pos1 = domain1.first.end(); + pos2 = domain2.first.end(); + + if (pos1 != pos2) { + return pos1 < pos2; + } + + pos3 = domain1.second.end(); + pos4 = domain2.second.end(); + + return pos3 < pos4; + }); + + const auto t1 = std::chrono::steady_clock::now(); + SPDLOG_INFO("read {} domains from \"{}\" in {}", domains.size(), path.string(), + format_duration(t1 - t0)); + return domains; + } + [[nodiscard]] auto select_domains(const hictk::Chromosome &chrom1, + const hictk::Chromosome &chrom2) noexcept { + SPDLOG_DEBUG("[{}:{}]: selecting domains...", chrom1.name(), chrom2.name()); + + // The positions here do not matter: see implementation of the comparison operator below + const auto query = + std::make_pair(hictk::GenomicInterval{chrom1, 0, 1}, hictk::GenomicInterval{chrom2, 0, 1}); + return std::equal_range(_domains.begin(), _domains.end(), query, + [&](const auto &domain1, const auto &domain2) { + const auto &c1 = domain1.first.chrom(); + const auto &c2 = domain2.first.chrom(); + const auto &c3 = domain1.second.chrom(); + const auto &c4 = domain2.second.chrom(); + + if (c1 == c2) { + return c3 < c4; + } + return c1 < c2; + }); } - std::ranges::sort(domains); - SPDLOG_INFO("[{}:{}] read {} domains from {}...", chrom1, chrom2, domains.size(), path); - return domains; -} + public: + using iterator = std::vector::iterator; + using const_iterator = std::vector::const_iterator; + + BG2Domains() : _mtx(std::make_unique()) {} + BG2Domains(const hictk::Reference &chroms, const std::filesystem::path &path, bool keep_cis, + bool keep_trans, const std::optional &chrom1 = {}, + const std::optional &chrom2 = {}) + : _domains(parse_domains(chroms, path, keep_cis, keep_trans, chrom1, chrom2)), + _mtx(std::make_unique()) {} + + [[nodiscard]] iterator begin() noexcept { return _domains.begin(); } + [[nodiscard]] iterator end() noexcept { return _domains.end(); } + + [[nodiscard]] const_iterator begin() const noexcept { return _domains.begin(); } + [[nodiscard]] const_iterator end() const noexcept { return _domains.end(); } + + [[nodiscard]] const_iterator cbegin() const noexcept { return _domains.cbegin(); } + [[nodiscard]] const_iterator cend() const noexcept { return _domains.cend(); } + + [[nodiscard]] std::vector extract(const hictk::Chromosome &chrom1, + const hictk::Chromosome &chrom2) { + assert(_mtx); + SPDLOG_DEBUG("[{}:{}] extracting domains...", chrom1.name(), chrom2.name()); + + [[maybe_unused]] const auto lck = std::scoped_lock(*_mtx); + auto [first, last] = select_domains(chrom1, chrom2); + std::vector domains(first, last); + _domains.erase(first, last); + _domains.shrink_to_fit(); + SPDLOG_DEBUG("[{}:{}] extracted {} domains!", chrom1.name(), chrom2.name(), domains.size()); + return domains; + } + + [[nodiscard]] std::filesystem::path extact_and_write_to_file( + const std::filesystem::path &dest_dir, const hictk::Chromosome &chrom1, + const hictk::Chromosome &chrom2, bool force) { + const auto dest = dest_dir / fmt::format("domains.{}.{}.bedpe", chrom1.name(), chrom2.name()); + SPDLOG_DEBUG("[{}:{}] writing domains to temporary file \"{}\"...", chrom1.name(), + chrom2.name(), dest.string()); + + const auto t0 = std::chrono::steady_clock::now(); + + if (!force && std::filesystem::exists(dest)) { + throw std::runtime_error( + fmt::format("refusing to overwrite existing temporary file: \"{}\". " + "Pass --force to overwrite.", + dest)); + } + + std::filesystem::remove(dest); // NOLINT; + + [[maybe_unused]] std::size_t domains_processed{}; + std::ofstream fs{}; + fs.exceptions(fs.exceptions() | std::ios::badbit | std::ios::failbit); + + try { + std::filesystem::create_directories(dest_dir); // NOLINT +#ifdef __cpp_lib_ios_noreplace + fs.open(dest, std::ios::out | std::ios::trunc | std::ios::noreplace); +#else + fs.open(dest, std::ios::out | std::ios::trunc); +#endif + + const auto selected_domains = extract(chrom1, chrom2); + if (selected_domains.empty()) { + SPDLOG_WARN("[{}:{}]: no domains were selected!", chrom1.name(), chrom2.name()); + } else if (selected_domains.size() == 1) { + SPDLOG_DEBUG("[{}:{}] selected a single domain ({:ucsc}; {:ucsc})...", chrom1.name(), + chrom2.name(), selected_domains.front().first, + selected_domains.front().second); + } else { + SPDLOG_DEBUG("[{}:{}] selected {} domains ({:ucsc}; {:ucsc} ... {:ucsc}; {:ucsc})...", + chrom1.name(), chrom2.name(), selected_domains.size(), + selected_domains.front().first, selected_domains.front().second, + selected_domains.back().first, selected_domains.back().second); + } + for (const auto &domain : selected_domains) { + fmt::print(fs, "{:bed}\t{:bed}\n", domain.first, domain.second); + ++domains_processed; + } + + } catch (const std::exception &e) { + throw std::runtime_error( + fmt::format("failed to write domains for {}:{} to temporary file \"{}\": {}", + chrom1.name(), chrom2.name(), dest.string(), e.what())); + } catch (...) { + throw std::runtime_error( + fmt::format("failed to write domains for {}:{} to temporary file \"{}\": unknown error", + chrom1.name(), chrom2.name(), dest.string())); + } + + const auto t1 = std::chrono::steady_clock::now(); + + SPDLOG_DEBUG("[{}:{}] written {} domains to \"{}\" in {}", chrom1.name(), chrom2.name(), + domains_processed, dest.string(), format_duration(t1 - t0)); + return dest; + } +}; [[nodiscard]] static NCHG init_nchg(const std::shared_ptr &f, const std::optional &expected_values, @@ -187,7 +445,7 @@ static void write_chrom_sizes_to_file(const hictk::Reference &chroms, } [[nodiscard]] static std::size_t process_domains( - const std::shared_ptr &f, + const std::shared_ptr &f, BG2Domains &domains, const std::optional &expected_values, const ComputePvalConfig &c) { assert(std::filesystem::exists(c.path_to_domains)); assert(c.chrom1.has_value()); @@ -200,9 +458,9 @@ static void write_chrom_sizes_to_file(const hictk::Reference &chroms, const auto writer = init_parquet_file_writer( f->chromosomes(), c.output_path, c.force, c.compression_method, c.compression_lvl, c.threads); - const auto domains = parse_domains(f->chromosomes(), c.path_to_domains, *c.chrom1, *c.chrom2); - - if (domains.empty()) { + const auto selected_domains = + domains.extract(f->chromosomes().at(*c.chrom1), f->chromosomes().at(*c.chrom2)); + if (selected_domains.empty()) { return 0; } @@ -212,27 +470,17 @@ static void write_chrom_sizes_to_file(const hictk::Reference &chroms, RecordBatchBuilder builder(f->bins().chromosomes()); std::size_t num_records = 0; - for (std::size_t i = 0; i < domains.size(); ++i) { - for (std::size_t j = i; j < domains.size(); ++j) { - const auto &d1 = domains[i]; - const auto &d2 = domains[j]; + for (const auto &domain : selected_domains) { + const auto s = nchg.compute(domain.first, domain.second, c.bad_bin_fraction); - if (c.chrom1.has_value() && (d1.chrom() != *c.chrom1 || d2.chrom() != *c.chrom2)) { - continue; - } - - const auto s = nchg.compute(d1, d2, c.bad_bin_fraction); - - if (builder.size() == batch_size) [[unlikely]] { - builder.write(*writer); - } - - if (std::isfinite(s.odds_ratio) && s.omega != 0) [[likely]] { - builder.append(s); - } + if (builder.size() == batch_size) [[unlikely]] { + builder.write(*writer); + } - ++num_records; + if (std::isfinite(s.odds_ratio) && s.omega != 0) [[likely]] { + builder.append(s); } + ++num_records; } if (builder.size() != 0) { @@ -288,43 +536,72 @@ static void write_chrom_sizes_to_file(const hictk::Reference &chroms, } [[nodiscard]] static std::size_t run_nchg_compute_worker( - const ComputePvalConfig &c, const std::optional &expected_values = {}) { + const ComputePvalConfig &c, std::optional &domains, + const std::optional &expected_values = {}) { assert(c.chrom1.has_value()); assert(c.chrom2.has_value()); const auto f = std::make_shared(c.path_to_hic.string(), c.resolution); + if (domains.has_value()) { + return process_domains(f, *domains, expected_values, c); + } + if (!c.path_to_domains.empty()) { - return process_domains(f, expected_values, c); + const auto chrom1 = std::make_optional(f->chromosomes().at(*c.chrom1)); + const auto chrom2 = std::make_optional(f->chromosomes().at(*c.chrom2)); + + BG2Domains domains_(f->chromosomes(), c.path_to_domains, true, true, chrom1, chrom2); + return process_domains(f, domains_, expected_values, c); } return process_one_chromosome_pair(f, expected_values, c); } -using ChromosomePairs = std::vector>; - -[[nodiscard]] static ChromosomePairs init_cis_chromosomes(const hictk::Reference &chroms) { +[[nodiscard]] static ChromosomePairs init_cis_chromosomes( + const hictk::Reference &chroms, const std::optional &domains) { ChromosomePairs buffer{}; - for (const auto &chrom : chroms) { - if (chrom.is_all()) [[unlikely]] { - continue; + if (domains.has_value()) { + for (const auto &[domain1, domain2] : *domains) { + assert(chroms.contains(domain1.chrom().name())); + assert(chroms.contains(domain2.chrom().name())); + + if (domain1.chrom() == domain2.chrom()) { + buffer.emplace(domain1.chrom(), domain2.chrom()); + } + } + } else { + for (const auto &chrom : chroms) { + if (chrom.is_all()) [[unlikely]] { + continue; + } + buffer.emplace(chrom, chrom); } - buffer.emplace_back(chrom, chrom); } return buffer; } -[[nodiscard]] static ChromosomePairs init_trans_chromosomes(const hictk::Reference &chroms) { +[[nodiscard]] static ChromosomePairs init_trans_chromosomes( + const hictk::Reference &chroms, const std::optional &domains) { ChromosomePairs buffer{}; - for (const auto &chrom1 : chroms) { - if (chrom1.is_all()) [[unlikely]] { - continue; + if (domains.has_value()) { + for (const auto &[domain1, domain2] : *domains) { + assert(chroms.contains(domain1.chrom().name())); + assert(chroms.contains(domain2.chrom().name())); + + if (domain1.chrom() != domain2.chrom()) { + buffer.emplace(domain1.chrom(), domain2.chrom()); + } } - for (std::uint32_t chrom2_id = chrom1.id() + 1; chrom2_id < chroms.size(); ++chrom2_id) { - buffer.emplace_back(chrom1, chroms.at(chrom2_id)); + } else { + for (const auto &chrom : chroms) { + if (chrom.is_all()) [[unlikely]] { + continue; + } + buffer.emplace(chrom, chrom); } } return buffer; @@ -430,6 +707,7 @@ using ChromosomePairs = std::vector &domains, const ComputePvalConfig &config, bool trans_expected_values_avail, std::atomic &early_return) { @@ -438,12 +716,22 @@ using ChromosomePairs = std::vectorextact_and_write_to_file(child_config.tmpdir, chrom1, chrom2, + child_config.force); + child_config.path_to_domains = domain_file; + } else { + assert(child_config.path_to_domains.empty()); + } + if (!trans_expected_values_avail && chrom1 != chrom2) { child_config.path_to_expected_values.clear(); } @@ -451,19 +739,25 @@ using ChromosomePairs = std::vector fp; PARQUET_ASSIGN_OR_THROW(fp, arrow::io::ReadableFile::Open(child_config.output_path)); const auto records_processed = parquet::StreamReader{parquet::ParquetFileReader::Open(fp)}.num_rows(); - SPDLOG_INFO("done processing {}:{} ({} records)!", chrom1.name(), chrom2.name(), - records_processed); + const auto t1 = std::chrono::steady_clock::now(); + SPDLOG_INFO("[{}:{}]: processed {} records in {}", chrom1.name(), chrom2.name(), + records_processed, format_duration(t1 - t0)); return static_cast(records_processed); } catch (const std::exception &e) { @@ -508,22 +802,24 @@ using ChromosomePairs = std::vector> &chrom_pairs, - const std::optional &expected_values, const ComputePvalConfig &c) { +static std::size_t process_queries_mt(BS::thread_pool &tpool, const ChromosomePairs &chrom_pairs, + std::optional &domains, + const std::optional &expected_values, + const ComputePvalConfig &c) { std::atomic early_return{false}; const auto user_provided_expected_values = !c.path_to_expected_values.empty(); const auto base_config = init_base_config(c, expected_values); BS::multi_future workers(chrom_pairs.size()); + auto it = chrom_pairs.begin(); for (std::size_t i = 0; i < workers.size(); ++i) { - workers[i] = tpool.submit_task([&, i] { - const auto &[chrom1, chrom2] = chrom_pairs[i]; + workers[i] = tpool.submit_task([&, i, chrom_pair = *it++] { + const auto &[chrom1, chrom2] = chrom_pair; SPDLOG_DEBUG("submitting task {}/{} ({}:{})...", i + 1, workers.size(), chrom1.name(), chrom2.name()); - return worker_fx(chrom1, chrom2, base_config, user_provided_expected_values, early_return); + return worker_fx(chrom1, chrom2, domains, base_config, user_provided_expected_values, + early_return); }); } @@ -535,9 +831,10 @@ static std::size_t process_queries_mt( return num_records; } -static std::size_t process_queries_st( - const std::vector> &chrom_pairs, - const std::optional &expected_values, const ComputePvalConfig &c) { +static std::size_t process_queries_st(const ChromosomePairs &chrom_pairs, + std::optional &domains, + const std::optional &expected_values, + const ComputePvalConfig &c) { assert(!c.output_prefix.empty()); std::size_t tot_num_records = 0; for (const auto &[chrom1, chrom2] : chrom_pairs) { @@ -557,8 +854,8 @@ static std::size_t process_queries_st( assert(expected_values.has_value()); } const auto num_records = config.chrom1 == config.chrom2 - ? run_nchg_compute_worker(config, expected_values) - : run_nchg_compute_worker(config); + ? run_nchg_compute_worker(config, domains, expected_values) + : run_nchg_compute_worker(config, domains); tot_num_records += num_records; const auto t1 = std::chrono::steady_clock::now(); @@ -600,9 +897,10 @@ static std::optional init_cis_expected_values(const ComputePvalC bin_mask)}; } -static std::size_t process_queries( - const std::vector> &chrom_pairs, - const std::optional &expected_values, const ComputePvalConfig &c) { +static std::size_t process_queries(const ChromosomePairs &chrom_pairs, + std::optional &domains, + const std::optional &expected_values, + const ComputePvalConfig &c) { write_chrom_sizes_to_file(hictk::File(c.path_to_hic, c.resolution).chromosomes(), generate_chrom_sizes_file_name(c.output_prefix), c.force); @@ -611,19 +909,19 @@ static std::size_t process_queries( if (c.threads > chrom_pairs.size()) { num_threads = conditional_static_cast(chrom_pairs.size()); SPDLOG_WARN( - "number of threads specified through --threads exceeds the number of chromosome pairs to " + "number of threads specified through --threads exceeds the number of chromosome pairs " + "to " "be processed: limiting concurrency to {} thread(s)", num_threads); } BS::thread_pool tpool(num_threads); - return process_queries_mt(tpool, chrom_pairs, expected_values, c); + return process_queries_mt(tpool, chrom_pairs, domains, expected_values, c); } - return process_queries_st(chrom_pairs, expected_values, c); + return process_queries_st(chrom_pairs, domains, expected_values, c); } -static void process_file_collisions( - const std::filesystem::path &output_prefix, - const std::vector> &chrom_pairs, bool force) { +static void process_file_collisions(const std::filesystem::path &output_prefix, + const ChromosomePairs &chrom_pairs, bool force) { std::vector collisions{}; std::size_t num_collisions = 0; @@ -674,10 +972,9 @@ static void process_file_collisions( } } -static void validate_expected_values( - const ExpectedValues &expected_values, const std::filesystem::path &path_to_expected_values, - const std::vector> &chrom_pairs, - std::uint32_t resolution) { +static void validate_expected_values(const ExpectedValues &expected_values, + const std::filesystem::path &path_to_expected_values, + const ChromosomePairs &chrom_pairs, std::uint32_t resolution) { if (expected_values.resolution() != resolution) { throw std::runtime_error( fmt::format("mismatch in file resolution: expected values have been computed " @@ -721,8 +1018,9 @@ static void validate_expected_values( [[nodiscard]] static auto generate_execution_plan(const ComputePvalConfig &c) { struct Plan { - std::vector> chrom_pairs{}; + ChromosomePairs chrom_pairs{}; std::optional expected_values{}; + std::optional domains{}; }; Plan plan{}; @@ -733,13 +1031,17 @@ static void validate_expected_values( throw std::runtime_error("only file with uniform bin sizes are supported."); } + if (!c.path_to_domains.empty()) { + plan.domains.emplace(f.chromosomes(), c.path_to_domains, c.compute_cis, c.compute_trans); + } + if (c.compute_cis) { - plan.chrom_pairs = init_cis_chromosomes(f.chromosomes()); + plan.chrom_pairs = init_cis_chromosomes(f.chromosomes(), plan.domains); } if (c.compute_trans) { - std::ranges::copy(init_trans_chromosomes(f.chromosomes()), - std::back_inserter(plan.chrom_pairs)); + std::ranges::move(init_trans_chromosomes(f.chromosomes(), plan.domains), + std::inserter(plan.chrom_pairs, plan.chrom_pairs.end())); } process_file_collisions(c.output_prefix, plan.chrom_pairs, c.force); @@ -766,7 +1068,8 @@ int run_command(const ComputePvalConfig &c) { if (c.chrom1.has_value()) { assert(c.chrom2.has_value()); assert(!c.output_path.empty()); - const auto interactions_processed = run_nchg_compute_worker(c); + std::optional placeholder{}; + const auto interactions_processed = run_nchg_compute_worker(c, placeholder); const auto t1 = std::chrono::steady_clock::now(); SPDLOG_INFO("[{}:{}] processed {} records in {}!", *c.chrom1, *c.chrom2, interactions_processed, format_duration(t1 - t0)); @@ -776,9 +1079,9 @@ int run_command(const ComputePvalConfig &c) { return 0; } - const auto &[chrom_pairs, expected_values] = generate_execution_plan(c); + auto [chrom_pairs, expected_values, domains] = generate_execution_plan(c); - const auto interactions_processed = process_queries(chrom_pairs, expected_values, c); + const auto interactions_processed = process_queries(chrom_pairs, domains, expected_values, c); std::filesystem::remove_all(c.tmpdir); // NOLINT const auto t1 = std::chrono::steady_clock::now(); @@ -788,12 +1091,18 @@ int run_command(const ComputePvalConfig &c) { SPDLOG_INFO("processed {} records in {}!", interactions_processed, format_duration(t1 - t0)); } - SPDLOG_INFO("{} file(s) have been created under prefix \"{}\"", chrom_pairs.size() + 1, + SPDLOG_INFO("{} new file(s) have been created under prefix \"{}\"", chrom_pairs.size() + 1, c.output_prefix); return 0; } catch (...) { - std::filesystem::remove_all(c.tmpdir); // NOLINT + std::error_code ec{}; + std::filesystem::remove_all(c.tmpdir, ec); // NOLINT + + if (!static_cast(ec) && std::filesystem::exists(c.tmpdir)) { + SPDLOG_WARN("failed to remove temporary folder \"{}\"! Please remove the folder manually", + c.tmpdir.string()); + } throw; } } diff --git a/src/nchg/include/nchg/tools/io.hpp b/src/nchg/include/nchg/tools/io.hpp index 2bb9c6e..58896f7 100644 --- a/src/nchg/include/nchg/tools/io.hpp +++ b/src/nchg/include/nchg/tools/io.hpp @@ -172,7 +172,8 @@ template [[nodiscard]] phmap::flat_hash_map> parse_bin_mask( const hictk::Reference &chroms, std::uint32_t bin_size, const std::filesystem::path &path); -[[nodiscard]] std::string_view truncate_bed3_record(std::string_view record, char sep = '\t'); +template +[[nodiscard]] std::string_view truncate_record(std::string_view record, char sep = '\t'); namespace internal { From 058275c4c2f48f8befa062306c021a913cdd3c4b Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Tue, 12 Nov 2024 21:49:32 +0100 Subject: [PATCH 09/18] Update script to generate reference data for the integration tests --- ...ate_integration_test_reference_datasets.py | 238 +++++++++++++----- 1 file changed, 175 insertions(+), 63 deletions(-) diff --git a/test/scripts/generate_integration_test_reference_datasets.py b/test/scripts/generate_integration_test_reference_datasets.py index 9cc4e8a..73713c4 100755 --- a/test/scripts/generate_integration_test_reference_datasets.py +++ b/test/scripts/generate_integration_test_reference_datasets.py @@ -27,7 +27,9 @@ import subprocess as sp import sys import tempfile -from typing import List +from typing import Any, Dict, List + +import pandas as pd def existing_file(path: str) -> pathlib.Path: @@ -105,6 +107,48 @@ def add_common_flags(parser): ) +def make_cross_product_sc(main_parser): + sc: argparse.ArgumentParser = main_parser.add_parser( + "cross-product", + help="Generate the reference dataset for NCHG cross-product.", + ) + + sc.add_argument( + "1d-domains", + type=existing_file, + help="Path to a BED3+ files with the list of domains to be processed.", + ) + + sc.add_argument( + "out-path", + type=pathlib.Path, + help="Output path where to store the resulting domains.", + ) + + sc.add_argument( + "--type", + type=str, + choices={"gw", "cis", "trans"}, + default="gw", + help="Type of domains to be generated.", + ) + + sc.add_argument( + "--verbosity", + choices={"debug", "info", "warnings", "error", "critical"}, + type=str, + default="info", + help="Tweak log verbosity.", + ) + + sc.add_argument( + "--force", + action="store_true", + default=False, + help="Overwrite existing file(s).", + ) + + def make_compute_sc(main_parser): sc: argparse.ArgumentParser = main_parser.add_parser( "compute", @@ -286,6 +330,52 @@ def run_nchg_command(nchg_bin: pathlib.Path, subcmd: str, *args, **kwargs): raise RuntimeError(f"{cmd} returned with exit code {res.returncode}") +def generate_2d_domains( + domains: pathlib.Path, + output_path: pathlib.Path, + domain_type: str, + force: bool, +) -> pathlib.Path: + if output_path.exists() and not force: + raise RuntimeError(f'refusing to overwrite file "{output_path}": pass --force to overwrite.') + + parent_dir = output_path.parent + parent_dir.mkdir(exist_ok=True) + + logging.info("reading domains from %s...", domains) + + df = pd.read_table(domains, names=["chrom", "start", "end"], usecols=[0, 1, 2]) + chroms = {name: i for i, name in enumerate(df["chrom"].unique())} + + logging.info("parsed %d chromosomes...", len(chroms)) + df["id"] = df["chrom"].map(chroms) + + df = df.merge(df, how="cross", suffixes=("1", "2")) + # Drop domain pairs overlapping with the lower-triangular matrix + df = df[(df["id1"] < df["id2"]) | ((df["id1"] == df["id2"]) & (df["start1"] <= df["start2"]))] + + if domain_type != "cis": + df = df.sort_values(["id1", "id2", "start1", "start2", "end1", "end2"]) + + df = df.reset_index(drop=True).drop(columns=["id1", "id2"]) + + size = len(df) + logging.info("generated %d domain pairs...", size) + if domain_type == "cis": + logging.info("dropping trans domains...") + df = df[df["chrom1"] == df["chrom2"]] + logging.info("dropped %d domains...", size - len(df)) + elif domain_type == "trans": + logging.info("dropping cis domains...") + df = df[df["chrom1"] != df["chrom2"]] + logging.info("dropped %d domains...", size - len(df)) + + logging.info("writing 2D domains to %s...", output_path) + df.to_csv(output_path, sep="\t", index=False, header=False) + + return output_path + + def run_nchg_compute( nchg_bin: pathlib.Path, uri: str, @@ -395,6 +485,80 @@ def run_nchg_expected( return output_h5 +def generate_all_files(nchg_bin: pathlib.Path, force: bool, timeout: float, args: Dict[str, Any]): + suffix = pathlib.Path(args["cool-uri"].partition("::")[0]).stem + + for dom_type in ["gw", "cis", "trans"]: + output = args["output-dir"] / "cross_product" / f"{suffix}.{dom_type}-domains.bedpe" + generate_2d_domains(args["domains"], output, dom_type, args["force"]) + + domains = args["output-dir"] / "cross_product" / f"{suffix}.gw-domains.bedpe" + outprefix = args["output-dir"] / "compute_with_domains" / suffix + run_nchg_compute( + nchg_bin, + args["cool-uri"], + outprefix, + domains, + args["nproc"], + force, + timeout, + ) + + outprefix = args["output-dir"] / "compute" / suffix + run_nchg_compute( + nchg_bin, + args["cool-uri"], + outprefix, + None, + args["nproc"], + force, + timeout, + ) + + output = args["output-dir"] / "merge" / f"{suffix}.parquet" + output.parent.mkdir(parents=True, exist_ok=True) + run_nchg_merge( + nchg_bin, + outprefix, + output, + force, + timeout, + ) + + input = output + output = args["output-dir"] / "filter" / f"{suffix}.parquet" + output.parent.mkdir(parents=True, exist_ok=True) + run_nchg_filter( + nchg_bin, + input, + output, + force, + timeout, + ) + + input = output + output = args["output-dir"] / "view" / f"{suffix}.tsv" + output.parent.mkdir(parents=True, exist_ok=True) + run_nchg_view( + nchg_bin, + input, + output, + force, + timeout, + ) + + output = args["output-dir"] / "expected" / f"{suffix}.h5" + output.parent.mkdir(parents=True, exist_ok=True) + run_nchg_expected( + nchg_bin, + args["cool-uri"], + output, + [], + force, + timeout, + ) + + def find_nchg_exec(path: pathlib.Path | None) -> pathlib.Path: if path: if shutil.which(path): @@ -421,7 +585,14 @@ def main(): cmd = args["command"] - if cmd == "compute": + if cmd == "cross-product": + generate_2d_domains( + args["domains"], + args["out-path"], + args["type"], + args["force"], + ) + elif cmd == "compute": run_nchg_compute( nchg_bin, args["cool-uri"], @@ -465,70 +636,11 @@ def main(): timeout, ) elif not cmd: - suffix = pathlib.Path(args["cool-uri"].partition("::")[0]).stem - outprefix = args["output-dir"] / "compute_with_domains" / suffix - run_nchg_compute( + generate_all_files( nchg_bin, - args["cool-uri"], - outprefix, - args["domains"], - args["nproc"], - force, - timeout, - ) - - outprefix = args["output-dir"] / "compute" / suffix - run_nchg_compute( - nchg_bin, - args["cool-uri"], - outprefix, - None, - args["nproc"], - force, - timeout, - ) - - output = args["output-dir"] / "merge" / f"{suffix}.parquet" - output.parent.mkdir(parents=True, exist_ok=True) - run_nchg_merge( - nchg_bin, - outprefix, - output, - force, - timeout, - ) - - input = output - output = args["output-dir"] / "filter" / f"{suffix}.parquet" - output.parent.mkdir(parents=True, exist_ok=True) - run_nchg_filter( - nchg_bin, - input, - output, - force, - timeout, - ) - - input = output - output = args["output-dir"] / "view" / f"{suffix}.tsv" - output.parent.mkdir(parents=True, exist_ok=True) - run_nchg_view( - nchg_bin, - input, - output, - force, - timeout, - ) - - output = args["output-dir"] / "expected" / f"{suffix}.h5" - output.parent.mkdir(parents=True, exist_ok=True) - run_nchg_expected( - nchg_bin, - args["cool-uri"], - output, - [], force, timeout, + args, ) else: raise NotImplementedError From 4618352841f648998dfbe110db98f7b3c3faf242 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:28:30 +0100 Subject: [PATCH 10/18] Update validate_nchg_output.py to cover NCHG cartesian-product --- test/scripts/validate_nchg_output.py | 74 +++++++++++++++++++++++++--- 1 file changed, 67 insertions(+), 7 deletions(-) diff --git a/test/scripts/validate_nchg_output.py b/test/scripts/validate_nchg_output.py index 728a4ee..00f1522 100755 --- a/test/scripts/validate_nchg_output.py +++ b/test/scripts/validate_nchg_output.py @@ -48,6 +48,25 @@ def add_common_flags(parser): ) +def make_cartesian_product_sc(main_parser): + sc: argparse.ArgumentParser = main_parser.add_parser( + "cartesian-product", + help="Validate the output produced by NCHG cartesian-product.", + ) + sc.add_argument( + "test-bedpe", + type=pathlib.Path, + help="Path to the BEDPE file to be tested.", + ) + sc.add_argument( + "ref-bedpe", + type=pathlib.Path, + help="Path to the BEDPE file to be used as ground truth.", + ) + + add_common_flags(sc) + + def make_compute_sc(main_parser): sc: argparse.ArgumentParser = main_parser.add_parser( "compute", @@ -151,6 +170,7 @@ def make_cli() -> argparse.ArgumentParser: title="subcommands", dest="command", required=True, help="List of available subcommands:" ) + make_cartesian_product_sc(sub_parser) make_compute_sc(sub_parser) make_merge_sc(sub_parser) make_filter_sc(sub_parser) @@ -176,6 +196,10 @@ def import_table(path: pathlib.Path) -> pd.DataFrame: raise RuntimeError(f"failed to read data from {path}: file is not in .tsv or .parquet format") from e +def import_bedpe(path: pathlib.Path) -> pd.DataFrame: + return pd.read_table(path, names=["chrom1", "start1", "end1", "chrom2", "start2", "end2"]) + + def validate_columns(expected: pd.DataFrame, found: pd.DataFrame, path: pathlib.Path): logging.debug("validating columns for %s...", path) @@ -215,7 +239,7 @@ def validate_chrom_sizes(expected: pd.DataFrame, found: pd.DataFrame, path: path logging.debug("%s validation was successful!", path) -def validate_data(expected: pd.DataFrame, found: pd.DataFrame, path: pathlib.Path): +def validate_pvalue_tables(expected: pd.DataFrame, found: pd.DataFrame, path: pathlib.Path): logging.debug("validating %s...", path) if len(expected) != len(found): @@ -383,7 +407,7 @@ def validate_expected_profile_h5(expected: pathlib.Path, found: pathlib.Path): compare_h5_nodes_recursive(ref["/"], tgt["/"]) -def validate_table(expected_path: pathlib.Path, found_path: pathlib.Path): +def validate_pvalue_table(expected_path: pathlib.Path, found_path: pathlib.Path): assert expected_path.is_file() try: if not found_path.is_file(): @@ -393,11 +417,45 @@ def validate_table(expected_path: pathlib.Path, found_path: pathlib.Path): found = import_table(found_path) validate_columns(expected, found, found_path) - validate_data(expected, found, found_path) + validate_pvalue_tables(expected, found, found_path) except RuntimeError as e: raise RuntimeError(f"failed to validate table {found_path.stem}: {e}") +def validate_nchg_cartesian_product(test_file: pathlib.Path, ref_file: pathlib.Path) -> int: + logging.info(f"### NCHG cartesian-product: validating {test_file}...") + if test_file.resolve() == ref_file.resolve(): + raise RuntimeError(f"test-bedpe and ref-bedpe point to the same file: {ref_file}") + + expected = import_bedpe(ref_file) + found = import_bedpe(test_file) + + ok = True + if (expected.columns != found.columns).any(): + logging.error("table is not correct: expected %s columns, found %s", expected.columns, found.columns) + ok = False + + if len(expected) != len(found): + logging.error("table is not correct: expected %d rows, found %d", len(expected), len(found)) + ok = False + + if not ok: + return 1 + + errors = [] + for col in expected.columns: + num_mismatches = (expected[col] != found[col]).sum() + if num_mismatches != 0: + errors.append(f'Found {num_mismatches} mismatched values while comparing column "{col}".') + + if len(errors) != 0: + logging.error("data validation failed:\n - " + "\n - ".join(errors)) + return 1 + + logging.debug("%s validation was successful", test_file) + return 0 + + def validate_nchg_compute(test_prefix: pathlib.Path, ref_prefix: pathlib.Path) -> int: logging.info(f"### NCHG compute: validating {test_prefix}...") if test_prefix == ref_prefix: @@ -430,7 +488,7 @@ def validate_nchg_compute(test_prefix: pathlib.Path, ref_prefix: pathlib.Path) - logging.debug("processing table %s...", prefix.lstrip(".")) try: path2 = pathlib.Path(f"{test_prefix}{prefix}.parquet") - validate_table(path1, path2) + validate_pvalue_table(path1, path2) except RuntimeError as e: ok = False logging.error(e) @@ -446,7 +504,7 @@ def validate_nchg_merge(test_file: pathlib.Path, ref_file: pathlib.Path) -> int: raise RuntimeError(f"test-parquet and ref-parquet point to the same file: {ref_file}") try: - validate_table(ref_file, test_file) + validate_pvalue_table(ref_file, test_file) return 0 except RuntimeError as e: logging.error(e) @@ -459,7 +517,7 @@ def validate_nchg_filter(test_file: pathlib.Path, ref_file: pathlib.Path) -> int raise RuntimeError(f"test-parquet and ref-parquet point to the same file: {ref_file}") try: - validate_table(ref_file, test_file) + validate_pvalue_table(ref_file, test_file) return 0 except RuntimeError as e: logging.error(e) @@ -472,7 +530,7 @@ def validate_nchg_view(test_file: pathlib.Path, ref_file: pathlib.Path) -> int: raise RuntimeError(f"test-tsv and ref-tsv point to the same file: {ref_file}") try: - validate_table(ref_file, test_file) + validate_pvalue_table(ref_file, test_file) return 0 except RuntimeError as e: logging.error(e) @@ -496,6 +554,8 @@ def main() -> int: setup_logger(args["verbosity"].upper()) cmd = args["command"] + if cmd == "cartesian-product": + return validate_nchg_cartesian_product(args["test-bedpe"], args["ref-bedpe"]) if cmd == "compute": return validate_nchg_compute(args["test-prefix"], args["ref-prefix"]) From c68f8cadee466cb2ea55d94a0d74b1f522761ed7 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:28:38 +0100 Subject: [PATCH 11/18] Bugfix --- src/nchg/cartesian_product/cartesian_product.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nchg/cartesian_product/cartesian_product.cpp b/src/nchg/cartesian_product/cartesian_product.cpp index b7d7d19..e5d26dd 100644 --- a/src/nchg/cartesian_product/cartesian_product.cpp +++ b/src/nchg/cartesian_product/cartesian_product.cpp @@ -333,7 +333,7 @@ struct ChromIndex { for (std::size_t i = 0; i < domains.size(); ++i) { const auto& chrom = std::get<0>(domains[i]); const auto i0 = i; - auto i1 = i0; + auto i1 = i0 + 1; for (; i1 < domains.size(); ++i1) { if (chrom != std::get<0>(domains[i1])) { break; @@ -344,7 +344,7 @@ struct ChromIndex { #else index.emplace_back(chrom, i0, i1); #endif - i = i1; + i = i1 - 1; } return index; From c1252440e194ef97deb730f6a9ca3952d29dfe84 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:31:01 +0100 Subject: [PATCH 12/18] Fix typos --- .../generate_integration_test_reference_datasets.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/scripts/generate_integration_test_reference_datasets.py b/test/scripts/generate_integration_test_reference_datasets.py index 73713c4..8c421a4 100755 --- a/test/scripts/generate_integration_test_reference_datasets.py +++ b/test/scripts/generate_integration_test_reference_datasets.py @@ -107,7 +107,7 @@ def add_common_flags(parser): ) -def make_cross_product_sc(main_parser): +def make_cartesian_product_sc(main_parser): sc: argparse.ArgumentParser = main_parser.add_parser( "cross-product", help="Generate the reference dataset for NCHG cross-product.", @@ -489,10 +489,10 @@ def generate_all_files(nchg_bin: pathlib.Path, force: bool, timeout: float, args suffix = pathlib.Path(args["cool-uri"].partition("::")[0]).stem for dom_type in ["gw", "cis", "trans"]: - output = args["output-dir"] / "cross_product" / f"{suffix}.{dom_type}-domains.bedpe" + output = args["output-dir"] / "cartesian_product" / f"{suffix}.{dom_type}-domains.bedpe" generate_2d_domains(args["domains"], output, dom_type, args["force"]) - domains = args["output-dir"] / "cross_product" / f"{suffix}.gw-domains.bedpe" + domains = args["output-dir"] / "cartesian_product" / f"{suffix}.gw-domains.bedpe" outprefix = args["output-dir"] / "compute_with_domains" / suffix run_nchg_compute( nchg_bin, @@ -585,7 +585,7 @@ def main(): cmd = args["command"] - if cmd == "cross-product": + if cmd == "cartesian-product": generate_2d_domains( args["domains"], args["out-path"], From c242b45daea7403ed8a916ac4441eaec58af5879 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:39:04 +0100 Subject: [PATCH 13/18] Bugfix --- src/nchg/compute/compute.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/nchg/compute/compute.cpp b/src/nchg/compute/compute.cpp index c55cef0..96dba23 100644 --- a/src/nchg/compute/compute.cpp +++ b/src/nchg/compute/compute.cpp @@ -597,11 +597,18 @@ static void write_chrom_sizes_to_file(const hictk::Reference &chroms, } } } else { - for (const auto &chrom : chroms) { - if (chrom.is_all()) [[unlikely]] { + for (std::uint32_t chrom1_id = 0; chrom1_id < chroms.size(); ++chrom1_id) { + const auto &chrom1 = chroms.at(chrom1_id); + if (chrom1.is_all()) [[unlikely]] { continue; } - buffer.emplace(chrom, chrom); + for (std::uint32_t chrom2_id = chrom1_id + 1; chrom2_id < chroms.size(); ++chrom2_id) { + const auto &chrom2 = chroms.at(chrom2_id); + if (chrom2.is_all()) [[unlikely]] { + continue; + } + buffer.emplace(chrom1, chrom2); + } } } return buffer; From 6bb932f873e9d6bfb35c7db3af02ea8d3e71b77f Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:49:59 +0100 Subject: [PATCH 14/18] Update test dataset --- cmake/FetchTestDataset.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/FetchTestDataset.cmake b/cmake/FetchTestDataset.cmake index 547214b..d74cc2f 100644 --- a/cmake/FetchTestDataset.cmake +++ b/cmake/FetchTestDataset.cmake @@ -18,8 +18,8 @@ # gersemi: off file( - DOWNLOAD https://zenodo.org/records/14069775/files/nchg_test_data.tar.zst?download=1 - EXPECTED_HASH SHA256=097eb7d6228f470000291ad4b83e0d8b7fd916ecfab74749833c01f13e8a882b + DOWNLOAD https://zenodo.org/records/14133677/files/nchg_test_data.tar.zst?download=1 + EXPECTED_HASH SHA256=17d89d3d79996eadd5c9e60ed4411a5351abb87af8e0ab214856ea3bc9fab362 "${PROJECT_SOURCE_DIR}/test/data/nchg_test_data.tar.zst" ) # gersemi: on From 59f115e09fbd2b67bd51a02e867e9ad56a110801 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:50:47 +0100 Subject: [PATCH 15/18] Update Ubuntu CI --- .github/workflows/ubuntu-ci.yml | 60 +++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/.github/workflows/ubuntu-ci.yml b/.github/workflows/ubuntu-ci.yml index 92a43ba..f2002ca 100644 --- a/.github/workflows/ubuntu-ci.yml +++ b/.github/workflows/ubuntu-ci.yml @@ -499,6 +499,66 @@ jobs: echo "DATA_DIR=$PWD/test/data/integration_tests" | tee -a "$GITHUB_ENV" echo "OUT_PREFIX=ENCFF447ERX.1000000" | tee -a "$GITHUB_ENV" + - name: Test NCHG cartesian-product (gw) + run: | + sudo -u devel \ + bin/NCHG cartesian-product \ + test/data/ENCFF447ERX.1000000.compartments.bed \ + > "$OUT_PREFIX.bedpe" + + test/scripts/validate_nchg_output.py cartesian-product \ + "$OUT_PREFIX.bedpe" \ + "$DATA_DIR/cartesian_product/$OUT_PREFIX.gw-domains.bedpe" + + rm "$OUT_PREFIX"* + + - name: Test NCHG cartesian-product (cis) + run: | + sudo -u devel \ + bin/NCHG cartesian-product \ + --cis-only \ + test/data/ENCFF447ERX.1000000.compartments.bed \ + > domains.bedpe + + test/scripts/validate_nchg_output.py cartesian-product \ + domains.bedpe \ + "$DATA_DIR/cartesian_product/$OUT_PREFIX.cis-domains.bedpe" + + rm *.bedpe + + - name: Test NCHG cartesian-product (trans) + run: | + sudo -u devel \ + bin/NCHG cartesian-product \ + --cis-only \ + test/data/ENCFF447ERX.1000000.compartments.bed \ + > domains.bedpe + + test/scripts/validate_nchg_output.py cartesian-product \ + domains.bedpe \ + "$DATA_DIR/cartesian_product/$OUT_PREFIX.trans-domains.bedpe" + + rm *.bedpe + + - name: Test NCHG cartesian-product (chr1:chr3) + run: | + sudo -u devel \ + bin/NCHG cartesian-product \ + --chrom1 chr1 \ + --chrom1 chr3 \ + test/data/ENCFF447ERX.1000000.compartments.bed \ + > domains.bedpe + + grep -E '\bchr1\b.*\bchr3\b' \ + "$DATA_DIR/cartesian_product/$OUT_PREFIX.trans-domains.bedpe" \ + > expected.bedpe + + test/scripts/validate_nchg_output.py cartesian-product \ + domains.bedpe \ + expected.bedpe + + rm *.bedpe + - name: Test NCHG compute (mt) run: | sudo -u devel \ From 58acacf37d4f5c16e106888cf5d349a55f23da07 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:53:22 +0100 Subject: [PATCH 16/18] Update codecov CI --- .github/workflows/codecov.yml | 72 +++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 77473ea..85e4dcb 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -379,6 +379,78 @@ jobs: echo "DATA_DIR=$PWD/test/data/integration_tests" | tee -a "$GITHUB_ENV" echo "OUT_PREFIX=ENCFF447ERX.1000000" | tee -a "$GITHUB_ENV" + - name: Test NCHG cartesian-product (gw) + run: | + LLVM_PROFILE_FILE="$PWD/nchg.cartesian-product-gw.profraw" + + sudo -u devel \ + -E env "LLVM_PROFILE_FILE=$LLVM_PROFILE_FILE" \ + build/src/NCHG cartesian-product \ + test/data/ENCFF447ERX.1000000.compartments.bed \ + > "$OUT_PREFIX.bedpe" + + test/scripts/validate_nchg_output.py cartesian-product \ + "$OUT_PREFIX.bedpe" \ + "$DATA_DIR/cartesian_product/$OUT_PREFIX.gw-domains.bedpe" + + rm "$OUT_PREFIX"* + + - name: Test NCHG cartesian-product (cis) + run: | + LLVM_PROFILE_FILE="$PWD/nchg.cartesian-product-cis.profraw" + + sudo -u devel \ + -E env "LLVM_PROFILE_FILE=$LLVM_PROFILE_FILE" \ + build/src/NCHG cartesian-product \ + --cis-only \ + test/data/ENCFF447ERX.1000000.compartments.bed \ + > domains.bedpe + + test/scripts/validate_nchg_output.py cartesian-product \ + domains.bedpe \ + "$DATA_DIR/cartesian_product/$OUT_PREFIX.cis-domains.bedpe" + + rm *.bedpe + + - name: Test NCHG cartesian-product (trans) + run: | + LLVM_PROFILE_FILE="$PWD/nchg.cartesian-product-trans.profraw" + + sudo -u devel \ + -E env "LLVM_PROFILE_FILE=$LLVM_PROFILE_FILE" \ + build/src/NCHG cartesian-product \ + --cis-only \ + test/data/ENCFF447ERX.1000000.compartments.bed \ + > domains.bedpe + + test/scripts/validate_nchg_output.py cartesian-product \ + domains.bedpe \ + "$DATA_DIR/cartesian_product/$OUT_PREFIX.trans-domains.bedpe" + + rm *.bedpe + + - name: Test NCHG cartesian-product (chr1:chr3) + run: | + LLVM_PROFILE_FILE="$PWD/nchg.cartesian-product-chr1-chr3.profraw" + + sudo -u devel \ + -E env "LLVM_PROFILE_FILE=$LLVM_PROFILE_FILE" \ + build/src/NCHG cartesian-product \ + --chrom1 chr1 \ + --chrom1 chr3 \ + test/data/ENCFF447ERX.1000000.compartments.bed \ + > domains.bedpe + + grep -E '\bchr1\b.*\bchr3\b' \ + "$DATA_DIR/cartesian_product/$OUT_PREFIX.trans-domains.bedpe" \ + > expected.bedpe + + test/scripts/validate_nchg_output.py cartesian-product \ + domains.bedpe \ + expected.bedpe + + rm *.bedpe + - name: Test NCHG compute (mt) run: | LLVM_PROFILE_FILE="$PWD/nchg.compute-mt.%$(nproc)m.profraw" From 1f145db52c2269cc05d4c47b1502e439655128cf Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:54:33 +0100 Subject: [PATCH 17/18] Update macOS CI --- .github/workflows/macos-ci.yml | 56 ++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/.github/workflows/macos-ci.yml b/.github/workflows/macos-ci.yml index 5f738ee..c44a4fb 100644 --- a/.github/workflows/macos-ci.yml +++ b/.github/workflows/macos-ci.yml @@ -455,6 +455,62 @@ jobs: echo "DATA_DIR=$PWD/test/data/integration_tests" | tee -a "$GITHUB_ENV" echo "OUT_PREFIX=ENCFF447ERX.1000000" | tee -a "$GITHUB_ENV" + - name: Test NCHG cartesian-product (gw) + run: | + bin/NCHG cartesian-product \ + test/data/ENCFF447ERX.1000000.compartments.bed \ + > "$OUT_PREFIX.bedpe" + + test/scripts/validate_nchg_output.py cartesian-product \ + "$OUT_PREFIX.bedpe" \ + "$DATA_DIR/cartesian_product/$OUT_PREFIX.gw-domains.bedpe" + + rm "$OUT_PREFIX"* + + - name: Test NCHG cartesian-product (cis) + run: | + bin/NCHG cartesian-product \ + --cis-only \ + test/data/ENCFF447ERX.1000000.compartments.bed \ + > domains.bedpe + + test/scripts/validate_nchg_output.py cartesian-product \ + domains.bedpe \ + "$DATA_DIR/cartesian_product/$OUT_PREFIX.cis-domains.bedpe" + + rm *.bedpe + + - name: Test NCHG cartesian-product (trans) + run: | + bin/NCHG cartesian-product \ + --cis-only \ + test/data/ENCFF447ERX.1000000.compartments.bed \ + > domains.bedpe + + test/scripts/validate_nchg_output.py cartesian-product \ + domains.bedpe \ + "$DATA_DIR/cartesian_product/$OUT_PREFIX.trans-domains.bedpe" + + rm *.bedpe + + - name: Test NCHG cartesian-product (chr1:chr3) + run: | + bin/NCHG cartesian-product \ + --chrom1 chr1 \ + --chrom1 chr3 \ + test/data/ENCFF447ERX.1000000.compartments.bed \ + > domains.bedpe + + grep -E '\bchr1\b.*\bchr3\b' \ + "$DATA_DIR/cartesian_product/$OUT_PREFIX.trans-domains.bedpe" \ + > expected.bedpe + + test/scripts/validate_nchg_output.py cartesian-product \ + domains.bedpe \ + expected.bedpe + + rm *.bedpe + - name: Test NCHG compute (mt) run: | bin/NCHG compute \ From 098f3e3ba5210acb6a4beb91435f027f9ec240d4 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:58:09 +0100 Subject: [PATCH 18/18] Fix CI --- .github/workflows/codecov.yml | 23 +++++++---------------- .github/workflows/macos-ci.yml | 6 +++--- .github/workflows/ubuntu-ci.yml | 6 +++--- 3 files changed, 13 insertions(+), 22 deletions(-) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 85e4dcb..bc08211 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -127,15 +127,6 @@ jobs: cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_PREFIX_PATH="$PWD/conan-env" \ - -DENABLE_DEVELOPER_MODE=ON \ - -DOPT_ENABLE_INTERPROCEDURAL_OPTIMIZATION=OFF \ - -DOPT_ENABLE_SANITIZER_ADDRESS=OFF \ - -DOPT_ENABLE_SANITIZER_LEAK=OFF \ - -DOPT_ENABLE_SANITIZER_POINTER_COMPARE=OFF \ - -DOPT_ENABLE_SANITIZER_POINTER_SUBTRACT=OFF \ - -DOPT_ENABLE_SANITIZER_UNDEFINED_BEHAVIOR=OFF \ - -DOPT_ENABLE_CPPCHECK=OFF \ - -DOPT_ENABLE_CLANG_TIDY=OFF \ -DNCHG_ENABLE_TESTING=ON \ -DNCHG_DOWNLOAD_TEST_DATASET=OFF \ -DCMAKE_C_FLAGS="$cov_flags" \ @@ -385,7 +376,7 @@ jobs: sudo -u devel \ -E env "LLVM_PROFILE_FILE=$LLVM_PROFILE_FILE" \ - build/src/NCHG cartesian-product \ + build/src/nchg/NCHG cartesian-product \ test/data/ENCFF447ERX.1000000.compartments.bed \ > "$OUT_PREFIX.bedpe" @@ -401,7 +392,7 @@ jobs: sudo -u devel \ -E env "LLVM_PROFILE_FILE=$LLVM_PROFILE_FILE" \ - build/src/NCHG cartesian-product \ + build/src/nchg/NCHG cartesian-product \ --cis-only \ test/data/ENCFF447ERX.1000000.compartments.bed \ > domains.bedpe @@ -418,8 +409,8 @@ jobs: sudo -u devel \ -E env "LLVM_PROFILE_FILE=$LLVM_PROFILE_FILE" \ - build/src/NCHG cartesian-product \ - --cis-only \ + build/src/nchg/NCHG cartesian-product \ + --trans-only \ test/data/ENCFF447ERX.1000000.compartments.bed \ > domains.bedpe @@ -435,9 +426,9 @@ jobs: sudo -u devel \ -E env "LLVM_PROFILE_FILE=$LLVM_PROFILE_FILE" \ - build/src/NCHG cartesian-product \ + build/src/nchg/NCHG cartesian-product \ --chrom1 chr1 \ - --chrom1 chr3 \ + --chrom2 chr3 \ test/data/ENCFF447ERX.1000000.compartments.bed \ > domains.bedpe @@ -492,7 +483,7 @@ jobs: test/data/ENCFF447ERX.1000000.cool \ "$OUT_PREFIX" \ --threads "$(nproc)" \ - --domains test/data/ENCFF447ERX.1000000.compartments.bed + --domains "$DATA_DIR/cartesian_product/$OUT_PREFIX.gw-domains.bedpe" test/scripts/validate_nchg_output.py compute \ "$OUT_PREFIX" \ diff --git a/.github/workflows/macos-ci.yml b/.github/workflows/macos-ci.yml index c44a4fb..0fa1dff 100644 --- a/.github/workflows/macos-ci.yml +++ b/.github/workflows/macos-ci.yml @@ -483,7 +483,7 @@ jobs: - name: Test NCHG cartesian-product (trans) run: | bin/NCHG cartesian-product \ - --cis-only \ + --trans-only \ test/data/ENCFF447ERX.1000000.compartments.bed \ > domains.bedpe @@ -497,7 +497,7 @@ jobs: run: | bin/NCHG cartesian-product \ --chrom1 chr1 \ - --chrom1 chr3 \ + --chrom2 chr3 \ test/data/ENCFF447ERX.1000000.compartments.bed \ > domains.bedpe @@ -540,7 +540,7 @@ jobs: test/data/ENCFF447ERX.1000000.cool \ "$OUT_PREFIX" \ --threads "$NPROC" \ - --domains test/data/ENCFF447ERX.1000000.compartments.bed + --domains "$DATA_DIR/cartesian_product/$OUT_PREFIX.gw-domains.bedpe" test/scripts/validate_nchg_output.py compute \ "$OUT_PREFIX" \ diff --git a/.github/workflows/ubuntu-ci.yml b/.github/workflows/ubuntu-ci.yml index f2002ca..60587c8 100644 --- a/.github/workflows/ubuntu-ci.yml +++ b/.github/workflows/ubuntu-ci.yml @@ -530,7 +530,7 @@ jobs: run: | sudo -u devel \ bin/NCHG cartesian-product \ - --cis-only \ + --trans-only \ test/data/ENCFF447ERX.1000000.compartments.bed \ > domains.bedpe @@ -545,7 +545,7 @@ jobs: sudo -u devel \ bin/NCHG cartesian-product \ --chrom1 chr1 \ - --chrom1 chr3 \ + --chrom2 chr3 \ test/data/ENCFF447ERX.1000000.compartments.bed \ > domains.bedpe @@ -591,7 +591,7 @@ jobs: test/data/ENCFF447ERX.1000000.cool \ "$OUT_PREFIX" \ --threads "$(nproc)" \ - --domains test/data/ENCFF447ERX.1000000.compartments.bed + --domains "$DATA_DIR/cartesian_product/$OUT_PREFIX.gw-domains.bedpe" test/scripts/validate_nchg_output.py compute \ "$OUT_PREFIX" \