From 202bfa8345f2d5c8606274e56340d267c068b33e Mon Sep 17 00:00:00 2001
From: Roberto Rossini <71787608+robomics@users.noreply.github.com>
Date: Wed, 13 Nov 2024 14:10:55 +0100
Subject: [PATCH 1/9] Split libnchg into a few smaller libraries
---
src/CMakeLists.txt | 26 +++++++-
src/common/CMakeLists.txt | 47 ++++++++++++++
.../include/nchg/common.hpp | 0
.../include/nchg/concepts.hpp | 0
src/{libnchg => common}/include/nchg/hash.hpp | 0
.../include/nchg/median.hpp | 0
.../include/nchg/suppress_warnings.hpp | 0
.../include/nchg/type_traits.hpp | 0
src/descriptive_stats/CMakeLists.txt | 62 +++++++++++++++++++
.../expected_matrix.cpp | 0
.../expected_matrix_impl.hpp | 0
.../expected_values.cpp | 0
.../expected_values_aggregator.cpp | 0
.../expected_values_aggregator_impl.hpp | 0
.../expected_values_impl.hpp | 0
.../nchg/chromosome_pairs_generator.hpp | 0
.../include/nchg/expected_matrix.hpp | 0
.../include/nchg/expected_values.hpp | 0
.../nchg/expected_values_aggregator.hpp | 0
.../include/nchg/matrix_stats.hpp | 0
.../include/nchg/observed_matrix.hpp | 0
.../matrix_stats_impl.hpp | 0
.../observed_matrix.cpp | 0
.../observed_matrix_impl.hpp | 0
.../CMakeLists.txt | 53 +++-------------
.../fdr_impl.hpp | 0
.../include/nchg/fdr.hpp | 0
.../include/nchg/nchg.hpp | 2 +-
src/{libnchg => hypothesis_testing}/nchg.cpp | 0
.../nchg_impl.hpp | 0
src/io/CMakeLists.txt | 39 ++++++++++++
src/{libnchg => io}/include/nchg/k_merger.hpp | 0
src/{libnchg => io}/k_merger_impl.hpp | 0
src/nchg/CMakeLists.txt | 5 +-
src/preproc/CMakeLists.txt | 48 ++++++++++++++
.../include/nchg/mad_max_filter.hpp | 0
src/{libnchg => preproc}/mad_max_filter.cpp | 3 +
.../mad_max_filter_impl.hpp | 0
test/units/expected_matrix/CMakeLists.txt | 2 +-
test/units/expected_values/CMakeLists.txt | 2 +-
test/units/fdr/CMakeLists.txt | 2 +-
test/units/nchg/CMakeLists.txt | 2 +-
test/units/observed_matrix/CMakeLists.txt | 2 +-
43 files changed, 243 insertions(+), 52 deletions(-)
create mode 100644 src/common/CMakeLists.txt
rename src/{libnchg => common}/include/nchg/common.hpp (100%)
rename src/{libnchg => common}/include/nchg/concepts.hpp (100%)
rename src/{libnchg => common}/include/nchg/hash.hpp (100%)
rename src/{libnchg => common}/include/nchg/median.hpp (100%)
rename src/{libnchg => common}/include/nchg/suppress_warnings.hpp (100%)
rename src/{libnchg => common}/include/nchg/type_traits.hpp (100%)
create mode 100644 src/descriptive_stats/CMakeLists.txt
rename src/{libnchg => descriptive_stats}/expected_matrix.cpp (100%)
rename src/{libnchg => descriptive_stats}/expected_matrix_impl.hpp (100%)
rename src/{libnchg => descriptive_stats}/expected_values.cpp (100%)
rename src/{libnchg => descriptive_stats}/expected_values_aggregator.cpp (100%)
rename src/{libnchg => descriptive_stats}/expected_values_aggregator_impl.hpp (100%)
rename src/{libnchg => descriptive_stats}/expected_values_impl.hpp (100%)
rename src/{libnchg => descriptive_stats}/include/nchg/chromosome_pairs_generator.hpp (100%)
rename src/{libnchg => descriptive_stats}/include/nchg/expected_matrix.hpp (100%)
rename src/{libnchg => descriptive_stats}/include/nchg/expected_values.hpp (100%)
rename src/{libnchg => descriptive_stats}/include/nchg/expected_values_aggregator.hpp (100%)
rename src/{libnchg => descriptive_stats}/include/nchg/matrix_stats.hpp (100%)
rename src/{libnchg => descriptive_stats}/include/nchg/observed_matrix.hpp (100%)
rename src/{libnchg => descriptive_stats}/matrix_stats_impl.hpp (100%)
rename src/{libnchg => descriptive_stats}/observed_matrix.cpp (100%)
rename src/{libnchg => descriptive_stats}/observed_matrix_impl.hpp (100%)
rename src/{libnchg => hypothesis_testing}/CMakeLists.txt (58%)
rename src/{libnchg => hypothesis_testing}/fdr_impl.hpp (100%)
rename src/{libnchg => hypothesis_testing}/include/nchg/fdr.hpp (100%)
rename src/{libnchg => hypothesis_testing}/include/nchg/nchg.hpp (99%)
rename src/{libnchg => hypothesis_testing}/nchg.cpp (100%)
rename src/{libnchg => hypothesis_testing}/nchg_impl.hpp (100%)
create mode 100644 src/io/CMakeLists.txt
rename src/{libnchg => io}/include/nchg/k_merger.hpp (100%)
rename src/{libnchg => io}/k_merger_impl.hpp (100%)
create mode 100644 src/preproc/CMakeLists.txt
rename src/{libnchg => preproc}/include/nchg/mad_max_filter.hpp (100%)
rename src/{libnchg => preproc}/mad_max_filter.cpp (97%)
rename src/{libnchg => preproc}/mad_max_filter_impl.hpp (100%)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5c0f2c0..2dba654 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,5 +16,29 @@
# with this library. If not, see
# .
-add_subdirectory(libnchg)
+include(FetchContent)
+
+set(HICTK_ENABLE_TESTING OFF)
+set(HICTK_BUILD_EXAMPLES OFF)
+set(HICTK_BUILD_BENCHMARKS OFF)
+set(HICTK_WITH_EIGEN OFF)
+set(HICTK_BUILD_TOOLS OFF)
+set(HICTK_INSTALL OFF)
+set(HICTK_ENABLE_GIT_VERSION_TRACKING OFF)
+
+FetchContent_Declare(
+ hictk
+ GIT_REPOSITORY https://github.com/paulsengroup/hictk.git
+ GIT_TAG v2.0.1
+ EXCLUDE_FROM_ALL
+ OVERRIDE_FIND_PACKAGE
+ SYSTEM
+)
+
+add_subdirectory(common)
+add_subdirectory(descriptive_stats)
+add_subdirectory(hypothesis_testing)
+add_subdirectory(io)
+add_subdirectory(preproc)
+
add_subdirectory(nchg)
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
new file mode 100644
index 0000000..cb57fde
--- /dev/null
+++ b/src/common/CMakeLists.txt
@@ -0,0 +1,47 @@
+# Copyright (C) 2024 Roberto Rossini
+#
+# SPDX-License-Identifier: GPL-3.0
+#
+# This library is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Public License as published
+# by the Free Software Foundation; either version 3 of the License,
+# or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Library General Public License for more details.
+#
+# You should have received a copy of the GNU Public License along
+# with this library. If not, see
+# .
+
+find_package(hictk REQUIRED)
+
+add_library(nchg_common INTERFACE)
+add_library(nchg::common ALIAS nchg_common)
+
+target_sources(
+ nchg_common
+ INTERFACE
+ "${CMAKE_CURRENT_SOURCE_DIR}/include/nchg/common.hpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/include/nchg/concepts.hpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/include/nchg/hash.hpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/include/nchg/suppress_warnings.hpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/include/nchg/type_traits.hpp"
+)
+
+target_include_directories(
+ nchg_common
+ INTERFACE
+ "$"
+ "$"
+)
+
+target_link_libraries(
+ nchg_common
+ INTERFACE
+ nchg_project_options
+ nchg_project_warnings
+ hictk::libhictk
+)
diff --git a/src/libnchg/include/nchg/common.hpp b/src/common/include/nchg/common.hpp
similarity index 100%
rename from src/libnchg/include/nchg/common.hpp
rename to src/common/include/nchg/common.hpp
diff --git a/src/libnchg/include/nchg/concepts.hpp b/src/common/include/nchg/concepts.hpp
similarity index 100%
rename from src/libnchg/include/nchg/concepts.hpp
rename to src/common/include/nchg/concepts.hpp
diff --git a/src/libnchg/include/nchg/hash.hpp b/src/common/include/nchg/hash.hpp
similarity index 100%
rename from src/libnchg/include/nchg/hash.hpp
rename to src/common/include/nchg/hash.hpp
diff --git a/src/libnchg/include/nchg/median.hpp b/src/common/include/nchg/median.hpp
similarity index 100%
rename from src/libnchg/include/nchg/median.hpp
rename to src/common/include/nchg/median.hpp
diff --git a/src/libnchg/include/nchg/suppress_warnings.hpp b/src/common/include/nchg/suppress_warnings.hpp
similarity index 100%
rename from src/libnchg/include/nchg/suppress_warnings.hpp
rename to src/common/include/nchg/suppress_warnings.hpp
diff --git a/src/libnchg/include/nchg/type_traits.hpp b/src/common/include/nchg/type_traits.hpp
similarity index 100%
rename from src/libnchg/include/nchg/type_traits.hpp
rename to src/common/include/nchg/type_traits.hpp
diff --git a/src/descriptive_stats/CMakeLists.txt b/src/descriptive_stats/CMakeLists.txt
new file mode 100644
index 0000000..8e291f9
--- /dev/null
+++ b/src/descriptive_stats/CMakeLists.txt
@@ -0,0 +1,62 @@
+# Copyright (C) 2024 Roberto Rossini
+#
+# SPDX-License-Identifier: GPL-3.0
+#
+# This library is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Public License as published
+# by the Free Software Foundation; either version 3 of the License,
+# or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Library General Public License for more details.
+#
+# You should have received a copy of the GNU Public License along
+# with this library. If not, see
+# .
+
+find_package(FMT REQUIRED)
+find_package(hictk REQUIRED)
+find_package(HighFive REQUIRED)
+find_package(phmap REQUIRED)
+find_package(spdlog REQUIRED)
+
+add_library(nchg_descriptive_stats STATIC)
+add_library(nchg::descriptive_stats ALIAS nchg_descriptive_stats)
+
+target_sources(
+ nchg_descriptive_stats
+ PRIVATE
+ "${CMAKE_CURRENT_SOURCE_DIR}/expected_matrix.cpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/expected_matrix_impl.hpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/expected_values.cpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/expected_values_aggregator.cpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/expected_values_aggregator_impl.hpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/expected_values_impl.hpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/matrix_stats_impl.hpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/observed_matrix.cpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/observed_matrix_impl.hpp"
+)
+
+target_include_directories(
+ nchg_descriptive_stats
+ PUBLIC
+ "$"
+ "$"
+)
+
+target_link_libraries(
+ nchg_descriptive_stats
+ PRIVATE
+ nchg_project_options
+ nchg_project_warnings
+ fmt::fmt-header-only
+ spdlog::spdlog_header_only
+ PUBLIC
+ nchg::common
+ nchg::preproc
+ hictk::libhictk
+ HighFive
+ phmap
+)
diff --git a/src/libnchg/expected_matrix.cpp b/src/descriptive_stats/expected_matrix.cpp
similarity index 100%
rename from src/libnchg/expected_matrix.cpp
rename to src/descriptive_stats/expected_matrix.cpp
diff --git a/src/libnchg/expected_matrix_impl.hpp b/src/descriptive_stats/expected_matrix_impl.hpp
similarity index 100%
rename from src/libnchg/expected_matrix_impl.hpp
rename to src/descriptive_stats/expected_matrix_impl.hpp
diff --git a/src/libnchg/expected_values.cpp b/src/descriptive_stats/expected_values.cpp
similarity index 100%
rename from src/libnchg/expected_values.cpp
rename to src/descriptive_stats/expected_values.cpp
diff --git a/src/libnchg/expected_values_aggregator.cpp b/src/descriptive_stats/expected_values_aggregator.cpp
similarity index 100%
rename from src/libnchg/expected_values_aggregator.cpp
rename to src/descriptive_stats/expected_values_aggregator.cpp
diff --git a/src/libnchg/expected_values_aggregator_impl.hpp b/src/descriptive_stats/expected_values_aggregator_impl.hpp
similarity index 100%
rename from src/libnchg/expected_values_aggregator_impl.hpp
rename to src/descriptive_stats/expected_values_aggregator_impl.hpp
diff --git a/src/libnchg/expected_values_impl.hpp b/src/descriptive_stats/expected_values_impl.hpp
similarity index 100%
rename from src/libnchg/expected_values_impl.hpp
rename to src/descriptive_stats/expected_values_impl.hpp
diff --git a/src/libnchg/include/nchg/chromosome_pairs_generator.hpp b/src/descriptive_stats/include/nchg/chromosome_pairs_generator.hpp
similarity index 100%
rename from src/libnchg/include/nchg/chromosome_pairs_generator.hpp
rename to src/descriptive_stats/include/nchg/chromosome_pairs_generator.hpp
diff --git a/src/libnchg/include/nchg/expected_matrix.hpp b/src/descriptive_stats/include/nchg/expected_matrix.hpp
similarity index 100%
rename from src/libnchg/include/nchg/expected_matrix.hpp
rename to src/descriptive_stats/include/nchg/expected_matrix.hpp
diff --git a/src/libnchg/include/nchg/expected_values.hpp b/src/descriptive_stats/include/nchg/expected_values.hpp
similarity index 100%
rename from src/libnchg/include/nchg/expected_values.hpp
rename to src/descriptive_stats/include/nchg/expected_values.hpp
diff --git a/src/libnchg/include/nchg/expected_values_aggregator.hpp b/src/descriptive_stats/include/nchg/expected_values_aggregator.hpp
similarity index 100%
rename from src/libnchg/include/nchg/expected_values_aggregator.hpp
rename to src/descriptive_stats/include/nchg/expected_values_aggregator.hpp
diff --git a/src/libnchg/include/nchg/matrix_stats.hpp b/src/descriptive_stats/include/nchg/matrix_stats.hpp
similarity index 100%
rename from src/libnchg/include/nchg/matrix_stats.hpp
rename to src/descriptive_stats/include/nchg/matrix_stats.hpp
diff --git a/src/libnchg/include/nchg/observed_matrix.hpp b/src/descriptive_stats/include/nchg/observed_matrix.hpp
similarity index 100%
rename from src/libnchg/include/nchg/observed_matrix.hpp
rename to src/descriptive_stats/include/nchg/observed_matrix.hpp
diff --git a/src/libnchg/matrix_stats_impl.hpp b/src/descriptive_stats/matrix_stats_impl.hpp
similarity index 100%
rename from src/libnchg/matrix_stats_impl.hpp
rename to src/descriptive_stats/matrix_stats_impl.hpp
diff --git a/src/libnchg/observed_matrix.cpp b/src/descriptive_stats/observed_matrix.cpp
similarity index 100%
rename from src/libnchg/observed_matrix.cpp
rename to src/descriptive_stats/observed_matrix.cpp
diff --git a/src/libnchg/observed_matrix_impl.hpp b/src/descriptive_stats/observed_matrix_impl.hpp
similarity index 100%
rename from src/libnchg/observed_matrix_impl.hpp
rename to src/descriptive_stats/observed_matrix_impl.hpp
diff --git a/src/libnchg/CMakeLists.txt b/src/hypothesis_testing/CMakeLists.txt
similarity index 58%
rename from src/libnchg/CMakeLists.txt
rename to src/hypothesis_testing/CMakeLists.txt
index 7dc7c00..3de3c77 100644
--- a/src/libnchg/CMakeLists.txt
+++ b/src/hypothesis_testing/CMakeLists.txt
@@ -16,25 +16,6 @@
# with this library. If not, see
# .
-include(FetchContent)
-
-set(HICTK_ENABLE_TESTING OFF)
-set(HICTK_BUILD_EXAMPLES OFF)
-set(HICTK_BUILD_BENCHMARKS OFF)
-set(HICTK_WITH_EIGEN OFF)
-set(HICTK_BUILD_TOOLS OFF)
-set(HICTK_INSTALL OFF)
-set(HICTK_ENABLE_GIT_VERSION_TRACKING OFF)
-
-FetchContent_Declare(
- hictk
- GIT_REPOSITORY https://github.com/paulsengroup/hictk.git
- GIT_TAG v2.0.1
- EXCLUDE_FROM_ALL
- OVERRIDE_FIND_PACKAGE
- SYSTEM
-)
-
FetchContent_Declare(
stocc
URL
@@ -56,52 +37,36 @@ FetchContent_MakeAvailable(stocc)
set(BUILD_SHARED_LIBS "${NCHG_BUILD_SHARED_LIBS}")
unset(NCHG_BUILD_SHARED_LIBS)
-add_library(libnchg OBJECT)
-add_library(nchg::libnchg ALIAS libnchg)
+add_library(nchg_hypothesis_testing STATIC)
+add_library(nchg::hypothesis_testing ALIAS nchg_hypothesis_testing)
target_sources(
- libnchg
+ nchg_hypothesis_testing
PRIVATE
- "${CMAKE_CURRENT_SOURCE_DIR}/expected_matrix.cpp"
- "${CMAKE_CURRENT_SOURCE_DIR}/expected_matrix_impl.hpp"
- "${CMAKE_CURRENT_SOURCE_DIR}/expected_values_aggregator.cpp"
- "${CMAKE_CURRENT_SOURCE_DIR}/expected_values_aggregator_impl.hpp"
- "${CMAKE_CURRENT_SOURCE_DIR}/expected_values.cpp"
- "${CMAKE_CURRENT_SOURCE_DIR}/expected_values_impl.hpp"
"${CMAKE_CURRENT_SOURCE_DIR}/fdr_impl.hpp"
- "${CMAKE_CURRENT_SOURCE_DIR}/k_merger_impl.hpp"
- "${CMAKE_CURRENT_SOURCE_DIR}/mad_max_filter.cpp"
- "${CMAKE_CURRENT_SOURCE_DIR}/mad_max_filter_impl.hpp"
- "${CMAKE_CURRENT_SOURCE_DIR}/matrix_stats_impl.hpp"
"${CMAKE_CURRENT_SOURCE_DIR}/nchg.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/nchg_impl.hpp"
- "${CMAKE_CURRENT_SOURCE_DIR}/observed_matrix.cpp"
- "${CMAKE_CURRENT_SOURCE_DIR}/observed_matrix_impl.hpp"
)
target_include_directories(
- libnchg
+ nchg_hypothesis_testing
PUBLIC
"$"
"$"
)
target_link_libraries(
- libnchg
+ nchg_hypothesis_testing
PRIVATE
nchg_project_options
nchg_project_warnings
fmt::fmt-header-only
spdlog::spdlog_header_only
+ stocc
PUBLIC
+ nchg::common
+ nchg::descriptive_stats
+ nchg::preproc
hictk::libhictk
phmap
- stocc
-)
-
-set_target_properties(
- libnchg
- PROPERTIES
- OUTPUT_NAME
- libnchg
)
diff --git a/src/libnchg/fdr_impl.hpp b/src/hypothesis_testing/fdr_impl.hpp
similarity index 100%
rename from src/libnchg/fdr_impl.hpp
rename to src/hypothesis_testing/fdr_impl.hpp
diff --git a/src/libnchg/include/nchg/fdr.hpp b/src/hypothesis_testing/include/nchg/fdr.hpp
similarity index 100%
rename from src/libnchg/include/nchg/fdr.hpp
rename to src/hypothesis_testing/include/nchg/fdr.hpp
diff --git a/src/libnchg/include/nchg/nchg.hpp b/src/hypothesis_testing/include/nchg/nchg.hpp
similarity index 99%
rename from src/libnchg/include/nchg/nchg.hpp
rename to src/hypothesis_testing/include/nchg/nchg.hpp
index c9f782b..37eb060 100644
--- a/src/libnchg/include/nchg/nchg.hpp
+++ b/src/hypothesis_testing/include/nchg/nchg.hpp
@@ -84,7 +84,7 @@ class NCHG {
iterator>;
using Params = ExpectedValues::Params;
- static constexpr auto& DefaultParams = ExpectedValues::DefaultParams;
+ static constexpr auto DefaultParams = ExpectedValues::DefaultParams;
explicit NCHG(std::shared_ptr f, const hictk::Chromosome& chrom1,
const hictk::Chromosome& chrom2, const Params& params);
NCHG(std::shared_ptr f, const hictk::Chromosome& chrom1,
diff --git a/src/libnchg/nchg.cpp b/src/hypothesis_testing/nchg.cpp
similarity index 100%
rename from src/libnchg/nchg.cpp
rename to src/hypothesis_testing/nchg.cpp
diff --git a/src/libnchg/nchg_impl.hpp b/src/hypothesis_testing/nchg_impl.hpp
similarity index 100%
rename from src/libnchg/nchg_impl.hpp
rename to src/hypothesis_testing/nchg_impl.hpp
diff --git a/src/io/CMakeLists.txt b/src/io/CMakeLists.txt
new file mode 100644
index 0000000..883c86c
--- /dev/null
+++ b/src/io/CMakeLists.txt
@@ -0,0 +1,39 @@
+# Copyright (C) 2024 Roberto Rossini
+#
+# SPDX-License-Identifier: GPL-3.0
+#
+# This library is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Public License as published
+# by the Free Software Foundation; either version 3 of the License,
+# or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Library General Public License for more details.
+#
+# You should have received a copy of the GNU Public License along
+# with this library. If not, see
+# .
+
+find_package(hictk REQUIRED)
+
+add_library(nchg_io INTERFACE)
+add_library(nchg::io ALIAS nchg_io)
+
+target_sources(nchg_io INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/k_merger_impl.hpp")
+
+target_include_directories(
+ nchg_io
+ INTERFACE
+ "$"
+ "$"
+)
+
+target_link_libraries(
+ nchg_io
+ INTERFACE
+ nchg_project_options
+ nchg_project_warnings
+ hictk::libhictk
+)
diff --git a/src/libnchg/include/nchg/k_merger.hpp b/src/io/include/nchg/k_merger.hpp
similarity index 100%
rename from src/libnchg/include/nchg/k_merger.hpp
rename to src/io/include/nchg/k_merger.hpp
diff --git a/src/libnchg/k_merger_impl.hpp b/src/io/k_merger_impl.hpp
similarity index 100%
rename from src/libnchg/k_merger_impl.hpp
rename to src/io/k_merger_impl.hpp
diff --git a/src/nchg/CMakeLists.txt b/src/nchg/CMakeLists.txt
index 16e3697..6c527ed 100644
--- a/src/nchg/CMakeLists.txt
+++ b/src/nchg/CMakeLists.txt
@@ -50,6 +50,10 @@ target_link_libraries(
PRIVATE
nchg_project_options
nchg_project_warnings
+ nchg::common
+ nchg::descriptive_stats
+ nchg::hypothesis_testing
+ nchg::io
Boost::headers
Boost::filesystem # Required by boost::process::v2
bshoshany-thread-pool::bshoshany-thread-pool
@@ -59,7 +63,6 @@ target_link_libraries(
readerwriterqueue::readerwriterqueue
spdlog::spdlog_header_only
PUBLIC
- nchg::libnchg
Arrow::arrow_$,shared,static>
hictk::libhictk
HighFive
diff --git a/src/preproc/CMakeLists.txt b/src/preproc/CMakeLists.txt
new file mode 100644
index 0000000..de7fb93
--- /dev/null
+++ b/src/preproc/CMakeLists.txt
@@ -0,0 +1,48 @@
+# Copyright (C) 2024 Roberto Rossini
+#
+# SPDX-License-Identifier: GPL-3.0
+#
+# This library is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Public License as published
+# by the Free Software Foundation; either version 3 of the License,
+# or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Library General Public License for more details.
+#
+# You should have received a copy of the GNU Public License along
+# with this library. If not, see
+# .
+
+find_package(FMT REQUIRED)
+find_package(hictk REQUIRED)
+find_package(phmap REQUIRED)
+find_package(spdlog REQUIRED)
+
+add_library(preproc STATIC)
+add_library(nchg::preproc ALIAS preproc)
+
+target_sources(
+ preproc
+ PRIVATE
+ "${CMAKE_CURRENT_SOURCE_DIR}/mad_max_filter.cpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/mad_max_filter_impl.hpp"
+)
+
+target_include_directories(
+ preproc
+ PUBLIC
+ "$"
+ "$"
+)
+
+target_link_libraries(
+ preproc
+ PRIVATE
+ nchg_project_options
+ nchg_project_warnings
+ PUBLIC
+ nchg::common
+)
diff --git a/src/libnchg/include/nchg/mad_max_filter.hpp b/src/preproc/include/nchg/mad_max_filter.hpp
similarity index 100%
rename from src/libnchg/include/nchg/mad_max_filter.hpp
rename to src/preproc/include/nchg/mad_max_filter.hpp
diff --git a/src/libnchg/mad_max_filter.cpp b/src/preproc/mad_max_filter.cpp
similarity index 97%
rename from src/libnchg/mad_max_filter.cpp
rename to src/preproc/mad_max_filter.cpp
index 6e2017e..0a19527 100644
--- a/src/libnchg/mad_max_filter.cpp
+++ b/src/preproc/mad_max_filter.cpp
@@ -21,8 +21,11 @@
#include
#include
#include
+#include
#include
+#include "nchg/median.hpp"
+
namespace nchg {
std::vector mad_max_filtering(std::vector& margs, double mad_max) {
diff --git a/src/libnchg/mad_max_filter_impl.hpp b/src/preproc/mad_max_filter_impl.hpp
similarity index 100%
rename from src/libnchg/mad_max_filter_impl.hpp
rename to src/preproc/mad_max_filter_impl.hpp
diff --git a/test/units/expected_matrix/CMakeLists.txt b/test/units/expected_matrix/CMakeLists.txt
index 072fde3..be3f163 100644
--- a/test/units/expected_matrix/CMakeLists.txt
+++ b/test/units/expected_matrix/CMakeLists.txt
@@ -30,7 +30,7 @@ target_link_libraries(
nchg_project_warnings
nchg_project_options
PUBLIC
- nchg::libnchg
+ nchg::descriptive_stats
)
target_link_system_libraries(
diff --git a/test/units/expected_values/CMakeLists.txt b/test/units/expected_values/CMakeLists.txt
index 5ff7636..7c94bf4 100644
--- a/test/units/expected_values/CMakeLists.txt
+++ b/test/units/expected_values/CMakeLists.txt
@@ -30,7 +30,7 @@ target_link_libraries(
nchg_project_warnings
nchg_project_options
PUBLIC
- nchg::libnchg
+ nchg::descriptive_stats
)
target_link_system_libraries(
diff --git a/test/units/fdr/CMakeLists.txt b/test/units/fdr/CMakeLists.txt
index 163fec8..17080c5 100644
--- a/test/units/fdr/CMakeLists.txt
+++ b/test/units/fdr/CMakeLists.txt
@@ -30,7 +30,7 @@ target_link_libraries(
nchg_project_warnings
nchg_project_options
PUBLIC
- nchg::libnchg
+ nchg::hypothesis_testing
)
target_link_system_libraries(
diff --git a/test/units/nchg/CMakeLists.txt b/test/units/nchg/CMakeLists.txt
index 37143c4..375dc7f 100644
--- a/test/units/nchg/CMakeLists.txt
+++ b/test/units/nchg/CMakeLists.txt
@@ -30,7 +30,7 @@ target_link_libraries(
nchg_project_warnings
nchg_project_options
PUBLIC
- nchg::libnchg
+ nchg::hypothesis_testing
)
target_link_system_libraries(
diff --git a/test/units/observed_matrix/CMakeLists.txt b/test/units/observed_matrix/CMakeLists.txt
index 3b10a0a..fe950b2 100644
--- a/test/units/observed_matrix/CMakeLists.txt
+++ b/test/units/observed_matrix/CMakeLists.txt
@@ -30,7 +30,7 @@ target_link_libraries(
nchg_project_warnings
nchg_project_options
PUBLIC
- nchg::libnchg
+ nchg::descriptive_stats
)
target_link_system_libraries(
From b446792288acc72e8114bb46ede0bb60b7ecbadd Mon Sep 17 00:00:00 2001
From: Roberto Rossini <71787608+robomics@users.noreply.github.com>
Date: Wed, 13 Nov 2024 15:54:21 +0100
Subject: [PATCH 2/9] Collect all code related to file IO into the nchg_io lib
---
src/descriptive_stats/CMakeLists.txt | 2 +
src/io/CMakeLists.txt | 28 +-
src/io/include/nchg/parquet_helpers.hpp | 49 ++++
.../include/nchg/parquet_stats_file.hpp} | 69 +----
src/io/include/nchg/record_batch_builder.hpp | 101 +++++++
src/io/include/nchg/text.hpp | 47 ++++
src/io/parquet_helpers.cpp | 112 ++++++++
.../io.cpp => io/parquet_stats_file.cpp} | 250 +-----------------
src/io/parquet_stats_file_impl.hpp | 147 ++++++++++
src/io/record_batch_builder.cpp | 166 ++++++++++++
.../record_batch_builder_impl.hpp} | 126 +--------
src/io/text.cpp | 108 ++++++++
src/io/text_impl.hpp | 46 ++++
src/nchg/CMakeLists.txt | 5 -
.../cartesian_product/cartesian_product.cpp | 2 +-
src/nchg/compute/compute.cpp | 3 +-
src/nchg/expected/expected.cpp | 2 +-
src/nchg/filter/filter.cpp | 4 +-
src/nchg/merge/merge.cpp | 4 +-
src/nchg/view/view.cpp | 2 +-
20 files changed, 823 insertions(+), 450 deletions(-)
create mode 100644 src/io/include/nchg/parquet_helpers.hpp
rename src/{nchg/include/nchg/tools/io.hpp => io/include/nchg/parquet_stats_file.hpp} (66%)
create mode 100644 src/io/include/nchg/record_batch_builder.hpp
create mode 100644 src/io/include/nchg/text.hpp
create mode 100644 src/io/parquet_helpers.cpp
rename src/{nchg/common/io.cpp => io/parquet_stats_file.cpp} (51%)
create mode 100644 src/io/parquet_stats_file_impl.hpp
create mode 100644 src/io/record_batch_builder.cpp
rename src/{nchg/common/io_impl.hpp => io/record_batch_builder_impl.hpp} (56%)
create mode 100644 src/io/text.cpp
create mode 100644 src/io/text_impl.hpp
diff --git a/src/descriptive_stats/CMakeLists.txt b/src/descriptive_stats/CMakeLists.txt
index 8e291f9..c9c72b4 100644
--- a/src/descriptive_stats/CMakeLists.txt
+++ b/src/descriptive_stats/CMakeLists.txt
@@ -18,6 +18,7 @@
find_package(FMT REQUIRED)
find_package(hictk REQUIRED)
+find_package(HDF5 REQUIRED QUIET COMPONENTS C)
find_package(HighFive REQUIRED)
find_package(phmap REQUIRED)
find_package(spdlog REQUIRED)
@@ -57,6 +58,7 @@ target_link_libraries(
nchg::common
nchg::preproc
hictk::libhictk
+ HDF5::HDF5
HighFive
phmap
)
diff --git a/src/io/CMakeLists.txt b/src/io/CMakeLists.txt
index 883c86c..8077165 100644
--- a/src/io/CMakeLists.txt
+++ b/src/io/CMakeLists.txt
@@ -16,24 +16,44 @@
# with this library. If not, see
# .
+find_package(Arrow REQUIRED)
+find_package(FMT REQUIRED)
find_package(hictk REQUIRED)
+find_package(spdlog REQUIRED)
-add_library(nchg_io INTERFACE)
+add_library(nchg_io STATIC)
add_library(nchg::io ALIAS nchg_io)
-target_sources(nchg_io INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/k_merger_impl.hpp")
+target_sources(
+ nchg_io
+ PRIVATE
+ "${CMAKE_CURRENT_SOURCE_DIR}/k_merger_impl.hpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/parquet_helpers.cpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/parquet_stats_file.cpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/parquet_stats_file_impl.hpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/record_batch_builder.cpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/record_batch_builder_impl.hpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/text.cpp"
+ "${CMAKE_CURRENT_SOURCE_DIR}/text_impl.hpp"
+)
target_include_directories(
nchg_io
- INTERFACE
+ PUBLIC
"$"
"$"
)
target_link_libraries(
nchg_io
- INTERFACE
+ PRIVATE
nchg_project_options
nchg_project_warnings
+ nchg::common
+ fmt::fmt-header-only
+ spdlog::spdlog_header_only
+ PUBLIC
+ Arrow::arrow_$,shared,static>
hictk::libhictk
+ Parquet::parquet_$,shared,static>
)
diff --git a/src/io/include/nchg/parquet_helpers.hpp b/src/io/include/nchg/parquet_helpers.hpp
new file mode 100644
index 0000000..86669cb
--- /dev/null
+++ b/src/io/include/nchg/parquet_helpers.hpp
@@ -0,0 +1,49 @@
+// Copyright (C) 2024 Roberto Rossini
+//
+// SPDX-License-Identifier: GPL-3.0
+//
+// This library is free software: you can redistribute it and/or
+// modify it under the terms of the GNU Public License as published
+// by the Free Software Foundation; either version 3 of the License,
+// or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Library General Public License for more details.
+//
+// You should have received a copy of the GNU Public License along
+// with this library. If not, see
+// .
+
+#pragma once
+
+// clang-format off
+#include "nchg/suppress_warnings.hpp"
+NCHG_DISABLE_WARNING_PUSH
+NCHG_DISABLE_WARNING_DEPRECATED_DECLARATIONS
+#include
+#include
+NCHG_DISABLE_WARNING_POP
+// clang-format on
+
+#include
+#include
+#include
+#include
+
+namespace nchg {
+
+// https://stackoverflow.com/a/16000226
+template
+struct has_pval_corrected : std::false_type {};
+
+template
+struct has_pval_corrected : std::true_type {};
+
+[[nodiscard]] std::shared_ptr get_schema(const hictk::Reference &chroms);
+[[nodiscard]] std::shared_ptr get_schema_padj(const hictk::Reference &chroms);
+
+[[nodiscard]] parquet::Compression::type parse_parquet_compression(std::string_view method);
+
+} // namespace nchg
diff --git a/src/nchg/include/nchg/tools/io.hpp b/src/io/include/nchg/parquet_stats_file.hpp
similarity index 66%
rename from src/nchg/include/nchg/tools/io.hpp
rename to src/io/include/nchg/parquet_stats_file.hpp
index 58896f7..99846e6 100644
--- a/src/nchg/include/nchg/tools/io.hpp
+++ b/src/io/include/nchg/parquet_stats_file.hpp
@@ -35,27 +35,11 @@ NCHG_DISABLE_WARNING_POP
#include
#include
#include
-#include
#include
#include
-#include
-#include
namespace nchg {
-// https://stackoverflow.com/a/16000226
-template
-struct has_pval_corrected : std::false_type {};
-
-template
-struct has_pval_corrected : std::true_type {};
-
-template
-struct has_log_ratio : std::false_type {};
-
-template
-struct has_log_ratio : std::true_type {};
-
class ParquetStatsFile {
public:
enum class RecordType : std::uint_fast8_t { infer, NCHGCompute, NCHGFilter };
@@ -121,57 +105,6 @@ class ParquetStatsFile {
};
};
-class RecordBatchBuilder {
- std::size_t _i{};
-
- hictk::Reference _chroms{};
-
- arrow::StringDictionary32Builder _chrom1{};
- arrow::UInt32Builder _start1{};
- arrow::UInt32Builder _end1{};
-
- arrow::StringDictionary32Builder _chrom2{};
- arrow::UInt32Builder _start2{};
- arrow::UInt32Builder _end2{};
-
- arrow::DoubleBuilder _pvalue{};
- arrow::DoubleBuilder _pvalue_corrected{};
- arrow::UInt64Builder _observed{};
- arrow::DoubleBuilder _expected{};
- arrow::DoubleBuilder _log_ratio{};
- arrow::DoubleBuilder _odds{};
- arrow::DoubleBuilder _omega{};
-
- public:
- RecordBatchBuilder(hictk::Reference chroms);
-
- [[nodiscard]] std::size_t size() const noexcept;
- [[nodiscard]] std::size_t capacity() const noexcept;
-
- template
- void append(const Stats &s);
- void reset();
-
- [[nodiscard]] std::shared_ptr get();
-
- void write(parquet::arrow::FileWriter &writer);
-
- private:
- template
- void append(ArrayBuilder &builder, const T &data);
-
- template
- [[nodiscard]] std::shared_ptr finish(ArrayBuilder &builder);
-};
-
-template
-[[nodiscard]] std::unique_ptr init_parquet_file_writer(
- const hictk::Reference &chroms, const std::filesystem::path &path, bool force,
- std::string_view compression_method, std::uint8_t compression_lvl, std::size_t threads);
-
-[[nodiscard]] phmap::flat_hash_map> parse_bin_mask(
- const hictk::Reference &chroms, std::uint32_t bin_size, const std::filesystem::path &path);
-
template
[[nodiscard]] std::string_view truncate_record(std::string_view record, char sep = '\t');
@@ -186,4 +119,4 @@ namespace internal {
} // namespace nchg
-#include "../../../common/io_impl.hpp"
+#include "../../parquet_stats_file_impl.hpp"
diff --git a/src/io/include/nchg/record_batch_builder.hpp b/src/io/include/nchg/record_batch_builder.hpp
new file mode 100644
index 0000000..990179d
--- /dev/null
+++ b/src/io/include/nchg/record_batch_builder.hpp
@@ -0,0 +1,101 @@
+// Copyright (C) 2024 Roberto Rossini
+//
+// SPDX-License-Identifier: GPL-3.0
+//
+// This library is free software: you can redistribute it and/or
+// modify it under the terms of the GNU Public License as published
+// by the Free Software Foundation; either version 3 of the License,
+// or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Library General Public License for more details.
+//
+// You should have received a copy of the GNU Public License along
+// with this library. If not, see
+// .
+
+#pragma once
+
+// clang-format off
+#include "nchg/suppress_warnings.hpp"
+NCHG_DISABLE_WARNING_PUSH
+NCHG_DISABLE_WARNING_DEPRECATED_DECLARATIONS
+#include
+#include
+#include
+#include
+#include
+NCHG_DISABLE_WARNING_POP
+// clang-format on
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace nchg {
+
+template
+[[nodiscard]] std::unique_ptr init_parquet_file_writer(
+ const hictk::Reference &chroms, const std::filesystem::path &path, bool force,
+ std::string_view compression_method, std::uint8_t compression_lvl, std::size_t threads);
+
+class RecordBatchBuilder {
+ std::size_t _i{};
+
+ hictk::Reference _chroms{};
+
+ arrow::StringDictionary32Builder _chrom1{};
+ arrow::UInt32Builder _start1{};
+ arrow::UInt32Builder _end1{};
+
+ arrow::StringDictionary32Builder _chrom2{};
+ arrow::UInt32Builder _start2{};
+ arrow::UInt32Builder _end2{};
+
+ arrow::DoubleBuilder _pvalue{};
+ arrow::DoubleBuilder _pvalue_corrected{};
+ arrow::UInt64Builder _observed{};
+ arrow::DoubleBuilder _expected{};
+ arrow::DoubleBuilder _log_ratio{};
+ arrow::DoubleBuilder _odds{};
+ arrow::DoubleBuilder _omega{};
+
+ public:
+ RecordBatchBuilder(hictk::Reference chroms);
+
+ [[nodiscard]] std::size_t size() const noexcept;
+ [[nodiscard]] std::size_t capacity() const noexcept;
+
+ template
+ void append(const Stats &s);
+ void reset();
+
+ [[nodiscard]] std::shared_ptr get();
+
+ void write(parquet::arrow::FileWriter &writer);
+
+ private:
+ template
+ void append(ArrayBuilder &builder, const T &data);
+
+ template
+ [[nodiscard]] std::shared_ptr finish(ArrayBuilder &builder);
+};
+
+template
+[[nodiscard]] std::unique_ptr init_parquet_file_writer(
+ const hictk::Reference &chroms, const std::filesystem::path &path, bool force,
+ std::string_view compression_method, std::uint8_t compression_lvl, std::size_t threads);
+
+[[nodiscard]] phmap::flat_hash_map> parse_bin_mask(
+ const hictk::Reference &chroms, std::uint32_t bin_size, const std::filesystem::path &path);
+
+} // namespace nchg
+
+#include "../../record_batch_builder_impl.hpp"
diff --git a/src/io/include/nchg/text.hpp b/src/io/include/nchg/text.hpp
new file mode 100644
index 0000000..5bb49bf
--- /dev/null
+++ b/src/io/include/nchg/text.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2024 Roberto Rossini
+//
+// SPDX-License-Identifier: GPL-3.0
+//
+// This library is free software: you can redistribute it and/or
+// modify it under the terms of the GNU Public License as published
+// by the Free Software Foundation; either version 3 of the License,
+// or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Library General Public License for more details.
+//
+// You should have received a copy of the GNU Public License along
+// with this library. If not, see
+// .
+
+#pragma once
+
+// clang-format off
+#include "nchg/suppress_warnings.hpp"
+NCHG_DISABLE_WARNING_PUSH
+NCHG_DISABLE_WARNING_DEPRECATED_DECLARATIONS
+#include
+NCHG_DISABLE_WARNING_POP
+// clang-format on
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace nchg {
+
+template
+[[nodiscard]] constexpr std::string_view truncate_record(std::string_view record, char sep = '\t');
+
+[[nodiscard]] phmap::flat_hash_map> parse_bin_mask(
+ const hictk::Reference &chroms, std::uint32_t bin_size, const std::filesystem::path &path);
+
+} // namespace nchg
+
+#include "../../text_impl.hpp"
diff --git a/src/io/parquet_helpers.cpp b/src/io/parquet_helpers.cpp
new file mode 100644
index 0000000..7bac7c0
--- /dev/null
+++ b/src/io/parquet_helpers.cpp
@@ -0,0 +1,112 @@
+// Copyright (C) 2024 Roberto Rossini
+//
+// SPDX-License-Identifier: GPL-3.0
+//
+// This library is free software: you can redistribute it and/or
+// modify it under the terms of the GNU Public License as published
+// by the Free Software Foundation; either version 3 of the License,
+// or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Library General Public License for more details.
+//
+// You should have received a copy of the GNU Public License along
+// with this library. If not, see
+// .
+
+#include "nchg/parquet_helpers.hpp"
+
+#include "nchg/suppress_warnings.hpp"
+NCHG_DISABLE_WARNING_PUSH
+NCHG_DISABLE_WARNING_DEPRECATED_DECLARATIONS
+#include
+#include
+#include
+#include
+NCHG_DISABLE_WARNING_POP
+// clang-format on
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace nchg {
+
+static std::shared_ptr generate_schema_metadata(
+ const hictk::Reference &chroms) {
+ std::vector keys{};
+ std::vector values{};
+ for (const auto &chrom : chroms) {
+ if (!chrom.is_all()) {
+ keys.emplace_back(chrom.name());
+ values.emplace_back(fmt::to_string(chrom.size()));
+ }
+ }
+
+ return std::make_shared(std::move(keys), std::move(values));
+}
+
+std::shared_ptr get_schema(const hictk::Reference &chroms) {
+ const auto chrom_dtype = arrow::dictionary(arrow::int32(), arrow::utf8());
+ const auto metadata = generate_schema_metadata(chroms);
+
+ return arrow::schema({
+ // clang-format off
+ arrow::field("chrom1", chrom_dtype, false, metadata),
+ arrow::field("start1", arrow::uint32(), false),
+ arrow::field("end1", arrow::uint32(), false),
+ arrow::field("chrom2", chrom_dtype, false, metadata),
+ arrow::field("start2", arrow::uint32(), false),
+ arrow::field("end2", arrow::uint32(), false),
+ arrow::field("pvalue", arrow::float64(), false),
+ arrow::field("observed_count", arrow::uint64(), false),
+ arrow::field("expected_count", arrow::float64(), false),
+ arrow::field("log_ratio", arrow::float64(), false),
+ arrow::field("odds_ratio", arrow::float64(), false),
+ arrow::field("omega", arrow::float64(), false)
+ // clang-format on
+ });
+}
+
+std::shared_ptr get_schema_padj(const hictk::Reference &chroms) {
+ const auto chrom_dtype = arrow::dictionary(arrow::int32(), arrow::utf8());
+ const auto metadata = generate_schema_metadata(chroms);
+
+ return arrow::schema({
+ // clang-format off
+ arrow::field("chrom1", chrom_dtype, false, metadata),
+ arrow::field("start1", arrow::uint32(), false),
+ arrow::field("end1", arrow::uint32(), false),
+ arrow::field("chrom2", chrom_dtype, false, metadata),
+ arrow::field("start2", arrow::uint32(), false),
+ arrow::field("end2", arrow::uint32(), false),
+ arrow::field("pvalue", arrow::float64(), false),
+ arrow::field("pvalue_corrected", arrow::float64(), false),
+ arrow::field("observed_count", arrow::uint64(), false),
+ arrow::field("expected_count", arrow::float64(), false),
+ arrow::field("log_ratio", arrow::float64(), false),
+ arrow::field("odds_ratio", arrow::float64(), false),
+ arrow::field("omega", arrow::float64(), false)
+ // clang-format on
+ });
+}
+
+parquet::Compression::type parse_parquet_compression(std::string_view method) {
+ if (method == "zstd") {
+ return parquet::Compression::ZSTD;
+ }
+ if (method == "lz4") {
+ return parquet::Compression::LZ4;
+ }
+ throw std::runtime_error(fmt::format("unrecognized compression method \"{}\"", method));
+}
+
+} // namespace nchg
diff --git a/src/nchg/common/io.cpp b/src/io/parquet_stats_file.cpp
similarity index 51%
rename from src/nchg/common/io.cpp
rename to src/io/parquet_stats_file.cpp
index 012ed75..c3fd09b 100644
--- a/src/nchg/common/io.cpp
+++ b/src/io/parquet_stats_file.cpp
@@ -16,32 +16,24 @@
// with this library. If not, see
// .
-// clang-format off
-#include "nchg/nchg.hpp"
-
+#include "nchg/parquet_stats_file.hpp"
+// clang-format off
#include "nchg/suppress_warnings.hpp"
NCHG_DISABLE_WARNING_PUSH
NCHG_DISABLE_WARNING_DEPRECATED_DECLARATIONS
#include
#include
-#include
#include
#include
#include
#include
-#include
-#include
#include
#include
-#include
NCHG_DISABLE_WARNING_POP
// clang-format on
-#include "nchg/tools/io.hpp"
-
#include
-#include
#include
#include
@@ -50,39 +42,18 @@ NCHG_DISABLE_WARNING_POP
#include
#include
#include
-#include
-#include
#include
#include
-#include
#include
-#include
#include
#include
#include
#include
#include
-namespace nchg {
-
-static std::shared_ptr make_chrom_dict(const hictk::Reference &chroms) {
- arrow::StringBuilder builder{};
- for (const auto &chrom : chroms) {
- if (!chrom.is_all()) {
- const auto status = builder.Append(std::string{chrom.name()});
- if (!status.ok()) {
- throw std::runtime_error(status.ToString());
- }
- }
- }
-
- auto result = builder.Finish();
- if (!result.status().ok()) {
- throw std::runtime_error(result.status().ToString());
- }
+#include "nchg/common.hpp"
- return result.MoveValueUnsafe();
-}
+namespace nchg {
[[nodiscard]] static std::shared_ptr open_parquet_file(
const std::filesystem::path &path) {
@@ -270,217 +241,4 @@ std::shared_ptr ParquetStatsFile::chromosomes() const no
return _chroms;
}
-RecordBatchBuilder::RecordBatchBuilder(hictk::Reference chroms) : _chroms(std::move(chroms)) {
- const auto dict = make_chrom_dict(_chroms);
- auto status = _chrom1.InsertMemoValues(*dict);
- if (!status.ok()) {
- throw std::runtime_error(status.ToString());
- }
-
- status = _chrom2.InsertMemoValues(*dict);
- if (!status.ok()) {
- throw std::runtime_error(status.ToString());
- }
-}
-
-std::size_t RecordBatchBuilder::size() const noexcept { return _i; }
-std::size_t RecordBatchBuilder::capacity() const noexcept {
- return static_cast(_chrom1.capacity());
-}
-
-void RecordBatchBuilder::reset() {
- _chrom1.Reset();
- _start1.Reset();
- _end1.Reset();
-
- _chrom2.Reset();
- _start2.Reset();
- _end2.Reset();
-
- _pvalue.Reset();
- _pvalue_corrected.Reset();
- _observed.Reset();
- _expected.Reset();
- _log_ratio.Reset();
- _odds.Reset();
- _omega.Reset();
-
- _i = 0;
-}
-
-std::shared_ptr RecordBatchBuilder::get() {
- std::vector> columns{};
- columns.reserve(13);
-
- columns.emplace_back(finish(_chrom1));
- columns.emplace_back(finish(_start1));
- columns.emplace_back(finish(_end1));
-
- columns.emplace_back(finish(_chrom2));
- columns.emplace_back(finish(_start2));
- columns.emplace_back(finish(_end2));
-
- columns.emplace_back(finish(_pvalue));
-
- if (_pvalue_corrected.length() != 0) {
- columns.emplace_back(finish(_pvalue_corrected));
- }
-
- columns.emplace_back(finish(_observed));
- columns.emplace_back(finish(_expected));
-
- columns.emplace_back(finish(_log_ratio));
- columns.emplace_back(finish(_odds));
- columns.emplace_back(finish(_omega));
-
- if (columns.size() == 13) {
- return arrow::RecordBatch::Make(internal::get_schema_padj(_chroms),
- static_cast(size()), columns);
- }
- return arrow::RecordBatch::Make(internal::get_schema(_chroms), static_cast(size()),
- columns);
-}
-
-void RecordBatchBuilder::write(parquet::arrow::FileWriter &writer) {
- const auto batch = get();
- const auto status = writer.WriteRecordBatch(*batch);
- if (!status.ok()) {
- throw std::runtime_error(status.ToString());
- }
- reset();
-}
-
-phmap::flat_hash_map> parse_bin_mask(
- const hictk::Reference &chroms, std::uint32_t bin_size, const std::filesystem::path &path) {
- if (path.empty()) {
- return {};
- }
-
- SPDLOG_INFO("reading the user-provided bin mask from {}...", path);
- phmap::flat_hash_map> mask{};
- std::string buffer{};
-
- std::ifstream fs{};
- fs.exceptions(fs.exceptions() | std::ios::badbit | std::ios::failbit);
-
- std::size_t i = 1;
- try {
- fs.open(path);
-
- for (; std::getline(fs, buffer); ++i) {
- if (buffer.empty()) {
- continue;
- }
-
- if (buffer.back() == '\r') {
- buffer.resize(buffer.size() - 1);
- }
-
- try {
- const auto record = truncate_record<3>(buffer);
- auto domain = hictk::GenomicInterval::parse_bed(chroms, record);
-
- const auto num_bins = (domain.chrom().size() + bin_size - 1) / bin_size;
- auto match = mask.try_emplace(domain.chrom(), std::vector(num_bins, false));
-
- const std::size_t j0 = domain.start() / bin_size;
- const std::size_t j1 = (domain.end() / bin_size) + 1;
-
- for (std::size_t j = j0; j < j1; ++j) {
- match.first->second[j] = true;
- }
-
- } catch (const std::exception &e) {
- throw std::runtime_error(
- fmt::format("found an invalid record at line {} of file {}: {}", i, path, e.what()));
- }
- }
-
- } catch (const std::exception &) {
- if (!fs.eof()) {
- throw;
- }
- }
-
- std::size_t num_bad_bins = 0;
- for (const auto &[_, v] : mask) {
- num_bad_bins += std::accumulate(v.begin(), v.end(), 0uz);
- }
-
- SPDLOG_INFO("masked {} bad bins based on {} domains read from {}...", num_bad_bins, i - 1, path);
- return mask;
-}
-
-namespace internal {
-static std::shared_ptr generate_schema_metadata(
- const hictk::Reference &chroms) {
- std::vector keys{};
- std::vector values{};
- for (const auto &chrom : chroms) {
- if (!chrom.is_all()) {
- keys.emplace_back(chrom.name());
- values.emplace_back(fmt::to_string(chrom.size()));
- }
- }
-
- return std::make_shared(std::move(keys), std::move(values));
-}
-
-std::shared_ptr get_schema(const hictk::Reference &chroms) {
- const auto chrom_dtype = arrow::dictionary(arrow::int32(), arrow::utf8());
- const auto metadata = generate_schema_metadata(chroms);
-
- return arrow::schema({
- // clang-format off
- arrow::field("chrom1", chrom_dtype, false, metadata),
- arrow::field("start1", arrow::uint32(), false),
- arrow::field("end1", arrow::uint32(), false),
- arrow::field("chrom2", chrom_dtype, false, metadata),
- arrow::field("start2", arrow::uint32(), false),
- arrow::field("end2", arrow::uint32(), false),
- arrow::field("pvalue", arrow::float64(), false),
- arrow::field("observed_count", arrow::uint64(), false),
- arrow::field("expected_count", arrow::float64(), false),
- arrow::field("log_ratio", arrow::float64(), false),
- arrow::field("odds_ratio", arrow::float64(), false),
- arrow::field("omega", arrow::float64(), false)
- // clang-format on
- });
-}
-
-std::shared_ptr get_schema_padj(const hictk::Reference &chroms) {
- const auto chrom_dtype = arrow::dictionary(arrow::int32(), arrow::utf8());
- const auto metadata = generate_schema_metadata(chroms);
-
- return arrow::schema({
- // clang-format off
- arrow::field("chrom1", chrom_dtype, false, metadata),
- arrow::field("start1", arrow::uint32(), false),
- arrow::field("end1", arrow::uint32(), false),
- arrow::field("chrom2", chrom_dtype, false, metadata),
- arrow::field("start2", arrow::uint32(), false),
- arrow::field("end2", arrow::uint32(), false),
- arrow::field("pvalue", arrow::float64(), false),
- arrow::field("pvalue_corrected", arrow::float64(), false),
- arrow::field("observed_count", arrow::uint64(), false),
- arrow::field("expected_count", arrow::float64(), false),
- arrow::field("log_ratio", arrow::float64(), false),
- arrow::field("odds_ratio", arrow::float64(), false),
- arrow::field("omega", arrow::float64(), false)
- // clang-format on
- });
-}
-
-parquet::Compression::type parse_parquet_compression(std::string_view method) {
- if (method == "zstd") {
- return parquet::Compression::ZSTD;
- }
- if (method == "lz4") {
- return parquet::Compression::LZ4;
- }
- throw std::runtime_error(fmt::format("unrecognized compression method \"{}\"", method));
-}
-
-} // namespace internal
-
} // namespace nchg
diff --git a/src/io/parquet_stats_file_impl.hpp b/src/io/parquet_stats_file_impl.hpp
new file mode 100644
index 0000000..c777006
--- /dev/null
+++ b/src/io/parquet_stats_file_impl.hpp
@@ -0,0 +1,147 @@
+// Copyright (C) 2024 Roberto Rossini
+//
+// SPDX-License-Identifier: GPL-3.0
+//
+// This library is free software: you can redistribute it and/or
+// modify it under the terms of the GNU Public License as published
+// by the Free Software Foundation; either version 3 of the License,
+// or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Library General Public License for more details.
+//
+// You should have received a copy of the GNU Public License along
+// with this library. If not, see
+// .
+
+#pragma once
+
+// clang-format off
+#include "nchg/suppress_warnings.hpp"
+NCHG_DISABLE_WARNING_PUSH
+NCHG_DISABLE_WARNING_DEPRECATED_DECLARATIONS
+#include
+#include
+#include
+#include
+#include
+NCHG_DISABLE_WARNING_POP
+// clang-format on
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "nchg/parquet_helpers.hpp"
+
+namespace nchg {
+
+template
+inline auto ParquetStatsFile::begin() -> iterator {
+ return {_chroms, _sr, true};
+}
+template
+inline auto ParquetStatsFile::end() -> iterator {
+ return iterator::at_end(_chroms, _sr);
+}
+
+template
+inline ParquetStatsFile::iterator::iterator(std::shared_ptr chroms,
+ std::shared_ptr sr,
+ bool init_value)
+ : _chroms(std::move(chroms)), _sr(std::move(sr)), _buffer(std::make_shared()) {
+ if (init_value && _sr->current_row() != _sr->num_rows()) {
+ if (_sr->eof()) {
+ *this = at_end(_chroms, _sr);
+ } else {
+ read_pixel();
+ }
+ }
+}
+
+template
+inline auto ParquetStatsFile::iterator::at_end(
+ std::shared_ptr chroms, std::shared_ptr sr)
+ -> iterator {
+ iterator it{std::move(chroms), std::move(sr), false};
+ it._offset = it._sr->num_rows();
+
+ return it;
+}
+
+template
+inline bool ParquetStatsFile::iterator::operator==(const iterator &other) const noexcept {
+ return _sr == other._sr && _offset == other._offset;
+}
+
+template
+inline bool ParquetStatsFile::iterator::operator!=(const iterator &other) const noexcept {
+ return !(*this == other);
+}
+
+template
+inline auto ParquetStatsFile::iterator::operator*() const noexcept -> const_reference {
+ return _value;
+}
+
+template
+inline auto ParquetStatsFile::iterator::operator->() const noexcept -> const_pointer {
+ return &_value;
+}
+
+template
+inline auto ParquetStatsFile::iterator::operator++() -> iterator & {
+ if (_sr->eof()) [[unlikely]] {
+ *this = at_end(_chroms, _sr);
+ return *this;
+ }
+
+ read_pixel();
+ return *this;
+}
+
+template
+inline void ParquetStatsFile::iterator::read_pixel() {
+ assert(!_sr->eof());
+ std::uint32_t start1{};
+ std::uint32_t end1{};
+ std::uint32_t start2{};
+ std::uint32_t end2{};
+ std::uint64_t observed_count{};
+
+ *_sr >> *_buffer;
+ const auto chrom1 = !!_chroms ? _chroms->at(*_buffer) : hictk::Chromosome{0, *_buffer, 1};
+ *_sr >> start1;
+ *_sr >> end1;
+
+ *_sr >> *_buffer;
+ const auto chrom2 = !!_chroms ? _chroms->at(*_buffer) : hictk::Chromosome{0, *_buffer, 1};
+ *_sr >> start2;
+ *_sr >> end2;
+
+ *_sr >> _value.pval;
+ if constexpr (has_pval_corrected()) {
+ *_sr >> _value.pval_corrected;
+ }
+ *_sr >> observed_count;
+ *_sr >> _value.expected;
+ *_sr >> _value.log_ratio;
+
+ *_sr >> _value.odds_ratio;
+ *_sr >> _value.omega;
+ *_sr >> parquet::EndRow;
+
+ _value.pixel = hictk::Pixel{chrom1, start1, end1, chrom2, start2, end2, observed_count};
+}
+
+} // namespace nchg
diff --git a/src/io/record_batch_builder.cpp b/src/io/record_batch_builder.cpp
new file mode 100644
index 0000000..b9961b3
--- /dev/null
+++ b/src/io/record_batch_builder.cpp
@@ -0,0 +1,166 @@
+// Copyright (C) 2024 Roberto Rossini
+//
+// SPDX-License-Identifier: GPL-3.0
+//
+// This library is free software: you can redistribute it and/or
+// modify it under the terms of the GNU Public License as published
+// by the Free Software Foundation; either version 3 of the License,
+// or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Library General Public License for more details.
+//
+// You should have received a copy of the GNU Public License along
+// with this library. If not, see
+// .
+
+#include "nchg/record_batch_builder.hpp"
+
+// clang-format off
+#include "nchg/suppress_warnings.hpp"
+NCHG_DISABLE_WARNING_PUSH
+NCHG_DISABLE_WARNING_DEPRECATED_DECLARATIONS
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+NCHG_DISABLE_WARNING_POP
+// clang-format on
+
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "nchg/common.hpp"
+#include "nchg/parquet_stats_file.hpp"
+
+namespace nchg {
+
+static std::shared_ptr make_chrom_dict(const hictk::Reference &chroms) {
+ arrow::StringBuilder builder{};
+ for (const auto &chrom : chroms) {
+ if (!chrom.is_all()) {
+ const auto status = builder.Append(std::string{chrom.name()});
+ if (!status.ok()) {
+ throw std::runtime_error(status.ToString());
+ }
+ }
+ }
+
+ auto result = builder.Finish();
+ if (!result.status().ok()) {
+ throw std::runtime_error(result.status().ToString());
+ }
+
+ return result.MoveValueUnsafe();
+}
+
+RecordBatchBuilder::RecordBatchBuilder(hictk::Reference chroms) : _chroms(std::move(chroms)) {
+ const auto dict = make_chrom_dict(_chroms);
+ auto status = _chrom1.InsertMemoValues(*dict);
+ if (!status.ok()) {
+ throw std::runtime_error(status.ToString());
+ }
+
+ status = _chrom2.InsertMemoValues(*dict);
+ if (!status.ok()) {
+ throw std::runtime_error(status.ToString());
+ }
+}
+
+std::size_t RecordBatchBuilder::size() const noexcept { return _i; }
+std::size_t RecordBatchBuilder::capacity() const noexcept {
+ return static_cast(_chrom1.capacity());
+}
+
+void RecordBatchBuilder::reset() {
+ _chrom1.Reset();
+ _start1.Reset();
+ _end1.Reset();
+
+ _chrom2.Reset();
+ _start2.Reset();
+ _end2.Reset();
+
+ _pvalue.Reset();
+ _pvalue_corrected.Reset();
+ _observed.Reset();
+ _expected.Reset();
+ _log_ratio.Reset();
+ _odds.Reset();
+ _omega.Reset();
+
+ _i = 0;
+}
+
+std::shared_ptr RecordBatchBuilder::get() {
+ std::vector> columns{};
+ columns.reserve(13);
+
+ columns.emplace_back(finish(_chrom1));
+ columns.emplace_back(finish(_start1));
+ columns.emplace_back(finish(_end1));
+
+ columns.emplace_back(finish(_chrom2));
+ columns.emplace_back(finish(_start2));
+ columns.emplace_back(finish(_end2));
+
+ columns.emplace_back(finish(_pvalue));
+
+ if (_pvalue_corrected.length() != 0) {
+ columns.emplace_back(finish(_pvalue_corrected));
+ }
+
+ columns.emplace_back(finish(_observed));
+ columns.emplace_back(finish(_expected));
+
+ columns.emplace_back(finish(_log_ratio));
+ columns.emplace_back(finish(_odds));
+ columns.emplace_back(finish(_omega));
+
+ if (columns.size() == 13) {
+ return arrow::RecordBatch::Make(get_schema_padj(_chroms), static_cast(size()),
+ columns);
+ }
+ return arrow::RecordBatch::Make(get_schema(_chroms), static_cast(size()), columns);
+}
+
+void RecordBatchBuilder::write(parquet::arrow::FileWriter &writer) {
+ const auto batch = get();
+ const auto status = writer.WriteRecordBatch(*batch);
+ if (!status.ok()) {
+ throw std::runtime_error(status.ToString());
+ }
+ reset();
+}
+
+} // namespace nchg
diff --git a/src/nchg/common/io_impl.hpp b/src/io/record_batch_builder_impl.hpp
similarity index 56%
rename from src/nchg/common/io_impl.hpp
rename to src/io/record_batch_builder_impl.hpp
index ca36f20..5ba960d 100644
--- a/src/nchg/common/io_impl.hpp
+++ b/src/io/record_batch_builder_impl.hpp
@@ -30,6 +30,8 @@ NCHG_DISABLE_WARNING_DEPRECATED_DECLARATIONS
NCHG_DISABLE_WARNING_POP
// clang-format on
+#include
+
#include
#include
#include
@@ -42,106 +44,9 @@ NCHG_DISABLE_WARNING_POP
#include