Skip to content

Commit

Permalink
Support fetching subsets of bin tables
Browse files Browse the repository at this point in the history
  • Loading branch information
robomics committed Aug 21, 2024
1 parent e2298c4 commit 864f3da
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 14 deletions.
2 changes: 1 addition & 1 deletion benchmarks/bin_table_merge_bin_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@

import numpy as np
import numpy.typing as npt
import pandas as pd

import hictkpy
import pandas as pd


def make_cli() -> argparse.ArgumentParser:
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bin_table_merge_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@

import numpy as np
import numpy.typing as npt
import pandas as pd

import hictkpy
import pandas as pd


def make_cli() -> argparse.ArgumentParser:
Expand Down
9 changes: 4 additions & 5 deletions src/hictkpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,17 +89,15 @@ static void declare_pixel_class(nb::module_ &m, const std::string &suffix) {
}

static void declare_bin_table_class(nb::module_ &m) {
auto bt =
nb::class_<BinTable>(m, "BinTable", "Class representing a table of genomic bins.");
auto bt = nb::class_<BinTable>(m, "BinTable", "Class representing a table of genomic bins.");

bt.def(nb::init<nb::dict, std::uint32_t>(), nb::arg("chroms"), nb::arg("resolution"),
"Construct a table of bins given a dictionary mapping chromosomes to their sizes and a "
"resolution");

bt.def("__repr__", &BinTable::repr);

bt.def("chromosomes", &get_chromosomes_from_object<BinTable>,
nb::arg("include_all") = false,
bt.def("chromosomes", &get_chromosomes_from_object<BinTable>, nb::arg("include_all") = false,
"Get chromosomes sizes as a dictionary mapping names to sizes.");

bt.def("bin_size", &BinTable::resolution,
Expand Down Expand Up @@ -130,7 +128,8 @@ static void declare_bin_table_class(nb::module_ &m) {
"Bin identifiers should be provided as a pandas DataFrame with columns \"bin1_id\" and "
"\"bin2_id\"");

bt.def("to_df", &BinTable::to_df, "Convert the bin table to a pandas DataFrame");
bt.def("to_df", &BinTable::to_df, nb::arg("range") = "", nb::arg("query_type") = "UCSC",
"Convert the bin table to a pandas DataFrame");
}

static void declare_pixel_selector_class(nb::module_ &m) {
Expand Down
32 changes: 26 additions & 6 deletions src/hictkpy_bin_table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,24 +201,44 @@ nb::iterator BinTable::make_iterable() const {
_bins.get());
}

nb::object BinTable::to_df() const {
static auto compute_num_bins(const hictk::BinTable& bins, const hictk::GenomicInterval& query) {
if (!query) {
return bins.size();
}
return static_cast<std::size_t>(std::visit(
[&](const auto& bins_) {
const auto [first_bin, last_bin] = bins_.find_overlap(query);
return std::distance(first_bin, last_bin);
},
bins.get()));
}

nb::object BinTable::to_df(std::string_view range, std::string_view query_type) const {
auto pd = nb::module_::import_("pandas");

using Buffer64T = nb::ndarray<nb::numpy, nb::shape<nb::any>, std::int64_t>;
const auto qt =
query_type == "UCSC" ? hictk::GenomicInterval::Type::UCSC : hictk::GenomicInterval::Type::BED;
const auto query =
range.empty() ? hictk::GenomicInterval{}
: hictk::GenomicInterval::parse(_bins.chromosomes(), std::string{range}, qt);

const auto n = _bins.size();
const auto n = compute_num_bins(_bins, query);

Dynamic1DA<std::int64_t> bin_ids(n);
Dynamic1DA<std::uint32_t> chrom_ids(n);
Dynamic1DA<std::int32_t> starts(n);
Dynamic1DA<std::int32_t> ends(n);

std::visit(
[&](const auto& bins) {
for (const auto& bin : bins) {
const auto [first_bin, last_bin] =
range.empty() ? std::make_pair(bins.begin(), bins.end()) : bins.find_overlap(query);
std::for_each(first_bin, last_bin, [&](const auto& bin) {
bin_ids.push_back(static_cast<std::int64_t>(bin.id()));
chrom_ids.push_back(bin.chrom().id());
starts.push_back(static_cast<std::int32_t>(bin.start()));
ends.push_back(static_cast<std::int32_t>(bin.end()));
}
});
},
_bins.get());

Expand All @@ -230,7 +250,7 @@ nb::object BinTable::to_df() const {
py_bins_dict["start"] = starts();
py_bins_dict["end"] = ends();

auto df = pd.attr("DataFrame")(py_bins_dict, "copy"_a = false);
auto df = pd.attr("DataFrame")(py_bins_dict, "index"_a = bin_ids(), "copy"_a = false);
return df;
}

Expand Down
2 changes: 1 addition & 1 deletion src/include/hictkpy/bin_table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class BinTable {

[[nodiscard]] nanobind::iterator make_iterable() const;

[[nodiscard]] nanobind::object to_df() const;
[[nodiscard]] nanobind::object to_df(std::string_view range, std::string_view query_type) const;
};

} // namespace hictkpy
5 changes: 5 additions & 0 deletions test/test_bin_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,8 @@ def test_to_df(self):
bins = hictkpy.BinTable(chroms, 100)

assert len(bins.to_df()) == len(bins)
assert len(bins.to_df("chr1")) == 10
assert len(bins.to_df("chr2:0-200")) == 2
assert len(bins.to_df("chr2\t0\t200", "BED")) == 2
with pytest.raises(RuntimeError):
bins.to_df("chr0")

0 comments on commit 864f3da

Please sign in to comment.