Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Hail-like AC, AN, and AF computation to variant stats V3 #731

Merged
merged 33 commits into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
40ffd12
read more fields from variant stats
awenocur May 18, 2024
67ea052
make use of V3 additions in variant stats reader, cache non-refs
awenocur May 30, 2024
6a832fd
make variant stats reader track entering/exiting ref blocks
awenocur Jun 6, 2024
751ba99
fix sorting logic for ref blocks
awenocur Jun 6, 2024
da6b087
update IAF with ref blocks for variant stats V3
awenocur Jun 11, 2024
850390d
fix AF computation for variant stats V3
awenocur Jun 12, 2024
417833d
fix data frame cardinality for variant stats reads
awenocur Jun 12, 2024
4f45ef5
add TILEDB_IAC, TILEDB_IAN to data query
awenocur Jun 19, 2024
a3d1479
update variant stats reader for v3, fix variant stats timing
awenocur Jun 20, 2024
5104ef5
fix progressive VCF stats computation, update tests
awenocur Jun 20, 2024
a3110cf
fix variant stats v3 writer: correct ordering and END
awenocur Jun 20, 2024
d7f263a
read in max_length when reading variant stats
awenocur Jun 20, 2024
7722df8
Refactor inclusion/exclusion of positions, add non-ref extension
awenocur Jun 20, 2024
f981273
info_TILEDB_IAN is pared down to a single encapsulated element
awenocur Jun 20, 2024
56966a4
fix type mismatch
awenocur Jun 20, 2024
31a6869
make tiledb_IAF_IAN a scalar value
awenocur Jun 20, 2024
6581dbc
minor fix to comparison
awenocur Jun 20, 2024
ea67e86
isolate variant stats v3 feature from v2
awenocur Jun 20, 2024
1dcd762
refactored stats v3 writer in-place sorting to include samples
awenocur Jun 20, 2024
a547d33
simplify in-place sort loop in variant stats writer
awenocur Jun 21, 2024
0d58261
clean up extra string comparison
awenocur Jun 22, 2024
8a9f8ed
move AF computation to method file
awenocur Jun 22, 2024
3100992
variant stats writer sorting fix
awenocur Jun 22, 2024
f73632e
fix IAC/IAF computation error for variant stats v3
awenocur Jul 19, 2024
94bcb60
refactor
awenocur Jul 19, 2024
c256a0c
fix AF computations
awenocur Jul 19, 2024
6e6c22a
fix test
awenocur Jul 20, 2024
81eda28
restructure writer for normalizing alleles
awenocur Jul 31, 2024
4021654
add normalization for variant stats v3
awenocur Jul 31, 2024
c7be7cf
change normalization of variant stats V3 to pairwise
awenocur Aug 1, 2024
568f0d5
reimplement variant stats V3 pairwise normalization, including ref
awenocur Aug 1, 2024
b2fd97a
address nits and add documentation
awenocur Aug 1, 2024
ba49c2d
non-functional assignment change for variant stats end
awenocur Aug 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions apis/python/tests/test_tiledbvcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1241,7 +1241,7 @@ def test_ingest_with_stats_v3(tmp_path):
attrs=["contig", "pos_start", "id", "qual", "info_TILEDB_IAF", "sample_name"],
set_af_filter="<0.2",
)
assert data_frame.shape == (3, 8)
assert data_frame.shape == (1, 8)
assert data_frame.query("sample_name == 'second'")["qual"].iloc[0] == pytest.approx(
343.73
)
Expand All @@ -1262,17 +1262,16 @@ def test_ingest_with_stats_v3(tmp_path):
)
ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "stats_test"), mode="r")
df = ds.read_variant_stats("chr1:1-10000")
assert df.shape == (12, 5)
assert df.shape == (13, 5)
df = tiledbvcf.allele_frequency.read_allele_frequency(
os.path.join(tmp_path, "stats_test"), "chr1:1-10000"
)
assert df.pos.is_monotonic_increasing
df["an_check"] = (df.ac / df.af).round(0).astype("int32")
assert df.an_check.equals(df.an)
df = ds.read_variant_stats("chr1:1-10000")
assert df.shape == (12, 5)
assert df.shape == (13, 5)
df = df.to_pandas()
assert sum(sum((df[df["alleles"] == "nr"] == (9, "nr", 4, 4, 1.0)).values)) == 5
df = ds.read_allele_count("chr1:1-10000")
assert df.shape == (7, 6)
df = df.to_pandas()
Expand Down Expand Up @@ -1343,15 +1342,15 @@ def test_ingest_with_stats_v2(tmp_path):
)
ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "stats_test"), mode="r")
df = ds.read_variant_stats("chr1:1-10000")
assert df.shape == (12, 5)
assert df.shape == (13, 5)
df = tiledbvcf.allele_frequency.read_allele_frequency(
os.path.join(tmp_path, "stats_test"), "chr1:1-10000"
)
assert df.pos.is_monotonic_increasing
df["an_check"] = (df.ac / df.af).round(0).astype("int32")
assert df.an_check.equals(df.an)
df = ds.read_variant_stats("chr1:1-10000")
assert df.shape == (12, 5)
assert df.shape == (13, 5)
df = df.to_pandas()
df = ds.read_allele_count("chr1:1-10000")
assert df.shape == (7, 6)
Expand Down
37 changes: 30 additions & 7 deletions libtiledbvcf/src/dataset/tiledbvcfdataset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -764,14 +764,35 @@ void TileDBVCFDataset::load_field_type_maps_v4(
}

if (add_iaf) {
int header_appended = bcf_hdr_append(
// TODO: do something better than promoting this pointer type;
// perhaps the header should be duplicated and later modified
const_cast<bcf_hdr_t*>(hdr),
"##INFO=<ID=TILEDB_IAF,Number=R,Type=Float,Description=\"Internal "
"Allele Frequency, computed over dataset by TileDB\">");
if (header_appended < 0) {
throw std::runtime_error(
"Error appending to header for internal allele frequency.");
}
if (bcf_hdr_append(
// TODO: do something better than promoting this pointer type;
// perhaps the header should be duplicated and later modified
const_cast<bcf_hdr_t*>(hdr),
"##INFO=<ID=TILEDB_IAF,Number=R,Type=Float,Description=\"Internal "
"Allele Frequency, computed over dataset by TileDB\">") < 0) {
"##INFO=<ID=TILEDB_IAC,Number=R,Type=Integer,Description="
"\"Internal "
awenocur marked this conversation as resolved.
Show resolved Hide resolved
"Allele Count, computed over dataset by TileDB\">") < 0) {
throw std::runtime_error(
"Error appending to header for internal allele frequency.");
"Error appending to header for internal allele count.");
}
if (bcf_hdr_append(
awenocur marked this conversation as resolved.
Show resolved Hide resolved
// TODO: do something better than promoting this pointer type;
// perhaps the header should be duplicated and later modified
const_cast<bcf_hdr_t*>(hdr),
"##INFO=<ID=TILEDB_IAN,Number=R,Type=Integer,Description="
"\"Internal "
"Allele Number, computed over dataset by TileDB\">") < 0) {
throw std::runtime_error(
"Error appending to header for internal allele number.");
}
info_iaf_field_type_added_ = true;
if (bcf_hdr_sync(const_cast<bcf_hdr_t*>(hdr)) < 0) {
Expand Down Expand Up @@ -949,8 +970,8 @@ void TileDBVCFDataset::delete_samples(
throw std::runtime_error("Sample not found in dataset: " + sample);
}

// If a stats array exists, read the data with the delete exporter, which
// adds negative counts to the stats arrays
// If a stats array exists, read the data with the delete exporter,
// which adds negative counts to the stats arrays
if (stats_array_exists) {
ExportParams args;
args.tiledb_config = tiledb_config;
Expand Down Expand Up @@ -1132,7 +1153,8 @@ std::unordered_map<uint32_t, SafeBCFHdr> TileDBVCFDataset::fetch_vcf_headers_v4(
// first sample.
if (first_sample && num_rows == 0) {
LOG_DEBUG(
"[fetch_vcf_headers_v4] the first sample was deleted, resubmitting");
"[fetch_vcf_headers_v4] the first sample was deleted, "
"resubmitting");
mq = std::make_unique<ManagedQuery>(
vcf_header_array_, "fetch_vcf_headers_v4");
mq->select_columns({"sample", "header"});
Expand Down Expand Up @@ -2002,7 +2024,8 @@ const char* TileDBVCFDataset::materialized_attribute_name(

bool TileDBVCFDataset::is_attribute_materialized(
const std::string& attr) const {
if (attr == "info_TILEDB_IAF") {
if (attr == "info_TILEDB_IAF" || attr == "info_TILEDB_IAC" ||
attr == "info_TILEDB_IAN") {
return true;
}

Expand Down
34 changes: 29 additions & 5 deletions libtiledbvcf/src/read/in_memory_exporter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
*/

#include "read/in_memory_exporter.h"
#include <stdexcept>
#include "enums/attr_datatype.h"

namespace tiledb {
Expand Down Expand Up @@ -594,7 +595,8 @@ bool InMemoryExporter::fixed_len_attr(const std::string& attr) {
"fmt_PS",
"fmt_PQ",
"fmt_MQ",
"fmt_MIN_DP"};
"fmt_MIN_DP",
"info_TILEDB_IAN"};
return fixed_len.count(attr) > 0;
}

Expand Down Expand Up @@ -975,13 +977,35 @@ bool InMemoryExporter::copy_info_fmt_value(
const std::string& field_name = dest->info_fmt_field_name;
const bool is_gt = field_name == "GT";
const bool is_iaf = field_name == "TILEDB_IAF";
const bool is_iac = field_name == "TILEDB_IAC";
const bool is_ian = field_name == "TILEDB_IAN";

const void* src = nullptr;
uint64_t nbytes = 0, nelts = 0;
if (is_iaf && !curr_query_results_->af_values.empty()) {
auto& af_values = curr_query_results_->af_values;
auto& ac_values = curr_query_results_->ac_values;
uint32_t an_value = curr_query_results_->an_value;
if ((is_iaf || is_iac || is_ian) && !af_values.empty()) {
if (af_values.size() != ac_values.size()) {
throw std::runtime_error(
"inconsistent cardinality of IAC/IAN/IAF vectors in query results");
}
// assign source pointer, nbytes, and nelts from vector
src = curr_query_results_->af_values.data();
nelts = curr_query_results_->af_values.size();
nbytes = nelts * sizeof(decltype(curr_query_results_->af_values.at(0)));
if (is_iaf) {
src = af_values.data();
nelts = af_values.size();
nbytes = nelts * sizeof(decltype(af_values.at(0)));
} else {
if (is_iac) {
src = ac_values.data();
nelts = ac_values.size();
nbytes = nelts * sizeof(decltype(ac_values.at(0)));
} else {
src = &an_value;
nelts = 1;
nbytes = nelts * sizeof(decltype(an_value));
}
}
} else {
get_info_fmt_value(dest, cell_idx, &src, &nbytes, &nelts);
}
Expand Down
2 changes: 2 additions & 0 deletions libtiledbvcf/src/read/read_query_results.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ struct ReadQueryResults {

/** TileDB Internal Allele Frequency values*/
std::vector<float> af_values;
std::vector<uint32_t> ac_values;
uint32_t an_value;

private:
/** Pointer to buffer set holding the actual data. */
Expand Down
14 changes: 12 additions & 2 deletions libtiledbvcf/src/read/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ void Reader::init_af_filter() {
Group group(*ctx_, dataset_->root_uri(), TILEDB_READ);
af_filter_ = std::make_unique<VariantStatsReader>(ctx_, group);
af_filter_->set_condition(params_.af_filter);
// TODO: enable sorting only if needed for GVCF IAF feature
gspowley marked this conversation as resolved.
Show resolved Hide resolved
if (af_filter_->array_version() > 2) {
params_.sort_real_start_pos = true;
}
}
}

Expand Down Expand Up @@ -948,7 +952,7 @@ void Reader::prepare_variant_stats() {
"Error preparing variant stats: there should be exactly one region "
"specified");
}
af_filter_->add_region(params_.regions[0]);
af_filter_->add_region(params_.regions[0], true);
af_filter_->compute_af();
}

Expand Down Expand Up @@ -1372,10 +1376,13 @@ bool Reader::process_query_results_v4() {
}

read_state_.query_results.af_values.clear();
read_state_.query_results.ac_values.clear();
read_state_.query_results.an_value = 0;
int allele_index = 0;
bool is_ref = true;
uint32_t an = 0;
for (auto&& allele : alleles) {
auto [allele_passes, af] = af_filter_->pass(
auto [allele_passes, af, ac, allele_an] = af_filter_->pass(
real_start,
is_ref ? "ref" : alleles[0] + "," + allele,
params_.scan_all_samples,
Expand All @@ -1399,10 +1406,13 @@ bool Reader::process_query_results_v4() {
// - build vector of AFs matching the order of the VCF record
// - all allele AF values are required, so do not exit this loop early
read_state_.query_results.af_values.push_back(af);
read_state_.query_results.ac_values.push_back(ac);
an = allele_an;

LOG_TRACE(" pass = {}", pass);
is_ref = false;
}
read_state_.query_results.an_value = an;

// If all alleles do not pass the af filter, continue
if (!pass) {
Expand Down
Loading
Loading