Skip to content

Commit

Permalink
reimplement variant stats V3 pairwise normalization, including ref
Browse files Browse the repository at this point in the history
This reverts commit 81eda28.
  • Loading branch information
awenocur committed Aug 1, 2024
1 parent c7be7cf commit 3ef786b
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 87 deletions.
10 changes: 6 additions & 4 deletions libtiledbvcf/src/read/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1382,13 +1382,15 @@ bool Reader::process_query_results_v4() {
int allele_index = 0;
bool is_ref = true;
uint32_t an = 0;
if (af_filter_->array_version() > 2) {
normalize(alleles);
}
for (auto&& allele : alleles) {
std::string normalized_ref = alleles[0];
std::string normalized_allele = allele;
if (af_filter_->array_version() > 2) {
normalize(normalized_ref, normalized_allele);
}
auto [allele_passes, af, ac, allele_an] = af_filter_->pass(
real_start,
is_ref ? "ref" : alleles[0] + "," + allele,
is_ref ? "ref" : normalized_ref + "," + normalized_allele,
params_.scan_all_samples,
num_samples);

Expand Down
66 changes: 31 additions & 35 deletions libtiledbvcf/src/stats/variant_stats.cc
Original file line number Diff line number Diff line change
Expand Up @@ -587,13 +587,11 @@ inline void VariantStats::process_v3(

// Determine homozygosity, generalized over n genotypes:
bool homozygous = true;
std::vector<std::pair<std::string, bool>> normalized_alleles;
{
int first_gt = 0;
bool first_gt_found = false;
for (int i = 0; i < ngt; i++) {
if (!gt_missing[i]) {
normalized_alleles.push_back({rec->d.allele[gt[i]], gt[i] == 0});
if (first_gt_found) {
homozygous = homozygous && gt[i] == first_gt;
} else {
Expand All @@ -604,10 +602,6 @@ inline void VariantStats::process_v3(
homozygous = false;
}
}
normalized_alleles.push_back({ref, true});
normalize(normalized_alleles);
normalized_alleles.pop_back();
// done
}

bool is_nr_block;
Expand All @@ -627,35 +621,38 @@ inline void VariantStats::process_v3(
}

bool already_added_homozygous = false;
std::string ref_key = "ref";
if (is_nr_block) {
// ref block
ref_key = "nr";
}

for (std::pair<std::string, bool>& allele : normalized_alleles) {
// update allele count for GT[i]

if (allele.second) { // alt
values_[ref_key].ac += count_delta_;
values_[ref_key].an = ngt * count_delta_;
values_[ref_key].end = end_;
values_[ref_key].max_length = max_length_;
if (homozygous) {
values_[ref_key].n_hom += count_delta_;
for (int i = 0; i < ngt; i++) {
// If not missing, update allele count for GT[i]
if (!gt_missing[i]) {
auto alt = alt_string(ref, rec->d.allele[gt[i]]);
std::string ref_key = "ref";
if (is_nr_block) {
// ref block
ref_key = "nr";
}

if (gt[i] == 0) {
values_[ref_key].ac += count_delta_;
values_[ref_key].an = ngt * count_delta_;
values_[ref_key].end = end_;
values_[ref_key].max_length = max_length_;
} else {
values_[alt].ac += count_delta_;
values_[alt].an = ngt * count_delta_;
values_[alt].end = end_;
values_[alt].max_length = max_length_;
}
} else { // alt
auto formatted_alt = alt_string(ref, allele.first);
values_[formatted_alt].ac += count_delta_;
values_[formatted_alt].an = ngt * count_delta_;
values_[formatted_alt].end = end_;
values_[formatted_alt].max_length = max_length_;

// Update homozygote count
if (homozygous && !already_added_homozygous) {
values_[formatted_alt].n_hom += count_delta_;
if (gt[i] == 0) {
values_[ref_key].n_hom += count_delta_;
} else {
values_[alt].n_hom += count_delta_;
}
already_added_homozygous = true;
}
}
already_added_homozygous = true;
}
}

Expand Down Expand Up @@ -904,11 +901,10 @@ void VariantStats::update_results() {
}

std::string VariantStats::alt_string(char* ref, char* alt) {
return std::string(ref) + "," + std::string(alt);
}

std::string VariantStats::alt_string(std::string ref, std::string alt) {
return ref + "," + alt;
std::string normalized_ref = ref;
std::string normalized_alt = alt;
normalize(normalized_ref, normalized_alt);
return normalized_ref + "," + normalized_alt;
}

} // namespace tiledb::vcf
9 changes: 0 additions & 9 deletions libtiledbvcf/src/stats/variant_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -367,15 +367,6 @@ class VariantStats {
* @return std::string ALT string
*/
std::string alt_string(char* ref, char* alt);

/**
* @brief Create an ALT string from the reference and alternate alleles.
*
* @param ref Reference allele
* @param alt Alternate allele
* @return std::string ALT string
*/
std::string alt_string(std::string ref, std::string alt);
};

} // namespace tiledb::vcf
Expand Down
42 changes: 5 additions & 37 deletions libtiledbvcf/src/utils/normalize.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,49 +33,17 @@

#include <utils/normalize.h>

static inline std::string& get_allele(std::pair<std::string, bool>& allele) {
return allele.first;
}

static inline std::string& get_allele(std::string& allele) {
return allele;
}

static inline std::string& ref_allele(
std::vector<std::pair<std::string, bool>>& alleles) {
return alleles.back().first;
}

static inline std::string& ref_allele(std::vector<std::string>& alleles) {
return alleles[0];
}

template <typename T>
static inline bool is_ref(std::string& allele, std::vector<T>& alleles) {
return &allele == &ref_allele(alleles);
}

namespace tiledb::vcf {
template <typename T>
void normalize(std::vector<T>& alleles) {
if (alleles.size() == 0) {
return;
}
for (T& allele_wrapper : alleles) {
void normalize(std::string &ref, std::string &alt) {
size_t min_length = UINT64_MAX;
min_length = std::min(min_length, get_allele(allele_wrapper).length());
min_length = std::min(min_length, alt.length());
size_t i = 1;
std::string& allele = get_allele(allele_wrapper);
std::string& ref = ref_allele(alleles);
if (!is_ref(allele, alleles) &&
allele[allele.length() - i] == ref[ref.length() - i]) {
if (alt[alt.length() - i] == ref[ref.length() - i]) {
i++;
}
i--;
allele.resize(allele.size() - i);
}
alt.resize(alt.size() - i);
ref.resize(ref.size() - i);
}

template void normalize(std::vector<std::string>&);
template void normalize(std::vector<std::pair<std::string, bool>>&);
} // namespace tiledb::vcf
3 changes: 1 addition & 2 deletions libtiledbvcf/src/utils/normalize.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@
namespace tiledb {
namespace vcf {

template <typename T>
void normalize(std::vector<T>& alleles);
void normalize(std::string &ref, std::string &alt);

} // namespace vcf
} // namespace tiledb
Expand Down

0 comments on commit 3ef786b

Please sign in to comment.