Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add switch for variant stats array version #720

Merged
merged 5 commits into from
May 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion apis/python/src/tiledbvcf/binding/libtiledbvcf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -118,5 +118,6 @@ PYBIND11_MODULE(libtiledbvcf, m) {
.def("set_enable_variant_stats", &Writer::set_enable_variant_stats)
.def("set_enable_sample_stats", &Writer::set_enable_sample_stats)
.def("set_compress_sample_dim", &Writer::set_compress_sample_dim)
.def("set_compression_level", &Writer::set_compression_level);
.def("set_compression_level", &Writer::set_compression_level)
.def("set_variant_stats_version", &Writer::set_variant_stats_version);
}
6 changes: 6 additions & 0 deletions apis/python/src/tiledbvcf/binding/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -358,4 +358,10 @@ void Writer::set_compression_level(int level) {
check_error(writer, tiledb_vcf_writer_set_compression_level(writer, level));
}

void Writer::set_variant_stats_version(uint8_t version) {
auto writer = ptr.get();
check_error(
writer, tiledb_vcf_writer_set_variant_stats_version(writer, version));
}

} // namespace tiledbvcfpy
5 changes: 5 additions & 0 deletions apis/python/src/tiledbvcf/binding/writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,11 @@ class Writer {
*/
void set_compression_level(int level);

/**
Set variant stats array version
*/
void set_variant_stats_version(uint8_t version);

private:
/** Helper function to free a C writer instance */
static void deleter(tiledb_vcf_writer_t* w);
Expand Down
6 changes: 6 additions & 0 deletions apis/python/src/tiledbvcf/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,7 @@ def create_dataset(
enable_sample_stats: bool = True,
compress_sample_dim: bool = True,
compression_level: int = 4,
variant_stats_version: int = 2,
):
"""
Create a new dataset.
Expand Down Expand Up @@ -633,6 +634,8 @@ def create_dataset(
Enable compression on the sample dimension.
compression_level
Compression level for zstd compression.
variant_stats_version
Version of the variant stats array.
"""
if self.mode != "w":
raise Exception("Dataset not open in write mode")
Expand Down Expand Up @@ -675,6 +678,9 @@ def create_dataset(
if compression_level is not None:
self.writer.set_compression_level(compression_level)

if variant_stats_version is not None:
self.writer.set_variant_stats_version(variant_stats_version)

# This call throws an exception if the dataset already exists.
self.writer.create_dataset()

Expand Down
87 changes: 85 additions & 2 deletions apis/python/tests/test_tiledbvcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1169,7 +1169,8 @@ def test_ingest_mode_merged(tmp_path):
and shutil.which("bcftools") is None,
reason="no bcftools",
)
def test_ingest_with_stats(tmp_path):
def test_ingest_with_stats_v3(tmp_path):
# tiledbvcf.config_logging("debug")
tmp_path_contents = os.listdir(tmp_path)
if "stats" in tmp_path_contents:
shutil.rmtree(os.path.join(tmp_path, "stats"))
Expand All @@ -1194,7 +1195,9 @@ def test_ingest_with_stats(tmp_path):
shutil.rmtree(os.path.join(tmp_path, "stats_test"))
# tiledbvcf.config_logging("trace")
ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "stats_test"), mode="w")
ds.create_dataset(enable_variant_stats=True, enable_allele_count=True)
ds.create_dataset(
enable_variant_stats=True, enable_allele_count=True, variant_stats_version=3
)
ds.ingest_samples(bgzipped_inputs)
ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "stats_test"), mode="r")
sample_names = [os.path.basename(file).split(".")[0] for file in bgzipped_inputs]
Expand Down Expand Up @@ -1242,6 +1245,86 @@ def test_ingest_with_stats(tmp_path):
assert sum(df["count"] == (8, 5, 3, 4, 2, 2, 1)) == 7


# Ok to skip is missing bcftools in Windows CI job
@pytest.mark.skipif(
os.environ.get("CI") == "true"
and platform.system() == "Windows"
and shutil.which("bcftools") is None,
reason="no bcftools",
)
def test_ingest_with_stats_v2(tmp_path):
# tiledbvcf.config_logging("debug")
tmp_path_contents = os.listdir(tmp_path)
if "stats" in tmp_path_contents:
shutil.rmtree(os.path.join(tmp_path, "stats"))
shutil.copytree(
os.path.join(TESTS_INPUT_DIR, "stats"), os.path.join(tmp_path, "stats")
)
raw_inputs = glob.glob(os.path.join(tmp_path, "stats", "*.vcf"))
# print(f"raw inputs: {raw_inputs}")
for vcf_file in raw_inputs:
subprocess.run(
"bcftools view --no-version -Oz -o " + vcf_file + ".gz " + vcf_file,
shell=True,
check=True,
)
bgzipped_inputs = glob.glob(os.path.join(tmp_path, "stats", "*.gz"))
# print(f"bgzipped inputs: {bgzipped_inputs}")
for vcf_file in bgzipped_inputs:
assert subprocess.run("bcftools index " + vcf_file, shell=True).returncode == 0
if "outputs" in tmp_path_contents:
shutil.rmtree(os.path.join(tmp_path, "outputs"))
if "stats_test" in tmp_path_contents:
shutil.rmtree(os.path.join(tmp_path, "stats_test"))
# tiledbvcf.config_logging("trace")
ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "stats_test"), mode="w")
ds.create_dataset(enable_variant_stats=True, enable_allele_count=True)
ds.ingest_samples(bgzipped_inputs)
ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "stats_test"), mode="r")
sample_names = [os.path.basename(file).split(".")[0] for file in bgzipped_inputs]
data_frame = ds.read(
samples=sample_names,
attrs=["contig", "pos_start", "id", "qual", "info_TILEDB_IAF", "sample_name"],
set_af_filter="<0.2",
)
assert data_frame.shape == (1, 8)
assert data_frame.query("sample_name == 'second'")["qual"].iloc[0] == pytest.approx(
343.73
)
assert (
data_frame[data_frame["sample_name"] == "second"]["info_TILEDB_IAF"].iloc[0][0]
== 0.9375
)
data_frame = ds.read(
samples=sample_names,
attrs=["contig", "pos_start", "id", "qual", "info_TILEDB_IAF", "sample_name"],
scan_all_samples=True,
)
assert (
data_frame[
(data_frame["sample_name"] == "second") & (data_frame["pos_start"] == 4)
]["info_TILEDB_IAF"].iloc[0][0]
== 0.9375
)
ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "stats_test"), mode="r")
df = ds.read_variant_stats("chr1:1-10000")
assert df.shape == (12, 5)
df = tiledbvcf.allele_frequency.read_allele_frequency(
os.path.join(tmp_path, "stats_test"), "chr1:1-10000"
)
assert df.pos.is_monotonic_increasing
df["an_check"] = (df.ac / df.af).round(0).astype("int32")
assert df.an_check.equals(df.an)
df = ds.read_variant_stats("chr1:1-10000")
assert df.shape == (12, 5)
df = df.to_pandas()
df = ds.read_allele_count("chr1:1-10000")
assert df.shape == (7, 6)
df = df.to_pandas()
assert sum(df["pos"] == (0, 1, 1, 2, 2, 2, 3)) == 7
assert sum(df["count"] == (8, 5, 3, 4, 2, 2, 1)) == 7


# Ok to skip is missing bcftools in Windows CI job
@pytest.mark.skipif(
os.environ.get("CI") == "true"
Expand Down
12 changes: 12 additions & 0 deletions libtiledbvcf/src/c_api/tiledbvcf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1811,6 +1811,18 @@ int32_t tiledb_vcf_writer_set_compression_level(
return TILEDB_VCF_OK;
}

int32_t tiledb_vcf_writer_set_variant_stats_version(
tiledb_vcf_writer_t* writer, uint8_t version) {
if (sanity_check(writer) == TILEDB_VCF_ERR)
return TILEDB_VCF_ERR;

if (SAVE_ERROR_CATCH(
writer, writer->writer_->set_variant_stats_array_version(version)))
return TILEDB_VCF_ERR;

return TILEDB_VCF_OK;
}

/* ********************************* */
/* ERROR */
/* ********************************* */
Expand Down
8 changes: 8 additions & 0 deletions libtiledbvcf/src/c_api/tiledbvcf.h
Original file line number Diff line number Diff line change
Expand Up @@ -1698,6 +1698,14 @@ TILEDBVCF_EXPORT int32_t tiledb_vcf_writer_set_compress_sample_dim(
TILEDBVCF_EXPORT int32_t
tiledb_vcf_writer_set_compression_level(tiledb_vcf_writer_t* writer, int level);

/**
* Sets variant stats array version
* @param writer VCF writer object
* @param version variant stats array version
*/
TILEDBVCF_EXPORT int32_t tiledb_vcf_writer_set_variant_stats_version(
tiledb_vcf_writer_t* writer, uint8_t version);

/* ********************************* */
/* ERROR */
/* ********************************* */
Expand Down
4 changes: 4 additions & 0 deletions libtiledbvcf/src/cli/tiledbvcf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,10 @@ void add_create(CLI::App& app) {
args->checksum,
"Checksum to use for dataset validation on read and writes.")
->transform(CLI::CheckedTransformer(filter_map));
cmd->add_option(
"--variant-stats-version",
args->variant_stats_array_version,
"Select which schema version of variant stats to use: '2', or '3' ");

cmd->option_defaults()->group("Debug options");
add_logging_options(cmd, args->log_level, args->log_file);
Expand Down
1 change: 1 addition & 0 deletions libtiledbvcf/src/dataset/tiledbvcfdataset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ void TileDBVCFDataset::create(const CreationParams& params) {
AlleleCount::create(ctx, params.uri, params.checksum);
}
if (params.enable_variant_stats) {
VariantStats::set_array_version(params.variant_stats_array_version);
VariantStats::create(ctx, params.uri, params.checksum);
}
if (params.enable_sample_stats) {
Expand Down
1 change: 1 addition & 0 deletions libtiledbvcf/src/dataset/tiledbvcfdataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ struct CreationParams {
bool enable_sample_stats = true;
bool compress_sample_dim = true;
int compression_level = 4;
uint8_t variant_stats_array_version = 2;
};

/** Arguments/params for dataset registration. */
Expand Down
Loading
Loading