diff --git a/tests/test_sgkit.py b/tests/test_sgkit.py index b930263a..8167474a 100644 --- a/tests/test_sgkit.py +++ b/tests/test_sgkit.py @@ -600,6 +600,22 @@ def test_ploidy1_unphased(self, tmp_path): sgkit.save_dataset(ds, path) tsinfer.SgkitSampleData(path) + def test_duplicate_positions(self, tmp_path): + path = tmp_path / "data.zarr" + ds = sgkit.simulate_genotype_call_dataset(n_variant=3, n_sample=3, phased=True) + ds["variant_position"][2] = ds["variant_position"][1] + sgkit.save_dataset(ds, path) + with pytest.raises(ValueError, match="duplicate or out-of-order values"): + tsinfer.SgkitSampleData(path) + + def test_bad_order_positions(self, tmp_path): + path = tmp_path / "data.zarr" + ds = sgkit.simulate_genotype_call_dataset(n_variant=3, n_sample=3, phased=True) + ds["variant_position"][0] = ds["variant_position"][2] - 0.5 + sgkit.save_dataset(ds, path) + with pytest.raises(ValueError, match="duplicate or out-of-order values"): + tsinfer.SgkitSampleData(path) + def test_empty_alleles_not_at_end(self, tmp_path): path = tmp_path / "data.zarr" ds = sgkit.simulate_genotype_call_dataset(n_variant=3, n_sample=3, n_ploidy=1) diff --git a/tsinfer/formats.py b/tsinfer/formats.py index 5ae575b7..f578f435 100644 --- a/tsinfer/formats.py +++ b/tsinfer/formats.py @@ -2309,6 +2309,12 @@ def __init__(self, path): " sgkit dataset, indicating that all the genotypes are" " unphased" ) + if np.any(np.diff(self.sites_position) <= 0): + raise ValueError( + "Values taken from the variant_position array are not strictly " + "increasing (i.e. have duplicate or out-of-order values). " + "These must be masked out to run tsinfer." + ) @functools.cached_property def format_name(self):