Skip to content

Commit

Permalink
Better formatted error message for ancestral alleles
Browse files Browse the repository at this point in the history
  • Loading branch information
hyanwong authored and mergify[bot] committed Jan 23, 2024
1 parent 7db1d38 commit 6b9099a
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 15 deletions.
20 changes: 9 additions & 11 deletions tests/test_sgkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,7 @@ def test_ancestral_missingness(tmp_path):
ancestral_allele = ds.variant_ancestral_allele.values
ancestral_allele[0] = "N"
ancestral_allele[11] = "-"
ancestral_allele[12] = "💩"
ancestral_allele[15] = "💩"
ds = ds.drop_vars(["variant_ancestral_allele"])
sgkit.save_dataset(ds, str(zarr_path) + ".tmp")
Expand All @@ -538,19 +539,16 @@ def test_ancestral_missingness(tmp_path):
)
ds = sgkit.load_dataset(str(zarr_path) + ".tmp")
sd = tsinfer.SgkitSampleData(str(zarr_path) + ".tmp")
with pytest.warns(UserWarning, match="The following alleles were not found"):
with pytest.warns(
UserWarning,
match=r"not found in the variant_allele array for the 4 [\s\S]*'💩': 2",
):
inf_ts = tsinfer.infer(sd)
for i, (
inf_var,
var,
) in enumerate(zip(inf_ts.variants(), ts.variants())):
assert inf_var.site.ancestral_state == var.site.ancestral_state or i in [
0,
11,
15,
]
if i in [0, 11, 15]:
for i, (inf_var, var) in enumerate(zip(inf_ts.variants(), ts.variants())):
if i in [0, 11, 12, 15]:
assert inf_var.site.metadata == {"inference_type": "parsimony"}
else:
assert inf_var.site.ancestral_state == var.site.ancestral_state


@pytest.mark.skipif(sys.platform == "win32", reason="File permission errors on Windows")
Expand Down
17 changes: 13 additions & 4 deletions tsinfer/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2405,11 +2405,20 @@ def sites_ancestral_allele(self):
except IndexError:
unknown_alleles[allele] += 1
ret[i] = allele_index
if sum(unknown_alleles.values()) > 0:
tot = sum(unknown_alleles.values())
if tot > 0:
num_sites = len(string_allele)
frac_bad = tot / num_sites
frac_bad_per_type = [v / num_sites for v in unknown_alleles.values()]
summarise_unknown = [
f"'{k}': {v} ({frac * 100:.2f}% of sites)" # Summarise per allele type
for (k, v), frac in zip(unknown_alleles.items(), frac_bad_per_type)
]
warnings.warn(
"The following alleles were not found in the variant_allele array "
"and will be treated as unknown:\n"
f"{unknown_alleles}"
"An ancestral allele was not found in the variant_allele array for "
+ f"the {tot} sites ({frac_bad * 100 :.2f}%) listed below. "
+ "They will be treated as of unknown ancestral state:\n "
+ "\n ".join(summarise_unknown)
)
return ret

Expand Down

0 comments on commit 6b9099a

Please sign in to comment.