Skip to content

Commit

Permalink
Generate genotypes
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite committed Jul 18, 2024
1 parent 5003c65 commit 5b3b0c5
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 2 deletions.
53 changes: 51 additions & 2 deletions hypothesis_vcf/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,28 @@ class Field:
vcf_key: str
vcf_type: str
vcf_number: str
description: str = "Generated field"

def get_header(self):
return (
f"##{self.category}=<"
f"ID={self.vcf_key},"
f"Type={self.vcf_type},"
f"Number={self.vcf_number},"
f'Description="{self.category},Type={self.vcf_type},Number={self.vcf_number}">'
f'Description="{self.description}">'
)


# GT is a special case, since it has a special syntax, and must be listed as the first
# format field (if present)
GT = Field(
category="FORMAT",
vcf_key="GT",
vcf_type="String",
vcf_number="1",
description="Genotype",
)

# references to the VCF spec are for https://samtools.github.io/hts-specs/VCFv4.3.pdf

# [Table 1: Reserved INFO keys]
Expand Down Expand Up @@ -133,7 +144,7 @@ def vcf_numbers(category, max_number):
def vcf_fields(category, max_number):
# info flag fields must have number 0
# non-flag fields can't have number 0
return builds(
general_fields = builds(
Field,
category=just(category),
vcf_key=vcf_field_keys(category),
Expand All @@ -143,6 +154,11 @@ def vcf_fields(category, max_number):
lambda field: (field.vcf_type == "Flag" and field.vcf_number == "0")
or (field.vcf_type != "Flag" and field.vcf_number != "0")
)
if category == "INFO":
return general_fields
else:
# FORMAT: GT special case
return one_of(just(GT), general_fields)


# [1.6.1 Fixed fields]
Expand Down Expand Up @@ -183,8 +199,31 @@ def qualities():
)


# [1.6.2 Genotype fields]


def genotypes(alleles, ploidy):
def gt_str(allele_indexes, phased):
sep = "|" if phased else "/"
return sep.join(
[str(idx) if idx is not None else "." for idx in allele_indexes]
)

return builds(
gt_str,
lists(
one_of(integers(0, alleles - 1), none()), min_size=ploidy, max_size=ploidy
),
booleans(),
)


@composite
def vcf_values(draw, field, *, max_number, alt_alleles, ploidy):
# GT special case
if field is GT:
return [draw(genotypes(alleles=alt_alleles + 1, ploidy=ploidy))]

# [1.3 Data types]
if field.vcf_type == "Integer":
# some integer values at lower end of range are not allowed
Expand Down Expand Up @@ -231,6 +270,15 @@ def vcf_number_to_ints(vcf_number, *, max_number, alt_alleles, ploidy):
raise ValueError(f"Number '{vcf_number}' is not supported.")


def ensure_gt_first(format_fields):
# GT must be the first field if present [1.6.2 Genotype fields]
try:
i = format_fields.index(GT)
format_fields.insert(0, format_fields.pop(i))
except ValueError:
pass


@composite
def vcf(
draw,
Expand Down Expand Up @@ -290,6 +338,7 @@ def vcf(
unique_by=lambda f: f.vcf_key.lower(),
)
)
ensure_gt_first(format_fields)
sample_ids = draw(
lists(
text(alphabet=ALPHANUMERIC, min_size=1), max_size=max_samples, unique=True
Expand Down
14 changes: 14 additions & 0 deletions tests/test_strategies.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

import pysam
from hypothesis import HealthCheck, given, note, settings
from hypothesis.strategies import data
Expand All @@ -7,6 +9,7 @@
RESERVED_FORMAT_KEYS,
RESERVED_INFO_KEYS,
Field,
genotypes,
vcf_field_keys,
vcf_fields,
vcf_values,
Expand Down Expand Up @@ -40,6 +43,17 @@ def test_format_fields(data):
assert field.vcf_number != "0"


@given(data=data())
def test_genotypes(data):
alleles = 3
ploidy = 2
gt = data.draw(genotypes(alleles=alleles, ploidy=ploidy))
allele_index_strs = re.split("[/|]", gt)
assert len(allele_index_strs) == ploidy
allele_indexes = [int(idx) if idx != "." else None for idx in allele_index_strs]
assert all(0 <= idx < alleles for idx in allele_indexes if idx is not None)


@given(data=data())
def test_vcf_values(data):
field = Field("INFO", "I1", "Integer", "1")
Expand Down

0 comments on commit 5b3b0c5

Please sign in to comment.