Skip to content

Commit

Permalink
Update sample_vcf_to_zarr.py (#94)
Browse files Browse the repository at this point in the history
* Update sample_vcf_to_zarr.py

Allow contigs to be read from VCF by default, preserving backwards compatibility with existing scripts

* Update sample_vcf_to_zarr.py

Remove accidental keypresses

* read contigs from header instead of from content

* add error handling on faulty file

* implemented Alistairs suggestions

`str.endswith()` can take a tuple of values, so I've combined both `endswith` suggestions

* update logs to be more explicit

* import gzip module

* add extra test to verify new contigs-in-header feature

* update fixtures to include contig header

Co-authored-by: Guus van de Steeg <[email protected]>
  • Loading branch information
GvandeSteeg and Guus van de Steeg authored Sep 10, 2021
1 parent 9e10e8e commit 66d0f6d
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 5 deletions.
Binary file modified dockerfiles/SampleVcfToZarr/fixture/example.vcf.gz
Binary file not shown.
Binary file modified dockerfiles/SampleVcfToZarr/fixture/example.vcf.gz.tbi
Binary file not shown.
36 changes: 31 additions & 5 deletions dockerfiles/SampleVcfToZarr/sample_vcf_to_zarr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import gzip
import sys
import allel
import zarr
Expand Down Expand Up @@ -58,10 +59,10 @@ def main():
required=True,
help="Sample identifier.")
parser.add_argument("--contig",
required=True,
required=False,
action='append',
dest='contigs',
help="Contig to extract. Multiple values may be provided.")
help="Contig to extract. Multiple values may be provided. Reads VCF contigs by default.")
parser.add_argument("--field",
required=True,
action='append',
Expand Down Expand Up @@ -120,7 +121,7 @@ def main():
chunk_length = args.chunk_length
chunk_width = args.chunk_width
do_zip = args.zip
contigs = args.contigs
contigs = args.contigs or [] # If no contigs provided, read from vcf file
fields = args.fields
log = args.log.strip()

Expand All @@ -132,9 +133,34 @@ def main():
else:
log_file = open(log, "w")
log_file_needs_closing = True


if not contigs:
if input_vcf_path.endswith((".gz", ".bgz")):
vcf_opener = gzip.open
else:
vcf_opener = open

with vcf_opener(input_vcf_path, mode="rt") as vcf_open:
for line in vcf_open:
# Assuming vcf header follows standard vcf convention
if line.startswith("#"):
if "contig=<ID=" in line:
chrom = line.split("<ID=")[1].split(",")[0]
contigs.append(chrom)
else:
# Header is finished, no need to read the rest
break

try:

if not contigs:
# If there still aren't any contigs present, the vcf was faulty
log_file.write(f"contigs argument is empty and provided VCF does not have a valid contig header line\n\n"
f"either supply contigs with the '--contig' option, "
f"or ensure the VCF header contains a contig header as specified in the VCF specification: "
f"https://samtools.github.io/hts-specs/VCFv4.2.pdf\n"
f"example: '##contig=<ID=20,length=62435964>'\n")
sys.exit(1)

for contig in contigs:

allel.vcf_to_zarr(
Expand Down
15 changes: 15 additions & 0 deletions dockerfiles/SampleVcfToZarr/test_sample_vcf_to_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,21 @@ def test_conversion_zip(self):
callset = zarr.open("output/example.zarr.zip")
self.check_example_callset(callset)

def test_conversion_without_contigs_argument(self):
cmd = ["python", "sample_vcf_to_zarr.py",
"--input", "fixture/example.vcf.gz",
"--output", "output/example.zarr",
"--sample", "NA00001",
"--field", "variants/DP",
"--field", "calldata/GT",
"--field", "calldata/GQ",
]
result = subprocess.run(cmd,
check=True,
capture_output=True)
print(result.stdout.decode())
callset = zarr.open("output/example.zarr")
self.check_example_callset(callset)

if __name__ == '__main__':
unittest.main()

0 comments on commit 66d0f6d

Please sign in to comment.