diff --git a/dockerfiles/SampleVcfToZarr/fixture/example.vcf.gz b/dockerfiles/SampleVcfToZarr/fixture/example.vcf.gz index 00f8a725..d73c435b 100644 Binary files a/dockerfiles/SampleVcfToZarr/fixture/example.vcf.gz and b/dockerfiles/SampleVcfToZarr/fixture/example.vcf.gz differ diff --git a/dockerfiles/SampleVcfToZarr/fixture/example.vcf.gz.tbi b/dockerfiles/SampleVcfToZarr/fixture/example.vcf.gz.tbi index 1a63a1ad..a26472fb 100644 Binary files a/dockerfiles/SampleVcfToZarr/fixture/example.vcf.gz.tbi and b/dockerfiles/SampleVcfToZarr/fixture/example.vcf.gz.tbi differ diff --git a/dockerfiles/SampleVcfToZarr/sample_vcf_to_zarr.py b/dockerfiles/SampleVcfToZarr/sample_vcf_to_zarr.py index 1c23bc10..1d48d500 100644 --- a/dockerfiles/SampleVcfToZarr/sample_vcf_to_zarr.py +++ b/dockerfiles/SampleVcfToZarr/sample_vcf_to_zarr.py @@ -1,3 +1,4 @@ +import gzip import sys import allel import zarr @@ -58,10 +59,10 @@ def main(): required=True, help="Sample identifier.") parser.add_argument("--contig", - required=True, + required=False, action='append', dest='contigs', - help="Contig to extract. Multiple values may be provided.") + help="Contig to extract. Multiple values may be provided. Reads VCF contigs by default.") parser.add_argument("--field", required=True, action='append', @@ -120,7 +121,7 @@ def main(): chunk_length = args.chunk_length chunk_width = args.chunk_width do_zip = args.zip - contigs = args.contigs + contigs = args.contigs or [] # If no contigs provided, read from vcf file fields = args.fields log = args.log.strip() @@ -132,9 +133,34 @@ def main(): else: log_file = open(log, "w") log_file_needs_closing = True - + + if not contigs: + if input_vcf_path.endswith((".gz", ".bgz")): + vcf_opener = gzip.open + else: + vcf_opener = open + + with vcf_opener(input_vcf_path, mode="rt") as vcf_open: + for line in vcf_open: + # Assuming vcf header follows standard vcf convention + if line.startswith("#"): + if "contig='\n") + sys.exit(1) + for contig in contigs: allel.vcf_to_zarr( diff --git a/dockerfiles/SampleVcfToZarr/test_sample_vcf_to_zarr.py b/dockerfiles/SampleVcfToZarr/test_sample_vcf_to_zarr.py index 9df0fc0f..21535b9f 100644 --- a/dockerfiles/SampleVcfToZarr/test_sample_vcf_to_zarr.py +++ b/dockerfiles/SampleVcfToZarr/test_sample_vcf_to_zarr.py @@ -62,6 +62,21 @@ def test_conversion_zip(self): callset = zarr.open("output/example.zarr.zip") self.check_example_callset(callset) + def test_conversion_without_contigs_argument(self): + cmd = ["python", "sample_vcf_to_zarr.py", + "--input", "fixture/example.vcf.gz", + "--output", "output/example.zarr", + "--sample", "NA00001", + "--field", "variants/DP", + "--field", "calldata/GT", + "--field", "calldata/GQ", + ] + result = subprocess.run(cmd, + check=True, + capture_output=True) + print(result.stdout.decode()) + callset = zarr.open("output/example.zarr") + self.check_example_callset(callset) if __name__ == '__main__': unittest.main()