From 66d0f6d6ea532d64c850eed3c65de4f519d936cc Mon Sep 17 00:00:00 2001 From: Guus van de Steeg <45566691+GvandeSteeg@users.noreply.github.com> Date: Fri, 10 Sep 2021 14:28:19 +0100 Subject: [PATCH] Update sample_vcf_to_zarr.py (#94) * Update sample_vcf_to_zarr.py Allow contigs to be read from VCF by default, preserving backwards compatibility with existing scripts * Update sample_vcf_to_zarr.py Remove accidental keypresses * read contigs from header instead of from content * add error handling on faulty file * implemented Alistairs suggestions `str.endswith()` can take a tuple of values, so I've combined both `endswith` suggestions * update logs to be more explicit * import gzip module * add extra test to verify new contigs-in-header feature * update fixtures to include contig header Co-authored-by: Guus van de Steeg --- .../SampleVcfToZarr/fixture/example.vcf.gz | Bin 954 -> 982 bytes .../fixture/example.vcf.gz.tbi | Bin 185 -> 186 bytes .../SampleVcfToZarr/sample_vcf_to_zarr.py | 36 +++++++++++++++--- .../test_sample_vcf_to_zarr.py | 15 ++++++++ 4 files changed, 46 insertions(+), 5 deletions(-) diff --git a/dockerfiles/SampleVcfToZarr/fixture/example.vcf.gz b/dockerfiles/SampleVcfToZarr/fixture/example.vcf.gz index 00f8a725f71596daf33d8ad1a948c47bce28eccf..d73c435bb59810bb51fa33d68d41c303a7dd4413 100644 GIT binary patch delta 967 zcmV;&133J;2i6CFABzYC000000RIL6LPG)oxdV+XXDiK*-{U|iq@&qB#h3G4&hZD=*DrCRU-B|lCV1)(UtNb@ zUDt;LWpuzP?b2K7!C95>%e4;Dw@F&;jWThb&C7gSs`N>JI1CC({h~{qK@Fpnj&zo% zx|+vHZvLG0?}a;#ZFamwhanZ znph=pGMNn@>%!TJ%=YQCE(1oQw?YS#%;>Ey$v{`@GA^w9;LD)|=P!7r(jw6n{yR21 z44tblU(e}(!r5o&qTHwiXD3pXB-)b38D87<25(^~2JlsjzTeP0zo8FnI=s&JnX#NF zGJ@d&u~zFH4yVykme>-r)3{Qp#(B8`PpZO6Tp7K&lpoAL-`zkLX4i5*Y`GsMxiY`> zJ}mY3Jw)WK#Spd(tV{_H+oFJERi*?F2eMWVcMS!9ZA_>Z!RC22o8wg5s%jS(1V8WN zWP=&+UUH53|8k90@t_KR##U1dC)4PDS(8*Ay@X74Rl$R(414qs8;|Okxt)5|c++I= zjN&_MQ7rZ9ctKA`#I3QY!Rc|SxzgcjezDLOnPvZ+Fc2GJ~pKSs)qUw-_G zE7nDfG31~S^-Nc&GY7ey*8D^*@S!gc@Y3^%Pn$8i$GO?ge5?1)?|RTi8n4T|Qij1h zjY2XC`v`_|A=b58qh2@*{9f1(1YsvW-TK`f>B5&w){&5x8`x4QjvC&356&?{){1p! pORxU1(JHG5{(KQgha`+fq;C&v3x|5E0tkNfA5A748Bvp)10U6e)0zMP delta 939 zcmV;c162Ih2f7D;ABzYC000000RIL6LPG)oodb_s{yw}N=PON9U;w+-{XW(2%{OEVms%%oZ~OKzkiEj^_J#|EWOwM@WV36 z-Fv|%1sIdxq4H%+CdE^gam4r zCaRc4aa#U8?LUsW12x%J7SZnA%Vb_gGKQ1!bok=^Gah);r7_(nE0ufH2oIU^#=BCz ztK1ln-*t;ZfGmCFNG(LM95Q`35}6 z0>@ENs?DYRV0L?V1C^EAmiuAL{V+~t`AhG^Tz%ietsYtoe#^kJ%;92N6cC1@%)vuV z*6QK0p&*S3*&^7imeUzdl&&haQD)%PK8iP(vhF2+*NFX*Yb3L0nXxmriefmPgini# zq_D{)WTJ`!E=2CIP5#pHD2|!i$`6G%P3F!hzS9=Pd@tjuJm8gz(;t|U8t}4s_QMJg zgg4AFIL77*JijFc4H_T4v{`P;(n872S(MK3qmv8v8$66ZzP173uRT0X#PgHzCtoj7 z14xa3Ug_b(OAOmY?aB*mfBJd}>ZjSE`KCW)9?2EtI9KoRQUm*=#mh6Ay)4mqfEG_f z1Vm{5)t{jHB9TrNsvAU8AO2;89-r*^A=cFK`2IZM1B$hF`)Ko zMG$$Qf*VH|t6%|w^J^6O1&R?wE$$MO7Y?z1EoQrb_7Ur%nZI13>C)r9!OWvp&mVfk zw0ncc-iT@9U@iu;hTiH3Dr{=l6w!LyX|^C_abXM7Fe^2eb8RjJ6^a}oegF_qePVXCbN{Q2Ts%|X8ZCg*n2$AES`eBt8skJ!g1~DDS?fJfc z7eaFv%Je4_j0#6EZZH=yP-X%q*fAJl4@_`dSSMx13FB_nnTag7H9uYnT=iuGUf7P| zkYD_-iT6qz|&liETZ}?$g^z~t7p{~~|fM9q3 N1C*=?O$3v-10TF{)G+`6 diff --git a/dockerfiles/SampleVcfToZarr/fixture/example.vcf.gz.tbi b/dockerfiles/SampleVcfToZarr/fixture/example.vcf.gz.tbi index 1a63a1ad1c043db7996911fb629b865210cddeff..a26472fb6a969210cfccf314fe057ff7eebb31d5 100644 GIT binary patch delta 132 zcmV-~0DJ$r0lEPZABzYC000000RIL6LPG)ooskhDXKkzy`Oi?=YZ62+x;V@{7(mv` ziew&4eIgswJg7K~zaJ(JwFjnNnFB(zq1n>|)eqBW$qu2XK*eEvn0_BDZWyIUX%N^? mAq`7w$oU7B?id($fwCVwU89RLGN1xfy$}EZvkVH_k(nSS;W1_a delta 131 zcmV-}0DS+t0l5JYABzYC000000RIL6LPG)ooRJYCW|piF`DIYrYZ62+x;V@{7(mv` ziew&4y$BoBJg7K~Uj!3}+5=O6jU7U>q1n>|)eqD621-wXio^IY{XSUSFiMZoAW%dh l4NGgt`3IKn7#QY(vL8HMql+^#paN9A5C8zCR)1=dnjjoeGLrxR diff --git a/dockerfiles/SampleVcfToZarr/sample_vcf_to_zarr.py b/dockerfiles/SampleVcfToZarr/sample_vcf_to_zarr.py index 1c23bc10..1d48d500 100644 --- a/dockerfiles/SampleVcfToZarr/sample_vcf_to_zarr.py +++ b/dockerfiles/SampleVcfToZarr/sample_vcf_to_zarr.py @@ -1,3 +1,4 @@ +import gzip import sys import allel import zarr @@ -58,10 +59,10 @@ def main(): required=True, help="Sample identifier.") parser.add_argument("--contig", - required=True, + required=False, action='append', dest='contigs', - help="Contig to extract. Multiple values may be provided.") + help="Contig to extract. Multiple values may be provided. Reads VCF contigs by default.") parser.add_argument("--field", required=True, action='append', @@ -120,7 +121,7 @@ def main(): chunk_length = args.chunk_length chunk_width = args.chunk_width do_zip = args.zip - contigs = args.contigs + contigs = args.contigs or [] # If no contigs provided, read from vcf file fields = args.fields log = args.log.strip() @@ -132,9 +133,34 @@ def main(): else: log_file = open(log, "w") log_file_needs_closing = True - + + if not contigs: + if input_vcf_path.endswith((".gz", ".bgz")): + vcf_opener = gzip.open + else: + vcf_opener = open + + with vcf_opener(input_vcf_path, mode="rt") as vcf_open: + for line in vcf_open: + # Assuming vcf header follows standard vcf convention + if line.startswith("#"): + if "contig='\n") + sys.exit(1) + for contig in contigs: allel.vcf_to_zarr( diff --git a/dockerfiles/SampleVcfToZarr/test_sample_vcf_to_zarr.py b/dockerfiles/SampleVcfToZarr/test_sample_vcf_to_zarr.py index 9df0fc0f..21535b9f 100644 --- a/dockerfiles/SampleVcfToZarr/test_sample_vcf_to_zarr.py +++ b/dockerfiles/SampleVcfToZarr/test_sample_vcf_to_zarr.py @@ -62,6 +62,21 @@ def test_conversion_zip(self): callset = zarr.open("output/example.zarr.zip") self.check_example_callset(callset) + def test_conversion_without_contigs_argument(self): + cmd = ["python", "sample_vcf_to_zarr.py", + "--input", "fixture/example.vcf.gz", + "--output", "output/example.zarr", + "--sample", "NA00001", + "--field", "variants/DP", + "--field", "calldata/GT", + "--field", "calldata/GQ", + ] + result = subprocess.run(cmd, + check=True, + capture_output=True) + print(result.stdout.decode()) + callset = zarr.open("output/example.zarr") + self.check_example_callset(callset) if __name__ == '__main__': unittest.main()