From 640850c40ff8e7303f18299b84434aee68355157 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 1 Mar 2022 16:29:43 +0100 Subject: [PATCH 01/12] Check for non-ACGTN REF allele when atomizing, otherwise faulty VCFs can cause an abort via the assert bcftools: abuf.c:154: _atomize_allele: Assertion `atom' failed. Resolves #1668 --- abuf.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/abuf.c b/abuf.c index a97332a9f..8ffe3db3b 100644 --- a/abuf.c +++ b/abuf.c @@ -3,17 +3,17 @@ Copyright (c) 2021 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -117,7 +117,7 @@ void abuf_set(abuf_t *buf, abuf_opt_t key, void *value) if ( key==INFO_TAG ) { buf->split.info_tag = *((char**)value); - bcf_hdr_printf(buf->out_hdr,"##INFO=",buf->split.info_tag); + bcf_hdr_printf(buf->out_hdr,"##INFO=",buf->split.info_tag); return; } if ( key==STAR_ALLELE ) { buf->star_allele = *((int*)value); return; } @@ -141,7 +141,7 @@ static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial) while ( rlen>1 && alen>1 && ref[rlen-1]==alt[alen-1] ) rlen--, alen--; int Mlen = rlen > alen ? rlen : alen; - atom_t *atom = NULL; + atom_t *atom = NULL; int i; for (i=0; itmp2+num_size,missing_ptr,num_size); else memcpy(buf->tmp2+num_size,buf->tmp+num_size*iori,num_size); - if ( type==BCF_HT_INT && mode==M_SUM ) + if ( type==BCF_HT_INT && mode==M_SUM ) { uint8_t *tbl = buf->split.tbl + iout*buf->split.nori; for (i=iori; isplit.nori; i++) @@ -542,7 +542,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo { int star_allele = _has_star_allele(buf,iout); bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)]; - int ret = 0; + int ret = 0; if ( len==BCF_VL_FIXED || len==BCF_VL_VAR ) ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type); else if ( len==BCF_VL_A && type!=BCF_HT_STR ) @@ -707,7 +707,7 @@ void _abuf_split(abuf_t *buf, bcf1_t *rec) buf->vcf[j] = bcf_dup(rec); return; } - for (i=1; in_allele; i++) + for (i=0; in_allele; i++) { if ( _is_acgtn(rec->d.allele[i]) ) continue; rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf); From 594e9b2a7f587741726b34db80d3c854f38bc66c Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 3 Mar 2022 15:10:50 +0000 Subject: [PATCH 02/12] Document supported consequence types. Resolves #1671 --- doc/bcftools.txt | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/doc/bcftools.txt b/doc/bcftools.txt index aac53f531..5e2fbf4c6 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -1425,6 +1425,32 @@ output VCF and are ignored for the prediction analysis. BCSQ=*missense|PER3|ENST00000361923|+|1028M>1028T|7890117T>C ---- +*Supported consequence types* +---- +3_prime_utr +5_prime_utr +coding_sequence +feature_elongation +frameshift +inframe_altering +inframe_deletion +inframe_insertion +intergenic +intron +missense +non_coding +splice_acceptor +splice_donor +splice_region +start_lost +start_retained +stop_gained +stop_lost +stop_retained +synonymous +---- +See also https://ensembl.org/info/genome/variation/prediction/predicted_data.html + [[filter]] From d68fe68551367809770f4128a4d9e3fc7fb0032c Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 8 Mar 2022 14:31:14 +0000 Subject: [PATCH 03/12] Support for filling in FORMAT tags Extend generalized functions so that FORMAT tags can be filled as well, e.g. with bcftools +fill-tags in.bcf -o out.bcf -- -t 'FORMAT/DP:1=int(smpl_sum(FORMAT/AD))' Resolves https://github.com/samtools/bcftools/issues/87#issuecomment-1058675939 --- plugins/fill-tags.c | 126 +++++++++++++----- test/fill-tags-AD.1.out | 9 ++ test/{fill-tags-AD.out => fill-tags-AD.2.out} | 7 +- test/fill-tags-AD.3.out | 9 ++ test/fill-tags-AD.vcf | 5 +- test/test.pl | 4 +- 6 files changed, 117 insertions(+), 43 deletions(-) create mode 100644 test/fill-tags-AD.1.out rename test/{fill-tags-AD.out => fill-tags-AD.2.out} (59%) create mode 100644 test/fill-tags-AD.3.out diff --git a/plugins/fill-tags.c b/plugins/fill-tags.c index 414808214..5794e0d63 100644 --- a/plugins/fill-tags.c +++ b/plugins/fill-tags.c @@ -3,17 +3,17 @@ Copyright (c) 2015-2022 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -65,6 +65,7 @@ struct _ftf_t int32_t *ival; int nfval, nival; int type, len, cnt; // VCF type (e.g. BCF_HT_REAL); length (BCF_VL_FIXED); count (e.g. 1) + int info; // filling INFO or FORMAT? }; typedef struct @@ -108,7 +109,7 @@ const char *about(void) const char *usage(void) { - return + return "\n" "About: Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS\n" " FORMAT tag VAF, custom INFO/TAG=func(FMT/TAG).\n" @@ -137,7 +138,10 @@ const char *usage(void) " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -S sample-group.txt -t HWE\n" "\n" " # Calculate total read depth (INFO/DP) from per-sample depths (FORMAT/DP)\n" - " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t 'DP:1=int(sum(DP))'\n" + " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t 'DP:1=int(sum(FORMAT/DP))'\n" + "\n" + " # Calculate per-sample read depth (FORMAT/DP) from per-sample allelic depths (FORMAT/AD)\n" + " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t 'FORMAT/DP:1=int(smpl_sum(FORMAT/AD))'\n" "\n" " # Annotate with allelic fraction\n" " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t FORMAT/VAF\n" @@ -172,7 +176,7 @@ void parse_samples(args_t *args, char *fname) smpl = str.s; int ismpl = bcf_hdr_id2int(args->in_hdr,BCF_DT_SAMPLE,smpl); - if ( ismpl<0 ) + if ( ismpl<0 ) { fprintf(stderr,"Warning: The sample not present in the VCF: %s\n",smpl); continue; @@ -265,7 +269,7 @@ void ftf_destroy(args_t *args) } int ftf_filter_expr(args_t *args, bcf1_t *rec, ftf_t *ftf) { - int i,j, nval, nval1; + int i,j,k, nval, nval1; for (i=0; inpop; i++) { args->str.l = 0; @@ -274,23 +278,54 @@ int ftf_filter_expr(args_t *args, bcf1_t *rec, ftf_t *ftf) filter_test(args->pop[i].filter, rec, NULL); const double *val = filter_get_doubles(args->pop[i].filter,&nval,&nval1); - int nfill = ftf->len==BCF_VL_FIXED ? ftf->cnt : nval; - int ncopy = nval < nfill ? nval : nfill; int ret; - - if ( ftf->type==BCF_HT_REAL ) + if ( ftf->info ) { - hts_expand(float,nfill,ftf->nfval,ftf->fval); - for (j=0; jfval[j] = val[j]; - for (; jfval[j]); - ret = bcf_update_info_float(args->out_hdr,rec,args->str.s,ftf->fval,nfill); + int nfill = ftf->len==BCF_VL_FIXED ? ftf->cnt : nval; + int ncopy = nval < nfill ? nval : nfill; // number of values available, the rest is filled with missing values + if ( ftf->type==BCF_HT_REAL ) + { + hts_expand(float,nfill,ftf->nfval,ftf->fval); + for (j=0; jfval[j] = val[j]; + for (; jfval[j]); + ret = bcf_update_info_float(args->out_hdr,rec,args->str.s,ftf->fval,nfill); + } + else + { + hts_expand(int32_t,nfill,ftf->nival,ftf->ival); + for (j=0; jival[j] = (int)val[j]; + for (; jival[j] = bcf_int32_missing; + ret = bcf_update_info_int32(args->out_hdr,rec,args->str.s,ftf->ival,nfill); + } } else { - hts_expand(int32_t,nfill,ftf->nival,ftf->ival); - for (j=0; jival[j] = (int)val[j]; - for (; jival[j] = bcf_int32_missing; - ret = bcf_update_info_int32(args->out_hdr,rec,args->str.s,ftf->ival,nfill); + int nfill = ftf->len==BCF_VL_FIXED ? ftf->cnt : nval1; + int ncopy = nval1 < nfill ? nval1 : nfill; + if ( ftf->type==BCF_HT_REAL ) + { + hts_expand(float,nfill*rec->n_sample,ftf->nfval,ftf->fval); + for (k=0; kn_sample; k++) + { + float *dst = ftf->fval + k*nval1; + const double *src = val + k*nval1; + for (j=0; jout_hdr,rec,args->str.s,ftf->fval,nfill*rec->n_sample); + } + else + { + hts_expand(int32_t,nfill*rec->n_sample,ftf->nival,ftf->ival); + for (k=0; kn_sample; k++) + { + int32_t *dst = ftf->ival + k*nval1; + const double *src = val + k*nval1; + for (j=0; jout_hdr,rec,args->str.s,ftf->ival,nfill*rec->n_sample); + } } if ( ret ) error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); @@ -312,10 +347,15 @@ int parse_func(args_t *args, char *tag_expr, char *expr) ftf->type = BCF_HT_REAL; ftf->len = BCF_VL_VAR; ftf->cnt = 0; + ftf->info = 1; char *end = strchr(tag_expr,'='); - ftf->dst_tag = (char*)calloc(end-tag_expr+1,1); - memcpy(ftf->dst_tag, tag_expr, end-tag_expr); + char *beg = tag_expr; + if ( !strncasecmp("info/",tag_expr,5) ) beg += 5; + else if ( !strncasecmp("format/",tag_expr,7) ) { beg += 7; ftf->info = 0; } + else if ( !strncasecmp("fmt/",tag_expr,4) ) { beg += 4; ftf->info = 0; } + ftf->dst_tag = (char*)calloc(end-beg+1,1); + memcpy(ftf->dst_tag, beg, end-beg); char *tmp, *cnt = strchr(ftf->dst_tag,':'); if ( cnt ) { @@ -346,19 +386,31 @@ int parse_func(args_t *args, char *tag_expr, char *expr) args->str.l = 0; ksprintf(&args->str, "%s%s", ftf->dst_tag,args->pop[i].suffix); int id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,args->str.s); - if ( bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,id) ) + int update_hdr = 1; + if ( ftf->info && bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_INFO,id) ) { - if ( bcf_hdr_id2length(args->in_hdr,BCF_HL_FMT,id)!=ftf->len ) + if ( bcf_hdr_id2length(args->in_hdr,BCF_HL_INFO,id)!=ftf->len ) error("Error: the field INFO/%s already exists and its Number definition is different\n",args->str.s); - if ( ftf->len==BCF_VL_FIXED && bcf_hdr_id2number(args->in_hdr,BCF_HL_FMT,id)!=ftf->cnt ) + if ( ftf->len==BCF_VL_FIXED && bcf_hdr_id2number(args->in_hdr,BCF_HL_INFO,id)!=ftf->cnt ) error("Error: the field INFO/%s already exists and its Number definition is different from %d\n",args->str.s,ftf->cnt); if ( bcf_hdr_id2type(args->in_hdr,BCF_HT_INT,id)!=ftf->type ) error("Error: the field INFO/%s already exists and its Type definition is different\n",args->str.s); + update_hdr = 0; } - else + else if ( !ftf->info && bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,id) ) + { + if ( bcf_hdr_id2length(args->in_hdr,BCF_HL_FMT,id)!=ftf->len ) + error("Error: the field FORMAT/%s already exists and its Number definition is different\n",args->str.s); + if ( ftf->len==BCF_VL_FIXED && bcf_hdr_id2number(args->in_hdr,BCF_HL_FMT,id)!=ftf->cnt ) + error("Error: the field FORMAT/%s already exists and its Number definition is different from %d\n",args->str.s,ftf->cnt); + if ( bcf_hdr_id2type(args->in_hdr,BCF_HT_INT,id)!=ftf->type ) + error("Error: the field FORMAT/%s already exists and its Type definition is different\n",args->str.s); + update_hdr = 0; + } + if ( update_hdr ) { args->str.l = 0; - ksprintf(&args->str, "##INFO=dst_tag,args->pop[i].suffix); + ksprintf(&args->str, "##%s=info?"INFO":"FORMAT",ftf->dst_tag,args->pop[i].suffix); if ( ftf->len==BCF_VL_FIXED ) kputw(ftf->cnt, &args->str); else kputc('.', &args->str); kputs(",Type=", &args->str); @@ -474,7 +526,7 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) int c; while ((c = getopt_long(argc, argv, "?ht:dS:l",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 'l': list_tags(); break; case 'd': args->drop_missing = 1; break; @@ -515,8 +567,8 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) return 0; } -/* - Wigginton 2005, PMID: 15789306 +/* + Wigginton 2005, PMID: 15789306 nref .. number of reference alleles nalt .. number of alt alleles @@ -696,7 +748,7 @@ static void process_fmt(bcf1_t *rec) { pop_t *pop = &args->pop[i]; int32_t an = 0; - for (j=0; jn_allele; j++) + for (j=0; jn_allele; j++) an += pop->counts[j].nhet + pop->counts[j].nhom + pop->counts[j].nhemi + pop->counts[j].nac; args->str.l = 0; @@ -713,7 +765,7 @@ static void process_fmt(bcf1_t *rec) if ( rec->n_allele > 1 ) { pop_t *pop = &args->pop[i]; - for (j=0; jn_allele; j++) + for (j=0; jn_allele; j++) { args->farr[j] = pop->counts[j].nhet + pop->counts[j].nhom + pop->counts[j].nhemi + pop->counts[j].nac; an += args->farr[j]; @@ -747,7 +799,7 @@ static void process_fmt(bcf1_t *rec) if ( rec->n_allele > 1 ) { pop_t *pop = &args->pop[i]; - for (j=0; jn_allele; j++) + for (j=0; jn_allele; j++) args->iarr[j] = pop->counts[j].nhet + pop->counts[j].nhom + pop->counts[j].nhemi + pop->counts[j].nac; } args->str.l = 0; @@ -764,7 +816,7 @@ static void process_fmt(bcf1_t *rec) { pop_t *pop = &args->pop[i]; memset(args->iarr, 0, sizeof(*args->iarr)*(rec->n_allele-1)); - for (j=1; jn_allele; j++) + for (j=1; jn_allele; j++) args->iarr[j-1] += pop->counts[j].nhet; } args->str.l = 0; @@ -781,7 +833,7 @@ static void process_fmt(bcf1_t *rec) { pop_t *pop = &args->pop[i]; memset(args->iarr, 0, sizeof(*args->iarr)*(rec->n_allele-1)); - for (j=1; jn_allele; j++) + for (j=1; jn_allele; j++) args->iarr[j-1] += pop->counts[j].nhom; } args->str.l = 0; @@ -798,7 +850,7 @@ static void process_fmt(bcf1_t *rec) { pop_t *pop = &args->pop[i]; memset(args->iarr, 0, sizeof(*args->iarr)*(rec->n_allele-1)); - for (j=1; jn_allele; j++) + for (j=1; jn_allele; j++) args->iarr[j-1] += pop->counts[j].nhemi; } args->str.l = 0; @@ -819,7 +871,7 @@ static void process_fmt(bcf1_t *rec) memset(args->farr, 0, sizeof(*args->farr)*(2*rec->n_allele)); int nref_tot = pop->counts[0].nhom; for (j=0; jn_allele; j++) nref_tot += pop->counts[j].nhet; // NB this neglects multiallelic genotypes - for (j=1; jn_allele; j++) + for (j=1; jn_allele; j++) { int nref = nref_tot - pop->counts[j].nhet; int nalt = pop->counts[j].nhet + pop->counts[j].nhom; @@ -956,7 +1008,7 @@ bcf1_t *process(bcf1_t *rec) void destroy(void) { - int i; + int i; for (i=0; inpop; i++) { free(args->pop[i].name); diff --git a/test/fill-tags-AD.1.out b/test/fill-tags-AD.1.out new file mode 100644 index 000000000..6de46619c --- /dev/null +++ b/test/fill-tags-AD.1.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##FILTER= +##INFO= +##FORMAT= +##contig= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C +chr1 10153 . AC A . . AD=3,1;DP=66 AD 10,1 20,2 30,3 +chr1 10153 . AC A . . AD=3,1;DP=11 AD 10,1 . . diff --git a/test/fill-tags-AD.out b/test/fill-tags-AD.2.out similarity index 59% rename from test/fill-tags-AD.out rename to test/fill-tags-AD.2.out index 7109dbe3d..20a711eef 100644 --- a/test/fill-tags-AD.out +++ b/test/fill-tags-AD.2.out @@ -1,8 +1,9 @@ ##fileformat=VCFv4.2 ##FILTER= +##INFO= ##FORMAT= ##contig= -##INFO= +##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C -chr1 10153 . AC A . . DP=66 AD 10,1 20,2 30,3 -chr1 10153 . AC A . . DP=11 AD 10,1 . . +chr1 10153 . AC A . . AD=3,1;DP=4 AD 10,1 20,2 30,3 +chr1 10153 . AC A . . AD=3,1;DP=4 AD 10,1 . . diff --git a/test/fill-tags-AD.3.out b/test/fill-tags-AD.3.out new file mode 100644 index 000000000..d5ef64000 --- /dev/null +++ b/test/fill-tags-AD.3.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##FILTER= +##INFO= +##FORMAT= +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C +chr1 10153 . AC A . . AD=3,1 AD:DP 10,1:11 20,2:22 30,3:33 +chr1 10153 . AC A . . AD=3,1 AD:DP 10,1:11 .:. .:. diff --git a/test/fill-tags-AD.vcf b/test/fill-tags-AD.vcf index 6b9f8e42e..be8b52091 100644 --- a/test/fill-tags-AD.vcf +++ b/test/fill-tags-AD.vcf @@ -1,6 +1,7 @@ ##fileformat=VCFv4.2 +##INFO= ##FORMAT= ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C -chr1 10153 . AC A . . . AD 10,1 20,2 30,3 -chr1 10153 . AC A . . . AD 10,1 . . +chr1 10153 . AC A . . AD=3,1 AD 10,1 20,2 30,3 +chr1 10153 . AC A . . AD=3,1 AD 10,1 . . diff --git a/test/test.pl b/test/test.pl index 1de8ef7d6..f82dcbbad 100755 --- a/test/test.pl +++ b/test/test.pl @@ -520,7 +520,9 @@ test_vcf_plugin($opts,in=>'fill-tags-hwe',out=>'fill-tags-func.out',cmd=>'+fill-tags --no-version',args=>q[-- -t 'XX:1=F_PASS(GT="alt")']); test_vcf_plugin($opts,in=>'fill-tags-AN0',out=>'fill-tags-AN0.out',cmd=>'+fill-tags --no-version',args=>'-- -t all,END,TYPE,F_MISSING'); test_vcf_plugin($opts,in=>'fill-tags-VAF',out=>'fill-tags-VAF.out',cmd=>'+fill-tags --no-version',args=>'-- -t VAF,VAF1'); -test_vcf_plugin($opts,in=>'fill-tags-AD',out=>'fill-tags-AD.out',cmd=>'+fill-tags --no-version',args=>q[-- -t 'DP:1=int(sum(AD))']); +test_vcf_plugin($opts,in=>'fill-tags-AD',out=>'fill-tags-AD.1.out',cmd=>'+fill-tags --no-version',args=>q[-- -t 'INFO/DP:1=int(sum(FMT/AD))']); +test_vcf_plugin($opts,in=>'fill-tags-AD',out=>'fill-tags-AD.2.out',cmd=>'+fill-tags --no-version',args=>q[-- -t 'INFO/DP:1=int(sum(INFO/AD))']); +test_vcf_plugin($opts,in=>'fill-tags-AD',out=>'fill-tags-AD.3.out',cmd=>'+fill-tags --no-version',args=>q[-- -t 'FORMAT/DP:1=int(smpl_sum(FMT/AD))']); test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.out',cmd=>'+GTisec',args=>' | grep -v bcftools'); test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.H.out',cmd=>'+GTisec',args=>'-- -H | grep -v bcftools'); test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.Hm.out',cmd=>'+GTisec',args=>'-- -Hm | grep -v bcftools'); From 1e7632a66804087f3e443deddfbfdb3d405feb1d Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Fri, 11 Mar 2022 09:49:38 +0000 Subject: [PATCH 04/12] New `-H, --header-line` option This is a convenience option that allows to pass a header line on command line, and complements the existing `-h, --header-lines` option which requires a file with header lines --- Makefile | 3 +- NEWS | 7 +++ dbuf.h | 71 ++++++++++++++++++++++++++++ test/test.pl | 3 +- vcfannotate.c | 128 +++++++++++++++++++++++++++++--------------------- 5 files changed, 156 insertions(+), 56 deletions(-) create mode 100644 dbuf.h diff --git a/Makefile b/Makefile index c221be2a4..e15f2cf4f 100644 --- a/Makefile +++ b/Makefile @@ -230,12 +230,13 @@ prob1_h = prob1.h $(htslib_vcf_h) $(call_h) smpl_ilist_h = smpl_ilist.h $(htslib_vcf_h) vcfbuf_h = vcfbuf.h $(htslib_vcf_h) abuf_h = abuf.h $(htslib_vcf_h) +dbuf_h = dbuf.h $(htslib_vcf_h) bam2bcf_h = bam2bcf.h $(htslib_hts_h) $(htslib_vcf_h) bam_sample_h = bam_sample.h $(htslib_sam_h) str_finder.o: str_finder.h utlist.h main.o: main.c $(htslib_hts_h) config.h version.h $(bcftools_h) -vcfannotate.o: vcfannotate.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(bcftools_h) vcmp.h $(filter_h) $(convert_h) $(smpl_ilist_h) regidx.h $(htslib_khash_h) +vcfannotate.o: vcfannotate.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(bcftools_h) vcmp.h $(filter_h) $(convert_h) $(smpl_ilist_h) regidx.h $(htslib_khash_h) $(dbuf_h) vcfplugin.o: vcfplugin.c config.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(bcftools_h) vcmp.h $(filter_h) vcfcall.o: vcfcall.c $(htslib_vcf_h) $(htslib_kfunc_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(bcftools_h) $(call_h) $(prob1_h) $(ploidy_h) $(gvcf_h) regidx.h $(vcfbuf_h) vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_thread_pool_h) $(bcftools_h) diff --git a/NEWS b/NEWS index 88d1569ea..119ebb643 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,12 @@ ## Release a.b +* bcftools annotate + + - New `-H, --header-line` convenience option to pass a header line on command line, + this complements the existing `-h, --header-lines` option which requires a file + with header lines + + ## Release 1.15 (21st February 2022) diff --git a/dbuf.h b/dbuf.h new file mode 100644 index 000000000..80b5958ef --- /dev/null +++ b/dbuf.h @@ -0,0 +1,71 @@ +/* The MIT License + + Copyright (c) 2022 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +/* + Simple data buffer +*/ + +#ifndef __DBUF_H__ +#define __DBUF_H__ + +#include + +typedef struct +{ + size_t n,m; + void **dat; +} +dbuf_t; + +static inline dbuf_t *dbuf_push(dbuf_t *buf, void *ptr) +{ + if ( !buf ) buf = calloc(1,sizeof(dbuf_t)); + buf->n++; + hts_expand(void*,buf->n,buf->m,buf->dat); + buf->dat[buf->n-1] = ptr; + return buf; +} + +static inline void *dbuf_ith(dbuf_t *buf, int i) +{ + return buf->dat[i]; +} + +static inline size_t dbuf_n(dbuf_t *buf) +{ + return buf->n; +} + +static inline void dbuf_destroy_free(dbuf_t *buf) +{ + int i; + for (i=0; in; i++) free(buf->dat[i]); + free(buf->dat); + free(buf); +} + +#endif + diff --git a/test/test.pl b/test/test.pl index f82dcbbad..678eba4e8 100755 --- a/test/test.pl +++ b/test/test.pl @@ -450,6 +450,7 @@ test_vcf_annotate($opts,in=>'annotate14',out=>'annotate25.out',args=>'-x FILTER/XX,INFO/XX --force'); test_vcf_annotate($opts,in=>'annotate15',tab=>'annotate15',out=>'annotate26.out',args=>'-s SAMPLE1 -c CHROM,FROM,TO,FMT/FOO,BAR'); test_vcf_annotate($opts,in=>'annotate15',tab=>'annotate15',out=>'annotate27.out',args=>'-s SAMPLE2 -c CHROM,FROM,TO,FMT/FOO,BAR'); +test_vcf_annotate($opts,in=>'annotate15',tab=>'annotate15',out=>'annotate27.out',args=>q[-s SAMPLE2 -c CHROM,FROM,TO,FMT/FOO,BAR -H '##FORMAT=' -H '##INFO=']); test_vcf_annotate($opts,in=>'annotate16',out=>'annotate28.out',args=>'-x FILTER'); test_vcf_annotate($opts,in=>'annotate17.1',tab=>'annotate17.1',out=>'annotate17.1.out',args=>'-c CHROM,BEG,END,A,B -l A:append,B:append'); test_vcf_annotate($opts,in=>'annotate17.2',tab=>'annotate17.1',out=>'annotate17.2.out',args=>'-c CHROM,BEG,END,A,B -l A:append,B:append'); @@ -1520,7 +1521,7 @@ sub test_vcf_annotate bgzip_tabix($opts,file=>$args{tab},suffix=>'tab',args=>'-s1 -b2 -e2'); $annot_fname = "-a $$opts{tmp}/$args{tab}.tab.gz"; $in_fname = "$$opts{path}/$args{in}.vcf"; - $hdr = -e "$$opts{path}/$args{in}.hdr" ? "-h $$opts{path}/$args{in}.hdr" : ''; + $hdr = (-e "$$opts{path}/$args{in}.hdr" && !($args{args}=~/-H/) && !($args{args}=~/--header-line\s/)) ? "-h $$opts{path}/$args{in}.hdr" : ''; } elsif ( exists($args{vcf}) ) { diff --git a/vcfannotate.c b/vcfannotate.c index 42cab68ca..b5e45d426 100644 --- a/vcfannotate.c +++ b/vcfannotate.c @@ -45,6 +45,7 @@ THE SOFTWARE. */ #include "convert.h" #include "smpl_ilist.h" #include "regidx.h" +#include "dbuf.h" struct _args_t; @@ -166,6 +167,7 @@ typedef struct _args_t kstring_t merge_method_str; int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps; int columns_is_file, has_append_mode, pair_logic; + dbuf_t *header_lines; } args_t; @@ -400,7 +402,7 @@ static void init_remove_annots(args_t *args) if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FLT); } else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual; - else if ( !strcasecmp("INFO",str.s) ) + else if ( !strcasecmp("INFO",str.s) ) { if ( needs_info ) error("Error: `--remove INFO` is executed first, cannot combine with `--set-id %s`\n",args->set_ids_fmt); tag->handler = remove_info; @@ -453,7 +455,7 @@ static void init_remove_annots(args_t *args) rm_tag_t *tag = &args->rm[args->nrm-1]; if ( hrec->type==BCF_HL_INFO ) tag->handler = remove_info_tag; else if ( hrec->type==BCF_HL_FMT ) tag->handler = remove_format_tag; - else + else { tag->handler = remove_filter; tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]); @@ -469,16 +471,31 @@ static void init_remove_annots(args_t *args) } static void init_header_lines(args_t *args) { - htsFile *file = hts_open(args->header_fname, "rb"); - if ( !file ) error("Error reading %s\n", args->header_fname); - kstring_t str = {0,0,0}; - while ( hts_getline(file, KS_SEP_LINE, &str) > 0 ) + if ( args->header_fname ) { - if ( bcf_hdr_append(args->hdr_out,str.s) ) error("Could not parse %s: %s\n", args->header_fname, str.s); - bcf_hdr_append(args->hdr,str.s); // the input file may not have the header line if run with -h (and nothing else) + htsFile *file = hts_open(args->header_fname, "rb"); + if ( !file ) error("Error reading %s\n", args->header_fname); + kstring_t str = {0,0,0}; + while ( hts_getline(file, KS_SEP_LINE, &str) > 0 ) + { + if ( bcf_hdr_append(args->hdr_out,str.s) ) error("Could not parse %s: %s\n", args->header_fname, str.s); + bcf_hdr_append(args->hdr,str.s); // the input file may not have the header line if run with -h (and nothing else) + } + if ( hts_close(file)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->header_fname); + free(str.s); + } + if ( args->header_lines ) + { + int i, n = dbuf_n(args->header_lines); + for (i=0; iheader_lines,i); + if ( bcf_hdr_append(args->hdr_out,line) ) error("Could not parse the header line: %s\n", line); + bcf_hdr_append(args->hdr,line); // the input file may not have the header line if run with -H (and nothing else) + } + dbuf_destroy_free(args->header_lines); + args->header_lines = NULL; } - if ( hts_close(file)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->header_fname); - free(str.s); if (bcf_hdr_sync(args->hdr_out) < 0) error_errno("[%s] Failed to update output header", __func__); if (bcf_hdr_sync(args->hdr) < 0) @@ -486,7 +503,7 @@ static void init_header_lines(args_t *args) } static int vcf_getter_info_str2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr) { - return bcf_get_info_string(args->tgts_hdr,rec,col->hdr_key_src,ptr,mptr); + return bcf_get_info_string(args->tgts_hdr,rec,col->hdr_key_src,ptr,mptr); } static int vcf_getter_id2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr) { @@ -538,9 +555,9 @@ static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *dat if ( !(col->replace & REPLACE_MISSING) ) { bcf_update_filter(args->hdr_out,line,NULL,0); - return bcf_update_filter(args->hdr_out,line,args->tmpi,1); + return bcf_update_filter(args->hdr_out,line,args->tmpi,1); } - + // only update missing FILTER if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); if ( !line->d.n_flt ) @@ -765,7 +782,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d { if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && - col->merge_method!=MM_APPEND && + col->merge_method!=MM_APPEND && col->merge_method!=MM_APPEND_MISSING ) error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Integer\n"); } @@ -804,7 +821,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d } else { - args->tmpi[ntmpi-1] = strtol(str, &end, 10); + args->tmpi[ntmpi-1] = strtol(str, &end, 10); if ( end==str ) error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); str = end+1; @@ -859,7 +876,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d col->mm_dbl_nused = col->mm_dbl_ndat = 0; } - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_int32(args,line,col,tab->nals,tab->als,ntmpi); if ( col->replace & REPLACE_MISSING ) @@ -875,7 +892,7 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi); if ( ntmpi < 0 ) return 0; // nothing to add - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_int32(args,line,col,rec->n_allele,rec->d.allele,ntmpi); if ( col->replace & REPLACE_MISSING ) @@ -962,7 +979,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * hts_expand(float,ntmpf,args->mtmpf,args->tmpf); if ( str[0]=='.' && (str[1]==0 || str[1]==',') ) { - if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) ) + if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) ) bcf_float_set_missing(args->tmpf[ntmpf-1]); else ntmpf--; @@ -1042,7 +1059,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * col->mm_dbl_nused = col->mm_dbl_ndat = 0; } - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_real(args,line,col,tab->nals,tab->als,ntmpf); if ( col->replace & REPLACE_MISSING ) @@ -1059,7 +1076,7 @@ static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, vo int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf); if ( ntmpf < 0 ) return 0; // nothing to add - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_real(args,line,col,rec->n_allele,rec->d.allele,ntmpf); if ( col->replace & REPLACE_MISSING ) @@ -1074,7 +1091,7 @@ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als) { assert( col->merge_method==MM_FIRST ); - + int nsrc = 1, lsrc = 0; while ( args->tmps[lsrc] ) { @@ -1092,7 +1109,7 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in // fill in any missing values in the target VCF (or all, if not present) int i, empty = 0, nstr, mstr = args->tmpks.m; - nstr = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmpks.s, &mstr); + nstr = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmpks.s, &mstr); args->tmpks.m = mstr; if ( nstr<0 || (nstr==1 && args->tmpks.s[0]=='.' && args->tmpks.s[1]==0) ) { @@ -1205,7 +1222,7 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d hts_expand(char,len+1,args->mtmps,args->tmps); memcpy(args->tmps,tab->cols[col->icol],len+1); - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_string(args,line,col,tab->nals,tab->als); } @@ -1223,7 +1240,7 @@ static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( ntmps < 0 ) return 0; // nothing to add } - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) + if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele); if ( col->replace & REPLACE_MISSING ) @@ -1256,8 +1273,8 @@ static int genotypes_to_string(args_t *args, int nsrc1, int32_t *src, int nsmpl_ for (i=0; il - plen > blen ) @@ -1319,7 +1336,7 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo } return bcf_update_genotypes(args->hdr_out,line,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out)); } - else if ( ndst >= nsrc ) + else if ( ndst >= nsrc ) { for (i=0; ihdr_out); i++) { @@ -1364,7 +1381,7 @@ static int count_vals(annot_line_t *tab, int icol_beg, int icol_end) for (i=icol_beg; icols[i], *end = str; - if ( str[0]=='.' && !str[1] ) + if ( str[0]=='.' && !str[1] ) { // missing value if ( !nmax ) nmax = 1; @@ -1407,7 +1424,7 @@ static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, } return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nvals*bcf_hdr_nsamples(args->hdr_out)); } - else if ( ndst >= nvals ) + else if ( ndst >= nvals ) { for (i=0; ihdr_out); i++) { @@ -1422,7 +1439,7 @@ static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, // . y y TAG,+TAG,-TAG .. REPLACE_ALL, REPLACE_MISSING, REPLACE_NON_MISSING // x . x TAG,+TAG .. REPLACE_ALL, REPLACE_MISSING // x . . -TAG .. REPLACE_NON_MISSING - if ( col->replace & REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; } else if ( col->replace & REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; } else if ( col->replace & REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; } for (j=0; jhdr_out,line,col->hdr_key_dst,args->tmpf2,nvals*bcf_hdr_nsamples(args->hdr_out)); } - else if ( ndst >= nvals ) + else if ( ndst >= nvals ) { for (i=0; ihdr_out); i++) { if ( args->sample_map[i]==-1 ) continue; float *src = vals + nvals*args->sample_map[i]; float *dst = args->tmpf2 + ndst*i; - if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; } else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; } else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; } for (j=0; jtmps2; for (i=0; itmpp2[i] = tmp; tmp += 2; @@ -1549,7 +1566,7 @@ static int core_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, char **src = vals + args->sample_map[i]; char **dst = args->tmpp2 + i; - if ( col->replace & REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; } + if ( col->replace & REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; } else if ( col->replace & REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; } else if ( col->replace & REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; } *dst = *src; @@ -1561,7 +1578,7 @@ static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) + if ( col->icol+args->nsmpl_annot > tab->ncols ) error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi); @@ -1583,7 +1600,7 @@ static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void } char *end = str; - ptr[ival] = strtol(str, &end, 10); + ptr[ival] = strtol(str, &end, 10); if ( end==str ) error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); @@ -1600,7 +1617,7 @@ static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) + if ( col->icol+args->nsmpl_annot > tab->ncols ) error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf); @@ -1616,14 +1633,14 @@ static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void { if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value { - bcf_float_set_missing(ptr[ival]); + bcf_float_set_missing(ptr[ival]); ival++; str += str[1] ? 2 : 1; continue; } char *end = str; - ptr[ival] = strtod(str, &end); + ptr[ival] = strtod(str, &end); if ( end==str ) error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); @@ -1640,7 +1657,7 @@ static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) + if ( col->icol+args->nsmpl_annot > tab->ncols ) error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); int ismpl; @@ -1664,12 +1681,12 @@ static int determine_ploidy(int nals, int *vals, int nvals1, uint8_t *smpl, int if ( has_value ) { if ( j==ndip ) - { + { smpl[i] = 2; - max_ploidy = 2; + max_ploidy = 2; } else if ( j==nals ) - { + { smpl[i] = 1; if ( !max_ploidy ) max_ploidy = 1; } @@ -1880,7 +1897,7 @@ static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, if ( k>=0 ) { if ( bcf_float_is_missing(ptr_src[j]) ) bcf_float_set_missing(ptr_dst[k]); - else if ( bcf_float_is_vector_end(ptr_src[j]) ) bcf_float_set_vector_end(ptr_dst[k]); + else if ( bcf_float_is_vector_end(ptr_src[j]) ) bcf_float_set_vector_end(ptr_dst[k]); else ptr_dst[k] = ptr_src[j]; } } @@ -1933,7 +1950,7 @@ static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, v if ( ndst1 < n ) ndst1 = n; } assert( ndst1 ); - + int ndst = ndst1*nsmpl_dst; hts_expand(int32_t,ndst,args->mtmpi,args->tmpi); hts_expand(char,ret+1,args->mtmps,args->tmps); args->tmps[ret] = 0; // the FORMAT string may not be 0-terminated @@ -2266,7 +2283,7 @@ static void init_columns(args_t *args) col->hdr_key_dst = strdup(str.s); col->hdr_key_src = strncasecmp("INFO/",str.s+4,5) ? strdup(str.s+4) : strdup(str.s+4+5); int hdr_id = bcf_hdr_id2int(args->tgts_hdr, BCF_DT_ID,col->hdr_key_src); - if ( !bcf_hdr_idinfo_exists(args->tgts_hdr,BCF_HL_INFO,hdr_id) ) + if ( !bcf_hdr_idinfo_exists(args->tgts_hdr,BCF_HL_INFO,hdr_id) ) error("The INFO tag \"%s\" is not defined in %s\n", col->hdr_key_src, args->targets_fname); if ( bcf_hdr_id2type(args->tgts_hdr,BCF_HL_INFO,hdr_id)!=BCF_HT_STR ) error("Only Type=String tags can be used to annotate the ID column\n"); @@ -2766,7 +2783,7 @@ static void init_data(args_t *args) args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt); } if ( args->remove_annots ) init_remove_annots(args); - if ( args->header_fname ) init_header_lines(args); + if ( args->header_fname || args->header_lines ) init_header_lines(args); if ( args->targets_fname && args->tgts_is_vcf ) { // reading annots from a VCF @@ -2932,7 +2949,7 @@ static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp) } if ( args->ref_idx != -1 ) { - if ( args->ref_idx >= tmp->ncols ) + if ( args->ref_idx >= tmp->ncols ) error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,str); if ( args->alt_idx >= tmp->ncols ) error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,str); @@ -3256,6 +3273,7 @@ static void usage(args_t *args) fprintf(stderr, " -C, --columns-file FILE Read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n"); fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); fprintf(stderr, " --force Continue despite parsing error (at your own risk!)\n"); + fprintf(stderr, " -H, --header-line STR Header line which should be appended to the VCF header, can be given multiple times\n"); fprintf(stderr, " -h, --header-lines FILE Lines which should be appended to the VCF header\n"); fprintf(stderr, " -I, --set-id [+]FORMAT Set ID column using a `bcftools query`-like expression, see man page for details\n"); fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); @@ -3325,6 +3343,7 @@ int main_vcfannotate(int argc, char *argv[]) {"rename-annots",required_argument,NULL,11}, {"rename-chrs",required_argument,NULL,1}, {"header-lines",required_argument,NULL,'h'}, + {"header-line",required_argument,NULL,'H'}, {"samples",required_argument,NULL,'s'}, {"samples-file",required_argument,NULL,'S'}, {"single-overlaps",no_argument,NULL,10}, @@ -3334,18 +3353,18 @@ int main_vcfannotate(int argc, char *argv[]) {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:H:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) { switch (c) { case 'f': args->force = 1; break; case 'k': args->keep_sites = 1; break; - case 'm': + case 'm': args->mark_sites_logic = MARK_LISTED; if ( optarg[0]=='+' ) args->mark_sites = optarg+1; else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; } - else args->mark_sites = optarg; + else args->mark_sites = optarg; break; - case 'l': + case 'l': if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str); kputs(optarg,&args->merge_method_str); break; @@ -3384,6 +3403,7 @@ int main_vcfannotate(int argc, char *argv[]) case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 'h': args->header_fname = optarg; break; + case 'H': args->header_lines = dbuf_push(args->header_lines,strdup(optarg)); break; case 1 : args->rename_chrs = optarg; break; case 2 : if ( !strcmp(optarg,"snps") ) args->pair_logic |= BCF_SR_PAIR_SNP_REF; @@ -3426,7 +3446,7 @@ int main_vcfannotate(int argc, char *argv[]) } if ( args->targets_fname ) { - htsFile *fp = hts_open(args->targets_fname,"r"); + htsFile *fp = hts_open(args->targets_fname,"r"); if ( !fp ) error("Failed to open %s\n", args->targets_fname); htsFormat type = *hts_get_format(fp); hts_close(fp); @@ -3467,7 +3487,7 @@ int main_vcfannotate(int argc, char *argv[]) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) + if ( !pass ) { if ( args->keep_sites && bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); continue; From 0ce834b4e4a02173473bac2d4be9077bd13882b1 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 16 Mar 2022 12:32:20 +0000 Subject: [PATCH 05/12] Switch to https for htslib git clone Due to deprecation of the unencrypted git protocol at GitHub. See https://github.blog/2021-09-01-improving-git-protocol-security-github --- .ci_helpers/clone | 2 +- .cirrus.yml | 2 +- INSTALL | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.ci_helpers/clone b/.ci_helpers/clone index 9913e7719..a01071d8b 100755 --- a/.ci_helpers/clone +++ b/.ci_helpers/clone @@ -11,7 +11,7 @@ branch=$3 ref='' [ -n "$branch" ] && ref=$(git ls-remote --heads "$repository" "$branch" 2>/dev/null) -[ -z "$ref" ] && repository='git://github.com/samtools/htslib.git' +[ -z "$ref" ] && repository='https://github.com/samtools/htslib.git' set -x git clone --recurse-submodules --shallow-submodules --depth=1 ${ref:+--branch="$branch"} "$repository" "$localdir" diff --git a/.cirrus.yml b/.cirrus.yml index e9c44d427..90cd56939 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -21,7 +21,7 @@ timeout_in: 10m # clone with our own commands too. clone_template: &HTSLIB_CLONE htslib_clone_script: | - .ci_helpers/clone "git://github.com/${CIRRUS_REPO_OWNER}/htslib" "${HTSDIR}" "${CIRRUS_BRANCH}" + .ci_helpers/clone "https://github.com/${CIRRUS_REPO_OWNER}/htslib" "${HTSDIR}" "${CIRRUS_BRANCH}" #-------------------------------------------------- diff --git a/INSTALL b/INSTALL index c4af072a8..bcdd2f4e3 100644 --- a/INSTALL +++ b/INSTALL @@ -3,8 +3,8 @@ For the impatient The latest source code can be downloaded from github and compiled using: - git clone --recurse-submodules git://github.com/samtools/htslib.git - git clone git://github.com/samtools/bcftools.git + git clone --recurse-submodules https://github.com/samtools/htslib.git + git clone https://github.com/samtools/bcftools.git cd bcftools # The following is optional: # autoheader && autoconf && ./configure --enable-libgsl --enable-perl-filters From 0159b96aa70a3770884fa7549588b0c8e0f311c4 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 21 Mar 2022 17:13:16 +0100 Subject: [PATCH 06/12] Allow multiple custom functions in a single run Previously the program would silently go with the last one, assigning the same values to all. Fixes #1684 --- NEWS | 9 ++ plugins/fill-tags.c | 263 +++++++++++++++++++++------------------- test/fill-tags-AD.4.out | 10 ++ test/test.pl | 1 + 4 files changed, 157 insertions(+), 126 deletions(-) create mode 100644 test/fill-tags-AD.4.out diff --git a/NEWS b/NEWS index 119ebb643..bff2847c5 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,15 @@ this complements the existing `-h, --header-lines` option which requires a file with header lines +* bcftools +fill-tags + + - Extend generalized functions so that FORMAT tags can be filled as well, for example: + + bcftools +fill-tags in.bcf -o out.bcf -- -t 'FORMAT/DP:1=int(smpl_sum(FORMAT/AD))' + + - Allow multiple custom functions in a single run. Previously the program would silently + go with the last one, assigning the same values to all (#1684) + ## Release 1.15 (21st February 2022) diff --git a/plugins/fill-tags.c b/plugins/fill-tags.c index 5794e0d63..73d57bea3 100644 --- a/plugins/fill-tags.c +++ b/plugins/fill-tags.c @@ -56,7 +56,8 @@ typedef struct _args_t args_t; typedef struct _ftf_t ftf_t; -typedef int (*fill_tag_f)(args_t *, bcf1_t *, ftf_t *); +typedef struct _pop_t pop_t; +typedef int (*fill_tag_f)(args_t *, bcf1_t *, pop_t *, ftf_t *); struct _ftf_t { char *src_tag, *dst_tag; @@ -66,6 +67,7 @@ struct _ftf_t int nfval, nival; int type, len, cnt; // VCF type (e.g. BCF_HT_REAL); length (BCF_VL_FIXED); count (e.g. 1) int info; // filling INFO or FORMAT? + filter_t *filter; }; typedef struct @@ -74,16 +76,16 @@ typedef struct } counts_t; -typedef struct +struct _pop_t { int ns; int ncounts, mcounts; counts_t *counts; char *name, *suffix; int nsmpl, *smpl; - filter_t *filter; -} -pop_t; + ftf_t *ftf; + int nftf; +}; struct _args_t { @@ -96,8 +98,6 @@ struct _args_t int mhwe_probs; kstring_t str; kbitset_t *bset; - ftf_t *ftf; - int nftf; }; static args_t *args; @@ -254,82 +254,81 @@ void init_pops(args_t *args) } } -void ftf_destroy(args_t *args) +void ftf_destroy(pop_t *pop) { int i; - for (i=0; inftf; i++) + for (i=0; inftf; i++) { - ftf_t *ftf = &args->ftf[i]; + ftf_t *ftf = &pop->ftf[i]; free(ftf->src_tag); free(ftf->dst_tag); free(ftf->fval); free(ftf->ival); + if ( ftf->filter ) filter_destroy(ftf->filter); } - free(args->ftf); + free(pop->ftf); } -int ftf_filter_expr(args_t *args, bcf1_t *rec, ftf_t *ftf) +int ftf_filter_expr(args_t *args, bcf1_t *rec, pop_t *pop, ftf_t *ftf) { - int i,j,k, nval, nval1; - for (i=0; inpop; i++) - { - args->str.l = 0; - ksprintf(&args->str, "%s%s", ftf->dst_tag,args->pop[i].suffix); + int j,k, nval, nval1; - filter_test(args->pop[i].filter, rec, NULL); - const double *val = filter_get_doubles(args->pop[i].filter,&nval,&nval1); + args->str.l = 0; + ksprintf(&args->str, "%s%s", ftf->dst_tag,pop->suffix); - int ret; - if ( ftf->info ) + filter_test(ftf->filter, rec, NULL); + const double *val = filter_get_doubles(ftf->filter,&nval,&nval1); + + int ret; + if ( ftf->info ) + { + int nfill = ftf->len==BCF_VL_FIXED ? ftf->cnt : nval; + int ncopy = nval < nfill ? nval : nfill; // number of values available, the rest is filled with missing values + if ( ftf->type==BCF_HT_REAL ) { - int nfill = ftf->len==BCF_VL_FIXED ? ftf->cnt : nval; - int ncopy = nval < nfill ? nval : nfill; // number of values available, the rest is filled with missing values - if ( ftf->type==BCF_HT_REAL ) - { - hts_expand(float,nfill,ftf->nfval,ftf->fval); - for (j=0; jfval[j] = val[j]; - for (; jfval[j]); - ret = bcf_update_info_float(args->out_hdr,rec,args->str.s,ftf->fval,nfill); - } - else - { - hts_expand(int32_t,nfill,ftf->nival,ftf->ival); - for (j=0; jival[j] = (int)val[j]; - for (; jival[j] = bcf_int32_missing; - ret = bcf_update_info_int32(args->out_hdr,rec,args->str.s,ftf->ival,nfill); - } + hts_expand(float,nfill,ftf->nfval,ftf->fval); + for (j=0; jfval[j] = val[j]; + for (; jfval[j]); + ret = bcf_update_info_float(args->out_hdr,rec,args->str.s,ftf->fval,nfill); } else { - int nfill = ftf->len==BCF_VL_FIXED ? ftf->cnt : nval1; - int ncopy = nval1 < nfill ? nval1 : nfill; - if ( ftf->type==BCF_HT_REAL ) + hts_expand(int32_t,nfill,ftf->nival,ftf->ival); + for (j=0; jival[j] = (int)val[j]; + for (; jival[j] = bcf_int32_missing; + ret = bcf_update_info_int32(args->out_hdr,rec,args->str.s,ftf->ival,nfill); + } + } + else + { + int nfill = ftf->len==BCF_VL_FIXED ? ftf->cnt : nval1; + int ncopy = nval1 < nfill ? nval1 : nfill; + if ( ftf->type==BCF_HT_REAL ) + { + hts_expand(float,nfill*rec->n_sample,ftf->nfval,ftf->fval); + for (k=0; kn_sample; k++) { - hts_expand(float,nfill*rec->n_sample,ftf->nfval,ftf->fval); - for (k=0; kn_sample; k++) - { - float *dst = ftf->fval + k*nval1; - const double *src = val + k*nval1; - for (j=0; jout_hdr,rec,args->str.s,ftf->fval,nfill*rec->n_sample); + float *dst = ftf->fval + k*nval1; + const double *src = val + k*nval1; + for (j=0; jout_hdr,rec,args->str.s,ftf->fval,nfill*rec->n_sample); + } + else + { + hts_expand(int32_t,nfill*rec->n_sample,ftf->nival,ftf->ival); + for (k=0; kn_sample; k++) { - hts_expand(int32_t,nfill*rec->n_sample,ftf->nival,ftf->ival); - for (k=0; kn_sample; k++) - { - int32_t *dst = ftf->ival + k*nval1; - const double *src = val + k*nval1; - for (j=0; jout_hdr,rec,args->str.s,ftf->ival,nfill*rec->n_sample); + int32_t *dst = ftf->ival + k*nval1; + const double *src = val + k*nval1; + for (j=0; jout_hdr,rec,args->str.s,ftf->ival,nfill*rec->n_sample); } - if ( ret ) - error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); } + if ( ret ) + error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); return 0; } void hdr_append(args_t *args, char *fmt) @@ -338,11 +337,11 @@ void hdr_append(args_t *args, char *fmt) for (i=0; inpop; i++) bcf_hdr_printf(args->out_hdr, fmt, args->pop[i].suffix,*args->pop[i].name ? " in " : "",args->pop[i].name); } -int parse_func(args_t *args, char *tag_expr, char *expr) +int parse_func_pop(args_t *args, pop_t *pop, char *tag_expr, char *expr) { - args->nftf++; - args->ftf = (ftf_t *)realloc(args->ftf,sizeof(*args->ftf)*args->nftf); - ftf_t *ftf = &args->ftf[ args->nftf - 1 ]; + pop->nftf++; + pop->ftf = (ftf_t *)realloc(pop->ftf,sizeof(*pop->ftf)*pop->nftf); + ftf_t *ftf = &pop->ftf[ pop->nftf - 1 ]; memset(ftf,0,sizeof(ftf_t)); ftf->type = BCF_HT_REAL; ftf->len = BCF_VL_VAR; @@ -380,71 +379,76 @@ int parse_func(args_t *args, char *tag_expr, char *expr) ftf->func = ftf_filter_expr; uint8_t *samples = (uint8_t*)calloc(bcf_hdr_nsamples(args->in_hdr),1); - int i,j; - for (i=0; inpop; i++) + args->str.l = 0; + ksprintf(&args->str, "%s%s", ftf->dst_tag,pop->suffix); + int id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,args->str.s); + int update_hdr = 1; + if ( ftf->info && bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_INFO,id) ) + { + if ( bcf_hdr_id2length(args->in_hdr,BCF_HL_INFO,id)!=ftf->len ) + error("Error: the field INFO/%s already exists and its Number definition is different\n",args->str.s); + if ( ftf->len==BCF_VL_FIXED && bcf_hdr_id2number(args->in_hdr,BCF_HL_INFO,id)!=ftf->cnt ) + error("Error: the field INFO/%s already exists and its Number definition is different from %d\n",args->str.s,ftf->cnt); + if ( bcf_hdr_id2type(args->in_hdr,BCF_HT_INT,id)!=ftf->type ) + error("Error: the field INFO/%s already exists and its Type definition is different\n",args->str.s); + update_hdr = 0; + } + else if ( !ftf->info && bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,id) ) + { + if ( bcf_hdr_id2length(args->in_hdr,BCF_HL_FMT,id)!=ftf->len ) + error("Error: the field FORMAT/%s already exists and its Number definition is different\n",args->str.s); + if ( ftf->len==BCF_VL_FIXED && bcf_hdr_id2number(args->in_hdr,BCF_HL_FMT,id)!=ftf->cnt ) + error("Error: the field FORMAT/%s already exists and its Number definition is different from %d\n",args->str.s,ftf->cnt); + if ( bcf_hdr_id2type(args->in_hdr,BCF_HT_INT,id)!=ftf->type ) + error("Error: the field FORMAT/%s already exists and its Type definition is different\n",args->str.s); + update_hdr = 0; + } + if ( update_hdr ) { args->str.l = 0; - ksprintf(&args->str, "%s%s", ftf->dst_tag,args->pop[i].suffix); - int id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,args->str.s); - int update_hdr = 1; - if ( ftf->info && bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_INFO,id) ) + ksprintf(&args->str, "##%s=info?"INFO":"FORMAT",ftf->dst_tag,pop->suffix); + if ( ftf->len==BCF_VL_FIXED ) kputw(ftf->cnt, &args->str); + else kputc('.', &args->str); + kputs(",Type=", &args->str); + if ( ftf->type==BCF_HT_INT ) kputs("Integer", &args->str); + else kputs("Float", &args->str); + kputs(",Description=\"Added by +fill-tags expression ", &args->str); + // escape quotes + char *tmp = tag_expr; + while ( *tmp ) { - if ( bcf_hdr_id2length(args->in_hdr,BCF_HL_INFO,id)!=ftf->len ) - error("Error: the field INFO/%s already exists and its Number definition is different\n",args->str.s); - if ( ftf->len==BCF_VL_FIXED && bcf_hdr_id2number(args->in_hdr,BCF_HL_INFO,id)!=ftf->cnt ) - error("Error: the field INFO/%s already exists and its Number definition is different from %d\n",args->str.s,ftf->cnt); - if ( bcf_hdr_id2type(args->in_hdr,BCF_HT_INT,id)!=ftf->type ) - error("Error: the field INFO/%s already exists and its Type definition is different\n",args->str.s); - update_hdr = 0; - } - else if ( !ftf->info && bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,id) ) - { - if ( bcf_hdr_id2length(args->in_hdr,BCF_HL_FMT,id)!=ftf->len ) - error("Error: the field FORMAT/%s already exists and its Number definition is different\n",args->str.s); - if ( ftf->len==BCF_VL_FIXED && bcf_hdr_id2number(args->in_hdr,BCF_HL_FMT,id)!=ftf->cnt ) - error("Error: the field FORMAT/%s already exists and its Number definition is different from %d\n",args->str.s,ftf->cnt); - if ( bcf_hdr_id2type(args->in_hdr,BCF_HT_INT,id)!=ftf->type ) - error("Error: the field FORMAT/%s already exists and its Type definition is different\n",args->str.s); - update_hdr = 0; - } - if ( update_hdr ) - { - args->str.l = 0; - ksprintf(&args->str, "##%s=info?"INFO":"FORMAT",ftf->dst_tag,args->pop[i].suffix); - if ( ftf->len==BCF_VL_FIXED ) kputw(ftf->cnt, &args->str); - else kputc('.', &args->str); - kputs(",Type=", &args->str); - if ( ftf->type==BCF_HT_INT ) kputs("Integer", &args->str); - else kputs("Float", &args->str); - kputs(",Description=\"Added by +fill-tags expression ", &args->str); - // escape quotes - char *tmp = tag_expr; - while ( *tmp ) - { - if ( *tmp=='"' ) kputc('\\', &args->str); - kputc(*tmp, &args->str); - tmp++; - } - if ( *args->pop[i].name ) - { - kputs(" in ", &args->str); - kputs(args->pop[i].name, &args->str); - } - kputs("\">", &args->str); - bcf_hdr_append(args->out_hdr, args->str.s); + if ( *tmp=='"' ) kputc('\\', &args->str); + kputc(*tmp, &args->str); + tmp++; } - args->pop[i].filter = filter_init(args->in_hdr, filter); - memset(samples,0,sizeof(*samples)*bcf_hdr_nsamples(args->in_hdr)); - if ( *args->pop[i].name ) + if ( *pop->name ) { - for (j=0; jpop[i].nsmpl; j++) samples[ args->pop[i].smpl[j] ] = 1; - filter_set_samples(args->pop[i].filter, samples); + kputs(" in ", &args->str); + kputs(pop->name, &args->str); } + kputs("\">", &args->str); + bcf_hdr_append(args->out_hdr, args->str.s); + } + ftf->filter = filter_init(args->in_hdr, filter); + args->unpack |= filter_max_unpack(ftf->filter); + memset(samples,0,sizeof(*samples)*bcf_hdr_nsamples(args->in_hdr)); + if ( *pop->name ) + { + int j; + for (j=0; jnsmpl; j++) samples[ pop->smpl[j] ] = 1; + filter_set_samples(ftf->filter, samples); } free(samples); free(filter); return SET_FUNC; } +int parse_func(args_t *args, char *tag_expr, char *expr) +{ + int i, ret = 0; + for (i=0; inpop; i++) + ret |= parse_func_pop(args,&args->pop[i],tag_expr,expr); + return ret; +} int parse_tags(args_t *args, const char *str) { if ( !args->in_hdr ) error("%s", usage()); @@ -564,6 +568,12 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) if ( args->tags & SET_VAF ) bcf_hdr_append(args->out_hdr, "##FORMAT="); if ( args->tags & SET_VAF1 ) bcf_hdr_append(args->out_hdr, "##FORMAT="); + int i, max_unpack_bit = 0; + for (i=0; i<=3; i++) + if ( args->unpack & (1<unpack |= 1<in_hdr);; - for (i=0; inftf; i++) - args->ftf[i].func(args, rec, &args->ftf[i]); - bcf_fmt_t *fmt_gt = NULL; for (i=0; in_fmt; i++) if ( rec->d.fmt[i].id==args->gt_id ) { fmt_gt = &rec->d.fmt[i]; break; } @@ -972,6 +977,13 @@ static void process_vaf_vaf1(bcf1_t *rec) bcf1_t *process(bcf1_t *rec) { + int i,j; + + bcf_unpack(rec, args->unpack); + for (i=0; inpop; i++) + for (j=0; jpop[i].nftf; j++) + args->pop[i].ftf[j].func(args, rec, &args->pop[i], &args->pop[i].ftf[j]); + if ( args->unpack & BCF_UN_FMT ) { process_fmt(rec); @@ -1015,7 +1027,7 @@ void destroy(void) free(args->pop[i].suffix); free(args->pop[i].smpl); free(args->pop[i].counts); - if ( args->pop[i].filter ) filter_destroy(args->pop[i].filter); + ftf_destroy(&args->pop[i]); } kbs_destroy(args->bset); free(args->str.s); @@ -1024,7 +1036,6 @@ void destroy(void) free(args->iarr); free(args->farr); free(args->hwe_probs); - ftf_destroy(args); free(args); } diff --git a/test/fill-tags-AD.4.out b/test/fill-tags-AD.4.out new file mode 100644 index 000000000..19a7f7c1e --- /dev/null +++ b/test/fill-tags-AD.4.out @@ -0,0 +1,10 @@ +##fileformat=VCFv4.2 +##FILTER= +##INFO= +##FORMAT= +##contig= +##INFO= +##INFO=10)"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C +chr1 10153 . AC A . . AD=3,1;XX=1;YY=2 AD 10,1 20,2 30,3 +chr1 10153 . AC A . . AD=3,1;XX=1;YY=0 AD 10,1 . . diff --git a/test/test.pl b/test/test.pl index 678eba4e8..549df70c7 100755 --- a/test/test.pl +++ b/test/test.pl @@ -524,6 +524,7 @@ test_vcf_plugin($opts,in=>'fill-tags-AD',out=>'fill-tags-AD.1.out',cmd=>'+fill-tags --no-version',args=>q[-- -t 'INFO/DP:1=int(sum(FMT/AD))']); test_vcf_plugin($opts,in=>'fill-tags-AD',out=>'fill-tags-AD.2.out',cmd=>'+fill-tags --no-version',args=>q[-- -t 'INFO/DP:1=int(sum(INFO/AD))']); test_vcf_plugin($opts,in=>'fill-tags-AD',out=>'fill-tags-AD.3.out',cmd=>'+fill-tags --no-version',args=>q[-- -t 'FORMAT/DP:1=int(smpl_sum(FMT/AD))']); +test_vcf_plugin($opts,in=>'fill-tags-AD',out=>'fill-tags-AD.4.out',cmd=>'+fill-tags --no-version',args=>q[-- -t 'XX=N_PASS(FMT/AD[:0]<=10)','YY=N_PASS(FMT/AD[:0]>10)']); test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.out',cmd=>'+GTisec',args=>' | grep -v bcftools'); test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.H.out',cmd=>'+GTisec',args=>'-- -H | grep -v bcftools'); test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.Hm.out',cmd=>'+GTisec',args=>'-- -Hm | grep -v bcftools'); From 22f0d909aab74f758173c99fc09d571e8bc06aea Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 28 Mar 2022 12:06:09 +0100 Subject: [PATCH 07/12] Sanitize VEP field names that do not form valid INFO tags. Invalid characters are replaced with underscores, uniqueness of newly formed tag names is currently not checked for (a possible future todo). Fixes #1686 --- plugins/split-vep.c | 62 ++++++++++++++++++++++++++++++++++--------- test/split-vep.10.vcf | 4 +++ test/split-vep.25.out | 1 + test/test.pl | 2 ++ 4 files changed, 57 insertions(+), 12 deletions(-) create mode 100644 test/split-vep.10.vcf create mode 100644 test/split-vep.25.out diff --git a/plugins/split-vep.c b/plugins/split-vep.c index 86fa92148..08955970a 100644 --- a/plugins/split-vep.c +++ b/plugins/split-vep.c @@ -1,19 +1,19 @@ /* The MIT License - Copyright (c) 2019-2021 Genome Research Ltd. + Copyright (c) 2019-2022 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -180,7 +180,7 @@ static const char *default_column_types(void) } static const char *usage_text(void) { - return + return "\n" "About: Query structured annotations such INFO/CSQ created by bcftools/csq or VEP. For more\n" " more information and pointers see http://samtools.github.io/bcftools/howtos/plugin.split-vep.html\n" @@ -384,14 +384,52 @@ static int query_has_field(char *fmt, char *field, kstring_t *str) } return 1; } +/** + The valid_tag array was generated with + perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48 +*/ +static const uint8_t valid_tag[256] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +static void sanitize_field_name(char *fmt) +{ + while ( *fmt ) + { + if ( !valid_tag[(uint8_t)*fmt] ) *fmt = '_'; + fmt++; + } +} char *strdup_annot_prefix(args_t *args, const char *str) { - if ( !args->annot_prefix ) return strdup(str); + char *out; + if ( !args->annot_prefix ) + { + out = strdup(str); + sanitize_field_name(out); + return out; + } int str_len = strlen(str); int prefix_len = strlen(args->annot_prefix); - char *out = calloc(str_len+prefix_len+1,1); + out = calloc(str_len+prefix_len+1,1); memcpy(out,args->annot_prefix,prefix_len); memcpy(out+prefix_len,str,str_len); + sanitize_field_name(out); return out; } static void init_data(args_t *args) @@ -587,7 +625,7 @@ static void init_data(args_t *args) if ( keep ) ksprintf(&str,",%s",ep+1); free(args->column_str); args->column_str = str.s; - ep = str.s; + ep = str.s; continue; } char *tmp = strdup_annot_prefix(args, bp); @@ -637,7 +675,7 @@ static void init_data(args_t *args) ep++; continue; } - else + else error("No such column: \"%s\"\n", bp); } } @@ -957,7 +995,7 @@ static void process_record(args_t *args, bcf1_t *rec) int len = bcf_get_info_string(args->hdr,rec,args->vep_tag,&args->csq_str,&args->ncsq_str); if ( len<=0 ) { - if ( !args->drop_sites ) + if ( !args->drop_sites ) { annot_reset(args->annot, args->nannot); filter_and_output(args,rec,1,1); @@ -1015,7 +1053,7 @@ static void process_record(args_t *args, bcf1_t *rec) else annot_append(ann, "."); // missing value } - + if ( args->duplicate ) { filter_and_output(args, rec, severity_pass, all_missing); @@ -1071,7 +1109,7 @@ int run(int argc, char **argv) char *tmp; while ((c = getopt_long(argc, argv, "o:O:i:e:r:R:t:T:lS:s:c:p:a:f:dA:xu",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 2 : args->record_cmd_line = 0; break; case 1 : args->column_types = optarg; break; diff --git a/test/split-vep.10.vcf b/test/split-vep.10.vcf new file mode 100644 index 000000000..02ee41bf7 --- /dev/null +++ b/test/split-vep.10.vcf @@ -0,0 +1,4 @@ +##fileformat=VCFv4.2 +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +9 94487044 . C T . . CSQ=T|missense_variant|MODERATE|ROR2|ENSG00000169071|Transcript|ENST00000375708|protein_coding|9/9||ENST00000375708.3:c.1732G>A|ENSP00000364860.3:p.Asp578Asn|1931|1732|578|D/N|Gat/Aat|rs139802697||-1||SNV|HGNC|10257|YES|||||CCDS6691.1|ENSP00000364860|ROR2_HUMAN||UPI000013E8CA||1|tolerated(0.13)|probably_damaging(0.999)|Superfamily:SSF56112&PIRSF:PIRSF000624&Gene3D:1.10.510.10&Pfam:PF07714&PANTHER:PTHR24416:SF132&PANTHER:PTHR24416&PROSITE_profiles:PS50011|||||||||0.000227|0|3.192e-05|6.165e-05|5.789e-05|0|0|0|4.416e-05|0|0|0.000227|AA||||||||||0.46|23.5|2.953273|D|0.293271| diff --git a/test/split-vep.25.out b/test/split-vep.25.out new file mode 100644 index 000000000..1801f45b9 --- /dev/null +++ b/test/split-vep.25.out @@ -0,0 +1 @@ +D 0.293271 diff --git a/test/test.pl b/test/test.pl index 549df70c7..8b7ca65b7 100755 --- a/test/test.pl +++ b/test/test.pl @@ -613,6 +613,8 @@ test_vcf_plugin($opts,in=>'split-vep.8',out=>'split-vep.22.out',cmd=>'+split-vep',args=>qq[-a BCSQ -s worst -f '%POS\\t%Consequence\\t%amino_acid_change\\t%BCSQ\\n' | grep -v ^#]); test_vcf_plugin($opts,in=>'split-vep.9',out=>'split-vep.23.out',cmd=>'+split-vep',args=>qq[-a BCSQ -f '%CHROM\\t%POS\\t%REF\\t%ALT\\t%BCSQ\\n' -d -A tab | grep -v ^#]); test_vcf_plugin($opts,in=>'split-vep.9',out=>'split-vep.24.out',cmd=>'+split-vep',args=>qq[-a BCSQ -f '%xPOS %xConsequence\\n' -p x | grep -v ^#]); +test_vcf_plugin($opts,in=>'split-vep.10',out=>'split-vep.25.out',cmd=>'+split-vep',args=>qq[-a CSQ -f '%M_CAP_pred %M_CAP_score\\n' | grep -v ^#]); +test_vcf_plugin($opts,in=>'split-vep.10',out=>'split-vep.25.out',cmd=>'+split-vep',args=>qq[-a CSQ -f '%xM_CAP_pred %xM_CAP_score\\n' -p x | grep -v ^#]); test_vcf_plugin($opts,in=>'parental-origin',out=>'parental-origin.1.out',cmd=>'+parental-origin',args=>qq[-r 20:100 -p proband,father,mother -t del | grep -v ^#]); test_vcf_plugin($opts,in=>'parental-origin',out=>'parental-origin.2.out',cmd=>'+parental-origin',args=>qq[-r 20:101 -p proband,father,mother -t del | grep -v ^#]); test_vcf_plugin($opts,in=>'parental-origin',out=>'parental-origin.3.out',cmd=>'+parental-origin',args=>qq[-r 20:102 -p proband,father,mother -t del | grep -v ^#]); From ae2bb074228b579ae97a891a6ba8ad32bd9e424d Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 29 Mar 2022 13:41:39 +0100 Subject: [PATCH 08/12] Fix the loss of phasing in half-missing genotypes in variant atomization Resolves #1689 --- NEWS | 10 ++++++++++ abuf.c | 5 ++++- test/atomize.split.3.1.out | 7 +++++++ test/atomize.split.3.vcf | 5 +++++ test/test.pl | 1 + 5 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 test/atomize.split.3.1.out create mode 100644 test/atomize.split.3.vcf diff --git a/NEWS b/NEWS index bff2847c5..e7a1a80de 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,16 @@ - Allow multiple custom functions in a single run. Previously the program would silently go with the last one, assigning the same values to all (#1684) +* bcftools norm + + - Fix the loss of phasing in half-missing genotypes in variant atomization (#1689) + +* bcftools +split-vep + + - VEP fields with characters disallowed in VCF tag names by the specification (such as '-' + in 'M-CAP') couldn't be queried. This has been fixed, the program now sanitizes the field + names, replacing invalid characters with underscore (#1686) + ## Release 1.15 (21st February 2022) diff --git a/abuf.c b/abuf.c index 8ffe3db3b..78682d667 100644 --- a/abuf.c +++ b/abuf.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2021 Genome Research Ltd. + Copyright (c) 2021-2022 Genome Research Ltd. Author: Petr Danecek @@ -466,7 +466,10 @@ static void _split_table_set_gt(abuf_t *buf) error("Out-of-bounds genotypes at %s:%"PRIhts_pos"\n",bcf_seqname(buf->hdr,rec),rec->pos+1); int ial = _split_table_get_ial(buf,iout,iori); if ( ial==2 && !star_allele ) + { dst[j] = bcf_gt_missing; + if ( bcf_gt_is_phased(src[j]) ) dst[j] |= 1; + } else dst[j] = bcf_gt_is_phased(src[j]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial); } diff --git a/test/atomize.split.3.1.out b/test/atomize.split.3.1.out new file mode 100644 index 000000000..3ed695234 --- /dev/null +++ b/test/atomize.split.3.1.out @@ -0,0 +1,7 @@ +##fileformat=VCFv4.3 +##FILTER= +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +1 50 . G C . . . GT .|1 +1 50 . G T . . . GT 1|. diff --git a/test/atomize.split.3.vcf b/test/atomize.split.3.vcf new file mode 100644 index 000000000..c2b90ed41 --- /dev/null +++ b/test/atomize.split.3.vcf @@ -0,0 +1,5 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +1 50 . G C,T . . . GT 2|1 diff --git a/test/test.pl b/test/test.pl index 8b7ca65b7..bbc1d94bd 100755 --- a/test/test.pl +++ b/test/test.pl @@ -251,6 +251,7 @@ test_vcf_norm($opts,in=>'atomize.split.1',out=>'atomize.split.1.2.out',args=>'--atomize --atom-overlaps . --old-rec-tag OLD_REC'); test_vcf_norm($opts,in=>'atomize.split.2',out=>'atomize.split.2.1.out',args=>'--atomize --old-rec-tag OLD_REC'); test_vcf_norm($opts,in=>'atomize.split.2',out=>'atomize.split.2.2.out',args=>'--atomize --atom-overlaps . --old-rec-tag OLD_REC'); +test_vcf_norm($opts,in=>'atomize.split.3',out=>'atomize.split.3.1.out',args=>'--atomize --atom-overlaps .'); test_vcf_norm($opts,in=>'norm.4',out=>'norm.4.1.out',args=>'-m +both'); test_vcf_norm($opts,in=>'norm.4',out=>'norm.4.2.out',args=>'-m +any'); test_vcf_view($opts,in=>'view',out=>'view.1.out',args=>'-aUc1 -C1 -s NA00002 -v snps',reg=>''); From b866bd74f3ccf89e036ae74663519339035e83a9 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 29 Mar 2022 15:36:43 +0100 Subject: [PATCH 09/12] Prevent further segfaults with the combination of --atomize and --check-ref The atomic buffer creates a copy of bcf records and thus requires unpacking. Following the 640850c fix of #1668, this resolves #1674 --- test/norm.2.out | 4 +--- test/norm.2.vcf | 4 +--- test/test.pl | 2 +- vcfmerge.c | 36 ++++++++++++++++++------------------ vcfnorm.c | 19 ++++++++++--------- 5 files changed, 31 insertions(+), 34 deletions(-) diff --git a/test/norm.2.out b/test/norm.2.out index df7a86386..dc5d4c71d 100644 --- a/test/norm.2.out +++ b/test/norm.2.out @@ -1,11 +1,9 @@ ##fileformat=VCFv4.2 ##FILTER= ##contig= -##INFO= -##INFO= -##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO 1 10 . G . . . . 1 10 . g gTC . . . 1 10 . g gtC . . . 1 13 . T . . . . +1 14 . a aGC . . . diff --git a/test/norm.2.vcf b/test/norm.2.vcf index 54b62cc0d..e9d694dac 100644 --- a/test/norm.2.vcf +++ b/test/norm.2.vcf @@ -1,10 +1,8 @@ ##fileformat=VCFv4.2 ##contig= -##INFO= -##INFO= -##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO 1 10 . G . . . . 1 11 . T TCT . . . 1 12 . C CTC . . . 1 13 . T . . . . +1 14 . B BGC . . . diff --git a/test/test.pl b/test/test.pl index bbc1d94bd..2e1d08198 100755 --- a/test/test.pl +++ b/test/test.pl @@ -244,7 +244,7 @@ test_vcf_norm($opts,in=>'norm.rmdup.2',out=>'norm.rmdup.2.2.out',args=>'-d any'); test_vcf_norm($opts,in=>'norm.rmdup.2',out=>'norm.rmdup.2.2.out',args=>'-d both'); test_vcf_norm($opts,in=>'norm.rmdup.2',out=>'norm.rmdup.2.2.out',args=>'-d snps'); -test_vcf_norm($opts,in=>'norm.2',fai=>'norm.2',out=>'norm.2.out',args=>''); +test_vcf_norm($opts,in=>'norm.2',fai=>'norm.2',out=>'norm.2.out',args=>'-c s -a'); test_vcf_norm($opts,in=>'norm.iupac',fai=>'norm.iupac',out=>'norm.iupac.out',args=>'-c s'); test_vcf_norm($opts,in=>'norm.3',fai=>'norm.3',out=>'norm.3.out',args=>'-c s'); test_vcf_norm($opts,in=>'atomize.split.1',out=>'atomize.split.1.1.out',args=>'--atomize --old-rec-tag OLD_REC'); diff --git a/vcfmerge.c b/vcfmerge.c index 92aaa8030..60a80c9fa 100644 --- a/vcfmerge.c +++ b/vcfmerge.c @@ -1,6 +1,6 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Petr Danecek @@ -387,7 +387,7 @@ static void info_rules_init(args_t *args) rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id); if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t); else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float); - else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char); + else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char); else error("The INFO rule \"%s\" is not supported; the tag \"%s\" type is %d\n", ss,rule->hdr_tag,rule->type); if ( !strcmp(rule->hdr_tag,"AC") || !strcmp(rule->hdr_tag,"AN") ) args->keep_AC_AN = 1; @@ -1074,8 +1074,8 @@ static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *t /* * copy_string_field() - copy a comma-separated field * @param src: source string - * @param isrc: index of the field to copy - * @param src_len: length of source string (excluding the terminating \0) + * @param isrc: index of the field to copy + * @param src_len: length of source string (excluding the terminating \0) * @param dst: destination kstring (must be initialized with missing values, e.g. as ".") * @param idst: index of the destination field */ @@ -1334,7 +1334,7 @@ void merge_info(args_t *args, bcf1_t *out) out->d.info[out->n_info].vptr_off = inf->vptr_off; out->d.info[out->n_info].vptr_len = inf->vptr_len; out->d.info[out->n_info].vptr_free = 1; - out->d.info[out->n_info].vptr = (uint8_t*) malloc(inf->vptr_len+inf->vptr_off); + out->d.info[out->n_info].vptr = (uint8_t*) malloc(inf->vptr_len+inf->vptr_off); memcpy(out->d.info[out->n_info].vptr,inf->vptr-inf->vptr_off, inf->vptr_len+inf->vptr_off); out->d.info[out->n_info].vptr += inf->vptr_off; if ( (args->output_type & FT_BCF) && id!=bcf_hdr_id2int(hdr, BCF_DT_ID, key) ) @@ -1435,7 +1435,7 @@ void init_local_alleles(args_t *args, bcf1_t *out, int ifmt_PL) if ( line->n_allele <= args->local_alleles + 1 ) { - // sort to the output order, insertion sort, ascending + // sort to the output order, insertion sort, ascending int *map = ma->buf[i].rec[ma->buf[i].cur].map; int *k2k = ma->k2k; int tmp; @@ -1746,7 +1746,7 @@ void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf int iori,inew; for (iori=ifrom; iorin_allele; iori++) { - inew = ma->buf[i].rec[irec].map[iori] - ifrom; + inew = ma->buf[i].rec[irec].map[iori] - ifrom; int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew); if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%"PRId64" .. %d\n",__func__,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret); } @@ -2310,7 +2310,7 @@ void gvcf_set_alleles(args_t *args) bcf_srs_t *files = args->files; maux_t *maux = args->maux; gvcf_aux_t *gaux = maux->gvcf; - for (i=0; inals; i++) + for (i=0; inals; i++) { free(maux->als[i]); maux->als[i] = NULL; @@ -2373,11 +2373,11 @@ void gvcf_write_block(args_t *args, int start, int end) for (i=0; ifiles->nreaders; i++) { if ( !gaux[i].active ) continue; - if ( gaux[i].end < start ) - { - gaux[i].active = 0; + if ( gaux[i].end < start ) + { + gaux[i].active = 0; maux->buf[i].cur = -1; - continue; + continue; } gaux[i].line->d.allele[0][0] = ref; if ( min > gaux[i].end ) min = gaux[i].end; @@ -2430,9 +2430,9 @@ void gvcf_write_block(args_t *args, int start, int end) if ( !gaux[i].active ) continue; if ( gaux[i].end < end ) { - gaux[i].active = 0; + gaux[i].active = 0; maux->buf[i].cur = -1; - continue; + continue; } // next min END position bigger than the current one if ( maux->gvcf_min < gaux[i].end+1 && min > gaux[i].end+1 ) min = gaux[i].end + 1; @@ -2455,7 +2455,7 @@ void gvcf_write_block(args_t *args, int start, int end) 3 END=5 A B C 6 END=7 A B . 8 END=10 A . . - + */ void gvcf_flush(args_t *args, int done) { @@ -2589,7 +2589,7 @@ void gvcf_stage(args_t *args, int pos) if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1; } else - maux->gvcf_break = line->pos; // must break the gvcf block + maux->gvcf_break = line->pos; // must break the gvcf block } maux->ntmp_arr = nend * sizeof(int32_t); maux->tmp_arr = end; @@ -2710,7 +2710,7 @@ int can_merge(args_t *args) char *id = NULL, ref = 'N'; int i,j,k, ntodo = 0; - for (i=0; inals; i++) + for (i=0; inals; i++) { free(maux->als[i]); maux->als[i] = NULL; @@ -3186,7 +3186,7 @@ int main_vcfmerge(int argc, char *argv[]) if ( args->local_alleles < 1 ) error("Error: \"--local-alleles %s\" makes no sense, expected value bigger or equal than 1\n", optarg); break; - case 'F': + case 'F': if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD; else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE; else error("Filter logic not recognised: %s\n", optarg); diff --git a/vcfnorm.c b/vcfnorm.c index 460a757a5..38c5de4e5 100644 --- a/vcfnorm.c +++ b/vcfnorm.c @@ -142,6 +142,7 @@ static void seq_to_upper(char *seq, int len) static void fix_ref(args_t *args, bcf1_t *line) { + bcf_unpack(line, BCF_UN_STR); int reflen = strlen(line->d.allele[0]); int i,j, maxlen = reflen, len; for (i=1; in_allele; i++) @@ -160,10 +161,10 @@ static void fix_ref(args_t *args, bcf1_t *line) if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } // is the REF allele missing? - if ( reflen==1 && line->d.allele[0][0]=='.' ) - { - line->d.allele[0][0] = ref[0]; - args->nref.set++; + if ( reflen==1 && line->d.allele[0][0]=='.' ) + { + line->d.allele[0][0] = ref[0]; + args->nref.set++; free(ref); bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); return; @@ -235,7 +236,7 @@ static void fix_ref(args_t *args, bcf1_t *line) for (j=1; jn_allele; j++) { kputc(',',&str); - if ( j==i ) + if ( j==i ) kputs(line->d.allele[0],&str); else kputs(line->d.allele[j],&str); @@ -1774,7 +1775,7 @@ static int cmpals_match(cmpals_t *ca, bcf1_t *rec) } khash_t(str2int) *hash = (khash_t(str2int)*) cmpals->hash; - for (j=1; jn_allele; j++) + for (j=1; jn_allele; j++) if ( !khash_str2int_has_key(hash, rec->d.allele[j]) ) break; if ( jn_allele ) continue; return 1; @@ -1863,7 +1864,7 @@ static void init_data(args_t *args) args->out_hdr = bcf_hdr_dup(args->hdr); if ( args->old_rec_tag ) - bcf_hdr_printf(args->out_hdr,"##INFO=",args->old_rec_tag); + bcf_hdr_printf(args->out_hdr,"##INFO=",args->old_rec_tag); rbuf_init(&args->rbuf, 100); args->lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*)); @@ -1880,7 +1881,7 @@ static void init_data(args_t *args) } if ( args->atomize==SPLIT ) { - args->abuf = abuf_init(args->hdr, SPLIT); + args->abuf = abuf_init(args->hdr, SPLIT); abuf_set_opt(args->abuf, bcf_hdr_t*, BCF_HDR, args->out_hdr); if ( args->old_rec_tag ) abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag); @@ -2226,7 +2227,7 @@ int main_vcfnorm(int argc, char *argv[]) break; case 'o': args->output_fname = optarg; break; case 'D': - fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d none`.\n"); + fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d none`.\n"); args->rmdup = BCF_SR_PAIR_EXACT; break; case 's': args->strict_filter = 1; break; From 339914054cf1d231e6210db6e9eccfe29ba57fdd Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 30 Mar 2022 08:34:34 +0100 Subject: [PATCH 10/12] Fix endless loop or incorrect AF estimates This problemm occurs would occur when missing genotypes were present and the `--estimate-AF -` option was used. Fixes #1687 --- NEWS | 5 +++++ vcfroh.c | 62 ++++++++++++++++++++++++++++++-------------------------- 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/NEWS b/NEWS index e7a1a80de..46ec574f1 100644 --- a/NEWS +++ b/NEWS @@ -19,6 +19,11 @@ - Fix the loss of phasing in half-missing genotypes in variant atomization (#1689) +* bcftools roh + + - Fix a bug that could result in an endless loop or incorrect AF estimate when + missing genotypes are present and the `--estimate-AF -` option was used (#1687) + * bcftools +split-vep - VEP fields with characters disallowed in VCF tag names by the specification (such as '-' diff --git a/vcfroh.c b/vcfroh.c index 8ca0a9e24..a0802db7e 100644 --- a/vcfroh.c +++ b/vcfroh.c @@ -153,7 +153,7 @@ static void init_data(args_t *args) args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"); if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) ) error("Error: The FORMAT/PL tag not found in the header, consider running with -G\n"); - if ( bcf_hdr_id2type(args->hdr,BCF_HL_FMT,args->pl_hdr_id)!=BCF_HT_INT ) + if ( bcf_hdr_id2type(args->hdr,BCF_HL_FMT,args->pl_hdr_id)!=BCF_HT_INT ) error("Error: The FORMAT/PL tag not defined as Integer in the header\n"); } @@ -279,15 +279,15 @@ static void init_data(args_t *args) MAT(tprob,2,STATE_HW,STATE_HW) = 1 - args->t2AZ; MAT(tprob,2,STATE_HW,STATE_AZ) = args->t2HW; MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ; - MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; + MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; args->hmm = hmm_init(2, tprob, 10000); - if ( args->genmap_fname ) + if ( args->genmap_fname ) hmm_set_tprob_func(args->hmm, set_tprob_genmap, args); else if ( args->rec_rate > 0 ) hmm_set_tprob_func(args->hmm, set_tprob_rrate, args); - args->out = bgzf_open(strcmp("stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu"); + args->out = bgzf_open(strcmp("stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu"); if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname, strerror(errno)); // print header @@ -509,7 +509,7 @@ static void flush_viterbi(args_t *args, int ismpl) if ( !args->vi_training ) // single viterbi pass { - hmm_restore(args->hmm, smpl->snapshot); + hmm_restore(args->hmm, smpl->snapshot); int end = (args->nbuf_max && smpl->nsites >= args->nbuf_max && smpl->nsites > args->nbuf_olap) ? smpl->nsites - args->nbuf_olap : smpl->nsites; if ( end < smpl->nsites ) smpl->snapshot = hmm_snapshot(args->hmm, smpl->snapshot, smpl->sites[smpl->nsites - args->nbuf_olap - 1]); @@ -535,7 +535,7 @@ static void flush_viterbi(args_t *args, int ismpl) if ( args->output_type & OUTPUT_RG ) { - if ( state!=smpl->rg.state ) + if ( state!=smpl->rg.state ) { if ( !state ) // the region ends, flush { @@ -599,7 +599,7 @@ static void flush_viterbi(args_t *args, int ismpl) MAT(tprob_arr,2,STATE_HW,STATE_HW) = 1 - args->t2AZ; MAT(tprob_arr,2,STATE_HW,STATE_AZ) = args->t2HW; MAT(tprob_arr,2,STATE_AZ,STATE_HW) = args->t2AZ; - MAT(tprob_arr,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; + MAT(tprob_arr,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; hmm_set_tprob(args->hmm, tprob_arr, 10000); int niter = 0; @@ -627,14 +627,14 @@ static void flush_viterbi(args_t *args, int ismpl) delthw = fabs(MAT(tprob_new,2,STATE_HW,STATE_AZ)-t2hw_prev); niter++; args->str.l = 0; - ksprintf(&args->str, "VT\t%s\t%d\t%e\t%e\t%e\t%e\t%e\t%e\n", + ksprintf(&args->str, "VT\t%s\t%d\t%e\t%e\t%e\t%e\t%e\t%e\n", name,niter,deltaz,delthw, 1-MAT(tprob_new,2,STATE_HW,STATE_HW),MAT(tprob_new,2,STATE_AZ,STATE_HW), 1-MAT(tprob_new,2,STATE_AZ,STATE_AZ),MAT(tprob_new,2,STATE_HW,STATE_AZ)); if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno)); } while ( deltaz > args->baum_welch_th || delthw > args->baum_welch_th ); - + // output the results for (i=0; inrid; i++) { @@ -671,7 +671,7 @@ int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) char *tmp, *str = tgt->line.s; i = 0; - while ( *str && i<3 ) + while ( *str && i<3 ) { if ( *str=='\t' ) i++; str++; @@ -722,7 +722,11 @@ int estimate_AF_from_GT(args_t *args, int8_t *gt, double *alt_freq) int8_t *end = gt + 2*bcf_hdr_nsamples(args->hdr); while ( gt < end ) { - if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue; + if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) + { + gt += 2; + continue; + } if ( bcf_gt_allele(gt[0]) ) nalt++; else nref++; @@ -746,7 +750,7 @@ int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_fr int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial); if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields - + if ( args->af_smpl ) // subset samples for AF estimate { #define BRANCH(type_t) \ @@ -838,7 +842,7 @@ int process_line(args_t *args, bcf1_t *line, int ial) if ( ret==-2 ) error("Type mismatch for INFO/%s tag at %s:%"PRId64"\n", args->af_tag, bcf_seqname(args->hdr,line), (int64_t) line->pos+1); } - else if ( args->af_fname ) + else if ( args->af_fname ) { // Read AF from a file ret = read_AF(args->files->targets, line, &alt_freq); @@ -875,9 +879,9 @@ int process_line(args_t *args, bcf1_t *line, int ial) if ( ret>0 ) AC = args->itmp[0]; } - if ( AN<=0 || AC<0 ) + if ( AN<=0 || AC<0 ) ret = -1; - else + else alt_freq = (double) AC/AN; } @@ -962,12 +966,12 @@ int process_line(args_t *args, bcf1_t *line, int ial) smpl->eprob = (double*) realloc(smpl->eprob,sizeof(*smpl->eprob)*smpl->msites*2); if ( !smpl->eprob ) error("Error: failed to alloc %"PRIu64" bytes\n", (uint64_t)(sizeof(*smpl->eprob)*smpl->msites*2)); } - + // Calculate emission probabilities P(D|AZ) and P(D|HW) double *eprob = &smpl->eprob[2*smpl->nsites]; eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq; eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq; - + smpl->sites[smpl->nsites] = line->pos; smpl->nsites++; @@ -994,12 +998,12 @@ static void vcfroh(args_t *args, bcf1_t *line) // Are we done? if ( !line ) - { + { for (i=0; iroh_smpl->n; i++) flush_viterbi(args, i); - return; + return; } - // Skip unwanted lines, for simplicity we consider only biallelic sites + // Skip unwanted lines, for simplicity we consider only biallelic sites if ( line->rid == args->skip_rid ) return; // This can be raw callable VCF with the symbolic unseen allele <*> @@ -1043,7 +1047,7 @@ static void vcfroh(args_t *args, bcf1_t *line) args->prev_pos = line->pos; skip_rid = load_genmap(args, bcf_seqname(args->hdr,line)); } - else if ( args->prev_pos == line->pos ) + else if ( args->prev_pos == line->pos ) { args->ndup++; return; // skip duplicate positions @@ -1161,7 +1165,7 @@ int main_vcfroh(int argc, char *argv[]) switch (c) { case 0: args->af_tag = optarg; naf_opts++; break; case 1: args->af_fname = optarg; naf_opts++; break; - case 2: + case 2: args->dflt_AF = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg); break; @@ -1173,7 +1177,7 @@ int main_vcfroh(int argc, char *argv[]) args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 5: args->include_noalt_sites = 1; break; case 'o': args->output_fname = optarg; break; - case 'O': + case 'O': if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST; if ( strchr(optarg,'r') || strchr(optarg,'R') ) args->output_type |= OUTPUT_RG; if ( strchr(optarg,'z') || strchr(optarg,'z') ) args->output_type |= OUTPUT_GZ; @@ -1183,10 +1187,10 @@ int main_vcfroh(int argc, char *argv[]) case 'i': args->skip_homref = 1; break; case 'I': args->snps_only = 1; break; case 'G': - args->fake_PLs = 1; + args->fake_PLs = 1; args->unseen_PL = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -G %s\n", optarg); - args->unseen_PL = pow(10,-args->unseen_PL/10.); + args->unseen_PL = pow(10,-args->unseen_PL/10.); break; case 'm': args->genmap_fname = optarg; break; case 'M': @@ -1216,12 +1220,12 @@ int main_vcfroh(int argc, char *argv[]) if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 'V': - args->vi_training = 1; - args->baum_welch_th = strtod(optarg,&tmp); + case 'V': + args->vi_training = 1; + args->baum_welch_th = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --viterbi-training %s\n", optarg); break; - case 'h': + case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } From ed550b53b0c6143bbc471df2cafdad0dd0074c50 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Fri, 1 Apr 2022 16:27:06 +0200 Subject: [PATCH 11/12] Minor usage page update --- plugins/af-dist.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/plugins/af-dist.c b/plugins/af-dist.c index 66a01db1f..b11733652 100644 --- a/plugins/af-dist.c +++ b/plugins/af-dist.c @@ -1,19 +1,19 @@ /* The MIT License - Copyright (c) 2016-2017 Genome Research Ltd. + Copyright (c) 2016-2022 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -53,10 +53,12 @@ const char *about(void) const char *usage(void) { - return + return "\n" - "About: Collect AF deviation stats and GT probability distribution\n" - " given AF and assuming HWE\n" + "About: Collect AF deviation stats and GT probability distribution given AF and\n" + " assuming HWE. Only non-reference genotypes with known allele frequency\n" + " at the site are considered, their probabilities are 2*AF*(1-AF) for RA\n" + " and AF**2 for the AA genotype.\n" "Usage: bcftools +af-dist [General Options] -- [Plugin Options]\n" "Options:\n" " run \"bcftools plugin\" for a list of common options\n" @@ -71,7 +73,7 @@ const char *usage(void) " -d: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1\n" " -p: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1\n" "Example:\n" - " bcftools +af-tag file.bcf -- -t EUR_AF -p bins.txt\n" + " bcftools +af-dist file.bcf -- -t EUR_AF -p bins.txt\n" "\n"; } @@ -94,9 +96,9 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) int c; while ((c = getopt_long(argc, argv, "?ht:d:p:l:",loptions,NULL)) >= 0) { - switch (c) + switch (c) { - case 'l': + case 'l': { char *a,*b; args->list_min = strtod(optarg,&a); From c2c26c9b8563a2072a01512d003a7686cb49d86e Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 28 Mar 2022 16:40:36 +0100 Subject: [PATCH 12/12] Add NEWS updates --- NEWS | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/NEWS b/NEWS index 46ec574f1..94faf229e 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,11 @@ this complements the existing `-h, --header-lines` option which requires a file with header lines +* bcftools csq + + - A list of consequence types supported by `bcftools csq` has been added to + the manual page. (#1671) + * bcftools +fill-tags - Extend generalized functions so that FORMAT tags can be filled as well, for example: @@ -17,6 +22,11 @@ * bcftools norm + - Fix an assertion failure triggered when a faulty VCF file with a '-' + character in the REF allele was used with `bcftools norm --atomize`. This + option now checks that the REF allele only includes the allowed characters + A, C, G, T and N. (#1668) + - Fix the loss of phasing in half-missing genotypes in variant atomization (#1689) * bcftools roh