Skip to content

Commit

Permalink
Make the mpileup BAM_CREF_SKIP filter optional.
Browse files Browse the repository at this point in the history
Mpileup removes alignments using the cigar ref skip operator ("N").
This was originally added in 2011 in samtools/samtools#d1643d6 with
the commit message of "fixed a bug in indel calling related to
unmapped and refskip reads".

Unfortunately I don't know what that bug was, but removing the code
shows it still works (at least for some data!).  We need better
understanding of what's going on and why it was added, but this PR
makes it optional, keeping the default as before.  Note there appears
to be no filtering of BAM_CREF_SKIP in indels-2.0 so the option would
be a nop there.

This is an alternative PR to samtools#2281.  I've leave it to the project
maintainer as to what is preferable: removing the (no longer needed?)
filtering, or keeping the default behaviour identical and adding a new
option instead (which is safer, but possibly leads to accidental bad
calls due to not noticing a new option has appeared).

Fixes samtools#2277
  • Loading branch information
jkbonfield committed Sep 17, 2024
1 parent 9d31418 commit 0f3cb9e
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 16 deletions.
2 changes: 1 addition & 1 deletion bam2bcf.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ plp_cd_t;


typedef struct __bcf_callaux_t {
int fmt_flag, ambig_reads;
int fmt_flag, ambig_reads, ref_skip_reads;
int capQ, min_baseQ, max_baseQ, delta_baseQ;
int openQ, extQ, tandemQ; // for indels
uint32_t min_support, max_support; // for collecting indel candidates
Expand Down
17 changes: 10 additions & 7 deletions bam2bcf_edlib.c
Original file line number Diff line number Diff line change
Expand Up @@ -1559,18 +1559,21 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
}
}

int qbeg, qpos, qend, tbeg, tend, kk;
int qbeg, qpos, qend, tbeg, tend;
uint8_t *seq = bam_get_seq(p->b);
uint32_t *cigar = bam_get_cigar(p->b);
if (p->b->core.flag & BAM_FUNMAP) continue;

// FIXME: the following loop should be better moved outside;
// nonetheless, realignment should be much slower anyway.
for (kk = 0; kk < p->b->core.n_cigar; ++kk)
if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP)
break;
if (kk < p->b->core.n_cigar)
continue;
if (!bca->ref_skip_reads) {
uint32_t *cigar = bam_get_cigar(p->b);
int kk;
for (kk = 0; kk < p->b->core.n_cigar; ++kk)
if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP)
break;
if (kk < p->b->core.n_cigar)
continue;
}

// determine the start and end of sequences for alignment
int left2 = left, right2 = right;
Expand Down
17 changes: 10 additions & 7 deletions bam2bcf_indel.c
Original file line number Diff line number Diff line change
Expand Up @@ -845,18 +845,21 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
}
}

int qbeg, qpos, qend, tbeg, tend, kk;
int qbeg, qpos, qend, tbeg, tend;
uint8_t *seq = bam_get_seq(p->b);
uint32_t *cigar = bam_get_cigar(p->b);
if (p->b->core.flag & BAM_FUNMAP) continue;

// FIXME: the following loop should be better moved outside;
// nonetheless, realignment should be much slower anyway.
for (kk = 0; kk < p->b->core.n_cigar; ++kk)
if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP)
break;
if (kk < p->b->core.n_cigar)
continue;
if (!bca->ref_skip_reads) {
uint32_t *cigar = bam_get_cigar(p->b);
int kk;
for (kk = 0; kk < p->b->core.n_cigar; ++kk)
if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP)
break;
if (kk < p->b->core.n_cigar)
continue;
}

// determine the start and end of sequences for alignment
// FIXME: loops over CIGAR multiple times
Expand Down
8 changes: 8 additions & 0 deletions doc/bcftools.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2503,6 +2503,14 @@ INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for
exclude from calling and increment the first value of the AD counter
('incAD0') ['drop']

*--ref-skip-reads*::
Include reads with CIGAR 'N' (ref-skip) operators in their alignment.
By default these are filtered out. Enabling this option may be
necessary for calling on some RNASeq pipelines. This works with
the default caller and --indels-cns. The experimental -indels-2.0
option does not filter out alignments with ref-skips so this option
is unnecessary there.

*-e, --ext-prob* 'INT'::
Phred-scaled gap extension sequencing error probability. Reducing 'INT'
leads to longer indels [20]
Expand Down
10 changes: 9 additions & 1 deletion mpileup.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ typedef struct _mplp_pileup_t mplp_pileup_t;
// Data shared by all bam files
typedef struct {
int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth,
max_indel_depth, max_read_len, ambig_reads;
max_indel_depth, max_read_len, ambig_reads, ref_skip_reads;
uint32_t fmt_flag;
int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type;
int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels
Expand Down Expand Up @@ -882,6 +882,7 @@ static int mpileup(mplp_conf_t *conf)
conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
conf->bca->fmt_flag = conf->fmt_flag;
conf->bca->ambig_reads = conf->ambig_reads;
conf->bca->ref_skip_reads = conf->ref_skip_reads;
conf->bca->indel_win_size = conf->indel_win_size;
conf->bca->indels_v20 = conf->indels_v20;
conf->bca->edlib = conf->edlib;
Expand Down Expand Up @@ -1291,6 +1292,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
" -p, --per-sample-mF Apply -m and -F per-sample for increased sensitivity\n"
" -P, --platforms STR Comma separated list of platforms for indels [all]\n"
" --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n");
fprintf(fp,
" --ref-skip-reads Use reads with N CIGAR op [discard by default]\n");
fprintf(fp,
" --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias);
fprintf(fp,
Expand Down Expand Up @@ -1381,6 +1384,7 @@ int main_mpileup(int argc, char *argv[])
mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB|B2B_FMT_AD;
mplp.max_read_len = 500;
mplp.ambig_reads = B2B_DROP;
mplp.ref_skip_reads = 0;
mplp.indel_win_size = 110;
mplp.poly_mqual = 0;
mplp.seqQ_offset = 120;
Expand Down Expand Up @@ -1458,6 +1462,7 @@ int main_mpileup(int argc, char *argv[])
{"seed", required_argument, NULL, 13},
{"ambig-reads", required_argument, NULL, 14},
{"ar", required_argument, NULL, 14},
{"ref-skip-reads", no_argument, NULL, 29},
{"write-index",optional_argument,NULL,'W'},
{"del-bias", required_argument, NULL, 23},
{"poly-mqual", no_argument, NULL, 24},
Expand Down Expand Up @@ -1620,6 +1625,9 @@ int main_mpileup(int argc, char *argv[])
if (mplp.seqQ_offset > 200)
mplp.seqQ_offset = 200;
break;
case 29:
mplp.ref_skip_reads = 1;
break;
case 23: mplp.del_bias = atof(optarg); break;
case 24: mplp.poly_mqual = 1; break;
case 26: mplp.poly_mqual = 0; break;
Expand Down

0 comments on commit 0f3cb9e

Please sign in to comment.