From 662227a8496fee91270ebc0445f0aaddf1724fbd Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 21 Apr 2021 12:21:22 +0100 Subject: [PATCH] Make mpileup's overlap removal choose a random sequence. Currently it always chooses the second sequence (except for the circumstance of differing base calls). This is essentially random strand and random coordinate in most library strategies, but some targetted sequencing methods have a very strong strand bias (first is + strand, second is - strand) or positional bias (eg PCR amplicons). Given SNPs near the end of sequences can give rise to poor BAQ scores, both position and strand bias are detrimental. This change makes it select either read 'a' or 'b' based on a hash of the read name. Unlike using a traditional random number generator, this gives it consistent behaviour regardless of how many sequences have gone before. An example from SynDip region 1:185M-200M: No overlap removal: SNP Q>0 / Filtered SNP TP 18830 / 18803 SNP FP 264 / 238 SNP GT 56 / 53 SNP FN 459 / 486 InDel TP 2788 / 2697 InDel FP 1022 / 86 InDel GT 353 / 345 InDel FN 596 / 687 Old removal strategy: SNP Q>0 / Filtered SNP TP 18841 / 18813 SNP FP 270 / 243 SNP GT 56 / 54 SNP FN 448 / 476 InDel TP 2754 / 2663 InDel FP 985 / 83 InDel GT 413 / 404 InDel FN 630 / 721 This PR: SNP Q>0 / Filtered SNP TP 18841 / 18814 SNP FP 272 / 242 SNP GT 55 / 53 SNP FN 448 / 475 InDel TP 2765 / 2679 InDel FP 996 / 85 InDel GT 382 / 375 InDel FN 619 / 705 The CPU cost on bcftools mpileup | bcftools call between the latter two tests was 0.4% (which may also just be random fluctuation). Vs the old removal system, this is a marginal improvement for SNPs and, oddly, a significant improvement to Indels. (It's still behind no overlap removal for indels, but I'm unsure on the veracity of small indels in that truth set). Fixes samtools/bcftools#1459 --- htscodecs | 2 +- sam.c | 121 ++++++++++++++++++++++++++++++++++-------------------- 2 files changed, 77 insertions(+), 46 deletions(-) diff --git a/htscodecs b/htscodecs index d7e357946..30bc9fdca 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit d7e357946ead219b81cc1becbe0de8a99d96ca84 +Subproject commit 30bc9fdca45e144bd975eb2a2563c1cac43c2ec5 diff --git a/sam.c b/sam.c index 8bda92384..4332cfb02 100644 --- a/sam.c +++ b/sam.c @@ -4840,10 +4840,18 @@ static inline int cigar_iref2iseq_next(const uint32_t **cigar, return -1; } +// Given overlapping read 'a' (left) and 'b' (right) on the same +// template, adjust quality values to zero for either a or b. +// Note versions 1.12 and earlier always removed quality from 'b' for +// matching bases. Now we select a or b semi-randomly based on name hash. +// Returns 0 on success, +// -1 on failure static int tweak_overlap_quality(bam1_t *a, bam1_t *b) { - const uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar; - const uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar; + const uint32_t *a_cigar = bam_get_cigar(a), + *a_cigar_max = a_cigar + a->core.n_cigar; + const uint32_t *b_cigar = bam_get_cigar(b), + *b_cigar_max = b_cigar + b->core.n_cigar; hts_pos_t a_icig = 0, a_iseq = 0; hts_pos_t b_icig = 0, b_iseq = 0; uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); @@ -4852,69 +4860,92 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b) hts_pos_t iref = b->core.pos; hts_pos_t a_iref = iref - a->core.pos; hts_pos_t b_iref = iref - b->core.pos; - int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); - if ( a_ret<0 ) return a_ret<-1 ? -1:0; // no overlap or error - int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); - if ( b_ret<0 ) return b_ret<-1 ? -1:0; // no overlap or error - #if DBG - fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %"PRIhts_pos"-%"PRIhts_pos"\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, - a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b))); - #endif + int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, + &a_icig, &a_iseq, &a_iref); + if ( a_ret<0 ) + // no overlap or error + return a_ret<-1 ? -1:0; + + int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, + &b_icig, &b_iseq, &b_iref); + if ( b_ret<0 ) + // no overlap or error + return b_ret<-1 ? -1:0; + + // Determine which seq is the one getting modified qualities. + uint8_t amul, bmul; + if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) { + amul = 1; + bmul = 0; + } else { + amul = 0; + bmul = 1; + } + // Loop over the overlapping region nulling qualities in either + // seq a or b. int err = 0; while ( 1 ) { - // Increment reference position + // Step to next matching reference position in a and b while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos ) - a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); - if ( a_ret<0 ) { err = a_ret<-1?-1:0; break; } // done - if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos; + a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, + &a_icig, &a_iseq, &a_iref); + if ( a_ret<0 ) { // done + err = a_ret<-1?-1:0; + break; + } + if ( iref < a_iref + a->core.pos ) + iref = a_iref + a->core.pos; while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos ) - b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); - if ( b_ret<0 ) { err = b_ret<-1?-1:0; break; } // done - if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos; + b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, + &b_iseq, &b_iref); + if ( b_ret<0 ) { // done + err = b_ret<-1?-1:0; + break; + } + if ( iref < b_iref + b->core.pos ) + iref = b_iref + b->core.pos; iref++; - if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels + + if ( a_iref+a->core.pos != b_iref+b->core.pos ) + // only CMATCH positions, don't know what to do with indels + continue; if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq) - return -1; // Fell off end of sequence, bad CIGAR? + // Fell off end of sequence, bad CIGAR? + return -1; - if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) - { - #if DBG - fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]); - #endif - // we are very confident about this base + // We're finally at the same ref base in both a and b. + // Check if the bases match (confident) or mismatch + // (not so confident). + if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) { + // We are very confident about this base. Use sum of quals int qual = a_qual[a_iseq] + b_qual[b_iseq]; - a_qual[a_iseq] = qual>200 ? 200 : qual; - b_qual[b_iseq] = 0; - } - else - { - if ( a_qual[a_iseq] >= b_qual[b_iseq] ) - { - #if DBG - fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower_c(seq_nt16_str[bam_seqi(b_seq,b_iseq)])); - #endif - a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch + a_qual[a_iseq] = amul * (qual>200 ? 200 : qual); + b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);; + } else { + // Not so confident about anymore given the mismatch. + // Reduce qual for lowest quality base. + if ( a_qual[a_iseq] > b_qual[b_iseq] ) { + // A highest qual base; keep + a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; b_qual[b_iseq] = 0; - } - else - { - #if DBG - fprintf(stderr,"[%c/%c]",tolower_c(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]); - #endif + } else if (a_qual[a_iseq] < b_qual[b_iseq] ) { + // B highest qual base; keep b_qual[b_iseq] = 0.8 * b_qual[b_iseq]; a_qual[a_iseq] = 0; + } else { + // Both equal, so pick randomly + a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq]; + b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq]; } } } - #if DBG - fprintf(stderr,"\n"); - #endif + return err; }