From 662227a8496fee91270ebc0445f0aaddf1724fbd Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Wed, 21 Apr 2021 12:21:22 +0100
Subject: [PATCH] Make mpileup's overlap removal choose a random sequence.

Currently it always chooses the second sequence (except for the
circumstance of differing base calls).  This is essentially random
strand and random coordinate in most library strategies, but some
targetted sequencing methods have a very strong strand bias (first is
+ strand, second is - strand) or positional bias (eg PCR amplicons).
Given SNPs near the end of sequences can give rise to poor BAQ scores,
both position and strand bias are detrimental.

This change makes it select either read 'a' or 'b' based on a hash of
the read name.  Unlike using a traditional random number generator,
this gives it consistent behaviour regardless of how many sequences
have gone before.

An example from SynDip region 1:185M-200M:

No overlap removal:
SNP          Q>0 /   Filtered
SNP   TP   18830 /     18803
SNP   FP     264 /       238
SNP   GT      56 /        53
SNP   FN     459 /       486

InDel TP    2788 /      2697
InDel FP    1022 /        86
InDel GT     353 /       345
InDel FN     596 /       687

Old removal strategy:
SNP          Q>0 /   Filtered
SNP   TP   18841 /     18813
SNP   FP     270 /       243
SNP   GT      56 /        54
SNP   FN     448 /       476

InDel TP    2754 /      2663
InDel FP     985 /        83
InDel GT     413 /       404
InDel FN     630 /       721

This PR:
SNP          Q>0 /   Filtered
SNP   TP   18841 /     18814
SNP   FP     272 /       242
SNP   GT      55 /        53
SNP   FN     448 /       475

InDel TP    2765 /      2679
InDel FP     996 /        85
InDel GT     382 /       375
InDel FN     619 /       705

The CPU cost on bcftools mpileup | bcftools call between the latter
two tests was 0.4% (which may also just be random fluctuation).
Vs the old removal system, this is a marginal improvement for SNPs
and, oddly, a significant improvement to Indels.  (It's still behind
no overlap removal for indels, but I'm unsure on the veracity of
small indels in that truth set).

Fixes samtools/bcftools#1459
---
 htscodecs |   2 +-
 sam.c     | 121 ++++++++++++++++++++++++++++++++++--------------------
 2 files changed, 77 insertions(+), 46 deletions(-)

diff --git a/htscodecs b/htscodecs
index d7e357946..30bc9fdca 160000
--- a/htscodecs
+++ b/htscodecs
@@ -1 +1 @@
-Subproject commit d7e357946ead219b81cc1becbe0de8a99d96ca84
+Subproject commit 30bc9fdca45e144bd975eb2a2563c1cac43c2ec5
diff --git a/sam.c b/sam.c
index 8bda92384..4332cfb02 100644
--- a/sam.c
+++ b/sam.c
@@ -4840,10 +4840,18 @@ static inline int cigar_iref2iseq_next(const uint32_t **cigar,
     return -1;
 }
 
+// Given overlapping read 'a' (left) and 'b' (right) on the same
+// template, adjust quality values to zero for either a or b.
+// Note versions 1.12 and earlier always removed quality from 'b' for
+// matching bases.  Now we select a or b semi-randomly based on name hash.
+// Returns 0 on success,
+//        -1 on failure
 static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
 {
-    const uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
-    const uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
+    const uint32_t *a_cigar = bam_get_cigar(a),
+        *a_cigar_max = a_cigar + a->core.n_cigar;
+    const uint32_t *b_cigar = bam_get_cigar(b),
+        *b_cigar_max = b_cigar + b->core.n_cigar;
     hts_pos_t a_icig = 0, a_iseq = 0;
     hts_pos_t b_icig = 0, b_iseq = 0;
     uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
@@ -4852,69 +4860,92 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
     hts_pos_t iref   = b->core.pos;
     hts_pos_t a_iref = iref - a->core.pos;
     hts_pos_t b_iref = iref - b->core.pos;
-    int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
-    if ( a_ret<0 ) return a_ret<-1 ? -1:0;  // no overlap or error
-    int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
-    if ( b_ret<0 ) return b_ret<-1 ? -1:0;  // no overlap or error
 
-    #if DBG
-        fprintf(stderr,"tweak %s  n_cigar=%d %d  .. %d-%d vs %"PRIhts_pos"-%"PRIhts_pos"\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
-            a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b)));
-    #endif
+    int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max,
+                                    &a_icig, &a_iseq, &a_iref);
+    if ( a_ret<0 )
+        // no overlap or error
+        return a_ret<-1 ? -1:0;
+
+    int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max,
+                                    &b_icig, &b_iseq, &b_iref);
+    if ( b_ret<0 )
+        // no overlap or error
+        return b_ret<-1 ? -1:0;
+
+    // Determine which seq is the one getting modified qualities.
+    uint8_t amul, bmul;
+    if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) {
+        amul = 1;
+        bmul = 0;
+    } else {
+        amul = 0;
+        bmul = 1;
+    }
 
+    // Loop over the overlapping region nulling qualities in either
+    // seq a or b.
     int err = 0;
     while ( 1 )
     {
-        // Increment reference position
+        // Step to next matching reference position in a and b
         while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
-            a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
-        if ( a_ret<0 ) { err = a_ret<-1?-1:0; break; }   // done
-        if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
+            a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
+                                         &a_icig, &a_iseq, &a_iref);
+        if ( a_ret<0 ) { // done
+            err = a_ret<-1?-1:0;
+            break;
+        }
+        if ( iref < a_iref + a->core.pos )
+            iref = a_iref + a->core.pos;
 
         while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
-            b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
-        if ( b_ret<0 ) { err = b_ret<-1?-1:0; break; }  // done
-        if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
+            b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig,
+                                         &b_iseq, &b_iref);
+        if ( b_ret<0 ) { // done
+            err = b_ret<-1?-1:0;
+            break;
+        }
+        if ( iref < b_iref + b->core.pos )
+            iref = b_iref + b->core.pos;
 
         iref++;
-        if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue;   // only CMATCH positions, don't know what to do with indels
+
+        if ( a_iref+a->core.pos != b_iref+b->core.pos )
+            // only CMATCH positions, don't know what to do with indels
+            continue;
 
         if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
-            return -1;  // Fell off end of sequence, bad CIGAR?
+            // Fell off end of sequence, bad CIGAR?
+            return -1;
 
-        if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
-        {
-            #if DBG
-                fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
-            #endif
-            // we are very confident about this base
+        // We're finally at the same ref base in both a and b.
+        // Check if the bases match (confident) or mismatch
+        // (not so confident).
+        if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) {
+            // We are very confident about this base.  Use sum of quals
             int qual = a_qual[a_iseq] + b_qual[b_iseq];
-            a_qual[a_iseq] = qual>200 ? 200 : qual;
-            b_qual[b_iseq] = 0;
-        }
-        else
-        {
-            if ( a_qual[a_iseq] >= b_qual[b_iseq] )
-            {
-                #if DBG
-                    fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower_c(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
-                #endif
-                a_qual[a_iseq] = 0.8 * a_qual[a_iseq];  // not so confident about a_qual anymore given the mismatch
+            a_qual[a_iseq] = amul * (qual>200 ? 200 : qual);
+            b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);;
+        } else {
+            // Not so confident about anymore given the mismatch.
+            // Reduce qual for lowest quality base.
+            if ( a_qual[a_iseq] > b_qual[b_iseq] ) {
+                // A highest qual base; keep
+                a_qual[a_iseq] = 0.8 * a_qual[a_iseq];
                 b_qual[b_iseq] = 0;
-            }
-            else
-            {
-                #if DBG
-                    fprintf(stderr,"[%c/%c]",tolower_c(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
-                #endif
+            } else if (a_qual[a_iseq] < b_qual[b_iseq] ) {
+                // B highest qual base; keep
                 b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
                 a_qual[a_iseq] = 0;
+            } else {
+                // Both equal, so pick randomly
+                a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq];
+                b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq];
             }
         }
     }
-    #if DBG
-        fprintf(stderr,"\n");
-    #endif
+
     return err;
 }