Skip to content

Commit

Permalink
Add a soft-clip MWU test.
Browse files Browse the repository at this point in the history
This looks for REF / ALT bias in the numbers of soft-clipped bases
near variants.
  • Loading branch information
jkbonfield committed Jan 28, 2021
1 parent 84f4f5f commit df273e4
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 54 deletions.
78 changes: 58 additions & 20 deletions bam2bcf.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,12 @@ void bcf_call_destroy(bcf_callaux_t *bca)
}

// position in the sequence with respect to the aligned part of the read
static int get_position(const bam_pileup1_t *p, int *len)
static int get_position(const bam_pileup1_t *p, int *len,
int *sc_len, int *sc_dist)
{
int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1;
int sc_left = 0, sc_right = 0;
int sc_left_dist = -1, sc_right_dist = -1;
for (icig=0; icig<p->b->core.n_cigar; icig++)
{
int cig = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK;
Expand All @@ -98,6 +101,13 @@ static int get_position(const bam_pileup1_t *p, int *len)
}
if ( cig==BAM_CSOFT_CLIP )
{
if (n_tot_bases) {
sc_right += ncig;
sc_right_dist = p->b->core.l_qseq - sc_right - p->qpos;
} else {
sc_left += ncig;
sc_left_dist = p->qpos+1 - sc_left;
}
iread += ncig;
if ( iread<=p->qpos ) edist -= ncig;
continue;
Expand All @@ -110,6 +120,20 @@ static int get_position(const bam_pileup1_t *p, int *len)
assert(0);
}
*len = n_tot_bases;

// Distance to nearest soft-clips and length of that clip.
if (sc_left_dist >= 0) {
if (sc_right_dist < 0 || sc_left_dist < sc_right_dist) {
*sc_len = sc_left;
*sc_dist = sc_left_dist;
}
} else if (sc_right_dist >= 0) {
*sc_len = sc_right;
*sc_dist = sc_right_dist;
} else {
*sc_len = 0;
*sc_dist = 0;
}
return edist;
}

Expand All @@ -127,6 +151,8 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES);
memset(bca->ref_scl, 0, 100*sizeof(int));
memset(bca->alt_scl, 0, 100*sizeof(int));
}

/*
Expand Down Expand Up @@ -229,28 +255,39 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
// collect for bias tests
if ( baseQ > 59 ) baseQ = 59;
if ( mapQ > 59 ) mapQ = 59;
int len, epos = 0;
if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB) )
int len, epos = 0, sc_len = 0, sc_dist = 0;
if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) )
{
int pos = get_position(p, &len);
int pos = get_position(p, &len, &sc_len, &sc_dist);
epos = (double)pos/(len+1) * bca->npos;

if (sc_len) {
sc_len = 20.0*sc_len / sc_dist;
if (sc_len > 99) sc_len = 99;
}
}

int imq = mapQ * nqual_over_60;
int ibq = baseQ * nqual_over_60;

if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++;
else bca->fwd_mqs[imq]++;
if ( bam_is_rev(p->b) )
bca->rev_mqs[imq]++;
else
bca->fwd_mqs[imq]++;

if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
{
bca->ref_pos[epos]++;
bca->ref_bq[ibq]++;
bca->ref_mq[imq]++;
bca->ref_scl[sc_len]++;
}
else
{
bca->alt_pos[epos]++;
bca->alt_bq[ibq]++;
bca->alt_mq[imq]++;
bca->alt_scl[sc_len]++;
}
}
r->ori_depth = ori_depth;
Expand Down Expand Up @@ -532,42 +569,37 @@ double calc_mwu_biasZ(int *a, int *b, int n, int left_only, int do_Z) {
if (na+nb <= 1)
return HUGE_VAL;

double U, m, sd;
double U, m;
U = l + e*0.5; // Mann-Whitney U score
m = na*nb / 2;
m = na*nb / 2.0;

// With ties adjustment
double var2 = (na*nb)/12.0 * ((na+nb+1) - t/(double)((na+nb)*(na+nb-1)));
// var = na*nb*(na+nb+1)/12.0; // simpler; minus tie adjustment
if (var2 < 0)
if (var2 <= 0)
return HUGE_VAL;

if (do_Z) {
// S.D. normalised Z-score
//Z = (U - m - (U-m >= 0 ? 0.5 : -0.5)) / sd; // gatk method?
return var2 ? (U - m) / sqrt(var2) : 0;
return (U - m) / sqrt(var2);
}

// Else U score, which can be asymmetric for some data types.
if (left_only && U > m)
return HUGE_VAL; // one-sided, +ve bias is OK, -ve is not.

if (var2 <= 0)
return HUGE_VAL;

if ( na==2 || nb==2 ) {
// Linear approximation
return U > m ? (2.0*m-U)/m : U/m;
}

if (na >= 8 || nb >= 8) {
// Normal approximation, very good for na>=8 && nb>=8 and
// reasonable if na<8 or nb<8
return exp(-0.5*(U-m)*(U-m)/var2);
}

// Exact calculation
return mann_whitney_1947_(na, nb, U) * sqrt(2*M_PI*var2);
if (na==1 || nb == 1)
return mann_whitney_1947_(na, nb, U) * sqrt(2*M_PI*var2);
else
return mann_whitney_1947(na, nb, U) * sqrt(2*M_PI*var2);
}

static inline double logsumexp2(double a, double b)
Expand Down Expand Up @@ -827,6 +859,8 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, bca->nqual,1,1);
call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, bca->nqual,0,1);
call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, bca->nqual,0,1);
if ( bca->fmt_flag & B2B_INFO_SCB )
call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl, 100, 0,1);
#else
// Old method; U as probability between 0 and 1
if ( bca->fmt_flag & B2B_INFO_RPB )
Expand All @@ -846,7 +880,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
#endif

if ( bca->fmt_flag & B2B_INFO_VDB )
if ( bca->fmt_flag & B2B_INFO_VDB )
call->vdb = calc_vdb(bca->alt_pos, bca->npos);

return 0;
Expand Down Expand Up @@ -939,13 +973,17 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
if ( bc->mwu_sc != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
#else
if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
#endif

if ( bc->strand_bias != HUGE_VAL )
bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);

#if CDF_MWU_TESTS
if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
Expand Down
7 changes: 5 additions & 2 deletions bam2bcf.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ DEALINGS IN THE SOFTWARE. */

// Use calc_mwu_biasZ as a sd-normalised score centred on 0 instead of the
// old method with values in the range 0 to 1.
//#define MWU_ZSCORE
#define MWU_ZSCORE

#define B2B_INDEL_NULL 10000

Expand All @@ -64,6 +64,7 @@ DEALINGS IN THE SOFTWARE. */
#define B2B_INFO_VDB (1<<14)
#define B2B_INFO_RPB (1<<15)
#define B2B_FMT_QS (1<<16)
#define B2B_INFO_SCB (1<<17)

#define B2B_MAX_ALLELES 5

Expand All @@ -90,6 +91,7 @@ typedef struct __bcf_callaux_t {
float max_frac; // for collecting indel candidates
int per_sample_flt; // indel filtering strategy
int *ref_pos, *alt_pos, npos, *ref_mq, *alt_mq, *ref_bq, *alt_bq, *fwd_mqs, *rev_mqs, nqual; // for bias tests
int ref_scl[100], alt_scl[100]; // soft-clip length bias
// for internal uses
int max_bases;
int indel_types[4]; // indel lengths
Expand Down Expand Up @@ -135,11 +137,12 @@ typedef struct {
int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS;
uint8_t *fmt_arr;
float vdb; // variant distance bias
float mwu_pos, mwu_mq, mwu_bq, mwu_mqs;
float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc;
#if CDF_MWU_TESTS
float mwu_pos_cdf, mwu_mq_cdf, mwu_bq_cdf, mwu_mqs_cdf;
#endif
float seg_bias;
float strand_bias; // phred-scaled fisher-exact test
kstring_t tmp;
} bcf_call_t;

Expand Down
38 changes: 6 additions & 32 deletions mpileup.c
Original file line number Diff line number Diff line change
Expand Up @@ -394,16 +394,6 @@ static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bc
*
* NB: this may sadly realign after we've already used the data. Hmm...
*/
static void bump_mq(int n, int *n_plp, const bam_pileup1_t **plp, int mx) {
int i, j;
for (i = 0; i < n; i++) {
for (j = 0; j < n_plp[i]; j++) {
bam_pileup1_t *p = (bam_pileup1_t *)plp[i] + j;
bam1_t *b = p->b;
b->core.qual += b->core.qual/2 < mx ? b->core.qual/2 : mx;
}
}
}
static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp,
char *ref, int ref_len, int pos) {
int i, j, has_indel = 0, has_clip = 0, nt = 0;
Expand Down Expand Up @@ -444,7 +434,6 @@ static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp,
// Don't bother realigning the most minor of cases.
// No indels, or low indel rate combined with low soft-clipping rate
if (has_indel == 0 || ((has_indel < 0.1*nt || has_indel == 1) && has_clip < 0.2*nt)) {
//bump_mq(n, n_plp, plp, 2);
return;
}

Expand Down Expand Up @@ -502,18 +491,15 @@ if (nt >= 20) { // Can be more liberal with BAQ on deeper data
}
}
if (nsz == 1) {
//bump_mq(n, n_plp, plp, 5);
return;
}
if (nsz == 2) {
double d = (n1+.01)/(n1+n2+.01);
if (d > 0.4 && d < 0.6) {
//bump_mq(n, n_plp, plp, 5);
return;
}
}
if (nsz > 2 && (double)indel_sz[0]/nt > 0.95) {
//bump_mq(n, n_plp, plp, 5);
return;
}
}
Expand All @@ -537,9 +523,6 @@ if (nt >= 20) { // Can be more liberal with BAQ on deeper data
continue;
b->core.flag |= 32768;

static int n_pos = 0, n_realn = 0;
n_pos++;

// if (b->core.qual == 0)
// continue; // no need to do BAQ as already considered poor

Expand All @@ -555,14 +538,7 @@ if (nt >= 20) { // Can be more liberal with BAQ on deeper data
//
// This rescues some of the false negatives that are caused by
// systematic reduction in quality due to sample vs ref alignment.
int pstart = b->core.pos;
int pend = b->core.pos + bam_cigar2rlen(b->core.n_cigar,
bam_get_cigar(b))-1;
#define REALN_DIST 50
// if (pos - pstart >= REALN_DIST && pend - (pos+dlen) >= REALN_DIST)
// continue;
// fprintf(stderr, "pos=%d+%d pstart=%d pend=%d => L %d / R %d, clip=%d\n",
// pos, dlen, pstart, pend, pos-pstart, pend - (pos+dlen), has_clip);
uint32_t *cig = bam_get_cigar(b);
int ncig = b->core.n_cigar;

Expand Down Expand Up @@ -594,13 +570,6 @@ if (nt >= 20) { // Can be more liberal with BAQ on deeper data
// fixme, honour conf->flag & MPLP_REDO_BAQ
//fprintf(stderr, "Realn %s\n", bam_get_qname(b));
sam_prob_realn(b, ref, ref_len, 3);

n_realn++;
if (n_realn % 1000 == 0) {
// approx 75% of indels, about 30% of total reads
fprintf(stderr, "Realigned %d of %d (%f %%)\n",
n_realn, n_pos, 100.0*n_realn / n_pos);
}
}
}

Expand Down Expand Up @@ -845,13 +814,16 @@ static int mpileup(mplp_conf_t *conf)
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality Bias (closer to 0 is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Base Quality Bias (closer to 0 is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality vs Strand Bias (closer to 0 is better)\">");
if ( conf->fmt_flag&B2B_INFO_SCB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
#else
if ( conf->fmt_flag&B2B_INFO_RPB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
#endif
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=FS,Number=1,Type=Float,Description=\"Phred-scaled p-value using Fisher's exact test to detect strand bias\">");
#if CDF_MWU_TESTS
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
Expand Down Expand Up @@ -1112,6 +1084,7 @@ int parse_format_flag(const char *str)
else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
else if ( !strcasecmp(tags[i],"SCB") || !strcasecmp(tags[i],"INFO/SCB")) flag |= B2B_INFO_SCB;
else
{
fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
Expand Down Expand Up @@ -1251,7 +1224,8 @@ int main_mpileup(int argc, char *argv[])
mplp.record_cmd_line = 1;
mplp.n_threads = 0;
mplp.bsmpl = bam_smpl_init();
mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB; // the default to be changed in future, see also parse_format_flag()
// the default to be changed in future, see also parse_format_flag()
mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB;

static const struct option lopts[] =
{
Expand Down

0 comments on commit df273e4

Please sign in to comment.