From a482b9121555a1d80b81fea2f0c191321ea7ce0c Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 14 May 2020 17:35:31 +0100 Subject: [PATCH] Speed up of VCF parsing / writing with FORMAT fields. If we're reading and writing VCF without intervening change to BCF or use of bcf_unpack with BCF_UN_FMT (so no querying and/or modification of the per-sample FORMAT columns), then we keep the data in string form instead of binary bcf form. The bcf binary form is held in kstring b->indiv. We also hold the string format there too, but use b->unpacked & VCF_UN_FMT as a bit-flag to indicate whether this is encoding as VCF or BCF. The benefit is reading is considerably faster for many-sample files (~4 fold for a 2.5k sample 1000 genomes test) and similarly (~3 fold) for a read/write loop to VCF as we don't have to re-encode the data from uBCF to VCF either. If we're converting to BCF, or doing a query, we still need the call to vcf_parse_format as before, which reformats b->indiv from VCF to BCF internally. This is done on the fly. There is some nastiness to how this works sadly as bcf_unpack just takes the bcf record and has no access to the header, but this is needed for parsing. Therefore we stash a copy of it in the kstring along with the text itself, but this assumes that no one is doing header oddities such as reading the file, freeing the header, and then attempting to unpack the contents. It feels like a safe assumption, although it's not something that has ever been explicitly spelt out before. Note this causes a few bcftools test failures due to fixups that happen when doing a full conversion to BCF. A similar case happens here in test/noroundtrip.vcf, which has an underspecified FORMAT of "GT:GT 0/1". The test checks it comes out as "GT:GT 2,4:.". This won't happen with pure VCF as we're not parsing that data, but the test fix is to force conversion to BCF and back again to test that code path. A similar fix would need to be made to the failing bcftools tests. --- htslib/vcf.h | 4 +- test/test.pl | 2 +- vcf.c | 171 +++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 136 insertions(+), 41 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 45a7adc16..92f0c2bd4 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -216,7 +216,8 @@ typedef struct { float qual; // QUAL uint32_t n_info:16, n_allele:16; uint32_t n_fmt:8, n_sample:24; - kstring_t shared, indiv; + kstring_t shared; + kstring_t indiv; // Per sample data. unpacked & VCF_UN_FMT => VCF / BCF bcf_dec_t d; // lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack() int max_unpack; // Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed int unpacked; // remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work @@ -389,6 +390,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). #define BCF_UN_FMT 8 // unpack format and each sample #define BCF_UN_IND BCF_UN_FMT // a synonymo of BCF_UN_FMT #define BCF_UN_ALL (BCF_UN_SHR|BCF_UN_FMT) // everything + #define VCF_UN_FMT 16 // indiv.s is VCF string HTSLIB_EXPORT int bcf_unpack(bcf1_t *b, int which); diff --git a/test/test.pl b/test/test.pl index 2c4a1b1f4..601025ceb 100755 --- a/test/test.pl +++ b/test/test.pl @@ -822,7 +822,7 @@ sub test_vcf_various test_cmd($opts, %args, out => "formatcols.vcf", cmd => "$$opts{bin}/htsfile -c $$opts{path}/formatcols.vcf"); test_cmd($opts, %args, out => "noroundtrip-out.vcf", - cmd => "$$opts{bin}/htsfile -c $$opts{path}/noroundtrip.vcf"); + cmd => "$$opts{path}/test_view -b $$opts{path}/noroundtrip.vcf | $$opts{bin}/htsfile -c -"); test_cmd($opts, %args, out => "formatmissing-out.vcf", cmd => "$$opts{bin}/htsfile -c $$opts{path}/formatmissing.vcf"); } diff --git a/vcf.c b/vcf.c index 62f7741aa..66a1f26d4 100644 --- a/vcf.c +++ b/vcf.c @@ -78,6 +78,7 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N #define BCF_IS_64BIT (1<<30) +static int vcf_parse_format_partial(bcf1_t *v); static char *find_chrom_header_line(char *s) { @@ -1439,6 +1440,11 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt); int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) { + if (rec->unpacked & VCF_UN_FMT) { + if (vcf_parse_format_partial(rec) < 0) + return -1; + } + if ( !hdr->keep_samples ) return 0; if ( !bcf_hdr_nsamples(hdr) ) { @@ -1734,6 +1740,16 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) if ( h->dirty ) { if (bcf_hdr_sync(h) < 0) return -1; } + + if ( hfp->format.format == vcf || hfp->format.format == text_format ) + return vcf_write(hfp,h,v); + + if (v->unpacked & VCF_UN_FMT) { + // slow, but round trip test + if (vcf_parse_format_partial(v) < 0) + return -1; + } + if ( bcf_hdr_nsamples(h)!=v->n_sample ) { hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)", @@ -1741,9 +1757,6 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) return -1; } - if ( hfp->format.format == vcf || hfp->format.format == text_format ) - return vcf_write(hfp,h,v); - if ( v->errcode ) { // vcf_parse1() encountered a new contig or tag, undeclared in the @@ -2569,6 +2582,45 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p return 0; } +// The indiv.s kstring is the textual VCF representation for FORMAT +// column and all the subsequent SAMPLE columns. +// +// We also need the header, and this cannot be passed in as bcf_unpack calls +// this and it doesn't have the header available. +// So we also cache a pointer to the header in the first bytes of the +// kstring too. +// +// This packing ensures the data is still in kstring format and amenable +// to freeing / reuse. I.e.: +// +// s.s p q s.l s.m +// | | | | | +// ......... +// +// Returns 0 on success, +// <0 on failure. +static int vcf_parse_format_partial(bcf1_t *v) { + if (!(v->unpacked & VCF_UN_FMT)) + return 0; + kstring_t s = v->indiv; + const bcf_hdr_t *h = *(const bcf_hdr_t **)s.s; + char *p = s.s + sizeof(const bcf_hdr_t *); + char *q = p + strlen(p); + + v->indiv.s = NULL; + v->indiv.l = v->indiv.m = 0; + + int ret; + if ((ret = vcf_parse_format(&s, h, v, p, q) == 0)) { + v->unpacked &= ~VCF_UN_FMT; + free(s.s); + return ret; + } else { + v->indiv = s; + return ret; + } +} + static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) { // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has // been already printed, but will enable tools like vcfcheck to proceed. @@ -2890,7 +2942,29 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } if ( v->max_unpack && !(v->max_unpack>>3) ) goto end; } else if (i == 8) {// FORMAT - return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2; + // Consider complete copy of s, obtained via ks_release, + // and cache of p/q pointers. This is then a generalised + // parse delay that works for any max_unpack value. + + kstring_t *iv = &v->indiv; + ks_clear(iv); + if (kputsn((char *)&h, sizeof(&h), iv) < 0 || + kputsn(p, s->s + s->l - p, iv) < 0) + goto err; + + v->unpacked |= VCF_UN_FMT; + + // We can't accurately judge n_sample and some things may check + // this without doing an explicit bcf_unpack call, but we + // set it to bcf_hdr_nsamples(h) as a starting point. + // (vcf_parse_format validates this and it's an error if it + // mismatches, so this is initial value is incorrect if the + // data itself is incorrect, and we'll spot that on an explict + // bcf_unpack call.) + v->n_sample = bcf_hdr_nsamples(h); + + return 0; + //return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2; } } @@ -3002,6 +3076,10 @@ int bcf_unpack(bcf1_t *b, int which) ptr = bcf_unpack_info_core1(ptr, &d->info[i]); b->unpacked |= BCF_UN_INFO; } + if ((which & BCF_UN_FMT) && (b->unpacked & VCF_UN_FMT)) { + if (vcf_parse_format_partial(b) < 0) + return -1; + } if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT ptr = (uint8_t*)b->indiv.s; hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt); @@ -3016,7 +3094,8 @@ int bcf_unpack(bcf1_t *b, int which) int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) { int i; - bcf_unpack((bcf1_t*)v, BCF_UN_ALL); + bcf_unpack((bcf1_t*)v, (v->unpacked & VCF_UN_FMT) + ? BCF_UN_SHR : BCF_UN_ALL); kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM kputc('\t', s); kputll(v->pos + 1, s); // POS kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID @@ -3075,45 +3154,54 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) if ( first ) kputc('.', s); } else kputc('.', s); // FORMAT and individual information - if (v->n_sample) - { - int i,j; - if ( v->n_fmt) + if (v->unpacked & VCF_UN_FMT) { + size_t l = strlen(v->indiv.s + sizeof(bcf_hdr_t *)); + kputc('\t', s); + kputsn(v->indiv.s + sizeof(bcf_hdr_t *), l, s); + kputc('\t', s); + kputsn(v->indiv.s + sizeof(bcf_hdr_t *) + l+1, + v->indiv.l - (sizeof(bcf_hdr_t *) + l+1), s); + } else { + if (v->n_sample) { - int gt_i = -1; - bcf_fmt_t *fmt = v->d.fmt; - int first = 1; - for (i = 0; i < (int)v->n_fmt; ++i) { - if ( !fmt[i].p ) continue; - kputc(!first ? ':' : '\t', s); first = 0; - if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) ) - { - hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", fmt[i].id, bcf_seqname_safe(h,(bcf1_t*)v), v->pos+1); - abort(); - } - kputs(h->id[BCF_DT_ID][fmt[i].id].key, s); - if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i; - } - if ( first ) kputs("\t.", s); - for (j = 0; j < v->n_sample; ++j) { - kputc('\t', s); - first = 1; + int i,j; + if ( v->n_fmt) + { + int gt_i = -1; + bcf_fmt_t *fmt = v->d.fmt; + int first = 1; for (i = 0; i < (int)v->n_fmt; ++i) { - bcf_fmt_t *f = &fmt[i]; - if ( !f->p ) continue; - if (!first) kputc(':', s); - first = 0; - if (gt_i == i) - bcf_format_gt(f,j,s); - else - bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size); + if ( !fmt[i].p ) continue; + kputc(!first ? ':' : '\t', s); first = 0; + if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) ) + { + hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", fmt[i].id, bcf_seqname_safe(h,(bcf1_t*)v), v->pos+1); + abort(); + } + kputs(h->id[BCF_DT_ID][fmt[i].id].key, s); + if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i; + } + if ( first ) kputs("\t.", s); + for (j = 0; j < v->n_sample; ++j) { + kputc('\t', s); + first = 1; + for (i = 0; i < (int)v->n_fmt; ++i) { + bcf_fmt_t *f = &fmt[i]; + if ( !f->p ) continue; + if (!first) kputc(':', s); + first = 0; + if (gt_i == i) + bcf_format_gt(f,j,s); + else + bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size); + } + if ( first ) kputc('.', s); } - if ( first ) kputc('.', s); } + else + for (j=0; j<=v->n_sample; j++) + kputs("\t.", s); } - else - for (j=0; j<=v->n_sample; j++) - kputs("\t.", s); } kputc('\n', s); return 0; @@ -3824,6 +3912,11 @@ int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap) { + if (v->unpacked & VCF_UN_FMT) { + if (vcf_parse_format_partial(v) < 0) + return -1; + } + kstring_t ind; ind.s = 0; ind.l = ind.m = 0; if (n) {