Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up of VCF parsing / writing with FORMAT fields. #1081

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion htslib/vcf.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,8 @@ typedef struct {
float qual; // QUAL
uint32_t n_info:16, n_allele:16;
uint32_t n_fmt:8, n_sample:24;
kstring_t shared, indiv;
kstring_t shared;
kstring_t indiv; // Per sample data. unpacked & VCF_UN_FMT => VCF / BCF
bcf_dec_t d; // lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
int max_unpack; // Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
int unpacked; // remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
Expand Down Expand Up @@ -389,6 +390,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write().
#define BCF_UN_FMT 8 // unpack format and each sample
#define BCF_UN_IND BCF_UN_FMT // a synonymo of BCF_UN_FMT
#define BCF_UN_ALL (BCF_UN_SHR|BCF_UN_FMT) // everything
#define VCF_UN_FMT 16 // indiv.s is VCF string
HTSLIB_EXPORT
int bcf_unpack(bcf1_t *b, int which);

Expand Down
2 changes: 1 addition & 1 deletion test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -822,7 +822,7 @@ sub test_vcf_various
test_cmd($opts, %args, out => "formatcols.vcf",
cmd => "$$opts{bin}/htsfile -c $$opts{path}/formatcols.vcf");
test_cmd($opts, %args, out => "noroundtrip-out.vcf",
cmd => "$$opts{bin}/htsfile -c $$opts{path}/noroundtrip.vcf");
cmd => "$$opts{path}/test_view -b $$opts{path}/noroundtrip.vcf | $$opts{bin}/htsfile -c -");
test_cmd($opts, %args, out => "formatmissing-out.vcf",
cmd => "$$opts{bin}/htsfile -c $$opts{path}/formatmissing.vcf");
}
Expand Down
171 changes: 132 additions & 39 deletions vcf.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N

#define BCF_IS_64BIT (1<<30)

static int vcf_parse_format_partial(bcf1_t *v);

static char *find_chrom_header_line(char *s)
{
Expand Down Expand Up @@ -1439,6 +1440,11 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
{
if (rec->unpacked & VCF_UN_FMT) {
if (vcf_parse_format_partial(rec) < 0)
return -1;
}

if ( !hdr->keep_samples ) return 0;
if ( !bcf_hdr_nsamples(hdr) )
{
Expand Down Expand Up @@ -1734,16 +1740,23 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
if ( h->dirty ) {
if (bcf_hdr_sync(h) < 0) return -1;
}

if ( hfp->format.format == vcf || hfp->format.format == text_format )
return vcf_write(hfp,h,v);

if (v->unpacked & VCF_UN_FMT) {
// slow, but round trip test
if (vcf_parse_format_partial(v) < 0)
return -1;
}

if ( bcf_hdr_nsamples(h)!=v->n_sample )
{
hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
return -1;
}

if ( hfp->format.format == vcf || hfp->format.format == text_format )
return vcf_write(hfp,h,v);

if ( v->errcode )
{
// vcf_parse1() encountered a new contig or tag, undeclared in the
Expand Down Expand Up @@ -2569,6 +2582,45 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
return 0;
}

// The indiv.s kstring is the textual VCF representation for FORMAT
// column and all the subsequent SAMPLE columns.
//
// We also need the header, and this cannot be passed in as bcf_unpack calls
// this and it doesn't have the header available.
// So we also cache a pointer to the header in the first bytes of the
// kstring too.
//
// This packing ensures the data is still in kstring format and amenable
// to freeing / reuse. I.e.:
//
// s.s p q s.l s.m
// | | | | |
// <HDR_PTR><FORMAT><SAMPLE\tSAMPLE>.........
//
// Returns 0 on success,
// <0 on failure.
static int vcf_parse_format_partial(bcf1_t *v) {
if (!(v->unpacked & VCF_UN_FMT))
return 0;
kstring_t s = v->indiv;
const bcf_hdr_t *h = *(const bcf_hdr_t **)s.s;
char *p = s.s + sizeof(const bcf_hdr_t *);
char *q = p + strlen(p);

v->indiv.s = NULL;
v->indiv.l = v->indiv.m = 0;

int ret;
if ((ret = vcf_parse_format(&s, h, v, p, q) == 0)) {
v->unpacked &= ~VCF_UN_FMT;
free(s.s);
return ret;
} else {
v->indiv = s;
return ret;
}
}

static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
// Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
// been already printed, but will enable tools like vcfcheck to proceed.
Expand Down Expand Up @@ -2890,7 +2942,29 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
}
if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
} else if (i == 8) {// FORMAT
return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
// Consider complete copy of s, obtained via ks_release,
// and cache of p/q pointers. This is then a generalised
// parse delay that works for any max_unpack value.

kstring_t *iv = &v->indiv;
ks_clear(iv);
if (kputsn((char *)&h, sizeof(&h), iv) < 0 ||
kputsn(p, s->s + s->l - p, iv) < 0)
goto err;

v->unpacked |= VCF_UN_FMT;

// We can't accurately judge n_sample and some things may check
// this without doing an explicit bcf_unpack call, but we
// set it to bcf_hdr_nsamples(h) as a starting point.
// (vcf_parse_format validates this and it's an error if it
// mismatches, so this is initial value is incorrect if the
// data itself is incorrect, and we'll spot that on an explict
// bcf_unpack call.)
v->n_sample = bcf_hdr_nsamples(h);

return 0;
//return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
}
}

Expand Down Expand Up @@ -3002,6 +3076,10 @@ int bcf_unpack(bcf1_t *b, int which)
ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
b->unpacked |= BCF_UN_INFO;
}
if ((which & BCF_UN_FMT) && (b->unpacked & VCF_UN_FMT)) {
if (vcf_parse_format_partial(b) < 0)
return -1;
}
if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
ptr = (uint8_t*)b->indiv.s;
hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
Expand All @@ -3016,7 +3094,8 @@ int bcf_unpack(bcf1_t *b, int which)
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
{
int i;
bcf_unpack((bcf1_t*)v, BCF_UN_ALL);
bcf_unpack((bcf1_t*)v, (v->unpacked & VCF_UN_FMT)
? BCF_UN_SHR : BCF_UN_ALL);
kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM
kputc('\t', s); kputll(v->pos + 1, s); // POS
kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
Expand Down Expand Up @@ -3075,45 +3154,54 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
if ( first ) kputc('.', s);
} else kputc('.', s);
// FORMAT and individual information
if (v->n_sample)
{
int i,j;
if ( v->n_fmt)
if (v->unpacked & VCF_UN_FMT) {
size_t l = strlen(v->indiv.s + sizeof(bcf_hdr_t *));
kputc('\t', s);
kputsn(v->indiv.s + sizeof(bcf_hdr_t *), l, s);
kputc('\t', s);
kputsn(v->indiv.s + sizeof(bcf_hdr_t *) + l+1,
v->indiv.l - (sizeof(bcf_hdr_t *) + l+1), s);
} else {
if (v->n_sample)
{
int gt_i = -1;
bcf_fmt_t *fmt = v->d.fmt;
int first = 1;
for (i = 0; i < (int)v->n_fmt; ++i) {
if ( !fmt[i].p ) continue;
kputc(!first ? ':' : '\t', s); first = 0;
if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
{
hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", fmt[i].id, bcf_seqname_safe(h,(bcf1_t*)v), v->pos+1);
abort();
}
kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
}
if ( first ) kputs("\t.", s);
for (j = 0; j < v->n_sample; ++j) {
kputc('\t', s);
first = 1;
int i,j;
if ( v->n_fmt)
{
int gt_i = -1;
bcf_fmt_t *fmt = v->d.fmt;
int first = 1;
for (i = 0; i < (int)v->n_fmt; ++i) {
bcf_fmt_t *f = &fmt[i];
if ( !f->p ) continue;
if (!first) kputc(':', s);
first = 0;
if (gt_i == i)
bcf_format_gt(f,j,s);
else
bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
if ( !fmt[i].p ) continue;
kputc(!first ? ':' : '\t', s); first = 0;
if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
{
hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", fmt[i].id, bcf_seqname_safe(h,(bcf1_t*)v), v->pos+1);
abort();
}
kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
}
if ( first ) kputs("\t.", s);
for (j = 0; j < v->n_sample; ++j) {
kputc('\t', s);
first = 1;
for (i = 0; i < (int)v->n_fmt; ++i) {
bcf_fmt_t *f = &fmt[i];
if ( !f->p ) continue;
if (!first) kputc(':', s);
first = 0;
if (gt_i == i)
bcf_format_gt(f,j,s);
else
bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
}
if ( first ) kputc('.', s);
}
if ( first ) kputc('.', s);
}
else
for (j=0; j<=v->n_sample; j++)
kputs("\t.", s);
}
else
for (j=0; j<=v->n_sample; j++)
kputs("\t.", s);
}
kputc('\n', s);
return 0;
Expand Down Expand Up @@ -3824,6 +3912,11 @@ int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)

int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
{
if (v->unpacked & VCF_UN_FMT) {
if (vcf_parse_format_partial(v) < 0)
return -1;
}

kstring_t ind;
ind.s = 0; ind.l = ind.m = 0;
if (n) {
Expand Down