Skip to content

Commit

Permalink
update to work with vcf 4.4 prefixed phasing info
Browse files Browse the repository at this point in the history
  • Loading branch information
vasudeva8 committed Nov 26, 2024
1 parent a662866 commit 5177190
Show file tree
Hide file tree
Showing 5 changed files with 203 additions and 4 deletions.
108 changes: 108 additions & 0 deletions htslib/vcf.h
Original file line number Diff line number Diff line change
Expand Up @@ -1501,6 +1501,34 @@ static inline int bcf_float_is_vector_end(float f)
return u.i==bcf_float_vector_end ? 1 : 0;
}

typedef enum bcf_version {v41 = 1, v42, v43, v44} bcf_version;
/**
* bcf_get_version - get the version as bcf_version enumeration
* @param hdr - bcf header, to get version
* @param ipver - pointer to return version
* Returns 0 on success and -1 on failure
*/
static inline int bcf_get_version(const bcf_hdr_t *hdr, bcf_version *ver)
{
const char *version = NULL;

if (!hdr || !ver) {
return -1;
}

version = bcf_hdr_get_version(hdr);
if (!strcmp("VCFv4.1", version)) {
*ver = v41;
} else if (!strcmp("VCFv4.2", version)) {
*ver = v42;
} else if (!strcmp("VCFv4.3", version)) {
*ver = v43;
} else {
*ver = v44;
}
return 0;
}

static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
{
uint32_t e = 0;
Expand Down Expand Up @@ -1528,6 +1556,86 @@ static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
return e == 0 ? 0 : -1;
}

/**
* bcf_format_gt1 - formats GT information on a string
* @param hdr - bcf header, to get version
* @param fmt - pointer to bcf format data
* @param isample - position of interested sample in data
* @param str - pointer to output string
* Returns 0 on success and -1 on failure
* This method is extended from bcf_format_gt to output phasing information
* in accordance with v4.4 format, which supports explicit / prefixed phasing
* for 1st allele.
* Explicit / prefixed phasing for 1st allele is used only when it is a must to
* correctly express phasing.
*/
static inline int bcf_format_gt1(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str)
{
uint32_t e = 0;
bcf_version ver = v42;
int ploidy = 1, anyunphased = 0;
int32_t val0 = 0;
kstring_t tmp1 = KS_INITIALIZE, tmp2 = KS_INITIALIZE;

if (bcf_get_version(hdr, &ver)) {
hts_log_error("Failed to get version information");
return -1;
}
#define BRANCH(type_t, convert, missing, vector_end) { \
uint8_t *ptr = fmt->p + isample*fmt->size; \
int i; \
for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
{ \
type_t val = convert(ptr); \
if ( val == vector_end ) break; \
if (!i) { val0 = val; } \
if (i) { \
e |= kputc("/|"[val & 1], &tmp1) < 0; \
anyunphased |= !(val & 1); \
} \
if (!(val >> 1)) e |= kputc('.', &tmp1) < 0; \
else e |= kputw((val >> 1) - 1, &tmp1) < 0; \
} \
if (i == 0) e |= kputc('.', &tmp1) < 0; \
ploidy = i; \
}
switch (fmt->type) {
case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, bcf_int8_vector_end); break;
case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break;
case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break;
case BCF_BT_NULL: e |= kputc('.', &tmp1) < 0; break;
default: hts_log_error("Unexpected type %d", fmt->type); return -2;
}
#undef BRANCH

if (ver >= v44) { //output which supports prefixed phasing
/* update 1st allele's phasing if required and append rest to it.
use prefixed phasing only when it is a must. i.e. without which the
inferred value will be incorrect */
if (val0 & 1) {
/* 1st one is phased, if ploidy is > 1 and an unphased allele exists
need to specify explicitly */
e |= (ploidy > 1 && anyunphased) ?
(kputc('|', &tmp2) < 0) :
0;
} else {
/* 1st allele is unphased, if ploidy is = 1 or allele is '.' or
ploidy > 1 and no other unphased allele exist, need to specify
explicitly */
e |= ((ploidy <= 1) || (ploidy > 1 && !anyunphased)) ?
(kputc('/', &tmp2) < 0) :
0;
}
e |= kputsn(tmp1.s, tmp1.l, &tmp2) < 0; //append rest with updated one
ks_free(&tmp1);
tmp1 = tmp2;
}
//updated v44 string or <v44 without any update
e |= kputsn(tmp1.s, tmp1.l, str) < 0;
ks_free(&tmp1);
return e == 0 ? 0 : -1;
}

static inline int bcf_enc_size(kstring_t *s, int size, int type)
{
// Most common case is first
Expand Down
10 changes: 10 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
run_test('test_bcf2vcf',$opts);
run_test('test_vcf_sweep',$opts,out=>'test-vcf-sweep.out');
run_test('test_vcf_various',$opts);
run_test('test_vcf_44', $opts);
run_test('test_bcf_sr_sort',$opts);
run_test('test_bcf_sr_no_index',$opts);
run_test('test_bcf_sr_range', $opts);
Expand Down Expand Up @@ -1159,6 +1160,15 @@ sub test_vcf_various
cmd => "$$opts{path}/test_view $$opts{path}/modhdr.vcf.gz chr22:1-2");
}

sub test_vcf_44
{
my ($opts, %args) = @_;

# vcf4.4 with implicit and explicit phasing info combinations
test_cmd($opts, %args, out => "vcf44_1.expected",
cmd => "$$opts{bin}/htsfile -c $$opts{path}/vcf44_1.vcf");
}

sub write_multiblock_bgzf {
my ($name, $frags) = @_;

Expand Down
27 changes: 27 additions & 0 deletions test/vcf44_1.expected
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
##fileformat=VCFv4.4
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=1,length=1000>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##failue="test file on explicit and implicit phasing markers in 4.4"
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097
1 61462 rs56992750 T A 100 PASS . GT 0|0|1 0/1
1 61480 rs56992751 T A 100 PASS . GT 0 /0|1
1 61481 rs56992752 T A 100 PASS . GT /0 |0/1
1 61482 rs56992752 T A 100 PASS . GT /0 /1
1 61483 rs56992752 T A 100 PASS . GT 0 1
1 61484 rs56992752 T A 100 PASS . GT 0 /1
1 61485 rs56992752 T A 100 PASS . GT 0 1
1 61486 rs56992752 T A 100 PASS . GT 0 1
1 61487 rs56992752 T A 100 PASS . GT 0 1
1 61488 rs56992752 T A 100 PASS . GT 0 /1
1 61489 rs56992752 T A 100 PASS . GT /0 1
1 61490 rs56992752 T A 100 PASS . GT /0 1
1 61491 rs56992752 T A 100 PASS . GT /0 /1
1 61492 rs56992752 T A 100 PASS . GT /0|0 1/0
1 61493 rs56992752 T A 100 PASS . GT 0|0 |1/0
1 61494 rs56992752 T A 100 PASS . GT /0|0 1/0
1 61495 rs56992752 T A 100 PASS . GT 0|0 |1/0
1 61496 rs56992752 T A 100 PASS . GT . .
1 61497 rs56992752 T A 100 PASS . GT ./1 .|1
1 61498 rs56992752 T A 100 PASS . GT 1/. 1|.
1 61499 rs56992752 T A 100 PASS . GT ./. .|.
26 changes: 26 additions & 0 deletions test/vcf44_1.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
##fileformat=VCFv4.4
##contig=<ID=1,length=1000>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##failue="test file on explicit and implicit phasing markers in 4.4"
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097
1 61462 rs56992750 T A 100 PASS . GT 0|0|1 0/1
1 61480 rs56992751 T A 100 PASS . GT 0 /0|1
1 61481 rs56992752 T A 100 PASS . GT /0 |0/1
1 61482 rs56992752 T A 100 PASS . GT /0 /1
1 61483 rs56992752 T A 100 PASS . GT 0 1
1 61484 rs56992752 T A 100 PASS . GT 0 /1
1 61485 rs56992752 T A 100 PASS . GT 0 |1
1 61486 rs56992752 T A 100 PASS . GT |0 1
1 61487 rs56992752 T A 100 PASS . GT |0 |1
1 61488 rs56992752 T A 100 PASS . GT |0 /1
1 61489 rs56992752 T A 100 PASS . GT /0 1
1 61490 rs56992752 T A 100 PASS . GT /0 |1
1 61491 rs56992752 T A 100 PASS . GT /0 /1
1 61492 rs56992752 T A 100 PASS . GT /0|0 /1/0
1 61493 rs56992752 T A 100 PASS . GT |0|0 |1/0
1 61494 rs56992752 T A 100 PASS . GT /0|0 1/0
1 61495 rs56992752 T A 100 PASS . GT 0|0 |1/0
1 61496 rs56992752 T A 100 PASS . GT . .
1 61497 rs56992752 T A 100 PASS . GT ./1 .|1
1 61498 rs56992752 T A 100 PASS . GT 1/. 1|.
1 61499 rs56992752 T A 100 PASS . GT ./. .|.
36 changes: 32 additions & 4 deletions vcf.c
Original file line number Diff line number Diff line change
Expand Up @@ -3061,8 +3061,14 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
const char *t = q + 1;
int m = 0; // m: sample id
const int nsamples = bcf_hdr_nsamples(h);

bcf_version ver = v42;
const char *end = s->s + s->l;

if (bcf_get_version(h, &ver)) {
hts_log_error("Failed to get version information");
return -1;
}

while ( t<end )
{
// can we skip some samples?
Expand Down Expand Up @@ -3099,13 +3105,25 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
int l;
if (z->is_gt) {
// Genotypes.
// <val>([|/]<val>)+... where <val> is [0-9]+ or ".".
//([/|])?<val>)([|/]<val>)+... where <val> is [0-9]+ or ".".
int32_t is_phased = 0;
uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
uint32_t unreadable = 0;
uint32_t max = 0;
int overflow = 0;
int overflow = 0, ploidy = 0, anyunphased = 0, \
phasingprfx = 0;

/* with prefixed phasing, it is explicitly given for 1st one
with non-prefixed, set based on ploidy and phasing of other
alleles. */
if (ver >= v44 && (*t == '|' || *t == '/')) {
// cache prefix and phasing status
is_phased = *t++ == '|';
phasingprfx = 1;
}

for (l = 0;; ++t) {
ploidy++;
if (*t == '.') {
++t, x[l++] = is_phased;
} else {
Expand All @@ -3125,9 +3143,19 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
if (max < val) max = val;
x[l++] = (val + 1) << 1 | is_phased;
}
anyunphased |= (ploidy != 1) && !is_phased;
is_phased = (*t == '|');
if (*t != '|' && *t != '/') break;
}
if (ver >= v44 && !phasingprfx) {
/* no explicit phasing for 1st allele, set based on
other alleles and ploidy */
if (ploidy == 1) { //implicitly phased
x[0]|= 1;
} else { //set by other unphased alleles
x[0] |= anyunphased ? 0 : 1;
}
}
// Possibly check max against v->n_allele instead?
if (overflow || max > (INT32_MAX >> 1) - 1) {
hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
Expand Down Expand Up @@ -4187,7 +4215,7 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
if (!first) kputc_(':', s);
first = 0;
if (gt_i == i) {
bcf_format_gt(f,j,s);
bcf_format_gt1(h, f,j,s);
break;
}
else if (f->n == 1)
Expand Down

0 comments on commit 5177190

Please sign in to comment.