Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update to work with vcf 4.4 prefixed phasing info #1861

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions htslib/vcf.h
Original file line number Diff line number Diff line change
Expand Up @@ -1501,6 +1501,34 @@ static inline int bcf_float_is_vector_end(float f)
return u.i==bcf_float_vector_end ? 1 : 0;
}

typedef enum bcf_version {v41 = 1, v42, v43, v44} bcf_version;
/**
* bcf_get_version - get the version as bcf_version enumeration
* @param hdr - bcf header, to get version
* @param ipver - pointer to return version
* Returns 0 on success and -1 on failure
*/
static inline int bcf_get_version(const bcf_hdr_t *hdr, bcf_version *ver)
{
const char *version = NULL;

if (!hdr || !ver) {
return -1;
}

version = bcf_hdr_get_version(hdr);
if (!strcmp("VCFv4.1", version)) {
*ver = v41;
} else if (!strcmp("VCFv4.2", version)) {
*ver = v42;
} else if (!strcmp("VCFv4.3", version)) {
*ver = v43;
} else {
*ver = v44;
}
return 0;
}

static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
{
uint32_t e = 0;
Expand Down Expand Up @@ -1528,6 +1556,88 @@ static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
return e == 0 ? 0 : -1;
}

/**
* bcf_format_gt1 - formats GT information on a string
* @param hdr - bcf header, to get version
* @param fmt - pointer to bcf format data
* @param isample - position of interested sample in data
* @param str - pointer to output string
* Returns 0 on success and -1 on failure
* This method is extended from bcf_format_gt to output phasing information
* in accordance with v4.4 format, which supports explicit / prefixed phasing
* for 1st allele.
* Explicit / prefixed phasing for 1st allele is used only when it is a must to
* correctly express phasing.
*/
static inline int bcf_format_gt1(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str)
Copy link
Member

@pd3 pd3 Nov 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The suffix 1, as used here, is confusing and somewhat conflicting with the convention used throughout the rest of the code. Can you rename so that the 'phase' keyword is included? Or, even better, create an extended function where various flags can be turned on, starting with PHASE_1ST_ALLELE, or something like that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the name should not be made more specific, and if possible made more generic!

An implementation which supports the newer format should be the 1st choice going forward.
Best would have been to update the current method itself but not possible as it would break the compatibility due to signature change.
Hence chose a name which is in sync with current method. Also we have a few others like index_load/index_build/ etc in same lines.

{
uint32_t e = 0;
bcf_version ver = v42;
int ploidy = 1, anyunphased = 0;
int32_t val0 = 0;
kstring_t tmp1 = KS_INITIALIZE, tmp2 = KS_INITIALIZE;

if (bcf_get_version(hdr, &ver)) {
hts_log_error("Failed to get version information");
return -1;
}
#define BRANCH(type_t, convert, missing, vector_end) { \
uint8_t *ptr = fmt->p + isample*fmt->size; \
int i; \
for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
{ \
type_t val = convert(ptr); \
if ( val == vector_end ) break; \
if (!i) { val0 = val; } \
if (i) { \
e |= kputc("/|"[val & 1], &tmp1) < 0; \
anyunphased |= !(val & 1); \
} \
if (!(val >> 1)) e |= kputc('.', &tmp1) < 0; \
else e |= kputw((val >> 1) - 1, &tmp1) < 0; \
} \
if (i == 0) e |= kputc('.', &tmp1) < 0; \
ploidy = i; \
}
switch (fmt->type) {
case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, bcf_int8_vector_end); break;
case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break;
case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break;
case BCF_BT_NULL: e |= kputc('.', &tmp1) < 0; break;
default: hts_log_error("Unexpected type %d", fmt->type); return -2;
}
#undef BRANCH

if (ver >= v44) { //output which supports prefixed phasing
/* update 1st allele's phasing if required and append rest to it.
use prefixed phasing only when it is a must. i.e. without which the
inferred value will be incorrect */
if (val0 & 1) {
/* 1st one is phased, if ploidy is > 1 and an unphased allele exists
need to specify explicitly */
e |= (ploidy > 1 && anyunphased) ?
(kputc('|', &tmp2) < 0) :
(ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p
(kputc('|', &tmp2) < 0) :
0);
} else {
/* 1st allele is unphased, if ploidy is = 1 or allele is '.' or
ploidy > 1 and no other unphased allele exist, need to specify
explicitly */
e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ?
(kputc('/', &tmp2) < 0) :
0;
}
e |= kputsn(tmp1.s, tmp1.l, &tmp2) < 0; //append rest with updated one
ks_free(&tmp1);
tmp1 = tmp2;
}
//updated v44 string or <v44 without any update
e |= kputsn(tmp1.s, tmp1.l, str) < 0;
ks_free(&tmp1);
return e == 0 ? 0 : -1;
}

static inline int bcf_enc_size(kstring_t *s, int size, int type)
{
// Most common case is first
Expand Down
10 changes: 10 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
run_test('test_bcf2vcf',$opts);
run_test('test_vcf_sweep',$opts,out=>'test-vcf-sweep.out');
run_test('test_vcf_various',$opts);
run_test('test_vcf_44', $opts);
run_test('test_bcf_sr_sort',$opts);
run_test('test_bcf_sr_no_index',$opts);
run_test('test_bcf_sr_range', $opts);
Expand Down Expand Up @@ -1159,6 +1160,15 @@ sub test_vcf_various
cmd => "$$opts{path}/test_view $$opts{path}/modhdr.vcf.gz chr22:1-2");
}

sub test_vcf_44
{
my ($opts, %args) = @_;

# vcf4.4 with implicit and explicit phasing info combinations
test_cmd($opts, %args, out => "vcf44_1.expected",
cmd => "$$opts{bin}/htsfile -c $$opts{path}/vcf44_1.vcf");
}

sub write_multiblock_bgzf {
my ($name, $frags) = @_;

Expand Down
35 changes: 35 additions & 0 deletions test/vcf44_1.expected
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
##fileformat=VCFv4.4
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=1,length=1000>
##reference=file://test
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##failue="test file on explicit and implicit phasing markers in 4.4"
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097
1 61462 rs56992750 T A 100 PASS . GT 0|0|1 0/1
1 61480 rs56992751 T A 100 PASS . GT 0 /0|1
1 61481 rs56992752 T A 100 PASS . GT /0 |0/1
1 61482 rs56992752 T A 100 PASS . GT /0 /1
1 61483 rs56992752 T A 100 PASS . GT 0 1
1 61484 rs56992752 T A 100 PASS . GT 0 /1
1 61485 rs56992752 T A 100 PASS . GT 0 1
1 61486 rs56992752 T A 100 PASS . GT 0 1
1 61487 rs56992752 T A 100 PASS . GT 0 1
1 61488 rs56992752 T A 100 PASS . GT 0 /1
1 61489 rs56992752 T A 100 PASS . GT /0 1
1 61490 rs56992752 T A 100 PASS . GT /0 1
1 61491 rs56992752 T A 100 PASS . GT /0 /1
1 61492 rs56992752 T A 100 PASS . GT /0|0 1/0
1 61493 rs56992752 T A 100 PASS . GT 0|0 |1/0
1 61494 rs56992752 T A 100 PASS . GT /0|0 1/0
1 61495 rs56992752 T A 100 PASS . GT 0|0 |1/0
1 61496 rs56992752 T A 100 PASS . GT . .
1 61497 rs56992752 T A 100 PASS . GT . |.
1 61498 rs56992752 T A 100 PASS . GT ./1 .|1
1 61499 rs56992752 T A 100 PASS . GT ./1 .|1
1 61500 rs56992752 T A 100 PASS . GT |./1 /.|1
1 61501 rs56992752 T A 100 PASS . GT 1/. 1|.
1 61502 rs56992752 T A 100 PASS . GT 1/. /1|.
1 61503 rs56992752 T A 100 PASS . GT |1/. 1|.
1 61504 rs56992752 T A 100 PASS . GT ./. .|.
1 61505 rs56992752 T A 100 PASS . GT ./. .|.
1 61506 rs56992752 T A 100 PASS . GT |./. /.|.
34 changes: 34 additions & 0 deletions test/vcf44_1.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
##fileformat=VCFv4.4
##contig=<ID=1,length=1000>
##reference=file://test
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##failue="test file on explicit and implicit phasing markers in 4.4"
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097
1 61462 rs56992750 T A 100 PASS . GT 0|0|1 0/1
1 61480 rs56992751 T A 100 PASS . GT 0 /0|1
1 61481 rs56992752 T A 100 PASS . GT /0 |0/1
1 61482 rs56992752 T A 100 PASS . GT /0 /1
1 61483 rs56992752 T A 100 PASS . GT 0 1
1 61484 rs56992752 T A 100 PASS . GT 0 /1
1 61485 rs56992752 T A 100 PASS . GT 0 |1
1 61486 rs56992752 T A 100 PASS . GT |0 1
1 61487 rs56992752 T A 100 PASS . GT |0 |1
1 61488 rs56992752 T A 100 PASS . GT |0 /1
1 61489 rs56992752 T A 100 PASS . GT /0 1
1 61490 rs56992752 T A 100 PASS . GT /0 |1
1 61491 rs56992752 T A 100 PASS . GT /0 /1
1 61492 rs56992752 T A 100 PASS . GT /0|0 /1/0
1 61493 rs56992752 T A 100 PASS . GT |0|0 |1/0
1 61494 rs56992752 T A 100 PASS . GT /0|0 1/0
1 61495 rs56992752 T A 100 PASS . GT 0|0 |1/0
1 61496 rs56992752 T A 100 PASS . GT . .
1 61497 rs56992752 T A 100 PASS . GT /. |.
1 61498 rs56992752 T A 100 PASS . GT ./1 .|1
1 61499 rs56992752 T A 100 PASS . GT /./1 |.|1
1 61500 rs56992752 T A 100 PASS . GT |./1 /.|1
1 61501 rs56992752 T A 100 PASS . GT 1/. 1|.
1 61502 rs56992752 T A 100 PASS . GT /1/. /1|.
1 61503 rs56992752 T A 100 PASS . GT |1/. |1|.
1 61504 rs56992752 T A 100 PASS . GT ./. .|.
1 61505 rs56992752 T A 100 PASS . GT /./. |.|.
1 61506 rs56992752 T A 100 PASS . GT |./. /.|.
41 changes: 37 additions & 4 deletions vcf.c
Original file line number Diff line number Diff line change
Expand Up @@ -3061,8 +3061,14 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
const char *t = q + 1;
int m = 0; // m: sample id
const int nsamples = bcf_hdr_nsamples(h);

bcf_version ver = v42;
const char *end = s->s + s->l;

if (bcf_get_version(h, &ver)) {
hts_log_error("Failed to get version information");
return -1;
}

while ( t<end )
{
// can we skip some samples?
Expand Down Expand Up @@ -3099,15 +3105,30 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
int l;
if (z->is_gt) {
// Genotypes.
// <val>([|/]<val>)+... where <val> is [0-9]+ or ".".
//([/|])?<val>)([|/]<val>)+... where <val> is [0-9]+ or ".".
int32_t is_phased = 0;
uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
uint32_t unreadable = 0;
uint32_t max = 0;
int overflow = 0;
int overflow = 0, ploidy = 0, anyunphased = 0, \
phasingprfx = 0, unknown1 = 0;

/* with prefixed phasing, it is explicitly given for 1st one
with non-prefixed, set based on ploidy and phasing of other
alleles. */
if (ver >= v44 && (*t == '|' || *t == '/')) {
// cache prefix and phasing status
is_phased = *t++ == '|';
phasingprfx = 1;
}

for (l = 0;; ++t) {
ploidy++;
if (*t == '.') {
++t, x[l++] = is_phased;
if (l==1) { //for 1st allele only
unknown1 = 1;
}
} else {
const char *tt = t;
uint32_t val;
Expand All @@ -3125,9 +3146,21 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
if (max < val) max = val;
x[l++] = (val + 1) << 1 | is_phased;
}
anyunphased |= (ploidy != 1) && !is_phased;
is_phased = (*t == '|');
if (*t != '|' && *t != '/') break;
}
if (ver >= v44 && !phasingprfx) {
/* no explicit phasing for 1st allele, set based on
other alleles and ploidy */
if (ploidy == 1) { //implicitly phased
if (!unknown1) {
x[0] |= 1;
}
} else { //set by other unphased alleles
x[0] |= (anyunphased)? 0 : 1;
}
}
// Possibly check max against v->n_allele instead?
if (overflow || max > (INT32_MAX >> 1) - 1) {
hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
Expand Down Expand Up @@ -4187,7 +4220,7 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
if (!first) kputc_(':', s);
first = 0;
if (gt_i == i) {
bcf_format_gt(f,j,s);
bcf_format_gt1(h, f,j,s);
break;
}
else if (f->n == 1)
Expand Down