Skip to content

Commit

Permalink
Fix sam_hdr_dup to cope with long refs.
Browse files Browse the repository at this point in the history
The h->sdict hash is used to track references that are > 4Gb in size.
The dup code didn't copy this.  This manifested itself as CRAM SQ
headers being truncated (read SAM hdr, dup, write as CRAM hdr).

To fix this a function was written that creates or updates the sdict
from the hrecs parsed header structs.  It's possible this should be
called directly from the sam_hdr_create function (part of the SAM
format parser) instead of manually keeping track of sdict itself,
however doing so would require initialising the new header structs so
I haven't done this.

This is a general utility, so perhaps should be made a public part of
the header API.  However IMO the new header API should hide this
nuance away and just return the correct data, also ensuring that
header updates work correctly and honour the text form.

Since c83c9e2 the header API also was using the 32-bit capped
target_len in preference to the parsed text from SQ LN fields when
they differed.  I am assuming this was a decision in what takes
priority in BAM where the sequence names and lengths exist in both
text and binary form.  This commit reverses this and makes the text
form always take priority.  As this is at least required in some
scenarios (long references) it seems easier to simply make it apply in
all scenarios.
  • Loading branch information
jkbonfield committed Nov 15, 2019
1 parent 7e3234e commit 3c079af
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 28 deletions.
32 changes: 4 additions & 28 deletions header.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ typedef khash_t(rm) rmhash_t;
static int sam_hdr_link_pg(sam_hdr_t *bh);

static int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap);
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...);


#define MAX_ERROR_QUOTE 320 // Prevent over-long error messages
Expand Down Expand Up @@ -182,14 +181,10 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs,
if (hrecs->ref[nref].ty == NULL) {
// Attach header line to existing stub entry.
hrecs->ref[nref].ty = h_type;
// Check lengths match; correct if not.
if (len != hrecs->ref[nref].len) {
char tmp[32];
snprintf(tmp, sizeof(tmp), "%" PRIhts_pos,
hrecs->ref[nref].len);
if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0)
return -1;
}
// The old hrecs length may be incorrect as it is initially
// populated from h->target_len which has a max 32-bit size.
// We always take our latest data as canonical.
hrecs->ref[nref].len = len;
if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
return -1;

Expand Down Expand Up @@ -2485,25 +2480,6 @@ int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap) {
return 0;
}

/*
* Adds or updates tag key,value pairs in a header line.
* Eg for adding M5 tags to @SQ lines or updating sort order for the
* @HD line.
*
* Specify multiple key,value pairs ending in NULL.
*
* Returns 0 on success
* -1 on failure
*/
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) {
va_list args;
int res;
va_start(args, type);
res = sam_hrecs_vupdate(hrecs, type, args);
va_end(args);
return res;
}

/*
* Looks for a specific key in a single sam header line identified by *type.
* If prev is non-NULL it also fills this out with the previous tag, to
Expand Down
55 changes: 55 additions & 0 deletions sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,53 @@ void sam_hdr_destroy(sam_hdr_t *bh)
free(bh);
}

/*
* Create or update the header sdict field from the new
* parsed hrecs data. This is important for sam_hdr_dup or
* when reading / creating a new header.
*
* Returns 0 on success,
* -1 on failure
*/
static int sam_hdr_update_sdict(sam_hdr_t *h) {
int i;
khash_t(s2i) *long_refs = h->sdict;

if (!h->hrecs)
return 0;

for (i = 0; i < h->n_targets; i++) {
hts_pos_t len = sam_hdr_tid2len(h, i);
const char *name = sam_hdr_tid2name(h, i);
khint_t k;
if (len < UINT32_MAX) {
if (!long_refs)
continue;

// Check if we have an old length, if so remove it.
k = kh_get(s2i, long_refs, name);
if (k < kh_end(long_refs))
kh_del(s2i, long_refs, k);
}

if (!long_refs) {
if (!(h->sdict = long_refs = kh_init(s2i)))
goto err;
}

// Add / update length
int absent;
k = kh_put(s2i, long_refs, name, &absent);
if (absent < 0)
goto err;
kh_val(long_refs, k) = len;
}
return 0;

err:
return -1;
}

sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
{
if (h0 == NULL) return NULL;
Expand Down Expand Up @@ -179,6 +226,14 @@ sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
h->text[h->l_text] = '\0';
}

if (h0->sdict) {
// We have a reason to use new API, so build it so we
// can replicate sdict.
if (sam_hdr_fill_hrecs(h) < 0 ||
sam_hdr_update_sdict(h) < 0)
goto fail;
}

return h;

fail:
Expand Down
6 changes: 6 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,12 @@ sub test_view
testv $opts, "./test_view $tv_args -p longrefs/longref.tmp.sam_ longrefs/longref.tmp.sam.gz";
testv $opts, "./compare_sam.pl longrefs/longref.sam longrefs/longref.tmp.sam_";

# CRAM disabled for now as the positions cannot be 32-bit. (These tests are useful for
# checking SQ headers only.)
# testv $opts, "./test_view $tv_args -C -o no_ref -p longrefs/longref.tmp.cram longrefs/longref.sam";
# testv $opts, "./test_view $tv_args -p longrefs/longref.tmp.sam_ longrefs/longref.tmp.cram";
# testv $opts, "./compare_sam.pl longrefs/longref.sam longrefs/longref.tmp.sam_";

# Build index and compare with on-the-fly one made earlier.
test_compare $opts, "$$opts{path}/test_index -c longrefs/longref.tmp.sam.gz", "longrefs/longref.tmp.sam.gz.csi.otf", "longrefs/longref.tmp.sam.gz.csi", gz=>1;

Expand Down

0 comments on commit 3c079af

Please sign in to comment.