Skip to content

Commit

Permalink
First stage of supporting large chromosomes.
Browse files Browse the repository at this point in the history
The in-memory data structures are 64-bit for pos, mate-pos and insert
size, along with iterators.  The code that fills these out is still
all 32-bit so this is basically a place-holder for ABI purposes.

The exception to this is SAM support, which being purely textual has
the minimal changes necessary to read and write 64-bit values.

Split the hts_parse_reg API to 32-bit and 64-bit variants (although
64 bit version is only used internally at the moment).  To much
code uses this with addresses of 32-bit quantities, so for
compatibility hts_parse_reg() cannot change.

64 bit parse_reg uses a slightly tweaked value for the end for
chromosomes with no range (eg "chr1").  Using INT64_MAX would
yield -1 when cast into int.  We now have nearly 64-bit max which
when truncated to 32-bit is still INT_MAX.

The only change needed in samtools to pass tests is fixing cur5
and pre5 in bam_mate.c.
  • Loading branch information
jkbonfield authored and daviesrob committed Sep 25, 2019
1 parent 13f28be commit 418a183
Show file tree
Hide file tree
Showing 17 changed files with 220 additions and 134 deletions.
2 changes: 1 addition & 1 deletion cram/cram_encode.c
Original file line number Diff line number Diff line change
Expand Up @@ -3065,7 +3065,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
// This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/-
// a small amount, if appropriate options set.
if ((bam_ins_size(b) &&
abs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) ||
llabs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) ||
(!bam_ins_size(b) && !fd->tlen_zero))
goto detached;

Expand Down
4 changes: 2 additions & 2 deletions cram/cram_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -661,8 +661,8 @@ typedef struct cram_index {

typedef struct {
int refid;
int start;
int end;
int64_t start;
int64_t end;
} cram_range;

/*-----------------------------------------------------------------------------
Expand Down
82 changes: 58 additions & 24 deletions hts.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ DEALINGS IN THE SOFTWARE. */
#include <stdlib.h>
#include <inttypes.h>
#include <limits.h>
#include <stdint.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/stat.h>
Expand Down Expand Up @@ -1517,7 +1518,7 @@ KHASH_MAP_INIT_INT(bin, bins_t)
typedef khash_t(bin) bidx_t;

typedef struct {
int32_t n, m;
int64_t n, m;
uint64_t *offset;
} lidx_t;

Expand All @@ -1532,7 +1533,8 @@ struct __hts_idx_t {
int tbi_n, last_tbi_tid;
struct {
uint32_t last_bin, save_bin;
int last_coor, last_tid, save_tid, finished;
int64_t last_coor;
int last_tid, save_tid, finished;
uint64_t last_off, save_off;
uint64_t off_beg, off_end;
uint64_t n_mapped, n_unmapped;
Expand Down Expand Up @@ -1578,7 +1580,8 @@ static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)

static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
{
int i, beg, end;
int i;
int64_t beg, end;
beg = _beg >> min_shift;
end = (_end - 1) >> min_shift;
if (l->m < end + 1) {
Expand Down Expand Up @@ -1732,7 +1735,7 @@ int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
return ret;
}

int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
int hts_idx_push(hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t offset, int is_mapped)
{
int bin;
int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3);
Expand Down Expand Up @@ -1773,12 +1776,12 @@ int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int
idx->z.last_tid = tid;
idx->z.last_bin = 0xffffffffu;
} else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
hts_log_error("Unsorted positions on sequence #%d: %d followed by %d", tid+1, idx->z.last_coor+1, beg+1);
hts_log_error("Unsorted positions on sequence #%d: %"PRId64" followed by %"PRId64, tid+1, idx->z.last_coor+1, beg+1);
return -1;
}
else if (end < beg) {
// Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.)
hts_log_error("Invalid record on sequence #%d: end %d < begin %d", tid+1, end, beg+1);
hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1);
return -1;
}
if ( tid>=0 )
Expand Down Expand Up @@ -1828,14 +1831,14 @@ int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int
}

if (idx->fmt == HTS_FMT_CSI) {
hts_log_error("Region %d..%d cannot be stored in a csi index "
hts_log_error("Region %"PRId64"..%"PRId64" cannot be stored in a csi index "
"with min_shift = %d, n_lvls = %d. Try using "
"min_shift = 14, n_lvls >= %d",
beg, end,
idx->min_shift, idx->n_lvls,
n_lvls);
} else {
hts_log_error("Region %d..%d cannot be stored in a %s index. "
hts_log_error("Region %"PRId64"..%"PRId64" cannot be stored in a %s index. "
"Try using a csi index with min_shift = 14, "
"n_lvls >= %d",
beg, end, idx_format_name(idx->fmt),
Expand Down Expand Up @@ -2275,7 +2278,8 @@ static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shi
if (beg >= end) return 0;
if (end >= 1LL<<s) end = 1LL<<s;
for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
int b, e, n, i;
int64_t b, e;
int n, i;
b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
if (itr->bins.n + n > itr->bins.m) {
itr->bins.m = itr->bins.n + n;
Expand Down Expand Up @@ -2396,7 +2400,7 @@ uint64_t hts_itr_off(const hts_idx_t* idx, int tid) {
return off0;
}

hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec)
{
int i, n_off, l, bin;
hts_pair64_max_t *off;
Expand Down Expand Up @@ -2527,7 +2531,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
khint_t k;
bidx_t *bidx;
uint64_t min_off, max_off, t_off = (uint64_t)-1;
int tid, beg, end;
int tid;
int64_t beg, end;
hts_reglist_t *curr_reg;

if (!idx || !iter || !iter->multi)
Expand Down Expand Up @@ -2646,7 +2651,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter)
{
const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
int tid, beg, end, i, j, l, n_off = 0;
int tid, i, j, l, n_off = 0;
int64_t beg, end;
hts_reglist_t *curr_reg;
hts_pair32_t *curr_intv;
hts_pair64_max_t *off = NULL;
Expand Down Expand Up @@ -2700,10 +2706,10 @@ int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter)
off[n_off].max = (uint64_t)tid<<32 | end;
n_off++;
} else {
hts_log_warning("Could not set offset end for region %d:%d-%d. Skipping", tid, beg, end);
hts_log_warning("Could not set offset end for region %d:%"PRId64"-%"PRId64". Skipping", tid, beg, end);
}
} else {
hts_log_warning("No index entry for region %d:%d-%d", tid, beg, end);
hts_log_warning("No index entry for region %d:%"PRId64"-%"PRId64"", tid, beg, end);
}
}
} else {
Expand Down Expand Up @@ -2856,6 +2862,11 @@ static void *hts_memrchr(const void *s, int c, size_t n) {
return NULL;
}

// Almost INT64_MAX, but when cast into a 32-bit int it's
// also INT_MAX instead of -1. This avoids bugs with old code
// using the new data types.
#define INT64_32_MAX ((((int64_t)INT_MAX)<<32)|INT_MAX)

/*
* A variant of hts_parse_reg which is reference-id aware. It uses
* the iterator name2id callbacks to validate the region tokenisation works.
Expand Down Expand Up @@ -2957,7 +2968,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end

// No colon is simplest case; just check and return.
if (colon == NULL) {
*beg = 0; *end = INT64_MAX;
*beg = 0; *end = INT64_32_MAX;
kputsn(s, s_len-quoted, &ks); // convert to nul terminated string
if (!ks.s) {
*tid = -2;
Expand All @@ -2972,7 +2983,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end

// Has a colon, but check whole name first.
if (!quoted) {
*beg = 0; *end = INT64_MAX;
*beg = 0; *end = INT64_32_MAX;
kputsn(s, s_len, &ks); // convert to nul terminated string
if (!ks.s) {
*tid = -2;
Expand Down Expand Up @@ -3023,7 +3034,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end
if (*beg < 0) {
if (isdigit(*hyphen) || *hyphen == '\0' || *hyphen == ',') {
// interpret chr:-100 as chr:1-100
*end = *beg==-1 ? INT64_MAX : -(*beg+1);
*end = *beg==-1 ? INT64_32_MAX : -(*beg+1);
*beg = 0;
return s_end;
} else if (*hyphen == '-') {
Expand All @@ -3035,7 +3046,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end
}

if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) {
*end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : INT64_MAX;
*end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : INT64_32_MAX;
} else if (*hyphen == '-') {
*end = hts_parse_decimal(hyphen+1, &hyphen, flags);
if (*hyphen != '\0' && *hyphen != ',') {
Expand All @@ -3048,7 +3059,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end
}

if (*end == 0)
*end = INT64_MAX; // interpret chr:100- as chr:100-<end>
*end = INT64_32_MAX; // interpret chr:100- as chr:100-<end>

if (*beg >= *end) return NULL;

Expand All @@ -3057,26 +3068,47 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end

// Next release we should mark this as deprecated?
// Use hts_parse_region above instead.
const char *hts_parse_reg(const char *s, int *beg, int *end)
const char *hts_parse_reg_(const char *s, int64_t *beg, int64_t *end)
{
char *hyphen;
const char *colon = strrchr(s, ':');
if (colon == NULL) {
*beg = 0; *end = INT_MAX;
*beg = 0; *end = INT64_32_MAX;
return s + strlen(s);
}

*beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
if (*beg < 0) *beg = 0;

if (*hyphen == '\0') *end = INT_MAX;
if (*hyphen == '\0') *end = INT64_32_MAX;
else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
else return NULL;

if (*beg >= *end) return NULL;
return colon;
}

const char *hts_parse_reg(const char *s, int *beg, int *end)
{
int64_t beg64 = 0, end64 = 0;
const char *colon = hts_parse_reg_(s, &beg64, &end64);
if (beg64 > INT_MAX) {
hts_log_error("Position %"PRId64" too large", beg64);
return NULL;
}
if (end64 > INT_MAX) {
if (end64 == INT64_32_MAX) {
end64 = INT_MAX;
} else {
hts_log_error("Position %"PRId64" too large", end64);
return NULL;
}
}
*beg = beg64;
*end = end64;
return colon;
}

hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
{
int tid;
Expand Down Expand Up @@ -3152,7 +3184,8 @@ hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int cou

int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
{
int ret, tid, beg, end;
int ret, tid;
int64_t beg, end;
if (iter == NULL || iter->finished) return -1;
if (iter->read_rest) {
if (iter->curr_off) { // seek to the start
Expand Down Expand Up @@ -3196,7 +3229,8 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r)
{
void *fp;
int ret, tid, beg, end, i, cr, ci;
int ret, tid, i, cr, ci;
int64_t beg, end;
hts_reglist_t *found_reg;

if (iter == NULL || iter->finished) return -1;
Expand Down
19 changes: 11 additions & 8 deletions htslib/hts.h
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,8 @@ When REST or NONE is used, idx is also ignored and may be NULL.
#define HTS_FMT_CRAI 3

typedef struct {
uint32_t beg, end;
//uint32_t beg, end;
uint64_t beg, end; // sorry for the bad naming: FIXME!
} hts_pair32_t;

typedef struct {
Expand All @@ -595,18 +596,20 @@ typedef struct {
hts_pair32_t *intervals;
int tid;
uint32_t count;
uint32_t min_beg, max_end;
uint64_t min_beg, max_end;
} hts_reglist_t;

typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end);
typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int64_t *beg, int64_t *end);
typedef int hts_seek_func(void *fp, int64_t offset, int where);
typedef int64_t hts_tell_func(void *fp);

typedef struct {
uint32_t read_rest:1, finished:1, is_cram:1, nocoor:1, multi:1, dummy:27;
int tid, beg, end, n_off, i, n_reg;
int tid, n_off, i, n_reg;
int64_t beg, end;
hts_reglist_t *reg_list;
int curr_tid, curr_beg, curr_end, curr_reg, curr_intv;
int curr_tid, curr_reg, curr_intv;
int64_t curr_beg, curr_end;
uint64_t curr_off, nocoor_off;
hts_pair64_max_t *off;
hts_readrec_func *readrec;
Expand Down Expand Up @@ -658,7 +661,7 @@ void hts_idx_destroy(hts_idx_t *idx);
The @p is_mapped parameter is used to update the n_mapped / n_unmapped counts
stored in the meta-data bin.
*/
int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped);
int hts_idx_push(hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t offset, int is_mapped);

/// Finish building an index
/** @param idx Index
Expand Down Expand Up @@ -940,14 +943,14 @@ const char *hts_parse_region(const char *str, int *tid, int64_t *beg, int64_t *e
@param readrec Callback to read a record from the input file
@return An iterator on success; NULL on failure
*/
hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec);
hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec);

/// Free an iterator
/** @param iter Iterator to free
*/
void hts_itr_destroy(hts_itr_t *iter);

typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec);
typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec);

/// Create a single-region iterator from a text region specification
/** @param idx Index
Expand Down
8 changes: 6 additions & 2 deletions htslib/kstring.h
Original file line number Diff line number Diff line change
Expand Up @@ -354,11 +354,11 @@ static inline int kputw(int c, kstring_t *s)
return kputuw(x, s);
}

static inline int kputl(long c, kstring_t *s)
static inline int kputll(long long c, kstring_t *s)
{
char buf[32];
int i, l = 0;
unsigned long x = c;
unsigned long long x = c;
if (c < 0) x = -x;
do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
if (c < 0) buf[l++] = '-';
Expand All @@ -369,6 +369,10 @@ static inline int kputl(long c, kstring_t *s)
return 0;
}

static inline int kputl(long c, kstring_t *s) {
return kputll(c, s);
}

/*
* Returns 's' split by delimiter, with *n being the number of components;
* NULL on failue.
Expand Down
Loading

0 comments on commit 418a183

Please sign in to comment.