From 418a183ef842a966afb00e783c09ac7fd7eaeb62 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 31 May 2018 16:28:24 +0100 Subject: [PATCH] First stage of supporting large chromosomes. The in-memory data structures are 64-bit for pos, mate-pos and insert size, along with iterators. The code that fills these out is still all 32-bit so this is basically a place-holder for ABI purposes. The exception to this is SAM support, which being purely textual has the minimal changes necessary to read and write 64-bit values. Split the hts_parse_reg API to 32-bit and 64-bit variants (although 64 bit version is only used internally at the moment). To much code uses this with addresses of 32-bit quantities, so for compatibility hts_parse_reg() cannot change. 64 bit parse_reg uses a slightly tweaked value for the end for chromosomes with no range (eg "chr1"). Using INT64_MAX would yield -1 when cast into int. We now have nearly 64-bit max which when truncated to 32-bit is still INT_MAX. The only change needed in samtools to pass tests is fixing cur5 and pre5 in bam_mate.c. --- cram/cram_encode.c | 2 +- cram/cram_structs.h | 4 +-- hts.c | 82 ++++++++++++++++++++++++++++++------------- htslib/hts.h | 19 +++++----- htslib/kstring.h | 8 +++-- htslib/sam.h | 30 ++++++++++++---- htslib/tbx.h | 2 +- htslib/vcf.h | 6 ++-- region.c | 2 ++ sam.c | 49 +++++++++++++++----------- synced_bcf_reader.c | 3 +- tbx.c | 2 +- test/sam.c | 6 ++-- test/test-bcf-sr.c | 3 +- test/test-parse-reg.c | 24 +++++++------ vcf.c | 59 +++++++++++++++++++------------ vcfutils.c | 53 ++++++++++++++-------------- 17 files changed, 220 insertions(+), 134 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 35b701a80..42d82f6b9 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -3065,7 +3065,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/- // a small amount, if appropriate options set. if ((bam_ins_size(b) && - abs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) || + llabs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) || (!bam_ins_size(b) && !fd->tlen_zero)) goto detached; diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 2cde6cfef..a1b7d8e7b 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -661,8 +661,8 @@ typedef struct cram_index { typedef struct { int refid; - int start; - int end; + int64_t start; + int64_t end; } cram_range; /*----------------------------------------------------------------------------- diff --git a/hts.c b/hts.c index 6133f893b..ecd46ee48 100644 --- a/hts.c +++ b/hts.c @@ -32,6 +32,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -1517,7 +1518,7 @@ KHASH_MAP_INIT_INT(bin, bins_t) typedef khash_t(bin) bidx_t; typedef struct { - int32_t n, m; + int64_t n, m; uint64_t *offset; } lidx_t; @@ -1532,7 +1533,8 @@ struct __hts_idx_t { int tbi_n, last_tbi_tid; struct { uint32_t last_bin, save_bin; - int last_coor, last_tid, save_tid, finished; + int64_t last_coor; + int last_tid, save_tid, finished; uint64_t last_off, save_off; uint64_t off_beg, off_end; uint64_t n_mapped, n_unmapped; @@ -1578,7 +1580,8 @@ static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end) static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift) { - int i, beg, end; + int i; + int64_t beg, end; beg = _beg >> min_shift; end = (_end - 1) >> min_shift; if (l->m < end + 1) { @@ -1732,7 +1735,7 @@ int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) return ret; } -int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped) +int hts_idx_push(hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t offset, int is_mapped) { int bin; int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3); @@ -1773,12 +1776,12 @@ int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int idx->z.last_tid = tid; idx->z.last_bin = 0xffffffffu; } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order - hts_log_error("Unsorted positions on sequence #%d: %d followed by %d", tid+1, idx->z.last_coor+1, beg+1); + hts_log_error("Unsorted positions on sequence #%d: %"PRId64" followed by %"PRId64, tid+1, idx->z.last_coor+1, beg+1); return -1; } else if (end < beg) { // Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.) - hts_log_error("Invalid record on sequence #%d: end %d < begin %d", tid+1, end, beg+1); + hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1); return -1; } if ( tid>=0 ) @@ -1828,14 +1831,14 @@ int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int } if (idx->fmt == HTS_FMT_CSI) { - hts_log_error("Region %d..%d cannot be stored in a csi index " + hts_log_error("Region %"PRId64"..%"PRId64" cannot be stored in a csi index " "with min_shift = %d, n_lvls = %d. Try using " "min_shift = 14, n_lvls >= %d", beg, end, idx->min_shift, idx->n_lvls, n_lvls); } else { - hts_log_error("Region %d..%d cannot be stored in a %s index. " + hts_log_error("Region %"PRId64"..%"PRId64" cannot be stored in a %s index. " "Try using a csi index with min_shift = 14, " "n_lvls >= %d", beg, end, idx_format_name(idx->fmt), @@ -2275,7 +2278,8 @@ static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shi if (beg >= end) return 0; if (end >= 1LL<>s); e = t + (end>>s); n = e - b + 1; if (itr->bins.n + n > itr->bins.m) { itr->bins.m = itr->bins.n + n; @@ -2396,7 +2400,7 @@ uint64_t hts_itr_off(const hts_idx_t* idx, int tid) { return off0; } -hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) +hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec) { int i, n_off, l, bin; hts_pair64_max_t *off; @@ -2527,7 +2531,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) khint_t k; bidx_t *bidx; uint64_t min_off, max_off, t_off = (uint64_t)-1; - int tid, beg, end; + int tid; + int64_t beg, end; hts_reglist_t *curr_reg; if (!idx || !iter || !iter->multi) @@ -2646,7 +2651,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; - int tid, beg, end, i, j, l, n_off = 0; + int tid, i, j, l, n_off = 0; + int64_t beg, end; hts_reglist_t *curr_reg; hts_pair32_t *curr_intv; hts_pair64_max_t *off = NULL; @@ -2700,10 +2706,10 @@ int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) off[n_off].max = (uint64_t)tid<<32 | end; n_off++; } else { - hts_log_warning("Could not set offset end for region %d:%d-%d. Skipping", tid, beg, end); + hts_log_warning("Could not set offset end for region %d:%"PRId64"-%"PRId64". Skipping", tid, beg, end); } } else { - hts_log_warning("No index entry for region %d:%d-%d", tid, beg, end); + hts_log_warning("No index entry for region %d:%"PRId64"-%"PRId64"", tid, beg, end); } } } else { @@ -2856,6 +2862,11 @@ static void *hts_memrchr(const void *s, int c, size_t n) { return NULL; } +// Almost INT64_MAX, but when cast into a 32-bit int it's +// also INT_MAX instead of -1. This avoids bugs with old code +// using the new data types. +#define INT64_32_MAX ((((int64_t)INT_MAX)<<32)|INT_MAX) + /* * A variant of hts_parse_reg which is reference-id aware. It uses * the iterator name2id callbacks to validate the region tokenisation works. @@ -2957,7 +2968,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end // No colon is simplest case; just check and return. if (colon == NULL) { - *beg = 0; *end = INT64_MAX; + *beg = 0; *end = INT64_32_MAX; kputsn(s, s_len-quoted, &ks); // convert to nul terminated string if (!ks.s) { *tid = -2; @@ -2972,7 +2983,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end // Has a colon, but check whole name first. if (!quoted) { - *beg = 0; *end = INT64_MAX; + *beg = 0; *end = INT64_32_MAX; kputsn(s, s_len, &ks); // convert to nul terminated string if (!ks.s) { *tid = -2; @@ -3023,7 +3034,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end if (*beg < 0) { if (isdigit(*hyphen) || *hyphen == '\0' || *hyphen == ',') { // interpret chr:-100 as chr:1-100 - *end = *beg==-1 ? INT64_MAX : -(*beg+1); + *end = *beg==-1 ? INT64_32_MAX : -(*beg+1); *beg = 0; return s_end; } else if (*hyphen == '-') { @@ -3035,7 +3046,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end } if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) { - *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : INT64_MAX; + *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : INT64_32_MAX; } else if (*hyphen == '-') { *end = hts_parse_decimal(hyphen+1, &hyphen, flags); if (*hyphen != '\0' && *hyphen != ',') { @@ -3048,7 +3059,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end } if (*end == 0) - *end = INT64_MAX; // interpret chr:100- as chr:100- + *end = INT64_32_MAX; // interpret chr:100- as chr:100- if (*beg >= *end) return NULL; @@ -3057,19 +3068,19 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end // Next release we should mark this as deprecated? // Use hts_parse_region above instead. -const char *hts_parse_reg(const char *s, int *beg, int *end) +const char *hts_parse_reg_(const char *s, int64_t *beg, int64_t *end) { char *hyphen; const char *colon = strrchr(s, ':'); if (colon == NULL) { - *beg = 0; *end = INT_MAX; + *beg = 0; *end = INT64_32_MAX; return s + strlen(s); } *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1; if (*beg < 0) *beg = 0; - if (*hyphen == '\0') *end = INT_MAX; + if (*hyphen == '\0') *end = INT64_32_MAX; else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP); else return NULL; @@ -3077,6 +3088,27 @@ const char *hts_parse_reg(const char *s, int *beg, int *end) return colon; } +const char *hts_parse_reg(const char *s, int *beg, int *end) +{ + int64_t beg64 = 0, end64 = 0; + const char *colon = hts_parse_reg_(s, &beg64, &end64); + if (beg64 > INT_MAX) { + hts_log_error("Position %"PRId64" too large", beg64); + return NULL; + } + if (end64 > INT_MAX) { + if (end64 == INT64_32_MAX) { + end64 = INT_MAX; + } else { + hts_log_error("Position %"PRId64" too large", end64); + return NULL; + } + } + *beg = beg64; + *end = end64; + return colon; +} + hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) { int tid; @@ -3152,7 +3184,8 @@ hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int cou int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) { - int ret, tid, beg, end; + int ret, tid; + int64_t beg, end; if (iter == NULL || iter->finished) return -1; if (iter->read_rest) { if (iter->curr_off) { // seek to the start @@ -3196,7 +3229,8 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r) { void *fp; - int ret, tid, beg, end, i, cr, ci; + int ret, tid, i, cr, ci; + int64_t beg, end; hts_reglist_t *found_reg; if (iter == NULL || iter->finished) return -1; diff --git a/htslib/hts.h b/htslib/hts.h index 85847a8fe..08c357815 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -578,7 +578,8 @@ When REST or NONE is used, idx is also ignored and may be NULL. #define HTS_FMT_CRAI 3 typedef struct { - uint32_t beg, end; + //uint32_t beg, end; + uint64_t beg, end; // sorry for the bad naming: FIXME! } hts_pair32_t; typedef struct { @@ -595,18 +596,20 @@ typedef struct { hts_pair32_t *intervals; int tid; uint32_t count; - uint32_t min_beg, max_end; + uint64_t min_beg, max_end; } hts_reglist_t; -typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end); +typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int64_t *beg, int64_t *end); typedef int hts_seek_func(void *fp, int64_t offset, int where); typedef int64_t hts_tell_func(void *fp); typedef struct { uint32_t read_rest:1, finished:1, is_cram:1, nocoor:1, multi:1, dummy:27; - int tid, beg, end, n_off, i, n_reg; + int tid, n_off, i, n_reg; + int64_t beg, end; hts_reglist_t *reg_list; - int curr_tid, curr_beg, curr_end, curr_reg, curr_intv; + int curr_tid, curr_reg, curr_intv; + int64_t curr_beg, curr_end; uint64_t curr_off, nocoor_off; hts_pair64_max_t *off; hts_readrec_func *readrec; @@ -658,7 +661,7 @@ void hts_idx_destroy(hts_idx_t *idx); The @p is_mapped parameter is used to update the n_mapped / n_unmapped counts stored in the meta-data bin. */ -int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped); +int hts_idx_push(hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t offset, int is_mapped); /// Finish building an index /** @param idx Index @@ -940,14 +943,14 @@ const char *hts_parse_region(const char *str, int *tid, int64_t *beg, int64_t *e @param readrec Callback to read a record from the input file @return An iterator on success; NULL on failure */ -hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec); +hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec); /// Free an iterator /** @param iter Iterator to free */ void hts_itr_destroy(hts_itr_t *iter); -typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec); +typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec); /// Create a single-region iterator from a text region specification /** @param idx Index diff --git a/htslib/kstring.h b/htslib/kstring.h index c440cd5e9..f817ed5dc 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -354,11 +354,11 @@ static inline int kputw(int c, kstring_t *s) return kputuw(x, s); } -static inline int kputl(long c, kstring_t *s) +static inline int kputll(long long c, kstring_t *s) { char buf[32]; int i, l = 0; - unsigned long x = c; + unsigned long long x = c; if (c < 0) x = -x; do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0); if (c < 0) buf[l++] = '-'; @@ -369,6 +369,10 @@ static inline int kputl(long c, kstring_t *s) return 0; } +static inline int kputl(long c, kstring_t *s) { + return kputll(c, s); +} + /* * Returns 's' split by delimiter, with *n being the number of components; * NULL on failue. diff --git a/htslib/sam.h b/htslib/sam.h index 8e1615e61..a4086ca1a 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -169,6 +169,22 @@ extern const int8_t bam_cigar_table[256]; *** Alignment records *** *************************/ +/* + * Assumptions made here. While pos can be 64-bit, no sequence + * itself is that long, but due to ref skip CIGAR fields it + * may span more than that. (CIGAR itself is 28-bit len + 4 bit + * type, but in theory we can combine multiples together.) + * + * Mate position and insert size also need to be 64-bit, but + * we won't accept more than 32-bit for tid. + * + * The bam_core_t structure is the *in memory* layout and not + * the same as the on-disk format. 64-bit changes here permit + * SAM to work with very long chromosomes and permit BAM and CRAM + * to seamlessly update in the future without further API/ABI + * revisions. + */ + /*! @typedef @abstract Structure for core alignment information. @field tid chromosome ID, defined by sam_hdr_t @@ -185,8 +201,8 @@ extern const int8_t bam_cigar_table[256]; */ typedef struct { int32_t tid; - int32_t pos; - uint16_t bin; + int64_t pos; + uint16_t bin; // NB: invalid on 64-bit pos uint8_t qual; uint8_t l_extranul; uint16_t flag; @@ -194,8 +210,8 @@ typedef struct { uint32_t n_cigar; int32_t l_qseq; int32_t mtid; - int32_t mpos; - int32_t isize; + int64_t mpos; + int64_t isize; } bam1_core_t; /*! @typedef @@ -946,7 +962,7 @@ int bam_cigar2qlen(int n_cigar, const uint32_t *cigar); operations in @p cigar (these are the operations that "consume" reference bases). All other operations (including invalid ones) are ignored. */ -int bam_cigar2rlen(int n_cigar, const uint32_t *cigar); +int64_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar); /*! @abstract Calculate the rightmost base position of an alignment on the @@ -959,7 +975,7 @@ int bam_cigar2rlen(int n_cigar, const uint32_t *cigar); For an unmapped read (either according to its flags or if it has no cigar string), we return b->core.pos + 1 by convention. */ -int32_t bam_endpos(const bam1_t *b); +int64_t bam_endpos(const bam1_t *b); int bam_str2flag(const char *str); /** returns negative value on error */ char *bam_flag2str(int flag); /** The string must be freed by the user */ @@ -1084,7 +1100,7 @@ When using one of these values, @p beg and @p end are ignored. When using HTS_IDX_REST or HTS_IDX_NONE, NULL can be passed in to @p idx. */ -hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end); +hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int64_t beg, int64_t end); /// Create a SAM/BAM/CRAM iterator /** @param idx Index diff --git a/htslib/tbx.h b/htslib/tbx.h index 9119ab8a3..52f103b11 100644 --- a/htslib/tbx.h +++ b/htslib/tbx.h @@ -64,7 +64,7 @@ extern const tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sa /* Internal helper function used by tbx_itr_next() */ BGZF *hts_get_bgzfp(htsFile *fp); - int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end); + int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int64_t *beg, int64_t *end); tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf); /* diff --git a/htslib/vcf.h b/htslib/vcf.h index 31720d7f1..7116a1229 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -209,8 +209,8 @@ typedef struct { */ typedef struct { int32_t rid; // CHROM - int32_t pos; // POS - int32_t rlen; // length of REF + int64_t pos; // POS + int64_t rlen; // length of REF float qual; // QUAL uint32_t n_info:16, n_allele:16; uint32_t n_fmt:8, n_sample:24; @@ -427,7 +427,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) HTS_RESULT_USED; /** Helper function for the bcf_itr_next() macro; internal use, ignore it */ - int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, int *beg, int *end); + int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, int64_t *beg, int64_t *end); diff --git a/region.c b/region.c index d9679f79f..d6680b8a0 100644 --- a/region.c +++ b/region.c @@ -37,6 +37,8 @@ typedef struct reglist KHASH_MAP_INIT_INT(reg, reglist_t) typedef kh_reg_t reghash_t; +const char *hts_parse_reg_(const char *s, int64_t *beg, int64_t *end); + static int compare_uint64 (const void * a, const void * b) { if (*(uint64_t *)a < *(uint64_t *)b) return -1; diff --git a/sam.c b/sam.c index 5d794de68..7948d1724 100644 --- a/sam.c +++ b/sam.c @@ -455,16 +455,17 @@ int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) return l; } -int bam_cigar2rlen(int n_cigar, const uint32_t *cigar) +int64_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar) { - int k, l; + int k; + int64_t l; for (k = l = 0; k < n_cigar; ++k) if (bam_cigar_type(bam_cigar_op(cigar[k]))&2) l += bam_cigar_oplen(cigar[k]); return l; } -int32_t bam_endpos(const bam1_t *b) +int64_t bam_endpos(const bam1_t *b) { if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0) return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); @@ -556,12 +557,12 @@ int bam_read1(BGZF *fp, bam1_t *b) if (fp->is_be) { for (i = 0; i < 8; ++i) ed_swap_4p(x + i); } - c->tid = x[0]; c->pos = x[1]; + c->tid = x[0]; c->pos = (int32_t)x[1]; c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0; c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; c->l_qseq = x[4]; - c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; + c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7]; new_l_data = block_len - 32 + c->l_extranul; if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4; @@ -608,6 +609,12 @@ int bam_write1(BGZF *fp, const bam1_t *b) return -1; } if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR + if (c->pos > INT_MAX || + c->mpos > INT_MAX || + c->isize < INT_MIN || c->isize > INT_MAX) { + hts_log_error("Positional data is too large for BAM format"); + return -1; + } x[0] = c->tid; x[1] = c->pos; x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); @@ -828,7 +835,7 @@ int sam_idx_save(htsFile *fp) { return 0; } -static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end) +static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int64_t *beg, int64_t *end) { htsFile *fp = (htsFile *)fpv; bam1_t *b = bv; @@ -843,7 +850,7 @@ static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, i } // This is used only with read_rest=1 iterators, so need not set tid/beg/end. -static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end) +static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, int64_t *beg, int64_t *end) { htsFile *fp = (htsFile *)fpv; bam1_t *b = bv; @@ -852,7 +859,7 @@ static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, int *b return ret; } -static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end) +static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int64_t *beg, int64_t *end) { htsFile *fp = fpv; bam1_t *b = bv; @@ -975,7 +982,7 @@ hts_idx_t *sam_index_load(htsFile *fp, const char *fn) return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE); } -static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) +static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t)); @@ -1032,7 +1039,7 @@ static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end return iter; } -hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) +hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int64_t beg, int64_t end) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; if (idx == NULL) @@ -2759,7 +2766,7 @@ static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *st r |= kputs(h->target_name[c->tid] , str); r |= kputc_('\t', str); } else r |= kputsn_("*\t", 2, str); - r |= kputw(c->pos + 1, str); r |= kputc_('\t', str); // pos + r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual if (c->n_cigar) { // cigar uint32_t *cigar = bam_get_cigar(b); @@ -2775,8 +2782,8 @@ static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *st r |= kputs(h->target_name[c->mtid], str); r |= kputc_('\t', str); } - r |= kputw(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos - r |= kputw(c->isize, str); r |= kputc_('\t', str); // template len + r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos + r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len if (c->l_qseq) { // seq and qual uint8_t *s = bam_get_seq(b); if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err; @@ -3659,7 +3666,7 @@ static cstate_t g_cstate_null = { -1, 0, 0, 0 }; typedef struct __linkbuf_t { bam1_t b; - int32_t beg, end; + int64_t beg, end; cstate_t s; struct __linkbuf_t *next; bam_pileup_cd cd; @@ -3957,7 +3964,7 @@ void bam_plp_destructor(bam_plp_t plp, * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered, * or -2 on error. */ -static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref) +static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int64_t *iref) { int pos = *iref; if ( pos < 0 ) return -1; @@ -3992,7 +3999,7 @@ static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *iseq = -1; return -1; } -static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref) +static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int64_t *iref) { while ( *cigar < cigar_max ) { @@ -4026,16 +4033,16 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b) uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b); - int iref = b->core.pos; - int a_iref = iref - a->core.pos; - int b_iref = iref - b->core.pos; + int64_t iref = b->core.pos; + int64_t a_iref = iref - a->core.pos; + int64_t b_iref = iref - b->core.pos; int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); if ( a_ret<0 ) return a_ret<-1 ? -1:0; // no overlap or error int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); if ( b_ret<0 ) return b_ret<-1 ? -1:0; // no overlap or error #if DBG - fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, + fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %"PRId64"-%"PRId64"\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b))); #endif @@ -4105,7 +4112,7 @@ static int overlap_push(bam_plp_t iter, lbnode_t *node) // no overlap possible, unless some wild cigar if ( node->b.core.tid != node->b.core.mtid - || (abs(node->b.core.isize) >= 2*node->b.core.l_qseq + || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq && node->b.core.mpos >= node->end) // for those wild cigars ) return 0; diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 6b65e3133..425fae1ce 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include "htslib/synced_bcf_reader.h" @@ -383,7 +384,7 @@ void debug_buffer(FILE *fp, bcf_sr_t *reader) for (j=0; j<=reader->nbuffer; j++) { bcf1_t *line = reader->buffer[j]; - fprintf(fp,"\t%p\t%s%s\t%s:%d\t%s ", (void*)line,reader->fname,j==0?"*":" ",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:""); + fprintf(fp,"\t%p\t%s%s\t%s:%"PRId64"\t%s ", (void*)line,reader->fname,j==0?"*":" ",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:""); int k; for (k=1; kn_allele; k++) fprintf(fp," %s", line->d.allele[k]); fprintf(fp,"\n"); diff --git a/tbx.c b/tbx.c index 2e0f3499e..d5e3c3be5 100644 --- a/tbx.c +++ b/tbx.c @@ -172,7 +172,7 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_ * -1 on EOF * <= -2 on error */ -int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end) +int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int64_t *beg, int64_t *end) { tbx_t *tbx = (tbx_t *) tbxv; kstring_t *s = (kstring_t *) sv; diff --git a/test/sam.c b/test/sam.c index 7cbfd179e..bf429bf98 100644 --- a/test/sam.c +++ b/test/sam.c @@ -1148,12 +1148,12 @@ static void samrecord_layout(void) size_t bam1_t_size, bam1_t_size2; - bam1_t_size = (36 + sizeof(int) + 4 + sizeof (char *) + sizeof(uint64_t) + bam1_t_size = (56 + sizeof(int) + 4 + sizeof (char *) + sizeof(uint64_t) + sizeof(uint32_t)); bam1_t_size2 = bam1_t_size + 4; // Account for padding on some platforms - if (sizeof (bam1_core_t) != 36) - fail("sizeof bam1_core_t is %zu, expected 36", sizeof (bam1_core_t)); + if (sizeof (bam1_core_t) != 56) + fail("sizeof bam1_core_t is %zu, expected 56", sizeof (bam1_core_t)); if (sizeof (bam1_t) != bam1_t_size && sizeof (bam1_t) != bam1_t_size2) fail("sizeof bam1_t is %zu, expected either %zu or %zu", diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c index ebe93904a..23ee1d3e0 100644 --- a/test/test-bcf-sr.c +++ b/test/test-bcf-sr.c @@ -31,6 +31,7 @@ #include #include #include +#include #include void error(const char *format, ...) @@ -103,7 +104,7 @@ int main(int argc, char *argv[]) { if ( !bcf_sr_has_line(sr,i) ) continue; bcf1_t *rec = bcf_sr_get_line(sr, i); - printf("%s:%d", bcf_seqname(bcf_sr_get_header(sr,i),rec),rec->pos+1); + printf("%s:%"PRId64, bcf_seqname(bcf_sr_get_header(sr,i),rec),rec->pos+1); break; } diff --git a/test/test-parse-reg.c b/test/test-parse-reg.c index 404e98ddf..74bb3187f 100644 --- a/test/test-parse-reg.c +++ b/test/test-parse-reg.c @@ -47,6 +47,10 @@ #include #include +#ifndef INT64_32_MAX +#define INT64_32_MAX ((((int64_t)INT_MAX)<<32)|INT_MAX) +#endif + void reg_expected(sam_hdr_t *hdr, const char *reg, int flags, char *reg_exp, int tid_exp, int64_t beg_exp, int64_t end_exp) { const char *reg_out; @@ -87,26 +91,26 @@ int reg_test(char *fn) { // 5 chr1,chr3 // Check range extensions. - reg_expected(hdr, "chr1", 0, "", 0, 0, INT64_MAX); - reg_expected(hdr, "chr1:50", 0, "", 0, 49, INT64_MAX); + reg_expected(hdr, "chr1", 0, "", 0, 0, INT64_32_MAX); + reg_expected(hdr, "chr1:50", 0, "", 0, 49, INT64_32_MAX); reg_expected(hdr, "chr1:50", HTS_PARSE_ONE_COORD, "", 0, 49, 50); reg_expected(hdr, "chr1:50-100", 0, "", 0, 49, 100); - reg_expected(hdr, "chr1:50-", 0, "", 0, 49, INT64_MAX); + reg_expected(hdr, "chr1:50-", 0, "", 0, 49, INT64_32_MAX); reg_expected(hdr, "chr1:-50", 0, "", 0, 0, 50); // Check quoting fprintf(stderr, "Expected error: "); reg_expected(hdr, "chr1:100-200", 0, NULL, 0, 0, 0); // ambiguous reg_expected(hdr, "{chr1}:100-200", 0, "", 0, 99, 200); - reg_expected(hdr, "{chr1:100-200}", 0, "", 2, 0, INT64_MAX); + reg_expected(hdr, "{chr1:100-200}", 0, "", 2, 0, INT64_32_MAX); reg_expected(hdr, "{chr1:100-200}:100-200", 0, "", 2, 99, 200); reg_expected(hdr, "{chr2:100-200}:100-200", 0, "", 3, 99, 200); reg_expected(hdr, "chr2:100-200:100-200", 0, "", 3, 99, 200); - reg_expected(hdr, "chr2:100-200", 0, "", 3, 0, INT64_MAX); + reg_expected(hdr, "chr2:100-200", 0, "", 3, 0, INT64_32_MAX); // Check numerics - reg_expected(hdr, "chr3", 0, "", 4, 0, INT64_MAX); - reg_expected(hdr, "chr3:", 0, "", 4, 0, INT64_MAX); + reg_expected(hdr, "chr3", 0, "", 4, 0, INT64_32_MAX); + reg_expected(hdr, "chr3:", 0, "", 4, 0, INT64_32_MAX); reg_expected(hdr, "chr3:1000-1500", 0, "", 4, 999, 1500); reg_expected(hdr, "chr3:1,000-1,500", 0, "", 4, 999, 1500); reg_expected(hdr, "chr3:1k-1.5K", 0, "", 4, 999, 1500); @@ -114,11 +118,11 @@ int reg_test(char *fn) { reg_expected(hdr, "chr3:1e3-15e2", 0, "", 4, 999, 1500); // Check list mode - reg_expected(hdr, "chr1,chr3", HTS_PARSE_LIST, "chr3", 0, 0, INT64_MAX); + reg_expected(hdr, "chr1,chr3", HTS_PARSE_LIST, "chr3", 0, 0, INT64_32_MAX); fprintf(stderr, "Expected error: "); reg_expected(hdr, "chr1:100-200,chr3", HTS_PARSE_LIST, NULL, 0, 0, 0); // ambiguous - reg_expected(hdr, "{chr1,chr3}", HTS_PARSE_LIST, "", 5, 0, INT64_MAX); - reg_expected(hdr, "{chr1,chr3},chr1", HTS_PARSE_LIST, "chr1", 5, 0, INT64_MAX); + reg_expected(hdr, "{chr1,chr3}", HTS_PARSE_LIST, "", 5, 0, INT64_32_MAX); + reg_expected(hdr, "{chr1,chr3},chr1", HTS_PARSE_LIST, "chr1", 5, 0, INT64_32_MAX); // incorrect usage; first reg is valid (but not what user expects). reg_expected(hdr, "chr3:1,000-1,500", HTS_PARSE_LIST | HTS_PARSE_ONE_COORD, "000-1,500", 4, 0, 1); diff --git a/vcf.c b/vcf.c index 1ace26b26..c53e3a2db 100644 --- a/vcf.c +++ b/vcf.c @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include "htslib/vcf.h" @@ -1180,21 +1181,27 @@ void bcf_destroy(bcf1_t *v) static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) { - uint32_t x[8]; + union { + uint32_t i; + float f; + } x[8]; ssize_t ret; if ((ret = bgzf_read(fp, x, 32)) != 32) { if (ret == 0) return -1; return -2; } bcf_clear1(v); - if (x[0] < 24) return -2; - x[0] -= 24; // to exclude six 32-bit integers - if (ks_resize(&v->shared, x[0]) != 0) return -2; - if (ks_resize(&v->indiv, x[1]) != 0) return -2; - memcpy(v, x + 2, 16); - v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff; - v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff; - v->shared.l = x[0], v->indiv.l = x[1]; + if (x[0].i < 24) return -2; + x[0].i -= 24; // to exclude six 32-bit integers + if (ks_resize(&v->shared, x[0].i) != 0) return -2; + if (ks_resize(&v->indiv, x[1].i) != 0) return -2; + v->rid = x[2].i; + v->pos = x[3].i; + v->rlen = x[4].i; + v->qual = x[5].f; + v->n_allele = x[6].i>>16; v->n_info = x[6].i&0xffff; + v->n_fmt = x[7].i>>24; v->n_sample = x[7].i&0xffffff; + v->shared.l = x[0].i, v->indiv.l = x[1].i; // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4 if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0; @@ -1436,7 +1443,7 @@ int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) return bcf_subset_format(h,v); } -int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int *beg, int *end) +int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int64_t *beg, int64_t *end) { bcf1_t *v = (bcf1_t *) vv; int ret; @@ -1684,7 +1691,7 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) } if ( bcf_hdr_nsamples(h)!=v->n_sample ) { - hts_log_error("Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d)", + hts_log_error("Broken VCF record, the number of columns at %s:%"PRId64" does not match the number of samples (%d vs %d)", bcf_seqname(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h)); return -1; } @@ -1704,12 +1711,18 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) bcf1_sync(v); // check if the BCF record was modified BGZF *fp = hfp->fp.bgzf; - uint32_t x[8]; - x[0] = v->shared.l + 24; // to include six 32-bit integers - x[1] = v->indiv.l; - memcpy(x + 2, v, 16); - x[6] = (uint32_t)v->n_allele<<16 | v->n_info; - x[7] = (uint32_t)v->n_fmt<<24 | v->n_sample; + union { + uint32_t i; + float f; + } x[8]; + x[0].i = v->shared.l + 24; // to include six 32-bit integers + x[1].i = v->indiv.l; + x[2].i = v->rid; + x[3].i = v->pos; + x[4].i = v->rlen; + x[5].f = v->qual; + x[6].i = (uint32_t)v->n_allele<<16 | v->n_info; + x[7].i = (uint32_t)v->n_fmt<<24 | v->n_sample; if ( bgzf_write(fp, x, 32) != 32 ) return -1; if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1; if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1; @@ -2132,7 +2145,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p char *end = s->s + s->l; if ( q>=end ) { - hts_log_error("FORMAT column with no sample columns starting at %s:%d", s->s, v->pos+1); + hts_log_error("FORMAT column with no sample columns starting at %s:%"PRId64"", s->s, v->pos+1); v->errcode |= BCF_ERR_NCOLS; return -1; } @@ -2148,7 +2161,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { if (j >= MAX_N_FMT) { v->errcode |= BCF_ERR_LIMITS; - hts_log_error("FORMAT column at %s:%d lists more identifiers than htslib can handle", + hts_log_error("FORMAT column at %s:%"PRId64" lists more identifiers than htslib can handle", bcf_seqname(h,v), v->pos+1); return -1; } @@ -2220,7 +2233,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p j++; if ( j>=v->n_fmt ) { - hts_log_error("Incorrect number of FORMAT fields at %s:%d", + hts_log_error("Incorrect number of FORMAT fields at %s:%"PRId64"", h->id[BCF_DT_CTG][v->rid].key, v->pos+1); v->errcode |= BCF_ERR_NCOLS; return -1; @@ -2327,7 +2340,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } else { char buffer[8]; - hts_log_error("Invalid character '%s' in '%s' FORMAT field at %s:%d", + hts_log_error("Invalid character '%s' in '%s' FORMAT field at %s:%"PRId64"", dump_char(buffer, *t), h->id[BCF_DT_ID][z->key].key, bcf_seqname(h,v), v->pos+1); v->errcode |= BCF_ERR_CHAR; return -1; @@ -2386,14 +2399,14 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p if ( v->n_sample!=bcf_hdr_nsamples(h) ) { - hts_log_error("Number of columns at %s:%d does not match the number of samples (%d vs %d)", + hts_log_error("Number of columns at %s:%"PRId64" does not match the number of samples (%d vs %d)", bcf_seqname(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h)); v->errcode |= BCF_ERR_NCOLS; return -1; } if ( v->indiv.l > 0xffffffff ) { - hts_log_error("The FORMAT at %s:%d is too long", bcf_seqname(h,v), v->pos+1); + hts_log_error("The FORMAT at %s:%"PRId64" is too long", bcf_seqname(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed diff --git a/vcfutils.c b/vcfutils.c index 008dbe6f5..3e96c286a 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -23,6 +23,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include #include "htslib/vcfutils.h" #include "htslib/kbitset.h" @@ -64,12 +65,12 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) case BCF_BT_INT8: BRANCH_INT(int8_t); break; case BCF_BT_INT16: BRANCH_INT(int16_t); break; case BCF_BT_INT32: BRANCH_INT(int32_t); break; - default: hts_log_error("Unexpected type %d at %s:%d", ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; + default: hts_log_error("Unexpected type %d at %s:%"PRId64, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT if ( anid[BCF_DT_CTG][line->rid].key, line->pos+1); + hts_log_error("Incorrect AN/AC counts at %s:%"PRId64, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); } ac[0] = an - nac; @@ -98,7 +99,7 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ if ( p[ial]>>1 > line->n_allele ) \ { \ - hts_log_error("Incorrect allele (\"%d\") in %s at %s:%d", (p[ial]>>1)-1, header->samples[i], header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ + hts_log_error("Incorrect allele (\"%d\") in %s at %s:%"PRId64, (p[ial]>>1)-1, header->samples[i], header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ exit(1); \ } \ ac[(p[ial]>>1)-1]++; \ @@ -109,7 +110,7 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; - default: hts_log_error("Unexpected type %d at %s:%d", fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; + default: hts_log_error("Unexpected type %d at %s:%"PRId64, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT return 1; @@ -188,7 +189,7 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ if ( (p[ial]>>1)-1 >= line->n_allele ) { \ - hts_log_error("Allele index is out of bounds at %s:%d", header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ + hts_log_error("Allele index is out of bounds at %s:%"PRId64, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ ret = -1; \ goto clean; \ } \ @@ -200,7 +201,7 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break; - default: hts_log_error("Unexpected GT %d at %s:%d", + default: hts_log_error("Unexpected GT %d at %s:%"PRId64, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos + 1); goto clean; } @@ -265,7 +266,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int nR_new = line->n_allele-nrm; if ( nR_new<=0 ) // should not be able to remove reference allele { - hts_log_error("Cannot remove reference allele at %s:%d [%d]", + hts_log_error("Cannot remove reference allele at %s:%"PRId64" [%d]", bcf_seqname(header,line), line->pos+1, nR_new); goto err; } @@ -296,7 +297,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb mdat_bytes = mdat * size; if ( nret<0 ) { - hts_log_error("Could not access INFO/%s at %s:%d [%d]", + hts_log_error("Could not access INFO/%s at %s:%"PRId64" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -334,7 +335,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( j==1 && s == '.' ) continue; // missing if ( j!=nexp ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=%c=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=%c=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, j); goto err; } @@ -365,7 +366,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( n==1 && s == '.' ) continue; // missing if ( n!=nG_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=G=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=G=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, n); goto err; } @@ -374,7 +375,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type); if ( nret<0 ) { - hts_log_error("Could not update INFO/%s at %s:%d [%d]", + hts_log_error("Could not update INFO/%s at %s:%"PRId64" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -406,7 +407,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nA_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=A=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=A=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nA_ori, nret); goto err; } @@ -418,7 +419,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nR_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nR_ori, nret); goto err; } @@ -450,7 +451,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nG_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, nret); goto err; } @@ -484,7 +485,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type); if ( nret<0 ) { - hts_log_error("Could not update INFO/%s at %s:%d [%d]", + hts_log_error("Could not update INFO/%s at %s:%"PRId64" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -510,7 +511,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int al = bcf_gt_allele(ptr[j]); if ( !( al=0 ) ) { - hts_log_error("Problem updating genotypes at %s:%d [ al=0 :: al=%d,nR_ori=%d,map[al]=%d ]", + hts_log_error("Problem updating genotypes at %s:%"PRId64" [ al=0 :: al=%d,nR_ori=%d,map[al]=%d ]", bcf_seqname(header,line), line->pos+1, al, nR_ori, map[al]); goto err; } @@ -521,7 +522,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/GT at %s:%d [%d]", + hts_log_error("Could not update FORMAT/GT at %s:%"PRId64" [%d]", bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -548,7 +549,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb mdat_bytes = mdat * size; if ( nret<0 ) { - hts_log_error("Could not access FORMAT/%s at %s:%d [%d]", + hts_log_error("Could not access FORMAT/%s at %s:%"PRId64" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -589,7 +590,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( k_src==1 && s == '.' ) continue; // missing if ( k_src!=nexp ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=%c=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=%c=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, k_src); goto err; } @@ -614,7 +615,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( nexp==1 && s == '.' ) continue; // missing if ( nexp!=nG_ori && nexp!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(diploid) or %d(haploid), but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=G=%d(diploid) or %d(haploid), but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nR_ori, nexp); goto err; } @@ -659,7 +660,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb } if ( k_src!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(haploid), but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=G=%d(haploid), but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, k_src); goto err; } @@ -671,7 +672,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/%s at %s:%d [%d]", + hts_log_error("Could not update FORMAT/%s at %s:%"PRId64" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -707,7 +708,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nA_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=A=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=A=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nA_ori, nori); goto err; } @@ -719,7 +720,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, nori); goto err; } @@ -755,7 +756,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nG_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=G=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nori); goto err; } @@ -808,7 +809,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/%s at %s:%d [%d]", + hts_log_error("Could not update FORMAT/%s at %s:%"PRId64" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; }