Skip to content

Commit

Permalink
Merge branch 'CW-2013--bamstats-single-pass' into 'dev'
Browse files Browse the repository at this point in the history
CW-2013 -- bamstats without index

See merge request epi2melabs/fastcat!21
  • Loading branch information
julibeg committed May 23, 2023
2 parents eb40b8a + aaa8f8e commit b78b02f
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 92 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [v0.11.0]
### Changed
- Bamstats can now be run without a BAM index.
- `fastcat -H` now wraps all known header fields into SAM tags regardless of whether the header was "valid" (i.e. all expected fields were present) or not.

## [v0.10.2]
Expand Down
61 changes: 32 additions & 29 deletions src/bamstats/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,8 @@ int main(int argc, char *argv[]) {
write_header(args.sample);

htsFile *fp = hts_open(args.bam[0], "rb");
hts_idx_t *idx = sam_index_load(fp, args.bam[0]);
sam_hdr_t *hdr = sam_hdr_read(fp);
if (hdr == 0 || idx == 0 || fp == 0) {
if (hdr == 0 || fp == 0) {
fprintf(stderr, "Failed to read .bam file '%s'.\n", args.bam[0]);
exit(EXIT_FAILURE);
}
Expand All @@ -108,37 +107,40 @@ int main(int argc, char *argv[]) {
}

FILE* flagstats = NULL;
flag_stats* flag_counts = NULL;
if (args.flagstats != NULL) {
flagstats = fopen(args.flagstats, "w");
write_stats_header(flagstats, args.sample);
flag_counts = create_flag_stats(
args.region == NULL ? hdr->n_targets : 1, args.unmapped
);
}

size_t* flag_counts = xalloc(8, sizeof(size_t), "counts");
if (args.region == NULL) {
// process all regions
for (int i=0; i < hdr->n_targets; ++i) {
const char* chr = sam_hdr_tid2name(hdr, i);
size_t ref_length = (size_t)sam_hdr_tid2len(hdr, i);
process_bams(
fp, idx, hdr, args.sample,
chr, 0, ref_length, true,
args.read_group, args.tag_name, args.tag_value,
flag_counts, args.unmapped);
write_stats(flag_counts, chr, args.sample, flagstats);
memset(flag_counts, 0, 8 * sizeof(size_t));
}
// Also do unplaced reads
if (args.unmapped) {
process_bams(
fp, idx, hdr, args.sample,
"*", 0, INT64_MAX, true,
args.read_group, args.tag_name, args.tag_value,
flag_counts, args.unmapped);
write_stats(flag_counts, "*", args.sample, flagstats);
memset(flag_counts, 0, 8 * sizeof(size_t));
// iterate over the entire file
process_bams(
fp, NULL, hdr, args.sample,
NULL, 0, INT64_MAX, true,
args.read_group, args.tag_name, args.tag_value,
flag_counts, args.unmapped);

// write flagstat counts if requested
if (flag_counts != NULL) {
for (int i=0; i < hdr->n_targets; ++i) {
const char* chr = sam_hdr_tid2name(hdr, i);
write_stats(flag_counts->counts[i], chr, args.sample, flagstats);
}
if (args.unmapped) {
write_stats(flag_counts->unmapped, "*", args.sample, flagstats);
}
}
} else {
// process given region
hts_idx_t *idx = sam_index_load(fp, args.bam[0]);
if (idx == 0){
fprintf(stderr, "Cannot find index file for '%s', which is required for processing by region.\n", args.bam[0]);
exit(EXIT_FAILURE);
}
int start, end;
char *chr = xalloc(strlen(args.region) + 1, sizeof(char), "chr");
strcpy(chr, args.region);
Expand All @@ -162,18 +164,19 @@ int main(int argc, char *argv[]) {
chr, start, end, true,
args.read_group, args.tag_name, args.tag_value,
flag_counts, args.unmapped);
write_stats(flag_counts, chr, args.sample, flagstats);
memset(flag_counts, 0, 8 * sizeof(size_t));
if (flag_counts != NULL) {
write_stats(flag_counts->counts[0], chr, args.sample, flagstats);
}
free(chr);
hts_idx_destroy(idx);
}
free(flag_counts);


if (flagstats != NULL) {
fclose(flagstats);
}

if (flag_counts != NULL) destroy_flag_stats(flag_counts);
sam_hdr_destroy(hdr);
hts_idx_destroy(idx);
hts_close(fp);
if (p.pool) { // must be after fp
hts_tpool_destroy(p.pool);
Expand Down
176 changes: 115 additions & 61 deletions src/bamstats/readstats.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,48 @@
#define bam_nt16_rev_table seq_nt16_str
#define bam_nt16_table seq_nt16_table


static const int NOTPRIMARY = BAM_FSUPPLEMENTARY | BAM_FSECONDARY | BAM_FUNMAP;
// counting alignment flags
// total, primary, ..., unused
static const size_t FLAG_MASK[8] = {
0, 0, BAM_FSECONDARY, BAM_FSUPPLEMENTARY, BAM_FUNMAP, BAM_FQCFAIL, BAM_FDUP, 0
};


/** Initialise flagstat counts struct for a BAM file.
*
* @param n_refs number of reference sequences.
*
*/
flag_stats* create_flag_stats(size_t n_refs, bool store_unmapped) {
flag_stats* stats = xalloc(1, sizeof(flag_stats), "flagstat");
stats->n_refs = n_refs;
stats->counts = xalloc(n_refs, sizeof(size_t*), "flagstat");
stats->unmapped = store_unmapped ? xalloc(8, sizeof(size_t), "flagstat") : NULL;

for (size_t i = 0; i < n_refs; i++) {
stats->counts[i] = xalloc(8, sizeof(size_t), "flagstat");
}

return stats;
}

/** Clean up flagstat counts.
*
* @param stats flagstat counts structure to clean.
*
*/
void destroy_flag_stats(flag_stats* stats) {
for (size_t i = 0; i < stats->n_refs; i++) {
free(stats->counts[i]);
}
free(stats->counts);
free(stats->unmapped);
free(stats);
}


// Count number of each cigar operation in an alignment
inline size_t* create_cigar_stats (bam1_t* b) {
static const size_t NCODES = 10;
Expand Down Expand Up @@ -81,6 +123,15 @@ inline size_t get_query_end(bam1_t* b) {
}


static inline void process_flagstat_counts(bam1_t* b, size_t* counts) {
counts[0] += 1;
counts[1] += ((b->core.flag & (NOTPRIMARY)) == 0);
for (size_t i=2; i<6; ++i){
counts[i] += ((b->core.flag & FLAG_MASK[i]) != 0);
}
}


/** Generates alignment stats from a region of a bam.
*
* @param fp htsFile pointer
Expand All @@ -94,7 +145,7 @@ inline size_t get_query_end(bam1_t* b) {
* @param read_group by which to filter alignments.
* @param tag_name by which to filter alignments.
* @param tag_value associated with tag_name.
* @param flag_counts size_t flag_counts[8] for output, will be cleared before use.
* @param flag_counts flag_stats pointer.
* @param unmapped bool include unmapped reads in output.
* @returns void. Prints output to stdout.
*
Expand All @@ -103,7 +154,7 @@ void process_bams(
htsFile *fp, hts_idx_t *idx, sam_hdr_t *hdr, const char *sample,
const char *chr, hts_pos_t start, hts_pos_t end, bool overlap_start,
const char *read_group, const char tag_name[2], const int tag_value,
size_t *flag_counts, bool unmapped) {
flag_stats *flag_counts, bool unmapped) {
if (chr != NULL) {
if (strcmp(chr, "*") == 0) {
fprintf(stderr, "Processing: Unplaced reads\n");
Expand All @@ -112,12 +163,6 @@ void process_bams(
}
}

// counting alignment flags
// total, primary, ..., unused
const size_t flag_mask[8] = {
0, 0, BAM_FSECONDARY, BAM_FSUPPLEMENTARY, BAM_FUNMAP, BAM_FQCFAIL, BAM_FDUP, 0};
memset(flag_counts, 0, 8 * sizeof(size_t));

// setup bam reading - reuse our pileup structure, but actually just need iterator
mplp_data* bam = create_bam_iter_data(
fp, idx, hdr,
Expand All @@ -128,60 +173,58 @@ void process_bams(
int res;
bam1_t *b = bam_init1();
uint8_t *tag;
// find the target length for query below
size_t ref_length = 0;
if (strcmp(chr, "*") != 0) {
int tid = sam_hdr_name2tid(hdr, chr);
if (tid < 0) {
fprintf(stderr, "Failed to find reference sequence '%s' in bam.\n", chr);
exit(EXIT_FAILURE);
}
ref_length = (size_t)sam_hdr_tid2len(hdr, tid);
}

const int NOTPRIMARY = BAM_FSUPPLEMENTARY | BAM_FSECONDARY | BAM_FUNMAP;

while ((res = read_bam(bam, b) >= 0)) {
flag_counts[0] += 1;
flag_counts[1] += ((b->core.flag & (NOTPRIMARY)) == 0);
for (size_t i=2; i<6; ++i){
flag_counts[i] += ((b->core.flag & flag_mask[i]) != 0);
}
// write a record for unmapped/unplaced
if ((b->core.flag & BAM_FUNMAP) && unmapped) {
// an unmapped read can still have a RNAME and POS, but we
// ignore that here, because its not a thing we care about
char* qname = bam_get_qname(b);
uint32_t read_length = b->core.l_qseq;
float mean_quality = mean_qual_from_bam(bam_get_qual(b), read_length);
if (sample == NULL) {
fprintf(stdout,
"%s\t*\tnan\tnan\t" \
"nan\tnan\tnan\tnan\t" \
"0\t*\t0\t%u\t%.3f\t" \
"0\t0\t0\t0\tnan\tnan\n",
qname, //chr, coverage, ref_cover,
//qstart, qend, rstart, rend,
//aligned_ref_len, direction, length,
read_length, mean_quality
//match, ins, delt, sub, iden, acc
);
} else {
fprintf(stdout,
"%s\t%s\t*\tnan\tnan\t" \
"nan\tnan\tnan\tnan\t" \
"0\t*\t0\t%u\t%.3f\t" \
"0\t0\t0\t0\tnan\tnan\n",
qname, sample, //chr, coverage, ref_cover,
//qstart, qend, rstart, rend,
//aligned_ref_len, direction, length,
read_length, mean_quality
//match, ins, delt, sub, iden, acc
);
if (b->core.flag & BAM_FUNMAP){
if (unmapped) {
// an unmapped read can still have a RNAME and POS, but we
// ignore that here, because its not a thing we care about
char* qname = bam_get_qname(b);
uint32_t read_length = b->core.l_qseq;
float mean_quality = mean_qual_from_bam(bam_get_qual(b), read_length);
if (sample == NULL) {
fprintf(stdout,
"%s\t*\tnan\tnan\t" \
"nan\tnan\tnan\tnan\t" \
"0\t*\t0\t%u\t%.3f\t" \
"0\t0\t0\t0\tnan\tnan\n",
qname, //chr, coverage, ref_cover,
//qstart, qend, rstart, rend,
//aligned_ref_len, direction, length,
read_length, mean_quality
//match, ins, delt, sub, iden, acc
);
} else {
fprintf(stdout,
"%s\t%s\t*\tnan\tnan\t" \
"nan\tnan\tnan\tnan\t" \
"0\t*\t0\t%u\t%.3f\t" \
"0\t0\t0\t0\tnan\tnan\n",
qname, sample, //chr, coverage, ref_cover,
//qstart, qend, rstart, rend,
//aligned_ref_len, direction, length,
read_length, mean_quality
//match, ins, delt, sub, iden, acc
);
}
// add to flagstat counts if required
if (flag_counts != NULL) {
process_flagstat_counts(b, flag_counts->unmapped);
}
}
continue;
}

if (flag_counts != NULL) {
// when we have a target region (as opposed to looping over the whole file),
// `flag_counts` will only contain one (dynamic) array of counts; otherwise
// there will be as many dynamic arrays as references in the BAM header
size_t* counts = (chr != NULL) ? flag_counts->counts[0]
: flag_counts->counts[b->core.tid];
process_flagstat_counts(b, counts);
}

// only take "good" primary alignments for further processing
if (b->core.flag & (NOTPRIMARY | BAM_FQCFAIL | BAM_FDUP)) continue;
char* qname = bam_get_qname(b);
Expand All @@ -194,8 +237,14 @@ void process_bams(
}
int NM = bam_aux2i(tag);
if (errno == EINVAL) {
fprintf(stderr, "Read '%s' contains non-integer 'NM' tag.\n", qname);
exit(EXIT_FAILURE);
// `get_int_aux_val` returns 0 if setting errno, preventing us from
// distinguishing between a non-int tag type and an intentional zero. We'll
// have to check the type ourselves.
char type = *tag++;
if (type != 'i' && type != 'I') {
fprintf(stderr, "Read '%s' contains non-integer 'NM' tag type '%c'\n", qname, type);
exit(EXIT_FAILURE);
}
}
size_t* stats = create_cigar_stats(b);
size_t match, ins, delt;
Expand All @@ -218,26 +267,31 @@ void process_bams(
size_t rstart = b->core.pos;
size_t rend = bam_endpos(b);
size_t aligned_ref_len = rend - rstart;
size_t ref_length = sam_hdr_tid2len(hdr, b->core.tid);
float ref_cover = 100 * ((float)(aligned_ref_len)) / ref_length;
char direction = "+-"[bam_is_rev(b)];

if (sample == NULL) {
fprintf(stdout,
"%s\t%s\t%.4f\t%.4f\t" \
"%s\t%s\t" \
"%.4f\t%.4f\t" \
"%lu\t%lu\t%lu\t%lu\t" \
"%lu\t%c\t%lu\t%u\t%.3f\t" \
"%lu\t%lu\t%lu\t%lu\t%.3f\t%.3f\n",
qname, chr, coverage, ref_cover,
qname, (chr != NULL) ? chr : sam_hdr_tid2name(hdr, b->core.tid),
coverage, ref_cover,
qstart, qend, rstart, rend,
aligned_ref_len, direction, length, read_length, mean_quality,
match, ins, delt, sub, iden, acc);
} else {
fprintf(stdout,
"%s\t%s\t%s\t%.4f\t%.4f\t" \
"%s\t%s\t%s\t" \
"%.4f\t%.4f\t" \
"%lu\t%lu\t%lu\t%lu\t" \
"%lu\t%c\t%lu\t%u\t%.3f\t" \
"%lu\t%lu\t%lu\t%lu\t%.3f\t%.3f\n",
qname, sample, chr, coverage, ref_cover,
qname, sample, (chr != NULL) ? chr : sam_hdr_tid2name(hdr, b->core.tid),
coverage, ref_cover,
qstart, qend, rstart, rend,
aligned_ref_len, direction, length, read_length, mean_quality,
match, ins, delt, sub, iden, acc);
Expand Down
Loading

0 comments on commit b78b02f

Please sign in to comment.