Skip to content

Commit

Permalink
Merge branch 'bamstats-add-sample-option' into 'dev'
Browse files Browse the repository at this point in the history
add option to specify sample name to bamstats

See merge request epi2melabs/fastcat!16
  • Loading branch information
julibeg committed Jan 23, 2023
2 parents caafec8 + b7a18a6 commit 816f149
Show file tree
Hide file tree
Showing 8 changed files with 107 additions and 49 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v0.8.0]
### Added
- Option to bamstats to add 'sample_name' column equivalent to fastcat.

## [v0.7.0]
### Added
- Option to report unmapped alignments in per read and summary files.
Expand Down
6 changes: 6 additions & 0 deletions src/bamstats/args.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ static struct argp_option options[] = {
"Genomic region to process."},
{"threads", 't', "THREADS", 0,
"Number of threads for BAM processing."},
{"sample", 's',"SAMPLE NAME", 0,
"Sample name (if given, adds a 'sample_name' column)."},
{"flagstats", 'f', "FLAGSTATS", 0,
"File for outputting alignment flag counts."},
{0, 0, 0, 0,
Expand Down Expand Up @@ -59,6 +61,9 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state) {
case 'f':
arguments->flagstats = arg;
break;
case 's':
arguments->sample = arg;
break;
case 'u':
arguments->unmapped = true;
break;
Expand Down Expand Up @@ -110,6 +115,7 @@ arguments_t parse_arguments(int argc, char** argv) {
arguments_t args;
args.bam = NULL;
args.flagstats = NULL;
args.sample = NULL;
args.ref = NULL;
args.region = NULL;
args.unmapped = false;
Expand Down
1 change: 1 addition & 0 deletions src/bamstats/args.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
typedef struct arguments {
const char** bam;
char* flagstats;
char *sample;
char* ref;
char* region;
char* read_group;
Expand Down
61 changes: 40 additions & 21 deletions src/bamstats/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,29 +15,48 @@
#include "common.h"


void write_header() {
fprintf(stdout,
"name\tref\tcoverage\tref_coverage\t"\
"qstart\tqend\trstart\trend\t"\
"aligned_ref_len\tdirection\tlength\tread_length\tmean_quality\t"\
"match\tins\tdel\tsub\tiden\tacc\n");
void write_header(const char* sample) {
if (sample == NULL) {
fprintf(stdout,
"name\tref\tcoverage\tref_coverage\t"\
"qstart\tqend\trstart\trend\t"\
"aligned_ref_len\tdirection\tlength\tread_length\tmean_quality\t"\
"match\tins\tdel\tsub\tiden\tacc\n");
} else {
fprintf(stdout,
"name\tsample_name\tref\tcoverage\tref_coverage\t"\
"qstart\tqend\trstart\trend\t"\
"aligned_ref_len\tdirection\tlength\tread_length\tmean_quality\t"\
"match\tins\tdel\tsub\tiden\tacc\n");
}
}

// stats array should have 8 entries
// total, primary, BAM_FSECONDARY, BAM_FSUPPLEMENTARY, BAM_FUNMAP, BAM_FQCFAIL, BAM_FDUP, unused
// note: HTS spec makes a distinction between "unmapped" (flag & 4) and "unplaced". Unplaced
// are not necessarily unmapped but lack definitive coords, this is mainly for paired-end
// but we'll keep the distinction here.
void write_stats_header(FILE* fh) {
fprintf(fh, "ref\ttotal\tprimary\tsecondary\tsupplementary\t\tunmapped\tqcfail\tduplicate\n");
void write_stats_header(FILE* fh, const char* sample) {
if (sample == NULL) {
fprintf(fh, "ref\ttotal\tprimary\tsecondary\tsupplementary\tunmapped\tqcfail\tduplicate\n");
} else {
fprintf(fh, "ref\tsample_name\ttotal\tprimary\tsecondary\tsupplementary\tunmapped\tqcfail\tduplicate\n");
}
}

inline void write_stats(size_t *stats, const char* chr, FILE* fh) {
static inline void write_stats(size_t *stats, const char* chr, const char* sample, FILE* fh) {
if (fh != NULL) {
fprintf(fh,
"%s\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\n",
chr, stats[0], stats[1], stats[2], stats[3], stats[4], stats[5], stats[6]
);
if (sample == NULL) {
fprintf(fh,
"%s\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\n",
chr, stats[0], stats[1], stats[2], stats[3], stats[4], stats[5], stats[6]
);
} else {
fprintf(fh,
"%s\t%s\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\n",
chr, sample, stats[0], stats[1], stats[2], stats[3], stats[4], stats[5], stats[6]
);
}
}
}

Expand Down Expand Up @@ -72,7 +91,7 @@ int main(int argc, char *argv[]) {
fprintf(stderr, "WARNING: Results from multiple files will not be coordinate sorted.\n");
}

write_header();
write_header(args.sample);

htsFile *fp = hts_open(args.bam[0], "rb");
hts_idx_t *idx = sam_index_load(fp, args.bam[0]);
Expand All @@ -91,7 +110,7 @@ int main(int argc, char *argv[]) {
FILE* flagstats = NULL;
if (args.flagstats != NULL) {
flagstats = fopen(args.flagstats, "w");
write_stats_header(flagstats);
write_stats_header(flagstats, args.sample);
}

size_t* flag_counts = xalloc(8, sizeof(size_t), "counts");
Expand All @@ -101,21 +120,21 @@ int main(int argc, char *argv[]) {
const char* chr = sam_hdr_tid2name(hdr, i);
size_t ref_length = (size_t)sam_hdr_tid2len(hdr, i);
process_bams(
fp, idx, hdr,
fp, idx, hdr, args.sample,
chr, 0, ref_length, true,
args.read_group, args.tag_name, args.tag_value,
flag_counts, args.unmapped);
write_stats(flag_counts, chr, flagstats);
write_stats(flag_counts, chr, args.sample, flagstats);
memset(flag_counts, 0, 8 * sizeof(size_t));
}
// Also do unplaced reads
if (args.unmapped) {
process_bams(
fp, idx, hdr,
fp, idx, hdr, args.sample,
"*", 0, INT64_MAX, true,
args.read_group, args.tag_name, args.tag_value,
flag_counts, args.unmapped);
write_stats(flag_counts, "*", flagstats);
write_stats(flag_counts, "*", args.sample, flagstats);
memset(flag_counts, 0, 8 * sizeof(size_t));
}
} else {
Expand All @@ -139,11 +158,11 @@ int main(int argc, char *argv[]) {
size_t ref_length = (size_t)sam_hdr_tid2len(hdr, tid);
end = min(end, ref_length);
process_bams(
fp, idx, hdr,
fp, idx, hdr, args.sample,
chr, start, end, true,
args.read_group, args.tag_name, args.tag_value,
flag_counts, args.unmapped);
write_stats(flag_counts, chr, flagstats);
write_stats(flag_counts, chr, args.sample, flagstats);
memset(flag_counts, 0, 8 * sizeof(size_t));
free(chr);
}
Expand Down
69 changes: 48 additions & 21 deletions src/bamstats/readstats.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ inline size_t get_query_end(bam1_t* b) {
* @param fp htsFile pointer
* @param idx hts_idx_t pointer
* @param hdr sam_hdr_t pointer
* @param sample sample name.
* @param chr bam target name.
* @param start start position of chr to consider.
* @param end end position of chr to consider.
Expand All @@ -99,7 +100,7 @@ inline size_t get_query_end(bam1_t* b) {
*
*/
void process_bams(
htsFile *fp, hts_idx_t *idx, sam_hdr_t *hdr,
htsFile *fp, hts_idx_t *idx, sam_hdr_t *hdr, const char *sample,
const char *chr, hts_pos_t start, hts_pos_t end, bool overlap_start,
const char *read_group, const char tag_name[2], const int tag_value,
size_t *flag_counts, bool unmapped) {
Expand Down Expand Up @@ -153,17 +154,31 @@ void process_bams(
char* qname = bam_get_qname(b);
uint32_t read_length = b->core.l_qseq;
float mean_quality = mean_qual_from_bam(bam_get_qual(b), read_length);
fprintf(stdout,
"%s\t*\tnan\tnan\t" \
"nan\tnan\tnan\tnan\t" \
"0\t*\t0\t%u\t%.3f\t" \
"0\t0\t0\t0\tnan\tnan\n",
qname, //chr, coverage, ref_cover,
//qstart, qend, rstart, rend,
//aligned_ref_len, direction, length,
read_length, mean_quality
//match, ins, delt, sub, iden, acc
);
if (sample == NULL) {
fprintf(stdout,
"%s\t*\tnan\tnan\t" \
"nan\tnan\tnan\tnan\t" \
"0\t*\t0\t%u\t%.3f\t" \
"0\t0\t0\t0\tnan\tnan\n",
qname, //chr, coverage, ref_cover,
//qstart, qend, rstart, rend,
//aligned_ref_len, direction, length,
read_length, mean_quality
//match, ins, delt, sub, iden, acc
);
} else {
fprintf(stdout,
"%s\t%s\t*\tnan\tnan\t" \
"nan\tnan\tnan\tnan\t" \
"0\t*\t0\t%u\t%.3f\t" \
"0\t0\t0\t0\tnan\tnan\n",
qname, sample, //chr, coverage, ref_cover,
//qstart, qend, rstart, rend,
//aligned_ref_len, direction, length,
read_length, mean_quality
//match, ins, delt, sub, iden, acc
);
}
continue;
}

Expand Down Expand Up @@ -206,15 +221,27 @@ void process_bams(
float ref_cover = 100 * ((float)(aligned_ref_len)) / ref_length;
char direction = "+-"[bam_is_rev(b)];

fprintf(stdout,
"%s\t%s\t%.4f\t%.4f\t" \
"%lu\t%lu\t%lu\t%lu\t" \
"%lu\t%c\t%lu\t%u\t%.3f\t" \
"%lu\t%lu\t%lu\t%lu\t%.3f\t%.3f\n",
qname, chr, coverage, ref_cover,
qstart, qend, rstart, rend,
aligned_ref_len, direction, length, read_length, mean_quality,
match, ins, delt, sub, iden, acc);
if (sample == NULL) {
fprintf(stdout,
"%s\t%s\t%.4f\t%.4f\t" \
"%lu\t%lu\t%lu\t%lu\t" \
"%lu\t%c\t%lu\t%u\t%.3f\t" \
"%lu\t%lu\t%lu\t%lu\t%.3f\t%.3f\n",
qname, chr, coverage, ref_cover,
qstart, qend, rstart, rend,
aligned_ref_len, direction, length, read_length, mean_quality,
match, ins, delt, sub, iden, acc);
} else {
fprintf(stdout,
"%s\t%s\t%s\t%.4f\t%.4f\t" \
"%lu\t%lu\t%lu\t%lu\t" \
"%lu\t%c\t%lu\t%u\t%.3f\t" \
"%lu\t%lu\t%lu\t%lu\t%.3f\t%.3f\n",
qname, sample, chr, coverage, ref_cover,
qstart, qend, rstart, rend,
aligned_ref_len, direction, length, read_length, mean_quality,
match, ins, delt, sub, iden, acc);
}
free(stats);
}

Expand Down
11 changes: 6 additions & 5 deletions src/bamstats/readstats.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,24 @@

/** Generates alignment stats from a region of a bam.
*
* @param fps htsFile pointer
* @param idxs hts_idx_t pointer
* @param hdrs sam_hdr_t pointer
* @param fp htsFile pointer
* @param idx hts_idx_t pointer
* @param hdr sam_hdr_t pointer
* @param sample sample name.
* @param chr bam target name.
* @param start start position of chr to consider.
* @param end end position of chr to consider.
* @param overlap_start whether reads overhanging start should be included.
* @param read_group by which to filter alignments.
* @param tag_name by which to filter alignments.
* @param tag_value associated with tag_name.
* @param flag_counts size_t flag_counts[8] for output.
* @param flag_counts size_t flag_counts[8] for output, will be cleared before use.
* @param unmapped bool include unmapped reads in output.
* @returns void. Prints output to stdout.
*
*/
void process_bams(
htsFile *fps, hts_idx_t *idxs, sam_hdr_t *hdrs,
htsFile *fp, hts_idx_t *idx, sam_hdr_t *hdr, const char *sample,
const char *chr, hts_pos_t start, hts_pos_t end, bool overlap_start,
const char *read_group, const char tag_name[2], const int tag_value,
size_t *flag_counts, bool unmapped);
Expand Down
2 changes: 1 addition & 1 deletion src/fastcat/args.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ static struct argp_option options[] = {
{"file", 'f', "FILE SUMMARY", 0,
"Per-file summary output"},
{"sample", 's',"SAMPLE NAME", 0,
"Sample name (if given adds a 'sample_name' column)."},
"Sample name (if given, adds a 'sample_name' column)."},
{"demultiplex", 'd', "OUT DIR", 0,
"Separate barcoded samples using fastq header information. Option value is top-level output directory."},
{"min_length", 'a', "MIN READ LENGTH", 0,
Expand Down
2 changes: 1 addition & 1 deletion src/version.h
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@

const char *argp_program_version = "0.7.0";
const char *argp_program_version = "0.8.0";

0 comments on commit 816f149

Please sign in to comment.