diff --git a/CHANGELOG.md b/CHANGELOG.md index 1456482..fe7d19d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v0.9.0] +### Fixed +- Ensure reheadered fastq is indeed formatted as a valid SAM tag(s). + ## [v0.8.0] ### Added - Option to bamstats to add 'sample_name' column equivalent to fastcat. diff --git a/src/common.c b/src/common.c index a13227c..0a934ee 100644 --- a/src/common.c +++ b/src/common.c @@ -67,6 +67,16 @@ char *substring(char *string, int position, int length) { return ptr; } +int replace_char(char *str, char orig, char rep) { + char *ix = str; + int n = 0; + while((ix = strchr(ix, orig)) != NULL) { + *ix++ = rep; + n++; + } + return n; +} + const double qprobs[100] = { 1.00000000e+00, 7.94328235e-01, 6.30957344e-01, 5.01187234e-01, 3.98107171e-01, 3.16227766e-01, 2.51188643e-01, 1.99526231e-01, @@ -125,4 +135,4 @@ float mean_qual_from_bam(u_int8_t* qual, size_t len) { } qsum /= len; return -10 * log10(qsum); -} \ No newline at end of file +} diff --git a/src/common.h b/src/common.h index f836999..6667b3b 100644 --- a/src/common.h +++ b/src/common.h @@ -44,6 +44,17 @@ void *xalloc(size_t num, size_t size, char* msg); */ char *substring(char *string, int position, int length); +/** Globally replace a char in a char* + * + * @param str char* source string + * @param orig original character + * @param rep replacement + * @returns number of times replacement made + * + */ +int replace_char(char *str, char orig, char rep); + + // https://en.wikipedia.org/wiki/Kahan_summation_algorithm void kahan_sum(double* sum, double term, double* c); diff --git a/src/fastcat/writer.c b/src/fastcat/writer.c index 15aa561..8243c43 100644 --- a/src/fastcat/writer.c +++ b/src/fastcat/writer.c @@ -2,6 +2,7 @@ #include <sys/types.h> #include "writer.h" +#include "common.h" #include "../fastqcomments.h" char* strip_path(char* input) { @@ -73,12 +74,28 @@ void _write_read(writer writer, kseq_t* seq, read_meta meta, void* handle) { static const char* wcomment_fmt = "@%s %s\n%s\n+\n%s\n"; static const char* nocomment_fmt = "@%s\n%s\n+\n%s\n"; - if (writer->reheader && meta->valid) { - (*write)( - handle, reheader_fmt, - seq->name.s, meta->runid, meta->barcode, meta->barcode_alias, - meta->flow_cell_id, meta->start_time, meta->read_number, meta->channel, - seq->seq.s, seq->qual.s); + if (writer->reheader) { + // If we have all items, write individual tags, else just a generic comment tag. + // We could write out the explicit tags we have but a) thats effort b) any + // files from a device will have all these fields c) doesn't clearly indicate + // something went wrong d) allows us to pass on arbitrary information (so + // is probably how we should have done this in the first place). + if (meta->valid) { + (*write)( + handle, reheader_fmt, + seq->name.s, meta->runid, meta->barcode, meta->barcode_alias, + meta->flow_cell_id, meta->start_time, meta->read_number, meta->channel, + seq->seq.s, seq->qual.s); + } + else { + char* buf = xalloc(seq->comment.l + 6, sizeof(char), "Temporary buffer"); + strncpy(buf, "CO:Z:", 5); + strncat(buf, seq->comment.s, seq->comment.l); + // remove tabs to space (because multi-tags are tab delimited), this is fine because the existing comments should only contain space as delimiter (not information). + replace_char(buf, '\t', ' '); + (*write)(handle, wcomment_fmt, seq->name.s, buf, seq->seq.s, seq->qual.s); + free(buf); + } } else if (seq->comment.l > 0) { (*write)(handle, wcomment_fmt, seq->name.s, seq->comment.s, seq->seq.s, seq->qual.s); diff --git a/src/version.h b/src/version.h index cba0579..b7f046c 100644 --- a/src/version.h +++ b/src/version.h @@ -1,2 +1,2 @@ -const char *argp_program_version = "0.8.0"; +const char *argp_program_version = "0.9.0"; diff --git a/test/data/bcMangled.fastq.gz b/test/data/bcMangled.fastq.gz index a235d34..7d2b7ae 100644 Binary files a/test/data/bcMangled.fastq.gz and b/test/data/bcMangled.fastq.gz differ