Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FASTQ_OPT_NCBI option for parsing of NCBI's SRA data. #1325

Merged
merged 2 commits into from
Sep 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions hts.c
Original file line number Diff line number Diff line change
Expand Up @@ -1035,6 +1035,10 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) {
strcmp(o->arg, "FASTQ_CASAVA") == 0)
o->opt = FASTQ_OPT_CASAVA, o->val.i = 1;

else if (strcmp(o->arg, "fastq_name2") == 0 ||
strcmp(o->arg, "FASTQ_NAME2") == 0)
o->opt = FASTQ_OPT_NAME2, o->val.i = 1;

else {
hts_log_error("Unknown option '%s'", o->arg);
free(o->arg);
Expand Down Expand Up @@ -1606,6 +1610,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {

case FASTQ_OPT_CASAVA:
case FASTQ_OPT_RNUM:
case FASTQ_OPT_NAME2:
if (fp->format.format == fastq_format ||
fp->format.format == fasta_format)
return fastq_state_set(fp, opt);
Expand Down
5 changes: 5 additions & 0 deletions htslib/hts.h
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,11 @@ enum hts_fmt_option {
// Two character string.
// Barcode aux tag for CASAVA; defaults to "BC".
FASTQ_OPT_BARCODE,

// Process SRA and ENA read names which pointlessly move the original
// name to the second field and insert a constructed <run>.<number>
// name in its place.
FASTQ_OPT_NAME2,
};

// Profile options for encoding; primarily used at present in CRAM
Expand Down
27 changes: 24 additions & 3 deletions sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -3597,6 +3597,7 @@ typedef struct {
char BC[3]; // aux tag ID for barcode
khash_t(tag) *tags; // which aux tags to use (if empty, use all).
char nprefix;
int sra_names;
} fastq_state;

// Initialise fastq state.
Expand Down Expand Up @@ -3640,6 +3641,10 @@ int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
x->casava = 1;
break;

case FASTQ_OPT_NAME2:
x->sra_names = 1;
break;

case FASTQ_OPT_AUX: {
va_start(args, opt);
x->aux = 1;
Expand Down Expand Up @@ -3710,11 +3715,27 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) {
}

// Name

if (*x->name.s != x->nprefix)
return -2;

i = 0; l = x->name.l;
char *s = x->name.s;
// Reverse the SRA strangeness of putting the run_name.number before
// the read name.
i = 0;
char *name = x->name.s+1;
if (x->sra_names) {
char *cp = strpbrk(x->name.s, " \t");
if (cp) {
while (*cp == ' ' || *cp == '\t')
cp++;
*--cp = '@';
i = cp - x->name.s;
name = cp+1;
}
}

l = x->name.l;
char *s = x->name.s + i;
while (i < l && !isspace_c(s[i]))
i++;
if (i < l) {
Expand Down Expand Up @@ -3774,7 +3795,7 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) {

// Convert to BAM
ret = bam_set1(b,
x->name.l-1, x->name.s+1,
x->name.s + x->name.l - name, name,
flag,
-1, -1, 0, // ref '*', pos, mapq,
0, NULL, // no cigar,
Expand Down
4 changes: 4 additions & 0 deletions test/fastq/fastq.tst
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ P r2.sam $tview -i fastq_aux r2.fq
P r1-q.sam $tview -i fastq_aux r1.fa
P r2-q.sam $tview -i fastq_aux r2.fa

# Simple tests for the FASTQ_NAME2 option.
P name2.sam $tview -i fastq_name2 name2.fq
P name2-q.sam $tview -i fastq_name2 name2.fa

# --------------------
# Writing

Expand Down
4 changes: 4 additions & 0 deletions test/fastq/name2-q.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name_001 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG *
name_002 4 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA *
name_003 4 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA *
name_004 4 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA *
8 changes: 8 additions & 0 deletions test/fastq/name2.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>ignore_001 name_001
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG
>ignore_002 name_002
TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA
>ignore_003 name_003
ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA
>name_004
GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA
16 changes: 16 additions & 0 deletions test/fastq/name2.fq
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
@ignore_001 name_001
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG
+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FG<GGHACEFGH
@ignore_002 name_002
TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA
+
CABEFGFFGFHGGGGJGGFFGKIHHJFIEHHHGIEGGEHJGHDHFGHIGICIJEFIFGIF8GGHKFHGGFEI6GGGFIGHGGIE>EFCFHGGGHEJEAJE
@ignore_003 name_003
ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA
+
BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIG<C>F,CGD66?7EFI5EEG>EGGGGD5=HH6E
@name_004
GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA
+
CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE
4 changes: 4 additions & 0 deletions test/fastq/name2.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name_001 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FG<GGHACEFGH
name_002 4 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA CABEFGFFGFHGGGGJGGFFGKIHHJFIEHHHGIEGGEHJGHDHFGHIGICIJEFIFGIF8GGHKFHGGFEI6GGGFIGHGGIE>EFCFHGGGHEJEAJE
name_003 4 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIG<C>F,CGD66?7EFI5EEG>EGGGGD5=HH6E
name_004 4 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE