diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f66329..5f9256b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # NAF Changelog +## 1.2.0 - 2020-09-01 +- Added `--sequences` option to _unnaf_. +- Added `--binary-stdout` option to _unnaf_. +- Added `--binary-stderr` option to both _ennaf_ and _unnaf_. +- Updated zstd to v1.4.5. +- Improved compatibility with MinGW. + ## 1.1.0 - 2019-10-01 - Added support for RNA, protein and text sequences, enabled with `--rna`, `--protein` and `--text` switches. - Added report for number of unknown characters at the end of compression. diff --git a/Compress.md b/Compress.md index 8791993..dc1853d 100644 --- a/Compress.md +++ b/Compress.md @@ -78,6 +78,8 @@ where they would be otherwise not created due to data being small. **--no-mask** - Don't store sequence mask (lower/upper characters). Converts the sequences to upper case before compression. +**--binary-stderr** - Set stderr stream to binary mode. Mainly useful for running test suite on Windows. + **-h**, **--help** - Show usage help. **-V**, **--version** - Show version. @@ -195,4 +197,4 @@ you have to switch to text mode (`--text`). Since both `--dna` and `--text` modes can be used for DNA data, which is better? Short answer: `--dna` is faster and has stronger compression. -For details, see this benchmark page. +For details, see [this benchmark page](http://kirill-kryukov.com/study/naf/benchmark-text-vs-dna-Spur.html). diff --git a/Decompress.md b/Decompress.md index 8b8681a..041a925 100644 --- a/Decompress.md +++ b/Decompress.md @@ -23,7 +23,9 @@ Only one of these options should be specified: **--fastq** - FASTQ format. Will fail if input has no qualities. -**--seq** - All sequences concatenated into one, without names or line breaks. +**--sequences** - One sequence per line, without names or qualities. + +**--seq** - All sequences concatenated into one, without names, qualities, or line breaks. **--number** - Number of sequences. @@ -59,9 +61,13 @@ Only one of these options should be specified: **--line-length N** - Divide sequences into lines of N bp, ignoring line length stored in the NAF file. Effective only for `--fasta` output. Line length of 0 means unlimited lines, i.e., each sequence printed in single line. -**--no-mask** - Ignore mask, useful only for `--fasta` and `--seq` outputs. +**--no-mask** - Ignore mask, useful only for `--fasta`, `--sequences` and `--seq` outputs. Supported only for DNA and RNA sequences. +**--binary-stderr** - Set stderr stream to binary mode. Mainly useful for running test suite on Windows. + +**--binary-stdout** - Set stdout stream to binary mode. Useful for piping decompressed sequences to md5sum on Windows. + **-h**, **--help** - Show usage help. **-V**, **--version** - Show version. diff --git a/LICENSE b/LICENSE index 57518b7..22ac5c2 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2018-2019 Kirill Kryukov +Copyright (c) 2018-2020 Kirill Kryukov This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages diff --git a/README.md b/README.md index 004a579..597b95b 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ It's based on [zstd](http://www.zstd.net/), and features strong compression and It can store DNA, RNA, protein or text sequences, with or without qualities. It supports FASTA and FASTQ-formatted sequences, ambiguous IUPAC codes, masked sequence, and has no limit on sequence length or number of sequences. +It supports Unix pipes which allows easy integration into pipelines. See [NAF homepage](http://kirill-kryukov.com/study/naf/) for details. | Example benchmark: SILVA 132 LSURef database (610 MB): | @@ -12,6 +13,11 @@ See [NAF homepage](http://kirill-kryukov.com/study/naf/) for details. | | | From [Sequence Compression Benchmark](http://kirr.dyndns.org/sequence-compression-benchmark/) project - visit for details and more benchmarks. | +More examples: +* [Compactness on DNA data](http://kirr.dyndns.org/sequence-compression-benchmark/?d=Mitochondrion+%28245+MB%29&d=Influenza+%281.22+GB%29&d=Helicobacter+%282.76+GB%29&doagg=1&agg=average&cs=1&cg=1&com=yes&src=all&nt=4&only-best=1&bn=1&bm=ratio&sm=same&tn=10&bs=100&rr=gzip-9&tm0=name&tm1=size&tm2=ratio&tm3=ctime&tm4=dtime&tm5=cdtime&tm6=tdtime&tm7=empty&gm=same&cyl=lin&ccw=1500&cch=500&sxm=ratio&sxmin=0&sxmax=0&sxl=lin&sym=dspeed&symin=0&symax=0&syl=lin&button=Show+column+chart) +* [Compactness vs decompression speed, on human genome](http://kirr.dyndns.org/sequence-compression-benchmark/?d=Homo+sapiens+GCA_000001405.28+(3.31+GB)&doagg=1&agg=sum&cs=1&cg=1&com=yes&src=all&nt=4&bn=1&bm=tdspeed&sm=same&tn=10&bs=100&rr=gzip-9&tm0=name&tm1=size&tm2=ratio&tm3=ctime&tm4=dtime&tm5=cdtime&tm6=tdtime&tm7=empty&gm=same&cyl=lin&ccw=1500&cch=500&sxm=ratio&sxmin=0&sxmax=0&sxl=lin&sym=dspeed&symin=0&symax=0&syl=lin&button=Show+scatterplot) + + ## Format specification NAF specification is in public domain: [NAFv2.pdf](NAFv2.pdf) @@ -78,4 +84,9 @@ If you use NAF, please cite: [Bioinformatics, 35(19), 3826-3828](https://academic.oup.com/bioinformatics/article/35/19/3826/5364265), doi: [10.1093/bioinformatics/btz144](https://doi.org/10.1093/bioinformatics/btz144). -Previous preprint: bioRxiv 501130; http://biorxiv.org/cgi/content/short/501130v2, doi: [10.1101/501130](https://doi.org/10.1101/501130). +For compressor benchmark, please cite: + + * Kirill Kryukov, Mahoko Takahashi Ueda, So Nakagawa, Tadashi Imanishi (2020) +**"Sequence Compression Benchmark (SCB) database � A comprehensive evaluation of reference-free compressors for FASTA-formatted sequences"** +[GigaScience, 9(7), giaa072](https://academic.oup.com/gigascience/article/9/7/giaa072/5867695), +doi: [10.1093/gigascience/giaa072](https://doi.org/10.1093/gigascience/giaa072). diff --git a/ennaf/src/compressor.c b/ennaf/src/compressor.c index 28d0332..c9fcaca 100644 --- a/ennaf/src/compressor.c +++ b/ennaf/src/compressor.c @@ -1,6 +1,6 @@ /* * NAF compressor - * Copyright (c) 2018-2019 Kirill Kryukov + * Copyright (c) 2018-2020 Kirill Kryukov * See README.md and LICENSE files of this repository */ diff --git a/ennaf/src/encoders.c b/ennaf/src/encoders.c index 2c3e233..a69632f 100644 --- a/ennaf/src/encoders.c +++ b/ennaf/src/encoders.c @@ -1,6 +1,6 @@ /* * NAF compressor - * Copyright (c) 2018-2019 Kirill Kryukov + * Copyright (c) 2018-2020 Kirill Kryukov * See README.md and LICENSE files of this repository */ diff --git a/ennaf/src/encoders.h b/ennaf/src/encoders.h index f865b9f..3db35df 100644 --- a/ennaf/src/encoders.h +++ b/ennaf/src/encoders.h @@ -1,6 +1,6 @@ /* * NAF compressor - * Copyright (c) 2018-2019 Kirill Kryukov + * Copyright (c) 2018-2020 Kirill Kryukov * See README.md and LICENSE files of this repository */ diff --git a/ennaf/src/ennaf.c b/ennaf/src/ennaf.c index 07b954a..e498b3d 100644 --- a/ennaf/src/ennaf.c +++ b/ennaf/src/ennaf.c @@ -1,12 +1,12 @@ /* * NAF compressor - * Copyright (c) 2018-2019 Kirill Kryukov + * Copyright (c) 2018-2020 Kirill Kryukov * See README.md and LICENSE files of this repository */ -#define VERSION "1.1.0" -#define DATE "2019-10-01" -#define COPYRIGHT_YEARS "2018-2019" +#define VERSION "1.2.0" +#define DATE "2020-09-01" +#define COPYRIGHT_YEARS "2018-2020" #include "platform.h" #include "encoders.h" @@ -18,6 +18,7 @@ static const unsigned char naf_magic_number[3] = { 0x01u, 0xF9u, 0xECu }; static bool verbose = false; +static bool binary_stderr = false; static bool keep_temp_files = false; static bool no_mask = false; @@ -351,6 +352,7 @@ static void parse_command_line(int argc, char **argv) if (!strcmp(argv[i], "--help")) { show_help(); exit(0); } if (!strcmp(argv[i], "--version")) { print_version = true; continue; } if (!strcmp(argv[i], "--verbose")) { verbose = true; continue; } + if (!strcmp(argv[i], "--binary-stderr")) { if (!binary_stderr) { binary_stderr = true; change_stderr_to_binary(); } continue; } if (!strcmp(argv[i], "--keep-temp-files")) { keep_temp_files = true; continue; } if (!strcmp(argv[i], "--no-mask")) { no_mask = true; continue; } if (!strcmp(argv[i], "--fasta")) { set_input_format_from_command_line("fasta"); continue; } @@ -518,7 +520,7 @@ int main(int argc, char **argv) fputc_or_die(' ', OUT); unsigned long long out_line_length = line_length_is_specified ? requested_line_length : longest_line_length; - if (verbose) { msg("Output line length: %" PRINT_ULL "\n", out_line_length); } + if (verbose) { msg("Output line length: %llu\n", out_line_length); } write_variable_length_encoded_number(OUT, out_line_length); write_variable_length_encoded_number(OUT, n_sequences); @@ -558,7 +560,7 @@ int main(int argc, char **argv) if (!assume_well_formed_input) { report_unexpected_input_char_stats(); } - if (verbose) { msg("Processed %" PRINT_ULL " sequences\n", n_sequences); } + if (verbose) { msg("Processed %llu sequences\n", n_sequences); } success = true; return 0; diff --git a/ennaf/src/files.c b/ennaf/src/files.c index db463bd..2230715 100644 --- a/ennaf/src/files.c +++ b/ennaf/src/files.c @@ -1,6 +1,6 @@ /* * NAF compressor - * Copyright (c) 2018-2019 Kirill Kryukov + * Copyright (c) 2018-2020 Kirill Kryukov * See README.md and LICENSE files of this repository */ @@ -28,6 +28,16 @@ static void open_input_file(void) } +static void change_stderr_to_binary(void) +{ +#ifdef __MINGW32__ + if (_setmode(_fileno(stderr), O_BINARY) == -1) { die("can't set error stream to binary mode\n"); } +#else + if (!freopen(NULL, "wb", stderr)) { die("can't set error stream to binary mode\n"); } +#endif +} + + static void open_output_file(void) { assert(OUT == NULL); diff --git a/ennaf/src/platform.h b/ennaf/src/platform.h index 45275ef..7f9d8e6 100644 --- a/ennaf/src/platform.h +++ b/ennaf/src/platform.h @@ -1,3 +1,8 @@ +/* + * NAF compressor + * Copyright (c) 2018-2020 Kirill Kryukov + * See README.md and LICENSE files of this repository + */ #ifndef ENNAF_PLATFORM_H #define ENNAF_PLATFORM_H @@ -5,14 +10,16 @@ #define NDEBUG +#define __USE_MINGW_ANSI_STDIO 1 + #include +#include #include #include #include #include #include #include -#include #include #include @@ -25,16 +32,6 @@ -#if defined(__MINGW32__) || defined(__MINGW64__) || defined(_WIN32) || defined(_WIN64) || defined(WIN32) || defined(WIN64) -#define PRINT_ULL "I64u" -#define PRINT_SIZE_T "Iu" -#else -#define PRINT_ULL "llu" -#define PRINT_SIZE_T "zu" -#endif - - - #if defined(__MINGW32__) || defined(__MINGW64__) #define HAVE_NO_CHMOD #define HAVE_NO_CHOWN diff --git a/ennaf/src/process.c b/ennaf/src/process.c index 4a7a098..c1aeab7 100644 --- a/ennaf/src/process.c +++ b/ennaf/src/process.c @@ -1,6 +1,6 @@ /* * NAF compressor - * Copyright (c) 2018-2019 Kirill Kryukov + * Copyright (c) 2018-2020 Kirill Kryukov * See README.md and LICENSE files of this repository * * The FASTA/Q parser was originally based on Heng Li's kseq.h. @@ -78,11 +78,11 @@ static void report_unexpected_char_stats(unsigned long long *n, const char *seq_ for (unsigned i = 0; i < 257; i++) { total += n[i]; } if (total > 0) { - msg("input has %" PRINT_ULL " unexpected %s characters:\n", total, seq_type_name); - for (unsigned i = 0; i < 32; i++) { if (n[i] != 0) { msg(" '\\x%02X': %" PRINT_ULL "\n", i, n[i]); } } - for (unsigned i = 32; i < 127; i++) { if (n[i] != 0) { msg(" '%c': %" PRINT_ULL "\n", (unsigned char)i, n[i]); } } - for (unsigned i = 127; i < 256; i++) { if (n[i] != 0) { msg(" '\\x%02X': %" PRINT_ULL "\n", i, n[i]); } } - if (n[256] != 0) { msg(" EOF: %" PRINT_ULL "\n", n[256]); } + msg("input has %llu unexpected %s characters:\n", total, seq_type_name); + for (unsigned i = 0; i < 32; i++) { if (n[i] != 0) { msg(" '\\x%02X': %llu\n", i, n[i]); } } + for (unsigned i = 32; i < 127; i++) { if (n[i] != 0) { msg(" '%c': %llu\n", (unsigned char)i, n[i]); } } + for (unsigned i = 127; i < 256; i++) { if (n[i] != 0) { msg(" '\\x%02X': %llu\n", i, n[i]); } } + if (n[256] != 0) { msg(" EOF: %llu\n", n[256]); } } } @@ -101,7 +101,7 @@ static void unexpected_id_char(unsigned c) { if (abort_on_unexpected_code) { - die("unexpected character '%c' in ID of sequence %" PRINT_ULL "\n", (unsigned char)c, n_sequences + 1); + die("unexpected character '%c' in ID of sequence %llu\n", (unsigned char)c, n_sequences + 1); } else { n_unexpected_id_characters[c]++; } } @@ -112,7 +112,7 @@ static void unexpected_comment_char(unsigned c) { if (abort_on_unexpected_code) { - die("unexpected character '%c' in comment of sequence %" PRINT_ULL "\n", (unsigned char)c, n_sequences + 1); + die("unexpected character '%c' in comment of sequence %llu\n", (unsigned char)c, n_sequences + 1); } else { n_unexpected_comment_characters[c]++; } } @@ -123,7 +123,7 @@ static void unexpected_input_char(unsigned c) { if (abort_on_unexpected_code) { - die("unexpected %s code '%c' in sequence %" PRINT_ULL "\n", in_seq_type_name, (unsigned char)c, n_sequences + 1); + die("unexpected %s code '%c' in sequence %llu\n", in_seq_type_name, (unsigned char)c, n_sequences + 1); } else { n_unexpected_seq_characters[c]++; } } @@ -134,7 +134,7 @@ static void unexpected_quality_char(unsigned c) { if (abort_on_unexpected_code) { - die("unexpected quality code '%c' in sequence %" PRINT_ULL "\n", (unsigned char)c, n_sequences + 1); + die("unexpected quality code '%c' in sequence %llu\n", (unsigned char)c, n_sequences + 1); } else { n_unexpected_qual_characters[c]++; } } @@ -438,7 +438,7 @@ static void process_well_formed_fastq(void) c = in_get_until_specific_char('\n', &qual); if (QUAL.uncompressed_size + qual.length - old_len != read_length) { - die("quality length of sequence %" PRINT_ULL " doesn't match sequence length\n", n_sequences + 1); + die("quality length of sequence %llu doesn't match sequence length\n", n_sequences + 1); } add_length(read_length); @@ -491,7 +491,7 @@ static void process_non_well_formed_fastq(void) do { c = in_get_char(); } while (is_eol_arr[c]); if (c == INEOF) { die("truncated FASTQ input: last sequence has no quality\n"); } - if (c != '+') { die("invalid FASTQ input: can't find '+' line of sequence %" PRINT_ULL "\n", n_sequences + 1); } + if (c != '+') { die("invalid FASTQ input: can't find '+' line of sequence %llu\n", n_sequences + 1); } c = in_skip_until(is_eol_arr); if (c == INEOF) { die("truncated FASTQ input: last sequence has no quality\n"); } @@ -510,7 +510,7 @@ static void process_non_well_formed_fastq(void) unsigned long long qual_length = QUAL.uncompressed_size + qual.length - old_len; if (qual_length != read_length) { - die("quality length of sequence %" PRINT_ULL " (%" PRINT_ULL ") doesn't match sequence length (%" PRINT_ULL ")\n", + die("quality length of sequence %llu (%llu) doesn't match sequence length (%llu)\n", n_sequences + 1, qual_length, read_length); } @@ -519,7 +519,7 @@ static void process_non_well_formed_fastq(void) do { c = in_get_char(); } while (is_eol_arr[c]); if (c == INEOF) { break; } - if (c != '@') { die("invalid FASTQ input: Can't find '@' after sequence %" PRINT_ULL "\n", n_sequences); } + if (c != '@') { die("invalid FASTQ input: Can't find '@' after sequence %llu\n", n_sequences); } } } diff --git a/ennaf/src/tables.c b/ennaf/src/tables.c index f9b9519..324f6d9 100644 --- a/ennaf/src/tables.c +++ b/ennaf/src/tables.c @@ -1,6 +1,6 @@ /* * NAF compressor - * Copyright (c) 2018-2019 Kirill Kryukov + * Copyright (c) 2018-2020 Kirill Kryukov * See README.md and LICENSE files of this repository */ diff --git a/ennaf/src/utils.c b/ennaf/src/utils.c index e1870d3..23371a7 100644 --- a/ennaf/src/utils.c +++ b/ennaf/src/utils.c @@ -1,11 +1,11 @@ /* * NAF compressor - * Copyright (c) 2018-2019 Kirill Kryukov + * Copyright (c) 2018-2020 Kirill Kryukov * See README.md and LICENSE files of this repository */ -__attribute__ ((format (printf, 1, 2))) +//__attribute__ ((format (printf, 1, 2))) static void msg(const char *format, ...) { va_list argptr; @@ -16,7 +16,7 @@ static void msg(const char *format, ...) __attribute__ ((cold)) -__attribute__ ((format (printf, 1, 2))) +//__attribute__ ((format (printf, 1, 2))) static void warn(const char *format, ...) { fputs("ennaf warning: ", stderr); @@ -28,7 +28,7 @@ static void warn(const char *format, ...) __attribute__ ((cold)) -__attribute__ ((format (printf, 1, 2))) +//__attribute__ ((format (printf, 1, 2))) static void err(const char *format, ...) { fputs("ennaf error: ", stderr); @@ -40,7 +40,7 @@ static void err(const char *format, ...) __attribute__ ((cold)) -__attribute__ ((format (printf, 1, 2))) +//__attribute__ ((format (printf, 1, 2))) __attribute__ ((noreturn)) static void die(const char *format, ...) { @@ -57,7 +57,7 @@ __attribute__ ((cold)) __attribute__ ((noreturn)) static void out_of_memory(const size_t size) { - die("can't allocate %" PRINT_SIZE_T " bytes\n", size); + die("can't allocate %zu bytes\n", size); } diff --git a/tests/alphabet/a-sequences-dna-masked.e.err-ref b/tests/alphabet/a-sequences-dna-masked.e.err-ref new file mode 100644 index 0000000..a9d328a --- /dev/null +++ b/tests/alphabet/a-sequences-dna-masked.e.err-ref @@ -0,0 +1,220 @@ +input has 219 unexpected DNA characters: + '\x00': 1 + '\x01': 1 + '\x02': 1 + '\x03': 1 + '\x04': 1 + '\x05': 1 + '\x06': 1 + '\x07': 1 + '\x08': 1 + '\x0E': 1 + '\x0F': 1 + '\x10': 1 + '\x11': 1 + '\x12': 1 + '\x13': 1 + '\x14': 1 + '\x15': 1 + '\x16': 1 + '\x17': 1 + '\x18': 1 + '\x19': 1 + '\x1A': 1 + '\x1B': 1 + '\x1C': 1 + '\x1D': 1 + '\x1E': 1 + '\x1F': 1 + '!': 1 + '"': 1 + '#': 1 + '$': 1 + '%': 1 + '&': 1 + ''': 1 + '(': 1 + ')': 1 + '*': 1 + '+': 1 + ',': 1 + '.': 1 + '/': 1 + '0': 1 + '1': 1 + '2': 1 + '3': 1 + '4': 1 + '5': 1 + '6': 1 + '7': 1 + '8': 1 + '9': 1 + ':': 1 + ';': 1 + '<': 1 + '=': 1 + '>': 1 + '?': 1 + '@': 1 + 'E': 1 + 'F': 1 + 'I': 1 + 'J': 1 + 'L': 1 + 'O': 1 + 'P': 1 + 'Q': 1 + 'U': 1 + 'X': 1 + 'Z': 1 + '[': 1 + '\': 1 + ']': 1 + '^': 1 + '_': 1 + '`': 1 + 'e': 1 + 'f': 1 + 'i': 1 + 'j': 1 + 'l': 1 + 'o': 1 + 'p': 1 + 'q': 1 + 'u': 1 + 'x': 1 + 'z': 1 + '{': 1 + '|': 1 + '}': 1 + '~': 1 + '\x7F': 1 + '\x80': 1 + '\x81': 1 + '\x82': 1 + '\x83': 1 + '\x84': 1 + '\x85': 1 + '\x86': 1 + '\x87': 1 + '\x88': 1 + '\x89': 1 + '\x8A': 1 + '\x8B': 1 + '\x8C': 1 + '\x8D': 1 + '\x8E': 1 + '\x8F': 1 + '\x90': 1 + '\x91': 1 + '\x92': 1 + '\x93': 1 + '\x94': 1 + '\x95': 1 + '\x96': 1 + '\x97': 1 + '\x98': 1 + '\x99': 1 + '\x9A': 1 + '\x9B': 1 + '\x9C': 1 + '\x9D': 1 + '\x9E': 1 + '\x9F': 1 + '\xA0': 1 + '\xA1': 1 + '\xA2': 1 + '\xA3': 1 + '\xA4': 1 + '\xA5': 1 + '\xA6': 1 + '\xA7': 1 + '\xA8': 1 + '\xA9': 1 + '\xAA': 1 + '\xAB': 1 + '\xAC': 1 + '\xAD': 1 + '\xAE': 1 + '\xAF': 1 + '\xB0': 1 + '\xB1': 1 + '\xB2': 1 + '\xB3': 1 + '\xB4': 1 + '\xB5': 1 + '\xB6': 1 + '\xB7': 1 + '\xB8': 1 + '\xB9': 1 + '\xBA': 1 + '\xBB': 1 + '\xBC': 1 + '\xBD': 1 + '\xBE': 1 + '\xBF': 1 + '\xC0': 1 + '\xC1': 1 + '\xC2': 1 + '\xC3': 1 + '\xC4': 1 + '\xC5': 1 + '\xC6': 1 + '\xC7': 1 + '\xC8': 1 + '\xC9': 1 + '\xCA': 1 + '\xCB': 1 + '\xCC': 1 + '\xCD': 1 + '\xCE': 1 + '\xCF': 1 + '\xD0': 1 + '\xD1': 1 + '\xD2': 1 + '\xD3': 1 + '\xD4': 1 + '\xD5': 1 + '\xD6': 1 + '\xD7': 1 + '\xD8': 1 + '\xD9': 1 + '\xDA': 1 + '\xDB': 1 + '\xDC': 1 + '\xDD': 1 + '\xDE': 1 + '\xDF': 1 + '\xE0': 1 + '\xE1': 1 + '\xE2': 1 + '\xE3': 1 + '\xE4': 1 + '\xE5': 1 + '\xE6': 1 + '\xE7': 1 + '\xE8': 1 + '\xE9': 1 + '\xEA': 1 + '\xEB': 1 + '\xEC': 1 + '\xED': 1 + '\xEE': 1 + '\xEF': 1 + '\xF0': 1 + '\xF1': 1 + '\xF2': 1 + '\xF3': 1 + '\xF4': 1 + '\xF5': 1 + '\xF6': 1 + '\xF7': 1 + '\xF8': 1 + '\xF9': 1 + '\xFA': 1 + '\xFB': 1 + '\xFC': 1 + '\xFD': 1 + '\xFE': 1 + '\xFF': 1 diff --git a/tests/alphabet/a-sequences-dna-masked.out-ref b/tests/alphabet/a-sequences-dna-masked.out-ref new file mode 100644 index 0000000..35d1c6e --- /dev/null +++ b/tests/alphabet/a-sequences-dna-masked.out-ref @@ -0,0 +1 @@ +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN-NNNNNNNNNNNNNNNNNNNABCDNNGHNNKNMNNNNRSTNVWNYNNNNNNNabcdNNghNNkNmnNNNrstNvwNyNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/tests/alphabet/a-sequences-dna-masked.test b/tests/alphabet/a-sequences-dna-masked.test new file mode 100644 index 0000000..7af0f02 --- /dev/null +++ b/tests/alphabet/a-sequences-dna-masked.test @@ -0,0 +1 @@ +ennaf --dna {GROUP}.fa 2>{TEST}.e.err | unnaf --sequences >{TEST}.out 2>{TEST}.u.err diff --git a/tests/alphabet/a-sequences-dna-masked.u.err-ref b/tests/alphabet/a-sequences-dna-masked.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/alphabet/a-sequences-dna-no-mask-1.e.err-ref b/tests/alphabet/a-sequences-dna-no-mask-1.e.err-ref new file mode 100644 index 0000000..a9d328a --- /dev/null +++ b/tests/alphabet/a-sequences-dna-no-mask-1.e.err-ref @@ -0,0 +1,220 @@ +input has 219 unexpected DNA characters: + '\x00': 1 + '\x01': 1 + '\x02': 1 + '\x03': 1 + '\x04': 1 + '\x05': 1 + '\x06': 1 + '\x07': 1 + '\x08': 1 + '\x0E': 1 + '\x0F': 1 + '\x10': 1 + '\x11': 1 + '\x12': 1 + '\x13': 1 + '\x14': 1 + '\x15': 1 + '\x16': 1 + '\x17': 1 + '\x18': 1 + '\x19': 1 + '\x1A': 1 + '\x1B': 1 + '\x1C': 1 + '\x1D': 1 + '\x1E': 1 + '\x1F': 1 + '!': 1 + '"': 1 + '#': 1 + '$': 1 + '%': 1 + '&': 1 + ''': 1 + '(': 1 + ')': 1 + '*': 1 + '+': 1 + ',': 1 + '.': 1 + '/': 1 + '0': 1 + '1': 1 + '2': 1 + '3': 1 + '4': 1 + '5': 1 + '6': 1 + '7': 1 + '8': 1 + '9': 1 + ':': 1 + ';': 1 + '<': 1 + '=': 1 + '>': 1 + '?': 1 + '@': 1 + 'E': 1 + 'F': 1 + 'I': 1 + 'J': 1 + 'L': 1 + 'O': 1 + 'P': 1 + 'Q': 1 + 'U': 1 + 'X': 1 + 'Z': 1 + '[': 1 + '\': 1 + ']': 1 + '^': 1 + '_': 1 + '`': 1 + 'e': 1 + 'f': 1 + 'i': 1 + 'j': 1 + 'l': 1 + 'o': 1 + 'p': 1 + 'q': 1 + 'u': 1 + 'x': 1 + 'z': 1 + '{': 1 + '|': 1 + '}': 1 + '~': 1 + '\x7F': 1 + '\x80': 1 + '\x81': 1 + '\x82': 1 + '\x83': 1 + '\x84': 1 + '\x85': 1 + '\x86': 1 + '\x87': 1 + '\x88': 1 + '\x89': 1 + '\x8A': 1 + '\x8B': 1 + '\x8C': 1 + '\x8D': 1 + '\x8E': 1 + '\x8F': 1 + '\x90': 1 + '\x91': 1 + '\x92': 1 + '\x93': 1 + '\x94': 1 + '\x95': 1 + '\x96': 1 + '\x97': 1 + '\x98': 1 + '\x99': 1 + '\x9A': 1 + '\x9B': 1 + '\x9C': 1 + '\x9D': 1 + '\x9E': 1 + '\x9F': 1 + '\xA0': 1 + '\xA1': 1 + '\xA2': 1 + '\xA3': 1 + '\xA4': 1 + '\xA5': 1 + '\xA6': 1 + '\xA7': 1 + '\xA8': 1 + '\xA9': 1 + '\xAA': 1 + '\xAB': 1 + '\xAC': 1 + '\xAD': 1 + '\xAE': 1 + '\xAF': 1 + '\xB0': 1 + '\xB1': 1 + '\xB2': 1 + '\xB3': 1 + '\xB4': 1 + '\xB5': 1 + '\xB6': 1 + '\xB7': 1 + '\xB8': 1 + '\xB9': 1 + '\xBA': 1 + '\xBB': 1 + '\xBC': 1 + '\xBD': 1 + '\xBE': 1 + '\xBF': 1 + '\xC0': 1 + '\xC1': 1 + '\xC2': 1 + '\xC3': 1 + '\xC4': 1 + '\xC5': 1 + '\xC6': 1 + '\xC7': 1 + '\xC8': 1 + '\xC9': 1 + '\xCA': 1 + '\xCB': 1 + '\xCC': 1 + '\xCD': 1 + '\xCE': 1 + '\xCF': 1 + '\xD0': 1 + '\xD1': 1 + '\xD2': 1 + '\xD3': 1 + '\xD4': 1 + '\xD5': 1 + '\xD6': 1 + '\xD7': 1 + '\xD8': 1 + '\xD9': 1 + '\xDA': 1 + '\xDB': 1 + '\xDC': 1 + '\xDD': 1 + '\xDE': 1 + '\xDF': 1 + '\xE0': 1 + '\xE1': 1 + '\xE2': 1 + '\xE3': 1 + '\xE4': 1 + '\xE5': 1 + '\xE6': 1 + '\xE7': 1 + '\xE8': 1 + '\xE9': 1 + '\xEA': 1 + '\xEB': 1 + '\xEC': 1 + '\xED': 1 + '\xEE': 1 + '\xEF': 1 + '\xF0': 1 + '\xF1': 1 + '\xF2': 1 + '\xF3': 1 + '\xF4': 1 + '\xF5': 1 + '\xF6': 1 + '\xF7': 1 + '\xF8': 1 + '\xF9': 1 + '\xFA': 1 + '\xFB': 1 + '\xFC': 1 + '\xFD': 1 + '\xFE': 1 + '\xFF': 1 diff --git a/tests/alphabet/a-sequences-dna-no-mask-1.out-ref b/tests/alphabet/a-sequences-dna-no-mask-1.out-ref new file mode 100644 index 0000000..5ceb72f --- /dev/null +++ b/tests/alphabet/a-sequences-dna-no-mask-1.out-ref @@ -0,0 +1 @@ +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN-NNNNNNNNNNNNNNNNNNNABCDNNGHNNKNMNNNNRSTNVWNYNNNNNNNABCDNNGHNNKNMNNNNRSTNVWNYNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/tests/alphabet/a-sequences-dna-no-mask-1.test b/tests/alphabet/a-sequences-dna-no-mask-1.test new file mode 100644 index 0000000..84b040b --- /dev/null +++ b/tests/alphabet/a-sequences-dna-no-mask-1.test @@ -0,0 +1 @@ +ennaf --dna --no-mask {GROUP}.fa 2>{TEST}.e.err | unnaf --sequences >{TEST}.out 2>{TEST}.u.err diff --git a/tests/alphabet/a-sequences-dna-no-mask-1.u.err-ref b/tests/alphabet/a-sequences-dna-no-mask-1.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/alphabet/a-sequences-dna-no-mask-2.e.err-ref b/tests/alphabet/a-sequences-dna-no-mask-2.e.err-ref new file mode 100644 index 0000000..a9d328a --- /dev/null +++ b/tests/alphabet/a-sequences-dna-no-mask-2.e.err-ref @@ -0,0 +1,220 @@ +input has 219 unexpected DNA characters: + '\x00': 1 + '\x01': 1 + '\x02': 1 + '\x03': 1 + '\x04': 1 + '\x05': 1 + '\x06': 1 + '\x07': 1 + '\x08': 1 + '\x0E': 1 + '\x0F': 1 + '\x10': 1 + '\x11': 1 + '\x12': 1 + '\x13': 1 + '\x14': 1 + '\x15': 1 + '\x16': 1 + '\x17': 1 + '\x18': 1 + '\x19': 1 + '\x1A': 1 + '\x1B': 1 + '\x1C': 1 + '\x1D': 1 + '\x1E': 1 + '\x1F': 1 + '!': 1 + '"': 1 + '#': 1 + '$': 1 + '%': 1 + '&': 1 + ''': 1 + '(': 1 + ')': 1 + '*': 1 + '+': 1 + ',': 1 + '.': 1 + '/': 1 + '0': 1 + '1': 1 + '2': 1 + '3': 1 + '4': 1 + '5': 1 + '6': 1 + '7': 1 + '8': 1 + '9': 1 + ':': 1 + ';': 1 + '<': 1 + '=': 1 + '>': 1 + '?': 1 + '@': 1 + 'E': 1 + 'F': 1 + 'I': 1 + 'J': 1 + 'L': 1 + 'O': 1 + 'P': 1 + 'Q': 1 + 'U': 1 + 'X': 1 + 'Z': 1 + '[': 1 + '\': 1 + ']': 1 + '^': 1 + '_': 1 + '`': 1 + 'e': 1 + 'f': 1 + 'i': 1 + 'j': 1 + 'l': 1 + 'o': 1 + 'p': 1 + 'q': 1 + 'u': 1 + 'x': 1 + 'z': 1 + '{': 1 + '|': 1 + '}': 1 + '~': 1 + '\x7F': 1 + '\x80': 1 + '\x81': 1 + '\x82': 1 + '\x83': 1 + '\x84': 1 + '\x85': 1 + '\x86': 1 + '\x87': 1 + '\x88': 1 + '\x89': 1 + '\x8A': 1 + '\x8B': 1 + '\x8C': 1 + '\x8D': 1 + '\x8E': 1 + '\x8F': 1 + '\x90': 1 + '\x91': 1 + '\x92': 1 + '\x93': 1 + '\x94': 1 + '\x95': 1 + '\x96': 1 + '\x97': 1 + '\x98': 1 + '\x99': 1 + '\x9A': 1 + '\x9B': 1 + '\x9C': 1 + '\x9D': 1 + '\x9E': 1 + '\x9F': 1 + '\xA0': 1 + '\xA1': 1 + '\xA2': 1 + '\xA3': 1 + '\xA4': 1 + '\xA5': 1 + '\xA6': 1 + '\xA7': 1 + '\xA8': 1 + '\xA9': 1 + '\xAA': 1 + '\xAB': 1 + '\xAC': 1 + '\xAD': 1 + '\xAE': 1 + '\xAF': 1 + '\xB0': 1 + '\xB1': 1 + '\xB2': 1 + '\xB3': 1 + '\xB4': 1 + '\xB5': 1 + '\xB6': 1 + '\xB7': 1 + '\xB8': 1 + '\xB9': 1 + '\xBA': 1 + '\xBB': 1 + '\xBC': 1 + '\xBD': 1 + '\xBE': 1 + '\xBF': 1 + '\xC0': 1 + '\xC1': 1 + '\xC2': 1 + '\xC3': 1 + '\xC4': 1 + '\xC5': 1 + '\xC6': 1 + '\xC7': 1 + '\xC8': 1 + '\xC9': 1 + '\xCA': 1 + '\xCB': 1 + '\xCC': 1 + '\xCD': 1 + '\xCE': 1 + '\xCF': 1 + '\xD0': 1 + '\xD1': 1 + '\xD2': 1 + '\xD3': 1 + '\xD4': 1 + '\xD5': 1 + '\xD6': 1 + '\xD7': 1 + '\xD8': 1 + '\xD9': 1 + '\xDA': 1 + '\xDB': 1 + '\xDC': 1 + '\xDD': 1 + '\xDE': 1 + '\xDF': 1 + '\xE0': 1 + '\xE1': 1 + '\xE2': 1 + '\xE3': 1 + '\xE4': 1 + '\xE5': 1 + '\xE6': 1 + '\xE7': 1 + '\xE8': 1 + '\xE9': 1 + '\xEA': 1 + '\xEB': 1 + '\xEC': 1 + '\xED': 1 + '\xEE': 1 + '\xEF': 1 + '\xF0': 1 + '\xF1': 1 + '\xF2': 1 + '\xF3': 1 + '\xF4': 1 + '\xF5': 1 + '\xF6': 1 + '\xF7': 1 + '\xF8': 1 + '\xF9': 1 + '\xFA': 1 + '\xFB': 1 + '\xFC': 1 + '\xFD': 1 + '\xFE': 1 + '\xFF': 1 diff --git a/tests/alphabet/a-sequences-dna-no-mask-2.out-ref b/tests/alphabet/a-sequences-dna-no-mask-2.out-ref new file mode 100644 index 0000000..5ceb72f --- /dev/null +++ b/tests/alphabet/a-sequences-dna-no-mask-2.out-ref @@ -0,0 +1 @@ +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN-NNNNNNNNNNNNNNNNNNNABCDNNGHNNKNMNNNNRSTNVWNYNNNNNNNABCDNNGHNNKNMNNNNRSTNVWNYNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/tests/alphabet/a-sequences-dna-no-mask-2.test b/tests/alphabet/a-sequences-dna-no-mask-2.test new file mode 100644 index 0000000..76e7d0b --- /dev/null +++ b/tests/alphabet/a-sequences-dna-no-mask-2.test @@ -0,0 +1 @@ +ennaf --dna {GROUP}.fa 2>{TEST}.e.err | unnaf --sequences --no-mask >{TEST}.out 2>{TEST}.u.err diff --git a/tests/alphabet/a-sequences-dna-no-mask-2.u.err-ref b/tests/alphabet/a-sequences-dna-no-mask-2.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/alphabet/a-sequences-protein-masked.e.err-ref b/tests/alphabet/a-sequences-protein-masked.e.err-ref new file mode 100644 index 0000000..b96f148 --- /dev/null +++ b/tests/alphabet/a-sequences-protein-masked.e.err-ref @@ -0,0 +1,197 @@ +input has 196 unexpected protein characters: + '\x00': 1 + '\x01': 1 + '\x02': 1 + '\x03': 1 + '\x04': 1 + '\x05': 1 + '\x06': 1 + '\x07': 1 + '\x08': 1 + '\x0E': 1 + '\x0F': 1 + '\x10': 1 + '\x11': 1 + '\x12': 1 + '\x13': 1 + '\x14': 1 + '\x15': 1 + '\x16': 1 + '\x17': 1 + '\x18': 1 + '\x19': 1 + '\x1A': 1 + '\x1B': 1 + '\x1C': 1 + '\x1D': 1 + '\x1E': 1 + '\x1F': 1 + '!': 1 + '"': 1 + '#': 1 + '$': 1 + '%': 1 + '&': 1 + ''': 1 + '(': 1 + ')': 1 + '+': 1 + ',': 1 + '.': 1 + '/': 1 + '0': 1 + '1': 1 + '2': 1 + '3': 1 + '4': 1 + '5': 1 + '6': 1 + '7': 1 + '8': 1 + '9': 1 + ':': 1 + ';': 1 + '<': 1 + '=': 1 + '>': 1 + '?': 1 + '@': 1 + '[': 1 + '\': 1 + ']': 1 + '^': 1 + '_': 1 + '`': 1 + '{': 1 + '|': 1 + '}': 1 + '~': 1 + '\x7F': 1 + '\x80': 1 + '\x81': 1 + '\x82': 1 + '\x83': 1 + '\x84': 1 + '\x85': 1 + '\x86': 1 + '\x87': 1 + '\x88': 1 + '\x89': 1 + '\x8A': 1 + '\x8B': 1 + '\x8C': 1 + '\x8D': 1 + '\x8E': 1 + '\x8F': 1 + '\x90': 1 + '\x91': 1 + '\x92': 1 + '\x93': 1 + '\x94': 1 + '\x95': 1 + '\x96': 1 + '\x97': 1 + '\x98': 1 + '\x99': 1 + '\x9A': 1 + '\x9B': 1 + '\x9C': 1 + '\x9D': 1 + '\x9E': 1 + '\x9F': 1 + '\xA0': 1 + '\xA1': 1 + '\xA2': 1 + '\xA3': 1 + '\xA4': 1 + '\xA5': 1 + '\xA6': 1 + '\xA7': 1 + '\xA8': 1 + '\xA9': 1 + '\xAA': 1 + '\xAB': 1 + '\xAC': 1 + '\xAD': 1 + '\xAE': 1 + '\xAF': 1 + '\xB0': 1 + '\xB1': 1 + '\xB2': 1 + '\xB3': 1 + '\xB4': 1 + '\xB5': 1 + '\xB6': 1 + '\xB7': 1 + '\xB8': 1 + '\xB9': 1 + '\xBA': 1 + '\xBB': 1 + '\xBC': 1 + '\xBD': 1 + '\xBE': 1 + '\xBF': 1 + '\xC0': 1 + '\xC1': 1 + '\xC2': 1 + '\xC3': 1 + '\xC4': 1 + '\xC5': 1 + '\xC6': 1 + '\xC7': 1 + '\xC8': 1 + '\xC9': 1 + '\xCA': 1 + '\xCB': 1 + '\xCC': 1 + '\xCD': 1 + '\xCE': 1 + '\xCF': 1 + '\xD0': 1 + '\xD1': 1 + '\xD2': 1 + '\xD3': 1 + '\xD4': 1 + '\xD5': 1 + '\xD6': 1 + '\xD7': 1 + '\xD8': 1 + '\xD9': 1 + '\xDA': 1 + '\xDB': 1 + '\xDC': 1 + '\xDD': 1 + '\xDE': 1 + '\xDF': 1 + '\xE0': 1 + '\xE1': 1 + '\xE2': 1 + '\xE3': 1 + '\xE4': 1 + '\xE5': 1 + '\xE6': 1 + '\xE7': 1 + '\xE8': 1 + '\xE9': 1 + '\xEA': 1 + '\xEB': 1 + '\xEC': 1 + '\xED': 1 + '\xEE': 1 + '\xEF': 1 + '\xF0': 1 + '\xF1': 1 + '\xF2': 1 + '\xF3': 1 + '\xF4': 1 + '\xF5': 1 + '\xF6': 1 + '\xF7': 1 + '\xF8': 1 + '\xF9': 1 + '\xFA': 1 + '\xFB': 1 + '\xFC': 1 + '\xFD': 1 + '\xFE': 1 + '\xFF': 1 diff --git a/tests/alphabet/a-sequences-protein-masked.out-ref b/tests/alphabet/a-sequences-protein-masked.out-ref new file mode 100644 index 0000000..2980cd7 --- /dev/null +++ b/tests/alphabet/a-sequences-protein-masked.out-ref @@ -0,0 +1 @@ +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX*XX-XXXXXXXXXXXXXXXXXXXABCDEFGHIJKLMNOPQRSTUVWXYZXXXXXXabcdefghijklmnopqrstuvwxyzXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX diff --git a/tests/alphabet/a-sequences-protein-masked.test b/tests/alphabet/a-sequences-protein-masked.test new file mode 100644 index 0000000..0e5a11f --- /dev/null +++ b/tests/alphabet/a-sequences-protein-masked.test @@ -0,0 +1 @@ +ennaf --protein {GROUP}.fa 2>{TEST}.e.err | unnaf --sequences >{TEST}.out 2>{TEST}.u.err diff --git a/tests/alphabet/a-sequences-protein-masked.u.err-ref b/tests/alphabet/a-sequences-protein-masked.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/alphabet/a-sequences-protein-no-mask-1.e.err-ref b/tests/alphabet/a-sequences-protein-no-mask-1.e.err-ref new file mode 100644 index 0000000..b96f148 --- /dev/null +++ b/tests/alphabet/a-sequences-protein-no-mask-1.e.err-ref @@ -0,0 +1,197 @@ +input has 196 unexpected protein characters: + '\x00': 1 + '\x01': 1 + '\x02': 1 + '\x03': 1 + '\x04': 1 + '\x05': 1 + '\x06': 1 + '\x07': 1 + '\x08': 1 + '\x0E': 1 + '\x0F': 1 + '\x10': 1 + '\x11': 1 + '\x12': 1 + '\x13': 1 + '\x14': 1 + '\x15': 1 + '\x16': 1 + '\x17': 1 + '\x18': 1 + '\x19': 1 + '\x1A': 1 + '\x1B': 1 + '\x1C': 1 + '\x1D': 1 + '\x1E': 1 + '\x1F': 1 + '!': 1 + '"': 1 + '#': 1 + '$': 1 + '%': 1 + '&': 1 + ''': 1 + '(': 1 + ')': 1 + '+': 1 + ',': 1 + '.': 1 + '/': 1 + '0': 1 + '1': 1 + '2': 1 + '3': 1 + '4': 1 + '5': 1 + '6': 1 + '7': 1 + '8': 1 + '9': 1 + ':': 1 + ';': 1 + '<': 1 + '=': 1 + '>': 1 + '?': 1 + '@': 1 + '[': 1 + '\': 1 + ']': 1 + '^': 1 + '_': 1 + '`': 1 + '{': 1 + '|': 1 + '}': 1 + '~': 1 + '\x7F': 1 + '\x80': 1 + '\x81': 1 + '\x82': 1 + '\x83': 1 + '\x84': 1 + '\x85': 1 + '\x86': 1 + '\x87': 1 + '\x88': 1 + '\x89': 1 + '\x8A': 1 + '\x8B': 1 + '\x8C': 1 + '\x8D': 1 + '\x8E': 1 + '\x8F': 1 + '\x90': 1 + '\x91': 1 + '\x92': 1 + '\x93': 1 + '\x94': 1 + '\x95': 1 + '\x96': 1 + '\x97': 1 + '\x98': 1 + '\x99': 1 + '\x9A': 1 + '\x9B': 1 + '\x9C': 1 + '\x9D': 1 + '\x9E': 1 + '\x9F': 1 + '\xA0': 1 + '\xA1': 1 + '\xA2': 1 + '\xA3': 1 + '\xA4': 1 + '\xA5': 1 + '\xA6': 1 + '\xA7': 1 + '\xA8': 1 + '\xA9': 1 + '\xAA': 1 + '\xAB': 1 + '\xAC': 1 + '\xAD': 1 + '\xAE': 1 + '\xAF': 1 + '\xB0': 1 + '\xB1': 1 + '\xB2': 1 + '\xB3': 1 + '\xB4': 1 + '\xB5': 1 + '\xB6': 1 + '\xB7': 1 + '\xB8': 1 + '\xB9': 1 + '\xBA': 1 + '\xBB': 1 + '\xBC': 1 + '\xBD': 1 + '\xBE': 1 + '\xBF': 1 + '\xC0': 1 + '\xC1': 1 + '\xC2': 1 + '\xC3': 1 + '\xC4': 1 + '\xC5': 1 + '\xC6': 1 + '\xC7': 1 + '\xC8': 1 + '\xC9': 1 + '\xCA': 1 + '\xCB': 1 + '\xCC': 1 + '\xCD': 1 + '\xCE': 1 + '\xCF': 1 + '\xD0': 1 + '\xD1': 1 + '\xD2': 1 + '\xD3': 1 + '\xD4': 1 + '\xD5': 1 + '\xD6': 1 + '\xD7': 1 + '\xD8': 1 + '\xD9': 1 + '\xDA': 1 + '\xDB': 1 + '\xDC': 1 + '\xDD': 1 + '\xDE': 1 + '\xDF': 1 + '\xE0': 1 + '\xE1': 1 + '\xE2': 1 + '\xE3': 1 + '\xE4': 1 + '\xE5': 1 + '\xE6': 1 + '\xE7': 1 + '\xE8': 1 + '\xE9': 1 + '\xEA': 1 + '\xEB': 1 + '\xEC': 1 + '\xED': 1 + '\xEE': 1 + '\xEF': 1 + '\xF0': 1 + '\xF1': 1 + '\xF2': 1 + '\xF3': 1 + '\xF4': 1 + '\xF5': 1 + '\xF6': 1 + '\xF7': 1 + '\xF8': 1 + '\xF9': 1 + '\xFA': 1 + '\xFB': 1 + '\xFC': 1 + '\xFD': 1 + '\xFE': 1 + '\xFF': 1 diff --git a/tests/alphabet/a-sequences-protein-no-mask-1.out-ref b/tests/alphabet/a-sequences-protein-no-mask-1.out-ref new file mode 100644 index 0000000..2b2cee0 --- /dev/null +++ b/tests/alphabet/a-sequences-protein-no-mask-1.out-ref @@ -0,0 +1 @@ +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX*XX-XXXXXXXXXXXXXXXXXXXABCDEFGHIJKLMNOPQRSTUVWXYZXXXXXXABCDEFGHIJKLMNOPQRSTUVWXYZXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX diff --git a/tests/alphabet/a-sequences-protein-no-mask-1.test b/tests/alphabet/a-sequences-protein-no-mask-1.test new file mode 100644 index 0000000..8dd00b4 --- /dev/null +++ b/tests/alphabet/a-sequences-protein-no-mask-1.test @@ -0,0 +1 @@ +ennaf --protein --no-mask {GROUP}.fa 2>{TEST}.e.err | unnaf --sequences >{TEST}.out 2>{TEST}.u.err diff --git a/tests/alphabet/a-sequences-protein-no-mask-1.u.err-ref b/tests/alphabet/a-sequences-protein-no-mask-1.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/alphabet/a-sequences-protein-no-mask-2.e.err-ref b/tests/alphabet/a-sequences-protein-no-mask-2.e.err-ref new file mode 100644 index 0000000..b96f148 --- /dev/null +++ b/tests/alphabet/a-sequences-protein-no-mask-2.e.err-ref @@ -0,0 +1,197 @@ +input has 196 unexpected protein characters: + '\x00': 1 + '\x01': 1 + '\x02': 1 + '\x03': 1 + '\x04': 1 + '\x05': 1 + '\x06': 1 + '\x07': 1 + '\x08': 1 + '\x0E': 1 + '\x0F': 1 + '\x10': 1 + '\x11': 1 + '\x12': 1 + '\x13': 1 + '\x14': 1 + '\x15': 1 + '\x16': 1 + '\x17': 1 + '\x18': 1 + '\x19': 1 + '\x1A': 1 + '\x1B': 1 + '\x1C': 1 + '\x1D': 1 + '\x1E': 1 + '\x1F': 1 + '!': 1 + '"': 1 + '#': 1 + '$': 1 + '%': 1 + '&': 1 + ''': 1 + '(': 1 + ')': 1 + '+': 1 + ',': 1 + '.': 1 + '/': 1 + '0': 1 + '1': 1 + '2': 1 + '3': 1 + '4': 1 + '5': 1 + '6': 1 + '7': 1 + '8': 1 + '9': 1 + ':': 1 + ';': 1 + '<': 1 + '=': 1 + '>': 1 + '?': 1 + '@': 1 + '[': 1 + '\': 1 + ']': 1 + '^': 1 + '_': 1 + '`': 1 + '{': 1 + '|': 1 + '}': 1 + '~': 1 + '\x7F': 1 + '\x80': 1 + '\x81': 1 + '\x82': 1 + '\x83': 1 + '\x84': 1 + '\x85': 1 + '\x86': 1 + '\x87': 1 + '\x88': 1 + '\x89': 1 + '\x8A': 1 + '\x8B': 1 + '\x8C': 1 + '\x8D': 1 + '\x8E': 1 + '\x8F': 1 + '\x90': 1 + '\x91': 1 + '\x92': 1 + '\x93': 1 + '\x94': 1 + '\x95': 1 + '\x96': 1 + '\x97': 1 + '\x98': 1 + '\x99': 1 + '\x9A': 1 + '\x9B': 1 + '\x9C': 1 + '\x9D': 1 + '\x9E': 1 + '\x9F': 1 + '\xA0': 1 + '\xA1': 1 + '\xA2': 1 + '\xA3': 1 + '\xA4': 1 + '\xA5': 1 + '\xA6': 1 + '\xA7': 1 + '\xA8': 1 + '\xA9': 1 + '\xAA': 1 + '\xAB': 1 + '\xAC': 1 + '\xAD': 1 + '\xAE': 1 + '\xAF': 1 + '\xB0': 1 + '\xB1': 1 + '\xB2': 1 + '\xB3': 1 + '\xB4': 1 + '\xB5': 1 + '\xB6': 1 + '\xB7': 1 + '\xB8': 1 + '\xB9': 1 + '\xBA': 1 + '\xBB': 1 + '\xBC': 1 + '\xBD': 1 + '\xBE': 1 + '\xBF': 1 + '\xC0': 1 + '\xC1': 1 + '\xC2': 1 + '\xC3': 1 + '\xC4': 1 + '\xC5': 1 + '\xC6': 1 + '\xC7': 1 + '\xC8': 1 + '\xC9': 1 + '\xCA': 1 + '\xCB': 1 + '\xCC': 1 + '\xCD': 1 + '\xCE': 1 + '\xCF': 1 + '\xD0': 1 + '\xD1': 1 + '\xD2': 1 + '\xD3': 1 + '\xD4': 1 + '\xD5': 1 + '\xD6': 1 + '\xD7': 1 + '\xD8': 1 + '\xD9': 1 + '\xDA': 1 + '\xDB': 1 + '\xDC': 1 + '\xDD': 1 + '\xDE': 1 + '\xDF': 1 + '\xE0': 1 + '\xE1': 1 + '\xE2': 1 + '\xE3': 1 + '\xE4': 1 + '\xE5': 1 + '\xE6': 1 + '\xE7': 1 + '\xE8': 1 + '\xE9': 1 + '\xEA': 1 + '\xEB': 1 + '\xEC': 1 + '\xED': 1 + '\xEE': 1 + '\xEF': 1 + '\xF0': 1 + '\xF1': 1 + '\xF2': 1 + '\xF3': 1 + '\xF4': 1 + '\xF5': 1 + '\xF6': 1 + '\xF7': 1 + '\xF8': 1 + '\xF9': 1 + '\xFA': 1 + '\xFB': 1 + '\xFC': 1 + '\xFD': 1 + '\xFE': 1 + '\xFF': 1 diff --git a/tests/alphabet/a-sequences-protein-no-mask-2.out-ref b/tests/alphabet/a-sequences-protein-no-mask-2.out-ref new file mode 100644 index 0000000..2b2cee0 --- /dev/null +++ b/tests/alphabet/a-sequences-protein-no-mask-2.out-ref @@ -0,0 +1 @@ +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX*XX-XXXXXXXXXXXXXXXXXXXABCDEFGHIJKLMNOPQRSTUVWXYZXXXXXXABCDEFGHIJKLMNOPQRSTUVWXYZXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX diff --git a/tests/alphabet/a-sequences-protein-no-mask-2.test b/tests/alphabet/a-sequences-protein-no-mask-2.test new file mode 100644 index 0000000..702547d --- /dev/null +++ b/tests/alphabet/a-sequences-protein-no-mask-2.test @@ -0,0 +1 @@ +ennaf --protein {GROUP}.fa 2>{TEST}.e.err | unnaf --sequences --no-mask >{TEST}.out 2>{TEST}.u.err diff --git a/tests/alphabet/a-sequences-protein-no-mask-2.u.err-ref b/tests/alphabet/a-sequences-protein-no-mask-2.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/alphabet/a-sequences-rna-masked.e.err-ref b/tests/alphabet/a-sequences-rna-masked.e.err-ref new file mode 100644 index 0000000..9a2e509 --- /dev/null +++ b/tests/alphabet/a-sequences-rna-masked.e.err-ref @@ -0,0 +1,220 @@ +input has 219 unexpected RNA characters: + '\x00': 1 + '\x01': 1 + '\x02': 1 + '\x03': 1 + '\x04': 1 + '\x05': 1 + '\x06': 1 + '\x07': 1 + '\x08': 1 + '\x0E': 1 + '\x0F': 1 + '\x10': 1 + '\x11': 1 + '\x12': 1 + '\x13': 1 + '\x14': 1 + '\x15': 1 + '\x16': 1 + '\x17': 1 + '\x18': 1 + '\x19': 1 + '\x1A': 1 + '\x1B': 1 + '\x1C': 1 + '\x1D': 1 + '\x1E': 1 + '\x1F': 1 + '!': 1 + '"': 1 + '#': 1 + '$': 1 + '%': 1 + '&': 1 + ''': 1 + '(': 1 + ')': 1 + '*': 1 + '+': 1 + ',': 1 + '.': 1 + '/': 1 + '0': 1 + '1': 1 + '2': 1 + '3': 1 + '4': 1 + '5': 1 + '6': 1 + '7': 1 + '8': 1 + '9': 1 + ':': 1 + ';': 1 + '<': 1 + '=': 1 + '>': 1 + '?': 1 + '@': 1 + 'E': 1 + 'F': 1 + 'I': 1 + 'J': 1 + 'L': 1 + 'O': 1 + 'P': 1 + 'Q': 1 + 'T': 1 + 'X': 1 + 'Z': 1 + '[': 1 + '\': 1 + ']': 1 + '^': 1 + '_': 1 + '`': 1 + 'e': 1 + 'f': 1 + 'i': 1 + 'j': 1 + 'l': 1 + 'o': 1 + 'p': 1 + 'q': 1 + 't': 1 + 'x': 1 + 'z': 1 + '{': 1 + '|': 1 + '}': 1 + '~': 1 + '\x7F': 1 + '\x80': 1 + '\x81': 1 + '\x82': 1 + '\x83': 1 + '\x84': 1 + '\x85': 1 + '\x86': 1 + '\x87': 1 + '\x88': 1 + '\x89': 1 + '\x8A': 1 + '\x8B': 1 + '\x8C': 1 + '\x8D': 1 + '\x8E': 1 + '\x8F': 1 + '\x90': 1 + '\x91': 1 + '\x92': 1 + '\x93': 1 + '\x94': 1 + '\x95': 1 + '\x96': 1 + '\x97': 1 + '\x98': 1 + '\x99': 1 + '\x9A': 1 + '\x9B': 1 + '\x9C': 1 + '\x9D': 1 + '\x9E': 1 + '\x9F': 1 + '\xA0': 1 + '\xA1': 1 + '\xA2': 1 + '\xA3': 1 + '\xA4': 1 + '\xA5': 1 + '\xA6': 1 + '\xA7': 1 + '\xA8': 1 + '\xA9': 1 + '\xAA': 1 + '\xAB': 1 + '\xAC': 1 + '\xAD': 1 + '\xAE': 1 + '\xAF': 1 + '\xB0': 1 + '\xB1': 1 + '\xB2': 1 + '\xB3': 1 + '\xB4': 1 + '\xB5': 1 + '\xB6': 1 + '\xB7': 1 + '\xB8': 1 + '\xB9': 1 + '\xBA': 1 + '\xBB': 1 + '\xBC': 1 + '\xBD': 1 + '\xBE': 1 + '\xBF': 1 + '\xC0': 1 + '\xC1': 1 + '\xC2': 1 + '\xC3': 1 + '\xC4': 1 + '\xC5': 1 + '\xC6': 1 + '\xC7': 1 + '\xC8': 1 + '\xC9': 1 + '\xCA': 1 + '\xCB': 1 + '\xCC': 1 + '\xCD': 1 + '\xCE': 1 + '\xCF': 1 + '\xD0': 1 + '\xD1': 1 + '\xD2': 1 + '\xD3': 1 + '\xD4': 1 + '\xD5': 1 + '\xD6': 1 + '\xD7': 1 + '\xD8': 1 + '\xD9': 1 + '\xDA': 1 + '\xDB': 1 + '\xDC': 1 + '\xDD': 1 + '\xDE': 1 + '\xDF': 1 + '\xE0': 1 + '\xE1': 1 + '\xE2': 1 + '\xE3': 1 + '\xE4': 1 + '\xE5': 1 + '\xE6': 1 + '\xE7': 1 + '\xE8': 1 + '\xE9': 1 + '\xEA': 1 + '\xEB': 1 + '\xEC': 1 + '\xED': 1 + '\xEE': 1 + '\xEF': 1 + '\xF0': 1 + '\xF1': 1 + '\xF2': 1 + '\xF3': 1 + '\xF4': 1 + '\xF5': 1 + '\xF6': 1 + '\xF7': 1 + '\xF8': 1 + '\xF9': 1 + '\xFA': 1 + '\xFB': 1 + '\xFC': 1 + '\xFD': 1 + '\xFE': 1 + '\xFF': 1 diff --git a/tests/alphabet/a-sequences-rna-masked.out-ref b/tests/alphabet/a-sequences-rna-masked.out-ref new file mode 100644 index 0000000..0c7c180 --- /dev/null +++ b/tests/alphabet/a-sequences-rna-masked.out-ref @@ -0,0 +1 @@ +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN-NNNNNNNNNNNNNNNNNNNABCDNNGHNNKNMNNNNRSNUVWNYNNNNNNNabcdNNghNNkNmnNNNrsNuvwNyNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/tests/alphabet/a-sequences-rna-masked.test b/tests/alphabet/a-sequences-rna-masked.test new file mode 100644 index 0000000..2b56e63 --- /dev/null +++ b/tests/alphabet/a-sequences-rna-masked.test @@ -0,0 +1 @@ +ennaf --rna {GROUP}.fa 2>{TEST}.e.err | unnaf --sequences >{TEST}.out 2>{TEST}.u.err diff --git a/tests/alphabet/a-sequences-rna-masked.u.err-ref b/tests/alphabet/a-sequences-rna-masked.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/alphabet/a-sequences-rna-no-mask-1.e.err-ref b/tests/alphabet/a-sequences-rna-no-mask-1.e.err-ref new file mode 100644 index 0000000..9a2e509 --- /dev/null +++ b/tests/alphabet/a-sequences-rna-no-mask-1.e.err-ref @@ -0,0 +1,220 @@ +input has 219 unexpected RNA characters: + '\x00': 1 + '\x01': 1 + '\x02': 1 + '\x03': 1 + '\x04': 1 + '\x05': 1 + '\x06': 1 + '\x07': 1 + '\x08': 1 + '\x0E': 1 + '\x0F': 1 + '\x10': 1 + '\x11': 1 + '\x12': 1 + '\x13': 1 + '\x14': 1 + '\x15': 1 + '\x16': 1 + '\x17': 1 + '\x18': 1 + '\x19': 1 + '\x1A': 1 + '\x1B': 1 + '\x1C': 1 + '\x1D': 1 + '\x1E': 1 + '\x1F': 1 + '!': 1 + '"': 1 + '#': 1 + '$': 1 + '%': 1 + '&': 1 + ''': 1 + '(': 1 + ')': 1 + '*': 1 + '+': 1 + ',': 1 + '.': 1 + '/': 1 + '0': 1 + '1': 1 + '2': 1 + '3': 1 + '4': 1 + '5': 1 + '6': 1 + '7': 1 + '8': 1 + '9': 1 + ':': 1 + ';': 1 + '<': 1 + '=': 1 + '>': 1 + '?': 1 + '@': 1 + 'E': 1 + 'F': 1 + 'I': 1 + 'J': 1 + 'L': 1 + 'O': 1 + 'P': 1 + 'Q': 1 + 'T': 1 + 'X': 1 + 'Z': 1 + '[': 1 + '\': 1 + ']': 1 + '^': 1 + '_': 1 + '`': 1 + 'e': 1 + 'f': 1 + 'i': 1 + 'j': 1 + 'l': 1 + 'o': 1 + 'p': 1 + 'q': 1 + 't': 1 + 'x': 1 + 'z': 1 + '{': 1 + '|': 1 + '}': 1 + '~': 1 + '\x7F': 1 + '\x80': 1 + '\x81': 1 + '\x82': 1 + '\x83': 1 + '\x84': 1 + '\x85': 1 + '\x86': 1 + '\x87': 1 + '\x88': 1 + '\x89': 1 + '\x8A': 1 + '\x8B': 1 + '\x8C': 1 + '\x8D': 1 + '\x8E': 1 + '\x8F': 1 + '\x90': 1 + '\x91': 1 + '\x92': 1 + '\x93': 1 + '\x94': 1 + '\x95': 1 + '\x96': 1 + '\x97': 1 + '\x98': 1 + '\x99': 1 + '\x9A': 1 + '\x9B': 1 + '\x9C': 1 + '\x9D': 1 + '\x9E': 1 + '\x9F': 1 + '\xA0': 1 + '\xA1': 1 + '\xA2': 1 + '\xA3': 1 + '\xA4': 1 + '\xA5': 1 + '\xA6': 1 + '\xA7': 1 + '\xA8': 1 + '\xA9': 1 + '\xAA': 1 + '\xAB': 1 + '\xAC': 1 + '\xAD': 1 + '\xAE': 1 + '\xAF': 1 + '\xB0': 1 + '\xB1': 1 + '\xB2': 1 + '\xB3': 1 + '\xB4': 1 + '\xB5': 1 + '\xB6': 1 + '\xB7': 1 + '\xB8': 1 + '\xB9': 1 + '\xBA': 1 + '\xBB': 1 + '\xBC': 1 + '\xBD': 1 + '\xBE': 1 + '\xBF': 1 + '\xC0': 1 + '\xC1': 1 + '\xC2': 1 + '\xC3': 1 + '\xC4': 1 + '\xC5': 1 + '\xC6': 1 + '\xC7': 1 + '\xC8': 1 + '\xC9': 1 + '\xCA': 1 + '\xCB': 1 + '\xCC': 1 + '\xCD': 1 + '\xCE': 1 + '\xCF': 1 + '\xD0': 1 + '\xD1': 1 + '\xD2': 1 + '\xD3': 1 + '\xD4': 1 + '\xD5': 1 + '\xD6': 1 + '\xD7': 1 + '\xD8': 1 + '\xD9': 1 + '\xDA': 1 + '\xDB': 1 + '\xDC': 1 + '\xDD': 1 + '\xDE': 1 + '\xDF': 1 + '\xE0': 1 + '\xE1': 1 + '\xE2': 1 + '\xE3': 1 + '\xE4': 1 + '\xE5': 1 + '\xE6': 1 + '\xE7': 1 + '\xE8': 1 + '\xE9': 1 + '\xEA': 1 + '\xEB': 1 + '\xEC': 1 + '\xED': 1 + '\xEE': 1 + '\xEF': 1 + '\xF0': 1 + '\xF1': 1 + '\xF2': 1 + '\xF3': 1 + '\xF4': 1 + '\xF5': 1 + '\xF6': 1 + '\xF7': 1 + '\xF8': 1 + '\xF9': 1 + '\xFA': 1 + '\xFB': 1 + '\xFC': 1 + '\xFD': 1 + '\xFE': 1 + '\xFF': 1 diff --git a/tests/alphabet/a-sequences-rna-no-mask-1.out-ref b/tests/alphabet/a-sequences-rna-no-mask-1.out-ref new file mode 100644 index 0000000..1c4a009 --- /dev/null +++ b/tests/alphabet/a-sequences-rna-no-mask-1.out-ref @@ -0,0 +1 @@ +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN-NNNNNNNNNNNNNNNNNNNABCDNNGHNNKNMNNNNRSNUVWNYNNNNNNNABCDNNGHNNKNMNNNNRSNUVWNYNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/tests/alphabet/a-sequences-rna-no-mask-1.test b/tests/alphabet/a-sequences-rna-no-mask-1.test new file mode 100644 index 0000000..4702656 --- /dev/null +++ b/tests/alphabet/a-sequences-rna-no-mask-1.test @@ -0,0 +1 @@ +ennaf --rna --no-mask {GROUP}.fa 2>{TEST}.e.err | unnaf --sequences >{TEST}.out 2>{TEST}.u.err diff --git a/tests/alphabet/a-sequences-rna-no-mask-1.u.err-ref b/tests/alphabet/a-sequences-rna-no-mask-1.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/alphabet/a-sequences-rna-no-mask-2.e.err-ref b/tests/alphabet/a-sequences-rna-no-mask-2.e.err-ref new file mode 100644 index 0000000..9a2e509 --- /dev/null +++ b/tests/alphabet/a-sequences-rna-no-mask-2.e.err-ref @@ -0,0 +1,220 @@ +input has 219 unexpected RNA characters: + '\x00': 1 + '\x01': 1 + '\x02': 1 + '\x03': 1 + '\x04': 1 + '\x05': 1 + '\x06': 1 + '\x07': 1 + '\x08': 1 + '\x0E': 1 + '\x0F': 1 + '\x10': 1 + '\x11': 1 + '\x12': 1 + '\x13': 1 + '\x14': 1 + '\x15': 1 + '\x16': 1 + '\x17': 1 + '\x18': 1 + '\x19': 1 + '\x1A': 1 + '\x1B': 1 + '\x1C': 1 + '\x1D': 1 + '\x1E': 1 + '\x1F': 1 + '!': 1 + '"': 1 + '#': 1 + '$': 1 + '%': 1 + '&': 1 + ''': 1 + '(': 1 + ')': 1 + '*': 1 + '+': 1 + ',': 1 + '.': 1 + '/': 1 + '0': 1 + '1': 1 + '2': 1 + '3': 1 + '4': 1 + '5': 1 + '6': 1 + '7': 1 + '8': 1 + '9': 1 + ':': 1 + ';': 1 + '<': 1 + '=': 1 + '>': 1 + '?': 1 + '@': 1 + 'E': 1 + 'F': 1 + 'I': 1 + 'J': 1 + 'L': 1 + 'O': 1 + 'P': 1 + 'Q': 1 + 'T': 1 + 'X': 1 + 'Z': 1 + '[': 1 + '\': 1 + ']': 1 + '^': 1 + '_': 1 + '`': 1 + 'e': 1 + 'f': 1 + 'i': 1 + 'j': 1 + 'l': 1 + 'o': 1 + 'p': 1 + 'q': 1 + 't': 1 + 'x': 1 + 'z': 1 + '{': 1 + '|': 1 + '}': 1 + '~': 1 + '\x7F': 1 + '\x80': 1 + '\x81': 1 + '\x82': 1 + '\x83': 1 + '\x84': 1 + '\x85': 1 + '\x86': 1 + '\x87': 1 + '\x88': 1 + '\x89': 1 + '\x8A': 1 + '\x8B': 1 + '\x8C': 1 + '\x8D': 1 + '\x8E': 1 + '\x8F': 1 + '\x90': 1 + '\x91': 1 + '\x92': 1 + '\x93': 1 + '\x94': 1 + '\x95': 1 + '\x96': 1 + '\x97': 1 + '\x98': 1 + '\x99': 1 + '\x9A': 1 + '\x9B': 1 + '\x9C': 1 + '\x9D': 1 + '\x9E': 1 + '\x9F': 1 + '\xA0': 1 + '\xA1': 1 + '\xA2': 1 + '\xA3': 1 + '\xA4': 1 + '\xA5': 1 + '\xA6': 1 + '\xA7': 1 + '\xA8': 1 + '\xA9': 1 + '\xAA': 1 + '\xAB': 1 + '\xAC': 1 + '\xAD': 1 + '\xAE': 1 + '\xAF': 1 + '\xB0': 1 + '\xB1': 1 + '\xB2': 1 + '\xB3': 1 + '\xB4': 1 + '\xB5': 1 + '\xB6': 1 + '\xB7': 1 + '\xB8': 1 + '\xB9': 1 + '\xBA': 1 + '\xBB': 1 + '\xBC': 1 + '\xBD': 1 + '\xBE': 1 + '\xBF': 1 + '\xC0': 1 + '\xC1': 1 + '\xC2': 1 + '\xC3': 1 + '\xC4': 1 + '\xC5': 1 + '\xC6': 1 + '\xC7': 1 + '\xC8': 1 + '\xC9': 1 + '\xCA': 1 + '\xCB': 1 + '\xCC': 1 + '\xCD': 1 + '\xCE': 1 + '\xCF': 1 + '\xD0': 1 + '\xD1': 1 + '\xD2': 1 + '\xD3': 1 + '\xD4': 1 + '\xD5': 1 + '\xD6': 1 + '\xD7': 1 + '\xD8': 1 + '\xD9': 1 + '\xDA': 1 + '\xDB': 1 + '\xDC': 1 + '\xDD': 1 + '\xDE': 1 + '\xDF': 1 + '\xE0': 1 + '\xE1': 1 + '\xE2': 1 + '\xE3': 1 + '\xE4': 1 + '\xE5': 1 + '\xE6': 1 + '\xE7': 1 + '\xE8': 1 + '\xE9': 1 + '\xEA': 1 + '\xEB': 1 + '\xEC': 1 + '\xED': 1 + '\xEE': 1 + '\xEF': 1 + '\xF0': 1 + '\xF1': 1 + '\xF2': 1 + '\xF3': 1 + '\xF4': 1 + '\xF5': 1 + '\xF6': 1 + '\xF7': 1 + '\xF8': 1 + '\xF9': 1 + '\xFA': 1 + '\xFB': 1 + '\xFC': 1 + '\xFD': 1 + '\xFE': 1 + '\xFF': 1 diff --git a/tests/alphabet/a-sequences-rna-no-mask-2.out-ref b/tests/alphabet/a-sequences-rna-no-mask-2.out-ref new file mode 100644 index 0000000..1c4a009 --- /dev/null +++ b/tests/alphabet/a-sequences-rna-no-mask-2.out-ref @@ -0,0 +1 @@ +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN-NNNNNNNNNNNNNNNNNNNABCDNNGHNNKNMNNNNRSNUVWNYNNNNNNNABCDNNGHNNKNMNNNNRSNUVWNYNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/tests/alphabet/a-sequences-rna-no-mask-2.test b/tests/alphabet/a-sequences-rna-no-mask-2.test new file mode 100644 index 0000000..49889a6 --- /dev/null +++ b/tests/alphabet/a-sequences-rna-no-mask-2.test @@ -0,0 +1 @@ +ennaf --rna {GROUP}.fa 2>{TEST}.e.err | unnaf --sequences --no-mask >{TEST}.out 2>{TEST}.u.err diff --git a/tests/alphabet/a-sequences-rna-no-mask-2.u.err-ref b/tests/alphabet/a-sequences-rna-no-mask-2.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/alphabet/a-sequences-text-masked.e.err-ref b/tests/alphabet/a-sequences-text-masked.e.err-ref new file mode 100644 index 0000000..a74b248 --- /dev/null +++ b/tests/alphabet/a-sequences-text-masked.e.err-ref @@ -0,0 +1,30 @@ +input has 29 unexpected text characters: + '\x00': 1 + '\x01': 1 + '\x02': 1 + '\x03': 1 + '\x04': 1 + '\x05': 1 + '\x06': 1 + '\x07': 1 + '\x08': 1 + '\x0E': 1 + '\x0F': 1 + '\x10': 1 + '\x11': 1 + '\x12': 1 + '\x13': 1 + '\x14': 1 + '\x15': 1 + '\x16': 1 + '\x17': 1 + '\x18': 1 + '\x19': 1 + '\x1A': 1 + '\x1B': 1 + '\x1C': 1 + '\x1D': 1 + '\x1E': 1 + '\x1F': 1 + '\x7F': 1 + '\xFF': 1 diff --git a/tests/alphabet/a-sequences-text-masked.out-ref b/tests/alphabet/a-sequences-text-masked.out-ref new file mode 100644 index 0000000..4e77131 --- /dev/null +++ b/tests/alphabet/a-sequences-text-masked.out-ref @@ -0,0 +1 @@ +???????????????????????????!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~?�������������������������������������������������������������������������������������������������������������������������������? diff --git a/tests/alphabet/a-sequences-text-masked.test b/tests/alphabet/a-sequences-text-masked.test new file mode 100644 index 0000000..a32be04 --- /dev/null +++ b/tests/alphabet/a-sequences-text-masked.test @@ -0,0 +1 @@ +ennaf --text {GROUP}.fa 2>{TEST}.e.err | unnaf --sequences >{TEST}.out 2>{TEST}.u.err diff --git a/tests/alphabet/a-sequences-text-masked.u.err-ref b/tests/alphabet/a-sequences-text-masked.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/alphabet/a-sequences-text-no-mask-1.e.err-ref b/tests/alphabet/a-sequences-text-no-mask-1.e.err-ref new file mode 100644 index 0000000..a74b248 --- /dev/null +++ b/tests/alphabet/a-sequences-text-no-mask-1.e.err-ref @@ -0,0 +1,30 @@ +input has 29 unexpected text characters: + '\x00': 1 + '\x01': 1 + '\x02': 1 + '\x03': 1 + '\x04': 1 + '\x05': 1 + '\x06': 1 + '\x07': 1 + '\x08': 1 + '\x0E': 1 + '\x0F': 1 + '\x10': 1 + '\x11': 1 + '\x12': 1 + '\x13': 1 + '\x14': 1 + '\x15': 1 + '\x16': 1 + '\x17': 1 + '\x18': 1 + '\x19': 1 + '\x1A': 1 + '\x1B': 1 + '\x1C': 1 + '\x1D': 1 + '\x1E': 1 + '\x1F': 1 + '\x7F': 1 + '\xFF': 1 diff --git a/tests/alphabet/a-sequences-text-no-mask-1.out-ref b/tests/alphabet/a-sequences-text-no-mask-1.out-ref new file mode 100644 index 0000000..18aa1b9 --- /dev/null +++ b/tests/alphabet/a-sequences-text-no-mask-1.out-ref @@ -0,0 +1 @@ +???????????????????????????!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~?�������������������������������������������������������������������������������������������������������������������������������? diff --git a/tests/alphabet/a-sequences-text-no-mask-1.test b/tests/alphabet/a-sequences-text-no-mask-1.test new file mode 100644 index 0000000..e44a493 --- /dev/null +++ b/tests/alphabet/a-sequences-text-no-mask-1.test @@ -0,0 +1 @@ +ennaf --text --no-mask {GROUP}.fa 2>{TEST}.e.err | unnaf --sequences >{TEST}.out 2>{TEST}.u.err diff --git a/tests/alphabet/a-sequences-text-no-mask-1.u.err-ref b/tests/alphabet/a-sequences-text-no-mask-1.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/alphabet/a-sequences-text-no-mask-2.e.err-ref b/tests/alphabet/a-sequences-text-no-mask-2.e.err-ref new file mode 100644 index 0000000..a74b248 --- /dev/null +++ b/tests/alphabet/a-sequences-text-no-mask-2.e.err-ref @@ -0,0 +1,30 @@ +input has 29 unexpected text characters: + '\x00': 1 + '\x01': 1 + '\x02': 1 + '\x03': 1 + '\x04': 1 + '\x05': 1 + '\x06': 1 + '\x07': 1 + '\x08': 1 + '\x0E': 1 + '\x0F': 1 + '\x10': 1 + '\x11': 1 + '\x12': 1 + '\x13': 1 + '\x14': 1 + '\x15': 1 + '\x16': 1 + '\x17': 1 + '\x18': 1 + '\x19': 1 + '\x1A': 1 + '\x1B': 1 + '\x1C': 1 + '\x1D': 1 + '\x1E': 1 + '\x1F': 1 + '\x7F': 1 + '\xFF': 1 diff --git a/tests/alphabet/a-sequences-text-no-mask-2.out-ref b/tests/alphabet/a-sequences-text-no-mask-2.out-ref new file mode 100644 index 0000000..18aa1b9 --- /dev/null +++ b/tests/alphabet/a-sequences-text-no-mask-2.out-ref @@ -0,0 +1 @@ +???????????????????????????!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~?�������������������������������������������������������������������������������������������������������������������������������? diff --git a/tests/alphabet/a-sequences-text-no-mask-2.test b/tests/alphabet/a-sequences-text-no-mask-2.test new file mode 100644 index 0000000..40b27c8 --- /dev/null +++ b/tests/alphabet/a-sequences-text-no-mask-2.test @@ -0,0 +1 @@ +ennaf --text {GROUP}.fa 2>{TEST}.e.err | unnaf --sequences --no-mask >{TEST}.out 2>{TEST}.u.err diff --git a/tests/alphabet/a-sequences-text-no-mask-2.u.err-ref b/tests/alphabet/a-sequences-text-no-mask-2.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/interface/ennaf-version.err-ref b/tests/interface/ennaf-version.err-ref index b058eb6..3073b16 100644 --- a/tests/interface/ennaf-version.err-ref +++ b/tests/interface/ennaf-version.err-ref @@ -1,2 +1,2 @@ -ennaf - NAF compressor, version 1.1.0, 2019-10-01 -Copyright (c) 2018-2019 Kirill Kryukov +ennaf - NAF compressor, version 1.2.0, 2020-09-01 +Copyright (c) 2018-2020 Kirill Kryukov diff --git a/tests/interface/unnaf-version.err-ref b/tests/interface/unnaf-version.err-ref index d8db25a..628c12d 100644 --- a/tests/interface/unnaf-version.err-ref +++ b/tests/interface/unnaf-version.err-ref @@ -1,2 +1,2 @@ -unnaf - NAF decompressor, version 1.1.0, 2019-10-01 -Copyright (c) 2018-2019 Kirill Kryukov +unnaf - NAF decompressor, version 1.2.0, 2020-09-01 +Copyright (c) 2018-2020 Kirill Kryukov diff --git a/tests/small/1-sequences.e.err-ref b/tests/small/1-sequences.e.err-ref new file mode 100644 index 0000000..7009c6c --- /dev/null +++ b/tests/small/1-sequences.e.err-ref @@ -0,0 +1,2 @@ +input has 1 unexpected DNA characters: + 'Z': 1 diff --git a/tests/small/1-sequences.out-ref b/tests/small/1-sequences.out-ref new file mode 100644 index 0000000..15ae76c --- /dev/null +++ b/tests/small/1-sequences.out-ref @@ -0,0 +1,2 @@ +actgACGTnN +a-tN-MY diff --git a/tests/small/1-sequences.test b/tests/small/1-sequences.test new file mode 100644 index 0000000..e0632f3 --- /dev/null +++ b/tests/small/1-sequences.test @@ -0,0 +1 @@ +ennaf {GROUP}.fa 2>{TEST}.e.err | unnaf --sequences >{TEST}.out 2>{TEST}.u.err diff --git a/tests/small/1-sequences.u.err-ref b/tests/small/1-sequences.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/test-runner.pl b/tests/test-runner.pl index bd716bf..f6f33e2 100755 --- a/tests/test-runner.pl +++ b/tests/test-runner.pl @@ -53,8 +53,8 @@ sub run_test { s/[\x0D\x0A]+$//; my $cmd = $_; - $cmd =~ s/ennaf/..\/ennaf\/ennaf/g; - $cmd =~ s/unnaf/..\/unnaf\/unnaf/g; + $cmd =~ s/ennaf/..\/ennaf\/ennaf --binary-stderr/g; + $cmd =~ s/unnaf/..\/unnaf\/unnaf --binary-stderr --binary-stdout/g; $cmd =~ s/\{TEST\}/$test_prefix/g; $cmd =~ s/\{GROUP\}/$group_prefix/g; push @cmds, $cmd; diff --git a/unnaf/src/files.c b/unnaf/src/files.c index 77590ed..a975153 100644 --- a/unnaf/src/files.c +++ b/unnaf/src/files.c @@ -1,6 +1,6 @@ /* * NAF decompressor - * Copyright (c) 2018-2019 Kirill Kryukov + * Copyright (c) 2018-2020 Kirill Kryukov * See README.md and LICENSE files of this repository */ @@ -26,6 +26,16 @@ static void open_input_file(void) } +static void change_stderr_to_binary(void) +{ +#ifdef __MINGW32__ + if (_setmode(_fileno(stderr), O_BINARY) == -1) { die("can't set error stream to binary mode\n"); } +#else + if (!freopen(NULL, "wb", stderr)) { die("can't set error stream to binary mode\n"); } +#endif +} + + static void open_output_file(void) { assert(OUT == NULL); @@ -61,7 +71,7 @@ static void open_output_file(void) OUT = stdout; } - if (out_type == FOUR_BIT && force_stdout) + if ( binary_stdout || (out_type == FOUR_BIT && force_stdout) ) { #ifdef __MINGW32__ if (_setmode(_fileno(stdout), O_BINARY) == -1) { die("can't set output stream to binary mode\n"); } diff --git a/unnaf/src/input.c b/unnaf/src/input.c index 002592d..dd94368 100644 --- a/unnaf/src/input.c +++ b/unnaf/src/input.c @@ -1,6 +1,6 @@ /* * NAF decompressor - * Copyright (c) 2018-2019 Kirill Kryukov + * Copyright (c) 2018-2020 Kirill Kryukov * See README.md and LICENSE files of this repository */ @@ -164,7 +164,7 @@ static void load_ids(void) for (unsigned long long i = 1; i < N; i++) { char *ep = strchr(ids[i-1], 0); - if (ep >= ids_buffer + ids_size - 1) { die("currupted ids - can't read id %" PRINT_ULL "\n", i); } + if (ep >= ids_buffer + ids_size - 1) { die("currupted ids - can't read id %llu\n", i); } ids[i] = ep + 1; } } @@ -192,7 +192,7 @@ static void load_names(void) for (unsigned long long i = 1; i < N; i++) { char *ep = strchr(names[i-1], 0); - if (ep >= names_buffer + names_size - 1) { die("corrupted names - can't read name %" PRINT_ULL "\n", i); } + if (ep >= names_buffer + names_size - 1) { die("corrupted names - can't read name %llu\n", i); } names[i] = ep + 1; } } diff --git a/unnaf/src/output-fastq.c b/unnaf/src/output-fastq.c index 049684d..50b924c 100644 --- a/unnaf/src/output-fastq.c +++ b/unnaf/src/output-fastq.c @@ -1,6 +1,6 @@ /* * NAF decompressor - * Copyright (c) 2018-2019 Kirill Kryukov + * Copyright (c) 2018-2020 Kirill Kryukov * See README.md and LICENSE files of this repository */ diff --git a/unnaf/src/output-sequences.c b/unnaf/src/output-sequences.c new file mode 100644 index 0000000..9e632e9 --- /dev/null +++ b/unnaf/src/output-sequences.c @@ -0,0 +1,116 @@ +/* + * NAF decompressor + * Copyright (c) 2018-2020 Kirill Kryukov + * See README.md and LICENSE files of this repository + */ + +static inline void print_dna_buffer_as_sequences(int masking) +{ + unsigned long long n_bp_to_print = dna_buffer_pos; + if (n_bp_to_print > total_seq_n_bp_remaining) { n_bp_to_print = total_seq_n_bp_remaining; } + + if (masking) { mask_dna_buffer(dna_buffer, (unsigned)n_bp_to_print); } + + unsigned char *pos = dna_buffer; + + while (n_bp_to_print >= cur_seq_len_n_bp_remaining) + { + if (cur_seq_len_n_bp_remaining > 0) + { + fwrite(pos, 1, cur_seq_len_n_bp_remaining, OUT); + pos += cur_seq_len_n_bp_remaining; + n_bp_to_print -= cur_seq_len_n_bp_remaining; + total_seq_n_bp_remaining -= cur_seq_len_n_bp_remaining; + } + + if (lengths_buffer[cur_seq_len_index] != 4294967295u) + { + fputc('\n', OUT); + cur_seq_index++; + } + + cur_seq_len_index++; + if (cur_seq_len_index >= n_lengths) { break; } + + cur_seq_len_n_bp_remaining = lengths_buffer[cur_seq_len_index]; + } + + if (n_bp_to_print > 0) + { + fwrite(pos, 1, n_bp_to_print, OUT); + cur_seq_len_n_bp_remaining -= n_bp_to_print; + total_seq_n_bp_remaining -= n_bp_to_print; + } + + dna_buffer_pos = 0; +} + + +static inline void write_4bit_as_sequences(unsigned char *buffer, size_t size, int masking) +{ + for (unsigned int i = 0; i < size; i++) + { + *(unsigned short *)(&dna_buffer[dna_buffer_pos]) = codes_to_nucs[buffer[i]]; + dna_buffer_pos += 2; + } + + if (dna_buffer_pos > dna_buffer_flush_size) { print_dna_buffer_as_sequences(masking); } +} + + +static void print_sequences(int masking) +{ + if (!has_data) { return; } + + skip_ids(); + skip_names(); + load_lengths(); + + if (masking) { load_mask(); } + else { skip_mask(); } + + total_seq_length = read_number(IN); + compressed_seq_size = read_number(IN); + total_seq_n_bp_remaining = total_seq_length; + cur_seq_len_n_bp_remaining = lengths_buffer[0]; + + size_t bytes_to_read = initialize_input_decompression(); + size_t input_size; + + if (in_seq_type < seq_type_protein) + { + while ( total_seq_n_bp_remaining > 0 && (input_size = read_next_chunk(in_buffer, bytes_to_read)) ) + { + ZSTD_inBuffer in = { in_buffer, input_size, 0 }; + while (in.pos < in.size) + { + ZSTD_outBuffer out = { out_buffer, out_buffer_size, 0 }; + bytes_to_read = ZSTD_decompressStream(input_decompression_stream, &out, &in); + if (ZSTD_isError(bytes_to_read)) { die("can't decompress sequence: %s\n", ZSTD_getErrorName(bytes_to_read)); } + write_4bit_as_sequences((unsigned char *)out_buffer, out.pos, masking); + } + } + } + else + { + while ( total_seq_n_bp_remaining > 0 && (input_size = read_next_chunk(in_buffer, bytes_to_read)) ) + { + ZSTD_inBuffer in = { in_buffer, input_size, 0 }; + while (in.pos < in.size) + { + ZSTD_outBuffer out = { dna_buffer, dna_buffer_size, 0 }; + bytes_to_read = ZSTD_decompressStream(input_decompression_stream, &out, &in); + if (ZSTD_isError(bytes_to_read)) { die("can't decompress sequence: %s\n", ZSTD_getErrorName(bytes_to_read)); } + dna_buffer_pos = (unsigned)out.pos; + if (!use_mask) { uppercase_dna_buffer(); } + print_dna_buffer_as_sequences(masking); + } + } + } + + if (total_seq_n_bp_remaining > 0) + { + if (in_seq_type >= seq_type_protein && !use_mask) { uppercase_dna_buffer(); } + print_dna_buffer_as_sequences(masking); + } +} diff --git a/unnaf/src/output.c b/unnaf/src/output.c index ccbd8c1..5854586 100644 --- a/unnaf/src/output.c +++ b/unnaf/src/output.c @@ -1,6 +1,6 @@ /* * NAF decompressor - * Copyright (c) 2018-2019 Kirill Kryukov + * Copyright (c) 2018-2020 Kirill Kryukov * See README.md and LICENSE files of this repository */ @@ -587,69 +587,68 @@ static void print_charcount(int masking) count_dna_buffer_sequence_characters(counts, masking); } - for (unsigned i = 0; i < 33; i++) { if (counts[i] != 0) { fprintf(OUT, "\\x%02X\t%" PRINT_ULL "\n", i, counts[i]); } } - for (unsigned i = 33; i < 127; i++) { if (counts[i] != 0) { fprintf(OUT, "%c\t%" PRINT_ULL "\n", (unsigned char)i, counts[i]); } } - for (unsigned i = 127; i < 256; i++) { if (counts[i] != 0) { fprintf(OUT, "\\x%02X\t%" PRINT_ULL "\n", i, counts[i]); } } + for (unsigned i = 0; i < 33; i++) { if (counts[i] != 0) { fprintf(OUT, "\\x%02X\t%llu\n", i, counts[i]); } } + for (unsigned i = 33; i < 127; i++) { if (counts[i] != 0) { fprintf(OUT, "%c\t%llu\n", (unsigned char)i, counts[i]); } } + for (unsigned i = 127; i < 256; i++) { if (counts[i] != 0) { fprintf(OUT, "\\x%02X\t%llu\n", i, counts[i]); } } } static void print_fasta(int masking) { - if (has_data) - { - load_ids(); - load_names(); - load_lengths(); + if (!has_data) { return; } - if (masking) { load_mask(); } - else { skip_mask(); } + load_ids(); + load_names(); + load_lengths(); - total_seq_length = read_number(IN); - compressed_seq_size = read_number(IN); - total_seq_n_bp_remaining = total_seq_length; - cur_seq_len_n_bp_remaining = lengths_buffer[0]; + if (masking) { load_mask(); } + else { skip_mask(); } - print_fasta_name(0); - cur_line_n_bp_remaining = max_line_length; + total_seq_length = read_number(IN); + compressed_seq_size = read_number(IN); + total_seq_n_bp_remaining = total_seq_length; + cur_seq_len_n_bp_remaining = lengths_buffer[0]; - size_t bytes_to_read = initialize_input_decompression(); - size_t input_size; + print_fasta_name(0); + cur_line_n_bp_remaining = max_line_length; - if (in_seq_type < seq_type_protein) + size_t bytes_to_read = initialize_input_decompression(); + size_t input_size; + + if (in_seq_type < seq_type_protein) + { + while ( total_seq_n_bp_remaining > 0 && (input_size = read_next_chunk(in_buffer, bytes_to_read)) ) { - while ( total_seq_n_bp_remaining > 0 && (input_size = read_next_chunk(in_buffer, bytes_to_read)) ) + ZSTD_inBuffer in = { in_buffer, input_size, 0 }; + while (in.pos < in.size) { - ZSTD_inBuffer in = { in_buffer, input_size, 0 }; - while (in.pos < in.size) - { - ZSTD_outBuffer out = { out_buffer, out_buffer_size, 0 }; - bytes_to_read = ZSTD_decompressStream(input_decompression_stream, &out, &in); - if (ZSTD_isError(bytes_to_read)) { die("can't decompress sequence: %s\n", ZSTD_getErrorName(bytes_to_read)); } - write_4bit_as_fasta((unsigned char *)out_buffer, out.pos, masking); - } + ZSTD_outBuffer out = { out_buffer, out_buffer_size, 0 }; + bytes_to_read = ZSTD_decompressStream(input_decompression_stream, &out, &in); + if (ZSTD_isError(bytes_to_read)) { die("can't decompress sequence: %s\n", ZSTD_getErrorName(bytes_to_read)); } + write_4bit_as_fasta((unsigned char *)out_buffer, out.pos, masking); } } - else + } + else + { + while ( total_seq_n_bp_remaining > 0 && (input_size = read_next_chunk(in_buffer, bytes_to_read)) ) { - while ( total_seq_n_bp_remaining > 0 && (input_size = read_next_chunk(in_buffer, bytes_to_read)) ) + ZSTD_inBuffer in = { in_buffer, input_size, 0 }; + while (in.pos < in.size) { - ZSTD_inBuffer in = { in_buffer, input_size, 0 }; - while (in.pos < in.size) - { - ZSTD_outBuffer out = { dna_buffer, dna_buffer_size, 0 }; - bytes_to_read = ZSTD_decompressStream(input_decompression_stream, &out, &in); - if (ZSTD_isError(bytes_to_read)) { die("can't decompress sequence: %s\n", ZSTD_getErrorName(bytes_to_read)); } - dna_buffer_pos = (unsigned)out.pos; - if (!use_mask) { uppercase_dna_buffer(); } - print_dna_buffer_as_fasta(masking); - } + ZSTD_outBuffer out = { dna_buffer, dna_buffer_size, 0 }; + bytes_to_read = ZSTD_decompressStream(input_decompression_stream, &out, &in); + if (ZSTD_isError(bytes_to_read)) { die("can't decompress sequence: %s\n", ZSTD_getErrorName(bytes_to_read)); } + dna_buffer_pos = (unsigned)out.pos; + if (!use_mask) { uppercase_dna_buffer(); } + print_dna_buffer_as_fasta(masking); } } + } - if (total_seq_n_bp_remaining > 0) - { - if (in_seq_type >= seq_type_protein && !use_mask) { uppercase_dna_buffer(); } - print_dna_buffer_as_fasta(masking); - } + if (total_seq_n_bp_remaining > 0) + { + if (in_seq_type >= seq_type_protein && !use_mask) { uppercase_dna_buffer(); } + print_dna_buffer_as_fasta(masking); } } diff --git a/unnaf/src/platform.h b/unnaf/src/platform.h index 3f66887..d44e44d 100644 --- a/unnaf/src/platform.h +++ b/unnaf/src/platform.h @@ -1,3 +1,8 @@ +/* + * NAF decompressor + * Copyright (c) 2018-2020 Kirill Kryukov + * See README.md and LICENSE files of this repository + */ #ifndef ENNAF_PLATFORM_H #define ENNAF_PLATFORM_H @@ -5,6 +10,8 @@ #define NDEBUG +#define __USE_MINGW_ANSI_STDIO 1 + #include #include #include @@ -12,7 +19,6 @@ #include #include #include -#include #include #include @@ -25,16 +31,6 @@ -#if defined(__MINGW32__) || defined(__MINGW64__) || defined(_WIN32) || defined(_WIN64) || defined(WIN32) || defined(WIN64) -#define PRINT_ULL "I64u" -#define PRINT_SIZE_T "Iu" -#else -#define PRINT_ULL "llu" -#define PRINT_SIZE_T "zu" -#endif - - - #if defined(__MINGW32__) || defined(__MINGW64__) #define HAVE_NO_CHMOD #define HAVE_NO_CHOWN diff --git a/unnaf/src/unnaf.c b/unnaf/src/unnaf.c index a83c3e3..eba28be 100644 --- a/unnaf/src/unnaf.c +++ b/unnaf/src/unnaf.c @@ -1,12 +1,12 @@ /* * NAF decompressor - * Copyright (c) 2018-2019 Kirill Kryukov + * Copyright (c) 2018-2020 Kirill Kryukov * See README.md and LICENSE files of this repository */ -#define VERSION "1.1.0" -#define DATE "2019-10-01" -#define COPYRIGHT_YEARS "2018-2019" +#define VERSION "1.2.0" +#define DATE "2020-09-01" +#define COPYRIGHT_YEARS "2018-2020" #include "platform.h" @@ -17,7 +17,7 @@ typedef enum { UNDECIDED, FORMAT_NAME, PART_LIST, PART_SIZES, NUMBER_OF_SEQUENCE TITLE, IDS, NAMES, LENGTHS, TOTAL_LENGTH, MASK, TOTAL_MASK_LENGTH, FOUR_BIT, DNA, MASKED_DNA, UNMASKED_DNA, - SEQ, CHARCOUNT, + SEQ, SEQUENCES, CHARCOUNT, FASTA, MASKED_FASTA, UNMASKED_FASTA, FASTQ } OUTPUT_TYPE; @@ -29,6 +29,7 @@ static int in_seq_type = seq_type_dna; static const char *in_seq_type_name = "DNA"; static bool verbose = false; +static bool binary_stderr = false; static bool use_mask = true; static char *in_file_path = NULL; @@ -40,6 +41,7 @@ static char *out_file_path = NULL; static char *out_file_path_auto = NULL; static FILE *OUT = NULL; static bool force_stdout = false; +static bool binary_stdout = false; static bool created_output_file = false; static unsigned char format_version = 1; @@ -142,6 +144,7 @@ static bool success = false; #include "files.c" #include "input.c" #include "output.c" +#include "output-sequences.c" #include "output-fastq.c" @@ -259,6 +262,7 @@ static void show_help(void) " --mask - Masked region lengths\n" " --4bit - 4bit-encoded nucleotide sequence (binary data)\n" " --seq - Continuous concatenated sequence\n" + " --sequences - One sequence per line, no names\n" " --fasta - FASTA-formatted sequences\n" " --fastq - FASTQ-formatted sequences\n" "Other options:\n" @@ -266,6 +270,7 @@ static void show_help(void) " -c - Write to standard output\n" " --line-length N - Use lines of width N for FASTA output\n" " --no-mask - Ignore mask\n" + " --binary - Binary output (no 0D 0A on Windows)\n" " -h, --help - Show help\n" " -V, --version - Show version\n" ); @@ -299,10 +304,13 @@ static void parse_command_line(int argc, char **argv) if (!strcmp(argv[i], "--total-mask-length")) { set_out_type(TOTAL_MASK_LENGTH ); continue; } if (!strcmp(argv[i], "--4bit" )) { set_out_type(FOUR_BIT ); continue; } if (!strcmp(argv[i], "--seq" )) { set_out_type(SEQ ); continue; } + if (!strcmp(argv[i], "--sequences" )) { set_out_type(SEQUENCES ); continue; } if (!strcmp(argv[i], "--charcount" )) { set_out_type(CHARCOUNT ); continue; } if (!strcmp(argv[i], "--fasta" )) { set_out_type(FASTA ); continue; } if (!strcmp(argv[i], "--fastq" )) { set_out_type(FASTQ ); continue; } if (!strcmp(argv[i], "--no-mask")) { use_mask = false; continue; } + if (!strcmp(argv[i], "--binary-stdout")) { binary_stdout = true; continue; } + if (!strcmp(argv[i], "--binary-stderr")) { if (!binary_stderr) { binary_stderr = true; change_stderr_to_binary(); } continue; } if (!strcmp(argv[i], "--help")) { show_help(); exit(0); } if (!strcmp(argv[i], "--verbose")) { verbose = true; continue; } if (!strcmp(argv[i], "--version")) { print_version = true; continue; } @@ -419,6 +427,7 @@ int main(int argc, char **argv) else if (out_type == MASKED_DNA) { print_dna(use_mask && has_mask); } else if (out_type == UNMASKED_DNA) { print_dna(0); } else if (out_type == CHARCOUNT) { print_charcount(use_mask && has_mask); } + else if (out_type == SEQUENCES) { print_sequences(use_mask && has_mask); } else if (out_type == FASTA) { print_fasta(use_mask && has_mask); } else if (out_type == MASKED_FASTA) { print_fasta(use_mask && has_mask); } else if (out_type == UNMASKED_FASTA) { print_fasta(0); } diff --git a/unnaf/src/utils.c b/unnaf/src/utils.c index 7833d06..0a7f6c5 100644 --- a/unnaf/src/utils.c +++ b/unnaf/src/utils.c @@ -1,11 +1,11 @@ /* * NAF decompressor - * Copyright (c) 2018-2019 Kirill Kryukov + * Copyright (c) 2018-2020 Kirill Kryukov * See README.md and LICENSE files of this repository */ -__attribute__ ((format (printf, 1, 2))) +//__attribute__ ((format (printf, 1, 2))) static void msg(const char *format, ...) { va_list argptr; @@ -15,20 +15,8 @@ static void msg(const char *format, ...) } -/*__attribute__ ((cold)) -__attribute__ ((format (printf, 1, 2))) -static void warn(const char *format, ...) -{ - fputs("unnaf warning: ", stderr); - va_list argptr; - va_start(argptr, format); - vfprintf(stderr, format, argptr); - va_end(argptr); -}*/ - - __attribute__ ((cold)) -__attribute__ ((format (printf, 1, 2))) +//__attribute__ ((format (printf, 1, 2))) static void err(const char *format, ...) { fputs("unnaf error: ", stderr); @@ -40,7 +28,7 @@ static void err(const char *format, ...) __attribute__ ((cold)) -__attribute__ ((format (printf, 1, 2))) +//__attribute__ ((format (printf, 1, 2))) __attribute__ ((noreturn)) static void die(const char *format, ...) { @@ -65,7 +53,7 @@ __attribute__ ((cold)) __attribute__ ((noreturn)) static void out_of_memory(const size_t size) { - die("can't allocate %" PRINT_SIZE_T " bytes\n", size); + die("can't allocate %zu bytes\n", size); } diff --git a/zstd b/zstd index a3d655d..b706286 160000 --- a/zstd +++ b/zstd @@ -1 +1 @@ -Subproject commit a3d655d2255481333e09ecca9855f1b37f757c52 +Subproject commit b706286adbba780006a47ef92df0ad7a785666b6