-
Notifications
You must be signed in to change notification settings - Fork 41
/
vcf_v44.ragel
299 lines (243 loc) · 14.6 KB
/
vcf_v44.ragel
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
/**
* Copyright 2014-2024 EMBL - European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
%%{
machine vcf_v44;
include vcf "vcf.ragel";
#############################################
# Actions definition #
#############################################
########## Incorrect metadata and header actions ##########
# Fileformat line
action fileformat_error {
ErrorPolicy::handle_error(*this,
new FileformatError{n_lines, "The fileformat declaration is not 'fileformat=VCFv4.4'"});
fhold; fgoto meta_section_skip;
}
# FORMAT metadata
action meta_format_number_err {
ErrorPolicy::handle_error(*this, new MetaSectionError{n_lines, "FORMAT metadata Number is not a number, A, R, G, P or dot"});
fhold; fgoto meta_section_skip;
}
# INFO metadata
action meta_info_number_err {
ErrorPolicy::handle_error(*this, new MetaSectionError{n_lines, "INFO metadata Number is not a number, A, R, G or dot"});
fhold; fgoto meta_section_skip;
}
########## Incorrect records actions ##########
# Format
action format_error {
ErrorPolicy::handle_error(*this, new FormatBodyError{n_lines, "Format does not start with a letter/underscore followed by alphanumeric/underscore/dot characters"});
fhold; fgoto body_section_skip;
}
action no_newline_at_eof_error {
ErrorPolicy::handle_error(*this, new BodySectionError{n_lines, "There is no newline at the end of the file"});
fhold; fgoto body_section_skip;
}
#################################################
# Machine definition #
#################################################
## A contig must be a sequence name allowed by the SAM format ( regex [!-)+-<>-~][!-~]* ) excluding the characters <>[]:*
## A chromosome must be a string with no white-spaces, and may be surrounded by < > symbols (for contigs)
meta_contig_char= alnum | ( punct - ( '<' | '>' | '[' | ']' | '*' | '=' | ',' ) ) ;
chrom_basic = (meta_contig_char - '#') (meta_contig_char)* ;
chrom_contig = '<' chrom_basic '>' ;
chromosome = chrom_basic | chrom_contig ;
# File format line
fileformat_name = 'VCFv4.4' >token_begin $token_middle %fileformat_end $err(fileformat_error) ;
fileformat = '##fileformat=' fileformat_name ;
# Meta-data
meta_typeid = ( ( print - '=' )+ - ('ALT' | 'FILTER' | 'FORMAT' | 'INFO' | 'assembly' | 'contig' | 'META' | 'SAMPLE' | 'PEDIGREE' | 'pedigreeDB') ) >token_begin @token_middle %meta_typeid_generic ;
meta_key = ( (alnum | '.' | '_' | '-' )+ - (punct)+ ) >token_begin @token_middle %meta_key_generic;
## A value can be expressed in multiple ways:
## - The line is a single key-value pair (meta_value): the value cannot start with an angle bracket, quote or linebreak, but it is free text thereafter
## - One of multiple key-value pairs, non-quoted (meta_field_value): the value cannot contain a comma -end of field-, or a closing angle bracket -end of entry-, or quote
## - One of multiple key-value pairs, quoted (meta_field_desc): the value can contain any character except for non-escaped quotes
meta_value = ((print - ('<' | '"' | NL)) (print)*) >token_begin $token_middle %token_end ;
meta_field_value= (print - (',' | '>' | '"'))+ >token_begin $token_middle %token_end ;
meta_field_desc = (('\\' print) | (print - '"'))* >token_begin $token_middle %token_end ;
meta_values = ('"' meta_field_desc '"') | meta_field_value ;
meta_field = meta_key '=' meta_values ;
meta_field_info_num = ( (digit)+ | 'A' | 'R' | 'G' | '.' ) >token_begin @token_middle %token_end ;
meta_field_format_num = ( (digit)+ | 'A' | 'R' | 'G' | '.' | 'P' ) >token_begin @token_middle %token_end ;
meta_field_type = (alpha)+ >token_begin @token_middle %token_end ;
meta_alt = 'ID=' %meta_id alt_id >token_begin @token_middle %token_end $err(meta_alt_id_err)
(',' identifier $err(meta_id_err) '="' meta_field_desc '"' $err(meta_desc_err))* ;
meta_assembly = url $err(meta_url_err) ;
meta_contig = 'ID=' %meta_id chrom_basic >token_begin $token_middle %token_end $err(meta_id_err)
(',' identifier '=' meta_values)* ;
meta_filter = 'ID=' %meta_id identifier $err(meta_id_err)
(',' identifier $err(meta_id_err) '="' meta_field_desc '"' $err(meta_desc_err))* ;
meta_format = 'ID=' %meta_id identifier $err(meta_id_err)
',Number=' %meta_number meta_field_format_num $err(meta_format_number_err)
',Type=' %meta_type meta_field_type $err(meta_info_type_err)
',Description=' %meta_description '"' meta_field_desc '"' $err(meta_desc_err)
(',' identifier $err(meta_id_err) '="' meta_field_desc '"' $err(meta_desc_err))* ;
meta_info = 'ID=' %meta_id identifier $err(meta_id_err)
',Number=' %meta_number meta_field_info_num $err(meta_info_number_err)
',Type=' %meta_type meta_field_type $err(meta_info_type_err)
',Description=' %meta_description '"' meta_field_desc '"' $err(meta_desc_err)
(',' identifier $err(meta_id_err) '="' meta_field_desc '"' $err(meta_desc_err))* ;
## 3 different ways to define a PEDIGREE tag now:
## ID=TumourSample,Original=GermlineID
## ID=ChildID,Father=FatherID,Mother=MotherID
## ID=SampleID,Name_1=Ancestor_1,...,Name_N=Ancestor_N
meta_pedigree_name = ('Name_' (digit)+) >token_begin $token_middle %token_end;
meta_pedigree = 'ID=' %meta_id identifier $err(meta_id_err)
(
( ',Original=' %meta_original identifier $err(meta_pedigree_original_err) ) |
( ',Father=' %meta_father identifier $err(meta_pedigree_parents_err)
',Mother=' %meta_mother identifier $err(meta_pedigree_parents_err) ) |
(',' meta_pedigree_name $err(meta_pedigree_names_err) '=' identifier $err(meta_pedigree_names_err))+
) ;
meta_pedigreeDB = url $err(meta_url_err) ;
## META values don't seem to have any particular restrictions so just applying those critical for parsing
meta_meta_value = (print - (',' | '<' | '>' | space))+ ;
meta_meta_values = (meta_meta_value (',' (space)? meta_meta_value)*) >token_begin $token_middle %token_end ;
meta_meta = 'ID=' %meta_id identifier $err(meta_id_err)
',Number=' %meta_number '.' $err(meta_meta_number_err)
',Type=' %meta_type 'String' $err(meta_meta_type_err)
',Values=' %meta_values '[' meta_meta_values ']' $err(meta_meta_values_err) ;
## Multiple ways to describe a sample now
meta_sample = 'ID=' %meta_id identifier $err(meta_id_err)
(',' identifier '=' meta_values)* ;
meta_entry = '##' (
('ALT' %meta_typeid_alt '=<' meta_alt '>' ) $err(meta_alt_err) |
('FILTER' %meta_typeid_filter '=<' meta_filter '>' ) $err(meta_filter_err) |
('FORMAT' %meta_typeid_format '=<' meta_format '>' ) $err(meta_format_err) |
('INFO' %meta_typeid_info '=<' meta_info '>' ) $err(meta_info_err) |
('assembly' %meta_typeid_assembly '=' meta_assembly ) $err(meta_assembly_err) |
('contig' %meta_typeid_contig '=<' meta_contig '>' ) $err(meta_contig_err) |
('META' %meta_typeid_meta '=<' meta_meta '>' ) $err(meta_meta_err) |
('SAMPLE' %meta_typeid_sample '=<' meta_sample '>' ) $err(meta_sample_err) |
('PEDIGREE' %meta_typeid_pedigree '=<' meta_pedigree '>' ) $err(meta_pedigree_err) |
('pedigreeDB' %meta_typeid_pedigreedb '=<' meta_pedigreeDB '>' ) $err(meta_pedigreedb_err) |
(meta_typeid '='
(
('<' meta_field (',' meta_field)* '>') |
('"' meta_field_desc '"') |
('<"' meta_field_desc '">') |
meta_value
)
)
) %meta_entry_end;
# Header between meta and records
sample_name = (print - (NL | CS))+ >token_begin @token_middle %sample_name_end;
header = ('#CHROM' CS 'POS' CS 'ID' CS 'REF' CS 'ALT' CS 'QUAL' CS 'FILTER' CS 'INFO' )
$eof(no_newline_at_eof_error) $err(header_prefix_err)
(CS 'FORMAT' (CS sample_name)+ )? $eof(no_newline_at_eof_error) %header_end ;
# Records
position = nat_number ;
record_chrom = chrom_basic >token_begin $token_middle %token_end |
'<' chrom_basic >token_begin $token_middle %token_end '>' ;
record_position = position >token_begin @token_middle %token_end ;
## ID must be a (list of) string with no white-spaces or semi-colons
record_id = (record_id_value (';' record_id_value)*) | record_empty_field ;
record_ref = bases >token_begin @token_middle %token_end ;
## Structural variants follow forms like:
## ]1:1234]ATG or ]<contig_1>:1234]ATG : paired breakends
## .AGT, AGT.: single breakends
record_alt_sv = ']' chromosome ':' position ']' bases |
'[' chromosome ':' position '[' bases |
bases ']' chromosome ':' position ']' |
bases '[' chromosome ':' position '[' |
'.' bases |
bases '.' ;
## Unspecified alleles and REF-only blocks (gVCF)
record_alt_gvcf = '<*>' ;
## Main alternate allele rule
record_alt_data = ( record_alt_snv |
record_alt_indel |
record_alt_sv |
record_alt_gvcf |
record_alt_other ) >token_begin $token_middle %token_end ;
record_alt = (record_alt_data (',' record_alt_data)*) | record_empty_field ;
record_qual = (any_number | '.') >token_begin $token_middle %token_end ;
record_filter = (record_fil_value (';' record_fil_value)*) | record_empty_field ;
info_key = (( (alpha | '_') (alpha | digit | '_' | '.')*) | '1000G' ) ;
info_value = (print - ';')+ ;
info_value_list = info_value (',' info_value)* ;
info_entry = (
info_key $err(info_key_error) '=' info_value_list $err(info_value_error) |
info_key $err(info_key_error)
) >token_begin $token_middle %token_end ;
record_info = ( info_entry (';' info_entry)* ) |
( '.' >token_begin @token_middle %token_end ) ;
## Accepting non-alphanumeric characters is an addition in v4.3, but widely used in already existing files
format_value = ( (alpha | '_') (alnum | '_' | '.' | '%')* ) >token_begin $token_middle %token_end ;
record_format = format_value (':' format_value)* ;
## NOTE: The genotype definition shows the format in which it is expected but won't get enforced from ragel
## as sample_values is a super set of genotype and other chars. Invalid genotypes successfully passes through
## ragel checks. The validation of genotype takes place in c++ code and not in ragel space.
## Same is applicable for older version as well.
## In a sample, if a genotype is present it must be the first field
record_sample = (
genotype $eof(no_newline_at_eof_error) $err(genotype_error) (':' sample_values)? |
sample_values
) >token_begin $token_middle %token_end ;
record = (
record_chrom %record_field_end $err(chrom_error)
CS record_position %record_field_end $err(pos_error)
CS record_id %record_field_end $err(id_error)
CS record_ref %record_field_end $err(ref_error)
CS record_alt %record_field_end $err(alt_error)
CS record_qual %record_field_end $err(qual_error)
CS record_filter %record_field_end $err(filter_error)
CS record_info %record_field_end $eof(no_newline_at_eof_error) $err(info_error)
(CS record_format %record_field_end $err(format_error)
(CS record_sample %record_field_end $eof(no_newline_at_eof_error) $err(sample_error))+ )?
) %record_end;
fileformat_section = (fileformat NL) $err(fileformat_section_error);
meta_section = (meta_entry NL)* $err(meta_section_error);
header_section = (header NL) $err(header_section_error);
body_section = (record (NL record)*)? $err(body_section_error);
# Machine start (fileformat, then optional meta, header, then optional records)
main := fileformat_section
meta_section
header_section
body_section >meta_section_end
(NL)* ;
# Error recovery machines that skip until the next line and restart the
# the most appropriate section state.
meta_section_skip := [^\n]* NL @{ fgoto meta_section; };
body_section_skip := [^\n]* NL @{ fgoto body_section; };
}%%
namespace
{
%%{
write data;
}%%
}
namespace ebi
{
namespace vcf
{
template <typename Configuration>
ParserImpl_v44<Configuration>::ParserImpl_v44(std::shared_ptr<Source> source, AdditionalChecks additionalChecks)
: ParserImpl{source, additionalChecks}
{
%%{
write init;
}%%
}
template <typename Configuration>
void ParserImpl_v44<Configuration>::parse_buffer(char const * p, char const * pe, char const * eof)
{
%%{
write exec;
}%%
}
}
}