-
Notifications
You must be signed in to change notification settings - Fork 181
/
parsemultipart.jl
256 lines (212 loc) · 9.47 KB
/
parsemultipart.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
module MultiPartParsing
import ..access_threaded, ..Request, ..Multipart, ..payload
using ..Parsers
export parse_multipart_form
const CR_BYTE = 0x0d # \r
const LF_BYTE = 0x0a # \n
const DASH_BYTE = 0x2d # -
const HTAB_BYTE = 0x09 # \t
const SPACE_BYTE = 0x20
const SEMICOLON_BYTE = UInt8(';')
const CRLFCRLF = (CR_BYTE, LF_BYTE, CR_BYTE, LF_BYTE)
"compare byte buffer `a` from index `i` to index `j` with `b` and check if they are byte-equal"
function byte_buffers_eq(a, i, j, b)
l = 1
@inbounds for k = i:j
a[k] == b[l] || return false
l += 1
end
return true
end
"""
find_multipart_boundary(bytes, boundaryDelimiter; start::Int=1)
Find the first and last index of the next boundary delimiting a part, and if
the discovered boundary is the terminating boundary.
"""
function find_multipart_boundary(bytes::AbstractVector{UInt8}, boundaryDelimiter::AbstractVector{UInt8}; start::Int=1)
# The boundary delimiter line is prepended with two '-' characters
# The boundary delimiter line starts on a new line, so must be preceded by a \r\n.
# The boundary delimiter line ends with \r\n, and can have "optional linear whitespace" between
# the end of the boundary delimiter, and the \r\n.
# The last boundary delimiter line has an additional '--' at the end of the boundary delimiter
# [RFC2046 5.1.1](https://tools.ietf.org/html/rfc2046#section-5.1.1)
i = start
end_index = i + length(boundaryDelimiter) + 1
while end_index <= length(bytes)
if bytes[i] == DASH_BYTE && bytes[i + 1] == DASH_BYTE && byte_buffers_eq(bytes, i + 2, end_index, boundaryDelimiter)
# boundary delimiter line start on a new line ...
if i > 1
(i == 2 || bytes[i-2] != CR_BYTE || bytes[i-1] != LF_BYTE) && error("boundary delimiter found, but it was not the start of a line")
# the CRLF preceding the boundary delimiter is "conceptually attached
# to the boundary", so account for this with the index
i -= 2
end
# need to check if there are enough characters for the CRLF or for two dashes
end_index < length(bytes)-1 || error("boundary delimiter found, but did not end with new line")
is_terminating_delimiter = bytes[end_index+1] == DASH_BYTE && bytes[end_index+2] == DASH_BYTE
is_terminating_delimiter && (end_index += 2)
# ... there can be arbitrary SP and HTAB space between the boundary delimiter ...
while end_index < length(bytes) && (bytes[end_index+1] in (HTAB_BYTE, SPACE_BYTE))
end_index += 1
end
# ... and ends with a new line
newlineEnd = end_index < length(bytes)-1 &&
bytes[end_index+1] == CR_BYTE &&
bytes[end_index+2] == LF_BYTE
if !newlineEnd
error("boundary delimiter found, but did not end with new line")
end
end_index += 2
return (is_terminating_delimiter, i, end_index)
end
i += 1
end_index += 1
end
error("boundary delimiter not found")
end
"""
find_multipart_boundaries(bytes, boundary; start=1)
Find the start and end indexes of all the parts of the multipart object. Ultimately this method is
looking for the data between the boundary delimiters in the byte array. A vector containing all
the start/end pairs is returned.
"""
function find_multipart_boundaries(bytes::AbstractVector{UInt8}, boundary::AbstractVector{UInt8}; start=1)
idxs = Tuple{Int, Int}[]
while true
(is_terminating_delimiter, i, end_index) = find_multipart_boundary(bytes, boundary; start = start)
push!(idxs, (i, end_index))
is_terminating_delimiter && break
start = end_index + 1
end
return idxs
end
"""
find_header_boundary(bytes)
Find the end of the multipart header in the byte array. Returns a Tuple with the
start index(1) and the end index. Headers are separated from the body by CRLFCRLF.
[RFC2046 5.1](https://tools.ietf.org/html/rfc2046#section-5.1)
[RFC822 3.1](https://tools.ietf.org/html/rfc822#section-3.1)
"""
function find_header_boundary(bytes::AbstractVector{UInt8})
length(CRLFCRLF) > length(bytes) && return nothing
l = length(bytes) - length(CRLFCRLF) + 1
i = 1
end_index = length(CRLFCRLF)
while (i <= l)
byte_buffers_eq(bytes, i, end_index, CRLFCRLF) && return (1, end_index)
i += 1
end_index += 1
end
error("no delimiter found separating header from multipart body")
end
const content_disposition_regex = Parsers.RegexAndMatchData[]
function content_disposition_regex_f()
r = Parsers.RegexAndMatchData(r"^Content-Disposition:[ \t]*form-data;[ \t]*(.*)\r\n"x)
Parsers.init!(r)
end
const content_disposition_flag_regex = Parsers.RegexAndMatchData[]
function content_disposition_flag_regex_f()
r = Parsers.RegexAndMatchData(r"""^
[ \t]*([!#$%&'*+\-.^_`|~[:alnum:]]+);?
"""x)
Parsers.init!(r)
end
const content_disposition_pair_regex = Parsers.RegexAndMatchData[]
function content_disposition_pair_regex_f()
r = Parsers.RegexAndMatchData(r"""^
[ \t]*([!#$%&'*+\-.^_`|~[:alnum:]]+)[ \t]*=[ \t]*"(.*?)";?
"""x)
Parsers.init!(r)
end
const content_type_regex = Parsers.RegexAndMatchData[]
function content_type_regex_f()
r = Parsers.RegexAndMatchData(r"(?i)Content-Type: (\S*[^;\s])"x)
Parsers.init!(r)
end
"""
parse_multipart_chunk(chunk)
Parse a single multi-part chunk into a Multipart object. This will decode
the header and extract the contents from the byte array.
"""
function parse_multipart_chunk(chunk)
startIndex, end_index = find_header_boundary(chunk)
header = SubString(unsafe_string(pointer(chunk, startIndex), end_index - startIndex + 1))
content = view(chunk, end_index+1:lastindex(chunk))
# find content disposition
re = access_threaded(content_disposition_regex_f, content_disposition_regex)
if !Parsers.exec(re, header)
@warn "Content disposition is not specified dropping the chunk." String(chunk)
return nothing # Specifying content disposition is mandatory
end
content_disposition = Parsers.group(1, re, header)
re_flag = access_threaded(content_disposition_flag_regex_f, content_disposition_flag_regex)
re_pair = access_threaded(content_disposition_pair_regex_f, content_disposition_pair_regex)
name = nothing
filename = nothing
while !isempty(content_disposition)
if Parsers.exec(re_pair, content_disposition)
key = Parsers.group(1, re_pair, content_disposition)
value = Parsers.group(2, re_pair, content_disposition)
if key == "name"
name = value
elseif key == "filename"
filename = value
else
# do stuff with other content disposition key-value pairs
end
content_disposition = Parsers.nextbytes(re_pair, content_disposition)
elseif Parsers.exec(re_flag, content_disposition)
# do stuff with content disposition flags
content_disposition = Parsers.nextbytes(re_flag, content_disposition)
else
break
end
end
name === nothing && return
re_ct = access_threaded(content_type_regex_f, content_type_regex)
contenttype = Parsers.exec(re_ct, header) ? Parsers.group(1, re_ct, header) : "text/plain"
return Multipart(filename, IOBuffer(content), contenttype, "", name)
end
"""
parse_multipart_body(body, boundary)::Vector{Multipart}
Parse the multipart body received from the client breaking it into the various
chunks which are returned as an array of Multipart objects.
"""
function parse_multipart_body(body::AbstractVector{UInt8}, boundary::AbstractString)::Vector{Multipart}
multiparts = Multipart[]
idxs = find_multipart_boundaries(body, codeunits(boundary))
length(idxs) > 1 || (return multiparts)
for i in 1:length(idxs)-1
chunk = view(body, idxs[i][2]+1:idxs[i+1][1]-1)
push!(multiparts, parse_multipart_chunk(chunk))
end
return multiparts
end
"""
parse_multipart_form(req::Request)::Vector{Multipart}
Parse the full mutipart form submission from the client returning and
array of Multipart objects containing all the data.
The order of the multipart form data in the request should be preserved.
[RFC7578 5.2](https://tools.ietf.org/html/rfc7578#section-5.2).
The boundary delimiter MUST NOT appear inside any of the encapsulated parts. Note
that the boundary delimiter does not need to have '-' characters, but a line using
the boundary delimiter will start with '--' and end in \r\n.
[RFC2046 5.1](https://tools.ietf.org/html/rfc2046#section-5.1.1)
"""
function parse_multipart_form(req::Request)::Union{Vector{Multipart}, Nothing}
# parse boundary from Content-Type
m = match(r"multipart/form-data; boundary=(.*)$", req["Content-Type"])
m === nothing && return nothing
boundary_delimiter = m[1]
# [RFC2046 5.1.1](https://tools.ietf.org/html/rfc2046#section-5.1.1)
length(boundary_delimiter) > 70 && error("boundary delimiter must not be greater than 70 characters")
return parse_multipart_body(payload(req), boundary_delimiter)
end
function __init__()
resize!(empty!(content_disposition_regex), Threads.nthreads())
resize!(empty!(content_disposition_flag_regex), Threads.nthreads())
resize!(empty!(content_disposition_pair_regex), Threads.nthreads())
resize!(empty!(content_type_regex), Threads.nthreads())
return
end
end # module MultiPartParsing