Skip to content

Commit

Permalink
fix #31521, make regexes thread-safe (#32381)
Browse files Browse the repository at this point in the history
  • Loading branch information
JeffBezanson authored Jun 24, 2019
1 parent f6049d6 commit cef655a
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 59 deletions.
4 changes: 4 additions & 0 deletions base/client.jl
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,10 @@ MainInclude.include
function _start()
empty!(ARGS)
append!(ARGS, Core.ARGS)
if ccall(:jl_generating_output, Cint, ()) != 0 && JLOptions().incremental == 0
# clear old invalid pointers
PCRE.__init__()
end
try
exec_options(JLOptions())
catch
Expand Down
73 changes: 51 additions & 22 deletions base/pcre.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,36 @@ include(string(length(Core.ARGS) >= 2 ? Core.ARGS[2] : "", "pcre_h.jl")) # incl

const PCRE_LIB = "libpcre2-8"

const JIT_STACK = RefValue{Ptr{Cvoid}}(C_NULL)
const MATCH_CONTEXT = RefValue{Ptr{Cvoid}}(C_NULL)
function create_match_context()
JIT_STACK_START_SIZE = 32768
JIT_STACK_MAX_SIZE = 1048576
jit_stack = ccall((:pcre2_jit_stack_create_8, PCRE_LIB), Ptr{Cvoid},
(Cint, Cint, Ptr{Cvoid}),
JIT_STACK_START_SIZE, JIT_STACK_MAX_SIZE, C_NULL)
ctx = ccall((:pcre2_match_context_create_8, PCRE_LIB),
Ptr{Cvoid}, (Ptr{Cvoid},), C_NULL)
ccall((:pcre2_jit_stack_assign_8, PCRE_LIB), Cvoid,
(Ptr{Cvoid}, Ptr{Cvoid}, Ptr{Cvoid}), ctx, C_NULL, jit_stack)
return ctx
end

function __init__()
try
JIT_STACK_START_SIZE = 32768
JIT_STACK_MAX_SIZE = 1048576
JIT_STACK[] = ccall((:pcre2_jit_stack_create_8, PCRE_LIB), Ptr{Cvoid},
(Cint, Cint, Ptr{Cvoid}),
JIT_STACK_START_SIZE, JIT_STACK_MAX_SIZE, C_NULL)
MATCH_CONTEXT[] = ccall((:pcre2_match_context_create_8, PCRE_LIB),
Ptr{Cvoid}, (Ptr{Cvoid},), C_NULL)
ccall((:pcre2_jit_stack_assign_8, PCRE_LIB), Cvoid,
(Ptr{Cvoid}, Ptr{Cvoid}, Ptr{Cvoid}), MATCH_CONTEXT[], C_NULL, JIT_STACK[])
catch ex
Base.showerror_nostdio(ex,
"WARNING: Error during initialization of module PCRE")
const THREAD_MATCH_CONTEXTS = Ptr{Cvoid}[C_NULL]

_tid() = Int(ccall(:jl_threadid, Int16, ())+1)
_nth() = Int(unsafe_load(cglobal(:jl_n_threads, Cint)))

function get_local_match_context()
tid = _tid()
ctx = @inbounds THREAD_MATCH_CONTEXTS[tid]
if ctx == C_NULL
@inbounds THREAD_MATCH_CONTEXTS[tid] = ctx = create_match_context()
end
return ctx
end

function __init__()
resize!(THREAD_MATCH_CONTEXTS, _nth())
fill!(THREAD_MATCH_CONTEXTS, C_NULL)
end

# supported options for different use cases
Expand Down Expand Up @@ -87,12 +99,16 @@ function info(regex::Ptr{Cvoid}, what::Integer, ::Type{T}) where T
buf[]
end

function get_ovec(match_data)
ptr = ccall((:pcre2_get_ovector_pointer_8, PCRE_LIB), Ptr{Csize_t},
(Ptr{Cvoid},), match_data)
function ovec_length(match_data)
n = ccall((:pcre2_get_ovector_count_8, PCRE_LIB), UInt32,
(Ptr{Cvoid},), match_data)
unsafe_wrap(Array, ptr, 2n, own = false)
return 2n
end

function ovec_ptr(match_data)
ptr = ccall((:pcre2_get_ovector_pointer_8, PCRE_LIB), Ptr{Csize_t},
(Ptr{Cvoid},), match_data)
return ptr
end

function compile(pattern::AbstractString, options::Integer)
Expand Down Expand Up @@ -132,15 +148,28 @@ function err_message(errno)
GC.@preserve buffer unsafe_string(pointer(buffer))
end

function exec(re,subject,offset,options,match_data)
function exec(re, subject, offset, options, match_data)
rc = ccall((:pcre2_match_8, PCRE_LIB), Cint,
(Ptr{Cvoid}, Ptr{UInt8}, Csize_t, Csize_t, Cuint, Ptr{Cvoid}, Ptr{Cvoid}),
re, subject, sizeof(subject), offset, options, match_data, MATCH_CONTEXT[])
re, subject, sizeof(subject), offset, options, match_data, get_local_match_context())
# rc == -1 means no match, -2 means partial match.
rc < -2 && error("PCRE.exec error: $(err_message(rc))")
rc >= 0
end

function exec_r(re, subject, offset, options)
match_data = create_match_data(re)
ans = exec(re, subject, offset, options, match_data)
free_match_data(match_data)
return ans
end

function exec_r_data(re, subject, offset, options)
match_data = create_match_data(re)
ans = exec(re, subject, offset, options, match_data)
return ans, match_data
end

function create_match_data(re)
ccall((:pcre2_match_data_create_from_pattern_8, PCRE_LIB),
Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}), re, C_NULL)
Expand Down
91 changes: 54 additions & 37 deletions base/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ mutable struct Regex
compile_options::UInt32
match_options::UInt32
regex::Ptr{Cvoid}
extra::Ptr{Cvoid}
ovec::Vector{Csize_t}
match_data::Ptr{Cvoid}

function Regex(pattern::AbstractString, compile_options::Integer,
match_options::Integer)
Expand All @@ -37,11 +34,9 @@ mutable struct Regex
if (match_options & ~PCRE.EXECUTE_MASK) !=0
throw(ArgumentError("invalid regex match options: $match_options"))
end
re = compile(new(pattern, compile_options, match_options, C_NULL,
C_NULL, Csize_t[], C_NULL))
re = compile(new(pattern, compile_options, match_options, C_NULL))
finalizer(re) do re
re.regex == C_NULL || PCRE.free_re(re.regex)
re.match_data == C_NULL || PCRE.free_match_data(re.match_data)
end
re
end
Expand All @@ -68,8 +63,6 @@ function compile(regex::Regex)
if regex.regex == C_NULL
regex.regex = PCRE.compile(regex.pattern, regex.compile_options)
PCRE.jit_compile(regex.regex)
regex.match_data = PCRE.create_match_data(regex.regex)
regex.ovec = PCRE.get_ovec(regex.match_data)
end
regex
end
Expand Down Expand Up @@ -164,14 +157,12 @@ getindex(m::RegexMatch, name::AbstractString) = m[Symbol(name)]

function occursin(r::Regex, s::AbstractString; offset::Integer=0)
compile(r)
return PCRE.exec(r.regex, String(s), offset, r.match_options,
r.match_data)
return PCRE.exec_r(r.regex, String(s), offset, r.match_options)
end

function occursin(r::Regex, s::SubString; offset::Integer=0)
compile(r)
return PCRE.exec(r.regex, s, offset, r.match_options,
r.match_data)
return PCRE.exec_r(r.regex, s, offset, r.match_options)
end

"""
Expand All @@ -198,14 +189,12 @@ true
"""
function startswith(s::AbstractString, r::Regex)
compile(r)
return PCRE.exec(r.regex, String(s), 0, r.match_options | PCRE.ANCHORED,
r.match_data)
return PCRE.exec_r(r.regex, String(s), 0, r.match_options | PCRE.ANCHORED)
end

function startswith(s::SubString, r::Regex)
compile(r)
return PCRE.exec(r.regex, s, 0, r.match_options | PCRE.ANCHORED,
r.match_data)
return PCRE.exec_r(r.regex, s, 0, r.match_options | PCRE.ANCHORED)
end

"""
Expand All @@ -232,14 +221,12 @@ true
"""
function endswith(s::AbstractString, r::Regex)
compile(r)
return PCRE.exec(r.regex, String(s), 0, r.match_options | PCRE.ENDANCHORED,
r.match_data)
return PCRE.exec_r(r.regex, String(s), 0, r.match_options | PCRE.ENDANCHORED)
end

function endswith(s::SubString, r::Regex)
compile(r)
return PCRE.exec(r.regex, s, 0, r.match_options | PCRE.ENDANCHORED,
r.match_data)
return PCRE.exec_r(r.regex, s, 0, r.match_options | PCRE.ENDANCHORED)
end

"""
Expand Down Expand Up @@ -274,36 +261,52 @@ function match end
function match(re::Regex, str::Union{SubString{String}, String}, idx::Integer, add_opts::UInt32=UInt32(0))
compile(re)
opts = re.match_options | add_opts
if !PCRE.exec(re.regex, str, idx-1, opts, re.match_data)
matched, data = PCRE.exec_r_data(re.regex, str, idx-1, opts)
if !matched
PCRE.free_match_data(data)
return nothing
end
ovec = re.ovec
n = div(length(ovec),2) - 1
mat = SubString(str, ovec[1]+1, prevind(str, ovec[2]+1))
cap = Union{Nothing,SubString{String}}[ovec[2i+1] == PCRE.UNSET ? nothing :
SubString(str, ovec[2i+1]+1,
prevind(str, ovec[2i+2]+1)) for i=1:n]
off = Int[ ovec[2i+1]+1 for i=1:n ]
RegexMatch(mat, cap, ovec[1]+1, off, re)
n = div(PCRE.ovec_length(data), 2) - 1
p = PCRE.ovec_ptr(data)
mat = SubString(str, unsafe_load(p, 1)+1, prevind(str, unsafe_load(p, 2)+1))
cap = Union{Nothing,SubString{String}}[unsafe_load(p,2i+1) == PCRE.UNSET ? nothing :
SubString(str, unsafe_load(p,2i+1)+1,
prevind(str, unsafe_load(p,2i+2)+1)) for i=1:n]
off = Int[ unsafe_load(p,2i+1)+1 for i=1:n ]
result = RegexMatch(mat, cap, unsafe_load(p,1)+1, off, re)
PCRE.free_match_data(data)
return result
end

match(r::Regex, s::AbstractString) = match(r, s, firstindex(s))
match(r::Regex, s::AbstractString, i::Integer) = throw(ArgumentError(
"regex matching is only available for the String type; use String(s) to convert"
))

findnext(re::Regex, str::Union{String,SubString}, idx::Integer) = _findnext_re(re, str, idx, C_NULL)

# TODO: return only start index and update deprecation
function findnext(re::Regex, str::Union{String,SubString}, idx::Integer)
function _findnext_re(re::Regex, str::Union{String,SubString}, idx::Integer, match_data::Ptr{Cvoid})
if idx > nextind(str,lastindex(str))
throw(BoundsError())
end
opts = re.match_options
compile(re)
if PCRE.exec(re.regex, str, idx-1, opts, re.match_data)
(Int(re.ovec[1])+1):prevind(str,Int(re.ovec[2])+1)
alloc = match_data == C_NULL
if alloc
matched, data = PCRE.exec_r_data(re.regex, str, idx-1, opts)
else
matched = PCRE.exec(re.regex, str, idx-1, opts, match_data)
data = match_data
end
if matched
p = PCRE.ovec_ptr(data)
ans = (Int(unsafe_load(p,1))+1):prevind(str,Int(unsafe_load(p,2))+1)
else
nothing
ans = nothing
end
alloc && PCRE.free_match_data(data)
return ans
end
findnext(r::Regex, s::AbstractString, idx::Integer) = throw(ArgumentError(
"regex search is only available for the String type; use String(s) to convert"
Expand Down Expand Up @@ -384,9 +387,23 @@ julia> replace(msg, r"#(.+)# from (?<from>\\w+)" => s"FROM: \\g<from>; MESSAGE:
"""
macro s_str(string) SubstitutionString(string) end

# replacement

struct RegexAndMatchData
re::Regex
match_data::Ptr{Cvoid}
RegexAndMatchData(re::Regex) = (compile(re); new(re, PCRE.create_match_data(re.regex)))
end

findnext(pat::RegexAndMatchData, str, i) = _findnext_re(pat.re, str, i, pat.match_data)

_pat_replacer(r::Regex) = RegexAndMatchData(r)

_free_pat_replacer(r::RegexAndMatchData) = PCRE.free_match_data(r.match_data)

replace_err(repl) = error("Bad replacement string: $repl")

function _write_capture(io, re, group)
function _write_capture(io, re::RegexAndMatchData, group)
len = PCRE.substring_length_bynumber(re.match_data, group)
ensureroom(io, len+1)
PCRE.substring_copy_bynumber(re.match_data, group,
Expand All @@ -395,7 +412,7 @@ function _write_capture(io, re, group)
io.size = max(io.size, io.ptr - 1)
end

function _replace(io, repl_s::SubstitutionString, str, r, re)
function _replace(io, repl_s::SubstitutionString, str, r, re::RegexAndMatchData)
SUB_CHAR = '\\'
GROUP_CHAR = 'g'
LBRACKET = '<'
Expand Down Expand Up @@ -439,8 +456,8 @@ function _replace(io, repl_s::SubstitutionString, str, r, re)
if all(isdigit, groupname)
_write_capture(io, re, parse(Int, groupname))
else
group = PCRE.substring_number_from_name(re.regex, groupname)
group < 0 && replace_err("Group $groupname not found in regex $re")
group = PCRE.substring_number_from_name(re.re.regex, groupname)
group < 0 && replace_err("Group $groupname not found in regex $(re.re)")
_write_capture(io, re, group)
end
i = nextind(repl, i)
Expand Down
5 changes: 5 additions & 0 deletions base/strings/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -426,13 +426,17 @@ replace(str::String, pat_repl::Pair{<:Union{Tuple{Vararg{<:AbstractChar}},
count::Integer=typemax(Int)) =
replace(str, in(first(pat_repl)) => last(pat_repl), count=count)

_pat_replacer(x) = x
_free_pat_replacer(x) = nothing

function replace(str::String, pat_repl::Pair; count::Integer=typemax(Int))
pattern, repl = pat_repl
count == 0 && return str
count < 0 && throw(DomainError(count, "`count` must be non-negative."))
n = 1
e = lastindex(str)
i = a = firstindex(str)
pattern = _pat_replacer(pattern)
r = something(findnext(pattern,str,i), 0)
j, k = first(r), last(r)
out = IOBuffer(sizehint=floor(Int, 1.2sizeof(str)))
Expand All @@ -453,6 +457,7 @@ function replace(str::String, pat_repl::Pair; count::Integer=typemax(Int))
j, k = first(r), last(r)
n += 1
end
_free_pat_replacer(pattern)
write(out, SubString(str,i))
String(take!(out))
end
Expand Down

1 comment on commit cef655a

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Executing the daily benchmark build, I will reply here when finished:

@nanosoldier runbenchmarks(ALL, isdaily = true)

Please sign in to comment.