From 14a02da08df84807e2c674b0736527c80d7997d2 Mon Sep 17 00:00:00 2001
From: Jon Malmaud <malmaud@gmail.com>
Date: Wed, 3 Jun 2015 15:11:37 -0600
Subject: [PATCH 1/4] define getindex on regex matches to return captures.

---
 base/pcre.jl  | 19 +++++++++++++++++++
 base/regex.jl | 23 ++++++++++++++++++++---
 test/regex.jl |  4 ++++
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/base/pcre.jl b/base/pcre.jl
index 39a9e8da693f6..edf4530cb52de 100644
--- a/base/pcre.jl
+++ b/base/pcre.jl
@@ -140,4 +140,23 @@ function substring_number_from_name(re, name)
         (Ptr{Void}, Cstring), re, name)
 end
 
+function capture_names(re)
+    name_count = info(re, INFO_NAMECOUNT, UInt32)
+    name_entry_size = info(re, INFO_NAMEENTRYSIZE, UInt32)
+    nametable_ptr = info(re, INFO_NAMETABLE, Ptr{UInt8})
+    names = Dict{Int, ASCIIString}()
+    for i=1:name_count
+        offset = (i-1)*name_entry_size + 1
+        # The capture group index corresponding to name 'i' is stored as a
+        # big-endian 16-bit value.
+        high_byte = UInt16(unsafe_load(nametable_ptr, offset))
+        low_byte = UInt16(unsafe_load(nametable_ptr, offset+1))
+        idx = (high_byte << 8) | low_byte
+        # The capture group name is a null-terminated string located directly
+        # after the index.
+        names[idx] = bytestring(nametable_ptr+offset+1)
+    end
+    names
+end
+
 end # module
diff --git a/base/regex.jl b/base/regex.jl
index d6b229f922494..d5a7bc859737c 100644
--- a/base/regex.jl
+++ b/base/regex.jl
@@ -15,6 +15,8 @@ type Regex
     extra::Ptr{Void}
     ovec::Vector{Csize_t}
     match_data::Ptr{Void}
+    capture_name_to_idx::Dict{ASCIIString, Int}
+    idx_to_capture_name::Dict{Int, ASCIIString}
 
 
     function Regex(pattern::AbstractString, compile_options::Integer,
@@ -29,7 +31,8 @@ type Regex
             throw(ArgumentError("invalid regex match options: $match_options"))
         end
         re = compile(new(pattern, compile_options, match_options, C_NULL,
-                         C_NULL, Csize_t[], C_NULL))
+                         C_NULL, Csize_t[], C_NULL,
+                         Dict{ASCIIString, Int}(), Dict{Int, ASCIIString}()))
         finalizer(re, re->begin
                               re.regex == C_NULL || PCRE.free_re(re.regex)
                               re.match_data == C_NULL || PCRE.free_match_data(re.match_data)
@@ -57,6 +60,10 @@ function compile(regex::Regex)
         PCRE.jit_compile(regex.regex)
         regex.match_data = PCRE.create_match_data(regex.regex)
         regex.ovec = PCRE.get_ovec(regex.match_data)
+        regex.idx_to_capture_name = PCRE.capture_names(regex.regex)
+        for (i, name) in regex.idx_to_capture_name
+            regex.capture_name_to_idx[name] = i
+        end
     end
     regex
 end
@@ -92,6 +99,7 @@ immutable RegexMatch
     captures::Vector{Union(Void,SubString{UTF8String})}
     offset::Int
     offsets::Vector{Int}
+    regex::Regex
 end
 
 function show(io::IO, m::RegexMatch)
@@ -100,7 +108,10 @@ function show(io::IO, m::RegexMatch)
     if !isempty(m.captures)
         print(io, ", ")
         for i = 1:length(m.captures)
-            print(io, i, "=")
+            # If the capture group is named, show the name.
+            # Otherwise show its index.
+            capture_name = get(m.regex.idx_to_capture_name, i, i)
+            print(io, capture_name, "=")
             show(io, m.captures[i])
             if i < length(m.captures)
                 print(io, ", ")
@@ -110,6 +121,12 @@ function show(io::IO, m::RegexMatch)
     print(io, ")")
 end
 
+# Capture group extraction
+getindex(m::RegexMatch, idx::Int) = m.captures[idx]
+function getindex(m::RegexMatch, name::AbstractString)
+    m[m.regex.capture_name_to_idx[name]]
+end
+
 function ismatch(r::Regex, s::AbstractString, offset::Integer=0)
     compile(r)
     return PCRE.exec(r.regex, bytestring(s), offset, r.match_options,
@@ -136,7 +153,7 @@ function match(re::Regex, str::UTF8String, idx::Integer, add_opts::UInt32=UInt32
     cap = Union(Void,SubString{UTF8String})[
             ovec[2i+1] == PCRE.UNSET ? nothing : SubString(str, ovec[2i+1]+1, ovec[2i+2]) for i=1:n ]
     off = Int[ ovec[2i+1]+1 for i=1:n ]
-    RegexMatch(mat, cap, ovec[1]+1, off)
+    RegexMatch(mat, cap, ovec[1]+1, off, re)
 end
 
 match(re::Regex, str::Union(ByteString,SubString), idx::Integer, add_opts::UInt32=UInt32(0)) =
diff --git a/test/regex.jl b/test/regex.jl
index 938abd8df70c6..fed738270dae9 100644
--- a/test/regex.jl
+++ b/test/regex.jl
@@ -37,3 +37,7 @@ show(buf, r"")
 # regex match / search string must be a ByteString
 @test_throws ArgumentError match(r"test", utf32("this is a test"))
 @test_throws ArgumentError search(utf32("this is a test"), r"test")
+
+# Named subpatterns
+m = match(r"(?<a>.)(.)(?<b>.)", "xyz")
+@test (m["a"], m[2], m["b"]) == ("x", "y", "z")

From 0479e7a9a8ba612ce8c141e0a777b6eece09e3ed Mon Sep 17 00:00:00 2001
From: Jon Malmaud <malmaud@gmail.com>
Date: Wed, 3 Jun 2015 16:16:06 -0600
Subject: [PATCH 2/4] Store capture names as symbols instead of strings

---
 base/regex.jl | 15 ++++++++-------
 test/regex.jl |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/base/regex.jl b/base/regex.jl
index d5a7bc859737c..150d7abd912b7 100644
--- a/base/regex.jl
+++ b/base/regex.jl
@@ -15,8 +15,8 @@ type Regex
     extra::Ptr{Void}
     ovec::Vector{Csize_t}
     match_data::Ptr{Void}
-    capture_name_to_idx::Dict{ASCIIString, Int}
-    idx_to_capture_name::Dict{Int, ASCIIString}
+    capture_name_to_idx::Dict{Symbol, Int}
+    idx_to_capture_name::Dict{Int, Symbol}
 
 
     function Regex(pattern::AbstractString, compile_options::Integer,
@@ -32,7 +32,7 @@ type Regex
         end
         re = compile(new(pattern, compile_options, match_options, C_NULL,
                          C_NULL, Csize_t[], C_NULL,
-                         Dict{ASCIIString, Int}(), Dict{Int, ASCIIString}()))
+                         Dict{Symbol, Int}(), Dict{Int, Symbol}()))
         finalizer(re, re->begin
                               re.regex == C_NULL || PCRE.free_re(re.regex)
                               re.match_data == C_NULL || PCRE.free_match_data(re.match_data)
@@ -60,9 +60,9 @@ function compile(regex::Regex)
         PCRE.jit_compile(regex.regex)
         regex.match_data = PCRE.create_match_data(regex.regex)
         regex.ovec = PCRE.get_ovec(regex.match_data)
-        regex.idx_to_capture_name = PCRE.capture_names(regex.regex)
-        for (i, name) in regex.idx_to_capture_name
-            regex.capture_name_to_idx[name] = i
+        for (idx, name) in PCRE.capture_names(regex.regex)
+            regex.capture_name_to_idx[Symbol(name)] = idx
+            regex.idx_to_capture_name[idx] = Symbol(name)
         end
     end
     regex
@@ -123,9 +123,10 @@ end
 
 # Capture group extraction
 getindex(m::RegexMatch, idx::Int) = m.captures[idx]
-function getindex(m::RegexMatch, name::AbstractString)
+function getindex(m::RegexMatch, name::Symbol)
     m[m.regex.capture_name_to_idx[name]]
 end
+getindex(m::RegexMatch, name::AbstractString) = m[Symbol(name)]
 
 function ismatch(r::Regex, s::AbstractString, offset::Integer=0)
     compile(r)
diff --git a/test/regex.jl b/test/regex.jl
index fed738270dae9..e76776b9692ae 100644
--- a/test/regex.jl
+++ b/test/regex.jl
@@ -40,4 +40,4 @@ show(buf, r"")
 
 # Named subpatterns
 m = match(r"(?<a>.)(.)(?<b>.)", "xyz")
-@test (m["a"], m[2], m["b"]) == ("x", "y", "z")
+@test (m[:a], m[2], m["b"]) == ("x", "y", "z")

From 1b0127cd7af78b5e9a769a667bb17739c19641fd Mon Sep 17 00:00:00 2001
From: Jon Malmaud <malmaud@gmail.com>
Date: Wed, 24 Jun 2015 10:54:21 -0400
Subject: [PATCH 3/4] Don't cache capture names in regex object

---
 base/regex.jl | 19 +++++++------------
 test/regex.jl |  1 +
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/base/regex.jl b/base/regex.jl
index 150d7abd912b7..3e7e2cd93ac31 100644
--- a/base/regex.jl
+++ b/base/regex.jl
@@ -15,9 +15,6 @@ type Regex
     extra::Ptr{Void}
     ovec::Vector{Csize_t}
     match_data::Ptr{Void}
-    capture_name_to_idx::Dict{Symbol, Int}
-    idx_to_capture_name::Dict{Int, Symbol}
-
 
     function Regex(pattern::AbstractString, compile_options::Integer,
                    match_options::Integer)
@@ -31,8 +28,7 @@ type Regex
             throw(ArgumentError("invalid regex match options: $match_options"))
         end
         re = compile(new(pattern, compile_options, match_options, C_NULL,
-                         C_NULL, Csize_t[], C_NULL,
-                         Dict{Symbol, Int}(), Dict{Int, Symbol}()))
+                         C_NULL, Csize_t[], C_NULL))
         finalizer(re, re->begin
                               re.regex == C_NULL || PCRE.free_re(re.regex)
                               re.match_data == C_NULL || PCRE.free_match_data(re.match_data)
@@ -60,10 +56,6 @@ function compile(regex::Regex)
         PCRE.jit_compile(regex.regex)
         regex.match_data = PCRE.create_match_data(regex.regex)
         regex.ovec = PCRE.get_ovec(regex.match_data)
-        for (idx, name) in PCRE.capture_names(regex.regex)
-            regex.capture_name_to_idx[Symbol(name)] = idx
-            regex.idx_to_capture_name[idx] = Symbol(name)
-        end
     end
     regex
 end
@@ -105,12 +97,13 @@ end
 function show(io::IO, m::RegexMatch)
     print(io, "RegexMatch(")
     show(io, m.match)
+    idx_to_capture_name = PCRE.capture_names(m.regex.regex)
     if !isempty(m.captures)
         print(io, ", ")
         for i = 1:length(m.captures)
             # If the capture group is named, show the name.
             # Otherwise show its index.
-            capture_name = get(m.regex.idx_to_capture_name, i, i)
+            capture_name = get(idx_to_capture_name, i, i)
             print(io, capture_name, "=")
             show(io, m.captures[i])
             if i < length(m.captures)
@@ -122,9 +115,11 @@ function show(io::IO, m::RegexMatch)
 end
 
 # Capture group extraction
-getindex(m::RegexMatch, idx::Int) = m.captures[idx]
+getindex(m::RegexMatch, idx::Integer) = m.captures[idx]
 function getindex(m::RegexMatch, name::Symbol)
-    m[m.regex.capture_name_to_idx[name]]
+    idx = PCRE.substring_number_from_name(m.regex.regex, name)
+    idx <= 0 && error("no capture group named $name found in regex")
+    m[idx]
 end
 getindex(m::RegexMatch, name::AbstractString) = m[Symbol(name)]
 
diff --git a/test/regex.jl b/test/regex.jl
index e76776b9692ae..aaf8eafa72a39 100644
--- a/test/regex.jl
+++ b/test/regex.jl
@@ -41,3 +41,4 @@ show(buf, r"")
 # Named subpatterns
 m = match(r"(?<a>.)(.)(?<b>.)", "xyz")
 @test (m[:a], m[2], m["b"]) == ("x", "y", "z")
+@test sprint(show, m) == "RegexMatch(\"xyz\", a=\"x\", 2=\"y\", b=\"z\")"

From 1b8d47aec9b6d3faae2a31c2f119c11ca2892dc4 Mon Sep 17 00:00:00 2001
From: Jon Malmaud <malmaud@gmail.com>
Date: Thu, 2 Jul 2015 15:34:49 -0400
Subject: [PATCH 4/4] Added manual section on accessing groups

---
 doc/manual/strings.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/doc/manual/strings.rst b/doc/manual/strings.rst
index 5e24be59235ee..d43cefaeb1153 100644
--- a/doc/manual/strings.rst
+++ b/doc/manual/strings.rst
@@ -697,6 +697,16 @@ use destructuring syntax to bind them to local variables::
     julia> first, second, third = m.captures; first
     "a"
 
+Captures can also be accessed by indexing the :obj:`RegexMatch` object
+with the number or name of the capture group::
+
+    julia> m=match(r"(?P<hour>\d+):(?P<minute>\d+)","12:45")
+    RegexMatch("12:45", hour="12", minute="45")
+    julia> m[:minute]
+    "45"
+    julia> m[2]
+    "45"
+
 You can modify the behavior of regular expressions by some combination
 of the flags ``i``, ``m``, ``s``, and ``x`` after the closing double
 quote mark. These flags have the same meaning as they do in Perl, as