From c5aad8e3cdcb927ae2b20c0a64fe8376a1c593d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20H=C3=B8rl=C3=BCck=20Berg?=
 <36937807+henrikhorluck@users.noreply.github.com>
Date: Sun, 18 Jun 2023 17:58:55 +0200
Subject: [PATCH 1/6] Expose captures

---
 src/utf32.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
diff --git a/src/utf32.rs b/src/utf32.rs
index 4c8718a..2acbd76 100644
--- a/src/utf32.rs
+++ b/src/utf32.rs
@@ -1,5 +1,6 @@
 use crate::ffi::CodeUnitWidth32;
 pub use crate::regex_impl::Match as MatchImpl;
+pub use crate::regex_impl::Captures as CapturesImpl;
 use crate::regex_impl::{Regex as RegexImpl, RegexBuilder as RegexBuilderImpl};
 
 /// A compiled PCRE2 regular expression for matching sequences of Rust chars.
@@ -17,6 +18,19 @@ pub type RegexBuilder = RegexBuilderImpl<CodeUnitWidth32>;
 /// of the subject string.
 pub type Match<'s> = MatchImpl<'s, CodeUnitWidth32>;
 
+/// Captures represents a group of captured byte strings for a single match.
+///
+/// The 0th capture always corresponds to the entire match. Each subsequent
+/// index corresponds to the next capture group in the regex. If a capture
+/// group is named, then the matched byte string is *also* available via the
+/// `name` method. (Note that the 0th capture is always unnamed and so must be
+/// accessed with the `get` method.)
+///
+/// Positions returned from a capture group are always byte indices.
+///
+/// `'s` is the lifetime of the matched subject string.
+pub type Captures<'s> = CapturesImpl<'s, CodeUnitWidth32>;
+
 #[cfg(test)]
 mod tests {
     use super::{CodeUnitWidth32, Regex, RegexBuilder};

From e216d5b388719195c4ccf155a4ccc1ebc452d061 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20H=C3=B8rl=C3=BCck=20Berg?=
 <36937807+henrikhorluck@users.noreply.github.com>
Date: Wed, 21 Jun 2023 18:29:34 +0200
Subject: [PATCH 2/6] Add very basic replacement support

---
 src/ffi.rs        | 116 ++++++++++++++++++++++++++++++++++++++++++++++
 src/regex_impl.rs | 113 +++++++++++++++++++++++++++++++++++++++++++-
 src/utf32.rs      |   2 +-
 3 files changed, 228 insertions(+), 3 deletions(-)

diff --git a/src/ffi.rs b/src/ffi.rs
index 8bb9902..a7c6246 100644
--- a/src/ffi.rs
+++ b/src/ffi.rs
@@ -153,6 +153,20 @@ pub trait CodeUnitWidth: std::fmt::Debug {
 
     unsafe fn pcre2_get_ovector_pointer(arg1: *mut Self::pcre2_match_data) -> *mut usize;
     unsafe fn pcre2_get_ovector_count(arg1: *mut Self::pcre2_match_data) -> u32;
+
+    unsafe fn pcre2_substitute(
+        code: *const Self::pcre2_code,
+        subject: Self::PCRE2_SPTR,
+        length: usize,
+        startoffset: usize,
+        options: u32,
+        match_data: *mut Self::pcre2_match_data,
+        mcontext: *mut Self::pcre2_match_context,
+        replacement: Self::PCRE2_SPTR,
+        rlength: usize,
+        outputbuffer: *mut Self::PCRE2_CHAR,
+        outputlengthptr: *mut usize,
+    ) -> ::libc::c_int;
 }
 
 #[cfg(feature = "utf8")]
@@ -282,6 +296,33 @@ impl CodeUnitWidth for CodeUnitWidth8 {
     unsafe fn pcre2_get_ovector_count(arg1: *mut Self::pcre2_match_data) -> u32 {
         pcre2_get_ovector_count_8(arg1)
     }
+    unsafe fn pcre2_substitute(
+        code: *const Self::pcre2_code,
+        subject: Self::PCRE2_SPTR,
+        length: usize,
+        startoffset: usize,
+        options: u32,
+        match_data: *mut Self::pcre2_match_data,
+        mcontext: *mut Self::pcre2_match_context,
+        replacement: Self::PCRE2_SPTR,
+        rlength: usize,
+        outputbuffer: *mut Self::PCRE2_CHAR,
+        outputlengthptr: *mut usize,
+    ) -> ::libc::c_int {
+        pcre2_substitute_8(
+            code,
+            subject,
+            length,
+            startoffset,
+            options,
+            match_data,
+            mcontext,
+            replacement,
+            rlength,
+            outputbuffer,
+            outputlengthptr,
+        )
+    }
 }
 
 #[cfg(feature = "utf32")]
@@ -415,6 +456,34 @@ impl CodeUnitWidth for CodeUnitWidth32 {
     unsafe fn pcre2_get_ovector_count(arg1: *mut Self::pcre2_match_data) -> u32 {
         pcre2_get_ovector_count_32(arg1)
     }
+
+    unsafe fn pcre2_substitute(
+        code: *const Self::pcre2_code,
+        subject: Self::PCRE2_SPTR,
+        length: usize,
+        startoffset: usize,
+        options: u32,
+        match_data: *mut Self::pcre2_match_data,
+        mcontext: *mut Self::pcre2_match_context,
+        replacement: Self::PCRE2_SPTR,
+        rlength: usize,
+        outputbuffer: *mut Self::PCRE2_CHAR,
+        outputlengthptr: *mut usize,
+    ) -> ::libc::c_int {
+        pcre2_substitute_32(
+            code,
+            subject,
+            length,
+            startoffset,
+            options,
+            match_data,
+            mcontext,
+            replacement,
+            rlength,
+            outputbuffer,
+            outputlengthptr,
+        )
+    }
 }
 
 /// Returns true if and only if PCRE2 believes that JIT is available.
@@ -613,6 +682,53 @@ impl<W: CodeUnitWidth> Code<W> {
             Ok(1 + count as usize)
         }
     }
+
+    pub unsafe fn substitute(
+        &self,
+        mut subject: &[W::SubjectChar],
+        mut replacement: &[W::SubjectChar],
+        start: usize,
+        options: u32,
+        output: &mut [W::PCRE2_CHAR],
+        output_len: &mut usize,
+    ) -> Result<usize, Error> {
+        // When the subject is empty, we use an empty slice
+        // with a known valid pointer. Otherwise, slices derived
+        // from, e.g., an empty `Vec<u8>` may not have a valid
+        // pointer, since creating an empty `Vec` is guaranteed
+        // to not allocate.
+        if subject.is_empty() {
+            subject = &[];
+        }
+        if replacement.is_empty() {
+            replacement = &[];
+        }
+        let (subj_ptr, subj_len) = W::subject_to_sptr_len(subject);
+        let (repl_ptr, repl_len) = W::subject_to_sptr_len(replacement);
+
+        // safety: we allow arbitrary options, security contract is on the caller
+        let rc = unsafe {
+            W::pcre2_substitute(
+                self.code,
+                subj_ptr,
+                subj_len,
+                start,
+                options,
+                ptr::null_mut(),
+                // should probably not be null for performance reasons?
+                ptr::null_mut(),
+                repl_ptr,
+                repl_len,
+                output.as_mut_ptr() as *mut W::PCRE2_CHAR,
+                output_len as *mut usize,
+            )
+        };
+        if rc >= 0 {
+            return Ok(rc as usize);
+        }
+        // this might warrant a new error type
+        Err(Error::info(rc))
+    }
 }
 
 /// A low level representation of PCRE2's compilation context.
diff --git a/src/regex_impl.rs b/src/regex_impl.rs
index dd60ae0..1a0f9eb 100644
--- a/src/regex_impl.rs
+++ b/src/regex_impl.rs
@@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use std::cell::RefCell;
 use std::collections::HashMap;
 use std::fmt;
@@ -6,8 +7,10 @@ use std::sync::Arc;
 
 use log::debug;
 use pcre2_sys::{
-    PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MULTILINE, PCRE2_NEVER_UTF,
-    PCRE2_NEWLINE_ANYCRLF, PCRE2_NO_UTF_CHECK, PCRE2_UCP, PCRE2_UNSET, PCRE2_UTF,
+    PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_ERROR_NOMEMORY, PCRE2_EXTENDED, PCRE2_MULTILINE,
+    PCRE2_NEVER_UTF, PCRE2_NEWLINE_ANYCRLF, PCRE2_NO_UTF_CHECK, PCRE2_SUBSTITUTE_EXTENDED,
+    PCRE2_SUBSTITUTE_GLOBAL, PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, PCRE2_SUBSTITUTE_UNSET_EMPTY,
+    PCRE2_UCP, PCRE2_UNSET, PCRE2_UTF,
 };
 use thread_local::ThreadLocal;
 
@@ -492,6 +495,112 @@ impl<W: CodeUnitWidth> Regex<W> {
     pub(crate) fn get_capture_names_idxs(&self) -> &HashMap<String, usize> {
         &self.capture_names_idx
     }
+
+    /// Replace the first instance of
+    pub fn replace<'s>(
+        &self,
+        subject: &'s [W::SubjectChar],
+        replacement: &'s [W::SubjectChar],
+        extended: bool,
+    ) -> Result<Cow<'s, [W::SubjectChar]>, Error>
+    where
+        [<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned,
+        W::PCRE2_CHAR: TryInto<W::SubjectChar>,
+        <<W as CodeUnitWidth>::PCRE2_CHAR as TryInto<<W as CodeUnitWidth>::SubjectChar>>::Error:
+            std::fmt::Debug,
+    {
+        self.replace_impl(subject, replacement, false, extended)
+    }
+
+    pub fn replace_all<'s>(
+        &self,
+        subject: &'s [W::SubjectChar],
+        replacement: &'s [W::SubjectChar],
+        extended: bool,
+    ) -> Result<Cow<'s, [W::SubjectChar]>, Error>
+    where
+        [<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned,
+        W::PCRE2_CHAR: TryInto<W::SubjectChar>,
+        <<W as CodeUnitWidth>::PCRE2_CHAR as TryInto<<W as CodeUnitWidth>::SubjectChar>>::Error:
+            std::fmt::Debug,
+    {
+        self.replace_impl(subject, replacement, true, extended)
+    }
+
+    #[inline]
+    fn replace_impl<'s>(
+        &self,
+        subject: &'s [W::SubjectChar],
+        replacement: &'s [W::SubjectChar],
+        replace_all: bool,
+        extended: bool,
+    ) -> Result<Cow<'s, [W::SubjectChar]>, Error>
+    where
+        [<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned,
+        W::PCRE2_CHAR: TryInto<W::SubjectChar>,
+        <<W as CodeUnitWidth>::PCRE2_CHAR as TryInto<<W as CodeUnitWidth>::SubjectChar>>::Error:
+            std::fmt::Debug,
+    {
+        let mut options: u32 = 0;
+        options |= PCRE2_SUBSTITUTE_OVERFLOW_LENGTH;
+        // TODO: this should probably be configurabe from user-side
+        options |= PCRE2_SUBSTITUTE_UNSET_EMPTY;
+        if extended {
+            options |= PCRE2_SUBSTITUTE_EXTENDED;
+        }
+        if replace_all {
+            options |= PCRE2_SUBSTITUTE_GLOBAL;
+        }
+
+        // TODO: we can use MaybeUninit to avoid allocation
+        let mut capacity = 256;
+        let mut output: Vec<W::PCRE2_CHAR> = Vec::with_capacity(capacity);
+        capacity = output.capacity();
+        let mut saved_capacity = capacity;
+
+        let mut rc = unsafe {
+            self.code
+                .substitute(subject, replacement, 0, options, &mut output, &mut capacity)
+        };
+
+        if let Err(e) = &rc {
+            if e.code() == PCRE2_ERROR_NOMEMORY {
+                if output.try_reserve(capacity - output.capacity()).is_err() {
+                    return Err(rc.unwrap_err());
+                }
+                capacity = output.capacity();
+                saved_capacity = capacity;
+                rc = unsafe {
+                    self.code.substitute(
+                        subject,
+                        replacement,
+                        0,
+                        options,
+                        &mut output,
+                        &mut capacity,
+                    )
+                };
+            }
+        }
+
+        Ok(match rc? {
+            0 => Cow::Borrowed(subject),
+            _ => {
+                // +1 to account for null terminator
+                let result = unsafe {
+                    Vec::from_raw_parts(output.as_mut_ptr(), capacity + 1, saved_capacity)
+                };
+                std::mem::forget(output);
+                let x: Vec<W::SubjectChar> = result
+                    .into_iter()
+                    .map(W::PCRE2_CHAR::try_into)
+                    .collect::<Result<Vec<W::SubjectChar>, _>>()
+                    .expect("PCRE2 returned invalid characters");
+
+                Cow::Owned(x)
+            }
+        })
+    }
 }
 
 /// Advanced or  "lower level" search methods.
diff --git a/src/utf32.rs b/src/utf32.rs
index 2acbd76..44cd39c 100644
--- a/src/utf32.rs
+++ b/src/utf32.rs
@@ -1,6 +1,6 @@
 use crate::ffi::CodeUnitWidth32;
-pub use crate::regex_impl::Match as MatchImpl;
 pub use crate::regex_impl::Captures as CapturesImpl;
+pub use crate::regex_impl::Match as MatchImpl;
 use crate::regex_impl::{Regex as RegexImpl, RegexBuilder as RegexBuilderImpl};
 
 /// A compiled PCRE2 regular expression for matching sequences of Rust chars.

From 76c92f9a657c58ba6b6201af9c00e32dc759e29f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20H=C3=B8rl=C3=BCck=20Berg?=
 <36937807+henrikhorluck@users.noreply.github.com>
Date: Thu, 22 Jun 2023 19:45:05 +0200
Subject: [PATCH 3/6] Avoid including null terminators

---
 src/regex_impl.rs | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/regex_impl.rs b/src/regex_impl.rs
index 1a0f9eb..f8e7e19 100644
--- a/src/regex_impl.rs
+++ b/src/regex_impl.rs
@@ -556,7 +556,6 @@ impl<W: CodeUnitWidth> Regex<W> {
         let mut capacity = 256;
         let mut output: Vec<W::PCRE2_CHAR> = Vec::with_capacity(capacity);
         capacity = output.capacity();
-        let mut saved_capacity = capacity;
 
         let mut rc = unsafe {
             self.code
@@ -569,7 +568,6 @@ impl<W: CodeUnitWidth> Regex<W> {
                     return Err(rc.unwrap_err());
                 }
                 capacity = output.capacity();
-                saved_capacity = capacity;
                 rc = unsafe {
                     self.code.substitute(
                         subject,
@@ -583,23 +581,25 @@ impl<W: CodeUnitWidth> Regex<W> {
             }
         }
 
-        Ok(match rc? {
+        let s = match rc? {
             0 => Cow::Borrowed(subject),
             _ => {
                 // +1 to account for null terminator
-                let result = unsafe {
-                    Vec::from_raw_parts(output.as_mut_ptr(), capacity + 1, saved_capacity)
-                };
-                std::mem::forget(output);
-                let x: Vec<W::SubjectChar> = result
+                unsafe { output.set_len(capacity + 1) }; 
+
+                // this is really just a type cast
+                let x: Vec<W::SubjectChar> = output
                     .into_iter()
                     .map(W::PCRE2_CHAR::try_into)
+                    // we don't want to return the null terminator
+                    .take(capacity)
                     .collect::<Result<Vec<W::SubjectChar>, _>>()
                     .expect("PCRE2 returned invalid characters");
 
                 Cow::Owned(x)
             }
-        })
+        };
+        Ok(s)
     }
 }
 

From 515382f681ade31c783cbbc0f092415d4b663b77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20H=C3=B8rl=C3=BCck=20Berg?=
 <36937807+henrikhorluck@users.noreply.github.com>
Date: Mon, 26 Jun 2023 01:57:36 +0200
Subject: [PATCH 4/6] Loosen type-constraits, fix typos, add test

---
 src/ffi.rs        |  2 +-
 src/regex_impl.rs | 29 ++++++++++++++---------------
 src/utf32.rs      | 16 ++++++++++++++++
 3 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/src/ffi.rs b/src/ffi.rs
index a7c6246..25a90ab 100644
--- a/src/ffi.rs
+++ b/src/ffi.rs
@@ -90,7 +90,7 @@ pub trait CodeUnitWidth: std::fmt::Debug {
     type pcre2_match_context;
     type pcre2_match_data;
     type pcre2_jit_stack;
-    type PCRE2_CHAR;
+    type PCRE2_CHAR: TryInto<Self::SubjectChar>;
     type PCRE2_SPTR;
     type name_table_entry: NameTableEntry;
     type SubjectChar: Copy;
diff --git a/src/regex_impl.rs b/src/regex_impl.rs
index f8e7e19..e71180b 100644
--- a/src/regex_impl.rs
+++ b/src/regex_impl.rs
@@ -496,7 +496,8 @@ impl<W: CodeUnitWidth> Regex<W> {
         &self.capture_names_idx
     }
 
-    /// Replace the first instance of
+    /// Replace the first match in the subject string with the replacement
+    /// If `extended` is true, enable PCRE2's extended replacement syntax.
     pub fn replace<'s>(
         &self,
         subject: &'s [W::SubjectChar],
@@ -505,13 +506,12 @@ impl<W: CodeUnitWidth> Regex<W> {
     ) -> Result<Cow<'s, [W::SubjectChar]>, Error>
     where
         [<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned,
-        W::PCRE2_CHAR: TryInto<W::SubjectChar>,
-        <<W as CodeUnitWidth>::PCRE2_CHAR as TryInto<<W as CodeUnitWidth>::SubjectChar>>::Error:
-            std::fmt::Debug,
     {
         self.replace_impl(subject, replacement, false, extended)
     }
 
+    /// Replace all non-overlapping matches in the subject string with the replacement
+    /// If `extended` is true, enable PCRE2's extended replacement syntax.
     pub fn replace_all<'s>(
         &self,
         subject: &'s [W::SubjectChar],
@@ -520,9 +520,6 @@ impl<W: CodeUnitWidth> Regex<W> {
     ) -> Result<Cow<'s, [W::SubjectChar]>, Error>
     where
         [<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned,
-        W::PCRE2_CHAR: TryInto<W::SubjectChar>,
-        <<W as CodeUnitWidth>::PCRE2_CHAR as TryInto<<W as CodeUnitWidth>::SubjectChar>>::Error:
-            std::fmt::Debug,
     {
         self.replace_impl(subject, replacement, true, extended)
     }
@@ -537,13 +534,10 @@ impl<W: CodeUnitWidth> Regex<W> {
     ) -> Result<Cow<'s, [W::SubjectChar]>, Error>
     where
         [<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned,
-        W::PCRE2_CHAR: TryInto<W::SubjectChar>,
-        <<W as CodeUnitWidth>::PCRE2_CHAR as TryInto<<W as CodeUnitWidth>::SubjectChar>>::Error:
-            std::fmt::Debug,
     {
         let mut options: u32 = 0;
         options |= PCRE2_SUBSTITUTE_OVERFLOW_LENGTH;
-        // TODO: this should probably be configurabe from user-side
+        // TODO: this should probably be configurable from user-side
         options |= PCRE2_SUBSTITUTE_UNSET_EMPTY;
         if extended {
             options |= PCRE2_SUBSTITUTE_EXTENDED;
@@ -585,16 +579,21 @@ impl<W: CodeUnitWidth> Regex<W> {
             0 => Cow::Borrowed(subject),
             _ => {
                 // +1 to account for null terminator
-                unsafe { output.set_len(capacity + 1) }; 
+                unsafe { output.set_len(capacity + 1) };
+
+                // All inputs contained valid chars, so we expect all outputs to as well.
+                let to_char = |c: W::PCRE2_CHAR| -> W::SubjectChar {
+                    c.try_into()
+                        .unwrap_or_else(|_| panic!("all output expected to be valid chars"))
+                };
 
                 // this is really just a type cast
                 let x: Vec<W::SubjectChar> = output
                     .into_iter()
-                    .map(W::PCRE2_CHAR::try_into)
+                    .map(to_char)
                     // we don't want to return the null terminator
                     .take(capacity)
-                    .collect::<Result<Vec<W::SubjectChar>, _>>()
-                    .expect("PCRE2 returned invalid characters");
+                    .collect::<Vec<W::SubjectChar>>();
 
                 Cow::Owned(x)
             }
diff --git a/src/utf32.rs b/src/utf32.rs
index 44cd39c..2708705 100644
--- a/src/utf32.rs
+++ b/src/utf32.rs
@@ -33,6 +33,8 @@ pub type Captures<'s> = CapturesImpl<'s, CodeUnitWidth32>;
 
 #[cfg(test)]
 mod tests {
+    use std::borrow::Cow;
+
     use super::{CodeUnitWidth32, Regex, RegexBuilder};
     use crate::is_jit_available;
 
@@ -114,6 +116,20 @@ mod tests {
         assert!(re.is_match(&b("foo\nabc\nbar")).unwrap());
     }
 
+    #[test]
+    fn replace() {
+        let re = RegexBuilder::new().build(b(".")).unwrap();
+        let s = b("abc");
+        let r = b("");
+        let replaced = re.replace(&s, &r, true).unwrap();
+        assert!(
+            matches!(replaced, Cow::Owned(_)),
+            "a replacement should give a new string"
+        );
+        let replaced = replaced.into_owned();
+        assert_eq!(replaced, &*b("bc"));
+    }
+
     #[test]
     fn ucp() {
         let re = RegexBuilder::new().ucp(false).build(b(r"\w")).unwrap();

From 7d78cb07b1a8b538030ab884f43440b1a9346dd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20H=C3=B8rl=C3=BCck=20Berg?=
 <36937807+henrikhorluck@users.noreply.github.com>
Date: Mon, 26 Jun 2023 02:20:35 +0200
Subject: [PATCH 5/6] Fix weird lifetime

- There is no reason for the replacement to share lifetime with the
  subject, because the replacement is not present in the return value,
  even if no replacement occured
---
 src/regex_impl.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/regex_impl.rs b/src/regex_impl.rs
index e71180b..a9d591a 100644
--- a/src/regex_impl.rs
+++ b/src/regex_impl.rs
@@ -501,7 +501,7 @@ impl<W: CodeUnitWidth> Regex<W> {
     pub fn replace<'s>(
         &self,
         subject: &'s [W::SubjectChar],
-        replacement: &'s [W::SubjectChar],
+        replacement: &[W::SubjectChar],
         extended: bool,
     ) -> Result<Cow<'s, [W::SubjectChar]>, Error>
     where
@@ -515,7 +515,7 @@ impl<W: CodeUnitWidth> Regex<W> {
     pub fn replace_all<'s>(
         &self,
         subject: &'s [W::SubjectChar],
-        replacement: &'s [W::SubjectChar],
+        replacement: &[W::SubjectChar],
         extended: bool,
     ) -> Result<Cow<'s, [W::SubjectChar]>, Error>
     where
@@ -528,7 +528,7 @@ impl<W: CodeUnitWidth> Regex<W> {
     fn replace_impl<'s>(
         &self,
         subject: &'s [W::SubjectChar],
-        replacement: &'s [W::SubjectChar],
+        replacement: &[W::SubjectChar],
         replace_all: bool,
         extended: bool,
     ) -> Result<Cow<'s, [W::SubjectChar]>, Error>

From 813a4267546e5ca8ff349c9c67d65e52a82172d2 Mon Sep 17 00:00:00 2001
From: ridiculousfish <corydoras@ridiculousfish.com>
Date: Thu, 29 Jun 2023 11:44:20 -0700
Subject: [PATCH 6/6] Correct substitute behavior on reallocation and add tests

This fixes the following issue in replace_impl: the call to
try_reserve passed in a difference of capacities, but try_reserve
expects a difference between the desired capacity and the length.
Because the initial capacity was nonzero but the length was zero, this
caused us to reserve less capacity than we ought to have, leading to an
OOB write.

Fix this by reworking replace_impl to have less unsafe code. Now we zero
initialize the buffer, but we also prefer a stack buffer so we may save
an allocation - probably a wash overall.

Add a test for this case.
---
 src/ffi.rs        |   2 +-
 src/regex_impl.rs |  49 ++++++++--------
 src/utf32.rs      | 139 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 165 insertions(+), 25 deletions(-)

diff --git a/src/ffi.rs b/src/ffi.rs
index 25a90ab..67edb5e 100644
--- a/src/ffi.rs
+++ b/src/ffi.rs
@@ -90,7 +90,7 @@ pub trait CodeUnitWidth: std::fmt::Debug {
     type pcre2_match_context;
     type pcre2_match_data;
     type pcre2_jit_stack;
-    type PCRE2_CHAR: TryInto<Self::SubjectChar>;
+    type PCRE2_CHAR: Default + Copy + TryInto<Self::SubjectChar>;
     type PCRE2_SPTR;
     type name_table_entry: NameTableEntry;
     type SubjectChar: Copy;
diff --git a/src/regex_impl.rs b/src/regex_impl.rs
index a9d591a..8102f4a 100644
--- a/src/regex_impl.rs
+++ b/src/regex_impl.rs
@@ -546,31 +546,38 @@ impl<W: CodeUnitWidth> Regex<W> {
             options |= PCRE2_SUBSTITUTE_GLOBAL;
         }
 
-        // TODO: we can use MaybeUninit to avoid allocation
-        let mut capacity = 256;
-        let mut output: Vec<W::PCRE2_CHAR> = Vec::with_capacity(capacity);
-        capacity = output.capacity();
+        // We prefer to allocate on the stack but fall back to the heap.
+        // Note that PCRE2 has the following behavior with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH:
+        //   - We supply the initial output buffer size in `capacity`. This should have sufficient
+        //     capacity for the terminating NUL character.
+        //   - If the capacity is NOT sufficient, PCRE2 returns the new required capacity, also
+        //     including the terminating NUL character.
+        //   - If the capacity IS sufficient, PCRE2 returns the number of characters written, NOT
+        //     including the terminating NUL character.
+        // Example: our initial capacity is 256. If the returned string needs to be of length 512,
+        // then PCRE2 will report NOMEMORY and set capacity to 513. After reallocating we pass in
+        // a capacity of 513; it succeeds and sets capacity to 512, which is the length of the result.
+        let mut stack_storage: [W::PCRE2_CHAR; 256] = [W::PCRE2_CHAR::default(); 256];
+        let mut heap_storage = Vec::new();
+        let mut output = stack_storage.as_mut();
+        let mut capacity = output.len();
 
         let mut rc = unsafe {
             self.code
-                .substitute(subject, replacement, 0, options, &mut output, &mut capacity)
+                .substitute(subject, replacement, 0, options, output, &mut capacity)
         };
 
         if let Err(e) = &rc {
             if e.code() == PCRE2_ERROR_NOMEMORY {
-                if output.try_reserve(capacity - output.capacity()).is_err() {
+                if heap_storage.try_reserve_exact(capacity).is_err() {
                     return Err(rc.unwrap_err());
                 }
-                capacity = output.capacity();
+                heap_storage.resize(capacity, W::PCRE2_CHAR::default());
+                output = &mut heap_storage;
+                capacity = output.len();
                 rc = unsafe {
-                    self.code.substitute(
-                        subject,
-                        replacement,
-                        0,
-                        options,
-                        &mut output,
-                        &mut capacity,
-                    )
+                    self.code
+                        .substitute(subject, replacement, 0, options, output, &mut capacity)
                 };
             }
         }
@@ -578,8 +585,8 @@ impl<W: CodeUnitWidth> Regex<W> {
         let s = match rc? {
             0 => Cow::Borrowed(subject),
             _ => {
-                // +1 to account for null terminator
-                unsafe { output.set_len(capacity + 1) };
+                // capacity has been updated with the length of the result (excluding nul terminator).
+                let output = &output[..capacity];
 
                 // All inputs contained valid chars, so we expect all outputs to as well.
                 let to_char = |c: W::PCRE2_CHAR| -> W::SubjectChar {
@@ -588,13 +595,7 @@ impl<W: CodeUnitWidth> Regex<W> {
                 };
 
                 // this is really just a type cast
-                let x: Vec<W::SubjectChar> = output
-                    .into_iter()
-                    .map(to_char)
-                    // we don't want to return the null terminator
-                    .take(capacity)
-                    .collect::<Vec<W::SubjectChar>>();
-
+                let x: Vec<W::SubjectChar> = output.iter().copied().map(to_char).collect();
                 Cow::Owned(x)
             }
         };
diff --git a/src/utf32.rs b/src/utf32.rs
index 2708705..5395d9e 100644
--- a/src/utf32.rs
+++ b/src/utf32.rs
@@ -130,6 +130,145 @@ mod tests {
         assert_eq!(replaced, &*b("bc"));
     }
 
+    #[test]
+    fn replace_no_match() {
+        let re = RegexBuilder::new().build(b("d")).unwrap();
+        let s = b("abc");
+        let r = b("");
+        let replaced = re.replace(&s, &r, true).unwrap();
+        assert!(
+            matches!(replaced, Cow::Borrowed(_)),
+            "when there is no match, the original string should be returned"
+        );
+        let replaced = replaced.into_owned();
+        assert_eq!(replaced, &*b("abc"));
+    }
+
+    #[test]
+    fn replace_with_replacement() {
+        let re = RegexBuilder::new().build(b("b")).unwrap();
+        let s = b("abc");
+        let r = b("d");
+        let replaced = re.replace(&s, &r, true).unwrap();
+        assert!(
+            matches!(replaced, Cow::Owned(_)),
+            "a replacement should give a new string"
+        );
+        let replaced = replaced.into_owned();
+        assert_eq!(replaced, &*b("adc"));
+    }
+
+    #[test]
+    fn replace_first_occurrence() {
+        let re = RegexBuilder::new().build(b("a")).unwrap();
+        let s = b("aaa");
+        let r = b("b");
+        let replaced = re.replace(&s, &r, false).unwrap();
+        assert!(
+            matches!(replaced, Cow::Owned(_)),
+            "a replacement should give a new string"
+        );
+        let replaced = replaced.into_owned();
+        assert_eq!(replaced, &*b("baa"));
+    }
+
+    #[test]
+    fn replace_multiple_occurrences() {
+        let re = RegexBuilder::new().build(b("a")).unwrap();
+        let s = b("aaa");
+        let r = b("b");
+        let replaced = re.replace_all(&s, &r, false).unwrap();
+        assert!(
+            matches!(replaced, Cow::Owned(_)),
+            "a replacement should give a new string"
+        );
+        let replaced = replaced.into_owned();
+        assert_eq!(replaced, &*b("bbb"));
+    }
+
+    #[test]
+    fn replace_empty_string() {
+        let re = RegexBuilder::new().build(b("")).unwrap();
+        let s = b("abc");
+        let r = b("d");
+        let replaced = re.replace(&s, &r, true).unwrap();
+        assert!(
+            matches!(replaced, Cow::Owned(_)),
+            "a replacement should give a new string"
+        );
+        let replaced = replaced.into_owned();
+        assert_eq!(replaced, &*b("dabc"));
+    }
+
+    #[test]
+    fn replace_empty_with_empty() {
+        let re = RegexBuilder::new().build(b("")).unwrap();
+        let s = b("");
+        let r = b("");
+        let replaced = re.replace(&s, &r, true).unwrap().into_owned();
+        assert_eq!(replaced, &*b(""));
+    }
+
+    #[test]
+    fn replace_long_string() {
+        let long_string = vec!['a'; 1024]; // Create a 1MB string filled with 'a'
+        let re = RegexBuilder::new().build(b("a")).unwrap();
+        let r = b("b");
+        let replaced = re.replace(&long_string, &r, false).unwrap();
+        assert!(
+            matches!(replaced, Cow::Owned(_)),
+            "a replacement should give a new string"
+        );
+        let replaced = replaced.into_owned();
+        let mut expected = long_string.clone();
+        expected[0] = 'b';
+        assert_eq!(replaced, expected);
+    }
+
+    #[test]
+    fn replace_long_string_all() {
+        let long_string = vec!['a'; 1024];
+        let re = RegexBuilder::new().build(b("a")).unwrap();
+        let r = b("b");
+        let replaced = re.replace_all(&long_string, &r, false).unwrap();
+        assert!(
+            matches!(replaced, Cow::Owned(_)),
+            "a replacement should give a new string"
+        );
+        let replaced = replaced.into_owned();
+        let all_b = vec!['b'; 1024];
+        assert_eq!(replaced, all_b);
+    }
+
+    #[test]
+    fn replace_long_string_all_elongating() {
+        let long_string = vec!['a'; 1024];
+        let re = RegexBuilder::new().build(b("a")).unwrap();
+        let r = b("bx");
+        let replaced = re.replace_all(&long_string, &r, false).unwrap();
+        assert!(
+            matches!(replaced, Cow::Owned(_)),
+            "a replacement should give a new string"
+        );
+        let replaced = replaced.into_owned();
+        let mut all_bx = Vec::new();
+        for _ in long_string {
+            all_bx.push('b');
+            all_bx.push('x');
+        }
+        assert_eq!(replaced, all_bx);
+    }
+
+    #[test]
+    fn replace_long_string_all_disappearing() {
+        let long_string = vec!['a'; 1024];
+        let re = RegexBuilder::new().build(b("a")).unwrap();
+        let r = b("");
+        let replaced = re.replace_all(&long_string, &r, false).unwrap();
+        let replaced = replaced.into_owned();
+        assert_eq!(replaced, &[]);
+    }
+
     #[test]
     fn ucp() {
         let re = RegexBuilder::new().ucp(false).build(b(r"\w")).unwrap();