Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add (very basic) support for replace #1

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 117 additions & 1 deletion src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ pub trait CodeUnitWidth: std::fmt::Debug {
type pcre2_match_context;
type pcre2_match_data;
type pcre2_jit_stack;
type PCRE2_CHAR;
type PCRE2_CHAR: Default + Copy + TryInto<Self::SubjectChar>;
type PCRE2_SPTR;
type name_table_entry: NameTableEntry;
type SubjectChar: Copy;
Expand Down Expand Up @@ -153,6 +153,20 @@ pub trait CodeUnitWidth: std::fmt::Debug {

unsafe fn pcre2_get_ovector_pointer(arg1: *mut Self::pcre2_match_data) -> *mut usize;
unsafe fn pcre2_get_ovector_count(arg1: *mut Self::pcre2_match_data) -> u32;

unsafe fn pcre2_substitute(
code: *const Self::pcre2_code,
subject: Self::PCRE2_SPTR,
length: usize,
startoffset: usize,
options: u32,
match_data: *mut Self::pcre2_match_data,
mcontext: *mut Self::pcre2_match_context,
replacement: Self::PCRE2_SPTR,
rlength: usize,
outputbuffer: *mut Self::PCRE2_CHAR,
outputlengthptr: *mut usize,
) -> ::libc::c_int;
}

#[cfg(feature = "utf8")]
Expand Down Expand Up @@ -282,6 +296,33 @@ impl CodeUnitWidth for CodeUnitWidth8 {
unsafe fn pcre2_get_ovector_count(arg1: *mut Self::pcre2_match_data) -> u32 {
pcre2_get_ovector_count_8(arg1)
}
unsafe fn pcre2_substitute(
code: *const Self::pcre2_code,
subject: Self::PCRE2_SPTR,
length: usize,
startoffset: usize,
options: u32,
match_data: *mut Self::pcre2_match_data,
mcontext: *mut Self::pcre2_match_context,
replacement: Self::PCRE2_SPTR,
rlength: usize,
outputbuffer: *mut Self::PCRE2_CHAR,
outputlengthptr: *mut usize,
) -> ::libc::c_int {
pcre2_substitute_8(
code,
subject,
length,
startoffset,
options,
match_data,
mcontext,
replacement,
rlength,
outputbuffer,
outputlengthptr,
)
}
}

#[cfg(feature = "utf32")]
Expand Down Expand Up @@ -415,6 +456,34 @@ impl CodeUnitWidth for CodeUnitWidth32 {
unsafe fn pcre2_get_ovector_count(arg1: *mut Self::pcre2_match_data) -> u32 {
pcre2_get_ovector_count_32(arg1)
}

unsafe fn pcre2_substitute(
code: *const Self::pcre2_code,
subject: Self::PCRE2_SPTR,
length: usize,
startoffset: usize,
options: u32,
match_data: *mut Self::pcre2_match_data,
mcontext: *mut Self::pcre2_match_context,
replacement: Self::PCRE2_SPTR,
rlength: usize,
outputbuffer: *mut Self::PCRE2_CHAR,
outputlengthptr: *mut usize,
) -> ::libc::c_int {
pcre2_substitute_32(
code,
subject,
length,
startoffset,
options,
match_data,
mcontext,
replacement,
rlength,
outputbuffer,
outputlengthptr,
)
}
}

/// Returns true if and only if PCRE2 believes that JIT is available.
Expand Down Expand Up @@ -613,6 +682,53 @@ impl<W: CodeUnitWidth> Code<W> {
Ok(1 + count as usize)
}
}

pub unsafe fn substitute(
&self,
mut subject: &[W::SubjectChar],
mut replacement: &[W::SubjectChar],
start: usize,
options: u32,
output: &mut [W::PCRE2_CHAR],
output_len: &mut usize,
) -> Result<usize, Error> {
// When the subject is empty, we use an empty slice
// with a known valid pointer. Otherwise, slices derived
// from, e.g., an empty `Vec<u8>` may not have a valid
// pointer, since creating an empty `Vec` is guaranteed
// to not allocate.
if subject.is_empty() {
subject = &[];
}
if replacement.is_empty() {
replacement = &[];
}
let (subj_ptr, subj_len) = W::subject_to_sptr_len(subject);
let (repl_ptr, repl_len) = W::subject_to_sptr_len(replacement);

// safety: we allow arbitrary options, security contract is on the caller
let rc = unsafe {
W::pcre2_substitute(
self.code,
subj_ptr,
subj_len,
start,
options,
ptr::null_mut(),
// should probably not be null for performance reasons?
ptr::null_mut(),
repl_ptr,
repl_len,
output.as_mut_ptr() as *mut W::PCRE2_CHAR,
output_len as *mut usize,
)
};
if rc >= 0 {
return Ok(rc as usize);
}
// this might warrant a new error type
Err(Error::info(rc))
}
}

/// A low level representation of PCRE2's compilation context.
Expand Down
113 changes: 111 additions & 2 deletions src/regex_impl.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::cell::RefCell;
use std::collections::HashMap;
use std::fmt;
Expand All @@ -6,8 +7,10 @@ use std::sync::Arc;

use log::debug;
use pcre2_sys::{
PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MULTILINE, PCRE2_NEVER_UTF,
PCRE2_NEWLINE_ANYCRLF, PCRE2_NO_UTF_CHECK, PCRE2_UCP, PCRE2_UNSET, PCRE2_UTF,
PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_ERROR_NOMEMORY, PCRE2_EXTENDED, PCRE2_MULTILINE,
PCRE2_NEVER_UTF, PCRE2_NEWLINE_ANYCRLF, PCRE2_NO_UTF_CHECK, PCRE2_SUBSTITUTE_EXTENDED,
PCRE2_SUBSTITUTE_GLOBAL, PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, PCRE2_SUBSTITUTE_UNSET_EMPTY,
PCRE2_UCP, PCRE2_UNSET, PCRE2_UTF,
};
use thread_local::ThreadLocal;

Expand Down Expand Up @@ -492,6 +495,112 @@ impl<W: CodeUnitWidth> Regex<W> {
pub(crate) fn get_capture_names_idxs(&self) -> &HashMap<String, usize> {
&self.capture_names_idx
}

/// Replace the first match in the subject string with the replacement
/// If `extended` is true, enable PCRE2's extended replacement syntax.
pub fn replace<'s>(
&self,
subject: &'s [W::SubjectChar],
replacement: &[W::SubjectChar],
extended: bool,
) -> Result<Cow<'s, [W::SubjectChar]>, Error>
where
[<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned,
{
self.replace_impl(subject, replacement, false, extended)
}

/// Replace all non-overlapping matches in the subject string with the replacement
/// If `extended` is true, enable PCRE2's extended replacement syntax.
pub fn replace_all<'s>(
&self,
subject: &'s [W::SubjectChar],
replacement: &[W::SubjectChar],
extended: bool,
) -> Result<Cow<'s, [W::SubjectChar]>, Error>
where
[<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned,
{
self.replace_impl(subject, replacement, true, extended)
}

#[inline]
fn replace_impl<'s>(
&self,
subject: &'s [W::SubjectChar],
replacement: &[W::SubjectChar],
replace_all: bool,
extended: bool,
) -> Result<Cow<'s, [W::SubjectChar]>, Error>
where
[<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned,
{
let mut options: u32 = 0;
options |= PCRE2_SUBSTITUTE_OVERFLOW_LENGTH;
// TODO: this should probably be configurable from user-side
options |= PCRE2_SUBSTITUTE_UNSET_EMPTY;
if extended {
options |= PCRE2_SUBSTITUTE_EXTENDED;
}
if replace_all {
options |= PCRE2_SUBSTITUTE_GLOBAL;
}

// We prefer to allocate on the stack but fall back to the heap.
// Note that PCRE2 has the following behavior with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH:
// - We supply the initial output buffer size in `capacity`. This should have sufficient
// capacity for the terminating NUL character.
// - If the capacity is NOT sufficient, PCRE2 returns the new required capacity, also
// including the terminating NUL character.
// - If the capacity IS sufficient, PCRE2 returns the number of characters written, NOT
// including the terminating NUL character.
// Example: our initial capacity is 256. If the returned string needs to be of length 512,
// then PCRE2 will report NOMEMORY and set capacity to 513. After reallocating we pass in
// a capacity of 513; it succeeds and sets capacity to 512, which is the length of the result.
let mut stack_storage: [W::PCRE2_CHAR; 256] = [W::PCRE2_CHAR::default(); 256];
let mut heap_storage = Vec::new();
let mut output = stack_storage.as_mut();
let mut capacity = output.len();

let mut rc = unsafe {
self.code
.substitute(subject, replacement, 0, options, output, &mut capacity)
};

if let Err(e) = &rc {
if e.code() == PCRE2_ERROR_NOMEMORY {
if heap_storage.try_reserve_exact(capacity).is_err() {
return Err(rc.unwrap_err());
}
heap_storage.resize(capacity, W::PCRE2_CHAR::default());
output = &mut heap_storage;
capacity = output.len();
rc = unsafe {
self.code
.substitute(subject, replacement, 0, options, output, &mut capacity)
};
}
}

let s = match rc? {
0 => Cow::Borrowed(subject),
_ => {
// capacity has been updated with the length of the result (excluding nul terminator).
let output = &output[..capacity];

// All inputs contained valid chars, so we expect all outputs to as well.
let to_char = |c: W::PCRE2_CHAR| -> W::SubjectChar {
c.try_into()
.unwrap_or_else(|_| panic!("all output expected to be valid chars"))
};

// this is really just a type cast
let x: Vec<W::SubjectChar> = output.iter().copied().map(to_char).collect();
Cow::Owned(x)
}
};
Ok(s)
}
}

/// Advanced or "lower level" search methods.
Expand Down
Loading