Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rustify DOMXPath::quote #13545

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 37 additions & 20 deletions build/php.m4
Original file line number Diff line number Diff line change
Expand Up @@ -215,47 +215,59 @@ AC_DEFUN([PHP_ADD_SOURCES_X],[
dnl Relative to source- or build-directory?
dnl ac_srcdir/ac_bdir include trailing slash
case $1 in
""[)] ac_srcdir="$abs_srcdir/"; unset ac_bdir; ac_inc="-I. -I$abs_srcdir" ;;
/*[)] ac_srcdir=`echo "$1"|cut -c 2-`"/"; ac_bdir=$ac_srcdir; ac_inc="-I$ac_bdir -I$abs_srcdir/$ac_bdir" ;;
*[)] ac_srcdir="$abs_srcdir/$1/"; ac_bdir="$1/"; ac_inc="-I$ac_bdir -I$ac_srcdir" ;;
"") ac_srcdir="$abs_srcdir/"; unset ac_bdir; ac_inc="-I. -I$abs_srcdir" ;;
/*) ac_srcdir=`echo "$1"|cut -c 2-`"/"; ac_bdir=$ac_srcdir; ac_inc="-I$ac_bdir -I$abs_srcdir/$ac_bdir" ;;
*) ac_srcdir="$abs_srcdir/$1/"; ac_bdir="$1/"; ac_inc="-I$ac_bdir -I$ac_srcdir" ;;
esac

dnl how to build .. shared or static?
ifelse($5,yes,_PHP_ASSIGN_BUILD_VARS(shared),_PHP_ASSIGN_BUILD_VARS(php))

dnl Iterate over the sources.
old_IFS=[$]IFS
for ac_src in $2; do

dnl Remove the suffix.
IFS=.
set $ac_src
ac_obj=[$]1
IFS=$old_IFS
old_IFS=[$]IFS
for ac_src in $2; do
IFS=.
set $ac_src
ac_obj=[$]1
IFS=$old_IFS

dnl Append to the array which has been dynamically chosen at m4 time.
$4="[$]$4 [$]ac_bdir[$]ac_obj.lo"

dnl Choose the right compiler/flags/etc. for the source-file.
case $ac_src in
*.c[)] ac_comp="$b_c_pre $ac_inc $b_c_meta $3 -c $ac_srcdir$ac_src -o $ac_bdir$ac_obj.$b_lo $b_c_post" ;;
*.s[)] ac_comp="$b_c_pre $ac_inc $b_c_meta $3 -c $ac_srcdir$ac_src -o $ac_bdir$ac_obj.$b_lo $b_c_post" ;;
*.S[)] ac_comp="$b_c_pre $ac_inc $b_c_meta $3 -c $ac_srcdir$ac_src -o $ac_bdir$ac_obj.$b_lo $b_c_post" ;;
*.cpp|*.cc|*.cxx[)] ac_comp="$b_cxx_pre $ac_inc $b_cxx_meta $3 -c $ac_srcdir$ac_src -o $ac_bdir$ac_obj.$b_lo $b_cxx_post" ;;
esac
$4="[$]$4 [$]ac_bdir[$]ac_obj.lo"

# Choose compiler/flags based on file extension
case $ac_src in
*.rs)
# Rust source file
ac_comp="rustc $ac_srcdir$ac_src --crate-type=staticlib --out-dir $ac_bdir --crate-name=$(basename $ac_src .rs) && mv ${ac_bdir}lib$(basename $ac_src .rs).a ${ac_bdir}$(basename $ac_src .rs).o"
;;

*.c|*.s|*.S)
# C and assembly source files
ac_comp="$b_c_pre $ac_inc $b_c_meta $3 -c $ac_srcdir$ac_src -o $ac_bdir$ac_obj.$b_lo $b_c_post -MMD -MF $ac_bdir$ac_obj.dep -MT $ac_bdir$ac_obj.lo"
;;
*.cpp|*.cc|*.cxx)
# C++ source files
ac_comp="$b_cxx_pre $ac_inc $b_cxx_meta $3 -c $ac_srcdir$ac_src -o $ac_bdir$ac_obj.$b_lo $b_cxx_post -MMD -MF $ac_bdir$ac_obj.dep -MT $ac_bdir$ac_obj.lo"
;;
esac


dnl Generate Makefiles with dependencies
ac_comp="$ac_comp -MMD -MF $ac_bdir$ac_obj.dep -MT $ac_bdir[$]ac_obj.lo"

dnl Create a rule for the object/source combo.
cat >>Makefile.objects<<EOF
-include $ac_bdir[$]ac_obj.dep
$ac_bdir[$]ac_obj.lo: $ac_srcdir[$]ac_src
$ac_comp
EOF


done
])



dnl ----------------------------------------------------------------------------
dnl Compiler characteristics checks.
dnl ----------------------------------------------------------------------------
Expand Down Expand Up @@ -781,6 +793,10 @@ AC_DEFUN([PHP_BUILD_PROGRAM],[
php_cxx_post=
php_lo=lo

php_rustc_pre='rustc'
php_rustc_meta='$(RUSTFLAGS)'
php_rustc_post=

case $with_pic in
yes) pic_setting='-prefer-pic';;
no) pic_setting='-prefer-non-pic';;
Expand All @@ -795,6 +811,7 @@ AC_DEFUN([PHP_BUILD_PROGRAM],[
shared_lo=lo
])


dnl
dnl PHP_SHARED_MODULE(module-name, object-var, build-dir, cxx, zend_ext)
dnl
Expand Down
2 changes: 1 addition & 1 deletion ext/dom/config.m4
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ if test "$PHP_DOM" != "no"; then
documenttype.c entity.c \
nodelist.c text.c comment.c \
entityreference.c \
notation.c xpath.c dom_iterators.c \
notation.c xpath.c xpath_rust.rs dom_iterators.c \
namednodemap.c xpath_callbacks.c \
$LEXBOR_SOURCES],
$ext_shared,,$PHP_LEXBOR_CFLAGS)
Expand Down
42 changes: 4 additions & 38 deletions ext/dom/xpath.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include "php.h"
#if defined(HAVE_LIBXML) && defined(HAVE_DOM)
#include "xpath_rust.h"
#include "php_dom.h"

#define PHP_DOM_XPATH_QUERY 0
Expand Down Expand Up @@ -438,47 +439,12 @@ PHP_METHOD(DOMXPath, registerPhpFunctionNS)
/* {{{ */
PHP_METHOD(DOMXPath, quote) {
const char *input;
size_t input_len;
uintptr_t input_len;
if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &input, &input_len) == FAILURE) {
RETURN_THROWS();
}
if (memchr(input, '\'', input_len) == NULL) {
zend_string *const output = zend_string_safe_alloc(1, input_len, 2, false);
output->val[0] = '\'';
memcpy(output->val + 1, input, input_len);
output->val[input_len + 1] = '\'';
output->val[input_len + 2] = '\0';
RETURN_STR(output);
} else if (memchr(input, '"', input_len) == NULL) {
zend_string *const output = zend_string_safe_alloc(1, input_len, 2, false);
output->val[0] = '"';
memcpy(output->val + 1, input, input_len);
output->val[input_len + 1] = '"';
output->val[input_len + 2] = '\0';
RETURN_STR(output);
} else {
smart_str output = {0};
// need to use the concat() trick published by Robert Rossney at https://stackoverflow.com/a/1352556/1067003
smart_str_appendl(&output, "concat(", 7);
const char *ptr = input;
const char *const end = input + input_len;
while (ptr < end) {
const char *const single_quote_ptr = memchr(ptr, '\'', end - ptr);
const char *const double_quote_ptr = memchr(ptr, '"', end - ptr);
const size_t distance_to_single_quote = single_quote_ptr ? single_quote_ptr - ptr : end - ptr;
const size_t distance_to_double_quote = double_quote_ptr ? double_quote_ptr - ptr : end - ptr;
const size_t bytes_until_quote = MAX(distance_to_single_quote, distance_to_double_quote);
const char quote_method = (distance_to_single_quote > distance_to_double_quote) ? '\'' : '"';
smart_str_appendc(&output, quote_method);
smart_str_appendl(&output, ptr, bytes_until_quote);
smart_str_appendc(&output, quote_method);
ptr += bytes_until_quote;
smart_str_appendc(&output, ',');
}
ZEND_ASSERT(ptr == end);
output.s->val[output.s->len - 1] = ')';
RETURN_STR(smart_str_extract(&output));
}
const char *ouput = domxpath_quote_literal(input, &input_len);
RETURN_STRINGL(ouput, input_len);
Comment on lines +446 to +447
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: ouput. This construct will also require a full allocation + copy into a new zend_string().

}
/* }}} */

Expand Down
4 changes: 4 additions & 0 deletions ext/dom/xpath_rust.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#ifndef DOM_XPATH_RUST_H
#define DOM_XPATH_RUST_H
extern char* domxpath_quote_literal(const char *const input, uintptr_t *const len);
#endif
56 changes: 56 additions & 0 deletions ext/dom/xpath_rust.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
use std::ffi::{CString, c_char};


#[no_mangle]
pub extern "C" fn domxpath_quote_literal(input: *const c_char, len: *mut usize) -> *mut c_char {
let slice = unsafe { std::slice::from_raw_parts(input as *const u8, *len as usize) };
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should preferably contain a "safety requirement" comment, i.e. that it is expected that the caller does not hold any mutable references to the input. I know it's a bit trivial in this case, but when using unsafe I'd rather see it documented, and the caller must know about it too to avoid soundness issues.


let single_quote_absent = !slice.contains(&b'\'');
let double_quote_absent = !slice.contains(&b'"');

let result = if single_quote_absent {
let mut res = Vec::with_capacity(slice.len() + 2);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without additional infrastructure this bypasses PHP's memory manager and thus PHP's memory limit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I wonder how difficult it is to give rust access to php's memory manager 🤔

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The easiest way I see is to implement GlobalAlloc to use emalloc/efree.

I can submit a separate PR for this, and possibly an RFC, if anyone's interested.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using GlobalAlloc makes sense, request-based allocations are the most common case anyway.
If you go for an RFC then we'd need more than just the allocation interface, but we also need proper access to the Zend data structures. So this is more work than we might expect at first, but my personal opinion is that it's a good idea to allow memory safe languages inside PHP.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@danog please do. I won't have time/energy to contribute for some time.

res.push(b'\'');
res.extend_from_slice(slice);
res.push(b'\'');
res
} else if double_quote_absent {
let mut res = Vec::with_capacity(slice.len() + 2);
res.push(b'"');
res.extend_from_slice(slice);
res.push(b'"');
res
} else {
let mut res = Vec::from("concat(".as_bytes());
let mut temp_slice = slice;

while !temp_slice.is_empty() {
let bytes_until_single_quote = temp_slice.iter().position(|&x| x == b'\'').unwrap_or(temp_slice.len());
let bytes_until_double_quote = temp_slice.iter().position(|&x| x == b'"').unwrap_or(temp_slice.len());

let (quote_method, bytes_until_quote) = if bytes_until_single_quote > bytes_until_double_quote {
(b'\'', bytes_until_single_quote)
} else {
(b'"', bytes_until_double_quote)
};

res.push(quote_method);
res.extend_from_slice(&temp_slice[..bytes_until_quote]);
res.push(quote_method);
res.push(b',');
temp_slice = &temp_slice[bytes_until_quote..];
}
let res_len = res.len();
res[res_len - 1] = b')';
res
};

// Update length
unsafe {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should at least be able to get rid of this unsafety here.
I suggest returning a ffi-compatible struct that contains both the string as raw parts, and the length.

*len = result.len() as usize;
}

// Convert Vec<u8> to *mut c_char
let c_str = CString::new(result).expect("CString::new failed");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that this would leak memory, as you're not freeing the (persistently-allocated) memory of the string at the call site. Using RETURN_STRINGL also makes an extra allocation + copy which is unfortunate.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

quick question, would it be better to handle this more "smoothly" ? ie using match pattern.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this can actually fail because the input may contain \0 bytes. So we must not panic by using expect. And this should probably use a Zend API to avoid the reallocation too, will be much cleaner in the end with the right access and abstractions to Zend APIs.

c_str.into_raw()
}
Loading