Skip to content

Commit

Permalink
feat: provide an XPath function for fast CSS class lookup
Browse files Browse the repository at this point in the history
available as `nokogiri-builtin:css-class`

Part of #2135
  • Loading branch information
flavorjones committed Dec 18, 2020
1 parent 2145f9d commit 7566d4b
Show file tree
Hide file tree
Showing 6 changed files with 225 additions and 15 deletions.
3 changes: 1 addition & 2 deletions ext/java/nokogiri/XmlXpathContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,7 @@ public IRubyObject register_variable(IRubyObject name, IRubyObject value) {
}

private IRubyObject node_set(ThreadContext context, String expr, IRubyObject handler) {
final NokogiriXPathFunctionResolver fnResolver =
handler.isNil() ? null : NokogiriXPathFunctionResolver.create(handler);
final NokogiriXPathFunctionResolver fnResolver = NokogiriXPathFunctionResolver.create(handler);
try {
return tryGetNodeSet(context, expr, fnResolver);
}
Expand Down
11 changes: 9 additions & 2 deletions ext/java/nokogiri/internals/NokogiriNamespaceContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,15 @@
*/
public final class NokogiriNamespaceContext implements NamespaceContext {

public static final String NOKOGIRI_PREFIX = "nokogiri";
/*
* these constants have matching declarations in
* ext/nokogiri/xml_xpath_context.c
*/
public static final String NOKOGIRI_PREFIX = "nokogiri";
public static final String NOKOGIRI_URI = "http://www.nokogiri.org/default_ns/ruby/extensions_functions";
public static final String NOKOGIRI_TEMPORARY_ROOT_TAG = "nokogiri-temporary-root-tag";

public static final String NOKOGIRI_BUILTIN_PREFIX = "nokogiri-builtin";
public static final String NOKOGIRI_BUILTIN_URI = "https://www.nokogiri.org/default_ns/ruby/builtins";

private final Map<String,String> register;

Expand All @@ -63,6 +69,7 @@ public static NokogiriNamespaceContext create() {
private NokogiriNamespaceContext() {
register = new HashMap<String, String>(6, 1);
register.put(NOKOGIRI_PREFIX, NOKOGIRI_URI);
register.put(NOKOGIRI_BUILTIN_PREFIX, NOKOGIRI_BUILTIN_URI);
register.put("xml", "http://www.w3.org/XML/1998/namespace");
register.put("xhtml", "http://www.w3.org/1999/xhtml");
}
Expand Down
68 changes: 62 additions & 6 deletions ext/java/nokogiri/internals/NokogiriXPathFunction.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

import javax.xml.xpath.XPathFunction;
import javax.xml.xpath.XPathFunctionException;
import javax.xml.namespace.QName;

import org.jruby.Ruby;
import org.jruby.RubyArray;
Expand Down Expand Up @@ -64,14 +65,14 @@
public class NokogiriXPathFunction implements XPathFunction {

private final IRubyObject handler;
private final String name;
private final QName name;
private final int arity;

public static NokogiriXPathFunction create(IRubyObject handler, String name, int arity) {
public static NokogiriXPathFunction create(IRubyObject handler, QName name, int arity) {
return new NokogiriXPathFunction(handler, name, arity);
}

private NokogiriXPathFunction(IRubyObject handler, String name, int arity) {
private NokogiriXPathFunction(IRubyObject handler, QName name, int arity) {
this.handler = handler;
this.name = name;
this.arity = arity;
Expand All @@ -82,11 +83,20 @@ public Object evaluate(List args) throws XPathFunctionException {
throw new XPathFunctionException("arity does not match");
}

final Ruby runtime = this.handler.getRuntime();
ThreadContext context = runtime.getCurrentContext();
if (name.getNamespaceURI().equals(NokogiriNamespaceContext.NOKOGIRI_BUILTIN_URI)) {
if (name.getLocalPart().equals("css-class")) {
return builtinCssClass(args);
}
}

IRubyObject result = Helpers.invoke(context, this.handler, this.name, fromObjectToRubyArgs(runtime, args));
if (this.handler.isNil()) {
throw new XPathFunctionException("no custom function handler declared for '" + name + "'");
}

final Ruby runtime = this.handler.getRuntime();
ThreadContext context = runtime.getCurrentContext();
IRubyObject result = Helpers.invoke(context, this.handler, this.name.getLocalPart(),
fromObjectToRubyArgs(runtime, args));
return fromRubyToObject(runtime, result);
}

Expand Down Expand Up @@ -121,4 +131,50 @@ private static Object fromRubyToObject(final Ruby runtime, IRubyObject obj) {
}
/*if (o instanceof XmlNode)*/ return ((XmlNode) obj).getNode();
}

private static boolean builtinCssClass(List args) throws XPathFunctionException {
if (args.size() != 2) {
throw new XPathFunctionException("builtin function nokogiri:css-class takes two arguments");
}

String hay = args.get(0).toString();
String needle = args.get(1).toString();

if (needle.length() == 0) {
return true;
}

int j = 0;
int j_lim = hay.length() - needle.length();
while (j <= j_lim) {
int k;
for (k = 0; k < needle.length(); k++) {
if (needle.charAt(k) != hay.charAt(j+k)) {
break;
}
}
if (k == needle.length()) {
if ((hay.length() == (j+k)) || isWhitespace(hay.charAt(j+k))) {
return true ;
}
}

/* advance str to whitespace */
while (j <= j_lim && !isWhitespace(hay.charAt(j))) {
j++;
}

/* advance str to start of next word or end of string */
while (j <= j_lim && isWhitespace(hay.charAt(j))) {
j++;
}
}

return false;
}

private static boolean isWhitespace(char subject) {
// see libxml2's xmlIsBlank_ch()
return ((subject == 0x09) || (subject == 0x0A) || (subject == 0x0D) || (subject == 0x20));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ public final class NokogiriXPathFunctionResolver implements XPathFunctionResolve

public static NokogiriXPathFunctionResolver create(IRubyObject handler) {
NokogiriXPathFunctionResolver freshResolver = new NokogiriXPathFunctionResolver();
freshResolver.setHandler(handler);
if (!handler.isNil()) {
freshResolver.setHandler(handler);
}
return freshResolver;
}

Expand All @@ -65,6 +67,6 @@ public void setHandler(IRubyObject handler) {
}

public XPathFunction resolveFunction(QName name, int arity) {
return NokogiriXPathFunction.create(handler, name.getLocalPart(), arity);
return NokogiriXPathFunction.create(handler, name, arity);
}
}
84 changes: 81 additions & 3 deletions ext/nokogiri/xml_xpath_context.c
Original file line number Diff line number Diff line change
@@ -1,12 +1,86 @@
#include <xml_xpath_context.h>

/*
* these constants have matching declarations in
* ext/java/nokogiri/internals/NokogiriNamespaceContext.java
*/
static const xmlChar *NOKOGIRI_BUILTIN_PREFIX = (const xmlChar *)"nokogiri-builtin";
static const xmlChar *NOKOGIRI_BUILTIN_URI = (const xmlChar *)"https://www.nokogiri.org/default_ns/ruby/builtins";

static void deallocate(xmlXPathContextPtr ctx)
{
NOKOGIRI_DEBUG_START(ctx);
xmlXPathFreeContext(ctx);
NOKOGIRI_DEBUG_END(ctx);
}

/* find a CSS class in an HTML element's `class` attribute */
const xmlChar* builtin_css_class(const xmlChar* str, const xmlChar *val)
{
int val_len;

if (str == NULL) { return(NULL); }
if (val == NULL) { return(NULL); }

val_len = xmlStrlen(val);
if (val_len == 0) { return(str); }

while (*str != 0) {
if ((*str == *val) && !xmlStrncmp(str, val, val_len)) {
const xmlChar* next_byte = str + val_len;

/* only match if the next byte is whitespace or end of string */
if ((*next_byte == 0) || (IS_BLANK_CH(*next_byte))) {
return((const xmlChar*)str);
}
}

/* advance str to whitespace */
while ((*str != 0) && !IS_BLANK_CH(*str)) {
str++;
}

/* advance str to start of next word or end of string */
while ((*str != 0) && IS_BLANK_CH(*str)) {
str++;
}
}

return(NULL);
}

/* xmlXPathFunction to wrap builtin_css_class() */
static void xpath_builtin_css_class(xmlXPathParserContextPtr ctxt, int nargs)
{
xmlXPathObjectPtr hay, needle;

CHECK_ARITY(2);

CAST_TO_STRING;
needle = valuePop(ctxt);
if ((needle == NULL) || (needle->type != XPATH_STRING)) {
xmlXPathFreeObject(needle);
XP_ERROR(XPATH_INVALID_TYPE);
}

CAST_TO_STRING;
hay = valuePop(ctxt);
if ((hay == NULL) || (hay->type != XPATH_STRING)) {
xmlXPathFreeObject(hay);
xmlXPathFreeObject(needle);
XP_ERROR(XPATH_INVALID_TYPE);
}

if (builtin_css_class(hay->stringval, needle->stringval)) {
valuePush(ctxt, xmlXPathNewBoolean(1));
} else {
valuePush(ctxt, xmlXPathNewBoolean(0));
}

xmlXPathFreeObject(hay);
xmlXPathFreeObject(needle);
}

/*
* call-seq:
* register_ns(prefix, uri)
Expand Down Expand Up @@ -261,14 +335,18 @@ static VALUE new(VALUE klass, VALUE nodeobj)
xmlXPathContextPtr ctx;
VALUE self;

xmlXPathInit();

Data_Get_Struct(nodeobj, xmlNode, node);

xmlXPathInit();

ctx = xmlXPathNewContext(node->doc);
ctx->node = node;

xmlXPathRegisterNs(ctx, NOKOGIRI_BUILTIN_PREFIX, NOKOGIRI_BUILTIN_URI);
xmlXPathRegisterFuncNS(ctx, (const xmlChar *)"css-class", NOKOGIRI_BUILTIN_URI,
xpath_builtin_css_class);

self = Data_Wrap_Struct(klass, 0, deallocate, ctx);
/*rb_iv_set(self, "@xpath_handler", Qnil); */
return self;
}

Expand Down
68 changes: 68 additions & 0 deletions test/xml/test_xpath.rb
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,74 @@ def test_xpath_syntax_error
assert_equal false, e.message.include?('0:0')
end
end

describe "nokogiri-builtin:css-class xpath function" do
before do
@doc = Nokogiri::HTML::Document.parse("<html></html>")
end

it "accepts exactly two arguments" do
assert_raise(Nokogiri::XML::XPath::SyntaxError) do
@doc.xpath("nokogiri-builtin:css-class()")
end
assert_raise(Nokogiri::XML::XPath::SyntaxError) do
@doc.xpath("nokogiri-builtin:css-class('one')")
end
assert_raise(Nokogiri::XML::XPath::SyntaxError) do
@doc.xpath("nokogiri-builtin:css-class('one', 'two', 'three')")
end

@doc.xpath("nokogiri-builtin:css-class('one', 'two')")
end

it "returns true if second arg is zero-length" do
assert(@doc.xpath("nokogiri-builtin:css-class('anything', '')"))
end

it "matches equal string" do
refute(@doc.xpath("nokogiri-builtin:css-class('asdf', 'asd')"))
refute(@doc.xpath("nokogiri-builtin:css-class('asdf', 'sdf')"))
assert(@doc.xpath("nokogiri-builtin:css-class('asdf', 'asdf')"))
refute(@doc.xpath("nokogiri-builtin:css-class('asdf', 'xasdf')"))
refute(@doc.xpath("nokogiri-builtin:css-class('asdf', 'asdfx')"))
end

it "matches start of string" do
refute(@doc.xpath("nokogiri-builtin:css-class('asdf qwer', 'asd')"))
assert(@doc.xpath("nokogiri-builtin:css-class('asdf qwer', 'asdf')"))
refute(@doc.xpath("nokogiri-builtin:css-class('asdf qwer', 'asdfg')"))
end

it "matches end of string" do
refute(@doc.xpath("nokogiri-builtin:css-class('qwer asdf', 'sdf')"))
assert(@doc.xpath("nokogiri-builtin:css-class('qwer asdf', 'asdf')"))
refute(@doc.xpath("nokogiri-builtin:css-class('qwer asdf', 'xasdf')"))
end

it "matches middle of string" do
refute(@doc.xpath("nokogiri-builtin:css-class('qwer asdf zxcv', 'xasdf')"))
refute(@doc.xpath("nokogiri-builtin:css-class('qwer asdf zxcv', 'asd')"))
assert(@doc.xpath("nokogiri-builtin:css-class('qwer asdf zxcv', 'asdf')"))
refute(@doc.xpath("nokogiri-builtin:css-class('qwer asdf zxcv', 'sdf')"))
refute(@doc.xpath("nokogiri-builtin:css-class('qwer asdf zxcv', 'asdfx')"))
end

# see xmlIsBlank_ch()
[" ", "\t", "\n", "\r"].each do |ws|
it "only matches complete whitespace-delimited words (#{sprintf("0x%02X", ws.bytes.first)})" do
assert(@doc.xpath("nokogiri-builtin:css-class('a#{ws}qwer#{ws}b', 'qwer')"))
refute(@doc.xpath("nokogiri-builtin:css-class('a#{ws}qwer#{ws}b', 'q')"))
refute(@doc.xpath("nokogiri-builtin:css-class('a#{ws}qwer#{ws}b', 'qw')"))
refute(@doc.xpath("nokogiri-builtin:css-class('a#{ws}qwer#{ws}b', 'qwe')"))
refute(@doc.xpath("nokogiri-builtin:css-class('a#{ws}qwer#{ws}b', 'w')"))
refute(@doc.xpath("nokogiri-builtin:css-class('a#{ws}qwer#{ws}b', 'we')"))
refute(@doc.xpath("nokogiri-builtin:css-class('a#{ws}qwer#{ws}b', 'wer')"))
refute(@doc.xpath("nokogiri-builtin:css-class('a#{ws}qwer#{ws}b', 'e')"))
refute(@doc.xpath("nokogiri-builtin:css-class('a#{ws}qwer#{ws}b', 'er')"))
refute(@doc.xpath("nokogiri-builtin:css-class('a#{ws}qwer#{ws}b', 'r')"))
end
end
end
end
end
end

0 comments on commit 7566d4b

Please sign in to comment.