Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for XPath scalar functions - xpath,xpath_string,xpath_boolean,xpath_double #9219

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@
import io.trino.operator.scalar.VersionFunction;
import io.trino.operator.scalar.WilsonInterval;
import io.trino.operator.scalar.WordStemFunction;
import io.trino.operator.scalar.XPathFunctions;
import io.trino.operator.scalar.time.LocalTimeFunction;
import io.trino.operator.scalar.time.TimeFunctions;
import io.trino.operator.scalar.time.TimeOperators;
Expand Down Expand Up @@ -481,6 +482,7 @@ public FunctionRegistry(
.scalar(SplitToMultimapFunction.class)
.scalars(VarbinaryFunctions.class)
.scalars(UrlFunctions.class)
.scalars(XPathFunctions.class)
.scalars(MathFunctions.class)
.scalar(MathFunctions.Abs.class)
.scalar(MathFunctions.Sign.class)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.operator.scalar;

import io.airlift.slice.Slice;
import io.trino.spi.block.Block;
import io.trino.spi.block.BlockBuilder;
import io.trino.spi.function.Description;
import io.trino.spi.function.LiteralParameters;
import io.trino.spi.function.ScalarFunction;
import io.trino.spi.function.SqlNullable;
import io.trino.spi.function.SqlType;
import io.trino.spi.type.StandardTypes;
import io.trino.util.UDFXPathUtil;
import org.w3c.dom.NodeList;

import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static io.airlift.slice.Slices.utf8Slice;
import static io.trino.spi.type.VarcharType.VARCHAR;

public final class XPathFunctions
{
private XPathFunctions() {}

@SqlNullable
@Description("Returns the text contents of the first xml node that matches the xpath expression")
@ScalarFunction("xpath_string")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The value of the annotation is redundant.

@LiteralParameters({"x", "y"})
@SqlType("varchar(x)")
public static Slice xpathString(@SqlType("varchar(x)") Slice xml, @SqlType("varchar(y)") Slice path)
{
UDFXPathUtil xpath = new UDFXPathUtil();
String s = xpath.evalString(xml.toStringUtf8(), path.toStringUtf8());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please don't use abbreviation for variable name.

if (s == null) {
return null;
}
return utf8Slice(s);
}

@SqlNullable
@Description("Returns a string array of values within xml nodes that match the xpath expression")
@ScalarFunction("xpath")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The value of the annotation is redundant.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will fix it.

@LiteralParameters({"x", "y"})
@SqlType("array(varchar)")
public static Block xpath(@SqlType("varchar(y)") Slice xml, @SqlType("varchar(x)") Slice path)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why x and y are flipped? Subsequent functions also.

{
List<String> initList = eval(xml.toStringUtf8(), path.toStringUtf8());

BlockBuilder builder = VARCHAR.createBlockBuilder(null, initList.size());
for (String value : initList) {
if (value == null) {
builder.appendNull();
}
else {
VARCHAR.writeSlice(builder, utf8Slice(value));
}
}
return builder.build();
}

@SqlNullable
@Description("Returns true if the XPath expression evaluates to true, or if a matching node is found")
@ScalarFunction("xpath_boolean")
@LiteralParameters({"x", "y"})
@SqlType(StandardTypes.BOOLEAN)
public static Boolean xpath_boolean(@SqlType("varchar(y)") Slice xml, @SqlType("varchar(x)") Slice path)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename to xpathBoolean

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need to change the value in @ScalarFunction annotation, but this method should be renamed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do.

{
UDFXPathUtil xpath = new UDFXPathUtil();
return xpath.evalBoolean(xml.toStringUtf8(), path.toStringUtf8());
}

@SqlNullable
@Description("Return a double value, or the value 0.0 if no match is found, or NaN for non-numeric match")
@ScalarFunction("xpath_double")
@LiteralParameters({"x", "y"})
@SqlType(StandardTypes.DOUBLE)
public static Double xpath_double(@SqlType("varchar(y)") Slice xml, @SqlType("varchar(x)") Slice path)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename to xpathDouble

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will do.

{
UDFXPathUtil xpath = new UDFXPathUtil();
return xpath.evalNumber(xml.toStringUtf8(), path.toStringUtf8());
}

private static List<String> eval(String xml, String path)
{
UDFXPathUtil xpath = new UDFXPathUtil();
NodeList nodeList = xpath.evalNodeList(xml, path);
if (nodeList == null) {
return Collections.<String>emptyList();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Return ImmutableList.of().

}

return Stream.iterate(0, i -> i < nodeList.getLength(), i -> i + 1)
.map(nodeList::item)
.filter(field -> field.getNodeValue() != null)
.map(field -> field.getNodeValue())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use method reference.

.collect(Collectors.toList());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use toImmutableList().

}
}
243 changes: 243 additions & 0 deletions core/trino-main/src/main/java/io/trino/util/UDFXPathUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.util;

import io.trino.spi.TrinoException;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXParseException;

import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import static io.trino.spi.StandardErrorCode.INVALID_FUNCTION_ARGUMENT;

/**
* Utility class for all XPath UDFs. Each UDF instance should keep an instance
* of this class.
*/
public class UDFXPathUtil
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't use "UDF" to represent functions.

I guess you copied this class from Hive repository. Please add the reference to javadoc.
https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/xml/UDFXPathUtil.java

{
public static final String SAX_FEATURE_PREFIX = "http://xml.org/sax/features/";
public static final String EXTERNAL_GENERAL_ENTITIES_FEATURE = "external-general-entities";
public static final String EXTERNAL_PARAMETER_ENTITIES_FEATURE = "external-parameter-entities";
private DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
private DocumentBuilder builder;
private XPath xpath = XPathFactory.newInstance().newXPath();

private XPathExpression expression;
private String oldPath;

public Object eval(String xml, String path, QName qname)
{
if (xml == null || path == null || qname == null) {
return null;
}

if (xml.length() == 0 || path.length() == 0) {
return null;
}

if (!path.equals(oldPath)) {
try {
expression = xpath.compile(path);
}
catch (XPathExpressionException e) {
expression = null;
}
oldPath = path;
}

if (expression == null) {
return null;
}

if (builder == null) {
try {
initializeDocumentBuilderFactory();
builder = documentBuilderFactory.newDocumentBuilder();
}
catch (ParserConfigurationException e) {
throw new RuntimeException("Error instantiating DocumentBuilder, cannot build xml parser", e);
}
}
ReusableStringReader reader = new ReusableStringReader();
InputSource inputSource = new InputSource(reader);
reader.set(xml);

try {
return expression.evaluate(builder.parse(inputSource), qname);
}
catch (XPathExpressionException ex) {
throw new TrinoException(INVALID_FUNCTION_ARGUMENT, "Invalid expression '" + oldPath + "'", ex);
}
catch (SAXParseException ex) {
throw new TrinoException(INVALID_FUNCTION_ARGUMENT, "Error parsing xml data '" + xml + "'", ex);
}
catch (Exception ex) {
throw new TrinoException(INVALID_FUNCTION_ARGUMENT, "Error loading expression '" + oldPath + "'", ex);
}
}

private void initializeDocumentBuilderFactory() throws ParserConfigurationException
{
documentBuilderFactory.setFeature(SAX_FEATURE_PREFIX + EXTERNAL_GENERAL_ENTITIES_FEATURE, false);
documentBuilderFactory.setFeature(SAX_FEATURE_PREFIX + EXTERNAL_PARAMETER_ENTITIES_FEATURE, false);
}

public Boolean evalBoolean(String xml, String path)
{
return (Boolean) eval(xml, path, XPathConstants.BOOLEAN);
}

public String evalString(String xml, String path)
{
return (String) eval(xml, path, XPathConstants.STRING);
}

public Double evalNumber(String xml, String path)
{
return (Double) eval(xml, path, XPathConstants.NUMBER);
}

public Node evalNode(String xml, String path)
{
return (Node) eval(xml, path, XPathConstants.NODE);
}

public NodeList evalNodeList(String xml, String path)
{
return (NodeList) eval(xml, path, XPathConstants.NODESET);
}

/**
* Reusable, threadsafe version of {@link StringReader}.
*/
public static class ReusableStringReader
extends Reader
{
private String streamData;
private int length = -1;
private int next;
private int mark;

public void set(String stringData)
{
this.streamData = stringData;
this.length = stringData.length();
this.mark = 0;
this.next = 0;
}

/** Check to make sure that the stream has not been closed */
private void ensureOpen() throws IOException
{
if (streamData == null) {
throw new IOException("Stream closed");
}
}

@Override
public int read() throws IOException
{
ensureOpen();
if (next >= length) {
return -1;
}
return streamData.charAt(next++);
}

@Override
public int read(char[] cbuf, int off, int len) throws IOException
{
ensureOpen();
if ((off < 0) || (off > cbuf.length) || (len < 0)
|| ((off + len) > cbuf.length) || ((off + len) < 0)) {
throw new IndexOutOfBoundsException();
}
else if (len == 0) {
return 0;
}
if (next >= length) {
return -1;
}
int n = Math.min(length - next, len);
streamData.getChars(next, next + n, cbuf, off);
next += n;
return n;
}

@Override
public long skip(long ns) throws IOException
{
ensureOpen();
if (next >= length) {
return 0;
}
// Bound skip by beginning and end of the source
long n = Math.min(length - next, ns);
n = Math.max(-next, n);
next += n;
return n;
}

@Override
public boolean ready() throws IOException
{
ensureOpen();
return true;
}

@Override
public boolean markSupported()
{
return true;
}

@Override
public void mark(int readAheadLimit) throws IOException
{
if (readAheadLimit < 0) {
throw new IllegalArgumentException("Read-ahead limit < 0");
}
ensureOpen();
mark = next;
}

@Override
public void reset() throws IOException
{
ensureOpen();
next = mark;
}

@Override
public void close()
{
streamData = null;
}
}
}
Loading