Skip to content

Commit

Permalink
Add UDF split_to_multimap
Browse files Browse the repository at this point in the history
  • Loading branch information
rongrong committed Jun 28, 2018
1 parent 0db1dc3 commit 2f00cd3
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 0 deletions.
7 changes: 7 additions & 0 deletions presto-docs/src/main/sphinx/functions/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,13 @@ String Functions
``entryDelimiter`` splits ``string`` into key-value pairs. ``keyValueDelimiter`` splits
each pair into key and value.

.. function:: split_to_multimap(string, entryDelimiter, keyValueDelimiter) -> map<varchar, array<varchar>>

Splits ``string`` by ``entryDelimiter`` and ``keyValueDelimiter`` and returns a map
containing an array of values for each unique key. ``entryDelimiter`` splits ``string``
into key-value pairs. ``keyValueDelimiter`` splits each pair into key and value. The
values for each key will be in the same order as they appeared in ``string``.

.. function:: strpos(string, substring) -> bigint

Returns the starting position of the first instance of ``substring`` in
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@
import com.facebook.presto.operator.scalar.SequenceFunction;
import com.facebook.presto.operator.scalar.SessionFunctions;
import com.facebook.presto.operator.scalar.SplitToMapFunction;
import com.facebook.presto.operator.scalar.SplitToMultimapFunction;
import com.facebook.presto.operator.scalar.StringFunctions;
import com.facebook.presto.operator.scalar.TryFunction;
import com.facebook.presto.operator.scalar.TypeOfFunction;
Expand Down Expand Up @@ -454,6 +455,7 @@ public FunctionRegistry(TypeManager typeManager, BlockEncodingSerde blockEncodin
.scalars(StringFunctions.class)
.scalars(WordStemFunction.class)
.scalar(SplitToMapFunction.class)
.scalar(SplitToMultimapFunction.class)
.scalars(VarbinaryFunctions.class)
.scalars(UrlFunctions.class)
.scalars(MathFunctions.class)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.facebook.presto.operator.scalar;

import com.facebook.presto.spi.PageBuilder;
import com.facebook.presto.spi.PrestoException;
import com.facebook.presto.spi.block.Block;
import com.facebook.presto.spi.block.BlockBuilder;
import com.facebook.presto.spi.function.Description;
import com.facebook.presto.spi.function.ScalarFunction;
import com.facebook.presto.spi.function.SqlType;
import com.facebook.presto.spi.function.TypeParameter;
import com.facebook.presto.spi.type.StandardTypes;
import com.facebook.presto.spi.type.Type;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Multimap;
import io.airlift.slice.Slice;

import java.util.Collection;
import java.util.Map;

import static com.facebook.presto.spi.StandardErrorCode.INVALID_FUNCTION_ARGUMENT;
import static com.facebook.presto.spi.type.VarcharType.VARCHAR;
import static com.facebook.presto.util.Failures.checkCondition;

@Description("creates a multimap by splitting a string into key/value pairs")
@ScalarFunction("split_to_multimap")
public class SplitToMultimapFunction
{
private final PageBuilder pageBuilder;

public SplitToMultimapFunction(@TypeParameter("map(varchar,array(varchar))") Type mapType)
{
pageBuilder = new PageBuilder(ImmutableList.of(mapType));
}

@SqlType("map(varchar,array(varchar))")
public Block splitToMultimap(
@TypeParameter("map(varchar,array(varchar))") Type mapType,
@SqlType(StandardTypes.VARCHAR) Slice string,
@SqlType(StandardTypes.VARCHAR) Slice entryDelimiter,
@SqlType(StandardTypes.VARCHAR) Slice keyValueDelimiter)
{
checkCondition(entryDelimiter.length() > 0, INVALID_FUNCTION_ARGUMENT, "entryDelimiter is empty");
checkCondition(keyValueDelimiter.length() > 0, INVALID_FUNCTION_ARGUMENT, "keyValueDelimiter is empty");
checkCondition(!entryDelimiter.equals(keyValueDelimiter), INVALID_FUNCTION_ARGUMENT, "entryDelimiter and keyValueDelimiter must not be the same");

Multimap<Slice, Slice> multimap = ArrayListMultimap.create();
int entryStart = 0;
while (entryStart < string.length()) {
// Extract key-value pair based on current index
// then add the pair if it can be split by keyValueDelimiter
Slice keyValuePair;
int entryEnd = string.indexOf(entryDelimiter, entryStart);
if (entryEnd >= 0) {
keyValuePair = string.slice(entryStart, entryEnd - entryStart);
}
else {
// The rest of the string is the last possible pair.
keyValuePair = string.slice(entryStart, string.length() - entryStart);
}

int keyEnd = keyValuePair.indexOf(keyValueDelimiter);
if (keyEnd < 0) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Key-value delimiter must appear exactly once in each entry. Bad input: " + keyValuePair.toStringUtf8());
}

int valueStart = keyEnd + keyValueDelimiter.length();
Slice key = keyValuePair.slice(0, keyEnd);
Slice value = keyValuePair.slice(valueStart, keyValuePair.length() - valueStart);
if (value.indexOf(keyValueDelimiter) >= 0) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Key-value delimiter must appear exactly once in each entry. Bad input: " + keyValuePair.toStringUtf8());
}

multimap.put(key, value);

if (entryEnd < 0) {
// No more pairs to add
break;
}
// Next possible pair is placed next to the current entryDelimiter
entryStart = entryEnd + entryDelimiter.length();
}

if (pageBuilder.isFull()) {
pageBuilder.reset();
}

pageBuilder.declarePosition();
BlockBuilder blockBuilder = pageBuilder.getBlockBuilder(0);
BlockBuilder singleMapBlockBuilder = blockBuilder.beginBlockEntry();
for (Map.Entry<Slice, Collection<Slice>> entry : multimap.asMap().entrySet()) {
VARCHAR.writeSlice(singleMapBlockBuilder, entry.getKey());
Collection<Slice> values = entry.getValue();
BlockBuilder valueBlockBuilder = singleMapBlockBuilder.beginBlockEntry();
for (Slice value : values) {
VARCHAR.writeSlice(valueBlockBuilder, value);
}
singleMapBlockBuilder.closeEntry();
}
blockBuilder.closeEntry();

return (Block) mapType.getObject(blockBuilder, blockBuilder.getPositionCount() - 1);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,58 @@ public void testSplitToMap()
assertCachedInstanceHasBoundedRetainedSize("SPLIT_TO_MAP('a=123,b=.4,c=,=d', ',', '=')");
}

@Test
public void testSplitToMultimap()
{
MapType expectedType = mapType(VARCHAR, new ArrayType(VARCHAR));

assertFunction("SPLIT_TO_MULTIMAP('', ',', '=')", expectedType, ImmutableMap.of());
assertFunction(
"SPLIT_TO_MULTIMAP('a=123,b=.4,c=,=d', ',', '=')",
expectedType,
ImmutableMap.of(
"a", ImmutableList.of("123"),
"b", ImmutableList.of(".4"),
"c", ImmutableList.of(""),
"", ImmutableList.of("d")));
assertFunction("SPLIT_TO_MULTIMAP('=', ',', '=')", expectedType, ImmutableMap.of("", ImmutableList.of("")));

// Test multiple values of the same key preserve the order as they appear in input
assertFunction("SPLIT_TO_MULTIMAP('a=123,a=.4,a=5.67', ',', '=')", expectedType, ImmutableMap.of("a", ImmutableList.of("123", ".4", "5.67")));

// Test multi-character delimiters
assertFunction("SPLIT_TO_MULTIMAP('key=>value,key=>value', ',', '=>')", expectedType, ImmutableMap.of("key", ImmutableList.of("value", "value")));
assertFunction(
"SPLIT_TO_MULTIMAP('key => value, key => value', ',', '=>')",
expectedType,
ImmutableMap.of(
"key ", ImmutableList.of(" value"),
" key ", ImmutableList.of(" value")));
assertFunction(
"SPLIT_TO_MULTIMAP('key => value, key => value', ', ', '=>')",
expectedType,
ImmutableMap.of(
"key ", ImmutableList.of(" value", " value")));

// Test non-ASCII
assertFunction("SPLIT_TO_MULTIMAP('\u4EA0\u4EFF\u4EA1', '\u4E00', '\u4EFF')", expectedType, ImmutableMap.of("\u4EA0", ImmutableList.of("\u4EA1")));
assertFunction(
"SPLIT_TO_MULTIMAP('\u4EA0\u4EFF\u4EA1\u4E00\u4EA0\u4EFF\u4EB1', '\u4E00', '\u4EFF')",
expectedType,
ImmutableMap.of("\u4EA0", ImmutableList.of("\u4EA1", "\u4EB1")));

// Entry delimiter and key-value delimiter must not be the same.
assertInvalidFunction("SPLIT_TO_MULTIMAP('', '\u4EFF', '\u4EFF')", "entryDelimiter and keyValueDelimiter must not be the same");
assertInvalidFunction("SPLIT_TO_MULTIMAP('a=123,b=.4,c=', '=', '=')", "entryDelimiter and keyValueDelimiter must not be the same");

// Key-value delimiter must appear exactly once in each entry.
assertInvalidFunction("SPLIT_TO_MULTIMAP('key', ',', '=')", "Key-value delimiter must appear exactly once in each entry. Bad input: key");
assertInvalidFunction("SPLIT_TO_MULTIMAP('key==value', ',', '=')", "Key-value delimiter must appear exactly once in each entry. Bad input: key==value");
assertInvalidFunction("SPLIT_TO_MULTIMAP('key=va=lue', ',', '=')", "Key-value delimiter must appear exactly once in each entry. Bad input: key=va=lue");

assertCachedInstanceHasBoundedRetainedSize("SPLIT_TO_MULTIMAP('a=123,b=.4,c=,=d', ',', '=')");
}

@Test
public void testSplitPart()
{
Expand Down

0 comments on commit 2f00cd3

Please sign in to comment.