Skip to content

Commit

Permalink
Merge branch 'rapidsai:branch-22.12' into 11337
Browse files Browse the repository at this point in the history
  • Loading branch information
galipremsagar authored Nov 4, 2022
2 parents 074de5a + 2a58ff6 commit 7272685
Show file tree
Hide file tree
Showing 8 changed files with 173 additions and 1 deletion.
5 changes: 4 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -753,7 +753,10 @@ if(CUDF_BUILD_BENCHMARKS)
include(${rapids-cmake-dir}/cpm/gbench.cmake)
rapids_cpm_gbench()

# Find or install NVBench
# Find or install NVBench Temporarily force downloading of fmt because current versions of nvbench
# do not support the latest version of fmt, which is automatically pulled into our conda
# environments by mamba.
set(CPM_DOWNLOAD_fmt TRUE)
include(${rapids-cmake-dir}/cpm/nvbench.cmake)
rapids_cpm_nvbench()
add_subdirectory(benchmarks)
Expand Down
50 changes: 50 additions & 0 deletions java/src/main/java/ai/rapids/cudf/ColumnView.java
Original file line number Diff line number Diff line change
Expand Up @@ -3276,6 +3276,46 @@ public final ColumnVector extractAllRecord(String pattern, int idx) {
return new ColumnVector(extractAllRecord(this.getNativeView(), pattern, idx));
}

/**
* Returns a boolean ColumnVector identifying rows which
* match the given like pattern.
*
* The like pattern expects only 2 wildcard special characters
* - `%` any number of any character (including no characters)
* - `_` any single character
*
* ```
* cv = ["azaa", "ababaabba", "aaxa"]
* r = cv.like("%a_aa%", "\\")
* r is now [true, true, false]
* r = cv.like("a__a", "\\")
* r is now [true, false, true]
* ```
*
* The escape character is specified to include either `%` or `_` in the search,
* which is expected to be either 0 or 1 character.
* If more than one character is specified, only the first character is used.
*
* ```
* cv = ["abc_def", "abc1def", "abc_"]
* r = cv.like("abc/_d%", "/")
* r is now [true, false, false]
* ```
* Any null string entries return corresponding null output column entries.
*
* @param pattern Like pattern to match to each string.
* @param escapeChar Character specifies the escape prefix; default is "\\".
* @return New ColumnVector of boolean results for each string.
*/
public final ColumnVector like(Scalar pattern, Scalar escapeChar) {
assert type.equals(DType.STRING) : "column type must be a String";
assert pattern != null : "pattern scalar must not be null";
assert pattern.getType().equals(DType.STRING) : "pattern scalar must be a string scalar";
assert escapeChar != null : "escapeChar scalar must not be null";
assert escapeChar.getType().equals(DType.STRING) : "escapeChar scalar must be a string scalar";
return new ColumnVector(like(getNativeView(), pattern.getScalarHandle(), escapeChar.getScalarHandle()));
}


/**
* Converts all character sequences starting with '%' into character code-points
Expand Down Expand Up @@ -4034,6 +4074,16 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
*/
private static native long containsRe(long cudfViewHandle, String pattern) throws CudfException;

/**
* Native method for checking if strings match the passed in like pattern
* and escape character.
* @param cudfViewHandle native handle of the cudf::column_view being operated on.
* @param patternHandle handle of scalar containing the string like pattern.
* @param escapeCharHandle handle of scalar containing the string escape character.
* @return native handle of the resulting cudf column containing the boolean results.
*/
private static native long like(long cudfViewHandle, long patternHandle, long escapeCharHandle) throws CudfException;

/**
* Native method for checking if strings in a column contains a specified comparison string.
* @param cudfViewHandle native handle of the cudf::column_view being operated on.
Expand Down
18 changes: 18 additions & 0 deletions java/src/main/native/src/ColumnViewJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1298,6 +1298,24 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsRe(JNIEnv *env, j
CATCH_STD(env, 0);
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_like(JNIEnv *env, jobject j_object,
jlong j_view_handle, jlong pattern,
jlong escapeChar) {
JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
JNI_NULL_CHECK(env, pattern, "pattern is null", false);
JNI_NULL_CHECK(env, escapeChar, "escape character is null", false);

try {
cudf::jni::auto_set_device(env);
auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
auto const strings_column = cudf::strings_column_view{*column_view};
auto const pattern_scalar = reinterpret_cast<cudf::string_scalar const *>(pattern);
auto const escape_scalar = reinterpret_cast<cudf::string_scalar const *>(escapeChar);
return release_as_jlong(cudf::strings::like(strings_column, *pattern_scalar, *escape_scalar));
}
CATCH_STD(env, 0);
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, jclass,
jlong lhs_view, jlong rhs_view,
jint int_op, jint out_dtype,
Expand Down
57 changes: 57 additions & 0 deletions java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4193,6 +4193,63 @@ void testContainsReEmptyInput() {
}
}

@Test
void testLike() {
// Default escape character
try (ColumnVector testStrings = ColumnVector.fromStrings(
"a", "aa", "aaa", "aba", "b", "bb", "bba", "", "áéêú", "a1b2c3");
Scalar patternString1 = Scalar.fromString("a1b2c3");
Scalar patternString2 = Scalar.fromString("__a%");
Scalar defaultEscape = Scalar.fromString("\\");
ColumnVector res1 = testStrings.like(patternString1, defaultEscape);
ColumnVector res2 = testStrings.like(patternString2, defaultEscape);
ColumnVector expected1 = ColumnVector.fromBoxedBooleans(
false, false, false, false, false, false, false, false, false, true);
ColumnVector expected2 = ColumnVector.fromBoxedBooleans(
false, false, true, true, false, false, true, false, false, false)) {
assertColumnsAreEqual(expected1, res1);
assertColumnsAreEqual(expected2, res2);
}
// Non-default escape character
try (ColumnVector testStrings = ColumnVector.fromStrings(
"10%-20%", "10-20", "10%%-20%", "a_b", "b_a", "___", "", "aéb", "_%_", "_%a");
Scalar patternString1 = Scalar.fromString("10%%%%-20%%");
Scalar patternString2 = Scalar.fromString("___%%");
Scalar escapeChar1 = Scalar.fromString("%");
Scalar escapeChar2 = Scalar.fromString("_");
ColumnVector res1 = testStrings.like(patternString1, escapeChar1);
ColumnVector res2 = testStrings.like(patternString2, escapeChar2);
ColumnVector expected1 = ColumnVector.fromBoxedBooleans(
false, false, true, false, false, false, false, false, false, false);
ColumnVector expected2 = ColumnVector.fromBoxedBooleans(
false, false, false, false, false, false, false, false, true, true)) {
assertColumnsAreEqual(expected1, res1);
assertColumnsAreEqual(expected2, res2);
}
assertThrows(AssertionError.class, () -> {
try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, "");
Scalar defaultEscape = Scalar.fromString("\\");
ColumnVector res = testStrings.like(null, defaultEscape)) {}
});
assertThrows(AssertionError.class, () -> {
try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, "");
Scalar patternString = Scalar.fromString("");
ColumnVector res = testStrings.like(patternString, null)) {}
});
assertThrows(AssertionError.class, () -> {
try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, "");
Scalar patternString = Scalar.fromString("");
Scalar intScalar = Scalar.fromInt(1);
ColumnVector res = testStrings.like(patternString, intScalar)) {}
});
assertThrows(AssertionError.class, () -> {
try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, "");
Scalar intScalar = Scalar.fromInt(1);
Scalar defaultEscape = Scalar.fromString("\\");
ColumnVector res = testStrings.like(intScalar, defaultEscape)) {}
});
}

@Test
void testUrlDecode() {
String[] inputs = new String[] {
Expand Down
13 changes: 13 additions & 0 deletions python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
from __future__ import annotations

from functools import cached_property

import pandas as pd
import pyarrow as pa

Expand Down Expand Up @@ -65,6 +67,17 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
pd_series.index = index
return pd_series

@cached_property
def memory_usage(self):
n = 0
if self.nullable:
n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size)

for child in self.children:
n += child.memory_usage

return n

def element_indexing(self, index: int):
result = super().element_indexing(index)
return {
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import operator
import pickle
import textwrap
from functools import cached_property
from typing import Any, Callable, Dict, List, Tuple, Type, Union

import numpy as np
Expand Down Expand Up @@ -627,6 +628,13 @@ def deserialize(cls, header: dict, frames: list):
fields[k] = pickle.loads(dtype)
return cls(fields)

@cached_property
def itemsize(self):
return sum(
cudf.utils.dtypes.cudf_dtype_from_pa_type(field.type).itemsize
for field in self._typ
)


decimal_dtype_template = textwrap.dedent(
"""
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,6 +864,8 @@ def test_memory_usage():
assert s1.memory_usage() == 44
s2 = cudf.Series([[[[1, 2]]], [[[3, 4]]]])
assert s2.memory_usage() == 68
s3 = cudf.Series([[{"b": 1, "a": 10}, {"b": 2, "a": 100}]])
assert s3.memory_usage() == 40


@pytest.mark.parametrize(
Expand Down
21 changes: 21 additions & 0 deletions python/cudf/cudf/tests/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,24 @@ def test_nested_struct_extract_host_scalars(data, idx, expected):
series = cudf.Series(data)

assert _nested_na_replace(series[idx]) == _nested_na_replace(expected)


def test_struct_memory_usage():
s = cudf.Series([{"a": 1, "b": 10}, {"a": 2, "b": 20}, {"a": 3, "b": 30}])
df = s.struct.explode()

assert_eq(s.memory_usage(), df.memory_usage().sum())


def test_struct_with_null_memory_usage():
df = cudf.DataFrame(
{
"a": cudf.Series([1, 2, -1, -1, 3], dtype="int64"),
"b": cudf.Series([10, 20, -1, -1, 30], dtype="int64"),
}
)
s = df.to_struct()
assert s.memory_usage() == 80

s[2:4] = None
assert s.memory_usage() == 272

0 comments on commit 7272685

Please sign in to comment.