From baa645dec67725036a724be02bb9d69de1d4136f Mon Sep 17 00:00:00 2001 From: Yuan Jiang <47068112+cindyyuanjiang@users.noreply.github.com> Date: Thu, 3 Nov 2022 07:10:58 -0700 Subject: [PATCH 1/3] Add strings `like` jni and native method (#12032) [rapidsai/cudf#11558](https://github.com/rapidsai/cudf/pull/11558) added strings `like` function to cudf, which is a wildcard-based string matching function based on SQL's LIKE statement. We add `like` jni and native method calling the `like` function in #11558 and corresponding Java unit tests. This is part of the solution for issue [NVIDIA/spark-rapids#6430](https://github.com/NVIDIA/spark-rapids/issues/6430). Authors: - Yuan Jiang (https://github.com/cindyyuanjiang) Approvers: - Nghia Truong (https://github.com/ttnghia) - Gera Shegalov (https://github.com/gerashegalov) - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/12032 --- .../main/java/ai/rapids/cudf/ColumnView.java | 50 ++++++++++++++++ java/src/main/native/src/ColumnViewJni.cpp | 18 ++++++ .../java/ai/rapids/cudf/ColumnVectorTest.java | 57 +++++++++++++++++++ 3 files changed, 125 insertions(+) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 8bc764a078e..e639320b028 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -3276,6 +3276,46 @@ public final ColumnVector extractAllRecord(String pattern, int idx) { return new ColumnVector(extractAllRecord(this.getNativeView(), pattern, idx)); } + /** + * Returns a boolean ColumnVector identifying rows which + * match the given like pattern. + * + * The like pattern expects only 2 wildcard special characters + * - `%` any number of any character (including no characters) + * - `_` any single character + * + * ``` + * cv = ["azaa", "ababaabba", "aaxa"] + * r = cv.like("%a_aa%", "\\") + * r is now [true, true, false] + * r = cv.like("a__a", "\\") + * r is now [true, false, true] + * ``` + * + * The escape character is specified to include either `%` or `_` in the search, + * which is expected to be either 0 or 1 character. + * If more than one character is specified, only the first character is used. + * + * ``` + * cv = ["abc_def", "abc1def", "abc_"] + * r = cv.like("abc/_d%", "/") + * r is now [true, false, false] + * ``` + * Any null string entries return corresponding null output column entries. + * + * @param pattern Like pattern to match to each string. + * @param escapeChar Character specifies the escape prefix; default is "\\". + * @return New ColumnVector of boolean results for each string. + */ + public final ColumnVector like(Scalar pattern, Scalar escapeChar) { + assert type.equals(DType.STRING) : "column type must be a String"; + assert pattern != null : "pattern scalar must not be null"; + assert pattern.getType().equals(DType.STRING) : "pattern scalar must be a string scalar"; + assert escapeChar != null : "escapeChar scalar must not be null"; + assert escapeChar.getType().equals(DType.STRING) : "escapeChar scalar must be a string scalar"; + return new ColumnVector(like(getNativeView(), pattern.getScalarHandle(), escapeChar.getScalarHandle())); + } + /** * Converts all character sequences starting with '%' into character code-points @@ -4034,6 +4074,16 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat */ private static native long containsRe(long cudfViewHandle, String pattern) throws CudfException; + /** + * Native method for checking if strings match the passed in like pattern + * and escape character. + * @param cudfViewHandle native handle of the cudf::column_view being operated on. + * @param patternHandle handle of scalar containing the string like pattern. + * @param escapeCharHandle handle of scalar containing the string escape character. + * @return native handle of the resulting cudf column containing the boolean results. + */ + private static native long like(long cudfViewHandle, long patternHandle, long escapeCharHandle) throws CudfException; + /** * Native method for checking if strings in a column contains a specified comparison string. * @param cudfViewHandle native handle of the cudf::column_view being operated on. diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index e2a96de93ef..f52d3201a10 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1298,6 +1298,24 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsRe(JNIEnv *env, j CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_like(JNIEnv *env, jobject j_object, + jlong j_view_handle, jlong pattern, + jlong escapeChar) { + JNI_NULL_CHECK(env, j_view_handle, "column is null", false); + JNI_NULL_CHECK(env, pattern, "pattern is null", false); + JNI_NULL_CHECK(env, escapeChar, "escape character is null", false); + + try { + cudf::jni::auto_set_device(env); + auto const column_view = reinterpret_cast(j_view_handle); + auto const strings_column = cudf::strings_column_view{*column_view}; + auto const pattern_scalar = reinterpret_cast(pattern); + auto const escape_scalar = reinterpret_cast(escapeChar); + return release_as_jlong(cudf::strings::like(strings_column, *pattern_scalar, *escape_scalar)); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, jclass, jlong lhs_view, jlong rhs_view, jint int_op, jint out_dtype, diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index f5c32b0da20..e1ed5e12fc2 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4193,6 +4193,63 @@ void testContainsReEmptyInput() { } } + @Test + void testLike() { + // Default escape character + try (ColumnVector testStrings = ColumnVector.fromStrings( + "a", "aa", "aaa", "aba", "b", "bb", "bba", "", "áéêú", "a1b2c3"); + Scalar patternString1 = Scalar.fromString("a1b2c3"); + Scalar patternString2 = Scalar.fromString("__a%"); + Scalar defaultEscape = Scalar.fromString("\\"); + ColumnVector res1 = testStrings.like(patternString1, defaultEscape); + ColumnVector res2 = testStrings.like(patternString2, defaultEscape); + ColumnVector expected1 = ColumnVector.fromBoxedBooleans( + false, false, false, false, false, false, false, false, false, true); + ColumnVector expected2 = ColumnVector.fromBoxedBooleans( + false, false, true, true, false, false, true, false, false, false)) { + assertColumnsAreEqual(expected1, res1); + assertColumnsAreEqual(expected2, res2); + } + // Non-default escape character + try (ColumnVector testStrings = ColumnVector.fromStrings( + "10%-20%", "10-20", "10%%-20%", "a_b", "b_a", "___", "", "aéb", "_%_", "_%a"); + Scalar patternString1 = Scalar.fromString("10%%%%-20%%"); + Scalar patternString2 = Scalar.fromString("___%%"); + Scalar escapeChar1 = Scalar.fromString("%"); + Scalar escapeChar2 = Scalar.fromString("_"); + ColumnVector res1 = testStrings.like(patternString1, escapeChar1); + ColumnVector res2 = testStrings.like(patternString2, escapeChar2); + ColumnVector expected1 = ColumnVector.fromBoxedBooleans( + false, false, true, false, false, false, false, false, false, false); + ColumnVector expected2 = ColumnVector.fromBoxedBooleans( + false, false, false, false, false, false, false, false, true, true)) { + assertColumnsAreEqual(expected1, res1); + assertColumnsAreEqual(expected2, res2); + } + assertThrows(AssertionError.class, () -> { + try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, ""); + Scalar defaultEscape = Scalar.fromString("\\"); + ColumnVector res = testStrings.like(null, defaultEscape)) {} + }); + assertThrows(AssertionError.class, () -> { + try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, ""); + Scalar patternString = Scalar.fromString(""); + ColumnVector res = testStrings.like(patternString, null)) {} + }); + assertThrows(AssertionError.class, () -> { + try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, ""); + Scalar patternString = Scalar.fromString(""); + Scalar intScalar = Scalar.fromInt(1); + ColumnVector res = testStrings.like(patternString, intScalar)) {} + }); + assertThrows(AssertionError.class, () -> { + try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, ""); + Scalar intScalar = Scalar.fromInt(1); + Scalar defaultEscape = Scalar.fromString("\\"); + ColumnVector res = testStrings.like(intScalar, defaultEscape)) {} + }); + } + @Test void testUrlDecode() { String[] inputs = new String[] { From b156c25d300a96120b0a63d7fb28fce9a0771b35 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 3 Nov 2022 09:58:38 -0500 Subject: [PATCH 2/3] Add `memory_usage` & `items` implementation for `Struct` column & dtype (#12033) Fixes: #11893 - [x] This PR implements `StructColumn.memory_usage` and `StructDtype.items` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/12033 --- python/cudf/cudf/core/column/struct.py | 13 +++++++++++++ python/cudf/cudf/core/dtypes.py | 8 ++++++++ python/cudf/cudf/tests/test_list.py | 2 ++ python/cudf/cudf/tests/test_struct.py | 21 +++++++++++++++++++++ 4 files changed, 44 insertions(+) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 67ff3e48dbd..69d70cf427f 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,6 +1,8 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. from __future__ import annotations +from functools import cached_property + import pandas as pd import pyarrow as pa @@ -65,6 +67,17 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": pd_series.index = index return pd_series + @cached_property + def memory_usage(self): + n = 0 + if self.nullable: + n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) + + for child in self.children: + n += child.memory_usage + + return n + def element_indexing(self, index: int): result = super().element_indexing(index) return { diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 25b1b3895de..39c7b8e6b57 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -4,6 +4,7 @@ import operator import pickle import textwrap +from functools import cached_property from typing import Any, Callable, Dict, List, Tuple, Type, Union import numpy as np @@ -627,6 +628,13 @@ def deserialize(cls, header: dict, frames: list): fields[k] = pickle.loads(dtype) return cls(fields) + @cached_property + def itemsize(self): + return sum( + cudf.utils.dtypes.cudf_dtype_from_pa_type(field.type).itemsize + for field in self._typ + ) + decimal_dtype_template = textwrap.dedent( """ diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 8ea11382419..4c2a14fc45c 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -864,6 +864,8 @@ def test_memory_usage(): assert s1.memory_usage() == 44 s2 = cudf.Series([[[[1, 2]]], [[[3, 4]]]]) assert s2.memory_usage() == 68 + s3 = cudf.Series([[{"b": 1, "a": 10}, {"b": 2, "a": 100}]]) + assert s3.memory_usage() == 40 @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 4c70d20c488..eaee1efcbc8 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -371,3 +371,24 @@ def test_nested_struct_extract_host_scalars(data, idx, expected): series = cudf.Series(data) assert _nested_na_replace(series[idx]) == _nested_na_replace(expected) + + +def test_struct_memory_usage(): + s = cudf.Series([{"a": 1, "b": 10}, {"a": 2, "b": 20}, {"a": 3, "b": 30}]) + df = s.struct.explode() + + assert_eq(s.memory_usage(), df.memory_usage().sum()) + + +def test_struct_with_null_memory_usage(): + df = cudf.DataFrame( + { + "a": cudf.Series([1, 2, -1, -1, 3], dtype="int64"), + "b": cudf.Series([10, 20, -1, -1, 30], dtype="int64"), + } + ) + s = df.to_struct() + assert s.memory_usage() == 80 + + s[2:4] = None + assert s.memory_usage() == 272 From 2a58ff64bc2869d0a3527b95a8de334eb5bc800e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 4 Nov 2022 06:01:13 -0700 Subject: [PATCH 3/3] Force using old fmt in nvbench. (#12067) This is a port of #12064 to 22.12 to unblock CI because forward mergers are currently disabled. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Robert Maynard (https://github.com/robertmaynard) URL: https://github.com/rapidsai/cudf/pull/12067 --- cpp/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d63c7e75616..03cf4c7d2b7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -753,7 +753,10 @@ if(CUDF_BUILD_BENCHMARKS) include(${rapids-cmake-dir}/cpm/gbench.cmake) rapids_cpm_gbench() - # Find or install NVBench + # Find or install NVBench Temporarily force downloading of fmt because current versions of nvbench + # do not support the latest version of fmt, which is automatically pulled into our conda + # environments by mamba. + set(CPM_DOWNLOAD_fmt TRUE) include(${rapids-cmake-dir}/cpm/nvbench.cmake) rapids_cpm_nvbench() add_subdirectory(benchmarks)