Merge branch 'rapidsai:branch-22.12' into 11337

rapidsai · Nov 4, 2022 · 7272685 · 7272685
2 parents 074de5a + 2a58ff6
commit 7272685
Show file tree

Hide file tree

Showing 8 changed files with 173 additions and 1 deletion.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -753,7 +753,10 @@ if(CUDF_BUILD_BENCHMARKS)
   include(${rapids-cmake-dir}/cpm/gbench.cmake)
   rapids_cpm_gbench()
 
-  # Find or install NVBench
+  # Find or install NVBench Temporarily force downloading of fmt because current versions of nvbench
+  # do not support the latest version of fmt, which is automatically pulled into our conda
+  # environments by mamba.
+  set(CPM_DOWNLOAD_fmt TRUE)
   include(${rapids-cmake-dir}/cpm/nvbench.cmake)
   rapids_cpm_nvbench()
   add_subdirectory(benchmarks)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -3276,6 +3276,46 @@ public final ColumnVector extractAllRecord(String pattern, int idx) {
     return new ColumnVector(extractAllRecord(this.getNativeView(), pattern, idx));
   }
 
+  /**
+   * Returns a boolean ColumnVector identifying rows which
+   * match the given like pattern.
+   *
+   * The like pattern expects only 2 wildcard special characters
+   * - `%` any number of any character (including no characters)
+   * - `_` any single character
+   *
+   * ```
+   * cv = ["azaa", "ababaabba", "aaxa"]
+   * r = cv.like("%a_aa%", "\\")
+   * r is now [true, true, false]
+   * r = cv.like("a__a", "\\")
+   * r is now [true, false, true]
+   * ```
+   *
+   * The escape character is specified to include either `%` or `_` in the search,
+   * which is expected to be either 0 or 1 character.
+   * If more than one character is specified, only the first character is used.
+   *
+   * ```
+   * cv = ["abc_def", "abc1def", "abc_"]
+   * r = cv.like("abc/_d%", "/")
+   * r is now [true, false, false]
+   * ```
+   * Any null string entries return corresponding null output column entries.
+   *
+   * @param pattern Like pattern to match to each string.
+   * @param escapeChar Character specifies the escape prefix; default is "\\".
+   * @return New ColumnVector of boolean results for each string.
+   */
+  public final ColumnVector like(Scalar pattern, Scalar escapeChar) {
+    assert type.equals(DType.STRING) : "column type must be a String";
+    assert pattern != null : "pattern scalar must not be null";
+    assert pattern.getType().equals(DType.STRING) : "pattern scalar must be a string scalar";
+    assert escapeChar != null : "escapeChar scalar must not be null";
+    assert escapeChar.getType().equals(DType.STRING) : "escapeChar scalar must be a string scalar";
+    return new ColumnVector(like(getNativeView(), pattern.getScalarHandle(), escapeChar.getScalarHandle()));
+  }
+
 
   /**
    * Converts all character sequences starting with '%' into character code-points
@@ -4034,6 +4074,16 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
    */
   private static native long containsRe(long cudfViewHandle, String pattern) throws CudfException;
 
+  /**
+   * Native method for checking if strings match the passed in like pattern
+   * and escape character.
+   * @param cudfViewHandle native handle of the cudf::column_view being operated on.
+   * @param patternHandle handle of scalar containing the string like pattern.
+   * @param escapeCharHandle handle of scalar containing the string escape character.
+   * @return native handle of the resulting cudf column containing the boolean results.
+   */
+  private static native long like(long cudfViewHandle, long patternHandle, long escapeCharHandle) throws CudfException;
+
   /**
    * Native method for checking if strings in a column contains a specified comparison string.
    * @param cudfViewHandle native handle of the cudf::column_view being operated on.

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1298,6 +1298,24 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsRe(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_like(JNIEnv *env, jobject j_object,
+                                                            jlong j_view_handle, jlong pattern,
+                                                            jlong escapeChar) {
+  JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
+  JNI_NULL_CHECK(env, pattern, "pattern is null", false);
+  JNI_NULL_CHECK(env, escapeChar, "escape character is null", false);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const strings_column = cudf::strings_column_view{*column_view};
+    auto const pattern_scalar = reinterpret_cast<cudf::string_scalar const *>(pattern);
+    auto const escape_scalar = reinterpret_cast<cudf::string_scalar const *>(escapeChar);
+    return release_as_jlong(cudf::strings::like(strings_column, *pattern_scalar, *escape_scalar));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, jclass,
                                                                   jlong lhs_view, jlong rhs_view,
                                                                   jint int_op, jint out_dtype,

diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4193,6 +4193,63 @@ void testContainsReEmptyInput() {
     }
   }
 
+  @Test
+  void testLike() {
+    // Default escape character
+    try (ColumnVector testStrings = ColumnVector.fromStrings(
+           "a", "aa", "aaa", "aba", "b", "bb", "bba", "", "áéêú", "a1b2c3");
+         Scalar patternString1 = Scalar.fromString("a1b2c3");
+         Scalar patternString2 = Scalar.fromString("__a%");
+         Scalar defaultEscape = Scalar.fromString("\\");
+         ColumnVector res1 = testStrings.like(patternString1, defaultEscape);
+         ColumnVector res2 = testStrings.like(patternString2, defaultEscape);
+         ColumnVector expected1 = ColumnVector.fromBoxedBooleans(
+           false, false, false, false, false, false, false, false, false, true);
+         ColumnVector expected2 = ColumnVector.fromBoxedBooleans(
+           false, false, true, true, false, false, true, false, false, false)) {
+      assertColumnsAreEqual(expected1, res1);
+      assertColumnsAreEqual(expected2, res2);
+    }
+    // Non-default escape character
+    try (ColumnVector testStrings = ColumnVector.fromStrings(
+           "10%-20%", "10-20", "10%%-20%", "a_b", "b_a", "___", "", "aéb", "_%_", "_%a");
+         Scalar patternString1 = Scalar.fromString("10%%%%-20%%");
+         Scalar patternString2 = Scalar.fromString("___%%");
+         Scalar escapeChar1 = Scalar.fromString("%");
+         Scalar escapeChar2 = Scalar.fromString("_");
+         ColumnVector res1 = testStrings.like(patternString1, escapeChar1);
+         ColumnVector res2 = testStrings.like(patternString2, escapeChar2);
+         ColumnVector expected1 = ColumnVector.fromBoxedBooleans(
+           false, false, true, false, false, false, false, false, false, false);
+         ColumnVector expected2 = ColumnVector.fromBoxedBooleans(
+           false, false, false, false, false, false, false, false, true, true)) {
+      assertColumnsAreEqual(expected1, res1);
+      assertColumnsAreEqual(expected2, res2);
+    }
+    assertThrows(AssertionError.class, () -> {
+      try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, "");
+           Scalar defaultEscape = Scalar.fromString("\\");
+           ColumnVector res = testStrings.like(null, defaultEscape)) {}
+    });
+    assertThrows(AssertionError.class, () -> {
+      try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, "");
+           Scalar patternString = Scalar.fromString("");
+           ColumnVector res = testStrings.like(patternString, null)) {}
+    });
+    assertThrows(AssertionError.class, () -> {
+      try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, "");
+           Scalar patternString = Scalar.fromString("");
+           Scalar intScalar = Scalar.fromInt(1);
+           ColumnVector res = testStrings.like(patternString, intScalar)) {}
+    });
+    assertThrows(AssertionError.class, () -> {
+      try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, "");
+           Scalar intScalar = Scalar.fromInt(1);
+           Scalar defaultEscape = Scalar.fromString("\\");
+           ColumnVector res = testStrings.like(intScalar, defaultEscape)) {}
+    });
+  }
+
   @Test
   void testUrlDecode() {
     String[] inputs = new String[] {

diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 from __future__ import annotations
 
+from functools import cached_property
+
 import pandas as pd
 import pyarrow as pa
 
@@ -65,6 +67,17 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
             pd_series.index = index
         return pd_series
 
+    @cached_property
+    def memory_usage(self):
+        n = 0
+        if self.nullable:
+            n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size)
+
+        for child in self.children:
+            n += child.memory_usage
+
+        return n
+
     def element_indexing(self, index: int):
         result = super().element_indexing(index)
         return {

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
@@ -4,6 +4,7 @@
 import operator
 import pickle
 import textwrap
+from functools import cached_property
 from typing import Any, Callable, Dict, List, Tuple, Type, Union
 
 import numpy as np
@@ -627,6 +628,13 @@ def deserialize(cls, header: dict, frames: list):
                 fields[k] = pickle.loads(dtype)
         return cls(fields)
 
+    @cached_property
+    def itemsize(self):
+        return sum(
+            cudf.utils.dtypes.cudf_dtype_from_pa_type(field.type).itemsize
+            for field in self._typ
+        )
+
 
 decimal_dtype_template = textwrap.dedent(
     """

diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
@@ -864,6 +864,8 @@ def test_memory_usage():
     assert s1.memory_usage() == 44
     s2 = cudf.Series([[[[1, 2]]], [[[3, 4]]]])
     assert s2.memory_usage() == 68
+    s3 = cudf.Series([[{"b": 1, "a": 10}, {"b": 2, "a": 100}]])
+    assert s3.memory_usage() == 40
 
 
 @pytest.mark.parametrize(

diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
@@ -371,3 +371,24 @@ def test_nested_struct_extract_host_scalars(data, idx, expected):
     series = cudf.Series(data)
 
     assert _nested_na_replace(series[idx]) == _nested_na_replace(expected)
+
+
+def test_struct_memory_usage():
+    s = cudf.Series([{"a": 1, "b": 10}, {"a": 2, "b": 20}, {"a": 3, "b": 30}])
+    df = s.struct.explode()
+
+    assert_eq(s.memory_usage(), df.memory_usage().sum())
+
+
+def test_struct_with_null_memory_usage():
+    df = cudf.DataFrame(
+        {
+            "a": cudf.Series([1, 2, -1, -1, 3], dtype="int64"),
+            "b": cudf.Series([10, 20, -1, -1, 30], dtype="int64"),
+        }
+    )
+    s = df.to_struct()
+    assert s.memory_usage() == 80
+
+    s[2:4] = None
+    assert s.memory_usage() == 272