From 7e4a985444148d727a1be457e745eff7fecc75fc Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Mon, 15 Nov 2021 20:51:11 -0800 Subject: [PATCH] Some improvements to `parse_decimal` function and bindings for `is_fixed_point` (#9658) This PR adds Java bindings for `is_fixed_point` Authors: - Raza Jafri (https://github.com/razajafri) Approvers: - Nghia Truong (https://github.com/ttnghia) - Robert (Bobby) Evans (https://github.com/revans2) - David Wendt (https://github.com/davidwendt) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/9658 --- .../strings/convert/convert_fixed_point.hpp | 8 ++-- .../main/java/ai/rapids/cudf/ColumnView.java | 32 ++++++++++++++++ java/src/main/native/src/ColumnViewJni.cpp | 16 ++++++++ .../java/ai/rapids/cudf/ColumnVectorTest.java | 38 +++++++++---------- 4 files changed, 69 insertions(+), 25 deletions(-) diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp index 7bcb7e72ab2..5fe5c880f9d 100644 --- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp +++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp @@ -93,18 +93,16 @@ std::unique_ptr from_fixed_point( * @brief Returns a boolean column identifying strings in which all * characters are valid for conversion to fixed-point. * - * The output row entry is set to `true` if the corresponding string element - * has at least one character in [+-0123456789.]. The optional sign character - * must only be in the first position. The decimal point may only appear once. + * The sign and the exponent is optional. The decimal point may only appear once. * Also, the integer component must fit within the size limits of the * underlying fixed-point storage type. The value of the integer component * is based on the scale of the `decimal_type` provided. * * @code{.pseudo} * Example: - * s = ['123', '-456', '', '1.2.3', '+17E30', '12.34' '.789', '-0.005] + * s = ['123', '-456', '', '1.2.3', '+17E30', '12.34', '.789', '-0.005] * b = is_fixed_point(s) - * b is [true, true, false, false, false, true, true, true] + * b is [true, true, false, false, true, true, true, true] * @endcode * * Any null entries result in corresponding null entries in the output column. diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index aa9d3f0d9f3..329c251f72d 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -348,6 +348,34 @@ public final ColumnVector isNull() { return new ColumnVector(isNullNative(getNativeView())); } + /** + * Returns a Boolean vector with the same number of rows as this instance, that has + * TRUE for any entry that is a fixed-point, and FALSE if its not a fixed-point. + * A null will be returned for null entries. + * + * The sign and the exponent is optional. The decimal point may only appear once. + * The integer component must fit within the size limits of the underlying fixed-point + * storage type. The value of the integer component is based on the scale of the target + * decimalType. + * + * Example: + * vec = ["A", "nan", "Inf", "-Inf", "Infinity", "infinity", "2.1474", "112.383", "-2.14748", + * "NULL", "null", null, "1.2", "1.2e-4", "0.00012"] + * vec.isFixedPoint() = [false, false, false, false, false, false, true, true, true, false, false, + * null, true, true, true] + * + * @param decimalType the data type that should be used for bounds checking. Note that only + * Decimal types (fixed-point) are allowed. + * @return Boolean vector + */ + public final ColumnVector isFixedPoint(DType decimalType) { + assert type.equals(DType.STRING); + assert decimalType.isDecimalType(); + return new ColumnVector(isFixedPoint(getNativeView(), + decimalType.getTypeId().getNativeId(), decimalType.getScale())); + } + + /** * Returns a Boolean vector with the same number of rows as this instance, that has * TRUE for any entry that is an integer, and FALSE if its not an integer. A null will be returned @@ -375,6 +403,7 @@ public final ColumnVector isInteger() { */ public final ColumnVector isInteger(DType intType) { assert type.equals(DType.STRING); + assert intType.isBackedByInt() || intType.isBackedByLong(); return new ColumnVector(isIntegerWithType(getNativeView(), intType.getTypeId().getNativeId(), intType.getScale())); } @@ -3220,6 +3249,9 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) { */ private static native long stringTimestampToTimestamp(long viewHandle, int unit, String format); + + private static native long isFixedPoint(long viewHandle, int nativeTypeId, int scale); + /** * Native method to concatenate a list column of strings (each row is a list of strings), * concatenates the strings within each row and returns a single strings column result. diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index d2a2030e24c..bce330ea4a3 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -2023,6 +2023,22 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isInteger(JNIEnv *env, jo CATCH_STD(env, 0) } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFixedPoint(JNIEnv *env, jobject, + jlong handle, jint j_dtype, + jint scale) { + + JNI_NULL_CHECK(env, handle, "native view handle is null", 0) + + try { + cudf::jni::auto_set_device(env); + cudf::column_view *view = reinterpret_cast(handle); + cudf::data_type fp_dtype = cudf::jni::make_data_type(j_dtype, scale); + std::unique_ptr result = cudf::strings::is_fixed_point(*view, fp_dtype); + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0) +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isIntegerWithType(JNIEnv *env, jobject, jlong handle, jint j_dtype, jint scale) { diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 0d007aa0ed7..4d52862f7b0 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -18,12 +18,7 @@ package ai.rapids.cudf; -import ai.rapids.cudf.HostColumnVector.BasicType; -import ai.rapids.cudf.HostColumnVector.DataType; -import ai.rapids.cudf.HostColumnVector.ListType; -import ai.rapids.cudf.HostColumnVector.StructData; -import ai.rapids.cudf.HostColumnVector.StructType; - +import ai.rapids.cudf.HostColumnVector.*; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -38,20 +33,9 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; -import static ai.rapids.cudf.QuantileMethod.HIGHER; -import static ai.rapids.cudf.QuantileMethod.LINEAR; -import static ai.rapids.cudf.QuantileMethod.LOWER; -import static ai.rapids.cudf.QuantileMethod.MIDPOINT; -import static ai.rapids.cudf.QuantileMethod.NEAREST; -import static ai.rapids.cudf.TableTest.assertColumnsAreEqual; -import static ai.rapids.cudf.TableTest.assertStructColumnsAreEqual; -import static ai.rapids.cudf.TableTest.assertTablesAreEqual; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotEquals; -import static org.junit.jupiter.api.Assertions.assertNull; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; +import static ai.rapids.cudf.QuantileMethod.*; +import static ai.rapids.cudf.TableTest.*; +import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assumptions.assumeTrue; public class ColumnVectorTest extends CudfTestBase { @@ -4834,6 +4818,20 @@ void testIsInteger() { } } + @Test + void testIsFixedPoint() { + String[] decimalStrings = {"A", "nan", "Inf", "-Inf", "Infinity", "infinity", + "2.1474", "112.383", "-2.14748", "NULL", "null", null, "1.2", "1.2e-4", "0.00012"}; + + DType dt = DType.create(DType.DTypeEnum.DECIMAL32, -3); + try (ColumnVector decStringCV = ColumnVector.fromStrings(decimalStrings); + ColumnVector isFixedPoint = decStringCV.isFixedPoint(dt); + ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, false, false, false + , false, true, true, true, false, false, null, true, true, true)) { + assertColumnsAreEqual(expected, isFixedPoint); + } + } + @Test void testIsFloat() { String[] floatStrings = {"A", "nan", "Inf", "-Inf", "Infinity", "infinity", "-0.0", "0.0",