From 9027fdf0533c533069ef0367ce3a3f8d43b9e197 Mon Sep 17 00:00:00 2001 From: Liangcai Li Date: Tue, 7 Feb 2023 05:44:05 +0000 Subject: [PATCH 1/6] Allow setting the seed argument for hash partition Signed-off-by: Liangcai Li --- java/src/main/java/ai/rapids/cudf/Table.java | 17 +++++++++++++++++ java/src/main/native/src/TableJni.cpp | 8 ++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 3eed7e45eed..45697da829c 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -194,6 +194,7 @@ private static native long[] hashPartition(long inputTable, int[] columnsToHash, int hashTypeId, int numberOfPartitions, + int seed, int[] outputOffsets) throws CudfException; private static native long[] roundRobinPartition(long inputTable, @@ -4253,12 +4254,28 @@ public PartitionedTable hashPartition(int numberOfPartitions) { * {@link Table} class */ public PartitionedTable hashPartition(HashType type, int numberOfPartitions) { + final int DEFAULT_HASH_SEED = 0; + return hashPartition(type, numberOfPartitions, DEFAULT_HASH_SEED); + } + + /** + * Hash partition a table into the specified number of partitions. + * @param type the type of hash to use. Depending on the type of hash different restrictions + * on the hash column(s) may exist. Not all hash functions are guaranteed to work + * besides IDENTITY and MURMUR3. + * @param numberOfPartitions - number of partitions to use + * @param seed - the seed of hash algorithm + * @return {@link PartitionedTable} - Table that exposes a limited functionality of the + * {@link Table} class + */ + public PartitionedTable hashPartition(HashType type, int numberOfPartitions, int seed) { int[] partitionOffsets = new int[numberOfPartitions]; return new PartitionedTable(new Table(Table.hashPartition( operation.table.nativeHandle, operation.indices, type.nativeId, partitionOffsets.length, + seed, partitionOffsets)), partitionOffsets); } } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 3d730ff61a1..7bce30e2f6e 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2655,16 +2655,19 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jc JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition( JNIEnv *env, jclass, jlong input_table, jintArray columns_to_hash, jint hash_function, - jint number_of_partitions, jintArray output_offsets) { + jint number_of_partitions, jint seed, jintArray output_offsets) { JNI_NULL_CHECK(env, input_table, "input table is null", NULL); JNI_NULL_CHECK(env, columns_to_hash, "columns_to_hash is null", NULL); JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL); JNI_ARG_CHECK(env, number_of_partitions > 0, "number_of_partitions is zero", NULL); + // For simplicity of converting a signed int to a unsigned int + JNI_ARG_CHECK(env, seed >= 0, "seed is negative", NULL); try { cudf::jni::auto_set_device(env); auto const hash_func = static_cast(hash_function); + auto const hash_seed = static_cast(seed); auto const n_input_table = reinterpret_cast(input_table); cudf::jni::native_jintArray n_columns_to_hash(env, columns_to_hash); JNI_ARG_CHECK(env, n_columns_to_hash.size() > 0, "columns_to_hash is zero", NULL); @@ -2673,7 +2676,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition( n_columns_to_hash.end()); auto [partitioned_table, partition_offsets] = - cudf::hash_partition(*n_input_table, columns_to_hash_vec, number_of_partitions, hash_func); + cudf::hash_partition(*n_input_table, columns_to_hash_vec, number_of_partitions, hash_func, + hash_seed); cudf::jni::native_jintArray n_output_offsets(env, output_offsets); std::copy(partition_offsets.begin(), partition_offsets.end(), n_output_offsets.begin()); From 2d7fbc3d1a496faa2000eff22cab672bec107168 Mon Sep 17 00:00:00 2001 From: Liangcai Li Date: Tue, 7 Feb 2023 05:58:28 +0000 Subject: [PATCH 2/6] A format fix Signed-off-by: Liangcai Li --- java/src/main/native/src/TableJni.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 7bce30e2f6e..1ad43e6face 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2675,9 +2675,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition( std::vector columns_to_hash_vec(n_columns_to_hash.begin(), n_columns_to_hash.end()); - auto [partitioned_table, partition_offsets] = - cudf::hash_partition(*n_input_table, columns_to_hash_vec, number_of_partitions, hash_func, - hash_seed); + auto [partitioned_table, partition_offsets] = cudf::hash_partition( + *n_input_table, columns_to_hash_vec, number_of_partitions, hash_func, hash_seed); cudf::jni::native_jintArray n_output_offsets(env, output_offsets); std::copy(partition_offsets.begin(), partition_offsets.end(), n_output_offsets.begin()); From 38e702fd51799f6dbd4ed6b5d0c8bf3d9eb779af Mon Sep 17 00:00:00 2001 From: Liangcai Li Date: Wed, 8 Feb 2023 09:43:12 +0800 Subject: [PATCH 3/6] Update java/src/main/java/ai/rapids/cudf/Table.java Co-authored-by: Nghia Truong --- java/src/main/java/ai/rapids/cudf/Table.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 45697da829c..40ca9bf74f1 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -4265,7 +4265,7 @@ public PartitionedTable hashPartition(HashType type, int numberOfPartitions) { * besides IDENTITY and MURMUR3. * @param numberOfPartitions - number of partitions to use * @param seed - the seed of hash algorithm - * @return {@link PartitionedTable} - Table that exposes a limited functionality of the + * @return Table that exposes a limited functionality of the * {@link Table} class */ public PartitionedTable hashPartition(HashType type, int numberOfPartitions, int seed) { From dd2bbeb299c63068ef44f4d1e0311d0da96fce95 Mon Sep 17 00:00:00 2001 From: Liangcai Li Date: Wed, 8 Feb 2023 09:43:21 +0800 Subject: [PATCH 4/6] Update java/src/main/java/ai/rapids/cudf/Table.java Co-authored-by: Nghia Truong --- java/src/main/java/ai/rapids/cudf/Table.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 40ca9bf74f1..0416c484cc6 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -4263,8 +4263,8 @@ public PartitionedTable hashPartition(HashType type, int numberOfPartitions) { * @param type the type of hash to use. Depending on the type of hash different restrictions * on the hash column(s) may exist. Not all hash functions are guaranteed to work * besides IDENTITY and MURMUR3. - * @param numberOfPartitions - number of partitions to use - * @param seed - the seed of hash algorithm + * @param numberOfPartitions number of partitions to use + * @param seed the seed value for hashing * @return Table that exposes a limited functionality of the * {@link Table} class */ From 2bce857bafe9700016fa5db5fb30d23e651b83d3 Mon Sep 17 00:00:00 2001 From: Liangcai Li Date: Wed, 8 Feb 2023 01:45:43 +0000 Subject: [PATCH 5/6] Address comments Signed-off-by: Liangcai Li --- java/src/main/native/src/TableJni.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 1ad43e6face..0b3ccb59a39 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2661,8 +2661,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition( JNI_NULL_CHECK(env, columns_to_hash, "columns_to_hash is null", NULL); JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL); JNI_ARG_CHECK(env, number_of_partitions > 0, "number_of_partitions is zero", NULL); - // For simplicity of converting a signed int to a unsigned int - JNI_ARG_CHECK(env, seed >= 0, "seed is negative", NULL); try { cudf::jni::auto_set_device(env); From 929cc586bb111192d238ed1a7d6e8c0a6c7f33d3 Mon Sep 17 00:00:00 2001 From: Liangcai Li Date: Wed, 8 Feb 2023 10:16:36 +0800 Subject: [PATCH 6/6] Update java/src/main/java/ai/rapids/cudf/Table.java Co-authored-by: Nghia Truong --- java/src/main/java/ai/rapids/cudf/Table.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 0416c484cc6..3ccab70ccda 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -4265,8 +4265,7 @@ public PartitionedTable hashPartition(HashType type, int numberOfPartitions) { * besides IDENTITY and MURMUR3. * @param numberOfPartitions number of partitions to use * @param seed the seed value for hashing - * @return Table that exposes a limited functionality of the - * {@link Table} class + * @return Table that exposes a limited functionality of the {@link Table} class */ public PartitionedTable hashPartition(HashType type, int numberOfPartitions, int seed) { int[] partitionOffsets = new int[numberOfPartitions];