From c32725d53ab1f83a2337df3b6c548bf38eeec700 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 27 Feb 2024 15:40:34 -0500 Subject: [PATCH 01/31] Remove offsets_begin() call from nvtext::generate_ngrams (#15077) Removes call to `strings_column_view::offsets_begin()` call from `nvtext::generate_ngrams()`. A future PR will deprecate the `offsets_begin()` function which hardcodes to int32 type. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/15077 --- cpp/src/text/generate_ngrams.cu | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 433237bbf81..fafb2f18b80 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -103,11 +103,8 @@ std::unique_ptr generate_ngrams(cudf::strings_column_view const& s // first create a new offsets vector removing nulls and empty strings from the input column std::unique_ptr non_empty_offsets_column = [&] { - cudf::column_view offsets_view(cudf::data_type{cudf::type_id::INT32}, - strings_count + 1, - strings.offsets_begin(), - nullptr, - 0); + cudf::column_view offsets_view( + strings.offsets().type(), strings_count + 1, strings.offsets().head(), nullptr, 0); auto table_offsets = cudf::detail::copy_if( cudf::table_view({offsets_view}), [d_strings, strings_count] __device__(cudf::size_type idx) { From 1719cda0b18bf3f15426f827fc49e23f0ec3bd40 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 27 Feb 2024 15:41:11 -0500 Subject: [PATCH 02/31] Remove calls to strings_column_view::offsets_begin() (#15112) Removes calls to `cudf::strings_column_view::offsets_begin()` since the result cannot have a hardcoded integer type. The goal is to deprecate this member function in this release. Follow on changes may be required to further enable large strings support to these functions. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/15112 --- cpp/examples/strings/custom_prealloc.cu | 2 +- cpp/src/transform/row_conversion.cu | 33 +++++++++++++------------ cpp/tests/io/json_type_cast_test.cu | 33 +++++++++++-------------- 3 files changed, 33 insertions(+), 35 deletions(-) diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu index 93194899fe1..27b553731f8 100644 --- a/cpp/examples/strings/custom_prealloc.cu +++ b/cpp/examples/strings/custom_prealloc.cu @@ -98,7 +98,7 @@ std::unique_ptr redact_strings(cudf::column_view const& names, nvtxRangePushA("redact_strings"); auto const scv = cudf::strings_column_view(names); - auto const offsets = scv.offsets_begin(); + auto const offsets = scv.offsets().begin(); // create working memory to hold the output of each string auto working_memory = rmm::device_uvector(scv.chars_size(stream), stream); diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu index 361a3610afa..32faa097d0e 100644 --- a/cpp/src/transform/row_conversion.cu +++ b/cpp/src/transform/row_conversion.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -212,7 +213,7 @@ struct batch_data { * @return pair of device vector of size_types of the row sizes of the table and a device vector of * offsets into the string column */ -std::pair, rmm::device_uvector> +std::pair, rmm::device_uvector> build_string_row_offsets(table_view const& tbl, size_type fixed_width_and_validity_size, rmm::cuda_stream_view stream) @@ -222,20 +223,20 @@ build_string_row_offsets(table_view const& tbl, thrust::uninitialized_fill(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), 0); auto d_offsets_iterators = [&]() { - std::vector offsets_iterators; - auto offsets_iter = thrust::make_transform_iterator( - tbl.begin(), [](auto const& col) -> strings_column_view::offset_iterator { - if (!is_fixed_width(col.type())) { - CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!"); - return strings_column_view(col).offsets_begin(); - } else { - return nullptr; - } + std::vector offsets_iterators; + auto itr = thrust::make_transform_iterator( + tbl.begin(), [](auto const& col) -> cudf::detail::input_offsetalator { + return cudf::detail::offsetalator_factory::make_input_iterator( + strings_column_view(col).offsets(), col.offset()); }); - std::copy_if(offsets_iter, - offsets_iter + tbl.num_columns(), - std::back_inserter(offsets_iterators), - [](auto const& offset_ptr) { return offset_ptr != nullptr; }); + auto stencil = thrust::make_transform_iterator( + tbl.begin(), [](auto const& col) -> bool { return !is_fixed_width(col.type()); }); + thrust::copy_if(thrust::host, + itr, + itr + tbl.num_columns(), + stencil, + std::back_inserter(offsets_iterators), + thrust::identity{}); return make_device_uvector_sync( offsets_iterators, stream, rmm::mr::get_current_device_resource()); }(); @@ -858,7 +859,7 @@ CUDF_KERNEL void copy_strings_to_rows(size_type const num_rows, size_type const num_variable_columns, int8_t const** variable_input_data, size_type const* variable_col_output_offsets, - size_type const** variable_col_offsets, + cudf::detail::input_offsetalator* variable_col_offsets, size_type fixed_width_row_size, RowOffsetFunctor row_offsets, size_type const batch_row_offset, @@ -1844,7 +1845,7 @@ std::vector> convert_to_rows( batch_data& batch_info, offsetFunctor offset_functor, column_info_s const& column_info, - std::optional> variable_width_offsets, + std::optional> variable_width_offsets, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 8a541022ab0..fe430010f4b 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -34,6 +35,8 @@ #include +#include + #include #include #include @@ -43,25 +46,15 @@ using namespace cudf::test::iterators; struct JSONTypeCastTest : public cudf::test::BaseFixture {}; namespace { -struct offsets_to_length { - __device__ cudf::size_type operator()(thrust::tuple const& p) - { - return thrust::get<1>(p) - thrust::get<0>(p); - } -}; /// Returns length of each string in the column auto string_offset_to_length(cudf::strings_column_view const& column, rmm::cuda_stream_view stream) { - auto offsets_begin = column.offsets_begin(); - auto offsets_pair = - thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin))); rmm::device_uvector svs_length(column.size(), stream); - thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - offsets_pair, - offsets_pair + column.size(), - svs_length.begin(), - offsets_to_length{}); + auto itr = + cudf::detail::offsetalator_factory::make_input_iterator(column.offsets(), column.offset()); + thrust::adjacent_difference( + rmm::exec_policy(stream), itr + 1, itr + column.size() + 1, svs_length.begin()); return svs_length; } } // namespace @@ -96,7 +89,8 @@ TEST_F(JSONTypeCastTest, String) auto str_col = cudf::io::json::detail::parse_data( column.chars_begin(stream), - thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + thrust::make_zip_iterator( + thrust::make_tuple(column.offsets().begin(), svs_length.begin())), column.size(), type, std::move(null_mask), @@ -129,7 +123,8 @@ TEST_F(JSONTypeCastTest, Int) auto col = cudf::io::json::detail::parse_data( column.chars_begin(stream), - thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + thrust::make_zip_iterator( + thrust::make_tuple(column.offsets().begin(), svs_length.begin())), column.size(), type, std::move(null_mask), @@ -169,7 +164,8 @@ TEST_F(JSONTypeCastTest, StringEscapes) auto col = cudf::io::json::detail::parse_data( column.chars_begin(stream), - thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + thrust::make_zip_iterator( + thrust::make_tuple(column.offsets().begin(), svs_length.begin())), column.size(), type, std::move(null_mask), @@ -238,7 +234,8 @@ TEST_F(JSONTypeCastTest, ErrorNulls) auto str_col = cudf::io::json::detail::parse_data( column.chars_begin(stream), - thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + thrust::make_zip_iterator( + thrust::make_tuple(column.offsets().begin(), svs_length.begin())), column.size(), type, std::move(null_mask), From ab2eb58be36e1140157e61aa65838670d97820b7 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Wed, 28 Feb 2024 08:49:44 -0600 Subject: [PATCH 03/31] Add java option to keep quotes for JSON reads (#15146) Plumbs through the option to enable returning quotes with strings when reading JSON. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - Jason Lowe (https://github.com/jlowe) - Bradley Dice (https://github.com/bdice) --- .../main/java/ai/rapids/cudf/JSONOptions.java | 17 ++++++++++++++ java/src/main/java/ai/rapids/cudf/Table.java | 22 ++++++++++++++----- java/src/main/native/src/TableJni.cpp | 19 +++++++++++----- .../test/java/ai/rapids/cudf/TableTest.java | 19 ++++++++++++++++ 4 files changed, 65 insertions(+), 12 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 35165c18c7a..62496e32f7a 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -32,6 +32,7 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean recoverWithNull; private final boolean normalizeSingleQuotes; private final boolean mixedTypesAsStrings; + private final boolean keepStringQuotes; private JSONOptions(Builder builder) { super(builder); @@ -40,6 +41,7 @@ private JSONOptions(Builder builder) { recoverWithNull = builder.recoverWithNull; normalizeSingleQuotes = builder.normalizeSingleQuotes; mixedTypesAsStrings = builder.mixedTypesAsStrings; + keepStringQuotes = builder.keepQuotes; } public boolean isDayFirst() { @@ -63,6 +65,10 @@ public boolean isMixedTypesAsStrings() { return mixedTypesAsStrings; } + public boolean keepStringQuotes() { + return keepStringQuotes; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -80,6 +86,7 @@ public static final class Builder extends ColumnFilterOptions.Builder(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) + .keep_quotes(keep_quotes) .mixed_types_as_string(mixed_types_as_string); auto result = @@ -1459,7 +1461,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, - jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string) { + jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string, + jboolean keep_quotes) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1481,6 +1484,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) + .keep_quotes(keep_quotes) .mixed_types_as_string(mixed_types_as_string); auto result = @@ -1569,7 +1573,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types, jintArray j_scales, jboolean day_first, jboolean lines, jboolean recover_with_null, - jboolean normalize_single_quotes, jboolean mixed_types_as_string, jlong ds_handle) { + jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes, + jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1601,7 +1606,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) - .mixed_types_as_string(mixed_types_as_string); + .mixed_types_as_string(mixed_types_as_string) + .keep_quotes(keep_quotes); if (!n_types.is_null()) { if (n_types.size() != n_scales.size()) { @@ -1640,7 +1646,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types, jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, jboolean recover_with_null, - jboolean normalize_single_quotes, jboolean mixed_types_as_string) { + jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes) { bool read_buffer = true; if (buffer == 0) { @@ -1687,7 +1693,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) - .mixed_types_as_string(mixed_types_as_string); + .mixed_types_as_string(mixed_types_as_string) + .keep_quotes(keep_quotes); if (!n_types.is_null()) { if (n_types.size() != n_scales.size()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index e270c4a5183..efdb6f4bb1b 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -349,6 +349,25 @@ void testReadSingleQuotesJSONFile() throws IOException { } } + @Test + void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "A") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withNormalizeSingleQuotes(true) + .withKeepQuotes(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("\"TEST\"\"", "\"TESTER'\"") // Note that escapes are also processed + .build(); + MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE); + Table table = Table.readJSON(schema, opts, source)) { + assertTablesAreEqual(expected, table); + } + } + private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" + "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" + "{\"d\":[1,2,3]}\n" + From 990ef0f87708c8e3e338b8f0148b0d6d7b6f18c9 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Wed, 28 Feb 2024 08:51:00 -0600 Subject: [PATCH 04/31] JNI bindings for distinct_hash_join (#15019) Adds Java bindings to the distinct hash join functionality added in #14990. Authors: - Jason Lowe (https://github.com/jlowe) Approvers: - Jim Brennan (https://github.com/jbrennan333) - Nghia Truong (https://github.com/ttnghia) --- java/src/main/java/ai/rapids/cudf/Table.java | 105 +++++++++++++++-- java/src/main/native/src/TableJni.cpp | 28 ++++- .../test/java/ai/rapids/cudf/TableTest.java | 111 +++++++++++++++++- 3 files changed, 231 insertions(+), 13 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 1356c93c64d..c562e08b4c8 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -626,6 +626,9 @@ private static native long[] leftHashJoinGatherMapsWithCount(long leftTable, lon private static native long[] innerJoinGatherMaps(long leftKeys, long rightKeys, boolean compareNullsEqual) throws CudfException; + private static native long[] innerDistinctJoinGatherMaps(long leftKeys, long rightKeys, + boolean compareNullsEqual) throws CudfException; + private static native long innerJoinRowCount(long table, long hashJoin) throws CudfException; private static native long[] innerHashJoinGatherMaps(long table, long hashJoin) throws CudfException; @@ -2920,7 +2923,9 @@ private static GatherMap[] buildJoinGatherMaps(long[] gatherMapData) { * the table argument represents the key columns from the right table. Two {@link GatherMap} * instances will be returned that can be used to gather the left and right tables, * respectively, to produce the result of the left join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * @param rightKeys join key columns from the right table * @param compareNullsEqual true if null key values should match otherwise false * @return left and right table gather maps @@ -2956,7 +2961,9 @@ public long leftJoinRowCount(HashJoin rightHash) { * the {@link HashJoin} argument has been constructed from the key columns from the right table. * Two {@link GatherMap} instances will be returned that can be used to gather the left and right * tables, respectively, to produce the result of the left join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * @param rightHash hash table built from join key columns from the right table * @return left and right table gather maps */ @@ -2975,11 +2982,15 @@ public GatherMap[] leftJoinGatherMaps(HashJoin rightHash) { * the {@link HashJoin} argument has been constructed from the key columns from the right table. * Two {@link GatherMap} instances will be returned that can be used to gather the left and right * tables, respectively, to produce the result of the left join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * This interface allows passing an output row count that was previously computed from * {@link #leftJoinRowCount(HashJoin)}. + * * WARNING: Passing a row count that is smaller than the actual row count will result * in undefined behavior. + * * @param rightHash hash table built from join key columns from the right table * @param outputRowCount number of output rows in the join result * @return left and right table gather maps @@ -3013,7 +3024,9 @@ public long conditionalLeftJoinRowCount(Table rightTable, CompiledExpression con * the columns from the left table, and the table argument represents the columns from the * right table. Two {@link GatherMap} instances will be returned that can be used to gather * the left and right tables, respectively, to produce the result of the left join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * @param rightTable the right side table of the join in the join * @param condition conditional expression to evaluate during the join * @return left and right table gather maps @@ -3032,11 +3045,15 @@ public GatherMap[] conditionalLeftJoinGatherMaps(Table rightTable, * the columns from the left table, and the table argument represents the columns from the * right table. Two {@link GatherMap} instances will be returned that can be used to gather * the left and right tables, respectively, to produce the result of the left join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * This interface allows passing an output row count that was previously computed from * {@link #conditionalLeftJoinRowCount(Table, CompiledExpression)}. + * * WARNING: Passing a row count that is smaller than the actual row count will result * in undefined behavior. + * * @param rightTable the right side table of the join in the join * @param condition conditional expression to evaluate during the join * @param outputRowCount number of output rows in the join result @@ -3085,7 +3102,9 @@ public static MixedJoinSize mixedLeftJoinSize(Table leftKeys, Table rightKeys, * assumed to be a logical AND of the equality condition and inequality condition. * Two {@link GatherMap} instances will be returned that can be used to gather * the left and right tables, respectively, to produce the result of the left join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * @param leftKeys the left table's key columns for the equality condition * @param rightKeys the right table's key columns for the equality condition * @param leftConditional the left table's columns needed to evaluate the inequality condition @@ -3112,10 +3131,13 @@ public static GatherMap[] mixedLeftJoinGatherMaps(Table leftKeys, Table rightKey * assumed to be a logical AND of the equality condition and inequality condition. * Two {@link GatherMap} instances will be returned that can be used to gather * the left and right tables, respectively, to produce the result of the left join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * This interface allows passing the size result from * {@link #mixedLeftJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)} * when the output size was computed previously. + * * @param leftKeys the left table's key columns for the equality condition * @param rightKeys the right table's key columns for the equality condition * @param leftConditional the left table's columns needed to evaluate the inequality condition @@ -3145,14 +3167,16 @@ public static GatherMap[] mixedLeftJoinGatherMaps(Table leftKeys, Table rightKey * the table argument represents the key columns from the right table. Two {@link GatherMap} * instances will be returned that can be used to gather the left and right tables, * respectively, to produce the result of the inner join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * @param rightKeys join key columns from the right table * @param compareNullsEqual true if null key values should match otherwise false * @return left and right table gather maps */ public GatherMap[] innerJoinGatherMaps(Table rightKeys, boolean compareNullsEqual) { if (getNumberOfColumns() != rightKeys.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + throw new IllegalArgumentException("Column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightKeys.getNumberOfColumns()); } long[] gatherMapData = @@ -3160,6 +3184,30 @@ public GatherMap[] innerJoinGatherMaps(Table rightKeys, boolean compareNullsEqua return buildJoinGatherMaps(gatherMapData); } + /** + * Computes the gather maps that can be used to manifest the result of an inner equi-join between + * two tables where the right table is guaranteed to not contain any duplicated join keys. It is + * assumed this table instance holds the key columns from the left table, and the table argument + * represents the key columns from the right table. Two {@link GatherMap} instances will be + * returned that can be used to gather the left and right tables, respectively, to produce the + * result of the inner join. + * + * It is the responsibility of the caller to close the resulting gather map instances. + * + * @param rightKeys join key columns from the right table + * @param compareNullsEqual true if null key values should match otherwise false + * @return left and right table gather maps + */ + public GatherMap[] innerDistinctJoinGatherMaps(Table rightKeys, boolean compareNullsEqual) { + if (getNumberOfColumns() != rightKeys.getNumberOfColumns()) { + throw new IllegalArgumentException("Column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightKeys.getNumberOfColumns()); + } + long[] gatherMapData = + innerDistinctJoinGatherMaps(getNativeView(), rightKeys.getNativeView(), compareNullsEqual); + return buildJoinGatherMaps(gatherMapData); + } + /** * Computes the number of rows resulting from an inner equi-join between two tables. * @param otherHash hash table built from join key columns from the other table @@ -3167,7 +3215,7 @@ public GatherMap[] innerJoinGatherMaps(Table rightKeys, boolean compareNullsEqua */ public long innerJoinRowCount(HashJoin otherHash) { if (getNumberOfColumns() != otherHash.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + throw new IllegalArgumentException("Column count mismatch, this: " + getNumberOfColumns() + "otherKeys: " + otherHash.getNumberOfColumns()); } return innerJoinRowCount(getNativeView(), otherHash.getNativeView()); @@ -3179,13 +3227,15 @@ public long innerJoinRowCount(HashJoin otherHash) { * the {@link HashJoin} argument has been constructed from the key columns from the right table. * Two {@link GatherMap} instances will be returned that can be used to gather the left and right * tables, respectively, to produce the result of the inner join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * @param rightHash hash table built from join key columns from the right table * @return left and right table gather maps */ public GatherMap[] innerJoinGatherMaps(HashJoin rightHash) { if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + throw new IllegalArgumentException("Column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } long[] gatherMapData = innerHashJoinGatherMaps(getNativeView(), rightHash.getNativeView()); @@ -3198,18 +3248,22 @@ public GatherMap[] innerJoinGatherMaps(HashJoin rightHash) { * the {@link HashJoin} argument has been constructed from the key columns from the right table. * Two {@link GatherMap} instances will be returned that can be used to gather the left and right * tables, respectively, to produce the result of the inner join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * This interface allows passing an output row count that was previously computed from * {@link #innerJoinRowCount(HashJoin)}. + * * WARNING: Passing a row count that is smaller than the actual row count will result * in undefined behavior. + * * @param rightHash hash table built from join key columns from the right table * @param outputRowCount number of output rows in the join result * @return left and right table gather maps */ public GatherMap[] innerJoinGatherMaps(HashJoin rightHash, long outputRowCount) { if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + throw new IllegalArgumentException("Column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } long[] gatherMapData = innerHashJoinGatherMapsWithCount(getNativeView(), @@ -3237,7 +3291,9 @@ public long conditionalInnerJoinRowCount(Table rightTable, * the columns from the left table, and the table argument represents the columns from the * right table. Two {@link GatherMap} instances will be returned that can be used to gather * the left and right tables, respectively, to produce the result of the inner join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * @param rightTable the right side table of the join * @param condition conditional expression to evaluate during the join * @return left and right table gather maps @@ -3256,11 +3312,15 @@ public GatherMap[] conditionalInnerJoinGatherMaps(Table rightTable, * the columns from the left table, and the table argument represents the columns from the * right table. Two {@link GatherMap} instances will be returned that can be used to gather * the left and right tables, respectively, to produce the result of the inner join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * This interface allows passing an output row count that was previously computed from * {@link #conditionalInnerJoinRowCount(Table, CompiledExpression)}. + * * WARNING: Passing a row count that is smaller than the actual row count will result * in undefined behavior. + * * @param rightTable the right side table of the join in the join * @param condition conditional expression to evaluate during the join * @param outputRowCount number of output rows in the join result @@ -3309,7 +3369,9 @@ public static MixedJoinSize mixedInnerJoinSize(Table leftKeys, Table rightKeys, * assumed to be a logical AND of the equality condition and inequality condition. * Two {@link GatherMap} instances will be returned that can be used to gather * the left and right tables, respectively, to produce the result of the inner join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * @param leftKeys the left table's key columns for the equality condition * @param rightKeys the right table's key columns for the equality condition * @param leftConditional the left table's columns needed to evaluate the inequality condition @@ -3336,10 +3398,13 @@ public static GatherMap[] mixedInnerJoinGatherMaps(Table leftKeys, Table rightKe * assumed to be a logical AND of the equality condition and inequality condition. * Two {@link GatherMap} instances will be returned that can be used to gather * the left and right tables, respectively, to produce the result of the inner join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * This interface allows passing the size result from * {@link #mixedInnerJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)} * when the output size was computed previously. + * * @param leftKeys the left table's key columns for the equality condition * @param rightKeys the right table's key columns for the equality condition * @param leftConditional the left table's columns needed to evaluate the inequality condition @@ -3369,14 +3434,16 @@ public static GatherMap[] mixedInnerJoinGatherMaps(Table leftKeys, Table rightKe * the table argument represents the key columns from the right table. Two {@link GatherMap} * instances will be returned that can be used to gather the left and right tables, * respectively, to produce the result of the full join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * @param rightKeys join key columns from the right table * @param compareNullsEqual true if null key values should match otherwise false * @return left and right table gather maps */ public GatherMap[] fullJoinGatherMaps(Table rightKeys, boolean compareNullsEqual) { if (getNumberOfColumns() != rightKeys.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + throw new IllegalArgumentException("Column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightKeys.getNumberOfColumns()); } long[] gatherMapData = @@ -3396,7 +3463,7 @@ public GatherMap[] fullJoinGatherMaps(Table rightKeys, boolean compareNullsEqual */ public long fullJoinRowCount(HashJoin rightHash) { if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + throw new IllegalArgumentException("Column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } return fullJoinRowCount(getNativeView(), rightHash.getNativeView()); @@ -3408,13 +3475,15 @@ public long fullJoinRowCount(HashJoin rightHash) { * the {@link HashJoin} argument has been constructed from the key columns from the right table. * Two {@link GatherMap} instances will be returned that can be used to gather the left and right * tables, respectively, to produce the result of the full join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * @param rightHash hash table built from join key columns from the right table * @return left and right table gather maps */ public GatherMap[] fullJoinGatherMaps(HashJoin rightHash) { if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + throw new IllegalArgumentException("Column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } long[] gatherMapData = fullHashJoinGatherMaps(getNativeView(), rightHash.getNativeView()); @@ -3427,7 +3496,9 @@ public GatherMap[] fullJoinGatherMaps(HashJoin rightHash) { * the {@link HashJoin} argument has been constructed from the key columns from the right table. * Two {@link GatherMap} instances will be returned that can be used to gather the left and right * tables, respectively, to produce the result of the full join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * This interface allows passing an output row count that was previously computed from * {@link #fullJoinRowCount(HashJoin)}. * WARNING: Passing a row count that is smaller than the actual row count will result @@ -3438,7 +3509,7 @@ public GatherMap[] fullJoinGatherMaps(HashJoin rightHash) { */ public GatherMap[] fullJoinGatherMaps(HashJoin rightHash, long outputRowCount) { if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + throw new IllegalArgumentException("Column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } long[] gatherMapData = fullHashJoinGatherMapsWithCount(getNativeView(), @@ -3452,7 +3523,9 @@ public GatherMap[] fullJoinGatherMaps(HashJoin rightHash, long outputRowCount) { * the columns from the left table, and the table argument represents the columns from the * right table. Two {@link GatherMap} instances will be returned that can be used to gather * the left and right tables, respectively, to produce the result of the full join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * @param rightTable the right side table of the join * @param condition conditional expression to evaluate during the join * @return left and right table gather maps @@ -3471,7 +3544,9 @@ public GatherMap[] conditionalFullJoinGatherMaps(Table rightTable, * assumed to be a logical AND of the equality condition and inequality condition. * Two {@link GatherMap} instances will be returned that can be used to gather * the left and right tables, respectively, to produce the result of the full join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * @param leftKeys the left table's key columns for the equality condition * @param rightKeys the right table's key columns for the equality condition * @param leftConditional the left table's columns needed to evaluate the inequality condition @@ -3512,7 +3587,7 @@ private static GatherMap buildSemiJoinGatherMap(long[] gatherMapData) { */ public GatherMap leftSemiJoinGatherMap(Table rightKeys, boolean compareNullsEqual) { if (getNumberOfColumns() != rightKeys.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + throw new IllegalArgumentException("Column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightKeys.getNumberOfColumns()); } long[] gatherMapData = @@ -3612,7 +3687,9 @@ public static MixedJoinSize mixedLeftSemiJoinSize(Table leftKeys, Table rightKey * assumed to be a logical AND of the equality condition and inequality condition. * A {@link GatherMap} instance will be returned that can be used to gather * the left table to produce the result of the left semi join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * @param leftKeys the left table's key columns for the equality condition * @param rightKeys the right table's key columns for the equality condition * @param leftConditional the left table's columns needed to evaluate the inequality condition @@ -3639,10 +3716,13 @@ public static GatherMap mixedLeftSemiJoinGatherMap(Table leftKeys, Table rightKe * assumed to be a logical AND of the equality condition and inequality condition. * A {@link GatherMap} instance will be returned that can be used to gather * the left table to produce the result of the left semi join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * This interface allows passing the size result from * {@link #mixedLeftSemiJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)} * when the output size was computed previously. + * * @param leftKeys the left table's key columns for the equality condition * @param rightKeys the right table's key columns for the equality condition * @param leftConditional the left table's columns needed to evaluate the inequality condition @@ -3679,7 +3759,7 @@ public static GatherMap mixedLeftSemiJoinGatherMap(Table leftKeys, Table rightKe */ public GatherMap leftAntiJoinGatherMap(Table rightKeys, boolean compareNullsEqual) { if (getNumberOfColumns() != rightKeys.getNumberOfColumns()) { - throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + throw new IllegalArgumentException("Column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightKeys.getNumberOfColumns()); } long[] gatherMapData = @@ -3779,7 +3859,9 @@ public static MixedJoinSize mixedLeftAntiJoinSize(Table leftKeys, Table rightKey * assumed to be a logical AND of the equality condition and inequality condition. * A {@link GatherMap} instance will be returned that can be used to gather * the left table to produce the result of the left anti join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * @param leftKeys the left table's key columns for the equality condition * @param rightKeys the right table's key columns for the equality condition * @param leftConditional the left table's columns needed to evaluate the inequality condition @@ -3806,10 +3888,13 @@ public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKe * assumed to be a logical AND of the equality condition and inequality condition. * A {@link GatherMap} instance will be returned that can be used to gather * the left table to produce the result of the left anti join. + * * It is the responsibility of the caller to close the resulting gather map instances. + * * This interface allows passing the size result from * {@link #mixedLeftAntiJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)} * when the output size was computed previously. + * * @param leftKeys the left table's key columns for the equality condition * @param rightKeys the right table's key columns for the equality condition * @param leftConditional the left table's columns needed to evaluate the inequality condition diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 8585761788e..84f1174fd3f 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -702,9 +702,9 @@ jlongArray gather_maps_to_java(JNIEnv *env, jlongArray gather_map_to_java(JNIEnv *env, std::unique_ptr> map) { // release the underlying device buffer to Java - auto gather_map_buffer = std::make_unique(map->release()); cudf::jni::native_jlongArray result(env, 3); - result[0] = static_cast(gather_map_buffer->size()); + result[0] = static_cast(map->size() * sizeof(cudf::size_type)); + auto gather_map_buffer = std::make_unique(map->release()); result[1] = ptr_as_jlong(gather_map_buffer->data()); result[2] = release_as_jlong(gather_map_buffer); return result.get_jArray(); @@ -2557,6 +2557,30 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps( }); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerDistinctJoinGatherMaps( + JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) { + return cudf::jni::join_gather_maps( + env, j_left_keys, j_right_keys, compare_nulls_equal, + [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) { + auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right) ? + cudf::nullable_join::YES : + cudf::nullable_join::NO; + std::pair>, + std::unique_ptr>> + maps; + if (cudf::detail::has_nested_columns(right)) { + cudf::distinct_hash_join hash(right, left, has_nulls, nulleq); + maps = hash.inner_join(); + } else { + cudf::distinct_hash_join hash(right, left, has_nulls, nulleq); + maps = hash.inner_join(); + } + // Unique join returns {right map, left map} but all the other joins + // return {left map, right map}. Swap here to make it consistent. + return std::make_pair(std::move(maps.second), std::move(maps.first)); + }); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_innerJoinRowCount(JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index efdb6f4bb1b..6f0b2b51f4c 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -33,7 +33,6 @@ import com.google.common.base.Charsets; import com.google.common.collect.Lists; import com.google.common.collect.Maps; -import org.apache.avro.SchemaBuilder; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.ParquetFileReader; @@ -2104,6 +2103,116 @@ void testInnerJoinGatherMapsNulls() { } } + private void checkInnerDistinctJoin(Table leftKeys, Table rightKeys, Table expected, + boolean compareNullsEqual) { + GatherMap[] maps = leftKeys.innerDistinctJoinGatherMaps(rightKeys, compareNullsEqual); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + + @Test + void testInnerDistinctJoinGatherMaps() { + try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8, 6).build(); + Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + Table expected = new Table.TestBuilder() + .column(2, 7, 8, 9, 10) // left + .column(2, 0, 1, 3, 0) // right + .build()) { + checkInnerDistinctJoin(leftKeys, rightKeys, expected, false); + } + } + + @Test + void testInnerDistinctJoinGatherMapsWithNested() { + StructType structType = new StructType(false, + new BasicType(false, DType.STRING), + new BasicType(false, DType.INT32)); + StructData[] leftData = new StructData[]{ + new StructData("abc", 1), + new StructData("xyz", 1), + new StructData("abc", 2), + new StructData("xyz", 2), + new StructData("abc", 1), + new StructData("abc", 3), + new StructData("xyz", 3) + }; + StructData[] rightData = new StructData[]{ + new StructData("abc", 1), + new StructData("xyz", 4), + new StructData("xyz", 2), + new StructData("abc", -1), + }; + try (Table leftKeys = new Table.TestBuilder().column(structType, leftData).build(); + Table rightKeys = new Table.TestBuilder().column(structType, rightData).build(); + Table expected = new Table.TestBuilder() + .column(0, 3, 4) + .column(0, 2, 0) + .build()) { + checkInnerDistinctJoin(leftKeys, rightKeys, expected, false); + } + } + + @Test + void testInnerDistinctJoinGatherMapsNullsEqual() { + try (Table leftKeys = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table rightKeys = new Table.TestBuilder() + .column(null, 9, 8, 10, 32) + .build(); + Table expected = new Table.TestBuilder() + .column(2, 7, 8, 9) // left + .column(1, 0, 0, 2) // right + .build()) { + checkInnerDistinctJoin(leftKeys, rightKeys, expected, true); + } + } + + @Test + void testInnerDistinctJoinGatherMapsWithNestedNullsEqual() { + StructType structType = new StructType(true, + new BasicType(true, DType.STRING), + new BasicType(true, DType.INT32)); + StructData[] leftData = new StructData[]{ + new StructData("abc", 1), + null, + new StructData("xyz", 1), + new StructData("abc", 2), + new StructData("xyz", null), + null, + new StructData("abc", 1), + new StructData("abc", 3), + new StructData("xyz", 3), + new StructData(null, null), + new StructData(null, 1) + }; + StructData[] rightData = new StructData[]{ + null, + new StructData("abc", 1), + new StructData("xyz", 4), + new StructData("xyz", 2), + new StructData(null, null), + new StructData(null, 2), + new StructData(null, 1), + new StructData("xyz", null), + new StructData("abc", null), + new StructData("abc", -1) + }; + try (Table leftKeys = new Table.TestBuilder().column(structType, leftData).build(); + Table rightKeys = new Table.TestBuilder().column(structType, rightData).build(); + Table expected = new Table.TestBuilder() + .column(0, 1, 4, 5, 6, 9, 10) + .column(1, 0, 7, 0, 1, 4, 6) + .build()) { + checkInnerDistinctJoin(leftKeys, rightKeys, expected, true); + } + } + @Test void testInnerHashJoinGatherMaps() { try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); From 8526e6d5b21361465d1c72ecbea64d3d2d9bf849 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 28 Feb 2024 09:55:50 -0600 Subject: [PATCH 05/31] Drop python-snappy from dependencies. (#15161) Previously `python-snappy` was a test dependency. It does not appear that we rely on this directly, as there are no instances of `import snappy`. Recently, pandas also dropped this dependency: https://github.com/pandas-dev/pandas/pull/54633 More generally, we can refactor the dependency list to use `pandas[all]` now that we require pandas 2. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) - Ray Douglass (https://github.com/raydouglass) --- .../all_cuda-118_arch-x86_64.yaml | 1 - .../all_cuda-122_arch-x86_64.yaml | 1 - dependencies.yaml | 46 +------------------ python/cudf/pyproject.toml | 44 +----------------- 4 files changed, 3 insertions(+), 89 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index dc78bf68dda..79b786fe012 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -79,7 +79,6 @@ dependencies: - pytest-xdist - pytest<8 - python-confluent-kafka>=1.9.0,<1.10.0a0 -- python-snappy>=0.6.0 - python>=3.9,<3.11 - pytorch<1.12.0 - rapids-dask-dependency==24.4.* diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index 627cfa7667c..66a4ee57238 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -77,7 +77,6 @@ dependencies: - pytest-xdist - pytest<8 - python-confluent-kafka>=1.9.0,<1.10.0a0 -- python-snappy>=0.6.0 - python>=3.9,<3.11 - pytorch<1.12.0 - rapids-dask-dependency==24.4.* diff --git a/dependencies.yaml b/dependencies.yaml index 4011bd764e1..4281e907862 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -612,7 +612,6 @@ dependencies: - hypothesis - pytest-benchmark - pytest-cases>=3.8.2 - - python-snappy>=0.6.0 - scipy - output_types: conda packages: @@ -712,49 +711,8 @@ dependencies: packages: # dependencies to run pandas tests # https://github.com/pandas-dev/pandas/blob/main/environment.yml - # TODO: When pandas 2.0 is the minimum version, can just specify pandas[all] - - beautifulsoup4 - - blosc - - brotlipy - - boto3 - - botocore>=1.24.21 - - bottleneck - - fastparquet - - flask - - fsspec - - html5lib - - hypothesis - - gcsfs - - ipython - - jinja2 - - lxml - - matplotlib - - moto - - numba - - numexpr - - openpyxl - - odfpy - - py - - psycopg2-binary - - pyarrow - - pymysql - - pyreadstat - - pytest-asyncio - - pytest-reportlog - - python-snappy - - pytest-timeout - - pyxlsb - - s3fs - - scipy - - sqlalchemy - - tables - - pandas-gbq - - tabulate - - xarray - - xlrd - - xlsxwriter - - xlwt - - zstandard + # pandas[all] includes all of the required dependencies + - pandas[all] test_python_cudf_pandas: common: - output_types: pyproject diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index ef3b439bdf4..590786f2414 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -62,55 +62,13 @@ test = [ "pytest-cov", "pytest-xdist", "pytest<8", - "python-snappy>=0.6.0", "scipy", "tokenizers==0.13.1", "transformers==4.24.0", "tzdata", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. pandas-tests = [ - "beautifulsoup4", - "blosc", - "boto3", - "botocore>=1.24.21", - "bottleneck", - "brotlipy", - "fastparquet", - "flask", - "fsspec", - "gcsfs", - "html5lib", - "hypothesis", - "ipython", - "jinja2", - "lxml", - "matplotlib", - "moto", - "numba", - "numexpr", - "odfpy", - "openpyxl", - "pandas-gbq", - "psycopg2-binary", - "py", - "pyarrow", - "pymysql", - "pyreadstat", - "pytest-asyncio", - "pytest-reportlog", - "pytest-timeout", - "python-snappy", - "pyxlsb", - "s3fs", - "scipy", - "sqlalchemy", - "tables", - "tabulate", - "xarray", - "xlrd", - "xlsxwriter", - "xlwt", - "zstandard", + "pandas[all]", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. cudf-pandas-tests = [ "ipython", From 896b5bced6597e81f3a9e96e5b6bcc72cb364e68 Mon Sep 17 00:00:00 2001 From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com> Date: Wed, 28 Feb 2024 15:20:22 -0500 Subject: [PATCH 06/31] Compile-time ipow computation with array lookup (#15110) Compile-time ipow() computation with array lookup. Results in up to 8% speed improvement for decimal64 -> double conversions. Improvement is negligible for other conversions but is not worse. New benchmark test will be in a separate PR. Fix fixed_point -> string conversion test. Also fix rounding comments. Closes #9346 Authors: - Paul Mattione (https://github.com/pmattione-nvidia) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/15110 --- cpp/include/cudf/fixed_point/fixed_point.hpp | 63 ++++++++++++++------ cpp/include/cudf/round.hpp | 7 ++- cpp/tests/strings/fixed_point_tests.cpp | 5 +- 3 files changed, 53 insertions(+), 22 deletions(-) diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index a8a681f181e..542e2b3c5c8 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -82,12 +83,43 @@ constexpr inline auto is_supported_construction_value_type() // Helper functions for `fixed_point` type namespace detail { + /** - * @brief A function for integer exponentiation by squaring + * @brief Recursively computes integer exponentiation * - * https://simple.wikipedia.org/wiki/Exponentiation_by_squaring
- * Note: this is the iterative equivalent of the recursive definition (faster)
- * Quick-bench: http://quick-bench.com/Wg7o7HYQC9FW5M0CO0wQAjSwP_Y + * @note This is intended to be run at compile time + * + * @tparam Rep Representation type for return type + * @tparam Base The base to be exponentiated + * @param exp The exponent to be used for exponentiation + * @return Result of `Base` to the power of `exponent` of type `Rep` + */ +template +CUDF_HOST_DEVICE inline constexpr Rep get_power(int32_t exp) +{ + // Compute power recursively + return (exp > 0) ? Rep(Base) * get_power(exp - 1) : 1; +} + +/** + * @brief Implementation of integer exponentiation by array lookup + * + * @tparam Rep Representation type for return type + * @tparam Base The base to be exponentiated + * @tparam Exponents The exponents for the array entries + * @param exponent The exponent to be used for exponentiation + * @return Result of `Base` to the power of `exponent` of type `Rep` + */ +template +CUDF_HOST_DEVICE inline Rep ipow_impl(int32_t exponent, cuda::std::index_sequence) +{ + // Compute powers at compile time, storing into array + static constexpr Rep powers[] = {get_power(Exponents)...}; + return powers[exponent]; +} + +/** + * @brief A function for integer exponentiation by array lookup * * @tparam Rep Representation type for return type * @tparam Base The base to be exponentiated @@ -102,19 +134,16 @@ template = 0 && "integer exponentiation with negative exponent is not possible."); - if (exponent == 0) { return static_cast(1); } - - auto extra = static_cast(1); - auto square = static_cast(Base); - while (exponent > 1) { - if (exponent & 1 /* odd */) { - extra *= square; - exponent -= 1; - } - exponent /= 2; - square *= square; + if constexpr (Base == numeric::Radix::BASE_2) { + return static_cast(1) << exponent; + } else { // BASE_10 + // Build index sequence for building power array at compile time + static constexpr auto max_exp = cuda::std::numeric_limits::digits10; + static constexpr auto exponents = cuda::std::make_index_sequence{}; + + // Get compile-time result + return ipow_impl(Base)>(exponent, exponents); } - return square * extra; } /** @brief Function that performs a `right shift` scale "times" on the `val` diff --git a/cpp/include/cudf/round.hpp b/cpp/include/cudf/round.hpp index 030d3d42773..ee088628b94 100644 --- a/cpp/include/cudf/round.hpp +++ b/cpp/include/cudf/round.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,8 +32,9 @@ namespace cudf { /** * @brief Different rounding methods for `cudf::round` * - * Info on HALF_UP rounding: https://en.wikipedia.org/wiki/Rounding#Round_half_up - * Info on HALF_EVEN rounding: https://en.wikipedia.org/wiki/Rounding#Round_half_to_even + * Info on HALF_EVEN rounding: https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even + * Info on HALF_UP rounding: https://en.wikipedia.org/wiki/Rounding#Rounding_half_away_from_zero + * Note: HALF_UP means up in MAGNITUDE: Away from zero! Because of how Java and python define it */ enum class rounding_method : int32_t { HALF_UP, HALF_EVEN }; diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp index 0a1c004d0a0..9205207cc53 100644 --- a/cpp/tests/strings/fixed_point_tests.cpp +++ b/cpp/tests/strings/fixed_point_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -324,7 +324,8 @@ TEST_F(StringsConvertTest, DISABLED_FixedPointStringConversionOperator) { auto const max = cuda::std::numeric_limits<__int128_t>::max(); - auto const x = numeric::decimal128{max, numeric::scale_type{-10}}; + // Must use scaled_integer, else shift (multiply) is undefined behavior (integer overflow) + auto const x = numeric::decimal128(numeric::scaled_integer{max, numeric::scale_type{-10}}); EXPECT_EQ(static_cast(x), "17014118346046923173168730371.5884105727"); auto const y = numeric::decimal128{max, numeric::scale_type{10}}; From 3adfddcfa2cdac4acb16a50916442763a1d8a78b Mon Sep 17 00:00:00 2001 From: Jim Brennan Date: Wed, 28 Feb 2024 15:24:30 -0600 Subject: [PATCH 07/31] Make HostColumnVector.DataType accessor methods public (#15157) * Make HostColumnVector.DataType accessor methods public Signed-off-by: Jim Brennan * add accessors for StructData * update copyrights --------- Signed-off-by: Jim Brennan --- .../java/ai/rapids/cudf/HostColumnVector.java | 46 +++++++++++-------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java index 3e4baf962bc..e64c428ecbb 100644 --- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1179,12 +1179,12 @@ public final ColumnBuilder appendNull() { private ColumnBuilder append(StructData structData) { assert type.isNestedType(); if (type.equals(DType.STRUCT)) { - if (structData == null || structData.dataRecord == null) { + if (structData == null || structData.isNull()) { return appendNull(); } else { for (int i = 0; i < structData.getNumFields(); i++) { ColumnBuilder childBuilder = childBuilders.get(i); - appendChildOrNull(childBuilder, structData.dataRecord.get(i)); + appendChildOrNull(childBuilder, structData.getField(i)); } endStruct(); } @@ -2077,10 +2077,10 @@ public String toString() { } public static abstract class DataType { - abstract DType getType(); - abstract boolean isNullable(); - abstract DataType getChild(int index); - abstract int getNumChildren(); + public abstract DType getType(); + public abstract boolean isNullable(); + public abstract DataType getChild(int index); + public abstract int getNumChildren(); } public static class ListType extends HostColumnVector.DataType { @@ -2093,17 +2093,17 @@ public ListType(boolean isNullable, DataType child) { } @Override - DType getType() { + public DType getType() { return DType.LIST; } @Override - boolean isNullable() { + public boolean isNullable() { return isNullable; } @Override - HostColumnVector.DataType getChild(int index) { + public HostColumnVector.DataType getChild(int index) { if (index > 0) { return null; } @@ -2111,7 +2111,7 @@ HostColumnVector.DataType getChild(int index) { } @Override - int getNumChildren() { + public int getNumChildren() { return 1; } } @@ -2134,6 +2134,14 @@ public int getNumFields() { return 0; } } + + public boolean isNull() { + return (this.dataRecord == null); + } + + public Object getField(int index) { + return this.dataRecord.get(index); + } } public static class StructType extends HostColumnVector.DataType { @@ -2150,22 +2158,22 @@ public StructType(boolean isNullable, DataType... children) { } @Override - DType getType() { + public DType getType() { return DType.STRUCT; } @Override - boolean isNullable() { + public boolean isNullable() { return isNullable; } @Override - HostColumnVector.DataType getChild(int index) { + public HostColumnVector.DataType getChild(int index) { return children.get(index); } @Override - int getNumChildren() { + public int getNumChildren() { return children.size(); } } @@ -2180,22 +2188,22 @@ public BasicType(boolean isNullable, DType type) { } @Override - DType getType() { + public DType getType() { return type; } @Override - boolean isNullable() { + public boolean isNullable() { return isNullable; } @Override - HostColumnVector.DataType getChild(int index) { + public HostColumnVector.DataType getChild(int index) { return null; } @Override - int getNumChildren() { + public int getNumChildren() { return 0; } } From 8507b3dfe44794cd549222598320d9cf25c6e34c Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 28 Feb 2024 18:48:50 -0600 Subject: [PATCH 08/31] [ci] update matrix filters for dask-cudf builds (#15174) --- .github/workflows/build.yaml | 2 +- .github/workflows/pr.yaml | 6 +++--- .github/workflows/test.yaml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 1c68b3504e0..e60c47fae2b 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -92,7 +92,7 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index d7f47f628d6..9e11993048f 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -128,7 +128,7 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) build_type: pull-request script: "ci/build_wheel_dask_cudf.sh" wheel-tests-dask-cudf: @@ -136,7 +136,7 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) build_type: pull-request script: ci/test_wheel_dask_cudf.sh devcontainer: @@ -152,7 +152,7 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) build_type: pull-request script: ci/cudf_pandas_scripts/run_tests.sh # pandas-tests: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index da733f51779..e66b2e1f872 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -99,7 +99,7 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2"))) build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} From b670af6b55f03e3d273d5c94ab0988378c1fa907 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Thu, 29 Feb 2024 00:38:51 -0600 Subject: [PATCH 09/31] Avoid dict normalization in ``__dask_tokenize__`` (#15187) There are currently [CI failures](https://github.com/rapidsai/cudf/actions/runs/8089269486/job/22105880070?pr=15181#step:7:1050) that seem to be caused by non-deterministic `dict` normalization in `Frame.__dask_tokenize__`. This PR avoids normalizing that dictionary. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15187 --- python/cudf/cudf/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 79005193b4e..809bdb4e6d1 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1958,7 +1958,7 @@ def __dask_tokenize__(self): return [ type(self), - normalize_token(self._dtypes), + str(self._dtypes), normalize_token(self.to_pandas()), ] From f7e486043c30810625fe2d13f5b20d60f90b8d2e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 29 Feb 2024 00:19:15 -0800 Subject: [PATCH 10/31] Enable creation of columns from scalar (#15181) This PR enables creation of pylibcudf columns from scalar values. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15181 --- python/cudf/cudf/_lib/pylibcudf/column.pyx | 26 ++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index 2a7215099d5..62a83efa3e2 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -1,14 +1,18 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from cython.operator cimport dereference from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from rmm._lib.device_buffer cimport DeviceBuffer from cudf._lib.cpp.column.column cimport column, column_contents +from cudf._lib.cpp.column.column_factories cimport make_column_from_scalar +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport size_type from .gpumemoryview cimport gpumemoryview +from .scalar cimport Scalar from .types cimport DataType, type_id from .utils cimport int_to_bitmask_ptr, int_to_void_ptr @@ -196,6 +200,28 @@ cdef class Column: children, ) + @staticmethod + def from_scalar(Scalar slr, size_type size): + """Create a Column from a Scalar. + + Parameters + ---------- + slr : Scalar + The scalar to create a column from. + size : size_type + The number of elements in the column. + + Returns + ------- + Column + A Column containing the scalar repeated `size` times. + """ + cdef const scalar* c_scalar = slr.get() + cdef unique_ptr[column] c_result + with nogil: + c_result = move(make_column_from_scalar(dereference(c_scalar), size)) + return Column.from_libcudf(move(c_result)) + cpdef DataType type(self): """The type of data in the column.""" return self._data_type From 1a3b7890e1f110e93082308546eccbeae8a4784a Mon Sep 17 00:00:00 2001 From: jakirkham Date: Thu, 29 Feb 2024 05:53:40 -0800 Subject: [PATCH 11/31] Dynamically set version in RAPIDS doc builds (#15101) Following up on issue ( https://github.com/rapidsai/build-planning/issues/15 ), drop RAPIDS version hard-coding in doc builds. Authors: - https://github.com/jakirkham Approvers: - Bradley Dice (https://github.com/bdice) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/15101 --- ci/build_docs.sh | 3 ++- ci/release/update-version.sh | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 2b55a9db8af..529eaeae696 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -3,6 +3,8 @@ set -euo pipefail +export RAPIDS_VERSION_NUMBER="$(rapids-generate-version)" + rapids-logger "Create test conda environment" . /opt/conda/etc/profile.d/conda.sh @@ -27,7 +29,6 @@ rapids-mamba-retry install \ --channel "${PYTHON_CHANNEL}" \ libcudf cudf dask-cudf -export RAPIDS_VERSION_NUMBER="24.04" export RAPIDS_DOCS_DIR="$(mktemp -d)" rapids-logger "Build CPP docs" diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 1186b02f244..811e7825363 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -86,7 +86,6 @@ for FILE in .github/workflows/*.yaml; do sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE}; done -sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh # Java files NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT" From 15f11e10ac76baaac2fd702aab9bdf30dde07d6b Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 29 Feb 2024 10:10:07 -0500 Subject: [PATCH 12/31] Remove unneeded script parameters in test_cpp_memcheck.sh (#15158) Fixes error introduced in #14992 in `test_cpp_memcheck.sh` Extra line of parameters removed from the call to `run_cudf_memcheck_ctests.sh` Authors: - David Wendt (https://github.com/davidwendt) - https://github.com/jakirkham Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/15158 --- ci/test_cpp_memcheck.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ci/test_cpp_memcheck.sh b/ci/test_cpp_memcheck.sh index 0233c2b55f8..fda11c64155 100755 --- a/ci/test_cpp_memcheck.sh +++ b/ci/test_cpp_memcheck.sh @@ -8,9 +8,7 @@ source ./ci/test_cpp_common.sh rapids-logger "Memcheck gtests with rmm_mode=cuda" -./ci/run_cudf_memcheck_ctests.sh \ - --gtest_output=xml:"${RAPIDS_TESTS_DIR}${test_name}.xml" \ - && EXITCODE=$? || EXITCODE=$?; +./ci/run_cudf_memcheck_ctests.sh && EXITCODE=$? || EXITCODE=$?; rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} From 50630b2011b37f39d1e9255456153550cf40d470 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 29 Feb 2024 15:13:20 +0000 Subject: [PATCH 13/31] Implement stable version of `cudf::sort` (#15066) Adds an implementation of `cudf::stable_sort`. While here, cleans up a few small issues around stream-passing and memory resource usage in the detail APIs of some of the sort functions. - Closes #15065 Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - David Wendt (https://github.com/davidwendt) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/15066 --- cpp/include/cudf/detail/sorting.hpp | 13 ++- cpp/include/cudf/sorting.hpp | 35 +++---- cpp/src/sort/common_sort_impl.cuh | 101 +++++++++++++++++++++ cpp/src/sort/segmented_sort_impl.cuh | 11 +-- cpp/src/sort/sort.cu | 40 ++------ cpp/src/sort/sort_column.cu | 15 +-- cpp/src/sort/sort_column_impl.cuh | 14 +-- cpp/src/sort/sort_impl.cuh | 11 ++- cpp/src/sort/stable_sort.cu | 34 ++++++- cpp/src/sort/stable_sort_column.cu | 15 +-- cpp/tests/sort/stable_sort_tests.cpp | 131 +++++++++++++++------------ 11 files changed, 274 insertions(+), 146 deletions(-) create mode 100644 cpp/src/sort/common_sort_impl.cuh diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp index 8f92b66d5fa..97cc054da57 100644 --- a/cpp/include/cudf/detail/sorting.hpp +++ b/cpp/include/cudf/detail/sorting.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -150,5 +150,16 @@ std::unique_ptr sort(table_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @copydoc cudf::stable_sort + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr
stable_sort(table_view const& values, + std::vector const& column_order, + std::vector const& null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp index e4e803b2d3c..42bcb5da8e3 100644 --- a/cpp/include/cudf/sorting.hpp +++ b/cpp/include/cudf/sorting.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -115,6 +115,18 @@ std::unique_ptr
sort( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Performs a stable lexicographic sort of the rows of a table + * + * @copydoc cudf::sort + */ +std::unique_ptr
stable_sort( + table_view const& input, + std::vector const& column_order = {}, + std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs a key-value sort. * @@ -148,26 +160,7 @@ std::unique_ptr
sort_by_key( /** * @brief Performs a key-value stable sort. * - * Creates a new table that reorders the rows of `values` according to the - * lexicographic ordering of the rows of `keys`. - * - * The order of equivalent elements is guaranteed to be preserved. - * - * @throws cudf::logic_error if `values.num_rows() != keys.num_rows()`. - * - * @param values The table to reorder - * @param keys The table that determines the ordering - * @param column_order The desired order for each column in `keys`. Size must be - * equal to `keys.num_columns()` or empty. If empty, all columns are sorted in - * ascending order. - * @param null_precedence The desired order of a null element compared to other - * elements for each column in `keys`. Size must be equal to - * `keys.num_columns()` or empty. If empty, all columns will be sorted with - * `null_order::BEFORE`. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned table's device memory - * @return The reordering of `values` determined by the lexicographic order of - * the rows of `keys`. + * @copydoc cudf::sort_by_key */ std::unique_ptr
stable_sort_by_key( table_view const& values, diff --git a/cpp/src/sort/common_sort_impl.cuh b/cpp/src/sort/common_sort_impl.cuh new file mode 100644 index 00000000000..745e2717304 --- /dev/null +++ b/cpp/src/sort/common_sort_impl.cuh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace detail { + +/** + * @brief The enum specifying which sorting method to use (stable or unstable). + */ +enum class sort_method : bool { STABLE, UNSTABLE }; + +/** + * @brief Functor performs a fast-path, in-place sort on eligible columns + * + * @tparam method Whether to use a stable or unstable sort. + */ +template +struct inplace_column_sort_fn { + /** + * @brief Check if fast-path, in-place sort is available for the given column + * + * @param column to check + * @return true if fast-path sort is available, false otherwise. + */ + static bool is_usable(column_view const& column) + { + return !column.has_nulls() && cudf::is_fixed_width(column.type()) && + !cudf::is_floating_point(column.type()); + } + /** + * @brief Check if fast-path, in-place sort is available for the given table + * + * @param table to check + * @return true if fast-path sort is available, false otherwise. + */ + static bool is_usable(table_view const& table) + { + return table.num_columns() == 1 && is_usable(table.column(0)); + } + + /** + * @brief Fast-path sort a column in place + * + * Precondition, is_usable(column) returned true + * + * @tparam T column data type. + * @param col Column to sort, modified in place. + * @param order Ascending or descending sort order. + * @param stream CUDA stream used for device memory operations and kernel launches + * + */ + template ()>* = nullptr> + void operator()(mutable_column_view& col, order order, rmm::cuda_stream_view stream) const + { + auto const do_sort = [&](auto const cmp) { + if constexpr (method == sort_method::STABLE) { + thrust::stable_sort(rmm::exec_policy(stream), col.begin(), col.end(), cmp); + } else { + thrust::sort(rmm::exec_policy(stream), col.begin(), col.end(), cmp); + } + }; + if (order == order::ASCENDING) { + do_sort(thrust::less()); + } else { + do_sort(thrust::greater()); + } + } + + template ()>* = nullptr> + void operator()(mutable_column_view&, order, rmm::cuda_stream_view) const + { + CUDF_FAIL("Column type must be relationally comparable and fixed-width"); + } +}; + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh index 5d11bf055f1..796e178fecd 100644 --- a/cpp/src/sort/segmented_sort_impl.cuh +++ b/cpp/src/sort/segmented_sort_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,10 @@ * limitations under the License. */ +#pragma once + +#include "common_sort_impl.cuh" + #include #include #include @@ -29,11 +33,6 @@ namespace cudf { namespace detail { -/** - * @brief The enum specifying which sorting method to use (stable or unstable). - */ -enum class sort_method { STABLE, UNSTABLE }; - /** * @brief Functor performs faster segmented sort on eligible columns */ diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu index 46edae798d4..adffc06ab93 100644 --- a/cpp/src/sort/sort.cu +++ b/cpp/src/sort/sort.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "common_sort_impl.cuh" #include "sort_impl.cuh" #include @@ -37,7 +38,7 @@ std::unique_ptr sorted_order(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return sorted_order(input, column_order, null_precedence, stream, mr); + return sorted_order(input, column_order, null_precedence, stream, mr); } std::unique_ptr
sort_by_key(table_view const& values, @@ -61,47 +62,24 @@ std::unique_ptr
sort_by_key(table_view const& values, mr); } -struct inplace_column_sort_fn { - template ()>* = nullptr> - void operator()(mutable_column_view& col, bool ascending, rmm::cuda_stream_view stream) const - { - CUDF_EXPECTS(!col.has_nulls(), "Nulls not supported for in-place sort"); - if (ascending) { - thrust::sort(rmm::exec_policy(stream), col.begin(), col.end(), thrust::less()); - } else { - thrust::sort(rmm::exec_policy(stream), col.begin(), col.end(), thrust::greater()); - } - } - - template ()>* = nullptr> - void operator()(mutable_column_view&, bool, rmm::cuda_stream_view) const - { - CUDF_FAIL("Column type must be relationally comparable and fixed-width"); - } -}; - std::unique_ptr
sort(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_FUNC_RANGE(); // fast-path sort conditions: single, non-floating-point, fixed-width column with no nulls - if (input.num_columns() == 1 && !input.column(0).has_nulls() && - cudf::is_fixed_width(input.column(0).type()) && - !cudf::is_floating_point(input.column(0).type())) { - auto output = std::make_unique(input.column(0), stream, mr); - auto view = output->mutable_view(); - bool ascending = (column_order.empty() ? true : column_order.front() == order::ASCENDING); + if (inplace_column_sort_fn::is_usable(input)) { + auto output = std::make_unique(input.column(0), stream, mr); + auto view = output->mutable_view(); + auto order = (column_order.empty() ? order::ASCENDING : column_order.front()); cudf::type_dispatcher( - output->type(), inplace_column_sort_fn{}, view, ascending, stream); + output->type(), inplace_column_sort_fn{}, view, order, stream); std::vector> columns; columns.emplace_back(std::move(output)); return std::make_unique
(std::move(columns)); } - return detail::sort_by_key( - input, input, column_order, null_precedence, cudf::get_default_stream(), mr); + return detail::sort_by_key(input, input, column_order, null_precedence, stream, mr); } } // namespace detail diff --git a/cpp/src/sort/sort_column.cu b/cpp/src/sort/sort_column.cu index 9df04251e93..7db44476988 100644 --- a/cpp/src/sort/sort_column.cu +++ b/cpp/src/sort/sort_column.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "common_sort_impl.cuh" #include "sort_column_impl.cuh" #include @@ -30,11 +31,11 @@ namespace detail { * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::mr::device_memory_resource*) */ template <> -std::unique_ptr sorted_order(column_view const& input, - order column_order, - null_order null_precedence, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr sorted_order(column_view const& input, + order column_order, + null_order null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto sorted_indices = cudf::make_numeric_column( data_type(type_to_id()), input.size(), mask_state::UNALLOCATED, stream, mr); @@ -42,7 +43,7 @@ std::unique_ptr sorted_order(column_view const& input, thrust::sequence( rmm::exec_policy(stream), indices_view.begin(), indices_view.end(), 0); cudf::type_dispatcher(input.type(), - column_sorted_order_fn{}, + column_sorted_order_fn{}, input, indices_view, column_order == order::ASCENDING, diff --git a/cpp/src/sort/sort_column_impl.cuh b/cpp/src/sort/sort_column_impl.cuh index 5abc6bdfadf..7af24f22b67 100644 --- a/cpp/src/sort/sort_column_impl.cuh +++ b/cpp/src/sort/sort_column_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,8 @@ #pragma once +#include "common_sort_impl.cuh" + #include #include #include @@ -36,7 +38,7 @@ namespace detail { * This API offers fast sorting for primitive types. It cannot handle nested types and will not * consider `NaN` as equivalent to other `NaN`. * - * @tparam stable Whether to use stable sort + * @tparam method Whether to use stable sort * @param input Column to sort. The column data is not modified. * @param column_order Ascending or descending sort order * @param null_precedence How null rows are to be ordered @@ -45,7 +47,7 @@ namespace detail { * @param mr Device memory resource used to allocate the returned column's device memory * @return Sorted indices for the input column. */ -template +template std::unique_ptr sorted_order(column_view const& input, order column_order, null_order null_precedence, @@ -78,7 +80,7 @@ struct simple_comparator { null_order null_precedence{}; }; -template +template struct column_sorted_order_fn { /** * @brief Compile time check for allowing faster sort. @@ -121,7 +123,7 @@ struct column_sorted_order_fn { auto const do_sort = [&](auto const comp) { // Compiling `thrust::*sort*` APIs is expensive. // Thus, we should optimize that by using constexpr condition to only compile what we need. - if constexpr (stable) { + if constexpr (method == sort_method::STABLE) { thrust::stable_sort_by_key(rmm::exec_policy(stream), d_col.begin(), d_col.end(), @@ -165,7 +167,7 @@ struct column_sorted_order_fn { auto comp = simple_comparator{*keys, input.has_nulls(), ascending, null_precedence}; // Compiling `thrust::*sort*` APIs is expensive. // Thus, we should optimize that by using constexpr condition to only compile what we need. - if constexpr (stable) { + if constexpr (method == sort_method::STABLE) { thrust::stable_sort( rmm::exec_policy(stream), indices.begin(), indices.end(), comp); } else { diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh index 5fae8db1a70..e0331d65053 100644 --- a/cpp/src/sort/sort_impl.cuh +++ b/cpp/src/sort/sort_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #pragma once +#include "common_sort_impl.cuh" #include "sort_column_impl.cuh" #include @@ -30,7 +31,7 @@ namespace detail { * @tparam stable Whether to use stable sort * @param stream CUDA stream used for device memory operations and kernel launches */ -template +template std::unique_ptr sorted_order(table_view input, std::vector const& column_order, std::vector const& null_precedence, @@ -39,7 +40,7 @@ std::unique_ptr sorted_order(table_view input, { if (input.num_rows() == 0 or input.num_columns() == 0) { return cudf::make_numeric_column( - data_type(type_to_id()), 0, mask_state::UNALLOCATED, stream); + data_type(type_to_id()), 0, mask_state::UNALLOCATED, stream, mr); } if (not column_order.empty()) { @@ -57,7 +58,7 @@ std::unique_ptr sorted_order(table_view input, auto const single_col = input.column(0); auto const col_order = column_order.empty() ? order::ASCENDING : column_order.front(); auto const null_prec = null_precedence.empty() ? null_order::BEFORE : null_precedence.front(); - return sorted_order(single_col, col_order, null_prec, stream, mr); + return sorted_order(single_col, col_order, null_prec, stream, mr); } std::unique_ptr sorted_indices = cudf::make_numeric_column( @@ -71,7 +72,7 @@ std::unique_ptr sorted_order(table_view input, auto const do_sort = [&](auto const comparator) { // Compiling `thrust::*sort*` APIs is expensive. // Thus, we should optimize that by using constexpr condition to only compile what we need. - if constexpr (stable) { + if constexpr (method == sort_method::STABLE) { thrust::stable_sort(rmm::exec_policy(stream), mutable_indices_view.begin(), mutable_indices_view.end(), diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu index cf602dcf1a9..0bfe2cfef16 100644 --- a/cpp/src/sort/stable_sort.cu +++ b/cpp/src/sort/stable_sort.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "common_sort_impl.cuh" #include "sort_impl.cuh" #include @@ -34,7 +35,26 @@ std::unique_ptr stable_sorted_order(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return sorted_order(input, column_order, null_precedence, stream, mr); + return sorted_order(input, column_order, null_precedence, stream, mr); +} + +std::unique_ptr
stable_sort(table_view const& input, + std::vector const& column_order, + std::vector const& null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (inplace_column_sort_fn::is_usable(input)) { + auto output = std::make_unique(input.column(0), stream, mr); + auto view = output->mutable_view(); + auto order = (column_order.empty() ? order::ASCENDING : column_order.front()); + cudf::type_dispatcher( + output->type(), inplace_column_sort_fn{}, view, order, stream); + std::vector> columns; + columns.emplace_back(std::move(output)); + return std::make_unique
(std::move(columns)); + } + return detail::stable_sort_by_key(input, input, column_order, null_precedence, stream, mr); } std::unique_ptr
stable_sort_by_key(table_view const& values, @@ -69,6 +89,16 @@ std::unique_ptr stable_sorted_order(table_view const& input, return detail::stable_sorted_order(input, column_order, null_precedence, stream, mr); } +std::unique_ptr
stable_sort(table_view const& input, + std::vector const& column_order, + std::vector const& null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::stable_sort(input, column_order, null_precedence, stream, mr); +} + std::unique_ptr
stable_sort_by_key(table_view const& values, table_view const& keys, std::vector const& column_order, diff --git a/cpp/src/sort/stable_sort_column.cu b/cpp/src/sort/stable_sort_column.cu index be519ead951..25a6c92034a 100644 --- a/cpp/src/sort/stable_sort_column.cu +++ b/cpp/src/sort/stable_sort_column.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "common_sort_impl.cuh" #include "sort_column_impl.cuh" #include @@ -30,11 +31,11 @@ namespace detail { * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::mr::device_memory_resource*) */ template <> -std::unique_ptr sorted_order(column_view const& input, - order column_order, - null_order null_precedence, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr sorted_order(column_view const& input, + order column_order, + null_order null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto sorted_indices = cudf::make_numeric_column( data_type(type_to_id()), input.size(), mask_state::UNALLOCATED, stream, mr); @@ -42,7 +43,7 @@ std::unique_ptr sorted_order(column_view const& input, thrust::sequence( rmm::exec_policy(stream), indices_view.begin(), indices_view.end(), 0); cudf::type_dispatcher(input.type(), - column_sorted_order_fn{}, + column_sorted_order_fn{}, input, indices_view, column_order == order::ASCENDING, diff --git a/cpp/tests/sort/stable_sort_tests.cpp b/cpp/tests/sort/stable_sort_tests.cpp index 71520ef007b..341f8317004 100644 --- a/cpp/tests/sort/stable_sort_tests.cpp +++ b/cpp/tests/sort/stable_sort_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,12 +34,14 @@ void run_stable_sort_test(cudf::table_view input, cudf::column_view expected_sorted_indices, std::vector column_order = {}, - std::vector null_precedence = {}) + std::vector null_precedence = {}, + bool by_key = true) { - auto got_sort_by_key_table = cudf::sort_by_key(input, input, column_order, null_precedence); - auto expected_sort_by_key_table = cudf::gather(input, expected_sorted_indices); + auto got = by_key ? cudf::stable_sort_by_key(input, input, column_order, null_precedence) + : cudf::stable_sort(input, column_order, null_precedence); + auto expected = cudf::gather(input, expected_sorted_indices); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort_by_key_table->view(), got_sort_by_key_table->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), got->view()); } using TestTypes = cudf::test::Concat col3{{10, 40, 70, 10, 2, 10}, {1, 1, 0, 1, 1, 1}}; cudf::table_view input{{col1, col2, col3}}; - cudf::test::fixed_width_column_wrapper expected{{1, 0, 3, 5, 4, 2}}; std::vector column_order{ cudf::order::ASCENDING, cudf::order::ASCENDING, cudf::order::DESCENDING}; std::vector null_precedence{ cudf::null_order::AFTER, cudf::null_order::AFTER, cudf::null_order::AFTER}; + auto expected = std::is_same_v + // All the bools are true, and therefore don't affect sort order, + // so this is just the sort order of the nullable string column + ? cudf::test::fixed_width_column_wrapper{{0, 3, 5, 1, 4, 2}} + : cudf::test::fixed_width_column_wrapper{{1, 0, 3, 5, 4, 2}}; auto got = cudf::stable_sorted_order(input, column_order, null_precedence); - if (not std::is_same_v) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view()); - - run_stable_sort_test(input, expected, column_order, null_precedence); - } else { - // for bools only validate that the null element landed at the back, since - // the rest of the values are equivalent and yields random sorted order. - auto to_host = [](cudf::column_view const& col) { - thrust::host_vector h_data(col.size()); - CUDF_CUDA_TRY(cudaMemcpy( - h_data.data(), col.data(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault)); - return h_data; - }; - thrust::host_vector h_exp = to_host(expected); - thrust::host_vector h_got = to_host(got->view()); - EXPECT_EQ(h_exp[h_exp.size() - 1], h_got[h_got.size() - 1]); - - cudf::test::fixed_width_column_wrapper expected_for_bool{{0, 3, 5, 1, 4, 2}}; - run_stable_sort_test(input, expected_for_bool, column_order, null_precedence); - } + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view()); + + run_stable_sort_test(input, expected, column_order, null_precedence, false); + run_stable_sort_test(input, expected, column_order, null_precedence, true); +} + +TYPED_TEST(StableSort, SingleColumnNoNull) +{ + // This test exercises the "fast-path" single column sort. + using T = TypeParam; + // 0 1 2 3 4 5 6 7 8 9 + cudf::test::fixed_width_column_wrapper col{{7, 1, -2, 5, 1, 0, 1, -2, 0, 5}}; + cudf::table_view input{{col}}; + std::vector column_order{cudf::order::ASCENDING}; + auto expected = + std::is_same_v + ? cudf::test::fixed_width_column_wrapper{{8, 5, 0, 1, 2, 3, 4, 6, 7, 9}} + : std::is_unsigned_v + ? cudf::test::fixed_width_column_wrapper{{5, 8, 1, 4, 6, 3, 9, 0, 2, 7}} + : cudf::test::fixed_width_column_wrapper{{2, 7, 5, 8, 1, 4, 6, 3, 9, 0}}; + run_stable_sort_test(input, expected, column_order, {}, false); + run_stable_sort_test(input, expected, column_order, {}, true); +} + +TYPED_TEST(StableSort, SingleColumnWithNull) +{ + using T = TypeParam; + // 0 1 2 3 4 5 6 7 8 9 + cudf::test::fixed_width_column_wrapper col{{7, 1, -2, 5, 1, 0, 1, -2, 0, 5}, + {1, 1, 0, 0, 1, 0, 1, 0, 1, 0}}; + cudf::table_view input{{col}}; + std::vector column_order{cudf::order::ASCENDING}; + std::vector null_precedence{cudf::null_order::BEFORE}; + auto expected = + std::is_same_v + ? cudf::test::fixed_width_column_wrapper{{5, 2, 3, 7, 9, 8, 0, 1, 4, 6}} + : std::is_unsigned_v + ? cudf::test::fixed_width_column_wrapper{{5, 3, 9, 2, 7, 8, 1, 4, 6, 0}} + : cudf::test::fixed_width_column_wrapper{{2, 7, 5, 3, 9, 8, 1, 4, 6, 0}}; + run_stable_sort_test(input, expected, column_order, {}, false); + run_stable_sort_test(input, expected, column_order, {}, true); } TYPED_TEST(StableSort, WithNullMin) @@ -117,32 +144,19 @@ TYPED_TEST(StableSort, WithNullMin) cudf::test::fixed_width_column_wrapper col3{{10, 40, 70, 10, 2}, {1, 1, 0, 1, 1}}; cudf::table_view input{{col1, col2, col3}}; - cudf::test::fixed_width_column_wrapper expected{{2, 1, 0, 3, 4}}; std::vector column_order{ cudf::order::ASCENDING, cudf::order::ASCENDING, cudf::order::DESCENDING}; + auto expected = std::is_same_v + // All the bools are true, and therefore don't affect sort order, + // so this is just the sort order of the string column + ? cudf::test::fixed_width_column_wrapper{{2, 0, 3, 1, 4}} + : cudf::test::fixed_width_column_wrapper{{2, 1, 0, 3, 4}}; + auto got = cudf::stable_sorted_order(input, column_order); - auto got = cudf::stable_sorted_order(input, column_order); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view()); - if (!std::is_same_v) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view()); - - run_stable_sort_test(input, expected, column_order); - } else { - // for bools only validate that the null element landed at the front, since - // the rest of the values are equivalent and yields random sorted order. - auto to_host = [](cudf::column_view const& col) { - thrust::host_vector h_data(col.size()); - CUDF_CUDA_TRY(cudaMemcpy( - h_data.data(), col.data(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault)); - return h_data; - }; - thrust::host_vector h_exp = to_host(expected); - thrust::host_vector h_got = to_host(got->view()); - EXPECT_EQ(h_exp.front(), h_got.front()); - - cudf::test::fixed_width_column_wrapper expected_for_bool{{2, 0, 3, 1, 4}}; - run_stable_sort_test(input, expected_for_bool, column_order); - } + run_stable_sort_test(input, expected, column_order, {}, false); + run_stable_sort_test(input, expected, column_order, {}, true); } TYPED_TEST(StableSort, WithAllValid) @@ -154,22 +168,19 @@ TYPED_TEST(StableSort, WithAllValid) cudf::test::fixed_width_column_wrapper col3{{10, 40, 70, 10, 2}}; cudf::table_view input{{col1, col2, col3}}; - cudf::test::fixed_width_column_wrapper expected{{2, 1, 0, 3, 4}}; std::vector column_order{ cudf::order::ASCENDING, cudf::order::ASCENDING, cudf::order::DESCENDING}; + auto expected = std::is_same_v + // All the bools are true, and therefore don't affect sort order, + // so this is just the sort order of the string column + ? cudf::test::fixed_width_column_wrapper{{2, 0, 3, 1, 4}} + : cudf::test::fixed_width_column_wrapper{{2, 1, 0, 3, 4}}; + auto got = cudf::stable_sorted_order(input, column_order); - auto got = cudf::stable_sorted_order(input, column_order); - - // Skip validating bools order. Valid true bools are all - // equivalent, and yield random order after thrust::sort - if (!std::is_same_v) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view()); - run_stable_sort_test(input, expected, column_order); - } else { - cudf::test::fixed_width_column_wrapper expected_for_bool{{2, 0, 3, 1, 4}}; - run_stable_sort_test(input, expected_for_bool, column_order); - } + run_stable_sort_test(input, expected, column_order, {}, false); + run_stable_sort_test(input, expected, column_order, {}, true); } TYPED_TEST(StableSort, MisMatchInColumnOrderSize) From efc4edfa9dcb30d63379ad23bef23ca330d5bcdf Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 29 Feb 2024 09:04:06 -0800 Subject: [PATCH 14/31] Fix memcheck error in distinct inner join (#15164) Closes #15156 Fixes the invalid global read introduced by #14990 and simplifies the logic. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/15164 --- cpp/src/join/distinct_hash_join.cu | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu index 7c834d1a96b..981a7bf0dea 100644 --- a/cpp/src/join/distinct_hash_join.cu +++ b/cpp/src/join/distinct_hash_join.cu @@ -205,18 +205,14 @@ CUDF_KERNEL void distinct_join_probe_kernel(Iter iter, cudf::size_type buffer_size = 0; while (idx - block.thread_rank() < n) { // the whole thread block falls into the same iteration - cudf::size_type thread_count{0}; - cudf::size_type build_idx{0}; - if (idx < n) { - auto const found = hash_table.find(*(iter + idx)); - thread_count = found != hash_table.end(); - build_idx = static_cast(found->second); - } + auto const found = idx < n ? hash_table.find(*(iter + idx)) : hash_table.end(); + auto const has_match = found != hash_table.end(); // Use a whole-block scan to calculate the output location cudf::size_type offset; cudf::size_type block_count; - block_scan(block_scan_temp_storage).ExclusiveSum(thread_count, offset, block_count); + block_scan(block_scan_temp_storage) + .ExclusiveSum(static_cast(has_match), offset, block_count); if (buffer_size + block_count > buffer_capacity) { flush_buffer(block, buffer_size, buffer, counter, build_indices, probe_indices); @@ -224,8 +220,9 @@ CUDF_KERNEL void distinct_join_probe_kernel(Iter iter, buffer_size = 0; } - if (thread_count == 1) { - buffer[buffer_size + offset] = cuco::pair{build_idx, static_cast(idx)}; + if (has_match) { + buffer[buffer_size + offset] = cuco::pair{static_cast(found->second), + static_cast(idx)}; } buffer_size += block_count; block.sync(); From b7d9335dc716e731c4fa820e77409b2bb0734eb8 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 29 Feb 2024 20:27:52 +0100 Subject: [PATCH 15/31] Document how cuDF is pronounced (#14753) Document in `README.md` and sphinx landing pages how cuDF is pronounced. It is known people may pronounce cuDF in ways that aren't how it was conceived, such as "see-you-dee-ef". The correct way to pronounce is not documented anywhere so people who have never heard it from someone knowledgeable aren't able to know for sure, and thus this should be clearly documented. Authors: - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - David Wendt (https://github.com/davidwendt) - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14753 --- README.md | 5 +++-- docs/cudf/source/index.rst | 12 ++++++------ docs/dask_cudf/source/index.rst | 5 +++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a64e39452ec..599e194bc1a 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,9 @@ ## 📢 cuDF can now be used as a no-code-change accelerator for pandas! To learn more, see [here](https://rapids.ai/cudf-pandas/)! -cuDF is a GPU DataFrame library for loading joining, aggregating, -filtering, and otherwise manipulating data. cuDF leverages +cuDF (pronounced "KOO-dee-eff") is a GPU DataFrame library +for loading, joining, aggregating, filtering, and otherwise +manipulating data. cuDF leverages [libcudf](https://docs.rapids.ai/api/libcudf/stable/), a blazing-fast C++/CUDA dataframe library and the [Apache Arrow](https://arrow.apache.org/) columnar format to provide a diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst index 3765b560a7f..3b8dfa5fe01 100644 --- a/docs/cudf/source/index.rst +++ b/docs/cudf/source/index.rst @@ -5,12 +5,12 @@ Welcome to the cuDF documentation! :width: 300px :align: center -**cuDF** is a Python GPU DataFrame library (built on the `Apache Arrow -`_ columnar memory format) for loading, joining, -aggregating, filtering, and otherwise manipulating data. cuDF also provides a -pandas-like API that will be familiar to data engineers & data scientists, so -they can use it to easily accelerate their workflows without going into -the details of CUDA programming. +**cuDF** (pronounced "KOO-dee-eff") is a Python GPU DataFrame library (built +on the `Apache Arrow `_ columnar memory format) +for loading, joining, aggregating, filtering, and otherwise manipulating data. +cuDF also provides a pandas-like API that will be familiar to data engineers +& data scientists, so they can use it to easily accelerate their workflows +without going into the details of CUDA programming. ``cudf.pandas`` is built on cuDF and accelerates pandas code on the GPU. It supports 100% of the pandas API, using the GPU for diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index 0442ab0929a..9a216690384 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -6,8 +6,9 @@ Welcome to dask-cudf's documentation! ===================================== -Dask-cuDF is an extension library for the `Dask `__ -parallel computing framework that provides a `cuDF +**Dask-cuDF** (pronounced "DASK KOO-dee-eff") is an extension +library for the `Dask `__ parallel computing +framework that provides a `cuDF `__-backed distributed dataframe with the same API as `Dask dataframes `__. From 08e3c96e482ead102cd06b99a0bbdfef2735e0bd Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 29 Feb 2024 12:01:53 -0800 Subject: [PATCH 16/31] Eliminate duplicate allocation of nested string columns (#15061) Issue https://github.com/rapidsai/cudf/issues/14965 Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/15061 --- cpp/src/io/parquet/reader_impl.cpp | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 26d810a3337..93fc6bd6bb5 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -60,7 +60,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) // TODO: This step is somewhat redundant if size info has already been calculated (nested schema, // chunked reader). auto const has_strings = (kernel_mask & STRINGS_MASK) != 0; - std::vector col_sizes(_input_columns.size(), 0L); + std::vector col_string_sizes(_input_columns.size(), 0L); if (has_strings) { ComputePageStringSizes(subpass.pages, pass.chunks, @@ -71,10 +71,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) kernel_mask, _stream); - col_sizes = calculate_page_string_offsets(); + col_string_sizes = calculate_page_string_offsets(); // check for overflow - if (std::any_of(col_sizes.cbegin(), col_sizes.cend(), [](size_t sz) { + if (std::any_of(col_string_sizes.cbegin(), col_string_sizes.cend(), [](std::size_t sz) { return sz > std::numeric_limits::max(); })) { CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); @@ -157,8 +157,9 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) valids[idx] = out_buf.null_mask(); data[idx] = out_buf.data(); // only do string buffer for leaf - if (out_buf.string_size() == 0 && col_sizes[pass.chunks[c].src_col_index] > 0) { - out_buf.create_string_data(col_sizes[pass.chunks[c].src_col_index], _stream); + if (idx == max_depth - 1 and out_buf.string_size() == 0 and + col_string_sizes[pass.chunks[c].src_col_index] > 0) { + out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index], _stream); } if (has_strings) { str_data[idx] = out_buf.string_data(); } out_buf.user_data |= @@ -272,21 +273,21 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) auto const& child = (*cols)[input_col.nesting[l_idx + 1]]; // the final offset for a list at level N is the size of it's child - int const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size; - CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), + size_type const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size; + CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), &offset, - sizeof(offset), + sizeof(size_type), cudaMemcpyDefault, _stream.value())); out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; } else if (out_buf.type.id() == type_id::STRING) { // need to cap off the string offsets column - size_type const sz = static_cast(col_sizes[idx]); - cudaMemcpyAsync(static_cast(out_buf.data()) + out_buf.size, - &sz, - sizeof(size_type), - cudaMemcpyDefault, - _stream.value()); + auto const sz = static_cast(col_string_sizes[idx]); + CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + out_buf.size, + &sz, + sizeof(size_type), + cudaMemcpyDefault, + _stream.value())); } } } From a9e41e73505876b171ca620c52a8638dae9896fd Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Thu, 29 Feb 2024 14:07:29 -0600 Subject: [PATCH 17/31] Performance optimizations for parquet sub-rowgroup reader. (#15020) This PR implements a basket of optimizations for the parquet reader to bring non-chunked reads close to par following the merge of the sub-rowgroup reader. The primary culprit for the performance hit was that in the case where we perform no splits, we were making a full copy of all of the pages into the subpass struct (including a pinned memory allocation). This is unnecessary because we can just represent the pages in the subpass as a span that wraps the existing pages in the pass. In addition, several `hostdevice_vector`s used for work that could be done entirely device-side were converted to `rmm::device_uvector`. Finally, I converted a number of functions that were taking hostdevice_vectors to use spans instead and added some missing operators to the `hostdevice_vector` class itself. This PR doesn't recover all the time (there is some new work that we have to do in all cases) but it takes out most of the sting. A sample of some of the benchmarks that were most notably affected: ``` Original Time Sub-rowgroup-implementation This PR parquet_read_decode Int, device buffer 0 29260860778 26373181343 28121328587 Int, device buffer 1 30692134492 27474241282 29495189226 parquet_read_chunks Int, device buffer 33895028252 29986276949 32293548191 Float, device buffer 57055985251 49640274260 55795392897 ``` Authors: - https://github.com/nvdbaranec Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/15020 --- cpp/src/io/parquet/decode_preprocess.cu | 4 +- cpp/src/io/parquet/page_data.cu | 8 +- cpp/src/io/parquet/page_delta_decode.cu | 12 +- cpp/src/io/parquet/page_hdr.cu | 2 +- cpp/src/io/parquet/page_string_decode.cu | 24 +-- cpp/src/io/parquet/parquet_gpu.hpp | 32 ++- cpp/src/io/parquet/reader_impl_chunking.cu | 170 +++++++++++----- cpp/src/io/parquet/reader_impl_chunking.hpp | 18 +- cpp/src/io/parquet/reader_impl_preprocess.cu | 202 +++++++++++-------- cpp/src/io/utilities/hostdevice_span.hpp | 40 +++- 10 files changed, 333 insertions(+), 179 deletions(-) diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index fea4777af43..862dedf6200 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -385,8 +385,8 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) /** * @copydoc cudf::io::parquet::gpu::ComputePageSizes */ -void ComputePageSizes(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void ComputePageSizes(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, size_t min_row, size_t num_rows, bool compute_num_rows, diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 2a9f2d56755..79154851cc7 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -609,11 +609,11 @@ struct mask_tform { } // anonymous namespace -uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector& pages, +uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_span pages, rmm::cuda_stream_view stream) { // determine which kernels to invoke - auto mask_iter = thrust::make_transform_iterator(pages.d_begin(), mask_tform{}); + auto mask_iter = thrust::make_transform_iterator(pages.device_begin(), mask_tform{}); return thrust::reduce( rmm::exec_policy(stream), mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or{}); } @@ -621,8 +621,8 @@ uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector /** * @copydoc cudf::io::parquet::detail::DecodePageData */ -void __host__ DecodePageData(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void __host__ DecodePageData(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, size_t num_rows, size_t min_row, int level_type_size, diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index ebad1434c7f..c68b6a32c8b 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -745,8 +745,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) /** * @copydoc cudf::io::parquet::detail::DecodeDeltaBinary */ -void DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void DecodeDeltaBinary(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, size_t num_rows, size_t min_row, int level_type_size, @@ -770,8 +770,8 @@ void DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages, /** * @copydoc cudf::io::parquet::gpu::DecodeDeltaByteArray */ -void DecodeDeltaByteArray(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void DecodeDeltaByteArray(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, size_t num_rows, size_t min_row, int level_type_size, @@ -795,8 +795,8 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_vector& pages, /** * @copydoc cudf::io::parquet::gpu::DecodeDeltaByteArray */ -void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, size_t num_rows, size_t min_row, int level_type_size, diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index a15ccf328de..0dae0724823 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -396,7 +396,7 @@ void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chunks, } num_values = bs->ck.num_values; page_info = chunk_pages ? chunk_pages[chunk].pages : nullptr; - max_num_pages = page_info ? bs->ck.max_num_pages : 0; + max_num_pages = page_info ? (bs->ck.num_data_pages + bs->ck.num_dict_pages) : 0; values_found = 0; __syncwarp(); while (values_found < num_values && bs->cur < bs->end) { diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 5cd8205b4ba..101bd34f09f 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -1112,8 +1112,8 @@ struct page_tform_functor { /** * @copydoc cudf::io::parquet::detail::ComputePageStringSizes */ -void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void ComputePageStringSizes(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, rmm::device_uvector& temp_string_buf, size_t min_row, size_t num_rows, @@ -1157,7 +1157,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, // check for needed temp space for DELTA_BYTE_ARRAY auto const need_sizes = thrust::any_of( - rmm::exec_policy(stream), pages.d_begin(), pages.d_end(), [] __device__(auto& page) { + rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), [] __device__(auto& page) { return page.temp_string_size != 0; }); @@ -1165,8 +1165,8 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, // sum up all of the temp_string_sizes auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; }; auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream), - pages.d_begin(), - pages.d_end(), + pages.device_begin(), + pages.device_end(), page_sizes, 0L, thrust::plus{}); @@ -1175,8 +1175,8 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, // page's chunk of the temp buffer rmm::device_uvector page_string_offsets(pages.size(), stream); thrust::transform_exclusive_scan(rmm::exec_policy_nosync(stream), - pages.d_begin(), - pages.d_end(), + pages.device_begin(), + pages.device_end(), page_string_offsets.begin(), page_sizes, 0L, @@ -1187,10 +1187,10 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, // now use the offsets array to set each page's temp_string_buf pointers thrust::transform(rmm::exec_policy_nosync(stream), - pages.d_begin(), - pages.d_end(), + pages.device_begin(), + pages.device_end(), page_string_offsets.begin(), - pages.d_begin(), + pages.device_begin(), page_tform_functor{temp_string_buf.data()}); } } @@ -1198,8 +1198,8 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, /** * @copydoc cudf::io::parquet::detail::DecodeStringPageData */ -void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void __host__ DecodeStringPageData(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, size_t num_rows, size_t min_row, int level_type_size, diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 64e1c199779..86d6ec42c04 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -388,7 +388,6 @@ struct ColumnChunkDesc { level_bits{def_level_bits_, rep_level_bits_}, num_data_pages(0), num_dict_pages(0), - max_num_pages(0), dict_page(nullptr), str_dict_index(nullptr), valid_map_base{nullptr}, @@ -417,7 +416,6 @@ struct ColumnChunkDesc { level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max definition/repetition levels int32_t num_data_pages{}; // number of data pages int32_t num_dict_pages{}; // number of dictionary pages - int32_t max_num_pages{}; // size of page_info array PageInfo const* dict_page{}; string_index_pair* str_dict_index{}; // index for string dictionary bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column @@ -644,7 +642,7 @@ void BuildStringDictionaryIndex(ColumnChunkDesc* chunks, * @param[in] stream CUDA stream to use * @return Bitwise OR of all page `kernel_mask` values */ -uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector& pages, +uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_span pages, rmm::cuda_stream_view stream); /** @@ -671,8 +669,8 @@ uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector * @param level_type_size Size in bytes of the type for level decoding * @param stream CUDA stream to use */ -void ComputePageSizes(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void ComputePageSizes(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, size_t min_row, size_t num_rows, bool compute_num_rows, @@ -697,8 +695,8 @@ void ComputePageSizes(cudf::detail::hostdevice_vector& pages, * @param[in] kernel_mask Mask of kernels to run * @param[in] stream CUDA stream to use */ -void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void ComputePageStringSizes(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, rmm::device_uvector& temp_string_buf, size_t min_row, size_t num_rows, @@ -720,8 +718,8 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ -void DecodePageData(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void DecodePageData(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, size_t num_rows, size_t min_row, int level_type_size, @@ -742,8 +740,8 @@ void DecodePageData(cudf::detail::hostdevice_vector& pages, * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ -void DecodeStringPageData(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void DecodeStringPageData(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, size_t num_rows, size_t min_row, int level_type_size, @@ -764,8 +762,8 @@ void DecodeStringPageData(cudf::detail::hostdevice_vector& pages, * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ -void DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void DecodeDeltaBinary(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, size_t num_rows, size_t min_row, int level_type_size, @@ -786,8 +784,8 @@ void DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages, * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ -void DecodeDeltaByteArray(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void DecodeDeltaByteArray(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, size_t num_rows, size_t min_row, int level_type_size, @@ -808,8 +806,8 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_vector& pages, * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ -void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, size_t num_rows, size_t min_row, int level_type_size, diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index a7af20f5d7c..b05318d3a91 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -21,6 +21,7 @@ #include "reader_impl_chunking.hpp" #include +#include #include #include @@ -32,6 +33,7 @@ #include #include #include +#include #include #include @@ -549,8 +551,64 @@ struct get_page_span { } }; +/** + * @brief Return the span of page indices for a given column index + + */ +struct get_page_span_by_column { + cudf::device_span page_offsets; + + __device__ page_span operator()(size_t i) const + { + return {static_cast(page_offsets[i]), static_cast(page_offsets[i + 1])}; + } +}; + +/** + * @brief Return the size of a span + * + */ struct get_span_size { - __device__ size_t operator()(page_span const& s) const { return s.end - s.start; } + CUDF_HOST_DEVICE size_t operator()(page_span const& s) const { return s.end - s.start; } +}; + +/** + * @brief Return the size of a span in an array of spans, handling out-of-bounds indices. + * + */ +struct get_span_size_by_index { + cudf::device_span page_indices; + + __device__ size_t operator()(size_t i) const + { + return i >= page_indices.size() ? 0 : page_indices[i].end - page_indices[i].start; + } +}; + +/** + * @brief Copy page from appropriate source location (as defined by page_offsets) to the destination + * location, and store the index mapping. + */ +struct copy_subpass_page { + cudf::device_span src_pages; + cudf::device_span dst_pages; + cudf::device_span page_src_index; + cudf::device_span page_offsets; + cudf::device_span page_indices; + + __device__ void operator()(size_t i) const + { + auto const index = + thrust::lower_bound(thrust::seq, page_offsets.begin(), page_offsets.end(), i) - + page_offsets.begin(); + auto const col_index = page_offsets[index] == i ? index : index - 1; + // index within the pages for the column + auto const col_page_index = i - page_offsets[col_index]; + auto const src_page_index = page_indices[col_index].start + col_page_index; + + dst_pages[i] = src_pages[src_page_index]; + page_src_index[i] = src_page_index; + } }; /** @@ -575,7 +633,7 @@ struct get_span_size { * expected memory usage (including scratch space) * */ -std::tuple, size_t, size_t> compute_next_subpass( +std::tuple, size_t, size_t> compute_next_subpass( device_span c_info, device_span pages, device_span page_offsets, @@ -618,9 +676,8 @@ std::tuple, size_t, size_t> compute_next_subpass( size_t const total_pages = thrust::reduce(rmm::exec_policy(stream), page_count_iter, page_count_iter + num_columns); - return {cudf::detail::make_std_vector_sync(page_bounds, stream), - total_pages, - h_aggregated_info[end_index].size_bytes - cumulative_size}; + return { + std::move(page_bounds), total_pages, h_aggregated_info[end_index].size_bytes - cumulative_size}; } std::vector compute_page_splits_by_row(device_span c_info, @@ -674,11 +731,13 @@ std::vector compute_page_splits_by_row(device_span const& chunks, - cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_span chunks, + cudf::detail::hostdevice_span pages, bool dict_pages, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE(); + auto for_each_codec_page = [&](Compression codec, std::function const& f) { for (size_t p = 0; p < pages.size(); p++) { if (chunks[pages[p].chunk_idx].codec == codec && @@ -715,8 +774,8 @@ std::vector compute_page_splits_by_row(device_span pages, std::optional expected_row_count, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE(); + // sum row counts for all non-dictionary, non-list columns. other columns will be indicated as 0 rmm::device_uvector row_counts(pages.size(), stream); // worst case: num keys == num pages @@ -1221,7 +1282,9 @@ void reader::impl::setup_next_pass(bool uses_custom_row_bounds) // if we are doing subpass reading, generate more accurate num_row estimates for list columns. // this helps us to generate more accurate subpass splits. - if (_input_pass_read_limit != 0) { generate_list_column_row_count_estimates(); } + if (pass.has_compressed_data && _input_pass_read_limit != 0) { + generate_list_column_row_count_estimates(); + } #if defined(PARQUET_CHUNK_LOGGING) printf("Pass: row_groups(%'lu), chunks(%'lu), pages(%'lu)\n", @@ -1266,21 +1329,21 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds) ? min_subpass_size : _input_pass_read_limit - pass.base_mem_size; + // page_indices is an array of spans where each element N is the + // indices into the pass.pages array that represents the subset of pages + // for column N to use for the subpass. auto [page_indices, total_pages, total_expected_size] = - [&]() -> std::tuple, size_t, size_t> { - // special case: if we contain no compressed data, or if we have no input limit, we can always - // just do 1 subpass since what we already have loaded is all the temporary memory we will ever - // use. + [&]() -> std::tuple, size_t, size_t> { if (!pass.has_compressed_data || _input_pass_read_limit == 0) { - std::vector page_indices; - page_indices.reserve(num_columns); + rmm::device_uvector page_indices( + num_columns, _stream, rmm::mr::get_current_device_resource()); auto iter = thrust::make_counting_iterator(0); - std::transform( - iter, iter + num_columns, std::back_inserter(page_indices), [&](size_t i) -> page_span { - return {static_cast(pass.page_offsets[i]), - static_cast(pass.page_offsets[i + 1])}; - }); - return {page_indices, pass.pages.size(), 0}; + thrust::transform(rmm::exec_policy_nosync(_stream), + iter, + iter + num_columns, + page_indices.begin(), + get_page_span_by_column{pass.page_offsets}); + return {std::move(page_indices), pass.pages.size(), size_t{0}}; } // otherwise we have to look forward and choose a batch of pages @@ -1319,37 +1382,50 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds) _stream); }(); - // fill out the subpass struct - subpass.pages = cudf::detail::hostdevice_vector(0, total_pages, _stream); - subpass.page_src_index = - cudf::detail::hostdevice_vector(total_pages, total_pages, _stream); - // copy the appropriate subset of pages from each column - size_t page_count = 0; - for (size_t c_idx = 0; c_idx < num_columns; c_idx++) { - auto const num_column_pages = page_indices[c_idx].end - page_indices[c_idx].start; - subpass.column_page_count.push_back(num_column_pages); - std::copy(pass.pages.begin() + page_indices[c_idx].start, - pass.pages.begin() + page_indices[c_idx].end, - std::back_inserter(subpass.pages)); - - // mapping back to original pages in the pass - thrust::sequence(thrust::host, - subpass.page_src_index.begin() + page_count, - subpass.page_src_index.begin() + page_count + num_column_pages, - page_indices[c_idx].start); - page_count += num_column_pages; + // check to see if we are processing the entire pass (enabling us to skip a bunch of work) + subpass.single_subpass = total_pages == pass.pages.size(); + + // in the single pass case, no page copying is necessary - just use what's in the pass itself + if (subpass.single_subpass) { + subpass.pages = pass.pages; + } + // copy the appropriate subset of pages from each column and store the mapping back to the source + // (pass) pages + else { + subpass.page_buf = cudf::detail::hostdevice_vector(total_pages, total_pages, _stream); + subpass.page_src_index = rmm::device_uvector(total_pages, _stream); + auto iter = thrust::make_counting_iterator(0); + rmm::device_uvector dst_offsets(num_columns + 1, _stream); + thrust::transform_exclusive_scan(rmm::exec_policy_nosync(_stream), + iter, + iter + num_columns + 1, + dst_offsets.begin(), + get_span_size_by_index{page_indices}, + 0, + thrust::plus{}); + thrust::for_each( + rmm::exec_policy_nosync(_stream), + iter, + iter + total_pages, + copy_subpass_page{ + pass.pages, subpass.page_buf, subpass.page_src_index, dst_offsets, page_indices}); + subpass.pages = subpass.page_buf; } - // print_hostdevice_vector(subpass.page_src_index); + + std::vector h_spans = cudf::detail::make_std_vector_async(page_indices, _stream); + subpass.pages.device_to_host_async(_stream); + + _stream.synchronize(); + + subpass.column_page_count = std::vector(num_columns); + std::transform( + h_spans.begin(), h_spans.end(), subpass.column_page_count.begin(), get_span_size{}); // decompress the data for the pages in this subpass. if (pass.has_compressed_data) { subpass.decomp_page_data = decompress_page_data(pass.chunks, subpass.pages, false, _stream); } - subpass.pages.host_to_device_async(_stream); - subpass.page_src_index.host_to_device_async(_stream); - _stream.synchronize(); - // buffers needed by the decode kernels { // nesting information (sizes, etc) stored -per page- @@ -1541,7 +1617,7 @@ void reader::impl::compute_output_chunks_for_subpass() // generate row_indices and cumulative output sizes for all pages rmm::device_uvector c_info(subpass.pages.size(), _stream); auto page_input = - thrust::make_transform_iterator(subpass.pages.d_begin(), get_page_output_size{}); + thrust::make_transform_iterator(subpass.pages.device_begin(), get_page_output_size{}); auto page_keys = make_page_key_iterator(subpass.pages); thrust::inclusive_scan_by_key(rmm::exec_policy_nosync(_stream), page_keys, diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp index a9cf0e94ec8..b959c793011 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.hpp +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -69,9 +69,17 @@ struct subpass_intermediate_data { rmm::device_buffer decomp_page_data; rmm::device_buffer level_decode_data{}; - cudf::detail::hostdevice_vector pages{}; + cudf::detail::hostdevice_span pages{}; + + // optimization. if the single_subpass flag is set, it means we will only be doing + // one subpass for the entire pass. this allows us to skip various pieces of work + // during processing. notably, page_buf will not be allocated to hold a compacted + // copy of the pages specific to the subpass. + bool single_subpass{false}; + cudf::detail::hostdevice_vector page_buf{}; + // for each page in the subpass, the index of our source page in the pass - cudf::detail::hostdevice_vector page_src_index{}; + rmm::device_uvector page_src_index{0, cudf::get_default_stream()}; // for each column in the file (indexed by _input_columns.size()) // the number of associated pages for this subpass std::vector column_page_count; @@ -111,10 +119,10 @@ struct pass_intermediate_data { // 1 1 1 1 1 2 2 2 // // page_offsets would be 0, 5, 8 - cudf::detail::hostdevice_vector page_offsets{}; + rmm::device_uvector page_offsets{0, cudf::get_default_stream()}; - rmm::device_buffer decomp_dict_data{0, rmm::cuda_stream_default}; - rmm::device_uvector str_dict_index{0, rmm::cuda_stream_default}; + rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()}; + rmm::device_uvector str_dict_index{0, cudf::get_default_stream()}; int level_type_size{0}; diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 48ff32038b3..c524547c4d7 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -18,6 +18,7 @@ #include "reader_impl.hpp" #include +#include #include #include @@ -37,6 +38,7 @@ #include #include #include +#include #include #include @@ -350,6 +352,7 @@ std::string encoding_to_string(Encoding encoding) } return result; } + /** * @brief Create a readable string for the user that will list out all unsupported encodings found. * @@ -368,6 +371,73 @@ std::string encoding_to_string(Encoding encoding) return encoding_bitmask_to_str(unsupported); } +/** + * @brief Sort pages in chunk/schema order + * + * @param unsorted_pages The unsorted pages + * @param chunks The chunks associated with the pages + * @param stream CUDA stream used for device memory operations and kernel launches + * @returns The sorted vector of pages + */ +cudf::detail::hostdevice_vector sort_pages(device_span unsorted_pages, + device_span chunks, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + + // sort the pages in chunk/schema order. we use chunk.src_col_index instead of + // chunk.src_col_schema because the user may have reordered them (reading columns, "a" and "b" but + // returning them as "b" and "a") + // + // ordering of pages is by input column schema, repeated across row groups. so + // if we had 3 columns, each with 2 pages, and 1 row group, our schema values might look like + // + // 1, 1, 2, 2, 3, 3 + // + // However, if we had more than one row group, the pattern would be + // + // 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3 + // ^ row group 0 | + // ^ row group 1 + // + // To process pages by key (exclusive_scan_by_key, reduce_by_key, etc), the ordering we actually + // want is + // + // 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 + // + // We also need to preserve key-relative page ordering, so we need to use a stable sort. + rmm::device_uvector page_keys{unsorted_pages.size(), stream}; + thrust::transform( + rmm::exec_policy_nosync(stream), + unsorted_pages.begin(), + unsorted_pages.end(), + page_keys.begin(), + cuda::proclaim_return_type([chunks = chunks.begin()] __device__(PageInfo const& page) { + return chunks[page.chunk_idx].src_col_index; + })); + // we are doing this by sorting indices first and then transforming the output because nvcc + // started generating kernels using too much shared memory when trying to sort the pages + // directly. + rmm::device_uvector sort_indices(unsorted_pages.size(), stream); + thrust::sequence(rmm::exec_policy_nosync(stream), sort_indices.begin(), sort_indices.end(), 0); + thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), + page_keys.begin(), + page_keys.end(), + sort_indices.begin(), + thrust::less()); + auto pass_pages = + cudf::detail::hostdevice_vector(unsorted_pages.size(), unsorted_pages.size(), stream); + thrust::transform( + rmm::exec_policy_nosync(stream), + sort_indices.begin(), + sort_indices.end(), + pass_pages.d_begin(), + cuda::proclaim_return_type([unsorted_pages = unsorted_pages.begin()] __device__( + int32_t i) { return unsorted_pages[i]; })); + stream.synchronize(); + return pass_pages; +} + /** * @brief Decode the page information for a given pass. * @@ -377,21 +447,35 @@ void decode_page_headers(pass_intermediate_data& pass, device_span unsorted_pages, rmm::cuda_stream_view stream) { - cudf::detail::hostdevice_vector chunk_page_info(pass.chunks.size(), stream); - - // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages), - // please update preprocess_nested_columns to reflect this. - for (size_t c = 0, page_count = 0; c < pass.chunks.size(); c++) { - pass.chunks[c].max_num_pages = pass.chunks[c].num_data_pages + pass.chunks[c].num_dict_pages; - chunk_page_info[c].pages = &unsorted_pages[page_count]; - page_count += pass.chunks[c].max_num_pages; - } + CUDF_FUNC_RANGE(); + + auto iter = thrust::make_counting_iterator(0); + rmm::device_uvector chunk_page_counts(pass.chunks.size() + 1, stream); + thrust::transform_exclusive_scan( + rmm::exec_policy_nosync(stream), + iter, + iter + pass.chunks.size() + 1, + chunk_page_counts.begin(), + cuda::proclaim_return_type( + [chunks = pass.chunks.d_begin(), num_chunks = pass.chunks.size()] __device__(size_t i) { + return static_cast( + i >= num_chunks ? 0 : chunks[i].num_data_pages + chunks[i].num_dict_pages); + }), + 0, + thrust::plus{}); + rmm::device_uvector d_chunk_page_info(pass.chunks.size(), stream); + thrust::for_each(rmm::exec_policy_nosync(stream), + iter, + iter + pass.chunks.size(), + [cpi = d_chunk_page_info.begin(), + chunk_page_counts = chunk_page_counts.begin(), + unsorted_pages = unsorted_pages.begin()] __device__(size_t i) { + cpi[i].pages = &unsorted_pages[chunk_page_counts[i]]; + }); kernel_error error_code(stream); - pass.chunks.host_to_device_async(stream); - chunk_page_info.host_to_device_async(stream); - DecodePageHeaders(pass.chunks.device_ptr(), - chunk_page_info.device_ptr(), + DecodePageHeaders(pass.chunks.d_begin(), + d_chunk_page_info.begin(), pass.chunks.size(), error_code.data(), stream); @@ -421,56 +505,8 @@ void decode_page_headers(pass_intermediate_data& pass, thrust::maximum()); pass.level_type_size = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8)); - // sort the pages in chunk/schema order. we use chunk.src_col_index instead of - // chunk.src_col_schema because the user may have reordered them (reading columns, "a" and "b" but - // returning them as "b" and "a") - // - // ordering of pages is by input column schema, repeated across row groups. so - // if we had 3 columns, each with 2 pages, and 1 row group, our schema values might look like - // - // 1, 1, 2, 2, 3, 3 - // - // However, if we had more than one row group, the pattern would be - // - // 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3 - // ^ row group 0 | - // ^ row group 1 - // - // To process pages by key (exclusive_scan_by_key, reduce_by_key, etc), the ordering we actually - // want is - // - // 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 - // - // We also need to preserve key-relative page ordering, so we need to use a stable sort. - { - rmm::device_uvector page_keys{unsorted_pages.size(), stream}; - thrust::transform(rmm::exec_policy_nosync(stream), - unsorted_pages.begin(), - unsorted_pages.end(), - page_keys.begin(), - [chunks = pass.chunks.d_begin()] __device__(PageInfo const& page) { - return chunks[page.chunk_idx].src_col_index; - }); - // we are doing this by sorting indices first and then transforming the output because nvcc - // started generating kernels using too much shared memory when trying to sort the pages - // directly. - rmm::device_uvector sort_indices(unsorted_pages.size(), stream); - thrust::sequence(rmm::exec_policy_nosync(stream), sort_indices.begin(), sort_indices.end(), 0); - thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), - page_keys.begin(), - page_keys.end(), - sort_indices.begin(), - thrust::less()); - pass.pages = cudf::detail::hostdevice_vector( - unsorted_pages.size(), unsorted_pages.size(), stream); - thrust::transform(rmm::exec_policy_nosync(stream), - sort_indices.begin(), - sort_indices.end(), - pass.pages.d_begin(), - [unsorted_pages = unsorted_pages.begin()] __device__(int32_t i) { - return unsorted_pages[i]; - }); - } + // sort the pages in chunk/schema order. + pass.pages = sort_pages(unsorted_pages, pass.chunks, stream); // compute offsets to each group of input pages. // page_keys: 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 @@ -486,11 +522,11 @@ void decode_page_headers(pass_intermediate_data& pass, page_counts.begin()) .second; auto const num_page_counts = page_counts_end - page_counts.begin(); - pass.page_offsets = cudf::detail::hostdevice_vector(num_page_counts + 1, stream); + pass.page_offsets = rmm::device_uvector(num_page_counts + 1, stream); thrust::exclusive_scan(rmm::exec_policy_nosync(stream), page_counts.begin(), page_counts.begin() + num_page_counts + 1, - pass.page_offsets.d_begin()); + pass.page_offsets.begin()); // setup dict_page for each chunk if necessary thrust::for_each(rmm::exec_policy_nosync(stream), @@ -502,7 +538,6 @@ void decode_page_headers(pass_intermediate_data& pass, } }); - pass.page_offsets.device_to_host_async(stream); pass.pages.device_to_host_async(stream); pass.chunks.device_to_host_async(stream); stream.synchronize(); @@ -589,6 +624,8 @@ struct set_final_row_count { void reader::impl::build_string_dict_indices() { + CUDF_FUNC_RANGE(); + auto& pass = *_pass_itm_data; // compute number of indices per chunk and a summed total @@ -1229,12 +1266,16 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t _stream); } - // copy our now-correct row counts back to the base pages stored in the pass. auto iter = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy_nosync(_stream), - iter, - iter + subpass.pages.size(), - update_pass_num_rows{pass.pages, subpass.pages, subpass.page_src_index}); + + // copy our now-correct row counts back to the base pages stored in the pass. + // only need to do this if we are not processing the whole pass in one subpass + if (!subpass.single_subpass) { + thrust::for_each(rmm::exec_policy_nosync(_stream), + iter, + iter + subpass.pages.size(), + update_pass_num_rows{pass.pages, subpass.pages, subpass.page_src_index}); + } // computes: // PageInfo::chunk_row (the chunk-relative row index) for all pages in the pass. The start_row @@ -1250,14 +1291,17 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_row_output_iter{pass.pages.device_ptr()}); // copy chunk row into the subpass pages - thrust::for_each(rmm::exec_policy_nosync(_stream), - iter, - iter + subpass.pages.size(), - update_subpass_chunk_row{pass.pages, subpass.pages, subpass.page_src_index}); + // only need to do this if we are not processing the whole pass in one subpass + if (!subpass.single_subpass) { + thrust::for_each(rmm::exec_policy_nosync(_stream), + iter, + iter + subpass.pages.size(), + update_subpass_chunk_row{pass.pages, subpass.pages, subpass.page_src_index}); + } // retrieve pages back pass.pages.device_to_host_async(_stream); - subpass.pages.device_to_host_async(_stream); + if (!subpass.single_subpass) { subpass.pages.device_to_host_async(_stream); } _stream.synchronize(); // at this point we have an accurate row count so we can compute how many rows we will actually be @@ -1382,7 +1426,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses thrust::make_counting_iterator(num_keys), size_input.begin(), get_page_nesting_size{ - d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.d_begin()}); + d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()}); auto const reduction_keys = cudf::detail::make_counting_transform_iterator(0, get_reduction_key{subpass.pages.size()}); cudf::detail::hostdevice_vector sizes{_input_columns.size() * max_depth, _stream}; @@ -1402,7 +1446,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses reduction_keys + num_keys, size_input.cbegin(), start_offset_output_iterator{ - subpass.pages.d_begin(), 0, d_cols_info.data(), max_depth, subpass.pages.size()}); + subpass.pages.device_begin(), 0, d_cols_info.data(), max_depth, subpass.pages.size()}); sizes.device_to_host_sync(_stream); for (size_type idx = 0; idx < static_cast(_input_columns.size()); idx++) { @@ -1442,7 +1486,7 @@ std::vector reader::impl::calculate_page_string_offsets() rmm::device_uvector d_col_sizes(col_sizes.size(), _stream); // use page_index to fetch page string sizes in the proper order - auto val_iter = thrust::make_transform_iterator(subpass.pages.d_begin(), + auto val_iter = thrust::make_transform_iterator(subpass.pages.device_begin(), page_to_string_size{pass.chunks.d_begin()}); // do scan by key to calculate string offsets for each page diff --git a/cpp/src/io/utilities/hostdevice_span.hpp b/cpp/src/io/utilities/hostdevice_span.hpp index 539e8e84e59..ec5e0410bc0 100644 --- a/cpp/src/io/utilities/hostdevice_span.hpp +++ b/cpp/src/io/utilities/hostdevice_span.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,7 +33,30 @@ class hostdevice_span { hostdevice_span(hostdevice_span&&) = default; ///< Move constructor hostdevice_span(T* cpu_data, T* gpu_data, size_t size) - : _size(size), _host_data(cpu_data), _device_data(gpu_data) + : _size(size), _device_data(gpu_data), _host_data(cpu_data) + { + } + + /// Constructor from container + /// @param in The container to construct the span from + template ().host_ptr())> (*)[], + T (*)[]>>* = nullptr> + constexpr hostdevice_span(C& in) : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size()) + { + } + + /// Constructor from const container + /// @param in The container to construct the span from + template ().host_ptr())> (*)[], + T (*)[]>>* = nullptr> + constexpr hostdevice_span(C const& in) + : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size()) { } @@ -50,10 +73,15 @@ class hostdevice_span { * @tparam T The device span type. * @return A typed device span of the hostdevice view's data. */ - [[nodiscard]] operator cudf::device_span() const - { - return cudf::device_span(_device_data, size()); - } + [[nodiscard]] operator cudf::device_span() { return {_device_data, size()}; } + + /** + * @brief Converts a hostdevice view into a device span of const data. + * + * @tparam T The device span type. + * @return A const typed device span of the hostdevice view's data. + */ + [[nodiscard]] operator cudf::device_span() const { return {_device_data, size()}; } /** * @brief Returns the underlying device data. From 200fc0b35216c01235103e491d5217b932670ebc Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 29 Feb 2024 13:25:35 -0800 Subject: [PATCH 18/31] Use cuco::static_set in the hash-based groupby (#14813) Depends on https://github.com/rapidsai/cudf/pull/14849 Contributes to #12261 This PR migrates hash groupby to use the new `cuco::static_set` data structure. It doesn't change any existing libcudf behavior but uncovers the fact that the cudf python `value_counts` doesn't guarantee output orders thus the PR becomes a breaking change. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14813 --- cpp/benchmarks/groupby/group_max.cpp | 7 +- cpp/benchmarks/groupby/group_struct_keys.cpp | 9 +- cpp/include/cudf/detail/cuco_helpers.hpp | 5 + cpp/src/groupby/hash/groupby.cu | 123 ++++++++---------- cpp/src/groupby/hash/groupby_kernels.cuh | 47 +++---- cpp/src/groupby/hash/multi_pass_kernels.cuh | 13 +- .../source/user_guide/pandas-comparison.md | 2 +- python/cudf/cudf/core/dataframe.py | 4 +- python/cudf/cudf/core/groupby/groupby.py | 28 ++-- python/cudf/cudf/tests/test_groupby.py | 16 ++- 10 files changed, 125 insertions(+), 129 deletions(-) diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp index e65c37f001d..b7b330f02e5 100644 --- a/cpp/benchmarks/groupby/group_max.cpp +++ b/cpp/benchmarks/groupby/group_max.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include +#include #include @@ -50,9 +51,13 @@ void bench_groupby_max(nvbench::state& state, nvbench::type_list) requests[0].values = vals->view(); requests[0].aggregations.push_back(cudf::make_max_aggregation()); + auto const mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); + + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } NVBENCH_BENCH_TYPES(bench_groupby_max, diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp index 44a12c1c30e..cadd9c2d137 100644 --- a/cpp/benchmarks/groupby/group_struct_keys.cpp +++ b/cpp/benchmarks/groupby/group_struct_keys.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include +#include #include @@ -80,11 +81,15 @@ void bench_groupby_struct_keys(nvbench::state& state) requests[0].aggregations.push_back(cudf::make_min_aggregation()); // Set up nvbench default stream - auto stream = cudf::get_default_stream(); + auto const mem_stats_logger = cudf::memory_stats_logger(); + auto stream = cudf::get_default_stream(); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); + + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } NVBENCH_BENCH(bench_groupby_struct_keys) diff --git a/cpp/include/cudf/detail/cuco_helpers.hpp b/cpp/include/cudf/detail/cuco_helpers.hpp index 506f6475637..dca5a39bece 100644 --- a/cpp/include/cudf/detail/cuco_helpers.hpp +++ b/cpp/include/cudf/detail/cuco_helpers.hpp @@ -16,11 +16,16 @@ #pragma once +#include + #include #include namespace cudf::detail { +/// Sentinel value for `cudf::size_type` +static cudf::size_type constexpr CUDF_SIZE_TYPE_SENTINEL = -1; + /// Default load factor for cuco data structures static double constexpr CUCO_DESIRED_LOAD_FACTOR = 0.5; diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 7b85dd02c10..acc1b087510 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -22,23 +22,19 @@ #include #include #include -#include #include #include #include #include +#include #include #include #include -#include #include -#include -#include #include #include #include #include -#include #include #include #include @@ -49,12 +45,9 @@ #include -#include -#include -#include +#include #include #include -#include #include #include @@ -66,15 +59,12 @@ namespace detail { namespace hash { namespace { -// TODO: replace it with `cuco::static_map` -// https://github.com/rapidsai/cudf/issues/10401 -template -using map_type = concurrent_unordered_map< - cudf::size_type, - cudf::size_type, +// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested +// types and `cg_size = 1`for flat data to improve performance +using probing_scheme_type = cuco::linear_probing< + 1, ///< Number of threads used to handle each input key cudf::experimental::row::hash::device_row_hasher, - ComparatorType>; + cudf::nullate::DYNAMIC>>; /** * @brief List of aggregation operations that can be computed with a hash-based @@ -190,14 +180,14 @@ class groupby_simple_aggregations_collector final } }; -template +template class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { column_view col; data_type result_type; cudf::detail::result_cache* sparse_results; cudf::detail::result_cache* dense_results; device_span gather_map; - map_type const& map; + SetType set; bitmask_type const* __restrict__ row_bitmask; rmm::cuda_stream_view stream; rmm::mr::device_memory_resource* mr; @@ -209,7 +199,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, device_span gather_map, - map_type const& map, + SetType set, bitmask_type const* row_bitmask, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -217,7 +207,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final sparse_results(sparse_results), dense_results(dense_results), gather_map(gather_map), - map(map), + set(set), row_bitmask(row_bitmask), stream(stream), mr(mr) @@ -340,8 +330,8 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final rmm::exec_policy(stream), thrust::make_counting_iterator(0), col.size(), - ::cudf::detail::var_hash_functor>{ - map, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); + ::cudf::detail::var_hash_functor{ + set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); sparse_results->add_result(col, agg, std::move(var_result)); dense_results->add_result(col, agg, to_dense_agg_result(agg)); } @@ -398,13 +388,13 @@ flatten_single_pass_aggs(host_span requests) * * @see groupby_null_templated() */ -template +template void sparse_to_dense_results(table_view const& keys, host_span requests, cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, device_span gather_map, - map_type const& map, + SetType set, bool keys_have_nulls, null_policy include_null_keys, rmm::cuda_stream_view stream, @@ -423,7 +413,7 @@ void sparse_to_dense_results(table_view const& keys, // Given an aggregation, this will get the result from sparse_results and // convert and return dense, compacted result auto finalizer = hash_compound_agg_finalizer( - col, sparse_results, dense_results, gather_map, map, row_bitmask_ptr, stream, mr); + col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr); for (auto&& agg : agg_v) { agg->finalize(finalizer); } @@ -467,11 +457,11 @@ auto create_sparse_results_table(table_view const& flattened_values, * @brief Computes all aggregations from `requests` that require a single pass * over the data and stores the results in `sparse_results` */ -template +template void compute_single_pass_aggs(table_view const& keys, host_span requests, cudf::detail::result_cache* sparse_results, - map_type& map, + SetType set, bool keys_have_nulls, null_policy include_null_keys, rmm::cuda_stream_view stream) @@ -494,16 +484,16 @@ void compute_single_pass_aggs(table_view const& keys, ? cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first : rmm::device_buffer{}; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - keys.num_rows(), - hash::compute_single_pass_aggs_fn>{ - map, - *d_values, - *d_sparse_table, - d_aggs.data(), - static_cast(row_bitmask.data()), - skip_key_rows_with_nulls}); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + keys.num_rows(), + hash::compute_single_pass_aggs_fn{set, + *d_values, + *d_sparse_table, + d_aggs.data(), + static_cast(row_bitmask.data()), + skip_key_rows_with_nulls}); // Add results back to sparse_results cache auto sparse_result_cols = sparse_table.release(); for (size_t i = 0; i < aggs.size(); i++) { @@ -517,23 +507,15 @@ void compute_single_pass_aggs(table_view const& keys, * @brief Computes and returns a device vector containing all populated keys in * `map`. */ -template -rmm::device_uvector extract_populated_keys(map_type const& map, +template +rmm::device_uvector extract_populated_keys(SetType const& key_set, size_type num_keys, rmm::cuda_stream_view stream) { rmm::device_uvector populated_keys(num_keys, stream); + auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); - auto const get_key = cuda::proclaim_return_type::key_type>( - [] __device__(auto const& element) { return element.first; }); // first = key - auto const key_used = [unused = map.get_unused_key()] __device__(auto key) { - return key != unused; - }; - auto const key_itr = thrust::make_transform_iterator(map.data(), get_key); - auto const end_it = cudf::detail::copy_if_safe( - key_itr, key_itr + map.capacity(), populated_keys.begin(), key_used, stream); - - populated_keys.resize(std::distance(populated_keys.begin(), end_it), stream); + populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); return populated_keys; } @@ -580,30 +562,33 @@ std::unique_ptr
groupby(table_view const& keys, auto const row_hash = cudf::experimental::row::hash::row_hasher{std::move(preprocessed_keys)}; auto const d_row_hash = row_hash.device_hasher(has_null); - size_type constexpr unused_key{std::numeric_limits::max()}; - size_type constexpr unused_value{std::numeric_limits::max()}; - // Cache of sparse results where the location of aggregate value in each - // column is indexed by the hash map + // column is indexed by the hash set cudf::detail::result_cache sparse_results(requests.size()); auto const comparator_helper = [&](auto const d_key_equal) { - using allocator_type = typename map_type::allocator_type; - - auto const map = map_type::create(compute_hash_table_size(num_keys), - stream, - unused_key, - unused_value, - d_row_hash, - d_key_equal, - allocator_type()); - // Compute all single pass aggs first - compute_single_pass_aggs( - keys, requests, &sparse_results, *map, keys_have_nulls, include_null_keys, stream); + auto const set = cuco::static_set{num_keys, + 0.5, // desired load factor + cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + d_key_equal, + probing_scheme_type{d_row_hash}, + cuco::thread_scope_device, + cuco::storage<1>{}, + cudf::detail::cuco_allocator{stream}, + stream.value()}; - // Extract the populated indices from the hash map and create a gather map. + // Compute all single pass aggs first + compute_single_pass_aggs(keys, + requests, + &sparse_results, + set.ref(cuco::insert_and_find), + keys_have_nulls, + include_null_keys, + stream); + + // Extract the populated indices from the hash set and create a gather map. // Gathering using this map from sparse results will give dense results. - auto gather_map = extract_populated_keys(*map, keys.num_rows(), stream); + auto gather_map = extract_populated_keys(set, keys.num_rows(), stream); // Compact all results from sparse_results and insert into cache sparse_to_dense_results(keys, @@ -611,7 +596,7 @@ std::unique_ptr
groupby(table_view const& keys, &sparse_results, cache, gather_map, - *map, + set.ref(cuco::find), keys_have_nulls, include_null_keys, stream, diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh index 4dfb191480b..9abfe22950a 100644 --- a/cpp/src/groupby/hash/groupby_kernels.cuh +++ b/cpp/src/groupby/hash/groupby_kernels.cuh @@ -30,42 +30,34 @@ namespace groupby { namespace detail { namespace hash { /** - * @brief Compute single-pass aggregations and store results into a sparse - * `output_values` table, and populate `map` with indices of unique keys + * @brief Computes single-pass aggregations and store results into a sparse `output_values` table, + * and populate `set` with indices of unique keys * - * The hash map is built by inserting every row `i` from the `keys` and - * `values` tables as a single (key,value) pair. When the pair is inserted, if - * the key was not already present in the map, then the corresponding value is - * simply copied to the output. If the key was already present in the map, - * then the inserted `values` row is aggregated with the existing row. This - * aggregation is done for every element `j` in the row by applying aggregation - * operation `j` between the new and existing element. + * The hash set is built by inserting every row index `i` from the `keys` and `values` tables. If + * the index was not present in the set, insert they index and then copy it to the output. If the + * key was already present in the set, then the inserted index is aggregated with the existing row. + * This aggregation is done for every element `j` in the row by applying aggregation operation `j` + * between the new and existing element. * * Instead of storing the entire rows from `input_keys` and `input_values` in - * the hashmap, we instead store the row indices. For example, when inserting - * row at index `i` from `input_keys` into the hash map, the value `i` is what - * gets stored for the hash map's "key". It is assumed the `map` was constructed + * the hashset, we instead store the row indices. For example, when inserting + * row at index `i` from `input_keys` into the hash set, the value `i` is what + * gets stored for the hash set's "key". It is assumed the `set` was constructed * with a custom comparator that uses these row indices to check for equality * between key rows. For example, comparing two keys `k0` and `k1` will compare * the two rows `input_keys[k0] ?= input_keys[k1]` * - * Likewise, we store the row indices for the hash maps "values". These indices - * index into the `output_values` table. For a given key `k` (which is an index - * into `input_keys`), the corresponding value `v` indexes into `output_values` - * and stores the result of aggregating rows from `input_values` from rows of - * `input_keys` equivalent to the row at `k`. - * * The exact size of the result is not known a priori, but can be upper bounded * by the number of rows in `input_keys` & `input_values`. Therefore, it is * assumed `output_values` has sufficient storage for an equivalent number of * rows. In this way, after all rows are aggregated, `output_values` will likely * be "sparse", meaning that not all rows contain the result of an aggregation. * - * @tparam Map The type of the hash map + * @tparam SetType The type of the hash set device ref */ -template +template struct compute_single_pass_aggs_fn { - Map map; + SetType set; table_device_view input_values; mutable_table_device_view output_values; aggregation::Kind const* __restrict__ aggs; @@ -75,9 +67,9 @@ struct compute_single_pass_aggs_fn { /** * @brief Construct a new compute_single_pass_aggs_fn functor object * - * @param map Hash map object to insert key,value pairs into. + * @param set_ref Hash set object to insert key,value pairs into. * @param input_values The table whose rows will be aggregated in the values - * of the hash map + * of the hash set * @param output_values Table that stores the results of aggregating rows of * `input_values`. * @param aggs The set of aggregation operations to perform across the @@ -88,13 +80,13 @@ struct compute_single_pass_aggs_fn { * null values should be skipped. It `true`, it is assumed `row_bitmask` is a * bitmask where bit `i` indicates the presence of a null value in row `i`. */ - compute_single_pass_aggs_fn(Map map, + compute_single_pass_aggs_fn(SetType set, table_device_view input_values, mutable_table_device_view output_values, aggregation::Kind const* aggs, bitmask_type const* row_bitmask, bool skip_rows_with_nulls) - : map(map), + : set(set), input_values(input_values), output_values(output_values), aggs(aggs), @@ -106,10 +98,9 @@ struct compute_single_pass_aggs_fn { __device__ void operator()(size_type i) { if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) { - auto result = map.insert(thrust::make_pair(i, i)); + auto const result = set.insert_and_find(i); - cudf::detail::aggregate_row( - output_values, result.first->second, input_values, i, aggs); + cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); } } }; diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/multi_pass_kernels.cuh index 4bc73631732..7043eafdc10 100644 --- a/cpp/src/groupby/hash/multi_pass_kernels.cuh +++ b/cpp/src/groupby/hash/multi_pass_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,23 +31,23 @@ namespace cudf { namespace detail { -template +template struct var_hash_functor { - Map const map; + SetType set; bitmask_type const* __restrict__ row_bitmask; mutable_column_device_view target; column_device_view source; column_device_view sum; column_device_view count; size_type ddof; - var_hash_functor(Map const map, + var_hash_functor(SetType set, bitmask_type const* row_bitmask, mutable_column_device_view target, column_device_view source, column_device_view sum, column_device_view count, size_type ddof) - : map(map), + : set(set), row_bitmask(row_bitmask), target(target), source(source), @@ -96,8 +96,7 @@ struct var_hash_functor { __device__ inline void operator()(size_type source_index) { if (row_bitmask == nullptr or cudf::bit_is_set(row_bitmask, source_index)) { - auto result = map.find(source_index); - auto target_index = result->second; + auto const target_index = *set.find(source_index); auto col = source; auto source_type = source.type(); diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md index 03ce58ea9e3..549d91b771a 100644 --- a/docs/cudf/source/user_guide/pandas-comparison.md +++ b/docs/cudf/source/user_guide/pandas-comparison.md @@ -87,7 +87,7 @@ using `.from_arrow()` or `.from_pandas()`. ## Result ordering -By default, `join` (or `merge`) and `groupby` operations in cuDF +By default, `join` (or `merge`), `value_counts` and `groupby` operations in cuDF do *not* guarantee output ordering. Compare the results obtained from Pandas and cuDF below: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9b4a79c6841..a0e1a041342 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7688,10 +7688,10 @@ def value_counts( dog 4 0 cat 4 0 ant 6 0 - >>> df.value_counts() + >>> df.value_counts().sort_index() num_legs num_wings - 4 0 2 2 2 1 + 4 0 2 6 0 1 Name: count, dtype: int64 """ diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 9612349a607..e4370be304a 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -109,11 +109,11 @@ def _is_row_of(chunk, obj): Parrot 30.0 Parrot 20.0 Name: Max Speed, dtype: float64 ->>> ser.groupby(level=0).mean() +>>> ser.groupby(level=0, sort=True).mean() Falcon 370.0 Parrot 25.0 Name: Max Speed, dtype: float64 ->>> ser.groupby(ser > 100).mean() +>>> ser.groupby(ser > 100, sort=True).mean() Max Speed False 25.0 True 370.0 @@ -133,7 +133,7 @@ def _is_row_of(chunk, obj): 1 Falcon 370.0 2 Parrot 24.0 3 Parrot 26.0 ->>> df.groupby(['Animal']).mean() +>>> df.groupby(['Animal'], sort=True).mean() Max Speed Animal Falcon 375.0 @@ -151,22 +151,22 @@ def _is_row_of(chunk, obj): Wild 350.0 Parrot Captive 30.0 Wild 20.0 ->>> df.groupby(level=0).mean() +>>> df.groupby(level=0, sort=True).mean() Max Speed Animal Falcon 370.0 Parrot 25.0 ->>> df.groupby(level="Type").mean() +>>> df.groupby(level="Type", sort=True).mean() Max Speed Type -Wild 185.0 Captive 210.0 +Wild 185.0 >>> df = cudf.DataFrame({{'A': 'a a b'.split(), ... 'B': [1,2,3], ... 'C': [4,6,5]}}) ->>> g1 = df.groupby('A', group_keys=False) ->>> g2 = df.groupby('A', group_keys=True) +>>> g1 = df.groupby('A', group_keys=False, sort=True) +>>> g2 = df.groupby('A', group_keys=True, sort=True) Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only differ in their ``group_keys`` argument. Calling `apply` in various ways, @@ -539,11 +539,11 @@ def agg(self, func): ... 'b': [1, 2, 3], ... 'c': [2, 2, 1] ... }) - >>> a.groupby('a').agg('sum') + >>> a.groupby('a', sort=True).agg('sum') b c a - 2 3 1 1 3 4 + 2 3 1 Specifying a list of aggregations to perform on each column. @@ -553,12 +553,12 @@ def agg(self, func): ... 'b': [1, 2, 3], ... 'c': [2, 2, 1] ... }) - >>> a.groupby('a').agg(['sum', 'min']) + >>> a.groupby('a', sort=True).agg(['sum', 'min']) b c sum min sum min a - 2 3 3 1 1 1 3 1 4 2 + 2 3 3 1 1 Using a dict to specify aggregations to perform per column. @@ -568,12 +568,12 @@ def agg(self, func): ... 'b': [1, 2, 3], ... 'c': [2, 2, 1] ... }) - >>> a.groupby('a').agg({'a': 'max', 'b': ['min', 'mean']}) + >>> a.groupby('a', sort=True).agg({'a': 'max', 'b': ['min', 'mean']}) a b max min mean a - 2 2 3 3.0 1 1 1 1.5 + 2 2 3 3.0 Using lambdas/callables to specify aggregations taking parameters. diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 63e0cf98b27..f856bbedca2 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -55,12 +55,12 @@ def assert_groupby_results_equal( if isinstance(expect, (pd.DataFrame, cudf.DataFrame)): expect = expect.sort_values(by=by).reset_index(drop=True) else: - expect = expect.sort_values().reset_index(drop=True) + expect = expect.sort_values(by=by).reset_index(drop=True) if isinstance(got, cudf.DataFrame): got = got.sort_values(by=by).reset_index(drop=True) else: - got = got.sort_values().reset_index(drop=True) + got = got.sort_values(by=by).reset_index(drop=True) assert_eq(expect, got, **kwargs) @@ -179,7 +179,7 @@ def test_groupby_agg_min_max_dictlist(nelem): def test_groupby_as_index_single_agg(pdf, gdf, as_index): gdf = gdf.groupby("y", as_index=as_index).agg({"x": "mean"}) pdf = pdf.groupby("y", as_index=as_index).agg({"x": "mean"}) - assert_groupby_results_equal(pdf, gdf) + assert_groupby_results_equal(pdf, gdf, as_index=as_index, by="y") @pytest.mark.parametrize("engine", ["cudf", "jit"]) @@ -190,7 +190,7 @@ def test_groupby_as_index_apply(pdf, gdf, as_index, engine): ) kwargs = {"func": lambda df: df["x"].mean(), "include_groups": False} pdf = pdf.groupby("y", as_index=as_index).apply(**kwargs) - assert_groupby_results_equal(pdf, gdf) + assert_groupby_results_equal(pdf, gdf, as_index=as_index, by="y") @pytest.mark.parametrize("as_index", [True, False]) @@ -3714,7 +3714,13 @@ def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index): # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` assert_groupby_results_equal( - actual, expected, check_names=False, check_index_type=False + actual, + expected, + check_names=False, + check_index_type=False, + as_index=as_index, + by=["gender", "education"], + sort=sort, ) From c1e26a63d33563190f452047e548f24fb47a63bf Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 29 Feb 2024 17:15:17 -0500 Subject: [PATCH 19/31] Fix cudf::test::to_host to handle both offset types for strings columns (#15073) The `cudf::test::to_host` function is updated to handle int32 and int64 offset types for strings columns when copying data to host memory. This function is used with `cudf::test::print()` as well. Also moved the function from the header `column_utilities.hpp` to the `column_utilities.cu` file. And moved the specialization for of `to_host` for fixed-point types from the header to `.cu` as well. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/15073 --- cpp/include/cudf_test/column_utilities.hpp | 43 +------------ cpp/tests/utilities/column_utilities.cu | 75 ++++++++++++++++++++++ 2 files changed, 78 insertions(+), 40 deletions(-) diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp index 49d5098f823..cbfd7a5e45c 100644 --- a/cpp/include/cudf_test/column_utilities.hpp +++ b/cpp/include/cudf_test/column_utilities.hpp @@ -194,23 +194,7 @@ std::pair, std::vector> to_host(column_view * `column_view`'s data, and second is the column's bitmask. */ template ()>* = nullptr> -std::pair, std::vector> to_host(column_view c) -{ - using namespace numeric; - using Rep = typename T::rep; - - auto host_rep_types = thrust::host_vector(c.size()); - - CUDF_CUDA_TRY( - cudaMemcpy(host_rep_types.data(), c.begin(), c.size() * sizeof(Rep), cudaMemcpyDefault)); - - auto to_fp = [&](Rep val) { return T{scaled_integer{val, scale_type{c.type().scale()}}}; }; - auto begin = thrust::make_transform_iterator(std::cbegin(host_rep_types), to_fp); - auto const host_fixed_points = thrust::host_vector(begin, begin + c.size()); - - return {host_fixed_points, bitmask_to_host(c)}; -} -//! @endcond +std::pair, std::vector> to_host(column_view c); /** * @brief Copies the data and bitmask of a `column_view` of strings @@ -223,29 +207,8 @@ std::pair, std::vector> to_host(column_view * and second is the column's bitmask. */ template <> -inline std::pair, std::vector> to_host(column_view c) -{ - thrust::host_vector host_data(c.size()); - auto stream = cudf::get_default_stream(); - if (c.size() > c.null_count()) { - auto const scv = strings_column_view(c); - auto const h_chars = cudf::detail::make_std_vector_sync( - cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); - auto const h_offsets = cudf::detail::make_std_vector_sync( - cudf::device_span(scv.offsets().data() + scv.offset(), - scv.size() + 1), - stream); - - // build std::string vector from chars and offsets - std::transform( - std::begin(h_offsets), - std::end(h_offsets) - 1, - std::begin(h_offsets) + 1, - host_data.begin(), - [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); - } - return {std::move(host_data), bitmask_to_host(c)}; -} +std::pair, std::vector> to_host(column_view c); +//! @endcond } // namespace cudf::test diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 018c6aeec2c..a556a8702bd 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -940,5 +940,80 @@ bool validate_host_masks(std::vector const& expected_mask, }); } +template ()>*> +std::pair, std::vector> to_host(column_view c) +{ + using namespace numeric; + using Rep = typename T::rep; + + auto host_rep_types = thrust::host_vector(c.size()); + + CUDF_CUDA_TRY( + cudaMemcpy(host_rep_types.data(), c.begin(), c.size() * sizeof(Rep), cudaMemcpyDefault)); + + auto to_fp = [&](Rep val) { return T{scaled_integer{val, scale_type{c.type().scale()}}}; }; + auto begin = thrust::make_transform_iterator(std::cbegin(host_rep_types), to_fp); + auto const host_fixed_points = thrust::host_vector(begin, begin + c.size()); + + return {host_fixed_points, bitmask_to_host(c)}; +} + +template std::pair, std::vector> to_host( + column_view c); +template std::pair, std::vector> to_host( + column_view c); +template std::pair, std::vector> to_host( + column_view c); + +namespace { +struct strings_to_host_fn { + template || + std::is_same_v>* = nullptr> + void operator()(thrust::host_vector& host_data, + char const* chars, + cudf::column_view const& offsets, + rmm::cuda_stream_view stream) + { + auto const h_offsets = cudf::detail::make_std_vector_sync( + cudf::device_span(offsets.data(), offsets.size()), stream); + // build std::string vector from chars and offsets + std::transform(std::begin(h_offsets), + std::end(h_offsets) - 1, + std::begin(h_offsets) + 1, + host_data.begin(), + [&](auto start, auto end) { return std::string(chars + start, end - start); }); + } + + template && + !std::is_same_v>* = nullptr> + void operator()(thrust::host_vector&, + char const*, + cudf::column_view const&, + rmm::cuda_stream_view) + { + CUDF_FAIL("invalid offsets type"); + } +}; +} // namespace + +template <> +std::pair, std::vector> to_host(column_view c) +{ + thrust::host_vector host_data(c.size()); + auto stream = cudf::get_default_stream(); + if (c.size() > c.null_count()) { + auto const scv = strings_column_view(c); + auto const h_chars = cudf::detail::make_std_vector_sync( + cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); + auto offsets = + cudf::slice(scv.offsets(), {scv.offset(), scv.offset() + scv.size() + 1}).front(); + cudf::type_dispatcher( + offsets.type(), strings_to_host_fn{}, host_data, h_chars.data(), offsets, stream); + } + return {std::move(host_data), bitmask_to_host(c)}; +} + } // namespace test } // namespace cudf From a4f1118f23cc7cfdb7e3d03abf7726740ff52af7 Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Thu, 29 Feb 2024 14:21:11 -0800 Subject: [PATCH 20/31] Resolve path parsing issues in `get_json_object` (#15082) This PR addresses a parsing issue related to JSONPath by implementing distinct parsing rules for values inside and outside brackets. For instance, in `{ "A.B": 2, "'A": { "B'": 3 } }`, `$.'A.B'` differs from `$['A.B']`. (See [Assertible JSON Path Documentation](https://assertible.com/docs/guide/json-path)) The fix ensures accurate parsing of JSONPath values containing quotes. For example in `{ "A.B": 2, "'A": { "B'": 3 } }` | JSONPath | Before Fix | Spark | After Fix | |---------------|-------------------------------------------------------|----------------------|---------------------| | $.'A.B' | 2 | 3 | 3 | | $.'A | CUDF_FAIL("Encountered invalid JSONPath input string")| {"B'": 3} | {"B'": 3} | Resolves [12483](https://github.com/rapidsai/cudf/issues/12483). Authors: - Suraj Aralihalli (https://github.com/SurajAralihalli) - Nghia Truong (https://github.com/ttnghia) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Mike Wilson (https://github.com/hyperbolic2346) - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15082 --- cpp/src/json/json_path.cu | 24 +++++++++--- cpp/tests/json/json_tests.cpp | 38 +++++++++++++++++++ java/src/main/native/src/ColumnViewJni.cpp | 10 ++++- .../java/ai/rapids/cudf/ColumnVectorTest.java | 16 ++++++++ 4 files changed, 81 insertions(+), 7 deletions(-) diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu index 25f136e2336..ff42d9c8620 100644 --- a/cpp/src/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -521,6 +521,14 @@ struct path_operator { int index{-1}; // index for subscript operator }; +/** + * @brief Enum to specify whether parsing values enclosed within brackets, like `['book']`. + */ +enum class bracket_state : bool { + INSIDE, ///< Parsing inside brackets + OUTSIDE ///< Parsing outside brackets +}; + /** * @brief Parsing class that holds the current state of the JSONPath string to be parsed * and provides functions for navigating through it. This is only called on the host @@ -541,7 +549,7 @@ class path_state : private parser { case '.': { path_operator op; string_view term{".[", 2}; - if (parse_path_name(op.name, term)) { + if (parse_path_name(op.name, term, bracket_state::OUTSIDE)) { // this is another potential use case for __SPARK_BEHAVIORS / configurability // Spark currently only handles the wildcard operator inside [*], it does // not handle .* @@ -564,7 +572,7 @@ class path_state : private parser { path_operator op; string_view term{"]", 1}; bool const is_string = *pos == '\''; - if (parse_path_name(op.name, term)) { + if (parse_path_name(op.name, term, bracket_state::INSIDE)) { pos++; if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') { op.type = path_operator_type::CHILD_WILDCARD; @@ -600,7 +608,8 @@ class path_state : private parser { private: cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'}; - bool parse_path_name(string_view& name, string_view const& terminators) + // b_state is set to INSIDE while parsing values enclosed within [ ], otherwise OUTSIDE + bool parse_path_name(string_view& name, string_view const& terminators, bracket_state b_state) { switch (*pos) { case '*': @@ -609,8 +618,11 @@ class path_state : private parser { break; case '\'': - if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; } - break; + if (b_state == bracket_state::INSIDE) { + if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; } + break; + } + // if not inside the [ ] -> go to default default: { size_t const chars_left = input_len - (pos - input); @@ -656,7 +668,7 @@ std::pair>, int> build_comma do { op = p_state.get_next_operator(); if (op.type == path_operator_type::ERROR) { - CUDF_FAIL("Encountered invalid JSONPath input string"); + CUDF_FAIL("Encountered invalid JSONPath input string", std::invalid_argument); } if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; } // convert pointer to device pointer diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp index 0894472dcc3..6c9050becc1 100644 --- a/cpp/tests/json/json_tests.cpp +++ b/cpp/tests/json/json_tests.cpp @@ -588,6 +588,15 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) }; EXPECT_THROW(query(), std::invalid_argument); } + + { + auto const input = cudf::test::strings_column_wrapper{R"({"a": "b"})"}; + auto const json_path = std::string{"${a}"}; + auto const query = [&]() { + auto const result = cudf::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), std::invalid_argument); + } } // queries that are legal, but reference invalid parts of the input @@ -1018,4 +1027,33 @@ TEST_F(JsonPathTests, MissingFieldsAsNulls) do_test("$.tup[*].a.x", "[\"5\"]", "[null,null,null,\"5\"]"); } +TEST_F(JsonPathTests, QueriesContainingQuotes) +{ + std::string input_string = R"({"AB": 1, "A.B": 2, "'A": {"B'": 3}, "A": {"B": 4} })"; + + auto do_test = [&input_string](auto const& json_path_string, + auto const& expected_string, + bool const& expect_null = false) { + auto const input = cudf::test::strings_column_wrapper{input_string}; + auto const json_path = std::string{json_path_string}; + cudf::get_json_object_options options; + options.set_allow_single_quotes(true); + auto const result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options); + auto const expected = + cudf::test::strings_column_wrapper{std::initializer_list{expected_string}, + std::initializer_list{!expect_null}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + }; + + // Set 1 + do_test(R"($.AB)", "1"); + do_test(R"($['A.B'])", "2"); + do_test(R"($.'A.B')", "3"); + do_test(R"($.A.B)", "4"); + + // Set 2 + do_test(R"($.'A)", R"({"B'": 3})"); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 1c4eb8a83ab..dd3859a4160 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -2452,7 +2452,15 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject( options.set_allow_single_quotes(allow_single_quotes); options.set_strip_quotes_from_single_strings(strip_quotes_from_single_strings); options.set_missing_fields_as_nulls(missing_fields_as_nulls); - return release_as_jlong(cudf::get_json_object(n_strings_col_view, *n_scalar_path, options)); + auto result_col_ptr = [&]() { + try { + return cudf::get_json_object(n_strings_col_view, *n_scalar_path, options); + } catch (std::invalid_argument const &err) { + auto const null_scalar = cudf::string_scalar(std::string(""), false); + return cudf::make_column_from_scalar(null_scalar, n_strings_col_view.size()); + } catch (...) { throw; } + }(); + return release_as_jlong(result_col_ptr); } CATCH_STD(env, 0) } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 75573046af2..bac4d1e4b3e 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -6405,6 +6405,22 @@ void testGetJSONObjectWithSingleQuotes() { } } +@Test +void testGetJSONObjectWithInvalidQueries() { + String jsonString = "{" + + "\'a\': \'A\"\'" + + "}"; + + GetJsonObjectOptions options = GetJsonObjectOptions.builder().allowSingleQuotes(true).build(); + try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString); + Scalar nullString = Scalar.fromString(null); + ColumnVector expectedAuthors = ColumnVector.fromScalar(nullString, 2); + Scalar path = Scalar.fromString("."); + ColumnVector gotAuthors = json.getJSONObject(path, options)) { + assertColumnsAreEqual(expectedAuthors, gotAuthors); + } +} + @Test void testMakeStructEmpty() { final int numRows = 10; From e96ff74fc020c06ee47a76e47f3fff2555531d32 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 29 Feb 2024 17:52:24 -0600 Subject: [PATCH 21/31] Add support for Python 3.11, require NumPy 1.23+ (#15111) Contributes to https://github.com/rapidsai/build-planning/issues/3 This PR adds support for Python 3.11. It also bumps uses of `NumPy` to `numpy>=1.23`, see https://github.com/rapidsai/build-planning/issues/3#issuecomment-1967952280. ## Notes for Reviewers This is part of ongoing work to add Python 3.11 support across RAPIDS. The Python 3.11 CI workflows introduced in https://github.com/rapidsai/shared-workflows/pull/176 are *optional*... they are not yet required to run successfully for PRs to be merged. This PR can be merged once all jobs are running successfully (including the non-required jobs for Python 3.11). The CI logs should be verified that the jobs are building and testing with Python 3.11. See https://github.com/rapidsai/shared-workflows/pull/176 for more details. *(created with [rapids-reviser](https://github.com/rapidsai/rapids-reviser))* Authors: - James Lamb (https://github.com/jameslamb) - Kyle Edwards (https://github.com/KyleFromNVIDIA) - https://github.com/jakirkham - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Jake Awe (https://github.com/AyodeAwe) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15111 --- README.md | 2 +- .../all_cuda-118_arch-x86_64.yaml | 11 ++++----- .../all_cuda-122_arch-x86_64.yaml | 11 ++++----- conda/recipes/cudf/meta.yaml | 3 ++- dependencies.yaml | 24 ++++++++++++------- .../cudf/tests/test_cuda_array_interface.py | 13 +++++----- python/cudf/cudf/tests/test_string.py | 2 +- .../cudf/tests/text/test_subword_tokenizer.py | 3 ++- python/cudf/pyproject.toml | 9 +++---- python/cudf_kafka/pyproject.toml | 2 +- python/custreamz/pyproject.toml | 1 + python/dask_cudf/pyproject.toml | 3 ++- 12 files changed, 47 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 599e194bc1a..8f9e57ff3ad 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects ```bash conda install -c rapidsai -c conda-forge -c nvidia \ - cudf=24.04 python=3.10 cuda-version=11.8 + cudf=24.04 python=3.11 cuda-version=12.2 ``` We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 79b786fe012..c12e88f1c0f 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -4,7 +4,6 @@ channels: - rapidsai - rapidsai-nightly - dask/label/dev -- pytorch - conda-forge - nvidia dependencies: @@ -59,7 +58,7 @@ dependencies: - ninja - notebook - numba>=0.57 -- numpy>=1.21 +- numpy>=1.23 - numpydoc - nvcc_linux-64=11.8 - nvcomp==3.0.6 @@ -79,8 +78,8 @@ dependencies: - pytest-xdist - pytest<8 - python-confluent-kafka>=1.9.0,<1.10.0a0 -- python>=3.9,<3.11 -- pytorch<1.12.0 +- python>=3.9,<3.12 +- pytorch>=2.1.0 - rapids-dask-dependency==24.4.* - rich - rmm==24.4.* @@ -96,8 +95,8 @@ dependencies: - sphinxcontrib-websupport - streamz - sysroot_linux-64==2.17 -- tokenizers==0.13.1 -- transformers==4.24.0 +- tokenizers==0.15.2 +- transformers==4.38.1 - typing_extensions>=4.0.0 - zlib>=1.2.13 - pip: diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index 66a4ee57238..e773812967d 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -4,7 +4,6 @@ channels: - rapidsai - rapidsai-nightly - dask/label/dev -- pytorch - conda-forge - nvidia dependencies: @@ -58,7 +57,7 @@ dependencies: - ninja - notebook - numba>=0.57 -- numpy>=1.21 +- numpy>=1.23 - numpydoc - nvcomp==3.0.6 - nvtx>=0.2.1 @@ -77,8 +76,8 @@ dependencies: - pytest-xdist - pytest<8 - python-confluent-kafka>=1.9.0,<1.10.0a0 -- python>=3.9,<3.11 -- pytorch<1.12.0 +- python>=3.9,<3.12 +- pytorch>=2.1.0 - rapids-dask-dependency==24.4.* - rich - rmm==24.4.* @@ -94,8 +93,8 @@ dependencies: - sphinxcontrib-websupport - streamz - sysroot_linux-64==2.17 -- tokenizers==0.13.1 -- transformers==4.24.0 +- tokenizers==0.15.2 +- transformers==4.38.1 - typing_extensions>=4.0.0 - zlib>=1.2.13 - pip: diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 80920dc7b5f..6a85fadaa48 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -65,6 +65,7 @@ requirements: - scikit-build-core >=0.7.0 - setuptools - dlpack >=0.5,<0.6.0a0 + - numpy 1.23 - pyarrow ==14.0.2.* - libcudf ={{ version }} - rmm ={{ minor_version }} @@ -83,7 +84,7 @@ requirements: - pandas >=2.0,<2.2.2dev0 - cupy >=12.0.0 - numba >=0.57 - - numpy >=1.21 + - {{ pin_compatible('numpy', max_pin='x') }} - {{ pin_compatible('pyarrow', max_pin='x') }} - libcudf ={{ version }} - {{ pin_compatible('rmm', max_pin='x.x') }} diff --git a/dependencies.yaml b/dependencies.yaml index 4281e907862..a83a03b571b 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -188,7 +188,6 @@ channels: - rapidsai - rapidsai-nightly - dask/label/dev - - pytorch - conda-forge - nvidia dependencies: @@ -258,13 +257,17 @@ dependencies: - *cmake_ver - cython>=3.0.3 - *ninja - - &numpy numpy>=1.21 # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - pyarrow==14.0.2.* - output_types: conda packages: - scikit-build-core>=0.7.0 + - output_types: pyproject + packages: + # Hard pin the patch version used during the build. + # Sync with conda build constraint & wheel run constraint. + - numpy==1.23.* - output_types: [requirements, pyproject] packages: - scikit-build-core[pyproject]>=0.7.0 @@ -488,15 +491,19 @@ dependencies: py: "3.10" packages: - python=3.10 + - matrix: + py: "3.11" + packages: + - python=3.11 - matrix: packages: - - python>=3.9,<3.11 + - python>=3.9,<3.12 run_common: common: - output_types: [conda, requirements, pyproject] packages: - fsspec>=0.6.0 - - *numpy + - numpy>=1.23 - pandas>=2.0,<2.2.2dev0 run_cudf: common: @@ -624,8 +631,8 @@ dependencies: - output_types: pyproject packages: - msgpack - - &tokenizers tokenizers==0.13.1 - - &transformers transformers==4.24.0 + - &tokenizers tokenizers==0.15.2 + - &transformers transformers==4.38.1 - tzdata specific: - output_types: conda @@ -633,9 +640,8 @@ dependencies: - matrix: arch: x86_64 packages: - # Currently, CUDA builds of pytorch do not exist for aarch64. We require - # version <1.12.0 because newer versions use nvidia::cuda-toolkit. - - pytorch<1.12.0 + # Currently, CUDA + aarch64 builds of pytorch do not exist on conda-forge. + - pytorch>=2.1.0 # We only install these on x86_64 to avoid pulling pytorch as a # dependency of transformers. - *tokenizers diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index a9d11922943..1f20152172b 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import types from contextlib import ExitStack as does_not_raise @@ -193,10 +193,11 @@ def test_cuda_array_interface_pytorch(): assert_eq(got, cudf.Series(buffer, dtype=np.bool_)) - index = cudf.Index([], dtype="float64") - tensor = torch.tensor(index) - got = cudf.Index(tensor) - assert_eq(got, index) + # TODO: This test fails with PyTorch 2. Is it still expected to be valid? + # index = cudf.Index([], dtype="float64") + # tensor = torch.tensor(index) + # got = cudf.Index(tensor) + # assert_eq(got, index) index = cudf.core.index.RangeIndex(start=0, stop=100) tensor = torch.tensor(index) @@ -212,7 +213,7 @@ def test_cuda_array_interface_pytorch(): str_series = cudf.Series(["a", "g"]) - with pytest.raises(NotImplementedError): + with pytest.raises(AttributeError): str_series.__cuda_array_interface__ cat_series = str_series.astype("category") diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index a9ba80a395d..de771a56e77 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -891,7 +891,7 @@ def test_string_repeat(data, repeats): ) @pytest.mark.parametrize("repl", ["qwerty", "", " "]) @pytest.mark.parametrize("case,case_raise", [(None, 0), (True, 1), (False, 1)]) -@pytest.mark.parametrize("flags,flags_raise", [(0, 0), (1, 1)]) +@pytest.mark.parametrize("flags,flags_raise", [(0, 0), (re.U, 1)]) def test_string_replace( ps_gs, pat, repl, case, case_raise, flags, flags_raise, regex ): diff --git a/python/cudf/cudf/tests/text/test_subword_tokenizer.py b/python/cudf/cudf/tests/text/test_subword_tokenizer.py index ac17daa8601..b21edc0477f 100644 --- a/python/cudf/cudf/tests/text/test_subword_tokenizer.py +++ b/python/cudf/cudf/tests/text/test_subword_tokenizer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import os import cupy @@ -27,6 +27,7 @@ def assert_equal_tokenization_outputs(hf_output, cudf_output): ) +@pytest.mark.skip(reason="segfaults") @pytest.mark.parametrize("seq_len", [32, 64]) @pytest.mark.parametrize("stride", [0, 15, 30]) @pytest.mark.parametrize("add_special_tokens", [True, False]) diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 590786f2414..5afd82220a4 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cmake>=3.26.4", "cython>=3.0.3", "ninja", - "numpy>=1.21", + "numpy==1.23.*", "protoc-wheel", "pyarrow==14.0.2.*", "rmm==24.4.*", @@ -30,7 +30,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numba>=0.57", - "numpy>=1.21", + "numpy>=1.23", "nvtx>=0.2.1", "packaging", "pandas>=2.0,<2.2.2dev0", @@ -49,6 +49,7 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", ] [project.optional-dependencies] @@ -63,8 +64,8 @@ test = [ "pytest-xdist", "pytest<8", "scipy", - "tokenizers==0.13.1", - "transformers==4.24.0", + "tokenizers==0.15.2", + "transformers==4.38.1", "tzdata", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. pandas-tests = [ diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 216d83940ce..7369b99aaf4 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cmake>=3.26.4", "cython>=3.0.3", "ninja", - "numpy>=1.21", + "numpy==1.23.*", "pyarrow==14.0.2.*", "scikit-build-core[pyproject]>=0.7.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index 12b0356c9c1..ccaa2543cc3 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -32,6 +32,7 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", ] [project.optional-dependencies] diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 5d4ea429d5f..4ecfc4f3f85 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -21,7 +21,7 @@ dependencies = [ "cudf==24.4.*", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", - "numpy>=1.21", + "numpy>=1.23", "pandas>=2.0,<2.2.2dev0", "rapids-dask-dependency==24.4.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -33,6 +33,7 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", ] [project.entry-points."dask.dataframe.backends"] From 56a3b8f6516f830d836b50cc0d93ae67c4db9613 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 29 Feb 2024 18:02:53 -0800 Subject: [PATCH 22/31] Fix chunked reads of Parquet delta encoded pages (#14921) The chunked Parquet reader currently does not properly estimate the sizes of string pages that are delta encoded. This PR modifies `gpuDecodeTotalPageStringSize()` to take into account the new encodings. Authors: - Ed Seidl (https://github.com/etseidl) - Mike Wilson (https://github.com/hyperbolic2346) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/14921 --- cpp/src/io/parquet/decode_preprocess.cu | 136 ++++++++++++++++++-- cpp/src/io/parquet/page_decode.cuh | 1 + cpp/src/io/parquet/page_string_decode.cu | 1 + cpp/tests/io/parquet_chunked_reader_test.cu | 109 ++++++++++++++-- 4 files changed, 223 insertions(+), 24 deletions(-) diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 862dedf6200..19c398c5965 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "delta_binary.cuh" #include "io/utilities/column_buffer.hpp" #include "page_decode.cuh" @@ -40,26 +41,139 @@ constexpr int rolling_buf_size = LEVEL_DECODE_BUF_SIZE; using unused_state_buf = page_state_buffers_s<0, 0, 0>; /** + * @brief Calculate string bytes for DELTA_LENGTH_BYTE_ARRAY encoded pages + * + * Result is valid only on thread 0. + * + * @param s The local page info + * @param t Thread index + */ +__device__ size_type gpuDeltaLengthPageStringSize(page_state_s* s, int t) +{ + if (t == 0) { + // find the beginning of char data + delta_binary_decoder string_lengths; + auto const* string_start = string_lengths.find_end_of_block(s->data_start, s->data_end); + // distance is size of string data + return static_cast(std::distance(string_start, s->data_end)); + } + return 0; +} + +/** + * @brief Calculate string bytes for DELTA_BYTE_ARRAY encoded pages + * + * This expects all threads in the thread block (preprocess_block_size). + * + * @param s The local page info + * @param t Thread index + */ +__device__ size_type gpuDeltaPageStringSize(page_state_s* s, int t) +{ + using cudf::detail::warp_size; + using WarpReduce = cub::WarpReduce; + __shared__ typename WarpReduce::TempStorage temp_storage[2]; + + __shared__ __align__(16) delta_binary_decoder prefixes; + __shared__ __align__(16) delta_binary_decoder suffixes; + + int const lane_id = t % warp_size; + int const warp_id = t / warp_size; + + if (t == 0) { + auto const* suffix_start = prefixes.find_end_of_block(s->data_start, s->data_end); + suffixes.init_binary_block(suffix_start, s->data_end); + } + __syncthreads(); + + // two warps will traverse the prefixes and suffixes and sum them up + auto const db = t < warp_size ? &prefixes : t < 2 * warp_size ? &suffixes : nullptr; + + size_t total_bytes = 0; + if (db != nullptr) { + // initialize with first value (which is stored in last_value) + if (lane_id == 0) { total_bytes = db->last_value; } + + uleb128_t lane_sum = 0; + while (db->current_value_idx < db->num_encoded_values(true)) { + // calculate values for current mini-block + db->calc_mini_block_values(lane_id); + + // get per lane sum for mini-block + for (uint32_t i = 0; i < db->values_per_mb; i += warp_size) { + uint32_t const idx = db->current_value_idx + i + lane_id; + if (idx < db->value_count) { + lane_sum += db->value[rolling_index(idx)]; + } + } + + if (lane_id == 0) { db->setup_next_mini_block(true); } + __syncwarp(); + } + + // get sum for warp. + // note: warp_sum will only be valid on lane 0. + auto const warp_sum = WarpReduce(temp_storage[warp_id]).Sum(lane_sum); + + if (lane_id == 0) { total_bytes += warp_sum; } + } + __syncthreads(); + + // now sum up total_bytes from the two warps. result is only valid on thread 0. + auto const final_bytes = + cudf::detail::single_lane_block_sum_reduce(total_bytes); + + return static_cast(final_bytes); +} + +/** + * @brief Calculate the number of string bytes in the page. * * This function expects the dictionary position to be at 0 and will traverse - * the entire thing. + * the entire thing (for plain and dictionary encoding). * - * Operates on a single warp only. Expects t < 32 + * This expects all threads in the thread block (preprocess_block_size). Result is only + * valid on thread 0. * * @param s The local page info * @param t Thread index */ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t) { + using cudf::detail::warp_size; size_type target_pos = s->num_input_values; size_type str_len = 0; - if (s->dict_base) { - auto const [new_target_pos, len] = - gpuDecodeDictionaryIndices(s, nullptr, target_pos, t); - target_pos = new_target_pos; - str_len = len; - } else if ((s->col.data_type & 7) == BYTE_ARRAY) { - str_len = gpuInitStringDescriptors(s, nullptr, target_pos, t); + switch (s->page.encoding) { + case Encoding::PLAIN_DICTIONARY: + case Encoding::RLE_DICTIONARY: + if (t < warp_size && s->dict_base) { + auto const [new_target_pos, len] = + gpuDecodeDictionaryIndices(s, nullptr, target_pos, t); + target_pos = new_target_pos; + str_len = len; + } + break; + + case Encoding::PLAIN: + // For V2 headers, we know how many values are present, so can skip an expensive scan. + if ((s->page.flags & PAGEINFO_FLAGS_V2) != 0) { + auto const num_values = s->page.num_input_values - s->page.num_nulls; + str_len = s->dict_size - sizeof(int) * num_values; + } + // For V1, the choice is an overestimate (s->dict_size), or an exact number that's + // expensive to compute. For now we're going with the latter. + else { + str_len = gpuInitStringDescriptors(s, nullptr, target_pos, t); + } + break; + + case Encoding::DELTA_LENGTH_BYTE_ARRAY: str_len = gpuDeltaLengthPageStringSize(s, t); break; + + case Encoding::DELTA_BYTE_ARRAY: str_len = gpuDeltaPageStringSize(s, t); break; + + default: + // not a valid string encoding, so just return 0 + break; } if (!t) { s->dict_pos = target_pos; } return str_len; @@ -348,9 +462,9 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) } // retrieve total string size. - // TODO: make this block-based instead of just 1 warp if (compute_string_sizes) { - if (t < 32) { s->page.str_bytes = gpuDecodeTotalPageStringSize(s, t); } + auto const str_bytes = gpuDecodeTotalPageStringSize(s, t); + if (t == 0) { s->page.str_bytes = str_bytes; } } // update output results: diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index 4353e079496..cf3e1911496 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -1291,6 +1291,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, s->dict_bits = 0; s->dict_base = nullptr; s->dict_size = 0; + s->dict_val = 0; // NOTE: if additional encodings are supported in the future, modifications must // be made to is_supported_encoding() in reader_impl_preprocess.cu switch (s->page.encoding) { diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 101bd34f09f..b63f96fda46 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -549,6 +549,7 @@ __device__ thrust::pair totalDeltaByteArraySize(uint8_t const* d // get sum for warp. // note: warp_sum will only be valid on lane 0. auto const warp_sum = WarpReduce(temp_storage[warp_id]).Sum(lane_sum); + __syncwarp(); auto const warp_max = WarpReduce(temp_storage[warp_id]).Reduce(lane_max, cub::Max()); if (lane_id == 0) { diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu index ea6d65a8c14..2c992677a65 100644 --- a/cpp/tests/io/parquet_chunked_reader_test.cu +++ b/cpp/tests/io/parquet_chunked_reader_test.cu @@ -62,6 +62,7 @@ using int32s_lists_col = cudf::test::lists_column_wrapper; auto write_file(std::vector>& input_columns, std::string const& filename, bool nullable, + bool delta_encoding, std::size_t max_page_size_bytes = cudf::io::default_max_page_size_bytes, std::size_t max_page_size_rows = cudf::io::default_max_page_size_rows) { @@ -86,14 +87,22 @@ auto write_file(std::vector>& input_columns, } auto input_table = std::make_unique(std::move(input_columns)); - auto filepath = - temp_env->get_temp_filepath(nullable ? filename + "_nullable.parquet" : filename + ".parquet"); + auto file_name = filename; + if (nullable) { file_name = file_name + "_nullable"; } + if (delta_encoding) { file_name = file_name + "_delta"; } + auto const filepath = temp_env->get_temp_filepath(file_name + ".parquet"); + + auto const dict_policy = + delta_encoding ? cudf::io::dictionary_policy::NEVER : cudf::io::dictionary_policy::ALWAYS; + auto const v2_headers = delta_encoding; auto const write_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *input_table) .max_page_size_bytes(max_page_size_bytes) .max_page_size_rows(max_page_size_rows) .max_page_fragment_size(cudf::io::default_max_page_fragment_size) + .dictionary_policy(dict_policy) + .write_v2_headers(v2_headers) .build(); cudf::io::write_parquet(write_opts); @@ -140,7 +149,7 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadNoData) input_columns.emplace_back(int32s_col{}.release()); input_columns.emplace_back(int64s_col{}.release()); - auto const [expected, filepath] = write_file(input_columns, "chunked_read_empty", false); + auto const [expected, filepath] = write_file(input_columns, "chunked_read_empty", false, false); auto const [result, num_chunks] = chunked_read(filepath, 1'000); EXPECT_EQ(num_chunks, 1); EXPECT_EQ(result->num_rows(), 0); @@ -152,24 +161,38 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadSimpleData) { auto constexpr num_rows = 40'000; - auto const generate_input = [num_rows](bool nullable) { + auto const generate_input = [num_rows](bool nullable, bool use_delta) { std::vector> input_columns; auto const value_iter = thrust::make_counting_iterator(0); input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release()); input_columns.emplace_back(int64s_col(value_iter, value_iter + num_rows).release()); - return write_file(input_columns, "chunked_read_simple", nullable); + return write_file(input_columns, "chunked_read_simple", nullable, false); }; { - auto const [expected, filepath] = generate_input(false); + auto const [expected, filepath] = generate_input(false, false); + auto const [result, num_chunks] = chunked_read(filepath, 240'000); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + { + auto const [expected, filepath] = generate_input(false, true); + auto const [result, num_chunks] = chunked_read(filepath, 240'000); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); + } + + { + auto const [expected, filepath] = generate_input(true, false); auto const [result, num_chunks] = chunked_read(filepath, 240'000); EXPECT_EQ(num_chunks, 2); CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); } { - auto const [expected, filepath] = generate_input(true); + auto const [expected, filepath] = generate_input(true, true); auto const [result, num_chunks] = chunked_read(filepath, 240'000); EXPECT_EQ(num_chunks, 2); CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result); @@ -186,7 +209,8 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadBoundaryCases) std::vector> input_columns; auto const value_iter = thrust::make_counting_iterator(0); input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release()); - return write_file(input_columns, "chunked_read_simple_boundary", false /*nullable*/); + return write_file( + input_columns, "chunked_read_simple_boundary", false /*nullable*/, false /*delta_encoding*/); }(); // Test with zero limit: everything will be read in one chunk @@ -264,7 +288,7 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithString) { auto constexpr num_rows = 60'000; - auto const generate_input = [num_rows](bool nullable) { + auto const generate_input = [num_rows](bool nullable, bool use_delta) { std::vector> input_columns; auto const value_iter = thrust::make_counting_iterator(0); @@ -296,13 +320,16 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithString) return write_file(input_columns, "chunked_read_with_strings", nullable, + use_delta, 512 * 1024, // 512KB per page 20000 // 20k rows per page ); }; - auto const [expected_no_null, filepath_no_null] = generate_input(false); - auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true); + auto const [expected_no_null, filepath_no_null] = generate_input(false, false); + auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true, false); + auto const [expected_no_null_delta, filepath_no_null_delta] = generate_input(false, true); + auto const [expected_with_nulls_delta, filepath_with_nulls_delta] = generate_input(true, true); // Test with zero limit: everything will be read in one chunk { @@ -315,6 +342,16 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithString) EXPECT_EQ(num_chunks, 1); CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); } + { + auto const [result, num_chunks] = chunked_read(filepath_no_null_delta, 0); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null_delta, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls_delta, 0); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls_delta, *result); + } // Test with a very small limit: 1 byte { @@ -327,6 +364,16 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithString) EXPECT_EQ(num_chunks, 3); CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); } + { + auto const [result, num_chunks] = chunked_read(filepath_no_null_delta, 1); + EXPECT_EQ(num_chunks, 3); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null_delta, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls_delta, 1); + EXPECT_EQ(num_chunks, 3); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls_delta, *result); + } // Test with a very large limit { @@ -339,6 +386,16 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithString) EXPECT_EQ(num_chunks, 1); CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); } + { + auto const [result, num_chunks] = chunked_read(filepath_no_null_delta, 2L << 40); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null_delta, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls_delta, 2L << 40); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls_delta, *result); + } // Other tests: @@ -352,6 +409,16 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithString) EXPECT_EQ(num_chunks, 2); CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); } + { + auto const [result, num_chunks] = chunked_read(filepath_no_null_delta, 500'000); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null_delta, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls_delta, 500'000); + EXPECT_EQ(num_chunks, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls_delta, *result); + } { auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000); @@ -363,13 +430,23 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithString) EXPECT_EQ(num_chunks, 1); CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result); } + { + auto const [result, num_chunks] = chunked_read(filepath_no_null_delta, 1'000'000); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null_delta, *result); + } + { + auto const [result, num_chunks] = chunked_read(filepath_with_nulls_delta, 1'000'000); + EXPECT_EQ(num_chunks, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls_delta, *result); + } } TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithStringPrecise) { auto constexpr num_rows = 60'000; - auto const generate_input = [num_rows](bool nullable) { + auto const generate_input = [num_rows](bool nullable, bool use_delta) { std::vector> input_columns; // strings Page total bytes cumulative @@ -388,12 +465,13 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithStringPrecise) return write_file(input_columns, "chunked_read_with_strings_precise", nullable, + use_delta, 512 * 1024, // 512KB per page 20000 // 20k rows per page ); }; - auto const [expected_no_null, filepath_no_null] = generate_input(false); + auto const [expected_no_null, filepath_no_null] = generate_input(false, false); // a chunk limit of 1 byte less than 2 pages should force it to produce 3 chunks: // each 1 page in size @@ -434,6 +512,7 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithStructs) return write_file(input_columns, "chunked_read_with_structs", nullable, + false /*delta_encoding*/, 512 * 1024, // 512KB per page 20000 // 20k rows per page ); @@ -515,6 +594,7 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithListsNoNulls) return write_file(input_columns, "chunked_read_with_lists_no_null", false /*nullable*/, + false /*delta_encoding*/, 512 * 1024, // 512KB per page 20000 // 20k rows per page ); @@ -597,6 +677,7 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithListsHavingNulls) return write_file(input_columns, "chunked_read_with_lists_nulls", true /*nullable*/, + false /*delta_encoding*/, 512 * 1024, // 512KB per page 20000 // 20k rows per page ); @@ -685,6 +766,7 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithStructsOfLists) return write_file(input_columns, "chunked_read_with_structs_of_lists", nullable, + false /*delta_encoding*/, 512 * 1024, // 512KB per page 20000 // 20k rows per page ); @@ -825,6 +907,7 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithListsOfStructs) return write_file(input_columns, "chunked_read_with_lists_of_structs", nullable, + false /*delta_encoding*/, 512 * 1024, // 512KB per page 20000 // 20k rows per page ); From 3b228e2c6d3ec39fcba553c63d53a56760dc1ca6 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Fri, 1 Mar 2024 11:22:32 -0700 Subject: [PATCH 23/31] Implement `segmented_row_bit_count` for computing row sizes by segments of rows (#15169) This implements `cudf::segmented_bit_count`, a version of `cudf::row_bit_count` with adding `segment_length` parameter to the interface. With the new parameter, `segmented_bit_count` allows to compute aggregate sizes for each "segment" of rows instead of computing size for each row. Currently, only fixed-length segments are supported. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - https://github.com/nvdbaranec - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/15169 --- cpp/include/cudf/detail/transform.hpp | 12 +- cpp/include/cudf/transform.hpp | 25 +- cpp/src/transform/row_bit_count.cu | 95 ++++-- cpp/tests/CMakeLists.txt | 1 + cpp/tests/transform/row_bit_count_test.cu | 300 +++++++++--------- .../transform/segmented_row_bit_count_test.cu | 251 +++++++++++++++ 6 files changed, 503 insertions(+), 181 deletions(-) create mode 100644 cpp/tests/transform/segmented_row_bit_count_test.cu diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp index 215ad50aed6..965fea84860 100644 --- a/cpp/include/cudf/detail/transform.hpp +++ b/cpp/include/cudf/detail/transform.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -100,5 +100,15 @@ std::unique_ptr row_bit_count(table_view const& t, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @copydoc cudf::segmented_row_bit_count + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr segmented_row_bit_count(table_view const& t, + size_type segment_length, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp index 412fe17ef26..49ec3d7c0d5 100644 --- a/cpp/include/cudf/transform.hpp +++ b/cpp/include/cudf/transform.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -224,5 +224,28 @@ std::unique_ptr row_bit_count( table_view const& t, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for + * each segment of rows. + * + * This is similar to counting bit size per row for the input table in `cudf::row_bit_count`, + * except that row sizes are accumulated by segments. + * + * Currently, only fixed-length segments are supported. In case the input table has number of rows + * not divisible by `segment_length`, its last segment is considered as shorter than the others. + * + * @throw std::invalid_argument if the input `segment_length` is non-positive or larger than the + * number of rows in the input table. + * + * @param t The table view to perform the computation on + * @param segment_length The number of rows in each segment for which the total size is computed + * @param mr Device memory resource used to allocate the returned columns' device memory + * @return A 32-bit integer column containing the bit counts for each segment of rows + */ +std::unique_ptr segmented_row_bit_count( + table_view const& t, + size_type segment_length, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace cudf diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index eda8ec7a463..78bd558501b 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -31,8 +32,10 @@ #include #include -#include +#include +#include #include +#include namespace cudf { namespace detail { @@ -398,26 +401,32 @@ __device__ size_type row_size_functor::operator()(column_device_vie * @param cols An span of column_device_views representing a column hierarchy * @param info An span of column_info structs corresponding the elements in `cols` * @param output Output span of size (# rows) where per-row bit sizes are stored + * @param segment_length The number of rows in each segment for which the total size is computed * @param max_branch_depth Maximum depth of the span stack needed per-thread */ -CUDF_KERNEL void compute_row_sizes(device_span cols, - device_span info, - device_span output, - size_type max_branch_depth) +CUDF_KERNEL void compute_segment_sizes(device_span cols, + device_span info, + device_span output, + size_type segment_length, + size_type max_branch_depth) { extern __shared__ row_span thread_branch_stacks[]; int const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto const num_rows = output.size(); - if (tid >= num_rows) { return; } + auto const num_segments = static_cast(output.size()); + if (tid >= num_segments) { return; } // my_branch_stack points to the last span prior to branching. a branch occurs only // when we are inside of a list contained within a struct column. row_span* my_branch_stack = thread_branch_stacks + (threadIdx.x * max_branch_depth); size_type branch_depth{0}; - // current row span - always starts at 1 row. - row_span cur_span{tid, tid + 1}; + // current row span - always starts at spanning over `segment_length` rows. + auto const num_rows = cols[0].size(); + auto const get_default_row_span = [=] { + return row_span{tid * segment_length, cuda::std::min((tid + 1) * segment_length, num_rows)}; + }; + auto cur_span = get_default_row_span(); // output size size_type& size = output[tid]; @@ -444,7 +453,7 @@ CUDF_KERNEL void compute_row_sizes(device_span cols, if (info[idx].depth == 0) { branch_depth = 0; last_branch_depth = 0; - cur_span = row_span{tid, tid + 1}; + cur_span = get_default_row_span(); } // add the contributing size of this row @@ -465,17 +474,18 @@ CUDF_KERNEL void compute_row_sizes(device_span cols, } // anonymous namespace -/** - * @copydoc cudf::detail::row_bit_count - * - */ -std::unique_ptr row_bit_count(table_view const& t, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr segmented_row_bit_count(table_view const& t, + size_type segment_length, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - // no rows + // If there is no rows, segment_length will not be checked. if (t.num_rows() <= 0) { return cudf::make_empty_column(type_id::INT32); } + CUDF_EXPECTS(segment_length >= 1 && segment_length <= t.num_rows(), + "Invalid segment length.", + std::invalid_argument); + // flatten the hierarchy and determine some information about it. std::vector cols; std::vector info; @@ -484,17 +494,28 @@ std::unique_ptr row_bit_count(table_view const& t, CUDF_EXPECTS(info.size() == cols.size(), "Size/info mismatch"); // create output buffer and view - auto output = cudf::make_fixed_width_column( - data_type{type_id::INT32}, t.num_rows(), mask_state::UNALLOCATED, stream, mr); + auto const num_segments = cudf::util::div_rounding_up_safe(t.num_rows(), segment_length); + auto output = cudf::make_fixed_width_column( + data_type{type_id::INT32}, num_segments, mask_state::UNALLOCATED, stream, mr); mutable_column_view mcv = output->mutable_view(); // simple case. if we have no complex types (lists, strings, etc), the per-row size is already // trivially computed if (h_info.complex_type_count <= 0) { - thrust::fill(rmm::exec_policy(stream), - mcv.begin(), - mcv.end(), - h_info.simple_per_row_size); + thrust::tabulate( + rmm::exec_policy_nosync(stream), + mcv.begin(), + mcv.end(), + cuda::proclaim_return_type( + [segment_length, + num_rows = t.num_rows(), + per_row_size = h_info.simple_per_row_size] __device__(size_type const segment_idx) { + // Since the number of rows may not divisible by segment_length, + // the last segment may be shorter than the others. + auto const current_length = + cuda::std::min(segment_length, num_rows - segment_length * segment_idx); + return per_row_size * current_length; + })); return output; } @@ -523,22 +544,34 @@ std::unique_ptr row_bit_count(table_view const& t, // should we be aborting if we reach some extremely small block size, or just if we hit 0? CUDF_EXPECTS(block_size > 0, "Encountered a column hierarchy too complex for row_bit_count"); - cudf::detail::grid_1d grid{t.num_rows(), block_size, 1}; - compute_row_sizes<<>>( + cudf::detail::grid_1d grid{num_segments, block_size, 1}; + compute_segment_sizes<<>>( {std::get<1>(d_cols), cols.size()}, {d_info.data(), info.size()}, - {mcv.data(), static_cast(t.num_rows())}, + {mcv.data(), static_cast(mcv.size())}, + segment_length, h_info.max_branch_depth); return output; } +std::unique_ptr row_bit_count(table_view const& t, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return segmented_row_bit_count(t, 1, stream, mr); +} + } // namespace detail -/** - * @copydoc cudf::row_bit_count - * - */ +std::unique_ptr segmented_row_bit_count(table_view const& t, + size_type segment_length, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::segmented_row_bit_count(t, segment_length, cudf::get_default_stream(), mr); +} + std::unique_ptr row_bit_count(table_view const& t, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 3e377b07eee..93443b04bd5 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -259,6 +259,7 @@ ConfigureTest( transform/mask_to_bools_test.cpp transform/bools_to_mask_test.cpp transform/row_bit_count_test.cu + transform/segmented_row_bit_count_test.cu transform/one_hot_encode_tests.cpp ) diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu index 236407e62f3..01a042130d6 100644 --- a/cpp/tests/transform/row_bit_count_test.cu +++ b/cpp/tests/transform/row_bit_count_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,6 +35,148 @@ #include +namespace row_bit_count_test { + +template +std::pair, std::unique_ptr> build_list_column() +{ + using LCW = cudf::test::lists_column_wrapper; + constexpr cudf::size_type type_size = sizeof(cudf::device_storage_type_t) * CHAR_BIT; + + // { + // {{1, 2}, {3, 4, 5}}, + // {{}}, + // {LCW{10}}, + // {{6, 7, 8}, {9}}, + // {{-1, -2}, {-3, -4}}, + // {{-5, -6, -7}, {-8, -9}} + // } + cudf::test::fixed_width_column_wrapper values{ + 1, 2, 3, 4, 5, 10, 6, 7, 8, 9, -1, -2, -3, -4, -5, -6, -7, -8, -9}; + cudf::test::fixed_width_column_wrapper inner_offsets{ + 0, 2, 5, 6, 9, 10, 12, 14, 17, 19}; + auto inner_list = cudf::make_lists_column(9, inner_offsets.release(), values.release(), 0, {}); + cudf::test::fixed_width_column_wrapper outer_offsets{0, 2, 2, 3, 5, 7, 9}; + auto list = cudf::make_lists_column(6, outer_offsets.release(), std::move(inner_list), 0, {}); + + // expected size = (num rows at level 1 + num_rows at level 2) + # values in the leaf + cudf::test::fixed_width_column_wrapper expected{ + ((4 + 8) * CHAR_BIT) + (type_size * 5), + ((4 + 0) * CHAR_BIT) + (type_size * 0), + ((4 + 4) * CHAR_BIT) + (type_size * 1), + ((4 + 8) * CHAR_BIT) + (type_size * 4), + ((4 + 8) * CHAR_BIT) + (type_size * 4), + ((4 + 8) * CHAR_BIT) + (type_size * 5)}; + + return {std::move(list), expected.release()}; +} + +std::pair, std::unique_ptr> build_struct_column() +{ + std::vector struct_validity{0, 1, 1, 1, 1, 0}; + std::vector strings{"abc", "def", "", "z", "bananas", "daïs"}; + + cudf::test::fixed_width_column_wrapper col0{0, 1, 2, 3, 4, 5}; + cudf::test::fixed_width_column_wrapper col1{{8, 9, 10, 11, 12, 13}, {1, 0, 1, 1, 1, 1}}; + cudf::test::strings_column_wrapper col2(strings.begin(), strings.end()); + + // creating a struct column will cause all child columns to be promoted to have validity + cudf::test::structs_column_wrapper struct_col({col0, col1, col2}, struct_validity); + + // expect (1 offset (4 bytes) + (length of string if row is valid) + 1 validity bit) + + // (1 float + 1 validity bit) + + // (1 int16_t + 1 validity bit) + + // (1 validity bit) + cudf::test::fixed_width_column_wrapper expected_sizes{84, 108, 84, 92, 140, 84}; + + return {struct_col.release(), expected_sizes.release()}; +} + +std::unique_ptr build_nested_column1(std::vector const& struct_validity) +{ + // tests the "branching" case -> list ...>>> + + // List, float, int16> + + // Inner list column + cudf::test::lists_column_wrapper list{{1, 2, 3, 4, 5}, + {6, 7, 8}, + {33, 34, 35, 36, 37, 38, 39}, + {-1, -2}, + {-10, -11, -1, -20}, + {40, 41, 42}, + {100, 200, 300}, + {-100, -200, -300}}; + + // floats + std::vector ages{5, 10, 15, 20, 4, 75, 16, -16}; + std::vector ages_validity = {1, 1, 1, 1, 0, 1, 0, 1}; + auto ages_column = + cudf::test::fixed_width_column_wrapper(ages.begin(), ages.end(), ages_validity.begin()); + + // int16 values + std::vector vals{-1, -2, -3, 1, 2, 3, 8, 9}; + auto i16_column = cudf::test::fixed_width_column_wrapper(vals.begin(), vals.end()); + + // Assemble struct column + auto struct_column = + cudf::test::structs_column_wrapper({list, ages_column, i16_column}, struct_validity); + + // wrap in a list + std::vector outer_offsets{0, 1, 1, 3, 6, 7, 8}; + cudf::test::fixed_width_column_wrapper outer_offsets_col(outer_offsets.begin(), + outer_offsets.end()); + auto const size = static_cast(outer_offsets_col).size() - 1; + + // Each struct (list child) has size: + // (1 offset (4 bytes) + (list size if row is valid) + 1 validity bit) + + // (1 float + 1 validity bit) + + // (1 int16_t + 1 validity bit) + + // (1 validity bit) + // Each top level list has size: + // 1 offset (4 bytes) + (list size if row is valid). + + return cudf::make_lists_column(static_cast(size), + outer_offsets_col.release(), + struct_column.release(), + 0, + rmm::device_buffer{}); +} + +std::unique_ptr build_nested_column2(std::vector const& struct_validity) +{ + // List>, Struct>> + + // Inner list column + // clang-format off + cudf::test::lists_column_wrapper list{ + {{1, 2, 3, 4, 5}, {2, 3}}, + {{6, 7, 8}, {8, 9}}, + {{1, 2}, {3, 4, 5}, {33, 34, 35, 36, 37, 38, 39}}}; + // clang-format on + + // Inner struct + std::vector vals{-1, -2, -3}; + auto i16_column = cudf::test::fixed_width_column_wrapper(vals.begin(), vals.end()); + auto inner_struct = cudf::test::structs_column_wrapper({i16_column}); + + // outer struct + auto outer_struct = cudf::test::structs_column_wrapper({list, inner_struct}, struct_validity); + + // wrap in a list + std::vector outer_offsets{0, 1, 1, 3}; + cudf::test::fixed_width_column_wrapper outer_offsets_col(outer_offsets.begin(), + outer_offsets.end()); + auto const size = static_cast(outer_offsets_col).size() - 1; + return cudf::make_lists_column(static_cast(size), + outer_offsets_col.release(), + outer_struct.release(), + 0, + rmm::device_buffer{}); +} + +} // namespace row_bit_count_test + template struct RowBitCountTyped : public cudf::test::BaseFixture {}; @@ -82,45 +224,11 @@ TYPED_TEST(RowBitCountTyped, SimpleTypesWithNulls) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result); } -template -std::pair, std::unique_ptr> build_list_column() -{ - using LCW = cudf::test::lists_column_wrapper; - constexpr cudf::size_type type_size = sizeof(cudf::device_storage_type_t) * CHAR_BIT; - - // { - // {{1, 2}, {3, 4, 5}}, - // {{}}, - // {LCW{10}}, - // {{6, 7, 8}, {9}}, - // {{-1, -2}, {-3, -4}}, - // {{-5, -6, -7}, {-8, -9}} - // } - cudf::test::fixed_width_column_wrapper values{ - 1, 2, 3, 4, 5, 10, 6, 7, 8, 9, -1, -2, -3, -4, -5, -6, -7, -8, -9}; - cudf::test::fixed_width_column_wrapper inner_offsets{ - 0, 2, 5, 6, 9, 10, 12, 14, 17, 19}; - auto inner_list = cudf::make_lists_column(9, inner_offsets.release(), values.release(), 0, {}); - cudf::test::fixed_width_column_wrapper outer_offsets{0, 2, 2, 3, 5, 7, 9}; - auto list = cudf::make_lists_column(6, outer_offsets.release(), std::move(inner_list), 0, {}); - - // expected size = (num rows at level 1 + num_rows at level 2) + # values in the leaf - cudf::test::fixed_width_column_wrapper expected{ - ((4 + 8) * CHAR_BIT) + (type_size * 5), - ((4 + 0) * CHAR_BIT) + (type_size * 0), - ((4 + 4) * CHAR_BIT) + (type_size * 1), - ((4 + 8) * CHAR_BIT) + (type_size * 4), - ((4 + 8) * CHAR_BIT) + (type_size * 4), - ((4 + 8) * CHAR_BIT) + (type_size * 5)}; - - return {std::move(list), expected.release()}; -} - TYPED_TEST(RowBitCountTyped, Lists) { using T = TypeParam; - auto [col, expected_sizes] = build_list_column(); + auto [col, expected_sizes] = row_bit_count_test::build_list_column(); cudf::table_view t({*col}); auto result = cudf::row_bit_count(t); @@ -272,27 +380,6 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(row_bit_counts->view(), expected_row_bit_counts->view()); } -std::pair, std::unique_ptr> build_struct_column() -{ - std::vector struct_validity{0, 1, 1, 1, 1, 0}; - std::vector strings{"abc", "def", "", "z", "bananas", "daïs"}; - - cudf::test::fixed_width_column_wrapper col0{0, 1, 2, 3, 4, 5}; - cudf::test::fixed_width_column_wrapper col1{{8, 9, 10, 11, 12, 13}, {1, 0, 1, 1, 1, 1}}; - cudf::test::strings_column_wrapper col2(strings.begin(), strings.end()); - - // creating a struct column will cause all child columns to be promoted to have validity - cudf::test::structs_column_wrapper struct_col({col0, col1, col2}, struct_validity); - - // expect (1 offset (4 bytes) + (length of string if row is valid) + 1 validity bit) + - // (1 float + 1 validity bit) + - // (1 int16_t + 1 validity bit) + - // (1 validity bit) - cudf::test::fixed_width_column_wrapper expected_sizes{84, 108, 84, 92, 140, 84}; - - return {struct_col.release(), expected_sizes.release()}; -} - TEST_F(RowBitCount, StructsNoNulls) { std::vector strings{"abc", "daïs", "", "z", "bananas", "warp"}; @@ -319,7 +406,7 @@ TEST_F(RowBitCount, StructsNoNulls) TEST_F(RowBitCount, StructsNulls) { - auto [struct_col, expected_sizes] = build_struct_column(); + auto [struct_col, expected_sizes] = row_bit_count_test::build_struct_column(); cudf::table_view t({*struct_col}); auto result = cudf::row_bit_count(t); @@ -346,101 +433,18 @@ TEST_F(RowBitCount, StructsNested) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); } -std::unique_ptr build_nested_column1(std::vector const& struct_validity) -{ - // tests the "branching" case -> list ...>>> - - // List, float, int16> - - // Inner list column - cudf::test::lists_column_wrapper list{{1, 2, 3, 4, 5}, - {6, 7, 8}, - {33, 34, 35, 36, 37, 38, 39}, - {-1, -2}, - {-10, -11, -1, -20}, - {40, 41, 42}, - {100, 200, 300}, - {-100, -200, -300}}; - - // floats - std::vector ages{5, 10, 15, 20, 4, 75, 16, -16}; - std::vector ages_validity = {1, 1, 1, 1, 0, 1, 0, 1}; - auto ages_column = - cudf::test::fixed_width_column_wrapper(ages.begin(), ages.end(), ages_validity.begin()); - - // int16 values - std::vector vals{-1, -2, -3, 1, 2, 3, 8, 9}; - auto i16_column = cudf::test::fixed_width_column_wrapper(vals.begin(), vals.end()); - - // Assemble struct column - auto struct_column = - cudf::test::structs_column_wrapper({list, ages_column, i16_column}, struct_validity); - - // wrap in a list - std::vector outer_offsets{0, 1, 1, 3, 6, 7, 8}; - cudf::test::fixed_width_column_wrapper outer_offsets_col(outer_offsets.begin(), - outer_offsets.end()); - auto const size = static_cast(outer_offsets_col).size() - 1; - - // Each struct (list child) has size: - // (1 offset (4 bytes) + (list size if row is valid) + 1 validity bit) + - // (1 float + 1 validity bit) + - // (1 int16_t + 1 validity bit) + - // (1 validity bit) - // Each top level list has size: - // 1 offset (4 bytes) + (list size if row is valid). - - return cudf::make_lists_column(static_cast(size), - outer_offsets_col.release(), - struct_column.release(), - 0, - rmm::device_buffer{}); -} - -std::unique_ptr build_nested_column2(std::vector const& struct_validity) -{ - // List>, Struct>> - - // Inner list column - // clang-format off - cudf::test::lists_column_wrapper list{ - {{1, 2, 3, 4, 5}, {2, 3}}, - {{6, 7, 8}, {8, 9}}, - {{1, 2}, {3, 4, 5}, {33, 34, 35, 36, 37, 38, 39}}}; - // clang-format on - - // Inner struct - std::vector vals{-1, -2, -3}; - auto i16_column = cudf::test::fixed_width_column_wrapper(vals.begin(), vals.end()); - auto inner_struct = cudf::test::structs_column_wrapper({i16_column}); - - // outer struct - auto outer_struct = cudf::test::structs_column_wrapper({list, inner_struct}, struct_validity); - - // wrap in a list - std::vector outer_offsets{0, 1, 1, 3}; - cudf::test::fixed_width_column_wrapper outer_offsets_col(outer_offsets.begin(), - outer_offsets.end()); - auto const size = static_cast(outer_offsets_col).size() - 1; - return make_lists_column(static_cast(size), - outer_offsets_col.release(), - outer_struct.release(), - 0, - rmm::device_buffer{}); -} - TEST_F(RowBitCount, NestedTypes) { // List, float, List, int16> { - auto const col_no_nulls = build_nested_column1({1, 1, 1, 1, 1, 1, 1, 1}); + auto const col_no_nulls = row_bit_count_test::build_nested_column1({1, 1, 1, 1, 1, 1, 1, 1}); auto const expected_sizes_no_nulls = cudf::test::fixed_width_column_wrapper{276, 32, 520, 572, 212, 212} .release(); cudf::table_view no_nulls_t({*col_no_nulls}); auto no_nulls_result = cudf::row_bit_count(no_nulls_t); - auto const col_nulls = build_nested_column1({0, 0, 1, 1, 1, 1, 1, 1}); + auto const col_nulls = row_bit_count_test::build_nested_column1({0, 0, 1, 1, 1, 1, 1, 1}); auto const expected_sizes_with_nulls = cudf::test::fixed_width_column_wrapper{116, 32, 424, 572, 212, 212} .release(); @@ -469,11 +473,11 @@ TEST_F(RowBitCount, NestedTypes) // List>, Struct>> { - auto col_no_nulls = build_nested_column2({1, 1, 1}); + auto col_no_nulls = row_bit_count_test::build_nested_column2({1, 1, 1}); cudf::table_view no_nulls_t({*col_no_nulls}); auto no_nulls_result = cudf::row_bit_count(no_nulls_t); - auto col_nulls = build_nested_column2({1, 0, 1}); + auto col_nulls = row_bit_count_test::build_nested_column2({1, 0, 1}); cudf::table_view nulls_t({*col_nulls}); auto nulls_result = cudf::row_bit_count(nulls_t); @@ -597,15 +601,15 @@ struct sum_functor { TEST_F(RowBitCount, Table) { // complex nested column - auto col0 = build_nested_column1({1, 1, 1, 1, 1, 1, 1, 1}); + auto col0 = row_bit_count_test::build_nested_column1({1, 1, 1, 1, 1, 1, 1, 1}); auto col0_sizes = cudf::test::fixed_width_column_wrapper{276, 32, 520, 572, 212, 212}.release(); // struct column - auto [col1, col1_sizes] = build_struct_column(); + auto [col1, col1_sizes] = row_bit_count_test::build_struct_column(); // list column - auto [col2, col2_sizes] = build_list_column(); + auto [col2, col2_sizes] = row_bit_count_test::build_list_column(); cudf::table_view t({*col0, *col1, *col2}); auto result = cudf::row_bit_count(t); diff --git a/cpp/tests/transform/segmented_row_bit_count_test.cu b/cpp/tests/transform/segmented_row_bit_count_test.cu new file mode 100644 index 00000000000..652b9053582 --- /dev/null +++ b/cpp/tests/transform/segmented_row_bit_count_test.cu @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +// Reuse function defined in `row_bit_count_test.cu`. +namespace row_bit_count_test { +template +std::pair, std::unique_ptr> build_list_column(); +std::pair, std::unique_ptr> build_struct_column(); +std::unique_ptr build_nested_column1(std::vector const& struct_validity); +std::unique_ptr build_nested_column2(std::vector const& struct_validity); +} // namespace row_bit_count_test + +namespace { + +// Compute row bit count, then sum up sizes for each segment of rows. +std::pair, std::unique_ptr> +compute_segmented_row_bit_count(cudf::table_view const& input, cudf::size_type segment_length) +{ + // The expected values are computed with the assumption that + // the outputs of `cudf::row_bit_count` are correct. + // This should be fine as they are verified by their own unit tests in `row_bit_count_test.cu`. + auto const row_sizes = cudf::row_bit_count(input); + auto const num_segments = cudf::util::div_rounding_up_safe(row_sizes->size(), segment_length); + auto expected = + cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, num_segments); + + thrust::transform( + rmm::exec_policy(cudf::get_default_stream()), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_segments), + expected->mutable_view().begin(), + cuda::proclaim_return_type( + [segment_length, + num_segments, + num_rows = row_sizes->size(), + d_sizes = row_sizes->view().begin()] __device__(auto const segment_idx) { + // Since the number of rows may not divisible by segment_length, + // the last segment may be shorter than the others. + auto const size_begin = d_sizes + segment_idx * segment_length; + auto const size_end = std::min(size_begin + segment_length, d_sizes + num_rows); + return thrust::reduce(thrust::seq, size_begin, size_end); + })); + + auto actual = cudf::segmented_row_bit_count(input, segment_length); + return {std::move(expected), std::move(actual)}; +} + +} // namespace + +struct SegmentedRowBitCount : public cudf::test::BaseFixture {}; + +TEST_F(SegmentedRowBitCount, Lists) +{ + auto const col = std::get<0>(row_bit_count_test::build_list_column()); + auto const input = cudf::table_view({*col}); + + auto constexpr segment_length = 3; + auto const [expected, actual] = compute_segmented_row_bit_count(input, segment_length); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *actual); +} + +TEST_F(SegmentedRowBitCount, StringsWithNulls) +{ + // clang-format off + std::vector const strings { "daïs", "def", "", "z", "bananas", "warp", "", "zing" }; + std::vector const valids { 1, 0, 0, 1, 0, 1, 1, 1 }; + // clang-format on + cudf::test::strings_column_wrapper const col(strings.begin(), strings.end(), valids.begin()); + auto const input = cudf::table_view({col}); + + auto constexpr segment_length = 2; + auto const [expected, actual] = compute_segmented_row_bit_count(input, segment_length); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *actual); +} + +TEST_F(SegmentedRowBitCount, StructsWithNulls) +{ + auto const col = std::get<0>(row_bit_count_test::build_struct_column()); + auto const input = cudf::table_view({*col}); + + auto constexpr segment_length = 2; + auto const [expected, actual] = compute_segmented_row_bit_count(input, segment_length); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *actual); +} + +TEST_F(SegmentedRowBitCount, NestedTypes) +{ + auto constexpr segment_length = 2; + + { + // List, float, List, int16> + auto const col = row_bit_count_test::build_nested_column1({1, 1, 1, 1, 1, 1, 1, 1}); + auto const input = cudf::table_view({*col}); + auto const [expected, actual] = compute_segmented_row_bit_count(input, segment_length); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *actual); + } + { + // List, float, List, int16> + auto const col = row_bit_count_test::build_nested_column1({0, 0, 1, 1, 1, 1, 1, 1}); + auto const input = cudf::table_view({*col}); + auto const [expected, actual] = compute_segmented_row_bit_count(input, segment_length); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *actual); + } + + { + // List>, Struct>> + auto const col = row_bit_count_test::build_nested_column2({1, 1, 1}); + auto const input = cudf::table_view({*col}); + auto const [expected, actual] = compute_segmented_row_bit_count(input, segment_length); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *actual); + } + { + // List>, Struct>> + auto const col = row_bit_count_test::build_nested_column2({1, 0, 1}); + auto const input = cudf::table_view({*col}); + auto const [expected, actual] = compute_segmented_row_bit_count(input, segment_length); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *actual); + } +} + +TEST_F(SegmentedRowBitCount, NestedTypesTable) +{ + auto const col0 = row_bit_count_test::build_nested_column1({1, 1, 1, 1, 1, 1, 1, 1}); + auto const col1 = std::get<0>(row_bit_count_test::build_struct_column()); + auto const col2 = std::get<0>(row_bit_count_test::build_list_column()); + auto const input = cudf::table_view({*col0, *col1, *col2}); + + { + auto const segment_length = 2; + auto const [expected, actual] = compute_segmented_row_bit_count(input, segment_length); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *actual); + } + + { + auto const segment_length = 4; + auto const [expected, actual] = compute_segmented_row_bit_count(input, segment_length); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *actual); + } + + { + auto const segment_length = 5; + auto const [expected, actual] = compute_segmented_row_bit_count(input, segment_length); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *actual); + } +} + +TEST_F(SegmentedRowBitCount, EmptyInput) +{ + { + auto const input = cudf::table_view{}; + { + auto const result = cudf::segmented_row_bit_count(input, 0); + EXPECT_TRUE(result != nullptr && result->size() == 0); + } + { + auto const result = cudf::segmented_row_bit_count(input, 1000); + EXPECT_TRUE(result != nullptr && result->size() == 0); + } + } + + { + auto const strings = cudf::make_empty_column(cudf::type_id::STRING); + auto const ints = cudf::make_empty_column(cudf::type_id::INT32); + auto const input = cudf::table_view{{*strings, *ints}}; + { + auto const result = cudf::segmented_row_bit_count(input, 0); + EXPECT_TRUE(result != nullptr && result->size() == 0); + } + { + auto const result = cudf::segmented_row_bit_count(input, 1000); + EXPECT_TRUE(result != nullptr && result->size() == 0); + } + } +} + +TEST_F(SegmentedRowBitCount, InvalidSegment) +{ + auto const col = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, 16); + auto const input = cudf::table_view({*col}); + + EXPECT_NO_THROW(cudf::segmented_row_bit_count(input, 1)); + EXPECT_NO_THROW(cudf::segmented_row_bit_count(input, input.num_rows())); + EXPECT_THROW(cudf::segmented_row_bit_count(input, -1), std::invalid_argument); + EXPECT_THROW(cudf::segmented_row_bit_count(input, 0), std::invalid_argument); + EXPECT_THROW(cudf::segmented_row_bit_count(input, input.num_rows() + 1), std::invalid_argument); + EXPECT_THROW(cudf::segmented_row_bit_count(input, 1000), std::invalid_argument); +} + +TEST_F(SegmentedRowBitCount, EdgeCases) +{ + auto const col0 = row_bit_count_test::build_nested_column1({1, 1, 1, 1, 1, 1, 1, 1}); + auto const col1 = std::get<0>(row_bit_count_test::build_struct_column()); + auto const col2 = std::get<0>(row_bit_count_test::build_list_column()); + auto const input = cudf::table_view({*col0, *col1, *col2}); + + { + auto const segment_length = 1; + auto const [expected, actual] = compute_segmented_row_bit_count(input, segment_length); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *actual); + } + + { + EXPECT_EQ(input.num_rows(), 6); + auto const segment_length = 4; // input.num_rows()==6, not divisible by segment_length . + auto const [expected, actual] = compute_segmented_row_bit_count(input, segment_length); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *actual); + } + + { + auto const segment_length = input.num_rows(); + auto const [expected, actual] = compute_segmented_row_bit_count(input, segment_length); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *actual); + } +} From d3e49f644be2475bffe0ee779c4d171be938b3af Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 1 Mar 2024 16:42:34 -0500 Subject: [PATCH 24/31] Fix includes for row_operators.cuh (#15194) Simple change removes the `cudf/sorting.hpp` include from `row_operators.cuh`. Found this while waiting for recompiles to finish. Changes to `sorting.hpp` seemed to cause more recompiling than expected. Also took the opportunity to change the `include ` to `include ` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15194 --- cpp/include/cudf/table/row_operators.cuh | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh index 4806f96c934..0e57d24f4b3 100644 --- a/cpp/include/cudf/table/row_operators.cuh +++ b/cpp/include/cudf/table/row_operators.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,19 +20,16 @@ #include #include #include -#include #include #include #include +#include #include #include #include -#include #include -#include - namespace cudf { /** @@ -470,7 +467,9 @@ class element_hasher { template ())> __device__ hash_value_type operator()(column_device_view col, size_type row_index) const { - if (has_nulls && col.is_null(row_index)) { return std::numeric_limits::max(); } + if (has_nulls && col.is_null(row_index)) { + return cuda::std::numeric_limits::max(); + } return hash_function{}(col.element(row_index)); } @@ -554,7 +553,7 @@ class element_hasher_with_seed { private: uint32_t _seed{DEFAULT_HASH_SEED}; - hash_value_type _null_hash{std::numeric_limits::max()}; + hash_value_type _null_hash{cuda::std::numeric_limits::max()}; Nullate _has_nulls; }; From f911ce8c784e55c4dbfc997fdf67236eb4842e35 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 1 Mar 2024 16:42:52 -0500 Subject: [PATCH 25/31] Change make_strings_children to return uvector (#15171) Changes the `cudf::strings::detail::make_strings_children` utility to return a `rmm::device_uvector` instead of a chars column. This further helps enable large strings support by not storing chars in a column. This is an internal utility and so is non-breaking for any public APIs. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/15171 --- cpp/benchmarks/json/json.cu | 3 +- .../cudf/strings/detail/strings_children.cuh | 11 +++--- cpp/src/io/csv/writer_impl.cu | 4 +-- cpp/src/io/json/write_json.cu | 4 +-- cpp/src/lists/interleave_columns.cu | 4 +-- cpp/src/replace/clamp.cu | 4 +-- cpp/src/strings/capitalize.cu | 4 +-- cpp/src/strings/case.cu | 2 +- cpp/src/strings/char_types/char_types.cu | 4 +-- cpp/src/strings/combine/concatenate.cu | 18 ++++------ cpp/src/strings/combine/join.cu | 35 ++++++++++--------- cpp/src/strings/combine/join_list_elements.cu | 18 ++++------ cpp/src/strings/convert/convert_booleans.cu | 2 +- cpp/src/strings/convert/convert_datetime.cu | 18 +++++----- cpp/src/strings/convert/convert_durations.cu | 2 +- .../strings/convert/convert_fixed_point.cu | 2 +- cpp/src/strings/convert/convert_floats.cu | 2 +- cpp/src/strings/convert/convert_hex.cu | 4 +-- cpp/src/strings/convert/convert_integers.cu | 2 +- cpp/src/strings/convert/convert_ipv4.cu | 6 ++-- cpp/src/strings/convert/convert_lists.cu | 9 ++--- cpp/src/strings/convert/convert_urls.cu | 4 +-- cpp/src/strings/filling/fill.cu | 9 ++--- cpp/src/strings/filter_chars.cu | 4 +-- cpp/src/strings/padding.cu | 8 ++--- cpp/src/strings/regex/utilities.cuh | 5 ++- cpp/src/strings/repeat_strings.cu | 13 +++---- cpp/src/strings/replace/backref_re.cu | 6 ++-- cpp/src/strings/replace/multi.cu | 4 +-- cpp/src/strings/replace/multi_re.cu | 4 +-- cpp/src/strings/replace/replace.cu | 4 +-- cpp/src/strings/replace/replace_re.cu | 4 +-- cpp/src/strings/replace/replace_slice.cu | 4 +-- cpp/src/strings/slice.cu | 2 +- cpp/src/strings/translate.cu | 4 +-- cpp/src/text/detokenize.cu | 9 ++--- cpp/src/text/generate_ngrams.cu | 18 ++++------ cpp/src/text/normalize.cu | 8 ++--- cpp/src/text/replace.cu | 8 ++--- 39 files changed, 123 insertions(+), 153 deletions(-) diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu index 020c8e413b3..a54d7d48dc4 100644 --- a/cpp/benchmarks/json/json.cu +++ b/cpp/benchmarks/json/json.cu @@ -179,8 +179,7 @@ auto build_json_string_column(int desired_bytes, int num_rows) desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order}; auto [offsets, chars] = cudf::strings::detail::make_strings_children( jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); - return cudf::make_strings_column( - num_rows, std::move(offsets), std::move(chars->release().data.release()[0]), 0, {}); + return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {}); } void BM_case(benchmark::State& state, std::string query_arg) diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index 8e2b6055a5c..49c4be88ca5 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -34,7 +34,7 @@ namespace strings { namespace detail { /** - * @brief Creates child offsets and chars columns by applying the template function that + * @brief Creates child offsets and chars data by applying the template function that * can be used for computing the output size of each string as well as create the output * * @throws std::overflow_error if the output strings column exceeds the column size limit @@ -49,7 +49,7 @@ namespace detail { * @param strings_count Number of strings. * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned columns' device memory. - * @return offsets child column and chars child column for a strings column + * @return Offsets child column and chars data for a strings column */ template auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, @@ -84,18 +84,17 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, std::overflow_error); // Now build the chars column - std::unique_ptr chars_column = - create_chars_child_column(static_cast(bytes), stream, mr); + rmm::device_uvector chars(bytes, stream, mr); // Execute the function fn again to fill the chars column. // Note that if the output chars column has zero size, the function fn should not be called to // avoid accidentally overwriting the offsets. if (bytes > 0) { - size_and_exec_fn.d_chars = chars_column->mutable_view().template data(); + size_and_exec_fn.d_chars = chars.data(); for_each_fn(size_and_exec_fn); } - return std::pair(std::move(offsets_column), std::move(chars_column)); + return std::pair(std::move(offsets_column), std::move(chars)); } /** diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index cedcd97e44e..c143d258448 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -180,12 +180,12 @@ struct column_to_strings_fn { auto d_column = column_device_view::create(column_v, stream_); escape_strings_fn fn{*d_column, delimiter.value(stream_)}; - auto [offsets_column, chars_column] = + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_); return make_strings_column(column_v.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), column_v.null_count(), cudf::detail::copy_bitmask(column_v, stream_, mr_)); } diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 8c5b309244d..8c3aceeefd4 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -169,12 +169,12 @@ struct escape_strings_fn { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto [offsets_column, chars_column] = + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr); return make_strings_column(column_v.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), column_v.null_count(), cudf::detail::copy_bitmask(column_v, stream, mr)); } diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu index fe5e1e677ca..478b6c9a209 100644 --- a/cpp/src/lists/interleave_columns.cu +++ b/cpp/src/lists/interleave_columns.cu @@ -228,7 +228,7 @@ struct interleave_list_entries_impl(data_has_null_mask ? num_output_entries : 0, stream); comp_fn.d_validities = validities.data(); - auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( comp_fn, num_output_lists, num_output_entries, stream, mr); auto [null_mask, null_count] = @@ -236,7 +236,7 @@ struct interleave_list_entries_implrelease().data.release()[0]), + chars.release(), null_count, std::move(null_mask)); } diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu index 43358a3b165..3cd1fdd20a2 100644 --- a/cpp/src/replace/clamp.cu +++ b/cpp/src/replace/clamp.cu @@ -100,12 +100,12 @@ std::unique_ptr clamp_string_column(strings_column_view const& inp auto fn = clamp_strings_fn{ d_input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr}; - auto [offsets_column, chars_column] = + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(fn, input.size(), stream, mr); return make_strings_column(input.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), input.null_count(), std::move(cudf::detail::copy_bitmask(input.parent(), stream, mr))); } diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu index 3b99093a89f..3889bd31b4d 100644 --- a/cpp/src/strings/capitalize.cu +++ b/cpp/src/strings/capitalize.cu @@ -229,12 +229,12 @@ std::unique_ptr capitalizer(CapitalFn cfn, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto [offsets_column, chars_column] = + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr); return make_strings_column(input.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index b3bf0e2a787..8d8930013cf 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -217,7 +217,7 @@ std::unique_ptr convert_case(strings_column_view const& input, cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr); return make_strings_column(input.size(), std::move(offsets), - std::move(chars->release().data.release()[0]), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index 9c2a2701227..b8c0dfd27e6 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -200,13 +200,13 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // this utility calls filterer to build the offsets and chars columns - auto [offsets_column, chars_column] = + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr); // return new strings column return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), strings.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index a48e84eac0c..14f530971f5 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -142,7 +142,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls}; - auto [offsets_column, chars_column] = make_strings_children(fn, strings_count, stream, mr); + auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr); // create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( @@ -156,11 +156,8 @@ std::unique_ptr concatenate(table_view const& strings_columns, stream, mr); - return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), - null_count, - std::move(null_mask)); + return make_strings_column( + strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask)); } namespace { @@ -237,7 +234,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, multi_separator_concat_fn mscf{ *d_table, separator_col_view, separator_rep, col_rep, separate_nulls}; - auto [offsets_column, chars_column] = make_strings_children(mscf, strings_count, stream, mr); + auto [offsets_column, chars] = make_strings_children(mscf, strings_count, stream, mr); // Create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( @@ -252,11 +249,8 @@ std::unique_ptr concatenate(table_view const& strings_columns, stream, mr); - return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), - null_count, - std::move(null_mask)); + return make_strings_column( + strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask)); } } // namespace detail diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index 0e0d6e437a7..c6290ceb6c2 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -142,28 +142,34 @@ std::unique_ptr join_strings(strings_column_view const& input, auto d_strings = column_device_view::create(input.parent(), stream); - auto chars_column = [&] { + auto chars = [&] { // build the strings column and commandeer the chars column if ((input.size() == input.null_count()) || ((input.chars_size(stream) / (input.size() - input.null_count())) <= AVG_CHAR_BYTES_THRESHOLD)) { - return std::get<1>( - make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr)); + return std::get<1>(make_strings_children( + join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr)) + .release(); } // dynamically feeds index pairs to build the output auto indices = cudf::detail::make_counting_transform_iterator( 0, join_gather_fn{*d_strings, d_separator, d_narep}); - auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr); - auto chars_data = joined_col->release().data; - auto const chars_size = chars_data->size(); - return std::make_unique( - data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0); + auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr); + auto chars_data = joined_col->release().data; + return std::move(*chars_data); }(); // build the offsets: single string output has offsets [0,chars-size] - auto offsets = cudf::detail::make_device_uvector_async( - std::vector({0, chars_column->size()}), stream, mr); - auto offsets_column = std::make_unique(std::move(offsets), rmm::device_buffer{}, 0); + auto offsets_column = [&] { + if (chars.size() < static_cast(get_offset64_threshold())) { + auto offsets32 = cudf::detail::make_device_uvector_async( + std::vector({0, static_cast(chars.size())}), stream, mr); + return std::make_unique(std::move(offsets32), rmm::device_buffer{}, 0); + } + auto offsets64 = cudf::detail::make_device_uvector_async( + std::vector({0L, static_cast(chars.size())}), stream, mr); + return std::make_unique(std::move(offsets64), rmm::device_buffer{}, 0); + }(); // build the null mask: only one output row so it is either all-valid or all-null auto const null_count = @@ -173,11 +179,8 @@ std::unique_ptr join_strings(strings_column_view const& input, : rmm::device_buffer{0, stream, mr}; // perhaps this return a string_scalar instead of a single-row column - return make_strings_column(1, - std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), - null_count, - std::move(null_mask)); + return make_strings_column( + 1, std::move(offsets_column), std::move(chars), null_count, std::move(null_mask)); } } // namespace detail diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu index 619f5feba15..170e621e05c 100644 --- a/cpp/src/strings/combine/join_list_elements.cu +++ b/cpp/src/strings/combine/join_list_elements.cu @@ -207,7 +207,7 @@ std::unique_ptr join_list_elements(lists_column_view const& lists_string separate_nulls, empty_list_policy}; - auto [offsets_column, chars_column] = make_strings_children(comp_fn, num_rows, stream, mr); + auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr); auto [null_mask, null_count] = cudf::detail::valid_if(thrust::counting_iterator(0), thrust::counting_iterator(num_rows), @@ -215,11 +215,8 @@ std::unique_ptr join_list_elements(lists_column_view const& lists_string stream, mr); - return make_strings_column(num_rows, - std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), - null_count, - std::move(null_mask)); + return make_strings_column( + num_rows, std::move(offsets_column), chars.release(), null_count, std::move(null_mask)); } namespace { @@ -285,7 +282,7 @@ std::unique_ptr join_list_elements(lists_column_view const& lists_string separate_nulls, empty_list_policy}; - auto [offsets_column, chars_column] = make_strings_children(comp_fn, num_rows, stream, mr); + auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr); auto [null_mask, null_count] = cudf::detail::valid_if(thrust::counting_iterator(0), thrust::counting_iterator(num_rows), @@ -293,11 +290,8 @@ std::unique_ptr join_list_elements(lists_column_view const& lists_string stream, mr); - return make_strings_column(num_rows, - std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), - null_count, - std::move(null_mask)); + return make_strings_column( + num_rows, std::move(offsets_column), chars.release(), null_count, std::move(null_mask)); } } // namespace detail diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu index 4fe0be7883f..d1de345a709 100644 --- a/cpp/src/strings/convert/convert_booleans.cu +++ b/cpp/src/strings/convert/convert_booleans.cu @@ -147,7 +147,7 @@ std::unique_ptr from_booleans(column_view const& booleans, return make_strings_column(strings_count, std::move(offsets), - std::move(chars->release().data.release()[0]), + chars.release(), booleans.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index b7a662b0b76..f54eb082959 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -1099,7 +1099,7 @@ struct datetime_formatter_fn { }; // -using strings_children = std::pair, std::unique_ptr>; +using strings_children = std::pair, rmm::device_uvector>; struct dispatch_from_timestamps_fn { template ()>* = nullptr> strings_children operator()(column_device_view const& d_timestamps, @@ -1148,17 +1148,17 @@ std::unique_ptr from_timestamps(column_view const& timestamps, auto const d_timestamps = column_device_view::create(timestamps, stream); // dispatcher is called to handle the different timestamp types - auto [offsets_column, chars_column] = cudf::type_dispatcher(timestamps.type(), - dispatch_from_timestamps_fn(), - *d_timestamps, - *d_names, - d_format_items, - stream, - mr); + auto [offsets_column, chars] = cudf::type_dispatcher(timestamps.type(), + dispatch_from_timestamps_fn(), + *d_timestamps, + *d_names, + d_format_items, + stream, + mr); return make_strings_column(timestamps.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), timestamps.null_count(), cudf::detail::copy_bitmask(timestamps, stream, mr)); } diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index 9a58926539c..8076c5c484b 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -422,7 +422,7 @@ struct dispatch_from_durations_fn { return make_strings_column(strings_count, std::move(offsets), - std::move(chars->release().data.release()[0]), + chars.release(), durations.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index c59952834d6..fb8ebf55ef1 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -248,7 +248,7 @@ struct dispatch_from_fixed_point_fn { return make_strings_column(input.size(), std::move(offsets), - std::move(chars->release().data.release()[0]), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input, stream, mr)); } diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index c56e723de8e..df019ca236a 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -408,7 +408,7 @@ struct dispatch_from_floats_fn { return make_strings_column(strings_count, std::move(offsets), - std::move(chars->release().data.release()[0]), + chars.release(), floats.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu index 68cff214507..332bc9837c1 100644 --- a/cpp/src/strings/convert/convert_hex.cu +++ b/cpp/src/strings/convert/convert_hex.cu @@ -180,12 +180,12 @@ struct dispatch_integers_to_hex_fn { { auto const d_column = column_device_view::create(input, stream); - auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( integer_to_hex_fn{*d_column}, input.size(), stream, mr); return make_strings_column(input.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input, stream, mr)); } diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu index 364cb534d2f..eb2e9c28134 100644 --- a/cpp/src/strings/convert/convert_integers.cu +++ b/cpp/src/strings/convert/convert_integers.cu @@ -367,7 +367,7 @@ struct dispatch_from_integers_fn { return make_strings_column(strings_count, std::move(offsets), - std::move(chars->release().data.release()[0]), + chars.release(), integers.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu index e07be26a23c..ce7f98067ef 100644 --- a/cpp/src/strings/convert/convert_ipv4.cu +++ b/cpp/src/strings/convert/convert_ipv4.cu @@ -165,13 +165,13 @@ std::unique_ptr integers_to_ipv4(column_view const& integers, CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type"); - auto d_column = column_device_view::create(integers, stream); - auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( + auto d_column = column_device_view::create(integers, stream); + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( integers_to_ipv4_fn{*d_column}, integers.size(), stream, mr); return make_strings_column(integers.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), integers.null_count(), cudf::detail::copy_bitmask(integers, stream, mr)); } diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu index 1f22aea284b..d6c24b6981b 100644 --- a/cpp/src/strings/convert/convert_lists.cu +++ b/cpp/src/strings/convert/convert_lists.cu @@ -216,17 +216,14 @@ std::unique_ptr format_list_column(lists_column_view const& input, auto const d_separators = column_device_view::create(separators.parent(), stream); auto const d_na_rep = na_rep.value(stream); - auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth}, input.size(), stream, mr); - return make_strings_column(input.size(), - std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), - 0, - rmm::device_buffer{}); + return make_strings_column( + input.size(), std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); } } // namespace detail diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index b96c799cf4d..f5aeeb8d130 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -131,12 +131,12 @@ std::unique_ptr url_encode(strings_column_view const& input, auto d_column = column_device_view::create(input.parent(), stream); - auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( url_encoder_fn{*d_column}, input.size(), stream, mr); return make_strings_column(input.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu index d2e3b6f6af3..685c3eec744 100644 --- a/cpp/src/strings/filling/fill.cu +++ b/cpp/src/strings/filling/fill.cu @@ -94,13 +94,10 @@ std::unique_ptr fill(strings_column_view const& input, auto const d_str = is_valid ? d_value.value(stream) : string_view{}; auto fn = fill_fn{d_strings, begin, end, d_str}; - auto [offsets_column, chars_column] = make_strings_children(fn, strings_count, stream, mr); + auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr); - return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), - null_count, - std::move(null_mask)); + return make_strings_column( + strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask)); } } // namespace detail diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu index 7a26fc45dcb..aaaa751c3f9 100644 --- a/cpp/src/strings/filter_chars.cu +++ b/cpp/src/strings/filter_chars.cu @@ -139,12 +139,12 @@ std::unique_ptr filter_characters( // this utility calls the strip_fn to build the offsets and chars columns filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement}; - auto [offsets_column, chars_column] = + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr); return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index ec77aea6338..85d47af87f6 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -112,7 +112,7 @@ std::unique_ptr pad(strings_column_view const& input, auto d_strings = column_device_view::create(input.parent(), stream); - auto [offsets_column, chars_column] = [&] { + auto [offsets_column, chars] = [&] { if (side == side_type::LEFT) { auto fn = pad_fn{*d_strings, width, fill_char_size, d_fill_char}; return make_strings_children(fn, input.size(), stream, mr); @@ -126,7 +126,7 @@ std::unique_ptr pad(strings_column_view const& input, return make_strings_column(input.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -151,12 +151,12 @@ std::unique_ptr zfill(strings_column_view const& input, if (input.is_empty()) return make_empty_column(type_id::STRING); auto d_strings = column_device_view::create(input.parent(), stream); - auto [offsets_column, chars_column] = + auto [offsets_column, chars] = make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr); return make_strings_column(input.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh index d5dd80aba53..ae8211ac916 100644 --- a/cpp/src/strings/regex/utilities.cuh +++ b/cpp/src/strings/regex/utilities.cuh @@ -140,10 +140,9 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, std::overflow_error); // Now build the chars column - std::unique_ptr chars = - create_chars_child_column(static_cast(char_bytes), stream, mr); + rmm::device_uvector chars(char_bytes, stream, mr); if (char_bytes > 0) { - size_and_exec_fn.d_chars = chars->mutable_view().template data(); + size_and_exec_fn.d_chars = chars.data(); for_each_kernel<<>>( size_and_exec_fn, d_prog, strings_count); } diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index b4a770f72bd..690a72c098f 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -160,11 +160,11 @@ std::unique_ptr repeat_strings(strings_column_view const& input, auto const strings_dv_ptr = column_device_view::create(input.parent(), stream); auto const fn = compute_size_and_repeat_fn{*strings_dv_ptr, repeat_times, input.has_nulls()}; - auto [offsets_column, chars_column] = + auto [offsets_column, chars] = make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr); return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -240,7 +240,7 @@ std::unique_ptr repeat_strings(strings_column_view const& input, input.has_nulls(), repeat_times.has_nulls()}; - auto [offsets_column, chars_column] = make_strings_children(fn, strings_count, stream, mr); + auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr); // We generate new bitmask by AND of the two input columns' bitmasks. // Note that if either of the input columns are nullable, the output column will also be nullable @@ -248,11 +248,8 @@ std::unique_ptr repeat_strings(strings_column_view const& input, auto [null_mask, null_count] = cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr); - return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), - null_count, - std::move(null_mask)); + return make_strings_column( + strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask)); } } // namespace detail diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index bb99dc0644c..8e20db18f43 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -125,8 +125,8 @@ std::unique_ptr replace_with_backrefs(strings_column_view const& input, auto const d_strings = column_device_view::create(input.parent(), stream); - using BackRefIterator = decltype(backrefs.begin()); - auto [offsets_column, chars_column] = make_strings_children( + using BackRefIterator = decltype(backrefs.begin()); + auto [offsets_column, chars] = make_strings_children( backrefs_fn{*d_strings, d_repl_template, backrefs.begin(), backrefs.end()}, *d_prog, input.size(), @@ -135,7 +135,7 @@ std::unique_ptr replace_with_backrefs(strings_column_view const& input, return make_strings_column(input.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index ab35393651f..ffa922d5944 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -453,12 +453,12 @@ std::unique_ptr replace_string_parallel(strings_column_view const& input auto d_targets = column_device_view::create(targets.parent(), stream); auto d_replacements = column_device_view::create(repls.parent(), stream); - auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr); return make_strings_column(input.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index ba122d11e0b..743e5894112 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -185,7 +185,7 @@ std::unique_ptr replace_re(strings_column_view const& input, auto found_ranges = rmm::device_uvector(d_progs.size() * input.size(), stream); - auto [offsets_column, chars_column] = make_strings_children( + auto [offsets_column, chars] = make_strings_children( replace_multi_regex_fn{*d_strings, d_progs, found_ranges.data(), *d_repls}, input.size(), stream, @@ -193,7 +193,7 @@ std::unique_ptr replace_re(strings_column_view const& input, return make_strings_column(input.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index 2d255e57686..c37c64e348c 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -530,12 +530,12 @@ std::unique_ptr replace_row_parallel(strings_column_view const& strings, auto d_strings = column_device_view::create(strings.parent(), stream); // this utility calls the given functor to build the offsets and chars columns - auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( replace_row_parallel_fn{*d_strings, d_target, d_repl, maxrepl}, strings.size(), stream, mr); return make_strings_column(strings.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu index 500bc0c5bb5..bded196946f 100644 --- a/cpp/src/strings/replace/replace_re.cu +++ b/cpp/src/strings/replace/replace_re.cu @@ -116,12 +116,12 @@ std::unique_ptr replace_re(strings_column_view const& input, auto const d_strings = column_device_view::create(input.parent(), stream); - auto [offsets_column, chars_column] = make_strings_children( + auto [offsets_column, chars] = make_strings_children( replace_regex_fn{*d_strings, d_repl, maxrepl}, *d_prog, input.size(), stream, mr); return make_strings_column(input.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu index 4321f78d2d5..041801336e6 100644 --- a/cpp/src/strings/replace/replace_slice.cu +++ b/cpp/src/strings/replace/replace_slice.cu @@ -91,12 +91,12 @@ std::unique_ptr replace_slice(strings_column_view const& strings, auto d_strings = column_device_view::create(strings.parent(), stream); // this utility calls the given functor to build the offsets and chars columns - auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( replace_slice_fn{*d_strings, d_repl, start, stop}, strings.size(), stream, mr); return make_strings_column(strings.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu index 1e55986fdb8..98f3c9cae0d 100644 --- a/cpp/src/strings/slice.cu +++ b/cpp/src/strings/slice.cu @@ -209,7 +209,7 @@ std::unique_ptr slice_strings(strings_column_view const& strings, return make_strings_column(strings.size(), std::move(offsets), - std::move(chars->release().data.release()[0]), + chars.release(), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index 039a8ac8a62..a8603f47226 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -110,12 +110,12 @@ std::unique_ptr translate(strings_column_view const& strings, auto d_strings = column_device_view::create(strings.parent(), stream); - auto [offsets_column, chars_column] = make_strings_children( + auto [offsets_column, chars] = make_strings_children( translate_fn{*d_strings, table.begin(), table.end()}, strings.size(), stream, mr); return make_strings_column(strings.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu index a317739e4ca..b9964352c74 100644 --- a/cpp/src/text/detokenize.cu +++ b/cpp/src/text/detokenize.cu @@ -156,18 +156,15 @@ std::unique_ptr detokenize(cudf::strings_column_view const& string cudf::string_view const d_separator(separator.data(), separator.size()); - auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( detokenizer_fn{*strings_column, d_row_map, tokens_offsets.data(), d_separator}, output_count, stream, mr); // make the output strings column from the offsets and chars column - return cudf::make_strings_column(output_count, - std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), - 0, - rmm::device_buffer{}); + return cudf::make_strings_column( + output_count, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); } } // namespace detail diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index fafb2f18b80..3290b58101d 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -135,15 +135,12 @@ std::unique_ptr generate_ngrams(cudf::strings_column_view const& s // compute the number of strings of ngrams auto const ngrams_count = strings_count - ngrams + 1; - auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, stream, mr); // make the output strings column from the offsets and chars column - return cudf::make_strings_column(ngrams_count, - std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), - 0, - rmm::device_buffer{}); + return cudf::make_strings_column( + ngrams_count, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); } } // namespace detail @@ -235,14 +232,11 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie "Insufficient number of characters in each string to generate ngrams"); character_ngram_generator_fn generator{d_strings, ngrams, ngram_offsets.data()}; - auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( generator, strings_count, total_ngrams, stream, mr); - return cudf::make_strings_column(total_ngrams, - std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), - 0, - rmm::device_buffer{}); + return cudf::make_strings_column( + total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); } namespace { diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 3d98ae59dc0..c06a24382ed 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -182,12 +182,12 @@ std::unique_ptr normalize_spaces(cudf::strings_column_view const& auto d_strings = cudf::column_device_view::create(strings.parent(), stream); // build offsets and children using the normalize_space_fn - auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( normalize_spaces_fn{*d_strings}, strings.size(), stream, mr); return cudf::make_strings_column(strings.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } @@ -224,12 +224,12 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con auto d_strings = cudf::column_device_view::create(strings.parent(), stream); // build offsets and children using the codepoint_to_utf8_fn - auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr); return cudf::make_strings_column(strings.size(), std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index 1fa0606424c..5aed701c037 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -228,13 +228,13 @@ std::unique_ptr replace_tokens(cudf::strings_column_view const& st rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // this utility calls replacer to build the offsets and chars columns - auto [offsets_column, chars_column] = + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr); // return new strings column return cudf::make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), strings.null_count(), std::move(null_mask)); } @@ -261,13 +261,13 @@ std::unique_ptr filter_tokens(cudf::strings_column_view const& str rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // this utility calls filterer to build the offsets and chars columns - auto [offsets_column, chars_column] = + auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr); // return new strings column return cudf::make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + chars.release(), strings.null_count(), std::move(null_mask)); } From e60aad110efcd94003ad78d0f46ac94e531bd1c0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 1 Mar 2024 18:22:33 -0800 Subject: [PATCH 26/31] Implement search using pylibcudf (#15166) Contributes to #15162 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15166 --- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../user_guide/api_docs/pylibcudf/search.rst | 6 + .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 + python/cudf/cudf/_lib/pylibcudf/search.pxd | 21 ++++ python/cudf/cudf/_lib/pylibcudf/search.pyx | 116 ++++++++++++++++++ python/cudf/cudf/_lib/search.pyx | 91 +++++--------- 8 files changed, 178 insertions(+), 62 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/search.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/search.pyx diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 73f63ae1343..2e5b3916c65 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf. reduce rolling scalar + search stream_compaction sorting replace diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst new file mode 100644 index 00000000000..aa57bcd9d92 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst @@ -0,0 +1,6 @@ +====== +search +====== + +.. automodule:: cudf._lib.pylibcudf.search + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 68e6765cc49..fd749a5edc1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -28,6 +28,7 @@ set(cython_sources replace.pyx rolling.pyx scalar.pyx + search.pyx stream_compaction.pyx sorting.pyx table.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 5ef10fb2ffc..96aa42cc257 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -14,6 +14,7 @@ from . cimport ( reduce, replace, rolling, + search, sorting, stream_compaction, types, @@ -45,6 +46,7 @@ __all__ = [ "reduce", "replace", "rolling", + "search", "stream_compaction", "sorting", "types", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 4689c49fdb1..19cc782dd92 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -13,6 +13,7 @@ reduce, replace, rolling, + search, sorting, stream_compaction, types, @@ -43,6 +44,7 @@ "reduce", "replace", "rolling", + "search", "stream_compaction", "sorting", "types", diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pxd b/python/cudf/cudf/_lib/pylibcudf/search.pxd new file mode 100644 index 00000000000..0faf18b108f --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/search.pxd @@ -0,0 +1,21 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from .column cimport Column +from .table cimport Table + + +cpdef Column lower_bound( + Table haystack, + Table needles, + list column_order, + list null_precedence, +) + +cpdef Column upper_bound( + Table haystack, + Table needles, + list column_order, + list null_precedence, +) + +cpdef Column contains(Column haystack, Column needles) diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pyx b/python/cudf/cudf/_lib/pylibcudf/search.pyx new file mode 100644 index 00000000000..a186167af13 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/search.pyx @@ -0,0 +1,116 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.cpp cimport search as cpp_search +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.types cimport null_order, order + +from .column cimport Column +from .table cimport Table + + +cpdef Column lower_bound( + Table haystack, + Table needles, + list column_order, + list null_precedence, +): + """Find smallest indices in haystack where needles may be inserted to retain order. + + Parameters + ---------- + haystack : Table + The search space. + needles : Table + The values for which to find insertion points. + column_order : List[ColumnOrder] + Whether each column should be sorted in ascending or descending order. + null_precedence : List[NullOrder] + Whether nulls should come before or after non-nulls. + + Returns + ------- + Column + The insertion points + """ + cdef unique_ptr[column] c_result + cdef vector[order] c_orders = column_order + cdef vector[null_order] c_null_precedence = null_precedence + with nogil: + c_result = move( + cpp_search.lower_bound( + haystack.view(), + needles.view(), + c_orders, + c_null_precedence, + ) + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column upper_bound( + Table haystack, + Table needles, + list column_order, + list null_precedence, +): + """Find largest indices in haystack where needles may be inserted to retain order. + + Parameters + ---------- + haystack : Table + The search space. + needles : Table + The values for which to find insertion points. + column_order : List[ColumnOrder] + Whether each column should be sorted in ascending or descending order. + null_precedence : List[NullOrder] + Whether nulls should come before or after non-nulls. + + Returns + ------- + Column + The insertion points + """ + cdef unique_ptr[column] c_result + cdef vector[order] c_orders = column_order + cdef vector[null_order] c_null_precedence = null_precedence + with nogil: + c_result = move( + cpp_search.upper_bound( + haystack.view(), + needles.view(), + c_orders, + c_null_precedence, + ) + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column contains(Column haystack, Column needles): + """Check whether needles are present in haystack. + + Parameters + ---------- + haystack : Table + The search space. + needles : Table + The values for which to search. + + Returns + ------- + Column + Boolean indicator for each needle. + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = move( + cpp_search.contains( + haystack.view(), + needles.view(), + ) + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx index fef3a08c6d7..1ee73949fd3 100644 --- a/python/cudf/cudf/_lib/search.pyx +++ b/python/cudf/cudf/_lib/search.pyx @@ -1,18 +1,10 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector - -cimport cudf._lib.cpp.search as cpp_search -cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.column cimport Column -from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.utils cimport table_view_from_columns + +from cudf._lib import pylibcudf @acquire_spill_lock() @@ -31,50 +23,31 @@ def search_sorted( If 'left', the index of the first suitable location is given. If 'right', return the last such index """ - cdef unique_ptr[column] c_result - cdef vector[libcudf_types.order] c_column_order - cdef vector[libcudf_types.null_order] c_null_precedence - cdef libcudf_types.order c_order - cdef libcudf_types.null_order c_null_order - cdef table_view c_table_data = table_view_from_columns(source) - cdef table_view c_values_data = table_view_from_columns(values) - # Note: We are ignoring index columns here - c_order = (libcudf_types.order.ASCENDING - if ascending - else libcudf_types.order.DESCENDING) - c_null_order = ( - libcudf_types.null_order.AFTER - if na_position=="last" - else libcudf_types.null_order.BEFORE + column_order = [ + pylibcudf.types.Order.ASCENDING + if ascending + else pylibcudf.types.Order.DESCENDING + ] * len(source) + null_precedence = [ + pylibcudf.types.NullOrder.AFTER + if na_position == "last" + else pylibcudf.types.NullOrder.BEFORE + ] * len(source) + + func = getattr( + pylibcudf.search, + "lower_bound" if side == "left" else "upper_bound", ) - c_column_order = vector[libcudf_types.order](len(source), c_order) - c_null_precedence = vector[libcudf_types.null_order]( - len(source), c_null_order + return Column.from_pylibcudf( + func( + pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source]), + pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), + column_order, + null_precedence, + ) ) - if side == 'left': - with nogil: - c_result = move( - cpp_search.lower_bound( - c_table_data, - c_values_data, - c_column_order, - c_null_precedence, - ) - ) - elif side == 'right': - with nogil: - c_result = move( - cpp_search.upper_bound( - c_table_data, - c_values_data, - c_column_order, - c_null_precedence, - ) - ) - return Column.from_unique_ptr(move(c_result)) - @acquire_spill_lock() def contains(Column haystack, Column needles): @@ -87,15 +60,9 @@ def contains(Column haystack, Column needles): needles : A column of values to search for """ - cdef unique_ptr[column] c_result - cdef column_view c_haystack = haystack.view() - cdef column_view c_needles = needles.view() - - with nogil: - c_result = move( - cpp_search.contains( - c_haystack, - c_needles, - ) + return Column.from_pylibcudf( + pylibcudf.search.contains( + haystack.to_pylibcudf(mode="read"), + needles.to_pylibcudf(mode="read"), ) - return Column.from_unique_ptr(move(c_result)) + ) From 8dbe7cb12a752c44ce3027b96fc37ab0b0db923d Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Mon, 4 Mar 2024 08:43:02 -0600 Subject: [PATCH 27/31] Disable testChunkedPackTwoPasses for now (#15210) Signed-off-by: Alessandro Bellina Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- java/src/test/java/ai/rapids/cudf/TableTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 6f0b2b51f4c..bee8d1cbb88 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -3739,7 +3739,7 @@ void testChunkedPackBasic() { } } } - +/* @Test void testChunkedPackTwoPasses() { // this test packes ~2MB worth of long into a 1MB bounce buffer @@ -3768,6 +3768,7 @@ void testChunkedPackTwoPasses() { } } } +*/ @Test void testContiguousSplitWithStrings() { From 903dcac6a5341c200c4981c7b9d188897164e89c Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 4 Mar 2024 08:43:13 -0600 Subject: [PATCH 28/31] Fix accessing .columns issue (#15212) --- python/cudf/cudf/_lib/utils.pyx | 4 +- python/cudf/cudf/core/indexed_frame.py | 7 ++- python/cudf/cudf/tests/test_dataframe.py | 55 ++++++++++++------------ 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 896cc55b425..b6637e9df08 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -149,7 +149,9 @@ cpdef generate_pandas_metadata(table, index): col for col in table._columns ], - df=table, + # It is OKAY to do `.head(0).to_pandas()` because + # this method will extract `.columns` metadata only + df=table.head(0).to_pandas(), column_names=col_names, index_levels=index_levels, index_descriptors=index_descriptors, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 8e43000d0a8..3c6e1e17142 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2872,6 +2872,8 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: self._column_names, None if has_range_index or not keep_index else self._index.names, ) + result._data.label_dtype = self._data.label_dtype + result._data.rangeindex = self._data.rangeindex if keep_index and has_range_index: result.index = self.index[start:stop] @@ -3053,7 +3055,7 @@ def duplicated(self, subset=None, keep="first"): @_cudf_nvtx_annotate def _empty_like(self, keep_index=True) -> Self: - return self._from_columns_like_self( + result = self._from_columns_like_self( libcudf.copying.columns_empty_like( [ *(self._index._data.columns if keep_index else ()), @@ -3063,6 +3065,9 @@ def _empty_like(self, keep_index=True) -> Self: self._column_names, self._index.names if keep_index else None, ) + result._data.label_dtype = self._data.label_dtype + result._data.rangeindex = self._data.rangeindex + return result def _split(self, splits, keep_index=True): if self._num_rows == 0: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 2084db89909..50b14d532e4 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3012,43 +3012,31 @@ def test_series_rename(): @pytest.mark.parametrize("data_type", dtypes) @pytest.mark.parametrize("nelem", [0, 100]) def test_head_tail(nelem, data_type): - def check_index_equality(left, right): - assert left.index.equals(right.index) - - def check_values_equality(left, right): - if len(left) == 0 and len(right) == 0: - return None - - np.testing.assert_array_equal(left.to_pandas(), right.to_pandas()) - - def check_frame_series_equality(left, right): - check_index_equality(left, right) - check_values_equality(left, right) - - gdf = cudf.DataFrame( + pdf = pd.DataFrame( { "a": np.random.randint(0, 1000, nelem).astype(data_type), "b": np.random.randint(0, 1000, nelem).astype(data_type), } ) + gdf = cudf.from_pandas(pdf) - check_frame_series_equality(gdf.head(), gdf[:5]) - check_frame_series_equality(gdf.head(3), gdf[:3]) - check_frame_series_equality(gdf.head(-2), gdf[:-2]) - check_frame_series_equality(gdf.head(0), gdf[0:0]) + assert_eq(gdf.head(), pdf.head()) + assert_eq(gdf.head(3), pdf.head(3)) + assert_eq(gdf.head(-2), pdf.head(-2)) + assert_eq(gdf.head(0), pdf.head(0)) - check_frame_series_equality(gdf["a"].head(), gdf["a"][:5]) - check_frame_series_equality(gdf["a"].head(3), gdf["a"][:3]) - check_frame_series_equality(gdf["a"].head(-2), gdf["a"][:-2]) + assert_eq(gdf["a"].head(), pdf["a"].head()) + assert_eq(gdf["a"].head(3), pdf["a"].head(3)) + assert_eq(gdf["a"].head(-2), pdf["a"].head(-2)) - check_frame_series_equality(gdf.tail(), gdf[-5:]) - check_frame_series_equality(gdf.tail(3), gdf[-3:]) - check_frame_series_equality(gdf.tail(-2), gdf[2:]) - check_frame_series_equality(gdf.tail(0), gdf[0:0]) + assert_eq(gdf.tail(), pdf.tail()) + assert_eq(gdf.tail(3), pdf.tail(3)) + assert_eq(gdf.tail(-2), pdf.tail(-2)) + assert_eq(gdf.tail(0), pdf.tail(0)) - check_frame_series_equality(gdf["a"].tail(), gdf["a"][-5:]) - check_frame_series_equality(gdf["a"].tail(3), gdf["a"][-3:]) - check_frame_series_equality(gdf["a"].tail(-2), gdf["a"][2:]) + assert_eq(gdf["a"].tail(), pdf["a"].tail()) + assert_eq(gdf["a"].tail(3), pdf["a"].tail(3)) + assert_eq(gdf["a"].tail(-2), pdf["a"].tail(-2)) def test_tail_for_string(): @@ -4328,6 +4316,17 @@ def test_one_row_head(): assert_eq(head_pdf, head_gdf) +@pytest.mark.parametrize("index", [None, [123], ["a", "b"]]) +def test_no_cols_head(index): + pdf = pd.DataFrame(index=index) + gdf = cudf.from_pandas(pdf) + + head_gdf = gdf.head() + head_pdf = pdf.head() + + assert_eq(head_pdf, head_gdf) + + @pytest.mark.parametrize("dtype", ALL_TYPES) @pytest.mark.parametrize( "np_dtype,pd_dtype", From dbdcc31fe1cbe902d495428da3c68dc59d289dc5 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 4 Mar 2024 18:22:49 +0000 Subject: [PATCH 29/31] Expose new stable_sort and finish stream_compaction in pylibcudf (#15175) Completes coverage of `sorting.hpp` and `stream_compaction.hpp` Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15175 --- python/cudf/cudf/_lib/cpp/sorting.pxd | 7 +- .../cudf/cudf/_lib/cpp/stream_compaction.pxd | 43 +++- python/cudf/cudf/_lib/pylibcudf/sorting.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/sorting.pyx | 39 +++- .../cudf/_lib/pylibcudf/stream_compaction.pxd | 34 +++- .../cudf/_lib/pylibcudf/stream_compaction.pyx | 185 ++++++++++++++++-- python/cudf/cudf/_lib/stream_compaction.pyx | 1 + 7 files changed, 275 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd index 68f01003fe6..86dc0f0de95 100644 --- a/python/cudf/cudf/_lib/cpp/sorting.pxd +++ b/python/cudf/cudf/_lib/cpp/sorting.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -68,3 +68,8 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: table_view source_table, vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence) except + + + cdef unique_ptr[table] stable_sort( + table_view source_table, + vector[libcudf_types.order] column_order, + vector[libcudf_types.null_order] null_precedence) except + diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index e8539ecb9c3..55854a9444f 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -30,21 +30,28 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: vector[size_type] keys, size_type keep_threshold) except + + cdef unique_ptr[table] drop_nans(table_view source_table, + vector[size_type] keys, + size_type keep_threshold) except + + cdef unique_ptr[table] apply_boolean_mask( table_view source_table, column_view boolean_mask ) except + - cdef size_type distinct_count( - column_view source_table, - null_policy null_handling, - nan_policy nan_handling) except + + cdef unique_ptr[table] unique( + table_view input, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal, + ) except + - cdef unique_ptr[table] stable_distinct( + cdef unique_ptr[table] distinct( table_view input, vector[size_type] keys, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equals, ) except + cdef unique_ptr[column] distinct_indices( @@ -53,3 +60,29 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: null_equality nulls_equal, nan_equality nans_equal, ) except + + + cdef unique_ptr[table] stable_distinct( + table_view input, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, + ) except + + + cdef size_type unique_count( + column_view column, + null_policy null_handling, + nan_policy nan_handling) except + + + cdef size_type unique_count( + table_view source_table, + null_policy null_handling) except + + + cdef size_type distinct_count( + column_view column, + null_policy null_handling, + nan_policy nan_handling) except + + + cdef size_type distinct_count( + table_view source_table, + null_policy null_handling) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd index fb22da0b0fd..3ed241622c0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd @@ -59,3 +59,5 @@ cpdef Table stable_sort_by_key( ) cpdef Table sort(Table source_table, list column_order, list null_precedence) + +cpdef Table stable_sort(Table source_table, list column_order, list null_precedence) diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx index 4e73760720a..1668a3efc7c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx @@ -50,7 +50,8 @@ cpdef Column stable_sorted_order( list column_order, list null_precedence, ): - """Computes the row indices required to sort the table, maintaining input order. + """Computes the row indices required to sort the table, + preserving order of equal elements. Parameters ---------- @@ -206,7 +207,8 @@ cpdef Table stable_segmented_sort_by_key( list column_order, list null_precedence, ): - """Sorts the table by key, within segments, maintaining input order. + """Sorts the table by key preserving order of equal elements, + within segments. Parameters ---------- @@ -287,7 +289,7 @@ cpdef Table stable_sort_by_key( list column_order, list null_precedence, ): - """Sorts the table by key, maintaining input order. + """Sorts the table by key preserving order of equal elements. Parameters ---------- @@ -349,3 +351,34 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence): ) ) return Table.from_libcudf(move(c_result)) + + +cpdef Table stable_sort(Table source_table, list column_order, list null_precedence): + """Sorts the table preserving order of equal elements. + + Parameters + ---------- + source_table : Table + The table to sort. + column_order : List[ColumnOrder] + Whether each column should be sorted in ascending or descending order. + null_precedence : List[NullOrder] + Whether nulls should come before or after non-nulls. + + Returns + ------- + Table + The sorted table. + """ + cdef unique_ptr[table] c_result + cdef vector[order] c_orders = column_order + cdef vector[null_order] c_null_precedence = null_precedence + with nogil: + c_result = move( + cpp_sorting.stable_sort( + source_table.view(), + c_orders, + c_null_precedence, + ) + ) + return Table.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd index 78adb20021c..29acc21fc05 100644 --- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd @@ -15,19 +15,21 @@ from .table cimport Table cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold) -cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask) +cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold) -cpdef size_type distinct_count( - Column source_table, - null_policy null_handling, - nan_policy nan_handling +cpdef Table unique( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, ) -cpdef Table stable_distinct( +cpdef Table distinct( Table input, list keys, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equal, ) cpdef Column distinct_indices( @@ -36,3 +38,23 @@ cpdef Column distinct_indices( null_equality nulls_equal, nan_equality nans_equal, ) + +cpdef Table stable_distinct( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, +) + +cpdef size_type unique_count( + Column column, + null_policy null_handling, + nan_policy nan_handling +) + +cpdef size_type distinct_count( + Column column, + null_policy null_handling, + nan_policy nan_handling +) diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx index 0357866980a..af7a85d31bf 100644 --- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx @@ -51,6 +51,34 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): return Table.from_libcudf(move(c_result)) +cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): + """Filters out rows from the input table based on the presence of NaNs. + + Parameters + ---------- + source_table : Table + The input table to filter. + keys : List[size_type] + The list of column indexes to consider for NaN filtering. + keep_threshold : size_type + The minimum number of non-NaNs required to keep a row. + + Returns + ------- + Table + A new table with rows removed based on NaNs. + """ + cdef unique_ptr[table] c_result + cdef vector[size_type] c_keys = keys + with nogil: + c_result = move( + cpp_stream_compaction.drop_nulls( + source_table.view(), c_keys, keep_threshold + ) + ) + return Table.from_libcudf(move(c_result)) + + cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): """Filters out rows from the input table based on a boolean mask. @@ -76,39 +104,55 @@ cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): return Table.from_libcudf(move(c_result)) -cpdef size_type distinct_count( - Column source_table, - null_policy null_handling, - nan_policy nan_handling +cpdef Table unique( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, ): - """Returns the number of unique elements in the input column. + """Filter duplicate consecutive rows from the input table. Parameters ---------- - source_table : Column - The input column to count the unique elements of. - null_handling : null_policy - Flag to include or exclude nulls from the count. - nan_handling : nan_policy - Flag to include or exclude NaNs from the count. + input : Table + The input table to filter + keys : list[int] + The list of column indexes to consider for filtering. + keep : duplicate_keep_option + The option to specify which rows to keep in the case of duplicates. + nulls_equal : null_equality + The option to specify how nulls are handled in the comparison. Returns ------- - size_type - The number of unique elements in the input column. + Table + New Table with unique rows from each sequence of equivalent rows + as specified by keep. In the same order as the input table. + + Notes + ----- + If the input columns to be filtered on are sorted, then + unique can produce the same result as stable_distinct, but faster. """ - return cpp_stream_compaction.distinct_count( - source_table.view(), null_handling, nan_handling - ) + cdef unique_ptr[table] c_result + cdef vector[size_type] c_keys = keys + with nogil: + c_result = move( + cpp_stream_compaction.unique( + input.view(), c_keys, keep, nulls_equal + ) + ) + return Table.from_libcudf(move(c_result)) -cpdef Table stable_distinct( +cpdef Table distinct( Table input, list keys, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equal, ): - """Get the distinct rows from the input table, preserving input order. + """Get the distinct rows from the input table. Parameters ---------- @@ -120,18 +164,21 @@ cpdef Table stable_distinct( The option to specify which rows to keep in the case of duplicates. nulls_equal : null_equality The option to specify how nulls are handled in the comparison. + nans_equal : nan_equality + The option to specify how NaNs are handled in the comparison. Returns ------- Table - A new table with distinct rows from the input table. + A new table with distinct rows from the input table. The + output will not necessarily be in the same order as the input. """ cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: c_result = move( - cpp_stream_compaction.stable_distinct( - input.view(), c_keys, keep, nulls_equal + cpp_stream_compaction.distinct( + input.view(), c_keys, keep, nulls_equal, nans_equal ) ) return Table.from_libcudf(move(c_result)) @@ -169,3 +216,99 @@ cpdef Column distinct_indices( ) ) return Column.from_libcudf(move(c_result)) + + +cpdef Table stable_distinct( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, +): + """Get the distinct rows from the input table, preserving input order. + + Parameters + ---------- + input : Table + The input table to filter. + keys : list + The list of column indexes to consider for distinct filtering. + keep : duplicate_keep_option + The option to specify which rows to keep in the case of duplicates. + nulls_equal : null_equality + The option to specify how nulls are handled in the comparison. + nans_equal : nan_equality + The option to specify how NaNs are handled in the comparison. + + Returns + ------- + Table + A new table with distinct rows from the input table, preserving + the input table order. + """ + cdef unique_ptr[table] c_result + cdef vector[size_type] c_keys = keys + with nogil: + c_result = move( + cpp_stream_compaction.stable_distinct( + input.view(), c_keys, keep, nulls_equal, nans_equal + ) + ) + return Table.from_libcudf(move(c_result)) + + +cpdef size_type unique_count( + Column source, + null_policy null_handling, + nan_policy nan_handling +): + """Returns the number of unique consecutive elements in the input column. + + Parameters + ---------- + source : Column + The input column to count the unique elements of. + null_handling : null_policy + Flag to include or exclude nulls from the count. + nan_handling : nan_policy + Flag to include or exclude NaNs from the count. + + Returns + ------- + size_type + The number of unique consecutive elements in the input column. + + Notes + ----- + If the input column is sorted, then unique_count can produce the + same result as distinct_count, but faster. + """ + return cpp_stream_compaction.unique_count( + source.view(), null_handling, nan_handling + ) + + +cpdef size_type distinct_count( + Column source, + null_policy null_handling, + nan_policy nan_handling +): + """Returns the number of distinct elements in the input column. + + Parameters + ---------- + source : Column + The input column to count the unique elements of. + null_handling : null_policy + Flag to include or exclude nulls from the count. + nan_handling : nan_policy + Flag to include or exclude NaNs from the count. + + Returns + ------- + size_type + The number of distinct elements in the input column. + """ + return cpp_stream_compaction.distinct_count( + source.view(), null_handling, nan_handling + ) diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 04883eac559..834f91f48d9 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -109,6 +109,7 @@ def drop_duplicates(list columns, keep_option, pylibcudf.types.NullEquality.EQUAL if nulls_are_equal else pylibcudf.types.NullEquality.UNEQUAL, + pylibcudf.types.NanEquality.ALL_EQUAL, ) ) From da113015aade79d78628d00578dff22a4dd5cf35 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 4 Mar 2024 13:17:33 -0600 Subject: [PATCH 30/31] Switch `pytest-xdist` algo to `worksteal` (#15207) This PR switches `pytest-xdist` distribution algorithm to a much more efficient algorithm `worksteal`, that will assign any idle pytest worker to pickup remaining pytests. I see a 25% time savings when this switch is made locally: ``` `loadscope`: == 101421 passed, 2115 skipped, 867 xfailed in 1179.48s (0:19:39) == `worksteal`: == 101423 passed, 2115 skipped, 867 xfailed in 891.79s (0:14:51) == ``` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/15207 --- ci/test_python_cudf.sh | 6 +++--- ci/test_python_other.sh | 4 ++-- ci/test_wheel_cudf.sh | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index ace71bb0b75..bacb54b3896 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -18,7 +18,7 @@ rapids-logger "pytest cudf" ./ci/run_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=../.coveragerc \ --cov=cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-coverage.xml" \ @@ -32,7 +32,7 @@ rapids-logger "pytest cudf" rapids-logger "pytest for cudf benchmarks" ./ci/run_cudf_pytest_benchmarks.sh \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=.coveragerc \ --cov=cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-benchmark-coverage.xml" \ @@ -41,7 +41,7 @@ rapids-logger "pytest for cudf benchmarks" rapids-logger "pytest for cudf benchmarks using pandas" ./ci/run_cudf_pandas_pytest_benchmarks.sh \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=.coveragerc \ --cov=cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-benchmark-pandas-coverage.xml" \ diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index bc15747b26a..9cdceb295db 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -23,7 +23,7 @@ rapids-logger "pytest dask_cudf" ./ci/run_dask_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=../.coveragerc \ --cov=dask_cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \ @@ -33,7 +33,7 @@ rapids-logger "pytest custreamz" ./ci/run_custreamz_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-custreamz.xml" \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=../.coveragerc \ --cov=custreamz \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/custreamz-coverage.xml" \ diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index b7e8f862ed5..af5779f478a 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -37,7 +37,7 @@ else --cache-clear \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ . popd fi From 0ff5a2c59cb62d6b3c473885ebbe883d1aae8c4f Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Mon, 4 Mar 2024 15:20:32 -0500 Subject: [PATCH 31/31] Replace local copyright check with pre-commit-hooks verify-copyright (#14917) The local `copyright.py` script is bug-prone. Replace it with a more robust centralized script from `pre-commit-hooks`. Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) - Jake Awe (https://github.com/AyodeAwe) - Karthikeyan (https://github.com/karthikeyann) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/14917 --- .pre-commit-config.yaml | 13 +- ci/checks/copyright.py | 277 ---------------------------------------- 2 files changed, 7 insertions(+), 283 deletions(-) delete mode 100644 ci/checks/copyright.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d302543368e..9235c80bdc9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -126,12 +126,6 @@ repos: - cmakelang==0.6.13 verbose: true require_serial: true - - id: copyright-check - name: copyright-check - entry: python ./ci/checks/copyright.py --git-modified-only --update-current-year - language: python - pass_filenames: false - additional_dependencies: [gitpython] - id: doxygen-check name: doxygen-check entry: ./ci/checks/doxygen.sh @@ -161,6 +155,13 @@ repos: hooks: - id: ruff files: python/.*$ + - repo: https://github.com/rapidsai/pre-commit-hooks + rev: v0.0.1 + hooks: + - id: verify-copyright + exclude: | + (?x) + cpp/include/cudf_test/cxxopts[.]hpp$ default_language_version: diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py deleted file mode 100644 index dd89b092496..00000000000 --- a/ci/checks/copyright.py +++ /dev/null @@ -1,277 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import datetime -import os -import re -import sys - -import git - -FilesToCheck = [ - re.compile(r"[.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$"), - re.compile(r"CMakeLists[.]txt$"), - re.compile(r"CMakeLists_standalone[.]txt$"), - re.compile(r"setup[.]cfg$"), - re.compile(r"meta[.]yaml$"), -] -ExemptFiles = [ - re.compile(r"cpp/include/cudf_test/cxxopts.hpp"), -] - -# this will break starting at year 10000, which is probably OK :) -CheckSimple = re.compile( - r"Copyright *(?:\(c\))? *(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)" -) -CheckDouble = re.compile( - r"Copyright *(?:\(c\))? *(\d{4})-(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)" # noqa: E501 -) - - -def checkThisFile(f): - if isinstance(f, git.Diff): - if f.deleted_file or f.b_blob.size == 0: - return False - f = f.b_path - elif not os.path.exists(f) or os.stat(f).st_size == 0: - # This check covers things like symlinks which point to files that DNE - return False - for exempt in ExemptFiles: - if exempt.search(f): - return False - for checker in FilesToCheck: - if checker.search(f): - return True - return False - - -def modifiedFiles(): - """Get a set of all modified files, as Diff objects. - - The files returned have been modified in git since the merge base of HEAD - and the upstream of the target branch. We return the Diff objects so that - we can read only the staged changes. - """ - repo = git.Repo() - # Use the environment variable TARGET_BRANCH or RAPIDS_BASE_BRANCH (defined in CI) if possible - target_branch = os.environ.get("TARGET_BRANCH", os.environ.get("RAPIDS_BASE_BRANCH")) - if target_branch is None: - # Fall back to the closest branch if not on CI - target_branch = repo.git.describe( - all=True, tags=True, match="branch-*", abbrev=0 - ).lstrip("heads/") - - upstream_target_branch = None - if target_branch in repo.heads: - # Use the tracking branch of the local reference if it exists. This - # returns None if no tracking branch is set. - upstream_target_branch = repo.heads[target_branch].tracking_branch() - if upstream_target_branch is None: - # Fall back to the remote with the newest target_branch. This code - # path is used on CI because the only local branch reference is - # current-pr-branch, and thus target_branch is not in repo.heads. - # This also happens if no tracking branch is defined for the local - # target_branch. We use the remote with the latest commit if - # multiple remotes are defined. - candidate_branches = [ - remote.refs[target_branch] for remote in repo.remotes - if target_branch in remote.refs - ] - if len(candidate_branches) > 0: - upstream_target_branch = sorted( - candidate_branches, - key=lambda branch: branch.commit.committed_datetime, - )[-1] - else: - # If no remotes are defined, try to use the local version of the - # target_branch. If this fails, the repo configuration must be very - # strange and we can fix this script on a case-by-case basis. - upstream_target_branch = repo.heads[target_branch] - merge_base = repo.merge_base("HEAD", upstream_target_branch.commit)[0] - diff = merge_base.diff() - changed_files = {f for f in diff if f.b_path is not None} - return changed_files - - -def getCopyrightYears(line): - res = CheckSimple.search(line) - if res: - return int(res.group(1)), int(res.group(1)) - res = CheckDouble.search(line) - if res: - return int(res.group(1)), int(res.group(2)) - return None, None - - -def replaceCurrentYear(line, start, end): - # first turn a simple regex into double (if applicable). then update years - res = CheckSimple.sub(r"Copyright (c) \1-\1, NVIDIA CORPORATION", line) - res = CheckDouble.sub( - rf"Copyright (c) {start:04d}-{end:04d}, NVIDIA CORPORATION", - res, - ) - return res - - -def checkCopyright(f, update_current_year): - """Checks for copyright headers and their years.""" - errs = [] - thisYear = datetime.datetime.now().year - lineNum = 0 - crFound = False - yearMatched = False - - if isinstance(f, git.Diff): - path = f.b_path - lines = f.b_blob.data_stream.read().decode().splitlines(keepends=True) - else: - path = f - with open(f, encoding="utf-8") as fp: - lines = fp.readlines() - - for line in lines: - lineNum += 1 - start, end = getCopyrightYears(line) - if start is None: - continue - crFound = True - if start > end: - e = [ - path, - lineNum, - "First year after second year in the copyright " - "header (manual fix required)", - None, - ] - errs.append(e) - elif thisYear < start or thisYear > end: - e = [ - path, - lineNum, - "Current year not included in the copyright header", - None, - ] - if thisYear < start: - e[-1] = replaceCurrentYear(line, thisYear, end) - if thisYear > end: - e[-1] = replaceCurrentYear(line, start, thisYear) - errs.append(e) - else: - yearMatched = True - # copyright header itself not found - if not crFound: - e = [ - path, - 0, - "Copyright header missing or formatted incorrectly " - "(manual fix required)", - None, - ] - errs.append(e) - # even if the year matches a copyright header, make the check pass - if yearMatched: - errs = [] - - if update_current_year: - errs_update = [x for x in errs if x[-1] is not None] - if len(errs_update) > 0: - lines_changed = ", ".join(str(x[1]) for x in errs_update) - print(f"File: {path}. Changing line(s) {lines_changed}") - for _, lineNum, __, replacement in errs_update: - lines[lineNum - 1] = replacement - with open(path, "w", encoding="utf-8") as out_file: - out_file.writelines(lines) - - return errs - - -def getAllFilesUnderDir(root, pathFilter=None): - retList = [] - for dirpath, dirnames, filenames in os.walk(root): - for fn in filenames: - filePath = os.path.join(dirpath, fn) - if pathFilter(filePath): - retList.append(filePath) - return retList - - -def checkCopyright_main(): - """ - Checks for copyright headers in all the modified files. In case of local - repo, this script will just look for uncommitted files and in case of CI - it compares between branches "$PR_TARGET_BRANCH" and "current-pr-branch" - """ - retVal = 0 - - argparser = argparse.ArgumentParser( - "Checks for a consistent copyright header in git's modified files" - ) - argparser.add_argument( - "--update-current-year", - dest="update_current_year", - action="store_true", - required=False, - help="If set, " - "update the current year if a header is already " - "present and well formatted.", - ) - argparser.add_argument( - "--git-modified-only", - dest="git_modified_only", - action="store_true", - required=False, - help="If set, " - "only files seen as modified by git will be " - "processed.", - ) - - args, dirs = argparser.parse_known_args() - - if args.git_modified_only: - files = [f for f in modifiedFiles() if checkThisFile(f)] - else: - files = [] - for d in [os.path.abspath(d) for d in dirs]: - if not os.path.isdir(d): - raise ValueError(f"{d} is not a directory.") - files += getAllFilesUnderDir(d, pathFilter=checkThisFile) - - errors = [] - for f in files: - errors += checkCopyright(f, args.update_current_year) - - if len(errors) > 0: - if any(e[-1] is None for e in errors): - print("Copyright headers incomplete in some of the files!") - for e in errors: - print(" %s:%d Issue: %s" % (e[0], e[1], e[2])) - print("") - n_fixable = sum(1 for e in errors if e[-1] is not None) - path_parts = os.path.abspath(__file__).split(os.sep) - file_from_repo = os.sep.join(path_parts[path_parts.index("ci") :]) - if n_fixable > 0 and not args.update_current_year: - print( - f"You can run `python {file_from_repo} --git-modified-only " - "--update-current-year` and stage the results in git to " - f"fix {n_fixable} of these errors.\n" - ) - retVal = 1 - - return retVal - - -if __name__ == "__main__": - sys.exit(checkCopyright_main())