diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b7d785ad65..6773e68a14 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ repos: - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v14.0.6 + rev: v16.0.1 hooks: - id: clang-format files: \.(cu|cuh|h|hpp|cpp|inl)$ diff --git a/src/main/cpp/benchmarks/cast_string_to_float.cpp b/src/main/cpp/benchmarks/cast_string_to_float.cpp index a231775d01..d94f9d26a0 100644 --- a/src/main/cpp/benchmarks/cast_string_to_float.cpp +++ b/src/main/cpp/benchmarks/cast_string_to_float.cpp @@ -27,16 +27,18 @@ void string_to_float(nvbench::state& state) { cudf::size_type const n_rows{(cudf::size_type)state.get_int64("num_rows")}; - auto const float_tbl = create_random_table({cudf::type_id::FLOAT32}, row_count{n_rows}); - auto const float_col = float_tbl->get_column(0); + auto const float_tbl = create_random_table({cudf::type_id::FLOAT32}, row_count{n_rows}); + auto const float_col = float_tbl->get_column(0); auto const string_col = cudf::strings::from_floats(float_col.view()); - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { - auto rows = spark_rapids_jni::string_to_float(cudf::data_type{cudf::type_id::FLOAT32}, string_col->view(), false, cudf::get_default_stream()); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto rows = spark_rapids_jni::string_to_float(cudf::data_type{cudf::type_id::FLOAT32}, + string_col->view(), + false, + cudf::get_default_stream()); }); } NVBENCH_BENCH(string_to_float) - .set_name("Strings to Float Cast") - .add_int64_axis("num_rows", {1 * 1024 * 1024, 100 * 1024 * 1024}); + .set_name("Strings to Float Cast") + .add_int64_axis("num_rows", {1 * 1024 * 1024, 100 * 1024 * 1024}); diff --git a/src/main/cpp/benchmarks/common/generate_input.hpp b/src/main/cpp/benchmarks/common/generate_input.hpp index a5be50d3f9..207ad00200 100644 --- a/src/main/cpp/benchmarks/common/generate_input.hpp +++ b/src/main/cpp/benchmarks/common/generate_input.hpp @@ -183,8 +183,7 @@ struct distribution_params -struct distribution_params()>> { -}; +struct distribution_params()>> {}; /** * @brief Returns a vector of types, corresponding to the input type or a type group. diff --git a/src/main/cpp/benchmarks/row_conversion.cpp b/src/main/cpp/benchmarks/row_conversion.cpp index 46ce39a7aa..c625342867 100644 --- a/src/main/cpp/benchmarks/row_conversion.cpp +++ b/src/main/cpp/benchmarks/row_conversion.cpp @@ -28,15 +28,15 @@ void fixed_width(nvbench::state& state) { cudf::size_type const n_rows{(cudf::size_type)state.get_int64("num_rows")}; auto const direction = state.get_string("direction"); - auto const table = create_random_table(cycle_dtypes({cudf::type_id::INT8, - cudf::type_id::INT32, - cudf::type_id::INT16, - cudf::type_id::INT64, - cudf::type_id::INT32, - cudf::type_id::BOOL8, - cudf::type_id::UINT16, - cudf::type_id::UINT8, - cudf::type_id::UINT64}, + auto const table = create_random_table(cycle_dtypes({cudf::type_id::INT8, + cudf::type_id::INT32, + cudf::type_id::INT16, + cudf::type_id::INT64, + cudf::type_id::INT32, + cudf::type_id::BOOL8, + cudf::type_id::UINT16, + cudf::type_id::UINT8, + cudf::type_id::UINT64}, 212), row_count{n_rows}); @@ -50,16 +50,15 @@ void fixed_width(nvbench::state& state) auto rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(table->view()); - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { - if (direction == "to row") { - auto _rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(table->view()); - } else { - for (auto const &r : rows) { - cudf::lists_column_view const l(r->view()); - auto out = spark_rapids_jni::convert_from_rows_fixed_width_optimized(l, schema); - } + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + if (direction == "to row") { + auto _rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(table->view()); + } else { + for (auto const& r : rows) { + cudf::lists_column_view const l(r->view()); + auto out = spark_rapids_jni::convert_from_rows_fixed_width_optimized(l, schema); } + } }); state.add_buffer_size(n_rows, "trc", "Total Rows"); @@ -69,7 +68,7 @@ void fixed_width(nvbench::state& state) static void variable_or_fixed_width(nvbench::state& state) { cudf::size_type const n_rows{(cudf::size_type)state.get_int64("num_rows")}; - auto const direction = state.get_string("direction"); + auto const direction = state.get_string("direction"); auto const include_strings = state.get_string("strings"); if (n_rows > 1 * 1024 * 1024 && include_strings == "include strings") { @@ -120,17 +119,16 @@ static void variable_or_fixed_width(nvbench::state& state) auto rows = spark_rapids_jni::convert_to_rows(table->view()); - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto new_rows = spark_rapids_jni::convert_to_rows(table->view()); - if (direction == "to row") { - auto _rows = spark_rapids_jni::convert_to_rows(table->view()); - } else { - for (auto const &r : rows) { - cudf::lists_column_view const l(r->view()); - auto out = spark_rapids_jni::convert_from_rows(l, schema); - } + if (direction == "to row") { + auto _rows = spark_rapids_jni::convert_to_rows(table->view()); + } else { + for (auto const& r : rows) { + cudf::lists_column_view const l(r->view()); + auto out = spark_rapids_jni::convert_from_rows(l, schema); } + } }); state.add_buffer_size(n_rows, "trc", "Total Rows"); @@ -138,12 +136,12 @@ static void variable_or_fixed_width(nvbench::state& state) } NVBENCH_BENCH(fixed_width) - .set_name("Fixed Width Only") - .add_int64_axis("num_rows", {1 * 1024 * 1024, 4 * 1024 * 1024}) - .add_string_axis("direction", {"to row", "from row"}); + .set_name("Fixed Width Only") + .add_int64_axis("num_rows", {1 * 1024 * 1024, 4 * 1024 * 1024}) + .add_string_axis("direction", {"to row", "from row"}); NVBENCH_BENCH(variable_or_fixed_width) - .set_name("Fixed or Variable Width") - .add_int64_axis("num_rows", {1 * 1024 * 1024, 4 * 1024 * 1024}) - .add_string_axis("direction", {"to row", "from row"}) - .add_string_axis("strings", {"include strings", "no strings"}); + .set_name("Fixed or Variable Width") + .add_int64_axis("num_rows", {1 * 1024 * 1024, 4 * 1024 * 1024}) + .add_string_axis("direction", {"to row", "from row"}) + .add_string_axis("strings", {"include strings", "no strings"}); diff --git a/src/main/cpp/src/DecimalUtilsJni.cpp b/src/main/cpp/src/DecimalUtilsJni.cpp index 25045aa94e..f732276817 100644 --- a/src/main/cpp/src/DecimalUtilsJni.cpp +++ b/src/main/cpp/src/DecimalUtilsJni.cpp @@ -14,98 +14,95 @@ * limitations under the License. */ -#include "decimal_utils.hpp" #include "cudf_jni_apis.hpp" +#include "decimal_utils.hpp" extern "C" { -JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_multiply128(JNIEnv *env, jclass, - jlong j_view_a, - jlong j_view_b, - jint j_product_scale) { +JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_multiply128( + JNIEnv* env, jclass, jlong j_view_a, jlong j_view_b, jint j_product_scale) +{ JNI_NULL_CHECK(env, j_view_a, "column is null", 0); JNI_NULL_CHECK(env, j_view_b, "column is null", 0); try { cudf::jni::auto_set_device(env); - auto view_a = reinterpret_cast(j_view_a); - auto view_b = reinterpret_cast(j_view_b); - auto scale = static_cast(j_product_scale); - return cudf::jni::convert_table_for_return(env, cudf::jni::multiply_decimal128(*view_a, *view_b, - scale)); + auto view_a = reinterpret_cast(j_view_a); + auto view_b = reinterpret_cast(j_view_b); + auto scale = static_cast(j_product_scale); + return cudf::jni::convert_table_for_return( + env, cudf::jni::multiply_decimal128(*view_a, *view_b, scale)); } CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_divide128(JNIEnv *env, jclass, - jlong j_view_a, - jlong j_view_b, - jint j_quotient_scale, - jboolean j_is_int_div) { +JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_divide128( + JNIEnv* env, jclass, jlong j_view_a, jlong j_view_b, jint j_quotient_scale, jboolean j_is_int_div) +{ JNI_NULL_CHECK(env, j_view_a, "column is null", 0); JNI_NULL_CHECK(env, j_view_b, "column is null", 0); try { cudf::jni::auto_set_device(env); - auto view_a = reinterpret_cast(j_view_a); - auto view_b = reinterpret_cast(j_view_b); - auto scale = static_cast(j_quotient_scale); + auto view_a = reinterpret_cast(j_view_a); + auto view_b = reinterpret_cast(j_view_b); + auto scale = static_cast(j_quotient_scale); auto is_int_division = static_cast(j_is_int_div); if (is_int_division) { - return cudf::jni::convert_table_for_return(env, cudf::jni::integer_divide_decimal128(*view_a, *view_b, scale)); + return cudf::jni::convert_table_for_return( + env, cudf::jni::integer_divide_decimal128(*view_a, *view_b, scale)); } else { - return cudf::jni::convert_table_for_return(env, cudf::jni::divide_decimal128(*view_a, *view_b, scale)); + return cudf::jni::convert_table_for_return( + env, cudf::jni::divide_decimal128(*view_a, *view_b, scale)); } } CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_remainder128(JNIEnv *env, jclass, - jlong j_view_a, - jlong j_view_b, - jint j_remainder_scale) { +JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_remainder128( + JNIEnv* env, jclass, jlong j_view_a, jlong j_view_b, jint j_remainder_scale) +{ JNI_NULL_CHECK(env, j_view_a, "column is null", 0); JNI_NULL_CHECK(env, j_view_b, "column is null", 0); try { cudf::jni::auto_set_device(env); - auto view_a = reinterpret_cast(j_view_a); - auto view_b = reinterpret_cast(j_view_b); - auto scale = static_cast(j_remainder_scale); - return cudf::jni::convert_table_for_return(env, cudf::jni::remainder_decimal128(*view_a, *view_b, scale)); + auto view_a = reinterpret_cast(j_view_a); + auto view_b = reinterpret_cast(j_view_b); + auto scale = static_cast(j_remainder_scale); + return cudf::jni::convert_table_for_return( + env, cudf::jni::remainder_decimal128(*view_a, *view_b, scale)); } CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_add128(JNIEnv *env, jclass, - jlong j_view_a, - jlong j_view_b, - jint j_target_scale) { +JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_add128( + JNIEnv* env, jclass, jlong j_view_a, jlong j_view_b, jint j_target_scale) +{ JNI_NULL_CHECK(env, j_view_a, "column is null", 0); JNI_NULL_CHECK(env, j_view_b, "column is null", 0); try { cudf::jni::auto_set_device(env); - auto const view_a= reinterpret_cast(j_view_a); - auto const view_b= reinterpret_cast(j_view_b); - auto const scale = static_cast(j_target_scale); - return cudf::jni::convert_table_for_return(env, cudf::jni::add_decimal128(*view_a, *view_b, - scale)); + auto const view_a = reinterpret_cast(j_view_a); + auto const view_b = reinterpret_cast(j_view_b); + auto const scale = static_cast(j_target_scale); + return cudf::jni::convert_table_for_return(env, + cudf::jni::add_decimal128(*view_a, *view_b, scale)); } CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_subtract128(JNIEnv *env, jclass, - jlong j_view_a, - jlong j_view_b, - jint j_target_scale) { +JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_subtract128( + JNIEnv* env, jclass, jlong j_view_a, jlong j_view_b, jint j_target_scale) +{ JNI_NULL_CHECK(env, j_view_a, "column is null", 0); JNI_NULL_CHECK(env, j_view_b, "column is null", 0); try { cudf::jni::auto_set_device(env); - auto const view_a = reinterpret_cast(j_view_a); - auto const view_b = reinterpret_cast(j_view_b); - auto const scale = static_cast(j_target_scale); - return cudf::jni::convert_table_for_return(env, cudf::jni::sub_decimal128(*view_a, *view_b, - scale)); + auto const view_a = reinterpret_cast(j_view_a); + auto const view_b = reinterpret_cast(j_view_b); + auto const scale = static_cast(j_target_scale); + return cudf::jni::convert_table_for_return(env, + cudf::jni::sub_decimal128(*view_a, *view_b, scale)); } CATCH_STD(env, 0); } -} // extern "C" +} // extern "C" diff --git a/src/main/cpp/src/MapUtilsJni.cpp b/src/main/cpp/src/MapUtilsJni.cpp index fbbcdd889f..dc02d04370 100644 --- a/src/main/cpp/src/MapUtilsJni.cpp +++ b/src/main/cpp/src/MapUtilsJni.cpp @@ -22,12 +22,13 @@ extern "C" { JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_MapUtils_extractRawMapFromJsonString( - JNIEnv *env, jclass, jlong input_handle) { + JNIEnv* env, jclass, jlong input_handle) +{ JNI_NULL_CHECK(env, input_handle, "json_column_handle is null", 0); try { cudf::jni::auto_set_device(env); - auto const input = reinterpret_cast(input_handle); + auto const input = reinterpret_cast(input_handle); return cudf::jni::ptr_as_jlong(spark_rapids_jni::from_json(*input).release()); } CATCH_STD(env, 0); diff --git a/src/main/cpp/src/NativeParquetJni.cpp b/src/main/cpp/src/NativeParquetJni.cpp index 5d51857dcb..c6d90be0cc 100644 --- a/src/main/cpp/src/NativeParquetJni.cpp +++ b/src/main/cpp/src/NativeParquetJni.cpp @@ -14,15 +14,15 @@ * limitations under the License. */ -#include +#include #include +#include #include #include -#include // TCompactProtocol requires some #defines to work right. // This came from the parquet code itself... -#define SIGNED_RIGHT_SHIFT_IS 1 +#define SIGNED_RIGHT_SHIFT_IS 1 #define ARITHMETIC_RIGHT_SHIFT 1 #include #include @@ -42,17 +42,16 @@ namespace jni { * and may not produce the exact same result as the JVM does. This is probably good enough * for now. */ -std::string unicode_to_lower(std::string const& input) { +std::string unicode_to_lower(std::string const& input) +{ std::mbstate_t to_wc_state = std::mbstate_t(); - const char * mbstr = input.data(); + const char* mbstr = input.data(); // get the size of the wide character result std::size_t wide_size = std::mbsrtowcs(nullptr, &mbstr, 0, &to_wc_state); - if (wide_size < 0) { - throw std::invalid_argument("invalid character sequence"); - } + if (wide_size < 0) { throw std::invalid_argument("invalid character sequence"); } std::vector wide(wide_size + 1); - // Set a null so we can get a proper output size from wcstombs. This is because + // Set a null so we can get a proper output size from wcstombs. This is because // we pass in a max length of 0, so it will only stop when it see the null character. wide.back() = 0; if (std::mbsrtowcs(wide.data(), &mbstr, wide_size, &to_wc_state) != wide_size) { @@ -63,11 +62,9 @@ std::string unicode_to_lower(std::string const& input) { } // Get the multi-byte result size std::mbstate_t from_wc_state = std::mbstate_t(); - const wchar_t * wcstr = wide.data(); - std::size_t mb_size = std::wcsrtombs(nullptr, &wcstr, 0, &from_wc_state); - if (mb_size < 0) { - throw std::invalid_argument("unsupported wide character sequence"); - } + const wchar_t* wcstr = wide.data(); + std::size_t mb_size = std::wcsrtombs(nullptr, &wcstr, 0, &from_wc_state); + if (mb_size < 0) { throw std::invalid_argument("unsupported wide character sequence"); } // We are allocating a fixed size string so we can put the data directly into it // instead of going through a NUL terminated char* first. The NUL fill char is // just because we need to pass in a fill char. The value does not matter @@ -102,351 +99,451 @@ struct column_pruning_maps { * lets us match the Spark schema to the schema in the Parquet file. Different * versions of parquet had different layouts for various nested types. */ -enum class Tag { - VALUE = 0, - STRUCT, - LIST, - MAP -}; +enum class Tag { VALUE = 0, STRUCT, LIST, MAP }; /** * This class will handle processing column pruning for a schema. It is written as a class because - * of JNI we are sending the names of the columns as a depth first list, like parquet does internally. + * of JNI we are sending the names of the columns as a depth first list, like parquet does + * internally. */ class column_pruner { -public: - /** - * Create pruning filter from a depth first flattened tree of names and num_children. - * The root entry is not included in names or in num_children, but parent_num_children - * should hold how many entries there are in it. - */ - column_pruner(std::vector const & names, - std::vector const & num_children, - std::vector const & tags, - int const parent_num_children): children(), tag(Tag::STRUCT) { - add_depth_first(names, num_children, tags, parent_num_children); - } + public: + /** + * Create pruning filter from a depth first flattened tree of names and num_children. + * The root entry is not included in names or in num_children, but parent_num_children + * should hold how many entries there are in it. + */ + column_pruner(std::vector const& names, + std::vector const& num_children, + std::vector const& tags, + int const parent_num_children) + : children(), tag(Tag::STRUCT) + { + add_depth_first(names, num_children, tags, parent_num_children); + } - column_pruner(Tag const in_tag): children(), tag(in_tag) { - } + column_pruner(Tag const in_tag) : children(), tag(in_tag) {} - column_pruner(): children(), tag(Tag::STRUCT) { - } + column_pruner() : children(), tag(Tag::STRUCT) {} - /** - * Given a schema from a parquet file create a set of pruning maps to prune columns from the rest of the footer - */ - column_pruning_maps filter_schema(std::vector const & schema, bool const ignore_case) const { - CUDF_FUNC_RANGE(); + /** + * Given a schema from a parquet file create a set of pruning maps to prune columns from the rest + * of the footer + */ + column_pruning_maps filter_schema(std::vector const& schema, + bool const ignore_case) const + { + CUDF_FUNC_RANGE(); - // These are the outputs of the computation. - std::vector chunk_map; - std::vector schema_map; - std::vector schema_num_children; - std::size_t current_input_schema_index = 0; - std::size_t next_input_chunk_index = 0; + // These are the outputs of the computation. + std::vector chunk_map; + std::vector schema_map; + std::vector schema_num_children; + std::size_t current_input_schema_index = 0; + std::size_t next_input_chunk_index = 0; + + filter_schema(schema, + ignore_case, + current_input_schema_index, + next_input_chunk_index, + chunk_map, + schema_map, + schema_num_children); + + return column_pruning_maps{ + std::move(schema_map), std::move(schema_num_children), std::move(chunk_map)}; + } - filter_schema(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children); + private: + std::string get_name(parquet::format::SchemaElement& elem, + const bool normalize_case = false) const + { + return normalize_case ? unicode_to_lower(elem.name) : elem.name; + } - return column_pruning_maps{std::move(schema_map), - std::move(schema_num_children), - std::move(chunk_map)}; - } + int get_num_children(parquet::format::SchemaElement& elem) const + { + return elem.__isset.num_children ? elem.num_children : 0; + } -private: - std::string get_name(parquet::format::SchemaElement & elem, const bool normalize_case = false) const { - return normalize_case ? unicode_to_lower(elem.name) : elem.name; - } + void skip(std::vector const& schema, + std::size_t& current_input_schema_index, + std::size_t& next_input_chunk_index) const + { + // We want to skip everything referenced by the current_input_schema_index and its children. + // But we do have to update the chunk indexes as we go. + int num_to_skip = 1; + while (num_to_skip > 0 && current_input_schema_index < schema.size()) { + auto schema_item = schema[current_input_schema_index]; + bool is_leaf = schema_item.__isset.type; + if (is_leaf) { ++next_input_chunk_index; } + + if (schema_item.__isset.num_children) { + num_to_skip = num_to_skip + schema_item.num_children; + } - int get_num_children(parquet::format::SchemaElement & elem) const { - return elem.__isset.num_children ? elem.num_children : 0; + --num_to_skip; + ++current_input_schema_index; } + } - void skip(std::vector const & schema, - std::size_t & current_input_schema_index, std::size_t & next_input_chunk_index) const { - // We want to skip everything referenced by the current_input_schema_index and its children. - // But we do have to update the chunk indexes as we go. - int num_to_skip = 1; - while (num_to_skip > 0 && current_input_schema_index < schema.size()) { - auto schema_item = schema[current_input_schema_index]; - bool is_leaf = schema_item.__isset.type; - if (is_leaf) { - ++next_input_chunk_index; - } - - if (schema_item.__isset.num_children) { - num_to_skip = num_to_skip + schema_item.num_children; - } - - --num_to_skip; - ++current_input_schema_index; + /** + * filter_schema, but specific to Tag::STRUCT. + */ + void filter_schema_struct(std::vector const& schema, + bool const ignore_case, + std::size_t& current_input_schema_index, + std::size_t& next_input_chunk_index, + std::vector& chunk_map, + std::vector& schema_map, + std::vector& schema_num_children) const + { + // First verify that we found a struct, like we expected to find. + auto struct_schema_item = schema.at(current_input_schema_index); + bool is_leaf = struct_schema_item.__isset.type; + if (is_leaf) { throw std::runtime_error("Found a leaf node, but expected to find a struct"); } + + int num_children = get_num_children(struct_schema_item); + // Now that everything looks good add ourselves into the maps, and move to the next entry to + // look at. + schema_map.push_back(current_input_schema_index); + // We will update the num_children each time we find one... + int our_num_children_index = schema_num_children.size(); + schema_num_children.push_back(0); + ++current_input_schema_index; + + // For a STRUCT we want to look for all of the children that match the name and let each of them + // handle updating things themselves. + for (int child_id = 0; child_id < num_children && current_input_schema_index < schema.size(); + child_id++) { + auto schema_item = schema[current_input_schema_index]; + std::string name = get_name(schema_item, ignore_case); + auto found = children.find(name); + + if (found != children.end()) { + // found a match so update the number of children that passed the filter and ask it to + // filter itself. + ++schema_num_children[our_num_children_index]; + found->second.filter_schema(schema, + ignore_case, + current_input_schema_index, + next_input_chunk_index, + chunk_map, + schema_map, + schema_num_children); + } else { + // No match was found so skip the child. + skip(schema, current_input_schema_index, next_input_chunk_index); } } + } - /** - * filter_schema, but specific to Tag::STRUCT. - */ - void filter_schema_struct(std::vector const & schema, bool const ignore_case, - std::size_t & current_input_schema_index, std::size_t & next_input_chunk_index, - std::vector & chunk_map, std::vector & schema_map, std::vector & schema_num_children) const { - // First verify that we found a struct, like we expected to find. - auto struct_schema_item = schema.at(current_input_schema_index); - bool is_leaf = struct_schema_item.__isset.type; - if (is_leaf) { - throw std::runtime_error("Found a leaf node, but expected to find a struct"); - } - - int num_children = get_num_children(struct_schema_item); - // Now that everything looks good add ourselves into the maps, and move to the next entry to look at. - schema_map.push_back(current_input_schema_index); - // We will update the num_children each time we find one... - int our_num_children_index = schema_num_children.size(); - schema_num_children.push_back(0); - ++current_input_schema_index; - - // For a STRUCT we want to look for all of the children that match the name and let each of them handle updating things - // themselves. - for (int child_id = 0; child_id < num_children && current_input_schema_index < schema.size(); child_id++) { - auto schema_item = schema[current_input_schema_index]; - std::string name = get_name(schema_item, ignore_case); - auto found = children.find(name); - - if (found != children.end()) { - // found a match so update the number of children that passed the filter and ask it to filter itself. - ++schema_num_children[our_num_children_index]; - found->second.filter_schema(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children); - } else { - // No match was found so skip the child. - skip(schema, current_input_schema_index, next_input_chunk_index); - } - } + /** + * filter_schema, but specific to Tag::VALUE. + */ + void filter_schema_value(std::vector const& schema, + std::size_t& current_input_schema_index, + std::size_t& next_input_chunk_index, + std::vector& chunk_map, + std::vector& schema_map, + std::vector& schema_num_children) const + { + auto schema_item = schema.at(current_input_schema_index); + bool is_leaf = schema_item.__isset.type; + if (!is_leaf) { throw std::runtime_error("found a non-leaf entry when reading a leaf value"); } + if (get_num_children(schema_item) != 0) { + throw std::runtime_error("found an entry with children when reading a leaf value"); } + schema_map.push_back(current_input_schema_index); + schema_num_children.push_back(0); + ++current_input_schema_index; + chunk_map.push_back(next_input_chunk_index); + ++next_input_chunk_index; + } - /** - * filter_schema, but specific to Tag::VALUE. - */ - void filter_schema_value(std::vector const & schema, - std::size_t & current_input_schema_index, std::size_t & next_input_chunk_index, - std::vector & chunk_map, std::vector & schema_map, std::vector & schema_num_children) const { - auto schema_item = schema.at(current_input_schema_index); - bool is_leaf = schema_item.__isset.type; - if (!is_leaf) { - throw std::runtime_error("found a non-leaf entry when reading a leaf value"); - } - if (get_num_children(schema_item) != 0) { - throw std::runtime_error("found an entry with children when reading a leaf value"); - } - schema_map.push_back(current_input_schema_index); - schema_num_children.push_back(0); - ++current_input_schema_index; - chunk_map.push_back(next_input_chunk_index); - ++next_input_chunk_index; + /** + * filter_schema, but specific to Tag::LIST. + */ + void filter_schema_list(std::vector const& schema, + bool const ignore_case, + std::size_t& current_input_schema_index, + std::size_t& next_input_chunk_index, + std::vector& chunk_map, + std::vector& schema_map, + std::vector& schema_num_children) const + { + // By convention with the java code the child is always called "element"... + auto found = children.at("element"); + // A list starts out as a group element(not leaf) with a ConvertedType that is a LIST + // Under it will be a repeated element + auto list_schema_item = schema.at(current_input_schema_index); + std::string list_name = list_schema_item.name; + bool is_group = !list_schema_item.__isset.type; + + // Rules for how to parse lists from the parquet format docs + // 1. If the repeated field is not a group, then its type is the element type and elements are + // required. + // 2. If the repeated field is a group with multiple fields, then its type is the element type + // and elements are required. + // 3. If the repeated field is a group with one field and is named either array or uses the + // LIST-annotated group's name + // with _tuple appended then the repeated type is the element type and elements are required. + // 4. Otherwise, the repeated field's type is the element type with the repeated field's + // repetition. + + if (!is_group) { + if (!list_schema_item.__isset.repetition_type || + list_schema_item.repetition_type != parquet::format::FieldRepetitionType::REPEATED) { + throw std::runtime_error("expected list item to be repeating"); + } + return filter_schema_value(schema, + current_input_schema_index, + next_input_chunk_index, + chunk_map, + schema_map, + schema_num_children); } - - /** - * filter_schema, but specific to Tag::LIST. - */ - void filter_schema_list(std::vector const & schema, bool const ignore_case, - std::size_t & current_input_schema_index, std::size_t & next_input_chunk_index, - std::vector & chunk_map, std::vector & schema_map, std::vector & schema_num_children) const { - // By convention with the java code the child is always called "element"... - auto found = children.at("element"); - // A list starts out as a group element(not leaf) with a ConvertedType that is a LIST - // Under it will be a repeated element - auto list_schema_item = schema.at(current_input_schema_index); - std::string list_name = list_schema_item.name; - bool is_group = !list_schema_item.__isset.type; - - // Rules for how to parse lists from the parquet format docs - // 1. If the repeated field is not a group, then its type is the element type and elements are required. - // 2. If the repeated field is a group with multiple fields, then its type is the element type and elements are required. - // 3. If the repeated field is a group with one field and is named either array or uses the LIST-annotated group's name - // with _tuple appended then the repeated type is the element type and elements are required. - // 4. Otherwise, the repeated field's type is the element type with the repeated field's repetition. - - if (!is_group) { - if (!list_schema_item.__isset.repetition_type || - list_schema_item.repetition_type != parquet::format::FieldRepetitionType::REPEATED) { - throw std::runtime_error("expected list item to be repeating"); - } - return filter_schema_value(schema, current_input_schema_index, next_input_chunk_index, - chunk_map, schema_map, schema_num_children); - } - if (!list_schema_item.__isset.converted_type || list_schema_item.converted_type != parquet::format::ConvertedType::LIST) { - throw std::runtime_error("expected a list type, but it was not found."); - } - if (get_num_children(list_schema_item) != 1) { - throw std::runtime_error("the structure of the outer list group is not standard"); - } - - // Now that the top level group looks good add it into the maps, and then start to look at the children - schema_map.push_back(current_input_schema_index); - schema_num_children.push_back(1); - ++current_input_schema_index; - - auto repeated_field_schema_item = schema.at(current_input_schema_index); - if (!repeated_field_schema_item.__isset.repetition_type || repeated_field_schema_item.repetition_type != parquet::format::FieldRepetitionType::REPEATED) { - throw std::runtime_error("the structure of the list's child is not standard (non repeating)"); - } - - bool repeated_field_is_group = !repeated_field_schema_item.__isset.type; - int repeated_field_num_children = get_num_children(repeated_field_schema_item); - std::string repeated_field_name = repeated_field_schema_item.name; - if (repeated_field_is_group && repeated_field_num_children == 1 && - repeated_field_name != "array" && repeated_field_name != (list_name + "_tuple")) { - // This is the "standard" format where there are two groups and then a child under the the second group that holds the data. - // so add in the middle repeated group to the map - schema_map.push_back(current_input_schema_index); - schema_num_children.push_back(1); - ++current_input_schema_index; - - // And let the child filter itself. - found.filter_schema(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children); - } else { - // This is for an older format that is some times used where it is just two levels - found.filter_schema(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children); - } + if (!list_schema_item.__isset.converted_type || + list_schema_item.converted_type != parquet::format::ConvertedType::LIST) { + throw std::runtime_error("expected a list type, but it was not found."); + } + if (get_num_children(list_schema_item) != 1) { + throw std::runtime_error("the structure of the outer list group is not standard"); } - /** - * filter_schema, but specific to Tag::MAP. - */ - void filter_schema_map(std::vector const & schema, bool const ignore_case, - std::size_t & current_input_schema_index, std::size_t & next_input_chunk_index, - std::vector & chunk_map, std::vector & schema_map, std::vector & schema_num_children) const { - // By convention with the java code the children are always called "key" and "value"... - auto key_found = children.at("key"); - auto value_found = children.at("value"); - auto map_schema_item = schema.at(current_input_schema_index); - - // Maps are two levels. An outer group that has a ConvertedType of MAP or MAP_KEY_VALUE - // and then an inner group that has two fields a key (that is required) and a value, that is optional. - - bool is_map_group = !map_schema_item.__isset.type; - if (!is_map_group) { - throw std::runtime_error("expected a map item, but found a single value"); - } - if (!map_schema_item.__isset.converted_type || - (map_schema_item.converted_type != parquet::format::ConvertedType::MAP && - map_schema_item.converted_type != parquet::format::ConvertedType::MAP_KEY_VALUE)) { - throw std::runtime_error("expected a map type, but it was not found."); - } - if (get_num_children(map_schema_item) != 1) { - throw std::runtime_error("the structure of the outer map group is not standard"); - } + // Now that the top level group looks good add it into the maps, and then start to look at the + // children + schema_map.push_back(current_input_schema_index); + schema_num_children.push_back(1); + ++current_input_schema_index; + + auto repeated_field_schema_item = schema.at(current_input_schema_index); + if (!repeated_field_schema_item.__isset.repetition_type || + repeated_field_schema_item.repetition_type != + parquet::format::FieldRepetitionType::REPEATED) { + throw std::runtime_error("the structure of the list's child is not standard (non repeating)"); + } - // The outer group looks good so lets add it in. - schema_map.push_back(current_input_schema_index); - schema_num_children.push_back(1); - ++current_input_schema_index; + bool repeated_field_is_group = !repeated_field_schema_item.__isset.type; + int repeated_field_num_children = get_num_children(repeated_field_schema_item); + std::string repeated_field_name = repeated_field_schema_item.name; + if (repeated_field_is_group && repeated_field_num_children == 1 && + repeated_field_name != "array" && repeated_field_name != (list_name + "_tuple")) { + // This is the "standard" format where there are two groups and then a child under the the + // second group that holds the data. so add in the middle repeated group to the map + schema_map.push_back(current_input_schema_index); + schema_num_children.push_back(1); + ++current_input_schema_index; + + // And let the child filter itself. + found.filter_schema(schema, + ignore_case, + current_input_schema_index, + next_input_chunk_index, + chunk_map, + schema_map, + schema_num_children); + } else { + // This is for an older format that is some times used where it is just two levels + found.filter_schema(schema, + ignore_case, + current_input_schema_index, + next_input_chunk_index, + chunk_map, + schema_map, + schema_num_children); + } + } - // Now lets look at the repeated child. - auto repeated_field_schema_item = schema.at(current_input_schema_index); - if (!repeated_field_schema_item.__isset.repetition_type || repeated_field_schema_item.repetition_type != parquet::format::FieldRepetitionType::REPEATED) { - throw std::runtime_error("found non repeating map child"); - } + /** + * filter_schema, but specific to Tag::MAP. + */ + void filter_schema_map(std::vector const& schema, + bool const ignore_case, + std::size_t& current_input_schema_index, + std::size_t& next_input_chunk_index, + std::vector& chunk_map, + std::vector& schema_map, + std::vector& schema_num_children) const + { + // By convention with the java code the children are always called "key" and "value"... + auto key_found = children.at("key"); + auto value_found = children.at("value"); + auto map_schema_item = schema.at(current_input_schema_index); + + // Maps are two levels. An outer group that has a ConvertedType of MAP or MAP_KEY_VALUE + // and then an inner group that has two fields a key (that is required) and a value, that is + // optional. + + bool is_map_group = !map_schema_item.__isset.type; + if (!is_map_group) { + throw std::runtime_error("expected a map item, but found a single value"); + } + if (!map_schema_item.__isset.converted_type || + (map_schema_item.converted_type != parquet::format::ConvertedType::MAP && + map_schema_item.converted_type != parquet::format::ConvertedType::MAP_KEY_VALUE)) { + throw std::runtime_error("expected a map type, but it was not found."); + } + if (get_num_children(map_schema_item) != 1) { + throw std::runtime_error("the structure of the outer map group is not standard"); + } - int repeated_field_num_children = get_num_children(repeated_field_schema_item); + // The outer group looks good so lets add it in. + schema_map.push_back(current_input_schema_index); + schema_num_children.push_back(1); + ++current_input_schema_index; + + // Now lets look at the repeated child. + auto repeated_field_schema_item = schema.at(current_input_schema_index); + if (!repeated_field_schema_item.__isset.repetition_type || + repeated_field_schema_item.repetition_type != + parquet::format::FieldRepetitionType::REPEATED) { + throw std::runtime_error("found non repeating map child"); + } - if (repeated_field_num_children != 1 && repeated_field_num_children != 2) { - throw std::runtime_error("found map with wrong number of children"); - } + int repeated_field_num_children = get_num_children(repeated_field_schema_item); - schema_map.push_back(current_input_schema_index); - schema_num_children.push_back(repeated_field_num_children); - ++current_input_schema_index; + if (repeated_field_num_children != 1 && repeated_field_num_children != 2) { + throw std::runtime_error("found map with wrong number of children"); + } - // Process the key... - key_found.filter_schema(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children); - if (repeated_field_num_children == 2) { - // Process the value... - value_found.filter_schema(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children); - } + schema_map.push_back(current_input_schema_index); + schema_num_children.push_back(repeated_field_num_children); + ++current_input_schema_index; + + // Process the key... + key_found.filter_schema(schema, + ignore_case, + current_input_schema_index, + next_input_chunk_index, + chunk_map, + schema_map, + schema_num_children); + if (repeated_field_num_children == 2) { + // Process the value... + value_found.filter_schema(schema, + ignore_case, + current_input_schema_index, + next_input_chunk_index, + chunk_map, + schema_map, + schema_num_children); } + } - /** - * Recursive method to parse and update the maps to filter out columns in the schema and chunks. - * Each column_pruner is responsible to parse out from schema what it holds and skip anything - * that does not match. chunk_map, schema_map, and schema_num_children are the final outputs. - * current_input_schema_index and next_input_chunk_index are also outputs but are state that is - * passed to each child and returned when it comsumes comething. - */ - void filter_schema(std::vector const & schema, bool const ignore_case, - std::size_t & current_input_schema_index, std::size_t & next_input_chunk_index, - std::vector & chunk_map, std::vector & schema_map, std::vector & schema_num_children) const { - switch(tag) { - case Tag::STRUCT: - filter_schema_struct(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children); - break; - case Tag::VALUE: - filter_schema_value(schema, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children); - break; - case Tag::LIST: - filter_schema_list(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children); - break; - case Tag::MAP: - filter_schema_map(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children); - break; - default: - throw std::runtime_error(std::string("INTERNAL ERROR UNEXPECTED TAG FOUND ") + std::to_string(static_cast(tag))); - } + /** + * Recursive method to parse and update the maps to filter out columns in the schema and chunks. + * Each column_pruner is responsible to parse out from schema what it holds and skip anything + * that does not match. chunk_map, schema_map, and schema_num_children are the final outputs. + * current_input_schema_index and next_input_chunk_index are also outputs but are state that is + * passed to each child and returned when it comsumes comething. + */ + void filter_schema(std::vector const& schema, + bool const ignore_case, + std::size_t& current_input_schema_index, + std::size_t& next_input_chunk_index, + std::vector& chunk_map, + std::vector& schema_map, + std::vector& schema_num_children) const + { + switch (tag) { + case Tag::STRUCT: + filter_schema_struct(schema, + ignore_case, + current_input_schema_index, + next_input_chunk_index, + chunk_map, + schema_map, + schema_num_children); + break; + case Tag::VALUE: + filter_schema_value(schema, + current_input_schema_index, + next_input_chunk_index, + chunk_map, + schema_map, + schema_num_children); + break; + case Tag::LIST: + filter_schema_list(schema, + ignore_case, + current_input_schema_index, + next_input_chunk_index, + chunk_map, + schema_map, + schema_num_children); + break; + case Tag::MAP: + filter_schema_map(schema, + ignore_case, + current_input_schema_index, + next_input_chunk_index, + chunk_map, + schema_map, + schema_num_children); + break; + default: + throw std::runtime_error(std::string("INTERNAL ERROR UNEXPECTED TAG FOUND ") + + std::to_string(static_cast(tag))); } + } - /** - * Do a depth first traversal to build up column_pruner into a tree that matches the schema we want to filter using. - */ - void add_depth_first(std::vector const& names, - std::vector const& num_children, - std::vector const& tags, - int parent_num_children) { - CUDF_FUNC_RANGE(); - if (parent_num_children == 0) { - // There is no point in doing more the tree is empty, and it lets us avoid some corner cases - // in the code below - return; - } - auto num = names.size(); - std::vector tree_stack; - std::vector num_children_stack; - tree_stack.push_back(this); - num_children_stack.push_back(parent_num_children); - for(uint64_t i = 0; i < num; ++i) { - auto name = names[i]; - auto num_c = num_children[i]; - auto t = tags[i]; - tree_stack.back()->children.try_emplace(name, t); - if (num_c > 0) { - tree_stack.push_back(&tree_stack.back()->children[name]); - num_children_stack.push_back(num_c); - } else { - // go back up the stack/tree removing children until we hit one with more children - bool done = false; - while (!done) { - int parent_children_left = num_children_stack.back() - 1; - if (parent_children_left > 0) { - num_children_stack.back() = parent_children_left; - done = true; - } else { - tree_stack.pop_back(); - num_children_stack.pop_back(); - } - - if (tree_stack.size() <= 0) { - done = true; - } + /** + * Do a depth first traversal to build up column_pruner into a tree that matches the schema we + * want to filter using. + */ + void add_depth_first(std::vector const& names, + std::vector const& num_children, + std::vector const& tags, + int parent_num_children) + { + CUDF_FUNC_RANGE(); + if (parent_num_children == 0) { + // There is no point in doing more the tree is empty, and it lets us avoid some corner cases + // in the code below + return; + } + auto num = names.size(); + std::vector tree_stack; + std::vector num_children_stack; + tree_stack.push_back(this); + num_children_stack.push_back(parent_num_children); + for (uint64_t i = 0; i < num; ++i) { + auto name = names[i]; + auto num_c = num_children[i]; + auto t = tags[i]; + tree_stack.back()->children.try_emplace(name, t); + if (num_c > 0) { + tree_stack.push_back(&tree_stack.back()->children[name]); + num_children_stack.push_back(num_c); + } else { + // go back up the stack/tree removing children until we hit one with more children + bool done = false; + while (!done) { + int parent_children_left = num_children_stack.back() - 1; + if (parent_children_left > 0) { + num_children_stack.back() = parent_children_left; + done = true; + } else { + tree_stack.pop_back(); + num_children_stack.pop_back(); } + + if (tree_stack.size() <= 0) { done = true; } } } - if (tree_stack.size() != 0 || num_children_stack.size() != 0) { - throw std::invalid_argument("DIDN'T CONSUME EVERYTHING..."); - } } + if (tree_stack.size() != 0 || num_children_stack.size() != 0) { + throw std::invalid_argument("DIDN'T CONSUME EVERYTHING..."); + } + } - std::map children; - Tag tag; + std::map children; + Tag tag; }; -static bool invalid_file_offset(long start_index, long pre_start_index, long pre_compressed_size) { +static bool invalid_file_offset(long start_index, long pre_start_index, long pre_compressed_size) +{ bool invalid = false; // checking the first rowGroup if (pre_start_index == 0 && start_index != 4) { @@ -454,7 +551,7 @@ static bool invalid_file_offset(long start_index, long pre_start_index, long pre return invalid; } - //calculate start index for other blocks + // calculate start index for other blocks int64_t min_start_index = pre_start_index + pre_compressed_size; if (start_index < min_start_index) { // a bad offset detected, try first column's offset @@ -465,8 +562,9 @@ static bool invalid_file_offset(long start_index, long pre_start_index, long pre return invalid; } -static int64_t get_offset(parquet::format::ColumnChunk const& column_chunk) { - auto md = column_chunk.meta_data; +static int64_t get_offset(parquet::format::ColumnChunk const& column_chunk) +{ + auto md = column_chunk.meta_data; int64_t offset = md.data_page_offset; if (md.__isset.dictionary_page_offset && offset > md.dictionary_page_offset) { offset = md.dictionary_page_offset; @@ -474,73 +572,75 @@ static int64_t get_offset(parquet::format::ColumnChunk const& column_chunk) { return offset; } -static std::vector filter_groups(parquet::format::FileMetaData const& meta, - int64_t part_offset, int64_t part_length) { - CUDF_FUNC_RANGE(); - // This is based off of the java parquet_mr code to find the groups in a range... - auto num_row_groups = meta.row_groups.size(); - int64_t pre_start_index = 0; - int64_t pre_compressed_size = 0; - bool first_column_with_metadata = true; - if (num_row_groups > 0) { - first_column_with_metadata = meta.row_groups[0].columns[0].__isset.meta_data; - } +static std::vector filter_groups( + parquet::format::FileMetaData const& meta, int64_t part_offset, int64_t part_length) +{ + CUDF_FUNC_RANGE(); + // This is based off of the java parquet_mr code to find the groups in a range... + auto num_row_groups = meta.row_groups.size(); + int64_t pre_start_index = 0; + int64_t pre_compressed_size = 0; + bool first_column_with_metadata = true; + if (num_row_groups > 0) { + first_column_with_metadata = meta.row_groups[0].columns[0].__isset.meta_data; + } - std::vector filtered_groups; - for (uint64_t rg_i = 0; rg_i < num_row_groups; ++rg_i) { - parquet::format::RowGroup const& row_group = meta.row_groups[rg_i]; - int64_t total_size = 0; - int64_t start_index; - auto column_chunk = row_group.columns[0]; - if (first_column_with_metadata) { - start_index = get_offset(column_chunk); + std::vector filtered_groups; + for (uint64_t rg_i = 0; rg_i < num_row_groups; ++rg_i) { + parquet::format::RowGroup const& row_group = meta.row_groups[rg_i]; + int64_t total_size = 0; + int64_t start_index; + auto column_chunk = row_group.columns[0]; + if (first_column_with_metadata) { + start_index = get_offset(column_chunk); + } else { + // the file_offset of first block always holds the truth, while other blocks don't : + // see PARQUET-2078 for details + start_index = row_group.file_offset; + if (invalid_file_offset(start_index, pre_start_index, pre_compressed_size)) { + // first row group's offset is always 4 + if (pre_start_index == 0) { + start_index = 4; } else { - //the file_offset of first block always holds the truth, while other blocks don't : - //see PARQUET-2078 for details - start_index = row_group.file_offset; - if (invalid_file_offset(start_index, pre_start_index, pre_compressed_size)) { - //first row group's offset is always 4 - if (pre_start_index == 0) { - start_index = 4; - } else { - // use minStartIndex(imprecise in case of padding, but good enough for filtering) - start_index = pre_start_index + pre_compressed_size; - } - } - pre_start_index = start_index; - pre_compressed_size = row_group.total_compressed_size; - } - if (row_group.__isset.total_compressed_size) { - total_size = row_group.total_compressed_size; - } else { - auto num_columns = row_group.columns.size(); - for (uint64_t cc_i = 0; cc_i < num_columns; ++cc_i) { - parquet::format::ColumnChunk const& col = row_group.columns[cc_i]; - total_size += col.meta_data.total_compressed_size; + // use minStartIndex(imprecise in case of padding, but good enough for filtering) + start_index = pre_start_index + pre_compressed_size; } } - - int64_t mid_point = start_index + total_size / 2; - if (mid_point >= part_offset && mid_point < (part_offset + part_length)) { - filtered_groups.push_back(row_group); + pre_start_index = start_index; + pre_compressed_size = row_group.total_compressed_size; + } + if (row_group.__isset.total_compressed_size) { + total_size = row_group.total_compressed_size; + } else { + auto num_columns = row_group.columns.size(); + for (uint64_t cc_i = 0; cc_i < num_columns; ++cc_i) { + parquet::format::ColumnChunk const& col = row_group.columns[cc_i]; + total_size += col.meta_data.total_compressed_size; } } - return filtered_groups; + + int64_t mid_point = start_index + total_size / 2; + if (mid_point >= part_offset && mid_point < (part_offset + part_length)) { + filtered_groups.push_back(row_group); + } + } + return filtered_groups; } -void deserialize_parquet_footer(uint8_t * buffer, uint32_t len, parquet::format::FileMetaData * meta) { +void deserialize_parquet_footer(uint8_t* buffer, uint32_t len, parquet::format::FileMetaData* meta) +{ using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; CUDF_FUNC_RANGE(); - // A lot of this came from the parquet source code... - // Deserialize msg bytes into c++ thrift msg using memory transport. - #if PARQUET_THRIFT_VERSION_MAJOR > 0 || PARQUET_THRIFT_VERSION_MINOR >= 14 +// A lot of this came from the parquet source code... +// Deserialize msg bytes into c++ thrift msg using memory transport. +#if PARQUET_THRIFT_VERSION_MAJOR > 0 || PARQUET_THRIFT_VERSION_MINOR >= 14 auto conf = std::make_shared(); conf->setMaxMessageSize(std::numeric_limits::max()); auto tmem_transport = std::make_shared(buffer, len, ThriftBuffer::OBSERVE, conf); - #else +#else auto tmem_transport = std::make_shared(buffer, len); - #endif +#endif apache::thrift::protocol::TCompactProtocolFactoryT tproto_factory; // Protect against CPU and memory bombs @@ -549,7 +649,7 @@ void deserialize_parquet_footer(uint8_t * buffer, uint32_t len, parquet::format: // This limits total memory to the same order of magnitude as stringSize. tproto_factory.setContainerSizeLimit(1000 * 1000); std::shared_ptr tproto = - tproto_factory.getProtocol(tmem_transport); + tproto_factory.getProtocol(tmem_transport); try { meta->read(tproto.get()); } catch (std::exception& e) { @@ -559,7 +659,8 @@ void deserialize_parquet_footer(uint8_t * buffer, uint32_t len, parquet::format: } } -void filter_columns(std::vector & groups, std::vector & chunk_filter) { +void filter_columns(std::vector& groups, std::vector& chunk_filter) +{ CUDF_FUNC_RANGE(); for (auto group_it = groups.begin(); group_it != groups.end(); ++group_it) { std::vector new_chunks; @@ -570,24 +671,27 @@ void filter_columns(std::vector & groups, std::vector } } -} -} +} // namespace jni +} // namespace rapids extern "C" { -JNIEXPORT long JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_readAndFilter(JNIEnv * env, jclass, - jlong buffer, - jlong buffer_length, - jlong part_offset, - jlong part_length, - jobjectArray filter_col_names, - jintArray num_children, - jintArray tags, - jint parent_num_children, - jboolean ignore_case) { +JNIEXPORT long JNICALL +Java_com_nvidia_spark_rapids_jni_ParquetFooter_readAndFilter(JNIEnv* env, + jclass, + jlong buffer, + jlong buffer_length, + jlong part_offset, + jlong part_length, + jobjectArray filter_col_names, + jintArray num_children, + jintArray tags, + jint parent_num_children, + jboolean ignore_case) +{ CUDF_FUNC_RANGE(); try { - auto meta = std::make_unique(); + auto meta = std::make_unique(); uint32_t len = static_cast(buffer_length); // We don't support encrypted parquet... rapids::jni::deserialize_parquet_footer(reinterpret_cast(buffer), len, meta.get()); @@ -603,18 +707,18 @@ JNIEXPORT long JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_readAndFil } rapids::jni::column_pruner pruner(n_filter_col_names.as_cpp_vector(), - std::vector(n_num_children.begin(), n_num_children.end()), - tags, - parent_num_children); + std::vector(n_num_children.begin(), n_num_children.end()), + tags, + parent_num_children); auto filter = pruner.filter_schema(meta->schema, ignore_case); // start by filtering the schema and the chunks std::size_t new_schema_size = filter.schema_map.size(); std::vector new_schema(new_schema_size); for (std::size_t i = 0; i < new_schema_size; ++i) { - int orig_index = filter.schema_map[i]; - int new_num_children = filter.schema_num_children[i]; - new_schema[i] = meta->schema[orig_index]; + int orig_index = filter.schema_map[i]; + int new_num_children = filter.schema_num_children[i]; + new_schema[i] = meta->schema[orig_index]; new_schema[i].num_children = new_num_children; } meta->schema = std::move(new_schema); @@ -636,21 +740,25 @@ JNIEXPORT long JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_readAndFil CATCH_STD(env, 0); } -JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_close(JNIEnv * env, jclass, - jlong handle) { +JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_close(JNIEnv* env, + jclass, + jlong handle) +{ try { - parquet::format::FileMetaData * ptr = reinterpret_cast(handle); + parquet::format::FileMetaData* ptr = reinterpret_cast(handle); delete ptr; } CATCH_STD(env, ); } -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_getNumRows(JNIEnv * env, jclass, - jlong handle) { +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_getNumRows(JNIEnv* env, + jclass, + jlong handle) +{ try { - parquet::format::FileMetaData * ptr = reinterpret_cast(handle); - long ret = 0; - for(auto it = ptr->row_groups.begin(); it != ptr->row_groups.end(); ++it) { + parquet::format::FileMetaData* ptr = reinterpret_cast(handle); + long ret = 0; + for (auto it = ptr->row_groups.begin(); it != ptr->row_groups.end(); ++it) { ret = ret + it->num_rows; } return ret; @@ -658,55 +766,56 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_getNumRow CATCH_STD(env, -1); } -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_getNumColumns(JNIEnv * env, jclass, - jlong handle) { +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_getNumColumns(JNIEnv* env, + jclass, + jlong handle) +{ try { - parquet::format::FileMetaData * ptr = reinterpret_cast(handle); - int ret = 0; + parquet::format::FileMetaData* ptr = reinterpret_cast(handle); + int ret = 0; if (ptr->schema.size() > 0) { - if (ptr->schema[0].__isset.num_children) { - ret = ptr->schema[0].num_children; - } + if (ptr->schema[0].__isset.num_children) { ret = ptr->schema[0].num_children; } } return ret; } CATCH_STD(env, -1); } -JNIEXPORT jobject JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_serializeThriftFile(JNIEnv * env, jclass, - jlong handle) { +JNIEXPORT jobject JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_serializeThriftFile( + JNIEnv* env, jclass, jlong handle) +{ CUDF_FUNC_RANGE(); try { - parquet::format::FileMetaData * meta = reinterpret_cast(handle); + parquet::format::FileMetaData* meta = reinterpret_cast(handle); std::shared_ptr transportOut( - new apache::thrift::transport::TMemoryBuffer()); - apache::thrift::protocol::TCompactProtocolFactoryT factory; + new apache::thrift::transport::TMemoryBuffer()); + apache::thrift::protocol::TCompactProtocolFactoryT + factory; auto protocolOut = factory.getProtocol(transportOut); meta->write(protocolOut.get()); - uint8_t * buf_ptr; + uint8_t* buf_ptr; uint32_t buf_size; transportOut->getBuffer(&buf_ptr, &buf_size); // 12 extra is for the MAGIC thrift_footer length MAGIC - jobject ret = cudf::jni::allocate_host_buffer(env, buf_size + 12, false); + jobject ret = cudf::jni::allocate_host_buffer(env, buf_size + 12, false); uint8_t* ret_addr = reinterpret_cast(cudf::jni::get_host_buffer_address(env, ret)); - ret_addr[0] = 'P'; - ret_addr[1] = 'A'; - ret_addr[2] = 'R'; - ret_addr[3] = '1'; + ret_addr[0] = 'P'; + ret_addr[1] = 'A'; + ret_addr[2] = 'R'; + ret_addr[3] = '1'; std::memcpy(ret_addr + 4, buf_ptr, buf_size); - uint8_t * after = ret_addr + buf_size + 4; - after[0] = static_cast(0xFF & buf_size); - after[1] = static_cast(0xFF & (buf_size >> 8)); - after[2] = static_cast(0xFF & (buf_size >> 16)); - after[3] = static_cast(0xFF & (buf_size >> 24)); - after[4] = 'P'; - after[5] = 'A'; - after[6] = 'R'; - after[7] = '1'; + uint8_t* after = ret_addr + buf_size + 4; + after[0] = static_cast(0xFF & buf_size); + after[1] = static_cast(0xFF & (buf_size >> 8)); + after[2] = static_cast(0xFF & (buf_size >> 16)); + after[3] = static_cast(0xFF & (buf_size >> 24)); + after[4] = 'P'; + after[5] = 'A'; + after[6] = 'R'; + after[7] = '1'; return ret; } CATCH_STD(env, nullptr); } - } diff --git a/src/main/cpp/src/RowConversionJni.cpp b/src/main/cpp/src/RowConversionJni.cpp index 2d1da7e453..1fdb8a86b5 100644 --- a/src/main/cpp/src/RowConversionJni.cpp +++ b/src/main/cpp/src/RowConversionJni.cpp @@ -21,37 +21,42 @@ extern "C" { JNIEXPORT jlongArray JNICALL -Java_com_nvidia_spark_rapids_jni_RowConversion_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, - jlong input_table) { +Java_com_nvidia_spark_rapids_jni_RowConversion_convertToRowsFixedWidthOptimized(JNIEnv* env, + jclass, + jlong input_table) +{ JNI_NULL_CHECK(env, input_table, "input table is null", 0); try { cudf::jni::auto_set_device(env); - cudf::table_view const *n_input_table = reinterpret_cast(input_table); + cudf::table_view const* n_input_table = reinterpret_cast(input_table); std::vector> cols = - spark_rapids_jni::convert_to_rows_fixed_width_optimized(*n_input_table); + spark_rapids_jni::convert_to_rows_fixed_width_optimized(*n_input_table); int const num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); - std::transform(cols.begin(), cols.end(), outcol_handles.begin(), - [](auto &col) { return cudf::jni::release_as_jlong(col); }); + std::transform(cols.begin(), cols.end(), outcol_handles.begin(), [](auto& col) { + return cudf::jni::release_as_jlong(col); + }); return outcol_handles.get_jArray(); } CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_RowConversion_convertToRows( - JNIEnv *env, jclass, jlong input_table) { +JNIEXPORT jlongArray JNICALL +Java_com_nvidia_spark_rapids_jni_RowConversion_convertToRows(JNIEnv* env, jclass, jlong input_table) +{ JNI_NULL_CHECK(env, input_table, "input table is null", 0); try { cudf::jni::auto_set_device(env); - cudf::table_view const *n_input_table = reinterpret_cast(input_table); + cudf::table_view const* n_input_table = reinterpret_cast(input_table); std::vector> cols = - spark_rapids_jni::convert_to_rows(*n_input_table); + spark_rapids_jni::convert_to_rows(*n_input_table); int const num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); - std::transform(cols.begin(), cols.end(), outcol_handles.begin(), - [](auto &col) { return cudf::jni::release_as_jlong(col); }); + std::transform(cols.begin(), cols.end(), outcol_handles.begin(), [](auto& col) { + return cudf::jni::release_as_jlong(col); + }); return outcol_handles.get_jArray(); } CATCH_STD(env, 0); @@ -59,46 +64,54 @@ JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_RowConversion_conv JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_RowConversion_convertFromRowsFixedWidthOptimized( - JNIEnv *env, jclass, jlong input_column, jintArray types, jintArray scale) { + JNIEnv* env, jclass, jlong input_column, jintArray types, jintArray scale) +{ JNI_NULL_CHECK(env, input_column, "input column is null", 0); JNI_NULL_CHECK(env, types, "types is null", 0); try { cudf::jni::auto_set_device(env); - cudf::lists_column_view const list_input{*reinterpret_cast(input_column)}; + cudf::lists_column_view const list_input{*reinterpret_cast(input_column)}; cudf::jni::native_jintArray n_types(env, types); cudf::jni::native_jintArray n_scale(env, scale); if (n_types.size() != n_scale.size()) { JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", NULL); } std::vector types_vec; - std::transform(n_types.begin(), n_types.end(), n_scale.begin(), std::back_inserter(types_vec), + std::transform(n_types.begin(), + n_types.end(), + n_scale.begin(), + std::back_inserter(types_vec), [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); }); std::unique_ptr result = - spark_rapids_jni::convert_from_rows_fixed_width_optimized(list_input, types_vec); + spark_rapids_jni::convert_from_rows_fixed_width_optimized(list_input, types_vec); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); } JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_RowConversion_convertFromRows( - JNIEnv *env, jclass, jlong input_column, jintArray types, jintArray scale) { + JNIEnv* env, jclass, jlong input_column, jintArray types, jintArray scale) +{ JNI_NULL_CHECK(env, input_column, "input column is null", 0); JNI_NULL_CHECK(env, types, "types is null", 0); try { cudf::jni::auto_set_device(env); - cudf::lists_column_view const list_input{*reinterpret_cast(input_column)}; + cudf::lists_column_view const list_input{*reinterpret_cast(input_column)}; cudf::jni::native_jintArray n_types(env, types); cudf::jni::native_jintArray n_scale(env, scale); if (n_types.size() != n_scale.size()) { JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", NULL); } std::vector types_vec; - std::transform(n_types.begin(), n_types.end(), n_scale.begin(), std::back_inserter(types_vec), + std::transform(n_types.begin(), + n_types.end(), + n_scale.begin(), + std::back_inserter(types_vec), [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); }); std::unique_ptr result = - spark_rapids_jni::convert_from_rows(list_input, types_vec); + spark_rapids_jni::convert_from_rows(list_input, types_vec); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); diff --git a/src/main/cpp/src/SparkResourceAdaptorJni.cpp b/src/main/cpp/src/SparkResourceAdaptorJni.cpp index 13a7a50a95..bcbe8080ca 100644 --- a/src/main/cpp/src/SparkResourceAdaptorJni.cpp +++ b/src/main/cpp/src/SparkResourceAdaptorJni.cpp @@ -32,8 +32,8 @@ namespace { -constexpr char const *RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/RetryOOM"; -constexpr char const *SPLIT_AND_RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/SplitAndRetryOOM"; +constexpr char const* RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/RetryOOM"; +constexpr char const* SPLIT_AND_RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/SplitAndRetryOOM"; // In the task states BUFN means Block Until Further Notice. // Meaning the thread should be blocked until another task finishes. @@ -43,33 +43,34 @@ constexpr char const *SPLIT_AND_RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/S // in the future to know when a retry section has passed, which would // probably be a preferable time to restart all BUFN threads. enum thread_state { - UNKNOWN = -1, // unknown state, this is really here for logging and anything transitioning to - // this state should actually be accomplished by deleting the thread from the state - TASK_RUNNING = 0, // task thread running normally - TASK_WAIT_ON_SHUFFLE = 1, // task thread waiting on shuffle - TASK_BUFN_WAIT_ON_SHUFFLE = 2, // task thread waiting on shuffle, but marked as BUFN - TASK_ALLOC = 3, // task thread in the middle of doing an allocation - TASK_ALLOC_FREE = 4, // task thread in the middle of doing an allocation and a free happened - TASK_BLOCKED = 5, // task thread that is temporarily blocked - TASK_BUFN_THROW = 6, // task thread that should throw an exception to roll back before blocking - TASK_BUFN_WAIT = 7, // task thread that threw an exception to roll back and now should - // block the next time alloc or block_until_ready is called - TASK_BUFN = 8, // task thread that is blocked until higher priority tasks start to succeed - TASK_SPLIT_THROW = 9, // task thread that should throw an exception to split input and retry - TASK_REMOVE_THROW = 10, // task thread that is being removed and needs to throw an exception - // to start the blocked thread running again. - SHUFFLE_RUNNING = 11, // shuffle thread that is running normally - SHUFFLE_ALLOC = 12, // shuffle thread that is in the middle of doing an alloc - SHUFFLE_ALLOC_FREE = 13, // shuffle thread that is doing an alloc and a free happened. - SHUFFLE_BLOCKED = 14, // shuffle thread that is temporarily blocked - SHUFFLE_THROW = 15, // shuffle thread that needs to throw an OOM - SHUFFLE_REMOVE_THROW = 16 // shuffle thread that is being removed and needs to throw an exception + UNKNOWN = -1, // unknown state, this is really here for logging and anything transitioning to + // this state should actually be accomplished by deleting the thread from the state + TASK_RUNNING = 0, // task thread running normally + TASK_WAIT_ON_SHUFFLE = 1, // task thread waiting on shuffle + TASK_BUFN_WAIT_ON_SHUFFLE = 2, // task thread waiting on shuffle, but marked as BUFN + TASK_ALLOC = 3, // task thread in the middle of doing an allocation + TASK_ALLOC_FREE = 4, // task thread in the middle of doing an allocation and a free happened + TASK_BLOCKED = 5, // task thread that is temporarily blocked + TASK_BUFN_THROW = 6, // task thread that should throw an exception to roll back before blocking + TASK_BUFN_WAIT = 7, // task thread that threw an exception to roll back and now should + // block the next time alloc or block_until_ready is called + TASK_BUFN = 8, // task thread that is blocked until higher priority tasks start to succeed + TASK_SPLIT_THROW = 9, // task thread that should throw an exception to split input and retry + TASK_REMOVE_THROW = 10, // task thread that is being removed and needs to throw an exception + // to start the blocked thread running again. + SHUFFLE_RUNNING = 11, // shuffle thread that is running normally + SHUFFLE_ALLOC = 12, // shuffle thread that is in the middle of doing an alloc + SHUFFLE_ALLOC_FREE = 13, // shuffle thread that is doing an alloc and a free happened. + SHUFFLE_BLOCKED = 14, // shuffle thread that is temporarily blocked + SHUFFLE_THROW = 15, // shuffle thread that needs to throw an OOM + SHUFFLE_REMOVE_THROW = 16 // shuffle thread that is being removed and needs to throw an exception }; /** * Convert a state to a string representation for logging. */ -const char *as_str(thread_state state) { +const char* as_str(thread_state state) +{ switch (state) { case TASK_RUNNING: return "TASK_RUNNING"; case TASK_WAIT_ON_SHUFFLE: return "TASK_WAIT_ON_SHUFFLE"; @@ -92,20 +93,23 @@ const char *as_str(thread_state state) { } } -static std::shared_ptr make_logger(std::ostream &stream) { +static std::shared_ptr make_logger(std::ostream& stream) +{ return std::make_shared("SPARK_RMM", std::make_shared(stream)); } -static std::shared_ptr make_logger() { +static std::shared_ptr make_logger() +{ return std::make_shared("SPARK_RMM", std::make_shared()); } -static auto make_logger(std::string const &filename) { +static auto make_logger(std::string const& filename) +{ return std::make_shared( - "SPARK_RMM", - std::make_shared(filename, true /*truncate file*/)); + "SPARK_RMM", + std::make_shared(filename, true /*truncate file*/)); } /** @@ -121,15 +125,16 @@ static auto make_logger(std::string const &filename) { * will be MAX_LONG - (task_id + 1). */ class thread_priority { -public: + public: thread_priority(long tsk_id, long t_id) : task_id(tsk_id), thread_id(t_id) {} long get_thread_id() const { return thread_id; } long get_task_id() const { return task_id; } - bool operator<(const thread_priority &other) const { - long task_priority = this->task_priority(); + bool operator<(const thread_priority& other) const + { + long task_priority = this->task_priority(); long other_task_priority = other.task_priority(); if (task_priority < other_task_priority) { return true; @@ -139,8 +144,9 @@ class thread_priority { return false; } - bool operator>(const thread_priority &other) const { - long task_priority = this->task_priority(); + bool operator>(const thread_priority& other) const + { + long task_priority = this->task_priority(); long other_task_priority = other.task_priority(); if (task_priority > other_task_priority) { return true; @@ -150,12 +156,13 @@ class thread_priority { return false; } - void operator=(const thread_priority &other) { - task_id = other.task_id; + void operator=(const thread_priority& other) + { + task_id = other.task_id; thread_id = other.thread_id; } -private: + private: long task_id; long thread_id; @@ -169,72 +176,77 @@ class thread_priority { * this should be accessed with a lock held. */ class full_thread_state { -public: + public: full_thread_state(thread_state state, long thread_id) : state(state), thread_id(thread_id) {} full_thread_state(thread_state state, long thread_id, long task_id) - : state(state), thread_id(thread_id), task_id(task_id) {} + : state(state), thread_id(thread_id), task_id(task_id) + { + } thread_state state; long thread_id; - long task_id = -1; - int retry_oom_injected = 0; + long task_id = -1; + int retry_oom_injected = 0; int split_and_retry_oom_injected = 0; - int cudf_exception_injected = 0; + int cudf_exception_injected = 0; // watchdog limit on maximum number of retries to avoid unexpected live lock situations int num_times_retried = 0; // metric for being able to report how many times each type of exception was thrown, // and some timings - int num_times_retry_throw = 0; + int num_times_retry_throw = 0; int num_times_split_retry_throw = 0; - long time_blocked_nanos = 0; + long time_blocked_nanos = 0; // The amount of time that this thread has lost due to retries (not inclduing blocked time) long time_lost_nanos = 0; - // The amount of time that this thread has spent in the current retry block (not inclucing block time) + // The amount of time that this thread has spent in the current retry block (not inclucing block + // time) long time_retry_running_nanos = 0; // When did the retry time for this thread start, or when did the block time end. std::chrono::time_point retry_start_or_block_end; // Is this thread currently in a marked retry block. This is only used for metrics. bool is_in_retry = false; - std::chrono::time_point block_start; std::unique_ptr wake_condition = - std::make_unique(); + std::make_unique(); /** * Transition to a new state. Ideally this is what is called when doing a state transition instead * of setting the state directly. */ - void transition_to(thread_state new_state) { + void transition_to(thread_state new_state) + { if (new_state == thread_state::UNKNOWN) { throw std::runtime_error( - "Going to UNKNOWN state should delete the thread state, not call transition_to"); + "Going to UNKNOWN state should delete the thread state, not call transition_to"); } state = new_state; } - void before_block() { + void before_block() + { block_start = std::chrono::steady_clock::now(); // Don't record running time lost while we are blocked... record_and_reset_pending_retry_time(); } - void after_block() { - auto end = std::chrono::steady_clock::now(); + void after_block() + { + auto end = std::chrono::steady_clock::now(); auto diff = end - block_start; time_blocked_nanos += std::chrono::duration_cast(diff).count(); - if (is_in_retry) { - retry_start_or_block_end = end; - } + if (is_in_retry) { retry_start_or_block_end = end; } } - long get_and_reset_failed_retry_time() { - long ret = time_lost_nanos; + long get_and_reset_failed_retry_time() + { + long ret = time_lost_nanos; time_lost_nanos = 0; return ret; } - void record_failed_retry_time() { + void record_failed_retry_time() + { if (is_in_retry) { record_and_reset_pending_retry_time(); time_lost_nanos += time_retry_running_nanos; @@ -242,20 +254,21 @@ class full_thread_state { } } - void record_and_reset_pending_retry_time() { + void record_and_reset_pending_retry_time() + { if (is_in_retry) { - auto end = std::chrono::steady_clock::now(); + auto end = std::chrono::steady_clock::now(); auto diff = end - retry_start_or_block_end; - time_retry_running_nanos += std::chrono::duration_cast(diff).count(); + time_retry_running_nanos += + std::chrono::duration_cast(diff).count(); retry_start_or_block_end = end; } } - void reset_retry_state(bool is_in_retry) { + void reset_retry_state(bool is_in_retry) + { time_retry_running_nanos = 0; - if (is_in_retry) { - retry_start_or_block_end = std::chrono::steady_clock::now(); - } + if (is_in_retry) { retry_start_or_block_end = std::chrono::steady_clock::now(); } this->is_in_retry = is_in_retry; } @@ -273,20 +286,20 @@ class full_thread_state { * memory error. */ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { -public: - spark_resource_adaptor(JNIEnv *env, rmm::mr::device_memory_resource *mr, - std::shared_ptr &logger) - : resource{mr}, logger{logger} { - if (env->GetJavaVM(&jvm) < 0) { - throw std::runtime_error("GetJavaVM failed"); - } + public: + spark_resource_adaptor(JNIEnv* env, + rmm::mr::device_memory_resource* mr, + std::shared_ptr& logger) + : resource{mr}, logger{logger} + { + if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); } logger->flush_on(spdlog::level::info); logger->set_pattern("%v"); logger->info("time,op,current thread,op thread,op task,from state,to state,notes"); logger->set_pattern("%H:%M:%S.%f,%v"); } - rmm::mr::device_memory_resource *get_wrapped_resource() { return resource; } + rmm::mr::device_memory_resource* get_wrapped_resource() { return resource; } bool supports_get_mem_info() const noexcept override { return resource->supports_get_mem_info(); } @@ -302,13 +315,12 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * was an error and the entire executor is shutting down. So there should be no * reuse. */ - void associate_thread_with_task(long thread_id, long task_id) { + void associate_thread_with_task(long thread_id, long task_id) + { std::unique_lock lock(state_mutex); - if (shutting_down) { - throw std::runtime_error("spark_resource_adaptor is shutting down"); - } - auto was_threads_inserted = threads.emplace( - thread_id, full_thread_state(thread_state::TASK_RUNNING, thread_id, task_id)); + if (shutting_down) { throw std::runtime_error("spark_resource_adaptor is shutting down"); } + auto was_threads_inserted = + threads.emplace(thread_id, full_thread_state(thread_state::TASK_RUNNING, thread_id, task_id)); if (was_threads_inserted.second == false) { if (was_threads_inserted.first->second.task_id != task_id) { throw std::invalid_argument("a thread can only be associated with a single task."); @@ -325,7 +337,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { // task_to_threads already has a task_id for this, so insert the thread_id was_inserted.first->second.insert(thread_id); } - } catch (const std::exception &) { + } catch (const std::exception&) { if (was_threads_inserted.second == true) { // roll back the thread insertion threads.erase(thread_id); @@ -337,25 +349,24 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { } } - void start_retry_block(long thread_id) { + void start_retry_block(long thread_id) + { std::unique_lock lock(state_mutex); auto thread = threads.find(thread_id); - if (thread != threads.end()) { - thread->second.reset_retry_state(true); - } + if (thread != threads.end()) { thread->second.reset_retry_state(true); } } - void end_retry_block(long thread_id) { + void end_retry_block(long thread_id) + { std::unique_lock lock(state_mutex); auto thread = threads.find(thread_id); - if (thread != threads.end()) { - thread->second.reset_retry_state(false); - } + if (thread != threads.end()) { thread->second.reset_retry_state(false); } } - long get_and_reset_lost_time(long task_id) { + long get_and_reset_lost_time(long task_id) + { std::unique_lock lock(state_mutex); - long ret = 0; + long ret = 0; auto task_at = task_to_threads.find(task_id); if (task_at != task_to_threads.end()) { for (auto thread_id : task_at->second) { @@ -375,14 +386,13 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * this is an error is if the thread is already marked as shutting down and has * not completed that transition yet. */ - void associate_thread_with_shuffle(long thread_id) { + void associate_thread_with_shuffle(long thread_id) + { std::unique_lock lock(state_mutex); - if (shutting_down) { - throw std::runtime_error("spark_resource_adaptor is shutting down"); - } + if (shutting_down) { throw std::runtime_error("spark_resource_adaptor is shutting down"); } auto was_inserted = - threads.emplace(thread_id, full_thread_state(thread_state::SHUFFLE_RUNNING, thread_id)); + threads.emplace(thread_id, full_thread_state(thread_state::SHUFFLE_RUNNING, thread_id)); if (was_inserted.second == true) { log_transition(thread_id, -1, thread_state::UNKNOWN, thread_state::SHUFFLE_RUNNING); } else if (was_inserted.first->second.task_id != -1) { @@ -399,11 +409,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * up and throw an exception. At that point the thread's state will be completely * removed. */ - void remove_thread_association(long thread_id) { + void remove_thread_association(long thread_id) + { std::unique_lock lock(state_mutex); - if (remove_thread_association(thread_id, lock)) { - wake_up_threads_after_task_finishes(lock); - } + if (remove_thread_association(thread_id, lock)) { wake_up_threads_after_task_finishes(lock); } } /** @@ -412,19 +421,18 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * threads are currently blocked/waiting then the state will not be totally * removed until the thread is woken. */ - void task_done(long task_id) { + void task_done(long task_id) + { std::unique_lock lock(state_mutex); auto task_at = task_to_threads.find(task_id); if (task_at != task_to_threads.end()) { // we want to make a copy so there is no conflict here... std::set threads_to_remove = task_at->second; - bool run_checks = false; + bool run_checks = false; for (auto thread_id : threads_to_remove) { run_checks = remove_thread_association(thread_id, lock) || run_checks; } - if (run_checks) { - wake_up_threads_after_task_finishes(lock); - } + if (run_checks) { wake_up_threads_after_task_finishes(lock); } } task_to_threads.erase(task_id); } @@ -434,7 +442,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * to shut down everything in an orderly way and wait for all of the * threads to be done. */ - void all_done() { + void all_done() + { { std::unique_lock lock(state_mutex); // 1. Mark all threads that need to be removed as such @@ -470,7 +479,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * Force a specific thread to throw one or more RetryOOM exceptions when an * alloc is called. This is intended only for testing. */ - void force_retry_oom(long thread_id, int num_ooms) { + void force_retry_oom(long thread_id, int num_ooms) + { std::unique_lock lock(state_mutex); auto threads_at = threads.find(thread_id); if (threads_at != threads.end()) { @@ -484,7 +494,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * Force a specific thread to throw one or more SplitAndRetryOOM exceptions * when an alloc is called. This is intended only for testing. */ - void force_split_and_retry_oom(long thread_id, int num_ooms) { + void force_split_and_retry_oom(long thread_id, int num_ooms) + { std::unique_lock lock(state_mutex); auto threads_at = threads.find(thread_id); if (threads_at != threads.end()) { @@ -498,7 +509,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * force a specific thread to throw one or more CudfExceptions when an * alloc is called. This is intended only for testing. */ - void force_cudf_exception(long thread_id, int num_times) { + void force_cudf_exception(long thread_id, int num_times) + { std::unique_lock lock(state_mutex); auto threads_at = threads.find(thread_id); if (threads_at != threads.end()) { @@ -511,9 +523,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { /** * get the number of times a retry was thrown and reset the value to 0. */ - int get_and_reset_num_retry(long task_id) { + int get_and_reset_num_retry(long task_id) + { std::unique_lock lock(state_mutex); - int ret = 0; + int ret = 0; auto task_at = task_to_threads.find(task_id); if (task_at != task_to_threads.end()) { for (auto thread_id : task_at->second) { @@ -530,9 +543,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { /** * get the number of times a split and retry was thrown and reset the value to 0. */ - int get_and_reset_num_split_retry(long task_id) { + int get_and_reset_num_split_retry(long task_id) + { std::unique_lock lock(state_mutex); - int ret = 0; + int ret = 0; auto task_at = task_to_threads.find(task_id); if (task_at != task_to_threads.end()) { for (auto thread_id : task_at->second) { @@ -549,9 +563,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { /** * get the time in ns that the task was blocked for. */ - long get_and_reset_block_time(long task_id) { + long get_and_reset_block_time(long task_id) + { std::unique_lock lock(state_mutex); - long ret = 0; + long ret = 0; auto task_at = task_to_threads.find(task_id); if (task_at != task_to_threads.end()) { for (auto thread_id : task_at->second) { @@ -569,7 +584,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * Update the internal state so that this thread is known that it is going to enter a * shuffle stage and could indirectly block on a shuffle thread (UCX). */ - void thread_could_block_on_shuffle(long thread_id) { + void thread_could_block_on_shuffle(long thread_id) + { std::unique_lock lock(state_mutex); auto threads_at = threads.find(thread_id); if (threads_at != threads.end()) { @@ -601,7 +617,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { /** * Indicate that the thread no longer will block indirectly on a shuffle thread. */ - void thread_done_with_shuffle(long thread_id) { + void thread_done_with_shuffle(long thread_id) + { std::unique_lock lock(state_mutex); auto threads_at = threads.find(thread_id); if (threads_at != threads.end()) { @@ -635,7 +652,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * before an alloc is called. If this is not called alloc will also call into the * same code and block if needed until the task is ready to keep going. */ - void block_thread_until_ready() { + void block_thread_until_ready() + { auto thread_id = static_cast(pthread_self()); std::unique_lock lock(state_mutex); block_thread_until_ready(thread_id, lock); @@ -645,7 +663,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * This is really here just for testing. It provides a way to look at the * current state of a thread. */ - int get_thread_state_as_int(long thread_id) { + int get_thread_state_as_int(long thread_id) + { std::unique_lock lock(state_mutex); auto threads_at = threads.find(thread_id); if (threads_at != threads.end()) { @@ -655,9 +674,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { } } -private: - rmm::mr::device_memory_resource *const resource; - std::shared_ptr logger; ///< spdlog logger object + private: + rmm::mr::device_memory_resource* const resource; + std::shared_ptr logger; ///< spdlog logger object // The state mutex must be held when modifying the state of threads or tasks // it must never be held when calling into the child resource or after returning @@ -667,26 +686,38 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { std::map threads; std::map> task_to_threads; bool shutting_down = false; - JavaVM *jvm; + JavaVM* jvm; /** * log a status change that does not involve a state transition. */ - void log_status(const char *op, long thread_id, long task_id, thread_state state, - const char *notes = nullptr) { + void log_status( + const char* op, long thread_id, long task_id, thread_state state, const char* notes = nullptr) + { auto this_id = static_cast(pthread_self()); - logger->info("{},{},{},{},{},,{}", op, this_id, thread_id, task_id, as_str(state), + logger->info("{},{},{},{},{},,{}", + op, + this_id, + thread_id, + task_id, + as_str(state), (notes == nullptr ? "" : notes)); } /** * log that a state transition happened. */ - void log_transition(long thread_id, long task_id, thread_state from, thread_state to, - const char *notes = nullptr) { + void log_transition( + long thread_id, long task_id, thread_state from, thread_state to, const char* notes = nullptr) + { auto this_id = static_cast(pthread_self()); - logger->info("TRANSITION,{},{},{},{},{},{}", this_id, thread_id, task_id, as_str(from), - as_str(to), (notes == nullptr ? "" : notes)); + logger->info("TRANSITION,{},{},{},{},{},{}", + this_id, + thread_id, + task_id, + as_str(from), + as_str(to), + (notes == nullptr ? "" : notes)); } /** @@ -694,7 +725,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * of setting the state directly. This will log the transition and do a little bit of * verification. */ - void transition(full_thread_state &state, thread_state new_state, const char *message = nullptr) { + void transition(full_thread_state& state, thread_state new_state, const char* message = nullptr) + { thread_state original = state.state; state.transition_to(new_state); log_transition(state.thread_id, state.task_id, original, new_state, message); @@ -703,8 +735,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { /** * throw a java exception using the cached jvm/env. */ - void throw_java_exception(const char *ex_class_name, const char *msg) { - JNIEnv *env = cudf::jni::get_jni_env(jvm); + void throw_java_exception(const char* ex_class_name, const char* msg) + { + JNIEnv* env = cudf::jni::get_jni_env(jvm); cudf::jni::throw_java_exception(env, ex_class_name, msg); } @@ -712,7 +745,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * This is a watchdog to prevent us from live locking. It should be called before we throw an * RetryOOM or a SplitAndRetryOOM to know if we actually should throw something else. */ - void check_before_oom(full_thread_state &state, const std::unique_lock &lock) { + void check_before_oom(full_thread_state& state, const std::unique_lock& lock) + { // The limit is an arbitrary number, large enough that we should not hit it in "normal" // operation, but also small enough that we can detect a livelock fairly quickly. // In testing it looks like it is a few ms if in a tight loop, not including spill @@ -724,23 +758,28 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { state.num_times_retried++; } - void throw_retry_oom(const char *msg, full_thread_state &state, - const std::unique_lock &lock) { + void throw_retry_oom(const char* msg, + full_thread_state& state, + const std::unique_lock& lock) + { state.num_times_retry_throw++; check_before_oom(state, lock); state.record_failed_retry_time(); throw_java_exception(RETRY_OOM_CLASS, "GPU OutOfMemory"); } - void throw_split_and_retry_oom(const char *msg, full_thread_state &state, - const std::unique_lock &lock) { + void throw_split_and_retry_oom(const char* msg, + full_thread_state& state, + const std::unique_lock& lock) + { state.num_times_split_retry_throw++; check_before_oom(state, lock); state.record_failed_retry_time(); throw_java_exception(SPLIT_AND_RETRY_OOM_CLASS, "GPU OutOfMemory"); } - bool is_blocked(thread_state state) { + bool is_blocked(thread_state state) + { switch (state) { case TASK_BLOCKED: // fall through @@ -754,8 +793,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { /** * Internal implementation that will block a thread until it is ready to continue. */ - void block_thread_until_ready(long thread_id, std::unique_lock &lock) { - bool done = false; + void block_thread_until_ready(long thread_id, std::unique_lock& lock) + { + bool done = false; bool first_time = true; // Because this is called from alloc as well as from the public facing block_thread_until_ready // there are states that should only show up in relation to alloc failing. These include @@ -812,14 +852,14 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { case TASK_SPLIT_THROW: transition(thread->second, thread_state::TASK_RUNNING); thread->second.record_failed_retry_time(); - throw_split_and_retry_oom("rollback, split input, and retry operation", thread->second, - lock); + throw_split_and_retry_oom( + "rollback, split input, and retry operation", thread->second, lock); break; case TASK_REMOVE_THROW: // fall through case SHUFFLE_REMOVE_THROW: - log_transition(thread_id, thread->second.task_id, thread->second.state, - thread_state::UNKNOWN); + log_transition( + thread_id, thread->second.task_id, thread->second.state, thread_state::UNKNOWN); // don't need to record failed time metric the thread is already gone... threads.erase(thread); task_has_woken_condition.notify_all(); @@ -845,7 +885,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * and if there are no blocked threads, then we wake up all BUFN threads. * Hopefully the frees have already woken up all the blocked threads anyways. */ - void wake_up_threads_after_task_finishes(const std::unique_lock &lock) { + void wake_up_threads_after_task_finishes(const std::unique_lock& lock) + { bool are_any_tasks_just_blocked = false; for (auto thread = threads.begin(); thread != threads.end(); thread++) { switch (thread->second.state) { @@ -885,16 +926,15 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * returns true if the thread that ended was a normally running task thread. * This should be used to decide if wake_up_threads_after_task_finishes is called or not. */ - bool remove_thread_association(long thread_id, const std::unique_lock &lock) { - bool ret = false; + bool remove_thread_association(long thread_id, const std::unique_lock& lock) + { + bool ret = false; auto threads_at = threads.find(thread_id); if (threads_at != threads.end()) { auto task_id = threads_at->second.task_id; if (task_id >= 0) { auto task_at = task_to_threads.find(task_id); - if (task_at != task_to_threads.end()) { - task_at->second.erase(thread_id); - } + if (task_at != task_to_threads.end()) { task_at->second.erase(thread_id); } } switch (threads_at->second.state) { @@ -912,8 +952,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { ret = true; // fall through; default: - log_transition(thread_id, threads_at->second.task_id, threads_at->second.state, - thread_state::UNKNOWN); + log_transition( + thread_id, threads_at->second.task_id, threads_at->second.state, thread_state::UNKNOWN); threads.erase(threads_at); } } @@ -929,7 +969,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * entered the state machine. The only known case is GPU memory required for setup in * cuDF for a spill operation. */ - bool pre_alloc(long thread_id) { + bool pre_alloc(long thread_id) + { std::unique_lock lock(state_mutex); auto thread = threads.find(thread_id); @@ -956,8 +997,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { if (thread->second.cudf_exception_injected > 0) { thread->second.cudf_exception_injected--; - log_status("INJECTED_CUDF_EXCEPTION", thread_id, thread->second.task_id, - thread->second.state); + log_status( + "INJECTED_CUDF_EXCEPTION", thread_id, thread->second.task_id, thread->second.state); thread->second.record_failed_retry_time(); throw_java_exception(cudf::jni::CUDF_ERROR_CLASS, "injected CudfException"); } @@ -965,8 +1006,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { if (thread->second.split_and_retry_oom_injected > 0) { thread->second.split_and_retry_oom_injected--; thread->second.num_times_split_retry_throw++; - log_status("INJECTED_SPLIT_AND_RETRY_OOM", thread_id, thread->second.task_id, - thread->second.state); + log_status( + "INJECTED_SPLIT_AND_RETRY_OOM", thread_id, thread->second.task_id, thread->second.state); thread->second.record_failed_retry_time(); throw_java_exception(SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM"); } @@ -1001,7 +1042,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * `likely_spill` if this allocation should be treated differently, because * we detected recursion while handling a prior allocation in this thread. */ - void post_alloc_success(long thread_id, bool likely_spill) { + void post_alloc_success(long thread_id, bool likely_spill) + { std::unique_lock lock(state_mutex); // pre allocate checks auto thread = threads.find(thread_id); @@ -1027,7 +1069,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * This is typically called when a free happens, or an alloc succeeds. * @param is_from_free true if a free happen. */ - void wake_next_highest_priority_blocked(const std::unique_lock &lock, bool is_from_free) { + void wake_next_highest_priority_blocked(const std::unique_lock& lock, + bool is_from_free) + { // 1. Find the highest priority blocked thread, including shuffle. thread_priority to_wake(-1, -1); bool is_to_wake_set = false; @@ -1036,7 +1080,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { if (state == thread_state::TASK_BLOCKED || state == thread_state::SHUFFLE_BLOCKED) { thread_priority current = thread->second.priority(); if (!is_to_wake_set || to_wake < current) { - to_wake = current; + to_wake = current; is_to_wake_set = true; } } @@ -1066,7 +1110,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { } else if (is_from_free) { // 3. Otherwise look to see if we are in a BUFN deadlock state. // - // Memory was freed and if all of the tasks are in a BUFN state, + // Memory was freed and if all of the tasks are in a BUFN state, // then we want to wake up the highest priority one so it can make progress // instead of trying to split its input. But we only do this if it // is a different thread that is freeing memory from the one we want to wake up. @@ -1079,24 +1123,21 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { thread_priority to_wake(-1, -1); bool is_to_wake_set = false; for (auto thread = threads.begin(); thread != threads.end(); thread++) { - if (thread->second.task_id >= 0) { - tasks_with_threads.insert(thread->second.task_id); - } + if (thread->second.task_id >= 0) { tasks_with_threads.insert(thread->second.task_id); } switch (thread->second.state) { case TASK_BUFN_THROW: - // fall through + // fall through case TASK_BUFN_WAIT: - // fall through + // fall through case TASK_BUFN: { - tasks_with_threads_bufn.insert(thread->second.task_id); - thread_priority current = thread->second.priority(); - if (!is_to_wake_set || to_wake < current) { - to_wake = current; - is_to_wake_set = true; - } + tasks_with_threads_bufn.insert(thread->second.task_id); + thread_priority current = thread->second.priority(); + if (!is_to_wake_set || to_wake < current) { + to_wake = current; + is_to_wake_set = true; } - break; + } break; default: break; } } @@ -1108,7 +1149,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { // Don't wake up yourself on a free. It is not adding more memory for this thread // to use on a retry and we might need a split instead to break a deadlock auto this_id = static_cast(pthread_self()); - auto thread = threads.find(thread_id_to_wake); + auto thread = threads.find(thread_id_to_wake); if (thread != threads.end() && thread->first != this_id) { switch (thread->second.state) { case TASK_BUFN: @@ -1126,8 +1167,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { break; default: { std::stringstream ss; - ss << "internal error expected to only wake up blocked threads " << thread_id_to_wake - << " " << as_str(thread->second.state); + ss << "internal error expected to only wake up blocked threads " + << thread_id_to_wake << " " << as_str(thread->second.state); throw std::runtime_error(ss.str()); } } @@ -1142,7 +1183,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * called when a task or shuffle thread becomes blocked so that we can * check to see if one of them needs to become BUFN or do a split and rollback. */ - void check_and_update_for_bufn(const std::unique_lock &lock) { + void check_and_update_for_bufn(const std::unique_lock& lock) + { // We want to know if all active tasks have at least one thread that // is effectively blocked or not. We could change the definitions here, // but for now this sounds like a good starting point. @@ -1162,9 +1204,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { } for (auto thread = threads.begin(); thread != threads.end(); thread++) { - if (thread->second.task_id >= 0) { - tasks_with_threads.insert(thread->second.task_id); - } + if (thread->second.task_id >= 0) { tasks_with_threads.insert(thread->second.task_id); } switch (thread->second.state) { case TASK_WAIT_ON_SHUFFLE: @@ -1184,7 +1224,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { } bool need_to_break_deadlock = - tasks_with_threads.size() == tasks_with_threads_effectively_blocked.size(); + tasks_with_threads.size() == tasks_with_threads_effectively_blocked.size(); if (need_to_break_deadlock) { // Find the task thread with the lowest priority that is not already BUFN thread_priority to_bufn(-1, -1); @@ -1194,7 +1234,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { case TASK_BLOCKED: { thread_priority current = thread->second.priority(); if (!is_to_bufn_set || current < to_bufn) { - to_bufn = current; + to_bufn = current; is_to_bufn_set = true; } } break; @@ -1203,7 +1243,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { } if (is_to_bufn_set) { long thread_id_to_bufn = to_bufn.get_thread_id(); - auto thread = threads.find(thread_id_to_bufn); + auto thread = threads.find(thread_id_to_bufn); if (thread != threads.end()) { transition(thread->second, thread_state::TASK_BUFN_THROW); thread->second.wake_condition->notify_all(); @@ -1221,21 +1261,19 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { case TASK_BUFN: { thread_priority current = thread->second.priority(); if (!is_to_wake_set || to_wake < current) { - to_wake = current; + to_wake = current; is_to_wake_set = true; } } break; case TASK_WAIT_ON_SHUFFLE: - if (!is_any_shuffle_thread_blocked) { - all_bufn_or_shuffle = false; - } + if (!is_any_shuffle_thread_blocked) { all_bufn_or_shuffle = false; } break; default: all_bufn_or_shuffle = false; break; } } } if (all_bufn_or_shuffle) { - long thread_id = to_wake.get_thread_id(); + long thread_id = to_wake.get_thread_id(); auto found_thread = threads.find(thread_id); if (found_thread != threads.end()) { transition(found_thread->second, thread_state::TASK_SPLIT_THROW); @@ -1264,7 +1302,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * typically happen after this has run, and we loop around to retry the alloc * if the state says we should. */ - bool post_alloc_failed(long thread_id, bool is_oom, bool likely_spill) { + bool post_alloc_failed(long thread_id, bool is_oom, bool likely_spill) + { std::unique_lock lock(state_mutex); auto thread = threads.find(thread_id); // only retry if this was due to an out of memory exception. @@ -1304,19 +1343,18 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { return ret; } - void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override { + void* do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override + { auto tid = static_cast(pthread_self()); while (true) { bool likely_spill = pre_alloc(tid); try { - void *ret = resource->allocate(num_bytes, stream); + void* ret = resource->allocate(num_bytes, stream); post_alloc_success(tid, likely_spill); return ret; - } catch (const std::bad_alloc &e) { - if (!post_alloc_failed(tid, true, likely_spill)) { - throw; - } - } catch (const std::exception &e) { + } catch (const std::bad_alloc& e) { + if (!post_alloc_failed(tid, true, likely_spill)) { throw; } + } catch (const std::exception& e) { post_alloc_failed(tid, false, likely_spill); throw; } @@ -1325,13 +1363,14 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { throw std::bad_alloc(); } - void do_deallocate(void *p, std::size_t size, rmm::cuda_stream_view stream) override { + void do_deallocate(void* p, std::size_t size, rmm::cuda_stream_view stream) override + { resource->deallocate(p, size, stream); // deallocate success if (size > 0) { std::unique_lock lock(state_mutex); - auto tid = static_cast(pthread_self()); + auto tid = static_cast(pthread_self()); auto thread = threads.find(tid); if (thread != threads.end()) { log_status("DEALLOC", tid, thread->second.task_id, thread->second.state); @@ -1363,17 +1402,19 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { } } - std::pair do_get_mem_info(rmm::cuda_stream_view stream) const override { + std::pair do_get_mem_info(rmm::cuda_stream_view stream) const override + { return resource->get_mem_info(stream); } }; -} // namespace +} // namespace extern "C" { JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getCurrentThreadId(JNIEnv *env, jclass) { +Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getCurrentThreadId(JNIEnv* env, jclass) +{ try { cudf::jni::auto_set_device(env); return static_cast(pthread_self()); @@ -1382,11 +1423,12 @@ Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getCurrentThreadId(JNIEnv } JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_createNewAdaptor( - JNIEnv *env, jclass, jlong child, jstring log_loc) { + JNIEnv* env, jclass, jlong child, jstring log_loc) +{ JNI_NULL_CHECK(env, child, "child is null", 0); try { cudf::jni::auto_set_device(env); - auto wrapped = reinterpret_cast(child); + auto wrapped = reinterpret_cast(child); cudf::jni::native_jstring nlogloc(env, log_loc); std::shared_ptr logger; if (nlogloc.is_null()) { @@ -1408,11 +1450,12 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_cr CATCH_STD(env, 0) } -JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_releaseAdaptor( - JNIEnv *env, jclass, jlong ptr) { +JNIEXPORT void JNICALL +Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_releaseAdaptor(JNIEnv* env, jclass, jlong ptr) +{ try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); mr->all_done(); delete mr; } @@ -1420,144 +1463,159 @@ JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_rel } JNIEXPORT void JNICALL -Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_associateThreadWithTask(JNIEnv *env, jclass, - jlong ptr, - jlong thread_id, - jlong task_id) { +Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_associateThreadWithTask( + JNIEnv* env, jclass, jlong ptr, jlong thread_id, jlong task_id) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); mr->associate_thread_with_task(thread_id, task_id); } CATCH_STD(env, ) } JNIEXPORT void JNICALL -Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_associateThreadWithShuffle(JNIEnv *env, - jclass, jlong ptr, - jlong thread_id) { +Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_associateThreadWithShuffle(JNIEnv* env, + jclass, + jlong ptr, + jlong thread_id) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); mr->associate_thread_with_shuffle(thread_id); } CATCH_STD(env, ) } JNIEXPORT void JNICALL -Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_removeThreadAssociation(JNIEnv *env, jclass, +Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_removeThreadAssociation(JNIEnv* env, + jclass, jlong ptr, - jlong thread_id) { + jlong thread_id) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); mr->remove_thread_association(thread_id); } CATCH_STD(env, ) } -JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_taskDone( - JNIEnv *env, jclass, jlong ptr, jlong task_id) { +JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_taskDone(JNIEnv* env, + jclass, + jlong ptr, + jlong task_id) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); mr->task_done(task_id); } CATCH_STD(env, ) } JNIEXPORT void JNICALL -Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_threadCouldBlockOnShuffle(JNIEnv *env, jclass, +Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_threadCouldBlockOnShuffle(JNIEnv* env, + jclass, jlong ptr, - jlong thread_id) { + jlong thread_id) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); mr->thread_could_block_on_shuffle(thread_id); } CATCH_STD(env, ) } JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_threadDoneWithShuffle( - JNIEnv *env, jclass, jlong ptr, jlong thread_id) { + JNIEnv* env, jclass, jlong ptr, jlong thread_id) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); mr->thread_done_with_shuffle(thread_id); } CATCH_STD(env, ) } JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_forceRetryOOM( - JNIEnv *env, jclass, jlong ptr, jlong thread_id, jint num_ooms) { + JNIEnv* env, jclass, jlong ptr, jlong thread_id, jint num_ooms) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); mr->force_retry_oom(thread_id, num_ooms); } CATCH_STD(env, ) } JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_forceSplitAndRetryOOM( - JNIEnv *env, jclass, jlong ptr, jlong thread_id, jint num_ooms) { + JNIEnv* env, jclass, jlong ptr, jlong thread_id, jint num_ooms) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); mr->force_split_and_retry_oom(thread_id, num_ooms); } CATCH_STD(env, ) } JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_forceCudfException( - JNIEnv *env, jclass, jlong ptr, jlong thread_id, jint num_times) { + JNIEnv* env, jclass, jlong ptr, jlong thread_id, jint num_times) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); mr->force_cudf_exception(thread_id, num_times); } CATCH_STD(env, ) } JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_blockThreadUntilReady( - JNIEnv *env, jclass, jlong ptr) { + JNIEnv* env, jclass, jlong ptr) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); mr->block_thread_until_ready(); } CATCH_STD(env, ) } JNIEXPORT jint JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getStateOf( - JNIEnv *env, jclass, jlong ptr, jlong thread_id) { + JNIEnv* env, jclass, jlong ptr, jlong thread_id) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); return mr->get_thread_state_as_int(thread_id); } CATCH_STD(env, 0) } JNIEXPORT jint JNICALL -Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetRetryThrowInternal(JNIEnv *env, +Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetRetryThrowInternal(JNIEnv* env, jclass, jlong ptr, - jlong task_id) { + jlong task_id) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); return mr->get_and_reset_num_retry(task_id); } CATCH_STD(env, 0) @@ -1565,61 +1623,64 @@ Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetRetryThrowInter JNIEXPORT jint JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetSplitRetryThrowInternal( - JNIEnv *env, jclass, jlong ptr, jlong task_id) { + JNIEnv* env, jclass, jlong ptr, jlong task_id) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); return mr->get_and_reset_num_split_retry(task_id); } CATCH_STD(env, 0) } JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetBlockTimeInternal(JNIEnv *env, +Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetBlockTimeInternal(JNIEnv* env, jclass, jlong ptr, - jlong task_id) { + jlong task_id) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); return mr->get_and_reset_block_time(task_id); } CATCH_STD(env, 0) } JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetComputeTimeLostToRetry(JNIEnv *env, - jclass, - jlong ptr, - jlong task_id) { +Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetComputeTimeLostToRetry( + JNIEnv* env, jclass, jlong ptr, jlong task_id) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); return mr->get_and_reset_lost_time(task_id); } CATCH_STD(env, 0) } JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_startRetryBlock( - JNIEnv *env, jclass, jlong ptr, jlong thread_id) { + JNIEnv* env, jclass, jlong ptr, jlong thread_id) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); mr->start_retry_block(thread_id); } CATCH_STD(env, ) } JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_endRetryBlock( - JNIEnv *env, jclass, jlong ptr, jlong thread_id) { + JNIEnv* env, jclass, jlong ptr, jlong thread_id) +{ JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); - auto mr = reinterpret_cast(ptr); + auto mr = reinterpret_cast(ptr); mr->end_retry_block(thread_id); } CATCH_STD(env, ) diff --git a/src/main/cpp/src/ZOrderJni.cpp b/src/main/cpp/src/ZOrderJni.cpp index 20d5ba92a4..37925f86d6 100644 --- a/src/main/cpp/src/ZOrderJni.cpp +++ b/src/main/cpp/src/ZOrderJni.cpp @@ -21,8 +21,9 @@ extern "C" { -JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_ZOrder_interleaveBits(JNIEnv *env, jclass, jlongArray input_columns) { +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ZOrder_interleaveBits( + JNIEnv* env, jclass, jlongArray input_columns) +{ JNI_NULL_CHECK(env, input_columns, "input is null", 0); try { @@ -35,8 +36,9 @@ Java_com_nvidia_spark_rapids_jni_ZOrder_interleaveBits(JNIEnv *env, jclass, jlon CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_ZOrder_hilbertIndex(JNIEnv *env, jclass, jint num_bits, jlongArray input_columns) { +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ZOrder_hilbertIndex( + JNIEnv* env, jclass, jint num_bits, jlongArray input_columns) +{ JNI_NULL_CHECK(env, input_columns, "input is null", 0); try { @@ -48,7 +50,4 @@ Java_com_nvidia_spark_rapids_jni_ZOrder_hilbertIndex(JNIEnv *env, jclass, jint n } CATCH_STD(env, 0); } - - - } diff --git a/src/main/cpp/src/cast_decimal_to_string.cu b/src/main/cpp/src/cast_decimal_to_string.cu index 88e8a6fdb8..a256e8e917 100644 --- a/src/main/cpp/src/cast_decimal_to_string.cu +++ b/src/main/cpp/src/cast_decimal_to_string.cu @@ -87,9 +87,9 @@ struct decimal_to_non_ansi_string_fn { } else { // positive scale or adjusted exponent < -6 means scientific notation auto const extra_digits = abs_value_digits > 1 ? 3 : 2; - return static_cast(value < 0) + // sign if negative - abs_value_digits + // number of digits - extra_digits + // decimal point if exists, E, +/- + return static_cast(value < 0) + // sign if negative + abs_value_digits + // number of digits + extra_digits + // decimal point if exists, E, +/- strings::detail::count_digits( numeric::detail::abs(adjusted_exponent)); // exponent portion } @@ -127,7 +127,7 @@ struct decimal_to_non_ansi_string_fn { d_buffer += strings::detail::integer_to_string(abs_value / exp_ten, d_buffer); // add the integer part if (scale != 0) { - *d_buffer++ = '.'; // add decimal point + *d_buffer++ = '.'; // add decimal point thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; }); // add zeros d_buffer += num_zeros; diff --git a/src/main/cpp/src/cast_string.cu b/src/main/cpp/src/cast_string.cu index 2cfcc62630..6f9de63d10 100644 --- a/src/main/cpp/src/cast_string.cu +++ b/src/main/cpp/src/cast_string.cu @@ -653,8 +653,8 @@ struct string_to_integer_impl { rmm::mr::device_memory_resource* mr) { if (string_col.size() == 0) { - return std::make_unique(data_type{type_to_id()}, 0, rmm::device_buffer{}, - rmm::device_buffer{}, 0); + return std::make_unique( + data_type{type_to_id()}, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); } rmm::device_uvector data(string_col.size(), stream, mr); @@ -676,8 +676,11 @@ struct string_to_integer_impl { auto null_count = cudf::detail::null_count(null_mask.data(), 0, string_col.size(), stream); - auto col = std::make_unique( - data_type{type_to_id()}, string_col.size(), data.release(), null_mask.release(), null_count); + auto col = std::make_unique(data_type{type_to_id()}, + string_col.size(), + data.release(), + null_mask.release(), + null_count); if (ansi_mode) { validate_ansi_column(col->view(), string_col, stream); } @@ -743,9 +746,8 @@ struct string_to_decimal_impl { auto null_count = cudf::detail::null_count(null_mask.data(), 0, string_col.size(), stream); - auto col = - std::make_unique(dtype, string_col.size(), data.release(), - null_mask.release(), null_count); + auto col = std::make_unique( + dtype, string_col.size(), data.release(), null_mask.release(), null_count); if (ansi_mode) { validate_ansi_column(col->view(), string_col, stream); } @@ -829,8 +831,15 @@ std::unique_ptr string_to_decimal(int32_t precision, return std::make_unique(dtype, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); } - return type_dispatcher( - dtype, detail::string_to_decimal_impl{}, dtype, precision, string_col, ansi_mode, strip, stream, mr); + return type_dispatcher(dtype, + detail::string_to_decimal_impl{}, + dtype, + precision, + string_col, + ansi_mode, + strip, + stream, + mr); } } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/cast_string_to_float.cu b/src/main/cpp/src/cast_string_to_float.cu index 3c23c98e0a..8eab9eef16 100644 --- a/src/main/cpp/src/cast_string_to_float.cu +++ b/src/main/cpp/src/cast_string_to_float.cu @@ -362,9 +362,9 @@ class string_to_float { __ballot_sync(0xffffffff, _warp_lane < num_chars && !is_digit(_c)); auto const first_non_digit = __ffs(non_digit_mask); - // first non-digit after location 1 means there is something valid here, note ffs is 0 with no set bits, - // so 1 is the 0th character is not a digit. - // first non-digit of 0 means all digits, and that means we have seen a valid digit as well. + // first non-digit after location 1 means there is something valid here, note ffs is 0 with no + // set bits, so 1 is the 0th character is not a digit. first non-digit of 0 means all digits, + // and that means we have seen a valid digit as well. seen_valid_digit |= (num_chars > 0 && first_non_digit != 1); num_chars = min(num_chars, first_non_digit > 0 ? first_non_digit - 1 : num_chars); diff --git a/src/main/cpp/src/decimal_utils.cu b/src/main/cpp/src/decimal_utils.cu index 625bd4e711..392fb495b4 100644 --- a/src/main/cpp/src/decimal_utils.cu +++ b/src/main/cpp/src/decimal_utils.cu @@ -18,8 +18,8 @@ #include #include -#include #include +#include #include #include @@ -32,15 +32,17 @@ struct chunked256 { inline chunked256() = default; // sign-extend a 128-bit value into a chunked 256-bit value - inline __device__ explicit chunked256(__int128_t x) { - chunks[0] = static_cast(x); + inline __device__ explicit chunked256(__int128_t x) + { + chunks[0] = static_cast(x); __int128_t x_shifted = x >> 64; - chunks[1] = static_cast(x_shifted); - chunks[2] = static_cast(x_shifted >> 64); - chunks[3] = chunks[2]; + chunks[1] = static_cast(x_shifted); + chunks[2] = static_cast(x_shifted >> 64); + chunks[3] = chunks[2]; } - inline __device__ explicit chunked256(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { + inline __device__ explicit chunked256(uint64_t a, uint64_t b, uint64_t c, uint64_t d) + { chunks[0] = d; chunks[1] = c; chunks[2] = b; @@ -48,14 +50,13 @@ struct chunked256 { } inline __device__ uint64_t operator[](int i) const { return chunks[i]; } - inline __device__ uint64_t &operator[](int i) { return chunks[i]; } + inline __device__ uint64_t& operator[](int i) { return chunks[i]; } inline __device__ int64_t sign() const { return static_cast(chunks[3]) >> 63; } - inline __device__ void add(int a) { - add(chunked256(static_cast<__int128_t>(a))); - } + inline __device__ void add(int a) { add(chunked256(static_cast<__int128_t>(a))); } - inline __device__ void add(chunked256 const &a) { + inline __device__ void add(chunked256 const& a) + { __uint128_t carry_and_sum = 0; for (int i = 0; i < 4; ++i) { carry_and_sum += static_cast<__uint128_t>(chunks[i]) + a.chunks[i]; @@ -64,14 +65,16 @@ struct chunked256 { } } - inline __device__ void negate() { + inline __device__ void negate() + { for (int i = 0; i < 4; i++) { chunks[i] = ~chunks[i]; } add(1); } - inline __device__ bool lt_unsigned(chunked256 const &other) const { + inline __device__ bool lt_unsigned(chunked256 const& other) const + { for (int i = 3; i >= 0; i--) { if (chunks[i] < other.chunks[i]) { return true; @@ -82,11 +85,10 @@ struct chunked256 { return false; } - inline __device__ bool gte_unsigned(chunked256 const &other) const { - return !lt_unsigned(other); - } + inline __device__ bool gte_unsigned(chunked256 const& other) const { return !lt_unsigned(other); } - inline __device__ int leading_zeros() const { + inline __device__ int leading_zeros() const + { if (sign() < 0) { chunked256 tmp = *this; tmp.negate(); @@ -104,15 +106,14 @@ struct chunked256 { } } - inline __device__ __int128_t as_128_bits() const { + inline __device__ __int128_t as_128_bits() const + { return (static_cast<__int128_t>(chunks[1]) << 64) | chunks[0]; } - inline __device__ uint64_t as_64_bits() const { - return chunks[0]; - } + inline __device__ uint64_t as_64_bits() const { return chunks[0]; } -private: + private: uint64_t chunks[4]; }; @@ -122,49 +123,52 @@ struct divmod256 { }; // Perform a 256-bit multiply in 64-bit chunks -__device__ chunked256 multiply(chunked256 const &a, chunked256 const &b) { +__device__ chunked256 multiply(chunked256 const& a, chunked256 const& b) +{ chunked256 r; __uint128_t mul; uint64_t carry = 0; for (int a_idx = 0; a_idx < 4; ++a_idx) { - mul = static_cast<__uint128_t>(a[a_idx]) * b[0] + carry; + mul = static_cast<__uint128_t>(a[a_idx]) * b[0] + carry; r[a_idx] = static_cast(mul); - carry = static_cast(mul >> 64); + carry = static_cast(mul >> 64); } for (int b_idx = 1; b_idx < 4; ++b_idx) { carry = 0; for (int a_idx = 0; a_idx < 4 - b_idx; ++a_idx) { int r_idx = a_idx + b_idx; - mul = static_cast<__uint128_t>(a[a_idx]) * b[b_idx] + r[r_idx] + carry; - r[r_idx] = static_cast(mul); - carry = static_cast(mul >> 64); + mul = static_cast<__uint128_t>(a[a_idx]) * b[b_idx] + r[r_idx] + carry; + r[r_idx] = static_cast(mul); + carry = static_cast(mul >> 64); } } return r; } -__device__ divmod256 divide_unsigned(chunked256 const &n, __int128_t const &d) { +__device__ divmod256 divide_unsigned(chunked256 const& n, __int128_t const& d) +{ // TODO: FIXME this is long division, and so it is likely very slow... chunked256 q(0); __uint128_t r = 0; for (int i = 255; i >= 0; i--) { int block = i / 64; - int bit = i % 64; - int read = (int)((n[block] >> bit) & 0x01); - r = r << 1; - r = r | read; + int bit = i % 64; + int read = (int)((n[block] >> bit) & 0x01); + r = r << 1; + r = r | read; if (r >= d) { - r = r - d; + r = r - d; int64_t bit_set = 1L << bit; - q[block] = q[block] | bit_set; + q[block] = q[block] | bit_set; } } return divmod256{q, static_cast<__int128_t>(r)}; } -__device__ divmod256 divide(chunked256 const &n, __int128_t const &d) { +__device__ divmod256 divide(chunked256 const& n, __int128_t const& d) +{ // We assume that d is not 0. This is because we do the zero check, // if needed before calling divide so we can set an overflow properly. bool const is_n_neg = n.sign() < 0; @@ -173,26 +177,23 @@ __device__ divmod256 divide(chunked256 const &n, __int128_t const &d) { // beause we are dealing with decimal numbers that should not go to // the maximum value that can be held by d or n chunked256 abs_n = n; - if (is_n_neg) { - abs_n.negate(); - } + if (is_n_neg) { abs_n.negate(); } __int128_t abs_d = is_d_neg ? -d : d; divmod256 result = divide_unsigned(abs_n, abs_d); - if (is_d_neg != is_n_neg) { - result.quotient.negate(); - } + if (is_d_neg != is_n_neg) { result.quotient.negate(); } - if (is_n_neg) { - result.remainder = -result.remainder; - } + if (is_n_neg) { result.remainder = -result.remainder; } return result; } -__device__ chunked256 round_from_remainder(chunked256 const &q, __int128_t const &r, - chunked256 const & n, __int128_t const &d) { +__device__ chunked256 round_from_remainder(chunked256 const& q, + __int128_t const& r, + chunked256 const& n, + __int128_t const& d) +{ // We are going to round if the abs value of the remainder is >= half of the divisor // but if we divide the divisor in half, we can lose data so instead we are going to // multiply the remainder by 2 @@ -204,19 +205,20 @@ __device__ chunked256 round_from_remainder(chunked256 const &q, __int128_t const // is in a range that would have us round because the divisor has to fit within // an __int128_t. - bool const need_inc = ((double_remainder >> 1) != r) || // if we lost info or - (double_remainder < 0 ? -double_remainder : double_remainder) >= // abs remainder is >= - (d < 0 ? -d : d); // abs divisor + bool const need_inc = + ((double_remainder >> 1) != r) || // if we lost info or + (double_remainder < 0 ? -double_remainder : double_remainder) >= // abs remainder is >= + (d < 0 ? -d : d); // abs divisor // To know which way to round, more specifically when the quotient is 0 // we need to know what the sign of the quotient would have been. In this // case that happens if only one of the inputs was negative (xor) - bool const is_n_neg = n.sign() < 0; - bool const is_d_neg = d < 0; + bool const is_n_neg = n.sign() < 0; + bool const is_d_neg = d < 0; bool const round_down = is_n_neg != is_d_neg; int const round_inc = (need_inc ? (round_down ? -1 : 1) : 0); - chunked256 ret = q; + chunked256 ret = q; ret.add(round_inc); return ret; } @@ -224,7 +226,8 @@ __device__ chunked256 round_from_remainder(chunked256 const &q, __int128_t const /** * Divide n by d and do half up rounding based off of the remainder returned. */ -__device__ chunked256 divide_and_round(chunked256 const &n, __int128_t const &d) { +__device__ chunked256 divide_and_round(chunked256 const& n, __int128_t const& d) +{ divmod256 div_result = divide(n, d); return round_from_remainder(div_result.quotient, div_result.remainder, n, d); @@ -234,13 +237,15 @@ __device__ chunked256 divide_and_round(chunked256 const &n, __int128_t const &d) * Divide n by d and return the quotient. This is essentially what `DOWN` rounding does * in Java */ -__device__ chunked256 integer_divide(chunked256 const &n, __int128_t const &d) { +__device__ chunked256 integer_divide(chunked256 const& n, __int128_t const& d) +{ divmod256 div_result = divide(n, d); - //drop the remainder and only return the quotient + // drop the remainder and only return the quotient return div_result.quotient; } -inline __device__ chunked256 pow_ten(int exp) { +inline __device__ chunked256 pow_ten(int exp) +{ // Note that the body of this was generated using the following scala script /* import java.math.BigInteger @@ -265,287 +270,287 @@ inline __device__ chunked256 pow_ten(int exp) { printAsInt128s(ret) System.out.println(");") } - */ - switch(exp) { + */ + switch (exp) { case 0: - //1 + // 1 return chunked256(0x0, 0x0, 0x0, 0x1); case 1: - //10 + // 10 return chunked256(0x0, 0x0, 0x0, 0xa); case 2: - //100 + // 100 return chunked256(0x0, 0x0, 0x0, 0x64); case 3: - //1000 + // 1000 return chunked256(0x0, 0x0, 0x0, 0x3e8); case 4: - //10000 + // 10000 return chunked256(0x0, 0x0, 0x0, 0x2710); case 5: - //100000 + // 100000 return chunked256(0x0, 0x0, 0x0, 0x186a0); case 6: - //1000000 + // 1000000 return chunked256(0x0, 0x0, 0x0, 0xf4240); case 7: - //10000000 + // 10000000 return chunked256(0x0, 0x0, 0x0, 0x989680); case 8: - //100000000 + // 100000000 return chunked256(0x0, 0x0, 0x0, 0x5f5e100); case 9: - //1000000000 + // 1000000000 return chunked256(0x0, 0x0, 0x0, 0x3b9aca00); case 10: - //10000000000 + // 10000000000 return chunked256(0x0, 0x0, 0x0, 0x2540be400); case 11: - //100000000000 + // 100000000000 return chunked256(0x0, 0x0, 0x0, 0x174876e800); case 12: - //1000000000000 + // 1000000000000 return chunked256(0x0, 0x0, 0x0, 0xe8d4a51000); case 13: - //10000000000000 + // 10000000000000 return chunked256(0x0, 0x0, 0x0, 0x9184e72a000); case 14: - //100000000000000 + // 100000000000000 return chunked256(0x0, 0x0, 0x0, 0x5af3107a4000); case 15: - //1000000000000000 + // 1000000000000000 return chunked256(0x0, 0x0, 0x0, 0x38d7ea4c68000); case 16: - //10000000000000000 + // 10000000000000000 return chunked256(0x0, 0x0, 0x0, 0x2386f26fc10000); case 17: - //100000000000000000 + // 100000000000000000 return chunked256(0x0, 0x0, 0x0, 0x16345785d8a0000); case 18: - //1000000000000000000 + // 1000000000000000000 return chunked256(0x0, 0x0, 0x0, 0xde0b6b3a7640000); case 19: - //10000000000000000000 + // 10000000000000000000 return chunked256(0x0, 0x0, 0x0, 0x8ac7230489e80000); case 20: - //100000000000000000000 + // 100000000000000000000 return chunked256(0x0, 0x0, 0x5, 0x6bc75e2d63100000); case 21: - //1000000000000000000000 + // 1000000000000000000000 return chunked256(0x0, 0x0, 0x36, 0x35c9adc5dea00000); case 22: - //10000000000000000000000 + // 10000000000000000000000 return chunked256(0x0, 0x0, 0x21e, 0x19e0c9bab2400000); case 23: - //100000000000000000000000 + // 100000000000000000000000 return chunked256(0x0, 0x0, 0x152d, 0x2c7e14af6800000); case 24: - //1000000000000000000000000 + // 1000000000000000000000000 return chunked256(0x0, 0x0, 0xd3c2, 0x1bcecceda1000000); case 25: - //10000000000000000000000000 + // 10000000000000000000000000 return chunked256(0x0, 0x0, 0x84595, 0x161401484a000000); case 26: - //100000000000000000000000000 + // 100000000000000000000000000 return chunked256(0x0, 0x0, 0x52b7d2, 0xdcc80cd2e4000000); case 27: - //1000000000000000000000000000 + // 1000000000000000000000000000 return chunked256(0x0, 0x0, 0x33b2e3c, 0x9fd0803ce8000000); case 28: - //10000000000000000000000000000 + // 10000000000000000000000000000 return chunked256(0x0, 0x0, 0x204fce5e, 0x3e25026110000000); case 29: - //100000000000000000000000000000 + // 100000000000000000000000000000 return chunked256(0x0, 0x0, 0x1431e0fae, 0x6d7217caa0000000); case 30: - //1000000000000000000000000000000 + // 1000000000000000000000000000000 return chunked256(0x0, 0x0, 0xc9f2c9cd0, 0x4674edea40000000); case 31: - //10000000000000000000000000000000 + // 10000000000000000000000000000000 return chunked256(0x0, 0x0, 0x7e37be2022, 0xc0914b2680000000); case 32: - //100000000000000000000000000000000 + // 100000000000000000000000000000000 return chunked256(0x0, 0x0, 0x4ee2d6d415b, 0x85acef8100000000); case 33: - //1000000000000000000000000000000000 + // 1000000000000000000000000000000000 return chunked256(0x0, 0x0, 0x314dc6448d93, 0x38c15b0a00000000); case 34: - //10000000000000000000000000000000000 + // 10000000000000000000000000000000000 return chunked256(0x0, 0x0, 0x1ed09bead87c0, 0x378d8e6400000000); case 35: - //100000000000000000000000000000000000 + // 100000000000000000000000000000000000 return chunked256(0x0, 0x0, 0x13426172c74d82, 0x2b878fe800000000); case 36: - //1000000000000000000000000000000000000 + // 1000000000000000000000000000000000000 return chunked256(0x0, 0x0, 0xc097ce7bc90715, 0xb34b9f1000000000); case 37: - //10000000000000000000000000000000000000 + // 10000000000000000000000000000000000000 return chunked256(0x0, 0x0, 0x785ee10d5da46d9, 0xf436a000000000); case 38: - //100000000000000000000000000000000000000 + // 100000000000000000000000000000000000000 return chunked256(0x0, 0x0, 0x4b3b4ca85a86c47a, 0x98a224000000000); case 39: - //1000000000000000000000000000000000000000 + // 1000000000000000000000000000000000000000 return chunked256(0x0, 0x2, 0xf050fe938943acc4, 0x5f65568000000000); case 40: - //10000000000000000000000000000000000000000 + // 10000000000000000000000000000000000000000 return chunked256(0x0, 0x1d, 0x6329f1c35ca4bfab, 0xb9f5610000000000); case 41: - //100000000000000000000000000000000000000000 + // 100000000000000000000000000000000000000000 return chunked256(0x0, 0x125, 0xdfa371a19e6f7cb5, 0x4395ca0000000000); case 42: - //1000000000000000000000000000000000000000000 + // 1000000000000000000000000000000000000000000 return chunked256(0x0, 0xb7a, 0xbc627050305adf14, 0xa3d9e40000000000); case 43: - //10000000000000000000000000000000000000000000 + // 10000000000000000000000000000000000000000000 return chunked256(0x0, 0x72cb, 0x5bd86321e38cb6ce, 0x6682e80000000000); case 44: - //100000000000000000000000000000000000000000000 + // 100000000000000000000000000000000000000000000 return chunked256(0x0, 0x47bf1, 0x9673df52e37f2410, 0x11d100000000000); case 45: - //1000000000000000000000000000000000000000000000 + // 1000000000000000000000000000000000000000000000 return chunked256(0x0, 0x2cd76f, 0xe086b93ce2f768a0, 0xb22a00000000000); case 46: - //10000000000000000000000000000000000000000000000 + // 10000000000000000000000000000000000000000000000 return chunked256(0x0, 0x1c06a5e, 0xc5433c60ddaa1640, 0x6f5a400000000000); case 47: - //100000000000000000000000000000000000000000000000 + // 100000000000000000000000000000000000000000000000 return chunked256(0x0, 0x118427b3, 0xb4a05bc8a8a4de84, 0x5986800000000000); case 48: - //1000000000000000000000000000000000000000000000000 + // 1000000000000000000000000000000000000000000000000 return chunked256(0x0, 0xaf298d05, 0xe4395d69670b12b, 0x7f41000000000000); case 49: - //10000000000000000000000000000000000000000000000000 + // 10000000000000000000000000000000000000000000000000 return chunked256(0x0, 0x6d79f8232, 0x8ea3da61e066ebb2, 0xf88a000000000000); case 50: - //100000000000000000000000000000000000000000000000000 + // 100000000000000000000000000000000000000000000000000 return chunked256(0x0, 0x446c3b15f9, 0x926687d2c40534fd, 0xb564000000000000); case 51: - //1000000000000000000000000000000000000000000000000000 + // 1000000000000000000000000000000000000000000000000000 return chunked256(0x0, 0x2ac3a4edbbf, 0xb8014e3ba83411e9, 0x15e8000000000000); case 52: - //10000000000000000000000000000000000000000000000000000 + // 10000000000000000000000000000000000000000000000000000 return chunked256(0x0, 0x1aba4714957d, 0x300d0e549208b31a, 0xdb10000000000000); case 53: - //100000000000000000000000000000000000000000000000000000 + // 100000000000000000000000000000000000000000000000000000 return chunked256(0x0, 0x10b46c6cdd6e3, 0xe0828f4db456ff0c, 0x8ea0000000000000); case 54: - //1000000000000000000000000000000000000000000000000000000 + // 1000000000000000000000000000000000000000000000000000000 return chunked256(0x0, 0xa70c3c40a64e6, 0xc51999090b65f67d, 0x9240000000000000); case 55: - //10000000000000000000000000000000000000000000000000000000 + // 10000000000000000000000000000000000000000000000000000000 return chunked256(0x0, 0x6867a5a867f103, 0xb2fffa5a71fba0e7, 0xb680000000000000); case 56: - //100000000000000000000000000000000000000000000000000000000 + // 100000000000000000000000000000000000000000000000000000000 return chunked256(0x0, 0x4140c78940f6a24, 0xfdffc78873d4490d, 0x2100000000000000); case 57: - //1000000000000000000000000000000000000000000000000000000000 + // 1000000000000000000000000000000000000000000000000000000000 return chunked256(0x0, 0x28c87cb5c89a2571, 0xebfdcb54864ada83, 0x4a00000000000000); case 58: - //10000000000000000000000000000000000000000000000000000000000 + // 10000000000000000000000000000000000000000000000000000000000 return chunked256(0x1, 0x97d4df19d6057673, 0x37e9f14d3eec8920, 0xe400000000000000); case 59: - //100000000000000000000000000000000000000000000000000000000000 + // 100000000000000000000000000000000000000000000000000000000000 return chunked256(0xf, 0xee50b7025c36a080, 0x2f236d04753d5b48, 0xe800000000000000); case 60: - //1000000000000000000000000000000000000000000000000000000000000 + // 1000000000000000000000000000000000000000000000000000000000000 return chunked256(0x9f, 0x4f2726179a224501, 0xd762422c946590d9, 0x1000000000000000); case 61: - //10000000000000000000000000000000000000000000000000000000000000 + // 10000000000000000000000000000000000000000000000000000000000000 return chunked256(0x639, 0x17877cec0556b212, 0x69d695bdcbf7a87a, 0xa000000000000000); case 62: - //100000000000000000000000000000000000000000000000000000000000000 + // 100000000000000000000000000000000000000000000000000000000000000 return chunked256(0x3e3a, 0xeb4ae1383562f4b8, 0x2261d969f7ac94ca, 0x4000000000000000); case 63: - //1000000000000000000000000000000000000000000000000000000000000000 + // 1000000000000000000000000000000000000000000000000000000000000000 return chunked256(0x26e4d, 0x30eccc3215dd8f31, 0x57d27e23acbdcfe6, 0x8000000000000000); case 64: - //10000000000000000000000000000000000000000000000000000000000000000 + // 10000000000000000000000000000000000000000000000000000000000000000 return chunked256(0x184f03, 0xe93ff9f4daa797ed, 0x6e38ed64bf6a1f01, 0x0); case 65: - //100000000000000000000000000000000000000000000000000000000000000000 + // 100000000000000000000000000000000000000000000000000000000000000000 return chunked256(0xf31627, 0x1c7fc3908a8bef46, 0x4e3945ef7a25360a, 0x0); case 66: - //1000000000000000000000000000000000000000000000000000000000000000000 + // 1000000000000000000000000000000000000000000000000000000000000000000 return chunked256(0x97edd87, 0x1cfda3a5697758bf, 0xe3cbb5ac5741c64, 0x0); case 67: - //10000000000000000000000000000000000000000000000000000000000000000000 + // 10000000000000000000000000000000000000000000000000000000000000000000 return chunked256(0x5ef4a747, 0x21e864761ea97776, 0x8e5f518bb6891be8, 0x0); case 68: - //100000000000000000000000000000000000000000000000000000000000000000000 + // 100000000000000000000000000000000000000000000000000000000000000000000 return chunked256(0x3b58e88c7, 0x5313ec9d329eaaa1, 0x8fb92f75215b1710, 0x0); case 69: - //1000000000000000000000000000000000000000000000000000000000000000000000 + // 1000000000000000000000000000000000000000000000000000000000000000000000 return chunked256(0x25179157c9, 0x3ec73e23fa32aa4f, 0x9d3bda934d8ee6a0, 0x0); case 70: - //10000000000000000000000000000000000000000000000000000000000000000000000 + // 10000000000000000000000000000000000000000000000000000000000000000000000 return chunked256(0x172ebad6ddc, 0x73c86d67c5faa71c, 0x245689c107950240, 0x0); case 71: - //100000000000000000000000000000000000000000000000000000000000000000000000 + // 100000000000000000000000000000000000000000000000000000000000000000000000 return chunked256(0xe7d34c64a9c, 0x85d4460dbbca8719, 0x6b61618a4bd21680, 0x0); case 72: - //1000000000000000000000000000000000000000000000000000000000000000000000000 + // 1000000000000000000000000000000000000000000000000000000000000000000000000 return chunked256(0x90e40fbeea1d, 0x3a4abc8955e946fe, 0x31cdcf66f634e100, 0x0); case 73: - //10000000000000000000000000000000000000000000000000000000000000000000000000 + // 10000000000000000000000000000000000000000000000000000000000000000000000000 return chunked256(0x5a8e89d752524, 0x46eb5d5d5b1cc5ed, 0xf20a1a059e10ca00, 0x0); case 74: - //100000000000000000000000000000000000000000000000000000000000000000000000000 + // 100000000000000000000000000000000000000000000000000000000000000000000000000 return chunked256(0x3899162693736a, 0xc531a5a58f1fbb4b, 0x746504382ca7e400, 0x0); case 75: - //1000000000000000000000000000000000000000000000000000000000000000000000000000 + // 1000000000000000000000000000000000000000000000000000000000000000000000000000 return chunked256(0x235fadd81c2822b, 0xb3f07877973d50f2, 0x8bf22a31be8ee800, 0x0); case 76: - //10000000000000000000000000000000000000000000000000000000000000000000000000000 + // 10000000000000000000000000000000000000000000000000000000000000000000000000000 return chunked256(0x161bcca7119915b5, 0x764b4abe8652979, 0x7775a5f171951000, 0x0); default: // This is not a supported value... assert(0); - } + } } // check that the divide is going to do the right thing -void check_scale_divisor(int source_scale, int target_scale) { +void check_scale_divisor(int source_scale, int target_scale) +{ int exponent = target_scale - source_scale; CUDF_EXPECTS(exponent <= cuda::std::numeric_limits<__int128_t>::digits10, "divisor too big"); } -inline __device__ int precision10(chunked256 value) { - if (value.sign() < 0) { - // we want to do this on positive numbers - value.negate(); - } - // TODO this is a horrible way to do this. We should at least - // be able to approximate the log10 using the leading zeros similar to - // http://graphics.stanford.edu/~seander/bithacks.html and then start - // the search around the guess. - for (int i = 0; i <= 76; i++) { - chunked256 tmp = pow_ten(i); - if (tmp.gte_unsigned(value)) { - return i; - } - } - return -1; +inline __device__ int precision10(chunked256 value) +{ + if (value.sign() < 0) { + // we want to do this on positive numbers + value.negate(); + } + // TODO this is a horrible way to do this. We should at least + // be able to approximate the log10 using the leading zeros similar to + // http://graphics.stanford.edu/~seander/bithacks.html and then start + // the search around the guess. + for (int i = 0; i <= 76; i++) { + chunked256 tmp = pow_ten(i); + if (tmp.gte_unsigned(value)) { return i; } + } + return -1; } -__device__ bool is_greater_than_decimal_38(chunked256 a) { +__device__ bool is_greater_than_decimal_38(chunked256 a) +{ auto const max_number_for_precision = pow_ten(38); - if (a.sign() != 0) { - a.negate(); - } + if (a.sign() != 0) { a.negate(); } return a.gte_unsigned(max_number_for_precision); } -__device__ chunked256 set_scale_and_round(chunked256 data, int old_scale, int new_scale) { +__device__ chunked256 set_scale_and_round(chunked256 data, int old_scale, int new_scale) +{ if (old_scale != new_scale) { if (new_scale < old_scale) { - int const raise = old_scale - new_scale; + int const raise = old_scale - new_scale; int const multiplier = pow_ten(raise).as_128_bits(); - data = multiply(data, chunked256(multiplier)); + data = multiply(data, chunked256(multiplier)); } else { - int const drop = new_scale - old_scale; + int const drop = new_scale - old_scale; int const divisor = pow_ten(drop).as_128_bits(); - data = divide_and_round(data, divisor); + data = divide_and_round(data, divisor); } } return data; @@ -553,102 +558,118 @@ __device__ chunked256 set_scale_and_round(chunked256 data, int old_scale, int ne // Functor to add two DECIMAL128 columns with rounding and overflow detection. struct dec128_add_sub { - dec128_add_sub(bool *overflows, cudf::mutable_column_view const &result_view, - cudf::column_view const &a_col, cudf::column_view const &b_col) - : overflows(overflows), a_data(a_col.data<__int128_t>()), b_data(b_col.data<__int128_t>()), - result_data(result_view.data<__int128_t>()), - a_scale(a_col.type().scale()), b_scale(b_col.type().scale()), - result_scale(result_view.type().scale()) {} - - __device__ void add(chunked256 &a, chunked256 &b) const { - do_add_sub(a, b, false); + dec128_add_sub(bool* overflows, + cudf::mutable_column_view const& result_view, + cudf::column_view const& a_col, + cudf::column_view const& b_col) + : overflows(overflows), + a_data(a_col.data<__int128_t>()), + b_data(b_col.data<__int128_t>()), + result_data(result_view.data<__int128_t>()), + a_scale(a_col.type().scale()), + b_scale(b_col.type().scale()), + result_scale(result_view.type().scale()) + { } - __device__ void sub(chunked256 &a, chunked256 &b) const { - do_add_sub(a, b, true); - } + __device__ void add(chunked256& a, chunked256& b) const { do_add_sub(a, b, false); } -private: + __device__ void sub(chunked256& a, chunked256& b) const { do_add_sub(a, b, true); } - __device__ void do_add_sub(chunked256 &a, chunked256 &b, bool sub) const { - int intermediate_scale = min(a_scale, b_scale); - if (a_scale != intermediate_scale) { - a = set_scale_and_round(a, a_scale, intermediate_scale); - } - if (b_scale != intermediate_scale) { - b = set_scale_and_round(b, b_scale, intermediate_scale); - } - if (sub) { - // Get 2's complement - b.negate(); - } - a.add(b); + private: + __device__ void do_add_sub(chunked256& a, chunked256& b, bool sub) const + { + int intermediate_scale = min(a_scale, b_scale); + if (a_scale != intermediate_scale) { a = set_scale_and_round(a, a_scale, intermediate_scale); } + if (b_scale != intermediate_scale) { b = set_scale_and_round(b, b_scale, intermediate_scale); } + if (sub) { + // Get 2's complement + b.negate(); + } + a.add(b); - if (result_scale != intermediate_scale) { - a = set_scale_and_round(a, intermediate_scale, result_scale); - } + if (result_scale != intermediate_scale) { + a = set_scale_and_round(a, intermediate_scale, result_scale); + } } -protected: - + protected: // output column for overflow detected - bool * const overflows; + bool* const overflows; // input data - __int128_t const * const a_data; - __int128_t const * const b_data; - __int128_t * const result_data; + __int128_t const* const a_data; + __int128_t const* const b_data; + __int128_t* const result_data; int const a_scale; int const b_scale; int const result_scale; }; // Functor to add two DECIMAL128 columns with rounding and overflow detection. -struct dec128_add: public dec128_add_sub { - dec128_add(bool *overflows, cudf::mutable_column_view const &sum_view, - cudf::column_view const &a_col, cudf::column_view const &b_col) - : dec128_add_sub(overflows, sum_view, a_col, b_col) {} +struct dec128_add : public dec128_add_sub { + dec128_add(bool* overflows, + cudf::mutable_column_view const& sum_view, + cudf::column_view const& a_col, + cudf::column_view const& b_col) + : dec128_add_sub(overflows, sum_view, a_col, b_col) + { + } - __device__ void operator()(cudf::size_type const i) const { + __device__ void operator()(cudf::size_type const i) const + { chunked256 a(a_data[i]); chunked256 b(b_data[i]); - chunked256 &sum = a; + chunked256& sum = a; add(a, b); - overflows[i] = is_greater_than_decimal_38(sum); + overflows[i] = is_greater_than_decimal_38(sum); result_data[i] = sum.as_128_bits(); } }; // Functor to sub two DECIMAL128 columns with rounding and overflow detection. -struct dec128_sub: public dec128_add_sub { - dec128_sub(bool *overflows, cudf::mutable_column_view const &sub_view, - cudf::column_view const &a_col, cudf::column_view const &b_col) - : dec128_add_sub(overflows, sub_view, a_col, b_col) {} +struct dec128_sub : public dec128_add_sub { + dec128_sub(bool* overflows, + cudf::mutable_column_view const& sub_view, + cudf::column_view const& a_col, + cudf::column_view const& b_col) + : dec128_add_sub(overflows, sub_view, a_col, b_col) + { + } - __device__ void operator()(cudf::size_type const i) const { + __device__ void operator()(cudf::size_type const i) const + { chunked256 a(a_data[i]); chunked256 b(b_data[i]); - chunked256 &res = a; + chunked256& res = a; sub(a, b); - overflows[i] = is_greater_than_decimal_38(res); + overflows[i] = is_greater_than_decimal_38(res); result_data[i] = res.as_128_bits(); } }; // Functor to multiply two DECIMAL128 columns with rounding and overflow detection. struct dec128_multiplier { - dec128_multiplier(bool *overflows, cudf::mutable_column_view const &product_view, - cudf::column_view const &a_col, cudf::column_view const &b_col) - : overflows(overflows), a_data(a_col.data<__int128_t>()), b_data(b_col.data<__int128_t>()), - product_data(product_view.data<__int128_t>()), - a_scale(a_col.type().scale()), b_scale(b_col.type().scale()), - prod_scale(product_view.type().scale()) {} - - __device__ void operator()(cudf::size_type const i) const { + dec128_multiplier(bool* overflows, + cudf::mutable_column_view const& product_view, + cudf::column_view const& a_col, + cudf::column_view const& b_col) + : overflows(overflows), + a_data(a_col.data<__int128_t>()), + b_data(b_col.data<__int128_t>()), + product_data(product_view.data<__int128_t>()), + a_scale(a_col.type().scale()), + b_scale(b_col.type().scale()), + prod_scale(product_view.type().scale()) + { + } + + __device__ void operator()(cudf::size_type const i) const + { chunked256 const a(a_data[i]); chunked256 const b(b_data[i]); @@ -659,13 +680,13 @@ struct dec128_multiplier { // But to match Spark we need to first round the result to a precision of 38 // and this is specific to the value in the result of the multiply. // Then we need to round the result to the final scale that we care about. - int dec_precision = precision10(product); + int dec_precision = precision10(product); int first_div_precision = dec_precision - 38; int mult_scale = a_scale + b_scale; if (first_div_precision > 0) { auto const first_div_scale_divisor = pow_ten(first_div_precision).as_128_bits(); - product = divide_and_round(product, first_div_scale_divisor); + product = divide_and_round(product, first_div_scale_divisor); // a_scale and b_scale are negative. first_div_precision is not mult_scale = a_scale + b_scale + first_div_precision; @@ -680,37 +701,33 @@ struct dec128_multiplier { overflows[i] = true; return; } else { - auto const scale_mult = pow_ten( -exponent).as_128_bits(); - product = multiply(product, chunked256(scale_mult)); + auto const scale_mult = pow_ten(-exponent).as_128_bits(); + product = multiply(product, chunked256(scale_mult)); } } else { auto const scale_divisor = pow_ten(exponent).as_128_bits(); // scale and round to target scale - if (scale_divisor != 1) { - product = divide_and_round(product, scale_divisor); - } + if (scale_divisor != 1) { product = divide_and_round(product, scale_divisor); } } - overflows[i] = is_greater_than_decimal_38(product); + overflows[i] = is_greater_than_decimal_38(product); product_data[i] = product.as_128_bits(); } -private: - + private: // output column for overflow detected - bool * const overflows; + bool* const overflows; // input data for multiply - __int128_t const * const a_data; - __int128_t const * const b_data; - __int128_t * const product_data; + __int128_t const* const a_data; + __int128_t const* const b_data; + __int128_t* const product_data; int const a_scale; int const b_scale; int const prod_scale; }; - /** * Functor to divide two DECIMAL128 columns with rounding and overflow detection. * This functor should be used for a 128-bit regular division or a 64-bit integer division only @@ -720,21 +737,29 @@ private: template struct dec128_divider { static_assert((sizeof(T) == sizeof(uint64_t) && is_int_div) || - (sizeof(T) == sizeof(__int128_t) && !is_int_div)); - dec128_divider(bool *overflows, cudf::mutable_column_view const "ient_view, - cudf::column_view const &a_col, cudf::column_view const &b_col) - : overflows(overflows), a_data(a_col.data<__int128_t>()), b_data(b_col.data<__int128_t>()), - quotient_data(quotient_view.data()), - a_scale(a_col.type().scale()), b_scale(b_col.type().scale()), - quot_scale(quotient_view.type().scale()) {} - - __device__ void operator()(cudf::size_type const i) const { + (sizeof(T) == sizeof(__int128_t) && !is_int_div)); + dec128_divider(bool* overflows, + cudf::mutable_column_view const& quotient_view, + cudf::column_view const& a_col, + cudf::column_view const& b_col) + : overflows(overflows), + a_data(a_col.data<__int128_t>()), + b_data(b_col.data<__int128_t>()), + quotient_data(quotient_view.data()), + a_scale(a_col.type().scale()), + b_scale(b_col.type().scale()), + quot_scale(quotient_view.type().scale()) + { + } + + __device__ void operator()(cudf::size_type const i) const + { chunked256 n(a_data[i]); __int128_t const d(b_data[i]); // Divide by zero, not sure if we care or not, but... if (d == 0) { - overflows[i] = true; + overflows[i] = true; quotient_data[i] = 0; return; } @@ -755,10 +780,10 @@ struct dec128_divider { // The second divide gets the result into the scale that we care about and does the rounding. chunked256 result; if constexpr (is_int_div) { - result = integer_divide(first_div_result.quotient, scale_divisor); + result = integer_divide(first_div_result.quotient, scale_divisor); quotient_data[i] = result.as_64_bits(); } else { - result = divide_and_round(first_div_result.quotient, scale_divisor); + result = divide_and_round(first_div_result.quotient, scale_divisor); quotient_data[i] = result.as_128_bits(); } overflows[i] = is_greater_than_decimal_38(result); @@ -774,10 +799,10 @@ struct dec128_divider { auto const first_div_result = divide(n, d); chunked256 const first_div_r(first_div_result.remainder); - //now we have to multiply each of these by how much is left + // now we have to multiply each of these by how much is left int const remaining_exp = (-n_shift_exp) - 38; - auto const scale_mult = pow_ten(remaining_exp); - auto result = multiply(first_div_result.quotient, scale_mult); + auto const scale_mult = pow_ten(remaining_exp); + auto result = multiply(first_div_result.quotient, scale_mult); auto const scaled_div_r = multiply(first_div_r, scale_mult); // Now do a second divide on what is left @@ -785,7 +810,7 @@ struct dec128_divider { result.add(second_div_result.quotient); if constexpr (is_int_div) { - overflows[i] = is_greater_than_decimal_38(result); + overflows[i] = is_greater_than_decimal_38(result); quotient_data[i] = result.as_64_bits(); } else { // and finally round @@ -795,60 +820,65 @@ struct dec128_divider { overflows[i] = is_greater_than_decimal_38(result); } else { // Regular multiply followed by a divide - if (n_shift_exp < 0) { - n = multiply(n, pow_ten(-n_shift_exp)); - } + if (n_shift_exp < 0) { n = multiply(n, pow_ten(-n_shift_exp)); } chunked256 result; if constexpr (is_int_div) { - result = integer_divide(n, d); + result = integer_divide(n, d); quotient_data[i] = result.as_64_bits(); } else { - result = divide_and_round(n, d); + result = divide_and_round(n, d); quotient_data[i] = result.as_128_bits(); } overflows[i] = is_greater_than_decimal_38(result); } } -private: - + private: // output column for overflow detected - bool * const overflows; + bool* const overflows; // input data for multiply - __int128_t const * const a_data; - __int128_t const * const b_data; - T * const quotient_data; + __int128_t const* const a_data; + __int128_t const* const b_data; + T* const quotient_data; int const a_scale; int const b_scale; int const quot_scale; }; struct dec128_remainder { - dec128_remainder(bool *overflows, cudf::mutable_column_view const &remainder_view, - cudf::column_view const &a_col, cudf::column_view const &b_col) - : overflows(overflows), a_data(a_col.data<__int128_t>()), b_data(b_col.data<__int128_t>()), - remainder_data(remainder_view.data<__int128_t>()), - a_scale(a_col.type().scale()), b_scale(b_col.type().scale()), - rem_scale(remainder_view.type().scale()) {} - - __device__ void operator()(cudf::size_type const i) const { + dec128_remainder(bool* overflows, + cudf::mutable_column_view const& remainder_view, + cudf::column_view const& a_col, + cudf::column_view const& b_col) + : overflows(overflows), + a_data(a_col.data<__int128_t>()), + b_data(b_col.data<__int128_t>()), + remainder_data(remainder_view.data<__int128_t>()), + a_scale(a_col.type().scale()), + b_scale(b_col.type().scale()), + rem_scale(remainder_view.type().scale()) + { + } + + __device__ void operator()(cudf::size_type const i) const + { chunked256 n(a_data[i]); __int128_t const d(b_data[i]); // Divide by zero, not sure if we care or not, but... if (d == 0) { - overflows[i] = true; + overflows[i] = true; remainder_data[i] = 0; return; } // This implementation of remainder uses the JAVA definition of remainder - // that Spark relies on. It's *not* the most efficient way of calculating + // that Spark relies on. It's *not* the most efficient way of calculating // remainder, but we use this to be consistent with CPU Spark. // The algorithm is: - // a % b = a - (a // b) * b - // Basically we substract the integral_divide result times the divisor from + // a % b = a - (a // b) * b + // Basically we substract the integral_divide result times the divisor from // the dividend bool const is_n_neg = n.sign() < 0; @@ -862,33 +892,31 @@ struct dec128_remainder { // Then, we have to shift the dividend to compute integer divide // We use the formula from dec128_divider // Start with: quot_scale - (a_scale - b_scale) - // Then substitute 0 for quot_scale (integer divide), and rem_scale for b_scale + // Then substitute 0 for quot_scale (integer divide), and rem_scale for b_scale // (since we updated the divisor scale) // 0 - (a_scale - rem_scale) // rem_scale - a_scale - int n_shift_exp = rem_scale - a_scale; + int n_shift_exp = rem_scale - a_scale; __int128_t abs_d = is_d_neg ? -d : d; // Unlike in divide, where we can scale the dividend to get the right result - // remainder relies on the scale on the divisor, so we might have to shift the + // remainder relies on the scale on the divisor, so we might have to shift the // divisor itself. if (d_shift_exp > 0) { - // We need to shift the the scale of the divisor to rem_scale, but - // we actual need to round because of how precision is to be handled, + // We need to shift the the scale of the divisor to rem_scale, but + // we actual need to round because of how precision is to be handled, // since the new scale is smaller than the old scale auto const scale_divisor = pow_ten(d_shift_exp).as_128_bits(); - abs_d = divide_and_round(chunked256(abs_d), scale_divisor).as_128_bits(); + abs_d = divide_and_round(chunked256(abs_d), scale_divisor).as_128_bits(); } else { // otherwise we are multiplying the bottom by a power of 10, which divides the numerator - // by the same power of ten, so we accomodate that in our original n-shift like + // by the same power of ten, so we accomodate that in our original n-shift like // divide did before n_shift_exp -= d_shift_exp; } // For remainder, we should do the computation using positive numbers only, and then // switch the sign based on [n] *only*. chunked256 abs_n = n; - if (is_n_neg) { - abs_n.negate(); - } + if (is_n_neg) { abs_n.negate(); } chunked256 int_div_result; if (n_shift_exp > 0) { divmod256 const first_div_result = divide(abs_n, abs_d); @@ -899,178 +927,241 @@ struct dec128_remainder { // The second divide gets the result into the scale that we care about and does the rounding. int_div_result = integer_divide(first_div_result.quotient, scale_divisor); } else { - if (n_shift_exp < 0) { - abs_n = multiply(abs_n, pow_ten(-n_shift_exp)); - } + if (n_shift_exp < 0) { abs_n = multiply(abs_n, pow_ten(-n_shift_exp)); } int_div_result = integer_divide(abs_n, abs_d); } // Multiply the integer divide result by abs(divisor) chunked256 less_n = multiply(int_div_result, chunked256(abs_d)); if (d_shift_exp < 0) { - // scale less_n up to equal it to same scale since we were technically scaling up + // scale less_n up to equal it to same scale since we were technically scaling up // the divisor earlier (even though we only shifted n) less_n = multiply(less_n, pow_ten(-d_shift_exp)); } - // Subtract our integer divide result from n by adding the negated + // Subtract our integer divide result from n by adding the negated less_n.negate(); abs_n.add(less_n); // This should almost never overflow, but we check anyways overflows[i] = is_greater_than_decimal_38(abs_n); - result = abs_n.as_128_bits(); + result = abs_n.as_128_bits(); // Change the sign of the result based on n - if (is_n_neg) { - result = -result; - } + if (is_n_neg) { result = -result; } remainder_data[i] = result; } -private: + private: // output column for overflow detected - bool * const overflows; + bool* const overflows; // input data for multiply - __int128_t const * const a_data; - __int128_t const * const b_data; - __int128_t * const remainder_data; + __int128_t const* const a_data; + __int128_t const* const b_data; + __int128_t* const remainder_data; int const a_scale; int const b_scale; int const rem_scale; }; -} // anonymous namespace +} // anonymous namespace namespace cudf::jni { -std::unique_ptr -multiply_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t product_scale, - rmm::cuda_stream_view stream) { +std::unique_ptr multiply_decimal128(cudf::column_view const& a, + cudf::column_view const& b, + int32_t product_scale, + rmm::cuda_stream_view stream) +{ CUDF_EXPECTS(a.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column"); CUDF_EXPECTS(b.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column"); auto const num_rows = a.size(); CUDF_EXPECTS(num_rows == b.size(), "inputs have mismatched row counts"); auto [result_null_mask, result_null_count] = cudf::detail::bitmask_and( - cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource()); + cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource()); std::vector> columns; // copy the null mask here, as it will be used again later - columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows, - rmm::device_buffer(result_null_mask, stream), result_null_count, stream)); - columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, product_scale}, num_rows, std::move(result_null_mask), result_null_count, stream)); + columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, + num_rows, + rmm::device_buffer(result_null_mask, stream), + result_null_count, + stream)); + columns.push_back( + cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, product_scale}, + num_rows, + std::move(result_null_mask), + result_null_count, + stream)); auto overflows_view = columns[0]->mutable_view(); - auto product_view = columns[1]->mutable_view(); + auto product_view = columns[1]->mutable_view(); check_scale_divisor(a.type().scale() + b.type().scale(), product_scale); - thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_rows), dec128_multiplier(overflows_view.begin(), product_view, a, b)); return std::make_unique(std::move(columns)); } -std::unique_ptr -divide_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t quotient_scale, - rmm::cuda_stream_view stream) { +std::unique_ptr divide_decimal128(cudf::column_view const& a, + cudf::column_view const& b, + int32_t quotient_scale, + rmm::cuda_stream_view stream) +{ CUDF_EXPECTS(a.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column"); CUDF_EXPECTS(b.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column"); auto const num_rows = a.size(); CUDF_EXPECTS(num_rows == b.size(), "inputs have mismatched row counts"); auto [result_null_mask, result_null_count] = cudf::detail::bitmask_and( - cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource()); + cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource()); std::vector> columns; // copy the null mask here, as it will be used again later - columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows, - rmm::device_buffer(result_null_mask, stream), result_null_count, stream)); - columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, quotient_scale}, num_rows, std::move(result_null_mask), result_null_count, stream)); + columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, + num_rows, + rmm::device_buffer(result_null_mask, stream), + result_null_count, + stream)); + columns.push_back( + cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, quotient_scale}, + num_rows, + std::move(result_null_mask), + result_null_count, + stream)); auto overflows_view = columns[0]->mutable_view(); - auto quotient_view = columns[1]->mutable_view(); - thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - dec128_divider<__int128_t, false>(overflows_view.begin(), quotient_view, a, b)); + auto quotient_view = columns[1]->mutable_view(); + thrust::for_each( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + dec128_divider<__int128_t, false>(overflows_view.begin(), quotient_view, a, b)); return std::make_unique(std::move(columns)); } -std::unique_ptr -integer_divide_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t quotient_scale, - rmm::cuda_stream_view stream) { +std::unique_ptr integer_divide_decimal128(cudf::column_view const& a, + cudf::column_view const& b, + int32_t quotient_scale, + rmm::cuda_stream_view stream) +{ CUDF_EXPECTS(a.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column"); CUDF_EXPECTS(b.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column"); auto const num_rows = a.size(); CUDF_EXPECTS(num_rows == b.size(), "inputs have mismatched row counts"); auto [result_null_mask, result_null_count] = cudf::detail::bitmask_and( - cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource()); + cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource()); std::vector> columns; // copy the null mask here, as it will be used again later - columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows, - rmm::device_buffer(result_null_mask, stream), result_null_count, stream)); - columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT64}, num_rows, std::move(result_null_mask), result_null_count, stream)); + columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, + num_rows, + rmm::device_buffer(result_null_mask, stream), + result_null_count, + stream)); + columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT64}, + num_rows, + std::move(result_null_mask), + result_null_count, + stream)); auto overflows_view = columns[0]->mutable_view(); - auto quotient_view = columns[1]->mutable_view(); - thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - dec128_divider(overflows_view.begin(), quotient_view, a, b)); + auto quotient_view = columns[1]->mutable_view(); + thrust::for_each( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + dec128_divider(overflows_view.begin(), quotient_view, a, b)); return std::make_unique(std::move(columns)); } -std::unique_ptr -remainder_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t remainder_scale, - rmm::cuda_stream_view stream) { +std::unique_ptr remainder_decimal128(cudf::column_view const& a, + cudf::column_view const& b, + int32_t remainder_scale, + rmm::cuda_stream_view stream) +{ CUDF_EXPECTS(a.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column"); CUDF_EXPECTS(b.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column"); auto const num_rows = a.size(); CUDF_EXPECTS(num_rows == b.size(), "inputs have mismatched row counts"); auto [result_null_mask, result_null_count] = cudf::detail::bitmask_and( - cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource()); + cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource()); std::vector> columns; // copy the null mask here, as it will be used again later - columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows, - rmm::device_buffer(result_null_mask, stream), result_null_count, stream)); - columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, remainder_scale}, num_rows, std::move(result_null_mask), result_null_count, stream)); + columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, + num_rows, + rmm::device_buffer(result_null_mask, stream), + result_null_count, + stream)); + columns.push_back( + cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, remainder_scale}, + num_rows, + std::move(result_null_mask), + result_null_count, + stream)); auto overflows_view = columns[0]->mutable_view(); auto remainder_view = columns[1]->mutable_view(); - thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_rows), dec128_remainder(overflows_view.begin(), remainder_view, a, b)); return std::make_unique(std::move(columns)); } -std::unique_ptr -add_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t target_scale, - rmm::cuda_stream_view stream) { +std::unique_ptr add_decimal128(cudf::column_view const& a, + cudf::column_view const& b, + int32_t target_scale, + rmm::cuda_stream_view stream) +{ CUDF_EXPECTS(a.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column"); CUDF_EXPECTS(b.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column"); auto const num_rows = a.size(); CUDF_EXPECTS(num_rows == b.size(), "inputs have mismatched row counts"); auto [result_null_mask, result_null_count] = cudf::detail::bitmask_and( - cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource()); + cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource()); std::vector> columns; // copy the null mask here, as it will be used again later - columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows, - rmm::device_buffer(result_null_mask, stream), result_null_count, stream)); - columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, target_scale}, num_rows, std::move(result_null_mask), result_null_count, stream)); + columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, + num_rows, + rmm::device_buffer(result_null_mask, stream), + result_null_count, + stream)); + columns.push_back( + cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, target_scale}, + num_rows, + std::move(result_null_mask), + result_null_count, + stream)); auto overflows_view = columns[0]->mutable_view(); - auto sum_view = columns[1]->mutable_view(); - thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), + auto sum_view = columns[1]->mutable_view(); + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_rows), dec128_add(overflows_view.begin(), sum_view, a, b)); return std::make_unique(std::move(columns)); } -std::unique_ptr -sub_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t target_scale, - rmm::cuda_stream_view stream) { +std::unique_ptr sub_decimal128(cudf::column_view const& a, + cudf::column_view const& b, + int32_t target_scale, + rmm::cuda_stream_view stream) +{ CUDF_EXPECTS(a.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column"); CUDF_EXPECTS(b.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column"); auto const num_rows = a.size(); CUDF_EXPECTS(num_rows == b.size(), "inputs have mismatched row counts"); auto [result_null_mask, result_null_count] = cudf::detail::bitmask_and( - cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource()); + cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource()); std::vector> columns; // copy the null mask here, as it will be used again later - columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows, - rmm::device_buffer(result_null_mask, stream), result_null_count, stream)); - columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, target_scale}, num_rows, std::move(result_null_mask), result_null_count, stream)); + columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, + num_rows, + rmm::device_buffer(result_null_mask, stream), + result_null_count, + stream)); + columns.push_back( + cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, target_scale}, + num_rows, + std::move(result_null_mask), + result_null_count, + stream)); auto overflows_view = columns[0]->mutable_view(); - auto sub_view = columns[1]->mutable_view(); - thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), + auto sub_view = columns[1]->mutable_view(); + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_rows), dec128_sub(overflows_view.begin(), sub_view, a, b)); return std::make_unique(std::move(columns)); } -} // namespace cudf::jni +} // namespace cudf::jni diff --git a/src/main/cpp/src/decimal_utils.hpp b/src/main/cpp/src/decimal_utils.hpp index 1c7c30ed01..1011a0a574 100644 --- a/src/main/cpp/src/decimal_utils.hpp +++ b/src/main/cpp/src/decimal_utils.hpp @@ -22,27 +22,39 @@ namespace cudf::jni { -std::unique_ptr -multiply_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t product_scale, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); - -std::unique_ptr -divide_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t quotient_scale, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); - -std::unique_ptr -integer_divide_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t quotient_scale, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); - -std::unique_ptr -remainder_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t remainder_scale, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); - -std::unique_ptr -add_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t quotient_scale, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); - -std::unique_ptr -sub_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t quotient_scale, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); -} // namespace cudf::jni +std::unique_ptr multiply_decimal128( + cudf::column_view const& a, + cudf::column_view const& b, + int32_t product_scale, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +std::unique_ptr divide_decimal128( + cudf::column_view const& a, + cudf::column_view const& b, + int32_t quotient_scale, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +std::unique_ptr integer_divide_decimal128( + cudf::column_view const& a, + cudf::column_view const& b, + int32_t quotient_scale, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +std::unique_ptr remainder_decimal128( + cudf::column_view const& a, + cudf::column_view const& b, + int32_t remainder_scale, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +std::unique_ptr add_decimal128( + cudf::column_view const& a, + cudf::column_view const& b, + int32_t quotient_scale, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +std::unique_ptr sub_decimal128( + cudf::column_view const& a, + cudf::column_view const& b, + int32_t quotient_scale, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); +} // namespace cudf::jni diff --git a/src/main/cpp/src/map_utils.cu b/src/main/cpp/src/map_utils.cu index d0367206ae..c5384e41b0 100644 --- a/src/main/cpp/src/map_utils.cu +++ b/src/main/cpp/src/map_utils.cu @@ -64,30 +64,32 @@ namespace { // 1. Append one comma character (',') to the end of each input string, except the last one. // 2. Concatenate all input strings into one string. // 3. Add a pair of bracket characters ('[' and ']') to the beginning and the end of the output. -rmm::device_uvector unify_json_strings(cudf::column_view const &input, - rmm::cuda_stream_view stream) { +rmm::device_uvector unify_json_strings(cudf::column_view const& input, + rmm::cuda_stream_view stream) +{ if (input.is_empty()) { - return cudf::detail::make_device_uvector_async(std::vector{'[', ']'}, stream, - rmm::mr::get_current_device_resource()); + return cudf::detail::make_device_uvector_async( + std::vector{'[', ']'}, stream, rmm::mr::get_current_device_resource()); } - auto const d_strings = cudf::column_device_view::create(input, stream); + auto const d_strings = cudf::column_device_view::create(input, stream); auto const chars_size = input.child(cudf::strings_column_view::chars_column_index).size(); auto const output_size = - 2l + // two extra bracket characters '[' and ']' - static_cast(chars_size) + - static_cast(input.size() - 1) + // append `,` character between input rows - static_cast(input.null_count()) * 2l; // replace null with "{}" + 2l + // two extra bracket characters '[' and ']' + static_cast(chars_size) + + static_cast(input.size() - 1) + // append `,` character between input rows + static_cast(input.null_count()) * 2l; // replace null with "{}" CUDF_EXPECTS(output_size <= static_cast(std::numeric_limits::max()), "The input json column is too large and causes overflow."); auto const joined_input = cudf::strings::detail::join_strings( - cudf::strings_column_view{input}, - cudf::string_scalar(","), // append `,` character between the input rows - cudf::string_scalar("{}"), // replacement for null rows - stream, rmm::mr::get_current_device_resource()); + cudf::strings_column_view{input}, + cudf::string_scalar(","), // append `,` character between the input rows + cudf::string_scalar("{}"), // replacement for null rows + stream, + rmm::mr::get_current_device_resource()); auto const joined_input_child = - joined_input->child(cudf::strings_column_view::chars_column_index); + joined_input->child(cudf::strings_column_view::chars_column_index); auto const joined_input_size_bytes = joined_input_child.size(); CUDF_EXPECTS(joined_input_size_bytes + 2 == output_size, "Incorrect output size computation."); @@ -95,10 +97,13 @@ rmm::device_uvector unify_json_strings(cudf::column_view const &input, // For efficiency, let's use memcpy instead of `cudf::strings::detail::concatenate`. auto output = rmm::device_uvector(joined_input_size_bytes + 2, stream); CUDF_CUDA_TRY(cudaMemsetAsync(output.data(), static_cast('['), 1, stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync(output.data() + 1, joined_input_child.view().data(), - joined_input_size_bytes, cudaMemcpyDefault, stream.value())); - CUDF_CUDA_TRY(cudaMemsetAsync(output.data() + joined_input_size_bytes + 1, static_cast(']'), - 1, stream.value())); + CUDF_CUDA_TRY(cudaMemcpyAsync(output.data() + 1, + joined_input_child.view().data(), + joined_input_size_bytes, + cudaMemcpyDefault, + stream.value())); + CUDF_CUDA_TRY(cudaMemsetAsync( + output.data() + joined_input_size_bytes + 1, static_cast(']'), 1, stream.value())); #ifdef DEBUG_FROM_JSON print_debug(output, "Processed json string", "", stream); @@ -107,29 +112,33 @@ rmm::device_uvector unify_json_strings(cudf::column_view const &input, } // Check and throw exception if there is any parsing error. -void throw_if_error(rmm::device_uvector const &input_json, - rmm::device_uvector const &tokens, - rmm::device_uvector const &token_indices, - rmm::cuda_stream_view stream) { +void throw_if_error(rmm::device_uvector const& input_json, + rmm::device_uvector const& tokens, + rmm::device_uvector const& token_indices, + rmm::cuda_stream_view stream) +{ auto const error_count = - thrust::count(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin); + thrust::count(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin); if (error_count > 0) { auto const error_location = - thrust::find(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin); + thrust::find(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin); SymbolOffsetT error_index; - CUDF_CUDA_TRY(cudaMemcpyAsync( - &error_index, token_indices.data() + thrust::distance(tokens.begin(), error_location), - sizeof(SymbolOffsetT), cudaMemcpyDeviceToHost, stream.value())); + CUDF_CUDA_TRY( + cudaMemcpyAsync(&error_index, + token_indices.data() + thrust::distance(tokens.begin(), error_location), + sizeof(SymbolOffsetT), + cudaMemcpyDeviceToHost, + stream.value())); stream.synchronize(); - constexpr auto extension = 100; + constexpr auto extension = 100; auto const begin_print_idx = std::max(error_index - extension, SymbolOffsetT{0}); auto const end_print_idx = - std::min(error_index + extension, static_cast(input_json.size())); - auto const print_size = end_print_idx - begin_print_idx; + std::min(error_index + extension, static_cast(input_json.size())); + auto const print_size = end_print_idx - begin_print_idx; auto const h_input_json = cudf::detail::make_host_vector_sync( - cudf::device_span{input_json.data() + begin_print_idx, print_size}, stream); + cudf::device_span{input_json.data() + begin_print_idx, print_size}, stream); std::cerr << "Substring of the input json with " + std::to_string(extension) << " characters before+after the error location:\n"; std::cerr << std::string(h_input_json.data(), h_input_json.size()) << std::endl; @@ -141,7 +150,8 @@ void throw_if_error(rmm::device_uvector const &input_json, // Check if a token is a json node. struct is_node { - __host__ __device__ bool operator()(PdaTokenT const token) const { + __host__ __device__ bool operator()(PdaTokenT const token) const + { switch (token) { case token_t::StructBegin: case token_t::ListBegin: @@ -159,8 +169,9 @@ struct is_node { // Each row in the input column should have levels starting from 1. // This is copied from cudf's `json_tree.cu`. rmm::device_uvector compute_node_levels(int64_t num_nodes, - rmm::device_uvector const &tokens, - rmm::cuda_stream_view stream) { + rmm::device_uvector const& tokens, + rmm::cuda_stream_view stream) +{ auto token_levels = rmm::device_uvector(tokens.size(), stream); // Whether the token pops from the parent node stack. @@ -184,16 +195,19 @@ rmm::device_uvector compute_node_levels(int64_t num_nodes, }; auto const push_pop_it = thrust::make_transform_iterator( - tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> cudf::size_type { - return does_push(token) - does_pop(token); - }); - thrust::exclusive_scan(rmm::exec_policy(stream), push_pop_it, push_pop_it + tokens.size(), - token_levels.begin()); - - auto node_levels = rmm::device_uvector(num_nodes, stream); - auto const copy_end = - cudf::detail::copy_if_safe(token_levels.begin(), token_levels.end(), tokens.begin(), - node_levels.begin(), is_node{}, stream); + tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> cudf::size_type { + return does_push(token) - does_pop(token); + }); + thrust::exclusive_scan( + rmm::exec_policy(stream), push_pop_it, push_pop_it + tokens.size(), token_levels.begin()); + + auto node_levels = rmm::device_uvector(num_nodes, stream); + auto const copy_end = cudf::detail::copy_if_safe(token_levels.begin(), + token_levels.end(), + tokens.begin(), + node_levels.begin(), + is_node{}, + stream); CUDF_EXPECTS(thrust::distance(node_levels.begin(), copy_end) == num_nodes, "Node level count mismatch"); @@ -204,14 +218,17 @@ rmm::device_uvector compute_node_levels(int64_t num_nodes, } // Compute the map from nodes to their indices in the list of all tokens. -rmm::device_uvector -compute_node_to_token_index_map(int64_t num_nodes, rmm::device_uvector const &tokens, - rmm::cuda_stream_view stream) { - auto node_token_ids = rmm::device_uvector(num_nodes, stream); +rmm::device_uvector compute_node_to_token_index_map( + int64_t num_nodes, rmm::device_uvector const& tokens, rmm::cuda_stream_view stream) +{ + auto node_token_ids = rmm::device_uvector(num_nodes, stream); auto const node_id_it = thrust::counting_iterator(0); - auto const copy_end = - cudf::detail::copy_if_safe(node_id_it, node_id_it + tokens.size(), tokens.begin(), - node_token_ids.begin(), is_node{}, stream); + auto const copy_end = cudf::detail::copy_if_safe(node_id_it, + node_id_it + tokens.size(), + tokens.begin(), + node_token_ids.begin(), + is_node{}, + stream); CUDF_EXPECTS(thrust::distance(node_token_ids.begin(), copy_end) == num_nodes, "Invalid computation for node-to-token-index map"); @@ -223,8 +240,9 @@ compute_node_to_token_index_map(int64_t num_nodes, rmm::device_uvector -std::pair, rmm::device_uvector> -stable_sorted_key_order(rmm::device_uvector const &keys, rmm::cuda_stream_view stream) { +std::pair, rmm::device_uvector> stable_sorted_key_order( + rmm::device_uvector const& keys, rmm::cuda_stream_view stream) +{ // Buffers used for storing intermediate results during sorting. rmm::device_uvector keys_buffer1(keys.size(), stream); rmm::device_uvector keys_buffer2(keys.size(), stream); @@ -237,43 +255,52 @@ stable_sorted_key_order(rmm::device_uvector const &keys, rmm::cuda_stre thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end()); size_t temp_storage_bytes = 0; - cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, keys_buffer, order_buffer, - keys.size()); + cub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size()); rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - cub::DeviceRadixSort::SortPairs(d_temp_storage.data(), temp_storage_bytes, keys_buffer, - order_buffer, keys.size(), 0, sizeof(KeyType) * 8, + cub::DeviceRadixSort::SortPairs(d_temp_storage.data(), + temp_storage_bytes, + keys_buffer, + order_buffer, + keys.size(), + 0, + sizeof(KeyType) * 8, stream.value()); - return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1) : - std::move(keys_buffer2), - order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1) : - std::move(order_buffer2)}; + return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1) + : std::move(keys_buffer2), + order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1) + : std::move(order_buffer2)}; } // This is copied from cudf's `json_tree.cu`. -void propagate_parent_to_siblings(rmm::device_uvector const &node_levels, - rmm::device_uvector &parent_node_ids, - rmm::cuda_stream_view stream) { +void propagate_parent_to_siblings(rmm::device_uvector const& node_levels, + rmm::device_uvector& parent_node_ids, + rmm::cuda_stream_view stream) +{ auto const [sorted_node_levels, sorted_order] = stable_sorted_key_order(node_levels, stream); // Instead of gather, using permutation_iterator, which is ~17% faster. thrust::inclusive_scan_by_key( - rmm::exec_policy(stream), sorted_node_levels.begin(), sorted_node_levels.end(), - thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), - thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), - thrust::equal_to{}, thrust::maximum{}); + rmm::exec_policy(stream), + sorted_node_levels.begin(), + sorted_node_levels.end(), + thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), + thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), + thrust::equal_to{}, + thrust::maximum{}); } // This is copied from cudf's `json_tree.cu`. -rmm::device_uvector -compute_parent_node_ids(int64_t num_nodes, rmm::device_uvector const &tokens, - rmm::device_uvector const &node_token_ids, - rmm::cuda_stream_view stream) { +rmm::device_uvector compute_parent_node_ids( + int64_t num_nodes, + rmm::device_uvector const& tokens, + rmm::device_uvector const& node_token_ids, + rmm::cuda_stream_view stream) +{ auto const first_childs_parent_token_id = [tokens = - tokens.begin()] __device__(auto i) -> NodeIndexT { - if (i <= 0) { - return -1; - } + tokens.begin()] __device__(auto i) -> NodeIndexT { + if (i <= 0) { return -1; } if (tokens[i - 1] == token_t::StructBegin || tokens[i - 1] == token_t::ListBegin) { return i - 1; } else if (tokens[i - 1] == token_t::FieldNameEnd) { @@ -287,16 +314,19 @@ compute_parent_node_ids(int64_t num_nodes, rmm::device_uvector const }; auto parent_node_ids = rmm::device_uvector(num_nodes, stream); - thrust::transform(rmm::exec_policy(stream), node_token_ids.begin(), node_token_ids.end(), - parent_node_ids.begin(), - [node_ids_gpu = node_token_ids.begin(), num_nodes, - first_childs_parent_token_id] __device__(NodeIndexT const tid) -> NodeIndexT { - auto const pid = first_childs_parent_token_id(tid); - return pid < 0 ? cudf::io::json::parent_node_sentinel : - thrust::lower_bound(thrust::seq, node_ids_gpu, - node_ids_gpu + num_nodes, pid) - - node_ids_gpu; - }); + thrust::transform( + rmm::exec_policy(stream), + node_token_ids.begin(), + node_token_ids.end(), + parent_node_ids.begin(), + [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id] __device__( + NodeIndexT const tid) -> NodeIndexT { + auto const pid = first_childs_parent_token_id(tid); + return pid < 0 + ? cudf::io::json::parent_node_sentinel + : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - + node_ids_gpu; + }); // Propagate parent node to siblings from first sibling - inplace. auto const node_levels = compute_node_levels(num_nodes, tokens, stream); @@ -312,27 +342,30 @@ constexpr int8_t key_sentinel{1}; constexpr int8_t value_sentinel{2}; // Check for each node if it is a key or a value field. -rmm::device_uvector -check_key_or_value_nodes(rmm::device_uvector const &parent_node_ids, - rmm::cuda_stream_view stream) { - auto key_or_value = rmm::device_uvector(parent_node_ids.size(), stream); +rmm::device_uvector check_key_or_value_nodes( + rmm::device_uvector const& parent_node_ids, rmm::cuda_stream_view stream) +{ + auto key_or_value = rmm::device_uvector(parent_node_ids.size(), stream); auto const transform_it = thrust::counting_iterator(0); thrust::transform( - rmm::exec_policy(stream), transform_it, transform_it + parent_node_ids.size(), - key_or_value.begin(), - [key_sentinel = key_sentinel, value_sentinel = value_sentinel, - parent_ids = parent_node_ids.begin()] __device__(auto const node_id) -> int8_t { - if (parent_ids[node_id] > 0) { - auto const grand_parent = parent_ids[parent_ids[node_id]]; - if (grand_parent == 0) { - return key_sentinel; - } else if (parent_ids[grand_parent] == 0) { - return value_sentinel; - } + rmm::exec_policy(stream), + transform_it, + transform_it + parent_node_ids.size(), + key_or_value.begin(), + [key_sentinel = key_sentinel, + value_sentinel = value_sentinel, + parent_ids = parent_node_ids.begin()] __device__(auto const node_id) -> int8_t { + if (parent_ids[node_id] > 0) { + auto const grand_parent = parent_ids[parent_ids[node_id]]; + if (grand_parent == 0) { + return key_sentinel; + } else if (parent_ids[grand_parent] == 0) { + return value_sentinel; } + } - return 0; - }); + return 0; + }); #ifdef DEBUG_FROM_JSON print_debug(key_or_value, "Nodes are key/value (1==key, 2==value)", ", ", stream); @@ -351,7 +384,8 @@ struct node_ranges_fn { // Whether the extracted string values from json map will have the quote character. static const bool include_quote_char{false}; - __device__ thrust::pair operator()(cudf::size_type node_id) const { + __device__ thrust::pair operator()(cudf::size_type node_id) const + { [[maybe_unused]] auto const is_begin_of_section = [] __device__(PdaTokenT const token) { switch (token) { case token_t::StructBegin: @@ -387,7 +421,7 @@ struct node_ranges_fn { }; auto const get_token_index = [include_quote_char = include_quote_char] __device__( - PdaTokenT const token, SymbolOffsetT const token_index) { + PdaTokenT const token, SymbolOffsetT const token_index) { constexpr SymbolOffsetT quote_char_size = 1; switch (token) { // Strip off quote char included for StringBegin @@ -405,18 +439,18 @@ struct node_ranges_fn { } auto const token_idx = node_token_ids[node_id]; - auto const token = tokens[token_idx]; + auto const token = tokens[token_idx]; cudf_assert(is_begin_of_section(token) && "Invalid node category."); // The section from the original JSON input that this token demarcates. auto const range_begin = get_token_index(token, token_indices[token_idx]); - auto range_end = range_begin + 1; // non-leaf, non-field nodes ignore this value. + auto range_end = range_begin + 1; // non-leaf, non-field nodes ignore this value. if ((token_idx + 1) < tokens.size() && end_of_partner(token) == tokens[token_idx + 1]) { // Update the range_end for this pair of tokens range_end = get_token_index(tokens[token_idx + 1], token_indices[token_idx + 1]); } else { - auto nested_range_value = nested_node_to_value(token); // iterate until this is zero - auto end_idx = token_idx + 1; + auto nested_range_value = nested_node_to_value(token); // iterate until this is zero + auto end_idx = token_idx + 1; while (end_idx < tokens.size()) { nested_range_value += nested_node_to_value(tokens[end_idx]); if (nested_range_value == 0) { @@ -434,18 +468,24 @@ struct node_ranges_fn { // Compute index range for each node. // These ranges identify positions to extract nodes from the unified json string. -rmm::device_uvector> -compute_node_ranges(int64_t num_nodes, rmm::device_uvector const &tokens, - rmm::device_uvector const &token_indices, - rmm::device_uvector const &node_token_ids, - rmm::device_uvector const &parent_node_ids, - rmm::device_uvector const &key_or_value, rmm::cuda_stream_view stream) { +rmm::device_uvector> compute_node_ranges( + int64_t num_nodes, + rmm::device_uvector const& tokens, + rmm::device_uvector const& token_indices, + rmm::device_uvector const& node_token_ids, + rmm::device_uvector const& parent_node_ids, + rmm::device_uvector const& key_or_value, + rmm::cuda_stream_view stream) +{ auto node_ranges = - rmm::device_uvector>(num_nodes, stream); + rmm::device_uvector>(num_nodes, stream); auto const transform_it = thrust::counting_iterator(0); thrust::transform( - rmm::exec_policy(stream), transform_it, transform_it + num_nodes, node_ranges.begin(), - node_ranges_fn{tokens, token_indices, node_token_ids, parent_node_ids, key_or_value}); + rmm::exec_policy(stream), + transform_it, + transform_it + num_nodes, + node_ranges.begin(), + node_ranges_fn{tokens, token_indices, node_token_ids, parent_node_ids, key_or_value}); #ifdef DEBUG_FROM_JSON print_pair_debug(node_ranges, "Node ranges", stream); @@ -460,12 +500,13 @@ struct substring_fn { cudf::device_span const d_string; cudf::device_span const> const d_ranges; - cudf::offset_type *d_offsets{}; - char *d_chars{}; + cudf::offset_type* d_offsets{}; + char* d_chars{}; - __device__ void operator()(cudf::size_type const idx) { + __device__ void operator()(cudf::size_type const idx) + { auto const range = d_ranges[idx]; - auto const size = range.second - range.first; + auto const size = range.second - range.first; if (d_chars) { memcpy(d_chars + d_offsets[idx], d_string.data() + range.first, size); } else { @@ -476,11 +517,14 @@ struct substring_fn { // Extract key-value string pairs from the input json string. std::unique_ptr extract_keys_or_values( - bool extract_key, int64_t num_nodes, - rmm::device_uvector> const &node_ranges, - rmm::device_uvector const &key_or_value, - rmm::device_uvector const &unified_json_buff, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { + bool extract_key, + int64_t num_nodes, + rmm::device_uvector> const& node_ranges, + rmm::device_uvector const& key_or_value, + rmm::device_uvector const& unified_json_buff, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ auto const is_key = [key_or_value = key_or_value.begin()] __device__(auto const node_id) { return key_or_value[node_id] == key_sentinel; }; @@ -490,35 +534,47 @@ std::unique_ptr extract_keys_or_values( }; auto extract_ranges = - rmm::device_uvector>(num_nodes, stream, mr); - auto const stencil_it = thrust::make_counting_iterator(0); - auto const range_end = - extract_key ? cudf::detail::copy_if_safe(node_ranges.begin(), node_ranges.end(), stencil_it, - extract_ranges.begin(), is_key, stream) : - cudf::detail::copy_if_safe(node_ranges.begin(), node_ranges.end(), stencil_it, - extract_ranges.begin(), is_value, stream); + rmm::device_uvector>(num_nodes, stream, mr); + auto const stencil_it = thrust::make_counting_iterator(0); + auto const range_end = extract_key ? cudf::detail::copy_if_safe(node_ranges.begin(), + node_ranges.end(), + stencil_it, + extract_ranges.begin(), + is_key, + stream) + : cudf::detail::copy_if_safe(node_ranges.begin(), + node_ranges.end(), + stencil_it, + extract_ranges.begin(), + is_value, + stream); auto const num_extract = thrust::distance(extract_ranges.begin(), range_end); auto children = cudf::strings::detail::make_strings_children( - substring_fn{unified_json_buff, extract_ranges}, num_extract, stream, mr); - return cudf::make_strings_column(num_extract, std::move(children.first), - std::move(children.second), 0, rmm::device_buffer{}); + substring_fn{unified_json_buff, extract_ranges}, num_extract, stream, mr); + return cudf::make_strings_column( + num_extract, std::move(children.first), std::move(children.second), 0, rmm::device_buffer{}); } // Compute the offsets for the final lists of Struct. -rmm::device_uvector -compute_list_offsets(cudf::size_type n_lists, - rmm::device_uvector const &parent_node_ids, - rmm::device_uvector const &key_or_value, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { +rmm::device_uvector compute_list_offsets( + cudf::size_type n_lists, + rmm::device_uvector const& parent_node_ids, + rmm::device_uvector const& key_or_value, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ // Count the number of children nodes for the json object nodes. // These object nodes are given as one row of the input json strings column. auto node_child_counts = rmm::device_uvector(parent_node_ids.size(), stream); // For the nodes having parent_id == 0 (they are json object given by one input row), set their // child counts to zero. Otherwise, set child counts to `-1` (a sentinel number). - thrust::transform(rmm::exec_policy(stream), parent_node_ids.begin(), parent_node_ids.end(), - node_child_counts.begin(), [] __device__(auto const parent_id) -> NodeIndexT { + thrust::transform(rmm::exec_policy(stream), + parent_node_ids.begin(), + parent_node_ids.end(), + node_child_counts.begin(), + [] __device__(auto const parent_id) -> NodeIndexT { return parent_id == 0 ? 0 : std::numeric_limits::lowest(); }); @@ -528,9 +584,12 @@ compute_list_offsets(cudf::size_type n_lists, // Count the number of keys for each json object using `atomicAdd`. auto const transform_it = thrust::counting_iterator(0); - thrust::for_each(rmm::exec_policy(stream), transform_it, transform_it + parent_node_ids.size(), - [is_key, child_counts = node_child_counts.begin(), - parent_ids = parent_node_ids.begin()] __device__(auto const node_id) { + thrust::for_each(rmm::exec_policy(stream), + transform_it, + transform_it + parent_node_ids.size(), + [is_key, + child_counts = node_child_counts.begin(), + parent_ids = parent_node_ids.begin()] __device__(auto const node_id) { if (is_key(node_id)) { auto const parent_id = parent_ids[node_id]; atomicAdd(&child_counts[parent_id], 1); @@ -540,29 +599,33 @@ compute_list_offsets(cudf::size_type n_lists, print_debug(node_child_counts, "Nodes' child keys counts", ", ", stream); #endif - auto list_offsets = rmm::device_uvector(n_lists + 1, stream, mr); + auto list_offsets = rmm::device_uvector(n_lists + 1, stream, mr); auto const copy_end = cudf::detail::copy_if_safe( - node_child_counts.begin(), node_child_counts.end(), list_offsets.begin(), - [] __device__(auto const count) { return count >= 0; }, stream); + node_child_counts.begin(), + node_child_counts.end(), + list_offsets.begin(), + [] __device__(auto const count) { return count >= 0; }, + stream); CUDF_EXPECTS(thrust::distance(list_offsets.begin(), copy_end) == static_cast(n_lists), "Invalid list size computation."); #ifdef DEBUG_FROM_JSON print_debug(list_offsets, "Output list sizes (except the last one)", ", ", stream); #endif - thrust::exclusive_scan(rmm::exec_policy(stream), list_offsets.begin(), list_offsets.end(), - list_offsets.begin()); + thrust::exclusive_scan( + rmm::exec_policy(stream), list_offsets.begin(), list_offsets.end(), list_offsets.begin()); #ifdef DEBUG_FROM_JSON print_debug(list_offsets, "Output list offsets", ", ", stream); #endif return list_offsets; } -} // namespace +} // namespace -std::unique_ptr from_json(cudf::column_view const &input, +std::unique_ptr from_json(cudf::column_view const& input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { + rmm::mr::device_memory_resource* mr) +{ CUDF_EXPECTS(input.type().id() == cudf::type_id::STRING, "Invalid input format"); // Firstly, concatenate all the input json strings into one giant input json string. @@ -574,8 +637,10 @@ std::unique_ptr from_json(cudf::column_view const &input, static_assert(sizeof(SymbolT) == sizeof(char), "Invalid internal data for nested json tokenizer."); auto const [tokens, token_indices] = cudf::io::json::detail::get_token_stream( - cudf::device_span{unified_json_buff.data(), unified_json_buff.size()}, - cudf::io::json_reader_options{}, stream, rmm::mr::get_current_device_resource()); + cudf::device_span{unified_json_buff.data(), unified_json_buff.size()}, + cudf::io::json_reader_options{}, + stream, + rmm::mr::get_current_device_resource()); #ifdef DEBUG_FROM_JSON print_debug(tokens, "Tokens", ", ", stream); @@ -586,7 +651,7 @@ std::unique_ptr from_json(cudf::column_view const &input, throw_if_error(unified_json_buff, tokens, token_indices, stream); auto const num_nodes = - thrust::count_if(rmm::exec_policy(stream), tokens.begin(), tokens.end(), is_node{}); + thrust::count_if(rmm::exec_policy(stream), tokens.begin(), tokens.end(), is_node{}); // Compute the map from nodes to their indices in the list of all tokens. auto const node_token_ids = compute_node_to_token_index_map(num_nodes, tokens, stream); @@ -599,23 +664,23 @@ std::unique_ptr from_json(cudf::column_view const &input, // Compute index range for each node. // These ranges identify positions to extract nodes from the unified json string. - auto const node_ranges = compute_node_ranges(num_nodes, tokens, token_indices, node_token_ids, - parent_node_ids, key_or_value_node, stream); + auto const node_ranges = compute_node_ranges( + num_nodes, tokens, token_indices, node_token_ids, parent_node_ids, key_or_value_node, stream); // // From below are variables for returning output. // - auto extracted_keys = extract_keys_or_values(true /*key*/, num_nodes, node_ranges, - key_or_value_node, unified_json_buff, stream, mr); - auto extracted_values = extract_keys_or_values(false /*value*/, num_nodes, node_ranges, - key_or_value_node, unified_json_buff, stream, mr); + auto extracted_keys = extract_keys_or_values( + true /*key*/, num_nodes, node_ranges, key_or_value_node, unified_json_buff, stream, mr); + auto extracted_values = extract_keys_or_values( + false /*value*/, num_nodes, node_ranges, key_or_value_node, unified_json_buff, stream, mr); CUDF_EXPECTS(extracted_keys->size() == extracted_values->size(), "Invalid key-value pair extraction."); // Compute the offsets of the final output lists column. auto list_offsets = - compute_list_offsets(input.size(), parent_node_ids, key_or_value_node, stream, mr); + compute_list_offsets(input.size(), parent_node_ids, key_or_value_node, stream, mr); #ifdef DEBUG_FROM_JSON print_output_spark_map(list_offsets, extracted_keys, extracted_values, stream); @@ -625,15 +690,18 @@ std::unique_ptr from_json(cudf::column_view const &input, std::vector> out_keys_vals; out_keys_vals.emplace_back(std::move(extracted_keys)); out_keys_vals.emplace_back(std::move(extracted_values)); - auto structs_col = cudf::make_structs_column(num_pairs, std::move(out_keys_vals), 0, - rmm::device_buffer{}, stream, mr); - - auto offsets = std::make_unique(std::move(list_offsets), - rmm::device_buffer{}, 0); - - return cudf::make_lists_column( - input.size(), std::move(offsets), std::move(structs_col), - input.null_count(), cudf::detail::copy_bitmask(input, stream, mr), stream, mr); + auto structs_col = cudf::make_structs_column( + num_pairs, std::move(out_keys_vals), 0, rmm::device_buffer{}, stream, mr); + + auto offsets = std::make_unique(std::move(list_offsets), rmm::device_buffer{}, 0); + + return cudf::make_lists_column(input.size(), + std::move(offsets), + std::move(structs_col), + input.null_count(), + cudf::detail::copy_bitmask(input, stream, mr), + stream, + mr); } -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/map_utils.hpp b/src/main/cpp/src/map_utils.hpp index ddf66b07de..445bc89c7b 100644 --- a/src/main/cpp/src/map_utils.hpp +++ b/src/main/cpp/src/map_utils.hpp @@ -24,8 +24,9 @@ namespace spark_rapids_jni { -std::unique_ptr -from_json(cudf::column_view const &input, rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); +std::unique_ptr from_json( + cudf::column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/map_utils_debug.cuh b/src/main/cpp/src/map_utils_debug.cuh index 652fa84672..39446b2971 100644 --- a/src/main/cpp/src/map_utils_debug.cuh +++ b/src/main/cpp/src/map_utils_debug.cuh @@ -16,7 +16,7 @@ #pragma once -//#define DEBUG_FROM_JSON +// #define DEBUG_FROM_JSON #ifdef DEBUG_FROM_JSON @@ -36,7 +36,8 @@ namespace spark_rapids_jni { using namespace cudf::io::json; // Convert the token value into string name, for debugging purpose. -std::string token_to_string(PdaTokenT const token_type) { +std::string token_to_string(PdaTokenT const token_type) +{ switch (token_type) { case token_t::StructBegin: return "StructBegin"; case token_t::StructEnd: return "StructEnd"; @@ -57,27 +58,30 @@ std::string token_to_string(PdaTokenT const token_type) { // Print the content of the input device vector. template -void print_debug(rmm::device_uvector const &input, std::string const &name, - std::string const &separator, rmm::cuda_stream_view stream) { +void print_debug(rmm::device_uvector const& input, + std::string const& name, + std::string const& separator, + rmm::cuda_stream_view stream) +{ auto const h_input = cudf::detail::make_host_vector_sync( - cudf::device_span{input.data(), input.size()}, stream); + cudf::device_span{input.data(), input.size()}, stream); std::stringstream ss; ss << name << ":\n"; for (size_t i = 0; i < h_input.size(); ++i) { ss << static_cast(h_input[i]); - if (separator.size() > 0 && i + 1 < h_input.size()) { - ss << separator; - } + if (separator.size() > 0 && i + 1 < h_input.size()) { ss << separator; } } std::cerr << ss.str() << std::endl; } // Print the content of the input map given by a device vector. template -void print_map_debug(rmm::device_uvector const &input, std::string const &name, - rmm::cuda_stream_view stream) { +void print_map_debug(rmm::device_uvector const& input, + std::string const& name, + rmm::cuda_stream_view stream) +{ auto const h_input = cudf::detail::make_host_vector_sync( - cudf::device_span{input.data(), input.size()}, stream); + cudf::device_span{input.data(), input.size()}, stream); std::stringstream ss; ss << name << ":\n"; for (size_t i = 0; i < h_input.size(); ++i) { @@ -88,10 +92,12 @@ void print_map_debug(rmm::device_uvector const &input, std::string const &nam // Print the content of the input pairs given by a device vector. template -void print_pair_debug(rmm::device_uvector const &input, std::string const &name, - rmm::cuda_stream_view stream) { +void print_pair_debug(rmm::device_uvector const& input, + std::string const& name, + rmm::cuda_stream_view stream) +{ auto const h_input = cudf::detail::make_host_vector_sync( - cudf::device_span{input.data(), input.size()}, stream); + cudf::device_span{input.data(), input.size()}, stream); std::stringstream ss; ss << name << ":\n"; for (size_t i = 0; i < h_input.size(); ++i) { @@ -102,36 +108,37 @@ void print_pair_debug(rmm::device_uvector const &input, std::string const &na } // Print the final output map data (Spark's MapType, i.e., List>). -void print_output_spark_map(rmm::device_uvector const &list_offsets, - std::unique_ptr const &extracted_keys, - std::unique_ptr const &extracted_values, - rmm::cuda_stream_view stream) { - auto const keys_child = extracted_keys->child(cudf::strings_column_view::chars_column_index); +void print_output_spark_map(rmm::device_uvector const& list_offsets, + std::unique_ptr const& extracted_keys, + std::unique_ptr const& extracted_values, + rmm::cuda_stream_view stream) +{ + auto const keys_child = extracted_keys->child(cudf::strings_column_view::chars_column_index); auto const keys_offsets = extracted_keys->child(cudf::strings_column_view::offsets_column_index); auto const values_child = extracted_values->child(cudf::strings_column_view::chars_column_index); auto const values_offsets = - extracted_values->child(cudf::strings_column_view::offsets_column_index); + extracted_values->child(cudf::strings_column_view::offsets_column_index); auto const h_extracted_keys_child = cudf::detail::make_host_vector_sync( - cudf::device_span{keys_child.view().data(), - static_cast(keys_child.size())}, - stream); + cudf::device_span{keys_child.view().data(), + static_cast(keys_child.size())}, + stream); auto const h_extracted_keys_offsets = cudf::detail::make_host_vector_sync( - cudf::device_span{keys_offsets.view().data(), - static_cast(keys_offsets.size())}, - stream); + cudf::device_span{keys_offsets.view().data(), + static_cast(keys_offsets.size())}, + stream); auto const h_extracted_values_child = cudf::detail::make_host_vector_sync( - cudf::device_span{values_child.view().data(), - static_cast(values_child.size())}, - stream); + cudf::device_span{values_child.view().data(), + static_cast(values_child.size())}, + stream); auto const h_extracted_values_offsets = cudf::detail::make_host_vector_sync( - cudf::device_span{values_offsets.view().data(), - static_cast(values_offsets.size())}, - stream); + cudf::device_span{values_offsets.view().data(), + static_cast(values_offsets.size())}, + stream); auto const h_list_offsets = cudf::detail::make_host_vector_sync( - cudf::device_span{list_offsets.data(), list_offsets.size()}, stream); + cudf::device_span{list_offsets.data(), list_offsets.size()}, stream); CUDF_EXPECTS(h_list_offsets.back() == extracted_keys->size(), "Invalid list offsets computation."); @@ -144,16 +151,16 @@ void print_output_spark_map(rmm::device_uvector const &list_o ++string_idx) { { auto const string_begin = h_extracted_keys_offsets[string_idx]; - auto const string_end = h_extracted_keys_offsets[string_idx + 1]; - auto const size = string_end - string_begin; - auto const ptr = &h_extracted_keys_child[string_begin]; + auto const string_end = h_extracted_keys_offsets[string_idx + 1]; + auto const size = string_end - string_begin; + auto const ptr = &h_extracted_keys_child[string_begin]; ss << "\t\"" << std::string(ptr, size) << "\" : "; } { auto const string_begin = h_extracted_values_offsets[string_idx]; - auto const string_end = h_extracted_values_offsets[string_idx + 1]; - auto const size = string_end - string_begin; - auto const ptr = &h_extracted_values_child[string_begin]; + auto const string_end = h_extracted_values_offsets[string_idx + 1]; + auto const size = string_end - string_begin; + auto const ptr = &h_extracted_values_child[string_begin]; ss << "\"" << std::string(ptr, size) << "\"\n"; } } @@ -161,6 +168,6 @@ void print_output_spark_map(rmm::device_uvector const &list_o std::cerr << ss.str() << std::endl; } -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni -#endif // DEBUG_FROM_JSON +#endif // DEBUG_FROM_JSON diff --git a/src/main/cpp/src/row_conversion.cu b/src/main/cpp/src/row_conversion.cu index aa772908dc..1960ce392b 100644 --- a/src/main/cpp/src/row_conversion.cu +++ b/src/main/cpp/src/row_conversion.cu @@ -50,7 +50,7 @@ #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED) #include -#endif // #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED) +#endif // #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED) #include #include @@ -68,14 +68,14 @@ constexpr auto JCUDF_ROW_ALIGNMENT = 8; constexpr auto MAX_BATCH_SIZE = std::numeric_limits::max(); // Number of rows each block processes in the two kernels. Tuned via nsight -constexpr auto NUM_STRING_ROWS_PER_BLOCK_TO_ROWS = 1024; +constexpr auto NUM_STRING_ROWS_PER_BLOCK_TO_ROWS = 1024; constexpr auto NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS = 64; -constexpr auto MIN_STRING_BLOCKS = 32; -constexpr auto MAX_STRING_BLOCKS = MAX_BATCH_SIZE; +constexpr auto MIN_STRING_BLOCKS = 32; +constexpr auto MAX_STRING_BLOCKS = MAX_BATCH_SIZE; constexpr auto NUM_WARPS_IN_BLOCK = 32; -} // anonymous namespace +} // anonymous namespace // needed to suppress warning about cuda::barrier #pragma nv_diag_suppress static_var_with_dynamic_init @@ -87,8 +87,9 @@ using rmm::device_uvector; #ifdef ASYNC_MEMCPY_SUPPORTED using cuda::aligned_size_t; #else -template using aligned_size_t = size_t; // Local stub for cuda::aligned_size_t. -#endif // ASYNC_MEMCPY_SUPPORTED +template +using aligned_size_t = size_t; // Local stub for cuda::aligned_size_t. +#endif // ASYNC_MEMCPY_SUPPORTED namespace spark_rapids_jni { namespace detail { @@ -156,8 +157,9 @@ struct tile_info { int end_row; int batch_number; - __device__ inline size_type get_shared_row_size(size_type const *const col_offsets, - size_type const *const col_sizes) const { + __device__ inline size_type get_shared_row_size(size_type const* const col_offsets, + size_type const* const col_sizes) const + { // this calculation is invalid if there are holes in the data such as a variable-width column. // It is wrong in a safe way in that it will say this row size is larger than it should be, so // we are not losing data we are just not as efficient as we could be with shared memory. This @@ -179,9 +181,9 @@ struct tile_info { * */ struct row_batch { - size_type num_bytes; // number of bytes in this batch - size_type row_count; // number of rows in the batch - device_uvector row_offsets; // offsets column of output cudf column + size_type num_bytes; // number of bytes in this batch + size_type row_count; // number of rows in the batch + device_uvector row_offsets; // offsets column of output cudf column }; /** @@ -189,11 +191,11 @@ struct row_batch { * */ struct batch_data { - device_uvector batch_row_offsets; // offsets to each row in incoming data - device_uvector d_batch_row_boundaries; // row numbers for the start of each batch + device_uvector batch_row_offsets; // offsets to each row in incoming data + device_uvector d_batch_row_boundaries; // row numbers for the start of each batch std::vector - batch_row_boundaries; // row numbers for the start of each batch: 0, 1500, 2700 - std::vector row_batches; // information about each batch such as byte count + batch_row_boundaries; // row numbers for the start of each batch: 0, 1500, 2700 + std::vector row_batches; // information about each batch such as byte count }; /** @@ -206,8 +208,10 @@ struct batch_data { * offsets into the string column */ std::pair, rmm::device_uvector> -build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validity_size, - rmm::cuda_stream_view stream) { +build_string_row_offsets(table_view const& tbl, + size_type fixed_width_and_validity_size, + rmm::cuda_stream_view stream) +{ auto const num_rows = tbl.num_rows(); rmm::device_uvector d_row_sizes(num_rows, stream); thrust::uninitialized_fill(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), 0); @@ -215,37 +219,44 @@ build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validi auto d_offsets_iterators = [&]() { std::vector offsets_iterators; auto offsets_iter = thrust::make_transform_iterator( - tbl.begin(), [](auto const &col) -> strings_column_view::offset_iterator { - if (!is_fixed_width(col.type())) { - CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!"); - return strings_column_view(col).offsets_begin(); - } else { - return nullptr; - } - }); - std::copy_if(offsets_iter, offsets_iter + tbl.num_columns(), + tbl.begin(), [](auto const& col) -> strings_column_view::offset_iterator { + if (!is_fixed_width(col.type())) { + CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!"); + return strings_column_view(col).offsets_begin(); + } else { + return nullptr; + } + }); + std::copy_if(offsets_iter, + offsets_iter + tbl.num_columns(), std::back_inserter(offsets_iterators), - [](auto const &offset_ptr) { return offset_ptr != nullptr; }); - return make_device_uvector_async(offsets_iterators, stream, - rmm::mr::get_current_device_resource()); + [](auto const& offset_ptr) { return offset_ptr != nullptr; }); + return make_device_uvector_async( + offsets_iterators, stream, rmm::mr::get_current_device_resource()); }(); auto const num_columns = static_cast(d_offsets_iterators.size()); - thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_columns * num_rows), - [d_offsets_iterators = d_offsets_iterators.data(), num_columns, num_rows, + [d_offsets_iterators = d_offsets_iterators.data(), + num_columns, + num_rows, d_row_sizes = d_row_sizes.data()] __device__(auto element_idx) { auto const row = element_idx % num_rows; auto const col = element_idx / num_rows; auto const val = - d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row]; + d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row]; atomicAdd(&d_row_sizes[row], val); }); // transform the row sizes to include fixed width size and alignment - thrust::transform(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), - d_row_sizes.begin(), [fixed_width_and_validity_size] __device__(auto row_size) { + thrust::transform(rmm::exec_policy(stream), + d_row_sizes.begin(), + d_row_sizes.end(), + d_row_sizes.begin(), + [fixed_width_and_validity_size] __device__(auto row_size) { return util::round_up_unsafe(fixed_width_and_validity_size + row_size, JCUDF_ROW_ALIGNMENT); }); @@ -259,9 +270,10 @@ build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validi */ struct string_row_offset_functor { string_row_offset_functor(device_span d_row_offsets) - : d_row_offsets(d_row_offsets){}; + : d_row_offsets(d_row_offsets){}; - __device__ inline size_type operator()(int row_number, int) const { + __device__ inline size_type operator()(int row_number, int) const + { return d_row_offsets[row_number]; } @@ -274,9 +286,10 @@ struct string_row_offset_functor { */ struct fixed_width_row_offset_functor { fixed_width_row_offset_functor(size_type fixed_width_only_row_size) - : _fixed_width_only_row_size(fixed_width_only_row_size){}; + : _fixed_width_only_row_size(fixed_width_only_row_size){}; - __device__ inline size_type operator()(int row_number, int tile_row_start) const { + __device__ inline size_type operator()(int row_number, int tile_row_start) const + { return (row_number - tile_row_start) * _fixed_width_only_row_size; } @@ -298,11 +311,15 @@ struct fixed_width_row_offset_functor { * @param output_nm array of pointers to the output null masks * @param input_data pointing to the incoming row data */ -__global__ void -copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type num_columns, - const size_type row_size, const size_type *input_offset_in_row, - const size_type *num_bytes, int8_t **output_data, - bitmask_type **output_nm, const int8_t *input_data) { +__global__ void copy_from_rows_fixed_width_optimized(const size_type num_rows, + const size_type num_columns, + const size_type row_size, + const size_type* input_offset_in_row, + const size_type* num_bytes, + int8_t** output_data, + bitmask_type** output_nm, + const int8_t* input_data) +{ // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -317,30 +334,30 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n // are controlled by the x dimension (there are multiple blocks in the x // dimension). - size_type const rows_per_group = blockDim.x; - size_type const row_group_start = blockIdx.x; + size_type const rows_per_group = blockDim.x; + size_type const row_group_start = blockIdx.x; size_type const row_group_stride = gridDim.x; - size_type const row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + size_type const row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; extern __shared__ int8_t shared_data[]; // Because we are copying fixed width only data and we stride the rows // this thread will always start copying from shared data in the same place - int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + int8_t* row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t* row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; for (auto row_group_index = row_group_start; row_group_index < row_group_end; row_group_index += row_group_stride) { // Step 1: Copy the data into shared memory // We know row_size is always aligned with and a multiple of int64_t; - int64_t *long_shared = reinterpret_cast(shared_data); - int64_t const *long_input = reinterpret_cast(input_data); + int64_t* long_shared = reinterpret_cast(shared_data); + int64_t const* long_input = reinterpret_cast(input_data); - auto const shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); + auto const shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); auto const shared_output_stride = blockDim.x * blockDim.y; - auto const row_index_end = std::min(num_rows, ((row_group_index + 1) * rows_per_group)); - auto const num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - auto const shared_length = row_size * num_rows_in_group; + auto const row_index_end = std::min(num_rows, ((row_group_index + 1) * rows_per_group)); + auto const num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + auto const shared_length = row_size * num_rows_in_group; size_type const shared_output_end = shared_length / sizeof(int64_t); @@ -363,31 +380,31 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n // because we may need them to copy data in for the next row group. uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); if (row_index < num_rows) { - auto const col_index_start = threadIdx.y; + auto const col_index_start = threadIdx.y; auto const col_index_stride = blockDim.y; for (auto col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { - auto const col_size = num_bytes[col_index]; - int8_t const *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); - int8_t *col_output = output_data[col_index]; + auto const col_size = num_bytes[col_index]; + int8_t const* col_tmp = &(row_tmp[input_offset_in_row[col_index]]); + int8_t* col_output = output_data[col_index]; switch (col_size) { case 1: { col_output[row_index] = *col_tmp; break; } case 2: { - int16_t *short_col_output = reinterpret_cast(col_output); - short_col_output[row_index] = *reinterpret_cast(col_tmp); + int16_t* short_col_output = reinterpret_cast(col_output); + short_col_output[row_index] = *reinterpret_cast(col_tmp); break; } case 4: { - int32_t *int_col_output = reinterpret_cast(col_output); - int_col_output[row_index] = *reinterpret_cast(col_tmp); + int32_t* int_col_output = reinterpret_cast(col_output); + int_col_output[row_index] = *reinterpret_cast(col_tmp); break; } case 8: { - int64_t *long_col_output = reinterpret_cast(col_output); - long_col_output[row_index] = *reinterpret_cast(col_tmp); + int64_t* long_col_output = reinterpret_cast(col_output); + long_col_output[row_index] = *reinterpret_cast(col_tmp); break; } default: { @@ -400,25 +417,29 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n } } - bitmask_type *nm = output_nm[col_index]; - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + bitmask_type* nm = output_nm[col_index]; + int8_t* valid_byte = &row_vld_tmp[col_index / 8]; size_type byte_bit_offset = col_index % 8; - int predicate = *valid_byte & (1 << byte_bit_offset); - uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { - nm[word_index(row_index)] = bitmask; - } - } // end column loop - } // end row copy + int predicate = *valid_byte & (1 << byte_bit_offset); + uint32_t bitmask = __ballot_sync(active_mask, predicate); + if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } + } // end column loop + } // end row copy // wait for the row_group to be totally copied before starting on the next row group __syncthreads(); } } -__global__ void copy_to_rows_fixed_width_optimized( - const size_type start_row, const size_type num_rows, const size_type num_columns, - const size_type row_size, const size_type *output_offset_in_row, const size_type *num_bytes, - const int8_t **input_data, const bitmask_type **input_nm, int8_t *output_data) { +__global__ void copy_to_rows_fixed_width_optimized(const size_type start_row, + const size_type num_rows, + const size_type num_columns, + const size_type row_size, + const size_type* output_offset_in_row, + const size_type* num_bytes, + const int8_t** input_data, + const bitmask_type** input_nm, + int8_t* output_data) +{ // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -435,18 +456,18 @@ __global__ void copy_to_rows_fixed_width_optimized( // are controlled by the x dimension (there are multiple blocks in the x // dimension). - size_type rows_per_group = blockDim.x; - size_type row_group_start = blockIdx.x; + size_type rows_per_group = blockDim.x; + size_type row_group_start = blockIdx.x; size_type row_group_stride = gridDim.x; - size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; extern __shared__ int8_t shared_data[]; // Because we are copying fixed width only data and we stride the rows // this thread will always start copying to shared data in the same place - int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t *row_vld_tmp = - &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + int8_t* row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t* row_vld_tmp = + &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; for (size_type row_group_index = row_group_start; row_group_index < row_group_end; row_group_index += row_group_stride) { @@ -457,31 +478,31 @@ __global__ void copy_to_rows_fixed_width_optimized( // evenly into the thread count. We don't want those threads to exit yet // because we may need them to copy data back out. if (row_index < (start_row + num_rows)) { - size_type col_index_start = threadIdx.y; + size_type col_index_start = threadIdx.y; size_type col_index_stride = blockDim.y; for (size_type col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { - size_type col_size = num_bytes[col_index]; - int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]); - const int8_t *col_input = input_data[col_index]; + size_type col_size = num_bytes[col_index]; + int8_t* col_tmp = &(row_tmp[output_offset_in_row[col_index]]); + const int8_t* col_input = input_data[col_index]; switch (col_size) { case 1: { *col_tmp = col_input[row_index]; break; } case 2: { - const int16_t *short_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = short_col_input[row_index]; + const int16_t* short_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = short_col_input[row_index]; break; } case 4: { - const int32_t *int_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = int_col_input[row_index]; + const int32_t* int_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = int_col_input[row_index]; break; } case 8: { - const int64_t *long_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = long_col_input[row_index]; + const int64_t* long_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = long_col_input[row_index]; break; } default: { @@ -495,11 +516,11 @@ __global__ void copy_to_rows_fixed_width_optimized( } // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + int8_t* valid_byte = &row_vld_tmp[col_index / 8]; size_type byte_bit_offset = col_index % 8; - uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; - int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); - size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); + uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; + int32_t* valid_int = reinterpret_cast(valid_byte - fixup_bytes); + size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); // Now copy validity for the column if (input_nm[col_index]) { if (bit_is_set(input_nm[col_index], row_index)) { @@ -511,24 +532,22 @@ __global__ void copy_to_rows_fixed_width_optimized( // It is valid so just set the bit atomicOr_block(valid_int, 1 << int_bit_offset); } - } // end column loop - } // end row copy + } // end column loop + } // end row copy // wait for the row_group to be totally copied into shared memory __syncthreads(); // Step 2: Copy the data back out // We know row_size is always aligned with and a multiple of int64_t; - int64_t *long_shared = reinterpret_cast(shared_data); - int64_t *long_output = reinterpret_cast(output_data); + int64_t* long_shared = reinterpret_cast(shared_data); + int64_t* long_output = reinterpret_cast(output_data); - size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); + size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); size_type shared_input_stride = blockDim.x * blockDim.y; - size_type row_index_end = ((row_group_index + 1) * rows_per_group); - if (row_index_end > num_rows) { - row_index_end = num_rows; - } + size_type row_index_end = ((row_group_index + 1) * rows_per_group); + if (row_index_end > num_rows) { row_index_end = num_rows; } size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - size_type shared_length = row_size * num_rows_in_group; + size_type shared_length = row_size * num_rows_in_group; size_type shared_input_end = shared_length / sizeof(int64_t); @@ -547,7 +566,7 @@ __global__ void copy_to_rows_fixed_width_optimized( #define MEMCPY(dst, src, size, barrier) cuda::memcpy_async(dst, src, size, barrier) #else #define MEMCPY(dst, src, size, barrier) memcpy(dst, src, size) -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED /** * @brief copy data from cudf columns into JCUDF format, which is row-based @@ -566,12 +585,17 @@ __global__ void copy_to_rows_fixed_width_optimized( * */ template -__global__ void copy_to_rows(const size_type num_rows, const size_type num_columns, +__global__ void copy_to_rows(const size_type num_rows, + const size_type num_columns, const size_type shmem_used_per_tile, - device_span tile_infos, const int8_t **input_data, - const size_type *col_sizes, const size_type *col_offsets, - RowOffsetFunctor row_offsets, size_type const *batch_row_boundaries, - int8_t **output_data) { + device_span tile_infos, + const int8_t** input_data, + const size_type* col_sizes, + const size_type* col_offsets, + RowOffsetFunctor row_offsets, + size_type const* batch_row_boundaries, + int8_t** output_data) +{ // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -581,21 +605,19 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum // any calculation to do here, but it is important to note. auto const group = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(group); + auto const warp = cooperative_groups::tiled_partition(group); extern __shared__ int8_t shared_data[]; #ifdef ASYNC_MEMCPY_SUPPORTED __shared__ cuda::barrier tile_barrier; - if (group.thread_rank() == 0) { - init(&tile_barrier, group.size()); - } + if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED - auto const tile = tile_infos[blockIdx.x]; - auto const num_tile_cols = tile.num_cols(); - auto const num_tile_rows = tile.num_rows(); - auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes); + auto const tile = tile_infos[blockIdx.x]; + auto const num_tile_cols = tile.num_cols(); + auto const num_tile_rows = tile.num_rows(); + auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes); auto const starting_column_offset = col_offsets[tile.start_col]; // to do the copy we need to do n column copies followed by m element copies OR we have to do m @@ -610,12 +632,11 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum // works on a row for (int relative_col = warp.meta_group_rank(); relative_col < num_tile_cols; relative_col += warp.meta_group_size()) { - - auto const absolute_col = relative_col + tile.start_col; - auto const col_size = col_sizes[absolute_col]; - auto const col_offset = col_offsets[absolute_col]; + auto const absolute_col = relative_col + tile.start_col; + auto const col_size = col_sizes[absolute_col]; + auto const col_offset = col_offsets[absolute_col]; auto const relative_col_offset = col_offset - starting_column_offset; - auto const col_ptr = input_data[absolute_col]; + auto const col_ptr = input_data[absolute_col]; if (col_ptr == nullptr) { // variable-width data column @@ -624,7 +645,6 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum for (int relative_row = warp.thread_rank(); relative_row < num_tile_rows; relative_row += warp.size()) { - if (relative_row >= num_tile_rows) { // out of bounds continue; @@ -632,23 +652,23 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum auto const absolute_row = relative_row + tile.start_row; auto const shared_offset = relative_row * tile_row_size + relative_col_offset; - auto const input_src = col_ptr + col_size * absolute_row; + auto const input_src = col_ptr + col_size * absolute_row; // copy the element from global memory switch (col_size) { case 2: { - const int16_t *short_col_input = reinterpret_cast(input_src); - *reinterpret_cast(&shared_data[shared_offset]) = *short_col_input; + const int16_t* short_col_input = reinterpret_cast(input_src); + *reinterpret_cast(&shared_data[shared_offset]) = *short_col_input; break; } case 4: { - const int32_t *int_col_input = reinterpret_cast(input_src); - *reinterpret_cast(&shared_data[shared_offset]) = *int_col_input; + const int32_t* int_col_input = reinterpret_cast(input_src); + *reinterpret_cast(&shared_data[shared_offset]) = *int_col_input; break; } case 8: { - const int64_t *long_col_input = reinterpret_cast(input_src); - *reinterpret_cast(&shared_data[shared_offset]) = *long_col_input; + const int64_t* long_col_input = reinterpret_cast(input_src); + *reinterpret_cast(&shared_data[shared_offset]) = *long_col_input; break; } case 1: shared_data[shared_offset] = *input_src; break; @@ -689,7 +709,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum tile_barrier.arrive_and_wait(); #else group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -708,58 +728,60 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum * */ template -__global__ void -copy_validity_to_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_tile, RowOffsetFunctor row_offsets, - size_type const *batch_row_boundaries, int8_t **output_data, - const size_type validity_offset, device_span tile_infos, - const bitmask_type **input_nm) { +__global__ void copy_validity_to_rows(const size_type num_rows, + const size_type num_columns, + const size_type shmem_used_per_tile, + RowOffsetFunctor row_offsets, + size_type const* batch_row_boundaries, + int8_t** output_data, + const size_type validity_offset, + device_span tile_infos, + const bitmask_type** input_nm) +{ extern __shared__ int8_t shared_data[]; // each thread of warp reads a single int32 of validity - so we read 128 bytes then ballot_sync // the bits and write the result to shmem after we fill shared mem memcpy it out in a blob. auto const group = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(group); + auto const warp = cooperative_groups::tiled_partition(group); #ifdef ASYNC_MEMCPY_SUPPORTED // Initialize cuda barriers for each tile. __shared__ cuda::barrier shared_tile_barrier; - if (group.thread_rank() == 0) { - init(&shared_tile_barrier, group.size()); - } + if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED - auto tile = tile_infos[blockIdx.x]; + auto tile = tile_infos[blockIdx.x]; auto const num_tile_cols = tile.num_cols(); auto const num_tile_rows = tile.num_rows(); auto const threads_per_warp = warp.size(); - auto const rows_per_read = cudf::detail::size_in_bits(); + auto const rows_per_read = cudf::detail::size_in_bits(); auto const num_sections_x = util::div_rounding_up_unsafe(num_tile_cols, threads_per_warp); auto const num_sections_y = util::div_rounding_up_unsafe(num_tile_rows, rows_per_read); auto const validity_data_row_length = util::round_up_unsafe( - util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT), JCUDF_ROW_ALIGNMENT); + util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT), JCUDF_ROW_ALIGNMENT); auto const total_sections = num_sections_x * num_sections_y; // the tile is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp.meta_group_rank(); my_section_idx < total_sections; my_section_idx += warp.meta_group_size()) { // convert to rows and cols - auto const section_x = my_section_idx % num_sections_x; - auto const section_y = my_section_idx / num_sections_x; - auto const relative_col = section_x * threads_per_warp + warp.thread_rank(); - auto const relative_row = section_y * rows_per_read; - auto const absolute_col = relative_col + tile.start_col; - auto const absolute_row = relative_row + tile.start_row; - auto const participating = absolute_col < num_columns && absolute_row < num_rows; + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; + auto const relative_col = section_x * threads_per_warp + warp.thread_rank(); + auto const relative_row = section_y * rows_per_read; + auto const absolute_col = relative_col + tile.start_col; + auto const absolute_row = relative_row + tile.start_row; + auto const participating = absolute_col < num_columns && absolute_row < num_rows; auto const participation_mask = __ballot_sync(0xFFFFFFFF, participating); if (participating) { - auto my_data = input_nm[absolute_col] != nullptr ? - input_nm[absolute_col][word_index(absolute_row)] : - std::numeric_limits::max(); + auto my_data = input_nm[absolute_col] != nullptr + ? input_nm[absolute_col][word_index(absolute_row)] + : std::numeric_limits::max(); // every thread that is participating in the warp has 4 bytes, but it's column-based data and // we need it in row-based. So we shuffle the bits around with ballot_sync to make the bytes @@ -769,19 +791,19 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns, auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask); // lead thread in each warp writes data auto const validity_write_offset = - validity_data_row_length * (relative_row + i) + (relative_col / CHAR_BIT); + validity_data_row_length * (relative_row + i) + (relative_col / CHAR_BIT); if (warp.thread_rank() == 0) { - *reinterpret_cast(&shared_data[validity_write_offset]) = validity_data; + *reinterpret_cast(&shared_data[validity_write_offset]) = validity_data; } } } } auto const output_data_base = - output_data[tile.batch_number] + validity_offset + tile.start_col / CHAR_BIT; + output_data[tile.batch_number] + validity_offset + tile.start_col / CHAR_BIT; // each warp copies a row at a time - auto const row_bytes = util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT); + auto const row_bytes = util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT); auto const row_batch_start = tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number]; // make sure entire tile has finished copy @@ -807,7 +829,7 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns, shared_tile_barrier.arrive_and_wait(); #else group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -826,42 +848,46 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns, * */ template -__global__ void copy_strings_to_rows(size_type const num_rows, size_type const num_variable_columns, - int8_t const **variable_input_data, - size_type const *variable_col_output_offsets, - size_type const **variable_col_offsets, - size_type fixed_width_row_size, RowOffsetFunctor row_offsets, - size_type const batch_row_offset, int8_t *output_data) { +__global__ void copy_strings_to_rows(size_type const num_rows, + size_type const num_variable_columns, + int8_t const** variable_input_data, + size_type const* variable_col_output_offsets, + size_type const** variable_col_offsets, + size_type fixed_width_row_size, + RowOffsetFunctor row_offsets, + size_type const batch_row_offset, + int8_t* output_data) +{ // Each block will take a group of rows controlled by NUM_STRING_ROWS_PER_BLOCK_TO_ROWS. Each warp // will copy a row at a time. The base thread will first go through column data and fill out // offset/length information for the column. Then all threads of the warp will participate in the // memcpy of the string data. auto const my_block = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(my_block); + auto const warp = cooperative_groups::tiled_partition(my_block); #ifdef ASYNC_MEMCPY_SUPPORTED cuda::barrier block_barrier; #endif auto const start_row = - blockIdx.x * NUM_STRING_ROWS_PER_BLOCK_TO_ROWS + warp.meta_group_rank() + batch_row_offset; + blockIdx.x * NUM_STRING_ROWS_PER_BLOCK_TO_ROWS + warp.meta_group_rank() + batch_row_offset; auto const end_row = - std::min(num_rows, static_cast(start_row + NUM_STRING_ROWS_PER_BLOCK_TO_ROWS)); + std::min(num_rows, static_cast(start_row + NUM_STRING_ROWS_PER_BLOCK_TO_ROWS)); for (int row = start_row; row < end_row; row += warp.meta_group_size()) { - auto offset = fixed_width_row_size; // initial offset to variable-width data + auto offset = fixed_width_row_size; // initial offset to variable-width data auto const base_row_offset = row_offsets(row, 0); for (int col = 0; col < num_variable_columns; ++col) { auto const string_start_offset = variable_col_offsets[col][row]; - auto const string_length = variable_col_offsets[col][row + 1] - string_start_offset; + auto const string_length = variable_col_offsets[col][row + 1] - string_start_offset; if (warp.thread_rank() == 0) { // write the offset/length to column - uint32_t *output_dest = reinterpret_cast( - &output_data[base_row_offset + variable_col_output_offsets[col]]); + uint32_t* output_dest = reinterpret_cast( + &output_data[base_row_offset + variable_col_output_offsets[col]]); output_dest[0] = offset; output_dest[1] = string_length; } auto string_output_dest = &output_data[base_row_offset + offset]; - auto string_output_src = &variable_input_data[col][string_start_offset]; + auto string_output_src = &variable_input_data[col][string_start_offset]; warp.sync(); #ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async(warp, string_output_dest, string_output_src, string_length, block_barrier); @@ -891,11 +917,17 @@ __global__ void copy_strings_to_rows(size_type const num_rows, size_type const n * */ template -__global__ void copy_from_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_tile, RowOffsetFunctor row_offsets, - size_type const *batch_row_boundaries, int8_t **output_data, - const size_type *col_sizes, const size_type *col_offsets, - device_span tile_infos, const int8_t *input_data) { +__global__ void copy_from_rows(const size_type num_rows, + const size_type num_columns, + const size_type shmem_used_per_tile, + RowOffsetFunctor row_offsets, + size_type const* batch_row_boundaries, + int8_t** output_data, + const size_type* col_sizes, + const size_type* col_offsets, + device_span tile_infos, + const int8_t* input_data) +{ // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -908,31 +940,30 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col // memory for each of the tiles that we work on auto const group = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(group); + auto const warp = cooperative_groups::tiled_partition(group); extern __shared__ int8_t shared[]; #ifdef ASYNC_MEMCPY_SUPPORTED // Initialize cuda barriers for each tile. __shared__ cuda::barrier tile_barrier; - if (group.thread_rank() == 0) { - init(&tile_barrier, group.size()); - } + if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED { - auto const fetch_tile = tile_infos[blockIdx.x]; + auto const fetch_tile = tile_infos[blockIdx.x]; auto const fetch_tile_start_row = fetch_tile.start_row; - auto const starting_col_offset = col_offsets[fetch_tile.start_col]; - auto const fetch_tile_row_size = fetch_tile.get_shared_row_size(col_offsets, col_sizes); + auto const starting_col_offset = col_offsets[fetch_tile.start_col]; + auto const fetch_tile_row_size = fetch_tile.get_shared_row_size(col_offsets, col_sizes); auto const row_batch_start = - fetch_tile.batch_number == 0 ? 0 : batch_row_boundaries[fetch_tile.batch_number]; + fetch_tile.batch_number == 0 ? 0 : batch_row_boundaries[fetch_tile.batch_number]; for (int absolute_row = warp.meta_group_rank() + fetch_tile.start_row; - absolute_row <= fetch_tile.end_row; absolute_row += warp.meta_group_size()) { + absolute_row <= fetch_tile.end_row; + absolute_row += warp.meta_group_size()) { warp.sync(); auto shared_offset = (absolute_row - fetch_tile_start_row) * fetch_tile_row_size; - auto dst = &shared[shared_offset]; + auto dst = &shared[shared_offset]; auto src = &input_data[row_offsets(absolute_row, row_batch_start) + starting_col_offset]; // copy the data #ifdef ASYNC_MEMCPY_SUPPORTED @@ -946,9 +977,9 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col } { - auto const tile = tile_infos[blockIdx.x]; - auto const rows_in_tile = tile.num_rows(); - auto const cols_in_tile = tile.num_cols(); + auto const tile = tile_infos[blockIdx.x]; + auto const rows_in_tile = tile.num_rows(); + auto const cols_in_tile = tile.num_cols(); auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes); #ifdef ASYNC_MEMCPY_SUPPORTED @@ -956,7 +987,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col tile_barrier.arrive_and_wait(); #else group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED // Now we copy from shared memory to final destination. The data is laid out in rows in shared // memory, so the reads for a column will be "vertical". Because of this and the different sizes @@ -965,8 +996,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col // than rows, we do a global index instead of a double for loop with col/row. for (int relative_row = warp.thread_rank(); relative_row < rows_in_tile; relative_row += warp.size()) { - - auto const absolute_row = relative_row + tile.start_row; + auto const absolute_row = relative_row + tile.start_row; auto const shared_memory_row_offset = tile_row_size * relative_row; for (int relative_col = warp.meta_group_rank(); relative_col < cols_in_tile; @@ -974,11 +1004,11 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col auto const absolute_col = relative_col + tile.start_col; auto const shared_memory_offset = - col_offsets[absolute_col] - col_offsets[tile.start_col] + shared_memory_row_offset; + col_offsets[absolute_col] - col_offsets[tile.start_col] + shared_memory_row_offset; auto const column_size = col_sizes[absolute_col]; - int8_t *shmem_src = &shared[shared_memory_offset]; - int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; + int8_t* shmem_src = &shared[shared_memory_offset]; + int8_t* dst = &output_data[absolute_col][absolute_row * column_size]; MEMCPY(dst, shmem_src, column_size, tile_barrier); } @@ -990,7 +1020,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col tile_barrier.arrive_and_wait(); #else group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -1009,12 +1039,16 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col * */ template -__global__ void -copy_validity_from_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_tile, RowOffsetFunctor row_offsets, - size_type const *batch_row_boundaries, bitmask_type **output_nm, - const size_type validity_offset, device_span tile_infos, - const int8_t *input_data) { +__global__ void copy_validity_from_rows(const size_type num_rows, + const size_type num_columns, + const size_type shmem_used_per_tile, + RowOffsetFunctor row_offsets, + size_type const* batch_row_boundaries, + bitmask_type** output_nm, + const size_type validity_offset, + device_span tile_infos, + const int8_t* input_data) +{ extern __shared__ int8_t shared[]; using cudf::detail::warp_size; @@ -1034,44 +1068,42 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, // __ballot_sync, representing 32 rows of that column. auto const group = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(group); + auto const warp = cooperative_groups::tiled_partition(group); #ifdef ASYNC_MEMCPY_SUPPORTED // Initialize cuda barriers for each tile. __shared__ cuda::barrier shared_tile_barrier; - if (group.thread_rank() == 0) { - init(&shared_tile_barrier, group.size()); - } + if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED - auto const tile = tile_infos[blockIdx.x]; + auto const tile = tile_infos[blockIdx.x]; auto const tile_start_col = tile.start_col; auto const tile_start_row = tile.start_row; - auto const num_tile_cols = tile.num_cols(); - auto const num_tile_rows = tile.num_rows(); + auto const num_tile_cols = tile.num_cols(); + auto const num_tile_rows = tile.num_rows(); auto const threads_per_warp = warp.size(); - auto const cols_per_read = CHAR_BIT; + auto const cols_per_read = CHAR_BIT; - auto const rows_per_read = static_cast(threads_per_warp); - auto const num_sections_x = util::div_rounding_up_safe(num_tile_cols, cols_per_read); - auto const num_sections_y = util::div_rounding_up_safe(num_tile_rows, rows_per_read); - auto const validity_data_col_length = num_sections_y * 4; // words to bytes - auto const total_sections = num_sections_x * num_sections_y; + auto const rows_per_read = static_cast(threads_per_warp); + auto const num_sections_x = util::div_rounding_up_safe(num_tile_cols, cols_per_read); + auto const num_sections_y = util::div_rounding_up_safe(num_tile_rows, rows_per_read); + auto const validity_data_col_length = num_sections_y * 4; // words to bytes + auto const total_sections = num_sections_x * num_sections_y; // the tile is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp.meta_group_rank(); my_section_idx < total_sections; my_section_idx += warp.meta_group_size()) { // convert section to row and col - auto const section_x = my_section_idx % num_sections_x; - auto const section_y = my_section_idx / num_sections_x; + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; auto const relative_col = section_x * cols_per_read; auto const relative_row = section_y * rows_per_read + warp.thread_rank(); auto const absolute_col = relative_col + tile_start_col; auto const absolute_row = relative_row + tile_start_row; auto const row_batch_start = - tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number]; + tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number]; auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); @@ -1088,8 +1120,8 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, // lead thread in each warp writes data if (warp.thread_rank() == 0) { auto const validity_write_offset = - validity_data_col_length * (relative_col + i) + relative_row / cols_per_read; - *reinterpret_cast(&shared[validity_write_offset]) = validity_data; + validity_data_col_length * (relative_col + i) + relative_row / cols_per_read; + *reinterpret_cast(&shared[validity_write_offset]) = validity_data; } } } @@ -1104,13 +1136,13 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, for (int relative_col = warp.meta_group_rank(); relative_col < num_tile_cols; relative_col += warp.meta_group_size()) { auto const absolute_col = relative_col + tile_start_col; - auto dst = output_nm[absolute_col] + word_index(tile_start_row); + auto dst = output_nm[absolute_col] + word_index(tile_start_row); auto const src = - reinterpret_cast(&shared[validity_data_col_length * relative_col]); + reinterpret_cast(&shared[validity_data_col_length * relative_col]); #ifdef ASYNC_MEMCPY_SUPPORTED - cuda::memcpy_async(warp, dst, src, aligned_size_t<4>(validity_data_col_length), - shared_tile_barrier); + cuda::memcpy_async( + warp, dst, src, aligned_size_t<4>(validity_data_col_length), shared_tile_barrier); #else for (int b = warp.thread_rank(); b < col_words; b += warp.size()) { dst[b] = src[b]; @@ -1123,7 +1155,7 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, shared_tile_barrier.arrive_and_wait(); #else group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -1140,38 +1172,42 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, * @param num_string_columns number of string columns in the table */ template -__global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets, int32_t **string_row_offsets, - int32_t **string_lengths, size_type **string_column_offsets, - char **string_col_data, int8_t const *row_data, +__global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets, + int32_t** string_row_offsets, + int32_t** string_lengths, + size_type** string_column_offsets, + char** string_col_data, + int8_t const* row_data, size_type const num_rows, - size_type const num_string_columns) { + size_type const num_string_columns) +{ // Each warp takes a tile, which is a single column and up to ROWS_PER_BLOCK rows. A tile will not // wrap around the bottom of the table. The warp will copy the strings for each row in the tile. // Traversing in row-major order to coalesce the offsets and size reads. auto my_block = cooperative_groups::this_thread_block(); - auto warp = cooperative_groups::tiled_partition(my_block); + auto warp = cooperative_groups::tiled_partition(my_block); #ifdef ASYNC_MEMCPY_SUPPORTED cuda::barrier block_barrier; #endif // workaround for not being able to take a reference to a constexpr host variable auto const ROWS_PER_BLOCK = NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS; - auto const tiles_per_col = util::div_rounding_up_unsafe(num_rows, ROWS_PER_BLOCK); - auto const starting_tile = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank(); - auto const num_tiles = tiles_per_col * num_string_columns; - auto const tile_stride = warp.meta_group_size() * gridDim.x; + auto const tiles_per_col = util::div_rounding_up_unsafe(num_rows, ROWS_PER_BLOCK); + auto const starting_tile = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank(); + auto const num_tiles = tiles_per_col * num_string_columns; + auto const tile_stride = warp.meta_group_size() * gridDim.x; // Each warp will copy strings in its tile. This is handled by all the threads of a warp passing // the same parameters to async_memcpy and all threads in the warp participating in the copy. for (auto my_tile = starting_tile; my_tile < num_tiles; my_tile += tile_stride) { auto const starting_row = (my_tile % tiles_per_col) * ROWS_PER_BLOCK; - auto const col = my_tile / tiles_per_col; - auto const str_len = string_lengths[col]; - auto const str_row_off = string_row_offsets[col]; - auto const str_col_off = string_column_offsets[col]; - auto str_col_data = string_col_data[col]; + auto const col = my_tile / tiles_per_col; + auto const str_len = string_lengths[col]; + auto const str_row_off = string_row_offsets[col]; + auto const str_col_off = string_column_offsets[col]; + auto str_col_data = string_col_data[col]; for (int row = starting_row; row < starting_row + ROWS_PER_BLOCK && row < num_rows; ++row) { auto const src = &row_data[row_offsets(row, 0) + str_row_off[row]]; - auto dst = &str_col_data[str_col_off[row]]; + auto dst = &str_col_data[str_col_off[row]]; #ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async(warp, dst, src, str_len[row], block_barrier); @@ -1194,8 +1230,12 @@ __global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets, int32_t **s * @param [out] threads the size of the threads for the kernel * @return the size in bytes of shared memory needed for each block. */ -static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_type num_rows, - const size_type size_per_row, dim3 &blocks, dim3 &threads) { +static int calc_fixed_width_kernel_dims(const size_type num_columns, + const size_type num_rows, + const size_type size_per_row, + dim3& blocks, + dim3& threads) +{ // We have found speed degrades when a thread handles more than 4 columns. // Each block is 2 dimensional. The y dimension indicates the columns. // We limit this to 32 threads in the y dimension so we can still @@ -1205,7 +1245,7 @@ static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_ // in the x dimension because we use atomic operations at the block // level when writing validity data out to main memory, and that would // need to change if we split a word of validity data between blocks. - int const y_block_size = min(util::div_rounding_up_safe(num_columns, 4), 32); + int const y_block_size = min(util::div_rounding_up_safe(num_columns, 4), 32); int const x_possible_block_size = 1024 / y_block_size; // 48KB is the default setting for shared memory per block according to the cuda tutorials // If someone configures the GPU to only have 16 KB this might not work. @@ -1228,9 +1268,9 @@ static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_ // to try and future proof this a bit. int const num_blocks = std::clamp((num_rows + block_size - 1) / block_size, 1, 10240); - blocks.x = num_blocks; - blocks.y = 1; - blocks.z = 1; + blocks.x = num_blocks; + blocks.y = 1; + blocks.z = 1; threads.x = block_size; threads.y = y_block_size; threads.z = 1; @@ -1244,12 +1284,19 @@ static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_ * into this function are common between runs and should be calculated once. */ static std::unique_ptr fixed_width_convert_to_rows( - const size_type start_row, const size_type num_rows, const size_type num_columns, - const size_type size_per_row, rmm::device_uvector &column_start, - rmm::device_uvector &column_size, rmm::device_uvector &input_data, - rmm::device_uvector &input_nm, const scalar &zero, - const scalar &scalar_size_per_row, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { + const size_type start_row, + const size_type num_rows, + const size_type num_columns, + const size_type size_per_row, + rmm::device_uvector& column_start, + rmm::device_uvector& column_size, + rmm::device_uvector& input_data, + rmm::device_uvector& input_nm, + const scalar& zero, + const scalar& scalar_size_per_row, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ int64_t const total_allocation = size_per_row * num_rows; // We made a mistake in the split somehow CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), @@ -1257,28 +1304,43 @@ static std::unique_ptr fixed_width_convert_to_rows( // Allocate and set the offsets row for the byte array std::unique_ptr offsets = - cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream, mr); + cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream, mr); - std::unique_ptr data = - make_numeric_column(data_type(type_id::INT8), static_cast(total_allocation), - mask_state::UNALLOCATED, stream, mr); + std::unique_ptr data = make_numeric_column(data_type(type_id::INT8), + static_cast(total_allocation), + mask_state::UNALLOCATED, + stream, + mr); dim3 blocks; dim3 threads; int shared_size = - detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); copy_to_rows_fixed_width_optimized<<>>( - start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(), - input_data.data(), input_nm.data(), data->mutable_view().data()); - - return make_lists_column(num_rows, std::move(offsets), std::move(data), 0, - rmm::device_buffer{0, cudf::get_default_stream(), mr}, stream, mr); + start_row, + num_rows, + num_columns, + size_per_row, + column_start.data(), + column_size.data(), + input_data.data(), + input_nm.data(), + data->mutable_view().data()); + + return make_lists_column(num_rows, + std::move(offsets), + std::move(data), + 0, + rmm::device_buffer{0, cudf::get_default_stream(), mr}, + stream, + mr); } -static inline bool are_all_fixed_width(std::vector const &schema) { - return std::all_of(schema.begin(), schema.end(), - [](const data_type &t) { return is_fixed_width(t); }); +static inline bool are_all_fixed_width(std::vector const& schema) +{ + return std::all_of( + schema.begin(), schema.end(), [](const data_type& t) { return is_fixed_width(t); }); } /** @@ -1289,9 +1351,10 @@ static inline bool are_all_fixed_width(std::vector const &schema) { * @param [out] column_size the size in bytes of the data for each columns in the row. * @return the size in bytes each row needs. */ -static inline int32_t compute_fixed_width_layout(std::vector const &schema, - std::vector &column_start, - std::vector &column_size) { +static inline int32_t compute_fixed_width_layout(std::vector const& schema, + std::vector& column_start, + std::vector& column_size) +{ // We guarantee that the start of each column is 64-bit aligned so anything can go // there, but to make the code simple we will still do an alignment for it. int32_t at_offset = 0; @@ -1299,7 +1362,7 @@ static inline int32_t compute_fixed_width_layout(std::vector const &s size_type s = size_of(*col); column_size.emplace_back(s); std::size_t allocation_needed = s; - std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types + std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types at_offset = util::round_up_unsafe(at_offset, static_cast(alignment_needed)); column_start.emplace_back(at_offset); at_offset += allocation_needed; @@ -1309,7 +1372,7 @@ static inline int32_t compute_fixed_width_layout(std::vector const &s // Eventually we can think about nullable vs not nullable, but for now we will just always add // it in int32_t const validity_bytes_needed = - util::div_rounding_up_safe(schema.size(), CHAR_BIT); + util::div_rounding_up_safe(schema.size(), CHAR_BIT); // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned @@ -1325,8 +1388,8 @@ struct column_info_s { std::vector column_sizes; std::vector variable_width_column_starts; - column_info_s &operator=(column_info_s const &other) = delete; - column_info_s &operator=(column_info_s &&other) = delete; + column_info_s& operator=(column_info_s const& other) = delete; + column_info_s& operator=(column_info_s&& other) = delete; }; /** @@ -1340,7 +1403,8 @@ struct column_info_s { * @return size of the fixed_width data portion of a row. */ template -column_info_s compute_column_information(iterator begin, iterator end) { +column_info_s compute_column_information(iterator begin, iterator end) +{ size_type size_per_row = 0; std::vector column_starts; std::vector column_sizes; @@ -1358,10 +1422,8 @@ column_info_s compute_column_information(iterator begin, iterator end) { // align size for this type - They are the same for fixed width types and 4 bytes for variable // width length/offset combos size_type const alignment_needed = compound_type ? __alignof(uint32_t) : col_size; - size_per_row = util::round_up_unsafe(size_per_row, alignment_needed); - if (compound_type) { - variable_width_column_starts.push_back(size_per_row); - } + size_per_row = util::round_up_unsafe(size_per_row, alignment_needed); + if (compound_type) { variable_width_column_starts.push_back(size_per_row); } column_starts.push_back(size_per_row); column_sizes.push_back(col_size); size_per_row += col_size; @@ -1373,9 +1435,11 @@ column_info_s compute_column_information(iterator begin, iterator end) { // validity is byte-aligned in the JCUDF format size_per_row += - util::div_rounding_up_safe(static_cast(std::distance(begin, end)), CHAR_BIT); + util::div_rounding_up_safe(static_cast(std::distance(begin, end)), CHAR_BIT); - return {size_per_row, std::move(column_starts), std::move(column_sizes), + return {size_per_row, + std::move(column_starts), + std::move(column_sizes), std::move(variable_width_column_starts)}; } @@ -1388,34 +1452,35 @@ column_info_s compute_column_information(iterator begin, iterator end) { * @param row_batches batched row information for multiple output locations * @return vector of `tile_info` structs for validity data */ -std::vector -build_validity_tile_infos(size_type const &num_columns, size_type const &num_rows, - size_type const &shmem_limit_per_tile, - std::vector const &row_batches) { +std::vector build_validity_tile_infos(size_type const& num_columns, + size_type const& num_rows, + size_type const& shmem_limit_per_tile, + std::vector const& row_batches) +{ auto const desired_rows_and_columns = static_cast(sqrt(shmem_limit_per_tile)); - auto const column_stride = util::round_up_unsafe( - [&]() { - if (desired_rows_and_columns > num_columns) { - // not many columns, build a single tile for table width and ship it off - return num_columns; - } else { - return util::round_down_safe(desired_rows_and_columns, CHAR_BIT); - } - }(), - JCUDF_ROW_ALIGNMENT); + auto const column_stride = util::round_up_unsafe( + [&]() { + if (desired_rows_and_columns > num_columns) { + // not many columns, build a single tile for table width and ship it off + return num_columns; + } else { + return util::round_down_safe(desired_rows_and_columns, CHAR_BIT); + } + }(), + JCUDF_ROW_ALIGNMENT); // we fit as much as we can given the column stride note that an element in the table takes just 1 // bit, but a row with a single element still takes 8 bytes! - auto const bytes_per_row = util::round_up_safe( - util::div_rounding_up_unsafe(column_stride, CHAR_BIT), JCUDF_ROW_ALIGNMENT); + auto const bytes_per_row = + util::round_up_safe(util::div_rounding_up_unsafe(column_stride, CHAR_BIT), JCUDF_ROW_ALIGNMENT); auto const row_stride = - std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64)); + std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64)); std::vector validity_tile_infos; validity_tile_infos.reserve(num_columns / column_stride * num_rows / row_stride); for (int col = 0; col < num_columns; col += column_stride) { int current_tile_row_batch = 0; - int rows_left_in_batch = row_batches[current_tile_row_batch].row_count; - int row = 0; + int rows_left_in_batch = row_batches[current_tile_row_batch].row_count; + int row = 0; while (row < num_rows) { if (rows_left_in_batch == 0) { current_tile_row_batch++; @@ -1423,8 +1488,11 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row } int const tile_height = std::min(row_stride, rows_left_in_batch); validity_tile_infos.emplace_back( - detail::tile_info{col, row, std::min(col + column_stride - 1, num_columns - 1), - row + tile_height - 1, current_tile_row_batch}); + detail::tile_info{col, + row, + std::min(col + column_stride - 1, num_columns - 1), + row + tile_height - 1, + current_tile_row_batch}); row += tile_height; rows_left_in_batch -= tile_height; } @@ -1439,11 +1507,15 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row * * @tparam RowSize iterator that returns the size of a specific row */ -template struct row_size_functor { +template +struct row_size_functor { row_size_functor(size_type row_end, RowSize row_sizes, size_type last_row_end) - : _row_end(row_end), _row_sizes(row_sizes), _last_row_end(last_row_end) {} + : _row_end(row_end), _row_sizes(row_sizes), _last_row_end(last_row_end) + { + } - __device__ inline uint64_t operator()(int i) const { + __device__ inline uint64_t operator()(int i) const + { return i >= _row_end ? 0 : _row_sizes[i + _last_row_end]; } @@ -1465,11 +1537,15 @@ template struct row_size_functor { * device_uvector of row offsets */ template -batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_width, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { +batch_data build_batches(size_type num_rows, + RowSize row_sizes, + bool all_fixed_width, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows); auto const num_batches = static_cast( - util::div_rounding_up_safe(total_size, static_cast(MAX_BATCH_SIZE))); + util::div_rounding_up_safe(total_size, static_cast(MAX_BATCH_SIZE))); auto const num_offsets = num_batches + 1; std::vector row_batches; std::vector batch_row_boundaries; @@ -1480,8 +1556,8 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w batch_row_boundaries.push_back(0); size_type last_row_end = 0; device_uvector cumulative_row_sizes(num_rows, stream); - thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, - cumulative_row_sizes.begin()); + thrust::inclusive_scan( + rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, cumulative_row_sizes.begin()); // This needs to be split this into 2 gig batches. Care must be taken to avoid a batch larger than // 2 gigs. Imagine a table with 900 meg rows. The batches should occur every 2 rows, but if a @@ -1495,21 +1571,21 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w while (last_row_end < num_rows) { auto offset_row_sizes = thrust::make_transform_iterator( - cumulative_row_sizes.begin(), - [last_row_end, cumulative_row_sizes = cumulative_row_sizes.data()] __device__(auto i) { - return i - cumulative_row_sizes[last_row_end]; - }); + cumulative_row_sizes.begin(), + [last_row_end, cumulative_row_sizes = cumulative_row_sizes.data()] __device__(auto i) { + return i - cumulative_row_sizes[last_row_end]; + }); auto search_start = offset_row_sizes + last_row_end; - auto search_end = offset_row_sizes + num_rows; + auto search_end = offset_row_sizes + num_rows; // find the next MAX_BATCH_SIZE boundary auto const lb = - thrust::lower_bound(rmm::exec_policy(stream), search_start, search_end, MAX_BATCH_SIZE); + thrust::lower_bound(rmm::exec_policy(stream), search_start, search_end, MAX_BATCH_SIZE); size_type const batch_size = lb - search_start; - size_type const row_end = lb == search_end ? - batch_size + last_row_end : - last_row_end + util::round_down_safe(batch_size, 32); + size_type const row_end = lb == search_end + ? batch_size + last_row_end + : last_row_end + util::round_down_safe(batch_size, 32); // build offset list for each row in this batch auto const num_rows_in_batch = row_end - last_row_end; @@ -1519,10 +1595,12 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w device_uvector output_batch_row_offsets(num_entries, stream, mr); auto row_size_iter_bounded = cudf::detail::make_counting_transform_iterator( - 0, row_size_functor(row_end, row_sizes, last_row_end)); + 0, row_size_functor(row_end, row_sizes, last_row_end)); - thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded, - row_size_iter_bounded + num_entries, output_batch_row_offsets.begin()); + thrust::exclusive_scan(rmm::exec_policy(stream), + row_size_iter_bounded, + row_size_iter_bounded + num_entries, + output_batch_row_offsets.begin()); auto const batch_bytes = output_batch_row_offsets.element(num_rows_in_batch, stream); @@ -1530,8 +1608,10 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w // needs to be individually allocated, but the kernel needs a contiguous array of offsets or // more global lookups are necessary. if (!all_fixed_width) { - cudaMemcpy(batch_row_offsets.data() + last_row_end, output_batch_row_offsets.data(), - num_rows_in_batch * sizeof(size_type), cudaMemcpyDeviceToDevice); + cudaMemcpy(batch_row_offsets.data() + last_row_end, + output_batch_row_offsets.data(), + num_rows_in_batch * sizeof(size_type), + cudaMemcpyDeviceToDevice); } batch_row_boundaries.push_back(row_end); @@ -1540,10 +1620,11 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w last_row_end = row_end; } - return {std::move(batch_row_offsets), - make_device_uvector_async(batch_row_boundaries, stream, - rmm::mr::get_current_device_resource()), - std::move(batch_row_boundaries), std::move(row_batches)}; + return { + std::move(batch_row_offsets), + make_device_uvector_async(batch_row_boundaries, stream, rmm::mr::get_current_device_resource()), + std::move(batch_row_boundaries), + std::move(row_batches)}; } /** @@ -1554,19 +1635,24 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w * @param stream stream to use * @return number of tiles necessary */ -int compute_tile_counts(device_span const &batch_row_boundaries, - int desired_tile_height, rmm::cuda_stream_view stream) { +int compute_tile_counts(device_span const& batch_row_boundaries, + int desired_tile_height, + rmm::cuda_stream_view stream) +{ size_type const num_batches = batch_row_boundaries.size() - 1; device_uvector num_tiles(num_batches, stream); auto iter = thrust::make_counting_iterator(0); - thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_tiles.begin(), - [desired_tile_height, - batch_row_boundaries = - batch_row_boundaries.data()] __device__(auto batch_index) -> size_type { - return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] - - batch_row_boundaries[batch_index], - desired_tile_height); - }); + thrust::transform( + rmm::exec_policy(stream), + iter, + iter + num_batches, + num_tiles.begin(), + [desired_tile_height, + batch_row_boundaries = batch_row_boundaries.data()] __device__(auto batch_index) -> size_type { + return util::div_rounding_up_unsafe( + batch_row_boundaries[batch_index + 1] - batch_row_boundaries[batch_index], + desired_tile_height); + }); return thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end()); } @@ -1582,61 +1668,73 @@ int compute_tile_counts(device_span const &batch_row_boundaries * @param stream stream to use * @return number of tiles created */ -size_type -build_tiles(device_span tiles, - device_uvector const &batch_row_boundaries, // comes from build_batches - int column_start, int column_end, int desired_tile_height, int total_number_of_rows, - rmm::cuda_stream_view stream) { +size_type build_tiles( + device_span tiles, + device_uvector const& batch_row_boundaries, // comes from build_batches + int column_start, + int column_end, + int desired_tile_height, + int total_number_of_rows, + rmm::cuda_stream_view stream) +{ size_type const num_batches = batch_row_boundaries.size() - 1; device_uvector num_tiles(num_batches, stream); auto iter = thrust::make_counting_iterator(0); - thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_tiles.begin(), - [desired_tile_height, - batch_row_boundaries = - batch_row_boundaries.data()] __device__(auto batch_index) -> size_type { - return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] - - batch_row_boundaries[batch_index], - desired_tile_height); - }); + thrust::transform( + rmm::exec_policy(stream), + iter, + iter + num_batches, + num_tiles.begin(), + [desired_tile_height, + batch_row_boundaries = batch_row_boundaries.data()] __device__(auto batch_index) -> size_type { + return util::div_rounding_up_unsafe( + batch_row_boundaries[batch_index + 1] - batch_row_boundaries[batch_index], + desired_tile_height); + }); size_type const total_tiles = - thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end()); + thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end()); device_uvector tile_starts(num_batches + 1, stream); auto tile_iter = cudf::detail::make_counting_transform_iterator( - 0, [num_tiles = num_tiles.data(), num_batches] __device__(auto i) { - return (i < num_batches) ? num_tiles[i] : 0; - }); - thrust::exclusive_scan(rmm::exec_policy(stream), tile_iter, tile_iter + num_batches + 1, - tile_starts.begin()); // in tiles + 0, [num_tiles = num_tiles.data(), num_batches] __device__(auto i) { + return (i < num_batches) ? num_tiles[i] : 0; + }); + thrust::exclusive_scan(rmm::exec_policy(stream), + tile_iter, + tile_iter + num_batches + 1, + tile_starts.begin()); // in tiles thrust::transform( - rmm::exec_policy(stream), iter, iter + total_tiles, tiles.begin(), - [=, tile_starts = tile_starts.data(), - batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type tile_index) { - // what batch this tile falls in - auto const batch_index_iter = - thrust::upper_bound(thrust::seq, tile_starts, tile_starts + num_batches, tile_index); - auto const batch_index = std::distance(tile_starts, batch_index_iter) - 1; - // local index within the tile - int const local_tile_index = tile_index - tile_starts[batch_index]; - // the start row for this batch. - int const batch_row_start = batch_row_boundaries[batch_index]; - // the start row for this tile - int const tile_row_start = batch_row_start + (local_tile_index * desired_tile_height); - // the end row for this tile - int const max_row = - std::min(total_number_of_rows - 1, - batch_index + 1 > num_batches ? - std::numeric_limits::max() : - static_cast(batch_row_boundaries[batch_index + 1]) - 1); - int const tile_row_end = - std::min(batch_row_start + ((local_tile_index + 1) * desired_tile_height) - 1, max_row); - - // stuff the tile - return tile_info{column_start, tile_row_start, column_end, tile_row_end, - static_cast(batch_index)}; - }); + rmm::exec_policy(stream), + iter, + iter + total_tiles, + tiles.begin(), + [ =, + tile_starts = tile_starts.data(), + batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type tile_index) { + // what batch this tile falls in + auto const batch_index_iter = + thrust::upper_bound(thrust::seq, tile_starts, tile_starts + num_batches, tile_index); + auto const batch_index = std::distance(tile_starts, batch_index_iter) - 1; + // local index within the tile + int const local_tile_index = tile_index - tile_starts[batch_index]; + // the start row for this batch. + int const batch_row_start = batch_row_boundaries[batch_index]; + // the start row for this tile + int const tile_row_start = batch_row_start + (local_tile_index * desired_tile_height); + // the end row for this tile + int const max_row = std::min(total_number_of_rows - 1, + batch_index + 1 > num_batches + ? std::numeric_limits::max() + : static_cast(batch_row_boundaries[batch_index + 1]) - 1); + int const tile_row_end = + std::min(batch_row_start + ((local_tile_index + 1) * desired_tile_height) - 1, max_row); + + // stuff the tile + return tile_info{ + column_start, tile_row_start, column_end, tile_row_end, static_cast(batch_index)}; + }); return total_tiles; } @@ -1654,13 +1752,16 @@ build_tiles(device_span tiles, * @param f callback function called when building a tile */ template -void determine_tiles(std::vector const &column_sizes, - std::vector const &column_starts, - size_type const first_row_batch_size, size_type const total_number_of_rows, - size_type const &shmem_limit_per_tile, TileCallback f) { +void determine_tiles(std::vector const& column_sizes, + std::vector const& column_starts, + size_type const first_row_batch_size, + size_type const total_number_of_rows, + size_type const& shmem_limit_per_tile, + TileCallback f) +{ // tile infos are organized with the tile going "down" the columns this provides the most // coalescing of memory access - int current_tile_width = 0; + int current_tile_width = 0; int current_tile_start_col = 0; // the ideal tile height has lots of 8-byte reads and 8-byte writes. The optimal read/write would @@ -1669,10 +1770,10 @@ void determine_tiles(std::vector const &column_sizes, // sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we want them // equal, so height and width are sqrt(shared_mem_size). The trick is that it's in bytes, not rows // or columns. - auto const square_bias = 32; // bias towards columns for performance reasons - auto const optimal_square_len = static_cast(sqrt(shmem_limit_per_tile)); + auto const square_bias = 32; // bias towards columns for performance reasons + auto const optimal_square_len = static_cast(sqrt(shmem_limit_per_tile)); auto const desired_tile_height = util::round_up_safe( - std::min(optimal_square_len / square_bias, total_number_of_rows), cudf::detail::warp_size); + std::min(optimal_square_len / square_bias, total_number_of_rows), cudf::detail::warp_size); auto const tile_height = std::clamp(desired_tile_height, 1, first_row_batch_size); int row_size = 0; @@ -1682,22 +1783,22 @@ void determine_tiles(std::vector const &column_sizes, auto const col_size = column_sizes[col]; // align size for this type - auto const alignment_needed = col_size; // They are the same for fixed width types - auto const row_size_aligned = util::round_up_unsafe(row_size, alignment_needed); + auto const alignment_needed = col_size; // They are the same for fixed width types + auto const row_size_aligned = util::round_up_unsafe(row_size, alignment_needed); auto const row_size_with_this_col = row_size_aligned + col_size; auto const row_size_with_end_pad = - util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT); + util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT); if (row_size_with_end_pad * tile_height > shmem_limit_per_tile) { // too large, close this tile, generate vertical tiles and restart f(current_tile_start_col, col == 0 ? col : col - 1, tile_height); row_size = - util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed); - row_size += col_size; // alignment required for shared memory tile boundary to match alignment - // of output row + util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed); + row_size += col_size; // alignment required for shared memory tile boundary to match + // alignment of output row current_tile_start_col = col; - current_tile_width = 0; + current_tile_width = 0; } else { row_size = row_size_with_this_col; current_tile_width++; @@ -1725,155 +1826,196 @@ void determine_tiles(std::vector const &column_sizes, */ template std::vector> convert_to_rows( - table_view const &tbl, batch_data &batch_info, offsetFunctor offset_functor, - column_info_s const &column_info, - std::optional> variable_width_offsets, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { + table_view const& tbl, + batch_data& batch_info, + offsetFunctor offset_functor, + column_info_s const& column_info, + std::optional> variable_width_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ int device_id; CUDF_CUDA_TRY(cudaGetDevice(&device_id)); int total_shmem_in_bytes; CUDF_CUDA_TRY( - cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); -#ifndef __CUDA_ARCH__ // __host__ code. +#ifndef __CUDA_ARCH__ // __host__ code. // Need to reduce total shmem available by the size of barriers in the kernel's shared memory total_shmem_in_bytes -= - util::round_up_unsafe(sizeof(cuda::barrier), 16ul); -#endif // __CUDA_ARCH__ + util::round_up_unsafe(sizeof(cuda::barrier), 16ul); +#endif // __CUDA_ARCH__ auto const shmem_limit_per_tile = total_shmem_in_bytes; - auto const num_rows = tbl.num_rows(); + auto const num_rows = tbl.num_rows(); auto const fixed_width_only = !variable_width_offsets.has_value(); - auto select_columns = [](auto const &tbl, auto column_predicate) { + auto select_columns = [](auto const& tbl, auto column_predicate) { std::vector cols; - std::copy_if(tbl.begin(), tbl.end(), std::back_inserter(cols), - [&](auto c) { return column_predicate(c); }); + std::copy_if(tbl.begin(), tbl.end(), std::back_inserter(cols), [&](auto c) { + return column_predicate(c); + }); return table_view(cols); }; - auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream, - rmm::mr::get_current_device_resource()); - auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream, - rmm::mr::get_current_device_resource()); + auto dev_col_sizes = make_device_uvector_async( + column_info.column_sizes, stream, rmm::mr::get_current_device_resource()); + auto dev_col_starts = make_device_uvector_async( + column_info.column_starts, stream, rmm::mr::get_current_device_resource()); // Get the pointers to the input columnar data ready - auto const data_begin = thrust::make_transform_iterator(tbl.begin(), [](auto const &c) { + auto const data_begin = thrust::make_transform_iterator(tbl.begin(), [](auto const& c) { return is_compound(c.type()) ? nullptr : c.template data(); }); - std::vector input_data(data_begin, data_begin + tbl.num_columns()); + std::vector input_data(data_begin, data_begin + tbl.num_columns()); // validity code handles variable and fixed-width data, so give it everything auto const nm_begin = - thrust::make_transform_iterator(tbl.begin(), [](auto const &c) { return c.null_mask(); }); - std::vector input_nm(nm_begin, nm_begin + tbl.num_columns()); + thrust::make_transform_iterator(tbl.begin(), [](auto const& c) { return c.null_mask(); }); + std::vector input_nm(nm_begin, nm_begin + tbl.num_columns()); auto dev_input_data = - make_device_uvector_async(input_data, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(input_data, stream, rmm::mr::get_current_device_resource()); auto dev_input_nm = - make_device_uvector_async(input_nm, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(input_nm, stream, rmm::mr::get_current_device_resource()); // the first batch always exists unless we were sent an empty table auto const first_batch_size = batch_info.row_batches[0].row_count; std::vector output_buffers; - std::vector output_data; + std::vector output_data; output_data.reserve(batch_info.row_batches.size()); output_buffers.reserve(batch_info.row_batches.size()); - std::transform(batch_info.row_batches.begin(), batch_info.row_batches.end(), - std::back_inserter(output_buffers), [&](auto const &batch) { - return rmm::device_buffer(batch.num_bytes, stream, mr); - }); - std::transform(output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data), - [](auto &buf) { return static_cast(buf.data()); }); + std::transform( + batch_info.row_batches.begin(), + batch_info.row_batches.end(), + std::back_inserter(output_buffers), + [&](auto const& batch) { return rmm::device_buffer(batch.num_bytes, stream, mr); }); + std::transform( + output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data), [](auto& buf) { + return static_cast(buf.data()); + }); auto dev_output_data = make_device_uvector_async(output_data, stream, mr); int info_count = 0; detail::determine_tiles( - column_info.column_sizes, column_info.column_starts, first_batch_size, num_rows, - shmem_limit_per_tile, - [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &info_count, - &stream](int const start_col, int const end_col, int const tile_height) { - int i = detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream); - info_count += i; - }); + column_info.column_sizes, + column_info.column_starts, + first_batch_size, + num_rows, + shmem_limit_per_tile, + [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &info_count, &stream]( + int const start_col, int const end_col, int const tile_height) { + int i = detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream); + info_count += i; + }); // allocate space for tiles device_uvector gpu_tile_infos(info_count, stream); int tile_offset = 0; detail::determine_tiles( - column_info.column_sizes, column_info.column_starts, first_batch_size, num_rows, - shmem_limit_per_tile, - [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &gpu_tile_infos, num_rows, - &tile_offset, stream](int const start_col, int const end_col, int const tile_height) { - tile_offset += detail::build_tiles( - {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset}, - gpu_batch_row_boundaries, start_col, end_col, tile_height, num_rows, stream); - }); + column_info.column_sizes, + column_info.column_starts, + first_batch_size, + num_rows, + shmem_limit_per_tile, + [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, + &gpu_tile_infos, + num_rows, + &tile_offset, + stream](int const start_col, int const end_col, int const tile_height) { + tile_offset += detail::build_tiles( + {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset}, + gpu_batch_row_boundaries, + start_col, + end_col, + tile_height, + num_rows, + stream); + }); // build validity tiles for ALL columns, variable and fixed width. auto validity_tile_infos = detail::build_validity_tile_infos( - tbl.num_columns(), num_rows, shmem_limit_per_tile, batch_info.row_batches); + tbl.num_columns(), num_rows, shmem_limit_per_tile, batch_info.row_batches); - auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream, - rmm::mr::get_current_device_resource()); + auto dev_validity_tile_infos = + make_device_uvector_async(validity_tile_infos, stream, rmm::mr::get_current_device_resource()); auto const validity_offset = column_info.column_starts.back(); // blast through the entire table and convert it - detail::copy_to_rows<<>>( - num_rows, tbl.num_columns(), shmem_limit_per_tile, gpu_tile_infos, dev_input_data.data(), - dev_col_sizes.data(), dev_col_starts.data(), offset_functor, - batch_info.d_batch_row_boundaries.data(), - reinterpret_cast(dev_output_data.data())); + detail::copy_to_rows<<>>(num_rows, + tbl.num_columns(), + shmem_limit_per_tile, + gpu_tile_infos, + dev_input_data.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + offset_functor, + batch_info.d_batch_row_boundaries.data(), + reinterpret_cast(dev_output_data.data())); // note that validity gets the entire table and not the fixed-width portion detail::copy_validity_to_rows<<>>( - num_rows, tbl.num_columns(), shmem_limit_per_tile, offset_functor, - batch_info.d_batch_row_boundaries.data(), dev_output_data.data(), validity_offset, - dev_validity_tile_infos, dev_input_nm.data()); + total_shmem_in_bytes, + stream.value()>>>(num_rows, + tbl.num_columns(), + shmem_limit_per_tile, + offset_functor, + batch_info.d_batch_row_boundaries.data(), + dev_output_data.data(), + validity_offset, + dev_validity_tile_infos, + dev_input_nm.data()); if (!fixed_width_only) { // build table view for variable-width data only auto const variable_width_table = - select_columns(tbl, [](auto col) { return is_compound(col.type()); }); + select_columns(tbl, [](auto col) { return is_compound(col.type()); }); CUDF_EXPECTS(!variable_width_table.is_empty(), "No variable-width columns when expected!"); CUDF_EXPECTS(variable_width_offsets.has_value(), "No variable width offset data!"); auto const variable_data_begin = - thrust::make_transform_iterator(variable_width_table.begin(), [](auto const &c) { - strings_column_view const scv{c}; - return is_compound(c.type()) ? scv.chars().template data() : nullptr; - }); - std::vector variable_width_input_data( - variable_data_begin, variable_data_begin + variable_width_table.num_columns()); + thrust::make_transform_iterator(variable_width_table.begin(), [](auto const& c) { + strings_column_view const scv{c}; + return is_compound(c.type()) ? scv.chars().template data() : nullptr; + }); + std::vector variable_width_input_data( + variable_data_begin, variable_data_begin + variable_width_table.num_columns()); auto dev_variable_input_data = make_device_uvector_async( - variable_width_input_data, stream, rmm::mr::get_current_device_resource()); + variable_width_input_data, stream, rmm::mr::get_current_device_resource()); auto dev_variable_col_output_offsets = make_device_uvector_async( - column_info.variable_width_column_starts, stream, rmm::mr::get_current_device_resource()); + column_info.variable_width_column_starts, stream, rmm::mr::get_current_device_resource()); for (uint i = 0; i < batch_info.row_batches.size(); i++) { auto const batch_row_offset = batch_info.batch_row_boundaries[i]; - auto const batch_num_rows = batch_info.row_batches[i].row_count; - - dim3 const string_blocks(std::min( - MAX_STRING_BLOCKS, - util::div_rounding_up_unsafe(batch_num_rows, NUM_STRING_ROWS_PER_BLOCK_TO_ROWS))); - - detail::copy_strings_to_rows<<>>( - batch_num_rows, variable_width_table.num_columns(), dev_variable_input_data.data(), - dev_variable_col_output_offsets.data(), variable_width_offsets->data(), - column_info.size_per_row, offset_functor, batch_row_offset, - reinterpret_cast(output_data[i])); + auto const batch_num_rows = batch_info.row_batches[i].row_count; + + dim3 const string_blocks( + std::min(MAX_STRING_BLOCKS, + util::div_rounding_up_unsafe(batch_num_rows, NUM_STRING_ROWS_PER_BLOCK_TO_ROWS))); + + detail::copy_strings_to_rows<<>>(batch_num_rows, + variable_width_table.num_columns(), + dev_variable_input_data.data(), + dev_variable_col_output_offsets.data(), + variable_width_offsets->data(), + column_info.size_per_row, + offset_functor, + batch_row_offset, + reinterpret_cast(output_data[i])); } } @@ -1882,28 +2024,36 @@ std::vector> convert_to_rows( std::vector> ret; ret.reserve(batch_info.row_batches.size()); auto counting_iter = thrust::make_counting_iterator(0); - std::transform(counting_iter, counting_iter + batch_info.row_batches.size(), - std::back_inserter(ret), [&](auto batch) { + std::transform(counting_iter, + counting_iter + batch_info.row_batches.size(), + std::back_inserter(ret), + [&](auto batch) { auto const offset_count = batch_info.row_batches[batch].row_offsets.size(); - auto offsets = std::make_unique( - data_type{type_id::INT32}, (size_type)offset_count, - batch_info.row_batches[batch].row_offsets.release(), - rmm::device_buffer{}, 0); + auto offsets = + std::make_unique(data_type{type_id::INT32}, + (size_type)offset_count, + batch_info.row_batches[batch].row_offsets.release(), + rmm::device_buffer{}, + 0); auto data = std::make_unique(data_type{type_id::INT8}, batch_info.row_batches[batch].num_bytes, std::move(output_buffers[batch]), rmm::device_buffer{}, 0); - return make_lists_column( - batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data), - 0, rmm::device_buffer{0, cudf::get_default_stream(), mr}, stream, mr); + return make_lists_column(batch_info.row_batches[batch].row_count, + std::move(offsets), + std::move(data), + 0, + rmm::device_buffer{0, cudf::get_default_stream(), mr}, + stream, + mr); }); return ret; } -} // namespace detail +} // namespace detail /** * @brief convert a cudf table to JCUDF row format @@ -1913,14 +2063,15 @@ std::vector> convert_to_rows( * @param mr memory resource used for returned data * @return vector of list columns containing byte columns of the JCUDF row data */ -std::vector> convert_to_rows(table_view const &tbl, +std::vector> convert_to_rows(table_view const& tbl, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { + rmm::mr::device_memory_resource* mr) +{ auto const num_columns = tbl.num_columns(); - auto const num_rows = tbl.num_rows(); + auto const num_rows = tbl.num_rows(); auto const fixed_width_only = std::all_of( - tbl.begin(), tbl.end(), [](column_view const &c) { return is_fixed_width(c.type()); }); + tbl.begin(), tbl.end(), [](column_view const& c) { return is_fixed_width(c.type()); }); // Break up the work into tiles, which are a starting and ending row/col #. This tile size is // calculated based on the shared memory size available we want a single tile to fill up the @@ -1936,94 +2087,107 @@ std::vector> convert_to_rows(table_view const &tbl, // before building the tiles so the tiles can be properly cut around them. auto schema_column_iter = - thrust::make_transform_iterator(tbl.begin(), [](auto const &i) { return i.type(); }); + thrust::make_transform_iterator(tbl.begin(), [](auto const& i) { return i.type(); }); auto column_info = - detail::compute_column_information(schema_column_iter, schema_column_iter + num_columns); + detail::compute_column_information(schema_column_iter, schema_column_iter + num_columns); auto const size_per_row = column_info.size_per_row; if (fixed_width_only) { // total encoded row size. This includes fixed-width data and validity only. It does not include // variable-width data since it isn't copied with the fixed-width and validity kernel. auto row_size_iter = thrust::make_constant_iterator( - util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT)); + util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT)); auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr); detail::fixed_width_row_offset_functor offset_functor( - util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT)); + util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT)); - return detail::convert_to_rows(tbl, batch_info, offset_functor, std::move(column_info), - std::nullopt, stream, mr); + return detail::convert_to_rows( + tbl, batch_info, offset_functor, std::move(column_info), std::nullopt, stream, mr); } else { auto offset_data = detail::build_string_row_offsets(tbl, size_per_row, stream); - auto &row_sizes = std::get<0>(offset_data); + auto& row_sizes = std::get<0>(offset_data); auto row_size_iter = cudf::detail::make_counting_transform_iterator( - 0, detail::row_size_functor(num_rows, row_sizes.data(), 0)); + 0, detail::row_size_functor(num_rows, row_sizes.data(), 0)); auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr); detail::string_row_offset_functor offset_functor(batch_info.batch_row_offsets); - return detail::convert_to_rows(tbl, batch_info, offset_functor, std::move(column_info), - std::make_optional(std::move(std::get<1>(offset_data))), stream, + return detail::convert_to_rows(tbl, + batch_info, + offset_functor, + std::move(column_info), + std::make_optional(std::move(std::get<1>(offset_data))), + stream, mr); } } -std::vector> -convert_to_rows_fixed_width_optimized(table_view const &tbl, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { +std::vector> convert_to_rows_fixed_width_optimized( + table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) +{ auto const num_columns = tbl.num_columns(); std::vector schema; schema.resize(num_columns); - std::transform(tbl.begin(), tbl.end(), schema.begin(), - [](auto i) -> data_type { return i.type(); }); + std::transform( + tbl.begin(), tbl.end(), schema.begin(), [](auto i) -> data_type { return i.type(); }); if (detail::are_all_fixed_width(schema)) { std::vector column_start; std::vector column_size; int32_t const size_per_row = - detail::compute_fixed_width_layout(schema, column_start, column_size); + detail::compute_fixed_width_layout(schema, column_start, column_size); auto dev_column_start = make_device_uvector_async(column_start, stream, mr); - auto dev_column_size = make_device_uvector_async(column_size, stream, mr); + auto dev_column_size = make_device_uvector_async(column_size, stream, mr); // Make the number of rows per batch a multiple of 32 so we don't have to worry about splitting // validity at a specific row offset. This might change in the future. auto const max_rows_per_batch = - util::round_down_safe(std::numeric_limits::max() / size_per_row, 32); + util::round_down_safe(std::numeric_limits::max() / size_per_row, 32); auto const num_rows = tbl.num_rows(); // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; + std::vector input_data; + std::vector input_nm; for (size_type column_number = 0; column_number < num_columns; column_number++) { column_view cv = tbl.column(column_number); input_data.emplace_back(cv.data()); input_nm.emplace_back(cv.null_mask()); } auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); using ScalarType = scalar_type_t; - auto zero = make_numeric_scalar(data_type(type_id::INT32), stream.value()); + auto zero = make_numeric_scalar(data_type(type_id::INT32), stream.value()); zero->set_valid_async(true, stream); - static_cast(zero.get())->set_value(0, stream); + static_cast(zero.get())->set_value(0, stream); auto step = make_numeric_scalar(data_type(type_id::INT32), stream.value()); step->set_valid_async(true, stream); - static_cast(step.get())->set_value(static_cast(size_per_row), stream); + static_cast(step.get())->set_value(static_cast(size_per_row), stream); std::vector> ret; for (size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { size_type row_count = num_rows - row_start; - row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; - ret.emplace_back(detail::fixed_width_convert_to_rows( - row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size, - dev_input_data, dev_input_nm, *zero, *step, stream, mr)); + row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; + ret.emplace_back(detail::fixed_width_convert_to_rows(row_start, + row_count, + num_columns, + size_per_row, + dev_column_start, + dev_column_size, + dev_input_data, + dev_input_nm, + *zero, + *step, + stream, + mr)); } return ret; @@ -2036,14 +2200,14 @@ namespace { /// @brief Calculates and sets null counts for specified columns void fixup_null_counts(std::vector>& output_columns, - rmm::cuda_stream_view stream) { - for (auto &col : output_columns) { - col->set_null_count( - cudf::detail::null_count(col->view().null_mask(), 0, col->size(), stream)); + rmm::cuda_stream_view stream) +{ + for (auto& col : output_columns) { + col->set_null_count(cudf::detail::null_count(col->view().null_mask(), 0, col->size(), stream)); } } -} +} // namespace /** * @brief convert from JCUDF row format to cudf columns @@ -2054,12 +2218,13 @@ void fixup_null_counts(std::vector>& output_columns, * @param mr memory resource for returned data * @return cudf table of the data */ -std::unique_ptr convert_from_rows(lists_column_view const &input, - std::vector const &schema, +std::unique_ptr
convert_from_rows(lists_column_view const& input, + std::vector const& schema, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { + rmm::mr::device_memory_resource* mr) +{ // verify that the types are what we expect - column_view child = input.child(); + column_view child = input.child(); auto const list_type = child.type().id(); CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8, "Only a list of bytes is supported as input"); @@ -2079,19 +2244,19 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, } auto const num_columns = string_schema.size(); - auto const num_rows = input.parent().size(); + auto const num_rows = input.parent().size(); int device_id; CUDF_CUDA_TRY(cudaGetDevice(&device_id)); int total_shmem_in_bytes; CUDF_CUDA_TRY( - cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); -#ifndef __CUDA_ARCH__ // __host__ code. +#ifndef __CUDA_ARCH__ // __host__ code. // Need to reduce total shmem available by the size of barriers in the kernel's shared memory total_shmem_in_bytes -= - util::round_up_unsafe(sizeof(cuda::barrier), 16ul); -#endif // __CUDA_ARCH__ + util::round_up_unsafe(sizeof(cuda::barrier), 16ul); +#endif // __CUDA_ARCH__ auto const shmem_limit_per_tile = total_shmem_in_bytes; @@ -2101,41 +2266,44 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, // Ideally we would check that the offsets are all the same, etc. but for now this is probably // fine CUDF_EXPECTS(size_per_row * num_rows <= child.size(), "The layout of the data appears to be off"); - auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream, - rmm::mr::get_current_device_resource()); - auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream, - rmm::mr::get_current_device_resource()); + auto dev_col_starts = make_device_uvector_async( + column_info.column_starts, stream, rmm::mr::get_current_device_resource()); + auto dev_col_sizes = make_device_uvector_async( + column_info.column_sizes, stream, rmm::mr::get_current_device_resource()); // Allocate the columns we are going to write into std::vector> output_columns; std::vector> string_row_offset_columns; std::vector> string_length_columns; - std::vector output_data; - std::vector output_nm; - std::vector string_row_offsets; - std::vector string_lengths; + std::vector output_data; + std::vector output_nm; + std::vector string_row_offsets; + std::vector string_lengths; for (auto i : schema) { - auto make_col = [&output_data, &output_nm](data_type type, size_type num_rows, bool include_nm, + auto make_col = [&output_data, &output_nm](data_type type, + size_type num_rows, + bool include_nm, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { - auto column = make_fixed_width_column( - type, num_rows, include_nm ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, stream, - mr); + rmm::mr::device_memory_resource* mr) { + auto column = + make_fixed_width_column(type, + num_rows, + include_nm ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, + stream, + mr); auto mut = column->mutable_view(); output_data.emplace_back(mut.data()); - if (include_nm) { - output_nm.emplace_back(mut.null_mask()); - } + if (include_nm) { output_nm.emplace_back(mut.null_mask()); } return column; }; if (i.id() == type_id::STRING) { auto const int32type = data_type(type_id::INT32); auto offset_col = - make_col(int32type, num_rows, true, stream, rmm::mr::get_current_device_resource()); + make_col(int32type, num_rows, true, stream, rmm::mr::get_current_device_resource()); string_row_offsets.push_back(offset_col->mutable_view().data()); string_row_offset_columns.emplace_back(std::move(offset_col)); auto length_col = - make_col(int32type, num_rows, false, stream, rmm::mr::get_current_device_resource()); + make_col(int32type, num_rows, false, stream, rmm::mr::get_current_device_resource()); string_lengths.push_back(length_col->mutable_view().data()); string_length_columns.emplace_back(std::move(length_col)); // placeholder @@ -2146,138 +2314,191 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, } auto dev_string_row_offsets = - make_device_uvector_async(string_row_offsets, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(string_row_offsets, stream, rmm::mr::get_current_device_resource()); auto dev_string_lengths = - make_device_uvector_async(string_lengths, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(string_lengths, stream, rmm::mr::get_current_device_resource()); // build the row_batches from the passed in list column std::vector row_batches; row_batches.push_back( - {detail::row_batch{child.size(), num_rows, device_uvector(0, stream)}}); + {detail::row_batch{child.size(), num_rows, device_uvector(0, stream)}}); auto dev_output_data = - make_device_uvector_async(output_data, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(output_data, stream, rmm::mr::get_current_device_resource()); auto dev_output_nm = - make_device_uvector_async(output_nm, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(output_nm, stream, rmm::mr::get_current_device_resource()); // only ever get a single batch when going from rows, so boundaries are 0, num_rows constexpr auto num_batches = 2; device_uvector gpu_batch_row_boundaries(num_batches, stream); - thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_batches), gpu_batch_row_boundaries.begin(), + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_batches), + gpu_batch_row_boundaries.begin(), [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; }); int info_count = 0; - detail::determine_tiles( - column_info.column_sizes, column_info.column_starts, num_rows, num_rows, shmem_limit_per_tile, - [&gpu_batch_row_boundaries, &info_count, &stream](int const start_col, int const end_col, - int const tile_height) { - info_count += detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream); - }); + detail::determine_tiles(column_info.column_sizes, + column_info.column_starts, + num_rows, + num_rows, + shmem_limit_per_tile, + [&gpu_batch_row_boundaries, &info_count, &stream]( + int const start_col, int const end_col, int const tile_height) { + info_count += detail::compute_tile_counts( + gpu_batch_row_boundaries, tile_height, stream); + }); // allocate space for tiles device_uvector gpu_tile_infos(info_count, stream); int tile_offset = 0; detail::determine_tiles( - column_info.column_sizes, column_info.column_starts, num_rows, num_rows, shmem_limit_per_tile, - [&gpu_batch_row_boundaries, &gpu_tile_infos, num_rows, &tile_offset, - stream](int const start_col, int const end_col, int const tile_height) { - tile_offset += detail::build_tiles( - {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset}, - gpu_batch_row_boundaries, start_col, end_col, tile_height, num_rows, stream); - }); + column_info.column_sizes, + column_info.column_starts, + num_rows, + num_rows, + shmem_limit_per_tile, + [&gpu_batch_row_boundaries, &gpu_tile_infos, num_rows, &tile_offset, stream]( + int const start_col, int const end_col, int const tile_height) { + tile_offset += detail::build_tiles( + {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset}, + gpu_batch_row_boundaries, + start_col, + end_col, + tile_height, + num_rows, + stream); + }); dim3 const blocks(gpu_tile_infos.size()); // validity needs to be calculated based on the actual number of final table columns auto validity_tile_infos = - detail::build_validity_tile_infos(schema.size(), num_rows, shmem_limit_per_tile, row_batches); + detail::build_validity_tile_infos(schema.size(), num_rows, shmem_limit_per_tile, row_batches); - auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream, - rmm::mr::get_current_device_resource()); + auto dev_validity_tile_infos = + make_device_uvector_async(validity_tile_infos, stream, rmm::mr::get_current_device_resource()); dim3 const validity_blocks(validity_tile_infos.size()); if (dev_string_row_offsets.size() == 0) { detail::fixed_width_row_offset_functor offset_functor(size_per_row); - detail::copy_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_tile, offset_functor, - gpu_batch_row_boundaries.data(), dev_output_data.data(), dev_col_sizes.data(), - dev_col_starts.data(), gpu_tile_infos, child.data()); + detail::copy_from_rows<<>>(num_rows, + num_columns, + shmem_limit_per_tile, + offset_functor, + gpu_batch_row_boundaries.data(), + dev_output_data.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + gpu_tile_infos, + child.data()); detail::copy_validity_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_tile, offset_functor, - gpu_batch_row_boundaries.data(), dev_output_nm.data(), column_info.column_starts.back(), - dev_validity_tile_infos, child.data()); + total_shmem_in_bytes, + stream.value()>>>(num_rows, + num_columns, + shmem_limit_per_tile, + offset_functor, + gpu_batch_row_boundaries.data(), + dev_output_nm.data(), + column_info.column_starts.back(), + dev_validity_tile_infos, + child.data()); } else { detail::string_row_offset_functor offset_functor(device_span{input.offsets()}); - detail::copy_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_tile, offset_functor, - gpu_batch_row_boundaries.data(), dev_output_data.data(), dev_col_sizes.data(), - dev_col_starts.data(), gpu_tile_infos, child.data()); + detail::copy_from_rows<<>>(num_rows, + num_columns, + shmem_limit_per_tile, + offset_functor, + gpu_batch_row_boundaries.data(), + dev_output_data.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + gpu_tile_infos, + child.data()); detail::copy_validity_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_tile, offset_functor, - gpu_batch_row_boundaries.data(), dev_output_nm.data(), column_info.column_starts.back(), - dev_validity_tile_infos, child.data()); + total_shmem_in_bytes, + stream.value()>>>(num_rows, + num_columns, + shmem_limit_per_tile, + offset_functor, + gpu_batch_row_boundaries.data(), + dev_output_nm.data(), + column_info.column_starts.back(), + dev_validity_tile_infos, + child.data()); std::vector> string_col_offsets; std::vector> string_data_cols; - std::vector string_col_offset_ptrs; - std::vector string_data_col_ptrs; - for (auto &col_string_lengths : string_lengths) { + std::vector string_col_offset_ptrs; + std::vector string_data_col_ptrs; + for (auto& col_string_lengths : string_lengths) { device_uvector output_string_offsets(num_rows + 1, stream, mr); - auto tmp = [num_rows, col_string_lengths] __device__(auto const &i) { + auto tmp = [num_rows, col_string_lengths] __device__(auto const& i) { return i < num_rows ? col_string_lengths[i] : 0; }; auto bounded_iter = cudf::detail::make_counting_transform_iterator(0, tmp); - thrust::exclusive_scan(rmm::exec_policy(stream), bounded_iter, bounded_iter + num_rows + 1, + thrust::exclusive_scan(rmm::exec_policy(stream), + bounded_iter, + bounded_iter + num_rows + 1, output_string_offsets.begin()); // allocate destination string column - rmm::device_uvector string_data(output_string_offsets.element(num_rows, stream), stream, - mr); + rmm::device_uvector string_data( + output_string_offsets.element(num_rows, stream), stream, mr); string_col_offset_ptrs.push_back(output_string_offsets.data()); string_data_col_ptrs.push_back(string_data.data()); string_col_offsets.push_back(std::move(output_string_offsets)); string_data_cols.push_back(std::move(string_data)); } - auto dev_string_col_offsets = make_device_uvector_async(string_col_offset_ptrs, stream, - rmm::mr::get_current_device_resource()); - auto dev_string_data_cols = make_device_uvector_async(string_data_col_ptrs, stream, - rmm::mr::get_current_device_resource()); + auto dev_string_col_offsets = make_device_uvector_async( + string_col_offset_ptrs, stream, rmm::mr::get_current_device_resource()); + auto dev_string_data_cols = make_device_uvector_async( + string_data_col_ptrs, stream, rmm::mr::get_current_device_resource()); dim3 const string_blocks( - std::min(std::max(MIN_STRING_BLOCKS, num_rows / NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS), - MAX_STRING_BLOCKS)); + std::min(std::max(MIN_STRING_BLOCKS, num_rows / NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS), + MAX_STRING_BLOCKS)); - detail::copy_strings_from_rows<<>>( - offset_functor, dev_string_row_offsets.data(), dev_string_lengths.data(), - dev_string_col_offsets.data(), dev_string_data_cols.data(), child.data(), num_rows, - static_cast(string_col_offsets.size())); + offset_functor, + dev_string_row_offsets.data(), + dev_string_lengths.data(), + dev_string_col_offsets.data(), + dev_string_data_cols.data(), + child.data(), + num_rows, + static_cast(string_col_offsets.size())); // merge strings back into output_columns int string_idx = 0; for (int i = 0; i < static_cast(schema.size()); ++i) { if (schema[i].id() == type_id::STRING) { // stuff real string column - auto string_data = string_row_offset_columns[string_idx].release()->release(); - output_columns[i] = - make_strings_column(num_rows, std::move(string_col_offsets[string_idx]), - std::move(string_data_cols[string_idx]), - std::move(*string_data.null_mask.release()), 0); + auto string_data = string_row_offset_columns[string_idx].release()->release(); + output_columns[i] = make_strings_column(num_rows, + std::move(string_col_offsets[string_idx]), + std::move(string_data_cols[string_idx]), + std::move(*string_data.null_mask.release()), + 0); // Null count set to 0, temporarily. Will be fixed up before return. string_idx++; } @@ -2292,11 +2513,13 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, return std::make_unique
(std::move(output_columns)); } -std::unique_ptr
convert_from_rows_fixed_width_optimized( - lists_column_view const &input, std::vector const &schema, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { +std::unique_ptr
convert_from_rows_fixed_width_optimized(lists_column_view const& input, + std::vector const& schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ // verify that the types are what we expect - column_view child = input.child(); + column_view child = input.child(); auto const list_type = child.type().id(); CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8, "Only a list of bytes is supported as input"); @@ -2307,7 +2530,7 @@ std::unique_ptr
convert_from_rows_fixed_width_optimized( std::vector column_start; std::vector column_size; - auto const num_rows = input.parent().size(); + auto const num_rows = input.parent().size(); auto const size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); // Ideally we would check that the offsets are all the same, etc. but for now this is probably @@ -2315,17 +2538,17 @@ std::unique_ptr
convert_from_rows_fixed_width_optimized( CUDF_EXPECTS(size_per_row * num_rows == child.size(), "The layout of the data appears to be off"); auto dev_column_start = - make_device_uvector_async(column_start, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(column_start, stream, rmm::mr::get_current_device_resource()); auto dev_column_size = - make_device_uvector_async(column_size, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(column_size, stream, rmm::mr::get_current_device_resource()); // Allocate the columns we are going to write into std::vector> output_columns; - std::vector output_data; - std::vector output_nm; + std::vector output_data; + std::vector output_nm; for (int i = 0; i < static_cast(num_columns); i++) { auto column = - make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr); + make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr); auto mut = column->mutable_view(); output_data.emplace_back(mut.data()); output_nm.emplace_back(mut.null_mask()); @@ -2333,16 +2556,22 @@ std::unique_ptr
convert_from_rows_fixed_width_optimized( } auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); dim3 blocks; dim3 threads; int shared_size = - detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); detail::copy_from_rows_fixed_width_optimized<<>>( - num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(), - dev_output_data.data(), dev_output_nm.data(), child.data()); + num_rows, + num_columns, + size_per_row, + dev_column_start.data(), + dev_column_size.data(), + dev_output_data.data(), + dev_output_nm.data(), + child.data()); // Set null counts, because output_columns are modified via mutable-view, // in the kernel above. @@ -2355,4 +2584,4 @@ std::unique_ptr
convert_from_rows_fixed_width_optimized( } } -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/row_conversion.hpp b/src/main/cpp/src/row_conversion.hpp index 635960ad14..84ee729d55 100644 --- a/src/main/cpp/src/row_conversion.hpp +++ b/src/main/cpp/src/row_conversion.hpp @@ -48,4 +48,4 @@ std::unique_ptr convert_from_rows( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/zorder.cu b/src/main/cpp/src/zorder.cu index 028a30d66c..c0f21b9b3a 100644 --- a/src/main/cpp/src/zorder.cu +++ b/src/main/cpp/src/zorder.cu @@ -36,27 +36,32 @@ namespace { template struct uint_backed_array { uint_backed_array() = delete; - __device__ explicit uint_backed_array(int32_t num_bits_per_entry): data(0), - num_bits_per_entry(num_bits_per_entry), mask(static_cast((1L << num_bits_per_entry) - 1)) {} + __device__ explicit uint_backed_array(int32_t num_bits_per_entry) + : data(0), + num_bits_per_entry(num_bits_per_entry), + mask(static_cast((1L << num_bits_per_entry) - 1)) + { + } - __device__ uint32_t operator[](int32_t i) const { + __device__ uint32_t operator[](int32_t i) const + { int32_t offset = num_bits_per_entry * i; return (data >> offset) & mask; } - __device__ void set(int32_t i, uint32_t value) { - int32_t offset = i * num_bits_per_entry; + __device__ void set(int32_t i, uint32_t value) + { + int32_t offset = i * num_bits_per_entry; data_type masked_data = data & ~(static_cast(mask) << offset); - data = masked_data | (static_cast(value & mask) << offset); + data = masked_data | (static_cast(value & mask) << offset); } -private: + private: data_type data; int32_t const num_bits_per_entry; uint32_t const mask; }; - // Most of the hilbert index code is based off of the work done by David Moten at // https://github.com/davidmoten/hilbert-curve, which has the following Note in // the code too @@ -65,17 +70,17 @@ private: // With thanks also to Paul Chernoch who published a C# algorithm for Skilling's // work on StackOverflow and // GitHub. -__device__ uint64_t to_hilbert_index(uint_backed_array const & transposed_index, - int32_t const num_bits_per_entry, int32_t const num_dimensions) { - uint64_t b = 0; +__device__ uint64_t to_hilbert_index(uint_backed_array const& transposed_index, + int32_t const num_bits_per_entry, + int32_t const num_dimensions) +{ + uint64_t b = 0; int32_t const length = num_bits_per_entry * num_dimensions; - int32_t b_index = length - 1; - uint64_t mask = 1L << (num_bits_per_entry - 1); + int32_t b_index = length - 1; + uint64_t mask = 1L << (num_bits_per_entry - 1); for (int32_t i = 0; i < num_bits_per_entry; i++) { for (int32_t j = 0; j < num_dimensions; j++) { - if ((transposed_index[j] & mask) != 0) { - b |= 1L << b_index; - } + if ((transposed_index[j] & mask) != 0) { b |= 1L << b_index; } b_index--; } mask >>= 1; @@ -84,11 +89,14 @@ __device__ uint64_t to_hilbert_index(uint_backed_array const & transpo return b; } -__device__ uint_backed_array hilbert_transposed_index(uint_backed_array const & point, - int32_t const num_bits_per_entry, int32_t const num_dimensions) { +__device__ uint_backed_array hilbert_transposed_index( + uint_backed_array const& point, + int32_t const num_bits_per_entry, + int32_t const num_dimensions) +{ uint32_t const M = 1L << (num_bits_per_entry - 1); - int32_t const n = num_dimensions; - auto x = point; + int32_t const n = num_dimensions; + auto x = point; uint32_t p, q, t; uint32_t i; @@ -97,14 +105,14 @@ __device__ uint_backed_array hilbert_transposed_index(uint_backed_arra p = q - 1; for (i = 0; i < n; i++) { if ((x[i] & q) != 0) { - x.set(0, x[0] ^ p); // invert + x.set(0, x[0] ^ p); // invert } else { t = (x[0] ^ x[i]) & p; x.set(0, x[0] ^ t); x.set(i, x[i] ^ t); } } - } // exchange + } // exchange // Gray encode for (i = 1; i < n; i++) { @@ -112,9 +120,7 @@ __device__ uint_backed_array hilbert_transposed_index(uint_backed_arra } t = 0; for (q = M; q > 1; q >>= 1) { - if ((x[n - 1] & q) != 0) { - t ^= q - 1; - } + if ((x[n - 1] & q) != 0) { t ^= q - 1; } } for (i = 0; i < n; i++) { @@ -124,21 +130,19 @@ __device__ uint_backed_array hilbert_transposed_index(uint_backed_arra return x; } - -} // namespace +} // namespace namespace spark_rapids_jni { -std::unique_ptr interleave_bits( - cudf::table_view const& tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) { - +std::unique_ptr interleave_bits(cudf::table_view const& tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ auto num_columns = tbl.num_columns(); CUDF_EXPECTS(num_columns > 0, "The input table must have at least one column."); CUDF_EXPECTS(is_fixed_width(tbl.begin()->type()), "Only fixed width columns can be used"); - auto const type_id = tbl.begin()->type().id(); + auto const type_id = tbl.begin()->type().id(); auto const data_type_size = cudf::size_of(tbl.begin()->type()); CUDF_EXPECTS( std::all_of(tbl.begin(), @@ -152,14 +156,14 @@ std::unique_ptr interleave_bits( const cudf::size_type max_bytes_allowed = std::numeric_limits::max(); int64_t total_output_size = static_cast(num_rows) * data_type_size * num_columns; - CUDF_EXPECTS (total_output_size <= max_bytes_allowed, "Input is too large to process"); + CUDF_EXPECTS(total_output_size <= max_bytes_allowed, "Input is too large to process"); cudf::size_type output_size = static_cast(total_output_size); auto input_dv = cudf::table_device_view::create(tbl, stream); auto output_data_col = cudf::make_numeric_column( - cudf::data_type{cudf::type_id::UINT8}, output_size, cudf::mask_state::UNALLOCATED, stream, mr); + cudf::data_type{cudf::type_id::UINT8}, output_size, cudf::mask_state::UNALLOCATED, stream, mr); auto output_dv_ptr = cudf::mutable_column_device_view::create(*output_data_col, stream); @@ -167,76 +171,80 @@ std::unique_ptr interleave_bits( rmm::exec_policy(stream), thrust::make_counting_iterator(0), output_size, - [col = *output_dv_ptr, - num_columns, - data_type_size, - input = *input_dv] __device__ (cudf::size_type ret_idx) { - // The most significant byte needs to come from the most significant column, so we switch the order of the output - // bytes to match that - cudf::size_type const flipped_start_byte_index = (ret_idx / num_columns) * num_columns; - cudf::size_type const flipped_ret_idx = flipped_start_byte_index + (num_columns - 1 - (ret_idx - flipped_start_byte_index)); - - uint8_t ret_byte = 0; - for (cudf::size_type output_bit_offset = 7; output_bit_offset >= 0; output_bit_offset--) { - // The index (in bits) of the output bit we are computing right now - int64_t const output_bit_index = flipped_ret_idx * 8L + output_bit_offset; - - // The most significant bit should come from the most significant column, but 0 is - // our most significant column, so switch the order of the columns. - cudf::size_type const column_idx = num_columns - 1 - (output_bit_index % num_columns); - auto column = input.column(column_idx); - - // Also we need to convert the endian byte order when we read the bytes. - int64_t const bit_index_within_column = output_bit_index / num_columns; - cudf::size_type const little_endian_read_byte_index = bit_index_within_column / 8; - cudf::size_type const read_bit_offset = bit_index_within_column % 8; - cudf::size_type const input_row_number = little_endian_read_byte_index / data_type_size; - cudf::size_type const start_row_byte_index = input_row_number * data_type_size; - cudf::size_type const read_byte_index = start_row_byte_index + (data_type_size - 1 - (little_endian_read_byte_index - start_row_byte_index)); - - uint32_t const byte_data = column.is_valid(input_row_number) ? column.data()[read_byte_index] : 0; - uint32_t const tmp = ((byte_data >> read_bit_offset) & 1) << output_bit_offset; - ret_byte = static_cast(ret_byte | tmp); - } - col.data()[ret_idx] = ret_byte; - }); - - auto offset_begin = thrust::make_constant_iterator(data_type_size * num_columns); - auto offsets_column = std::get<0>(cudf::detail::make_offsets_child_column( - offset_begin, offset_begin + num_rows, stream, mr)); + [col = *output_dv_ptr, num_columns, data_type_size, input = *input_dv] __device__( + cudf::size_type ret_idx) { + // The most significant byte needs to come from the most significant column, so we switch the + // order of the output bytes to match that + cudf::size_type const flipped_start_byte_index = (ret_idx / num_columns) * num_columns; + cudf::size_type const flipped_ret_idx = + flipped_start_byte_index + (num_columns - 1 - (ret_idx - flipped_start_byte_index)); + + uint8_t ret_byte = 0; + for (cudf::size_type output_bit_offset = 7; output_bit_offset >= 0; output_bit_offset--) { + // The index (in bits) of the output bit we are computing right now + int64_t const output_bit_index = flipped_ret_idx * 8L + output_bit_offset; + + // The most significant bit should come from the most significant column, but 0 is + // our most significant column, so switch the order of the columns. + cudf::size_type const column_idx = num_columns - 1 - (output_bit_index % num_columns); + auto column = input.column(column_idx); + + // Also we need to convert the endian byte order when we read the bytes. + int64_t const bit_index_within_column = output_bit_index / num_columns; + cudf::size_type const little_endian_read_byte_index = bit_index_within_column / 8; + cudf::size_type const read_bit_offset = bit_index_within_column % 8; + cudf::size_type const input_row_number = little_endian_read_byte_index / data_type_size; + cudf::size_type const start_row_byte_index = input_row_number * data_type_size; + cudf::size_type const read_byte_index = + start_row_byte_index + + (data_type_size - 1 - (little_endian_read_byte_index - start_row_byte_index)); + + uint32_t const byte_data = + column.is_valid(input_row_number) ? column.data()[read_byte_index] : 0; + uint32_t const tmp = ((byte_data >> read_bit_offset) & 1) << output_bit_offset; + ret_byte = static_cast(ret_byte | tmp); + } + col.data()[ret_idx] = ret_byte; + }); + + auto offset_begin = thrust::make_constant_iterator(data_type_size * num_columns); + auto offsets_column = std::get<0>( + cudf::detail::make_offsets_child_column(offset_begin, offset_begin + num_rows, stream, mr)); return cudf::make_lists_column(num_rows, - std::move(offsets_column), - std::move(output_data_col), - 0, - rmm::device_buffer(), - stream, - mr); + std::move(offsets_column), + std::move(output_data_col), + 0, + rmm::device_buffer(), + stream, + mr); } -std::unique_ptr hilbert_index( - int32_t const num_bits_per_entry, - cudf::table_view const& tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) { - - auto const num_rows = tbl.num_rows(); +std::unique_ptr hilbert_index(int32_t const num_bits_per_entry, + cudf::table_view const& tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_rows = tbl.num_rows(); auto const num_columns = tbl.num_columns(); - CUDF_EXPECTS(num_bits_per_entry > 0 && num_bits_per_entry <= 32, "the number of bits must be >0 and <= 32."); - CUDF_EXPECTS(num_bits_per_entry * num_columns <= 64, "we only support up to 64 bits of output right now."); + CUDF_EXPECTS(num_bits_per_entry > 0 && num_bits_per_entry <= 32, + "the number of bits must be >0 and <= 32."); + CUDF_EXPECTS(num_bits_per_entry * num_columns <= 64, + "we only support up to 64 bits of output right now."); CUDF_EXPECTS(num_columns > 0, "at least one column is required."); - CUDF_EXPECTS( - std::all_of(tbl.begin(), - tbl.end(), - [](cudf::column_view const& col) { return col.type().id() == cudf::type_id::INT32; }), - "All columns of the input table must be INT32."); + CUDF_EXPECTS(std::all_of(tbl.begin(), + tbl.end(), + [](cudf::column_view const& col) { + return col.type().id() == cudf::type_id::INT32; + }), + "All columns of the input table must be INT32."); auto const input_dv = cudf::table_device_view::create(tbl, stream); auto output_data_col = cudf::make_numeric_column( - cudf::data_type{cudf::type_id::INT64}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr); + cudf::data_type{cudf::type_id::INT64}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr); auto const output_dv_ptr = cudf::mutable_column_device_view::create(*output_data_col, stream); @@ -245,22 +253,20 @@ std::unique_ptr hilbert_index( thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_rows, output_dv_ptr->begin(), - [num_bits_per_entry, - num_columns, - input = *input_dv] __device__ (cudf::size_type row_index) { - uint_backed_array row(num_bits_per_entry); - for (cudf::size_type column_index = 0; column_index < num_columns; column_index++) { - auto const column = input.column(column_index); - uint32_t const data = column.is_valid(row_index) ? column.data()[row_index] : 0; - row.set(column_index, data); - } - - auto const transposed_index = hilbert_transposed_index(row, num_bits_per_entry, num_columns); - return static_cast( - to_hilbert_index(transposed_index, num_bits_per_entry, num_columns)); - }); + [num_bits_per_entry, num_columns, input = *input_dv] __device__(cudf::size_type row_index) { + uint_backed_array row(num_bits_per_entry); + for (cudf::size_type column_index = 0; column_index < num_columns; column_index++) { + auto const column = input.column(column_index); + uint32_t const data = column.is_valid(row_index) ? column.data()[row_index] : 0; + row.set(column_index, data); + } + + auto const transposed_index = hilbert_transposed_index(row, num_bits_per_entry, num_columns); + return static_cast( + to_hilbert_index(transposed_index, num_bits_per_entry, num_columns)); + }); return output_data_col; } -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/zorder.hpp b/src/main/cpp/src/zorder.hpp index 975ff79df8..9268f5b71e 100644 --- a/src/main/cpp/src/zorder.hpp +++ b/src/main/cpp/src/zorder.hpp @@ -35,4 +35,4 @@ std::unique_ptr hilbert_index( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni diff --git a/src/main/cpp/tests/cast_decimal_to_string.cpp b/src/main/cpp/tests/cast_decimal_to_string.cpp index f59b77ca05..1a93354339 100644 --- a/src/main/cpp/tests/cast_decimal_to_string.cpp +++ b/src/main/cpp/tests/cast_decimal_to_string.cpp @@ -30,8 +30,7 @@ using namespace cudf; template -struct DecimalToStringTests : public test::BaseFixture { -}; +struct DecimalToStringTests : public test::BaseFixture {}; TYPED_TEST_SUITE(DecimalToStringTests, cudf::test::FixedPointTypes); diff --git a/src/main/cpp/tests/cast_string.cpp b/src/main/cpp/tests/cast_string.cpp index 598d570611..c736d5971f 100644 --- a/src/main/cpp/tests/cast_string.cpp +++ b/src/main/cpp/tests/cast_string.cpp @@ -30,14 +30,12 @@ using namespace cudf; template -struct StringToIntegerTests : public test::BaseFixture { -}; +struct StringToIntegerTests : public test::BaseFixture {}; struct StringToDecimalTests : public test::BaseFixture {}; template -struct StringToFloatTests : public test::BaseFixture { -}; +struct StringToFloatTests : public test::BaseFixture {}; TYPED_TEST_SUITE(StringToIntegerTests, cudf::test::IntegralTypesNotBool); TYPED_TEST_SUITE(StringToFloatTests, cudf::test::FloatingPointTypes); @@ -240,8 +238,8 @@ TYPED_TEST(StringToIntegerTests, Overflow) TYPED_TEST(StringToIntegerTests, Empty) { - auto empty = std::make_unique(data_type{type_id::STRING}, 0, rmm::device_buffer{}, - rmm::device_buffer{}, 0); + auto empty = std::make_unique( + data_type{type_id::STRING}, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); auto result = spark_rapids_jni::string_to_integer(data_type{type_to_id()}, strings_column_view{empty->view()}, @@ -542,8 +540,8 @@ TEST_F(StringToDecimalTests, Edges) TEST_F(StringToDecimalTests, Empty) { - auto empty = std::make_unique(data_type{type_id::STRING}, 0, rmm::device_buffer{}, - rmm::device_buffer{}, 0); + auto empty = std::make_unique( + data_type{type_id::STRING}, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); auto const result = spark_rapids_jni::string_to_decimal( 8, 2, strings_column_view{empty->view()}, false, true, cudf::get_default_stream()); @@ -698,8 +696,8 @@ TYPED_TEST(StringToFloatTests, TrickyValues) TYPED_TEST(StringToFloatTests, Empty) { - auto empty = std::make_unique(data_type{type_id::STRING}, 0, rmm::device_buffer{}, - rmm::device_buffer{}, 0); + auto empty = std::make_unique( + data_type{type_id::STRING}, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); auto const result = spark_rapids_jni::string_to_float(data_type{type_to_id()}, strings_column_view{empty->view()}, diff --git a/src/main/cpp/tests/row_conversion.cpp b/src/main/cpp/tests/row_conversion.cpp index e140918f09..7e104c3871 100644 --- a/src/main/cpp/tests/row_conversion.cpp +++ b/src/main/cpp/tests/row_conversion.cpp @@ -35,10 +35,8 @@ #include -struct ColumnToRowTests : public cudf::test::BaseFixture { -}; -struct RowToColumnTests : public cudf::test::BaseFixture { -}; +struct ColumnToRowTests : public cudf::test::BaseFixture {}; +struct RowToColumnTests : public cudf::test::BaseFixture {}; TEST_F(ColumnToRowTests, Single) { @@ -51,7 +49,8 @@ TEST_F(ColumnToRowTests, Single) EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); @@ -212,7 +211,8 @@ TEST_F(ColumnToRowTests, Simple) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -234,7 +234,8 @@ TEST_F(ColumnToRowTests, Tall) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -260,7 +261,8 @@ TEST_F(ColumnToRowTests, Wide) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -288,7 +290,8 @@ TEST_F(ColumnToRowTests, SingleByteWide) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -319,7 +322,8 @@ TEST_F(ColumnToRowTests, Non2Power) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); for (int j = 0; j < old_tbl->num_columns(); ++j) { CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); @@ -355,7 +359,8 @@ TEST_F(ColumnToRowTests, Big) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); for (int j = 0; j < old_tbl->num_columns(); ++j) { CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); @@ -390,7 +395,8 @@ TEST_F(ColumnToRowTests, Bigger) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); for (int j = 0; j < old_tbl->num_columns(); ++j) { CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); @@ -426,7 +432,8 @@ TEST_F(ColumnToRowTests, Biggest) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); for (int j = 0; j < old_tbl->num_columns(); ++j) { CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); @@ -446,7 +453,8 @@ TEST_F(RowToColumnTests, Single) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -462,7 +470,8 @@ TEST_F(RowToColumnTests, Simple) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -484,7 +493,8 @@ TEST_F(RowToColumnTests, Tall) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -511,7 +521,8 @@ TEST_F(RowToColumnTests, Wide) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -537,7 +548,8 @@ TEST_F(RowToColumnTests, SingleByteWide) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -578,7 +590,8 @@ TEST_F(RowToColumnTests, AllTypes) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -700,7 +713,8 @@ TEST_F(RowToColumnTests, AllTypesLarge) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -728,7 +742,8 @@ TEST_F(RowToColumnTests, Non2Power) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -757,7 +772,8 @@ TEST_F(RowToColumnTests, Big) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -786,7 +802,8 @@ TEST_F(RowToColumnTests, Bigger) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -816,7 +833,8 @@ TEST_F(RowToColumnTests, Biggest) for (uint i = 0; i < old_rows.size(); ++i) { auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); + auto new_tbl = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } @@ -870,7 +888,8 @@ TEST_F(RowToColumnTests, DoubleString) auto new_rows = spark_rapids_jni::convert_to_rows(in); for (uint i = 0; i < new_rows.size(); ++i) { - auto new_cols = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); + auto new_cols = + spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); EXPECT_EQ(new_rows[0]->size(), 5); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in, *new_cols);