From 2f248222a6cbf132d3609814faed912a902c9106 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 7 Dec 2023 11:58:10 -0800 Subject: [PATCH 1/4] disable dict sorting by default --- cpp/include/cudf/io/orc.hpp | 4 ++-- cpp/tests/io/orc_test.cpp | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index c2762b05aa6..f7599a5ce5a 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -451,7 +451,7 @@ class orc_writer_options { // Optional compression statistics std::shared_ptr _compression_stats; // Specify whether string dictionaries should be alphabetically sorted - bool _enable_dictionary_sort = true; + bool _enable_dictionary_sort = false; friend orc_writer_options_builder; @@ -895,7 +895,7 @@ class chunked_orc_writer_options { // Optional compression statistics std::shared_ptr _compression_stats; // Specify whether string dictionaries should be alphabetically sorted - bool _enable_dictionary_sort = true; + bool _enable_dictionary_sort = false; friend chunked_orc_writer_options_builder; diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index dca3886db14..cd8ced6013f 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1948,7 +1948,8 @@ TEST_F(OrcWriterTest, UnorderedDictionary) std::vector out_buffer_sorted; cudf::io::orc_writer_options out_opts_sorted = - cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer_sorted}, expected); + cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer_sorted}, expected) + .enable_dictionary_sort(true); cudf::io::write_orc(out_opts_sorted); cudf::io::orc_reader_options in_opts_sorted = cudf::io::orc_reader_options::builder( From ba83f5472a36d6bde3730d2a279eaf65113d4fe3 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 7 Dec 2023 13:31:12 -0800 Subject: [PATCH 2/4] Revert "disable dict sorting by default" This reverts commit 2f248222a6cbf132d3609814faed912a902c9106. --- cpp/include/cudf/io/orc.hpp | 4 ++-- cpp/tests/io/orc_test.cpp | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index f7599a5ce5a..c2762b05aa6 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -451,7 +451,7 @@ class orc_writer_options { // Optional compression statistics std::shared_ptr _compression_stats; // Specify whether string dictionaries should be alphabetically sorted - bool _enable_dictionary_sort = false; + bool _enable_dictionary_sort = true; friend orc_writer_options_builder; @@ -895,7 +895,7 @@ class chunked_orc_writer_options { // Optional compression statistics std::shared_ptr _compression_stats; // Specify whether string dictionaries should be alphabetically sorted - bool _enable_dictionary_sort = false; + bool _enable_dictionary_sort = true; friend chunked_orc_writer_options_builder; diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index cd8ced6013f..dca3886db14 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1948,8 +1948,7 @@ TEST_F(OrcWriterTest, UnorderedDictionary) std::vector out_buffer_sorted; cudf::io::orc_writer_options out_opts_sorted = - cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer_sorted}, expected) - .enable_dictionary_sort(true); + cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer_sorted}, expected); cudf::io::write_orc(out_opts_sorted); cudf::io::orc_reader_options in_opts_sorted = cudf::io::orc_reader_options::builder( From 31e62bf86b019ac7ea4b6d8443624bfd6dc909e1 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 7 Dec 2023 13:32:20 -0800 Subject: [PATCH 3/4] sync --- cpp/src/io/orc/writer_impl.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index ac5993e764e..7b53b7bb186 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -2137,7 +2137,8 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, } } } - stripe_dicts.host_to_device_async(stream); + // Synchronous to ensure the copy is complete before we clear `map_slots` + stripe_dicts.host_to_device_sync(stream); gpu::collect_map_entries(stripe_dicts, stream); gpu::get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream); From f11258c528cfc8d2a1876bfede1284ed9ccfaf39 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 7 Dec 2023 13:57:01 -0800 Subject: [PATCH 4/4] Update cpp/src/io/orc/writer_impl.cu Co-authored-by: Bradley Dice --- cpp/src/io/orc/writer_impl.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 7b53b7bb186..29661285ed8 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -2137,7 +2137,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, } } } - // Synchronous to ensure the copy is complete before we clear `map_slots` + // Synchronize to ensure the copy is complete before we clear `map_slots` stripe_dicts.host_to_device_sync(stream); gpu::collect_map_entries(stripe_dicts, stream);