Merge branch 'branch-24.02' into fix-stream1

rapidsai · Dec 7, 2023 · c36c2cf · c36c2cf
2 parents 07ae25d + a253826
commit c36c2cf
Show file tree

Hide file tree

Showing 25 changed files with 889 additions and 342 deletions.
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -62,7 +62,7 @@ dependencies:
 - numpy>=1.21,<1.25
 - numpydoc
 - nvcc_linux-64=11.8
-- nvcomp==3.0.4
+- nvcomp==3.0.5
 - nvtx>=0.2.1
 - packaging
 - pandas>=1.3,<1.6.0dev0

diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -60,7 +60,7 @@ dependencies:
 - numba>=0.57,<0.58
 - numpy>=1.21,<1.25
 - numpydoc
-- nvcomp==3.0.4
+- nvcomp==3.0.5
 - nvtx>=0.2.1
 - packaging
 - pandas>=1.3,<1.6.0dev0

diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
@@ -38,7 +38,7 @@ spdlog_version:
   - ">=1.12.0,<1.13"
 
 nvcomp_version:
-  - "=3.0.4"
+  - "=3.0.5"
 
 zlib_version:
   - ">=1.2.13"

diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
@@ -18,8 +18,10 @@
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/file_utilities.hpp>
 
 #include <cudf/strings/strings_column_view.hpp>
+
 #include <nvtext/subword_tokenize.hpp>
 
 #include <filesystem>
@@ -29,8 +31,8 @@
 
 static std::string create_hash_vocab_file()
 {
-  std::string dir_template{std::filesystem::temp_directory_path().string()};
-  if (char const* env_p = std::getenv("WORKSPACE")) dir_template = env_p;
+  static temp_directory const subword_tmpdir{"cudf_gbench"};
+  auto dir_template     = subword_tmpdir.path();
   std::string hash_file = dir_template + "/hash_vocab.txt";
   // create a fake hashed vocab text file for this test
   // this only works with words in the strings in the benchmark code below
@@ -57,7 +59,7 @@ static void BM_subword_tokenizer(benchmark::State& state)
   auto const nrows = static_cast<cudf::size_type>(state.range(0));
   std::vector<char const*> h_strings(nrows, "This is a test ");
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
-  std::string hash_file = create_hash_vocab_file();
+  static std::string hash_file = create_hash_vocab_file();
   std::vector<uint32_t> offsets{14};
   uint32_t max_sequence_length = 64;
   uint32_t stride              = 48;

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
@@ -852,7 +852,11 @@ struct dst_offset_output_iterator {
 
   dst_offset_output_iterator operator+ __host__ __device__(int i) { return {c + i}; }
 
-  void operator++ __host__ __device__() { c++; }
+  dst_offset_output_iterator& operator++ __host__ __device__()
+  {
+    c++;
+    return *this;
+  }
 
   reference operator[] __device__(int i) { return dereference(c + i); }
   reference operator* __device__() { return dereference(c); }
@@ -873,13 +877,14 @@ struct dst_valid_count_output_iterator {
   using reference         = size_type&;
   using iterator_category = thrust::output_device_iterator_tag;
 
-  dst_valid_count_output_iterator operator+ __host__ __device__(int i)
+  dst_valid_count_output_iterator operator+ __host__ __device__(int i) { return {c + i}; }
+
+  dst_valid_count_output_iterator& operator++ __host__ __device__()
   {
-    return dst_valid_count_output_iterator{c + i};
+    c++;
+    return *this;
   }
 
-  void operator++ __host__ __device__() { c++; }
-
   reference operator[] __device__(int i) { return dereference(c + i); }
   reference operator* __device__() { return dereference(c); }
 

diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh
@@ -104,7 +104,7 @@ class SingleSymbolSmemLUT {
     SymbolGroupIdT no_match_id = symbol_strings.size();
 
     // The symbol with the largest value that is mapped to a symbol group id
-    SymbolGroupIdT max_base_match_val = 0;
+    SymbolGroupIdT max_lookup_index = 0;
 
     // Initialize all entries: by default we return the no-match-id
     std::fill(&init_data.sym_to_sgid[0], &init_data.sym_to_sgid[NUM_ENTRIES_PER_LUT], no_match_id);
@@ -115,17 +115,19 @@ class SingleSymbolSmemLUT {
     for (auto const& sg_symbols : symbol_strings) {
       // Iterate over all symbols that belong to the current symbol group
       for (auto const& sg_symbol : sg_symbols) {
-        max_base_match_val = std::max(max_base_match_val, static_cast<SymbolGroupIdT>(sg_symbol));
+        max_lookup_index = std::max(max_lookup_index, static_cast<SymbolGroupIdT>(sg_symbol));
         init_data.sym_to_sgid[static_cast<int32_t>(sg_symbol)] = sg_id;
       }
       sg_id++;
     }
 
-    // Initialize the out-of-bounds lookup: sym_to_sgid[max_base_match_val+1] -> no_match_id
-    init_data.sym_to_sgid[max_base_match_val + 1] = no_match_id;
+    // Initialize the out-of-bounds lookup: sym_to_sgid[max_lookup_index+1] -> no_match_id
+    auto const oob_match_index             = max_lookup_index + 1;
+    init_data.sym_to_sgid[oob_match_index] = no_match_id;
 
-    // Alias memory / return memory requirements
-    init_data.num_valid_entries = max_base_match_val + 1;
+    // The number of valid entries in the table (including the entry for the out-of-bounds symbol
+    // group id)
+    init_data.num_valid_entries = oob_match_index + 1;
     init_data.pre_map_op        = pre_map_op;
 
     return init_data;

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
@@ -799,9 +799,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       // This is to match the existing JSON reader's behaviour:
       // - Non-string columns will always be returned as nullable
       // - String columns will be returned as nullable, iff there's at least one null entry
-      if (target_type.id() == type_id::STRING and col->null_count() == 0) {
-        col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
-      }
+      if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); }
 
       // For string columns return ["offsets", "char"] schema
       if (target_type.id() == type_id::STRING) {
@@ -830,7 +828,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       // The null_mask is set after creation of struct column is to skip the superimpose_nulls and
       // null validation applied in make_structs_column factory, which is not needed for json
       auto ret_col = make_structs_column(num_rows, std::move(child_columns), 0, {}, stream, mr);
-      ret_col->set_null_mask(std::move(result_bitmask), null_count);
+      if (null_count != 0) { ret_col->set_null_mask(std::move(result_bitmask), null_count); }
       return {std::move(ret_col), column_names};
     }
     case json_col_t::ListColumn: {
@@ -877,7 +875,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       // The null_mask is set after creation of list column is to skip the purge_nonempty_nulls and
       // null validation applied in make_lists_column factory, which is not needed for json
       // parent column cannot be null when its children is non-empty in JSON
-      ret_col->set_null_mask(std::move(result_bitmask), null_count);
+      if (null_count != 0) { ret_col->set_null_mask(std::move(result_bitmask), null_count); }
       return {std::move(ret_col), std::move(column_names)};
     }
     default: CUDF_FAIL("Unsupported column type"); break;

diff --git a/cpp/src/io/json/legacy/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu
@@ -569,6 +569,9 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
     } else {
       out_columns.emplace_back(std::move(out_column));
     }
+    if (out_columns.back()->null_count() == 0) {
+      out_columns.back()->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
+    }
   }
 
   std::vector<column_name_info> column_infos;

diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
@@ -2068,11 +2068,13 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
 
   auto make_validity =
     [stream, mr](json_column const& json_col) -> std::pair<rmm::device_buffer, size_type> {
+    auto const null_count = json_col.current_offset - json_col.valid_count;
+    if (null_count == 0) { return {rmm::device_buffer{}, null_count}; }
     return {rmm::device_buffer{json_col.validity.data(),
                                bitmask_allocation_size_bytes(json_col.current_offset),
                                stream,
                                mr},
-            json_col.current_offset - json_col.valid_count};
+            null_count};
   };
 
   auto get_child_schema = [schema](auto child_name) -> std::optional<schema_element> {
@@ -2138,9 +2140,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       // This is to match the existing JSON reader's behaviour:
       // - Non-string columns will always be returned as nullable
       // - String columns will be returned as nullable, iff there's at least one null entry
-      if (target_type.id() == type_id::STRING and col->null_count() == 0) {
-        col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
-      }
+      if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); }
 
       // For string columns return ["offsets", "char"] schema
       if (target_type.id() == type_id::STRING) {

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -289,7 +289,7 @@ class parquet_field_union_struct : public parquet_field {
   inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     T v;
-    bool const res = parquet_field_struct<T>(field(), v).operator()(cpr, field_type);
+    bool const res = parquet_field_struct<T>{field(), v}(cpr, field_type);
     if (!res) {
       val      = v;
       enum_val = static_cast<E>(field());
@@ -424,7 +424,7 @@ class parquet_field_optional : public parquet_field {
   inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     T v;
-    bool const res = FieldFunctor(field(), v).operator()(cpr, field_type);
+    bool const res = FieldFunctor{field(), v}(cpr, field_type);
     if (!res) { val = v; }
     return res;
   }
@@ -631,6 +631,8 @@ bool CompactProtocolReader::read(ColumnChunk* c)
 
 bool CompactProtocolReader::read(ColumnChunkMetaData* c)
 {
+  using optional_size_statistics =
+    parquet_field_optional<SizeStatistics, parquet_field_struct<SizeStatistics>>;
   auto op = std::make_tuple(parquet_field_enum<Type>(1, c->type),
                             parquet_field_enum_list(2, c->encodings),
                             parquet_field_string_list(3, c->path_in_schema),
@@ -641,7 +643,8 @@ bool CompactProtocolReader::read(ColumnChunkMetaData* c)
                             parquet_field_int64(9, c->data_page_offset),
                             parquet_field_int64(10, c->index_page_offset),
                             parquet_field_int64(11, c->dictionary_page_offset),
-                            parquet_field_struct(12, c->statistics));
+                            parquet_field_struct(12, c->statistics),
+                            optional_size_statistics(16, c->size_statistics));
   return function_builder(this, op);
 }
 
@@ -700,17 +703,35 @@ bool CompactProtocolReader::read(PageLocation* p)
 
 bool CompactProtocolReader::read(OffsetIndex* o)
 {
-  auto op = std::make_tuple(parquet_field_struct_list(1, o->page_locations));
+  using optional_list_i64 = parquet_field_optional<std::vector<int64_t>, parquet_field_int64_list>;
+
+  auto op = std::make_tuple(parquet_field_struct_list(1, o->page_locations),
+                            optional_list_i64(2, o->unencoded_byte_array_data_bytes));
+  return function_builder(this, op);
+}
+
+bool CompactProtocolReader::read(SizeStatistics* s)
+{
+  using optional_i64      = parquet_field_optional<int64_t, parquet_field_int64>;
+  using optional_list_i64 = parquet_field_optional<std::vector<int64_t>, parquet_field_int64_list>;
+
+  auto op = std::make_tuple(optional_i64(1, s->unencoded_byte_array_data_bytes),
+                            optional_list_i64(2, s->repetition_level_histogram),
+                            optional_list_i64(3, s->definition_level_histogram));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(ColumnIndex* c)
 {
+  using optional_list_i64 = parquet_field_optional<std::vector<int64_t>, parquet_field_int64_list>;
+
   auto op = std::make_tuple(parquet_field_bool_list(1, c->null_pages),
                             parquet_field_binary_list(2, c->min_values),
                             parquet_field_binary_list(3, c->max_values),
                             parquet_field_enum<BoundaryOrder>(4, c->boundary_order),
-                            parquet_field_int64_list(5, c->null_counts));
+                            parquet_field_int64_list(5, c->null_counts),
+                            optional_list_i64(6, c->repetition_level_histogram),
+                            optional_list_i64(7, c->definition_level_histogram));
   return function_builder(this, op);
 }
 

diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -116,6 +116,7 @@ class CompactProtocolReader {
   bool read(KeyValue* k);
   bool read(PageLocation* p);
   bool read(OffsetIndex* o);
+  bool read(SizeStatistics* s);
   bool read(ColumnIndex* c);
   bool read(Statistics* s);
   bool read(ColumnOrder* c);

diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -182,6 +182,7 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s)
   if (s.index_page_offset != 0) { c.field_int(10, s.index_page_offset); }
   if (s.dictionary_page_offset != 0) { c.field_int(11, s.dictionary_page_offset); }
   c.field_struct(12, s.statistics);
+  if (s.size_statistics.has_value()) { c.field_struct(16, s.size_statistics.value()); }
   return c.value();
 }
 
@@ -210,6 +211,24 @@ size_t CompactProtocolWriter::write(OffsetIndex const& s)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_struct_list(1, s.page_locations);
+  if (s.unencoded_byte_array_data_bytes.has_value()) {
+    c.field_int_list(2, s.unencoded_byte_array_data_bytes.value());
+  }
+  return c.value();
+}
+
+size_t CompactProtocolWriter::write(SizeStatistics const& s)
+{
+  CompactProtocolFieldWriter c(*this);
+  if (s.unencoded_byte_array_data_bytes.has_value()) {
+    c.field_int(1, s.unencoded_byte_array_data_bytes.value());
+  }
+  if (s.repetition_level_histogram.has_value()) {
+    c.field_int_list(2, s.repetition_level_histogram.value());
+  }
+  if (s.definition_level_histogram.has_value()) {
+    c.field_int_list(3, s.definition_level_histogram.value());
+  }
   return c.value();
 }
 
@@ -286,13 +305,26 @@ inline void CompactProtocolFieldWriter::field_int(int field, int64_t val)
   current_field_value = field;
 }
 
+template <>
+inline void CompactProtocolFieldWriter::field_int_list<int64_t>(int field,
+                                                                std::vector<int64_t> const& val)
+{
+  put_field_header(field, current_field_value, ST_FLD_LIST);
+  put_byte(static_cast<uint8_t>((std::min(val.size(), 0xfUL) << 4) | ST_FLD_I64));
+  if (val.size() >= 0xfUL) { put_uint(val.size()); }
+  for (auto const v : val) {
+    put_int(v);
+  }
+  current_field_value = field;
+}
+
 template <typename Enum>
 inline void CompactProtocolFieldWriter::field_int_list(int field, std::vector<Enum> const& val)
 {
   put_field_header(field, current_field_value, ST_FLD_LIST);
-  put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_I32));
-  if (val.size() >= 0xf) put_uint(val.size());
-  for (auto& v : val) {
+  put_byte(static_cast<uint8_t>((std::min(val.size(), 0xfUL) << 4) | ST_FLD_I32));
+  if (val.size() >= 0xfUL) { put_uint(val.size()); }
+  for (auto const& v : val) {
     put_int(static_cast<int32_t>(v));
   }
   current_field_value = field;

diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -51,6 +51,7 @@ class CompactProtocolWriter {
   size_t write(Statistics const&);
   size_t write(PageLocation const&);
   size_t write(OffsetIndex const&);
+  size_t write(SizeStatistics const&);
   size_t write(ColumnOrder const&);
 
  protected:
@@ -113,4 +114,8 @@ class CompactProtocolFieldWriter {
   inline void set_current_field(int const& field);
 };
 
+template <>
+inline void CompactProtocolFieldWriter::field_int_list<int64_t>(int field,
+                                                                std::vector<int64_t> const& val);
+
 }  // namespace cudf::io::parquet::detail