Pack/unpack functionality to convert tables to and from a serialized …

…format. (#7096) Addresses #3793 Depends on #6864 (This affects contiguous_split.cu. For the purposes of this PR, the only changes that are relevant are those that involve the generation of metadata) - `pack()` performs a `contiguous_split()` on the incoming table to arrange the memory into a unified device buffer, and generates a host-side metadata buffer. These are returned in the `packed_columns` struct. - unpack() takes the data stored in the `packed_columns` struct and returns a deserialized `table_view` that points into it. The intent of this functionality is as follows (pseudocode) ``` // serialize-side table_view t; packed_columns p = pack(t); send_over_network(p.gpu_data); send_over_network(p.metadata); // deserialize-side packed_columns p = receive_from_network(); table_view t = unpack(p); ``` This PR also renames `contiguous_split_result` to `packed_table` (which is just a bundled `table_view` and `packed_column`) Authors: - @nvdbaranec Approvers: - Jake Hemstad (@jrhemstad) - Paul Taylor (@trxcllnt) - Mike Wilson (@hyperbolic2346) URL: #7096
rapidsai · Feb 4, 2021 · fd2d0e2 · fd2d0e2
1 parent 2e71b36
commit fd2d0e2
Show file tree

Hide file tree

Showing 8 changed files with 1,152 additions and 86 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,40 @@
+# cuDF 0.18.0 (Date TBD)
+
+## New Features
+- PR #6856 Add groupby idxmin, idxmax aggregation
+- PR #6847 Add a cmake find module for cuFile in JNI code
+- PR #6902 Implement `DataFrame.quantile` for `datetime` and `timedelta` data types
+- PR #6814 Implement `cudf::reduce` for `decimal32` and `decimal64` (part 1)
+- PR #6929 Add `Index.set_names` api
+- PR #6907 Add `replace_null` API with `replace_policy` parameter, `fixed_width` column support
+- PR #6885 Share `factorize` implementation with Index and cudf module
+- PR #6775 Implement cudf.DateOffset for months
+- PR #7069 Pack/unpack for serialization/deserialization of table_views
+- PR #7039 Support contains() on lists of primitives
+
+## Improvements
+
+- PR #6938 Pass numeric scalars of the same dtype through numeric binops
+- PR #6275 Update to official libcu++ on Github
+- PR #6838 Fix `columns` & `index` handling in dataframe constructor
+- PR #6750 Remove **kwargs from string/categorical methods
+- PR #6585 Add dictionary support to libcudf groupby functions
+- PR #6909 Support reading byte array backed decimal columns from parquet files
+- PR #6939 Use simplified `rmm::exec_policy`
+- PR #6512 Refactor rolling.cu to reduce compile time
+- PR #6982 Disable some pragma unroll statements in thrust `sort.h`
+- PR #7051 Verify decimal cast in java package
+- PR #7120 Verify window operations on decimal in java package
+
+## Bug Fixes
+
+- PR #6884 Correct the sampling range when sampling with replacement
+- PR #6903 Add null count test for apply_boolean_mask
+- PR #6922 Fix N/A detection for empty fields in CSV reader
+- PR #6912 Fix rmm_mode=managed parameter for gtests
+- PR #6943 Fix join with nulls not equal performance
+- PR #6945 Fix groupby agg/apply behaviour when no key columns are provided 
+- PR #6942 Fix cudf::merge gtest for dictionary columns
 # 0.18.0
 
 Please see https://github.com/rapidsai/cudf/releases/tag/branch-0.18-latest for the latest changes to this development branch.

diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -476,23 +476,50 @@ std::vector<column_view> split(column_view const& input, std::vector<size_type>
  */
 std::vector<table_view> split(table_view const& input, std::vector<size_type> const& splits);
 
+/**
+ * @brief Column data in a serialized format
+ *
+ * @ingroup copy_split
+ *
+ * Contains data from an array of columns in two contiguous buffers: one on host, which contains
+ * table metadata and one on device which contains the table data.
+ */
+struct packed_columns {
+  /**
+   * @brief Host-side metadata buffer used for reconstructing columns via unpack.
+   *
+   * @ingroup copy_split
+   */
+  struct metadata {
+    metadata(std::vector<uint8_t>&& v) : data_(std::move(v)) {}
+    uint8_t const* data() const { return data_.data(); }
+    size_t size() const { return data_.size(); }
+
+   private:
+    std::vector<uint8_t> data_;
+  };
+
+  std::unique_ptr<metadata> metadata_;
+  std::unique_ptr<rmm::device_buffer> gpu_data;
+};
+
 /**
  * @brief The result(s) of a `contiguous_split`
  *
  * @ingroup copy_split
  *
  * Each table_view resulting from a split operation performed by contiguous_split,
- * will be returned wrapped in a `contiguous_split_result`.  The table_view and internal
+ * will be returned wrapped in a `packed_table`.  The table_view and internal
  * column_views in this struct are not owned by a top level cudf::table or cudf::column.
- * The backing memory is instead owned by the `all_data` field and in one
+ * The backing memory and metadata is instead owned by the `data` field and is in one
  * contiguous block.
  *
  * The user is responsible for assuring that the `table` or any derived table_views do
- * not outlive the memory owned by `all_data`
+ * not outlive the memory owned by `data`
  */
-struct contiguous_split_result {
+struct packed_table {
   cudf::table_view table;
-  std::unique_ptr<rmm::device_buffer> all_data;
+  packed_columns data;
 };
 
 /**
@@ -502,7 +529,7 @@ struct contiguous_split_result {
  * @ingroup copy_split
  *
  * The memory for the output views is allocated in a single contiguous `rmm::device_buffer` returned
- * in the `contiguous_split_result`. There is no top-level owning table.
+ * in the `packed_table`. There is no top-level owning table.
  *
  * The returned views of `input` are constructed from a vector of indices, that indicate
  * where each split should occur. The `i`th returned `table_view` is sliced as
@@ -514,7 +541,7 @@ struct contiguous_split_result {
  *
  * @note It is the caller's responsibility to ensure that the returned views
  * do not outlive the viewed device memory contained in the `all_data` field of the
- * returned contiguous_split_result.
+ * returned packed_table.
  *
  * @code{.pseudo}
  * Example:
@@ -536,11 +563,78 @@ struct contiguous_split_result {
  * @return The set of requested views of `input` indicated by the `splits` and the viewed memory
  * buffer.
  */
-std::vector<contiguous_split_result> contiguous_split(
+std::vector<packed_table> contiguous_split(
   cudf::table_view const& input,
   std::vector<size_type> const& splits,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Deep-copy a `table_view` into a serialized contiguous memory format
+ *
+ * The metadata from the `table_view` is copied into a host vector of bytes and the data from the
+ * `table_view` is copied into a `device_buffer`. Pass the output of this function into
+ * `cudf::unpack` to deserialize.
+ *
+ * @param input View of the table to pack
+ * @param[in] mr Optional, The resource to use for all returned device allocations
+ * @return packed_columns A struct containing the serialized metadata and data in contiguous host
+ *         and device memory respectively
+ */
+packed_columns pack(cudf::table_view const& input,
+                    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Produce the metadata used for packing a table stored in a contiguous buffer.
+ *
+ * The metadata from the `table_view` is copied into a host vector of bytes which can be used to
+ * construct a `packed_columns` or `packed_table` structure.  The caller is responsible for
+ * guaranteeing that that all of the columns in the table point into `contiguous_buffer`.
+ *
+ * @param input View of the table to pack
+ * @param contgiuous_buffer A contiguous buffer of device memory which contains the data referenced
+ * by the columns in `table`
+ * @param buffer_size The size of `contiguous_buffer`.
+ * @return Vector of bytes representing the metadata used to `unpack` a packed_columns struct.
+ */
+packed_columns::metadata pack_metadata(table_view const& table,
+                                       uint8_t const* contiguous_buffer,
+                                       size_t buffer_size);
+
+/**
+ * @brief Deserialize the result of `cudf::pack`
+ *
+ * Converts the result of a serialized table into a `table_view` that points to the data stored in
+ * the contiguous device buffer contained in `input`.
+ *
+ * It is the caller's responsibility to ensure that the `table_view` in the output does not outlive
+ * the data in the input.
+ *
+ * No new device memory is allocated in this function.
+ *
+ * @param input The packed columns to unpack
+ * @return The unpacked `table_view`
+ */
+table_view unpack(packed_columns const& input);
+
+/**
+ * @brief Deserialize the result of `cudf::pack`
+ *
+ * Converts the result of a serialized table into a `table_view` that points to the data stored in
+ * the contiguous device buffer contained in `gpu_data` using the metadata contained in the host
+ * buffer `metadata`.
+ *
+ * It is the caller's responsibility to ensure that the `table_view` in the output does not outlive
+ * the data in the input.
+ *
+ * No new device memory is allocated in this function.
+ *
+ * @param metadata The host-side metadata buffer resulting from the initial pack() call
+ * @param gpu_data The device-side contiguous buffer storing the data that will be referenced by
+ * the resulting `table_view`
+ * @return The unpacked `table_view`
+ */
+table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data);
+
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
  *          @p rhs based on the value of the corresponding element in @p boolean_mask

diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -92,13 +92,22 @@ std::unique_ptr<column> shift(
  * @copydoc cudf::contiguous_split
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::vector<contiguous_split_result> contiguous_split(
+ **/
+std::vector<packed_table> contiguous_split(
   cudf::table_view const& input,
   std::vector<size_type> const& splits,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::pack
+ *
+ * @param stream Optional CUDA stream on which to execute kernels
+ **/
+packed_columns pack(cudf::table_view const& input,
+                    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @copydoc cudf::allocate_like(column_view const&, size_type, mask_allocation_policy,
  * rmm::mr::device_memory_resource*)