Merge pull request #5687 from davidwendt/split-record-to-list

[REVIEW] Change strings::split_record to return a lists column
rapidsai · Jul 21, 2020 · 3d287b9 · 3d287b9
2 parents ed84164 + ca7212d
commit 3d287b9
Show file tree

Hide file tree

Showing 6 changed files with 529 additions and 826 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -123,6 +123,7 @@
 - PR #5662 Make Java ColumnVector(long nativePointer) constructor public
 - PR #5679 Use `pickle5` to test older Python versions
 - PR #5684 Use `pickle5` in `Serializable` (when available)
+- PR #5687 Change strings::split_record to return a lists column
 - PR #5708 Add support for `dummy_na` in `get_dummies`
 - PR #5709 Update java build to help cu-spacial with java bindings
 - PR #5713 Remove old NVTX utilities

diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -82,82 +82,149 @@ std::unique_ptr<table> rsplit(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
 
 /**
- * @brief The result(s) of a `contiguous_(r)split_record`
- *
- * Each column_view resulting from a split operation performed by
- * contiguous_split_record will be returned wrapped in a
- * `contiguous_split_record_result`. The column data addresses stored in the
- * column_view objects are not owned by top level cudf::column objects. The
- * backing memory is instead owned by the `all_data` field and in one contiguous
- * block.
- *
- * The user is responsible for assuring that the `column_views` or any derived
- * objects do not outlive the memory owned by `all_data`
- */
-struct contiguous_split_record_result {
-  std::vector<column_view> column_views;
-  std::unique_ptr<rmm::device_buffer> all_data;
-};
-
-/**
- * @brief Splits each element of the input column to a column of tokens storing
- * the resulting columns in a single contiguous block of memory.
- *
- * This function splits each element in the input column to a column of tokens.
- * The number of columns in the output vector will be the same as the number of
- * elements in the input column. The column length will coincide with the
- * number of tokens; the resulting columns wrapped in the returned object may
- * have different sizes.
- *
- * Splitting a null string element will result in an empty output column.
- *
- * @throws cudf:logic_error if `delimiter` is invalid.
+ * @brief Splits individual strings elements into a list of strings.
+ *
+ * Each element generates an array of strings that are stored in an output
+ * lists column.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * The `delimiter` is searched within each string from beginning to end
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
+ *
+ * If a delimiter is not whitespace and occurs adjacent to another delimiter,
+ * an empty string is produced for that split occurrence. Likewise, a non-whitespace
+ * delimiter produces an empty string if it appears at the beginning or the end
+ * of a string.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc_def_g", "a__bc", "_ab_cd", "ab_cd_"]
+ * s1 = split_record(s, "_")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * s2 = split_record(s, "_", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a", "bc_def_g"],
+ *       ["a", "_bc"],
+ *       ["", "ab_cd"],
+ *       ["ab", "cd_"] ]
+ * @endcode
+ *
+ * A whitespace delimiter produces no empty strings.
+ * @code{.pseudo}
+ * s = ["a bc def", "a  bc", " ab cd", "ab cd "]
+ * s1 = split_record(s, "")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def"],
+ *       ["a", "bc"],
+ *       ["ab", "cd"],
+ *       ["ab", "cd"] ]
+ * s2 = split_record(s, "", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a", "bc def"],
+ *       ["a", "bc"],
+ *       ["ab", "cd"],
+ *       ["ab", "cd "] ]
+ * @endcode
+ *
+ * A null string element will result in a null list item for that row.
+ *
+ * @throw cudf:logic_error if `delimiter` is invalid.
  *
  * @param strings A column of string elements to be splitted.
- * @param delimiter UTF-8 encoded string indicating the split points in each
- *        string.
+ * @param delimiter The string to identify split points in each string.
  *        Default of empty string indicates split on whitespace.
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
  * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return contiguous_split_record_result New vector of strings column_view
- *         objects
- *         (each column_view element of the vector holds splits from a string
- *         element of the input column).
+ * @return Lists column of strings
+ *         Each vector of the lists column holds splits from a single row
+ *         element of the input column.
  */
-contiguous_split_record_result contiguous_split_record(
+std::unique_ptr<column> split_record(
   strings_column_view const& strings,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
 
 /**
- * @brief Splits each element of the input column from the end to a column of
- * tokens storing the resulting columns in a single contiguous block of memory.
- *
- * This function splits each element in the input column to a column of tokens.
- * The number of columns in the output vector will be the same as the number of
- * elements in the input column. The column length will coincide with the
- * number of tokens; the resulting columns wrapped in the returned object may
- * have different sizes.
- *
- * Splitting a null string element will result in an empty output column.
- *
- * @throws cudf:logic_error if `delimiter` is invalid.
+ * @brief  Splits individual strings elements into a list of strings starting
+ * from the end of each string.
+ *
+ * Each element generates an array of strings that are stored in an output
+ * lists column.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * The `delimiter` is searched from end to beginning within each string
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * If a delimiter is not whitespace and occurs adjacent to another delimiter,
+ * an empty string is produced for that split occurrence. Likewise, a non-whitespace
+ * delimiter produces an empty string if it appears at the beginning or the end
+ * of a string.
+ *
+ * Note that `rsplit_record` and `split_record` produce equivalent results for
+ * the default `maxsplit` value.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc_def_g", "a__bc", "_ab_cd", "ab_cd_"]
+ * s1 = rsplit_record(s, "_")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * s2 = rsplit_record(s, "_", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a_bc_def", "g"],
+ *       ["a_", "bc"],
+ *       ["_ab", "cd"],
+ *       ["ab_cd", ""] ]
+ * @endcode
+ *
+ * A whitespace delimiter produces no empty strings.
+ * @code{.pseudo}
+ * s = ["a bc def", "a  bc", " ab cd", "ab cd "]
+ * s1 = rsplit_record(s, "")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def"],
+ *       ["a", "bc"],
+ *       ["ab", "cd"],
+ *       ["ab", "cd"] ]
+ * s2 = rsplit_record(s, "", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a bc", "def"],
+ *       ["a", "bc"],
+ *       [" ab", "cd"],
+ *       ["ab", "cd"] ]
+ * @endcode
+ *
+ * A null string element will result in a null list item for that row.
+ *
+ * @throw cudf:logic_error if `delimiter` is invalid.
  *
  * @param strings A column of string elements to be splitted.
- * @param delimiter UTF-8 encoded string indicating the split points in each
- *        string.
+ * @param delimiter The string to identify split points in each string.
  *        Default of empty string indicates split on whitespace.
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
  * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return contiguous_split_record_result New vector of strings column_view
- *         objects
- *         (each column_view element of the vector holds splits from a string
- *         element of the input column).
+ * @return Lists column of strings
+ *         Each vector of the lists column holds splits from a single row
+ *         element of the input column.
  */
-contiguous_split_record_result contiguous_rsplit_record(
+std::unique_ptr<column> rsplit_record(
   strings_column_view const& strings,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,

diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
@@ -24,6 +24,7 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
+#include <strings/split/split_utils.cuh>
 
 #include <thrust/binary_search.h>  // upper_bound()
 #include <thrust/copy.h>           // copy_if()
@@ -34,8 +35,8 @@
 namespace cudf {
 namespace strings {
 namespace detail {
+
 using string_index_pair = thrust::pair<const char*, size_type>;
-using position_pair     = thrust::pair<size_type, size_type>;
 
 namespace {
 
@@ -582,99 +583,6 @@ struct base_whitespace_split_tokenizer {
   size_type max_tokens;  // maximum number of tokens
 };
 
-/**
- * @brief Instantiated for each string to manage navigating tokens from
- * the beginning or the end of that string.
- */
-struct whitespace_string_tokenizer {
-  /**
-   * @brief Identifies the position range of the next token in the given
-   * string at the specified iterator position.
-   *
-   * Tokens are delimited by one or more whitespace characters.
-   *
-   * @return true if a token has been found
-   */
-  __device__ bool next_token()
-  {
-    if (itr != d_str.begin()) {  // skip these 2 lines the first time through
-      start_position = end_position + 1;
-      ++itr;
-    }
-    if (start_position >= d_str.length()) return false;
-    // continue search for the next token
-    end_position = d_str.length();
-    for (; itr < d_str.end(); ++itr) {
-      if (spaces == (*itr <= ' ')) {
-        if (spaces)
-          start_position = itr.position() + 1;
-        else
-          end_position = itr.position() + 1;
-        continue;
-      }
-      spaces = !spaces;
-      if (spaces) {
-        end_position = itr.position();
-        break;
-      }
-    }
-    return start_position < end_position;
-  }
-
-  /**
-   * @brief Identifies the position range of the previous token in the given
-   * string at the specified iterator position.
-   *
-   * Tokens are delimited by one or more whitespace characters.
-   *
-   * @return true if a token has been found
-   */
-  __device__ bool prev_token()
-  {
-    end_position = start_position - 1;
-    --itr;
-    if (end_position <= 0) return false;
-    // continue search for the next token
-    start_position = 0;
-    for (; itr >= d_str.begin(); --itr) {
-      if (spaces == (*itr <= ' ')) {
-        if (spaces)
-          end_position = itr.position();
-        else
-          start_position = itr.position();
-        continue;
-      }
-      spaces = !spaces;
-      if (spaces) {
-        start_position = itr.position() + 1;
-        break;
-      }
-    }
-    return start_position < end_position;
-  }
-
-  __device__ position_pair token_byte_positions()
-  {
-    return position_pair{d_str.byte_offset(start_position), d_str.byte_offset(end_position)};
-  }
-
-  __device__ whitespace_string_tokenizer(string_view const& d_str, bool reverse = false)
-    : d_str{d_str},
-      spaces(true),
-      start_position{reverse ? d_str.length() + 1 : 0},
-      end_position{d_str.length()},
-      itr{reverse ? d_str.end() : d_str.begin()}
-  {
-  }
-
- private:
-  string_view const d_str;
-  bool spaces;  // true if current position is whitespace
-  cudf::string_view::const_iterator itr;
-  size_type start_position;
-  size_type end_position;
-};
-
 /**
  * @brief The tokenizer functions for split() with whitespace.
  *
@@ -709,7 +617,7 @@ struct whitespace_split_tokenizer_fn : base_whitespace_split_tokenizer {
     size_type token_idx   = 0;
     position_pair token{0, 0};
     while (tokenizer.next_token() && (token_idx < token_count)) {
-      token = tokenizer.token_byte_positions();
+      token = tokenizer.get_token();
       d_tokens[d_strings.size() * (token_idx++)] =
         string_index_pair{d_str.data() + token.first, (token.second - token.first)};
     }
@@ -760,7 +668,7 @@ struct whitespace_rsplit_tokenizer_fn : base_whitespace_split_tokenizer {
     size_type token_idx   = 0;
     position_pair token{0, 0};
     while (tokenizer.prev_token() && (token_idx < token_count)) {
-      token = tokenizer.token_byte_positions();
+      token = tokenizer.get_token();
       d_tokens[d_strings.size() * (token_count - 1 - token_idx)] =
         string_index_pair{d_str.data() + token.first, (token.second - token.first)};
       ++token_idx;