rapidsai · rapids-bot · Nov 22, 2022 · Nov 9, 2022 · Nov 9, 2022 · Nov 9, 2022
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -131,6 +131,16 @@ repos:
                 language: system
                 pass_filenames: false
                 verbose: false
+      - repo: https://github.com/codespell-project/codespell
+        rev: v2.1.0
+        hooks:
+              - id: codespell
+                exclude: |
+                  (?x)^(
+                    .*test.*|
+                    ^CHANGELOG.md$|
+                    ^.*versioneer.py$
+                  )
 
 default_language_version:
       python: python3
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -377,6 +377,11 @@ Now code linters and formatters will be run each time you commit changes.
 
 You can skip these checks with `git commit --no-verify` or with the short version `git commit -n`.
 
+cuDF also uses [codespell](https://github.com/codespell-project/codespell) to find spelling
+mistakes, and this check is run as part of the pre-commit hook. To apply the suggested spelling
+fixes, you can run  `codespell -i 3 -w .` from the commandline in the cuDF root directory.
+This will bring up an interactive prompt to select which spelling fixes to apply.
+
 ## Developer Guidelines
 
 The [C++ Developer Guide](cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md) includes details on contributing to libcudf C++ code.

@@ -430,7 +430,7 @@ class data_profile {
  * initialization. The `profile` object in the example above is initialized from
  * `data_profile_builder` using an implicit conversion operator.
  *
- * The builder API also includes a few additional convinience setters:
+ * The builder API also includes a few additional convenience setters:
  * Overload of `distribution` that only takes the distribution type (not the range).
  * `no_validity`, which is a simpler equivalent of `null_probability(std::nullopr)`.
  */

@@ -33,7 +33,7 @@
 #include <type_traits>
 
 /**
- * @brief Real Type that has atleast number of bits of integral type in its mantissa.
+ * @brief Real Type that has at least number of bits of integral type in its mantissa.
  *  number of bits of integrals < 23 bits of mantissa in float
  * to allow full range of integer bits to be generated.
  * @tparam T integral type

@@ -144,7 +144,7 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
   thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin());
 
   // last element is the total output size
-  // (device-to-host copy of 1 integer -- includes synching the stream)
+  // (device-to-host copy of 1 integer -- includes syncing the stream)
   cudf::size_type output_size = offsets.back_element(stream);
 
   //  create chars vector

@@ -53,7 +53,7 @@ auto to_string(T value) -> std::string
   } else {
     return std::to_string(value);
   }
-  return std::string{};  // won't ever hit here, need to supress warning though
+  return std::string{};  // won't ever hit here, need to suppress warning though
 }
 
 template <typename T>

@@ -158,7 +158,7 @@ __device__ __forceinline__ out_it_t write_utf8_char(utf8_char_t utf8_chars, out_
  * @return A four-tuple of (in_it_end, out_it_end, set_null, is_invalid), where in_it_end is an
  * iterator to one past the last character from the input that was processed, out_it_end is an
  * iterator to one past the last character that was written, set_null is true if a null literal
- * was read or a parsing error occured, and is_invalid is true if a parsing error was
+ * was read or a parsing error occurred, and is_invalid is true if a parsing error was
  * encountered
  */
 template <typename in_iterator_t, typename out_iterator_t>
@@ -334,7 +334,7 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
                          auto const num_chars_copied_out =
                            thrust::distance(out_it, str_process_info.output_processed_end);
 
-                         // If, during parsing, an error occured or we parsed the null literal ->
+                         // If, during parsing, an error occurred or we parsed the null literal ->
                          // set to null
                          if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
                            sizes[row] = 0;

@@ -178,7 +178,7 @@ struct source_info {
   source_info() = default;
 
   /**
-   * @brief Construct a new source info object for mutiple files
+   * @brief Construct a new source info object for multiple files
    *
    * @param file_paths Input files paths
    */

@@ -33,7 +33,7 @@ namespace cudf {
  * Stores output in a contiguous column, exposing the transposed table as
  * a `table_view`.
  *
- * @throw cudf::logic_error if column types are non-homogenous
+ * @throw cudf::logic_error if column types are non-homogeneous
  * @throw cudf::logic_error if column types are non-fixed-width
  *
  * @param[in] input A table (M cols x N rows) to be transposed

@@ -100,7 +100,7 @@ def output_xml(entries, sorted_list, args):
     print(xmlstr)
 
 
-# utility converts a millisecond value to a colum width in pixels
+# utility converts a millisecond value to a column width in pixels
 def time_to_width(value, end):
     # map a value from (0,end) to (0,1000)
     r = (float(value) / float(end)) * 1000.0

@@ -302,7 +302,7 @@ struct DispatchFSM : DeviceFSMPolicy {
   }
 
   //------------------------------------------------------------------------------
-  // POLICY INVOKATION
+  // POLICY INVOCATION
   //------------------------------------------------------------------------------
   template <typename ActivePolicyT>
   CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()

@@ -26,7 +26,7 @@ namespace cudf::io::fst::detail {
 
 /**
  * @brief A bit-packed array of items that can be backed by registers yet allows to be dynamically
- * addressed at runtime. The data struture is explained in greater detail in the paper <a
+ * addressed at runtime. The data structure is explained in greater detail in the paper <a
  * href="http://www.vldb.org/pvldb/vol13/p616-stehle.pdf">ParPaRaw: Massively Parallel Parsing of
  * Delimiter-Separated Raw Data</a>.
  *

@@ -101,7 +101,7 @@ class SingleSymbolSmemLUT {
     // Initialize the out-of-bounds lookup: sym_to_sgid[max_base_match_val+1] -> no_match_id
     sgid_init.host_ptr()->sym_to_sgid[max_base_match_val + 1] = no_match_id;
 
-    // Alias memory / return memory requiremenets
+    // Alias memory / return memory requirements
     sgid_init.host_ptr()->num_valid_entries = max_base_match_val + 1;
 
     sgid_init.host_to_device(stream);

@@ -292,7 +292,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
   // Node parent ids:
   // previous push node_id transform, stable sort by level, segmented scan with Max, reorder.
   rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream, mr);
-  // This block of code is generalized logical stack algorithm. TODO: make this a seperate function.
+  // This block of code is generalized logical stack algorithm. TODO: make this a separate function.
   {
     rmm::device_uvector<NodeIndexT> node_token_ids(num_nodes, stream);
     thrust::copy_if(rmm::exec_policy(stream),
@@ -480,7 +480,7 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
 //   a. Create a hash map with hash of {node_level, node_type} of its node and the entire parent
 //      until root.
 //   b. While creating hashmap, transform node id to unique node ids that are inserted into the
-//      hash map. This mimicks set operation with hash map. This unique node ids are set ids.
+//      hash map. This mimics set operation with hash map. This unique node ids are set ids.
 //   c. Return this converted set ids, which are the hash map keys/values, and unique set ids.
 std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_node_path(
   device_span<TreeDepthT const> node_levels,
@@ -728,7 +728,7 @@ rmm::device_uvector<size_type> compute_row_offsets(rmm::device_uvector<NodeIndex
 //   c. sort and use binary search to generate column ids.
 //   d. Translate parent node ids to parent column ids.
 // 2. Generate row_offset.
-//   a. filter only list childs
+//   a. filter only list children
 //   a. stable_sort by parent_col_id.
 //   b. scan_by_key {parent_col_id} (done only on nodes whose parent is a list)
 //   c. propagate to non-list leaves from parent list node by recursion

@@ -49,9 +49,9 @@ static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years;
 /**
  * @brief Returns the GMT offset for a given date and given timezone table.
  *
- * @param ttimes Transition times; trailing `cycle_entry_cnt` entires are used for all times
+ * @param ttimes Transition times; trailing `cycle_entry_cnt` entries are used for all times
  * beyond the one covered by the TZif file
- * @param offsets Time offsets in specific intervals; trailing `cycle_entry_cnt` entires are used
+ * @param offsets Time offsets in specific intervals; trailing `cycle_entry_cnt` entries are used
  * for all times beyond the one covered by the TZif file
  * @param count Number of elements in @p ttimes and @p offsets
  * @param ts ORC timestamp

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
@@ -1801,7 +1801,7 @@ void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
   // PageNestingInfo::size for each level of nesting, for each page, taking row bounds into account.
   // PageInfo::skipped_values, which tells us where to start decoding in the input  .
   // It is only necessary to do this second pass if uses_custom_row_bounds is set (if the user has
-  // specified artifical bounds).
+  // specified artificial bounds).
   if (uses_custom_row_bounds) {
     gpuComputePageSizes<<<dim_grid, dim_block, 0, stream.value()>>>(
       pages.device_ptr(), chunks, min_row, num_rows, true);

@@ -1491,9 +1491,9 @@ __device__ bool increment_utf8_at(unsigned char* ptr)
   // elem is one of (no 5 or 6 byte chars allowed):
   //  0b0vvvvvvv a 1 byte character
   //  0b10vvvvvv a continuation byte
-  //  0b110vvvvv start of a 2 byte characther
-  //  0b1110vvvv start of a 3 byte characther
-  //  0b11110vvv start of a 4 byte characther
+  //  0b110vvvvv start of a 2 byte character
+  //  0b1110vvvv start of a 3 byte character
+  //  0b11110vvv start of a 4 byte character
 
   // TODO(ets): starting at 4 byte and working down.  Should probably start low and work higher.
   uint8_t mask  = 0xF8;

@@ -128,7 +128,7 @@ class reader::impl {
                                       std::vector<std::unique_ptr<column>>& out_columns);
 
   /**
-   * @brief Allocate data bufers for the output columns.
+   * @brief Allocate data buffers for the output columns.
    *
    * @param skip_rows Crop all rows below skip_rows
    * @param num_rows Maximum number of rows to read

@@ -80,7 +80,7 @@ type_id to_type_id(SchemaElement const& schema,
   int32_t decimal_scale                   = schema.decimal_scale;
 
   // Logical type used for actual data interpretation; the legacy converted type
-  // is superceded by 'logical' type whenever available.
+  // is superseded by 'logical' type whenever available.
   auto const inferred_converted_type = logical_type_to_converted_type(logical_type);
   if (inferred_converted_type != parquet::UNKNOWN) converted_type = inferred_converted_type;
   if (inferred_converted_type == parquet::DECIMAL && decimal_scale == 0)

@@ -1772,7 +1772,7 @@ void writer::impl::write(table_view const& table, std::vector<partition_info> co
                                         cudaMemcpyDeviceToHost,
                                         stream.value()));
 
-          // calculate offsets while the column index is transfering
+          // calculate offsets while the column index is transferring
           int64_t curr_pg_offset = column_chunk_meta.data_page_offset;
 
           OffsetIndex offset_idx;

@@ -21,7 +21,7 @@
 namespace cudf::io::statistics {
 
 /**
- * @brief Wrapper for a row of a list<int8> or list<uint8> column. This is analagous to
+ * @brief Wrapper for a row of a list<int8> or list<uint8> column. This is analogous to
  * `string_view` in type. It was created due to the need for comparison operators for cub reduce on
  * statistics. Otherwise, it is a device_span in all but name.
  *

@@ -114,7 +114,7 @@ __device__ __inline__ bool is_like_float(std::size_t len,
  *
  * @param[in] options View of inference options
  * @param[in] data JSON string input
- * @param[in] column_strings_begin The begining of an offset-length tuple sequence
+ * @param[in] column_strings_begin The beginning of an offset-length tuple sequence
  * @param[in] size Size of the string input
  * @param[out] column_info Histogram of column type counters
  */
@@ -234,7 +234,7 @@ __global__ void infer_column_type_kernel(OptionsView options,
  *
  * @param options View of inference options
  * @param data JSON string input
- * @param column_strings_begin The begining of an offset-length tuple sequence
+ * @param column_strings_begin The beginning of an offset-length tuple sequence
  * @param size Size of the string input
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return A histogram containing column-specific type counters
@@ -272,7 +272,7 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options,
  *
  * @param options View of inference options
  * @param data JSON string input
- * @param column_strings_begin The begining of an offset-length tuple sequence
+ * @param column_strings_begin The beginning of an offset-length tuple sequence
  * @param size Size of the string input
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return The inferred data type

@@ -266,7 +266,7 @@ __global__ void copy_block_partitions(InputIter input_iter,
   using BlockScan = cub::BlockScan<size_type, OPTIMIZED_BLOCK_SIZE>;
   __shared__ typename BlockScan::TempStorage temp_storage;
 
-  // use ELEMENTS_PER_THREAD=2 to support upto 1024 partitions
+  // use ELEMENTS_PER_THREAD=2 to support up to 1024 partitions
   size_type temp_histo[ELEMENTS_PER_THREAD];
 
   for (int i = 0; i < ELEMENTS_PER_THREAD; ++i) {

@@ -168,7 +168,7 @@ std::unique_ptr<column> quantile(column_view const& input,
 
   } else {
     CUDF_EXPECTS(indices.type() == data_type{type_to_id<size_type>()},
-                 "`indicies` type must be `INT32`.");
+                 "`indices` type must be `INT32`.");
     if (exact) {
       return quantile<true>(
         input, indices.begin<size_type>(), indices.size(), q, interp, exact, stream, mr);

@@ -137,7 +137,7 @@ __global__ void fused_concatenate_string_offset_kernel(column_device_view const*
     auto const* input_data       = input_view.child(offsets_child).data<int32_t>();
     output_data[output_index] =
       input_data[offset_index + input_view.offset()]  // handle parent offset
-      - input_data[input_view.offset()]               // subract first offset if non-zero
+      - input_data[input_view.offset()]               // subtract first offset if non-zero
       + partition_offsets[partition_index];           // add offset of source column
 
     if (Nullable) {

@@ -39,7 +39,7 @@ namespace {
 enum OperatorType : int32_t {
   START        = 0200,  // Start, used for marker on stack
   LBRA_NC      = 0203,  // non-capturing group
-  CAT          = 0205,  // Concatentation, implicit operator
+  CAT          = 0205,  // Concatenation, implicit operator
   STAR         = 0206,  // Closure, *
   STAR_LAZY    = 0207,
   PLUS         = 0210,  // a+ == aa*

@@ -173,7 +173,7 @@ TYPED_TEST(TDigestAllTypes, LargeGroups)
     values.push_back(v[0]);
   }
 
-  // generate a seperate tdigest for each group
+  // generate a separate tdigest for each group
   std::vector<std::unique_ptr<column>> parts;
   std::transform(
     iter, iter + values.size(), std::back_inserter(parts), [&keys, &values, delta](int i) {
@@ -283,7 +283,7 @@ TEST_F(TDigestMergeTest, Grouped)
 
   int const delta = 1000;
 
-  // generate seperate digests
+  // generate separate digests
   std::vector<std::unique_ptr<column>> parts;
   auto iter = thrust::make_counting_iterator(0);
   std::transform(

@@ -109,7 +109,7 @@ TEST_F(DataChunkSourceTest, DataSourceHost)
 TEST_F(DataChunkSourceTest, DataSourceFile)
 {
   std::string content = "file datasource";
-  // make it big enought to have is_device_read_preferred return true
+  // make it big enough to have is_device_read_preferred return true
   content.reserve(content.size() << 20);
   for (int i = 0; i < 20; i++) {
     content += content;

@@ -240,7 +240,7 @@ TEST_F(MultibyteSplitTest, HandpickedInput)
     "another::|"
     "simple::|"
     "text::|"
-    "seperated::|"
+    "separated::|"
     "by::|"
     "emojis::|"
     "which::|"
@@ -258,7 +258,7 @@ TEST_F(MultibyteSplitTest, HandpickedInput)
   auto expected = strings_column_wrapper{
     "aaa::|",         "bbb::|",      "ccc::|",       "ddd::|",  "eee::|",    "fff::|",
     "ggg::|",         "hhh::|",      "___::|",       "here::|", "is::|",     "another::|",
-    "simple::|",      "text::|",     "seperated::|", "by::|",   "emojis::|", "which::|",
+    "simple::|",      "text::|",     "separated::|", "by::|",   "emojis::|", "which::|",
     "are::|",         "multiple::|", "bytes::|",     "and::|",  "used::|",   "as::|",
     "delimiters.::|", "::|",         "::|",          "::|"};