fix merge conflict

rapidsai · Jan 17, 2020 · 74dcfe4 · 74dcfe4
2 parents 572a5c9 + fc5ec20
commit 74dcfe4
Show file tree

Hide file tree

Showing 23 changed files with 634 additions and 77 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,11 +14,13 @@
 - PR #3667 Define and implement round-robin partition API.
 - PR #3690 Add bools_to_mask
 - PR #3683 Added support for multiple delimiters in `nvtext.token_count()`
+- PR #3792 Adding is_nan and is_notnan
 
 ## Improvements
 
 - PR #3292 Port NVStrings regex contains function
 - PR #3409 Port NVStrings regex replace function
+- PR #3417 Port NVStrings regex findall function
 - PR #3351 Add warning when filepath resolves to multiple files in cudf readers
 - PR #3370 Port NVStrings strip functions
 - PR #3453 Port NVStrings IPv4 convert functions to cudf strings column
@@ -45,6 +47,7 @@
 - PR #3640 Enable memory_usage in dask_cudf (also adds pd.Index from_pandas)
 - PR #3654 Update Jitify submodule ref to include gcc-8 fix
 - PR #3639 Define and implement `nans_to_nulls`
+- PR #3561 Rework contains implementation in search
 - PR #3616 Add aggregation infrastructure for argmax/argmin.
 - PR #3699 Stringify libcudacxx headers for binary op JIT
 - PR #3697 Improve column insert performance for wide frames
@@ -53,6 +56,7 @@
 - PR #3657 Define and implement compiled binops for string column comparisons
 - PR #3520 Change read_parquet defaults and add warnings
 - PR #3780 Java APIs for selecting a GPU
+- PR #3805 Avoid CuPy 7.1.0 for now
 
 ## Bug Fixes
 
@@ -91,6 +95,8 @@
 - PR #3783 Bind cuDF operators to Dask Dataframe
 - PR #3775 Fix segfault when reading compressed CSV files larger than 4GB
 - PR #3803 Keep name when unpickling Index objects
+- PR #3804 Fix cuda crash in AVRO reader
+- PR #3766 Remove references to cudf::type_id::CATEGORY from IO code
 
 
 # cuDF 0.11.0 (11 Dec 2019)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -60,7 +60,7 @@ source activate gdf
 conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \
               "dask>=2.1.0" "distributed>=2.1.0" "numpy>=1.16" "double-conversion" \
               "rapidjson" "flatbuffers" "boost-cpp" "fsspec>=0.3.3" "dlpack" \
-              "feather-format" "cupy>=6.6.0,<8.0.0a0" "arrow-cpp=0.15.0" "pyarrow=0.15.0" \
+              "feather-format" "cupy>=6.6.0,<8.0.0a0,!=7.1.0" "arrow-cpp=0.15.0" "pyarrow=0.15.0" \
               "fastavro>=0.22.0" "pandas>=0.25,<0.26" "hypothesis" "s3fs" "gcsfs" \
               "boto3" "moto" "httpretty" "streamz"
 

diff --git a/conda/environments/cudf_dev_cuda10.0.yml b/conda/environments/cudf_dev_cuda10.0.yml
@@ -6,7 +6,7 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - cupy>=6.6.0,<8.0.0a0
+  - cupy>=6.6.0,<8.0.0a0,!=7.1.0
   - rmm=0.12.*
   - cmake>=3.12
   - cmake_setuptools>=0.1.3

diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml
@@ -6,7 +6,7 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - cupy>=6.6.0,<8.0.0a0
+  - cupy>=6.6.0,<8.0.0a0,!=7.1.0
   - rmm=0.12.*
   - cmake>=3.12
   - cmake_setuptools>=0.1.3

diff --git a/conda/environments/cudf_dev_cuda9.2.yml b/conda/environments/cudf_dev_cuda9.2.yml
@@ -6,7 +6,7 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - cupy>=6.6.0,<8.0.0a0
+  - cupy>=6.6.0,<8.0.0a0,!=7.1.0
   - rmm=0.12.*
   - cmake>=3.12
   - cmake_setuptools>=0.1.3

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -31,7 +31,7 @@ requirements:
   run:
     - python
     - pandas>=0.24.2,<0.25
-    - cupy >=6.6.0,<8.0.0a0
+    - cupy >=6.6.0,<8.0.0a0,!=7.1.0
     - numba >=0.46.0
     - pyarrow 0.15.0.*
     - fastavro >=0.22.0

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -473,6 +473,7 @@ add_library(cudf
             src/merge/legacy/merge.cu
             src/unary/cast_ops.cu
             src/unary/null_ops.cu
+            src/unary/nan_ops.cu
             src/unary/legacy/math_ops.cu
             src/unary/legacy/cast_ops.cu
             src/unary/legacy/null_ops.cu
@@ -576,6 +577,7 @@ add_library(cudf
             src/strings/copying/copying.cu
             src/strings/extract.cu
             src/strings/find.cu
+            src/strings/findall.cu
             src/strings/find_multiple.cu
             src/strings/filling/fill.cu
             src/strings/padding.cu

diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+
+namespace cudf
+{
+namespace strings
+{
+
+/**
+ * @brief Returns a table of strings columns for each matching occurrence of the
+ * regex pattern within each string.
+ *
+ * The number of output columns is determined by the string with the most
+ * matches.
+ *
+ * Any null string entries return corresponding null output column entries.
+ *
+ * @param strings Strings instance for this operation.
+ * @param pattern Regex pattern to match within each string.
+ * @param mr Resource for allocating device memory.
+ * @return New table of strings columns.
+ */
+std::unique_ptr<experimental::table> findall_re( strings_column_view const& strings,
+                                                 std::string const& pattern,
+                                                 rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
+
+} // namespace strings
+} // namespace cudf
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -93,5 +93,39 @@ std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
 std::unique_ptr<column> cast(column_view const& input, data_type out_type,
                              rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
 
+/**
+ * @brief Creates a column of `BOOL8` elements indicating the presence of `NaN` values
+ * in a column of floating point values.
+ * The output element at row `i` is `true` if the element in `input` at row i is `NAN`, else `false`
+ *
+ * @throws cudf::logic_error if `input` is a non-floating point type
+ *
+ * @param input A column of floating-point elements
+ * @param mr Optional, The resource to use for allocating the device memory in the returned column.
+ *
+ * @returns unique_ptr<column> A non-nulalble column of `BOOL8` elements with `true`
+ * representing `NAN` values
+ */
+std::unique_ptr<column> is_nan(cudf::column_view const& input,
+                               rmm::mr::device_memory_resource* mr =
+                                   rmm::mr::get_default_resource());
+
+/**
+ * @brief Creates a column of `BOOL8` elements indicating the absence of `NaN` values
+ * in a column of floating point values.
+ * The output element at row `i` is `false` if the element in `input` at row i is `NAN`, else `true`
+ *
+ * @throws cudf::logic_error if `input` is a non-floating point type
+ *
+ * @param input A column of floating-point elements
+ * @param mr Optional, The resource to use for allocating the device memory in the returned column.
+ *
+ * @returns unique_ptr<column> A non-nulalble column of `BOOL8` elements with `false`
+ * representing `NAN` values
+ */
+std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
+                                 rmm::mr::device_memory_resource* mr =
+                                   rmm::mr::get_default_resource());
+
 } // namespace experimental
 } // namespace cudf
diff --git a/cpp/src/io/avro/legacy/avro_reader_impl.cu b/cpp/src/io/avro/legacy/avro_reader_impl.cu
@@ -452,7 +452,7 @@ void reader::Impl::decode_data(
       static_cast<block_desc_s *>(block_list.data()), schema_desc.device_ptr(),
       reinterpret_cast<gpu::nvstrdesc_s *>(global_dictionary.device_ptr()),
       static_cast<const uint8_t *>(block_data.data()),
-      static_cast<uint32_t>(block_list.size()),
+      static_cast<uint32_t>(md_->block_list.size()),
       static_cast<uint32_t>(schema_desc.size()),
       static_cast<uint32_t>(total_dictionary_entries), md_->num_rows,
       md_->skip_rows, min_row_data_size, 0));

diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
@@ -329,7 +329,7 @@ void reader::impl::decode_data(
       static_cast<block_desc_s *>(block_list.data()), schema_desc.device_ptr(),
       reinterpret_cast<gpu::nvstrdesc_s *>(global_dictionary.device_ptr()),
       static_cast<const uint8_t *>(block_data.data()),
-      static_cast<uint32_t>(block_list.size()),
+      static_cast<uint32_t>(_metadata->block_list.size()),
       static_cast<uint32_t>(schema_desc.size()),
       static_cast<uint32_t>(total_dictionary_entries), _metadata->num_rows,
       _metadata->skip_rows, min_row_data_size, stream));

diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
@@ -679,8 +679,7 @@ __global__ void convertCsvToGdf(const char *raw_csv, const ParseOptions opts,
 
       // Modify start & end to ignore whitespace and quotechars
       long tempPos = pos - 1;
-      if (!is_na && dtype[actual_col].id() != cudf::type_id::CATEGORY &&
-          dtype[actual_col].id() != cudf::type_id::STRING) {
+      if (!is_na && dtype[actual_col].id() != cudf::type_id::STRING) {
         trim_field_start_end(raw_csv, &start, &tempPos, opts.quotechar);
       }
 

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
@@ -128,7 +128,7 @@ data_type convertStringToDtype(const std::string &dtype) {
     return data_type(cudf::type_id::TIMESTAMP_MICROSECONDS);
   if (dtype == "timestamp[ns]")
     return data_type(cudf::type_id::TIMESTAMP_NANOSECONDS);
-  if (dtype == "category") return data_type(cudf::type_id::CATEGORY);
+  if (dtype == "category") return data_type(cudf::type_id::INT32);
   if (dtype == "date32") return data_type(cudf::type_id::TIMESTAMP_DAYS);
   if (dtype == "bool" || dtype == "boolean")
     return data_type(cudf::type_id::BOOL8);

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
@@ -91,8 +91,6 @@ constexpr orc::TypeKind to_orc_type(cudf::type_id id) {
     case cudf::type_id::TIMESTAMP_MILLISECONDS:
     case cudf::type_id::TIMESTAMP_NANOSECONDS:
       return TypeKind::TIMESTAMP;
-    case cudf::type_id::CATEGORY:
-      return TypeKind::INT;
     case cudf::type_id::STRING:
       return TypeKind::STRING;
     default:

diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
@@ -97,7 +97,7 @@ constexpr type_id to_type_id(parquet::Type physical,
     case parquet::BYTE_ARRAY:
     case parquet::FIXED_LEN_BYTE_ARRAY:
       // Can be mapped to GDF_CATEGORY (32-bit hash) or GDF_STRING (nvstring)
-      return strings_to_categorical ? type_id::CATEGORY : type_id::STRING;
+      return strings_to_categorical ? type_id::INT32 : type_id::STRING;
     case parquet::INT96:
       return (timestamp_type_id != type_id::EMPTY)
                  ? timestamp_type_id
@@ -146,7 +146,7 @@ std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
     type_width = 1;  // I32 -> I8
   } else if (column_type_id == type_id::INT16) {
     type_width = 2;  // I32 -> I16
-  } else if (column_type_id == type_id::CATEGORY) {
+  } else if (column_type_id == type_id::INT32) {
     type_width = 4;  // str -> hash32
   } else if (is_timestamp(data_type{column_type_id})) {
     clock_rate = to_clockrate(timestamp_type_id);

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
@@ -128,7 +128,6 @@ class parquet_column_view {
         _stats_dtype = statistics_dtype::dtype_int16;
         break;
       case cudf::type_id::INT32:
-      case cudf::type_id::CATEGORY:
         _physical_type = Type::INT32;
         _stats_dtype = statistics_dtype::dtype_int32;
         break;

diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
@@ -21,7 +21,9 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
+
 #include <hash/unordered_multiset.cuh>
+#include <cudf/detail/iterator.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <strings/utilities.hpp>
@@ -122,72 +124,28 @@ std::unique_ptr<column> search_ordered(table_view const& t,
   return result;
 }
 
-template <typename Element, bool nullable = true>
-struct compare_with_value{
-  compare_with_value(column_device_view c, Element val, bool val_is_valid, bool nulls_are_equal)
-
-    : col{c}, value{val}, val_is_valid{val_is_valid}, nulls_are_equal{nulls_are_equal} {}
-
-  __device__ bool operator()(size_type i) noexcept {
-    if (nullable) {
-      bool const col_is_null{col.nullable() and col.is_null(i)};
-      if (col_is_null and not val_is_valid)
-        return nulls_are_equal;
-      else if (col_is_null == val_is_valid)
-        return false;
-    }
-
-    return equality_compare<Element>(col.element<Element>(i), value);
-  }
-
-  column_device_view        col;
-  Element                   value;
-  bool val_is_valid;
-  bool nulls_are_equal;
-};
-
-template <typename Element>
-void populate_element(scalar const& value, Element &e) {
-  using ScalarType = cudf::experimental::scalar_type_t<Element>;
-  auto s1 = static_cast<const ScalarType *>(&value);
-
-  e = s1->value();
-}
-
-template <>
-void populate_element<string_view>(scalar const& value, string_view &e) {
-  using ScalarType = cudf::experimental::scalar_type_t<string_view>;
-  auto s1 = static_cast<const ScalarType *>(&value);
-
-  e = string_view{s1->data(), s1->size()};
-}
-
 struct contains_scalar_dispatch {
   template <typename Element>
   bool operator()(column_view const& col, scalar const& value,
-                  cudaStream_t stream,
-                  rmm::mr::device_memory_resource *mr) {
+                  cudaStream_t stream, rmm::mr::device_memory_resource *mr) {
 
+    using ScalarType = cudf::experimental::scalar_type_t<Element>;
     auto d_col = column_device_view::create(col, stream);
-    auto data_it = thrust::make_counting_iterator<size_type>(0);
-
-    bool    element_is_valid{value.is_valid()};
-    Element element;
-
-    populate_element(value, element);
+    auto s = static_cast<const ScalarType *>(&value);
 
     if (col.has_nulls()) {
-      auto eq_op = compare_with_value<Element, true>(*d_col, element, element_is_valid, true);
+      auto found_iter = thrust::find(rmm::exec_policy(stream)->on(stream),
+                                     d_col->pair_begin<Element, true>(),
+                                     d_col->pair_end<Element, true>(),
+                                     thrust::make_pair(s->value(), true));
 
-      return thrust::any_of(rmm::exec_policy(stream)->on(stream),
-                            data_it, data_it + col.size(),
-                            eq_op);
+      return found_iter != d_col->pair_end<Element, true>();
     } else {
-      auto eq_op = compare_with_value<Element, false>(*d_col, element, element_is_valid, true);
+      auto found_iter =  thrust::find(rmm::exec_policy(stream)->on(stream),
+                                      d_col->begin<Element>(),
+                                      d_col->end<Element>(), s->value());
 
-      return thrust::any_of(rmm::exec_policy(stream)->on(stream),
-                            data_it, data_it + col.size(),
-                            eq_op);
+      return found_iter != d_col->end<Element>();
     }
   }
 };