shwina · shwina · Nov 5, 2019 · Oct 4, 2019 · Oct 4, 2019 · Oct 7, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,22 +2,27 @@
 
 ## New Features
 
-- PR #3011 Added libcudf++ transition guide
 - PR #2930 JSON Reader: Support ARROW_RANDOM_FILE input
 - PR #2956 Add `cudf::stack` and `cudf::tile`
+- PR #2980 Added nvtext is_vowel/is_consonant functions
 - PR #2987 Add `inplace` arg to `DataFrame.reset_index` and `Series`
+- PR #3011 Added libcudf++ transition guide
 - PR #3129 Add strings column factory from `std::vector`s
 - PR #3054 Add parquet reader support for decimal data types
 - PR #3022 adds DataFrame.astype for cuDF dataframes
 - PR #2962 Add isnull(), notnull() and related functions
 - PR #3025 Move search files to legacy
+- PR #3068 Add `scalar` class
 - PR #3094 Adding `any` and `all` support from libcudf
 - PR #3130 Define and implement new `column_wrapper`
 - PR #3143 Define and implement new copying APIs `slice` and `split`
 - PR #3161 Move merge files to legacy
 - PR #3079 Added support to write ORC files given a local path
 - PR #3192 Add dtype param to cast `DataFrame` on init
 - PR #3223 Java expose underlying buffers
+- PR #3278 Add `to_host` utility to copy `column_view` to host
+- PR #3087 Add new cudf::experimental bool8 wrapper
+- PR #3219 Construct column from column_view
 
 ## Improvements
 
@@ -71,6 +76,13 @@
 - PR #3245 Move binaryop files to legacy
 - PR #3241 Move stream_compaction files to legacy
 - PR #3166 Move reductions to legacy
+- PR #3261 Small cleanup: remove `== true`
+- PR #3268 Adding null ordering per column feature when sorting
+- PR #3239 Adding floating point specialization to comparators for NaNs
+- PR #3270 Move predicates files to legacy
+- PR #3282 Add `num_bitmask_words`
+- PR #3287 Move rolling windows files to legacy
+
 
 ## Bug Fixes
 
@@ -91,8 +103,16 @@
 - PR #3218 Fixes `row_lexicographic_comparator` issue with handling two tables
 - PR #3228 Default initialize RMM when Java native dependencies are loaded
 - PR #3236 Fix Numba 0.46+/CuPy 6.3 interface compatibility
+- PR #3276 Update JNI includes for legacy moves
 - PR #3256 Fix orc writer crash with multiple string columns
 - PR #3211 Fix breaking change caused by rapidsai/rmm#167
+- PR #3265 Fix dangling pointer in `is_sorted`
+- PR #3267 ORC writer: fix incorrect ByteRLE encoding of long literal runs
+- PR #3277 Fix invalid reference to deleted temporary in `is_sorted`.
+- PR #3274 ORC writer: fix integer RLEv2 mode2 unsigned base value encoding
+- PR #3279 Fix shutdown hang issues with pinned memory pool init executor
+- PR #3280 Invalid children check in mutable_column_device_view
+
 
 # cuDF 0.10.0 (16 Oct 2019)
 

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -71,10 +71,10 @@ test:
     - test -f $PREFIX/include/cudf/ipc.hpp
     - test -f $PREFIX/include/cudf/legacy/merge.hpp
     - test -f $PREFIX/include/cudf/legacy/join.hpp
-    - test -f $PREFIX/include/cudf/predicates.hpp
+    - test -f $PREFIX/include/cudf/legacy/predicates.hpp
     - test -f $PREFIX/include/cudf/legacy/reduction.hpp
     - test -f $PREFIX/include/cudf/legacy/replace.hpp
-    - test -f $PREFIX/include/cudf/rolling.hpp
+    - test -f $PREFIX/include/cudf/legacy/rolling.hpp
     - test -f $PREFIX/include/cudf/legacy/search.hpp
     - test -f $PREFIX/include/cudf/legacy/stream_compaction.hpp
     - test -f $PREFIX/include/cudf/legacy/transform.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -366,17 +366,17 @@ add_library(cudf
             src/strings/nvcategory_util.cpp
             src/join/legacy/joining.cu
             src/orderby/legacy/orderby.cu
-            src/predicates/is_sorted.cu
+            src/predicates/legacy/is_sorted.cu
             src/sort/legacy/digitize.cu
             src/groupby/hash/legacy/groupby.cu
             src/groupby/sort/legacy/sort_helper.cu
             src/groupby/sort/legacy/groupby.cu
             src/groupby/legacy/groupby_without_aggregation.cu
             src/groupby/common/legacy/aggregation_requests.cpp
-            src/rolling/rolling.cu
-            src/rolling/jit/code/kernel.cpp
-            src/rolling/jit/code/operation.cpp
-            src/rolling/jit/util/type.cpp
+            src/rolling/legacy/rolling.cu
+            src/rolling/legacy/jit/code/kernel.cpp
+            src/rolling/legacy/jit/code/operation.cpp
+            src/rolling/legacy/jit/util/type.cpp
             src/binaryop/legacy/binaryop.cpp
             src/binaryop/legacy/compiled/binary_ops.cu
             src/binaryop/legacy/jit/code/kernel.cpp
@@ -481,11 +481,14 @@ add_library(cudf
             src/bitmask/null_mask.cu
             src/sort/sort.cu
             src/strings/strings_column_factories.cu
+            src/strings/strings_scalar_factories.cpp
             src/strings/strings_column_view.cu
             src/strings/utilities.cu
             src/strings/copying/copying.cu
             src/strings/sorting/sorting.cu
-            src/column/legacy/interop.cpp)
+            src/column/legacy/interop.cpp
+            src/scalar/scalar.cpp
+            src/scalar/scalar_factories.cpp)
 
 # Rename installation to proper names for later finding
 set_target_properties(libNVStrings PROPERTIES OUTPUT_NAME "NVStrings")

diff --git a/cpp/custrings/tests/test_text.cu b/cpp/custrings/tests/test_text.cu
@@ -1,6 +1,8 @@
 #include <gtest/gtest.h>
 #include <vector>
 #include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
 
 #include "nvstrings/NVStrings.h"
 #include "nvstrings/NVText.h"
@@ -186,6 +188,43 @@ TEST_F(TestText, PorterStemmerMeasure)
     NVStrings::destroy(strs);
 }
 
+TEST_F(TestText, VowelsAndConsonants)
+{
+    std::vector<const char*> hstrs{ "abandon", nullptr, "abbey", "cleans",
+                                    "trouble", "", "yearly" };
+    NVStrings* strs = NVStrings::create_from_array(hstrs.data(),hstrs.size());
+
+    thrust::device_vector<bool> results(hstrs.size(),0);
+    {
+        NVText::is_letter(*strs, nullptr, nullptr, NVText::vowel, 5, results.data().get());
+        bool expected[] = { true, false, false, false, false, false, true };
+        for( unsigned int idx=0; idx < hstrs.size(); ++idx )
+            EXPECT_EQ(results[idx],expected[idx]);
+    }
+    {
+        NVText::is_letter(*strs, nullptr, nullptr, NVText::consonant, 5, results.data().get());
+        bool expected[] = { false, false, false, true, true, false, false };
+        for( unsigned int idx=0; idx < hstrs.size(); ++idx )
+            EXPECT_EQ(results[idx],expected[idx]);
+    }
+    thrust::device_vector<int> indices(hstrs.size());
+    thrust::sequence( thrust::device, indices.begin(), indices.end() );
+    indices[hstrs.size()-1] = -1; // throw in a negative index too
+    {
+        NVText::is_letter(*strs, nullptr, nullptr, NVText::vowel, indices.data().get(), results.data().get());
+        bool expected[] = { true, false, false, true, false, false, true };
+        for( unsigned int idx=0; idx < hstrs.size(); ++idx )
+            EXPECT_EQ(results[idx],expected[idx]);
+    }
+    {
+        NVText::is_letter(*strs, nullptr, nullptr, NVText::consonant, indices.data().get(), results.data().get());
+        bool expected[] = { false, false, true, false, true, false, false };
+        for( unsigned int idx=0; idx < hstrs.size(); ++idx )
+            EXPECT_EQ(results[idx],expected[idx]);
+    }
+
+    NVStrings::destroy(strs);
+}
 
 TEST_F(TestText, ScatterCount)
 {

diff --git a/cpp/custrings/text/stemmer.cu b/cpp/custrings/text/stemmer.cu
@@ -19,6 +19,12 @@
 #include <thrust/for_each.h>
 #include <rmm/rmm.h>
 #include <rmm/thrust_rmm_allocator.h>
+
+// NOTE: These are cudf headers. Please be cautious.
+// Using anything from these headers besides macros or typedefs
+// will not work because this module is built before libcudf
+// and therefore will not be able to link to any functions there.
+// This module will be reworked appropriately in the future.
 #include <cudf/utilities/error.hpp>
 
 #include "nvstrings/NVStrings.h"
@@ -27,14 +33,15 @@
 #include "../custring_view.cuh"
 #include "../util.h"
 
-struct porter_stemmer_measure_fn
+struct stemmer_base_fn
 {
-    custring_view_array d_strings;
     custring_view* d_vowels;
     Char y_char;
-    unsigned int* d_results;
 
-    __device__ bool is_consonant( custring_view* dstr, int index )
+    stemmer_base_fn( custring_view* d_vowels, Char y_char )
+    : d_vowels(d_vowels), y_char(y_char) {}
+
+    __device__ bool is_consonant( custring_view* dstr, int index ) const
     {
         Char ch = dstr->at(index);
         if( d_vowels->find(ch) >= 0 )
@@ -44,6 +51,16 @@ struct porter_stemmer_measure_fn
         ch = dstr->at(index-1);       // only if previous char
         return d_vowels->find(ch)>=0; // is not a consonant
     }
+};
+
+struct porter_stemmer_measure_fn : public stemmer_base_fn
+{
+    custring_view_array d_strings;
+    unsigned int* d_results;
+
+    porter_stemmer_measure_fn( custring_view* d_vowels, Char y_char,
+                               custring_view_array d_strings, unsigned int* d_results )
+    : stemmer_base_fn(d_vowels,y_char), d_strings(d_strings), d_results(d_results) {}
 
     __device__ void operator()(unsigned int idx)
     {
@@ -92,7 +109,7 @@ unsigned int NVText::porter_stemmer_measure(NVStrings& strs, const char* vowels,
 
     // do the measure
     thrust::for_each_n(execpol->on(0), thrust::make_counting_iterator<unsigned int>(0), count,
-        porter_stemmer_measure_fn{d_strings,d_vowels,char_y,d_results});
+        porter_stemmer_measure_fn{d_vowels,char_y,d_strings,d_results});
 
     // done
     if( !bdevmem )
@@ -102,4 +119,74 @@ unsigned int NVText::porter_stemmer_measure(NVStrings& strs, const char* vowels,
     }
     RMM_FREE(d_vowels,0);
     return 0;
-}
+}
+
+//
+unsigned int is_letter(NVStrings& strs, const char* vowels, const char* y_char,
+                       NVText::letter_type ltype, int index, int* d_indices, bool* results, bool bdevmem )
+{
+    unsigned int count = strs.size();
+    if( count==0 )
+        return 0; // nothing to do
+    auto execpol = rmm::exec_policy(0);
+    // setup results vector
+    bool* d_results = results;
+    if( !bdevmem )
+        d_results = device_alloc<bool>(count,0);
+    if( vowels==nullptr )
+        vowels = "aeiou";
+    custring_view* d_vowels = custring_from_host(vowels);
+    if( y_char==nullptr )
+        y_char = "y";
+    Char char_y;
+    custring_view::char_to_Char(y_char,char_y);
+
+    // get the string pointers
+    rmm::device_vector<custring_view*> strings(count,nullptr);
+    custring_view** d_strings = strings.data().get();
+    strs.create_custring_index(d_strings);
+
+    //
+    stemmer_base_fn pfn{d_vowels,char_y};
+    thrust::transform(execpol->on(0),
+        thrust::make_counting_iterator<unsigned int>(0),
+        thrust::make_counting_iterator<unsigned int>(count),
+        d_results,
+        [d_strings, pfn, ltype, index, d_indices] __device__ (unsigned int idx) {
+            custring_view* d_str = d_strings[idx];
+            if( !d_str )
+                return false;
+            int position = index;
+            if( d_indices )
+                position = d_indices[idx];
+            int length = static_cast<int>(d_str->length());
+            if( (position >= length) || (position < -length) )
+                return false;
+            position = (position + length) % length; // handles positive or negative index
+            return pfn.is_consonant(d_str,position) ? ltype==NVText::consonant : ltype==NVText::vowel;
+        });
+
+    // done
+    if( !bdevmem )
+    {
+        CUDA_TRY( cudaMemcpyAsync(results,d_results,count*sizeof(bool),cudaMemcpyDeviceToHost))
+        RMM_FREE(d_results,0);
+    }
+    RMM_FREE(d_vowels,0);
+    return 0;
+}
+
+
+// check individual characters are vowels or consonants
+unsigned int NVText::is_letter(NVStrings& strs, const char* vowels, const char* y_char,
+                               NVText::letter_type ltype, int position, bool* results, bool bdevmem )
+{
+    return ::is_letter(strs,vowels,y_char,ltype,position,nullptr,results,bdevmem);
+}
+
+//
+unsigned int NVText::is_letter(NVStrings& strs, const char* vowels, const char* y_char,
+                               NVText::letter_type ltype, int* d_indices, bool* results, bool bdevmem )
+{
+    return ::is_letter(strs,vowels,y_char,ltype,0,d_indices,results,bdevmem);
+}
diff --git a/cpp/docs/TRANSITIONGUIDE.md b/cpp/docs/TRANSITIONGUIDE.md
@@ -56,7 +56,30 @@ rmm::device_buffer custom_buff(100, &mr); // Allocates 100 bytes from the custom
 
 ## `cudf::scalar`
 
-// TODO
+A `cudf::scalar` is an object that can represent a singular, nullable value of any of the types currently supported by cudf. Each type of value is represented by a separate type of scalar class which are all derived from `cudf::scalar`. e.g. A `numeric_scalar` holds a single numerical value, a `string_scalar` holds a single string. The data for the stored value resides in device memory.
+
+|Value type|Scalar class|Notes|
+|-|-|-|
+|numeric|`numeric_scalar<T>` where `T` can be `int8_t`, `int16_t`, `int32_t`, `int_64_t`, `float` or `double`||
+|timestamp|`timestamp_scalar<T>` where `T` can be `timestamp_D`, `timestamp_s`...||
+|string|`string_scalar`|This class object is immutable|
+
+### Construction
+`scalar`s can be created using either their respective constructors or using factory functions like `make_numeric_scalar()`, `make_timestamp_scalar()` or `make_string_scalar()`. 
+
+### Casting
+All the factory methods return a `unique_ptr<scalar>` which needs to be statically downcasted to its respective scalar class type before accessing its value. Their validity (nullness) can be accessed without casting.
+Generally, the value would need to be accessed from a function that is aware of the value type e.g. a functor that is dispatched from `type_dispatcher`. To cast to the requisite scalar class type given the value type, use the mapping utility `scalar_type_t` provided in `type_dispatcher.hpp` : 
+```c++
+//unique_ptr<scalar> s = make_numeric_scalar(...);
+
+using ScalarType = cudf::experimental::scalar_type_t<T>;
+// ScalarType is now numeric_scalar<T>
+auto s1 = static_cast<ScalarType *>(s.get());
+```
+
+### Passing to device
+Each scalar type has a corresponding non-owning device view class which allows access to the value and its validity from the device. This can be obtained using the function `get_scalar_device_view(ScalarType s)`. Note that a device view is not provided for a base scalar object, only for the derived typed scalar class objects.
 
 ## `cudf::column`
 
@@ -240,6 +263,8 @@ The preferred style for how inputs are passed in and outputs are returned is the
 		- `column_view const&`
 	- Tables:
 		- `table_view const&`
+    - Scalar:
+        - `scalar const&`
     - Everything else:
        - Trivial or inexpensively copied types
           - Pass by value
@@ -258,6 +283,8 @@ The preferred style for how inputs are passed in and outputs are returned is the
 		- `std::unique_ptr<column>`
 	- Tables:
 		- `std::unique_ptr<table>`
+    - Scalars:
+        - `std::unique_ptr<scalar>`
 
 
 ### Multiple Return Values

diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
@@ -73,8 +73,8 @@ __global__ void gather_bitmask_kernel(table_device_view source_table,
         size_type destination_row = destination_row_base + threadIdx.x;
 
         const bool thread_active = destination_row < destination_col.size();
-        size_type source_row =
-          thread_active ? gather_map[destination_row] : 0;
+        size_type source_row = thread_active ?
+          static_cast<size_type>(gather_map[destination_row]) : 0;
 
         bool source_bit_is_valid = source_col.has_nulls()
           ? source_col.is_valid_nocheck(source_row)

diff --git a/cpp/include/cudf/predicates.hpp → cpp/include/cudf/legacy/predicates.hpp b/cpp/include/cudf/predicates.hpp → cpp/include/cudf/legacy/predicates.hpp
diff --git a/cpp/include/cudf/rolling.hpp → cpp/include/cudf/legacy/rolling.hpp b/cpp/include/cudf/rolling.hpp → cpp/include/cudf/legacy/rolling.hpp
@@ -17,8 +17,6 @@
 #ifndef ROLLING_HPP
 #define ROLLING_HPP
 
-#include "cudf.h"
-
 namespace cudf {
 /* --------------------------------------------------------------------------*
  * @brief  Computes the rolling window function of the values in a column.