Skip to content

Commit

Permalink
fix merge conflict
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Jan 17, 2020
2 parents 572a5c9 + fc5ec20 commit 74dcfe4
Show file tree
Hide file tree
Showing 23 changed files with 634 additions and 77 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
- PR #3667 Define and implement round-robin partition API.
- PR #3690 Add bools_to_mask
- PR #3683 Added support for multiple delimiters in `nvtext.token_count()`
- PR #3792 Adding is_nan and is_notnan

## Improvements

- PR #3292 Port NVStrings regex contains function
- PR #3409 Port NVStrings regex replace function
- PR #3417 Port NVStrings regex findall function
- PR #3351 Add warning when filepath resolves to multiple files in cudf readers
- PR #3370 Port NVStrings strip functions
- PR #3453 Port NVStrings IPv4 convert functions to cudf strings column
Expand All @@ -45,6 +47,7 @@
- PR #3640 Enable memory_usage in dask_cudf (also adds pd.Index from_pandas)
- PR #3654 Update Jitify submodule ref to include gcc-8 fix
- PR #3639 Define and implement `nans_to_nulls`
- PR #3561 Rework contains implementation in search
- PR #3616 Add aggregation infrastructure for argmax/argmin.
- PR #3699 Stringify libcudacxx headers for binary op JIT
- PR #3697 Improve column insert performance for wide frames
Expand All @@ -53,6 +56,7 @@
- PR #3657 Define and implement compiled binops for string column comparisons
- PR #3520 Change read_parquet defaults and add warnings
- PR #3780 Java APIs for selecting a GPU
- PR #3805 Avoid CuPy 7.1.0 for now

## Bug Fixes

Expand Down Expand Up @@ -91,6 +95,8 @@
- PR #3783 Bind cuDF operators to Dask Dataframe
- PR #3775 Fix segfault when reading compressed CSV files larger than 4GB
- PR #3803 Keep name when unpickling Index objects
- PR #3804 Fix cuda crash in AVRO reader
- PR #3766 Remove references to cudf::type_id::CATEGORY from IO code


# cuDF 0.11.0 (11 Dec 2019)
Expand Down
2 changes: 1 addition & 1 deletion ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ source activate gdf
conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \
"dask>=2.1.0" "distributed>=2.1.0" "numpy>=1.16" "double-conversion" \
"rapidjson" "flatbuffers" "boost-cpp" "fsspec>=0.3.3" "dlpack" \
"feather-format" "cupy>=6.6.0,<8.0.0a0" "arrow-cpp=0.15.0" "pyarrow=0.15.0" \
"feather-format" "cupy>=6.6.0,<8.0.0a0,!=7.1.0" "arrow-cpp=0.15.0" "pyarrow=0.15.0" \
"fastavro>=0.22.0" "pandas>=0.25,<0.26" "hypothesis" "s3fs" "gcsfs" \
"boto3" "moto" "httpretty" "streamz"

Expand Down
2 changes: 1 addition & 1 deletion conda/environments/cudf_dev_cuda10.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ channels:
- conda-forge
- defaults
dependencies:
- cupy>=6.6.0,<8.0.0a0
- cupy>=6.6.0,<8.0.0a0,!=7.1.0
- rmm=0.12.*
- cmake>=3.12
- cmake_setuptools>=0.1.3
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/cudf_dev_cuda10.1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ channels:
- conda-forge
- defaults
dependencies:
- cupy>=6.6.0,<8.0.0a0
- cupy>=6.6.0,<8.0.0a0,!=7.1.0
- rmm=0.12.*
- cmake>=3.12
- cmake_setuptools>=0.1.3
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/cudf_dev_cuda9.2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ channels:
- conda-forge
- defaults
dependencies:
- cupy>=6.6.0,<8.0.0a0
- cupy>=6.6.0,<8.0.0a0,!=7.1.0
- rmm=0.12.*
- cmake>=3.12
- cmake_setuptools>=0.1.3
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ requirements:
run:
- python
- pandas>=0.24.2,<0.25
- cupy >=6.6.0,<8.0.0a0
- cupy >=6.6.0,<8.0.0a0,!=7.1.0
- numba >=0.46.0
- pyarrow 0.15.0.*
- fastavro >=0.22.0
Expand Down
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,7 @@ add_library(cudf
src/merge/legacy/merge.cu
src/unary/cast_ops.cu
src/unary/null_ops.cu
src/unary/nan_ops.cu
src/unary/legacy/math_ops.cu
src/unary/legacy/cast_ops.cu
src/unary/legacy/null_ops.cu
Expand Down Expand Up @@ -576,6 +577,7 @@ add_library(cudf
src/strings/copying/copying.cu
src/strings/extract.cu
src/strings/find.cu
src/strings/findall.cu
src/strings/find_multiple.cu
src/strings/filling/fill.cu
src/strings/padding.cu
Expand Down
45 changes: 45 additions & 0 deletions cpp/include/cudf/strings/findall.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table.hpp>

namespace cudf
{
namespace strings
{

/**
* @brief Returns a table of strings columns for each matching occurrence of the
* regex pattern within each string.
*
* The number of output columns is determined by the string with the most
* matches.
*
* Any null string entries return corresponding null output column entries.
*
* @param strings Strings instance for this operation.
* @param pattern Regex pattern to match within each string.
* @param mr Resource for allocating device memory.
* @return New table of strings columns.
*/
std::unique_ptr<experimental::table> findall_re( strings_column_view const& strings,
std::string const& pattern,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

} // namespace strings
} // namespace cudf
36 changes: 35 additions & 1 deletion cpp/include/cudf/unary.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2019, NVIDIA CORPORATION.
* Copyright (c) 2018-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -93,5 +93,39 @@ std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
std::unique_ptr<column> cast(column_view const& input, data_type out_type,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**
* @brief Creates a column of `BOOL8` elements indicating the presence of `NaN` values
* in a column of floating point values.
* The output element at row `i` is `true` if the element in `input` at row i is `NAN`, else `false`
*
* @throws cudf::logic_error if `input` is a non-floating point type
*
* @param input A column of floating-point elements
* @param mr Optional, The resource to use for allocating the device memory in the returned column.
*
* @returns unique_ptr<column> A non-nulalble column of `BOOL8` elements with `true`
* representing `NAN` values
*/
std::unique_ptr<column> is_nan(cudf::column_view const& input,
rmm::mr::device_memory_resource* mr =
rmm::mr::get_default_resource());

/**
* @brief Creates a column of `BOOL8` elements indicating the absence of `NaN` values
* in a column of floating point values.
* The output element at row `i` is `false` if the element in `input` at row i is `NAN`, else `true`
*
* @throws cudf::logic_error if `input` is a non-floating point type
*
* @param input A column of floating-point elements
* @param mr Optional, The resource to use for allocating the device memory in the returned column.
*
* @returns unique_ptr<column> A non-nulalble column of `BOOL8` elements with `false`
* representing `NAN` values
*/
std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
rmm::mr::device_memory_resource* mr =
rmm::mr::get_default_resource());

} // namespace experimental
} // namespace cudf
2 changes: 1 addition & 1 deletion cpp/src/io/avro/legacy/avro_reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ void reader::Impl::decode_data(
static_cast<block_desc_s *>(block_list.data()), schema_desc.device_ptr(),
reinterpret_cast<gpu::nvstrdesc_s *>(global_dictionary.device_ptr()),
static_cast<const uint8_t *>(block_data.data()),
static_cast<uint32_t>(block_list.size()),
static_cast<uint32_t>(md_->block_list.size()),
static_cast<uint32_t>(schema_desc.size()),
static_cast<uint32_t>(total_dictionary_entries), md_->num_rows,
md_->skip_rows, min_row_data_size, 0));
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/avro/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ void reader::impl::decode_data(
static_cast<block_desc_s *>(block_list.data()), schema_desc.device_ptr(),
reinterpret_cast<gpu::nvstrdesc_s *>(global_dictionary.device_ptr()),
static_cast<const uint8_t *>(block_data.data()),
static_cast<uint32_t>(block_list.size()),
static_cast<uint32_t>(_metadata->block_list.size()),
static_cast<uint32_t>(schema_desc.size()),
static_cast<uint32_t>(total_dictionary_entries), _metadata->num_rows,
_metadata->skip_rows, min_row_data_size, stream));
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/io/csv/csv_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -679,8 +679,7 @@ __global__ void convertCsvToGdf(const char *raw_csv, const ParseOptions opts,

// Modify start & end to ignore whitespace and quotechars
long tempPos = pos - 1;
if (!is_na && dtype[actual_col].id() != cudf::type_id::CATEGORY &&
dtype[actual_col].id() != cudf::type_id::STRING) {
if (!is_na && dtype[actual_col].id() != cudf::type_id::STRING) {
trim_field_start_end(raw_csv, &start, &tempPos, opts.quotechar);
}

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ data_type convertStringToDtype(const std::string &dtype) {
return data_type(cudf::type_id::TIMESTAMP_MICROSECONDS);
if (dtype == "timestamp[ns]")
return data_type(cudf::type_id::TIMESTAMP_NANOSECONDS);
if (dtype == "category") return data_type(cudf::type_id::CATEGORY);
if (dtype == "category") return data_type(cudf::type_id::INT32);
if (dtype == "date32") return data_type(cudf::type_id::TIMESTAMP_DAYS);
if (dtype == "bool" || dtype == "boolean")
return data_type(cudf::type_id::BOOL8);
Expand Down
2 changes: 0 additions & 2 deletions cpp/src/io/orc/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,6 @@ constexpr orc::TypeKind to_orc_type(cudf::type_id id) {
case cudf::type_id::TIMESTAMP_MILLISECONDS:
case cudf::type_id::TIMESTAMP_NANOSECONDS:
return TypeKind::TIMESTAMP;
case cudf::type_id::CATEGORY:
return TypeKind::INT;
case cudf::type_id::STRING:
return TypeKind::STRING;
default:
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/parquet/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ constexpr type_id to_type_id(parquet::Type physical,
case parquet::BYTE_ARRAY:
case parquet::FIXED_LEN_BYTE_ARRAY:
// Can be mapped to GDF_CATEGORY (32-bit hash) or GDF_STRING (nvstring)
return strings_to_categorical ? type_id::CATEGORY : type_id::STRING;
return strings_to_categorical ? type_id::INT32 : type_id::STRING;
case parquet::INT96:
return (timestamp_type_id != type_id::EMPTY)
? timestamp_type_id
Expand Down Expand Up @@ -146,7 +146,7 @@ std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
type_width = 1; // I32 -> I8
} else if (column_type_id == type_id::INT16) {
type_width = 2; // I32 -> I16
} else if (column_type_id == type_id::CATEGORY) {
} else if (column_type_id == type_id::INT32) {
type_width = 4; // str -> hash32
} else if (is_timestamp(data_type{column_type_id})) {
clock_rate = to_clockrate(timestamp_type_id);
Expand Down
1 change: 0 additions & 1 deletion cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ class parquet_column_view {
_stats_dtype = statistics_dtype::dtype_int16;
break;
case cudf::type_id::INT32:
case cudf::type_id::CATEGORY:
_physical_type = Type::INT32;
_stats_dtype = statistics_dtype::dtype_int32;
break;
Expand Down
70 changes: 14 additions & 56 deletions cpp/src/search/search.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
#include <cudf/table/table_device_view.cuh>
#include <cudf/table/table_view.hpp>
#include <cudf/scalar/scalar_device_view.cuh>

#include <hash/unordered_multiset.cuh>
#include <cudf/detail/iterator.cuh>

#include <rmm/thrust_rmm_allocator.h>
#include <strings/utilities.hpp>
Expand Down Expand Up @@ -122,72 +124,28 @@ std::unique_ptr<column> search_ordered(table_view const& t,
return result;
}

template <typename Element, bool nullable = true>
struct compare_with_value{
compare_with_value(column_device_view c, Element val, bool val_is_valid, bool nulls_are_equal)

: col{c}, value{val}, val_is_valid{val_is_valid}, nulls_are_equal{nulls_are_equal} {}

__device__ bool operator()(size_type i) noexcept {
if (nullable) {
bool const col_is_null{col.nullable() and col.is_null(i)};
if (col_is_null and not val_is_valid)
return nulls_are_equal;
else if (col_is_null == val_is_valid)
return false;
}

return equality_compare<Element>(col.element<Element>(i), value);
}

column_device_view col;
Element value;
bool val_is_valid;
bool nulls_are_equal;
};

template <typename Element>
void populate_element(scalar const& value, Element &e) {
using ScalarType = cudf::experimental::scalar_type_t<Element>;
auto s1 = static_cast<const ScalarType *>(&value);

e = s1->value();
}

template <>
void populate_element<string_view>(scalar const& value, string_view &e) {
using ScalarType = cudf::experimental::scalar_type_t<string_view>;
auto s1 = static_cast<const ScalarType *>(&value);

e = string_view{s1->data(), s1->size()};
}

struct contains_scalar_dispatch {
template <typename Element>
bool operator()(column_view const& col, scalar const& value,
cudaStream_t stream,
rmm::mr::device_memory_resource *mr) {
cudaStream_t stream, rmm::mr::device_memory_resource *mr) {

using ScalarType = cudf::experimental::scalar_type_t<Element>;
auto d_col = column_device_view::create(col, stream);
auto data_it = thrust::make_counting_iterator<size_type>(0);

bool element_is_valid{value.is_valid()};
Element element;

populate_element(value, element);
auto s = static_cast<const ScalarType *>(&value);

if (col.has_nulls()) {
auto eq_op = compare_with_value<Element, true>(*d_col, element, element_is_valid, true);
auto found_iter = thrust::find(rmm::exec_policy(stream)->on(stream),
d_col->pair_begin<Element, true>(),
d_col->pair_end<Element, true>(),
thrust::make_pair(s->value(), true));

return thrust::any_of(rmm::exec_policy(stream)->on(stream),
data_it, data_it + col.size(),
eq_op);
return found_iter != d_col->pair_end<Element, true>();
} else {
auto eq_op = compare_with_value<Element, false>(*d_col, element, element_is_valid, true);
auto found_iter = thrust::find(rmm::exec_policy(stream)->on(stream),
d_col->begin<Element>(),
d_col->end<Element>(), s->value());

return thrust::any_of(rmm::exec_policy(stream)->on(stream),
data_it, data_it + col.size(),
eq_op);
return found_iter != d_col->end<Element>();
}
}
};
Expand Down
Loading

0 comments on commit 74dcfe4

Please sign in to comment.