diff --git a/CHANGELOG.md b/CHANGELOG.md index 013356797bc..2791ee068e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,10 +7,10 @@ - PR #525 Added GitHub Issue templates for bugs, documentation, new features, and questions - PR #455 CSV Reader: Add support for user-specified decimal point and thousands separator - PR #439 add `DataFrame.drop` method similar to pandas +- PR #350 Implemented Series replace function - PR #490 Added print_env.sh script to gather relevant environment details when reporting cuDF issues. - PR #474 add ZLIB-based GZIP/ZIP support to `read_csv()` - ## Improvements - PR #472 RMM: Created centralized rmm::device_vector alias and rmm::exec_policy diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0b1f6f341d6..6bb88bc394d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -164,6 +164,7 @@ add_library(cudf SHARED src/hash/hash_ops.cu src/quantiles/quantiles.cu src/reductions/reductions.cu + src/replace/replace.cu src/reductions/scan.cu src/unary/unary_ops.cu # src/windowed/windowed_ops.cu ... this is broken diff --git a/cpp/include/cudf/functions.h b/cpp/include/cudf/functions.h index 014d87124e4..a6a76befc1b 100644 --- a/cpp/include/cudf/functions.h +++ b/cpp/include/cudf/functions.h @@ -907,9 +907,23 @@ gdf_error gdf_quantile_aprrox( gdf_column* col_in, //input column with 0 void* t_erased_res, //type-erased result of same type as column; gdf_context* ctxt); //context info +/* --------------------------------------------------------------------------* + * @brief Replace elements from `col` according to the mapping `old_values` to + * `new_values`, that is, replace all `old_values[i]` present in `col` + * with `new_values[i]`. + * + * @Param[in,out] col gdf_column with the data to be modified + * @Param[in] old_values gdf_column with the old values to be replaced + * @Param[in] new_values gdf_column with the new values + * + * @Returns GDF_SUCCESS upon successful completion + * + * --------------------------------------------------------------------------*/ +gdf_error gdf_find_and_replace_all(gdf_column* col, + const gdf_column* old_values, + const gdf_column* new_values); -/* --------------------------------------------------------------------------*/ -/** +/* --------------------------------------------------------------------------* * @brief Sorts an array of gdf_column. * * @Param[in] input_columns Array of gdf_columns @@ -923,8 +937,8 @@ gdf_error gdf_quantile_aprrox( gdf_column* col_in, //input column with 0 * indices * * @Returns GDF_SUCCESS upon successful completion - */ -/* ----------------------------------------------------------------------------*/ + * + * ----------------------------------------------------------------------------*/ gdf_error gdf_order_by(gdf_column** input_columns, int8_t* asc_desc, size_t num_inputs, diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu new file mode 100644 index 00000000000..1f31e3f190d --- /dev/null +++ b/cpp/src/replace/replace.cu @@ -0,0 +1,136 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "cudf.h" +#include "utilities/error_utils.h" +#include "utilities//type_dispatcher.hpp" + +namespace{ //anonymous + + constexpr int BLOCK_SIZE = 256; + + /* --------------------------------------------------------------------------*/ + /** + * @brief Kernel that replaces elements from `d_col_data` given the following + * rule: replace all `old_values[i]` in [old_values_begin`, `old_values_end`) + * present in `d_col_data` with `d_new_values[i]`. + * + * @Param[in,out] d_col_data Device array with the data to be modified + * @Param[in] nrows # rows in `d_col_data` + * @Param[in] old_values_begin Device pointer to the beginning of the sequence + * of old values to be replaced + * @Param[in] old_values_end Device pointer to the end of the sequence + * of old values to be replaced + * @Param[in] d_new_values Device array with the new values + * + * @Returns + */ + /* ----------------------------------------------------------------------------*/ + template + __global__ + void replace_kernel(T* d_col_data, + size_t nrows, + thrust::device_ptr old_values_begin, + thrust::device_ptr old_values_end, + const T* d_new_values) + { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + while(i < nrows) + { + auto found_ptr = thrust::find(thrust::seq, old_values_begin, old_values_end, d_col_data[i]); + + if (found_ptr != old_values_end) { + auto d = thrust::distance(old_values_begin, found_ptr); + d_col_data[i] = d_new_values[d]; + } + + i += blockDim.x * gridDim.x; + } + } + + /* --------------------------------------------------------------------------*/ + /** + * @brief Functor called by the `type_dispatcher` in order to invoke and instantiate + * `replace_kernel` with the apropiate data types. + */ + /* ----------------------------------------------------------------------------*/ + struct replace_kernel_forwarder { + template + void operator()(void* d_col_data, + size_t nrows, + const void* d_old_values, + const void* d_new_values, + size_t nvalues) + { + thrust::device_ptr old_values_begin = thrust::device_pointer_cast(static_cast(d_old_values)); + + const size_t grid_size = nrows / BLOCK_SIZE + (nrows % BLOCK_SIZE != 0); + replace_kernel<<>>(static_cast(d_col_data), + nrows, + old_values_begin, + old_values_begin + nvalues, + static_cast(d_new_values)); + } + }; + + gdf_error find_and_replace_all(gdf_column* col, + const gdf_column* old_values, + const gdf_column* new_values) + { + GDF_REQUIRE(col != nullptr && old_values != nullptr && new_values != nullptr, GDF_DATASET_EMPTY); + GDF_REQUIRE(old_values->size == new_values->size, GDF_COLUMN_SIZE_MISMATCH); + GDF_REQUIRE(col->dtype == old_values->dtype && col->dtype == new_values->dtype, GDF_DTYPE_MISMATCH); + GDF_REQUIRE(col->valid == nullptr || col->null_count == 0, GDF_VALIDITY_UNSUPPORTED); + GDF_REQUIRE(old_values->valid == nullptr || old_values->null_count == 0, GDF_VALIDITY_UNSUPPORTED); + GDF_REQUIRE(new_values->valid == nullptr || new_values->null_count == 0, GDF_VALIDITY_UNSUPPORTED); + + + cudf::type_dispatcher(col->dtype, replace_kernel_forwarder{}, + col->data, + col->size, + old_values->data, + new_values->data, + new_values->size); + + return GDF_SUCCESS; + } + +} //end anonymous namespace + +/* --------------------------------------------------------------------------*/ +/** + * @brief Replace elements from `col` according to the mapping `old_values` to + * `new_values`, that is, replace all `old_values[i]` present in `col` + * with `new_values[i]`. + * + * @Param[in,out] col gdf_column with the data to be modified + * @Param[in] old_values gdf_column with the old values to be replaced + * @Param[in] new_values gdf_column with the new values + * + * @Returns GDF_SUCCESS upon successful completion + */ +/* ----------------------------------------------------------------------------*/ +gdf_error gdf_find_and_replace_all(gdf_column* col, + const gdf_column* old_values, + const gdf_column* new_values) +{ + return find_and_replace_all(col, old_values, new_values); +} diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 70660892cb9..f6b0d7c70b7 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -135,6 +135,14 @@ set(QUANTILES_TEST_SRC ConfigureTest(QUANTILES_TEST "${QUANTILES_TEST_SRC}") +################################################################################################### +# - replace tests --------------------------------------------------------------------------------- + +set(REPLACE_TEST_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/replace/replace_tests.cu") + +ConfigureTest(REPLACE_TEST "${REPLACE_TEST_SRC}") + ################################################################################################### # - unary tests ----------------------------------------------------------------------------------- diff --git a/cpp/tests/replace/replace_tests.cu b/cpp/tests/replace/replace_tests.cu new file mode 100644 index 00000000000..d95b225ebb8 --- /dev/null +++ b/cpp/tests/replace/replace_tests.cu @@ -0,0 +1,315 @@ +/* + * Copyright 2018 BlazingDB, Inc. + * Copyright 2018 Cristhian Alberto Gonzales Castillo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "gtest/gtest.h" + +#include +#include + +#include + +#include "tests/utilities/cudf_test_fixtures.h" +#include "tests/utilities/cudf_test_utils.cuh" + +// This is the main test feature +template +struct ReplaceTest : public GdfTest +{ + std::vector replace_column; + std::vector old_values_column; + std::vector new_values_column; + + gdf_col_pointer gdf_replace_column; + gdf_col_pointer gdf_old_values_column; + gdf_col_pointer gdf_new_values_column; + + gdf_column* gdf_raw_replace_column; + gdf_column* gdf_raw_old_values_column; + gdf_column* gdf_raw_new_values_column; + + ReplaceTest() + { + // Use constant seed so the psuedo-random order is the same each time + // Each time the class is constructed a new constant seed is used + static size_t number_of_instantiations{0}; + std::srand(number_of_instantiations++); + } + + ~ReplaceTest() + { + } + + /* --------------------------------------------------------------------------* + * @brief Initializes the input columns with the given values. + * + * @Param replace_column_list The original values + * @Param old_values_column_list The values that will be replaced + * @Param new_values_column_list The new values + * @Param print Optionally print the set of columns for debug + * -------------------------------------------------------------------------*/ + void create_input(const std::initializer_list &replace_column_list, + const std::initializer_list &old_values_column_list, + const std::initializer_list &new_values_column_list, + bool print = false) + { + replace_column = replace_column_list; + old_values_column = old_values_column_list; + new_values_column = new_values_column_list; + + gdf_replace_column = create_gdf_column(replace_column); + gdf_old_values_column = create_gdf_column(old_values_column); + gdf_new_values_column = create_gdf_column(new_values_column); + + gdf_raw_replace_column = gdf_replace_column.get(); + gdf_raw_old_values_column = gdf_old_values_column.get(); + gdf_raw_new_values_column = gdf_new_values_column.get(); + + if(print) + { + std::cout << "replace column(s) created. Size: " << replace_column.size() << std::endl; + print_vector(replace_column); + std::cout << "\n"; + } + } + + /* --------------------------------------------------------------------------*/ + /** + * @brief Computes a reference solution + * + * @Param print Option to print the solution for debug + * + * @Returns A vector of 'T' with the old values replaced + */ + /* ----------------------------------------------------------------------------*/ + std::vector compute_reference_solution(bool print = false) + { + std::vector reference_result(replace_column); + std::vector isReplaced(reference_result.size(), false); + + for(size_t i = 0; i < old_values_column.size(); i++) + { + size_t k = 0; + auto pred = [&, this] (T element) { + bool toBeReplaced = false; + if(!isReplaced[k]) + { + toBeReplaced = (element == this->old_values_column[i]); + isReplaced[k] = toBeReplaced; + } + + ++k; + return toBeReplaced; + }; + std::replace_if(reference_result.begin(), reference_result.end(), pred, new_values_column[i]); + } + + if(print) + { + std::cout << "Reference result size: " << reference_result.size() << std::endl; + print_vector(reference_result); + std::cout << "\n"; + } + + return reference_result; + } + + /* --------------------------------------------------------------------------*/ + /** + * @brief Replaces the values in a column given a map of old values to be replaced + * and new values with the libgdf functions + * + * @Param print Option to print the result computed by the libgdf function + * + * @Returns A vector of 'T' with the old values replaced + */ + /* ----------------------------------------------------------------------------*/ + std::vector compute_gdf_result(bool print = false, gdf_error expected_result = GDF_SUCCESS) + { + gdf_error result_error{GDF_SUCCESS}; + + gdf_error status = gdf_find_and_replace_all(gdf_raw_replace_column, gdf_raw_old_values_column, gdf_raw_new_values_column); + + EXPECT_EQ(expected_result, result_error) << "The gdf order by function did not complete successfully"; + + // If the expected result was not GDF_SUCCESS, then this test was testing for a + // specific error condition, in which case we return imediately and do not do + // any further work on the output + if(GDF_SUCCESS != expected_result){ + return std::vector(); + } + + size_t output_size = gdf_raw_replace_column->size; + std::vector host_result(output_size); + + EXPECT_EQ(cudaMemcpy(host_result.data(), + gdf_raw_replace_column->data, output_size * sizeof(T), cudaMemcpyDeviceToHost), cudaSuccess); + + if(print){ + std::cout << "GDF result size: " << host_result.size() << std::endl; + print_vector(host_result); + std::cout << "\n"; + } + + return host_result; + } +}; + +using Types = testing::Types; + +TYPED_TEST_CASE(ReplaceTest, Types); + +// This test is used for debugging purposes and is disabled by default. +// The input sizes are small and has a large amount of debug printing enabled. +TYPED_TEST(ReplaceTest, DISABLED_DebugTest) +{ + this->create_input({7, 5, 6, 3, 1, 2, 8, 4}, {2, 6, 4, 8}, {0, 4, 2, 6}, true); + + auto reference_result = this->compute_reference_solution(true); + auto gdf_result = this->compute_gdf_result(true); + + ASSERT_EQ(reference_result.size(), gdf_result.size()) << "Size of gdf result does not match reference result\n"; + // Compare the GDF and reference solutions + for(size_t i = 0; i < reference_result.size(); ++i){ + EXPECT_EQ(reference_result[i], gdf_result[i]); + } +} + + +// Simple test, replacing all even gdf_new_values_column +TYPED_TEST(ReplaceTest, ReplaceEvenPosition) +{ + this->create_input({1, 2, 3, 4, 5, 6, 7, 8}, {2, 4, 6, 8}, {0, 2, 4, 6}); + + auto reference_result = this->compute_reference_solution(); + auto gdf_result = this->compute_gdf_result(); + + ASSERT_EQ(reference_result.size(), gdf_result.size()) << "Size of gdf result does not match reference result\n"; + // Compare the GDF and reference solutions + for(size_t i = 0; i < reference_result.size(); ++i){ + EXPECT_EQ(reference_result[i], gdf_result[i]); + } +} + +// Similar test as ReplaceEvenPosition, but with unordered data +TYPED_TEST(ReplaceTest, Unordered) +{ + this->create_input({7, 5, 6, 3, 1, 2, 8, 4}, {2, 6, 4, 8}, {0, 4, 2, 6}); + + auto reference_result = this->compute_reference_solution(); + auto gdf_result = this->compute_gdf_result(); + + ASSERT_EQ(reference_result.size(), gdf_result.size()) << "Size of gdf result does not match reference result\n"; + // Compare the GDF and reference solutions + for(size_t i = 0; i < reference_result.size(); ++i){ + EXPECT_EQ(reference_result[i], gdf_result[i]); + } +} + +// Testing with Empty Replace +TYPED_TEST(ReplaceTest, EmptyReplace) +{ + this->create_input({7, 5, 6, 3, 1, 2, 8, 4}, {}, {}); + + auto reference_result = this->compute_reference_solution(); + auto gdf_result = this->compute_gdf_result(); + + ASSERT_EQ(reference_result.size(), gdf_result.size()) << "Size of gdf result does not match reference result\n"; + // Compare the GDF and reference solutions + for(size_t i = 0; i < reference_result.size(); ++i){ + EXPECT_EQ(reference_result[i], gdf_result[i]); + } +} + +// Testing with Nothing To Replace +TYPED_TEST(ReplaceTest, NothingToReplace) +{ + this->create_input({7, 5, 6, 3, 1, 2, 8, 4}, {10, 11, 12}, {15, 16, 17}); + + auto reference_result = this->compute_reference_solution(); + auto gdf_result = this->compute_gdf_result(); + + ASSERT_EQ(reference_result.size(), gdf_result.size()) << "Size of gdf result does not match reference result\n"; + // Compare the GDF and reference solutions + for(size_t i = 0; i < reference_result.size(); ++i){ + EXPECT_EQ(reference_result[i], gdf_result[i]); + } +} + +// Testing with Empty Data +TYPED_TEST(ReplaceTest, EmptyData) +{ + this->create_input({}, {10, 11, 12}, {15, 16, 17}); + + auto reference_result = this->compute_reference_solution(); + auto gdf_result = this->compute_gdf_result(); + + ASSERT_EQ(reference_result.size(), gdf_result.size()) << "Size of gdf result does not match reference result\n"; + // Compare the GDF and reference solutions + for(size_t i = 0; i < reference_result.size(); ++i){ + EXPECT_EQ(reference_result[i], gdf_result[i]); + } +} + +// Test with much larger data sets +TYPED_TEST(ReplaceTest, LargeScaleReplaceTest) +{ + const size_t DATA_SIZE = 1000000; + const size_t REPLACE_SIZE = 10000; + + this->replace_column.resize(DATA_SIZE); + for (size_t i = 0; i < DATA_SIZE; i++) { + this->replace_column[i] = std::rand() % (2 * REPLACE_SIZE); + } + + this->old_values_column.resize(REPLACE_SIZE); + this->new_values_column.resize(REPLACE_SIZE); + size_t count = 0; + for (size_t i = 0; i < 7; i++) { + for (size_t j = 0; j < REPLACE_SIZE; j += 7) { + if (i + j < REPLACE_SIZE) { + this->old_values_column[i + j] = count; + count++; + this->new_values_column[i + j] = count; + } + } + } + + this->gdf_replace_column = create_gdf_column(this->replace_column); + this->gdf_old_values_column = create_gdf_column(this->old_values_column); + this->gdf_new_values_column = create_gdf_column(this->new_values_column); + + this->gdf_raw_replace_column = this->gdf_replace_column.get(); + this->gdf_raw_old_values_column = this->gdf_old_values_column.get(); + this->gdf_raw_new_values_column = this->gdf_new_values_column.get(); + + auto gdf_result = this->compute_gdf_result(); + + for (size_t i = 0; i < DATA_SIZE; i++) { + if ((size_t)(this->replace_column[i]) < REPLACE_SIZE) { + EXPECT_EQ((TypeParam)(this->replace_column[i] + 1), gdf_result[i]); + } + } +} diff --git a/cpp/tests/utilities/cudf_test_utils.cuh b/cpp/tests/utilities/cudf_test_utils.cuh index f303a3f4533..b9981e2fb7f 100644 --- a/cpp/tests/utilities/cudf_test_utils.cuh +++ b/cpp/tests/utilities/cudf_test_utils.cuh @@ -194,38 +194,39 @@ gdf_col_pointer create_gdf_column(std::vector const & host_vector, RMM_ALLOC(&(the_column->data), host_vector.size() * sizeof(col_type), 0); cudaMemcpy(the_column->data, host_vector.data(), host_vector.size() * sizeof(col_type), cudaMemcpyHostToDevice); + // Fill the gdf_column members + the_column->size = host_vector.size(); + the_column->dtype = gdf_col_type; + gdf_dtype_extra_info extra_info; + extra_info.time_unit = TIME_UNIT_NONE; + the_column->dtype_info = extra_info; + // If a validity bitmask vector was passed in, allocate device storage // and copy its contents from the host vector if(valid_vector.size() > 0) { RMM_ALLOC((void**)&(the_column->valid), valid_vector.size() * sizeof(gdf_valid_type), 0); cudaMemcpy(the_column->valid, valid_vector.data(), valid_vector.size() * sizeof(gdf_valid_type), cudaMemcpyHostToDevice); + + // Count the number of null bits + // count in all but last element in case it is not full + the_column->null_count = std::accumulate(valid_vector.begin(), valid_vector.end() - 1, 0, + [](gdf_size_type s, gdf_valid_type x) { + return s + std::bitset(x).flip().count(); + }); + // Now count the bits in the last mask + size_t unused_bits = GDF_VALID_BITSIZE - the_column->size % GDF_VALID_BITSIZE; + if (GDF_VALID_BITSIZE == unused_bits) unused_bits = 0; + auto last_mask = std::bitset(*(valid_vector.end()-1)).flip(); + last_mask = (last_mask << unused_bits) >> unused_bits; + the_column->null_count += last_mask.count(); } else { the_column->valid = nullptr; + the_column->null_count = 0; } - // Fill the gdf_column members - the_column->size = host_vector.size(); - the_column->dtype = gdf_col_type; - gdf_dtype_extra_info extra_info; - extra_info.time_unit = TIME_UNIT_NONE; - the_column->dtype_info = extra_info; - - // Count the number of null bits - // count in all but last element in case it is not full - the_column->null_count = std::accumulate(valid_vector.begin(), valid_vector.end() - 1, 0, - [](gdf_size_type s, gdf_valid_type x) { - return s + std::bitset(x).flip().count(); - }); - // Now count the bits in the last mask - int unused_bits = GDF_VALID_BITSIZE - the_column->size % GDF_VALID_BITSIZE; - if (GDF_VALID_BITSIZE == unused_bits) unused_bits = 0; - auto last_mask = std::bitset(*(valid_vector.end()-1)).flip(); - last_mask = (last_mask << unused_bits) >> unused_bits; - the_column->null_count += last_mask.count(); - return the_column; }