diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index fece1cb52b0..455ffce7ed8 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -115,8 +115,7 @@ class csv_reader_options { // Conversion settings // Per-column types; disables type inference on those columns - std::variant, std::vector, std::map> - _dtypes; + std::variant, std::map> _dtypes; // Additional values to recognize as boolean true values std::vector _true_values{"True", "TRUE", "true"}; // Additional values to recognize as boolean false values @@ -305,10 +304,7 @@ class csv_reader_options { /** * @brief Returns per-column types. */ - std::variant, - std::vector, - std::map> const& - get_dtypes() const + std::variant, std::map> const& get_dtypes() const { return _dtypes; } @@ -608,20 +604,6 @@ class csv_reader_options { */ void set_dtypes(std::vector types) { _dtypes = std::move(types); } - /** - * @brief Sets per-column types, specified by the type's respective string representation. - * - * @param types Vector of dtypes in which the column needs to be read. - */ - [[deprecated( - "The string-based interface will be deprecated." - "Use dtypes(std::vector) or " - "dtypes(std::map) instead.")]] void - set_dtypes(std::vector types) - { - _dtypes = std::move(types); - } - /** * @brief Sets additional values to recognize as boolean true values. * @@ -1067,22 +1049,6 @@ class csv_reader_options_builder { return *this; } - /** - * @brief Sets per-column types, specified by the type's respective string representation. - * - * @param types Vector of dtypes in which the column needs to be read. - * @return this for chaining. - */ - [[deprecated( - "The string-based interface will be deprecated." - "Use dtypes(std::vector) or " - "dtypes(std::map) instead.")]] csv_reader_options_builder& - dtypes(std::vector types) - { - options._dtypes = std::move(types); - return *this; - } - /** * @brief Sets additional values to recognize as boolean true values. * diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 60f990c87d8..31201e30ac6 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -68,8 +68,7 @@ class json_reader_options { source_info _source; // Data types of the column; empty to infer dtypes - std::variant, std::vector, std::map> - _dtypes; + std::variant, std::map> _dtypes; // Specify the compression format of the source or infer from file extension compression_type _compression = compression_type::AUTO; @@ -117,10 +116,7 @@ class json_reader_options { /** * @brief Returns data types of the columns. */ - std::variant, - std::vector, - std::map> const& - get_dtypes() const + std::variant, std::map> const& get_dtypes() const { return _dtypes; } @@ -150,20 +146,6 @@ class json_reader_options { */ bool is_enabled_dayfirst() const { return _dayfirst; } - /** - * @brief Set data types for columns to be read. - * - * @param types Vector of dtypes in string format. - */ - [[deprecated( - "The string-based interface will be deprecated." - "Use dtypes(std::vector) or " - "dtypes(std::map) instead.")]] void - set_dtypes(std::vector types) - { - _dtypes = std::move(types); - } - /** * @brief Set data types for columns to be read. * @@ -232,22 +214,6 @@ class json_reader_options_builder { */ explicit json_reader_options_builder(source_info const& src) : options(src) {} - /** - * @brief Set data types for columns to be read. - * - * @param types Vector of dtypes in string format - * @return this for chaining - */ - [[deprecated( - "The string-based interface will be deprecated." - "Use dtypes(std::vector) or " - "dtypes(std::map) instead.")]] json_reader_options_builder& - dtypes(std::vector types) - { - options._dtypes = std::move(types); - return *this; - } - /** * @brief Set data types for columns to be read. * diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 549b0474fe1..7f85589a8aa 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -27,7 +27,6 @@ #include #include -#include #include #include #include @@ -420,14 +419,8 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) if (has_to_infer_column_types) { column_types = infer_column_types(data, row_offsets, stream); } else { - column_types = std::visit( - cudf::detail::visitor_overload{ - [&](const std::vector& data_types) { return select_data_types(data_types); }, - [&](const std::map& data_types) { - return select_data_types(data_types); - }, - [&](const std::vector& dtypes) { return parse_column_types(dtypes); }}, - opts_.get_dtypes()); + column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); }, + opts_.get_dtypes()); } out_columns.reserve(column_types.size()); @@ -707,81 +700,6 @@ std::vector reader::impl::infer_column_types(device_span return dtypes; } -std::vector reader::impl::parse_column_types( - const std::vector& types_as_strings) -{ - std::vector dtypes; - - const bool is_dict = std::all_of(types_as_strings.begin(), - types_as_strings.end(), - [](const auto& s) { return s.find(':') != std::string::npos; }); - - if (!is_dict) { - if (types_as_strings.size() == 1) { - // If it's a single dtype, assign that dtype to all active columns - data_type dtype_; - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]); - dtypes.resize(num_active_cols_, dtype_); - for (int col = 0; col < num_actual_cols_; col++) { - column_flags_[col] |= col_flags_; - } - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } else { - // If it's a list, assign dtypes to active columns in the given order - CUDF_EXPECTS(static_cast(types_as_strings.size()) >= num_actual_cols_, - "Must specify data types for all columns"); - - auto dtype_ = std::back_inserter(dtypes); - - for (int col = 0; col < num_actual_cols_; col++) { - if (column_flags_[col] & column_parse::enabled) { - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]); - column_flags_[col] |= col_flags_; - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } - } - } - } else { - // Translate vector of `name : dtype` strings to map - // NOTE: Incoming pairs can be out-of-order from column names in dataset - std::unordered_map col_type_map; - for (const auto& pair : types_as_strings) { - const auto pos = pair.find_last_of(':'); - const auto name = pair.substr(0, pos); - const auto dtype = pair.substr(pos + 1, pair.size()); - col_type_map[name] = dtype; - } - - auto dtype_ = std::back_inserter(dtypes); - - for (int col = 0; col < num_actual_cols_; col++) { - if (column_flags_[col] & column_parse::enabled) { - CUDF_EXPECTS(col_type_map.find(col_names_[col]) != col_type_map.end(), - "Must specify data types for all active columns"); - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[col_names_[col]]); - column_flags_[col] |= col_flags_; - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } - } - } - - if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) { - for (auto& type : dtypes) { - if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); } - } - } - - for (size_t i = 0; i < dtypes.size(); i++) { - // Replace EMPTY dtype with STRING - if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; } - } - - return dtypes; -} - std::vector reader::impl::decode_data(device_span data, device_span row_offsets, host_span column_types, diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 36c2bf4f9e7..4416457be16 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -197,15 +197,6 @@ class reader::impl { */ std::vector select_data_types(std::vector const& dtypes); - /** - * @brief Parses the columns' data types from the vector of dtypes that are provided as strings. - * - * @param types_as_strings The vector of strings from which to parse the columns' target data - * types - * @return List of columns' data types - */ - std::vector parse_column_types(std::vector const& types_as_strings); - /** * @brief Converts the row-column data and outputs to column bufferrs. * diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 85608a0984a..f1080342312 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -466,71 +466,32 @@ void reader::impl::set_column_names(device_span rec_starts, } } -std::vector reader::impl::parse_data_types( - std::vector const& types_as_strings) -{ - CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(), - "Need to specify the type of each column.\n"); - std::vector dtypes; - // Assume that the dtype is in dictionary format only if all elements contain a colon - const bool is_dict = std::all_of( - std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) { - return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s); - }); - - auto split_on_colon = [](std::string_view s) { - auto const i = s.find(":"); - return std::pair{s.substr(0, i), s.substr(i + 1)}; - }; - - if (is_dict) { - std::map col_type_map; - std::transform( - std::cbegin(types_as_strings), - std::cend(types_as_strings), - std::inserter(col_type_map, col_type_map.end()), - [&](auto const& ts) { - auto const [col_name, type_str] = split_on_colon(ts); - return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})}; - }); - - // Using the map here allows O(n log n) complexity - std::transform(std::cbegin(metadata_.column_names), - std::cend(metadata_.column_names), - std::back_inserter(dtypes), - [&](auto const& column_name) { return col_type_map[column_name]; }); - } else { - std::transform(std::cbegin(types_as_strings), - std::cend(types_as_strings), - std::back_inserter(dtypes), - [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); }); - } - return dtypes; -} - void reader::impl::set_data_types(device_span rec_starts, rmm::cuda_stream_view stream) { bool has_to_infer_column_types = std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes()); if (!has_to_infer_column_types) { - dtypes_ = std::visit( - cudf::detail::visitor_overload{ - [&](const std::vector& dtypes) { return dtypes; }, - [&](const std::map& dtypes) { - std::vector sorted_dtypes; - std::transform(std::cbegin(metadata_.column_names), - std::cend(metadata_.column_names), - std::back_inserter(sorted_dtypes), - [&](auto const& column_name) { - auto const it = dtypes.find(column_name); - CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns"); - return it->second; - }); - return sorted_dtypes; - }, - [&](std::vector const& dtypes) { return parse_data_types(dtypes); }}, - options_.get_dtypes()); + dtypes_ = std::visit(cudf::detail::visitor_overload{ + [&](const std::vector& dtypes) { + CUDF_EXPECTS(dtypes.size() == metadata_.column_names.size(), + "Must specify types for all columns"); + return dtypes; + }, + [&](const std::map& dtypes) { + std::vector sorted_dtypes; + std::transform(std::cbegin(metadata_.column_names), + std::cend(metadata_.column_names), + std::back_inserter(sorted_dtypes), + [&](auto const& column_name) { + auto const it = dtypes.find(column_name); + CUDF_EXPECTS(it != dtypes.end(), + "Must specify types for all columns"); + return it->second; + }); + return sorted_dtypes; + }}, + options_.get_dtypes()); } else { CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n"); auto const num_columns = metadata_.column_names.size(); diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 5cf51369cdf..bbda7e9ba74 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -158,8 +158,6 @@ class reader::impl { */ void set_column_names(device_span rec_starts, rmm::cuda_stream_view stream); - std::vector parse_data_types(std::vector const& types_as_strings); - /** * @brief Set the data type array data member * diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 53e0ab14fd3..5b6270a8be1 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1858,7 +1858,11 @@ TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str", "int32", "int32", "int32"}); + .dtypes({dtype(), + dtype(), + dtype(), + dtype(), + dtype()}); auto result = cudf_io::read_csv(in_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view()); diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index a263fa0fce0..e83592a028a 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -888,4 +888,27 @@ TEST_F(JsonReaderTest, JsonLinesMultipleFileInputs) float64_wrapper{{1.1, 2.2, 3.3, 4.4}, validity}); } +TEST_F(JsonReaderTest, BadDtypeParams) +{ + std::string buffer = "[1,2,3,4]"; + + cudf_io::json_reader_options options_vec = + cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()}) + .lines(true) + .dtypes({dtype()}); + + // should throw because there are four columns and only one dtype + EXPECT_THROW(cudf_io::read_json(options_vec), cudf::logic_error); + + cudf_io::json_reader_options options_map = + cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()}) + .lines(true) + .dtypes(std::map{{"0", dtype()}, + {"1", dtype()}, + {"2", dtype()}, + {"wrong_name", dtype()}}); + // should throw because one of the columns is not in the dtype map + EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/java/src/main/java/ai/rapids/cudf/HashJoin.java b/java/src/main/java/ai/rapids/cudf/HashJoin.java new file mode 100644 index 00000000000..620a7ce6a6c --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/HashJoin.java @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.rapids.cudf; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class represents a hash table built from the join keys of the right-side table for a + * join operation. This hash table can then be reused across a series of left probe tables + * to compute gather maps for joins more efficiently when the right-side table is not changing. + * It can also be used to query the output row count of a join and then pass that result to the + * operation that generates the join gather maps to avoid redundant computation when the output + * row count must be checked before manifesting the join gather maps. + */ +public class HashJoin implements AutoCloseable { + static { + NativeDepsLoader.loadNativeDeps(); + } + + private static final Logger log = LoggerFactory.getLogger(HashJoin.class); + + private static class HashJoinCleaner extends MemoryCleaner.Cleaner { + private Table buildKeys; + private long nativeHandle; + + HashJoinCleaner(Table buildKeys, long nativeHandle) { + this.buildKeys = buildKeys; + this.nativeHandle = nativeHandle; + addRef(); + } + + @Override + protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) { + long origAddress = nativeHandle; + boolean neededCleanup = nativeHandle != 0; + if (neededCleanup) { + try { + destroy(nativeHandle); + buildKeys.close(); + buildKeys = null; + } finally { + nativeHandle = 0; + } + if (logErrorIfNotClean) { + log.error("A HASH TABLE WAS LEAKED (ID: " + id + " " + Long.toHexString(origAddress)); + } + } + return neededCleanup; + } + + @Override + public boolean isClean() { + return nativeHandle == 0; + } + } + + private final HashJoinCleaner cleaner; + private final boolean compareNulls; + private boolean isClosed = false; + + /** + * Construct a hash table for a join from a table representing the join key columns from the + * right-side table in the join. The resulting instance must be closed to release the + * GPU resources associated with the instance. + * @param buildKeys table view containing the join keys for the right-side join table + * @param compareNulls true if null key values should match otherwise false + */ + public HashJoin(Table buildKeys, boolean compareNulls) { + this.compareNulls = compareNulls; + Table buildTable = new Table(buildKeys.getColumns()); + try { + long handle = create(buildTable.getNativeView(), compareNulls); + this.cleaner = new HashJoinCleaner(buildTable, handle); + MemoryCleaner.register(this, cleaner); + } catch (Throwable t) { + try { + buildTable.close(); + } catch (Throwable t2) { + t.addSuppressed(t2); + } + throw t; + } + } + + @Override + public synchronized void close() { + cleaner.delRef(); + if (isClosed) { + cleaner.logRefCountDebug("double free " + this); + throw new IllegalStateException("Close called too many times " + this); + } + cleaner.clean(false); + isClosed = true; + } + + long getNativeView() { + return cleaner.nativeHandle; + } + + /** Get the number of join key columns for the table that was used to generate the has table. */ + public long getNumberOfColumns() { + return cleaner.buildKeys.getNumberOfColumns(); + } + + /** Returns true if the hash table was built to match on nulls otherwise false. */ + public boolean getCompareNulls() { + return compareNulls; + } + + private static native long create(long tableView, boolean nullEqual); + private static native void destroy(long handle); +} diff --git a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java index 4bf38543a2d..a936d4830ee 100644 --- a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java +++ b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java @@ -277,6 +277,10 @@ public static void register(CompiledExpression expr, Cleaner cleaner) { all.add(new CleanerWeakReference(expr, cleaner, collected, false)); } + static void register(HashJoin hashJoin, Cleaner cleaner) { + all.add(new CleanerWeakReference(hashJoin, cleaner, collected, true)); + } + /** * This is not 100% perfect and we can still run into situations where RMM buffers were not * collected and this returns false because of thread race conditions. This is just a best effort. diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 1fc9616d607..e725932ed5e 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -505,18 +505,48 @@ private static native long[] leftJoin(long leftTable, int[] leftJoinCols, long r private static native long[] leftJoinGatherMaps(long leftKeys, long rightKeys, boolean compareNullsEqual) throws CudfException; + private static native long leftJoinRowCount(long leftTable, long rightHashJoin, + boolean nullsEqual) throws CudfException; + + private static native long[] leftHashJoinGatherMaps(long leftTable, long rightHashJoin, + boolean nullsEqual) throws CudfException; + + private static native long[] leftHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin, + boolean nullsEqual, + long outputRowCount) throws CudfException; + private static native long[] innerJoin(long leftTable, int[] leftJoinCols, long rightTable, int[] rightJoinCols, boolean compareNullsEqual) throws CudfException; private static native long[] innerJoinGatherMaps(long leftKeys, long rightKeys, boolean compareNullsEqual) throws CudfException; + private static native long innerJoinRowCount(long table, long hashJoin, + boolean nullsEqual) throws CudfException; + + private static native long[] innerHashJoinGatherMaps(long table, long hashJoin, + boolean nullsEqual) throws CudfException; + + private static native long[] innerHashJoinGatherMapsWithCount(long table, long hashJoin, + boolean nullsEqual, + long outputRowCount) throws CudfException; + private static native long[] fullJoin(long leftTable, int[] leftJoinCols, long rightTable, int[] rightJoinCols, boolean compareNullsEqual) throws CudfException; private static native long[] fullJoinGatherMaps(long leftKeys, long rightKeys, boolean compareNullsEqual) throws CudfException; + private static native long fullJoinRowCount(long leftTable, long rightHashJoin, + boolean nullsEqual) throws CudfException; + + private static native long[] fullHashJoinGatherMaps(long leftTable, long rightHashJoin, + boolean nullsEqual) throws CudfException; + + private static native long[] fullHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin, + boolean nullsEqual, + long outputRowCount) throws CudfException; + private static native long[] leftSemiJoin(long leftTable, int[] leftJoinCols, long rightTable, int[] rightJoinCols, boolean compareNullsEqual) throws CudfException; @@ -2040,6 +2070,69 @@ public GatherMap[] leftJoinGatherMaps(Table rightKeys, boolean compareNullsEqual return buildJoinGatherMaps(gatherMapData); } + /** + * Computes the number of rows resulting from a left equi-join between two tables. + * It is assumed this table instance holds the key columns from the left table, and the + * {@link HashJoin} argument has been constructed from the key columns from the right table. + * @param rightHash hash table built from join key columns from the right table + * @return row count of the join result + */ + public long leftJoinRowCount(HashJoin rightHash) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + return leftJoinRowCount(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls()); + } + + /** + * Computes the gather maps that can be used to manifest the result of a left equi-join between + * two tables. It is assumed this table instance holds the key columns from the left table, and + * the {@link HashJoin} argument has been constructed from the key columns from the right table. + * Two {@link GatherMap} instances will be returned that can be used to gather the left and right + * tables, respectively, to produce the result of the left join. + * It is the responsibility of the caller to close the resulting gather map instances. + * @param rightHash hash table built from join key columns from the right table + * @return left and right table gather maps + */ + public GatherMap[] leftJoinGatherMaps(HashJoin rightHash) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + long[] gatherMapData = + leftHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls()); + return buildJoinGatherMaps(gatherMapData); + } + + /** + * Computes the gather maps that can be used to manifest the result of a left equi-join between + * two tables. It is assumed this table instance holds the key columns from the left table, and + * the {@link HashJoin} argument has been constructed from the key columns from the right table. + * Two {@link GatherMap} instances will be returned that can be used to gather the left and right + * tables, respectively, to produce the result of the left join. + * It is the responsibility of the caller to close the resulting gather map instances. + * This interface allows passing an output row count that was previously computed from + * {@link #leftJoinRowCount(HashJoin)}. + * WARNING: Passing a row count that is smaller than the actual row count will result + * in undefined behavior. + * @param rightHash hash table built from join key columns from the right table + * @param outputRowCount number of output rows in the join result + * @return left and right table gather maps + */ + public GatherMap[] leftJoinGatherMaps(HashJoin rightHash, long outputRowCount) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + long[] gatherMapData = + leftHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls(), outputRowCount); + return buildJoinGatherMaps(gatherMapData); + } + /** * Computes the number of rows from the result of a left join between two tables when a * conditional expression is true. It is assumed this table instance holds the columns from @@ -2124,6 +2217,67 @@ public GatherMap[] innerJoinGatherMaps(Table rightKeys, boolean compareNullsEqua return buildJoinGatherMaps(gatherMapData); } + /** + * Computes the number of rows resulting from an inner equi-join between two tables. + * @param otherHash hash table built from join key columns from the other table + * @return row count of the join result + */ + public long innerJoinRowCount(HashJoin otherHash) { + if (getNumberOfColumns() != otherHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "otherKeys: " + otherHash.getNumberOfColumns()); + } + return innerJoinRowCount(getNativeView(), otherHash.getNativeView(), + otherHash.getCompareNulls()); + } + + /** + * Computes the gather maps that can be used to manifest the result of an inner equi-join between + * two tables. It is assumed this table instance holds the key columns from the left table, and + * the {@link HashJoin} argument has been constructed from the key columns from the right table. + * Two {@link GatherMap} instances will be returned that can be used to gather the left and right + * tables, respectively, to produce the result of the inner join. + * It is the responsibility of the caller to close the resulting gather map instances. + * @param rightHash hash table built from join key columns from the right table + * @return left and right table gather maps + */ + public GatherMap[] innerJoinGatherMaps(HashJoin rightHash) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + long[] gatherMapData = + innerHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls()); + return buildJoinGatherMaps(gatherMapData); + } + + /** + * Computes the gather maps that can be used to manifest the result of an inner equi-join between + * two tables. It is assumed this table instance holds the key columns from the left table, and + * the {@link HashJoin} argument has been constructed from the key columns from the right table. + * Two {@link GatherMap} instances will be returned that can be used to gather the left and right + * tables, respectively, to produce the result of the inner join. + * It is the responsibility of the caller to close the resulting gather map instances. + * This interface allows passing an output row count that was previously computed from + * {@link #innerJoinRowCount(HashJoin)}. + * WARNING: Passing a row count that is smaller than the actual row count will result + * in undefined behavior. + * @param rightHash hash table built from join key columns from the right table + * @param outputRowCount number of output rows in the join result + * @return left and right table gather maps + */ + public GatherMap[] innerJoinGatherMaps(HashJoin rightHash, long outputRowCount) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + long[] gatherMapData = + innerHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls(), outputRowCount); + return buildJoinGatherMaps(gatherMapData); + } + /** * Computes the number of rows from the result of an inner join between two tables when a * conditional expression is true. It is assumed this table instance holds the columns from @@ -2209,6 +2363,72 @@ public GatherMap[] fullJoinGatherMaps(Table rightKeys, boolean compareNullsEqual return buildJoinGatherMaps(gatherMapData); } + /** + * Computes the number of rows resulting from a full equi-join between two tables. + * It is assumed this table instance holds the key columns from the left table, and the + * {@link HashJoin} argument has been constructed from the key columns from the right table. + * Note that unlike {@link #leftJoinRowCount(HashJoin)} and {@link #innerJoinRowCount(HashJoin), + * this will perform some redundant calculations compared to + * {@link #fullJoinGatherMaps(HashJoin, long)}. + * @param rightHash hash table built from join key columns from the right table + * @return row count of the join result + */ + public long fullJoinRowCount(HashJoin rightHash) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + return fullJoinRowCount(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls()); + } + + /** + * Computes the gather maps that can be used to manifest the result of a full equi-join between + * two tables. It is assumed this table instance holds the key columns from the left table, and + * the {@link HashJoin} argument has been constructed from the key columns from the right table. + * Two {@link GatherMap} instances will be returned that can be used to gather the left and right + * tables, respectively, to produce the result of the full join. + * It is the responsibility of the caller to close the resulting gather map instances. + * @param rightHash hash table built from join key columns from the right table + * @return left and right table gather maps + */ + public GatherMap[] fullJoinGatherMaps(HashJoin rightHash) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + long[] gatherMapData = + fullHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls()); + return buildJoinGatherMaps(gatherMapData); + } + + /** + * Computes the gather maps that can be used to manifest the result of a full equi-join between + * two tables. It is assumed this table instance holds the key columns from the left table, and + * the {@link HashJoin} argument has been constructed from the key columns from the right table. + * Two {@link GatherMap} instances will be returned that can be used to gather the left and right + * tables, respectively, to produce the result of the full join. + * It is the responsibility of the caller to close the resulting gather map instances. + * This interface allows passing an output row count that was previously computed from + * {@link #fullJoinRowCount(HashJoin)}. + * WARNING: Passing a row count that is smaller than the actual row count will result + * in undefined behavior. + * @param rightHash hash table built from join key columns from the right table + * @param outputRowCount number of output rows in the join result + * @return left and right table gather maps + */ + public GatherMap[] fullJoinGatherMaps(HashJoin rightHash, long outputRowCount) { + if (getNumberOfColumns() != rightHash.getNumberOfColumns()) { + throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + + "rightKeys: " + rightHash.getNumberOfColumns()); + } + long[] gatherMapData = + fullHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(), + rightHash.getCompareNulls(), outputRowCount); + return buildJoinGatherMaps(gatherMapData); + } + /** * Computes the gather maps that can be used to manifest the result of a full join between * two tables when a conditional expression is true. It is assumed this table instance holds diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 35ecae681b8..bc59e3aee64 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -264,6 +264,7 @@ set(SOURCE_FILES "src/ColumnViewJni.cpp" "src/CompiledExpression.cpp" "src/ContiguousTableJni.cpp" + "src/HashJoinJni.cpp" "src/HostMemoryBufferNativeUtilsJni.cpp" "src/NvcompJni.cpp" "src/NvtxRangeJni.cpp" diff --git a/java/src/main/native/src/HashJoinJni.cpp b/java/src/main/native/src/HashJoinJni.cpp new file mode 100644 index 00000000000..0f78aef64bc --- /dev/null +++ b/java/src/main/native/src/HashJoinJni.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "cudf_jni_apis.hpp" + +extern "C" { + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HashJoin_create(JNIEnv *env, jclass, jlong j_table, + jboolean j_nulls_equal) { + JNI_NULL_CHECK(env, j_table, "table handle is null", 0); + try { + cudf::jni::auto_set_device(env); + auto tview = reinterpret_cast(j_table); + auto nulleq = j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; + auto hash_join_ptr = new cudf::hash_join(*tview, nulleq); + return reinterpret_cast(hash_join_ptr); + } + CATCH_STD(env, 0); +} + +JNIEXPORT void JNICALL Java_ai_rapids_cudf_HashJoin_destroy(JNIEnv *env, jclass, jlong j_handle) { + try { + cudf::jni::auto_set_device(env); + auto hash_join_ptr = reinterpret_cast(j_handle); + delete hash_join_ptr; + } + CATCH_STD(env, ); +} + +} // extern "C" diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 595bc1df151..f642a87b445 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -755,13 +755,46 @@ bool valid_window_parameters(native_jintArray const &values, values.size() == preceding.size() && values.size() == following.size(); } -// Generate gather maps needed to manifest the result of an equi-join between two tables. +// Convert a cudf gather map pair into the form that Java expects // The resulting Java long array contains the following at each index: // 0: Size of each gather map in bytes // 1: Device address of the gather map for the left table // 2: Host address of the rmm::device_buffer instance that owns the left gather map data // 3: Device address of the gather map for the right table // 4: Host address of the rmm::device_buffer instance that owns the right gather map data +jlongArray gather_maps_to_java(JNIEnv *env, + std::pair>, + std::unique_ptr>> + maps) { + // release the underlying device buffer to Java + auto left_map_buffer = std::make_unique(maps.first->release()); + auto right_map_buffer = std::make_unique(maps.second->release()); + cudf::jni::native_jlongArray result(env, 5); + result[0] = static_cast(left_map_buffer->size()); + result[1] = reinterpret_cast(left_map_buffer->data()); + result[2] = reinterpret_cast(left_map_buffer.release()); + result[3] = reinterpret_cast(right_map_buffer->data()); + result[4] = reinterpret_cast(right_map_buffer.release()); + return result.get_jArray(); +} + +// Convert a cudf gather map into the form that Java expects +// The resulting Java long array contains the following at each index: +// 0: Size of the gather map in bytes +// 1: Device address of the gather map +// 2: Host address of the rmm::device_buffer instance that owns the gather map data +jlongArray gather_map_to_java(JNIEnv *env, + std::unique_ptr> map) { + // release the underlying device buffer to Java + auto gather_map_buffer = std::make_unique(map->release()); + cudf::jni::native_jlongArray result(env, 3); + result[0] = static_cast(gather_map_buffer->size()); + result[1] = reinterpret_cast(gather_map_buffer->data()); + result[2] = reinterpret_cast(gather_map_buffer.release()); + return result.get_jArray(); +} + +// Generate gather maps needed to manifest the result of an equi-join between two tables. template jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal, T join_func) { @@ -772,31 +805,29 @@ jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys, auto left_keys = reinterpret_cast(j_left_keys); auto right_keys = reinterpret_cast(j_right_keys); auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - std::pair>, - std::unique_ptr>> - join_maps = join_func(*left_keys, *right_keys, nulleq); - - // release the underlying device buffer to Java - auto left_map_buffer = std::make_unique(join_maps.first->release()); - auto right_map_buffer = std::make_unique(join_maps.second->release()); - cudf::jni::native_jlongArray result(env, 5); - result[0] = static_cast(left_map_buffer->size()); - result[1] = reinterpret_cast(left_map_buffer->data()); - result[2] = reinterpret_cast(left_map_buffer.release()); - result[3] = reinterpret_cast(right_map_buffer->data()); - result[4] = reinterpret_cast(right_map_buffer.release()); - return result.get_jArray(); + return gather_maps_to_java(env, join_func(*left_keys, *right_keys, nulleq)); + } + CATCH_STD(env, NULL); +} + +// Generate gather maps needed to manifest the result of an equi-join between a left table and +// a hash table built from the join's right table. +template +jlongArray hash_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_hash_join, + jboolean compare_nulls_equal, T join_func) { + JNI_NULL_CHECK(env, j_left_keys, "left table is null", NULL); + JNI_NULL_CHECK(env, j_right_hash_join, "hash join is null", NULL); + try { + cudf::jni::auto_set_device(env); + auto left_keys = reinterpret_cast(j_left_keys); + auto hash_join = reinterpret_cast(j_right_hash_join); + auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; + return gather_maps_to_java(env, join_func(*left_keys, *hash_join, nulleq)); } CATCH_STD(env, NULL); } // Generate gather maps needed to manifest the result of a conditional join between two tables. -// The resulting Java long array contains the following at each index: -// 0: Size of each gather map in bytes -// 1: Device address of the gather map for the left table -// 2: Host address of the rmm::device_buffer instance that owns the left gather map data -// 3: Device address of the gather map for the right table -// 4: Host address of the rmm::device_buffer instance that owns the right gather map data template jlongArray cond_join_gather_maps(JNIEnv *env, jlong j_left_table, jlong j_right_table, jlong j_condition, jboolean compare_nulls_equal, T join_func) { @@ -809,29 +840,13 @@ jlongArray cond_join_gather_maps(JNIEnv *env, jlong j_left_table, jlong j_right_ auto right_table = reinterpret_cast(j_right_table); auto condition = reinterpret_cast(j_condition); auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - std::pair>, - std::unique_ptr>> - join_maps = join_func(*left_table, *right_table, condition->get_top_expression(), nulleq); - - // release the underlying device buffer to Java - auto left_map_buffer = std::make_unique(join_maps.first->release()); - auto right_map_buffer = std::make_unique(join_maps.second->release()); - cudf::jni::native_jlongArray result(env, 5); - result[0] = static_cast(left_map_buffer->size()); - result[1] = reinterpret_cast(left_map_buffer->data()); - result[2] = reinterpret_cast(left_map_buffer.release()); - result[3] = reinterpret_cast(right_map_buffer->data()); - result[4] = reinterpret_cast(right_map_buffer.release()); - return result.get_jArray(); + return gather_maps_to_java( + env, join_func(*left_table, *right_table, condition->get_top_expression(), nulleq)); } CATCH_STD(env, NULL); } // Generate a gather map needed to manifest the result of a semi/anti join between two tables. -// The resulting Java long array contains the following at each index: -// 0: Size of the gather map in bytes -// 1: Device address of the gather map -// 2: Host address of the rmm::device_buffer instance that owns the gather map data template jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal, T join_func) { @@ -842,26 +857,13 @@ jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_ auto left_keys = reinterpret_cast(j_left_keys); auto right_keys = reinterpret_cast(j_right_keys); auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - std::unique_ptr> join_map = - join_func(*left_keys, *right_keys, nulleq); - - // release the underlying device buffer to Java - auto gather_map_buffer = std::make_unique(join_map->release()); - cudf::jni::native_jlongArray result(env, 3); - result[0] = static_cast(gather_map_buffer->size()); - result[1] = reinterpret_cast(gather_map_buffer->data()); - result[2] = reinterpret_cast(gather_map_buffer.release()); - return result.get_jArray(); + return gather_map_to_java(env, join_func(*left_keys, *right_keys, nulleq)); } CATCH_STD(env, NULL); } // Generate a gather map needed to manifest the result of a conditional semi/anti join // between two tables. -// The resulting Java long array contains the following at each index: -// 0: Size of the gather map in bytes -// 1: Device address of the gather map -// 2: Host address of the rmm::device_buffer instance that owns the gather map data template jlongArray cond_join_gather_single_map(JNIEnv *env, jlong j_left_table, jlong j_right_table, jlong j_condition, jboolean compare_nulls_equal, @@ -875,16 +877,8 @@ jlongArray cond_join_gather_single_map(JNIEnv *env, jlong j_left_table, jlong j_ auto right_table = reinterpret_cast(j_right_table); auto condition = reinterpret_cast(j_condition); auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - std::unique_ptr> join_map = - join_func(*left_table, *right_table, condition->get_top_expression(), nulleq); - - // release the underlying device buffer to Java - auto gather_map_buffer = std::make_unique(join_map->release()); - cudf::jni::native_jlongArray result(env, 3); - result[0] = static_cast(gather_map_buffer->size()); - result[1] = reinterpret_cast(gather_map_buffer->data()); - result[2] = reinterpret_cast(gather_map_buffer.release()); - return result.get_jArray(); + return gather_map_to_java( + env, join_func(*left_table, *right_table, condition->get_top_expression(), nulleq)); } CATCH_STD(env, NULL); } @@ -1951,6 +1945,45 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoinGatherMaps( }); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_leftJoinRowCount(JNIEnv *env, jclass, + jlong j_left_table, + jlong j_right_hash_join, + jboolean compare_nulls_equal) { + JNI_NULL_CHECK(env, j_left_table, "left table is null", 0); + JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0); + try { + cudf::jni::auto_set_device(env); + auto left_table = reinterpret_cast(j_left_table); + auto hash_join = reinterpret_cast(j_right_hash_join); + auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; + auto row_count = hash_join->left_join_size(*left_table, nulleq); + return static_cast(row_count); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMaps( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, + jboolean compare_nulls_equal) { + return cudf::jni::hash_join_gather_maps( + env, j_left_table, j_right_hash_join, compare_nulls_equal, + [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) { + return hash.left_join(left, nulleq); + }); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMapsWithCount( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal, + jlong j_output_row_count) { + auto output_row_count = static_cast(j_output_row_count); + return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal, + [output_row_count](cudf::table_view const &left, + cudf::hash_join const &hash, + cudf::null_equality nulleq) { + return hash.left_join(left, nulleq, output_row_count); + }); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinRowCount( JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, jboolean compare_nulls_equal) { @@ -2002,6 +2035,45 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps( }); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_innerJoinRowCount(JNIEnv *env, jclass, + jlong j_left_table, + jlong j_right_hash_join, + jboolean compare_nulls_equal) { + JNI_NULL_CHECK(env, j_left_table, "left table is null", 0); + JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0); + try { + cudf::jni::auto_set_device(env); + auto left_table = reinterpret_cast(j_left_table); + auto hash_join = reinterpret_cast(j_right_hash_join); + auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; + auto row_count = hash_join->inner_join_size(*left_table, nulleq); + return static_cast(row_count); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMaps( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, + jboolean compare_nulls_equal) { + return cudf::jni::hash_join_gather_maps( + env, j_left_table, j_right_hash_join, compare_nulls_equal, + [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) { + return hash.inner_join(left, nulleq); + }); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMapsWithCount( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal, + jlong j_output_row_count) { + auto output_row_count = static_cast(j_output_row_count); + return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal, + [output_row_count](cudf::table_view const &left, + cudf::hash_join const &hash, + cudf::null_equality nulleq) { + return hash.inner_join(left, nulleq, output_row_count); + }); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinRowCount( JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, jboolean compare_nulls_equal) { @@ -2053,6 +2125,45 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoinGatherMaps( }); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_fullJoinRowCount(JNIEnv *env, jclass, + jlong j_left_table, + jlong j_right_hash_join, + jboolean compare_nulls_equal) { + JNI_NULL_CHECK(env, j_left_table, "left table is null", 0); + JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0); + try { + cudf::jni::auto_set_device(env); + auto left_table = reinterpret_cast(j_left_table); + auto hash_join = reinterpret_cast(j_right_hash_join); + auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; + auto row_count = hash_join->full_join_size(*left_table, nulleq); + return static_cast(row_count); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMaps( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, + jboolean compare_nulls_equal) { + return cudf::jni::hash_join_gather_maps( + env, j_left_table, j_right_hash_join, compare_nulls_equal, + [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) { + return hash.full_join(left, nulleq); + }); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMapsWithCount( + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal, + jlong j_output_row_count) { + auto output_row_count = static_cast(j_output_row_count); + return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal, + [output_row_count](cudf::table_view const &left, + cudf::hash_join const &hash, + cudf::null_equality nulleq) { + return hash.full_join(left, nulleq, output_row_count); + }); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalFullJoinGatherMaps( JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition, jboolean compare_nulls_equal) { diff --git a/java/src/test/java/ai/rapids/cudf/HashJoinTest.java b/java/src/test/java/ai/rapids/cudf/HashJoinTest.java new file mode 100644 index 00000000000..be6125340ec --- /dev/null +++ b/java/src/test/java/ai/rapids/cudf/HashJoinTest.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.rapids.cudf; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class HashJoinTest { + @Test + void testGetNumberOfColumns() { + try (Table t = new Table.TestBuilder().column(1, 2).column(3, 4).column(5, 6).build(); + HashJoin hashJoin = new HashJoin(t, false)) { + assertEquals(3, hashJoin.getNumberOfColumns()); + } + } + + @Test + void testGetCompareNulls() { + try (Table t = new Table.TestBuilder().column(1, 2, 3, 4).column(5, 6, 7, 8).build()) { + try (HashJoin hashJoin = new HashJoin(t, false)) { + assertFalse(hashJoin.getCompareNulls()); + } + try (HashJoin hashJoin = new HashJoin(t, true)) { + assertTrue(hashJoin.getCompareNulls()); + } + } + } +} diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 8e4e3df612b..aeb94e4824a 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -30,10 +30,6 @@ import ai.rapids.cudf.ast.ColumnReference; import ai.rapids.cudf.ast.CompiledExpression; import ai.rapids.cudf.ast.TableReference; -import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.ipc.ArrowFileReader; -import org.apache.arrow.vector.ipc.SeekableReadChannel; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.ParquetFileReader; @@ -1500,6 +1496,102 @@ void testLeftJoinGatherMapsNulls() { } } + @Test + void testLeftHashJoinGatherMaps() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); + Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + HashJoin rightHash = new HashJoin(rightKeys, false); + Table expected = new Table.TestBuilder() + .column( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9) + .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 3) + .build()) { + GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testLeftHashJoinGatherMapsWithCount() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); + Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + HashJoin rightHash = new HashJoin(rightKeys, false); + Table expected = new Table.TestBuilder() + .column( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9) + .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 3) + .build()) { + long rowCount = leftKeys.leftJoinRowCount(rightHash); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash, rowCount); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testLeftHashJoinGatherMapsNulls() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table rightKeys = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + HashJoin rightHash = new HashJoin(rightKeys, true); + Table expected = new Table.TestBuilder() + .column( 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 8, 9) // left + .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right + .build()) { + GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testLeftHashJoinGatherMapsNullsWithCount() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table rightKeys = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + HashJoin rightHash = new HashJoin(rightKeys,true); + Table expected = new Table.TestBuilder() + .column( 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 8, 9) // left + .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right + .build()) { + long rowCount = leftKeys.leftJoinRowCount(rightHash); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash, rowCount); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + @Test void testConditionalLeftJoinGatherMaps() { final int inv = Integer.MIN_VALUE; @@ -1654,6 +1746,98 @@ void testInnerJoinGatherMapsNulls() { } } + @Test + void testInnerHashJoinGatherMaps() { + try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); + Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + HashJoin rightHash = new HashJoin(rightKeys, false); + Table expected = new Table.TestBuilder() + .column(2, 7, 8, 9) // left + .column(2, 0, 1, 3) // right + .build()) { + GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testInnerHashJoinGatherMapsWithCount() { + try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); + Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build(); + HashJoin rightHash = new HashJoin(rightKeys, false); + Table expected = new Table.TestBuilder() + .column(2, 7, 8, 9) // left + .column(2, 0, 1, 3) // right + .build()) { + long rowCount = leftKeys.innerJoinRowCount(rightHash); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash, rowCount); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testInnerHashJoinGatherMapsNulls() { + try (Table leftKeys = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table rightKeys = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + HashJoin rightHash = new HashJoin(rightKeys, true); + Table expected = new Table.TestBuilder() + .column(2, 7, 7, 8, 8, 9) // left + .column(2, 0, 1, 0, 1, 3) // right + .build()) { + GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testInnerHashJoinGatherMapsNullsWithCount() { + try (Table leftKeys = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table rightKeys = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + HashJoin rightHash = new HashJoin(rightKeys, true); + Table expected = new Table.TestBuilder() + .column(2, 7, 7, 8, 8, 9) // left + .column(2, 0, 1, 0, 1, 3) // right + .build()) { + long rowCount = leftKeys.innerJoinRowCount(rightHash); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash, rowCount); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + @Test void testConditionalInnerJoinGatherMaps() { BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, @@ -1806,6 +1990,102 @@ void testFullJoinGatherMapsNulls() { } } + @Test + void testFullHashJoinGatherMaps() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, null, 1, 7, 4, 6, 5, 8).build(); + Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, null).build(); + HashJoin rightHash = new HashJoin(rightKeys, false); + Table expected = new Table.TestBuilder() + .column(inv, inv, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9) // left + .column( 4, 5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 3) // right + .build()) { + GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testFullHashJoinGatherMapsWithCount() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, null, 1, 7, 4, 6, 5, 8).build(); + Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, null).build(); + HashJoin rightHash = new HashJoin(rightKeys, false); + Table expected = new Table.TestBuilder() + .column(inv, inv, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9) // left + .column( 4, 5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 3) // right + .build()) { + long rowCount = leftKeys.fullJoinRowCount(rightHash); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash, rowCount); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testFullHashJoinGatherMapsNulls() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table rightKeys = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + HashJoin rightHash = new HashJoin(rightKeys, true); + Table expected = new Table.TestBuilder() + .column(inv, inv, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 8, 9) // left + .column( 4, 5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right + .build()) { + GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + + @Test + void testFullHashJoinGatherMapsNullsWithCount() { + final int inv = Integer.MIN_VALUE; + try (Table leftKeys = new Table.TestBuilder() + .column(2, 3, 9, 0, 1, 7, 4, null, null, 8) + .build(); + Table rightKeys = new Table.TestBuilder() + .column(null, null, 9, 8, 10, 32) + .build(); + HashJoin rightHash = new HashJoin(rightKeys, true); + Table expected = new Table.TestBuilder() + .column(inv, inv, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 8, 9) // left + .column( 4, 5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right + .build()) { + long rowCount = leftKeys.fullJoinRowCount(rightHash); + assertEquals(expected.getRowCount(), rowCount); + GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash, rowCount); + try { + verifyJoinGatherMaps(maps, expected); + } finally { + for (GatherMap map : maps) { + map.close(); + } + } + } + } + @Test void testConditionalFullJoinGatherMaps() { final int inv = Integer.MIN_VALUE; diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b20c42926dc..d52f63a79f5 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -392,14 +392,6 @@ def _fill( return self - fill_code = self._encode(fill_value) - fill_scalar = as_device_scalar(fill_code, self.codes.dtype) - - result = self if inplace else self.copy() - - libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar) - return result - def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: return libcudf.copying.shift(self, offset, fill_value) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f66cb570fbb..721ebf22de7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6305,12 +6305,9 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): if axis != 0: raise NotImplementedError("Only axis=0 is currently supported.") - return self._apply_support_method( - "count", - axis=axis, - level=level, - numeric_only=numeric_only, - **kwargs, + return Series._from_data( + {None: [self._data[col].valid_count for col in self._data.names]}, + as_index(self._data.names), ) _SUPPORT_AXIS_LOOKUP = { @@ -6343,7 +6340,7 @@ def _reduce( {None: result}, as_index(self._data.names) ) elif axis == 1: - return self._apply_support_method_axis_1(op, **kwargs) + return self._apply_cupy_method_axis_1(op, **kwargs) def _scan( self, op, axis=None, *args, **kwargs, @@ -6353,7 +6350,7 @@ def _scan( if axis == 0: return super()._scan(op, axis=axis, *args, **kwargs) elif axis == 1: - return self._apply_support_method_axis_1(f"cum{op}", **kwargs) + return self._apply_cupy_method_axis_1(f"cum{op}", **kwargs) def mode(self, axis=0, numeric_only=False, dropna=True): """ @@ -6458,100 +6455,17 @@ def mode(self, axis=0, numeric_only=False, dropna=True): def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): - """ - Return Fisher's unbiased kurtosis of a sample. - - Kurtosis obtained using Fisher’s definition of - kurtosis (kurtosis of normal == 0.0). Normalized by N-1. - - Parameters - ---------- - - skipna: bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.kurt() - a -1.2 - b -1.2 - dtype: float64 - """ - if axis not in (0, "index", None): - raise NotImplementedError("Only axis=0 is currently supported.") - - if numeric_only not in (None, True): - msg = "Kurtosis only supports int, float, and bool dtypes." - raise NotImplementedError(msg) - - filtered = self.select_dtypes(include=[np.number, np.bool_]) - return filtered._apply_support_method( - "kurtosis", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, + obj = self.select_dtypes(include=[np.number, np.bool_]) + return super(DataFrame, obj).kurtosis( + axis, skipna, level, numeric_only, **kwargs ) - # Alias for kurtosis. - kurt = kurtosis - def skew( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): - """ - Return unbiased Fisher-Pearson skew of a sample. - - Parameters - ---------- - skipna: bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 8, 10, 10]}) - >>> df.skew() - a 0.00000 - b -0.37037 - dtype: float64 - """ - if axis not in (0, "index", None): - raise NotImplementedError("Only axis=0 is currently supported.") - - if numeric_only not in (None, True): - msg = "Skew only supports int, float, and bool dtypes." - raise NotImplementedError(msg) - - filtered = self.select_dtypes(include=[np.number, np.bool_]) - return filtered._apply_support_method( - "skew", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, + obj = self.select_dtypes(include=[np.number, np.bool_]) + return super(DataFrame, obj).skew( + axis, skipna, level, numeric_only, **kwargs ) def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): @@ -6562,23 +6476,11 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self return super(DataFrame, obj).any(axis, skipna, level, **kwargs) - def _apply_support_method_axis_0(self, method, *args, **kwargs): - result = [ - getattr(self[col], method)(*args, **kwargs) - for col in self._data.names - ] + def _apply_cupy_method_axis_1(self, method, *args, **kwargs): + # This method uses cupy to perform scans and reductions along rows of a + # DataFrame. Since cuDF is designed around columnar storage and + # operations, we convert DataFrames to 2D cupy arrays for these ops. - if isinstance(result[0], Series): - support_result = result - result = DataFrame(index=support_result[0].index) - for idx, col in enumerate(self._data.names): - result[col] = support_result[idx] - else: - result = Series(result) - result = result.set_index(self._data.names) - return result - - def _apply_support_method_axis_1(self, method, *args, **kwargs): # for dask metadata compatibility skipna = kwargs.pop("skipna", None) skipna = True if skipna is None else skipna @@ -6608,13 +6510,13 @@ def _apply_support_method_axis_1(self, method, *args, **kwargs): min_count = kwargs.pop("min_count", None) if min_count not in (None, 0): raise NotImplementedError( - "Row-wise operations currently do not " "support `min_count`." + "Row-wise operations currently do not support `min_count`." ) bool_only = kwargs.pop("bool_only", None) if bool_only not in (None, True): raise NotImplementedError( - "Row-wise operations currently do not " "support `bool_only`." + "Row-wise operations currently do not support `bool_only`." ) # This parameter is only necessary for axis 0 reductions that cuDF @@ -6674,14 +6576,6 @@ def _apply_support_method_axis_1(self, method, *args, **kwargs): result_df.columns = prepared.columns return result_df - def _apply_support_method(self, method, axis=0, *args, **kwargs): - axis = self._get_axis_from_axis_arg(axis) - - if axis == 0: - return self._apply_support_method_axis_0(method, *args, **kwargs) - elif axis == 1: - return self._apply_support_method_axis_1(method, *args, **kwargs) - def _columns_view(self, columns): """ Return a subset of the DataFrame's columns as a view. diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5f1ac4e0c20..9f743cd8c85 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -27,6 +27,7 @@ ) from cudf.core.column_accessor import ColumnAccessor from cudf.core.join import merge +from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, @@ -4056,6 +4057,131 @@ def var( **kwargs, ) + def kurtosis( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + """ + Return Fisher's unbiased kurtosis of a sample. + + Kurtosis obtained using Fisher’s definition of + kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + + Returns + ------- + Series or scalar + + Notes + ----- + Parameters currently not supported are `level` and `numeric_only` + + Examples + -------- + **Series** + + >>> import cudf + >>> series = cudf.Series([1, 2, 3, 4]) + >>> series.kurtosis() + -1.1999999999999904 + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.kurt() + a -1.2 + b -1.2 + dtype: float64 + """ + if axis not in (0, "index", None): + raise NotImplementedError("Only axis=0 is currently supported.") + + return self._reduce( + "kurtosis", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + # Alias for kurtosis. + @copy_docstring(kurtosis) + def kurt( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + return self.kurtosis( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def skew( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + """ + Return unbiased Fisher-Pearson skew of a sample. + + Parameters + ---------- + skipna: bool, default True + Exclude NA/null values when computing the result. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `axis`, `level` and + `numeric_only` + + Examples + -------- + **Series** + + >>> import cudf + >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6]) + >>> series + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + 5 6 + 6 6 + dtype: int64 + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 8, 10, 10]}) + >>> df.skew() + a 0.00000 + b -0.37037 + dtype: float64 + """ + if axis not in (0, "index", None): + raise NotImplementedError("Only axis=0 is currently supported.") + + return self._reduce( + "skew", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + def all(self, axis=0, skipna=True, level=None, **kwargs): """ Return whether all elements are True in DataFrame. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 380e1838534..ff3b9fc68ef 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4036,103 +4036,6 @@ def round(self, decimals=0, how="half_even"): dtype=self.dtype, ) - def kurtosis( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return Fisher's unbiased kurtosis of a sample. - - Kurtosis obtained using Fisher’s definition of - kurtosis (kurtosis of normal == 0.0). Normalized by N-1. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([1, 2, 3, 4]) - >>> series.kurtosis() - -1.1999999999999904 - """ - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.kurtosis(skipna=skipna) - - # Alias for kurtosis. - kurt = kurtosis - - def skew( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return unbiased Fisher-Pearson skew of a sample. - - Parameters - ---------- - skipna : bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6]) - >>> series - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - 6 6 - dtype: int64 - >>> series.skew() - -0.288195490292614 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.skew(skipna=skipna) - def cov(self, other, min_periods=None): """ Compute covariance with Series, excluding missing values.