Skip to content

Commit

Permalink
Merge branch 'branch-22.04' into bug-str-to-fp128-exp-adjust
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Feb 7, 2022
2 parents f3ce7ea + e3611a2 commit 9d4d81a
Show file tree
Hide file tree
Showing 12 changed files with 261 additions and 46 deletions.
23 changes: 19 additions & 4 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ add_custom_command(
function(ConfigureBench CMAKE_BENCH_NAME)
add_executable(${CMAKE_BENCH_NAME} ${ARGN})
set_target_properties(
${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
"$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>"
${CMAKE_BENCH_NAME}
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>"
INSTALL_RPATH "\$ORIGIN/../../../lib"
)
target_link_libraries(
${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main
Expand All @@ -69,19 +70,33 @@ function(ConfigureBench CMAKE_BENCH_NAME)
APPEND
COMMENT "Adding ${CMAKE_BENCH_NAME}"
)

install(
TARGETS ${CMAKE_BENCH_NAME}
COMPONENT testing
DESTINATION bin/benchmarks/libcudf
EXCLUDE_FROM_ALL
)
endfunction()

# This function takes in a benchmark name and benchmark source for nvbench benchmarks and handles
# setting all of the associated properties and linking to build the benchmark
function(ConfigureNVBench CMAKE_BENCH_NAME)
add_executable(${CMAKE_BENCH_NAME} ${ARGN})
set_target_properties(
${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
"$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>"
${CMAKE_BENCH_NAME}
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>"
INSTALL_RPATH "\$ORIGIN/../../../lib"
)
target_link_libraries(
${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen nvbench::main
)
install(
TARGETS ${CMAKE_BENCH_NAME}
COMPONENT testing
DESTINATION bin/benchmarks/libcudf
EXCLUDE_FROM_ALL
)
endfunction()

# ##################################################################################################
Expand Down
11 changes: 9 additions & 2 deletions cpp/libcudf_kafka/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,20 @@
function(ConfigureTest test_name)
add_executable(${test_name} ${ARGN})
set_target_properties(
${test_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
"$<BUILD_INTERFACE:${CUDA_KAFKA_BINARY_DIR}/gtests>"
${test_name}
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDA_KAFKA_BINARY_DIR}/gtests>"
INSTALL_RPATH "\$ORIGIN/../../../lib"
)
target_link_libraries(
${test_name} PRIVATE GTest::gmock GTest::gmock_main GTest::gtest_main cudf_kafka
)
add_test(NAME ${test_name} COMMAND ${test_name})
install(
TARGETS ${test_name}
COMPONENT testing
DESTINATION bin/gtests/libcudf_kafka
EXCLUDE_FROM_ALL
)
endfunction()

# ##################################################################################################
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/utilities/config_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ enum class usage_policy : uint8_t { OFF, GDS, ALWAYS };
*/
usage_policy get_env_policy()
{
static auto const env_val = getenv_or("LIBCUDF_CUFILE_POLICY", "GDS");
static auto const env_val = getenv_or<std::string>("LIBCUDF_CUFILE_POLICY", "GDS");
if (env_val == "OFF") return usage_policy::OFF;
if (env_val == "GDS") return usage_policy::GDS;
if (env_val == "ALWAYS") return usage_policy::ALWAYS;
Expand All @@ -69,7 +69,7 @@ enum class usage_policy : uint8_t { OFF, STABLE, ALWAYS };
*/
usage_policy get_env_policy()
{
static auto const env_val = getenv_or("LIBCUDF_NVCOMP_POLICY", "STABLE");
static auto const env_val = getenv_or<std::string>("LIBCUDF_NVCOMP_POLICY", "STABLE");
if (env_val == "OFF") return usage_policy::OFF;
if (env_val == "STABLE") return usage_policy::STABLE;
if (env_val == "ALWAYS") return usage_policy::ALWAYS;
Expand Down
13 changes: 12 additions & 1 deletion cpp/src/io/utilities/config_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
#pragma once

#include <sstream>
#include <string>

namespace cudf::io::detail {
Expand All @@ -23,7 +24,17 @@ namespace cudf::io::detail {
* @brief Returns the value of the environment variable, or a default value if the variable is not
* present.
*/
std::string getenv_or(std::string const& env_var_name, std::string_view default_val);
template <typename T>
T getenv_or(std::string_view env_var_name, T default_val)
{
auto const env_val = std::getenv(env_var_name.data());
if (env_val == nullptr) { return default_val; }

std::stringstream sstream(env_val);
T converted_val;
sstream >> converted_val;
return converted_val;
}

namespace cufile_integration {

Expand Down
15 changes: 9 additions & 6 deletions cpp/src/io/utilities/file_io_utilities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ void cufile_shim::modify_cufile_json() const
temp_directory tmp_config_dir{"cudf_cufile_config"};

// Modify the config file based on the policy
auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json");
auto const config_file_path = getenv_or<std::string>(json_path_env_var, "/etc/cufile.json");
std::ifstream user_config_file(config_file_path);
// Modified config file is stored in a temporary directory
auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
Expand Down Expand Up @@ -170,7 +170,8 @@ cufile_registered_file::~cufile_registered_file() { shim->handle_deregister(cf_h
cufile_input_impl::cufile_input_impl(std::string const& filepath)
: shim{cufile_shim::instance()},
cf_file(shim, filepath, O_RDONLY | O_DIRECT),
pool(16) // The benefit from multithreaded read plateaus around 16 threads
// The benefit from multithreaded read plateaus around 16 threads
pool(getenv_or("LIBCUDF_CUFILE_THREAD_COUNT", 16))
{
pool.sleep_duration = 10;
}
Expand All @@ -194,9 +195,11 @@ std::vector<std::future<ResultT>> make_sliced_tasks(
F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool)
{
std::vector<std::future<ResultT>> slice_tasks;
constexpr size_t max_slice_bytes = 4 * 1024 * 1024;
size_t const n_slices = util::div_rounding_up_safe(size, max_slice_bytes);
size_t slice_offset = 0;
constexpr size_t default_max_slice_bytes = 4 * 1024 * 1024;
static auto const max_slice_bytes =
getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_bytes);
size_t const n_slices = util::div_rounding_up_safe(size, max_slice_bytes);
size_t slice_offset = 0;
for (size_t t = 0; t < n_slices; ++t) {
DataT* ptr_slice = ptr + slice_offset;

Expand Down Expand Up @@ -250,7 +253,7 @@ size_t cufile_input_impl::read(size_t offset,
cufile_output_impl::cufile_output_impl(std::string const& filepath)
: shim{cufile_shim::instance()},
cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664),
pool(16)
pool(getenv_or("LIBCUDF_CUFILE_THREAD_COUNT", 16))
{
}

Expand Down
11 changes: 9 additions & 2 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,18 @@
function(ConfigureTest CMAKE_TEST_NAME)
add_executable(${CMAKE_TEST_NAME} ${ARGN})
set_target_properties(
${CMAKE_TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
"$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/gtests>"
${CMAKE_TEST_NAME}
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/gtests>"
INSTALL_RPATH "\$ORIGIN/../../../lib"
)
target_link_libraries(${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main)
add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
install(
TARGETS ${CMAKE_TEST_NAME}
COMPONENT testing
DESTINATION bin/gtests/libcudf
EXCLUDE_FROM_ALL
)
endfunction()

# ##################################################################################################
Expand Down
6 changes: 6 additions & 0 deletions docs/cudf/source/basics/io-gds-integration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,9 @@ Operations that support the use of GPUDirect Storage:
- `to_csv`
- `to_parquet`
- `to_orc`

Several parameters that can be used to tune the performance of GDS-enabled I/O are exposed through environment variables:

- ``LIBCUDF_CUFILE_THREAD_COUNT``: Integral value, maximum number of parallel reads/writes per file (default 16);
- ``LIBCUDF_CUFILE_SLICE_SIZE``: Integral value, maximum size of each GDS read/write, in bytes (default 4MB).
Larger I/O operations are split into multiple calls.
60 changes: 40 additions & 20 deletions java/src/main/java/ai/rapids/cudf/ColumnView.java
Original file line number Diff line number Diff line change
Expand Up @@ -4012,20 +4012,29 @@ static ColumnVector createColumnVector(DType type, int rows, HostMemoryBuffer da
}
if (mainColOffsets != null) {
// The offset buffer has (no. of rows + 1) entries, where each entry is INT32.sizeInBytes
long offsetsLen = OFFSET_SIZE * (mainColRows + 1);
long offsetsLen = OFFSET_SIZE * (((long)mainColRows) + 1);
mainOffsetsDevBuff = DeviceMemoryBuffer.allocate(offsetsLen);
mainOffsetsDevBuff.copyFromHostBuffer(mainColOffsets, 0, offsetsLen);
}
List<DeviceMemoryBuffer> toClose = new ArrayList<>();
long[] childHandles = new long[devChildren.size()];
for (ColumnView.NestedColumnVector ncv : devChildren) {
toClose.addAll(ncv.getBuffersToClose());
}
for (int i = 0; i < devChildren.size(); i++) {
childHandles[i] = devChildren.get(i).getViewHandle();
try {
for (ColumnView.NestedColumnVector ncv : devChildren) {
toClose.addAll(ncv.getBuffersToClose());
}
for (int i = 0; i < devChildren.size(); i++) {
childHandles[i] = devChildren.get(i).createViewHandle();
}
return new ColumnVector(mainColType, mainColRows, nullCount, mainDataDevBuff,
mainValidDevBuff, mainOffsetsDevBuff, toClose, childHandles);
} finally {
for (int i = 0; i < childHandles.length; i++) {
if (childHandles[i] != 0) {
ColumnView.deleteColumnView(childHandles[i]);
childHandles[i] = 0;
}
}
}
return new ColumnVector(mainColType, mainColRows, nullCount, mainDataDevBuff,
mainValidDevBuff, mainOffsetsDevBuff, toClose, childHandles);
}

private static NestedColumnVector createNewNestedColumnVector(
Expand All @@ -4048,21 +4057,32 @@ private static NestedColumnVector createNewNestedColumnVector(
children);
}

long getViewHandle() {
private long createViewHandle() {
long[] childrenColViews = null;
if (children != null) {
childrenColViews = new long[children.size()];
for (int i = 0; i < children.size(); i++) {
childrenColViews[i] = children.get(i).getViewHandle();
try {
if (children != null) {
childrenColViews = new long[children.size()];
for (int i = 0; i < children.size(); i++) {
childrenColViews[i] = children.get(i).createViewHandle();
}
}
long dataAddr = data == null ? 0 : data.address;
long dataLen = data == null ? 0 : data.length;
long offsetAddr = offsets == null ? 0 : offsets.address;
long validAddr = valid == null ? 0 : valid.address;
int nc = nullCount.orElse(ColumnVector.OffHeapState.UNKNOWN_NULL_COUNT).intValue();
return makeCudfColumnView(dataType.typeId.getNativeId(), dataType.getScale(), dataAddr, dataLen,
offsetAddr, validAddr, nc, (int) rows, childrenColViews);
} finally {
if (childrenColViews != null) {
for (int i = 0; i < childrenColViews.length; i++) {
if (childrenColViews[i] != 0) {
deleteColumnView(childrenColViews[i]);
childrenColViews[i] = 0;
}
}
}
}
long dataAddr = data == null ? 0 : data.address;
long dataLen = data == null ? 0 : data.length;
long offsetAddr = offsets == null ? 0 : offsets.address;
long validAddr = valid == null ? 0 : valid.address;
int nc = nullCount.orElse(ColumnVector.OffHeapState.UNKNOWN_NULL_COUNT).intValue();
return makeCudfColumnView(dataType.typeId.getNativeId(), dataType.getScale() , dataAddr, dataLen,
offsetAddr, validAddr, nc, (int)rows, childrenColViews);
}

List<DeviceMemoryBuffer> getBuffersToClose() {
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/_lib/utils.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import numpy as np
import pyarrow as pa
Expand Down Expand Up @@ -190,6 +190,7 @@ cpdef generate_pandas_metadata(table, index):
col_meta["name"] in table._column_names
and table._data[col_meta["name"]].nullable
and col_meta["numpy_type"] in PARQUET_META_TYPE_MAP
and col_meta["pandas_type"] != "decimal"
):
col_meta["numpy_type"] = PARQUET_META_TYPE_MAP[
col_meta["numpy_type"]
Expand Down
75 changes: 75 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import pyarrow as pa
from nvtx import annotate
from pandas._config import get_option
from pandas.core.dtypes.common import is_float, is_integer
from pandas.io.formats import console
from pandas.io.formats.printing import pprint_thing

Expand Down Expand Up @@ -2542,6 +2543,80 @@ def insert(self, loc, name, value, nan_as_null=None):

self._data.insert(name, value, loc=loc)

def diff(self, periods=1, axis=0):
"""
First discrete difference of element.
Calculates the difference of a DataFrame element compared with another
element in the DataFrame (default is element in previous row).
Parameters
----------
periods : int, default 1
Periods to shift for calculating difference,
accepts negative values.
axis : {0 or 'index', 1 or 'columns'}, default 0
Take difference over rows (0) or columns (1).
Only row-wise (0) shift is supported.
Returns
-------
DataFrame
First differences of the DataFrame.
Notes
-----
Diff currently only supports numeric dtype columns.
Examples
--------
>>> import cudf
>>> gdf = cudf.DataFrame({'a': [1, 2, 3, 4, 5, 6],
... 'b': [1, 1, 2, 3, 5, 8],
... 'c': [1, 4, 9, 16, 25, 36]})
>>> gdf
a b c
0 1 1 1
1 2 1 4
2 3 2 9
3 4 3 16
4 5 5 25
5 6 8 36
>>> gdf.diff(periods=2)
a b c
0 <NA> <NA> <NA>
1 <NA> <NA> <NA>
2 2 1 8
3 2 2 12
4 2 3 16
5 2 5 20
"""
if not is_integer(periods):
if not (is_float(periods) and periods.is_integer()):
raise ValueError("periods must be an integer")
periods = int(periods)

axis = self._get_axis_from_axis_arg(axis)
if axis != 0:
raise NotImplementedError("Only axis=0 is supported.")

if not all(is_numeric_dtype(i) for i in self.dtypes):
raise NotImplementedError(
"DataFrame.diff only supports numeric dtypes"
)

if abs(periods) > len(self):
df = cudf.DataFrame._from_data(
{
name: column_empty(len(self), dtype=dtype, masked=True)
for name, dtype in zip(self.columns, self.dtypes)
}
)
return df

return self - self.shift(periods=periods)

def drop(
self,
labels=None,
Expand Down
Loading

0 comments on commit 9d4d81a

Please sign in to comment.