Skip to content

Commit

Permalink
Merge branch 'branch-23.12' into streams-io-csv
Browse files Browse the repository at this point in the history
  • Loading branch information
shrshi authored Nov 14, 2023
2 parents 751936d + b446a6f commit 413bfb6
Show file tree
Hide file tree
Showing 29 changed files with 354 additions and 161 deletions.
2 changes: 1 addition & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ fi
# build cudf_kafka Python package
if hasArg cudf_kafka; then
cd ${REPODIR}/python/cudf_kafka
SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR}" \
SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS}" \
SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL:-1}" \
python -m pip install --no-build-isolation --no-deps .
fi
Expand Down
1 change: 1 addition & 0 deletions ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g'

# Python CMakeLists updates
sed_runner 's/'"cudf_version .*)"'/'"cudf_version ${NEXT_FULL_TAG})"'/g' python/cudf/CMakeLists.txt
sed_runner 's/'"cudf_kafka_version .*)"'/'"cudf_kafka_version ${NEXT_FULL_TAG})"'/g' python/cudf_kafka/CMakeLists.txt

# cpp libcudf_kafka update
sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt
Expand Down
1 change: 0 additions & 1 deletion conda/environments/all_cuda-120_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ dependencies:
- cmake>=3.26.4
- cramjam
- cuda-cudart-dev
- cuda-gdb
- cuda-nvcc
- cuda-nvrtc-dev
- cuda-nvtx-dev
Expand Down
13 changes: 0 additions & 13 deletions conda/recipes/cudf_kafka/build.sh
Original file line number Diff line number Diff line change
@@ -1,16 +1,3 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

# This assumes the script is executed from the root of the repo directory
# Need to set CUDA_HOME inside conda environments because the hacked together
# setup.py for cudf-kafka searches that way.
# TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
# cudf_kafka to use scikit-build
CUDA_MAJOR=${RAPIDS_CUDA_VERSION%%.*}
if [[ ${CUDA_MAJOR} == "12" ]]; then
target_name="x86_64-linux"
if [[ ! $(arch) == "x86_64" ]]; then
target_name="sbsa-linux"
fi
export CUDA_HOME="${PREFIX}/targets/${target_name}/"
fi
./build.sh -v cudf_kafka
6 changes: 6 additions & 0 deletions conda/recipes/cudf_kafka/conda_build_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,9 @@ sysroot_version:

cmake_version:
- ">=3.26.4"

cuda_compiler:
- cuda-nvcc

cuda11_compiler:
- nvcc
21 changes: 12 additions & 9 deletions conda/recipes/cudf_kafka/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,28 +33,31 @@ build:
- SCCACHE_S3_KEY_PREFIX=cudf-kafka-linux64 # [linux64]
- SCCACHE_S3_USE_SSL
- SCCACHE_S3_NO_CREDENTIALS
# TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
# cudf_kafka to use scikit-build
- RAPIDS_CUDA_VERSION
ignore_run_exports_from:
{% if cuda_major == "11" %}
- {{ compiler('cuda11') }}
{% endif %}

requirements:
build:
- cmake {{ cmake_version }}
- ninja
- {{ compiler('c') }}
- {{ compiler('cxx') }}
- ninja
- sysroot_{{ target_platform }} {{ sysroot_version }}
# TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
# cudf_kafka to use scikit-build
{% if cuda_major == "12" %}
- cuda-gdb
{% if cuda_major == "11" %}
- {{ compiler('cuda11') }} ={{ cuda_version }}
{% else %}
- {{ compiler('cuda') }}
{% endif %}
- cuda-version ={{ cuda_version }}
- sysroot_{{ target_platform }} {{ sysroot_version }}
host:
- python
- cython >=3.0.3
- cuda-version ={{ cuda_version }}
- cudf ={{ version }}
- libcudf_kafka ={{ version }}
- scikit-build >=0.13.1
- setuptools
{% if cuda_major == "12" %}
- cuda-cudart-dev
Expand Down
2 changes: 1 addition & 1 deletion cpp/benchmarks/text/vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ static void bench_vocab_tokenize(nvbench::state& state)

auto const vocab_col = [] {
data_profile const profile = data_profile_builder().no_validity().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
cudf::type_id::STRING, distribution_id::NORMAL, 0, 15);
auto const col = create_random_column(cudf::type_id::STRING, row_count{100}, profile);
return cudf::strings::filter_characters_of_type(
cudf::strings_column_view(col->view()),
Expand Down
2 changes: 1 addition & 1 deletion cpp/cmake/thirdparty/get_nvbench.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ function(find_and_configure_nvbench)
set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
rapids_cpm_package_override("${cudf_patch_dir}/nvbench_override.json")

rapids_cpm_nvbench()
rapids_cpm_nvbench(BUILD_STATIC)

endfunction()

Expand Down
5 changes: 0 additions & 5 deletions cpp/cmake/thirdparty/patches/nvbench_override.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,6 @@
"file" : "${current_json_dir}/nvbench_global_setup.diff",
"issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]",
"fixed_in" : ""
},
{
"file" : "nvbench/use_existing_fmt.diff",
"issue" : "Fix add support for using an existing fmt [https://github.com/NVIDIA/nvbench/pull/125]",
"fixed_in" : ""
}
]
}
Expand Down
8 changes: 5 additions & 3 deletions cpp/libcudf_kafka/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ include(rapids-export)
include(rapids-find)

project(
CUDA_KAFKA
CUDF_KAFKA
VERSION 23.12.00
LANGUAGES CXX
)
Expand Down Expand Up @@ -64,7 +64,7 @@ add_library(cudf_kafka SHARED src/kafka_consumer.cpp src/kafka_callback.cpp)
# ##################################################################################################
# * include paths ---------------------------------------------------------------------------------
target_include_directories(
cudf_kafka PUBLIC "$<BUILD_INTERFACE:${CUDA_KAFKA_SOURCE_DIR}/include>"
cudf_kafka PUBLIC "$<BUILD_INTERFACE:${CUDF_KAFKA_SOURCE_DIR}/include>"
"$<INSTALL_INTERFACE:include>"
)

Expand All @@ -85,6 +85,8 @@ set_target_properties(
CXX_STANDARD_REQUIRED ON
)

add_library(cudf_kafka::cudf_kafka ALIAS cudf_kafka)

# ##################################################################################################
# * cudf_kafka Install ----------------------------------------------------------------------------
rapids_cmake_install_lib_dir(lib_dir)
Expand All @@ -94,7 +96,7 @@ install(
EXPORT cudf_kafka-exports
)

install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include DESTINATION include)
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

rapids_export(
INSTALL cudf_kafka
Expand Down
16 changes: 8 additions & 8 deletions cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down Expand Up @@ -35,21 +35,21 @@ function(find_and_configure_cudf VERSION)
endif()
endfunction()

set(CUDA_KAFKA_MIN_VERSION_cudf
"${CUDA_KAFKA_VERSION_MAJOR}.${CUDA_KAFKA_VERSION_MINOR}.${CUDA_KAFKA_VERSION_PATCH}"
set(CUDF_KAFKA_MIN_VERSION
"${CUDF_KAFKA_VERSION_MAJOR}.${CUDF_KAFKA_VERSION_MINOR}.${CUDF_KAFKA_VERSION_PATCH}"
)
find_and_configure_cudf(${CUDA_KAFKA_MIN_VERSION_cudf})
find_and_configure_cudf(${CUDF_KAFKA_MIN_VERSION})

if(cudf_REQUIRES_CUDA)
rapids_cuda_init_architectures(CUDA_KAFKA)
rapids_cuda_init_architectures(CUDF_KAFKA)

# Since we are building cudf as part of ourselves we need to enable the CUDA language in the
# top-most scope
enable_language(CUDA)

# Since CUDA_KAFKA only enables CUDA optionally we need to manually include the file that
# Since CUDF_KAFKA only enables CUDA optionally we need to manually include the file that
# rapids_cuda_init_architectures relies on `project` calling
if(DEFINED CMAKE_PROJECT_CUDA_KAFKA_INCLUDE)
include("${CMAKE_PROJECT_CUDA_KAFKA_INCLUDE}")
if(DEFINED CMAKE_PROJECT_CUDF_KAFKA_INCLUDE)
include("${CMAKE_PROJECT_CUDF_KAFKA_INCLUDE}")
endif()
endif()
2 changes: 1 addition & 1 deletion cpp/libcudf_kafka/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ function(ConfigureTest test_name)
add_executable(${test_name} ${ARGN})
set_target_properties(
${test_name}
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDA_KAFKA_BINARY_DIR}/gtests>"
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_KAFKA_BINARY_DIR}/gtests>"
INSTALL_RPATH "\$ORIGIN/../../../lib"
)
target_link_libraries(
Expand Down
8 changes: 6 additions & 2 deletions cpp/src/text/vocabulary_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,12 @@ __global__ void token_counts_fn(cudf::column_device_view const d_strings,
__syncwarp();

for (auto itr = d_output + lane_idx + 1; itr < d_output_end; itr += cudf::detail::warp_size) {
// add one if at the edge of a token or at the string's end
count += ((*itr && !(*(itr - 1))) || (itr + 1 == d_output_end));
// add one if at the edge of a token or if at the string's end
if (*itr) {
count += !(*(itr - 1));
} else {
count += (itr + 1 == d_output_end);
}
}
__syncwarp();

Expand Down
12 changes: 6 additions & 6 deletions cpp/tests/text/tokenize_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,14 +246,14 @@ TEST_F(TextTokenizeTest, Vocabulary)

TEST_F(TextTokenizeTest, VocabularyLongStrings)
{
cudf::test::strings_column_wrapper vocabulary( // leaving out 'cat' on purpose
cudf::test::strings_column_wrapper vocabulary(
{"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"});
auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary));

std::vector<std::string> h_strings(
4,
"the fox jumped chased the dog cheese mouse at the over there dog mouse cat plus the horse "
"jumped over the mouse house with the dog");
"jumped over the mousé house with the dog ");
cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end());
auto input_view = cudf::strings_column_view(input);
auto delimiter = cudf::string_scalar(" ");
Expand All @@ -262,10 +262,10 @@ TEST_F(TextTokenizeTest, VocabularyLongStrings)

using LCW = cudf::test::lists_column_wrapper<cudf::size_type>;
// clang-format off
LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}});
LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}});
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

Expand Down
13 changes: 5 additions & 8 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ files:
- build_all
- build_cpp
- build_wheels
- build_python
- build_python_common
- build_python_cudf
- cudatoolkit
- develop
- docs
Expand Down Expand Up @@ -71,8 +71,8 @@ files:
table: build-system
includes:
- build_all
- build_python
- build_python_common
- build_python_cudf
- build_wheels
py_run_cudf:
output: pyproject
Expand Down Expand Up @@ -138,8 +138,8 @@ files:
extras:
table: build-system
includes:
- build_wheels
- build_python_common
- build_wheels
py_run_cudf_kafka:
output: pyproject
pyproject_dir: python/cudf_kafka
Expand Down Expand Up @@ -259,16 +259,16 @@ dependencies:
- cython>=3.0.3
# TODO: Pin to numpy<1.25 until cudf requires pandas 2
- &numpy numpy>=1.21,<1.25
- scikit-build>=0.13.1
- output_types: [conda, requirements, pyproject]
packages:
# Hard pin the patch version used during the build. This must be kept
# in sync with the version pinned in get_arrow.cmake.
- pyarrow==14.0.1.*
build_python:
build_python_cudf:
common:
- output_types: [conda, requirements, pyproject]
packages:
- scikit-build>=0.13.1
- rmm==23.12.*
- output_types: conda
packages:
Expand Down Expand Up @@ -302,9 +302,6 @@ dependencies:
- cuda-nvrtc-dev
- cuda-nvtx-dev
- libcurand-dev
# TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
# cudf_kafka to use scikit-build
- cuda-gdb
- matrix:
cuda: "11.8"
packages:
Expand Down
6 changes: 0 additions & 6 deletions python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,6 @@ target_link_libraries(strings_udf cudf_strings_udf)
# necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C
# /opt/_internal"
find_package(NumPy REQUIRED)
set(targets_using_numpy interop avro csv orc json parquet)
foreach(target IN LISTS targets_using_numpy)
target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
# Switch to the line below when we switch back to FindPython.cmake in CMake 3.24.
# target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
endforeach()

set(targets_using_dlpack interop)
foreach(target IN LISTS targets_using_dlpack)
Expand Down
24 changes: 24 additions & 0 deletions python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.scalar.scalar cimport string_scalar


cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:

cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs":
pass

cdef unique_ptr[bpe_merge_pairs] load_merge_pairs(
const column_view &merge_pairs
) except +

cdef unique_ptr[column] byte_pair_encoding(
const column_view &strings,
const bpe_merge_pairs &merge_pairs,
const string_scalar &separator
) except +
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
# =============================================================================

set(cython_sources
edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
Loading

0 comments on commit 413bfb6

Please sign in to comment.