Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes devices vector alloc to fix seg fault, removes unused RAFT code in PLC, re-enables full CI testing #3167

Merged
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 1 addition & 11 deletions ci/test_notebooks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,7 @@ rapids-mamba-retry install \
NBTEST="$(realpath "$(dirname "$0")/utils/nbtest.sh")"
NOTEBOOK_LIST="$(realpath "$(dirname "$0")/gpu/notebook_list.py")"
EXITCODE=0
# FIXME: This is temporary until a crash that occurs at cleanup is fixed. This
# allows PRs that pass tests to pass even if they crash with a Seg Fault or
# other error that results in 139. Remove this ASAP!
# trap "EXITCODE=1" ERR
trap "EXITCODE=1" ERR


pushd notebooks
Expand All @@ -52,13 +49,6 @@ for folder in ${TOPLEVEL_NB_FOLDERS}; do
pushd "$(dirname "${nb}")"
nvidia-smi
${NBTEST} "${nbBasename}"
# FIXME: This is temporary until a crash that occurs at cleanup is fixed. This
# allows PRs that pass tests to pass even if they crash with a Seg Fault or
# other error that results in 139. Remove this ASAP!
exitcode=$?
if (( (${exitcode} != 0) && (${exitcode} != 139) )); then
EXITCODE=1
fi
echo "Ran nbtest for $nb : return code was: $?, test script exit code is now: $EXITCODE"
echo
popd
Expand Down
30 changes: 5 additions & 25 deletions ci/test_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,7 @@ pytest \
tests
exitcode=$?

# FIXME: This is temporary until a crash that occurs at cleanup is fixed. This
# allows PRs that pass tests to pass even if they crash with a Seg Fault or
# other error that results in 139. Remove this ASAP!
# if (( ${exitcode} != 0 )); then
if (( (${exitcode} != 0) && (${exitcode} != 139) )); then
if (( ${exitcode} != 0 )); then
SUITEERROR=${exitcode}
echo "FAILED: 1 or more tests in pylibcugraph"
fi
Expand All @@ -85,11 +81,7 @@ pytest \
tests
exitcode=$?

# FIXME: This is temporary until a crash that occurs at cleanup is fixed. This
# allows PRs that pass tests to pass even if they crash with a Seg Fault or
# other error that results in 139. Remove this ASAP!
# if (( ${exitcode} != 0 )); then
if (( (${exitcode} != 0) && (${exitcode} != 139) )); then
if (( ${exitcode} != 0 )); then
SUITEERROR=${exitcode}
echo "FAILED: 1 or more tests in cugraph"
fi
Expand All @@ -105,11 +97,7 @@ pytest \
cugraph/pytest-based/bench_algos.py
exitcode=$?

# FIXME: This is temporary until a crash that occurs at cleanup is fixed. This
# allows PRs that pass tests to pass even if they crash with a Seg Fault or
# other error that results in 139. Remove this ASAP!
# if (( ${exitcode} != 0 )); then
if (( (${exitcode} != 0) && (${exitcode} != 139) )); then
if (( ${exitcode} != 0 )); then
SUITEERROR=${exitcode}
echo "FAILED: 1 or more tests in cugraph benchmarks"
fi
Expand All @@ -130,11 +118,7 @@ pytest \
.
exitcode=$?

# FIXME: This is temporary until a crash that occurs at cleanup is fixed. This
# allows PRs that pass tests to pass even if they crash with a Seg Fault or
# other error that results in 139. Remove this ASAP!
# if (( ${exitcode} != 0 )); then
if (( (${exitcode} != 0) && (${exitcode} != 139) )); then
if (( ${exitcode} != 0 )); then
SUITEERROR=${exitcode}
echo "FAILED: 1 or more tests in cugraph-pyg"
fi
Expand All @@ -157,11 +141,7 @@ pytest \
tests
exitcode=$?

# FIXME: This is temporary until a crash that occurs at cleanup is fixed. This
# allows PRs that pass tests to pass even if they crash with a Seg Fault or
# other error that results in 139. Remove this ASAP!
# if (( ${exitcode} != 0 )); then
if (( (${exitcode} != 0) && (${exitcode} != 139) )); then
if (( ${exitcode} != 0 )); then
SUITEERROR=${exitcode}
echo "FAILED: 1 or more tests in cugraph-service"
fi
Expand Down
58 changes: 32 additions & 26 deletions cpp/include/cugraph/utilities/misc_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -45,33 +45,39 @@ std::tuple<std::vector<vertex_t>, std::vector<edge_t>> compute_offset_aligned_ed
thrust::make_counting_iterator(size_t{1}),
[approx_edge_chunk_size] __device__(auto i) { return i * approx_edge_chunk_size; });
auto num_chunks = (num_edges + approx_edge_chunk_size - 1) / approx_edge_chunk_size;
rmm::device_uvector<vertex_t> d_vertex_offsets(num_chunks - 1, handle.get_stream());
thrust::lower_bound(handle.get_thrust_policy(),
offsets,
offsets + num_vertices + 1,
search_offset_first,
search_offset_first + d_vertex_offsets.size(),
d_vertex_offsets.begin());
rmm::device_uvector<edge_t> d_edge_offsets(d_vertex_offsets.size(), handle.get_stream());
thrust::gather(handle.get_thrust_policy(),
d_vertex_offsets.begin(),
d_vertex_offsets.end(),
offsets,
d_edge_offsets.begin());
std::vector<edge_t> h_edge_offsets(num_chunks + 1, edge_t{0});
h_edge_offsets.back() = num_edges;
raft::update_host(
h_edge_offsets.data() + 1, d_edge_offsets.data(), d_edge_offsets.size(), handle.get_stream());
std::vector<vertex_t> h_vertex_offsets(num_chunks + 1, vertex_t{0});
h_vertex_offsets.back() = num_vertices;
raft::update_host(h_vertex_offsets.data() + 1,
d_vertex_offsets.data(),
d_vertex_offsets.size(),
handle.get_stream());

handle.sync_stream();
if (num_chunks > 1) {
rmm::device_uvector<vertex_t> d_vertex_offsets(num_chunks - 1, handle.get_stream());
thrust::lower_bound(handle.get_thrust_policy(),
offsets,
offsets + num_vertices + 1,
search_offset_first,
search_offset_first + d_vertex_offsets.size(),
d_vertex_offsets.begin());
rmm::device_uvector<edge_t> d_edge_offsets(d_vertex_offsets.size(), handle.get_stream());
thrust::gather(handle.get_thrust_policy(),
d_vertex_offsets.begin(),
d_vertex_offsets.end(),
offsets,
d_edge_offsets.begin());
std::vector<edge_t> h_edge_offsets(num_chunks + 1, edge_t{0});
h_edge_offsets.back() = num_edges;
raft::update_host(
h_edge_offsets.data() + 1, d_edge_offsets.data(), d_edge_offsets.size(), handle.get_stream());
std::vector<vertex_t> h_vertex_offsets(num_chunks + 1, vertex_t{0});
h_vertex_offsets.back() = num_vertices;
raft::update_host(h_vertex_offsets.data() + 1,
d_vertex_offsets.data(),
d_vertex_offsets.size(),
handle.get_stream());

return std::make_tuple(h_vertex_offsets, h_edge_offsets);
handle.sync_stream();

return std::make_tuple(h_vertex_offsets, h_edge_offsets);
} else {
return std::make_tuple(std::vector<vertex_t>{{0, num_vertices}},
std::vector<edge_t>{{0, num_edges}});
}
}

template <typename T>
Expand Down
14 changes: 10 additions & 4 deletions cpp/include/cugraph_c/sampling_algorithms.h
Original file line number Diff line number Diff line change
Expand Up @@ -354,8 +354,11 @@ void cugraph_sample_result_free(cugraph_sample_result_t* result);
* @param [in] handle Handle for accessing resources
* @param [in] srcs Device array view to populate srcs
* @param [in] dsts Device array view to populate dsts
* @param [in] weights Device array view to populate weights
* @param [in] counts Device array view to populate counts
* @param [in] edge_id Device array view to populate edge_id (can be NULL)
* @param [in] edge_type Device array view to populate edge_type (can be NULL)
* @param [in] wgt Device array view to populate wgt (can be NULL)
* @param [in] hop Device array view to populate hop
* @param [in] label Device array view to populate label (can be NULL)
* @param [out] result Pointer to the location to store the
* cugraph_sample_result_t*
* @param [out] error Pointer to an error object storing details of
Expand All @@ -367,8 +370,11 @@ cugraph_error_code_t cugraph_test_sample_result_create(
const cugraph_resource_handle_t* handle,
const cugraph_type_erased_device_array_view_t* srcs,
const cugraph_type_erased_device_array_view_t* dsts,
const cugraph_type_erased_device_array_view_t* weights,
const cugraph_type_erased_device_array_view_t* counts,
const cugraph_type_erased_device_array_view_t* edge_id,
const cugraph_type_erased_device_array_view_t* edge_type,
const cugraph_type_erased_device_array_view_t* wgt,
const cugraph_type_erased_device_array_view_t* hop,
const cugraph_type_erased_device_array_view_t* label,
cugraph_sample_result_t** result,
cugraph_error_t** error);

Expand Down
80 changes: 62 additions & 18 deletions cpp/src/c_api/uniform_neighbor_sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -526,8 +526,11 @@ extern "C" cugraph_error_code_t cugraph_test_sample_result_create(
const cugraph_resource_handle_t* handle,
const cugraph_type_erased_device_array_view_t* srcs,
const cugraph_type_erased_device_array_view_t* dsts,
const cugraph_type_erased_device_array_view_t* weights,
const cugraph_type_erased_device_array_view_t* counts,
const cugraph_type_erased_device_array_view_t* edge_id,
const cugraph_type_erased_device_array_view_t* edge_type,
const cugraph_type_erased_device_array_view_t* wgt,
const cugraph_type_erased_device_array_view_t* hop,
const cugraph_type_erased_device_array_view_t* label,
cugraph_sample_result_t** result,
cugraph_error_t** error)
{
Expand Down Expand Up @@ -567,36 +570,77 @@ extern "C" cugraph_error_code_t cugraph_test_sample_result_create(
device_array_unique_ptr_t new_device_dsts(new_device_dsts_ptr,
&cugraph_type_erased_device_array_free);

// copy weights to new device array
cugraph_type_erased_device_array_t* new_device_weights_ptr{nullptr};
error_code = cugraph_type_erased_device_array_create_from_view(
handle, weights, &new_device_weights_ptr, error);
if (error_code != CUGRAPH_SUCCESS) return error_code;
// copy edge_id to new device array
cugraph_type_erased_device_array_t* new_device_edge_id_ptr{nullptr};

if (edge_id != NULL) {
error_code = cugraph_type_erased_device_array_create_from_view(
handle, edge_id, &new_device_edge_id_ptr, error);
if (error_code != CUGRAPH_SUCCESS) return error_code;
}

device_array_unique_ptr_t new_device_weights(new_device_weights_ptr,
device_array_unique_ptr_t new_device_edge_id(new_device_edge_id_ptr,
&cugraph_type_erased_device_array_free);

// copy counts to new device array
cugraph_type_erased_device_array_t* new_device_counts_ptr{nullptr};
error_code = cugraph_type_erased_device_array_create_from_view(
handle, counts, &new_device_counts_ptr, error);
// copy edge_type to new device array
cugraph_type_erased_device_array_t* new_device_edge_type_ptr{nullptr};

if (edge_type != NULL) {
error_code = cugraph_type_erased_device_array_create_from_view(
handle, edge_type, &new_device_edge_type_ptr, error);
if (error_code != CUGRAPH_SUCCESS) return error_code;
}

device_array_unique_ptr_t new_device_edge_type(new_device_edge_type_ptr,
&cugraph_type_erased_device_array_free);

// copy wgt to new device array
cugraph_type_erased_device_array_t* new_device_wgt_ptr{nullptr};
if (wgt != NULL) {
error_code =
cugraph_type_erased_device_array_create_from_view(handle, wgt, &new_device_wgt_ptr, error);
if (error_code != CUGRAPH_SUCCESS) return error_code;
}

device_array_unique_ptr_t new_device_wgt(new_device_wgt_ptr,
&cugraph_type_erased_device_array_free);

// copy hop to new device array
cugraph_type_erased_device_array_t* new_device_hop_ptr{nullptr};
error_code =
cugraph_type_erased_device_array_create_from_view(handle, hop, &new_device_hop_ptr, error);
if (error_code != CUGRAPH_SUCCESS) return error_code;

device_array_unique_ptr_t new_device_counts(new_device_counts_ptr,
&cugraph_type_erased_device_array_free);
device_array_unique_ptr_t new_device_hop(new_device_hop_ptr,
&cugraph_type_erased_device_array_free);

// copy label to new device array
cugraph_type_erased_device_array_t* new_device_label_ptr{nullptr};

if (label != NULL) {
error_code = cugraph_type_erased_device_array_create_from_view(
handle, label, &new_device_label_ptr, error);
if (error_code != CUGRAPH_SUCCESS) return error_code;
}

device_array_unique_ptr_t new_device_label(new_device_label_ptr,
&cugraph_type_erased_device_array_free);

// create new cugraph_sample_result_t
*result = reinterpret_cast<cugraph_sample_result_t*>(new cugraph::c_api::cugraph_sample_result_t{
reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_t*>(
new_device_srcs.release()),
reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_t*>(
new_device_dsts.release()),
nullptr,
reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_t*>(
new_device_weights.release()),
nullptr,
new_device_edge_id.release()),
reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_t*>(
new_device_counts.release())});
new_device_edge_type.release()),
reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_t*>(new_device_wgt.release()),
reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_t*>(
new_device_label.release()),
reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_t*>(new_device_hop.release()),
nullptr});

return CUGRAPH_SUCCESS;
}
Expand Down
4 changes: 2 additions & 2 deletions python/pylibcugraph/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down Expand Up @@ -64,7 +64,7 @@ if (NOT cugraph_FOUND)
if(CUGRAPH_BUILD_WHEELS)
# Statically link dependencies if building wheels
set(CUDA_STATIC_RUNTIME ON)
set(USE_RAFT_STATIC ON)
set(USE_RAFT_STATIC OFF)
rlratzel marked this conversation as resolved.
Show resolved Hide resolved
set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
set(ALLOW_CLONE_CUGRAPH_OPS ON)
Expand Down
1 change: 0 additions & 1 deletion python/pylibcugraph/pylibcugraph/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# =============================================================================

add_subdirectory(components)
add_subdirectory(raft/common)
add_subdirectory(internal_types)
add_subdirectory(testing)

Expand Down
7 changes: 5 additions & 2 deletions python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,11 @@ cdef extern from "cugraph_c/algorithms.h":
const cugraph_resource_handle_t* handle,
const cugraph_type_erased_device_array_view_t* srcs,
const cugraph_type_erased_device_array_view_t* dsts,
const cugraph_type_erased_device_array_view_t* weights,
const cugraph_type_erased_device_array_view_t* counts,
const cugraph_type_erased_device_array_view_t* edge_id,
const cugraph_type_erased_device_array_view_t* edge_type,
const cugraph_type_erased_device_array_view_t* wgt,
const cugraph_type_erased_device_array_view_t* hop,
const cugraph_type_erased_device_array_view_t* label,
cugraph_sample_result_t** result,
cugraph_error_t** error
)
Expand Down
Empty file.
26 changes: 0 additions & 26 deletions python/pylibcugraph/pylibcugraph/raft/common/CMakeLists.txt

This file was deleted.

6 changes: 0 additions & 6 deletions python/pylibcugraph/pylibcugraph/raft/common/TODO

This file was deleted.

Empty file.
22 changes: 0 additions & 22 deletions python/pylibcugraph/pylibcugraph/raft/common/cuda.pxd

This file was deleted.

Loading