Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes devices vector alloc to fix seg fault, removes unused RAFT code in PLC, re-enables full CI testing #3167

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 1 addition & 11 deletions ci/test_notebooks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,7 @@ rapids-mamba-retry install \
NBTEST="$(realpath "$(dirname "$0")/utils/nbtest.sh")"
NOTEBOOK_LIST="$(realpath "$(dirname "$0")/gpu/notebook_list.py")"
EXITCODE=0
# FIXME: This is temporary until a crash that occurs at cleanup is fixed. This
# allows PRs that pass tests to pass even if they crash with a Seg Fault or
# other error that results in 139. Remove this ASAP!
# trap "EXITCODE=1" ERR
trap "EXITCODE=1" ERR


pushd notebooks
Expand All @@ -52,13 +49,6 @@ for folder in ${TOPLEVEL_NB_FOLDERS}; do
pushd "$(dirname "${nb}")"
nvidia-smi
${NBTEST} "${nbBasename}"
# FIXME: This is temporary until a crash that occurs at cleanup is fixed. This
# allows PRs that pass tests to pass even if they crash with a Seg Fault or
# other error that results in 139. Remove this ASAP!
exitcode=$?
if (( (${exitcode} != 0) && (${exitcode} != 139) )); then
EXITCODE=1
fi
echo "Ran nbtest for $nb : return code was: $?, test script exit code is now: $EXITCODE"
echo
popd
Expand Down
30 changes: 5 additions & 25 deletions ci/test_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,7 @@ pytest \
tests
exitcode=$?

# FIXME: This is temporary until a crash that occurs at cleanup is fixed. This
# allows PRs that pass tests to pass even if they crash with a Seg Fault or
# other error that results in 139. Remove this ASAP!
# if (( ${exitcode} != 0 )); then
if (( (${exitcode} != 0) && (${exitcode} != 139) )); then
if (( ${exitcode} != 0 )); then
SUITEERROR=${exitcode}
echo "FAILED: 1 or more tests in pylibcugraph"
fi
Expand All @@ -85,11 +81,7 @@ pytest \
tests
exitcode=$?

# FIXME: This is temporary until a crash that occurs at cleanup is fixed. This
# allows PRs that pass tests to pass even if they crash with a Seg Fault or
# other error that results in 139. Remove this ASAP!
# if (( ${exitcode} != 0 )); then
if (( (${exitcode} != 0) && (${exitcode} != 139) )); then
if (( ${exitcode} != 0 )); then
SUITEERROR=${exitcode}
echo "FAILED: 1 or more tests in cugraph"
fi
Expand All @@ -105,11 +97,7 @@ pytest \
cugraph/pytest-based/bench_algos.py
exitcode=$?

# FIXME: This is temporary until a crash that occurs at cleanup is fixed. This
# allows PRs that pass tests to pass even if they crash with a Seg Fault or
# other error that results in 139. Remove this ASAP!
# if (( ${exitcode} != 0 )); then
if (( (${exitcode} != 0) && (${exitcode} != 139) )); then
if (( ${exitcode} != 0 )); then
SUITEERROR=${exitcode}
echo "FAILED: 1 or more tests in cugraph benchmarks"
fi
Expand All @@ -130,11 +118,7 @@ pytest \
.
exitcode=$?

# FIXME: This is temporary until a crash that occurs at cleanup is fixed. This
# allows PRs that pass tests to pass even if they crash with a Seg Fault or
# other error that results in 139. Remove this ASAP!
# if (( ${exitcode} != 0 )); then
if (( (${exitcode} != 0) && (${exitcode} != 139) )); then
if (( ${exitcode} != 0 )); then
SUITEERROR=${exitcode}
echo "FAILED: 1 or more tests in cugraph-pyg"
fi
Expand All @@ -157,11 +141,7 @@ pytest \
tests
exitcode=$?

# FIXME: This is temporary until a crash that occurs at cleanup is fixed. This
# allows PRs that pass tests to pass even if they crash with a Seg Fault or
# other error that results in 139. Remove this ASAP!
# if (( ${exitcode} != 0 )); then
if (( (${exitcode} != 0) && (${exitcode} != 139) )); then
if (( ${exitcode} != 0 )); then
SUITEERROR=${exitcode}
echo "FAILED: 1 or more tests in cugraph-service"
fi
Expand Down
58 changes: 32 additions & 26 deletions cpp/include/cugraph/utilities/misc_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -45,33 +45,39 @@ std::tuple<std::vector<vertex_t>, std::vector<edge_t>> compute_offset_aligned_ed
thrust::make_counting_iterator(size_t{1}),
[approx_edge_chunk_size] __device__(auto i) { return i * approx_edge_chunk_size; });
auto num_chunks = (num_edges + approx_edge_chunk_size - 1) / approx_edge_chunk_size;
rmm::device_uvector<vertex_t> d_vertex_offsets(num_chunks - 1, handle.get_stream());
thrust::lower_bound(handle.get_thrust_policy(),
offsets,
offsets + num_vertices + 1,
search_offset_first,
search_offset_first + d_vertex_offsets.size(),
d_vertex_offsets.begin());
rmm::device_uvector<edge_t> d_edge_offsets(d_vertex_offsets.size(), handle.get_stream());
thrust::gather(handle.get_thrust_policy(),
d_vertex_offsets.begin(),
d_vertex_offsets.end(),
offsets,
d_edge_offsets.begin());
std::vector<edge_t> h_edge_offsets(num_chunks + 1, edge_t{0});
h_edge_offsets.back() = num_edges;
raft::update_host(
h_edge_offsets.data() + 1, d_edge_offsets.data(), d_edge_offsets.size(), handle.get_stream());
std::vector<vertex_t> h_vertex_offsets(num_chunks + 1, vertex_t{0});
h_vertex_offsets.back() = num_vertices;
raft::update_host(h_vertex_offsets.data() + 1,
d_vertex_offsets.data(),
d_vertex_offsets.size(),
handle.get_stream());

handle.sync_stream();
if (num_chunks > 1) {
rmm::device_uvector<vertex_t> d_vertex_offsets(num_chunks - 1, handle.get_stream());
thrust::lower_bound(handle.get_thrust_policy(),
offsets,
offsets + num_vertices + 1,
search_offset_first,
search_offset_first + d_vertex_offsets.size(),
d_vertex_offsets.begin());
rmm::device_uvector<edge_t> d_edge_offsets(d_vertex_offsets.size(), handle.get_stream());
thrust::gather(handle.get_thrust_policy(),
d_vertex_offsets.begin(),
d_vertex_offsets.end(),
offsets,
d_edge_offsets.begin());
std::vector<edge_t> h_edge_offsets(num_chunks + 1, edge_t{0});
h_edge_offsets.back() = num_edges;
raft::update_host(
h_edge_offsets.data() + 1, d_edge_offsets.data(), d_edge_offsets.size(), handle.get_stream());
std::vector<vertex_t> h_vertex_offsets(num_chunks + 1, vertex_t{0});
h_vertex_offsets.back() = num_vertices;
raft::update_host(h_vertex_offsets.data() + 1,
d_vertex_offsets.data(),
d_vertex_offsets.size(),
handle.get_stream());

return std::make_tuple(h_vertex_offsets, h_edge_offsets);
handle.sync_stream();

return std::make_tuple(h_vertex_offsets, h_edge_offsets);
} else {
return std::make_tuple(std::vector<vertex_t>{{0, num_vertices}},
std::vector<edge_t>{{0, num_edges}});
}
}

template <typename T>
Expand Down
53 changes: 52 additions & 1 deletion python/pylibcugraph/pylibcugraph/tests/test_graph_sg.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -109,3 +109,54 @@ def test_sg_graph(graph_data):
renumber=False,
do_expensive_check=False,
)


def test_SGGraph_create_from_cudf():
"""
Smoke test to ensure an SGGraph can be created from a cuDF DataFrame
without raising exceptions, crashing, etc. This currently does not assert
correctness of the graph in any way.
"""
# FIXME: other PLC tests are using cudf so this does not add a new dependency,
# however, PLC tests should consider having fewer external dependencies, meaning
# this and other tests would be changed to not use cudf.
import cudf

# Importing this cugraph class seems to cause a crash more reliably (2023-01-22)
# from cugraph.structure.graph_implementation import simpleGraphImpl
from pylibcugraph import (
ResourceHandle,
GraphProperties,
SGGraph,
)

print("get edgelist...", end="", flush=True)
edgelist = cudf.DataFrame(
{
"src": [0, 1, 2],
"dst": [1, 2, 4],
"wgt": [0.0, 0.1, 0.2],
}
)

print("edgelist = ", edgelist)
print("done", flush=True)
print("create Graph...", end="", flush=True)

graph_props = GraphProperties(is_multigraph=False, is_symmetric=False)

plc_graph = SGGraph(
resource_handle=ResourceHandle(),
graph_properties=graph_props,
src_or_offset_array=edgelist["src"],
dst_or_index_array=edgelist["dst"],
weight_array=edgelist["wgt"],
edge_id_array=None,
edge_type_array=None,
store_transposed=False,
renumber=False,
do_expensive_check=True,
input_array_format="COO",
)
print("done", flush=True)
print(f"created SGGraph {plc_graph=}", flush=True)