From ced22e24398629c25cd3ef6eb1bda7ba00997231 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 1 Nov 2024 13:12:13 -0500 Subject: [PATCH] add notebook tests, build.sh args --- .github/workflows/pr.yaml | 14 ++++ .github/workflows/test.yaml | 10 +++ build.sh | 18 ++++- ci/test_notebooks.sh | 6 +- ci/test_python.sh | 7 ++ ci/utils/nbtest.sh | 2 +- datasets/get_test_data.sh | 128 ++++++++++++++++++++++++++++++++++++ 7 files changed, 182 insertions(+), 3 deletions(-) create mode 100755 datasets/get_test_data.sh diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 3ae4d94..850ec7e 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -18,6 +18,7 @@ jobs: - conda-cpp-tests - conda-python-build - conda-python-tests + - conda-notebook-tests - wheel-build-pylibwholegraph - wheel-tests-pylibwholegraph - wheel-build-cugraph-dgl @@ -51,6 +52,7 @@ jobs: - '!CONTRIBUTING.md' - '!README.md' - '!docs/**' + - '!readme_pages/**' test_python: - '**' - '!.devcontainers/**' @@ -59,6 +61,7 @@ jobs: - '!docs/**' - '!img/**' - '!notebooks/**' + - '!readme_pages/**' checks: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12 @@ -83,6 +86,17 @@ jobs: uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12 with: build_type: pull-request + conda-notebook-tests: + needs: [conda-python-build, changed-files] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks + with: + build_type: pull-request + node_type: "gpu-v100-latest-1" + arch: "amd64" + container_image: "rapidsai/ci-conda:cuda11.8.0-ubuntu22.04-py3.10" + run_script: "ci/test_notebooks.sh" conda-python-tests: needs: [conda-python-build, changed-files] secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index cda7a18..8fa06d8 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -22,6 +22,16 @@ jobs: branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} + conda-notebook-tests: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks + with: + build_type: pull-request + node_type: "gpu-v100-latest-1" + arch: "amd64" + container_image: "rapidsai/ci-conda:cuda11.8.0-ubuntu22.04-py3.12" + run_script: "ci/test_notebooks.sh" conda-python-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 diff --git a/build.sh b/build.sh index edece26..c6f4955 100755 --- a/build.sh +++ b/build.sh @@ -29,12 +29,14 @@ VALIDARGS=" pylibwholegraph libwholegraph tests + benchmarks all -v -g -n --pydevelop --allgpuarch + --compile-cmd --clean -h --help @@ -49,6 +51,7 @@ HELP="$0 [ ...] [ ...] pylibwholegraph - build the pylibwholegraph Python package libwholegraph - build the libwholegraph library tests - build the C++ tests + benchmarks - build benchmarks all - build everything and is: -v - verbose build mode @@ -56,6 +59,8 @@ HELP="$0 [ ...] [ ...] -n - do not install after a successful build (does not affect Python packages) --pydevelop - install the Python packages in editable mode --allgpuarch - build for all supported GPU architectures + --enable-nvshmem - build with nvshmem support (beta). + --compile-cmd - only output compile commands (invoke CMake without build) --clean - clean an individual target (note: to do a complete rebuild, use the clean target described above) -h - print this text @@ -140,11 +145,22 @@ if hasArg --pydevelop; then PYTHON_ARGS_FOR_INSTALL="${PYTHON_ARGS_FOR_INSTALL} -e" fi +if hasArg --enable-nvshmem; then + BUILD_WITH_NVSHMEM=ON +else + BUILD_WITH_NVSHMEM=OFF +fi if hasArg tests; then BUILD_TESTS=ON else BUILD_TESTS=OFF fi +if hasArg benchmarks; then + BUILD_BENCHMARKS=ON +else + BUILD_BENCHMARKS=OFF +fi + # If clean or uninstall targets given, run them prior to any other steps if hasArg uninstall; then @@ -250,7 +266,7 @@ if hasArg cugraph-pyg || buildDefault || hasArg all; then fi fi -# Install the cugraph-dgl extensions for DGL +# Build and install the cugraph-dgl Python package if hasArg cugraph-dgl || buildDefault ||hasArg all; then if hasArg --clean; then cleanPythonDir ${REPODIR}/python/cugraph-dgl diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh index 31ec560..59b60e4 100755 --- a/ci/test_notebooks.sh +++ b/ci/test_notebooks.sh @@ -5,6 +5,8 @@ set -Eeuo pipefail . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" + rapids-logger "Generate notebook testing dependencies" rapids-dependency-file-generator \ --output conda \ @@ -27,7 +29,9 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python) rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ - libcugraph pylibcugraph cugraph + "libcugraph=${RAPIDS_VERSION}" \ + "pylibcugraph=${RAPIDS_VERSION}" \ + "cugraph=${RAPIDS_VERSION}" NBTEST="$(realpath "$(dirname "$0")/utils/nbtest.sh")" NOTEBOOK_LIST="$(realpath "$(dirname "$0")/notebook_list.py")" diff --git a/ci/test_python.sh b/ci/test_python.sh index dd5f539..9a03c24 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -27,6 +27,13 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"} mkdir -p "${RAPIDS_TESTS_DIR}" "${RAPIDS_COVERAGE_DIR}" +# RAPIDS_DATASET_ROOT_DIR is used by test scripts +export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)" +mkdir -p "${RAPIDS_DATASET_ROOT_DIR}" +pushd "${RAPIDS_DATASET_ROOT_DIR}" +./get_test_data.sh --benchmark +popd + EXITCODE=0 trap "EXITCODE=1" ERR set +e diff --git a/ci/utils/nbtest.sh b/ci/utils/nbtest.sh index 91af633..faf7d28 100755 --- a/ci/utils/nbtest.sh +++ b/ci/utils/nbtest.sh @@ -60,7 +60,7 @@ for nb in $*; do echo -------------------------------------------------------------------------------- echo STARTING: ${NBNAME} echo -------------------------------------------------------------------------------- - jupyter nbconvert --to script ${NBFILENAME} --output ${NBTMPDIR}/${NBNAME}-test + jupyter nbconvert --to python ${NBFILENAME} --output ${NBTMPDIR}/${NBNAME}-test echo "${MAGIC_OVERRIDE_CODE}" > ${NBTMPDIR}/tmpfile cat ${NBTESTSCRIPT} >> ${NBTMPDIR}/tmpfile mv ${NBTMPDIR}/tmpfile ${NBTESTSCRIPT} diff --git a/datasets/get_test_data.sh b/datasets/get_test_data.sh new file mode 100755 index 0000000..6778166 --- /dev/null +++ b/datasets/get_test_data.sh @@ -0,0 +1,128 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -e +set -o pipefail + +# Ensure we're in the cugraph/datasets dir +cd "$( cd "$( dirname "$(realpath -m "${BASH_SOURCE[0]}")" )" && pwd )"; + +# Update this to add/remove/change a dataset, using the following format: +# +# comment about the dataset +# dataset download URL +# destination dir to untar to +# blank line separator +# +# FIXME: some test data needs to be extracted to "benchmarks", which is +# confusing now that there's dedicated datasets for benchmarks. +CPP_CI_DATASET_DATA=" +# ~10s download +https://data.rapids.ai/cugraph/test/cpp_ci_datasets.tgz +test +" + +BASE_DATASET_DATA=" +# ~22s download +https://data.rapids.ai/cugraph/test/datasets.tgz +test + +# ~14s download +https://data.rapids.ai/cugraph/test/ref/pagerank.tgz +test/ref + +# ~1s download +https://data.rapids.ai/cugraph/test/ref/sssp.tgz +test/ref + +# ~15s download +https://data.rapids.ai/cugraph/benchmark/hibench/hibench_1_large.tgz +benchmark + +# ~1s download +https://data.rapids.ai/cugraph/benchmark/hibench/hibench_1_small.tgz +benchmark + +# ~0.6s download +https://data.rapids.ai/cugraph/test/tsplib/datasets.tar.gz +tsplib +" + +EXTENDED_DATASET_DATA=" +# ~42s download - tests using this dataset are currently not run in test.sh with --quick +https://data.rapids.ai/cugraph/benchmark/hibench/hibench_1_huge.tgz +benchmark +" + +BENCHMARK_DATASET_DATA=" +# ~90s download - these are used for benchmarks runs (code in /benchmarks) +https://data.rapids.ai/cugraph/benchmark/benchmark_csv_data.tgz +csv +" + +SELF_LOOPS_DATASET_DATA=" +# ~1s download +https://data.rapids.ai/cugraph/benchmark/benchmark_csv_data_self_loops.tgz +self_loops +" +################################################################################ +# Do not change the script below this line if only adding/updating a dataset + +NUMARGS=$# +ARGS=$* +function hasArg { + (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") +} + +if hasArg -h || hasArg --help; then + echo "$0 [--subset | --benchmark | --self_loops]" + exit 0 +fi + +# Select the datasets to install +if hasArg "--benchmark"; then + DATASET_DATA="${BENCHMARK_DATASET_DATA}" +elif hasArg "--subset"; then + DATASET_DATA="${BASE_DATASET_DATA}" +elif hasArg "--cpp_ci_subset"; then + DATASET_DATA="${CPP_CI_DATASET_DATA}" +elif hasArg "--self_loops"; then + DATASET_DATA="${SELF_LOOPS_DATASET_DATA}" +# Do not include benchmark datasets by default - too big +else + DATASET_DATA="${BASE_DATASET_DATA} ${EXTENDED_DATASET_DATA}" +fi + +URLS=($(echo "$DATASET_DATA"|awk '{if (NR%4 == 3) print $0}')) # extract 3rd fields to a bash array +DESTDIRS=($(echo "$DATASET_DATA"|awk '{if (NR%4 == 0) print $0}')) # extract 4th fields to a bash array + +echo Downloading ... + +# Download all tarfiles to a tmp dir +mkdir -p tmp +cd tmp +for url in ${URLS[*]}; do + time wget -N --progress=dot:giga ${url} +done +cd .. + +# create the destination dirs +mkdir -p "${DESTDIRS[@]}" + +# Iterate over the arrays and untar the nth tarfile to the nth dest directory. +# The tarfile name is derived from the download url. +echo Decompressing ... +for index in ${!DESTDIRS[*]}; do + echo "tmp/$(basename "${URLS[$index]}") -C ${DESTDIRS[$index]}" | tr '\n' '\0' +done | xargs -0 -t -r -n1 -P$(nproc --all) sh -c 'tar -xzvf $0 --overwrite'