From 8829aa9ece9b57fd6bca795f0633b066f50f8afb Mon Sep 17 00:00:00 2001 From: Paul Taylor <178183+trxcllnt@users.noreply.github.com> Date: Wed, 17 Jul 2024 12:24:47 -0700 Subject: [PATCH] Ensure `get_test_data.sh` doesn't re-download datasets (#4536) * support calling `get_test_data.sh` from any directory * only download datasets if newer than local copies * extract datasets in parallel After this PR, datasets will be stored in `datasets/tmp` and not deleted after extraction, because `wget -N` (aka `--timestamping`) relies on comparing the local and remote mtimes. Authors: - Paul Taylor (https://github.com/trxcllnt) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/4536 --- datasets/get_test_data.sh | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/datasets/get_test_data.sh b/datasets/get_test_data.sh index be2126be771..eea789ef3e3 100755 --- a/datasets/get_test_data.sh +++ b/datasets/get_test_data.sh @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -15,6 +15,9 @@ set -e set -o pipefail +# Ensure we're in the cugraph/datasets dir +cd "$( cd "$( dirname "$(realpath -m "${BASH_SOURCE[0]}")" )" && pwd )"; + # Update this to add/remove/change a dataset, using the following format: # # comment about the dataset @@ -99,28 +102,19 @@ DESTDIRS=($(echo "$DATASET_DATA"|awk '{if (NR%4 == 0) print $0}')) # extract 4t echo Downloading ... # Download all tarfiles to a tmp dir -rm -rf tmp -mkdir tmp +mkdir -p tmp cd tmp for url in ${URLS[*]}; do - time wget --progress=dot:giga ${url} + time wget -N --progress=dot:giga ${url} done cd .. -# Setup the destination dirs, removing any existing ones first! -for index in ${!DESTDIRS[*]}; do - rm -rf ${DESTDIRS[$index]} -done -for index in ${!DESTDIRS[*]}; do - mkdir -p ${DESTDIRS[$index]} -done +# create the destination dirs +mkdir -p "${DESTDIRS[@]}" # Iterate over the arrays and untar the nth tarfile to the nth dest directory. # The tarfile name is derived from the download url. echo Decompressing ... for index in ${!DESTDIRS[*]}; do - tfname=$(basename ${URLS[$index]}) - tar xvzf tmp/${tfname} -C ${DESTDIRS[$index]} -done - -rm -rf tmp + echo "tmp/$(basename "${URLS[$index]}") -C ${DESTDIRS[$index]}" | tr '\n' '\0' +done | xargs -0 -t -r -n1 -P$(nproc --all) sh -c 'tar -xzvf $0 --overwrite'