Improve multi-gpu testing (#1017)

NVIDIA-Merlin · Sep 14, 2021 · 9f141c6 · 9f141c6
1 parent b59a670
commit 9f141c6
Show file tree

Hide file tree

Showing 5 changed files with 111 additions and 89 deletions.
diff --git a/ci/test_integration.sh b/ci/test_integration.sh
@@ -1,34 +1,43 @@
 #!/bin/bash
 
-# Call this script with name of container as parameter
-# [merlin-training, merlin-tensorflow-training,
-#  merlin-pytorch-training, merlin-inference]
+# Call this script with:
+# 1. Name of container as first parameter
+#    [merlin-training, merlin-tensorflow,
+#      merlin-pytorch, merlin-inference]
+#
+# 2. Devices to use:
+#    [0; 0,1; 0,1,..,n-1]
 
 # Get last NVTabular version
 cd /nvtabular/
 git pull origin main
 
-# Run tests for all containers
-pytest tests/integration/test_notebooks.py::test_criteo_example
-pytest tests/integration/test_notebooks.py::test_rossman_example
-pytest tests/integration/test_notebooks.py::test_movielens_example
+container=$1
+cmd="--devices $2"
+
+# Run tests for all containers but inference
+if [ "$container" != "merlin-inference" ]; then
+  #pytest $config tests/integration/test_notebooks.py::test_criteo
+  pytest $config tests/integration/test_notebooks.py::test_rossman
+  pytest $config tests/integration/test_notebooks.py::test_movielens
+fi
 
 # Run tests for specific containers
-if [ "$1" == "merlin-training" ]; then
-  pytest tests/integration/test_nvt_hugectr.py::test_training
-elif [ "$1" == "merlin-tensorflow-trainig" ]; then
-  pytest tests/integration/test_nvt_tf_inference.py::test_nvt_tf_rossmann_inference
-  pytest tests/integration/test_nvt_tf_inference.py::test_nvt_tf_movielens_inference
-elif [ "$1" == "merlin-pytorch-training" ]; then
-  echo "Nothing specific for $1 yet"
-elif [ "$1" == "merlin-inference" ]; then
-  pytest tests/integration/test_nvt_tf_inference.py::test_nvt_tf_rossmann_inference_triton
-  pytest tests/integration/test_nvt_tf_inference.py::test_nvt_tf_rossmann_inference_triton_mt
-  pytest tests/integration/test_nvt_tf_inference.py::test_nvt_tf_movielens_inference_triton
-  pytest tests/integration/test_nvt_tf_inference.py::test_nvt_tf_movielens_inference_triton_mt
-  pytest tests/integration/test_nvt_hugectr.py::test_inference
+if [ "$container" == "merlin-training" ]; then
+  pytest $config tests/integration/test_nvt_hugectr.py::test_training
+elif [ "$container" == "merlin-tensorflow" ]; then
+  pytest $config tests/integration/test_nvt_tf_inference.py::test_nvt_tf_rossmann_inference
+  pytest $config tests/integration/test_nvt_tf_inference.py::test_nvt_tf_movielens_inference
+elif [ "$container" == "merlin-pytorch" ]; then
+  echo "Nothing specific for $container yet"
+elif [ "$container" == "merlin-inference" ]; then
+  #pytest $config tests/integration/test_notebooks.py::test_inference
+  pytest $config tests/integration/test_nvt_tf_inference.py::test_nvt_tf_rossmann_inference_triton
+  pytest $config tests/integration/test_nvt_tf_inference.py::test_nvt_tf_rossmann_inference_triton_mt
+  pytest $config tests/integration/test_nvt_tf_inference.py::test_nvt_tf_movielens_inference_triton
+  pytest $config tests/integration/test_nvt_tf_inference.py::test_nvt_tf_movielens_inference_triton_mt
+  pytest $config tests/integration/test_nvt_hugectr.py::test_inference
 else
   echo "INVALID Container name"
   exit 1
 fi
-
diff --git a/examples/getting-started-movielens/03-Training-with-HugeCTR.ipynb b/examples/getting-started-movielens/03-Training-with-HugeCTR.ipynb
@@ -3,7 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "5222845b",
+   "id": "a81ae6a1",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -25,7 +25,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "188f944e",
+   "id": "b287c379",
    "metadata": {},
    "source": [
     "<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
@@ -42,7 +42,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c01e2e13",
+   "id": "a56211a5",
    "metadata": {},
    "source": [
     "### Why using HugeCTR?\n",
@@ -57,7 +57,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "1cbbbbd2",
+   "id": "fef851ad",
    "metadata": {},
    "source": [
     "### Other Features of HugeCTR\n",
@@ -72,15 +72,15 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d0c6bb06",
+   "id": "2548be9c",
    "metadata": {},
    "source": [
     "### Getting Started"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "d86e3d0a",
+   "id": "bff8df20",
    "metadata": {},
    "source": [
     "In this example, we will train a neural network with HugeCTR. We will use preprocessed datasets generated via NVTabular in `02-ETL-with-NVTabular` notebook."
@@ -89,7 +89,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "857c98a8",
+   "id": "4de873b8",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -100,7 +100,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c455eaec",
+   "id": "a43e28df",
    "metadata": {},
    "source": [
     "We define our base directory, containing the data."
@@ -109,19 +109,22 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "id": "b0cf97ca",
+   "id": "61470a29",
    "metadata": {},
    "outputs": [],
    "source": [
     "# path to preprocessed data\n",
     "INPUT_DATA_DIR = os.environ.get(\n",
     "    \"INPUT_DATA_DIR\", os.path.expanduser(\"~/nvt-examples/movielens/data/\")\n",
-    ")"
+    ")\n",
+    "\n",
+    "# path to save the models\n",
+    "MODEL_BASE_DIR = os.environ.get(\"MODEL_BASE_DIR\", os.path.expanduser(\"~/nvt-examples/\"))"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "8bbbba9a",
+   "id": "d430ba99",
    "metadata": {},
    "source": [
     "Let's load our saved workflow from the `02-ETL-with-NVTabular` notebook."
@@ -130,7 +133,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "id": "3d9162fa",
+   "id": "57de5919",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -140,7 +143,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "id": "9e84a4b2",
+   "id": "acbcc269",
    "metadata": {},
    "outputs": [
     {
@@ -163,15 +166,15 @@
   },
   {
    "cell_type": "markdown",
-   "id": "598f0f9a",
+   "id": "79fdd980",
    "metadata": {},
    "source": [
     "Note: We do not have numerical output columns"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "10363084",
+   "id": "6684e11c",
    "metadata": {},
    "source": [
     "Let's clear existing directory and create the output folders."
@@ -180,33 +183,34 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "32eccdbf",
+   "id": "bca2b7c3",
    "metadata": {},
    "outputs": [],
    "source": [
-    "!rm -r ~/nvt-examples/model/movielens_hugectr\n",
-    "!mkdir -p ~/nvt-examples/model/movielens_hugectr/1"
+    "MODEL_DIR = os.path.join(INPUT_DATA_DIR, \"model/movielens_hugectr/\")\n",
+    "!rm -r MODEL_DIR\n",
+    "!mkdir MODEL_DIR + \"1\""
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "f8b2cc0d",
+   "id": "4b5ea67a",
    "metadata": {},
    "source": [
     "## Scaling Accelerated training with HugeCTR"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "e41c8447",
+   "id": "13f42493",
    "metadata": {},
    "source": [
     "HugeCTR is a deep learning framework dedicated to recommendation systems. It is written in CUDA C++. As HugeCTR optimizes the training in CUDA++, we need to define the training pipeline and model architecture and execute it via the commandline. We will use the Python API, which is similar to Keras models."
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "73327484",
+   "id": "a04e46ef",
    "metadata": {},
    "source": [
     "HugeCTR has three main components:\n",
@@ -218,7 +222,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "927a46bb",
+   "id": "aecac7da",
    "metadata": {},
    "source": [
     "**Solver**\n",
@@ -241,7 +245,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "96f9e478",
+   "id": "7e8a0061",
    "metadata": {},
    "source": [
     "**Optimizer**\n",
@@ -259,7 +263,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2b8d2c21",
+   "id": "82a0aed9",
    "metadata": {},
    "source": [
     "**DataReader**\n",
@@ -280,7 +284,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "7f248fcb",
+   "id": "6133bae9",
    "metadata": {},
    "source": [
     "**Model**\n",
@@ -301,7 +305,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "770358be",
+   "id": "a3d13762",
    "metadata": {},
    "source": [
     "Input layer:\n",
@@ -352,7 +356,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b1e088d4",
+   "id": "1ffb96fe",
    "metadata": {},
    "source": [
     "## Let's define our model\n",
@@ -362,7 +366,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "7120eea8",
+   "id": "2c2b0b93",
    "metadata": {},
    "source": [
     "We need the cardinalities of each categorical feature to assign as `slot_size_array` in the model below."
@@ -371,7 +375,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "id": "4f802522",
+   "id": "a2752043",
    "metadata": {},
    "outputs": [
     {
@@ -391,7 +395,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a7075d6e",
+   "id": "390c37b0",
    "metadata": {},
    "source": [
     "We use `graph_to_json` to convert the model to a JSON configuration, required for the inference."
@@ -400,7 +404,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "id": "e92d17ef",
+   "id": "0ae8d97d",
    "metadata": {},
    "outputs": [
     {
@@ -429,8 +433,8 @@
     "optimizer = hugectr.CreateOptimizer(optimizer_type=hugectr.Optimizer_t.Adam)\n",
     "reader = hugectr.DataReaderParams(\n",
     "    data_reader_type=hugectr.DataReaderType_t.Parquet,\n",
-    "    source=[\"/root/nvt-examples/movielens/data/train/_file_list.txt\"],\n",
-    "    eval_source=\"/root/nvt-examples/movielens/data/valid/_file_list.txt\",\n",
+    "    source=[INPUT_DATA_DIR + \"train/_file_list.txt\"],\n",
+    "    eval_source=INPUT_DATA_DIR + \"valid/_file_list.txt\",\n",
     "    check_type=hugectr.Check_t.Non,\n",
     "    slot_size_array=[162542, 56586, 21],\n",
     ")\n",
@@ -516,12 +520,12 @@
     "model.compile()\n",
     "model.summary()\n",
     "model.fit(max_iter=2000, display=100, eval_interval=200, snapshot=1900)\n",
-    "model.graph_to_json(graph_config_file=\"/root/nvt-examples/model/movielens_hugectr/1/movielens.json\")"
+    "model.graph_to_json(graph_config_file=MODEL_DIR + \"1/movielens.json\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "552745d9",
+   "id": "965e4090",
    "metadata": {},
    "source": [
     "We train our model."
@@ -530,7 +534,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
-   "id": "5e1bf00b",
+   "id": "8563943a",
    "metadata": {},
    "outputs": [
     {
@@ -642,7 +646,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "95ded867",
+   "id": "107cb1b0",
    "metadata": {},
    "source": [
     "After training terminates, we can see that multiple `.model` files and folders are generated. We need to move them inside `1` folder under the `movielens_hugectr` folder. "
@@ -651,11 +655,11 @@
   {
    "cell_type": "code",
    "execution_count": 10,
-   "id": "cd8dd0c9",
+   "id": "7ed71e52",
    "metadata": {},
    "outputs": [],
    "source": [
-    "!mv *.model ~/nvt-examples/model/movielens_hugectr/1/"
+    "!mv *.model MODEL_DIR"
    ]
   }
  ],