From 73c7726336a5f4958d526c6224fb15b9690ad84e Mon Sep 17 00:00:00 2001
From: krishung5 <krish@nvidia.com>
Date: Wed, 8 Nov 2023 01:47:09 -0800
Subject: [PATCH] Revert test changes

---
 qa/L0_backend_python/python_unittest.py       |   2 +-
 .../request_rescheduling/test.sh              |   7 -
 .../bls_request_rescheduling/model.py         |  54 +------
 .../request_rescheduling_cases/config.pbtxt   |  51 ------
 .../request_rescheduling_cases/model.py       | 147 ------------------
 5 files changed, 6 insertions(+), 255 deletions(-)
 delete mode 100644 qa/python_models/request_rescheduling_cases/config.pbtxt
 delete mode 100644 qa/python_models/request_rescheduling_cases/model.py

diff --git a/qa/L0_backend_python/python_unittest.py b/qa/L0_backend_python/python_unittest.py
index a00ee1cb994..c956412f9de 100755
--- a/qa/L0_backend_python/python_unittest.py
+++ b/qa/L0_backend_python/python_unittest.py
@@ -76,7 +76,7 @@ def test_python_unittest(self):
                 self._run_unittest(model_name)
 
                 # [FIXME] See DLIS-3684
-                # self._run_unittest(model_name)
+                self._run_unittest(model_name)
                 with self._shm_leak_detector.Probe() as shm_probe:
                     self._run_unittest(model_name)
             else:
diff --git a/qa/L0_backend_python/request_rescheduling/test.sh b/qa/L0_backend_python/request_rescheduling/test.sh
index b290c90bb1a..cecf2b2812b 100755
--- a/qa/L0_backend_python/request_rescheduling/test.sh
+++ b/qa/L0_backend_python/request_rescheduling/test.sh
@@ -39,9 +39,6 @@ RET=0
 
 rm -fr *.log ./models *.txt
 
-pip3 uninstall -y torch
-pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html
-
 mkdir -p models/bls_request_rescheduling/1/
 cp ../../python_models/bls_request_rescheduling/model.py models/bls_request_rescheduling/1/
 cp ../../python_models/bls_request_rescheduling/config.pbtxt models/bls_request_rescheduling
@@ -54,10 +51,6 @@ mkdir -p models/generative_sequence/1/
 cp ../../python_models/generative_sequence/model.py models/generative_sequence/1/
 cp ../../python_models/generative_sequence/config.pbtxt models/generative_sequence
 
-mkdir -p models/request_rescheduling_cases/1/
-cp ../../python_models/request_rescheduling_cases/model.py models/request_rescheduling_cases/1/
-cp ../../python_models/request_rescheduling_cases/config.pbtxt models/request_rescheduling_cases
-
 mkdir -p models/wrong_return_type/1/
 cp ../../python_models/wrong_return_type/model.py models/wrong_return_type/1/
 cp ../../python_models/wrong_return_type/config.pbtxt models/wrong_return_type
diff --git a/qa/python_models/bls_request_rescheduling/model.py b/qa/python_models/bls_request_rescheduling/model.py
index af630a4e838..5599618c71d 100644
--- a/qa/python_models/bls_request_rescheduling/model.py
+++ b/qa/python_models/bls_request_rescheduling/model.py
@@ -24,6 +24,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import time
 import unittest
 
 import numpy as np
@@ -78,6 +79,10 @@ def test_decoupled_e2e(self):
         model_name = "generative_sequence"
         # Reload the model to reset the flag for multiple iterations
         pb_utils.unload_model(model_name)
+        # TODO: Make this more robust to wait until fully unloaded
+        print("Sleep 10 seconds to make sure model finishes unloading...", flush=True)
+        time.sleep(10)
+        print("Done sleeping.", flush=True)
         pb_utils.load_model(model_name)
 
         input_value = 3
@@ -102,55 +107,6 @@ def test_decoupled_e2e(self):
                     self.assertEqual(expected_output, output0.as_numpy()[0])
                     expected_output -= 1
 
-    def test_send_final_flag_before_rescheduling_request(self):
-        model_name = "request_rescheduling_cases"
-        # Reload the model to reset the flag for multiple iterations
-        pb_utils.unload_model(model_name)
-        pb_utils.load_model(model_name)
-
-        case_value = 0
-        input0 = pb_utils.Tensor("IN", np.array([case_value], dtype=np.int32))
-        infer_request = pb_utils.InferenceRequest(
-            model_name=model_name,
-            inputs=[input0],
-            requested_output_names=["OUT"],
-        )
-        infer_responses = infer_request.exec(decoupled=True)
-        for infer_response in infer_responses:
-            self.assertFalse(infer_response.has_error())
-
-            if len(infer_response.output_tensors()) > 0:
-                output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
-                self.assertIsNotNone(output0)
-
-                self.assertEqual(case_value, output0.as_numpy()[0])
-
-    def test_process_request_in_different_thread(self):
-        model_name = "request_rescheduling_cases"
-        # Reload the model to reset the flag for multiple iterations
-        pb_utils.unload_model(model_name)
-        pb_utils.load_model(model_name)
-
-        case_value = 1
-        input0 = pb_utils.Tensor("IN", np.array([case_value], dtype=np.int32))
-        infer_request = pb_utils.InferenceRequest(
-            model_name=model_name,
-            inputs=[input0],
-            requested_output_names=["OUT"],
-        )
-        infer_responses = infer_request.exec(decoupled=True)
-
-        expected_output = case_value
-        for infer_response in infer_responses:
-            self.assertFalse(infer_response.has_error())
-
-            if len(infer_response.output_tensors()) > 0:
-                output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
-                self.assertIsNotNone(output0)
-
-                self.assertEqual(expected_output, output0.as_numpy()[0])
-                expected_output -= 1
-
 
 class TritonPythonModel:
     def execute(self, requests):
diff --git a/qa/python_models/request_rescheduling_cases/config.pbtxt b/qa/python_models/request_rescheduling_cases/config.pbtxt
deleted file mode 100644
index 19b6db68f37..00000000000
--- a/qa/python_models/request_rescheduling_cases/config.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "request_rescheduling_cases"
-backend: "python"
-max_batch_size: 0
-model_transaction_policy {
-  decoupled: True
-}
-input [
-  {
-    name: "IN"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  }
-]
-output [
-  {
-    name: "OUT"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  }
-]
-sequence_batching {
-  generative_sequence : true
-}
-
-instance_group [{ kind: KIND_CPU }]
diff --git a/qa/python_models/request_rescheduling_cases/model.py b/qa/python_models/request_rescheduling_cases/model.py
deleted file mode 100644
index c23d889fd7b..00000000000
--- a/qa/python_models/request_rescheduling_cases/model.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import json
-import threading
-
-import numpy as np
-import triton_python_backend_utils as pb_utils
-
-
-class TritonPythonModel:
-    def initialize(self, args):
-        self.model_config = model_config = json.loads(args["model_config"])
-
-        using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
-            model_config
-        )
-        if not using_decoupled:
-            raise pb_utils.TritonModelException(
-                """the model `{}` can generate any number of responses per request,
-                enable decoupled transaction policy in model configuration to
-                serve this model""".format(
-                    args["model_name"]
-                )
-            )
-
-        # Get IN configuration
-        in_config = pb_utils.get_input_config_by_name(model_config, "IN")
-
-        # Validate the shape and data type of IN
-        in_shape = in_config["dims"]
-        if (len(in_shape) != 1) or (in_shape[0] != 1):
-            raise pb_utils.TritonModelException(
-                """the model `{}` requires the shape of 'IN' to be
-                [1], got {}""".format(
-                    args["model_name"], in_shape
-                )
-            )
-        if in_config["data_type"] != "TYPE_INT32":
-            raise pb_utils.TritonModelException(
-                """the model `{}` requires the data_type of 'IN' to be
-                'TYPE_INT32', got {}""".format(
-                    args["model_name"], in_config["data_type"]
-                )
-            )
-
-        # Get OUT configuration
-        out_config = pb_utils.get_output_config_by_name(model_config, "OUT")
-
-        # Validate the shape and data type of OUT
-        out_shape = out_config["dims"]
-        if (len(out_shape) != 1) or (out_shape[0] != 1):
-            raise pb_utils.TritonModelException(
-                """the model `{}` requires the shape of 'OUT' to be
-                [1], got {}""".format(
-                    args["model_name"], out_shape
-                )
-            )
-        if out_config["data_type"] != "TYPE_INT32":
-            raise pb_utils.TritonModelException(
-                """the model `{}` requires the data_type of 'OUT' to be
-                'TYPE_INT32', got {}""".format(
-                    args["model_name"], out_config["data_type"]
-                )
-            )
-
-        self.idx = 0
-        self.inflight_thread_count = 0
-        self.inflight_thread_count_lck = threading.Lock()
-
-    def execute(self, requests):
-        for request in requests:
-            case = pb_utils.get_input_tensor_by_name(request, "IN").as_numpy()
-
-            if case[0] == 0:
-                self.send_final_flag_before_rescheduling_request(request)
-            elif case[0] == 1:
-                self.process_request_thread(request)
-            else:
-                raise pb_utils.TritonModelException("Unknown test case.")
-
-        return None
-
-    def send_final_flag_before_rescheduling_request(self, request):
-        response_sender = request.get_response_sender()
-        if self.idx == 0:
-            out_output = pb_utils.Tensor("OUT", np.array([0], np.int32))
-            response = pb_utils.InferenceResponse(output_tensors=[out_output])
-            response_sender.send(response)
-            response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-            request.set_release_flags(pb_utils.TRITONSERVER_REQUEST_RELEASE_RESCHEDULE)
-            self.idx = 1
-
-    def process_request_thread(self, request):
-        thread = threading.Thread(
-            target=self.response_thread,
-            args=(
-                request.get_response_sender(),
-                pb_utils.get_input_tensor_by_name(request, "IN").as_numpy(),
-            ),
-        )
-
-        thread.daemon = True
-
-        with self.inflight_thread_count_lck:
-            self.inflight_thread_count += 1
-
-        if self.idx == 0:
-            request.set_release_flags(pb_utils.TRITONSERVER_REQUEST_RELEASE_RESCHEDULE)
-            thread.start()
-            self.idx = 1
-
-    def response_thread(self, response_sender, in_input):
-        output_value = in_input[0]
-        while output_value >= 0:
-            out_output = pb_utils.Tensor("OUT", np.array([output_value], np.int32))
-            response = pb_utils.InferenceResponse(output_tensors=[out_output])
-            response_sender.send(response)
-            output_value -= 1
-
-        response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-
-        with self.inflight_thread_count_lck:
-            self.inflight_thread_count -= 1