From 9e49cc8228866b2eeac8f82ef688bb0608dfdae7 Mon Sep 17 00:00:00 2001
From: irisdingbj <shaojun.ding@intel.com>
Date: Fri, 7 Jun 2024 22:35:17 +0000
Subject: [PATCH] fix wait error

Signed-off-by: irisdingbj <shaojun.ding@intel.com>
---
 .github/workflows/scripts/e2e/gmc_test.sh | 47 ++++++++++++++++-------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/scripts/e2e/gmc_test.sh b/.github/workflows/scripts/e2e/gmc_test.sh
index 228d81a3..c6940927 100755
--- a/.github/workflows/scripts/e2e/gmc_test.sh
+++ b/.github/workflows/scripts/e2e/gmc_test.sh
@@ -21,8 +21,8 @@ function install_gmc() {
     kubectl apply -f $(pwd)/config/manager/gmc-manager.yaml
 
     # Wait until the gmc controller pod is ready
-    GMC_CONTROLLER_POD=$(kubectl get pods --namespace=$SYSTEM_NAMESPACE | awk 'NR>1 {print $1; exit}')
-    wait_until_pod_ready "gmc-controller" $GMC_CONTROLLER_POD $SYSTEM_NAMESPACE
+    wait_until_pod_ready "gmc-controller" $SYSTEM_NAMESPACE "gmc-controller"
+    kubectl get pods -n $SYSTEM_NAMESPACE
 }
 
 function validate_gmc() {
@@ -43,36 +43,31 @@ function cleanup_gmc() {
 
 function validate_chatqna() {
 
-   kubectl get pods -n $SYSTEM_NAMESPACE
    # todo select gaudi or xeon
    kubectl create ns $APP_NAMESPACE
    sed -i "s|namespace: chatqa|namespace: $APP_NAMESPACE|g"  $(pwd)/config/samples/chatQnA_xeon.yaml
    kubectl apply -f $(pwd)/config/samples/chatQnA_xeon.yaml
 
-
-
-   output=$(kubectl get pods)
-   echo $output
-
    # Wait until the router service is ready
    echo "Waiting for the chatqa router service to be ready..."
-   ROUTER_POD=$(kubectl get pods --namespace=$APP_NAMESPACE -l app=router-service | awk 'NR>1 {print $1; exit}')
-   wait_until_pod_ready "chatqna router" $ROUTER_POD $APP_NAMESPACE
+   wait_until_pod_ready "chatqna router" $APP_NAMESPACE "router-service"
+   output=$(kubectl get pods -n $APP_NAMESPACE)
+   echo $output
 
   # Wait until the tgi pod is ready
   TGI_POD_NAME=$(kubectl get pods --namespace=$APP_NAMESPACE | grep ^tgi-service | awk '{print $1}')
   kubectl describe pod $TGI_POD_NAME -n $APP_NAMESPACE
-  wait_until_pod_ready "tgi service" $TGI_POD_NAME $APP_NAMESPACE
+  kubectl wait --for=condition=ready pod/TGI_POD_NAME --namespace=$APP_NAMESPACE --timeout=300s
 
 
    # deploy client pod for testing
    kubectl create deployment client-test -n $APP_NAMESPACE --image=python:3.8.13 -- sleep infinity
 
    # wait for client pod ready
-   CLIENT_POD=$(kubectl get pods --namespace=$APP_NAMESPACE -l app=client-test | awk 'NR>1 {print $1; exit}')
-   wait_until_pod_ready "client-test" $CLIENT_POD $APP_NAMESPACE
+   wait_until_pod_ready "client-test" $APP_NAMESPACE "client-test"
    # giving time to populating data
    sleep 120
+
    kubectl get pods -n $APP_NAMESPACE
    # send request to chatqnA
    export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name})
@@ -128,9 +123,33 @@ function init_gmc() {
 
 function wait_until_pod_ready() {
     echo "Waiting for the $1 to be ready..."
-    kubectl wait --for=condition=ready pod/$2 --namespace=$3 --timeout=300s
+    max_retries=30
+    retry_count=0
+    while ! is_pod_ready $2 $3; do
+        if [ $retry_count -ge $max_retries ]; then
+            echo "$1 is not ready after waiting for a significant amount of time"
+            exit 1
+        fi
+        echo "$1 is not ready yet. Retrying in 10 seconds..."
+        sleep 10
+        output=$(kubectl get pods -n $2)
+        echo $output
+        retry_count=$((retry_count + 1))
+    done
 }
 
+function is_pod_ready() {
+    if [ "$2" == "gmc-controller" ]; then
+      pod_status=$(kubectl get pods -n $1 -o jsonpath='{.items[].status.conditions[?(@.type=="Ready")].status}')
+    else
+      pod_status=$(kubectl get pods -n $1 -l app=$2 -o jsonpath='{.items[].status.conditions[?(@.type=="Ready")].status}')
+    fi
+    if [ "$pod_status" == "True" ]; then
+        return 0
+    else
+        return 1
+    fi
+}
 
 if [ $# -eq 0 ]; then
     echo "Usage: $0 <function_name>"