Skip to content

Commit

Permalink
GMC: resource management (#259)
Browse files Browse the repository at this point in the history
This PR introduce the resource management in GMC controller in order to:
delete the resources when GMC pipeline is deleted
delete the resources if it is deleted from a pipeline
record more status details of the resources
update resource status based on event
Signed-off-by: KfreeZ <[email protected]>
  • Loading branch information
KfreeZ authored Aug 17, 2024
1 parent 3d94844 commit 81060ab
Show file tree
Hide file tree
Showing 6 changed files with 827 additions and 140 deletions.
163 changes: 161 additions & 2 deletions .github/workflows/scripts/e2e/gmc_xeon_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ CHATQNA_SWITCH_NAMESPACE="${APP_NAMESPACE}-chatqna-switch"
CODEGEN_NAMESPACE="${APP_NAMESPACE}-codegen"
CODETRANS_NAMESPACE="${APP_NAMESPACE}-codetrans"
DOCSUM_NAMESPACE="${APP_NAMESPACE}-docsum"
DELETE_STEP_NAMESPACE="${APP_NAMESPACE}-delstep"
MODIFY_STEP_NAMESPACE="${APP_NAMESPACE}-modstep"
WEBHOOK_NAMESPACE="${APP_NAMESPACE}-webhook"

function validate_gmc() {
Expand All @@ -31,6 +33,10 @@ function validate_gmc() {
echo "validate chat-qna in switch mode"
validate_chatqna_in_switch

echo "validate change graph"
validate_modify_config
validate_remove_step

# echo "validate codegen"
# validate_codegen

Expand Down Expand Up @@ -68,7 +74,6 @@ function validate_webhook() {
exit 1
fi


# nodeName existence
yq '(.spec.nodes.root.steps[] | select ( .name == "Embedding")).nodeName = "node123"' config/samples/chatQnA_switch_xeon.yaml > /tmp/webhook-case3.yaml
sed -i "s|namespace: switch|namespace: $WEBHOOK_NAMESPACE|g" /tmp/webhook-case3.yaml
Expand Down Expand Up @@ -96,7 +101,7 @@ function validate_webhook() {
function cleanup_apps() {
echo "clean up microservice-connector"
# namespaces=("$CHATQNA_NAMESPACE" "$CHATQNA_DATAPREP_NAMESPACE" "$CHATQNA_SWITCH_NAMESPACE" "$CODEGEN_NAMESPACE" "$CODETRANS_NAMESPACE" "$DOCSUM_NAMESPACE")
namespaces=("$AUDIOQA_NAMESPACE" "$CHATQNA_NAMESPACE" "$CHATQNA_DATAPREP_NAMESPACE" "$CHATQNA_SWITCH_NAMESPACE" "$WEBHOOK_NAMESPACE")
namespaces=("$AUDIOQA_NAMESPACE" "$CHATQNA_NAMESPACE" "$CHATQNA_DATAPREP_NAMESPACE" "$CHATQNA_SWITCH_NAMESPACE" "$WEBHOOK_NAMESPACE" "$MODIFY_STEP_NAMESPACE" "$DELETE_STEP_NAMESPACE")
for ns in "${namespaces[@]}"; do
if kubectl get namespace $ns > /dev/null 2>&1; then
echo "Deleting namespace: $ns"
Expand Down Expand Up @@ -128,6 +133,15 @@ function validate_audioqa() {
exit 1
fi

pods_count=$(kubectl get pods -n $AUDIOQA_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)
# expected_ready_pods, expected_external_pods, expected_total_pods
# pods_count-1 is to exclude the client pod in this namespace
check_gmc_status $AUDIOQA_NAMESPACE 'audioqa' $((pods_count-1)) 0 7
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

# giving time to populating data
sleep 90

Expand All @@ -142,6 +156,11 @@ function validate_audioqa() {
exit 1
fi
echo "Audioqa response check succeed!"

kubectl delete gmc -n $AUDIOQA_NAMESPACE 'audioqa'
echo "sleep 10s for cleaning up"
sleep 10
check_resource_cleared $AUDIOQA_NAMESPACE
}

function validate_chatqna() {
Expand All @@ -167,6 +186,16 @@ function validate_chatqna() {
exit 1
fi

pods_count=$(kubectl get pods -n $CHATQNA_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)

# expected_ready_pods, expected_external_pods, expected_total_pods
# pods_count-1 is to exclude the client pod in this namespace
check_gmc_status $CHATQNA_NAMESPACE 'chatqa' $((pods_count-1)) 0 9
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

# giving time to populating data
sleep 90

Expand Down Expand Up @@ -197,6 +226,12 @@ function validate_chatqna() {
else
echo "Response check succeed!"
fi

kubectl delete deployment client-test -n $CHATQNA_NAMESPACE
kubectl delete gmc -n $CHATQNA_NAMESPACE 'chatqa'
echo "sleep 10s for cleaning up"
sleep 10
check_resource_cleared $CHATQNA_NAMESPACE
}

function validate_chatqna_with_dataprep() {
Expand All @@ -222,6 +257,15 @@ function validate_chatqna_with_dataprep() {
exit 1
fi

pods_count=$(kubectl get pods -n $CHATQNA_DATAPREP_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)
# expected_ready_pods, expected_external_pods, expected_total_pods
# pods_count-1 is to exclude the client pod in this namespace
check_gmc_status $CHATQNA_DATAPREP_NAMESPACE 'chatqa' $((pods_count-1)) 0 10
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

# giving time to populating data
sleep 90

Expand Down Expand Up @@ -275,6 +319,12 @@ function validate_chatqna_with_dataprep() {
else
echo "Response check succeed!"
fi

kubectl delete deployment client-test -n $CHATQNA_DATAPREP_NAMESPACE
kubectl delete gmc -n $CHATQNA_DATAPREP_NAMESPACE 'chatqa'
echo "sleep 10s for cleaning up"
sleep 10
check_resource_cleared $CHATQNA_DATAPREP_NAMESPACE
}

function validate_chatqna_in_switch() {
Expand All @@ -300,6 +350,16 @@ function validate_chatqna_in_switch() {
exit 1
fi

pods_count=$(kubectl get pods -n $CHATQNA_SWITCH_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)
# expected_ready_pods, expected_external_pods, expected_total_pods
# pods_count-1 is to exclude the client pod in this namespace
check_gmc_status $CHATQNA_SWITCH_NAMESPACE 'switch' $((pods_count-1)) 0 15
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi


# giving time to populating data
sleep 90

Expand Down Expand Up @@ -356,6 +416,105 @@ function validate_chatqna_in_switch() {
else
echo "Response check succeed!"
fi

kubectl delete deployment client-test -n $CHATQNA_SWITCH_NAMESPACE
kubectl delete gmc -n $CHATQNA_SWITCH_NAMESPACE 'switch'
echo "sleep 10s for cleaning up"
sleep 10
check_resource_cleared $CHATQNA_SWITCH_NAMESPACE
}


function validate_modify_config() {
kubectl create ns $MODIFY_STEP_NAMESPACE
cp $(pwd)/config/samples/codegen_xeon.yaml $(pwd)/config/samples/codegen_xeon_mod.yaml
sed -i "s|namespace: codegen|namespace: $MODIFY_STEP_NAMESPACE|g" $(pwd)/config/samples/codegen_xeon_mod.yaml
kubectl apply -f $(pwd)/config/samples/codegen_xeon_mod.yaml

# Wait until the router service is ready
echo "Waiting for the router service to be ready..."
wait_until_pod_ready "router" $MODIFY_STEP_NAMESPACE "router-service"
output=$(kubectl get pods -n $MODIFY_STEP_NAMESPACE)
echo $output

# Wait until all pods are ready
wait_until_all_pod_ready $MODIFY_STEP_NAMESPACE 300s
if [ $? -ne 0 ]; then
echo "Error Some pods are not ready!"
exit 1
fi

pods_count=$(kubectl get pods -n $MODIFY_STEP_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)
check_gmc_status $MODIFY_STEP_NAMESPACE 'codegen' $pods_count 0 3
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

#change the model id of the step named "Tgi" in the codegen_xeon_mod.yaml
yq -i '(.spec.nodes.root.steps[] | select ( .name == "Tgi")).internalService.config.MODEL_ID = "bigscience/bloom-560m"' $(pwd)/config/samples/codegen_xeon_mod.yaml
kubectl apply -f $(pwd)/config/samples/codegen_xeon_mod.yaml

pods_count=$(kubectl get pods -n $MODIFY_STEP_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)
check_gmc_status $MODIFY_STEP_NAMESPACE 'codegen' $pods_count 0 3
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

#revert the codegen yaml
sed -i "s|namespace: $MODIFY_STEP_NAMESPACE|namespace: codegen|g" $(pwd)/config/samples/codegen_xeon_mod.yaml
kubectl delete gmc -n $MODIFY_STEP_NAMESPACE 'codegen'
echo "sleep 10s for cleaning up"
sleep 10
check_resource_cleared $MODIFY_STEP_NAMESPACE
}

function validate_remove_step() {
kubectl create ns $DELETE_STEP_NAMESPACE
cp $(pwd)/config/samples/codegen_xeon.yaml $(pwd)/config/samples/codegen_xeon_del.yaml
sed -i "s|namespace: codegen|namespace: $DELETE_STEP_NAMESPACE|g" $(pwd)/config/samples/codegen_xeon_del.yaml
kubectl apply -f $(pwd)/config/samples/codegen_xeon_del.yaml

# Wait until the router service is ready
echo "Waiting for the router service to be ready..."
wait_until_pod_ready "router" $DELETE_STEP_NAMESPACE "router-service"
output=$(kubectl get pods -n $DELETE_STEP_NAMESPACE)
echo $output

# Wait until all pods are ready
wait_until_all_pod_ready $DELETE_STEP_NAMESPACE 300s
if [ $? -ne 0 ]; then
echo "Error Some pods are not ready!"
exit 1
fi

pods_count=$(kubectl get pods -n $DELETE_STEP_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)
check_gmc_status $DELETE_STEP_NAMESPACE 'codegen' $pods_count 0 3
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

# remove the step named "llm" in the codegen_xeon.yaml
yq -i 'del(.spec.nodes.root.steps[] | select ( .name == "Llm"))' $(pwd)/config/samples/codegen_xeon_del.yaml
kubectl apply -f $(pwd)/config/samples/codegen_xeon_del.yaml

sleep 10
check_pod_terminated $DELETE_STEP_NAMESPACE

check_gmc_status $DELETE_STEP_NAMESPACE 'codegen' 2 0 2
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

#revert the codegen yaml
sed -i "s|namespace: $MODIFY_STEP_NAMESPACE|namespace: codegen|g" $(pwd)/config/samples/codegen_xeon_del.yaml
kubectl delete gmc -n $DELETE_STEP_NAMESPACE 'codegen'
echo "sleep 10s for cleaning up"
sleep 10
check_resource_cleared $DELETE_STEP_NAMESPACE
}

function validate_codegen() {
Expand Down
66 changes: 66 additions & 0 deletions .github/workflows/scripts/e2e/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,69 @@ function wait_until_all_pod_ready() {
fi
done
}

function check_gmc_status() {
namespace=$1
gmc_name=$2
expected_ready_pods=$3
expected_external_pods=$4
expected_total_pods=$5

# pods*3 is because 1 pod has 1 configmap + 1 deployment + 1 service
# minus 1 is because router and redis don't have the configmap
expected_total_records=$((3* $3 - 2))

if [ $((expected_ready_pods + expected_external_pods)) -ne $expected_total_pods ]; then
return 1
fi

gmc_status=$(kubectl get gmc -n $namespace -o jsonpath="{.items[?(@.metadata.name=='$gmc_name')].status.status}")
echo $gmc_status
if [[ "$gmc_status" == "$expected_ready_pods/$expected_external_pods/$expected_total_pods" ]]; then
return 0
else
return 1
fi
annotation=$(kubectl get gmc -n $namespace -o json | jq ".items[] | select(.metadata.name==\"$gmc_name\") | .status.annotations | length")
echo $annotation
if [ $annotation -eq $expected_total_records ]; then
return 0
else
return 1
fi
}

function check_resource_cleared() {
namespace=$1

actual_count=$(kubectl get all -n $namespace --no-headers | wc -l)
if [ $actual_count -eq 0 ]; then
return 0
else
#check every line of kubectl get all status is Terminating
remaining=$(kubectl get pods -n $namespace --no-headers)
echo $remaining
status=$(echo $remaining | awk '{print $3}')
for i in $status; do
if [[ "$i" != "Terminating" ]]; then
return 1
fi
done
return 0
fi
}

function check_pod_terminated() {
namespace=$1

#check every line of kubectl get all status is Terminating
remaining=$(kubectl get pods -n $namespace --no-headers)
echo $remaining
status=$(echo $remaining | awk '{print $3}')
for i in $status; do
if [[ "$i" == "Terminating" ]]; then
return 0
fi
done
return 1
}
6 changes: 4 additions & 2 deletions microservices-connector/config/gmcrouter/gmc-router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: router-server
name: {{.DplymntName}}
namespace: {{.Namespace}}
spec:
replicas: 1
selector:
Expand Down Expand Up @@ -37,7 +38,8 @@ spec:
apiVersion: v1
kind: Service
metadata:
name: router-service
name: {{.SvcName}}
namespace: {{.Namespace}}
spec:
type: ClusterIP
selector:
Expand Down
Loading

0 comments on commit 81060ab

Please sign in to comment.