Skip to content

Commit

Permalink
update GMC e2e and Doc (#321)
Browse files Browse the repository at this point in the history
* 1. fix minor e2e issue 2. sync gaudi tests
Signed-off-by: KfreeZ <[email protected]>
  • Loading branch information
KfreeZ authored Aug 21, 2024
1 parent 076e81e commit 8a85364
Show file tree
Hide file tree
Showing 4 changed files with 428 additions and 17 deletions.
227 changes: 218 additions & 9 deletions .github/workflows/scripts/e2e/gmc_gaudi_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ CHATQNA_SWITCH_NAMESPACE="${APP_NAMESPACE}-chatqna-switch"
CODEGEN_NAMESPACE="${APP_NAMESPACE}-codegen"
CODETRANS_NAMESPACE="${APP_NAMESPACE}-codetrans"
DOCSUM_NAMESPACE="${APP_NAMESPACE}-docsum"
DELETE_STEP_NAMESPACE="${APP_NAMESPACE}-delstep"
MODIFY_STEP_NAMESPACE="${APP_NAMESPACE}-modstep"
WEBHOOK_NAMESPACE="${APP_NAMESPACE}-webhook"

function validate_gmc() {
echo "validate audio-qna"
Expand All @@ -30,6 +33,10 @@ function validate_gmc() {
echo "validate chat-qna in switch mode"
validate_chatqna_in_switch

echo "validate change graph"
validate_modify_config
validate_remove_step

# echo "validate codegen"
# validate_codegen

Expand All @@ -39,13 +46,62 @@ function validate_gmc() {
# echo "validate docsum"
# validate_docsum

echo "validate webhook"
validate_webhook

get_gmc_controller_logs
}

function validate_webhook() {
kubectl create ns $WEBHOOK_NAMESPACE || echo "namespace $WEBHOOK_NAMESPACE is created."
# validate root node existence
yq ".spec.nodes.node123 = .spec.nodes.root | del(.spec.nodes.root)" config/samples/chatQnA_xeon.yaml > /tmp/webhook-case1.yaml
sed -i "s|namespace: chatqa|namespace: $WEBHOOK_NAMESPACE|g" /tmp/webhook-case1.yaml
output=$(! kubectl apply -f /tmp/webhook-case1.yaml 2>&1)
if ! (echo $output | grep -q "a root node is required"); then
echo "Root node existence validation error message is not found!"
echo $output
exit 1
fi

# StepName validation
yq '(.spec.nodes.root.steps[] | select ( .name == "Llm")).name = "xyz"' config/samples/chatQnA_gaudi.yaml > /tmp/webhook-case2.yaml
sed -i "s|namespace: chatqa|namespace: $WEBHOOK_NAMESPACE|g" /tmp/webhook-case2.yaml
output=$(! kubectl apply -f /tmp/webhook-case2.yaml 2>&1)
if ! (echo $output | grep -q "invalid step name"); then
echo "Step name validation error message is not found!"
echo $output
exit 1
fi

# nodeName existence
yq '(.spec.nodes.root.steps[] | select ( .name == "Embedding")).nodeName = "node123"' config/samples/chatQnA_switch_xeon.yaml > /tmp/webhook-case3.yaml
sed -i "s|namespace: switch|namespace: $WEBHOOK_NAMESPACE|g" /tmp/webhook-case3.yaml
output=$(! kubectl apply -f /tmp/webhook-case3.yaml 2>&1)
if ! (echo $output | grep -q "node name: node123 in step Embedding does not exist"); then
echo "nodeName existence validation error message is not found!"
echo $output
exit 1
fi

# serviceName uniqueness
yq '(.spec.nodes.node1.steps[] | select ( .name == "Embedding")).internalService.serviceName = "tei-embedding-svc-bge15"' config/samples/chatQnA_switch_xeon.yaml > /tmp/webhook-case4.yaml
sed -i "s|namespace: switch|namespace: $WEBHOOK_NAMESPACE|g" /tmp/webhook-case4.yaml
output=$(! kubectl apply -f /tmp/webhook-case4.yaml 2>&1)
if ! (echo $output | grep -q "service name: tei-embedding-svc-bge15 in node node1 already exists"); then
echo "serviceName uniqueness validation error message is not found!"
echo $output
exit 1
fi

# clean up cases
rm -f /tmp/webhook-case*.yaml
}

function cleanup_apps() {
echo "clean up microservice-connector"
# namespaces=("$CHATQNA_NAMESPACE" "$CHATQNA_DATAPREP_NAMESPACE" "$CHATQNA_SWITCH_NAMESPACE" "$CODEGEN_NAMESPACE" "$CODETRANS_NAMESPACE" "$DOCSUM_NAMESPACE")
namespaces=("$AUDIOQA_NAMESPACE" "$CHATQNA_NAMESPACE" "$CHATQNA_DATAPREP_NAMESPACE" "$CHATQNA_SWITCH_NAMESPACE")
namespaces=("$AUDIOQA_NAMESPACE" "$CHATQNA_NAMESPACE" "$CHATQNA_DATAPREP_NAMESPACE" "$CHATQNA_SWITCH_NAMESPACE" "$WEBHOOK_NAMESPACE" "$MODIFY_STEP_NAMESPACE" "$DELETE_STEP_NAMESPACE")
for ns in "${namespaces[@]}"; do
if kubectl get namespace $ns > /dev/null 2>&1; then
echo "Deleting namespace: $ns"
Expand Down Expand Up @@ -77,6 +133,15 @@ function validate_audioqa() {
exit 1
fi

pods_count=$(kubectl get pods -n $AUDIOQA_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)
# expected_ready_pods, expected_external_pods, expected_total_pods
# pods_count-1 is to exclude the client pod in this namespace
check_gmc_status $AUDIOQA_NAMESPACE 'audioqa' $((pods_count-1)) 0 7
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

# giving time to populating data
sleep 90

Expand All @@ -85,12 +150,17 @@ function validate_audioqa() {
export CLIENT_POD=$(kubectl get pod -n $AUDIOQA_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name})
echo "$CLIENT_POD"
accessUrl=$(kubectl get gmc -n $AUDIOQA_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='audioqa')].status.accessUrl}")
byte_str=$(kubectl exec "$CLIENT_POD" -n $AUDIOQA_NAMESPACE -- curl $accessUrl -s -X POST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_new_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str)
byte_str=$(kubectl exec "$CLIENT_POD" -n $AUDIOQA_NAMESPACE -- curl $accessUrl -s -X POST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_new_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq -r .byte_str > /dev/null)
if [ -z "$byte_str" ]; then
echo "audioqa failed, please check the the!"
exit 1
fi
echo "Audioqa response check succeed!"

kubectl delete gmc -n $AUDIOQA_NAMESPACE 'audioqa'
echo "sleep 10s for cleaning up"
sleep 10
check_resource_cleared $AUDIOQA_NAMESPACE
}

function validate_chatqna() {
Expand All @@ -104,16 +174,25 @@ function validate_chatqna() {
output=$(kubectl get pods -n $CHATQNA_NAMESPACE)
echo $output

# Wait until the tgi pod is ready
TGI_POD_NAME=$(kubectl get pods --namespace=$CHATQNA_NAMESPACE | grep ^tgi-gaudi-svc | awk '{print $1}')
kubectl describe pod $TGI_POD_NAME -n $CHATQNA_NAMESPACE
kubectl wait --for=condition=ready pod/$TGI_POD_NAME --namespace=$CHATQNA_NAMESPACE --timeout=300s

# deploy client pod for testing
kubectl create deployment client-test -n $CHATQNA_NAMESPACE --image=python:3.8.13 -- sleep infinity

# wait for client pod ready
wait_until_pod_ready "client-test" $CHATQNA_NAMESPACE "client-test"
# Wait until all pods are ready
wait_until_all_pod_ready $CHATQNA_NAMESPACE 300s
if [ $? -ne 0 ]; then
echo "Error Some pods are not ready!"
exit 1
fi

pods_count=$(kubectl get pods -n $CHATQNA_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)
# expected_ready_pods, expected_external_pods, expected_total_pods
# pods_count-1 is to exclude the client pod in this namespace
check_gmc_status $CHATQNA_NAMESPACE 'chatqa' $((pods_count-1)) 0 9
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

# giving time to populating data
sleep 90

Expand Down Expand Up @@ -144,6 +223,12 @@ function validate_chatqna() {
else
echo "Response check succeed!"
fi

kubectl delete deployment client-test -n $CHATQNA_NAMESPACE
kubectl delete gmc -n $CHATQNA_NAMESPACE 'chatqa'
echo "sleep 10s for cleaning up"
sleep 10
check_resource_cleared $CHATQNA_NAMESPACE
}

function validate_chatqna_with_dataprep() {
Expand All @@ -169,6 +254,15 @@ function validate_chatqna_with_dataprep() {
exit 1
fi

pods_count=$(kubectl get pods -n $CHATQNA_DATAPREP_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)
# expected_ready_pods, expected_external_pods, expected_total_pods
# pods_count-1 is to exclude the client pod in this namespace
check_gmc_status $CHATQNA_DATAPREP_NAMESPACE 'chatqa' $((pods_count-1)) 0 10
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

# giving time to populating data
sleep 90

Expand Down Expand Up @@ -222,6 +316,12 @@ function validate_chatqna_with_dataprep() {
else
echo "Response check succeed!"
fi

kubectl delete deployment client-test -n $CHATQNA_DATAPREP_NAMESPACE
kubectl delete gmc -n $CHATQNA_DATAPREP_NAMESPACE 'chatqa'
echo "sleep 10s for cleaning up"
sleep 10
check_resource_cleared $CHATQNA_DATAPREP_NAMESPACE
}

function validate_chatqna_in_switch() {
Expand All @@ -247,6 +347,15 @@ function validate_chatqna_in_switch() {
exit 1
fi

pods_count=$(kubectl get pods -n $CHATQNA_SWITCH_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)
# expected_ready_pods, expected_external_pods, expected_total_pods
# pods_count-1 is to exclude the client pod in this namespace
check_gmc_status $CHATQNA_SWITCH_NAMESPACE 'switch' $((pods_count-1)) 0 15
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

# giving time to populating data
sleep 90

Expand Down Expand Up @@ -303,6 +412,106 @@ function validate_chatqna_in_switch() {
else
echo "Response check succeed!"
fi

kubectl delete deployment client-test -n $CHATQNA_SWITCH_NAMESPACE
kubectl delete gmc -n $CHATQNA_SWITCH_NAMESPACE 'switch'
echo "sleep 10s for cleaning up"
sleep 10
check_resource_cleared $CHATQNA_SWITCH_NAMESPACE
}

function validate_modify_config() {
kubectl create ns $MODIFY_STEP_NAMESPACE
cp $(pwd)/config/samples/codegen_xeon.yaml $(pwd)/config/samples/codegen_xeon_mod.yaml
sed -i "s|namespace: codegen|namespace: $MODIFY_STEP_NAMESPACE|g" $(pwd)/config/samples/codegen_xeon_mod.yaml
kubectl apply -f $(pwd)/config/samples/codegen_xeon_mod.yaml

# Wait until the router service is ready
echo "Waiting for the router service to be ready..."
wait_until_pod_ready "router" $MODIFY_STEP_NAMESPACE "router-service"
output=$(kubectl get pods -n $MODIFY_STEP_NAMESPACE)
echo $output

# Wait until all pods are ready
wait_until_all_pod_ready $MODIFY_STEP_NAMESPACE 300s
if [ $? -ne 0 ]; then
echo "Error Some pods are not ready!"
exit 1
fi

pods_count=$(kubectl get pods -n $MODIFY_STEP_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)
check_gmc_status $MODIFY_STEP_NAMESPACE 'codegen' $pods_count 0 3
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

#change the model id of the step named "Tgi" in the codegen_xeon_mod.yaml
yq -i '(.spec.nodes.root.steps[] | select ( .name == "Tgi")).internalService.config.MODEL_ID = "bigscience/bloom-560m"' $(pwd)/config/samples/codegen_xeon_mod.yaml
kubectl apply -f $(pwd)/config/samples/codegen_xeon_mod.yaml
#you are supposed to see an error, it's a known issue, but it does not affect the tests
#https://github.com/opea-project/GenAIInfra/issues/314

pods_count=$(kubectl get pods -n $MODIFY_STEP_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)
check_gmc_status $MODIFY_STEP_NAMESPACE 'codegen' $pods_count 0 3
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

#revert the codegen yaml
sed -i "s|namespace: $MODIFY_STEP_NAMESPACE|namespace: codegen|g" $(pwd)/config/samples/codegen_xeon_mod.yaml
kubectl delete gmc -n $MODIFY_STEP_NAMESPACE 'codegen'
echo "sleep 10s for cleaning up"
sleep 10
check_resource_cleared $MODIFY_STEP_NAMESPACE
}

function validate_remove_step() {
kubectl create ns $DELETE_STEP_NAMESPACE
cp $(pwd)/config/samples/codegen_xeon.yaml $(pwd)/config/samples/codegen_xeon_del.yaml
sed -i "s|namespace: codegen|namespace: $DELETE_STEP_NAMESPACE|g" $(pwd)/config/samples/codegen_xeon_del.yaml
kubectl apply -f $(pwd)/config/samples/codegen_xeon_del.yaml

# Wait until the router service is ready
echo "Waiting for the router service to be ready..."
wait_until_pod_ready "router" $DELETE_STEP_NAMESPACE "router-service"
output=$(kubectl get pods -n $DELETE_STEP_NAMESPACE)
echo $output

# Wait until all pods are ready
wait_until_all_pod_ready $DELETE_STEP_NAMESPACE 300s
if [ $? -ne 0 ]; then
echo "Error Some pods are not ready!"
exit 1
fi

pods_count=$(kubectl get pods -n $DELETE_STEP_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)
check_gmc_status $DELETE_STEP_NAMESPACE 'codegen' $pods_count 0 3
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

# remove the step named "llm" in the codegen_xeon.yaml
yq -i 'del(.spec.nodes.root.steps[] | select ( .name == "Llm"))' $(pwd)/config/samples/codegen_xeon_del.yaml
kubectl apply -f $(pwd)/config/samples/codegen_xeon_del.yaml

sleep 10
check_pod_terminated $DELETE_STEP_NAMESPACE

check_gmc_status $DELETE_STEP_NAMESPACE 'codegen' 2 0 2
if [ $? -ne 0 ]; then
echo "GMC status is not as expected"
exit 1
fi

#revert the codegen yaml
sed -i "s|namespace: $DELETE_STEP_NAMESPACE|namespace: codegen|g" $(pwd)/config/samples/codegen_xeon_del.yaml
kubectl delete gmc -n $DELETE_STEP_NAMESPACE 'codegen'
echo "sleep 10s for cleaning up"
sleep 10
check_resource_cleared $DELETE_STEP_NAMESPACE
}

function validate_codegen() {
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/scripts/e2e/gmc_xeon_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,6 @@ function validate_chatqna() {
fi

pods_count=$(kubectl get pods -n $CHATQNA_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)

# expected_ready_pods, expected_external_pods, expected_total_pods
# pods_count-1 is to exclude the client pod in this namespace
check_gmc_status $CHATQNA_NAMESPACE 'chatqa' $((pods_count-1)) 0 9
Expand Down Expand Up @@ -359,7 +358,6 @@ function validate_chatqna_in_switch() {
exit 1
fi


# giving time to populating data
sleep 90

Expand Down Expand Up @@ -454,6 +452,8 @@ function validate_modify_config() {
#change the model id of the step named "Tgi" in the codegen_xeon_mod.yaml
yq -i '(.spec.nodes.root.steps[] | select ( .name == "Tgi")).internalService.config.MODEL_ID = "HuggingFaceH4/mistral-7b-grok"' $(pwd)/config/samples/codegen_xeon_mod.yaml
kubectl apply -f $(pwd)/config/samples/codegen_xeon_mod.yaml
#you are supposed to see an error, it's a known issue, but it does not affect the tests
#https://github.com/opea-project/GenAIInfra/issues/314

pods_count=$(kubectl get pods -n $MODIFY_STEP_NAMESPACE -o jsonpath='{.items[*].metadata.name}' | wc -w)
check_gmc_status $MODIFY_STEP_NAMESPACE 'codegen' $pods_count 0 3
Expand Down Expand Up @@ -510,7 +510,7 @@ function validate_remove_step() {
fi

#revert the codegen yaml
sed -i "s|namespace: $MODIFY_STEP_NAMESPACE|namespace: codegen|g" $(pwd)/config/samples/codegen_xeon_del.yaml
sed -i "s|namespace: $DELETE_STEP_NAMESPACE|namespace: codegen|g" $(pwd)/config/samples/codegen_xeon_del.yaml
kubectl delete gmc -n $DELETE_STEP_NAMESPACE 'codegen'
echo "sleep 10s for cleaning up"
sleep 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ func (r *GMConnectorReconciler) collectResourceStatus(graph *mcv1alpha3.GMConnec
var latestGraph mcv1alpha3.GMConnector
err = r.Client.Get(ctx, types.NamespacedName{Namespace: graph.Namespace, Name: graph.Name}, &latestGraph)
if err != nil && apierr.IsNotFound(err) {
fmt.Printf("failed to get graph %s before update status : %s", graph.Name, err)
fmt.Printf("failed to get graph %s before update status : %s\n", graph.Name, err)
} else {
graph.SetResourceVersion(latestGraph.GetResourceVersion())
}
Expand Down
Loading

0 comments on commit 8a85364

Please sign in to comment.