diff --git a/comps/finetuning/handlers.py b/comps/finetuning/handlers.py index ddae0726f..8026273de 100644 --- a/comps/finetuning/handlers.py +++ b/comps/finetuning/handlers.py @@ -3,6 +3,7 @@ import os import random +import re import time import urllib.parse import uuid @@ -61,7 +62,7 @@ def update_job_status(job_id: FineTuningJobID): status = "cancelled" if status == "stopped" else status logger.info(f"Status of job {job_id} is '{status}'") running_finetuning_jobs[job_id].status = status - if status == "finished" or status == "cancelled" or status == "failed": + if status == "succeeded" or status == "cancelled" or status == "failed": break time.sleep(CHECK_JOB_STATUS_INTERVAL) @@ -190,7 +191,21 @@ def handle_list_finetuning_checkpoints(request: FineTuningJobIDRequest): checkpoints = [] if os.path.exists(output_dir): # Iterate over the contents of the directory and add an entry for each - for _ in os.listdir(output_dir): # Loop over directory contents + files = os.listdir(output_dir) + for file in files: # Loop over directory contents + file_path = os.path.join(output_dir, file) + if os.path.isdir(file_path) and file.startswith("checkpoint"): + steps = re.findall("\d+", file)[0] + checkpointsResponse = FineTuningJobCheckpoint( + id=f"ftckpt-{uuid.uuid4()}", # Generate a unique ID + created_at=int(time.time()), # Use the current timestamp + fine_tuned_model_checkpoint=file_path, # Directory path itself + fine_tuning_job_id=fine_tuning_job_id, + object="fine_tuning.job.checkpoint", + step_number=steps, + ) + checkpoints.append(checkpointsResponse) + if job.status == "succeeded": checkpointsResponse = FineTuningJobCheckpoint( id=f"ftckpt-{uuid.uuid4()}", # Generate a unique ID created_at=int(time.time()), # Use the current timestamp @@ -199,7 +214,6 @@ def handle_list_finetuning_checkpoints(request: FineTuningJobIDRequest): object="fine_tuning.job.checkpoint", ) checkpoints.append(checkpointsResponse) - checkpoint_id_to_checkpoint_path[checkpointsResponse.id] = checkpointsResponse.fine_tuned_model_checkpoint return checkpoints diff --git a/tests/test_finetuning_embedding_hpu.sh b/tests/test_finetuning_embedding_hpu.sh index e080b0d13..a066140ae 100644 --- a/tests/test_finetuning_embedding_hpu.sh +++ b/tests/test_finetuning_embedding_hpu.sh @@ -28,6 +28,7 @@ function start_service() { sleep 1m } + function validate_microservice() { cd $LOG_PATH export no_proxy="localhost,127.0.0.1,"${ip_address} @@ -79,8 +80,10 @@ function validate_microservice() { HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' -d '{"training_file": "test_embed_data.json","model": "BAAI/bge-base-en-v1.5","General":{"task":"embedding","lora_cofig":null,"save_strategy":"epoch"},"Dataset":{"query_max_len":128,"passage_max_len":128,"padding":"max_length"},"Training":{"epochs":3}}' "$URL") HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + FINTUNING_ID=$(echo "$RESPONSE_BODY" | jq -r '.id') SERVICE_NAME="finetuning-server - create finetuning job" + if [ "$HTTP_STATUS" -ne "200" ]; then echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log @@ -96,10 +99,80 @@ function validate_microservice() { echo "[ $SERVICE_NAME ] Content is as expected." fi - sleep 10m + # test /v1/fine_tuning/jobs/retrieve + URL="http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs/retrieve" + for((i=1;i<=10;i++)); + do + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": "'$FINTUNING_ID'"}' "$URL") + echo $HTTP_RESPONSE + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + STATUS=$(echo "$RESPONSE_BODY" | jq -r '.status') + if [[ "$STATUS" == "succeeded" ]]; then + echo "training: succeeded." + break + elif [[ "$STATUS" == "failed" ]]; then + echo "training: failed." + exit 1 + else + echo "training: '$STATUS'" + fi + sleep 1m + done + + # test /v1/finetune/list_checkpoints + URL="http://${ip_address}:$finetuning_service_port/v1/finetune/list_checkpoints" + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": "'$FINTUNING_ID'"}' "$URL") + echo $HTTP_RESPONSE + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + fine_tuned_model_checkpoint=$(echo "$RESPONSE_BODY" | jq -r '.[0].fine_tuned_model_checkpoint') + echo $fine_tuned_model_checkpoint + + echo "start resume checkpoint............................................." + # resume checkpoint /v1/fine_tuning/jobs + URL="http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs" + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' -d '{"training_file": "test_embed_data.json","model": "BAAI/bge-base-en-v1.5","General":{"task":"embedding","lora_cofig":null,"save_strategy":"epoch","resume_from_checkpoint":"'$fine_tuned_model_checkpoint'"},"Dataset":{"query_max_len":128,"passage_max_len":128,"padding":"max_length"},"Training":{"epochs":5}}' "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + FINTUNING_ID=$(echo "$RESPONSE_BODY" | jq -r '.id') + SERVICE_NAME="finetuning-server - resume checkpoint" + + + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + if [[ "$RESPONSE_BODY" != *'{"id":"ft-job'* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + # check training status /v1/fine_tuning/jobs/retrieve + URL="http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs/retrieve" + for((i=1;i<=10;i++)); + do + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": "'$FINTUNING_ID'"}' "$URL") + echo $HTTP_RESPONSE + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + STATUS=$(echo "$RESPONSE_BODY" | jq -r '.status') + if [[ "$STATUS" == "succeeded" ]]; then + echo "training: succeeded." + break + elif [[ "$STATUS" == "failed" ]]; then + echo "training: failed." + exit 1 + else + echo "training: '$STATUS'" + fi + sleep 1m + done + - # get logs - docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log } function stop_docker() {