Skip to content

Commit

Permalink
add resume finetuning checkpoint ut. (opea-project#646)
Browse files Browse the repository at this point in the history
* add resume finetuning checkpoint ut.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add final tuned model.

---------

Co-authored-by: root <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Sep 10, 2024
1 parent 94eb60f commit c718602
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 6 deletions.
20 changes: 17 additions & 3 deletions comps/finetuning/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import os
import random
import re
import time
import urllib.parse
import uuid
Expand Down Expand Up @@ -61,7 +62,7 @@ def update_job_status(job_id: FineTuningJobID):
status = "cancelled" if status == "stopped" else status
logger.info(f"Status of job {job_id} is '{status}'")
running_finetuning_jobs[job_id].status = status
if status == "finished" or status == "cancelled" or status == "failed":
if status == "succeeded" or status == "cancelled" or status == "failed":
break
time.sleep(CHECK_JOB_STATUS_INTERVAL)

Expand Down Expand Up @@ -190,7 +191,21 @@ def handle_list_finetuning_checkpoints(request: FineTuningJobIDRequest):
checkpoints = []
if os.path.exists(output_dir):
# Iterate over the contents of the directory and add an entry for each
for _ in os.listdir(output_dir): # Loop over directory contents
files = os.listdir(output_dir)
for file in files: # Loop over directory contents
file_path = os.path.join(output_dir, file)
if os.path.isdir(file_path) and file.startswith("checkpoint"):
steps = re.findall("\d+", file)[0]
checkpointsResponse = FineTuningJobCheckpoint(
id=f"ftckpt-{uuid.uuid4()}", # Generate a unique ID
created_at=int(time.time()), # Use the current timestamp
fine_tuned_model_checkpoint=file_path, # Directory path itself
fine_tuning_job_id=fine_tuning_job_id,
object="fine_tuning.job.checkpoint",
step_number=steps,
)
checkpoints.append(checkpointsResponse)
if job.status == "succeeded":
checkpointsResponse = FineTuningJobCheckpoint(
id=f"ftckpt-{uuid.uuid4()}", # Generate a unique ID
created_at=int(time.time()), # Use the current timestamp
Expand All @@ -199,7 +214,6 @@ def handle_list_finetuning_checkpoints(request: FineTuningJobIDRequest):
object="fine_tuning.job.checkpoint",
)
checkpoints.append(checkpointsResponse)
checkpoint_id_to_checkpoint_path[checkpointsResponse.id] = checkpointsResponse.fine_tuned_model_checkpoint

return checkpoints

Expand Down
79 changes: 76 additions & 3 deletions tests/test_finetuning_embedding_hpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ function start_service() {
sleep 1m
}


function validate_microservice() {
cd $LOG_PATH
export no_proxy="localhost,127.0.0.1,"${ip_address}
Expand Down Expand Up @@ -79,8 +80,10 @@ function validate_microservice() {
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' -d '{"training_file": "test_embed_data.json","model": "BAAI/bge-base-en-v1.5","General":{"task":"embedding","lora_cofig":null,"save_strategy":"epoch"},"Dataset":{"query_max_len":128,"passage_max_len":128,"padding":"max_length"},"Training":{"epochs":3}}' "$URL")
HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
FINTUNING_ID=$(echo "$RESPONSE_BODY" | jq -r '.id')
SERVICE_NAME="finetuning-server - create finetuning job"


if [ "$HTTP_STATUS" -ne "200" ]; then
echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log
Expand All @@ -96,10 +99,80 @@ function validate_microservice() {
echo "[ $SERVICE_NAME ] Content is as expected."
fi

sleep 10m
# test /v1/fine_tuning/jobs/retrieve
URL="http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs/retrieve"
for((i=1;i<=10;i++));
do
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": "'$FINTUNING_ID'"}' "$URL")
echo $HTTP_RESPONSE
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
STATUS=$(echo "$RESPONSE_BODY" | jq -r '.status')
if [[ "$STATUS" == "succeeded" ]]; then
echo "training: succeeded."
break
elif [[ "$STATUS" == "failed" ]]; then
echo "training: failed."
exit 1
else
echo "training: '$STATUS'"
fi
sleep 1m
done

# test /v1/finetune/list_checkpoints
URL="http://${ip_address}:$finetuning_service_port/v1/finetune/list_checkpoints"
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": "'$FINTUNING_ID'"}' "$URL")
echo $HTTP_RESPONSE
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
fine_tuned_model_checkpoint=$(echo "$RESPONSE_BODY" | jq -r '.[0].fine_tuned_model_checkpoint')
echo $fine_tuned_model_checkpoint

echo "start resume checkpoint............................................."
# resume checkpoint /v1/fine_tuning/jobs
URL="http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs"
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' -d '{"training_file": "test_embed_data.json","model": "BAAI/bge-base-en-v1.5","General":{"task":"embedding","lora_cofig":null,"save_strategy":"epoch","resume_from_checkpoint":"'$fine_tuned_model_checkpoint'"},"Dataset":{"query_max_len":128,"passage_max_len":128,"padding":"max_length"},"Training":{"epochs":5}}' "$URL")
HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
FINTUNING_ID=$(echo "$RESPONSE_BODY" | jq -r '.id')
SERVICE_NAME="finetuning-server - resume checkpoint"


if [ "$HTTP_STATUS" -ne "200" ]; then
echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log
exit 1
else
echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
fi
if [[ "$RESPONSE_BODY" != *'{"id":"ft-job'* ]]; then
echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log
exit 1
else
echo "[ $SERVICE_NAME ] Content is as expected."
fi

# check training status /v1/fine_tuning/jobs/retrieve
URL="http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs/retrieve"
for((i=1;i<=10;i++));
do
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": "'$FINTUNING_ID'"}' "$URL")
echo $HTTP_RESPONSE
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
STATUS=$(echo "$RESPONSE_BODY" | jq -r '.status')
if [[ "$STATUS" == "succeeded" ]]; then
echo "training: succeeded."
break
elif [[ "$STATUS" == "failed" ]]; then
echo "training: failed."
exit 1
else
echo "training: '$STATUS'"
fi
sleep 1m
done


# get logs
docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log
}

function stop_docker() {
Expand Down

0 comments on commit c718602

Please sign in to comment.