Skip to content

Commit

Permalink
refine logging code. (#559)
Browse files Browse the repository at this point in the history
* add ut and refine logging code.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update microservice port.

---------

Co-authored-by: root <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Aug 28, 2024
1 parent 3e87c3b commit 5b3053f
Show file tree
Hide file tree
Showing 10 changed files with 32 additions and 81 deletions.
16 changes: 8 additions & 8 deletions comps/finetuning/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ docker build -t opea/finetuning:latest --build-arg https_proxy=$https_proxy --bu
Start docker container with below command:

```bash
docker run -d --name="finetuning-server" -p 8005:8005 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest
docker run -d --name="finetuning-server" -p 8015:8015 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest
```

## 2.2 Setup on Gaudi2
Expand All @@ -81,7 +81,7 @@ Start docker container with below command:

```bash
export HF_TOKEN=${your_huggingface_token}
docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8005:8005 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest
docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8015:8015 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest
```

# 🚀3. Consume Finetuning Service
Expand All @@ -92,10 +92,10 @@ Assuming a training file `alpaca_data.json` is uploaded, it can be downloaded in

```bash
# upload a training file
curl http://${your_ip}:8005/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json"
curl http://${your_ip}:8015/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json"

# create a finetuning job
curl http://${your_ip}:8005/v1/fine_tuning/jobs \
curl http://${your_ip}:8015/v1/fine_tuning/jobs \
-X POST \
-H "Content-Type: application/json" \
-d '{
Expand All @@ -104,18 +104,18 @@ curl http://${your_ip}:8005/v1/fine_tuning/jobs \
}'

# list finetuning jobs
curl http://${your_ip}:8005/v1/fine_tuning/jobs -X GET
curl http://${your_ip}:8015/v1/fine_tuning/jobs -X GET

# retrieve one finetuning job
curl http://localhost:8005/v1/fine_tuning/jobs/retrieve -X POST -H "Content-Type: application/json" -d '{
curl http://localhost:8015/v1/fine_tuning/jobs/retrieve -X POST -H "Content-Type: application/json" -d '{
"fine_tuning_job_id": ${fine_tuning_job_id}}'

# cancel one finetuning job

curl http://localhost:8005/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type: application/json" -d '{
curl http://localhost:8015/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type: application/json" -d '{
"fine_tuning_job_id": ${fine_tuning_job_id}}'

# list checkpoints of a finetuning job
curl http://${your_ip}:8005/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'

```
Empty file removed comps/finetuning/datasets/.gitkeep
Empty file.
12 changes: 6 additions & 6 deletions comps/finetuning/finetuning_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,28 +20,28 @@
)


@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8005)
@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015)
def create_finetuning_jobs(request: FineTuningJobsRequest, background_tasks: BackgroundTasks):
return handle_create_finetuning_jobs(request, background_tasks)


@register_microservice(
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8005, methods=["GET"]
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015, methods=["GET"]
)
def list_finetuning_jobs():
return handle_list_finetuning_jobs()


@register_microservice(
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8005
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8015
)
def retrieve_finetuning_job(request: FineTuningJobIDRequest):
job = handle_retrieve_finetuning_job(request)
return job


@register_microservice(
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8005
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8015
)
def cancel_finetuning_job(request: FineTuningJobIDRequest):
job = handle_cancel_finetuning_job(request)
Expand All @@ -52,7 +52,7 @@ def cancel_finetuning_job(request: FineTuningJobIDRequest):
name="opea_service@finetuning",
endpoint="/v1/finetune/upload_training_files",
host="0.0.0.0",
port=8005,
port=8015,
)
async def upload_training_files(
files: Optional[Union[UploadFile, List[UploadFile]]] = File(None),
Expand All @@ -69,7 +69,7 @@ async def upload_training_files(


@register_microservice(
name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8005
name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8015
)
def list_checkpoints(request: FineTuningJobIDRequest):
checkpoints = handle_list_finetuning_checkpoints(request)
Expand Down
11 changes: 7 additions & 4 deletions comps/finetuning/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pydantic_yaml import parse_yaml_raw_as, to_yaml_file
from ray.job_submission import JobSubmissionClient

from comps import CustomLogger
from comps.cores.proto.api_protocol import (
FineTuningJob,
FineTuningJobIDRequest,
Expand All @@ -20,6 +21,8 @@
)
from comps.finetuning.llm_on_ray.finetune.finetune_config import FinetuneConfig

logger = CustomLogger("finetuning_handlers")

MODEL_CONFIG_FILE_MAP = {
"meta-llama/Llama-2-7b-chat-hf": "./models/llama-2-7b-chat-hf.yaml",
"mistralai/Mistral-7B-v0.1": "./models/mistral-7b-v0.1.yaml",
Expand Down Expand Up @@ -50,7 +53,7 @@ def update_job_status(job_id: FineTuningJobID):
status = str(job_status).lower()
# Ray status "stopped" is OpenAI status "cancelled"
status = "cancelled" if status == "stopped" else status
print(f"Status of job {job_id} is '{status}'")
logger.info(f"Status of job {job_id} is '{status}'")
running_finetuning_jobs[job_id].status = status
if status == "finished" or status == "cancelled" or status == "failed":
break
Expand Down Expand Up @@ -102,7 +105,7 @@ def handle_create_finetuning_jobs(request: FineTuningJobsRequest, background_tas
)
finetune_config.General.output_dir = os.path.join(JOBS_PATH, job.id)
if os.getenv("DEVICE", ""):
print(f"specific device: {os.getenv('DEVICE')}")
logger.info(f"specific device: {os.getenv('DEVICE')}")
finetune_config.Training.device = os.getenv("DEVICE")

finetune_config_file = f"{JOBS_PATH}/{job.id}.yaml"
Expand All @@ -117,7 +120,7 @@ def handle_create_finetuning_jobs(request: FineTuningJobsRequest, background_tas
# Path to the local directory that contains the script.py file
runtime_env={"working_dir": "./"},
)
print(f"Submitted Ray job: {ray_job_id} ...")
logger.info(f"Submitted Ray job: {ray_job_id} ...")

running_finetuning_jobs[job.id] = job
finetuning_job_to_ray_job[job.id] = ray_job_id
Expand Down Expand Up @@ -169,7 +172,7 @@ async def save_content_to_local_disk(save_path: str, content):
content = await content.read()
fout.write(content)
except Exception as e:
print(f"Write file failed. Exception: {e}")
logger.info(f"Write file failed. Exception: {e}")
raise Exception(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}")


Expand Down
Empty file removed comps/finetuning/jobs/.gitkeep
Empty file.
4 changes: 2 additions & 2 deletions comps/finetuning/lanuch.sh → comps/finetuning/launch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
# SPDX-License-Identifier: Apache-2.0

if [[ -n "$RAY_PORT" ]];then
export RAY_ADDRESS=http://127.0.0.1:$RAY_PORT
ray start --head --port $RAY_PORT
else
export RAY_ADDRESS=http://127.0.0.1:8265
ray start --head
export RAY_PORT=8265
fi

export RAY_ADDRESS=http://127.0.0.1:$RAY_PORT
python finetuning_service.py
1 change: 0 additions & 1 deletion comps/finetuning/llm_on_ray/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@
#
# Copyright 2023 The LLM-on-Ray Authors.

from .logging import logger
from .torch_config import TorchConfig
4 changes: 3 additions & 1 deletion comps/finetuning/llm_on_ray/common/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
import importlib
import os

from .logging import logger
from comps import CustomLogger

logger = CustomLogger("llm_on_ray")


def import_all_modules(basedir, prefix=None):
Expand Down
56 changes: 0 additions & 56 deletions comps/finetuning/llm_on_ray/common/logging.py

This file was deleted.

9 changes: 6 additions & 3 deletions comps/finetuning/llm_on_ray/finetune/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,13 @@
from ray.air.config import ScalingConfig
from ray.train.torch import TorchTrainer

from comps import CustomLogger
from comps.finetuning.llm_on_ray import common
from comps.finetuning.llm_on_ray.finetune.data_process import DataProcessor
from comps.finetuning.llm_on_ray.finetune.finetune_config import FinetuneConfig

logger = CustomLogger("llm_on_ray/finetune")


def adapt_transformers_to_device(config: Dict):
device = config["Training"]["device"]
Expand Down Expand Up @@ -332,10 +335,10 @@ def train_func(config: Dict[str, Any]):

training_args, trainer = get_trainer(config, model, tokenizer, tokenized_dataset, data_collator)

common.logger.info("train start")
logger.info("train start")
trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
trainer.save_model()
common.logger.info("train finish")
logger.info("train finish")


def get_finetune_config():
Expand Down Expand Up @@ -401,7 +404,7 @@ def main(external_config=None):
else:
ray.init(runtime_env=runtime_env)

common.logger.info(f"ray available resources = {ray.available_resources()}")
logger.info(f"ray available resources = {ray.available_resources()}")
use_gpu = True if device == "gpu" else False
scaling_config = ScalingConfig(
num_workers=num_training_workers,
Expand Down

0 comments on commit 5b3053f

Please sign in to comment.