refine logging code. (#559)

* add ut and refine logging code. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update microservice port. --------- Co-authored-by: root <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
opea-project · Aug 28, 2024 · 5b3053f · 5b3053f
1 parent 3e87c3b
commit 5b3053f
Show file tree

Hide file tree

Showing 10 changed files with 32 additions and 81 deletions.
diff --git a/comps/finetuning/README.md b/comps/finetuning/README.md
@@ -61,7 +61,7 @@ docker build -t opea/finetuning:latest --build-arg https_proxy=$https_proxy --bu
 Start docker container with below command:
 
 ```bash
-docker run -d --name="finetuning-server" -p 8005:8005 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest
+docker run -d --name="finetuning-server" -p 8015:8015 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest
 ```
 
 ## 2.2 Setup on Gaudi2
@@ -81,7 +81,7 @@ Start docker container with below command:
 
 ```bash
 export HF_TOKEN=${your_huggingface_token}
-docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8005:8005 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest
+docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8015:8015 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest
 ```
 
 # 🚀3. Consume Finetuning Service
@@ -92,10 +92,10 @@ Assuming a training file `alpaca_data.json` is uploaded, it can be downloaded in
 
 ```bash
 # upload a training file
-curl http://${your_ip}:8005/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json"
+curl http://${your_ip}:8015/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json"
 
 # create a finetuning job
-curl http://${your_ip}:8005/v1/fine_tuning/jobs \
+curl http://${your_ip}:8015/v1/fine_tuning/jobs \
   -X POST \
   -H "Content-Type: application/json" \
   -d '{
@@ -104,18 +104,18 @@ curl http://${your_ip}:8005/v1/fine_tuning/jobs \
   }'
 
 # list finetuning jobs
-curl http://${your_ip}:8005/v1/fine_tuning/jobs   -X GET
+curl http://${your_ip}:8015/v1/fine_tuning/jobs   -X GET
 
 # retrieve one finetuning job
-curl http://localhost:8005/v1/fine_tuning/jobs/retrieve   -X POST   -H "Content-Type: application/json"   -d '{
+curl http://localhost:8015/v1/fine_tuning/jobs/retrieve   -X POST   -H "Content-Type: application/json"   -d '{
     "fine_tuning_job_id": ${fine_tuning_job_id}}'
 
 # cancel one finetuning job
 
-curl http://localhost:8005/v1/fine_tuning/jobs/cancel   -X POST   -H "Content-Type: application/json"   -d '{
+curl http://localhost:8015/v1/fine_tuning/jobs/cancel   -X POST   -H "Content-Type: application/json"   -d '{
     "fine_tuning_job_id": ${fine_tuning_job_id}}'
 
 # list checkpoints of a finetuning job
-curl http://${your_ip}:8005/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
+curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
 
 ```
diff --git a/comps/finetuning/datasets/.gitkeep b/comps/finetuning/datasets/.gitkeep
diff --git a/comps/finetuning/finetuning_service.py b/comps/finetuning/finetuning_service.py
@@ -20,28 +20,28 @@
 )
 
 
-@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8005)
+@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015)
 def create_finetuning_jobs(request: FineTuningJobsRequest, background_tasks: BackgroundTasks):
     return handle_create_finetuning_jobs(request, background_tasks)
 
 
 @register_microservice(
-    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8005, methods=["GET"]
+    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015, methods=["GET"]
 )
 def list_finetuning_jobs():
     return handle_list_finetuning_jobs()
 
 
 @register_microservice(
-    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8005
+    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8015
 )
 def retrieve_finetuning_job(request: FineTuningJobIDRequest):
     job = handle_retrieve_finetuning_job(request)
     return job
 
 
 @register_microservice(
-    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8005
+    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8015
 )
 def cancel_finetuning_job(request: FineTuningJobIDRequest):
     job = handle_cancel_finetuning_job(request)
@@ -52,7 +52,7 @@ def cancel_finetuning_job(request: FineTuningJobIDRequest):
     name="opea_service@finetuning",
     endpoint="/v1/finetune/upload_training_files",
     host="0.0.0.0",
-    port=8005,
+    port=8015,
 )
 async def upload_training_files(
     files: Optional[Union[UploadFile, List[UploadFile]]] = File(None),
@@ -69,7 +69,7 @@ async def upload_training_files(
 
 
 @register_microservice(
-    name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8005
+    name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8015
 )
 def list_checkpoints(request: FineTuningJobIDRequest):
     checkpoints = handle_list_finetuning_checkpoints(request)

diff --git a/comps/finetuning/handlers.py b/comps/finetuning/handlers.py
@@ -12,6 +12,7 @@
 from pydantic_yaml import parse_yaml_raw_as, to_yaml_file
 from ray.job_submission import JobSubmissionClient
 
+from comps import CustomLogger
 from comps.cores.proto.api_protocol import (
     FineTuningJob,
     FineTuningJobIDRequest,
@@ -20,6 +21,8 @@
 )
 from comps.finetuning.llm_on_ray.finetune.finetune_config import FinetuneConfig
 
+logger = CustomLogger("finetuning_handlers")
+
 MODEL_CONFIG_FILE_MAP = {
     "meta-llama/Llama-2-7b-chat-hf": "./models/llama-2-7b-chat-hf.yaml",
     "mistralai/Mistral-7B-v0.1": "./models/mistral-7b-v0.1.yaml",
@@ -50,7 +53,7 @@ def update_job_status(job_id: FineTuningJobID):
         status = str(job_status).lower()
         # Ray status "stopped" is OpenAI status "cancelled"
         status = "cancelled" if status == "stopped" else status
-        print(f"Status of job {job_id} is '{status}'")
+        logger.info(f"Status of job {job_id} is '{status}'")
         running_finetuning_jobs[job_id].status = status
         if status == "finished" or status == "cancelled" or status == "failed":
             break
@@ -102,7 +105,7 @@ def handle_create_finetuning_jobs(request: FineTuningJobsRequest, background_tas
     )
     finetune_config.General.output_dir = os.path.join(JOBS_PATH, job.id)
     if os.getenv("DEVICE", ""):
-        print(f"specific device: {os.getenv('DEVICE')}")
+        logger.info(f"specific device: {os.getenv('DEVICE')}")
         finetune_config.Training.device = os.getenv("DEVICE")
 
     finetune_config_file = f"{JOBS_PATH}/{job.id}.yaml"
@@ -117,7 +120,7 @@ def handle_create_finetuning_jobs(request: FineTuningJobsRequest, background_tas
         # Path to the local directory that contains the script.py file
         runtime_env={"working_dir": "./"},
     )
-    print(f"Submitted Ray job: {ray_job_id} ...")
+    logger.info(f"Submitted Ray job: {ray_job_id} ...")
 
     running_finetuning_jobs[job.id] = job
     finetuning_job_to_ray_job[job.id] = ray_job_id
@@ -169,7 +172,7 @@ async def save_content_to_local_disk(save_path: str, content):
                 content = await content.read()
                 fout.write(content)
     except Exception as e:
-        print(f"Write file failed. Exception: {e}")
+        logger.info(f"Write file failed. Exception: {e}")
         raise Exception(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}")
 
 

diff --git a/comps/finetuning/jobs/.gitkeep b/comps/finetuning/jobs/.gitkeep
diff --git a/comps/finetuning/lanuch.sh → comps/finetuning/launch.sh b/comps/finetuning/lanuch.sh → comps/finetuning/launch.sh
@@ -2,11 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 if [[ -n "$RAY_PORT" ]];then
-    export RAY_ADDRESS=http://127.0.0.1:$RAY_PORT
     ray start --head --port $RAY_PORT
 else
-    export RAY_ADDRESS=http://127.0.0.1:8265
     ray start --head
+    export RAY_PORT=8265
 fi
 
+export RAY_ADDRESS=http://127.0.0.1:$RAY_PORT
 python finetuning_service.py
diff --git a/comps/finetuning/llm_on_ray/common/__init__.py b/comps/finetuning/llm_on_ray/common/__init__.py
@@ -3,5 +3,4 @@
 #
 # Copyright 2023 The LLM-on-Ray Authors.
 
-from .logging import logger
 from .torch_config import TorchConfig
diff --git a/comps/finetuning/llm_on_ray/common/common.py b/comps/finetuning/llm_on_ray/common/common.py
@@ -7,7 +7,9 @@
 import importlib
 import os
 
-from .logging import logger
+from comps import CustomLogger
+
+logger = CustomLogger("llm_on_ray")
 
 
 def import_all_modules(basedir, prefix=None):

diff --git a/comps/finetuning/llm_on_ray/common/logging.py b/comps/finetuning/llm_on_ray/common/logging.py
diff --git a/comps/finetuning/llm_on_ray/finetune/finetune.py b/comps/finetuning/llm_on_ray/finetune/finetune.py
@@ -23,10 +23,13 @@
 from ray.air.config import ScalingConfig
 from ray.train.torch import TorchTrainer
 
+from comps import CustomLogger
 from comps.finetuning.llm_on_ray import common
 from comps.finetuning.llm_on_ray.finetune.data_process import DataProcessor
 from comps.finetuning.llm_on_ray.finetune.finetune_config import FinetuneConfig
 
+logger = CustomLogger("llm_on_ray/finetune")
+
 
 def adapt_transformers_to_device(config: Dict):
     device = config["Training"]["device"]
@@ -332,10 +335,10 @@ def train_func(config: Dict[str, Any]):
 
     training_args, trainer = get_trainer(config, model, tokenizer, tokenized_dataset, data_collator)
 
-    common.logger.info("train start")
+    logger.info("train start")
     trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
     trainer.save_model()
-    common.logger.info("train finish")
+    logger.info("train finish")
 
 
 def get_finetune_config():
@@ -401,7 +404,7 @@ def main(external_config=None):
         else:
             ray.init(runtime_env=runtime_env)
 
-    common.logger.info(f"ray available resources = {ray.available_resources()}")
+    logger.info(f"ray available resources = {ray.available_resources()}")
     use_gpu = True if device == "gpu" else False
     scaling_config = ScalingConfig(
         num_workers=num_training_workers,