diff --git a/comps/finetuning/README.md b/comps/finetuning/README.md index 411395ec9..44ee3d10c 100644 --- a/comps/finetuning/README.md +++ b/comps/finetuning/README.md @@ -61,7 +61,7 @@ docker build -t opea/finetuning:latest --build-arg https_proxy=$https_proxy --bu Start docker container with below command: ```bash -docker run -d --name="finetuning-server" -p 8005:8005 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest +docker run -d --name="finetuning-server" -p 8015:8015 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest ``` ## 2.2 Setup on Gaudi2 @@ -81,7 +81,7 @@ Start docker container with below command: ```bash export HF_TOKEN=${your_huggingface_token} -docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8005:8005 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest +docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8015:8015 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest ``` # 🚀3. Consume Finetuning Service @@ -92,10 +92,10 @@ Assuming a training file `alpaca_data.json` is uploaded, it can be downloaded in ```bash # upload a training file -curl http://${your_ip}:8005/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json" +curl http://${your_ip}:8015/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json" # create a finetuning job -curl http://${your_ip}:8005/v1/fine_tuning/jobs \ +curl http://${your_ip}:8015/v1/fine_tuning/jobs \ -X POST \ -H "Content-Type: application/json" \ -d '{ @@ -104,18 +104,18 @@ curl http://${your_ip}:8005/v1/fine_tuning/jobs \ }' # list finetuning jobs -curl http://${your_ip}:8005/v1/fine_tuning/jobs -X GET +curl http://${your_ip}:8015/v1/fine_tuning/jobs -X GET # retrieve one finetuning job -curl http://localhost:8005/v1/fine_tuning/jobs/retrieve -X POST -H "Content-Type: application/json" -d '{ +curl http://localhost:8015/v1/fine_tuning/jobs/retrieve -X POST -H "Content-Type: application/json" -d '{ "fine_tuning_job_id": ${fine_tuning_job_id}}' # cancel one finetuning job -curl http://localhost:8005/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type: application/json" -d '{ +curl http://localhost:8015/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type: application/json" -d '{ "fine_tuning_job_id": ${fine_tuning_job_id}}' # list checkpoints of a finetuning job -curl http://${your_ip}:8005/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}' +curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}' ``` diff --git a/comps/finetuning/datasets/.gitkeep b/comps/finetuning/datasets/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/comps/finetuning/finetuning_service.py b/comps/finetuning/finetuning_service.py index fabb32bc4..031380a5d 100644 --- a/comps/finetuning/finetuning_service.py +++ b/comps/finetuning/finetuning_service.py @@ -20,20 +20,20 @@ ) -@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8005) +@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015) def create_finetuning_jobs(request: FineTuningJobsRequest, background_tasks: BackgroundTasks): return handle_create_finetuning_jobs(request, background_tasks) @register_microservice( - name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8005, methods=["GET"] + name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015, methods=["GET"] ) def list_finetuning_jobs(): return handle_list_finetuning_jobs() @register_microservice( - name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8005 + name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8015 ) def retrieve_finetuning_job(request: FineTuningJobIDRequest): job = handle_retrieve_finetuning_job(request) @@ -41,7 +41,7 @@ def retrieve_finetuning_job(request: FineTuningJobIDRequest): @register_microservice( - name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8005 + name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8015 ) def cancel_finetuning_job(request: FineTuningJobIDRequest): job = handle_cancel_finetuning_job(request) @@ -52,7 +52,7 @@ def cancel_finetuning_job(request: FineTuningJobIDRequest): name="opea_service@finetuning", endpoint="/v1/finetune/upload_training_files", host="0.0.0.0", - port=8005, + port=8015, ) async def upload_training_files( files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), @@ -69,7 +69,7 @@ async def upload_training_files( @register_microservice( - name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8005 + name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8015 ) def list_checkpoints(request: FineTuningJobIDRequest): checkpoints = handle_list_finetuning_checkpoints(request) diff --git a/comps/finetuning/handlers.py b/comps/finetuning/handlers.py index 2bdab42a9..6aa7e5d3e 100644 --- a/comps/finetuning/handlers.py +++ b/comps/finetuning/handlers.py @@ -12,6 +12,7 @@ from pydantic_yaml import parse_yaml_raw_as, to_yaml_file from ray.job_submission import JobSubmissionClient +from comps import CustomLogger from comps.cores.proto.api_protocol import ( FineTuningJob, FineTuningJobIDRequest, @@ -20,6 +21,8 @@ ) from comps.finetuning.llm_on_ray.finetune.finetune_config import FinetuneConfig +logger = CustomLogger("finetuning_handlers") + MODEL_CONFIG_FILE_MAP = { "meta-llama/Llama-2-7b-chat-hf": "./models/llama-2-7b-chat-hf.yaml", "mistralai/Mistral-7B-v0.1": "./models/mistral-7b-v0.1.yaml", @@ -50,7 +53,7 @@ def update_job_status(job_id: FineTuningJobID): status = str(job_status).lower() # Ray status "stopped" is OpenAI status "cancelled" status = "cancelled" if status == "stopped" else status - print(f"Status of job {job_id} is '{status}'") + logger.info(f"Status of job {job_id} is '{status}'") running_finetuning_jobs[job_id].status = status if status == "finished" or status == "cancelled" or status == "failed": break @@ -102,7 +105,7 @@ def handle_create_finetuning_jobs(request: FineTuningJobsRequest, background_tas ) finetune_config.General.output_dir = os.path.join(JOBS_PATH, job.id) if os.getenv("DEVICE", ""): - print(f"specific device: {os.getenv('DEVICE')}") + logger.info(f"specific device: {os.getenv('DEVICE')}") finetune_config.Training.device = os.getenv("DEVICE") finetune_config_file = f"{JOBS_PATH}/{job.id}.yaml" @@ -117,7 +120,7 @@ def handle_create_finetuning_jobs(request: FineTuningJobsRequest, background_tas # Path to the local directory that contains the script.py file runtime_env={"working_dir": "./"}, ) - print(f"Submitted Ray job: {ray_job_id} ...") + logger.info(f"Submitted Ray job: {ray_job_id} ...") running_finetuning_jobs[job.id] = job finetuning_job_to_ray_job[job.id] = ray_job_id @@ -169,7 +172,7 @@ async def save_content_to_local_disk(save_path: str, content): content = await content.read() fout.write(content) except Exception as e: - print(f"Write file failed. Exception: {e}") + logger.info(f"Write file failed. Exception: {e}") raise Exception(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}") diff --git a/comps/finetuning/jobs/.gitkeep b/comps/finetuning/jobs/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/comps/finetuning/lanuch.sh b/comps/finetuning/launch.sh similarity index 68% rename from comps/finetuning/lanuch.sh rename to comps/finetuning/launch.sh index a7e249b6f..bb5042ac6 100644 --- a/comps/finetuning/lanuch.sh +++ b/comps/finetuning/launch.sh @@ -2,11 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 if [[ -n "$RAY_PORT" ]];then - export RAY_ADDRESS=http://127.0.0.1:$RAY_PORT ray start --head --port $RAY_PORT else - export RAY_ADDRESS=http://127.0.0.1:8265 ray start --head + export RAY_PORT=8265 fi +export RAY_ADDRESS=http://127.0.0.1:$RAY_PORT python finetuning_service.py diff --git a/comps/finetuning/llm_on_ray/common/__init__.py b/comps/finetuning/llm_on_ray/common/__init__.py index a4ad1e878..954b7baa4 100644 --- a/comps/finetuning/llm_on_ray/common/__init__.py +++ b/comps/finetuning/llm_on_ray/common/__init__.py @@ -3,5 +3,4 @@ # # Copyright 2023 The LLM-on-Ray Authors. -from .logging import logger from .torch_config import TorchConfig diff --git a/comps/finetuning/llm_on_ray/common/common.py b/comps/finetuning/llm_on_ray/common/common.py index 136d2526f..ac01ae12e 100644 --- a/comps/finetuning/llm_on_ray/common/common.py +++ b/comps/finetuning/llm_on_ray/common/common.py @@ -7,7 +7,9 @@ import importlib import os -from .logging import logger +from comps import CustomLogger + +logger = CustomLogger("llm_on_ray") def import_all_modules(basedir, prefix=None): diff --git a/comps/finetuning/llm_on_ray/common/logging.py b/comps/finetuning/llm_on_ray/common/logging.py deleted file mode 100644 index e2aec567a..000000000 --- a/comps/finetuning/llm_on_ray/common/logging.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -# -# Copyright 2023 The LLM-on-Ray Authors. - -import functools -import logging -import logging.config -import traceback - -__all__ = ["logger", "get_logger"] - -use_accelerate_log = False -logger_name = "common" - -logging_config = { - "version": 1, - "loggers": { - "root": {"level": "INFO", "handlers": ["consoleHandler"]}, - "common": { - "level": "INFO", - "handlers": ["consoleHandler"], - "qualname": "common", - "propagate": 0, - }, - }, - "handlers": { - "consoleHandler": { - "class": "logging.StreamHandler", - "level": "INFO", - "formatter": "standardFormatter", - }, - }, - "formatters": { - "standardFormatter": { - "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s", - "datefmt": "", - } - }, -} - -if logging_config is not None: - try: - logging.config.dictConfig(logging_config) - except Exception: - traceback.print_exc() - exit(1) - -if use_accelerate_log: - import accelerate - - get_logger = functools.partial(accelerate.logging.get_logger, name=logger_name) -else: - get_logger = functools.partial(logging.getLogger, name=logger_name) - -logger = get_logger() diff --git a/comps/finetuning/llm_on_ray/finetune/finetune.py b/comps/finetuning/llm_on_ray/finetune/finetune.py index f268800f2..03b8adfaa 100644 --- a/comps/finetuning/llm_on_ray/finetune/finetune.py +++ b/comps/finetuning/llm_on_ray/finetune/finetune.py @@ -23,10 +23,13 @@ from ray.air.config import ScalingConfig from ray.train.torch import TorchTrainer +from comps import CustomLogger from comps.finetuning.llm_on_ray import common from comps.finetuning.llm_on_ray.finetune.data_process import DataProcessor from comps.finetuning.llm_on_ray.finetune.finetune_config import FinetuneConfig +logger = CustomLogger("llm_on_ray/finetune") + def adapt_transformers_to_device(config: Dict): device = config["Training"]["device"] @@ -332,10 +335,10 @@ def train_func(config: Dict[str, Any]): training_args, trainer = get_trainer(config, model, tokenizer, tokenized_dataset, data_collator) - common.logger.info("train start") + logger.info("train start") trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) trainer.save_model() - common.logger.info("train finish") + logger.info("train finish") def get_finetune_config(): @@ -401,7 +404,7 @@ def main(external_config=None): else: ray.init(runtime_env=runtime_env) - common.logger.info(f"ray available resources = {ray.available_resources()}") + logger.info(f"ray available resources = {ray.available_resources()}") use_gpu = True if device == "gpu" else False scaling_config = ScalingConfig( num_workers=num_training_workers,