From 54b7e096b264ccf2fd7e9a6729bf677966b1b933 Mon Sep 17 00:00:00 2001 From: Elliot Tower Date: Sat, 22 Apr 2023 18:49:35 -0400 Subject: [PATCH 01/10] Update gymnasium/pettingzoo/supersuit versions Signed-off-by: Elliot Tower --- python/requirements/ml/requirements_rllib.txt | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt index c976b3f0889a9..1309fd8966239 100644 --- a/python/requirements/ml/requirements_rllib.txt +++ b/python/requirements/ml/requirements_rllib.txt @@ -5,7 +5,7 @@ # Atari # TODO(sven): Still needed for Atari (need to be wrapped by gymnasium as it does NOT support Atari yet) gym==0.26.2 -gymnasium[atari,mujoco]==0.26.3 +gymnasium[atari,mujoco]==0.28.1 # For testing MuJoCo envs with gymnasium. mujoco-py<2.2,>=2.1 # Kaggle envs. @@ -15,12 +15,9 @@ kaggle_environments==1.7.11 #mlagents==0.28.0 mlagents_envs==0.28.0 # For tests on PettingZoo's multi-agent envs. -pettingzoo==1.22.1; python_version >= '3.7' -# When installing pettingzoo, chess is missing, even though its a dependancy -# TODO: remove if a future pettingzoo and/or ray version fixes this dependancy issue -chess==1.7.0 +pettingzoo==1.22.3; python_version >= '3.7' pymunk==6.2.1 -supersuit==3.7.0; python_version >= '3.7' +supersuit==3.7.2; python_version >= '3.7' # For tests on minigrid. minigrid==2.1.1 # For tests on RecSim and Kaggle envs. From 6b7bf1e8bb1b3d4ead9589262656cd127b84861b Mon Sep 17 00:00:00 2001 From: Elliot Tower Date: Sat, 22 Apr 2023 18:56:58 -0400 Subject: [PATCH 02/10] pettingzoo_env: `return_info` from `reset()` also removes rendering mode from `render()` as that is specified in the environment initialization rather than in `render()` Signed-off-by: Elliot Tower --- rllib/env/wrappers/pettingzoo_env.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rllib/env/wrappers/pettingzoo_env.py b/rllib/env/wrappers/pettingzoo_env.py index ba2abc13ce734..651b5cce7f63b 100644 --- a/rllib/env/wrappers/pettingzoo_env.py +++ b/rllib/env/wrappers/pettingzoo_env.py @@ -136,7 +136,7 @@ def observation_space_contains(self, x: MultiAgentDict) -> bool: return all(self.observation_space.contains(val) for val in x.values()) def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): - info = self.env.reset(seed=seed, return_info=True, options=options) + info = self.env.reset(seed=seed, options=options) return ( {self.env.agent_selection: self.env.observe(self.env.agent_selection)}, info or {}, @@ -175,7 +175,7 @@ def close(self): self.env.close() def render(self): - return self.env.render(self.render_mode) + return self.env.render() @property def get_sub_environments(self): @@ -221,7 +221,7 @@ def __init__(self, env): ) def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): - obs, info = self.par_env.reset(seed=seed, return_info=True, options=options) + obs, info = self.par_env.reset(seed=seed, options=options) return obs, info or {} def step(self, action_dict): From ea7afe8db005f81a64d03d12c244f76d7f2acb55 Mon Sep 17 00:00:00 2001 From: Elliot Tower Date: Sat, 22 Apr 2023 19:31:18 -0400 Subject: [PATCH 03/10] Switch to stable pettignzoo/shimmy releases Signed-off-by: Elliot Tower --- python/requirements/ml/requirements_rllib.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt index 1309fd8966239..919d506f9e749 100644 --- a/python/requirements/ml/requirements_rllib.txt +++ b/python/requirements/ml/requirements_rllib.txt @@ -17,7 +17,7 @@ mlagents_envs==0.28.0 # For tests on PettingZoo's multi-agent envs. pettingzoo==1.22.3; python_version >= '3.7' pymunk==6.2.1 -supersuit==3.7.2; python_version >= '3.7' +supersuit==3.7.1; python_version >= '3.7' # For tests on minigrid. minigrid==2.1.1 # For tests on RecSim and Kaggle envs. From 5c2256120502ceb631463ec34b92707fa3d9e42f Mon Sep 17 00:00:00 2001 From: Elliot Tower Date: Sat, 22 Apr 2023 19:33:21 -0400 Subject: [PATCH 04/10] Update pettingzoo_env to work with pettingzoo 1.22.3 Signed-off-by: Elliot Tower --- rllib/env/wrappers/pettingzoo_env.py | 29 ++++------------------------ 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/rllib/env/wrappers/pettingzoo_env.py b/rllib/env/wrappers/pettingzoo_env.py index 651b5cce7f63b..3be5c03c2a93b 100644 --- a/rllib/env/wrappers/pettingzoo_env.py +++ b/rllib/env/wrappers/pettingzoo_env.py @@ -3,7 +3,6 @@ from ray.rllib.env.multi_agent_env import MultiAgentEnv from ray.rllib.utils.annotations import PublicAPI from ray.rllib.utils.gym import convert_old_gym_space_to_gymnasium_space -from ray.rllib.utils.typing import MultiAgentDict @PublicAPI @@ -113,30 +112,10 @@ def __init__(self, env): ) self.action_space = convert_old_gym_space_to_gymnasium_space(first_action_space) - self._agent_ids = self.env.agents - - def observation_space_sample(self, agent_ids: list = None) -> MultiAgentDict: - if agent_ids is None: - agent_ids = self._agent_ids - return {id: self.observation_space.sample() for id in agent_ids} - - def action_space_sample(self, agent_ids: list = None) -> MultiAgentDict: - if agent_ids is None: - agent_ids = self._agent_ids - return {id: self.action_space.sample() for id in agent_ids} - - def action_space_contains(self, x: MultiAgentDict) -> bool: - if not isinstance(x, dict): - return False - return all(self.action_space.contains(val) for val in x.values()) - - def observation_space_contains(self, x: MultiAgentDict) -> bool: - if not isinstance(x, dict): - return False - return all(self.observation_space.contains(val) for val in x.values()) + self._agent_ids = set(self.env.agents) def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): - info = self.env.reset(seed=seed, options=options) + info = self.env.reset(seed=seed, return_info=True, options=options) return ( {self.env.agent_selection: self.env.observe(self.env.agent_selection)}, info or {}, @@ -221,7 +200,7 @@ def __init__(self, env): ) def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): - obs, info = self.par_env.reset(seed=seed, options=options) + obs, info = self.par_env.reset(seed=seed, return_info=True, options=options) return obs, info or {} def step(self, action_dict): @@ -234,7 +213,7 @@ def close(self): self.par_env.close() def render(self): - return self.par_env.render(self.render_mode) + return self.par_env.render() @property def get_sub_environments(self): From cf1bae909a5bfb8701d9e4ebb532f62221545fa7 Mon Sep 17 00:00:00 2001 From: Elliot Tower Date: Thu, 18 May 2023 16:38:20 -0400 Subject: [PATCH 05/10] Update to newest PettingZoo, SuperSuit versions Signed-off-by: Elliot Tower --- python/requirements/ml/requirements_rllib.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt index d8a7b3d0380a8..6e1cac0f4b6b1 100644 --- a/python/requirements/ml/requirements_rllib.txt +++ b/python/requirements/ml/requirements_rllib.txt @@ -15,9 +15,9 @@ kaggle_environments==1.7.11 #mlagents==0.28.0 mlagents_envs==0.28.0 # For tests on PettingZoo's multi-agent envs. -pettingzoo==1.22.3; python_version >= '3.7' +pettingzoo==1.23.0; python_version >= '3.7' pymunk==6.2.1 -supersuit==3.7.1; python_version >= '3.7' +supersuit==3.8.0; python_version >= '3.7' # For tests on minigrid. minigrid==2.1.1 # For tests on RecSim and Kaggle envs. From fe372972a761ab8c75006b306a1fbc0c4a6c94d5 Mon Sep 17 00:00:00 2001 From: Elliot Tower Date: Thu, 18 May 2023 16:59:22 -0400 Subject: [PATCH 06/10] Merge PettingZooEnv from master Signed-off-by: Elliot Tower --- rllib/env/wrappers/pettingzoo_env.py | 29 ++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/rllib/env/wrappers/pettingzoo_env.py b/rllib/env/wrappers/pettingzoo_env.py index 3be5c03c2a93b..3fd29732190eb 100644 --- a/rllib/env/wrappers/pettingzoo_env.py +++ b/rllib/env/wrappers/pettingzoo_env.py @@ -3,6 +3,7 @@ from ray.rllib.env.multi_agent_env import MultiAgentEnv from ray.rllib.utils.annotations import PublicAPI from ray.rllib.utils.gym import convert_old_gym_space_to_gymnasium_space +from ray.rllib.utils.typing import MultiAgentDict @PublicAPI @@ -112,10 +113,30 @@ def __init__(self, env): ) self.action_space = convert_old_gym_space_to_gymnasium_space(first_action_space) - self._agent_ids = set(self.env.agents) + self._agent_ids = self.env.agents + + def observation_space_sample(self, agent_ids: list = None) -> MultiAgentDict: + if agent_ids is None: + agent_ids = self._agent_ids + return {id: self.observation_space.sample() for id in agent_ids} + + def action_space_sample(self, agent_ids: list = None) -> MultiAgentDict: + if agent_ids is None: + agent_ids = self._agent_ids + return {id: self.action_space.sample() for id in agent_ids} + + def action_space_contains(self, x: MultiAgentDict) -> bool: + if not isinstance(x, dict): + return False + return all(self.action_space.contains(val) for val in x.values()) + + def observation_space_contains(self, x: MultiAgentDict) -> bool: + if not isinstance(x, dict): + return False + return all(self.observation_space.contains(val) for val in x.values()) def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): - info = self.env.reset(seed=seed, return_info=True, options=options) + info = self.env.reset(seed=seed, options=options) return ( {self.env.agent_selection: self.env.observe(self.env.agent_selection)}, info or {}, @@ -200,8 +221,8 @@ def __init__(self, env): ) def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): - obs, info = self.par_env.reset(seed=seed, return_info=True, options=options) - return obs, info or {} + obs, info = self.par_env.reset(seed=seed, options=options) + return obs, info def step(self, action_dict): obss, rews, terminateds, truncateds, infos = self.par_env.step(action_dict) From 20487c73299915083946a8035363250715ab1dd7 Mon Sep 17 00:00:00 2001 From: Elliot Tower Date: Thu, 18 May 2023 17:36:36 -0400 Subject: [PATCH 07/10] Update requirements_rllib.txt Signed-off-by: Elliot Tower --- python/requirements/ml/requirements_rllib.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt index 6e1cac0f4b6b1..305a57a7cc99a 100644 --- a/python/requirements/ml/requirements_rllib.txt +++ b/python/requirements/ml/requirements_rllib.txt @@ -5,7 +5,7 @@ # Atari # TODO(sven): Still needed for Atari (need to be wrapped by gymnasium as it does NOT support Atari yet) gym==0.26.2 -gymnasium[atari,mujoco]==0.28.1 +gymnasium==0.28.1 # For testing MuJoCo envs with gymnasium. mujoco-py<2.2,>=2.1 # Kaggle envs. From f6666893a0e2c9c0ddf6bb5c544a1b7b6431c058 Mon Sep 17 00:00:00 2001 From: Elliot Tower Date: Thu, 18 May 2023 17:37:04 -0400 Subject: [PATCH 08/10] Update requirements_rllib.txt Signed-off-by: Elliot Tower --- python/requirements/ml/requirements_rllib.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt index 305a57a7cc99a..bb5621ea39f42 100644 --- a/python/requirements/ml/requirements_rllib.txt +++ b/python/requirements/ml/requirements_rllib.txt @@ -3,8 +3,6 @@ # Environment adapters. # --------------------- # Atari -# TODO(sven): Still needed for Atari (need to be wrapped by gymnasium as it does NOT support Atari yet) -gym==0.26.2 gymnasium==0.28.1 # For testing MuJoCo envs with gymnasium. mujoco-py<2.2,>=2.1 From 583107d462ea710d9578a74d013a041627a2d341 Mon Sep 17 00:00:00 2001 From: Elliot Tower Date: Thu, 18 May 2023 18:12:43 -0400 Subject: [PATCH 09/10] Update all uses of gymnasium to 0.28.1 Signed-off-by: Elliot Tower --- python/requirements.txt | 2 +- python/requirements/ml/requirements_rllib.txt | 2 +- python/setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/requirements.txt b/python/requirements.txt index e6bfd5a7e4667..5e569b8f0b30a 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -41,7 +41,7 @@ scikit-image scipy aiohttp>=3.7 fastapi -gymnasium==0.26.3 +gymnasium==0.28.1 opencensus fsspec dm_tree diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt index bb5621ea39f42..78beaad6404c3 100644 --- a/python/requirements/ml/requirements_rllib.txt +++ b/python/requirements/ml/requirements_rllib.txt @@ -3,7 +3,7 @@ # Environment adapters. # --------------------- # Atari -gymnasium==0.28.1 +gymnasium[atari,mujoco]==0.28.1 # For testing MuJoCo envs with gymnasium. mujoco-py<2.2,>=2.1 # Kaggle envs. diff --git a/python/setup.py b/python/setup.py index 11b725ed85a68..514f293434214 100644 --- a/python/setup.py +++ b/python/setup.py @@ -274,7 +274,7 @@ def get_packages(self): setup_spec.extras["rllib"] = setup_spec.extras["tune"] + [ "dm_tree", - "gymnasium==0.26.3", + "gymnasium==0.28.1", "lz4", "scikit-image", "pyyaml", From 39f6a8ce3ae149846fbb6bd76885f6498098a542 Mon Sep 17 00:00:00 2001 From: elliottower Date: Mon, 7 Aug 2023 11:32:00 -0400 Subject: [PATCH 10/10] Merge master --- .../batch_inference_object_detection.ipynb | 6 +- doc/source/data/examples/batch_training.ipynb | 2 +- .../huggingface_vit_batch_prediction.ipynb | 6 +- .../pytorch_resnet_batch_prediction.ipynb | 6 +- .../ray-air/examples/feast_example.ipynb | 86 +- .../huggingface_text_classification.ipynb | 2 +- .../opt_deepspeed_batch_inference.ipynb | 36 + .../ray-air/examples/sklearn_example.ipynb | 4 +- .../examples/tfx_tabular_train_to_serve.ipynb | 2 +- .../templates/01_batch_inference/start.ipynb | 6 +- .../lightning/lightning_mnist_example.ipynb | 232 +-- ...una_13b_lightning_deepspeed_finetune.ipynb | 474 +++--- .../pytorch/pytorch_resnet_finetune.ipynb | 2 +- doc/source/tune/examples/ax_example.ipynb | 4 +- .../tune/examples/bayesopt_example.ipynb | 4 +- doc/source/tune/examples/optuna_example.ipynb | 12 +- doc/source/tune/examples/tune-aim.ipynb | 808 +++++----- doc/source/tune/examples/tune-mlflow.ipynb | 2 +- .../tune-vanilla-pytorch-lightning.ipynb | 76 +- doc/source/tune/examples/tune-wandb.ipynb | 22 +- .../ray/air/examples/lightgbm_example.ipynb | 504 +++++- python/ray/air/examples/sklearn_example.ipynb | 357 ++++- .../ray/air/examples/upload_to_comet_ml.ipynb | 413 ++++- python/ray/air/examples/upload_to_wandb.ipynb | 370 ++++- python/ray/air/examples/xgboost_example.ipynb | 522 +++++- .../gptj_deepspeed_fine_tuning.ipynb | 1201 +++++++++++++- .../opt_deepspeed_batch_inference.ipynb | 37 +- ...una_13b_lightning_deepspeed_finetune.ipynb | 1426 ++++++++++++++++- 28 files changed, 5736 insertions(+), 886 deletions(-) create mode 100644 doc/source/ray-air/examples/opt_deepspeed_batch_inference.ipynb mode change 120000 => 100644 python/ray/air/examples/lightgbm_example.ipynb mode change 120000 => 100644 python/ray/air/examples/sklearn_example.ipynb mode change 120000 => 100644 python/ray/air/examples/upload_to_comet_ml.ipynb mode change 120000 => 100644 python/ray/air/examples/upload_to_wandb.ipynb mode change 120000 => 100644 python/ray/air/examples/xgboost_example.ipynb mode change 120000 => 100644 release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb mode change 120000 => 100644 release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb mode change 120000 => 100644 release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb diff --git a/doc/source/data/examples/batch_inference_object_detection.ipynb b/doc/source/data/examples/batch_inference_object_detection.ipynb index 3f8da8947787b..a8026efe79566 100644 --- a/doc/source/data/examples/batch_inference_object_detection.ipynb +++ b/doc/source/data/examples/batch_inference_object_detection.ipynb @@ -452,10 +452,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2023-05-19 18:10:29] INFO ray._private.worker::Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "[2023-05-19 18:10:35] [Ray Data] WARNING ray.data.dataset::\u001b[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n", + "[2023-05-19 18:10:29] INFO ray._private.worker::Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32m127.0.0.1:8265 \u001B[39m\u001B[22m\n", + "[2023-05-19 18:10:35] [Ray Data] WARNING ray.data.dataset::\u001B[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n", "\n", - "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001b[0m\n" + "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001B[0m\n" ] }, { diff --git a/doc/source/data/examples/batch_training.ipynb b/doc/source/data/examples/batch_training.ipynb index a95542636b907..b751575e7d155 100644 --- a/doc/source/data/examples/batch_training.ipynb +++ b/doc/source/data/examples/batch_training.ipynb @@ -94,7 +94,7 @@ "text": [ "2022-12-08 17:04:06,689\tINFO worker.py:1223 -- Using address localhost:9031 set in the environment variable RAY_ADDRESS\n", "2022-12-08 17:04:06,691\tINFO worker.py:1333 -- Connecting to existing Ray cluster at address: 172.31.174.62:9031...\n", - "2022-12-08 17:04:06,700\tINFO worker.py:1509 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_gyl6mbksa8xt7b149ib6abld/services?redirect_to=dashboard \u001b[39m\u001b[22m\n" + "2022-12-08 17:04:06,700\tINFO worker.py:1509 -- Connected to Ray cluster. View the dashboard at \u001B[1m\u001B[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_gyl6mbksa8xt7b149ib6abld/services?redirect_to=dashboard \u001B[39m\u001B[22m\n" ] }, { diff --git a/doc/source/data/examples/huggingface_vit_batch_prediction.ipynb b/doc/source/data/examples/huggingface_vit_batch_prediction.ipynb index 34d30c1801c03..fb58a027084dc 100644 --- a/doc/source/data/examples/huggingface_vit_batch_prediction.ipynb +++ b/doc/source/data/examples/huggingface_vit_batch_prediction.ipynb @@ -72,12 +72,12 @@ "output_type": "stream", "text": [ "[2023-05-24 11:25:47] INFO ray._private.worker::Connecting to existing Ray cluster at address: 10.0.33.149:6379...\n", - "[2023-05-24 11:25:47] INFO ray._private.worker::Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_6h5a4kl2xhfgtdy4w41he6iwyw/services?redirect_to=dashboard \u001b[39m\u001b[22m\n", + "[2023-05-24 11:25:47] INFO ray._private.worker::Connected to Ray cluster. View the dashboard at \u001B[1m\u001B[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_6h5a4kl2xhfgtdy4w41he6iwyw/services?redirect_to=dashboard \u001B[39m\u001B[22m\n", "[2023-05-24 11:25:47] INFO ray._private.runtime_env.packaging::Pushing file package 'gcs://_ray_pkg_2429254893b10da6df2b65ceaf858894.zip' (8.71MiB) to Ray cluster...\n", "[2023-05-24 11:25:47] INFO ray._private.runtime_env.packaging::Successfully pushed file package 'gcs://_ray_pkg_2429254893b10da6df2b65ceaf858894.zip'.\n", - "[2023-05-24 11:25:50] [Ray Data] WARNING ray.data.dataset::\u001b[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n", + "[2023-05-24 11:25:50] [Ray Data] WARNING ray.data.dataset::\u001B[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n", "\n", - "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001b[0m\n" + "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001B[0m\n" ] }, { diff --git a/doc/source/data/examples/pytorch_resnet_batch_prediction.ipynb b/doc/source/data/examples/pytorch_resnet_batch_prediction.ipynb index 1497fb2120612..57e6dae4c5568 100644 --- a/doc/source/data/examples/pytorch_resnet_batch_prediction.ipynb +++ b/doc/source/data/examples/pytorch_resnet_batch_prediction.ipynb @@ -78,12 +78,12 @@ "output_type": "stream", "text": [ "2023-06-27 23:23:57,184\tINFO worker.py:1452 -- Connecting to existing Ray cluster at address: 10.0.5.141:6379...\n", - "2023-06-27 23:23:57,228\tINFO worker.py:1627 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-kncgqf3p7w2j7qcsnz2safl4tj.i.anyscaleuserdata-staging.com \u001b[39m\u001b[22m\n", + "2023-06-27 23:23:57,228\tINFO worker.py:1627 -- Connected to Ray cluster. View the dashboard at \u001B[1m\u001B[32mhttps://session-kncgqf3p7w2j7qcsnz2safl4tj.i.anyscaleuserdata-staging.com \u001B[39m\u001B[22m\n", "2023-06-27 23:23:57,243\tINFO packaging.py:347 -- Pushing file package 'gcs://_ray_pkg_32ef287a3a39e82021e70d2413880a69.zip' (4.49MiB) to Ray cluster...\n", "2023-06-27 23:23:57,257\tINFO packaging.py:360 -- Successfully pushed file package 'gcs://_ray_pkg_32ef287a3a39e82021e70d2413880a69.zip'.\n", - "2023-06-27 23:23:59,629\tWARNING dataset.py:253 -- \u001b[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n", + "2023-06-27 23:23:59,629\tWARNING dataset.py:253 -- \u001B[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n", "\n", - "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001b[0m\n" + "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001B[0m\n" ] }, { diff --git a/doc/source/ray-air/examples/feast_example.ipynb b/doc/source/ray-air/examples/feast_example.ipynb index ca735ec4f9603..16d5d0bbf3ec5 100644 --- a/doc/source/ray-air/examples/feast_example.ipynb +++ b/doc/source/ray-air/examples/feast_example.ipynb @@ -150,61 +150,61 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36mdatetime\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m timedelta\n", + "\u001B[34mfrom\u001B[39;49;00m \u001B[04m\u001B[36mdatetime\u001B[39;49;00m \u001B[34mimport\u001B[39;49;00m timedelta\n", "\n", - "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36mfeast\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m (Entity, Field, FeatureView, FileSource, ValueType)\n", - "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36mfeast\u001b[39;49;00m\u001b[04m\u001b[36m.\u001b[39;49;00m\u001b[04m\u001b[36mtypes\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m Float32, Int64, String\n", + "\u001B[34mfrom\u001B[39;49;00m \u001B[04m\u001B[36mfeast\u001B[39;49;00m \u001B[34mimport\u001B[39;49;00m (Entity, Field, FeatureView, FileSource, ValueType)\n", + "\u001B[34mfrom\u001B[39;49;00m \u001B[04m\u001B[36mfeast\u001B[39;49;00m\u001B[04m\u001B[36m.\u001B[39;49;00m\u001B[04m\u001B[36mtypes\u001B[39;49;00m \u001B[34mimport\u001B[39;49;00m Float32, Int64, String\n", "\n", "\n", - "zipcode = Entity(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mzipcode\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, value_type=Int64)\n", + "zipcode = Entity(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mzipcode\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, value_type=Int64)\n", "\n", "zipcode_source = FileSource(\n", - " path=\u001b[33m\"\u001b[39;49;00m\u001b[33mfeature_repo/data/zipcode_table.parquet\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n", - " timestamp_field=\u001b[33m\"\u001b[39;49;00m\u001b[33mevent_timestamp\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n", - " created_timestamp_column=\u001b[33m\"\u001b[39;49;00m\u001b[33mcreated_timestamp\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n", + " path=\u001B[33m\"\u001B[39;49;00m\u001B[33mfeature_repo/data/zipcode_table.parquet\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n", + " timestamp_field=\u001B[33m\"\u001B[39;49;00m\u001B[33mevent_timestamp\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n", + " created_timestamp_column=\u001B[33m\"\u001B[39;49;00m\u001B[33mcreated_timestamp\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n", ")\n", "\n", "zipcode_features = FeatureView(\n", - " name=\u001b[33m\"\u001b[39;49;00m\u001b[33mzipcode_features\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n", - " entities=[\u001b[33m\"\u001b[39;49;00m\u001b[33mzipcode\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m],\n", - " ttl=timedelta(days=\u001b[34m3650\u001b[39;49;00m),\n", + " name=\u001B[33m\"\u001B[39;49;00m\u001B[33mzipcode_features\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n", + " entities=[\u001B[33m\"\u001B[39;49;00m\u001B[33mzipcode\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m],\n", + " ttl=timedelta(days=\u001B[34m3650\u001B[39;49;00m),\n", " schema=[\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mcity\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=String),\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mstate\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=String),\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mlocation_type\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=String),\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mtax_returns_filed\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mpopulation\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mtotal_wages\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mcity\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=String),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mstate\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=String),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mlocation_type\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=String),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mtax_returns_filed\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mpopulation\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mtotal_wages\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n", " ],\n", " source=zipcode_source,\n", ")\n", "\n", "dob_ssn = Entity(\n", - " name=\u001b[33m\"\u001b[39;49;00m\u001b[33mdob_ssn\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n", + " name=\u001B[33m\"\u001B[39;49;00m\u001B[33mdob_ssn\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n", " value_type=ValueType.STRING,\n", - " description=\u001b[33m\"\u001b[39;49;00m\u001b[33mDate of birth and last four digits of social security number\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n", + " description=\u001B[33m\"\u001B[39;49;00m\u001B[33mDate of birth and last four digits of social security number\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n", ")\n", "\n", "credit_history_source = FileSource(\n", - " path=\u001b[33m\"\u001b[39;49;00m\u001b[33mfeature_repo/data/credit_history.parquet\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n", - " timestamp_field=\u001b[33m\"\u001b[39;49;00m\u001b[33mevent_timestamp\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n", - " created_timestamp_column=\u001b[33m\"\u001b[39;49;00m\u001b[33mcreated_timestamp\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n", + " path=\u001B[33m\"\u001B[39;49;00m\u001B[33mfeature_repo/data/credit_history.parquet\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n", + " timestamp_field=\u001B[33m\"\u001B[39;49;00m\u001B[33mevent_timestamp\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n", + " created_timestamp_column=\u001B[33m\"\u001B[39;49;00m\u001B[33mcreated_timestamp\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n", ")\n", "\n", "credit_history = FeatureView(\n", - " name=\u001b[33m\"\u001b[39;49;00m\u001b[33mcredit_history\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n", - " entities=[\u001b[33m\"\u001b[39;49;00m\u001b[33mdob_ssn\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m],\n", - " ttl=timedelta(days=\u001b[34m90\u001b[39;49;00m),\n", + " name=\u001B[33m\"\u001B[39;49;00m\u001B[33mcredit_history\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n", + " entities=[\u001B[33m\"\u001B[39;49;00m\u001B[33mdob_ssn\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m],\n", + " ttl=timedelta(days=\u001B[34m90\u001B[39;49;00m),\n", " schema=[\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mcredit_card_due\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mmortgage_due\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mstudent_loan_due\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mvehicle_loan_due\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mhard_pulls\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mmissed_payments_2y\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mmissed_payments_1y\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mmissed_payments_6m\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n", - " Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mbankruptcies\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mcredit_card_due\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mmortgage_due\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mstudent_loan_due\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mvehicle_loan_due\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mhard_pulls\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mmissed_payments_2y\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mmissed_payments_1y\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mmissed_payments_6m\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n", + " Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mbankruptcies\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n", " ],\n", " source=credit_history_source,\n", ")\n" @@ -240,13 +240,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created entity \u001b[1m\u001b[32mzipcode\u001b[0m\n", - "Created entity \u001b[1m\u001b[32mdob_ssn\u001b[0m\n", - "Created feature view \u001b[1m\u001b[32mcredit_history\u001b[0m\n", - "Created feature view \u001b[1m\u001b[32mzipcode_features\u001b[0m\n", + "Created entity \u001B[1m\u001B[32mzipcode\u001B[0m\n", + "Created entity \u001B[1m\u001B[32mdob_ssn\u001B[0m\n", + "Created feature view \u001B[1m\u001B[32mcredit_history\u001B[0m\n", + "Created feature view \u001B[1m\u001B[32mzipcode_features\u001B[0m\n", "\n", - "Created sqlite table \u001b[1m\u001b[32mfeature_repo_credit_history\u001b[0m\n", - "Created sqlite table \u001b[1m\u001b[32mfeature_repo_zipcode_features\u001b[0m\n", + "Created sqlite table \u001B[1m\u001B[32mfeature_repo_credit_history\u001B[0m\n", + "Created sqlite table \u001B[1m\u001B[32mfeature_repo_zipcode_features\u001B[0m\n", "\n" ] } @@ -1049,7 +1049,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-09-12 19:25:14,018\tINFO worker.py:1508 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n" + "2022-09-12 19:25:14,018\tINFO worker.py:1508 -- Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32m127.0.0.1:8265 \u001B[39m\u001B[22m\n" ] } ], @@ -1196,9 +1196,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(XGBoostTrainer pid=348845)\u001b[0m /home/ray/.pyenv/versions/mambaforge/envs/ray/lib/python3.9/site-packages/xgboost_ray/main.py:431: UserWarning: `num_actors` in `ray_params` is smaller than 2 (1). XGBoost will NOT be distributed!\n", - "\u001b[2m\u001b[36m(XGBoostTrainer pid=348845)\u001b[0m warnings.warn(\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=348922)\u001b[0m [19:25:23] task [xgboost.ray]:140319682474864 got new rank 0\n" + "\u001B[2m\u001B[36m(XGBoostTrainer pid=348845)\u001B[0m /home/ray/.pyenv/versions/mambaforge/envs/ray/lib/python3.9/site-packages/xgboost_ray/main.py:431: UserWarning: `num_actors` in `ray_params` is smaller than 2 (1). XGBoost will NOT be distributed!\n", + "\u001B[2m\u001B[36m(XGBoostTrainer pid=348845)\u001B[0m warnings.warn(\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=348922)\u001B[0m [19:25:23] task [xgboost.ray]:140319682474864 got new rank 0\n" ] }, { diff --git a/doc/source/ray-air/examples/huggingface_text_classification.ipynb b/doc/source/ray-air/examples/huggingface_text_classification.ipynb index 636f1e8429409..7269b76aa8004 100644 --- a/doc/source/ray-air/examples/huggingface_text_classification.ipynb +++ b/doc/source/ray-air/examples/huggingface_text_classification.ipynb @@ -83,7 +83,7 @@ "text": [ "2022-08-25 10:09:51,282\tINFO worker.py:1223 -- Using address localhost:9031 set in the environment variable RAY_ADDRESS\n", "2022-08-25 10:09:51,697\tINFO worker.py:1333 -- Connecting to existing Ray cluster at address: 172.31.80.117:9031...\n", - "2022-08-25 10:09:51,706\tINFO worker.py:1509 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-i8ddtfaxhwypbvnyb9uzg7xs.i.anyscaleuserdata-staging.com/auth/?token=agh0_CkcwRQIhAJXwvxwq31GryaWthvXGCXZebsijbuqi7qL2pCa5uROOAiBGjzsyXAJFHLlaEI9zSlNI8ewtghKg5UV3t8NmlxuMcRJmEiCtvjcKE0VPiU7iQx51P9oPQjfpo5g1RJXccVSS5005cBgCIgNuL2E6DAj9xazjBhDwj4veAUIMCP3ClJgGEPCPi94B-gEeChxzZXNfaThERFRmQVhId1lwYlZueWI5dVpnN3hT&redirect_to=dashboard \u001b[39m\u001b[22m\n", + "2022-08-25 10:09:51,706\tINFO worker.py:1509 -- Connected to Ray cluster. View the dashboard at \u001B[1m\u001B[32mhttps://session-i8ddtfaxhwypbvnyb9uzg7xs.i.anyscaleuserdata-staging.com/auth/?token=agh0_CkcwRQIhAJXwvxwq31GryaWthvXGCXZebsijbuqi7qL2pCa5uROOAiBGjzsyXAJFHLlaEI9zSlNI8ewtghKg5UV3t8NmlxuMcRJmEiCtvjcKE0VPiU7iQx51P9oPQjfpo5g1RJXccVSS5005cBgCIgNuL2E6DAj9xazjBhDwj4veAUIMCP3ClJgGEPCPi94B-gEeChxzZXNfaThERFRmQVhId1lwYlZueWI5dVpnN3hT&redirect_to=dashboard \u001B[39m\u001B[22m\n", "2022-08-25 10:09:51,709\tINFO packaging.py:342 -- Pushing file package 'gcs://_ray_pkg_3332f64b0a461fddc20be71129115d0a.zip' (0.34MiB) to Ray cluster...\n", "2022-08-25 10:09:51,714\tINFO packaging.py:351 -- Successfully pushed file package 'gcs://_ray_pkg_3332f64b0a461fddc20be71129115d0a.zip'.\n" ] diff --git a/doc/source/ray-air/examples/opt_deepspeed_batch_inference.ipynb b/doc/source/ray-air/examples/opt_deepspeed_batch_inference.ipynb new file mode 100644 index 0000000000000..06f3d5fc35fae --- /dev/null +++ b/doc/source/ray-air/examples/opt_deepspeed_batch_inference.ipynb @@ -0,0 +1,36 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/doc/source/ray-air/examples/sklearn_example.ipynb b/doc/source/ray-air/examples/sklearn_example.ipynb index a75ce1ac07f2a..29603d25a249e 100644 --- a/doc/source/ray-air/examples/sklearn_example.ipynb +++ b/doc/source/ray-air/examples/sklearn_example.ipynb @@ -158,7 +158,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-06-22 17:27:37,741\tINFO services.py:1477 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8269\u001b[39m\u001b[22m\n", + "2022-06-22 17:27:37,741\tINFO services.py:1477 -- View the Ray dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8269\u001B[39m\u001B[22m\n", "2022-06-22 17:27:39,822\tWARNING read_api.py:260 -- The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 44.05it/s]\n" ] @@ -186,7 +186,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(SklearnTrainer pid=1492629)\u001b[0m 2022-06-22 17:27:45,647\tWARNING pool.py:591 -- The 'context' argument is not supported using ray. Please refer to the documentation for how to control ray initialization.\n" + "\u001B[2m\u001B[36m(SklearnTrainer pid=1492629)\u001B[0m 2022-06-22 17:27:45,647\tWARNING pool.py:591 -- The 'context' argument is not supported using ray. Please refer to the documentation for how to control ray initialization.\n" ] }, { diff --git a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb index f2c265ef0c91d..e17f78e214776 100644 --- a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb +++ b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb @@ -81,7 +81,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-11-08 22:33:29,918\tINFO worker.py:1528 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n" + "2022-11-08 22:33:29,918\tINFO worker.py:1528 -- Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8265 \u001B[39m\u001B[22m\n" ] }, { diff --git a/doc/source/templates/01_batch_inference/start.ipynb b/doc/source/templates/01_batch_inference/start.ipynb index 2929107794a1e..a53232d0ac12e 100644 --- a/doc/source/templates/01_batch_inference/start.ipynb +++ b/doc/source/templates/01_batch_inference/start.ipynb @@ -78,12 +78,12 @@ "output_type": "stream", "text": [ "2023-06-27 23:23:57,184\tINFO worker.py:1452 -- Connecting to existing Ray cluster at address: 10.0.5.141:6379...\n", - "2023-06-27 23:23:57,228\tINFO worker.py:1627 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-kncgqf3p7w2j7qcsnz2safl4tj.i.anyscaleuserdata-staging.com \u001b[39m\u001b[22m\n", + "2023-06-27 23:23:57,228\tINFO worker.py:1627 -- Connected to Ray cluster. View the dashboard at \u001B[1m\u001B[32mhttps://session-kncgqf3p7w2j7qcsnz2safl4tj.i.anyscaleuserdata-staging.com \u001B[39m\u001B[22m\n", "2023-06-27 23:23:57,243\tINFO packaging.py:347 -- Pushing file package 'gcs://_ray_pkg_32ef287a3a39e82021e70d2413880a69.zip' (4.49MiB) to Ray cluster...\n", "2023-06-27 23:23:57,257\tINFO packaging.py:360 -- Successfully pushed file package 'gcs://_ray_pkg_32ef287a3a39e82021e70d2413880a69.zip'.\n", - "2023-06-27 23:23:59,629\tWARNING dataset.py:253 -- \u001b[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n", + "2023-06-27 23:23:59,629\tWARNING dataset.py:253 -- \u001B[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n", "\n", - "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001b[0m\n" + "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001B[0m\n" ] }, { diff --git a/doc/source/train/examples/lightning/lightning_mnist_example.ipynb b/doc/source/train/examples/lightning/lightning_mnist_example.ipynb index 34d724cd08fd9..2b92e9e2224b5 100644 --- a/doc/source/train/examples/lightning/lightning_mnist_example.ipynb +++ b/doc/source/train/examples/lightning/lightning_mnist_example.ipynb @@ -314,7 +314,7 @@ "output_type": "stream", "text": [ "2023-06-13 16:05:12,869\tINFO worker.py:1452 -- Connecting to existing Ray cluster at address: 10.0.28.253:6379...\n", - "2023-06-13 16:05:12,877\tINFO worker.py:1627 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_15dlj65vax84ljl7ayeplubryd/services?redirect_to=dashboard \u001b[39m\u001b[22m\n", + "2023-06-13 16:05:12,877\tINFO worker.py:1627 -- Connected to Ray cluster. View the dashboard at \u001B[1m\u001B[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_15dlj65vax84ljl7ayeplubryd/services?redirect_to=dashboard \u001B[39m\u001B[22m\n", "2023-06-13 16:05:13,036\tINFO packaging.py:347 -- Pushing file package 'gcs://_ray_pkg_488e346d50f332edaa288fdaa22b2bdc.zip' (52.65MiB) to Ray cluster...\n", "2023-06-13 16:05:13,221\tINFO packaging.py:360 -- Successfully pushed file package 'gcs://_ray_pkg_488e346d50f332edaa288fdaa22b2bdc.zip'.\n", "2023-06-13 16:05:13,314\tINFO tune.py:226 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Trainer(...)`.\n" @@ -397,38 +397,38 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(pid=16995)\u001b[0m /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", - "\u001b[2m\u001b[36m(pid=16995)\u001b[0m from pandas import MultiIndex, Int64Index\n", - "\u001b[2m\u001b[36m(LightningTrainer pid=16995)\u001b[0m 2023-06-13 16:05:24,007\tINFO backend_executor.py:137 -- Starting distributed worker processes: ['17232 (10.0.28.253)', '6371 (10.0.1.80)', '7319 (10.0.58.90)', '6493 (10.0.26.229)']\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m 2023-06-13 16:05:24,966\tINFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=4]\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m from pandas import MultiIndex, Int64Index\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m from pandas import MultiIndex, Int64Index\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=7319, ip=10.0.58.90)\u001b[0m /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=7319, ip=10.0.58.90)\u001b[0m from pandas import MultiIndex, Int64Index\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m Global seed set to 888\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m GPU available: True, used: True\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m TPU available: False, using: 0 TPU cores\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m IPU available: False, using: 0 IPUs\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m HPU available: False, using: 0 HPUs\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=6371, ip=10.0.1.80)\u001b[0m Missing logger folder: logs/lightning_logs\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=6371, ip=10.0.1.80)\u001b[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m | Name | Type | Params\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m -------------------------------------------------\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m 0 | linear_relu_stack | Sequential | 101 K \n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m 1 | accuracy | Accuracy | 0 \n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m -------------------------------------------------\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m 101 K Trainable params\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m 0 Non-trainable params\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m 101 K Total params\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m 0.407 Total estimated model params size (MB)\n" + "\u001B[2m\u001B[36m(pid=16995)\u001B[0m /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001B[2m\u001B[36m(pid=16995)\u001B[0m from pandas import MultiIndex, Int64Index\n", + "\u001B[2m\u001B[36m(LightningTrainer pid=16995)\u001B[0m 2023-06-13 16:05:24,007\tINFO backend_executor.py:137 -- Starting distributed worker processes: ['17232 (10.0.28.253)', '6371 (10.0.1.80)', '7319 (10.0.58.90)', '6493 (10.0.26.229)']\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m 2023-06-13 16:05:24,966\tINFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=4]\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m from pandas import MultiIndex, Int64Index\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m from pandas import MultiIndex, Int64Index\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=7319, ip=10.0.58.90)\u001B[0m /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=7319, ip=10.0.58.90)\u001B[0m from pandas import MultiIndex, Int64Index\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m Global seed set to 888\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m GPU available: True, used: True\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m TPU available: False, using: 0 TPU cores\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m IPU available: False, using: 0 IPUs\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m HPU available: False, using: 0 HPUs\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=6371, ip=10.0.1.80)\u001B[0m Missing logger folder: logs/lightning_logs\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=6371, ip=10.0.1.80)\u001B[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m | Name | Type | Params\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m -------------------------------------------------\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m 0 | linear_relu_stack | Sequential | 101 K \n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m 1 | accuracy | Accuracy | 0 \n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m -------------------------------------------------\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m 101 K Trainable params\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m 0 Non-trainable params\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m 101 K Total params\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m 0.407 Total estimated model params size (MB)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Sanity Checking: 0it [00:00, ?it/s]\u001b[0m \n", + "Sanity Checking: 0it [00:00, ?it/s]\u001B[0m \n", "Sanity Checking DataLoader 0: 0%| | 0/2 [00:00 TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)->MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]\n", - "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n", + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m \u001B[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n", + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m \n", + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001B[0m\n", + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Starting distributed worker processes: ['134267 (10.0.55.20)', '74152 (10.0.63.141)', '75476 (10.0.51.205)', '75547 (10.0.42.158)', '74711 (10.0.45.211)', '75132 (10.0.20.140)', '74502 (10.0.60.86)', '75695 (10.0.53.69)', '74457 (10.0.47.2)', '74569 (10.0.33.23)', '74341 (10.0.29.61)', '74274 (10.0.36.152)', '74561 (10.0.35.16)', '74427 (10.0.16.236)', '74273 (10.0.54.55)', '74996 (10.0.9.249)']\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Setting up process group for: env:// [rank=0, world_size=16]\n", + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)->MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]\n", + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -578,16 +578,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 8.86MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n", - "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 18.2MB/s]ansform_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n", - "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 3.33MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n" + "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 8.86MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001B[0m \n", + "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 18.2MB/s]ansform_pandas) pid=74329, ip=10.0.54.55)\u001B[0m \n", + "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 3.33MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001B[0m \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m [2023-06-30 17:39:54,612] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" + "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m [2023-06-30 17:39:54,612] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" ] }, { @@ -596,22 +596,22 @@ "text": [ "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 7.86MB/s]\n", "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 7.57MB/s]\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m GPU available: True (cuda), used: True\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m TPU available: False, using: 0 TPU cores\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m IPU available: False, using: 0 IPUs\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m HPU available: False, using: 0 HPUs\n", - "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m `Trainer(limit_val_batches=1)` was configured so 1 batch will be used.\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m GPU available: True (cuda), used: True\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m TPU available: False, using: 0 TPU cores\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m IPU available: False, using: 0 IPUs\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m HPU available: False, using: 0 HPUs\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m `Trainer(limit_val_batches=1)` was configured so 1 batch will be used.\n", "Downloading tokenizer.model: 0%| | 0.00/500k [00:00=0.11 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from scikit-learn->ax-platform==0.2.4) (1.1.0)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from scikit-learn->ax-platform==0.2.4) (3.0.0)\n", "Requirement already satisfied: typing-extensions in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from torch>=1.9->botorch==0.6.2->ax-platform==0.2.4) (4.1.1)\n", - "\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n", - "\u001b[0m" + "\u001B[33mWARNING: There was an error checking the latest version of pip.\u001B[0m\u001B[33m\n", + "\u001B[0m" ] } ], diff --git a/doc/source/tune/examples/bayesopt_example.ipynb b/doc/source/tune/examples/bayesopt_example.ipynb index a46ace8d22a38..88bc85a33341a 100644 --- a/doc/source/tune/examples/bayesopt_example.ipynb +++ b/doc/source/tune/examples/bayesopt_example.ipynb @@ -33,8 +33,8 @@ "Requirement already satisfied: scipy>=0.14.0 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from bayesian-optimization==1.2.0) (1.4.1)\n", "Requirement already satisfied: joblib>=0.11 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from scikit-learn>=0.18.0->bayesian-optimization==1.2.0) (1.1.0)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from scikit-learn>=0.18.0->bayesian-optimization==1.2.0) (3.0.0)\n", - "\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n", - "\u001b[0m" + "\u001B[33mWARNING: There was an error checking the latest version of pip.\u001B[0m\u001B[33m\n", + "\u001B[0m" ] } ], diff --git a/doc/source/tune/examples/optuna_example.ipynb b/doc/source/tune/examples/optuna_example.ipynb index 2b2d7f36fd535..b82ce5485c74e 100644 --- a/doc/source/tune/examples/optuna_example.ipynb +++ b/doc/source/tune/examples/optuna_example.ipynb @@ -57,8 +57,8 @@ "Requirement already satisfied: wcwidth>=0.1.7 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from cmd2>=1.0.0->cliff->optuna==2.9.1) (0.2.5)\n", "Requirement already satisfied: zipp>=0.5 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from importlib-metadata->sqlalchemy>=1.1.0->optuna==2.9.1) (3.7.0)\n", "Requirement already satisfied: MarkupSafe>=0.9.2 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from Mako->alembic->optuna==2.9.1) (2.0.1)\n", - "\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n", - "\u001b[0m" + "\u001B[33mWARNING: There was an error checking the latest version of pip.\u001B[0m\u001B[33m\n", + "\u001B[0m" ] } ], @@ -330,7 +330,7 @@ "output_type": "stream", "text": [ "Function checkpointing is disabled. This may result in unexpected behavior when using checkpointing features or certain schedulers. To enable, set the train function arguments to be `func(config, checkpoint_dir=None)`.\n", - "\u001b[32m[I 2022-07-22 15:21:47,769]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n" + "\u001B[32m[I 2022-07-22 15:21:47,769]\u001B[0m A new study created in memory with name: optuna\u001B[0m\n" ] }, { @@ -1293,7 +1293,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m[I 2022-07-22 15:22:32,644]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n", + "\u001B[32m[I 2022-07-22 15:22:32,644]\u001B[0m A new study created in memory with name: optuna\u001B[0m\n", "/Users/kai/coding/ray/python/ray/tune/search/optuna/optuna_search.py:389: ExperimentalWarning: enqueue_trial is experimental (supported from v1.2.0). The interface can change in the future.\n", " self._ot_study.enqueue_trial(point)\n", "/Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/optuna/study/study.py:857: ExperimentalWarning: create_trial is experimental (supported from v2.0.0). The interface can change in the future.\n", @@ -2263,7 +2263,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m[I 2022-07-22 15:23:15,784]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n" + "\u001B[32m[I 2022-07-22 15:23:15,784]\u001B[0m A new study created in memory with name: optuna\u001B[0m\n" ] } ], @@ -3230,7 +3230,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m[I 2022-07-22 15:26:50,680]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n" + "\u001B[32m[I 2022-07-22 15:26:50,680]\u001B[0m A new study created in memory with name: optuna\u001B[0m\n" ] }, { diff --git a/doc/source/tune/examples/tune-aim.ipynb b/doc/source/tune/examples/tune-aim.ipynb index ad180658faa83..5a648e9777069 100644 --- a/doc/source/tune/examples/tune-aim.ipynb +++ b/doc/source/tune/examples/tune-aim.ipynb @@ -1,407 +1,407 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "id": "ecad719c", - "metadata": {}, - "source": [ - "(tune-aim-ref)=\n", - "\n", - "# Using Aim with Tune\n", - "\n", - "[Aim](https://aimstack.io) is an easy-to-use and supercharged open-source experiment tracker.\n", - "Aim logs your training runs, enables a well-designed UI to compare them, and provides an API to query them programmatically.\n", - "\n", - "```{image} /images/aim_logo_full.png\n", - ":align: center\n", - ":alt: Aim\n", - ":width: 100%\n", - ":target: https://aimstack.io\n", - "```\n", - "\n", - "Ray Tune currently offers built-in integration with Aim.\n", - "The {ref}`AimLoggerCallback ` automatically logs metrics that are reported to Tune by using the Aim API.\n", - "\n", - "\n", - "```{contents}\n", - ":backlinks: none\n", - ":local: true\n", - "```\n", - "\n", - "## Logging Tune Hyperparameter Configurations and Results to Aim\n", - "\n", - "The following example demonstrates how the `AimLoggerCallback` can be used in a Tune experiment.\n", - "Begin by installing and importing the necessary modules:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1290b5b5", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install aim\n", - "%pip install ray[tune]" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "100bcf8a", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "import ray\n", - "from ray import air, tune\n", - "from ray.air import session\n", - "from ray.tune.logger.aim import AimLoggerCallback" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "9346c0f6", - "metadata": {}, - "source": [ - "Next, define a simple `train_function`, which is a [`Trainable`](trainable-docs) that reports a loss to Tune.\n", - "The objective function itself is not important for this example, as our main focus is on the integration with Aim." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "e8b4fc4d", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "def train_function(config):\n", - " for _ in range(50):\n", - " loss = config[\"mean\"] + config[\"sd\"] * np.random.randn()\n", - " session.report({\"loss\": loss})" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "831eed42", - "metadata": {}, - "source": [ - "Here is an example of how you can use the `AimLoggerCallback` with simple grid-search Tune experiment.\n", - "The logger will log each of the 9 grid-search trials as separate Aim runs." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "52988599", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-02-07 00:04:11,228\tINFO worker.py:1544 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "
\n", - "
\n", - "

Tune Status

\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Current time:2023-02-07 00:04:19
Running for: 00:00:06.86
Memory: 32.8/64.0 GiB
\n", - "
\n", - "
\n", - "
\n", - "

System Info

\n", - " Using FIFO scheduling algorithm.
Resources requested: 0/10 CPUs, 0/0 GPUs, 0.0/26.93 GiB heap, 0.0/2.0 GiB objects\n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - "

Trial Status

\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Trial name status loc mean sd iter total time (s) loss
train_function_01a3b_00000TERMINATED127.0.0.1:10277 10.385428 50 4.480311.01928
train_function_01a3b_00001TERMINATED127.0.0.1:10296 20.819716 50 2.972723.01491
train_function_01a3b_00002TERMINATED127.0.0.1:10301 30.769197 50 2.395723.87155
train_function_01a3b_00003TERMINATED127.0.0.1:10307 40.29466 50 2.415684.1507
train_function_01a3b_00004TERMINATED127.0.0.1:10313 50.152208 50 1.683835.10225
train_function_01a3b_00005TERMINATED127.0.0.1:10321 60.879814 50 1.540156.20238
train_function_01a3b_00006TERMINATED127.0.0.1:10329 70.487499 50 1.447067.79551
train_function_01a3b_00007TERMINATED127.0.0.1:10333 80.639783 50 1.4261 7.94189
train_function_01a3b_00008TERMINATED127.0.0.1:10341 90.12285 50 1.077018.82304
\n", - "
\n", - "
\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "

Trial Progress

\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Trial name date done episodes_total experiment_id experiment_tag hostname iterations_since_restore lossnode_ip pid time_since_restore time_this_iter_s time_total_s timestamp timesteps_since_restoretimesteps_total training_iterationtrial_id warmup_time
train_function_01a3b_000002023-02-07_00-04-18True c8447fdceea6436c9edd6f030a5b1d820_mean=1,sd=0.3854Justins-MacBook-Pro-16 501.01928127.0.0.110277 4.48031 0.013865 4.48031 1675757058 0 5001a3b_00000 0.00264072
train_function_01a3b_000012023-02-07_00-04-18True 7dd6d3ee24244a0885b354c2850647281_mean=2,sd=0.8197Justins-MacBook-Pro-16 503.01491127.0.0.110296 2.97272 0.0584073 2.97272 1675757058 0 5001a3b_00001 0.0316792
train_function_01a3b_000022023-02-07_00-04-18True e3da49ebad034c4b8fdaf0aa87927b1a2_mean=3,sd=0.7692Justins-MacBook-Pro-16 503.87155127.0.0.110301 2.39572 0.0695491 2.39572 1675757058 0 5001a3b_00002 0.0315411
train_function_01a3b_000032023-02-07_00-04-18True 95c60c4f67c4481ebccff25b0a49e75d3_mean=4,sd=0.2947Justins-MacBook-Pro-16 504.1507 127.0.0.110307 2.41568 0.0175381 2.41568 1675757058 0 5001a3b_00003 0.0310779
train_function_01a3b_000042023-02-07_00-04-18True a216253cb41e47caa229e65488deb0194_mean=5,sd=0.1522Justins-MacBook-Pro-16 505.10225127.0.0.110313 1.68383 0.064441 1.68383 1675757058 0 5001a3b_00004 0.00450182
train_function_01a3b_000052023-02-07_00-04-18True 23834104277f476cb99d9c696281fceb5_mean=6,sd=0.8798Justins-MacBook-Pro-16 506.20238127.0.0.110321 1.54015 0.00910306 1.54015 1675757058 0 5001a3b_00005 0.0480251
train_function_01a3b_000062023-02-07_00-04-18True 15f650121df747c3bd2720481d47b2656_mean=7,sd=0.4875Justins-MacBook-Pro-16 507.79551127.0.0.110329 1.44706 0.00600386 1.44706 1675757058 0 5001a3b_00006 0.00202489
train_function_01a3b_000072023-02-07_00-04-19True 78b1673cf2034ed99135b80a0cb31e0e7_mean=8,sd=0.6398Justins-MacBook-Pro-16 507.94189127.0.0.110333 1.4261 0.00225306 1.4261 1675757059 0 5001a3b_00007 0.00209713
train_function_01a3b_000082023-02-07_00-04-19True c7f5d86154cb46b6aa27bef523edcd6f8_mean=9,sd=0.1228Justins-MacBook-Pro-16 508.82304127.0.0.110341 1.07701 0.00291467 1.07701 1675757059 0 5001a3b_00008 0.00240111
\n", - "
\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-02-07 00:04:19,366\tINFO tune.py:798 -- Total run time: 7.38 seconds (6.85 seconds for the tuning loop).\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tuner = tune.Tuner(\n", - " train_function,\n", - " run_config=air.RunConfig(\n", - " callbacks=[AimLoggerCallback()],\n", - " storage_path=\"/tmp/ray_results\",\n", - " name=\"aim_example\",\n", - " ),\n", - " param_space={\n", - " \"mean\": tune.grid_search([1, 2, 3, 4, 5, 6, 7, 8, 9]),\n", - " \"sd\": tune.uniform(0.1, 0.9),\n", - " },\n", - " tune_config=tune.TuneConfig(\n", - " metric=\"loss\",\n", - " mode=\"min\",\n", - " ),\n", - ")\n", - "tuner.fit()\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "941f25f2", - "metadata": {}, - "source": [ - "When the script executes, a grid-search is carried out and the results are saved to the Aim repo,\n", - "stored at the default location -- the experiment log directory (in this case, it's at `/tmp/ray_results/aim_example`).\n", - "\n", - "### More Configuration Options for Aim\n", - "\n", - "In the example above, we used the default configuration for the `AimLoggerCallback`.\n", - "There are a few options that can be configured as arguments to the callback. For example,\n", - "setting `AimLoggerCallback(repo=\"/path/to/repo\")` will log results to the Aim repo at that\n", - "filepath, which could be useful if you have a central location where the results of multiple\n", - "Tune experiments are stored. Relative paths to the working directory where Tune script is\n", - "launched can be used as well. By default, the repo will be set to the experiment log\n", - "directory. See [the API reference](tune-aim-logger) for more configurations.\n", - "\n", - "## Launching the Aim UI\n", - "\n", - "Now that we have logged our results to the Aim repository, we can view it in Aim's web UI.\n", - "To do this, we first find the directory where the Aim repository lives, then we use\n", - "the Aim CLI to launch the web interface." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "880f55aa", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--------------------------------------------------------------------------\n", - " Aim UI collects anonymous usage analytics. \n", - " Read how to opt-out here: \n", - " https://aimstack.readthedocs.io/en/latest/community/telemetry.html \n", - "--------------------------------------------------------------------------\n", - "\u001b[33mRunning Aim UI on repo ``\u001b[0m\n", - "Open http://127.0.0.1:43800\n", - "Press Ctrl+C to exit\n", - "^C\n" - ] - } - ], - "source": [ - "# Uncomment the following line to launch the Aim UI!\n", - "#!aim up --repo=/tmp/ray_results/aim_example" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "adbe661a", - "metadata": {}, - "source": [ - "After launching the Aim UI, we can open the web interface at `localhost:43800`." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "7bb97157", - "metadata": {}, - "source": [ - "```{image} /images/aim_example_metrics_page.png\n", - ":align: center\n", - ":alt: Aim Metrics Explorer\n", - ":target: https://aimstack.readthedocs.io/en/latest/ui/pages/explorers.html#metrics-explorer\n", - "```" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "2f6e9138", - "metadata": {}, - "source": [ - "The next sections contain more in-depth information on the API of the Tune-Aim integration.\n", - "\n", - "## Tune Aim Logger API\n", - "\n", - "(tune-aim-logger)=\n", - "\n", - "```{eval-rst}\n", - ".. autoclass:: ray.tune.logger.aim.AimLoggerCallback\n", - " :noindex:\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "0ebd1904", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ray_dev_py38", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "orphan": true, - "vscode": { - "interpreter": { - "hash": "265d195fda5292fe8f69c6e37c435a5634a1ed3b6799724e66a975f68fa21517" - } - } + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "ecad719c", + "metadata": {}, + "source": [ + "(tune-aim-ref)=\n", + "\n", + "# Using Aim with Tune\n", + "\n", + "[Aim](https://aimstack.io) is an easy-to-use and supercharged open-source experiment tracker.\n", + "Aim logs your training runs, enables a well-designed UI to compare them, and provides an API to query them programmatically.\n", + "\n", + "```{image} /images/aim_logo_full.png\n", + ":align: center\n", + ":alt: Aim\n", + ":width: 100%\n", + ":target: https://aimstack.io\n", + "```\n", + "\n", + "Ray Tune currently offers built-in integration with Aim.\n", + "The {ref}`AimLoggerCallback ` automatically logs metrics that are reported to Tune by using the Aim API.\n", + "\n", + "\n", + "```{contents}\n", + ":backlinks: none\n", + ":local: true\n", + "```\n", + "\n", + "## Logging Tune Hyperparameter Configurations and Results to Aim\n", + "\n", + "The following example demonstrates how the `AimLoggerCallback` can be used in a Tune experiment.\n", + "Begin by installing and importing the necessary modules:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1290b5b5", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install aim\n", + "%pip install ray[tune]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "100bcf8a", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "import ray\n", + "from ray import air, tune\n", + "from ray.air import session\n", + "from ray.tune.logger.aim import AimLoggerCallback" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9346c0f6", + "metadata": {}, + "source": [ + "Next, define a simple `train_function`, which is a [`Trainable`](trainable-docs) that reports a loss to Tune.\n", + "The objective function itself is not important for this example, as our main focus is on the integration with Aim." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e8b4fc4d", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "def train_function(config):\n", + " for _ in range(50):\n", + " loss = config[\"mean\"] + config[\"sd\"] * np.random.randn()\n", + " session.report({\"loss\": loss})" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "831eed42", + "metadata": {}, + "source": [ + "Here is an example of how you can use the `AimLoggerCallback` with simple grid-search Tune experiment.\n", + "The logger will log each of the 9 grid-search trials as separate Aim runs." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "52988599", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-02-07 00:04:11,228\tINFO worker.py:1544 -- Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8265 \u001B[39m\u001B[22m\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 + { + "data": { + "text/html": [ + "
\n", + "
\n", + "
\n", + "

Tune Status

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Current time:2023-02-07 00:04:19
Running for: 00:00:06.86
Memory: 32.8/64.0 GiB
\n", + "
\n", + "
\n", + "
\n", + "

System Info

\n", + " Using FIFO scheduling algorithm.
Resources requested: 0/10 CPUs, 0/0 GPUs, 0.0/26.93 GiB heap, 0.0/2.0 GiB objects\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "

Trial Status

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Trial name status loc mean sd iter total time (s) loss
train_function_01a3b_00000TERMINATED127.0.0.1:10277 10.385428 50 4.480311.01928
train_function_01a3b_00001TERMINATED127.0.0.1:10296 20.819716 50 2.972723.01491
train_function_01a3b_00002TERMINATED127.0.0.1:10301 30.769197 50 2.395723.87155
train_function_01a3b_00003TERMINATED127.0.0.1:10307 40.29466 50 2.415684.1507
train_function_01a3b_00004TERMINATED127.0.0.1:10313 50.152208 50 1.683835.10225
train_function_01a3b_00005TERMINATED127.0.0.1:10321 60.879814 50 1.540156.20238
train_function_01a3b_00006TERMINATED127.0.0.1:10329 70.487499 50 1.447067.79551
train_function_01a3b_00007TERMINATED127.0.0.1:10333 80.639783 50 1.4261 7.94189
train_function_01a3b_00008TERMINATED127.0.0.1:10341 90.12285 50 1.077018.82304
\n", + "
\n", + "
\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "

Trial Progress

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Trial name date done episodes_total experiment_id experiment_tag hostname iterations_since_restore lossnode_ip pid time_since_restore time_this_iter_s time_total_s timestamp timesteps_since_restoretimesteps_total training_iterationtrial_id warmup_time
train_function_01a3b_000002023-02-07_00-04-18True c8447fdceea6436c9edd6f030a5b1d820_mean=1,sd=0.3854Justins-MacBook-Pro-16 501.01928127.0.0.110277 4.48031 0.013865 4.48031 1675757058 0 5001a3b_00000 0.00264072
train_function_01a3b_000012023-02-07_00-04-18True 7dd6d3ee24244a0885b354c2850647281_mean=2,sd=0.8197Justins-MacBook-Pro-16 503.01491127.0.0.110296 2.97272 0.0584073 2.97272 1675757058 0 5001a3b_00001 0.0316792
train_function_01a3b_000022023-02-07_00-04-18True e3da49ebad034c4b8fdaf0aa87927b1a2_mean=3,sd=0.7692Justins-MacBook-Pro-16 503.87155127.0.0.110301 2.39572 0.0695491 2.39572 1675757058 0 5001a3b_00002 0.0315411
train_function_01a3b_000032023-02-07_00-04-18True 95c60c4f67c4481ebccff25b0a49e75d3_mean=4,sd=0.2947Justins-MacBook-Pro-16 504.1507 127.0.0.110307 2.41568 0.0175381 2.41568 1675757058 0 5001a3b_00003 0.0310779
train_function_01a3b_000042023-02-07_00-04-18True a216253cb41e47caa229e65488deb0194_mean=5,sd=0.1522Justins-MacBook-Pro-16 505.10225127.0.0.110313 1.68383 0.064441 1.68383 1675757058 0 5001a3b_00004 0.00450182
train_function_01a3b_000052023-02-07_00-04-18True 23834104277f476cb99d9c696281fceb5_mean=6,sd=0.8798Justins-MacBook-Pro-16 506.20238127.0.0.110321 1.54015 0.00910306 1.54015 1675757058 0 5001a3b_00005 0.0480251
train_function_01a3b_000062023-02-07_00-04-18True 15f650121df747c3bd2720481d47b2656_mean=7,sd=0.4875Justins-MacBook-Pro-16 507.79551127.0.0.110329 1.44706 0.00600386 1.44706 1675757058 0 5001a3b_00006 0.00202489
train_function_01a3b_000072023-02-07_00-04-19True 78b1673cf2034ed99135b80a0cb31e0e7_mean=8,sd=0.6398Justins-MacBook-Pro-16 507.94189127.0.0.110333 1.4261 0.00225306 1.4261 1675757059 0 5001a3b_00007 0.00209713
train_function_01a3b_000082023-02-07_00-04-19True c7f5d86154cb46b6aa27bef523edcd6f8_mean=9,sd=0.1228Justins-MacBook-Pro-16 508.82304127.0.0.110341 1.07701 0.00291467 1.07701 1675757059 0 5001a3b_00008 0.00240111
\n", + "
\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-02-07 00:04:19,366\tINFO tune.py:798 -- Total run time: 7.38 seconds (6.85 seconds for the tuning loop).\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tuner = tune.Tuner(\n", + " train_function,\n", + " run_config=air.RunConfig(\n", + " callbacks=[AimLoggerCallback()],\n", + " storage_path=\"/tmp/ray_results\",\n", + " name=\"aim_example\",\n", + " ),\n", + " param_space={\n", + " \"mean\": tune.grid_search([1, 2, 3, 4, 5, 6, 7, 8, 9]),\n", + " \"sd\": tune.uniform(0.1, 0.9),\n", + " },\n", + " tune_config=tune.TuneConfig(\n", + " metric=\"loss\",\n", + " mode=\"min\",\n", + " ),\n", + ")\n", + "tuner.fit()\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "941f25f2", + "metadata": {}, + "source": [ + "When the script executes, a grid-search is carried out and the results are saved to the Aim repo,\n", + "stored at the default location -- the experiment log directory (in this case, it's at `/tmp/ray_results/aim_example`).\n", + "\n", + "### More Configuration Options for Aim\n", + "\n", + "In the example above, we used the default configuration for the `AimLoggerCallback`.\n", + "There are a few options that can be configured as arguments to the callback. For example,\n", + "setting `AimLoggerCallback(repo=\"/path/to/repo\")` will log results to the Aim repo at that\n", + "filepath, which could be useful if you have a central location where the results of multiple\n", + "Tune experiments are stored. Relative paths to the working directory where Tune script is\n", + "launched can be used as well. By default, the repo will be set to the experiment log\n", + "directory. See [the API reference](tune-aim-logger) for more configurations.\n", + "\n", + "## Launching the Aim UI\n", + "\n", + "Now that we have logged our results to the Aim repository, we can view it in Aim's web UI.\n", + "To do this, we first find the directory where the Aim repository lives, then we use\n", + "the Aim CLI to launch the web interface." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "880f55aa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------------------------------\n", + " Aim UI collects anonymous usage analytics. \n", + " Read how to opt-out here: \n", + " https://aimstack.readthedocs.io/en/latest/community/telemetry.html \n", + "--------------------------------------------------------------------------\n", + "\u001B[33mRunning Aim UI on repo ``\u001B[0m\n", + "Open http://127.0.0.1:43800\n", + "Press Ctrl+C to exit\n", + "^C\n" + ] + } + ], + "source": [ + "# Uncomment the following line to launch the Aim UI!\n", + "#!aim up --repo=/tmp/ray_results/aim_example" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "adbe661a", + "metadata": {}, + "source": [ + "After launching the Aim UI, we can open the web interface at `localhost:43800`." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7bb97157", + "metadata": {}, + "source": [ + "```{image} /images/aim_example_metrics_page.png\n", + ":align: center\n", + ":alt: Aim Metrics Explorer\n", + ":target: https://aimstack.readthedocs.io/en/latest/ui/pages/explorers.html#metrics-explorer\n", + "```" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2f6e9138", + "metadata": {}, + "source": [ + "The next sections contain more in-depth information on the API of the Tune-Aim integration.\n", + "\n", + "## Tune Aim Logger API\n", + "\n", + "(tune-aim-logger)=\n", + "\n", + "```{eval-rst}\n", + ".. autoclass:: ray.tune.logger.aim.AimLoggerCallback\n", + " :noindex:\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "0ebd1904", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ray_dev_py38", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "orphan": true, + "vscode": { + "interpreter": { + "hash": "265d195fda5292fe8f69c6e37c435a5634a1ed3b6799724e66a975f68fa21517" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/doc/source/tune/examples/tune-mlflow.ipynb b/doc/source/tune/examples/tune-mlflow.ipynb index 2e32a3f2e8491..5e8524e6c523c 100644 --- a/doc/source/tune/examples/tune-mlflow.ipynb +++ b/doc/source/tune/examples/tune-mlflow.ipynb @@ -253,7 +253,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-12-22 10:37:53,580\tINFO worker.py:1542 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n" + "2022-12-22 10:37:53,580\tINFO worker.py:1542 -- Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8265 \u001B[39m\u001B[22m\n" ] }, { diff --git a/doc/source/tune/examples/tune-vanilla-pytorch-lightning.ipynb b/doc/source/tune/examples/tune-vanilla-pytorch-lightning.ipynb index d34e2860ab649..b078ca2f975fb 100644 --- a/doc/source/tune/examples/tune-vanilla-pytorch-lightning.ipynb +++ b/doc/source/tune/examples/tune-vanilla-pytorch-lightning.ipynb @@ -798,39 +798,39 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m GPU available: False, used: False\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m TPU available: False, using: 0 TPU cores\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m IPU available: False, using: 0 IPUs\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m HPU available: False, using: 0 HPUs\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:336: LightningDeprecationWarning: The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7. Please use the `on_exception` callback hook instead.\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m \"The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7.\"\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:348: LightningDeprecationWarning: The `on_init_start` callback hook was deprecated in v1.6 and will be removed in v1.8.\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m \"The `on_init_start` callback hook was deprecated in v1.6 and will be removed in v1.8.\"\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:351: LightningDeprecationWarning: The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m rank_zero_deprecation(\"The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.\")\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:377: LightningDeprecationWarning: The `Callback.on_batch_start` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on_train_batch_start` instead.\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:377: LightningDeprecationWarning: The `Callback.on_batch_end` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on_train_batch_end` instead.\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:386: LightningDeprecationWarning: The `Callback.on_epoch_start` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on__epoch_start` instead.\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:386: LightningDeprecationWarning: The `Callback.on_epoch_end` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on__epoch_end` instead.\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m | Name | Type | Params\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m -----------------------------------\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m 0 | layer_1 | Linear | 100 K \n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m 1 | layer_2 | Linear | 16.5 K\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m 2 | layer_3 | Linear | 1.3 K \n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m -----------------------------------\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m 118 K Trainable params\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m 0 Non-trainable params\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m 118 K Total params\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m 0.473 Total estimated model params size (MB)\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:245: PossibleUserWarning: The dataloader, val_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m category=PossibleUserWarning,\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:245: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n", - "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m category=PossibleUserWarning,\n" + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m GPU available: False, used: False\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m TPU available: False, using: 0 TPU cores\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m IPU available: False, using: 0 IPUs\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m HPU available: False, using: 0 HPUs\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:336: LightningDeprecationWarning: The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7. Please use the `on_exception` callback hook instead.\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m \"The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7.\"\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:348: LightningDeprecationWarning: The `on_init_start` callback hook was deprecated in v1.6 and will be removed in v1.8.\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m \"The `on_init_start` callback hook was deprecated in v1.6 and will be removed in v1.8.\"\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:351: LightningDeprecationWarning: The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m rank_zero_deprecation(\"The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.\")\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:377: LightningDeprecationWarning: The `Callback.on_batch_start` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on_train_batch_start` instead.\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:377: LightningDeprecationWarning: The `Callback.on_batch_end` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on_train_batch_end` instead.\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:386: LightningDeprecationWarning: The `Callback.on_epoch_start` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on__epoch_start` instead.\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:386: LightningDeprecationWarning: The `Callback.on_epoch_end` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on__epoch_end` instead.\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m | Name | Type | Params\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m -----------------------------------\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m 0 | layer_1 | Linear | 100 K \n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m 1 | layer_2 | Linear | 16.5 K\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m 2 | layer_3 | Linear | 1.3 K \n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m -----------------------------------\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m 118 K Trainable params\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m 0 Non-trainable params\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m 118 K Total params\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m 0.473 Total estimated model params size (MB)\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:245: PossibleUserWarning: The dataloader, val_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m category=PossibleUserWarning,\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:245: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n", + "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m category=PossibleUserWarning,\n" ] }, { @@ -1178,11 +1178,11 @@ "evalue": "__init__() got an unexpected keyword argument 'tune_mnist_pbt'", "output_type": "error", "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/var/folders/b2/0_91bd757rz02lrmr920v0gw0000gn/T/ipykernel_52122/1146224506.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mtune_mnist_asha\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_samples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_epochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgpus_per_trial\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mtune_mnist_pbt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_samples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_epochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgpus_per_trial\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/var/folders/b2/0_91bd757rz02lrmr920v0gw0000gn/T/ipykernel_52122/328169407.py\u001b[0m in \u001b[0;36mtune_mnist_pbt\u001b[0;34m(num_samples, num_epochs, gpus_per_trial, data_dir)\u001b[0m\n\u001b[1;32m 38\u001b[0m run_config=air.RunConfig(\n\u001b[1;32m 39\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"tune_mnist_asha\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 40\u001b[0;31m \u001b[0mtune_mnist_pbt\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreporter\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 41\u001b[0m ),\n\u001b[1;32m 42\u001b[0m \u001b[0mparam_space\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: __init__() got an unexpected keyword argument 'tune_mnist_pbt'" + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mTypeError\u001B[0m Traceback (most recent call last)", + "\u001B[0;32m/var/folders/b2/0_91bd757rz02lrmr920v0gw0000gn/T/ipykernel_52122/1146224506.py\u001B[0m in \u001B[0;36m\u001B[0;34m\u001B[0m\n\u001B[1;32m 2\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 3\u001B[0m \u001B[0mtune_mnist_asha\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mnum_samples\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;36m1\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mnum_epochs\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;36m6\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mgpus_per_trial\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;36m0\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mdata_dir\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mdata_dir\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m----> 4\u001B[0;31m \u001B[0mtune_mnist_pbt\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mnum_samples\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;36m1\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mnum_epochs\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;36m6\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mgpus_per_trial\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;36m0\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mdata_dir\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mdata_dir\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m", + "\u001B[0;32m/var/folders/b2/0_91bd757rz02lrmr920v0gw0000gn/T/ipykernel_52122/328169407.py\u001B[0m in \u001B[0;36mtune_mnist_pbt\u001B[0;34m(num_samples, num_epochs, gpus_per_trial, data_dir)\u001B[0m\n\u001B[1;32m 38\u001B[0m run_config=air.RunConfig(\n\u001B[1;32m 39\u001B[0m \u001B[0mname\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;34m\"tune_mnist_asha\"\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 40\u001B[0;31m \u001B[0mtune_mnist_pbt\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mreporter\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 41\u001B[0m ),\n\u001B[1;32m 42\u001B[0m \u001B[0mparam_space\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mconfig\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", + "\u001B[0;31mTypeError\u001B[0m: __init__() got an unexpected keyword argument 'tune_mnist_pbt'" ] } ], diff --git a/doc/source/tune/examples/tune-wandb.ipynb b/doc/source/tune/examples/tune-wandb.ipynb index 7ff95bedb5661..b0faa529eedeb 100644 --- a/doc/source/tune/examples/tune-wandb.ipynb +++ b/doc/source/tune/examples/tune-wandb.ipynb @@ -300,7 +300,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-11-02 16:02:45,355\tINFO worker.py:1534 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8266 \u001b[39m\u001b[22m\n", + "2022-11-02 16:02:45,355\tINFO worker.py:1534 -- Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8266 \u001B[39m\u001B[22m\n", "2022-11-02 16:02:46,513\tINFO wandb.py:282 -- Already logged into W&B.\n" ] }, @@ -508,7 +508,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_function_wandb pid=14647)\u001b[0m 2022-11-02 16:03:17,149\tINFO wandb.py:282 -- Already logged into W&B.\n" + "\u001B[2m\u001B[36m(train_function_wandb pid=14647)\u001B[0m 2022-11-02 16:03:17,149\tINFO wandb.py:282 -- Already logged into W&B.\n" ] }, { @@ -554,10 +554,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_function_wandb pid=14660)\u001b[0m 2022-11-02 16:03:20,600\tINFO wandb.py:282 -- Already logged into W&B.\n", - "\u001b[2m\u001b[36m(train_function_wandb pid=14661)\u001b[0m 2022-11-02 16:03:20,600\tINFO wandb.py:282 -- Already logged into W&B.\n", - "\u001b[2m\u001b[36m(train_function_wandb pid=14663)\u001b[0m 2022-11-02 16:03:20,628\tINFO wandb.py:282 -- Already logged into W&B.\n", - "\u001b[2m\u001b[36m(train_function_wandb pid=14662)\u001b[0m 2022-11-02 16:03:20,723\tINFO wandb.py:282 -- Already logged into W&B.\n", + "\u001B[2m\u001B[36m(train_function_wandb pid=14660)\u001B[0m 2022-11-02 16:03:20,600\tINFO wandb.py:282 -- Already logged into W&B.\n", + "\u001B[2m\u001B[36m(train_function_wandb pid=14661)\u001B[0m 2022-11-02 16:03:20,600\tINFO wandb.py:282 -- Already logged into W&B.\n", + "\u001B[2m\u001B[36m(train_function_wandb pid=14663)\u001B[0m 2022-11-02 16:03:20,628\tINFO wandb.py:282 -- Already logged into W&B.\n", + "\u001B[2m\u001B[36m(train_function_wandb pid=14662)\u001B[0m 2022-11-02 16:03:20,723\tINFO wandb.py:282 -- Already logged into W&B.\n", "2022-11-02 16:03:22,565\tINFO tune.py:788 -- Total run time: 8.60 seconds (8.48 seconds for the tuning loop).\n" ] }, @@ -642,7 +642,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(WandbTrainable pid=14718)\u001b[0m 2022-11-02 16:03:25,742\tINFO wandb.py:282 -- Already logged into W&B.\n" + "\u001B[2m\u001B[36m(WandbTrainable pid=14718)\u001B[0m 2022-11-02 16:03:25,742\tINFO wandb.py:282 -- Already logged into W&B.\n" ] }, { @@ -688,10 +688,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(WandbTrainable pid=14739)\u001b[0m 2022-11-02 16:03:30,360\tINFO wandb.py:282 -- Already logged into W&B.\n", - "\u001b[2m\u001b[36m(WandbTrainable pid=14740)\u001b[0m 2022-11-02 16:03:30,393\tINFO wandb.py:282 -- Already logged into W&B.\n", - "\u001b[2m\u001b[36m(WandbTrainable pid=14737)\u001b[0m 2022-11-02 16:03:30,454\tINFO wandb.py:282 -- Already logged into W&B.\n", - "\u001b[2m\u001b[36m(WandbTrainable pid=14738)\u001b[0m 2022-11-02 16:03:30,510\tINFO wandb.py:282 -- Already logged into W&B.\n", + "\u001B[2m\u001B[36m(WandbTrainable pid=14739)\u001B[0m 2022-11-02 16:03:30,360\tINFO wandb.py:282 -- Already logged into W&B.\n", + "\u001B[2m\u001B[36m(WandbTrainable pid=14740)\u001B[0m 2022-11-02 16:03:30,393\tINFO wandb.py:282 -- Already logged into W&B.\n", + "\u001B[2m\u001B[36m(WandbTrainable pid=14737)\u001B[0m 2022-11-02 16:03:30,454\tINFO wandb.py:282 -- Already logged into W&B.\n", + "\u001B[2m\u001B[36m(WandbTrainable pid=14738)\u001B[0m 2022-11-02 16:03:30,510\tINFO wandb.py:282 -- Already logged into W&B.\n", "2022-11-02 16:03:31,985\tINFO tune.py:788 -- Total run time: 9.40 seconds (9.27 seconds for the tuning loop).\n" ] }, diff --git a/python/ray/air/examples/lightgbm_example.ipynb b/python/ray/air/examples/lightgbm_example.ipynb deleted file mode 120000 index 6501466a10a1d..0000000000000 --- a/python/ray/air/examples/lightgbm_example.ipynb +++ /dev/null @@ -1 +0,0 @@ -../../../../doc/source/ray-air/examples/lightgbm_example.ipynb \ No newline at end of file diff --git a/python/ray/air/examples/lightgbm_example.ipynb b/python/ray/air/examples/lightgbm_example.ipynb new file mode 100644 index 0000000000000..3280a55a77440 --- /dev/null +++ b/python/ray/air/examples/lightgbm_example.ipynb @@ -0,0 +1,503 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "0d385409", + "metadata": {}, + "source": [ + "(air-lightgbm-example-ref)=\n", + "\n", + "# Training a model with distributed LightGBM\n", + "In this example we will train a model in Ray AIR using distributed LightGBM." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "07d92cee", + "metadata": {}, + "source": [ + "Let's start with installing our dependencies:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "86131abe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.2\u001B[0m\n", + "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n" + ] + } + ], + "source": [ + "!pip install -qU \"ray[tune]\" lightgbm_ray" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "135fc884", + "metadata": {}, + "source": [ + "Then we need some imports:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "102ef1ac", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/balaji/Documents/GitHub/ray/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "2023-07-07 14:34:14,951\tINFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n", + "2023-07-07 14:34:15,892\tINFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + ] + } + ], + "source": [ + "from typing import Tuple\n", + "\n", + "import ray\n", + "from ray.train.lightgbm import LightGBMPredictor\n", + "from ray.data.preprocessors.chain import Chain\n", + "from ray.data.preprocessors.encoder import Categorizer\n", + "from ray.train.lightgbm import LightGBMTrainer\n", + "from ray.train import Result, ScalingConfig\n", + "from ray.data import Dataset\n", + "from ray.data.preprocessors import StandardScaler" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c7d102bd", + "metadata": {}, + "source": [ + "Next we define a function to load our train, validation, and test datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f1f35cd7", + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:\n", + " dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer_with_categorical.csv\")\n", + " train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)\n", + " test_dataset = valid_dataset.drop_columns(cols=[\"target\"])\n", + " return train_dataset, valid_dataset, test_dataset" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8f7afbce", + "metadata": {}, + "source": [ + "The following function will create a LightGBM trainer, train it, and return the result." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fefcbc8a", + "metadata": {}, + "outputs": [], + "source": [ + "def train_lightgbm(num_workers: int, use_gpu: bool = False) -> Result:\n", + " train_dataset, valid_dataset, _ = prepare_data()\n", + "\n", + " # Scale some random columns, and categorify the categorical_column,\n", + " # allowing LightGBM to use its built-in categorical feature support\n", + " preprocessor = Chain(\n", + " Categorizer([\"categorical_column\"]), \n", + " StandardScaler(columns=[\"mean radius\", \"mean texture\"])\n", + " )\n", + "\n", + " # LightGBM specific params\n", + " params = {\n", + " \"objective\": \"binary\",\n", + " \"metric\": [\"binary_logloss\", \"binary_error\"],\n", + " }\n", + "\n", + " trainer = LightGBMTrainer(\n", + " scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),\n", + " label_column=\"target\",\n", + " params=params,\n", + " datasets={\"train\": train_dataset, \"valid\": valid_dataset},\n", + " preprocessor=preprocessor,\n", + " num_boost_round=100,\n", + " )\n", + " result = trainer.fit()\n", + " print(result.metrics)\n", + "\n", + " return result" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "04d278ae", + "metadata": {}, + "source": [ + "Once we have the result, we can do batch inference on the obtained model. Let's define a utility function for this." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3f1d0c19", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from ray.train import Checkpoint\n", + "from ray.data import ActorPoolStrategy\n", + "\n", + "\n", + "class Predict:\n", + "\n", + " def __init__(self, checkpoint: Checkpoint):\n", + " self.predictor = LightGBMPredictor.from_checkpoint(checkpoint)\n", + "\n", + " def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:\n", + " return self.predictor.predict(batch)\n", + "\n", + "\n", + "def predict_lightgbm(result: Result):\n", + " _, _, test_dataset = prepare_data()\n", + "\n", + " scores = test_dataset.map_batches(\n", + " Predict, \n", + " fn_constructor_args=[result.checkpoint], \n", + " compute=ActorPoolStrategy(), \n", + " batch_format=\"pandas\"\n", + " )\n", + " \n", + " predicted_labels = scores.map_batches(lambda df: (df > 0.5).astype(int), batch_format=\"pandas\")\n", + " print(f\"PREDICTED LABELS\")\n", + " predicted_labels.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2bb0e5df", + "metadata": {}, + "source": [ + "Now we can run the training:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8244ff3c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
\n", + "
\n", + "

Tune Status

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Current time:2023-07-07 14:34:34
Running for: 00:00:06.06
Memory: 12.2/64.0 GiB
\n", + "
\n", + "
\n", + "
\n", + "

System Info

\n", + " Using FIFO scheduling algorithm.
Logical resource usage: 4.0/10 CPUs, 0/0 GPUs\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "

Trial Status

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Trial name status loc iter total time (s) train-binary_logloss train-binary_error valid-binary_logloss
LightGBMTrainer_0c5ae_00000TERMINATED127.0.0.1:10027 101 4.5829 0.000202293 0 0.130232
\n", + "
\n", + "
\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n", + "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(get_pd_value_counts)]\n", + "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.\n", + "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(Categorizer._transform_pandas)] -> AllToAllOperator[Aggregate]\n", + "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + " \n", + "\u001B[A\n", + "\u001B[A\n", + "\n", + "\u001B[A\u001B[A\n", + "\n", + "(pid=10027) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory: 0%| | 0/14 [00:00 TaskPoolMapOperator[MapBatches(Categorizer._transform_pandas)->MapBatches(StandardScaler._transform_pandas)]\n", + "\n", + "\u001B[A\n", + "\n", + "(pid=10027) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory: 7%|▋ | 1/14 [00:00<00:01, 7.59it/s]\n", + "\u001B[A \n", + "\n", + "\u001B[A\u001B[A \n", + "\n", + "\n", + "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "\n", + "\u001B[A\n", + "\n", + "(pid=10027) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory: 7%|▋ | 1/14 [00:00<00:01, 6.59it/s]\n", + "\u001B[A \n", + "\n", + "\u001B[A\u001B[A \n", + "\n", + "\n", + "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + "\n", + "\u001B[A\n", + "\n", + " \n", + "\u001B[A\n", + "\n", + "\u001B[A\u001B[A\n", + "\n", + "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(Categorizer._transform_pandas)->MapBatches(StandardScaler._transform_pandas)]\n", + "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10063)\u001B[0m [LightGBM] [Info] Trying to bind port 51134...\n", + "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10063)\u001B[0m [LightGBM] [Info] Binding port 51134 succeeded\n", + "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10063)\u001B[0m [LightGBM] [Info] Listening...\n", + "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10062)\u001B[0m [LightGBM] [Warning] Connecting to rank 1 failed, waiting for 200 milliseconds\n", + "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10063)\u001B[0m [LightGBM] [Info] Connected to rank 0\n", + "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10063)\u001B[0m [LightGBM] [Info] Local rank: 1, total number of machines: 2\n", + "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10063)\u001B[0m [LightGBM] [Warning] num_threads is set=2, n_jobs=-1 will be ignored. Current value: num_threads=2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10062)\u001B[0m /Users/balaji/Documents/GitHub/ray/.venv/lib/python3.11/site-packages/lightgbm/basic.py:1780: UserWarning: Overriding the parameters from Reference Dataset.\n", + "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10062)\u001B[0m _log_warning('Overriding the parameters from Reference Dataset.')\n", + "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10062)\u001B[0m /Users/balaji/Documents/GitHub/ray/.venv/lib/python3.11/site-packages/lightgbm/basic.py:1513: UserWarning: categorical_column in param dict is overridden.\n", + "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10062)\u001B[0m _log_warning(f'{cat_alias} in param dict is overridden.')\n", + "2023-07-07 14:34:34,087\tINFO tune.py:1148 -- Total run time: 7.18 seconds (6.05 seconds for the tuning loop).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'train-binary_logloss': 0.00020229312743896637, 'train-binary_error': 0.0, 'valid-binary_logloss': 0.13023245107091222, 'valid-binary_error': 0.023529411764705882, 'time_this_iter_s': 0.021785974502563477, 'should_checkpoint': True, 'done': True, 'training_iteration': 101, 'trial_id': '0c5ae_00000', 'date': '2023-07-07_14-34-34', 'timestamp': 1688765674, 'time_total_s': 4.582904100418091, 'pid': 10027, 'hostname': 'Balajis-MacBook-Pro-16', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 4.582904100418091, 'iterations_since_restore': 101, 'experiment_tag': '0'}\n" + ] + } + ], + "source": [ + "result = train_lightgbm(num_workers=2, use_gpu=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d7155d9b", + "metadata": {}, + "source": [ + "And perform inference on the obtained model:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "871c9be6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-07-07 14:34:36,769\tINFO read_api.py:374 -- To satisfy the requested parallelism of 20, each read task output will be split into 20 smaller blocks.\n", + "2023-07-07 14:34:38,655\tWARNING plan.py:567 -- Warning: The Ray cluster currently does not have any available CPUs. The Dataset job will hang unless more CPUs are freed up. A common reason is that cluster resources are used by Actors or Tune trials; see the following link for more details: https://docs.ray.io/en/master/data/dataset-internals.html#datasets-and-tune\n", + "2023-07-07 14:34:38,668\tINFO dataset.py:2180 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.\n", + "2023-07-07 14:34:38,674\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[MapBatches()->MapBatches(Predict)] -> TaskPoolMapOperator[MapBatches()]\n", + "2023-07-07 14:34:38,674\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-07-07 14:34:38,676\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + "2023-07-07 14:34:38,701\tINFO actor_pool_map_operator.py:117 -- MapBatches()->MapBatches(Predict): Waiting for 1 pool actors to start...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREDICTED LABELS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 0}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 0}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 0}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r" + ] + } + ], + "source": [ + "predict_lightgbm(result)" + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + }, + "orphan": true, + "vscode": { + "interpreter": { + "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/ray/air/examples/sklearn_example.ipynb b/python/ray/air/examples/sklearn_example.ipynb deleted file mode 120000 index d3fb3c5b1908c..0000000000000 --- a/python/ray/air/examples/sklearn_example.ipynb +++ /dev/null @@ -1 +0,0 @@ -../../../../doc/source/ray-air/examples/sklearn_example.ipynb \ No newline at end of file diff --git a/python/ray/air/examples/sklearn_example.ipynb b/python/ray/air/examples/sklearn_example.ipynb new file mode 100644 index 0000000000000..29603d25a249e --- /dev/null +++ b/python/ray/air/examples/sklearn_example.ipynb @@ -0,0 +1,356 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "c3192ac4", + "metadata": {}, + "source": [ + "# Training a model with Sklearn\n", + "In this example we will train a model in Ray AIR using a Sklearn classifier." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "5a4823bf", + "metadata": {}, + "source": [ + "Let's start with installing our dependencies:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88f4bb39", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!pip install -qU \"ray[tune]\" sklearn" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c049c692", + "metadata": {}, + "source": [ + "Then we need some imports:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c02eb5cd", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Tuple\n", + "\n", + "\n", + "import ray\n", + "from ray.data import Dataset\n", + "from ray.train.sklearn import SklearnPredictor\n", + "from ray.data.preprocessors import Chain, OrdinalEncoder, StandardScaler\n", + "from ray.train import Result, ScalingConfig\n", + "from ray.train.sklearn import SklearnTrainer\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "try:\n", + " from cuml.ensemble import RandomForestClassifier as cuMLRandomForestClassifier\n", + "except ImportError:\n", + " cuMLRandomForestClassifier = None" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "52e017f1", + "metadata": {}, + "source": [ + "Next we define a function to load our train, validation, and test datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3631ed1e", + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:\n", + " dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer_with_categorical.csv\")\n", + " train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)\n", + " test_dataset = valid_dataset.drop_columns([\"target\"])\n", + " return train_dataset, valid_dataset, test_dataset" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8d6c6d17", + "metadata": {}, + "source": [ + "The following function will create a Sklearn trainer, train it, and return the result." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0fd39e42", + "metadata": {}, + "outputs": [], + "source": [ + "def train_sklearn(num_cpus: int, use_gpu: bool = False) -> Result:\n", + " if use_gpu and not cuMLRandomForestClassifier:\n", + " raise RuntimeError(\"cuML must be installed for GPU enabled sklearn estimators.\")\n", + "\n", + " train_dataset, valid_dataset, _ = prepare_data()\n", + "\n", + " # Scale some random columns\n", + " columns_to_scale = [\"mean radius\", \"mean texture\"]\n", + " preprocessor = Chain(\n", + " OrdinalEncoder([\"categorical_column\"]), StandardScaler(columns=columns_to_scale)\n", + " )\n", + "\n", + " if use_gpu:\n", + " trainer_resources = {\"CPU\": 1, \"GPU\": 1}\n", + " estimator = cuMLRandomForestClassifier()\n", + " else:\n", + " trainer_resources = {\"CPU\": num_cpus}\n", + " estimator = RandomForestClassifier()\n", + "\n", + " trainer = SklearnTrainer(\n", + " estimator=estimator,\n", + " label_column=\"target\",\n", + " datasets={\"train\": train_dataset, \"valid\": valid_dataset},\n", + " preprocessor=preprocessor,\n", + " cv=5,\n", + " scaling_config=ScalingConfig(trainer_resources=trainer_resources),\n", + " )\n", + " result = trainer.fit()\n", + " print(result.metrics)\n", + "\n", + " return result" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7d073994", + "metadata": {}, + "source": [ + "Now we can run the training:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "43f9170a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-06-22 17:27:37,741\tINFO services.py:1477 -- View the Ray dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8269\u001B[39m\u001B[22m\n", + "2022-06-22 17:27:39,822\tWARNING read_api.py:260 -- The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", + "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 44.05it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "== Status ==
Current time: 2022-06-22 17:27:59 (running for 00:00:18.31)
Memory usage on this node: 10.7/31.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/12.9 GiB heap, 0.0/6.45 GiB objects
Result logdir: /home/ubuntu/ray_results/SklearnTrainer_2022-06-22_17-27-40
Number of trials: 1/1 (1 TERMINATED)
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Trial name status loc iter total time (s) fit_time
SklearnTrainer_9dec8_00000TERMINATED172.31.43.110:1492629 1 15.6842 2.31571


" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[2m\u001B[36m(SklearnTrainer pid=1492629)\u001B[0m 2022-06-22 17:27:45,647\tWARNING pool.py:591 -- The 'context' argument is not supported using ray. Please refer to the documentation for how to control ray initialization.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Result for SklearnTrainer_9dec8_00000:\n", + " cv:\n", + " fit_time:\n", + " - 2.221003770828247\n", + " - 2.215489387512207\n", + " - 2.2075674533843994\n", + " - 2.222351312637329\n", + " - 2.312389612197876\n", + " fit_time_mean: 2.235760307312012\n", + " fit_time_std: 0.03866614559685742\n", + " score_time:\n", + " - 0.022464990615844727\n", + " - 0.0230865478515625\n", + " - 0.02564835548400879\n", + " - 0.029137849807739258\n", + " - 0.021221637725830078\n", + " score_time_mean: 0.02431187629699707\n", + " score_time_std: 0.0028120522003997595\n", + " test_score:\n", + " - 0.9625\n", + " - 0.9125\n", + " - 0.9875\n", + " - 1.0\n", + " - 0.9367088607594937\n", + " test_score_mean: 0.9598417721518986\n", + " test_score_std: 0.032128186960552516\n", + " date: 2022-06-22_17-27-59\n", + " done: false\n", + " experiment_id: f8215019c10e4a81ba2187c38e875365\n", + " fit_time: 2.3157050609588623\n", + " hostname: ip-172-31-43-110\n", + " iterations_since_restore: 1\n", + " node_ip: 172.31.43.110\n", + " pid: 1492629\n", + " should_checkpoint: true\n", + " time_since_restore: 15.684244871139526\n", + " time_this_iter_s: 15.684244871139526\n", + " time_total_s: 15.684244871139526\n", + " timestamp: 1655918879\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: 9dec8_00000\n", + " valid:\n", + " score_time: 0.03549623489379883\n", + " test_score: 0.9532163742690059\n", + " warmup_time: 0.0057866573333740234\n", + " \n", + "Result for SklearnTrainer_9dec8_00000:\n", + " cv:\n", + " fit_time:\n", + " - 2.221003770828247\n", + " - 2.215489387512207\n", + " - 2.2075674533843994\n", + " - 2.222351312637329\n", + " - 2.312389612197876\n", + " fit_time_mean: 2.235760307312012\n", + " fit_time_std: 0.03866614559685742\n", + " score_time:\n", + " - 0.022464990615844727\n", + " - 0.0230865478515625\n", + " - 0.02564835548400879\n", + " - 0.029137849807739258\n", + " - 0.021221637725830078\n", + " score_time_mean: 0.02431187629699707\n", + " score_time_std: 0.0028120522003997595\n", + " test_score:\n", + " - 0.9625\n", + " - 0.9125\n", + " - 0.9875\n", + " - 1.0\n", + " - 0.9367088607594937\n", + " test_score_mean: 0.9598417721518986\n", + " test_score_std: 0.032128186960552516\n", + " date: 2022-06-22_17-27-59\n", + " done: true\n", + " experiment_id: f8215019c10e4a81ba2187c38e875365\n", + " experiment_tag: '0'\n", + " fit_time: 2.3157050609588623\n", + " hostname: ip-172-31-43-110\n", + " iterations_since_restore: 1\n", + " node_ip: 172.31.43.110\n", + " pid: 1492629\n", + " should_checkpoint: true\n", + " time_since_restore: 15.684244871139526\n", + " time_this_iter_s: 15.684244871139526\n", + " time_total_s: 15.684244871139526\n", + " timestamp: 1655918879\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: 9dec8_00000\n", + " valid:\n", + " score_time: 0.03549623489379883\n", + " test_score: 0.9532163742690059\n", + " warmup_time: 0.0057866573333740234\n", + " \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-06-22 17:27:59,333\tINFO tune.py:734 -- Total run time: 19.09 seconds (18.31 seconds for the tuning loop).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'valid': {'score_time': 0.03549623489379883, 'test_score': 0.9532163742690059}, 'cv': {'fit_time': array([2.22100377, 2.21548939, 2.20756745, 2.22235131, 2.31238961]), 'score_time': array([0.02246499, 0.02308655, 0.02564836, 0.02913785, 0.02122164]), 'test_score': array([0.9625 , 0.9125 , 0.9875 , 1. , 0.93670886]), 'fit_time_mean': 2.235760307312012, 'fit_time_std': 0.03866614559685742, 'score_time_mean': 0.02431187629699707, 'score_time_std': 0.0028120522003997595, 'test_score_mean': 0.9598417721518986, 'test_score_std': 0.032128186960552516}, 'fit_time': 2.3157050609588623, 'time_this_iter_s': 15.684244871139526, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 1, 'trial_id': '9dec8_00000', 'experiment_id': 'f8215019c10e4a81ba2187c38e875365', 'date': '2022-06-22_17-27-59', 'timestamp': 1655918879, 'time_total_s': 15.684244871139526, 'pid': 1492629, 'hostname': 'ip-172-31-43-110', 'node_ip': '172.31.43.110', 'config': {}, 'time_since_restore': 15.684244871139526, 'timesteps_since_restore': 0, 'iterations_since_restore': 1, 'warmup_time': 0.0057866573333740234, 'experiment_tag': '0'}\n" + ] + } + ], + "source": [ + "result = train_sklearn(num_cpus=2, use_gpu=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e11cf27b", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "- {ref}`End-to-end: Offline Batch Inference `" + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + }, + "kernelspec": { + "display_name": "Python 3.8.10 ('venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orphan": true, + "vscode": { + "interpreter": { + "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/ray/air/examples/upload_to_comet_ml.ipynb b/python/ray/air/examples/upload_to_comet_ml.ipynb deleted file mode 120000 index 9c6606ee1475a..0000000000000 --- a/python/ray/air/examples/upload_to_comet_ml.ipynb +++ /dev/null @@ -1 +0,0 @@ -../../../../doc/source/ray-air/examples/upload_to_comet_ml.ipynb \ No newline at end of file diff --git a/python/ray/air/examples/upload_to_comet_ml.ipynb b/python/ray/air/examples/upload_to_comet_ml.ipynb new file mode 100644 index 0000000000000..cad1483beba32 --- /dev/null +++ b/python/ray/air/examples/upload_to_comet_ml.ipynb @@ -0,0 +1,412 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "98d7c620", + "metadata": {}, + "source": [ + "# Logging results and uploading models to Comet ML\n", + "In this example, we train a simple XGBoost model and log the training\n", + "results to Comet ML. We also save the resulting model checkpoints\n", + "as artifacts." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c6e66577", + "metadata": {}, + "source": [ + "Let's start with installing our dependencies:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6d6297ef", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -qU \"ray[tune]\" scikit-learn xgboost_ray comet_ml" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c2e21446", + "metadata": {}, + "source": [ + "Then we need some imports:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dffff484", + "metadata": {}, + "outputs": [], + "source": [ + "import ray\n", + "\n", + "from ray.train import Result, RunConfig, ScalingConfig\n", + "from ray.train.xgboost import XGBoostTrainer\n", + "from ray.air.integrations.comet import CometLoggerCallback" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "29fcd93b", + "metadata": {}, + "source": [ + "We define a simple function that returns our training dataset as a Dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "cf830706", + "metadata": {}, + "outputs": [], + "source": [ + "def get_train_dataset() -> ray.data.Dataset:\n", + " dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer.csv\")\n", + " return dataset" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "0f48f948", + "metadata": {}, + "source": [ + "Now we define a simple training function. All the magic happens within the `CometLoggerCallback`:\n", + "\n", + "```python\n", + "CometLoggerCallback(\n", + " project_name=comet_project,\n", + " save_checkpoints=True,\n", + ")\n", + "```\n", + "\n", + "It will automatically log all results to Comet ML and upload the checkpoints as artifacts. It assumes you're logged in into Comet via an API key or your `~./.comet.config`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "230f23a3", + "metadata": {}, + "outputs": [], + "source": [ + "def train_model(train_dataset: ray.data.Dataset, comet_project: str) -> Result:\n", + " \"\"\"Train a simple XGBoost model and return the result.\"\"\"\n", + " trainer = XGBoostTrainer(\n", + " scaling_config=ScalingConfig(num_workers=2),\n", + " params={\"tree_method\": \"auto\"},\n", + " label_column=\"target\",\n", + " datasets={\"train\": train_dataset},\n", + " num_boost_round=10,\n", + " run_config=RunConfig(\n", + " callbacks=[\n", + " # This is the part needed to enable logging to Comet ML.\n", + " # It assumes Comet ML can find a valid API (e.g. by setting\n", + " # the ``COMET_API_KEY`` environment variable).\n", + " CometLoggerCallback(\n", + " project_name=comet_project,\n", + " save_checkpoints=True,\n", + " )\n", + " ]\n", + " ),\n", + " )\n", + " result = trainer.fit()\n", + " return result" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "711b1d7d", + "metadata": {}, + "source": [ + "Let's kick off a run:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9bfd9a8d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-19 15:19:17,237\tINFO services.py:1483 -- View the Ray dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8265\u001B[39m\u001B[22m\n" + ] + }, + { + "data": { + "text/html": [ + "== Status ==
Current time: 2022-05-19 15:19:35 (running for 00:00:14.95)
Memory usage on this node: 10.2/16.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/5.12 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-19-19
Number of trials: 1/1 (1 TERMINATED)
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Trial name status loc iter total time (s) train-rmse
XGBoostTrainer_ac544_00000TERMINATED127.0.0.1:19852 10 9.7203 0.030717


" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "COMET WARNING: As you are running in a Jupyter environment, you will need to call `experiment.end()` when finished to ensure all metrics and code are logged before exiting.\n", + "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:21,584\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=16 --runtime-env-hash=-2010331134\n", + "COMET INFO: Experiment is live on comet.ml https://www.comet.ml/krfricke/ray-air-example/ecd3726ca127497ba7386003a249fad6\n", + "\n", + "COMET WARNING: Failed to add tag(s) None to the experiment\n", + "\n", + "COMET WARNING: Empty mapping given to log_params({}); ignoring\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=19852)\u001B[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:24,628\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=17 --runtime-env-hash=-2010331069\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=19852)\u001B[0m 2022-05-19 15:19:25,961\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", + "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,830\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=18 --runtime-env-hash=-2010331069\n", + "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,918\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=20 --runtime-env-hash=-2010331134\n", + "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,922\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=21 --runtime-env-hash=-2010331134\n", + "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,922\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=22 --runtime-env-hash=-2010331134\n", + "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,923\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=19 --runtime-env-hash=-2010331134\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=19852)\u001B[0m 2022-05-19 15:19:29,272\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=19876)\u001B[0m [15:19:29] task [xgboost.ray]:4505889744 got new rank 1\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=19875)\u001B[0m [15:19:29] task [xgboost.ray]:6941849424 got new rank 0\n", + "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n", + "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n", + "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 1.0.0 created\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Result for XGBoostTrainer_ac544_00000:\n", + " date: 2022-05-19_15-19-30\n", + " done: false\n", + " experiment_id: d3007bd6a2734b328fd90385485c5a8d\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " node_ip: 127.0.0.1\n", + " pid: 19852\n", + " should_checkpoint: true\n", + " time_since_restore: 6.529659032821655\n", + " time_this_iter_s: 6.529659032821655\n", + " time_total_s: 6.529659032821655\n", + " timestamp: 1652969970\n", + " timesteps_since_restore: 0\n", + " train-rmse: 0.357284\n", + " training_iteration: 1\n", + " trial_id: ac544_00000\n", + " warmup_time: 0.003961086273193359\n", + " \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "COMET INFO: Scheduling the upload of 3 assets for a size of 2.48 KB, this can take some time\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:1.0.0' has started uploading asynchronously\n", + "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n", + "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n", + "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 2.0.0 created (previous was: 1.0.0)\n", + "COMET INFO: Scheduling the upload of 3 assets for a size of 3.86 KB, this can take some time\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:2.0.0' has started uploading asynchronously\n", + "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n", + "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n", + "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 3.0.0 created (previous was: 2.0.0)\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:1.0.0' has been fully uploaded successfully\n", + "COMET INFO: Scheduling the upload of 3 assets for a size of 5.31 KB, this can take some time\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:3.0.0' has started uploading asynchronously\n", + "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n", + "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n", + "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 4.0.0 created (previous was: 3.0.0)\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:2.0.0' has been fully uploaded successfully\n", + "COMET INFO: Scheduling the upload of 3 assets for a size of 6.76 KB, this can take some time\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:4.0.0' has started uploading asynchronously\n", + "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n", + "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n", + "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 5.0.0 created (previous was: 4.0.0)\n", + "COMET INFO: Scheduling the upload of 3 assets for a size of 8.21 KB, this can take some time\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:3.0.0' has been fully uploaded successfully\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:5.0.0' has started uploading asynchronously\n", + "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n", + "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:4.0.0' has been fully uploaded successfully\n", + "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 6.0.0 created (previous was: 5.0.0)\n", + "COMET INFO: Scheduling the upload of 3 assets for a size of 9.87 KB, this can take some time\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:6.0.0' has started uploading asynchronously\n", + "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n", + "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:5.0.0' has been fully uploaded successfully\n", + "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 7.0.0 created (previous was: 6.0.0)\n", + "COMET INFO: Scheduling the upload of 3 assets for a size of 11.46 KB, this can take some time\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:7.0.0' has started uploading asynchronously\n", + "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n", + "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:6.0.0' has been fully uploaded successfully\n", + "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 8.0.0 created (previous was: 7.0.0)\n", + "COMET INFO: Scheduling the upload of 3 assets for a size of 12.84 KB, this can take some time\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:8.0.0' has started uploading asynchronously\n", + "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n", + "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:7.0.0' has been fully uploaded successfully\n", + "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 9.0.0 created (previous was: 8.0.0)\n", + "COMET INFO: Scheduling the upload of 3 assets for a size of 14.36 KB, this can take some time\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:9.0.0' has started uploading asynchronously\n", + "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n", + "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:8.0.0' has been fully uploaded successfully\n", + "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 10.0.0 created (previous was: 9.0.0)\n", + "COMET INFO: Scheduling the upload of 3 assets for a size of 16.37 KB, this can take some time\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:10.0.0' has started uploading asynchronously\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=19852)\u001B[0m 2022-05-19 15:19:33,890\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=569 in 7.96 seconds (4.61 pure XGBoost training time).\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:9.0.0' has been fully uploaded successfully\n", + "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 11.0.0 created (previous was: 10.0.0)\n", + "COMET INFO: Scheduling the upload of 3 assets for a size of 16.39 KB, this can take some time\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:11.0.0' has started uploading asynchronously\n", + "COMET INFO: ---------------------------\n", + "COMET INFO: Comet.ml Experiment Summary\n", + "COMET INFO: ---------------------------\n", + "COMET INFO: Data:\n", + "COMET INFO: display_summary_level : 1\n", + "COMET INFO: url : https://www.comet.ml/krfricke/ray-air-example/ecd3726ca127497ba7386003a249fad6\n", + "COMET INFO: Metrics [count] (min, max):\n", + "COMET INFO: iterations_since_restore [10] : (1, 10)\n", + "COMET INFO: time_since_restore [10] : (6.529659032821655, 9.720295906066895)\n", + "COMET INFO: time_this_iter_s [10] : (0.3124058246612549, 6.529659032821655)\n", + "COMET INFO: time_total_s [10] : (6.529659032821655, 9.720295906066895)\n", + "COMET INFO: timestamp [10] : (1652969970, 1652969973)\n", + "COMET INFO: timesteps_since_restore : 0\n", + "COMET INFO: train-rmse [10] : (0.030717, 0.357284)\n", + "COMET INFO: training_iteration [10] : (1, 10)\n", + "COMET INFO: warmup_time : 0.003961086273193359\n", + "COMET INFO: Others:\n", + "COMET INFO: Created from : Ray\n", + "COMET INFO: Name : XGBoostTrainer_ac544_00000\n", + "COMET INFO: experiment_id : d3007bd6a2734b328fd90385485c5a8d\n", + "COMET INFO: trial_id : ac544_00000\n", + "COMET INFO: System Information:\n", + "COMET INFO: date : 2022-05-19_15-19-33\n", + "COMET INFO: hostname : Kais-MacBook-Pro.local\n", + "COMET INFO: node_ip : 127.0.0.1\n", + "COMET INFO: pid : 19852\n", + "COMET INFO: Uploads:\n", + "COMET INFO: artifact assets : 33 (107.92 KB)\n", + "COMET INFO: artifacts : 11\n", + "COMET INFO: environment details : 1\n", + "COMET INFO: filename : 1\n", + "COMET INFO: installed packages : 1\n", + "COMET INFO: notebook : 1\n", + "COMET INFO: source_code : 1\n", + "COMET INFO: ---------------------------\n", + "COMET INFO: Uploading metrics, params, and assets to Comet before program termination (may take several seconds)\n", + "COMET INFO: The Python SDK has 3600 seconds to finish before aborting...\n", + "COMET INFO: Waiting for completion of the file uploads (may take several seconds)\n", + "COMET INFO: The Python SDK has 10800 seconds to finish before aborting...\n", + "COMET INFO: Still uploading 6 file(s), remaining 21.05 KB/116.69 KB\n", + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:10.0.0' has been fully uploaded successfully\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:11.0.0' has been fully uploaded successfully\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Result for XGBoostTrainer_ac544_00000:\n", + " date: 2022-05-19_15-19-33\n", + " done: true\n", + " experiment_id: d3007bd6a2734b328fd90385485c5a8d\n", + " experiment_tag: '0'\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 10\n", + " node_ip: 127.0.0.1\n", + " pid: 19852\n", + " should_checkpoint: true\n", + " time_since_restore: 9.720295906066895\n", + " time_this_iter_s: 0.39761900901794434\n", + " time_total_s: 9.720295906066895\n", + " timestamp: 1652969973\n", + " timesteps_since_restore: 0\n", + " train-rmse: 0.030717\n", + " training_iteration: 10\n", + " trial_id: ac544_00000\n", + " warmup_time: 0.003961086273193359\n", + " \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-19 15:19:35,621\tINFO tune.py:753 -- Total run time: 15.75 seconds (14.94 seconds for the tuning loop).\n" + ] + } + ], + "source": [ + "comet_project = \"ray_air_example\"\n", + "\n", + "train_dataset = get_train_dataset()\n", + "result = train_model(train_dataset=train_dataset, comet_project=comet_project)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "be28bdd3", + "metadata": {}, + "source": [ + "Check out your [Comet ML](https://www.comet.ml/) project to see the results!" + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + }, + "orphan": true + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/ray/air/examples/upload_to_wandb.ipynb b/python/ray/air/examples/upload_to_wandb.ipynb deleted file mode 120000 index e241f6fcbd391..0000000000000 --- a/python/ray/air/examples/upload_to_wandb.ipynb +++ /dev/null @@ -1 +0,0 @@ -../../../../doc/source/ray-air/examples/upload_to_wandb.ipynb \ No newline at end of file diff --git a/python/ray/air/examples/upload_to_wandb.ipynb b/python/ray/air/examples/upload_to_wandb.ipynb new file mode 100644 index 0000000000000..8c59392194904 --- /dev/null +++ b/python/ray/air/examples/upload_to_wandb.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "f37e8a9f", + "metadata": {}, + "source": [ + "# Logging results and uploading models to Weights & Biases\n", + "In this example, we train a simple XGBoost model and log the training\n", + "results to Weights & Biases. We also save the resulting model checkpoints\n", + "as artifacts.\n", + "\n", + "There are two ways to achieve this:\n", + "\n", + "1. Automatically using the `ray.air.integrations.wandb.WandbLoggerCallback`\n", + "2. Manually using the `wandb` API\n", + "\n", + "This tutorial will walk you through both options." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "27d04c97", + "metadata": {}, + "source": [ + "Let's start with installing our dependencies:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4e697e5d", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -qU \"ray[tune]\" scikit-learn xgboost_ray wandb" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3096e7c9", + "metadata": {}, + "source": [ + "Then we need some imports:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9c286701", + "metadata": {}, + "outputs": [], + "source": [ + "import ray\n", + "\n", + "from ray.train import Result, RunConfig, ScalingConfig\n", + "from ray.air.integrations.wandb import WandbLoggerCallback\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2efa1564", + "metadata": {}, + "source": [ + "We define a simple function that returns our training dataset as a Dataset:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a63ebd10", + "metadata": {}, + "outputs": [], + "source": [ + "def get_train_dataset() -> ray.data.Dataset:\n", + " dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer.csv\")\n", + " return dataset\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "5fc1ca73", + "metadata": {}, + "source": [ + "And that's the common parts. We now dive into the two options to interact with Weights and Biases." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d07cf41f", + "metadata": {}, + "source": [ + "## Using the WandbLoggerCallback\n", + "\n", + "The WandbLoggerCallback does all the logging and reporting for you. It is especially useful when you use an out-of-the-box trainer like `XGBoostTrainer`. In these trainers, you don't define your own training loop, so using the AIR W&B callback is the best way to log your results to Weights and Biases.\n", + "\n", + "First we define a simple training function.\n", + "\n", + "All the magic happens within the `WandbLoggerCallback`:\n", + "\n", + "```python\n", + "WandbLoggerCallback(\n", + " project=wandb_project,\n", + " save_checkpoints=True,\n", + ")\n", + "```\n", + "\n", + "It will automatically log all results to Weights & Biases and upload the checkpoints as artifacts. It assumes you're logged in into Wandb via an API key or `wandb login`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "52edfde0", + "metadata": {}, + "outputs": [], + "source": [ + "from ray.train.xgboost import XGBoostTrainer\n", + "\n", + "\n", + "def train_model_xgboost(train_dataset: ray.data.Dataset, wandb_project: str) -> Result:\n", + " \"\"\"Train a simple XGBoost model and return the result.\"\"\"\n", + " trainer = XGBoostTrainer(\n", + " scaling_config=ScalingConfig(num_workers=2),\n", + " params={\"tree_method\": \"auto\"},\n", + " label_column=\"target\",\n", + " datasets={\"train\": train_dataset},\n", + " num_boost_round=10,\n", + " run_config=RunConfig(\n", + " callbacks=[\n", + " # This is the part needed to enable logging to Weights & Biases.\n", + " # It assumes you've logged in before, e.g. with `wandb login`.\n", + " WandbLoggerCallback(\n", + " project=wandb_project,\n", + " save_checkpoints=True,\n", + " )\n", + " ]\n", + " ),\n", + " )\n", + " result = trainer.fit()\n", + " return result\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1959ce19", + "metadata": {}, + "source": [ + "Let's kick off a run:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64f80d6c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-10-28 16:28:19,325\tINFO worker.py:1524 -- Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8265 \u001B[39m\u001B[22m\n", + "2022-10-28 16:28:22,993\tWARNING read_api.py:297 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", + "2022-10-28 16:28:26,033\tINFO wandb.py:267 -- Already logged into W&B.\n" + ] + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "wandb_project = \"ray_air_example_xgboost\"\n", + "\n", + "train_dataset = get_train_dataset()\n", + "result = train_model_xgboost(train_dataset=train_dataset, wandb_project=wandb_project)\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "78701c42", + "metadata": {}, + "source": [ + "Check out your [WandB](https://wandb.ai/) project to see the results!" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a215b6d4", + "metadata": {}, + "source": [ + "## Using the `wandb` API\n", + "\n", + "When you define your own training loop, you sometimes want to manually interact with the Weights and Biases API. Ray AIR provides a `setup_wandb()` function that takes care of the initialization.\n", + "\n", + "The main benefit here is that authentication to Weights and Biases is automatically set up for you, and sensible default names for your runs are set. Of course, you can override these.\n", + "\n", + "Additionally in distributed training you often only want to report the results of the rank 0 worker. This can also be done automatically using our setup.\n", + "\n", + "Let's define a distributed training loop. The important part here are:\n", + "\n", + " wandb = setup_wandb(config)\n", + " \n", + "and later\n", + "\n", + " wandb.log({\"loss\": loss.item()})\n", + " \n", + "The call to `setup_wandb()` will setup your session, for instance calling `wandb.init()` with sensible defaults. Because we are in a distributed training setting, this will only happen for the rank 0 - all other workers get a mock object back, and any subsequent calls to `wandb.XXX` will be a no-op for these.\n", + "\n", + "You can then use the `wandb` as usual:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "154e233d", + "metadata": {}, + "outputs": [], + "source": [ + "from ray import train\n", + "from ray.air.integrations.wandb import setup_wandb\n", + "from ray.data.preprocessors import Concatenator\n", + "\n", + "import numpy as np\n", + "\n", + "\n", + "import torch.optim as optim\n", + "import torch.nn as nn\n", + "\n", + "def train_loop(config):\n", + " wandb = setup_wandb(config, project=config.get(\"wandb_project\"))\n", + " \n", + " dataset = train.get_dataset_shard(\"train\")\n", + "\n", + " model = nn.Linear(30, 2)\n", + "\n", + " optimizer = optim.SGD(\n", + " model.parameters(),\n", + " lr=config.get(\"lr\", 0.01),\n", + " )\n", + " loss_fn = nn.CrossEntropyLoss()\n", + " \n", + " for batch in dataset.iter_torch_batches(batch_size=32):\n", + " X = batch[\"data\"]\n", + " y = batch[\"target\"]\n", + " \n", + " # Compute prediction error\n", + " pred = model(X)\n", + " loss = loss_fn(pred, y)\n", + "\n", + " # Backpropagation\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + " \n", + " train.report({\"loss\": loss.item()})\n", + " wandb.log({\"loss\": loss.item()})\n", + " " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9aa12feb", + "metadata": {}, + "source": [ + "Let's define a function to kick off the training - again, we can configure Weights and Biases settings in the config. But you could also just pass it to the setup function, e.g. like this:\n", + "\n", + " setup_wandb(config, project=\"my_project\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5ae7c8c", + "metadata": {}, + "outputs": [], + "source": [ + "from ray.train.torch import TorchTrainer\n", + "\n", + "\n", + "def train_model_torch(train_dataset: ray.data.Dataset, wandb_project: str) -> Result:\n", + " \"\"\"Train a simple XGBoost model and return the result.\"\"\"\n", + " trainer = TorchTrainer(\n", + " train_loop_per_worker=train_loop,\n", + " scaling_config=ScalingConfig(num_workers=2),\n", + " train_loop_config={\"lr\": 0.01, \"wandb_project\": wandb_project},\n", + " datasets={\"train\": train_dataset},\n", + " preprocessor=Concatenator(\"data\", dtype=np.float32, exclude=[\"target\"]),\n", + " )\n", + " result = trainer.fit()\n", + " return result\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "12049bcf", + "metadata": {}, + "source": [ + "Let's kick off this run:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3825b35b", + "metadata": {}, + "outputs": [], + "source": [ + "wandb_project = \"ray_air_example_torch\"\n", + "\n", + "train_dataset = get_train_dataset()\n", + "result = train_model_torch(train_dataset=train_dataset, wandb_project=wandb_project)\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "75fddee7", + "metadata": {}, + "source": [ + "Check out your [WandB](https://wandb.ai/) project to see the results!" + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.9" + }, + "orphan": true + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/ray/air/examples/xgboost_example.ipynb b/python/ray/air/examples/xgboost_example.ipynb deleted file mode 120000 index 5780af02bdf5c..0000000000000 --- a/python/ray/air/examples/xgboost_example.ipynb +++ /dev/null @@ -1 +0,0 @@ -../../../../doc/source/ray-air/examples/xgboost_example.ipynb \ No newline at end of file diff --git a/python/ray/air/examples/xgboost_example.ipynb b/python/ray/air/examples/xgboost_example.ipynb new file mode 100644 index 0000000000000..0ebcea9470a8a --- /dev/null +++ b/python/ray/air/examples/xgboost_example.ipynb @@ -0,0 +1,521 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "5fb89b3d", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "(air-xgboost-example-ref)=\n", + "\n", + "# Training a model with distributed XGBoost\n", + "In this example we will train a model in Ray AIR using distributed XGBoost." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "53d57c1f", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Let's start with installing our dependencies:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "41f20cc1", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.2\u001B[0m\n", + "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n" + ] + } + ], + "source": [ + "!pip install -qU \"ray[tune]\" xgboost_ray" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d2fe8d4a", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Then we need some imports:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "7232303d", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Tuple\n", + "\n", + "import ray\n", + "from ray.train.xgboost import XGBoostPredictor\n", + "from ray.train.xgboost import XGBoostTrainer\n", + "from ray.train import Result, ScalingConfig\n", + "from ray.data import Dataset\n", + "from ray.data.preprocessors import StandardScaler" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1c75b5ca", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Next we define a function to load our train, validation, and test datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "37c4f38f", + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:\n", + " dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer.csv\")\n", + " train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)\n", + " test_dataset = valid_dataset.drop_columns([\"target\"])\n", + " return train_dataset, valid_dataset, test_dataset" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9b2850dd", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "The following function will create a XGBoost trainer, train it, and return the result." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "dae8998d", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "def train_xgboost(num_workers: int, use_gpu: bool = False) -> Result:\n", + " train_dataset, valid_dataset, _ = prepare_data()\n", + "\n", + " # Scale some random columns\n", + " columns_to_scale = [\"mean radius\", \"mean texture\"]\n", + " preprocessor = StandardScaler(columns=columns_to_scale)\n", + "\n", + " # XGBoost specific params\n", + " params = {\n", + " \"tree_method\": \"approx\",\n", + " \"objective\": \"binary:logistic\",\n", + " \"eval_metric\": [\"logloss\", \"error\"],\n", + " }\n", + "\n", + " trainer = XGBoostTrainer(\n", + " scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),\n", + " label_column=\"target\",\n", + " params=params,\n", + " datasets={\"train\": train_dataset, \"valid\": valid_dataset},\n", + " preprocessor=preprocessor,\n", + " num_boost_round=100,\n", + " )\n", + " result = trainer.fit()\n", + " print(result.metrics)\n", + "\n", + " return result" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ce05af87", + "metadata": {}, + "source": [ + "Once we have the result, we can do batch inference on the obtained model. Let's define a utility function for this." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "5b8076d3", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from ray.train import Checkpoint\n", + "from ray.data import ActorPoolStrategy\n", + "\n", + "\n", + "class Predict:\n", + "\n", + " def __init__(self, checkpoint: Checkpoint):\n", + " self.predictor = XGBoostPredictor.from_checkpoint(checkpoint)\n", + "\n", + " def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:\n", + " return self.predictor.predict(batch)\n", + "\n", + "\n", + "def predict_xgboost(result: Result):\n", + " _, _, test_dataset = prepare_data()\n", + "\n", + " scores = test_dataset.map_batches(\n", + " Predict, \n", + " fn_constructor_args=[result.checkpoint], \n", + " compute=ActorPoolStrategy(), \n", + " batch_format=\"pandas\"\n", + " )\n", + " \n", + " predicted_labels = scores.map_batches(lambda df: (df > 0.5).astype(int), batch_format=\"pandas\")\n", + " print(f\"PREDICTED LABELS\")\n", + " predicted_labels.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7e172f66", + "metadata": {}, + "source": [ + "Now we can run the training:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "0f96d62b", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
\n", + "
\n", + "

Tune Status

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Current time:2023-07-06 18:33:25
Running for: 00:00:06.19
Memory: 14.9/64.0 GiB
\n", + "
\n", + "
\n", + "
\n", + "

System Info

\n", + " Using FIFO scheduling algorithm.
Logical resource usage: 2.0/10 CPUs, 0/0 GPUs\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "

Trial Status

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Trial name status loc iter total time (s) train-logloss train-error valid-logloss
XGBoostTrainer_40fed_00000TERMINATED127.0.0.1:40725 101 4.90132 0.00587595 0 0.06215
\n", + "
\n", + "
\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n", + "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.\n", + "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m Executing DAG InputDataBuffer[Input] -> AllToAllOperator[Aggregate]\n", + "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + "\n", + "\u001B[A\n", + "\u001B[A\n", + "\n", + "\u001B[A\u001B[A\n", + "\n", + "(pid=40725) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory: 0%| | 0/14 [00:00 TaskPoolMapOperator[MapBatches(StandardScaler._transform_pandas)]\n", + "\n", + "\u001B[A\n", + "\n", + "(pid=40725) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory: 0%| | 0/14 [00:01 TaskPoolMapOperator[MapBatches(StandardScaler._transform_pandas)]\n", + "\n", + "\u001B[A\n", + "\n", + "(pid=40725) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory: 0%| | 0/14 [00:01 ActorPoolMapOperator[MapBatches()->MapBatches(Predict)] -> TaskPoolMapOperator[MapBatches()]\n", + "2023-07-06 18:33:28,112\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-07-06 18:33:28,114\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + "2023-07-06 18:33:28,150\tINFO actor_pool_map_operator.py:117 -- MapBatches()->MapBatches(Predict): Waiting for 1 pool actors to start...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREDICTED LABELS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 0}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 0}\n", + "{'predictions': 1}\n", + "{'predictions': 0}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 0}\n", + "{'predictions': 0}\n", + "{'predictions': 1}\n", + "{'predictions': 1}\n", + "{'predictions': 0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r" + ] + } + ], + "source": [ + "predict_xgboost(result)" + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + }, + "kernelspec": { + "display_name": "Python 3.8.10 ('venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + }, + "orphan": true, + "vscode": { + "interpreter": { + "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb deleted file mode 120000 index a65044dfacf95..0000000000000 --- a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb +++ /dev/null @@ -1 +0,0 @@ -../../../doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb \ No newline at end of file diff --git a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb new file mode 100644 index 0000000000000..4133eb084c43f --- /dev/null +++ b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb @@ -0,0 +1,1200 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(gptj_deepspeed_finetune)=\n", + "\n", + "# GPT-J-6B Fine-Tuning with Ray AIR and DeepSpeed\n", + "\n", + "In this example, we will showcase how to use the Ray AIR for **GPT-J fine-tuning**. GPT-J is a GPT-2-like causal language model trained on the Pile dataset. This particular model has 6 billion parameters. For more information on GPT-J, click [here](https://huggingface.co/docs/transformers/model_doc/gptj).\n", + "\n", + "We will use Ray AIR (with the 🤗 Transformers integration) and a pretrained model from Hugging Face hub. Note that you can easily adapt this example to use other similar models.\n", + "\n", + "This example focuses more on the performance and distributed computing aspects of Ray AIR. If you are looking for a more beginner-friendly introduction to Ray AIR 🤗 Transformers integration, see {doc}`this example `.\n", + "\n", + "It is highly recommended to read [Ray Train Key Concepts](train-key-concepts) and [Ray Data Key Concepts](data_key_concepts) before starting this example.\n", + "\n", + "```{note}\n", + "To run this example, make sure your Ray cluster has access to at least one GPU with 16 or more GBs of memory. The required amount of memory depends on the model. This notebook is tested with 16 g4dn.4xlarge instances (including the head node). If you wish to use a CPU head node, turn on [cloud checkpointing](tune-cloud-checkpointing) to avoid OOM errors that may happen due to the default behavior of syncing the checkpoint files to the head node.\n", + "```\n", + "\n", + "In this notebook, we will:\n", + "1. [Set up Ray](#setup)\n", + "2. [Load the dataset](#load)\n", + "3. [Preprocess the dataset with Ray AIR](#preprocess)\n", + "4. [Run the training with Ray AIR](#train)\n", + "5. [Generate text from prompt with Ray AIR](#predict)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Uncomment and run the following line in order to install all the necessary dependencies (this notebook is being tested with `transformers==4.26.0`):" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#! pip install \"datasets\" \"evaluate\" \"accelerate==0.18.0\" \"transformers>=4.26.0\" \"torch>=1.12.0\" \"deepspeed==0.8.3\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import os" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up Ray \n", + "\n", + "First, let's set some global variables. We will use 16 workers, each being assigned 1 GPU and 8 CPUs." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"EleutherAI/gpt-j-6B\"\n", + "use_gpu = True\n", + "num_workers = 16\n", + "cpus_per_worker = 8" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will use `ray.init()` to initialize a local cluster. By default, this cluster will be comprised of only the machine you are running this notebook on. You can also run this notebook on an Anyscale cluster.\n", + "\n", + "We define a {ref}`runtime environment ` to ensure that the Ray workers have access to all the necessary packages. You can omit the `runtime_env` argument if you have all of the packages already installed on each node in your cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
\n", + "

Ray

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "
Python version:3.8.16
Ray version: 3.0.0.dev0
Dashboard:http://console.anyscale-staging.com/api/v2/sessions/ses_sedlspnpy16naa5lm9kf2cmi2y/services?redirect_to=dashboard
\n", + "
\n", + "
\n" + ], + "text/plain": [ + "RayContext(dashboard_url='console.anyscale-staging.com/api/v2/sessions/ses_sedlspnpy16naa5lm9kf2cmi2y/services?redirect_to=dashboard', python_version='3.8.16', ray_version='3.0.0.dev0', ray_commit='4ddbbb3c4b19c2d27bbf54f8c5ffc100dceafbcf', address_info={'node_ip_address': '10.0.30.196', 'raylet_ip_address': '10.0.30.196', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2023-03-06_15-55-37_997701_162/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2023-03-06_15-55-37_997701_162/sockets/raylet', 'webui_url': 'console.anyscale-staging.com/api/v2/sessions/ses_sedlspnpy16naa5lm9kf2cmi2y/services?redirect_to=dashboard', 'session_dir': '/tmp/ray/session_2023-03-06_15-55-37_997701_162', 'metrics_export_port': 8085, 'gcs_address': '10.0.30.196:6379', 'address': '10.0.30.196:6379', 'dashboard_agent_listen_port': 52365, 'node_id': '77de483c435bf4987fd6f1e91d47602554e876fd41230d8d50c05333'})" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import ray\n", + "\n", + "ray.init(\n", + " runtime_env={\n", + " \"pip\": [\n", + " \"datasets\",\n", + " \"evaluate\",\n", + " # Latest combination of accelerate==0.19.0 and transformers==4.29.0\n", + " # seems to have issues with DeepSpeed process group initialization,\n", + " # and will result in a batch_size validation problem.\n", + " # TODO(jungong) : get rid of the pins once the issue is fixed.\n", + " \"accelerate==0.16.0\",\n", + " \"transformers==4.26.0\",\n", + " \"torch>=1.12.0\",\n", + " \"deepspeed==0.9.2\",\n", + " ]\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [ + "hide-cell" + ] + }, + "outputs": [], + "source": [ + "# THIS SHOULD BE HIDDEN IN DOCS AND ONLY RAN IN CI\n", + "# Download the model from our S3 mirror as it's faster\n", + "\n", + "import ray\n", + "import subprocess\n", + "import ray.util.scheduling_strategies\n", + "\n", + "\n", + "def force_on_node(node_id: str, remote_func_or_actor_class):\n", + " scheduling_strategy = ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(\n", + " node_id=node_id, soft=False\n", + " )\n", + " options = {\"scheduling_strategy\": scheduling_strategy}\n", + " return remote_func_or_actor_class.options(**options)\n", + "\n", + "\n", + "def run_on_every_node(remote_func_or_actor_class, **remote_kwargs):\n", + " refs = []\n", + " for node in ray.nodes():\n", + " if node[\"Alive\"] and node[\"Resources\"].get(\"GPU\", None):\n", + " refs.append(\n", + " force_on_node(node[\"NodeID\"], remote_func_or_actor_class).remote(\n", + " **remote_kwargs\n", + " )\n", + " )\n", + " return ray.get(refs)\n", + "\n", + "\n", + "@ray.remote(num_gpus=1)\n", + "def download_model():\n", + " from transformers.utils.hub import TRANSFORMERS_CACHE\n", + "\n", + " path = os.path.expanduser(\n", + " os.path.join(TRANSFORMERS_CACHE, \"models--EleutherAI--gpt-j-6B\")\n", + " )\n", + " subprocess.run([\"mkdir\", \"-p\", os.path.join(path, \"snapshots\", \"main\")])\n", + " subprocess.run([\"mkdir\", \"-p\", os.path.join(path, \"refs\")])\n", + " if os.path.exists(os.path.join(path, \"refs\", \"main\")):\n", + " return\n", + " subprocess.run(\n", + " [\n", + " \"aws\",\n", + " \"s3\",\n", + " \"sync\",\n", + " \"--no-sign-request\",\n", + " \"s3://large-dl-models-mirror/models--EleutherAI--gpt-j-6B/main/\",\n", + " os.path.join(path, \"snapshots\", \"main\"),\n", + " ]\n", + " )\n", + " with open(os.path.join(path, \"snapshots\", \"main\", \"hash\"), \"r\") as f:\n", + " f_hash = f.read().strip()\n", + " with open(os.path.join(path, \"refs\", \"main\"), \"w\") as f:\n", + " f.write(f_hash)\n", + " os.rename(\n", + " os.path.join(path, \"snapshots\", \"main\"), os.path.join(path, \"snapshots\", f_hash)\n", + " )\n", + "\n", + "\n", + "_ = run_on_every_node(download_model)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the dataset \n", + "\n", + "We will be fine-tuning the model on the [`tiny_shakespeare` dataset](https://huggingface.co/datasets/tiny_shakespeare), comprised of 40,000 lines of Shakespeare from a variety of Shakespeare's plays. The aim will be to make the GPT-J model better at generating text in the style of Shakespeare." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading tiny_shakespeare dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset tiny_shakespeare (/home/ray/.cache/huggingface/datasets/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "65894225f3b84e5caa117c4d08d9f99d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00 pd.DataFrame:\n", + " text = list(batch[\"text\"])\n", + " flat_text = \"\".join(text)\n", + " split_text = [\n", + " x.strip()\n", + " for x in flat_text.split(\"\\n\")\n", + " if x.strip() and not x.strip()[-1] == \":\"\n", + " ]\n", + " return pd.DataFrame(split_text, columns=[\"text\"])\n", + "\n", + "\n", + "def tokenize(batch: pd.DataFrame) -> dict:\n", + " tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + " ret = tokenizer(\n", + " list(batch[\"text\"]),\n", + " truncation=True,\n", + " max_length=block_size,\n", + " padding=\"max_length\",\n", + " return_tensors=\"np\",\n", + " )\n", + " ret[\"labels\"] = ret[\"input_ids\"].copy()\n", + " return dict(ret)\n", + "\n", + "\n", + "splitter = BatchMapper(split_text, batch_format=\"pandas\")\n", + "tokenizer = BatchMapper(tokenize, batch_format=\"pandas\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fine-tuning the model with Ray AIR \n", + "\n", + "We can now configure Ray AIR's {class}`~ray.train.huggingface.TransformersTrainer` to perform distributed fine-tuning of the model. In order to do that, we specify a `trainer_init_per_worker` function, which creates a 🤗 Transformers `Trainer` that will be distributed by Ray using Distributed Data Parallelism (using PyTorch Distributed backend internally). This means that each worker will have its own copy of the model, but operate on different data, At the end of each step, all the workers will sync gradients.\n", + "\n", + "Because GPT-J is a relatively large model, it may not be possible to fit it on smaller GPU types (<=16 GB GRAM). To deal with that issue, we can use [DeepSpeed](https://github.com/microsoft/DeepSpeed), a library to optimize the training process and allow us to (among other things) offload and partition optimizer and parameter states, reducing GRAM usage. Furthermore, DeepSpeed ZeRO Stage 3 allows us to load large models without running out of memory.\n", + "\n", + "🤗 Transformers and Ray AIR's integration ({class}`~ray.train.huggingface.TransformersTrainer`) allow you to easily configure and use DDP and DeepSpeed. All you need to do is specify the DeepSpeed configuration in the [`TrainingArguments`](https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.TrainingArguments) object.\n", + "\n", + "```{tip}\n", + "There are many DeepSpeed settings that allow you to trade-off speed for memory usage. The settings used below are tailored to the cluster setup used (16 g4dn.4xlarge nodes) and per device batch size of 16. Some things to keep in mind:\n", + "- If your GPUs support bfloat16, use that instead of float16 mixed precision to get better performance and prevent overflows. Replace `fp16=True` with `bf16=True` in `TrainingArguments`.\n", + "- If you are running out of GRAM: try reducing batch size (defined in the cell below the next one), set `\"overlap_comm\": False` in DeepSpeed config.\n", + "- If you are running out of RAM, add more nodes to your cluster, use nodes with more RAM, set `\"pin_memory\": False` in the DeepSpeed config, reduce the batch size, and remove `\"offload_param\"` from the DeepSpeed config.\n", + "\n", + "For more information on DeepSpeed configuration, refer to [Hugging Face documentation](https://huggingface.co/docs/transformers/main_classes/deepspeed) and [DeepSpeed documentation](https://www.deepspeed.ai/docs/config-json/).\n", + "\n", + "Additionally, if you prefer a lower-level API, the logic below can be expressed as an [Accelerate training loop](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/deepspeed_with_config_support.py) distributed by a Ray AIR {class}`~ray.train.torch.torch_trainer.TorchTrainer`.\n", + "```\n", + "\n", + "#### Training speed\n", + "\n", + "As we are using data parallelism, each worker operates on its own shard of the data. The batch size set in `TrainingArguments` is the **per device batch size** (per worker batch size). By changing the number of workers, we can change the **effective batch size** and thus the time needed for training to complete. The effective batch size is then calculated as `per device batch size * number of workers * number of gradient accumulation steps`. As we add more workers, the effective batch size rises and thus we need less time to complete a full epoch. While the speedup is not exactly linear due to extra communication overheads, in many cases it can be close to linear.\n", + "\n", + "The preprocessed dataset has 1348 examples. We have set per device batch size to 16.\n", + "\n", + "* With 16 g4dn.4xlarge nodes, the effective batch size was 256, which equals to 85 steps per epoch. One epoch took **~2440 seconds** (including initialization time).\n", + "\n", + "* With 32 g4dn.4xlarge nodes, the effective batch size was 512, which equals to 43 steps per epoch. One epoch took **~1280 seconds** (including initialization time)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import evaluate\n", + "from transformers import Trainer, TrainingArguments\n", + "from transformers import (\n", + " GPTJForCausalLM,\n", + " AutoTokenizer,\n", + " default_data_collator,\n", + ")\n", + "from transformers.utils.logging import disable_progress_bar, enable_progress_bar\n", + "import torch\n", + "\n", + "from ray import train\n", + "\n", + "\n", + "def trainer_init_per_worker(train_dataset, eval_dataset=None, **config):\n", + " # Use the actual number of CPUs assigned by Ray\n", + " os.environ[\"OMP_NUM_THREADS\"] = str(\n", + " train.get_context().get_trial_resources().bundles[-1].get(\"CPU\", 1)\n", + " )\n", + " # Enable tf32 for better performance\n", + " torch.backends.cuda.matmul.allow_tf32 = True\n", + "\n", + " batch_size = config.get(\"batch_size\", 4)\n", + " epochs = config.get(\"epochs\", 2)\n", + " warmup_steps = config.get(\"warmup_steps\", 0)\n", + " learning_rate = config.get(\"learning_rate\", 0.00002)\n", + " weight_decay = config.get(\"weight_decay\", 0.01)\n", + "\n", + " deepspeed = {\n", + " \"fp16\": {\n", + " \"enabled\": \"auto\",\n", + " \"initial_scale_power\": 8,\n", + " },\n", + " \"bf16\": {\"enabled\": \"auto\"},\n", + " \"optimizer\": {\n", + " \"type\": \"AdamW\",\n", + " \"params\": {\n", + " \"lr\": \"auto\",\n", + " \"betas\": \"auto\",\n", + " \"eps\": \"auto\",\n", + " },\n", + " },\n", + " \"zero_optimization\": {\n", + " \"stage\": 3,\n", + " \"offload_optimizer\": {\n", + " \"device\": \"cpu\",\n", + " \"pin_memory\": True,\n", + " },\n", + " \"offload_param\": {\n", + " \"device\": \"cpu\",\n", + " \"pin_memory\": True,\n", + " },\n", + " \"overlap_comm\": True,\n", + " \"contiguous_gradients\": True,\n", + " \"reduce_bucket_size\": \"auto\",\n", + " \"stage3_prefetch_bucket_size\": \"auto\",\n", + " \"stage3_param_persistence_threshold\": \"auto\",\n", + " \"gather_16bit_weights_on_model_save\": True,\n", + " \"round_robin_gradients\": True,\n", + " },\n", + " \"gradient_accumulation_steps\": \"auto\",\n", + " \"gradient_clipping\": \"auto\",\n", + " \"steps_per_print\": 10,\n", + " \"train_batch_size\": \"auto\",\n", + " \"train_micro_batch_size_per_gpu\": \"auto\",\n", + " \"wall_clock_breakdown\": False,\n", + " }\n", + "\n", + " print(\"Preparing training arguments\")\n", + " training_args = TrainingArguments(\n", + " \"output\",\n", + " per_device_train_batch_size=batch_size,\n", + " logging_steps=1,\n", + " save_strategy=\"no\",\n", + " per_device_eval_batch_size=batch_size,\n", + " learning_rate=learning_rate,\n", + " weight_decay=weight_decay,\n", + " warmup_steps=warmup_steps,\n", + " label_names=[\"input_ids\", \"attention_mask\"],\n", + " num_train_epochs=epochs,\n", + " push_to_hub=False,\n", + " disable_tqdm=True, # declutter the output a little\n", + " fp16=True,\n", + " gradient_checkpointing=True,\n", + " deepspeed=deepspeed,\n", + " )\n", + " disable_progress_bar()\n", + "\n", + " tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + "\n", + " print(\"Loading model\")\n", + "\n", + " model = GPTJForCausalLM.from_pretrained(model_name, use_cache=False)\n", + " model.resize_token_embeddings(len(tokenizer))\n", + "\n", + " print(\"Model loaded\")\n", + "\n", + " enable_progress_bar()\n", + "\n", + " metric = evaluate.load(\"accuracy\")\n", + "\n", + " def compute_metrics(eval_pred):\n", + " logits, labels = eval_pred\n", + " predictions = np.argmax(logits, axis=-1)\n", + " return metric.compute(predictions=predictions, references=labels)\n", + "\n", + " trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " compute_metrics=compute_metrics,\n", + " tokenizer=tokenizer,\n", + " data_collator=default_data_collator,\n", + " )\n", + " return trainer" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With our `trainer_init_per_worker` complete, we can now instantiate the {class}`~ray.train.huggingface.TransformersTrainer`. Aside from the function, we set the `scaling_config`, controlling the amount of workers and resources used, and the `datasets` we will use for training and evaluation.\n", + "\n", + "We pass the preprocessors we have defined earlier as an argument, wrapped in a {class}`~ray.data.preprocessors.chain.Chain`. The preprocessor will be included with the returned {class}`~ray.train.Checkpoint`, meaning it will also be applied during inference.\n", + "\n", + "```{note}\n", + "Since this example runs with multiple nodes, we need to persist checkpoints\n", + "and other outputs to some external storage for access after training has completed.\n", + "**You should set up cloud storage or NFS, then replace `storage_path` with your own cloud bucket URI or NFS path.**\n", + "\n", + "See the [storage guide](tune-storage-options) for more details.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "storage_path=\"s3://your-bucket-here\" # TODO: Set up cloud storage\n", + "# storage_path=\"/mnt/path/to/nfs\" # TODO: Alternatively, set up NFS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "storage_path = \"/mnt/cluster_storage\"" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from ray.train.huggingface import TransformersTrainer\n", + "from ray.train import RunConfig, ScalingConfig\n", + "from ray.data.preprocessors import Chain\n", + "\n", + "\n", + "trainer = TransformersTrainer(\n", + " trainer_init_per_worker=trainer_init_per_worker,\n", + " trainer_init_config={\n", + " \"batch_size\": 16, # per device\n", + " \"epochs\": 1,\n", + " },\n", + " scaling_config=ScalingConfig(\n", + " num_workers=num_workers,\n", + " use_gpu=use_gpu,\n", + " resources_per_worker={\"GPU\": 1, \"CPU\": cpus_per_worker},\n", + " ),\n", + " datasets={\"train\": ray_datasets[\"train\"], \"evaluation\": ray_datasets[\"validation\"]},\n", + " preprocessor=Chain(splitter, tokenizer),\n", + " run_config=RunConfig(storage_path=storage_path),\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we call the {meth}`~ray.train.huggingface.TransformersTrainer.fit` method to start training with Ray AIR. We will save the {class}`~ray.train.Result` object to a variable so we can access metrics and checkpoints." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
\n", + "
\n", + "

Tune Status

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Current time:2023-03-06 17:18:41
Running for: 00:43:11.46
Memory: 31.9/62.0 GiB
\n", + "
\n", + "
\n", + "
\n", + "

System Info

\n", + " Using FIFO scheduling algorithm.
Resources requested: 0/256 CPUs, 0/16 GPUs, 0.0/675.29 GiB heap, 0.0/291.99 GiB objects (0.0/16.0 accelerator_type:T4)\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "

Trial Status

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Trial name status loc iter total time (s) loss learning_rate epoch
TransformersTrainer_f623d_00000TERMINATED10.0.30.196:30861 85 2579.30.0715 4.70588e-07 1
\n", + "
\n", + "
\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) 2023-03-06 16:36:00,447\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "(RayTrainWorker pid=1964, ip=10.0.26.83) /tmp/ray/session_2023-03-06_15-55-37_997701_162/runtime_resources/py_modules_files/_ray_pkg_f864ba6869d6802c/ray/train/_internal/dataset_iterator.py:64: UserWarning: session.get_dataset_shard returns a ray.data.DataIterator instead of a Dataset/DatasetPipeline as of Ray v2.3. Use iter_torch_batches(), to_tf(), or iter_batches() to iterate over one epoch. See https://docs.ray.io/en/latest/data/api/dataset_iterator.html for full DataIterator docs.\n", + "(RayTrainWorker pid=1964, ip=10.0.26.83) warnings.warn(\n", + "(RayTrainWorker pid=1964, ip=10.0.26.83) 2023-03-06 16:36:00,453\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "(RayTrainWorker pid=1963, ip=10.0.54.163) /tmp/ray/session_2023-03-06_15-55-37_997701_162/runtime_resources/py_modules_files/_ray_pkg_f864ba6869d6802c/ray/train/_internal/dataset_iterator.py:64: UserWarning: session.get_dataset_shard returns a ray.data.DataIterator instead of a Dataset/DatasetPipeline as of Ray v2.3. Use iter_torch_batches(), to_tf(), or iter_batches() to iterate over one epoch. See https://docs.ray.io/en/latest/data/api/dataset_iterator.html for full DataIterator docs.\n", + "(RayTrainWorker pid=1963, ip=10.0.54.163) warnings.warn(\n", + "(RayTrainWorker pid=1963, ip=10.0.54.163) 2023-03-06 16:36:00,452\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "(RayTrainWorker pid=1954, ip=10.0.15.115) /tmp/ray/session_2023-03-06_15-55-37_997701_162/runtime_resources/py_modules_files/_ray_pkg_f864ba6869d6802c/ray/train/_internal/dataset_iterator.py:64: UserWarning: session.get_dataset_shard returns a ray.data.DataIterator instead of a Dataset/DatasetPipeline as of Ray v2.3. Use iter_torch_batches(), to_tf(), or iter_batches() to iterate over one epoch. See https://docs.ray.io/en/latest/data/api/dataset_iterator.html for full DataIterator docs.\n", + "(RayTrainWorker pid=1954, ip=10.0.15.115) warnings.warn(\n", + "(RayTrainWorker pid=1954, ip=10.0.15.115) 2023-03-06 16:36:00,452\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "(RayTrainWorker pid=1955, ip=10.0.58.255) /tmp/ray/session_2023-03-06_15-55-37_997701_162/runtime_resources/py_modules_files/_ray_pkg_f864ba6869d6802c/ray/train/_internal/dataset_iterator.py:64: UserWarning: session.get_dataset_shard returns a ray.data.DataIterator instead of a Dataset/DatasetPipeline as of Ray v2.3. Use iter_torch_batches(), to_tf(), or iter_batches() to iterate over one epoch. See https://docs.ray.io/en/latest/data/api/dataset_iterator.html for full DataIterator docs.\n", + "(RayTrainWorker pid=1955, ip=10.0.58.255) warnings.warn(\n", + "(RayTrainWorker pid=1955, ip=10.0.58.255) 2023-03-06 16:36:00,453\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "(RayTrainWorker pid=1942, ip=10.0.57.85) 2023-03-06 16:36:00,452\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "(RayTrainWorker pid=1963, ip=10.0.29.205) 2023-03-06 16:36:00,452\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "(RayTrainWorker pid=1942, ip=10.0.51.113) 2023-03-06 16:36:00,454\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) Preparing training arguments\n", + "(RayTrainWorker pid=31281) Loading model\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:37:21,252] [INFO] [partition_parameters.py:415:__exit__] finished initializing model with 6.05B parameters\n", + "(RayTrainWorker pid=31281) Model loaded\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) Using cuda_amp half precision backend\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) [2023-03-06 16:38:03,431] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed info: version=0.8.1, git-hash=unknown, git-branch=unknown\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:03,450] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) ***** Running training *****\n", + "(RayTrainWorker pid=31281) Num examples = 1348\n", + "(RayTrainWorker pid=31281) Num Epochs = 1\n", + "(RayTrainWorker pid=31281) Instantaneous batch size per device = 16\n", + "(RayTrainWorker pid=31281) Total train batch size (w. parallel, distributed & accumulation) = 256\n", + "(RayTrainWorker pid=31281) Gradient Accumulation steps = 1\n", + "(RayTrainWorker pid=31281) Total optimization steps = 85\n", + "(RayTrainWorker pid=31281) Number of trainable parameters = 0\n", + "(RayTrainWorker pid=31281) /home/ray/anaconda3/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:2387: UserWarning: torch.distributed._all_gather_base is a private function and will be deprecated. Please use torch.distributed.all_gather_into_tensor instead.\n", + "(RayTrainWorker pid=31281) warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,024] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,024] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed using client callable to create LR scheduler\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,025] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed LR Scheduler = \n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,025] [INFO] [logging.py:75:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05], mom=[[0.9, 0.999]]\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,025] [INFO] [config.py:1009:print] DeepSpeedEngine configuration:\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,026] [INFO] [config.py:1013:print] activation_checkpointing_config {\n", + "(RayTrainWorker pid=31281) \"partition_activations\": false, \n", + "(RayTrainWorker pid=31281) \"contiguous_memory_optimization\": false, \n", + "(RayTrainWorker pid=31281) \"cpu_checkpointing\": false, \n", + "(RayTrainWorker pid=31281) \"number_checkpoints\": null, \n", + "(RayTrainWorker pid=31281) \"synchronize_checkpoint_boundary\": false, \n", + "(RayTrainWorker pid=31281) \"profile\": false\n", + "(RayTrainWorker pid=31281) }\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,026] [INFO] [config.py:1013:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,026] [INFO] [config.py:1013:print] amp_enabled .................. False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,026] [INFO] [config.py:1013:print] amp_params ................... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] autotuning_config ............ {\n", + "(RayTrainWorker pid=31281) \"enabled\": false, \n", + "(RayTrainWorker pid=31281) \"start_step\": null, \n", + "(RayTrainWorker pid=31281) \"end_step\": null, \n", + "(RayTrainWorker pid=31281) \"metric_path\": null, \n", + "(RayTrainWorker pid=31281) \"arg_mappings\": null, \n", + "(RayTrainWorker pid=31281) \"metric\": \"throughput\", \n", + "(RayTrainWorker pid=31281) \"model_info\": null, \n", + "(RayTrainWorker pid=31281) \"results_dir\": \"autotuning_results\", \n", + "(RayTrainWorker pid=31281) \"exps_dir\": \"autotuning_exps\", \n", + "(RayTrainWorker pid=31281) \"overwrite\": true, \n", + "(RayTrainWorker pid=31281) \"fast\": true, \n", + "(RayTrainWorker pid=31281) \"start_profile_step\": 3, \n", + "(RayTrainWorker pid=31281) \"end_profile_step\": 5, \n", + "(RayTrainWorker pid=31281) \"tuner_type\": \"gridsearch\", \n", + "(RayTrainWorker pid=31281) \"tuner_early_stopping\": 5, \n", + "(RayTrainWorker pid=31281) \"tuner_num_trials\": 50, \n", + "(RayTrainWorker pid=31281) \"model_info_path\": null, \n", + "(RayTrainWorker pid=31281) \"mp_size\": 1, \n", + "(RayTrainWorker pid=31281) \"max_train_batch_size\": null, \n", + "(RayTrainWorker pid=31281) \"min_train_batch_size\": 1, \n", + "(RayTrainWorker pid=31281) \"max_train_micro_batch_size_per_gpu\": 1.024000e+03, \n", + "(RayTrainWorker pid=31281) \"min_train_micro_batch_size_per_gpu\": 1, \n", + "(RayTrainWorker pid=31281) \"num_tuning_micro_batch_sizes\": 3\n", + "(RayTrainWorker pid=31281) }\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] bfloat16_enabled ............. False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] checkpoint_parallel_write_pipeline False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] checkpoint_tag_validation_enabled True\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] checkpoint_tag_validation_fail False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] comms_config ................. \n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] communication_data_type ...... None\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] curriculum_enabled_legacy .... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] curriculum_params_legacy ..... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] data_efficiency_enabled ...... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] dataloader_drop_last ......... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] disable_allgather ............ False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] dump_state ................... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] dynamic_loss_scale_args ...... {'init_scale': 256, 'scale_window': 1000, 'delayed_shift': 2, 'min_scale': 1}\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_enabled ........... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_gas_boundary_resolution 1\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_layer_name ........ bert.encoder.layer\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_layer_num ......... 0\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_max_iter .......... 100\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_stability ......... 1e-06\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_tol ............... 0.01\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_verbose ........... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] elasticity_enabled ........... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] flops_profiler_config ........ {\n", + "(RayTrainWorker pid=31281) \"enabled\": false, \n", + "(RayTrainWorker pid=31281) \"profile_step\": 1, \n", + "(RayTrainWorker pid=31281) \"module_depth\": -1, \n", + "(RayTrainWorker pid=31281) \"top_modules\": 1, \n", + "(RayTrainWorker pid=31281) \"detailed\": true, \n", + "(RayTrainWorker pid=31281) \"output_file\": null\n", + "(RayTrainWorker pid=31281) }\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] fp16_auto_cast ............... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] fp16_enabled ................. True\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] fp16_master_weights_and_gradients False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] global_rank .................. 0\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] grad_accum_dtype ............. None\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] gradient_accumulation_steps .. 1\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] gradient_clipping ............ 1.0\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] gradient_predivide_factor .... 1.0\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] initial_dynamic_scale ........ 256\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] load_universal_checkpoint .... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] loss_scale ................... 0\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] memory_breakdown ............. False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] nebula_config ................ {\n", + "(RayTrainWorker pid=31281) \"enabled\": false, \n", + "(RayTrainWorker pid=31281) \"persistent_storage_path\": null, \n", + "(RayTrainWorker pid=31281) \"persistent_time_interval\": 100, \n", + "(RayTrainWorker pid=31281) \"num_of_version_in_retention\": 2, \n", + "(RayTrainWorker pid=31281) \"enable_nebula_load\": true, \n", + "(RayTrainWorker pid=31281) \"load_path\": null\n", + "(RayTrainWorker pid=31281) }\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] optimizer_legacy_fusion ...... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] optimizer_name ............... adamw\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] optimizer_params ............. {'lr': 2e-05, 'betas': [0.9, 0.999], 'eps': 1e-08}\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] pld_enabled .................. False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] pld_params ................... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] prescale_gradients ........... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] scheduler_name ............... None\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] scheduler_params ............. None\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] sparse_attention ............. None\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] sparse_gradients_enabled ..... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] steps_per_print .............. 10\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] train_batch_size ............. 256\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] train_micro_batch_size_per_gpu 16\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] use_node_local_storage ....... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] wall_clock_breakdown ......... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] world_size ................... 16\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] zero_allow_untested_optimizer False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=16777216 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='cpu', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=True) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=15099494 param_persistence_threshold=40960 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=True\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] zero_enabled ................. True\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] zero_optimization_stage ...... 3\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,029] [INFO] [config.py:998:print_user_config] json = {\n", + "(RayTrainWorker pid=31281) \"fp16\": {\n", + "(RayTrainWorker pid=31281) \"enabled\": true, \n", + "(RayTrainWorker pid=31281) \"initial_scale_power\": 8\n", + "(RayTrainWorker pid=31281) }, \n", + "(RayTrainWorker pid=31281) \"bf16\": {\n", + "(RayTrainWorker pid=31281) \"enabled\": false\n", + "(RayTrainWorker pid=31281) }, \n", + "(RayTrainWorker pid=31281) \"optimizer\": {\n", + "(RayTrainWorker pid=31281) \"type\": \"AdamW\", \n", + "(RayTrainWorker pid=31281) \"params\": {\n", + "(RayTrainWorker pid=31281) \"lr\": 2e-05, \n", + "(RayTrainWorker pid=31281) \"betas\": [0.9, 0.999], \n", + "(RayTrainWorker pid=31281) \"eps\": 1e-08\n", + "(RayTrainWorker pid=31281) }\n", + "(RayTrainWorker pid=31281) }, \n", + "(RayTrainWorker pid=31281) \"zero_optimization\": {\n", + "(RayTrainWorker pid=31281) \"stage\": 3, \n", + "(RayTrainWorker pid=31281) \"offload_optimizer\": {\n", + "(RayTrainWorker pid=31281) \"device\": \"cpu\", \n", + "(RayTrainWorker pid=31281) \"pin_memory\": true\n", + "(RayTrainWorker pid=31281) }, \n", + "(RayTrainWorker pid=31281) \"offload_param\": {\n", + "(RayTrainWorker pid=31281) \"device\": \"cpu\", \n", + "(RayTrainWorker pid=31281) \"pin_memory\": true\n", + "(RayTrainWorker pid=31281) }, \n", + "(RayTrainWorker pid=31281) \"overlap_comm\": true, \n", + "(RayTrainWorker pid=31281) \"contiguous_gradients\": true, \n", + "(RayTrainWorker pid=31281) \"reduce_bucket_size\": 1.677722e+07, \n", + "(RayTrainWorker pid=31281) \"stage3_prefetch_bucket_size\": 1.509949e+07, \n", + "(RayTrainWorker pid=31281) \"stage3_param_persistence_threshold\": 4.096000e+04, \n", + "(RayTrainWorker pid=31281) \"gather_16bit_weights_on_model_save\": true, \n", + "(RayTrainWorker pid=31281) \"round_robin_gradients\": true\n", + "(RayTrainWorker pid=31281) }, \n", + "(RayTrainWorker pid=31281) \"gradient_accumulation_steps\": 1, \n", + "(RayTrainWorker pid=31281) \"gradient_clipping\": 1.0, \n", + "(RayTrainWorker pid=31281) \"steps_per_print\": 10, \n", + "(RayTrainWorker pid=31281) \"train_batch_size\": 256, \n", + "(RayTrainWorker pid=31281) \"train_micro_batch_size_per_gpu\": 16, \n", + "(RayTrainWorker pid=31281) \"wall_clock_breakdown\": false\n", + "(RayTrainWorker pid=31281) }\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) Model weights saved in output/checkpoint-85/pytorch_model.bin\n", + "(RayTrainWorker pid=31281) tokenizer config file saved in output/checkpoint-85/tokenizer_config.json\n", + "(RayTrainWorker pid=31281) Special tokens file saved in output/checkpoint-85/special_tokens_map.json\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) [2023-03-06 17:18:13,320] [INFO] [engine.py:3516:save_16bit_model] Saving model weights to output/checkpoint-85/pytorch_model.bin\n", + "(RayTrainWorker pid=31281) [2023-03-06 17:18:13,320] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving output/checkpoint-85/pytorch_model.bin...\n", + "(RayTrainWorker pid=31281) [2023-03-06 17:18:29,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved output/checkpoint-85/pytorch_model.bin.\n", + "(RayTrainWorker pid=31281) [2023-03-06 17:18:29,087] [INFO] [logging.py:75:log_dist] [Rank 0] [Torch] Checkpoint global_step85 is begin to save!\n", + "(RayTrainWorker pid=31281) [2023-03-06 17:18:29,109] [INFO] [logging.py:75:log_dist] [Rank 0] Saving model checkpoint: output/checkpoint-85/global_step85/zero_pp_rank_0_mp_rank_00_model_states.pt\n", + "(RayTrainWorker pid=31281) [2023-03-06 17:18:29,109] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving output/checkpoint-85/global_step85/zero_pp_rank_0_mp_rank_00_model_states.pt...\n", + "(RayTrainWorker pid=31281) [2023-03-06 17:18:37,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved output/checkpoint-85/global_step85/zero_pp_rank_0_mp_rank_00_optim_states.pt.\n", + "(RayTrainWorker pid=31281) [2023-03-06 17:18:37,984] [INFO] [engine.py:3407:_save_zero_checkpoint] zero checkpoint saved output/checkpoint-85/global_step85/zero_pp_rank_0_mp_rank_00_optim_states.pt\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) \n", + "(RayTrainWorker pid=31281) \n", + "(RayTrainWorker pid=31281) Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "(RayTrainWorker pid=31281) \n", + "(RayTrainWorker pid=31281) \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) [2023-03-06 17:18:38,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85 is ready now!\n", + "(RayTrainWorker pid=31281) {'train_runtime': 2413.1243, 'train_samples_per_second': 0.559, 'train_steps_per_second': 0.035, 'train_loss': 0.32492108064539293, 'epoch': 1.0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-03-06 17:18:41,018\tINFO tune.py:825 -- Total run time: 2591.59 seconds (2591.46 seconds for the tuning loop).\n" + ] + } + ], + "source": [ + "results = trainer.fit()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can use the returned {class}`~ray.train.Result` object to access metrics and the Ray AIR {class}`~ray.train.Checkpoint` associated with the last iteration." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TransformersCheckpoint(local_path=/home/ray/ray_results/TransformersTrainer_2023-03-06_16-35-29/TransformersTrainer_f623d_00000_0_2023-03-06_16-35-30/checkpoint_000000)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "checkpoint = results.checkpoint\n", + "checkpoint" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generate text from prompt\n", + "\n", + "We can use the {class}`~ray.train.huggingface.huggingface_predictor.TransformersPredictor` to generate predictions from our fine-tuned model.\n", + "\n", + "```{tip}\n", + "For large scale batch inference, see {ref}`End-to-end: Offline Batch Inference `.\n", + "```\n", + "\n", + "Because the {class}`~ray.train.huggingface.huggingface_predictor.TransformersPredictor` uses a 🤗 Transformers [`pipeline`](https://huggingface.co/docs/transformers/en/main_classes/pipelines) under the hood, we disable the tokenizer AIR Preprocessor we have used for training and let the `pipeline` to tokenize the data itself." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "checkpoint.set_preprocessor(None)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We also set `device_map=\"auto\"` so that the model is automatically placed on the right device and set the `task` to `\"text-generation\"`. The `predict` method passes the arguments to a 🤗 Transformers `pipeline` call." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from ray.train.huggingface import TransformersPredictor\n", + "import pandas as pd\n", + "\n", + "prompts = pd.DataFrame([\"Romeo and Juliet\", \"Romeo\", \"Juliet\"], columns=[\"text\"])\n", + "\n", + "# Predict on the head node.\n", + "predictor = TransformersPredictor.from_checkpoint(\n", + " checkpoint=checkpoint,\n", + " task=\"text-generation\",\n", + " torch_dtype=torch.float16 if use_gpu else None,\n", + " device_map=\"auto\",\n", + " use_gpu=use_gpu,\n", + ")\n", + "prediction = predictor.predict(\n", + " prompts,\n", + " do_sample=True,\n", + " temperature=0.9,\n", + " min_length=32,\n", + " max_length=128,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
generated_text
0Romeo and Juliet, they are married: and it is ...
1Romeo, thou art Romeo and a Montague; for only...
2Juliet's name; but I do not sound an ear to na...
\n", + "
" + ], + "text/plain": [ + " generated_text\n", + "0 Romeo and Juliet, they are married: and it is ...\n", + "1 Romeo, thou art Romeo and a Montague; for only...\n", + "2 Juliet's name; but I do not sound an ear to na..." + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prediction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "orphan": true, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "vscode": { + "interpreter": { + "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb b/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb deleted file mode 120000 index 1c219bcfcb468..0000000000000 --- a/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb +++ /dev/null @@ -1 +0,0 @@ -../../../doc/source/ray-air/examples/opt_deepspeed_batch_inference.ipynb \ No newline at end of file diff --git a/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb b/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb new file mode 100644 index 0000000000000..06f3d5fc35fae --- /dev/null +++ b/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb @@ -0,0 +1,36 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb deleted file mode 120000 index ccd34dcfc22fa..0000000000000 --- a/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb +++ /dev/null @@ -1 +0,0 @@ -../../../doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb \ No newline at end of file diff --git a/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb new file mode 100644 index 0000000000000..74afc02ff0b08 --- /dev/null +++ b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb @@ -0,0 +1,1425 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(vicuna_lightning_deepspeed_finetuning)=\n", + "\n", + "# Fine-tune `vicuna-13b` with Ray LightningTrainer and DeepSpeed\n", + "\n", + "In this example, we will demonstrate how to perform full fine-tuning for a [`vicuna-13b-v1.3`](https://huggingface.co/lmsys/vicuna-13b-v1.3) model using LightningTrainer with the DeepSpeed ZeRO-3 strategy.\n", + "\n", + "- [DeepSpeed]() is an open-source deep learning optimization library for PyTorch. It's designed to reduce computing power and memory usage, and to train large distributed models by leveraging state-of-the-art innovations like ZeRO, 3D-Parallelism, DeepSpeed-MoE, and ZeRO-Infinity. \n", + "- PyTorch Lightning offers a [DeepSpeed integration](https://lightning.ai/docs/pytorch/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), which provides a simple interface to configure the knobs for DeepSpeed and automatically trigger your training process with the DeepSpeed Engine.\n", + "- {class}`Ray LightningTrainer ` allows you to easily scale your PyTorch Lightning job across multiple nodes in a Ray cluster, without worrying about the underlying cluster management, autoscaling, and distributed process group settings.\n", + "\n", + "Our demo aims to illustrate how these three tools can be combined effectively to finetune the Vicuna-13B model, leveraging the strengths of each to create an efficient and high-performance deep learning solution.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```{note}\n", + "This is an advanced example of Large Language Model fine-tuning with Ray Train. If you're a beginner or new to the concepts of Ray Train and LightningTrainer, it would be beneficial to first explore the introductory documentation below to build a foundational understanding. \n", + "- [Ray Train Key Concepts](train-key-concepts) \n", + "- [Ray Data Key Concepts](data_key_concepts)\n", + "- {ref}`[Basic] Image Classification with LightningTrainer `\n", + "- {ref}`[Intermediate] Using LightningTrainer with Ray Data `\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cluster Setting\n", + "\n", + "\n", + "### Compute instances\n", + "In this example, we set up a Ray cluster on AWS with the following settings:\n", + "\n", + "| | num | instance type | GPU per node | GPU Memory | CPU Memory |\n", + "|-|-|-|-|-|-|\n", + "|Head node|1|g5.16xlarge|1 x A10G | 24 GB | 256 GB|\n", + "|Worker node|15|g5.4xlarge|1 x A10G | 24 GB | 64 GB|\n", + "\n", + "```{note}\n", + "In this example, we used 16 A10G GPUs for model training and tuned the DeepSpeed configurations for this setup. If you have a different cluster setup or GPUs with lower memory capacities, you may need to modify the DeepSpeed configurations and batch size to fit the model into the GPUs.\n", + "```\n", + "\n", + "```{tip}\n", + "We selected a GPU instance with additional CPU memory for the head node to demonstrate single-node offline inference. If you are training only, you can still opt for the g5.4xlarge instance for the head node.\n", + "```\n", + "\n", + "\n", + "### Cloud Storage\n", + "\n", + "Additionally, since the checkpoint size for this 13B parameter model can be large (~140GB), we choose to store the checkpoints in AWS S3. Thanks to the newly introduced distributed checkpointing feature in Ray 2.5, each worker can upload its own shards individually to the S3 bucket, greatly reducing the latency and network traffic of checkpoint syncing.\n", + "\n", + "### Local Storage\n", + "To demonstrate offline inference, we need to download and consolidate the model checkpoint onto the head node. This action requires around 200GB disk storage. Therefore, we mounted the NVMe SSD provided by g5 instances at `/dev/nvme1n1` to `/mnt/local_storage`, and we will save the checkpoints in this folder.\n", + "\n", + "For more details, please refer to[Amazon EBS and NVMe on Linux instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/nvme-ebs-volumes.html).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup Ray Environment\n", + "\n", + "We define a runtime environment to ensure that the Ray workers have access to all necessary packages. If you have already included these dependencies in your Docker image or installed them on each node, you can ignore the `runtime_env` argument.\n", + "\n", + "```{note}\n", + "Note that the codebases of `transformers`, `accelerate`, and `deepspeed` are all rapidly changing, so we have pinned the package versions here to ensure testing stability. You can try other version combinations and feel free to report any issues you encounter.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import ray\n", + "\n", + "NUM_WORKERS = 16\n", + "BATCH_SIZE_PER_WORKER = 8\n", + "MODEL_NAME = \"lmsys/vicuna-13b-v1.3\"\n", + "\n", + "ray.init(\n", + " runtime_env={\n", + " \"pip\": [\n", + " \"datasets==2.13.1\",\n", + " \"torch>=1.13.0\",\n", + " \"deepspeed==0.9.4\",\n", + " \"accelerate==0.20.3\",\n", + " \"transformers==4.30.2\",\n", + " \"pytorch_lightning==2.0.3\",\n", + " ]\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load and preprocess datasets\n", + "\n", + "We were impressed by LLM's ability of zero-shot text-generation, while some LLMs may not perform well in code generation due to the lack of code in the training corpus. The CMU [CoNaLa](https://conala-corpus.github.io/)(The Code/Natural Language Challenge) was designed to test systems for generating program snippets from natural language. Each data record contains an intent sentence and a one-line code snippet. The goal is to fine-tune the Vicuna model on this dataset, enabling the model to generate correct and runnable code snippets, thereby achieving natural language intent. Here are some examples:\n", + "\n", + "| intent | code snippet |\n", + "| - | - |\n", + "| \"convert a list of integers into a single integer\" | `r = int(''.join(map(str, x)))`|\n", + "| \"normalize a pandas dataframe `df` by row\" | `df.div(df.sum(axis=1), axis=0)` | \n", + "| \"Convert string '03:55' into datetime.time object\" | `datetime.datetime.strptime('03:55', '%H:%M').time()` |\n", + "\n", + "The CoNaLa team has released a dataset crawled from Stack Overflow, automatically filtered, then curated by annotators, split into 2379 training and 500 test examples. In addition, they also included an automatically-mined dataset with 600k examples. In this demo, we take all the curated data and the top 5000 mined data for fine-tuning." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we preprocess the CoNaLa dataset with Ray Data. You can also use HuggingFace Datasets and pass it directly to `LightningConfigBuilder.fit_params()`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "535afe3e183b4cdfa61c39cbae788608", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00\",\n", + " axis=1,\n", + " )\n", + " return batch[[\"input_sentence\"]]\n", + "\n", + "\n", + "# Tokenize input sentences to tensors\n", + "def tokenize(batch):\n", + " tokenizer = AutoTokenizer.from_pretrained(\n", + " MODEL_NAME, padding_side=\"left\", use_fast=False\n", + " )\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + " ret = tokenizer(\n", + " list(batch[\"input_sentence\"]),\n", + " truncation=True,\n", + " max_length=128,\n", + " padding=\"max_length\",\n", + " return_tensors=\"np\",\n", + " )\n", + " ret[\"labels\"] = ret[\"input_ids\"].copy()\n", + " return dict(ret)\n", + "\n", + "# Preprocess train dataset\n", + "processed_ds = ray_ds.map_batches(fill_prompt, batch_format=\"pandas\").map_batches(tokenize, batch_format=\"pandas\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define your model\n", + "\n", + "Here we load the pre-trained model weights from HuggingFace Model Hub, and wrap them into `pl.LightningModule`. We adopted the efficient model initialization techniques introduced in [Lightning-transformers](https://github.com/Lightning-Universe/lightning-transformers) to avoid unnecessary full weights loading." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-06-30 17:39:35,109] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" + ] + } + ], + "source": [ + "import torch\n", + "import transformers\n", + "import pytorch_lightning as pl\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM\n", + "from deepspeed.ops.adam import DeepSpeedCPUAdam\n", + "\n", + "\n", + "class ZeRO3Config:\n", + " def __init__(self, pl_module):\n", + " self.config = pl_module.trainer.strategy.config\n", + "\n", + " def __call__(self, *args, **kwargs):\n", + " return self\n", + "\n", + " def is_zero3(self) -> bool:\n", + " return True\n", + "\n", + "\n", + "def enable_transformers_pretrained_deepspeed_sharding(\n", + " pl_module: \"pl.LightningModule\",\n", + ") -> None:\n", + " transformers.deepspeed._hf_deepspeed_config_weak_ref = ZeRO3Config(pl_module)\n", + "\n", + "\n", + "class Vicuna13BModel(pl.LightningModule):\n", + " def __init__(self):\n", + " super().__init__()\n", + " # Enable tf32 for better performance\n", + " torch.backends.cuda.matmul.allow_tf32 = True\n", + "\n", + " def setup(self, stage) -> None:\n", + " # Defer model initialization to inject deepspeed configs to HF.\n", + " # During initialization, HF transformers can immediately partition \n", + " # the model across all gpus avoid the overhead in time and memory \n", + " # copying it on CPU or each GPU first.\n", + " enable_transformers_pretrained_deepspeed_sharding(self)\n", + " self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n", + " if self.global_rank == 0:\n", + " print(\"DeepSpeed Configs: \", self.trainer.strategy.config)\n", + " print(\"Model Archetecture: \", self.model)\n", + "\n", + " def forward(self, batch):\n", + " outputs = self.model(\n", + " batch[\"input_ids\"],\n", + " labels=batch[\"labels\"],\n", + " attention_mask=batch[\"attention_mask\"],\n", + " )\n", + " return outputs.loss\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " loss = self.forward(batch)\n", + " self.log(\"train_loss\", loss, prog_bar=True, on_step=True, sync_dist=True)\n", + " return loss\n", + "\n", + " def configure_optimizers(self):\n", + " return DeepSpeedCPUAdam(self.parameters(), lr=2e-5, weight_decay=0.01)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training Configurations\n", + "\n", + "Before training, let's calculate the memory usage of finetuning a `vicuna-13b` model. Assume we are using FP16 mixed-precision training, and the optimizer is Adam with FP32 states.\n", + "\n", + "- Model parameters: 13(billion parameters) * 2(FP16) ≈ 26GB\n", + "- Optimizer states: 13(billion parameters) * 2(momentums per param) * 4 (FP32) ≈ 52GB\n", + "\n", + "As we can see, the model parameters themselves require 26GB, which cannot fit in a single A10G GPU, let alone the activations and optimizers states. Here, we use ZeRO stage-3 to partition the model, gradients, and optimizer states across 16 nodes. Additionally, we employ optimizer CPU offloading to reduce GRAM usage and increase throughput with larger batch sizes. We also disabled parameter offloading and activation checkpointing to improve the training speed.\n", + "\n", + "Regarding other knobs such as `reduce_bucket_size`, `stage3_prefetch_bucket_size` and `stage3_param_persistence_threshold`, we kept them as the [default values in HuggingFace](https://huggingface.co/docs/transformers/main_classes/deepspeed#zero3-config). Feel free to further adjust them to speed up the training process." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from ray.train.lightning import LightningTrainer, LightningConfigBuilder\n", + "from transformers import AutoConfig\n", + "\n", + "config = AutoConfig.from_pretrained(MODEL_NAME)\n", + "HIDDEN_SIZE = config.hidden_size\n", + "\n", + "deepspeed_configs = {\n", + " \"zero_allow_untested_optimizer\": True,\n", + " \"bf16\": {\"enabled\": True},\n", + " \"zero_optimization\": {\n", + " \"stage\": 3,\n", + " \"offload_optimizer\": {\"device\": \"cpu\", \"pin_memory\": True},\n", + " \"overlap_comm\": True,\n", + " \"contiguous_gradients\": True,\n", + " \"reduce_bucket_size\": HIDDEN_SIZE * HIDDEN_SIZE,\n", + " \"stage3_prefetch_bucket_size\": 0.9 * HIDDEN_SIZE * HIDDEN_SIZE,\n", + " \"stage3_param_persistence_threshold\": 10 * HIDDEN_SIZE,\n", + " },\n", + "}\n", + "\n", + "lightning_config = (\n", + " LightningConfigBuilder()\n", + " .module(cls=Vicuna13BModel)\n", + " .trainer(\n", + " max_epochs=1,\n", + " accelerator=\"gpu\",\n", + " precision=\"bf16-mixed\",\n", + " accumulate_grad_batches=2,\n", + " )\n", + " .strategy(name=\"deepspeed\", config=deepspeed_configs)\n", + " .checkpointing(save_top_k=0, save_weights_only=True, save_last=True)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "from pytorch_lightning.callbacks import TQDMProgressBar\n", + "\n", + "# Create a customized progress bar for LightningTrainer\n", + "class VicunaProgressBar(TQDMProgressBar):\n", + " def __init__(self, num_iters_per_epoch, *args, **kwargs):\n", + " super().__init__(*args, **kwargs)\n", + " self.num_iters_per_epoch = num_iters_per_epoch\n", + "\n", + " def on_train_epoch_start(self, trainer, *_):\n", + " super().on_train_epoch_start(trainer, *_)\n", + " self.train_progress_bar.reset(self.num_iters_per_epoch)\n", + "\n", + "\n", + "total_batches = processed_ds.count()\n", + "num_iters_per_epoch = total_batches // (NUM_WORKERS * BATCH_SIZE_PER_WORKER)\n", + "progress_bar = VicunaProgressBar(num_iters_per_epoch)\n", + "\n", + "\n", + "lightning_config.trainer(\n", + " callbacks=[progress_bar],\n", + " # Take a subset to accelerate release tests\n", + " limit_train_batches=20,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, combine all the configurations with {class}`LightningConfigBuilder ` and instantiate a LightningTrainer. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from ray.air.config import CheckpointConfig, RunConfig, ScalingConfig\n", + "\n", + "trainer = LightningTrainer(\n", + " lightning_config=lightning_config.build(),\n", + " run_config=RunConfig(\n", + " name=\"vicuna-13b-finetune\",\n", + " storage_path=\"s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/air-release-tests\",\n", + " checkpoint_config=CheckpointConfig(\n", + " num_to_keep=1,\n", + " # Enable distributed checkpointing\n", + " _checkpoint_keep_all_ranks=True,\n", + " _checkpoint_upload_from_workers=True,\n", + " ),\n", + " ),\n", + " scaling_config=ScalingConfig(\n", + " num_workers=NUM_WORKERS,\n", + " use_gpu=True,\n", + " resources_per_worker={\"CPU\": 15, \"GPU\": 1},\n", + " ),\n", + " datasets={\"train\": processed_ds},\n", + " datasets_iter_config={\"batch_size\": BATCH_SIZE_PER_WORKER},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```{tip}\n", + "\n", + "Here, we highly recommend saving checkpoints with cloud storage and enabling distributed checkpointing by setting `_checkpoint_keep_all_ranks` and `_checkpoint_upload_from_workers` to True when training huge models. Otherwise, all checkpoint shards will be synced to the head node, which may introduce enormous syncing overhead and even cause out-of-memory.\n", + "\n", + "```" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Fine-tuning\n", + "\n", + "Once everything is configured in LightningTrainer, training becomes easy. Simply call `trainer.fit()`, and your workload will be scaled to the Ray cluster, initiating ZeRO-3 parallel training." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
\n", + "
\n", + "

Tune Status

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Current time:2023-06-30 18:21:59
Running for: 00:42:22.75
Memory: 10.7/249.1 GiB
\n", + "
\n", + "
\n", + "
\n", + "

System Info

\n", + " Using FIFO scheduling algorithm.
Logical resource usage: 241.0/304 CPUs, 16.0/16 GPUs (0.0/16.0 accelerator_type:A10G)\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "

Trial Status

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Trial name status loc iter total time (s) train_loss epoch step
LightningTrainer_c1544_00000TERMINATED10.0.55.20:134103 1 2473.94 0.523438 0 29
\n", + "
\n", + "
\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[2m\u001B[36m(pid=134103)\u001B[0m [2023-06-30 17:39:41,637] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n", + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m \u001B[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n", + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m \n", + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001B[0m\n", + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Starting distributed worker processes: ['134267 (10.0.55.20)', '74152 (10.0.63.141)', '75476 (10.0.51.205)', '75547 (10.0.42.158)', '74711 (10.0.45.211)', '75132 (10.0.20.140)', '74502 (10.0.60.86)', '75695 (10.0.53.69)', '74457 (10.0.47.2)', '74569 (10.0.33.23)', '74341 (10.0.29.61)', '74274 (10.0.36.152)', '74561 (10.0.35.16)', '74427 (10.0.16.236)', '74273 (10.0.54.55)', '74996 (10.0.9.249)']\n", + "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Setting up process group for: env:// [rank=0, world_size=16]\n", + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)->MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]\n", + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "da7f200767b448d7b409fcdd07daecce", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "(pid=134103) - RandomizeBlockOrder 1: 0%| | 0/1 [00:00.*<' in xml string `line`\",\n", + " },\n", + " {\n", + " \"intent\": \"send a signal `signal.SIGUSR1` to the current process\",\n", + " },\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's begin by examining the generated outputs without fine-tuning. In this case study, we utilize [Aviary Explorer](https://aviary.anyscale.com), an open-source multi-LLM serving platform supported by Ray and Anyscale. You can easily select from a variety of open-source LLMs and compare their generation quality, cost, latency, and many other metrics.\n", + "\n", + "We constructed a prompt in a zero-shot learning manner and feed it into 3 OSS LLMs.\n", + "\n", + "![](https://user-images.githubusercontent.com/26745457/250704232-65a20f1b-6752-4d6c-bba1-8296a373162f.png)\n", + "\n", + "\n", + "- `vicuna-13b-v1.3` begins to speak Chinese.\n", + "- `mpt-7b-chat` generates a reasonable code snippet, but with multiple lines.\n", + "- `falcon-7b-sft` generates a one line snippet, but it doesn't seem to work.\n", + "\n", + "As we can see, none of them generate a satisfactory code snippet. \n", + "\n", + "Now let's check the performance of our fine-tuned `vicuna-13b-v1.3` model:" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/pipelines/base.py:1081: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Intent: replace white spaces in colunm 'col' of dataframe `df` with '_'\n", + "One-line code snippet: `df['col'] = df['col'].str.replace(' ', '_')`\n", + "\n", + "Intent: search for occurrences of regex pattern '>.*<' in xml string `line`\n", + "One-line code snippet: `re.findall('>.*<', line)``\n", + "\n", + "Intent: send a signal `signal.SIGUSR1` to the current process\n", + "One-line code snippet: `os.kill(os.getpid(), signal.SIGUSR1)``\n" + ] + } + ], + "source": [ + "for case in testcases:\n", + " prompt = PROMPT_TEMPLATE.format(intent=case[\"intent\"], snippet=\"\")\n", + " output = generator(prompt, max_new_tokens=30, do_sample=True)\n", + " print(output[0][\"generated_text\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test the Generated Code Snippets\n", + "\n", + "The generated code snippets look pretty reasonable. The results covered Pandas operations, regular expressions, and Linux commands. Let's test them one by one." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Before\n", + " col\n", + "0 abc def ghi\n", + "1 12 3 456\n", + "2 \n", + "After\n", + " col\n", + "0 abc_def_ghi\n", + "1 _12_3_456\n", + "2 _____\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.DataFrame.from_dict({\"col\": [\"abc def ghi\", \" 12 3 456\", \" \"]})\n", + "print(\"Before\\n\", df)\n", + "\n", + "df[\"col\"] = df[\"col\"].str.replace(\" \", \"_\")\n", + "print(\"After\\n\", df)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['>The Great Gatsby<',\n", + " '>F. Scott Fitzgerald<',\n", + " '>1925<',\n", + " '>Sapiens: A Brief History of Humankind<',\n", + " '>Yuval Noah Harari<',\n", + " '>2011<']" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "\n", + "line = \"\"\"\n", + "\n", + " \n", + " The Great Gatsby\n", + " F. Scott Fitzgerald\n", + " 1925\n", + " \n", + " \n", + " Sapiens: A Brief History of Humankind\n", + " Yuval Noah Harari\n", + " 2011\n", + " \n", + "\n", + "\"\"\"\n", + "re.findall(\">.*<\", line)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's hand it over to LLM and let it wrap up the demo:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os, signal\n", + "\n", + "os.kill(os.getpid(), signal.SIGUSR1) # Terminate the current process~" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References:\n", + "\n", + "- [CoNaLa: The Code/Natural Language Challenge](https://conala-corpus.github.io/)\n", + "- [HuggingFace: DeepSpeed Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed#deepspeed-integration)\n", + "- [HuggingFace: Handling big models for inference](https://huggingface.co/docs/accelerate/main/usage_guides/big_modeling)\n", + "- [Lightning Transformers: DeepSpeed Training with Big Transformer Models](https://lightning-transformers.readthedocs.io/en/latest/)\n", + "- [Aviary: Open Source Multi-LLM Serving](https://www.anyscale.com/blog/announcing-aviary-open-source-multi-llm-serving-solution)\n", + "- Rajbhandari, S., Rasley, J., et al. (2020). ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054)\n", + "- Zheng, L., Chiang, W-L., Sheng, Y., et al. (2023). Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. [arXiv:2306.05685](https://arxiv.org/abs/2306.05685)\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}