From 54b7e096b264ccf2fd7e9a6729bf677966b1b933 Mon Sep 17 00:00:00 2001
From: Elliot Tower <elliot@elliottower.com>
Date: Sat, 22 Apr 2023 18:49:35 -0400
Subject: [PATCH 01/10] Update gymnasium/pettingzoo/supersuit versions

Signed-off-by: Elliot Tower <elliot@elliottower.com>
---
 python/requirements/ml/requirements_rllib.txt | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt
index c976b3f0889a9..1309fd8966239 100644
--- a/python/requirements/ml/requirements_rllib.txt
+++ b/python/requirements/ml/requirements_rllib.txt
@@ -5,7 +5,7 @@
 # Atari
 # TODO(sven): Still needed for Atari (need to be wrapped by gymnasium as it does NOT support Atari yet)
 gym==0.26.2
-gymnasium[atari,mujoco]==0.26.3
+gymnasium[atari,mujoco]==0.28.1
 # For testing MuJoCo envs with gymnasium.
 mujoco-py<2.2,>=2.1
 # Kaggle envs.
@@ -15,12 +15,9 @@ kaggle_environments==1.7.11
 #mlagents==0.28.0
 mlagents_envs==0.28.0
 # For tests on PettingZoo's multi-agent envs.
-pettingzoo==1.22.1; python_version >= '3.7'
-# When installing pettingzoo, chess is missing, even though its a dependancy
-# TODO: remove if a future pettingzoo and/or ray version fixes this dependancy issue
-chess==1.7.0
+pettingzoo==1.22.3; python_version >= '3.7'
 pymunk==6.2.1
-supersuit==3.7.0; python_version >= '3.7'
+supersuit==3.7.2; python_version >= '3.7'
 # For tests on minigrid.
 minigrid==2.1.1
 # For tests on RecSim and Kaggle envs.

From 6b7bf1e8bb1b3d4ead9589262656cd127b84861b Mon Sep 17 00:00:00 2001
From: Elliot Tower <elliot@elliottower.com>
Date: Sat, 22 Apr 2023 18:56:58 -0400
Subject: [PATCH 02/10] pettingzoo_env: `return_info` from `reset()`

also removes rendering mode from `render()` as that is specified in the environment initialization rather than in `render()`

Signed-off-by: Elliot Tower <elliot@elliottower.com>
---
 rllib/env/wrappers/pettingzoo_env.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rllib/env/wrappers/pettingzoo_env.py b/rllib/env/wrappers/pettingzoo_env.py
index ba2abc13ce734..651b5cce7f63b 100644
--- a/rllib/env/wrappers/pettingzoo_env.py
+++ b/rllib/env/wrappers/pettingzoo_env.py
@@ -136,7 +136,7 @@ def observation_space_contains(self, x: MultiAgentDict) -> bool:
         return all(self.observation_space.contains(val) for val in x.values())
 
     def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
-        info = self.env.reset(seed=seed, return_info=True, options=options)
+        info = self.env.reset(seed=seed, options=options)
         return (
             {self.env.agent_selection: self.env.observe(self.env.agent_selection)},
             info or {},
@@ -175,7 +175,7 @@ def close(self):
         self.env.close()
 
     def render(self):
-        return self.env.render(self.render_mode)
+        return self.env.render()
 
     @property
     def get_sub_environments(self):
@@ -221,7 +221,7 @@ def __init__(self, env):
         )
 
     def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
-        obs, info = self.par_env.reset(seed=seed, return_info=True, options=options)
+        obs, info = self.par_env.reset(seed=seed, options=options)
         return obs, info or {}
 
     def step(self, action_dict):

From ea7afe8db005f81a64d03d12c244f76d7f2acb55 Mon Sep 17 00:00:00 2001
From: Elliot Tower <elliot@elliottower.com>
Date: Sat, 22 Apr 2023 19:31:18 -0400
Subject: [PATCH 03/10] Switch to stable pettignzoo/shimmy releases

Signed-off-by: Elliot Tower <elliot@elliottower.com>
---
 python/requirements/ml/requirements_rllib.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt
index 1309fd8966239..919d506f9e749 100644
--- a/python/requirements/ml/requirements_rllib.txt
+++ b/python/requirements/ml/requirements_rllib.txt
@@ -17,7 +17,7 @@ mlagents_envs==0.28.0
 # For tests on PettingZoo's multi-agent envs.
 pettingzoo==1.22.3; python_version >= '3.7'
 pymunk==6.2.1
-supersuit==3.7.2; python_version >= '3.7'
+supersuit==3.7.1; python_version >= '3.7'
 # For tests on minigrid.
 minigrid==2.1.1
 # For tests on RecSim and Kaggle envs.

From 5c2256120502ceb631463ec34b92707fa3d9e42f Mon Sep 17 00:00:00 2001
From: Elliot Tower <elliot@elliottower.com>
Date: Sat, 22 Apr 2023 19:33:21 -0400
Subject: [PATCH 04/10] Update pettingzoo_env to work with pettingzoo 1.22.3

Signed-off-by: Elliot Tower <elliot@elliottower.com>
---
 rllib/env/wrappers/pettingzoo_env.py | 29 ++++------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/rllib/env/wrappers/pettingzoo_env.py b/rllib/env/wrappers/pettingzoo_env.py
index 651b5cce7f63b..3be5c03c2a93b 100644
--- a/rllib/env/wrappers/pettingzoo_env.py
+++ b/rllib/env/wrappers/pettingzoo_env.py
@@ -3,7 +3,6 @@
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.rllib.utils.annotations import PublicAPI
 from ray.rllib.utils.gym import convert_old_gym_space_to_gymnasium_space
-from ray.rllib.utils.typing import MultiAgentDict
 
 
 @PublicAPI
@@ -113,30 +112,10 @@ def __init__(self, env):
         )
         self.action_space = convert_old_gym_space_to_gymnasium_space(first_action_space)
 
-        self._agent_ids = self.env.agents
-
-    def observation_space_sample(self, agent_ids: list = None) -> MultiAgentDict:
-        if agent_ids is None:
-            agent_ids = self._agent_ids
-        return {id: self.observation_space.sample() for id in agent_ids}
-
-    def action_space_sample(self, agent_ids: list = None) -> MultiAgentDict:
-        if agent_ids is None:
-            agent_ids = self._agent_ids
-        return {id: self.action_space.sample() for id in agent_ids}
-
-    def action_space_contains(self, x: MultiAgentDict) -> bool:
-        if not isinstance(x, dict):
-            return False
-        return all(self.action_space.contains(val) for val in x.values())
-
-    def observation_space_contains(self, x: MultiAgentDict) -> bool:
-        if not isinstance(x, dict):
-            return False
-        return all(self.observation_space.contains(val) for val in x.values())
+        self._agent_ids = set(self.env.agents)
 
     def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
-        info = self.env.reset(seed=seed, options=options)
+        info = self.env.reset(seed=seed, return_info=True, options=options)
         return (
             {self.env.agent_selection: self.env.observe(self.env.agent_selection)},
             info or {},
@@ -221,7 +200,7 @@ def __init__(self, env):
         )
 
     def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
-        obs, info = self.par_env.reset(seed=seed, options=options)
+        obs, info = self.par_env.reset(seed=seed, return_info=True, options=options)
         return obs, info or {}
 
     def step(self, action_dict):
@@ -234,7 +213,7 @@ def close(self):
         self.par_env.close()
 
     def render(self):
-        return self.par_env.render(self.render_mode)
+        return self.par_env.render()
 
     @property
     def get_sub_environments(self):

From cf1bae909a5bfb8701d9e4ebb532f62221545fa7 Mon Sep 17 00:00:00 2001
From: Elliot Tower <elliot@elliottower.com>
Date: Thu, 18 May 2023 16:38:20 -0400
Subject: [PATCH 05/10] Update to newest PettingZoo, SuperSuit versions

Signed-off-by: Elliot Tower <elliot@elliottower.com>
---
 python/requirements/ml/requirements_rllib.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt
index d8a7b3d0380a8..6e1cac0f4b6b1 100644
--- a/python/requirements/ml/requirements_rllib.txt
+++ b/python/requirements/ml/requirements_rllib.txt
@@ -15,9 +15,9 @@ kaggle_environments==1.7.11
 #mlagents==0.28.0
 mlagents_envs==0.28.0
 # For tests on PettingZoo's multi-agent envs.
-pettingzoo==1.22.3; python_version >= '3.7'
+pettingzoo==1.23.0; python_version >= '3.7'
 pymunk==6.2.1
-supersuit==3.7.1; python_version >= '3.7'
+supersuit==3.8.0; python_version >= '3.7'
 # For tests on minigrid.
 minigrid==2.1.1
 # For tests on RecSim and Kaggle envs.

From fe372972a761ab8c75006b306a1fbc0c4a6c94d5 Mon Sep 17 00:00:00 2001
From: Elliot Tower <elliot@elliottower.com>
Date: Thu, 18 May 2023 16:59:22 -0400
Subject: [PATCH 06/10] Merge PettingZooEnv from master

Signed-off-by: Elliot Tower <elliot@elliottower.com>
---
 rllib/env/wrappers/pettingzoo_env.py | 29 ++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/rllib/env/wrappers/pettingzoo_env.py b/rllib/env/wrappers/pettingzoo_env.py
index 3be5c03c2a93b..3fd29732190eb 100644
--- a/rllib/env/wrappers/pettingzoo_env.py
+++ b/rllib/env/wrappers/pettingzoo_env.py
@@ -3,6 +3,7 @@
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.rllib.utils.annotations import PublicAPI
 from ray.rllib.utils.gym import convert_old_gym_space_to_gymnasium_space
+from ray.rllib.utils.typing import MultiAgentDict
 
 
 @PublicAPI
@@ -112,10 +113,30 @@ def __init__(self, env):
         )
         self.action_space = convert_old_gym_space_to_gymnasium_space(first_action_space)
 
-        self._agent_ids = set(self.env.agents)
+        self._agent_ids = self.env.agents
+
+    def observation_space_sample(self, agent_ids: list = None) -> MultiAgentDict:
+        if agent_ids is None:
+            agent_ids = self._agent_ids
+        return {id: self.observation_space.sample() for id in agent_ids}
+
+    def action_space_sample(self, agent_ids: list = None) -> MultiAgentDict:
+        if agent_ids is None:
+            agent_ids = self._agent_ids
+        return {id: self.action_space.sample() for id in agent_ids}
+
+    def action_space_contains(self, x: MultiAgentDict) -> bool:
+        if not isinstance(x, dict):
+            return False
+        return all(self.action_space.contains(val) for val in x.values())
+
+    def observation_space_contains(self, x: MultiAgentDict) -> bool:
+        if not isinstance(x, dict):
+            return False
+        return all(self.observation_space.contains(val) for val in x.values())
 
     def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
-        info = self.env.reset(seed=seed, return_info=True, options=options)
+        info = self.env.reset(seed=seed, options=options)
         return (
             {self.env.agent_selection: self.env.observe(self.env.agent_selection)},
             info or {},
@@ -200,8 +221,8 @@ def __init__(self, env):
         )
 
     def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
-        obs, info = self.par_env.reset(seed=seed, return_info=True, options=options)
-        return obs, info or {}
+        obs, info = self.par_env.reset(seed=seed, options=options)
+        return obs, info
 
     def step(self, action_dict):
         obss, rews, terminateds, truncateds, infos = self.par_env.step(action_dict)

From 20487c73299915083946a8035363250715ab1dd7 Mon Sep 17 00:00:00 2001
From: Elliot Tower <elliot@elliottower.com>
Date: Thu, 18 May 2023 17:36:36 -0400
Subject: [PATCH 07/10] Update requirements_rllib.txt

Signed-off-by: Elliot Tower <elliot@elliottower.com>
---
 python/requirements/ml/requirements_rllib.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt
index 6e1cac0f4b6b1..305a57a7cc99a 100644
--- a/python/requirements/ml/requirements_rllib.txt
+++ b/python/requirements/ml/requirements_rllib.txt
@@ -5,7 +5,7 @@
 # Atari
 # TODO(sven): Still needed for Atari (need to be wrapped by gymnasium as it does NOT support Atari yet)
 gym==0.26.2
-gymnasium[atari,mujoco]==0.28.1
+gymnasium==0.28.1
 # For testing MuJoCo envs with gymnasium.
 mujoco-py<2.2,>=2.1
 # Kaggle envs.

From f6666893a0e2c9c0ddf6bb5c544a1b7b6431c058 Mon Sep 17 00:00:00 2001
From: Elliot Tower <elliot@elliottower.com>
Date: Thu, 18 May 2023 17:37:04 -0400
Subject: [PATCH 08/10] Update requirements_rllib.txt

Signed-off-by: Elliot Tower <elliot@elliottower.com>
---
 python/requirements/ml/requirements_rllib.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt
index 305a57a7cc99a..bb5621ea39f42 100644
--- a/python/requirements/ml/requirements_rllib.txt
+++ b/python/requirements/ml/requirements_rllib.txt
@@ -3,8 +3,6 @@
 # Environment adapters.
 # ---------------------
 # Atari
-# TODO(sven): Still needed for Atari (need to be wrapped by gymnasium as it does NOT support Atari yet)
-gym==0.26.2
 gymnasium==0.28.1
 # For testing MuJoCo envs with gymnasium.
 mujoco-py<2.2,>=2.1

From 583107d462ea710d9578a74d013a041627a2d341 Mon Sep 17 00:00:00 2001
From: Elliot Tower <etower@umass.edu>
Date: Thu, 18 May 2023 18:12:43 -0400
Subject: [PATCH 09/10] Update all uses of gymnasium to 0.28.1

Signed-off-by: Elliot Tower <etower@umass.edu>
---
 python/requirements.txt                       | 2 +-
 python/requirements/ml/requirements_rllib.txt | 2 +-
 python/setup.py                               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/requirements.txt b/python/requirements.txt
index e6bfd5a7e4667..5e569b8f0b30a 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -41,7 +41,7 @@ scikit-image
 scipy
 aiohttp>=3.7
 fastapi
-gymnasium==0.26.3
+gymnasium==0.28.1
 opencensus
 fsspec
 dm_tree
diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt
index bb5621ea39f42..78beaad6404c3 100644
--- a/python/requirements/ml/requirements_rllib.txt
+++ b/python/requirements/ml/requirements_rllib.txt
@@ -3,7 +3,7 @@
 # Environment adapters.
 # ---------------------
 # Atari
-gymnasium==0.28.1
+gymnasium[atari,mujoco]==0.28.1
 # For testing MuJoCo envs with gymnasium.
 mujoco-py<2.2,>=2.1
 # Kaggle envs.
diff --git a/python/setup.py b/python/setup.py
index 11b725ed85a68..514f293434214 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -274,7 +274,7 @@ def get_packages(self):
 
     setup_spec.extras["rllib"] = setup_spec.extras["tune"] + [
         "dm_tree",
-        "gymnasium==0.26.3",
+        "gymnasium==0.28.1",
         "lz4",
         "scikit-image",
         "pyyaml",

From 39f6a8ce3ae149846fbb6bd76885f6498098a542 Mon Sep 17 00:00:00 2001
From: elliottower <elliot@elliottower.com>
Date: Mon, 7 Aug 2023 11:32:00 -0400
Subject: [PATCH 10/10] Merge master

---
 .../batch_inference_object_detection.ipynb    |    6 +-
 doc/source/data/examples/batch_training.ipynb |    2 +-
 .../huggingface_vit_batch_prediction.ipynb    |    6 +-
 .../pytorch_resnet_batch_prediction.ipynb     |    6 +-
 .../ray-air/examples/feast_example.ipynb      |   86 +-
 .../huggingface_text_classification.ipynb     |    2 +-
 .../opt_deepspeed_batch_inference.ipynb       |   36 +
 .../ray-air/examples/sklearn_example.ipynb    |    4 +-
 .../examples/tfx_tabular_train_to_serve.ipynb |    2 +-
 .../templates/01_batch_inference/start.ipynb  |    6 +-
 .../lightning/lightning_mnist_example.ipynb   |  232 +--
 ...una_13b_lightning_deepspeed_finetune.ipynb |  474 +++---
 .../pytorch/pytorch_resnet_finetune.ipynb     |    2 +-
 doc/source/tune/examples/ax_example.ipynb     |    4 +-
 .../tune/examples/bayesopt_example.ipynb      |    4 +-
 doc/source/tune/examples/optuna_example.ipynb |   12 +-
 doc/source/tune/examples/tune-aim.ipynb       |  808 +++++-----
 doc/source/tune/examples/tune-mlflow.ipynb    |    2 +-
 .../tune-vanilla-pytorch-lightning.ipynb      |   76 +-
 doc/source/tune/examples/tune-wandb.ipynb     |   22 +-
 .../ray/air/examples/lightgbm_example.ipynb   |  504 +++++-
 python/ray/air/examples/sklearn_example.ipynb |  357 ++++-
 .../ray/air/examples/upload_to_comet_ml.ipynb |  413 ++++-
 python/ray/air/examples/upload_to_wandb.ipynb |  370 ++++-
 python/ray/air/examples/xgboost_example.ipynb |  522 +++++-
 .../gptj_deepspeed_fine_tuning.ipynb          | 1201 +++++++++++++-
 .../opt_deepspeed_batch_inference.ipynb       |   37 +-
 ...una_13b_lightning_deepspeed_finetune.ipynb | 1426 ++++++++++++++++-
 28 files changed, 5736 insertions(+), 886 deletions(-)
 create mode 100644 doc/source/ray-air/examples/opt_deepspeed_batch_inference.ipynb
 mode change 120000 => 100644 python/ray/air/examples/lightgbm_example.ipynb
 mode change 120000 => 100644 python/ray/air/examples/sklearn_example.ipynb
 mode change 120000 => 100644 python/ray/air/examples/upload_to_comet_ml.ipynb
 mode change 120000 => 100644 python/ray/air/examples/upload_to_wandb.ipynb
 mode change 120000 => 100644 python/ray/air/examples/xgboost_example.ipynb
 mode change 120000 => 100644 release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb
 mode change 120000 => 100644 release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb
 mode change 120000 => 100644 release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb

diff --git a/doc/source/data/examples/batch_inference_object_detection.ipynb b/doc/source/data/examples/batch_inference_object_detection.ipynb
index 3f8da8947787b..a8026efe79566 100644
--- a/doc/source/data/examples/batch_inference_object_detection.ipynb
+++ b/doc/source/data/examples/batch_inference_object_detection.ipynb
@@ -452,10 +452,10 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[2023-05-19 18:10:29]  INFO ray._private.worker::Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-      "[2023-05-19 18:10:35] [Ray Data] WARNING ray.data.dataset::\u001b[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n",
+      "[2023-05-19 18:10:29]  INFO ray._private.worker::Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32m127.0.0.1:8265 \u001B[39m\u001B[22m\n",
+      "[2023-05-19 18:10:35] [Ray Data] WARNING ray.data.dataset::\u001B[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n",
       "\n",
-      "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001b[0m\n"
+      "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001B[0m\n"
      ]
     },
     {
diff --git a/doc/source/data/examples/batch_training.ipynb b/doc/source/data/examples/batch_training.ipynb
index a95542636b907..b751575e7d155 100644
--- a/doc/source/data/examples/batch_training.ipynb
+++ b/doc/source/data/examples/batch_training.ipynb
@@ -94,7 +94,7 @@
      "text": [
       "2022-12-08 17:04:06,689\tINFO worker.py:1223 -- Using address localhost:9031 set in the environment variable RAY_ADDRESS\n",
       "2022-12-08 17:04:06,691\tINFO worker.py:1333 -- Connecting to existing Ray cluster at address: 172.31.174.62:9031...\n",
-      "2022-12-08 17:04:06,700\tINFO worker.py:1509 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_gyl6mbksa8xt7b149ib6abld/services?redirect_to=dashboard \u001b[39m\u001b[22m\n"
+      "2022-12-08 17:04:06,700\tINFO worker.py:1509 -- Connected to Ray cluster. View the dashboard at \u001B[1m\u001B[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_gyl6mbksa8xt7b149ib6abld/services?redirect_to=dashboard \u001B[39m\u001B[22m\n"
      ]
     },
     {
diff --git a/doc/source/data/examples/huggingface_vit_batch_prediction.ipynb b/doc/source/data/examples/huggingface_vit_batch_prediction.ipynb
index 34d30c1801c03..fb58a027084dc 100644
--- a/doc/source/data/examples/huggingface_vit_batch_prediction.ipynb
+++ b/doc/source/data/examples/huggingface_vit_batch_prediction.ipynb
@@ -72,12 +72,12 @@
      "output_type": "stream",
      "text": [
       "[2023-05-24 11:25:47]  INFO ray._private.worker::Connecting to existing Ray cluster at address: 10.0.33.149:6379...\n",
-      "[2023-05-24 11:25:47]  INFO ray._private.worker::Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_6h5a4kl2xhfgtdy4w41he6iwyw/services?redirect_to=dashboard \u001b[39m\u001b[22m\n",
+      "[2023-05-24 11:25:47]  INFO ray._private.worker::Connected to Ray cluster. View the dashboard at \u001B[1m\u001B[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_6h5a4kl2xhfgtdy4w41he6iwyw/services?redirect_to=dashboard \u001B[39m\u001B[22m\n",
       "[2023-05-24 11:25:47]  INFO ray._private.runtime_env.packaging::Pushing file package 'gcs://_ray_pkg_2429254893b10da6df2b65ceaf858894.zip' (8.71MiB) to Ray cluster...\n",
       "[2023-05-24 11:25:47]  INFO ray._private.runtime_env.packaging::Successfully pushed file package 'gcs://_ray_pkg_2429254893b10da6df2b65ceaf858894.zip'.\n",
-      "[2023-05-24 11:25:50] [Ray Data] WARNING ray.data.dataset::\u001b[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n",
+      "[2023-05-24 11:25:50] [Ray Data] WARNING ray.data.dataset::\u001B[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n",
       "\n",
-      "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001b[0m\n"
+      "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001B[0m\n"
      ]
     },
     {
diff --git a/doc/source/data/examples/pytorch_resnet_batch_prediction.ipynb b/doc/source/data/examples/pytorch_resnet_batch_prediction.ipynb
index 1497fb2120612..57e6dae4c5568 100644
--- a/doc/source/data/examples/pytorch_resnet_batch_prediction.ipynb
+++ b/doc/source/data/examples/pytorch_resnet_batch_prediction.ipynb
@@ -78,12 +78,12 @@
      "output_type": "stream",
      "text": [
       "2023-06-27 23:23:57,184\tINFO worker.py:1452 -- Connecting to existing Ray cluster at address: 10.0.5.141:6379...\n",
-      "2023-06-27 23:23:57,228\tINFO worker.py:1627 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-kncgqf3p7w2j7qcsnz2safl4tj.i.anyscaleuserdata-staging.com \u001b[39m\u001b[22m\n",
+      "2023-06-27 23:23:57,228\tINFO worker.py:1627 -- Connected to Ray cluster. View the dashboard at \u001B[1m\u001B[32mhttps://session-kncgqf3p7w2j7qcsnz2safl4tj.i.anyscaleuserdata-staging.com \u001B[39m\u001B[22m\n",
       "2023-06-27 23:23:57,243\tINFO packaging.py:347 -- Pushing file package 'gcs://_ray_pkg_32ef287a3a39e82021e70d2413880a69.zip' (4.49MiB) to Ray cluster...\n",
       "2023-06-27 23:23:57,257\tINFO packaging.py:360 -- Successfully pushed file package 'gcs://_ray_pkg_32ef287a3a39e82021e70d2413880a69.zip'.\n",
-      "2023-06-27 23:23:59,629\tWARNING dataset.py:253 -- \u001b[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n",
+      "2023-06-27 23:23:59,629\tWARNING dataset.py:253 -- \u001B[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n",
       "\n",
-      "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001b[0m\n"
+      "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001B[0m\n"
      ]
     },
     {
diff --git a/doc/source/ray-air/examples/feast_example.ipynb b/doc/source/ray-air/examples/feast_example.ipynb
index ca735ec4f9603..16d5d0bbf3ec5 100644
--- a/doc/source/ray-air/examples/feast_example.ipynb
+++ b/doc/source/ray-air/examples/feast_example.ipynb
@@ -150,61 +150,61 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36mdatetime\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m timedelta\n",
+      "\u001B[34mfrom\u001B[39;49;00m \u001B[04m\u001B[36mdatetime\u001B[39;49;00m \u001B[34mimport\u001B[39;49;00m timedelta\n",
       "\n",
-      "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36mfeast\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m (Entity, Field, FeatureView, FileSource, ValueType)\n",
-      "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36mfeast\u001b[39;49;00m\u001b[04m\u001b[36m.\u001b[39;49;00m\u001b[04m\u001b[36mtypes\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m Float32, Int64, String\n",
+      "\u001B[34mfrom\u001B[39;49;00m \u001B[04m\u001B[36mfeast\u001B[39;49;00m \u001B[34mimport\u001B[39;49;00m (Entity, Field, FeatureView, FileSource, ValueType)\n",
+      "\u001B[34mfrom\u001B[39;49;00m \u001B[04m\u001B[36mfeast\u001B[39;49;00m\u001B[04m\u001B[36m.\u001B[39;49;00m\u001B[04m\u001B[36mtypes\u001B[39;49;00m \u001B[34mimport\u001B[39;49;00m Float32, Int64, String\n",
       "\n",
       "\n",
-      "zipcode = Entity(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mzipcode\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, value_type=Int64)\n",
+      "zipcode = Entity(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mzipcode\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, value_type=Int64)\n",
       "\n",
       "zipcode_source = FileSource(\n",
-      "    path=\u001b[33m\"\u001b[39;49;00m\u001b[33mfeature_repo/data/zipcode_table.parquet\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n",
-      "    timestamp_field=\u001b[33m\"\u001b[39;49;00m\u001b[33mevent_timestamp\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n",
-      "    created_timestamp_column=\u001b[33m\"\u001b[39;49;00m\u001b[33mcreated_timestamp\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n",
+      "    path=\u001B[33m\"\u001B[39;49;00m\u001B[33mfeature_repo/data/zipcode_table.parquet\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n",
+      "    timestamp_field=\u001B[33m\"\u001B[39;49;00m\u001B[33mevent_timestamp\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n",
+      "    created_timestamp_column=\u001B[33m\"\u001B[39;49;00m\u001B[33mcreated_timestamp\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n",
       ")\n",
       "\n",
       "zipcode_features = FeatureView(\n",
-      "    name=\u001b[33m\"\u001b[39;49;00m\u001b[33mzipcode_features\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n",
-      "    entities=[\u001b[33m\"\u001b[39;49;00m\u001b[33mzipcode\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m],\n",
-      "    ttl=timedelta(days=\u001b[34m3650\u001b[39;49;00m),\n",
+      "    name=\u001B[33m\"\u001B[39;49;00m\u001B[33mzipcode_features\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n",
+      "    entities=[\u001B[33m\"\u001B[39;49;00m\u001B[33mzipcode\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m],\n",
+      "    ttl=timedelta(days=\u001B[34m3650\u001B[39;49;00m),\n",
       "    schema=[\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mcity\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=String),\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mstate\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=String),\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mlocation_type\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=String),\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mtax_returns_filed\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mpopulation\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mtotal_wages\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mcity\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=String),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mstate\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=String),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mlocation_type\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=String),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mtax_returns_filed\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mpopulation\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mtotal_wages\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n",
       "    ],\n",
       "    source=zipcode_source,\n",
       ")\n",
       "\n",
       "dob_ssn = Entity(\n",
-      "    name=\u001b[33m\"\u001b[39;49;00m\u001b[33mdob_ssn\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n",
+      "    name=\u001B[33m\"\u001B[39;49;00m\u001B[33mdob_ssn\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n",
       "    value_type=ValueType.STRING,\n",
-      "    description=\u001b[33m\"\u001b[39;49;00m\u001b[33mDate of birth and last four digits of social security number\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n",
+      "    description=\u001B[33m\"\u001B[39;49;00m\u001B[33mDate of birth and last four digits of social security number\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n",
       ")\n",
       "\n",
       "credit_history_source = FileSource(\n",
-      "    path=\u001b[33m\"\u001b[39;49;00m\u001b[33mfeature_repo/data/credit_history.parquet\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n",
-      "    timestamp_field=\u001b[33m\"\u001b[39;49;00m\u001b[33mevent_timestamp\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n",
-      "    created_timestamp_column=\u001b[33m\"\u001b[39;49;00m\u001b[33mcreated_timestamp\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n",
+      "    path=\u001B[33m\"\u001B[39;49;00m\u001B[33mfeature_repo/data/credit_history.parquet\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n",
+      "    timestamp_field=\u001B[33m\"\u001B[39;49;00m\u001B[33mevent_timestamp\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n",
+      "    created_timestamp_column=\u001B[33m\"\u001B[39;49;00m\u001B[33mcreated_timestamp\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n",
       ")\n",
       "\n",
       "credit_history = FeatureView(\n",
-      "    name=\u001b[33m\"\u001b[39;49;00m\u001b[33mcredit_history\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m,\n",
-      "    entities=[\u001b[33m\"\u001b[39;49;00m\u001b[33mdob_ssn\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m],\n",
-      "    ttl=timedelta(days=\u001b[34m90\u001b[39;49;00m),\n",
+      "    name=\u001B[33m\"\u001B[39;49;00m\u001B[33mcredit_history\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m,\n",
+      "    entities=[\u001B[33m\"\u001B[39;49;00m\u001B[33mdob_ssn\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m],\n",
+      "    ttl=timedelta(days=\u001B[34m90\u001B[39;49;00m),\n",
       "    schema=[\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mcredit_card_due\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mmortgage_due\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mstudent_loan_due\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mvehicle_loan_due\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mhard_pulls\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mmissed_payments_2y\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mmissed_payments_1y\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mmissed_payments_6m\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n",
-      "        Field(name=\u001b[33m\"\u001b[39;49;00m\u001b[33mbankruptcies\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, dtype=Int64),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mcredit_card_due\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mmortgage_due\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mstudent_loan_due\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mvehicle_loan_due\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mhard_pulls\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mmissed_payments_2y\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mmissed_payments_1y\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mmissed_payments_6m\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n",
+      "        Field(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mbankruptcies\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, dtype=Int64),\n",
       "    ],\n",
       "    source=credit_history_source,\n",
       ")\n"
@@ -240,13 +240,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Created entity \u001b[1m\u001b[32mzipcode\u001b[0m\n",
-      "Created entity \u001b[1m\u001b[32mdob_ssn\u001b[0m\n",
-      "Created feature view \u001b[1m\u001b[32mcredit_history\u001b[0m\n",
-      "Created feature view \u001b[1m\u001b[32mzipcode_features\u001b[0m\n",
+      "Created entity \u001B[1m\u001B[32mzipcode\u001B[0m\n",
+      "Created entity \u001B[1m\u001B[32mdob_ssn\u001B[0m\n",
+      "Created feature view \u001B[1m\u001B[32mcredit_history\u001B[0m\n",
+      "Created feature view \u001B[1m\u001B[32mzipcode_features\u001B[0m\n",
       "\n",
-      "Created sqlite table \u001b[1m\u001b[32mfeature_repo_credit_history\u001b[0m\n",
-      "Created sqlite table \u001b[1m\u001b[32mfeature_repo_zipcode_features\u001b[0m\n",
+      "Created sqlite table \u001B[1m\u001B[32mfeature_repo_credit_history\u001B[0m\n",
+      "Created sqlite table \u001B[1m\u001B[32mfeature_repo_zipcode_features\u001B[0m\n",
       "\n"
      ]
     }
@@ -1049,7 +1049,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2022-09-12 19:25:14,018\tINFO worker.py:1508 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n"
+      "2022-09-12 19:25:14,018\tINFO worker.py:1508 -- Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32m127.0.0.1:8265 \u001B[39m\u001B[22m\n"
      ]
     }
    ],
@@ -1196,9 +1196,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(XGBoostTrainer pid=348845)\u001b[0m /home/ray/.pyenv/versions/mambaforge/envs/ray/lib/python3.9/site-packages/xgboost_ray/main.py:431: UserWarning: `num_actors` in `ray_params` is smaller than 2 (1). XGBoost will NOT be distributed!\n",
-      "\u001b[2m\u001b[36m(XGBoostTrainer pid=348845)\u001b[0m   warnings.warn(\n",
-      "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=348922)\u001b[0m [19:25:23] task [xgboost.ray]:140319682474864 got new rank 0\n"
+      "\u001B[2m\u001B[36m(XGBoostTrainer pid=348845)\u001B[0m /home/ray/.pyenv/versions/mambaforge/envs/ray/lib/python3.9/site-packages/xgboost_ray/main.py:431: UserWarning: `num_actors` in `ray_params` is smaller than 2 (1). XGBoost will NOT be distributed!\n",
+      "\u001B[2m\u001B[36m(XGBoostTrainer pid=348845)\u001B[0m   warnings.warn(\n",
+      "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=348922)\u001B[0m [19:25:23] task [xgboost.ray]:140319682474864 got new rank 0\n"
      ]
     },
     {
diff --git a/doc/source/ray-air/examples/huggingface_text_classification.ipynb b/doc/source/ray-air/examples/huggingface_text_classification.ipynb
index 636f1e8429409..7269b76aa8004 100644
--- a/doc/source/ray-air/examples/huggingface_text_classification.ipynb
+++ b/doc/source/ray-air/examples/huggingface_text_classification.ipynb
@@ -83,7 +83,7 @@
      "text": [
       "2022-08-25 10:09:51,282\tINFO worker.py:1223 -- Using address localhost:9031 set in the environment variable RAY_ADDRESS\n",
       "2022-08-25 10:09:51,697\tINFO worker.py:1333 -- Connecting to existing Ray cluster at address: 172.31.80.117:9031...\n",
-      "2022-08-25 10:09:51,706\tINFO worker.py:1509 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-i8ddtfaxhwypbvnyb9uzg7xs.i.anyscaleuserdata-staging.com/auth/?token=agh0_CkcwRQIhAJXwvxwq31GryaWthvXGCXZebsijbuqi7qL2pCa5uROOAiBGjzsyXAJFHLlaEI9zSlNI8ewtghKg5UV3t8NmlxuMcRJmEiCtvjcKE0VPiU7iQx51P9oPQjfpo5g1RJXccVSS5005cBgCIgNuL2E6DAj9xazjBhDwj4veAUIMCP3ClJgGEPCPi94B-gEeChxzZXNfaThERFRmQVhId1lwYlZueWI5dVpnN3hT&redirect_to=dashboard \u001b[39m\u001b[22m\n",
+      "2022-08-25 10:09:51,706\tINFO worker.py:1509 -- Connected to Ray cluster. View the dashboard at \u001B[1m\u001B[32mhttps://session-i8ddtfaxhwypbvnyb9uzg7xs.i.anyscaleuserdata-staging.com/auth/?token=agh0_CkcwRQIhAJXwvxwq31GryaWthvXGCXZebsijbuqi7qL2pCa5uROOAiBGjzsyXAJFHLlaEI9zSlNI8ewtghKg5UV3t8NmlxuMcRJmEiCtvjcKE0VPiU7iQx51P9oPQjfpo5g1RJXccVSS5005cBgCIgNuL2E6DAj9xazjBhDwj4veAUIMCP3ClJgGEPCPi94B-gEeChxzZXNfaThERFRmQVhId1lwYlZueWI5dVpnN3hT&redirect_to=dashboard \u001B[39m\u001B[22m\n",
       "2022-08-25 10:09:51,709\tINFO packaging.py:342 -- Pushing file package 'gcs://_ray_pkg_3332f64b0a461fddc20be71129115d0a.zip' (0.34MiB) to Ray cluster...\n",
       "2022-08-25 10:09:51,714\tINFO packaging.py:351 -- Successfully pushed file package 'gcs://_ray_pkg_3332f64b0a461fddc20be71129115d0a.zip'.\n"
      ]
diff --git a/doc/source/ray-air/examples/opt_deepspeed_batch_inference.ipynb b/doc/source/ray-air/examples/opt_deepspeed_batch_inference.ipynb
new file mode 100644
index 0000000000000..06f3d5fc35fae
--- /dev/null
+++ b/doc/source/ray-air/examples/opt_deepspeed_batch_inference.ipynb
@@ -0,0 +1,36 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    ""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/doc/source/ray-air/examples/sklearn_example.ipynb b/doc/source/ray-air/examples/sklearn_example.ipynb
index a75ce1ac07f2a..29603d25a249e 100644
--- a/doc/source/ray-air/examples/sklearn_example.ipynb
+++ b/doc/source/ray-air/examples/sklearn_example.ipynb
@@ -158,7 +158,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2022-06-22 17:27:37,741\tINFO services.py:1477 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8269\u001b[39m\u001b[22m\n",
+      "2022-06-22 17:27:37,741\tINFO services.py:1477 -- View the Ray dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8269\u001B[39m\u001B[22m\n",
       "2022-06-22 17:27:39,822\tWARNING read_api.py:260 -- The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n",
       "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 44.05it/s]\n"
      ]
@@ -186,7 +186,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(SklearnTrainer pid=1492629)\u001b[0m 2022-06-22 17:27:45,647\tWARNING pool.py:591 -- The 'context' argument is not supported using ray. Please refer to the documentation for how to control ray initialization.\n"
+      "\u001B[2m\u001B[36m(SklearnTrainer pid=1492629)\u001B[0m 2022-06-22 17:27:45,647\tWARNING pool.py:591 -- The 'context' argument is not supported using ray. Please refer to the documentation for how to control ray initialization.\n"
      ]
     },
     {
diff --git a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb
index f2c265ef0c91d..e17f78e214776 100644
--- a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb
+++ b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb
@@ -81,7 +81,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2022-11-08 22:33:29,918\tINFO worker.py:1528 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n"
+      "2022-11-08 22:33:29,918\tINFO worker.py:1528 -- Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8265 \u001B[39m\u001B[22m\n"
      ]
     },
     {
diff --git a/doc/source/templates/01_batch_inference/start.ipynb b/doc/source/templates/01_batch_inference/start.ipynb
index 2929107794a1e..a53232d0ac12e 100644
--- a/doc/source/templates/01_batch_inference/start.ipynb
+++ b/doc/source/templates/01_batch_inference/start.ipynb
@@ -78,12 +78,12 @@
      "output_type": "stream",
      "text": [
       "2023-06-27 23:23:57,184\tINFO worker.py:1452 -- Connecting to existing Ray cluster at address: 10.0.5.141:6379...\n",
-      "2023-06-27 23:23:57,228\tINFO worker.py:1627 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-kncgqf3p7w2j7qcsnz2safl4tj.i.anyscaleuserdata-staging.com \u001b[39m\u001b[22m\n",
+      "2023-06-27 23:23:57,228\tINFO worker.py:1627 -- Connected to Ray cluster. View the dashboard at \u001B[1m\u001B[32mhttps://session-kncgqf3p7w2j7qcsnz2safl4tj.i.anyscaleuserdata-staging.com \u001B[39m\u001B[22m\n",
       "2023-06-27 23:23:57,243\tINFO packaging.py:347 -- Pushing file package 'gcs://_ray_pkg_32ef287a3a39e82021e70d2413880a69.zip' (4.49MiB) to Ray cluster...\n",
       "2023-06-27 23:23:57,257\tINFO packaging.py:360 -- Successfully pushed file package 'gcs://_ray_pkg_32ef287a3a39e82021e70d2413880a69.zip'.\n",
-      "2023-06-27 23:23:59,629\tWARNING dataset.py:253 -- \u001b[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n",
+      "2023-06-27 23:23:59,629\tWARNING dataset.py:253 -- \u001B[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n",
       "\n",
-      "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001b[0m\n"
+      "Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001B[0m\n"
      ]
     },
     {
diff --git a/doc/source/train/examples/lightning/lightning_mnist_example.ipynb b/doc/source/train/examples/lightning/lightning_mnist_example.ipynb
index 34d724cd08fd9..2b92e9e2224b5 100644
--- a/doc/source/train/examples/lightning/lightning_mnist_example.ipynb
+++ b/doc/source/train/examples/lightning/lightning_mnist_example.ipynb
@@ -314,7 +314,7 @@
      "output_type": "stream",
      "text": [
       "2023-06-13 16:05:12,869\tINFO worker.py:1452 -- Connecting to existing Ray cluster at address: 10.0.28.253:6379...\n",
-      "2023-06-13 16:05:12,877\tINFO worker.py:1627 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_15dlj65vax84ljl7ayeplubryd/services?redirect_to=dashboard \u001b[39m\u001b[22m\n",
+      "2023-06-13 16:05:12,877\tINFO worker.py:1627 -- Connected to Ray cluster. View the dashboard at \u001B[1m\u001B[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_15dlj65vax84ljl7ayeplubryd/services?redirect_to=dashboard \u001B[39m\u001B[22m\n",
       "2023-06-13 16:05:13,036\tINFO packaging.py:347 -- Pushing file package 'gcs://_ray_pkg_488e346d50f332edaa288fdaa22b2bdc.zip' (52.65MiB) to Ray cluster...\n",
       "2023-06-13 16:05:13,221\tINFO packaging.py:360 -- Successfully pushed file package 'gcs://_ray_pkg_488e346d50f332edaa288fdaa22b2bdc.zip'.\n",
       "2023-06-13 16:05:13,314\tINFO tune.py:226 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Trainer(...)`.\n"
@@ -397,38 +397,38 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(pid=16995)\u001b[0m /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n",
-      "\u001b[2m\u001b[36m(pid=16995)\u001b[0m   from pandas import MultiIndex, Int64Index\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=16995)\u001b[0m 2023-06-13 16:05:24,007\tINFO backend_executor.py:137 -- Starting distributed worker processes: ['17232 (10.0.28.253)', '6371 (10.0.1.80)', '7319 (10.0.58.90)', '6493 (10.0.26.229)']\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m 2023-06-13 16:05:24,966\tINFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=4]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m   from pandas import MultiIndex, Int64Index\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m   from pandas import MultiIndex, Int64Index\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=7319, ip=10.0.58.90)\u001b[0m /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=7319, ip=10.0.58.90)\u001b[0m   from pandas import MultiIndex, Int64Index\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m Global seed set to 888\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m GPU available: True, used: True\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m TPU available: False, using: 0 TPU cores\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m IPU available: False, using: 0 IPUs\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m HPU available: False, using: 0 HPUs\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=6371, ip=10.0.1.80)\u001b[0m Missing logger folder: logs/lightning_logs\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=6371, ip=10.0.1.80)\u001b[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m   | Name              | Type       | Params\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m -------------------------------------------------\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m 0 | linear_relu_stack | Sequential | 101 K \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m 1 | accuracy          | Accuracy   | 0     \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m -------------------------------------------------\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m 101 K     Trainable params\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m 0         Non-trainable params\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m 101 K     Total params\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m 0.407     Total estimated model params size (MB)\n"
+      "\u001B[2m\u001B[36m(pid=16995)\u001B[0m /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n",
+      "\u001B[2m\u001B[36m(pid=16995)\u001B[0m   from pandas import MultiIndex, Int64Index\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=16995)\u001B[0m 2023-06-13 16:05:24,007\tINFO backend_executor.py:137 -- Starting distributed worker processes: ['17232 (10.0.28.253)', '6371 (10.0.1.80)', '7319 (10.0.58.90)', '6493 (10.0.26.229)']\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m 2023-06-13 16:05:24,966\tINFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=4]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m   from pandas import MultiIndex, Int64Index\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m   from pandas import MultiIndex, Int64Index\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=7319, ip=10.0.58.90)\u001B[0m /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=7319, ip=10.0.58.90)\u001B[0m   from pandas import MultiIndex, Int64Index\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m Global seed set to 888\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m GPU available: True, used: True\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m TPU available: False, using: 0 TPU cores\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m IPU available: False, using: 0 IPUs\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m HPU available: False, using: 0 HPUs\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=6371, ip=10.0.1.80)\u001B[0m Missing logger folder: logs/lightning_logs\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=6371, ip=10.0.1.80)\u001B[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m   | Name              | Type       | Params\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m -------------------------------------------------\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m 0 | linear_relu_stack | Sequential | 101 K \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m 1 | accuracy          | Accuracy   | 0     \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m -------------------------------------------------\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m 101 K     Trainable params\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m 0         Non-trainable params\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m 101 K     Total params\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m 0.407     Total estimated model params size (MB)\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Sanity Checking: 0it [00:00, ?it/s]\u001b[0m \n",
+      "Sanity Checking: 0it [00:00, ?it/s]\u001B[0m \n",
       "Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]\n",
       "Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00,  1.33it/s]\n",
       "Epoch 0:   0%|          | 0/118 [00:00<?, ?it/s]                           \n"
@@ -438,9 +438,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=6493, ip=10.0.26.229)\u001b[0m [W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=6371, ip=10.0.1.80)\u001b[0m /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\u001b[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=6371, ip=10.0.1.80)\u001b[0m   from pandas import MultiIndex, Int64Index\u001b[32m [repeated 2x across cluster]\u001b[0m\n"
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=6493, ip=10.0.26.229)\u001B[0m [W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=6371, ip=10.0.1.80)\u001B[0m /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\u001B[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=6371, ip=10.0.1.80)\u001B[0m   from pandas import MultiIndex, Int64Index\u001B[32m [repeated 2x across cluster]\u001B[0m\n"
      ]
     },
     {
@@ -463,14 +463,14 @@
       "Epoch 0:  80%|███████▉  | 94/118 [00:01<00:00, 79.16it/s, loss=0.297, v_num=0]\n",
       "Epoch 0:  90%|████████▉ | 106/118 [00:01<00:00, 82.26it/s, loss=0.281, v_num=0]\n",
       "Epoch 0:  92%|█████████▏| 108/118 [00:01<00:00, 83.04it/s, loss=0.284, v_num=0]\n",
-      "Validation: 0it [00:00, ?it/s]\u001b[A2)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
-      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation: 0it [00:00, ?it/s]\u001B[A2)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
+      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
       "Epoch 0:  92%|█████████▏| 109/118 [00:01<00:00, 73.67it/s, loss=0.284, v_num=0]\n",
       "Epoch 0:  93%|█████████▎| 110/118 [00:01<00:00, 74.14it/s, loss=0.284, v_num=0]\n",
       "Epoch 0:  94%|█████████▍| 111/118 [00:01<00:00, 74.57it/s, loss=0.284, v_num=0]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
       "Epoch 0:  95%|█████████▍| 112/118 [00:01<00:00, 73.94it/s, loss=0.284, v_num=0]\n",
       "Epoch 0:  96%|█████████▌| 113/118 [00:01<00:00, 74.45it/s, loss=0.284, v_num=0]\n",
       "Epoch 0:  97%|█████████▋| 114/118 [00:01<00:00, 74.96it/s, loss=0.284, v_num=0]\n",
@@ -536,17 +536,17 @@
       "Epoch 1:  73%|███████▎  | 86/118 [00:01<00:00, 78.66it/s, loss=0.203, v_num=0]\n",
       "Epoch 1:  81%|████████  | 95/118 [00:01<00:00, 79.71it/s, loss=0.199, v_num=0]\n",
       "Epoch 1:  92%|█████████▏| 108/118 [00:01<00:00, 83.67it/s, loss=0.206, v_num=0]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "Validation: 0it [00:00, ?it/s]\u001b[A2)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
-      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "Validation: 0it [00:00, ?it/s]\u001B[A2)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
+      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
       "Epoch 1:  92%|█████████▏| 109/118 [00:01<00:00, 73.73it/s, loss=0.206, v_num=0]\n",
       "Epoch 1:  93%|█████████▎| 110/118 [00:01<00:00, 74.00it/s, loss=0.206, v_num=0]\n",
       "Epoch 1:  94%|█████████▍| 111/118 [00:01<00:00, 74.36it/s, loss=0.206, v_num=0]\n",
       "Epoch 1:  95%|█████████▍| 112/118 [00:01<00:00, 74.72it/s, loss=0.206, v_num=0]\n",
       "Epoch 1:  96%|█████████▌| 113/118 [00:01<00:00, 75.08it/s, loss=0.206, v_num=0]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
       "Epoch 1:  97%|█████████▋| 114/118 [00:01<00:00, 75.42it/s, loss=0.206, v_num=0]\n",
       "Epoch 1:  97%|█████████▋| 115/118 [00:01<00:00, 75.77it/s, loss=0.206, v_num=0]\n",
       "Epoch 1:  98%|█████████▊| 116/118 [00:01<00:00, 76.08it/s, loss=0.206, v_num=0]\n",
@@ -567,16 +567,16 @@
       "Epoch 2:  74%|███████▎  | 87/118 [00:01<00:00, 77.13it/s, loss=0.157, v_num=0]\n",
       "Epoch 2:  81%|████████▏ | 96/118 [00:01<00:00, 78.76it/s, loss=0.162, v_num=0]\n",
       "Epoch 2:  92%|█████████▏| 108/118 [00:01<00:00, 81.91it/s, loss=0.149, v_num=0]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "Validation: 0it [00:00, ?it/s]\u001b[A2)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
-      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "Validation: 0it [00:00, ?it/s]\u001B[A2)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
+      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
       "Epoch 2:  92%|█████████▏| 109/118 [00:01<00:00, 71.87it/s, loss=0.149, v_num=0]\n",
       "Epoch 2:  93%|█████████▎| 110/118 [00:01<00:00, 72.36it/s, loss=0.149, v_num=0]\n",
       "Epoch 2:  94%|█████████▍| 111/118 [00:01<00:00, 72.87it/s, loss=0.149, v_num=0]\n",
       "Epoch 2:  95%|█████████▍| 112/118 [00:01<00:00, 73.22it/s, loss=0.149, v_num=0]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
       "Epoch 2:  96%|█████████▌| 113/118 [00:01<00:00, 73.13it/s, loss=0.149, v_num=0]\n",
       "Epoch 2:  97%|█████████▋| 114/118 [00:01<00:00, 73.63it/s, loss=0.149, v_num=0]\n",
       "Epoch 2:  97%|█████████▋| 115/118 [00:01<00:00, 74.14it/s, loss=0.149, v_num=0]\n",
@@ -602,13 +602,13 @@
       "Epoch 3:  81%|████████  | 95/118 [00:01<00:00, 77.30it/s, loss=0.124, v_num=0]\n",
       "Epoch 3:  90%|████████▉ | 106/118 [00:01<00:00, 79.61it/s, loss=0.119, v_num=0]\n",
       "Epoch 3:  92%|█████████▏| 108/118 [00:01<00:00, 80.41it/s, loss=0.123, v_num=0]\n",
-      "Validation: 0it [00:00, ?it/s]\u001b[A2)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
-      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
+      "Validation: 0it [00:00, ?it/s]\u001B[A2)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
+      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
       "Epoch 3:  92%|█████████▏| 109/118 [00:01<00:00, 70.82it/s, loss=0.123, v_num=0]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
       "Epoch 3:  93%|█████████▎| 110/118 [00:01<00:00, 71.26it/s, loss=0.123, v_num=0]\n",
       "Epoch 3:  94%|█████████▍| 111/118 [00:01<00:00, 71.60it/s, loss=0.123, v_num=0]\n",
       "Epoch 3:  95%|█████████▍| 112/118 [00:01<00:00, 71.92it/s, loss=0.123, v_num=0]\n",
@@ -638,10 +638,10 @@
       "Epoch 4:  78%|███████▊  | 92/118 [00:01<00:00, 76.33it/s, loss=0.108, v_num=0]\n",
       "Epoch 4:  87%|████████▋ | 103/118 [00:01<00:00, 78.93it/s, loss=0.0973, v_num=0]\n",
       "Epoch 4:  92%|█████████▏| 108/118 [00:01<00:00, 80.62it/s, loss=0.107, v_num=0] \n",
-      "Validation: 0it [00:00, ?it/s]\u001b[A2)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
-      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation: 0it [00:00, ?it/s]\u001B[A2)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
+      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
       "Epoch 4:  92%|█████████▏| 109/118 [00:01<00:00, 70.57it/s, loss=0.107, v_num=0]\n",
       "Epoch 4:  93%|█████████▎| 110/118 [00:01<00:00, 71.05it/s, loss=0.107, v_num=0]\n",
       "Epoch 4:  94%|█████████▍| 111/118 [00:01<00:00, 71.56it/s, loss=0.107, v_num=0]\n",
@@ -669,10 +669,10 @@
       "Epoch 5:  78%|███████▊  | 92/118 [00:01<00:00, 78.53it/s, loss=0.102, v_num=0] \n",
       "Epoch 5:  87%|████████▋ | 103/118 [00:01<00:00, 80.60it/s, loss=0.109, v_num=0]\n",
       "Epoch 5:  92%|█████████▏| 108/118 [00:01<00:00, 82.44it/s, loss=0.105, v_num=0]\n",
-      "Validation: 0it [00:00, ?it/s]\u001b[A2)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
-      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation: 0it [00:00, ?it/s]\u001B[A2)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
+      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
       "Epoch 5:  92%|█████████▏| 109/118 [00:01<00:00, 72.14it/s, loss=0.105, v_num=0]\n",
       "Epoch 5:  93%|█████████▎| 110/118 [00:01<00:00, 72.45it/s, loss=0.105, v_num=0]\n",
       "Epoch 5:  94%|█████████▍| 111/118 [00:01<00:00, 72.86it/s, loss=0.105, v_num=0]\n",
@@ -700,10 +700,10 @@
       "Epoch 6:  78%|███████▊  | 92/118 [00:01<00:00, 78.52it/s, loss=0.0764, v_num=0]\n",
       "Epoch 6:  84%|████████▍ | 99/118 [00:01<00:00, 78.46it/s, loss=0.0668, v_num=0]\n",
       "Epoch 6:  92%|█████████▏| 108/118 [00:01<00:00, 81.21it/s, loss=0.0822, v_num=0]\n",
-      "Validation: 0it [00:00, ?it/s]\u001b[A2)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
-      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation: 0it [00:00, ?it/s]\u001B[A2)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
+      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
       "Epoch 6:  92%|█████████▏| 109/118 [00:01<00:00, 70.99it/s, loss=0.0822, v_num=0]\n",
       "Epoch 6:  93%|█████████▎| 110/118 [00:01<00:00, 71.41it/s, loss=0.0822, v_num=0]\n",
       "Epoch 6:  94%|█████████▍| 111/118 [00:01<00:00, 71.74it/s, loss=0.0822, v_num=0]\n",
@@ -712,7 +712,7 @@
       "Epoch 6:  97%|█████████▋| 114/118 [00:01<00:00, 72.67it/s, loss=0.0822, v_num=0]\n",
       "Epoch 6:  97%|█████████▋| 115/118 [00:01<00:00, 73.08it/s, loss=0.0822, v_num=0]\n",
       "Epoch 6:  98%|█████████▊| 116/118 [00:01<00:00, 73.40it/s, loss=0.0822, v_num=0]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
       "Epoch 6:  99%|█████████▉| 117/118 [00:01<00:00, 73.85it/s, loss=0.0822, v_num=0]\n",
       "Epoch 6: 100%|██████████| 118/118 [00:01<00:00, 73.97it/s, loss=0.0822, v_num=0]\n",
       "Epoch 6: 100%|██████████| 118/118 [00:01<00:00, 73.91it/s, loss=0.0822, v_num=0]\n",
@@ -733,14 +733,14 @@
       "Epoch 7:  91%|█████████ | 107/118 [00:01<00:00, 85.72it/s, loss=0.0743, v_num=0]\n",
       "Epoch 7:  91%|█████████ | 107/118 [00:01<00:00, 85.50it/s, loss=0.0753, v_num=0]\n",
       "Epoch 7:  92%|█████████▏| 108/118 [00:01<00:00, 85.88it/s, loss=0.0742, v_num=0]\n",
-      "Validation: 0it [00:00, ?it/s]\u001b[A2)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
-      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation: 0it [00:00, ?it/s]\u001B[A2)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
+      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
       "Epoch 7:  92%|█████████▏| 109/118 [00:01<00:00, 75.70it/s, loss=0.0742, v_num=0]\n",
       "Epoch 7:  93%|█████████▎| 110/118 [00:01<00:00, 76.11it/s, loss=0.0742, v_num=0]\n",
       "Epoch 7:  94%|█████████▍| 111/118 [00:01<00:00, 76.41it/s, loss=0.0742, v_num=0]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
       "Epoch 7:  95%|█████████▍| 112/118 [00:01<00:00, 76.72it/s, loss=0.0742, v_num=0]\n",
       "Epoch 7:  96%|█████████▌| 113/118 [00:01<00:00, 77.06it/s, loss=0.0742, v_num=0]\n",
       "Epoch 7:  97%|█████████▋| 114/118 [00:01<00:00, 77.34it/s, loss=0.0742, v_num=0]\n",
@@ -765,13 +765,13 @@
       "Epoch 8:  81%|████████  | 95/118 [00:01<00:00, 79.63it/s, loss=0.0485, v_num=0]\n",
       "Epoch 8:  81%|████████▏ | 96/118 [00:01<00:00, 80.00it/s, loss=0.0497, v_num=0]\n",
       "Epoch 8:  92%|█████████▏| 108/118 [00:01<00:00, 82.93it/s, loss=0.059, v_num=0] \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "Validation: 0it [00:00, ?it/s]\u001b[A2)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
-      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "Validation: 0it [00:00, ?it/s]\u001B[A2)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
+      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
       "Epoch 8:  92%|█████████▏| 109/118 [00:01<00:00, 72.21it/s, loss=0.059, v_num=0]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
       "Epoch 8:  93%|█████████▎| 110/118 [00:01<00:00, 72.61it/s, loss=0.059, v_num=0]\n",
       "Epoch 8:  94%|█████████▍| 111/118 [00:01<00:00, 72.91it/s, loss=0.059, v_num=0]\n",
       "Epoch 8:  95%|█████████▍| 112/118 [00:01<00:00, 73.36it/s, loss=0.059, v_num=0]\n",
@@ -797,11 +797,11 @@
       "Epoch 9:  82%|████████▏ | 97/118 [00:01<00:00, 81.13it/s, loss=0.0506, v_num=0]\n",
       "Epoch 9:  83%|████████▎ | 98/118 [00:01<00:00, 81.21it/s, loss=0.0506, v_num=0]\n",
       "Epoch 9:  92%|█████████▏| 108/118 [00:01<00:00, 84.51it/s, loss=0.0563, v_num=0]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "Validation: 0it [00:00, ?it/s]\u001b[A2)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
-      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
-      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "Validation: 0it [00:00, ?it/s]\u001B[A2)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
+      "Validation:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
+      "Validation DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]\u001B[A\n",
       "Epoch 9:  92%|█████████▏| 109/118 [00:01<00:00, 74.09it/s, loss=0.0563, v_num=0]\n",
       "Epoch 9:  93%|█████████▎| 110/118 [00:01<00:00, 74.36it/s, loss=0.0563, v_num=0]\n",
       "Epoch 9:  94%|█████████▍| 111/118 [00:01<00:00, 74.67it/s, loss=0.0563, v_num=0]\n",
@@ -809,7 +809,7 @@
       "Epoch 9:  96%|█████████▌| 113/118 [00:01<00:00, 75.46it/s, loss=0.0563, v_num=0]\n",
       "Epoch 9:  97%|█████████▋| 114/118 [00:01<00:00, 75.88it/s, loss=0.0563, v_num=0]\n",
       "Epoch 9:  97%|█████████▋| 115/118 [00:01<00:00, 76.36it/s, loss=0.0563, v_num=0]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m \n",
       "Epoch 9:  98%|█████████▊| 116/118 [00:01<00:00, 75.64it/s, loss=0.0563, v_num=0]\n",
       "Epoch 9:  99%|█████████▉| 117/118 [00:01<00:00, 76.00it/s, loss=0.0563, v_num=0]\n",
       "Epoch 9: 100%|██████████| 118/118 [00:01<00:00, 76.04it/s, loss=0.0563, v_num=0]\n",
@@ -945,9 +945,9 @@
       ],
       "text/plain": [
        "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
-       "┃\u001b[1m \u001b[0m\u001b[1m       Test metric       \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m      DataLoader 0       \u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┃\u001B[1m \u001B[0m\u001B[1m       Test metric       \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1m      DataLoader 0       \u001B[0m\u001B[1m \u001B[0m┃\n",
        "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
-       "│\u001b[36m \u001b[0m\u001b[36m      test_accuracy      \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m   0.9740999937057495    \u001b[0m\u001b[35m \u001b[0m│\n",
+       "│\u001B[36m \u001B[0m\u001B[36m      test_accuracy      \u001B[0m\u001B[36m \u001B[0m│\u001B[35m \u001B[0m\u001B[35m   0.9740999937057495    \u001B[0m\u001B[35m \u001B[0m│\n",
        "└───────────────────────────┴───────────────────────────┘\n"
       ]
      },
@@ -984,29 +984,29 @@
      "output_type": "stream",
      "text": [
       "2023-06-13 16:05:56,270\tWARNING worker.py:2019 -- Warning: The actor TestWorker is very large (53 MiB). Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use ray.put() to put large objects in the Ray object store.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=17232)\u001b[0m Global seed set to 888\u001b[32m [repeated 7x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=7319, ip=10.0.58.90)\u001b[0m Missing logger folder: logs/lightning_logs\u001b[32m [repeated 3x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=7319, ip=10.0.58.90)\u001b[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\u001b[32m [repeated 3x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=6371, ip=10.0.1.80)\u001b[0m [W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\u001b[32m [repeated 3x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(pid=9162, ip=10.0.26.229)\u001b[0m   from pandas import MultiIndex, Int64Index\n",
-      "\u001b[2m\u001b[36m(pid=9162, ip=10.0.26.229)\u001b[0m   from pandas import MultiIndex, Int64Index\n",
-      "\u001b[2m\u001b[36m(pid=9976, ip=10.0.58.90)\u001b[0m /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n",
-      "\u001b[2m\u001b[36m(pid=9976, ip=10.0.58.90)\u001b[0m   from pandas import MultiIndex, Int64Index\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m /home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:92: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m   rank_zero_warn(\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m GPU available: True, used: True\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m TPU available: False, using: 0 TPU cores\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m IPU available: False, using: 0 IPUs\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m HPU available: False, using: 0 HPUs\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m /home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:330: PossibleUserWarning: Using `DistributedSampler` with the dataloaders. During `trainer.test()`, it is recommended to use `Trainer(devices=1)` to ensure each sample/batch gets evaluated exactly once. Otherwise, multi-device settings use `DistributedSampler` that replicates some samples to make sure all devices have same batch size in case of uneven inputs.\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m   rank_zero_warn(\n"
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=17232)\u001B[0m Global seed set to 888\u001B[32m [repeated 7x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=7319, ip=10.0.58.90)\u001B[0m Missing logger folder: logs/lightning_logs\u001B[32m [repeated 3x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=7319, ip=10.0.58.90)\u001B[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\u001B[32m [repeated 3x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=6371, ip=10.0.1.80)\u001B[0m [W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\u001B[32m [repeated 3x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(pid=9162, ip=10.0.26.229)\u001B[0m   from pandas import MultiIndex, Int64Index\n",
+      "\u001B[2m\u001B[36m(pid=9162, ip=10.0.26.229)\u001B[0m   from pandas import MultiIndex, Int64Index\n",
+      "\u001B[2m\u001B[36m(pid=9976, ip=10.0.58.90)\u001B[0m /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n",
+      "\u001B[2m\u001B[36m(pid=9976, ip=10.0.58.90)\u001B[0m   from pandas import MultiIndex, Int64Index\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m /home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:92: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m   rank_zero_warn(\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m GPU available: True, used: True\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m TPU available: False, using: 0 TPU cores\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m IPU available: False, using: 0 IPUs\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m HPU available: False, using: 0 HPUs\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m /home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:330: PossibleUserWarning: Using `DistributedSampler` with the dataloaders. During `trainer.test()`, it is recommended to use `Trainer(devices=1)` to ensure each sample/batch gets evaluated exactly once. Otherwise, multi-device settings use `DistributedSampler` that replicates some samples to make sure all devices have same batch size in case of uneven inputs.\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m   rank_zero_warn(\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Testing: 0it [00:00, ?it/s]600)\u001b[0m \n",
+      "Testing: 0it [00:00, ?it/s]600)\u001B[0m \n",
       "Testing DataLoader 0:   0%|          | 0/20 [00:00<?, ?it/s]\n",
       "Testing DataLoader 0:  10%|█         | 2/20 [00:00<00:13,  1.36it/s]\n",
       "Testing DataLoader 0:  75%|███████▌  | 15/20 [00:00<00:00, 22.10it/s]\n"
@@ -1016,12 +1016,12 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m 2023-06-13 16:06:07.550225: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=9976, ip=10.0.58.90)\u001b[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\u001b[32m [repeated 4x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(pid=20600)\u001b[0m /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(pid=20600)\u001b[0m   from pandas import MultiIndex, Int64Index\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m 2023-06-13 16:06:07.708119: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n"
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m 2023-06-13 16:06:07.550225: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=9976, ip=10.0.58.90)\u001B[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\u001B[32m [repeated 4x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(pid=20600)\u001B[0m /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\u001B[32m [repeated 2x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(pid=20600)\u001B[0m   from pandas import MultiIndex, Int64Index\u001B[32m [repeated 2x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m 2023-06-13 16:06:07.708119: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n"
      ]
     },
     {
@@ -1035,9 +1035,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m 2023-06-13 16:06:08.680418: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m 2023-06-13 16:06:08.680524: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m 2023-06-13 16:06:08.680532: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m 2023-06-13 16:06:08.680418: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m 2023-06-13 16:06:08.680524: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m 2023-06-13 16:06:08.680532: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
      ]
     },
     {
@@ -1045,11 +1045,11 @@
      "output_type": "stream",
      "text": [
       "Testing DataLoader 0: 100%|██████████| 20/20 [00:02<00:00,  7.31it/s]\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m ┃        Test metric        ┃       DataLoader 0        ┃\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m │       test_accuracy       │    0.9740999937057495     │\n",
-      "\u001b[2m\u001b[36m(TestWorker pid=20600)\u001b[0m └───────────────────────────┴───────────────────────────┘\n"
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m ┃        Test metric        ┃       DataLoader 0        ┃\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m │       test_accuracy       │    0.9740999937057495     │\n",
+      "\u001B[2m\u001B[36m(TestWorker pid=20600)\u001B[0m └───────────────────────────┴───────────────────────────┘\n"
      ]
     }
    ],
diff --git a/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb b/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb
index 863dca2c31207..74afc02ff0b08 100644
--- a/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb
+++ b/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb
@@ -528,22 +528,22 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(pid=134103)\u001b[0m [2023-06-30 17:39:41,637] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+      "\u001B[2m\u001B[36m(pid=134103)\u001B[0m [2023-06-30 17:39:41,637] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m \u001b[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001b[0m\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Starting distributed worker processes: ['134267 (10.0.55.20)', '74152 (10.0.63.141)', '75476 (10.0.51.205)', '75547 (10.0.42.158)', '74711 (10.0.45.211)', '75132 (10.0.20.140)', '74502 (10.0.60.86)', '75695 (10.0.53.69)', '74457 (10.0.47.2)', '74569 (10.0.33.23)', '74341 (10.0.29.61)', '74274 (10.0.36.152)', '74561 (10.0.35.16)', '74427 (10.0.16.236)', '74273 (10.0.54.55)', '74996 (10.0.9.249)']\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Setting up process group for: env:// [rank=0, world_size=16]\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)->MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n"
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m \u001B[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001B[0m\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Starting distributed worker processes: ['134267 (10.0.55.20)', '74152 (10.0.63.141)', '75476 (10.0.51.205)', '75547 (10.0.42.158)', '74711 (10.0.45.211)', '75132 (10.0.20.140)', '74502 (10.0.60.86)', '75695 (10.0.53.69)', '74457 (10.0.47.2)', '74569 (10.0.33.23)', '74341 (10.0.29.61)', '74274 (10.0.36.152)', '74561 (10.0.35.16)', '74427 (10.0.16.236)', '74273 (10.0.54.55)', '74996 (10.0.9.249)']\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Setting up process group for: env:// [rank=0, world_size=16]\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)->MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n"
      ]
     },
     {
@@ -578,16 +578,16 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 8.86MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n",
-      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 18.2MB/s]ansform_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n",
-      "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 3.33MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n"
+      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 8.86MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001B[0m \n",
+      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 18.2MB/s]ansform_pandas) pid=74329, ip=10.0.54.55)\u001B[0m \n",
+      "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 3.33MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001B[0m \n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m [2023-06-30 17:39:54,612] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m [2023-06-30 17:39:54,612] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
      ]
     },
     {
@@ -596,22 +596,22 @@
      "text": [
       "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 7.86MB/s]\n",
       "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 7.57MB/s]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m GPU available: True (cuda), used: True\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m TPU available: False, using: 0 TPU cores\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m IPU available: False, using: 0 IPUs\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m HPU available: False, using: 0 HPUs\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m `Trainer(limit_val_batches=1)` was configured so 1 batch will be used.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m GPU available: True (cuda), used: True\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m TPU available: False, using: 0 TPU cores\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m IPU available: False, using: 0 IPUs\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m HPU available: False, using: 0 HPUs\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m `Trainer(limit_val_batches=1)` was configured so 1 batch will be used.\n",
       "Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]\n",
       "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 14.9MB/s]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/16\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001b[0m Missing logger folder: /home/ray/ray_results/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/rank_all/lightning_logs\n"
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/16\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001B[0m Missing logger folder: /home/ray/ray_results/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/rank_all/lightning_logs\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:39:55,589] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\n"
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:39:55,589] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\n"
      ]
     },
     {
@@ -625,219 +625,219 @@
       "Downloading (…)lve/main/config.json: 100%|██████████| 585/585 [00:00<00:00, 7.09MB/s]\n",
       "Downloading (…)model.bin.index.json: 100%|██████████| 33.4k/33.4k [00:00<00:00, 35.1MB/s]\n",
       "Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m \n",
-      "Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]\u001b[A\n",
-      "Downloading (…)l-00001-of-00003.bin:   0%|          | 21.0M/9.95G [00:00<00:59, 167MB/s]\u001b[A\n",
-      "Downloading (…)l-00001-of-00003.bin:   0%|          | 41.9M/9.95G [00:00<00:58, 170MB/s] \u001b[A\n",
-      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 8.33MB/s]\u001b[32m [repeated 9x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)\u001b[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001B[0m \n",
+      "Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]\u001B[A\n",
+      "Downloading (…)l-00001-of-00003.bin:   0%|          | 21.0M/9.95G [00:00<00:59, 167MB/s]\u001B[A\n",
+      "Downloading (…)l-00001-of-00003.bin:   0%|          | 41.9M/9.95G [00:00<00:58, 170MB/s] \u001B[A\n",
+      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 8.33MB/s]\u001B[32m [repeated 9x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)\u001B[0m\n",
       "Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]\n",
-      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 17.5MB/s]\u001b[32m [repeated 8x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m initializing deepspeed distributed: GLOBAL_RANK: 12, MEMBER: 13/16\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m Missing logger folder: /home/ray/ray_results/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/rank_all/lightning_logs\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 17.5MB/s]\u001B[32m [repeated 8x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001B[0m initializing deepspeed distributed: GLOBAL_RANK: 12, MEMBER: 13/16\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001B[0m Missing logger folder: /home/ray/ray_results/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/rank_all/lightning_logs\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
       "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 8.85MB/s]\n",
-      "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 5.23MB/s]\u001b[32m [repeated 10x across cluster]\u001b[0m\n",
-      "Downloading (…)lve/main/config.json: 100%|██████████| 585/585 [00:00<00:00, 7.03MB/s]\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
-      "Downloading (…)model.bin.index.json: 100%|██████████| 33.4k/33.4k [00:00<00:00, 87.9MB/s]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001b[0m \u001b[32m [repeated 650x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]\u001b[A\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  13%|█▎        | 1.31G/9.95G [00:05<00:36, 239MB/s]\u001b[A\u001b[32m [repeated 636x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:   1%|          | 105M/9.95G [00:00<00:41, 239MB/s] \u001b[A\u001b[32m [repeated 17x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001b[0m \u001b[32m [repeated 640x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  26%|██▌       | 2.58G/9.95G [00:10<00:28, 256MB/s]\u001b[A\u001b[32m [repeated 635x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001b[0m \u001b[32m [repeated 638x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  37%|███▋      | 3.70G/9.95G [00:15<00:26, 238MB/s]\u001b[A\u001b[32m [repeated 638x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m \u001b[32m [repeated 643x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  51%|█████▏    | 5.12G/9.95G [00:20<00:18, 255MB/s]\u001b[A\u001b[32m [repeated 649x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75476, ip=10.0.51.205)\u001b[0m \u001b[32m [repeated 638x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  65%|██████▌   | 6.48G/9.95G [00:25<00:14, 246MB/s]\u001b[A\u001b[32m [repeated 633x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74457, ip=10.0.47.2)\u001b[0m \u001b[32m [repeated 645x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  76%|███████▌  | 7.52G/9.95G [00:29<00:09, 247MB/s]\u001b[A\u001b[32m [repeated 644x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  91%|█████████▏| 9.10G/9.95G [00:34<00:03, 263MB/s]\u001b[A\n",
-      "Downloading (…)l-00001-of-00003.bin:  92%|█████████▏| 9.13G/9.95G [00:34<00:03, 257MB/s]\u001b[A\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001b[0m \u001b[32m [repeated 634x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin:  82%|████████▏ | 8.17G/9.95G [00:35<00:07, 228MB/s]\u001b[A\u001b[32m [repeated 628x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:37<00:00, 262MB/s]\u001b[A\n",
+      "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 5.23MB/s]\u001B[32m [repeated 10x across cluster]\u001B[0m\n",
+      "Downloading (…)lve/main/config.json: 100%|██████████| 585/585 [00:00<00:00, 7.03MB/s]\u001B[32m [repeated 13x across cluster]\u001B[0m\n",
+      "Downloading (…)model.bin.index.json: 100%|██████████| 33.4k/33.4k [00:00<00:00, 87.9MB/s]\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001B[0m \u001B[32m [repeated 650x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]\u001B[A\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  13%|█▎        | 1.31G/9.95G [00:05<00:36, 239MB/s]\u001B[A\u001B[32m [repeated 636x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:   1%|          | 105M/9.95G [00:00<00:41, 239MB/s] \u001B[A\u001B[32m [repeated 17x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001B[0m \u001B[32m [repeated 640x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  26%|██▌       | 2.58G/9.95G [00:10<00:28, 256MB/s]\u001B[A\u001B[32m [repeated 635x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001B[0m \u001B[32m [repeated 638x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  37%|███▋      | 3.70G/9.95G [00:15<00:26, 238MB/s]\u001B[A\u001B[32m [repeated 638x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001B[0m \u001B[32m [repeated 643x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  51%|█████▏    | 5.12G/9.95G [00:20<00:18, 255MB/s]\u001B[A\u001B[32m [repeated 649x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75476, ip=10.0.51.205)\u001B[0m \u001B[32m [repeated 638x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  65%|██████▌   | 6.48G/9.95G [00:25<00:14, 246MB/s]\u001B[A\u001B[32m [repeated 633x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74457, ip=10.0.47.2)\u001B[0m \u001B[32m [repeated 645x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  76%|███████▌  | 7.52G/9.95G [00:29<00:09, 247MB/s]\u001B[A\u001B[32m [repeated 644x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  91%|█████████▏| 9.10G/9.95G [00:34<00:03, 263MB/s]\u001B[A\n",
+      "Downloading (…)l-00001-of-00003.bin:  92%|█████████▏| 9.13G/9.95G [00:34<00:03, 257MB/s]\u001B[A\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001B[0m \u001B[32m [repeated 634x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  82%|████████▏ | 8.17G/9.95G [00:35<00:07, 228MB/s]\u001B[A\u001B[32m [repeated 628x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:37<00:00, 262MB/s]\u001B[A\n",
       "Downloading shards:  33%|███▎      | 1/3 [00:38<01:16, 38.09s/it]\n",
-      "Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]\u001b[A\n",
-      "Downloading (…)l-00002-of-00003.bin:   1%|▏         | 126M/9.90G [00:00<00:35, 273MB/s] \u001b[A\n",
-      "Downloading (…)l-00001-of-00003.bin:  93%|█████████▎| 9.27G/9.95G [00:39<00:02, 228MB/s]\u001b[A\u001b[32m [repeated 394x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m \u001b[32m [repeated 633x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:   2%|▏         | 241M/9.90G [00:01<00:38, 252MB/s]\u001b[A\u001b[32m [repeated 213x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:40<00:00, 243MB/s]\u001b[32m [repeated 8x across cluster]\u001b[0m\n",
-      "Downloading shards:  33%|███▎      | 1/3 [00:42<01:25, 42.77s/it]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]\u001b[A\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:   1%|          | 115M/9.90G [00:00<00:46, 209MB/s] \u001b[A\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:42<00:00, 233MB/s]\u001b[32m [repeated 50x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001b[0m \u001b[32m [repeated 636x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  19%|█▊        | 1.86G/9.90G [00:06<00:29, 275MB/s]\u001b[A\u001b[32m [repeated 589x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001b[0m \u001b[32m [repeated 649x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  18%|█▊        | 1.75G/9.90G [00:07<00:34, 234MB/s]\u001b[A\u001b[32m [repeated 643x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001b[0m \u001b[32m [repeated 645x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  41%|████▏     | 4.09G/9.90G [00:15<00:21, 271MB/s]\u001b[A\u001b[32m [repeated 644x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001b[0m \u001b[32m [repeated 652x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  53%|█████▎    | 5.25G/9.90G [00:21<00:19, 242MB/s]\u001b[A\u001b[32m [repeated 656x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m \u001b[32m [repeated 647x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  67%|██████▋   | 6.66G/9.90G [00:25<00:13, 246MB/s]\u001b[A\u001b[32m [repeated 646x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001b[0m \u001b[32m [repeated 629x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  84%|████████▍ | 8.30G/9.90G [00:31<00:06, 234MB/s]\u001b[A\u001b[32m [repeated 627x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  91%|█████████▏| 9.06G/9.90G [00:34<00:03, 241MB/s]\u001b[A\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74457, ip=10.0.47.2)\u001b[0m \u001b[32m [repeated 627x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin:  89%|████████▉ | 8.84G/9.90G [00:36<00:04, 228MB/s]\u001b[A\u001b[32m [repeated 567x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:38<00:00, 257MB/s]\u001b[A\n",
+      "Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]\u001B[A\n",
+      "Downloading (…)l-00002-of-00003.bin:   1%|▏         | 126M/9.90G [00:00<00:35, 273MB/s] \u001B[A\n",
+      "Downloading (…)l-00001-of-00003.bin:  93%|█████████▎| 9.27G/9.95G [00:39<00:02, 228MB/s]\u001B[A\u001B[32m [repeated 394x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001B[0m \u001B[32m [repeated 633x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:   2%|▏         | 241M/9.90G [00:01<00:38, 252MB/s]\u001B[A\u001B[32m [repeated 213x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:40<00:00, 243MB/s]\u001B[32m [repeated 8x across cluster]\u001B[0m\n",
+      "Downloading shards:  33%|███▎      | 1/3 [00:42<01:25, 42.77s/it]\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]\u001B[A\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:   1%|          | 115M/9.90G [00:00<00:46, 209MB/s] \u001B[A\u001B[32m [repeated 16x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:42<00:00, 233MB/s]\u001B[32m [repeated 50x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001B[0m \u001B[32m [repeated 636x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  19%|█▊        | 1.86G/9.90G [00:06<00:29, 275MB/s]\u001B[A\u001B[32m [repeated 589x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001B[0m \u001B[32m [repeated 649x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  18%|█▊        | 1.75G/9.90G [00:07<00:34, 234MB/s]\u001B[A\u001B[32m [repeated 643x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001B[0m \u001B[32m [repeated 645x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  41%|████▏     | 4.09G/9.90G [00:15<00:21, 271MB/s]\u001B[A\u001B[32m [repeated 644x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001B[0m \u001B[32m [repeated 652x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  53%|█████▎    | 5.25G/9.90G [00:21<00:19, 242MB/s]\u001B[A\u001B[32m [repeated 656x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m \u001B[32m [repeated 647x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  67%|██████▋   | 6.66G/9.90G [00:25<00:13, 246MB/s]\u001B[A\u001B[32m [repeated 646x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001B[0m \u001B[32m [repeated 629x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  84%|████████▍ | 8.30G/9.90G [00:31<00:06, 234MB/s]\u001B[A\u001B[32m [repeated 627x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  91%|█████████▏| 9.06G/9.90G [00:34<00:03, 241MB/s]\u001B[A\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74457, ip=10.0.47.2)\u001B[0m \u001B[32m [repeated 627x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  89%|████████▉ | 8.84G/9.90G [00:36<00:04, 228MB/s]\u001B[A\u001B[32m [repeated 567x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:38<00:00, 257MB/s]\u001B[A\n",
       "Downloading shards:  67%|██████▋   | 2/3 [01:16<00:38, 38.38s/it]\n",
-      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001b[A\n",
-      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 126M/6.18G [00:00<00:22, 266MB/s] \u001b[A\n",
-      "Downloading (…)l-00002-of-00003.bin:  98%|█████████▊| 9.69G/9.90G [00:38<00:00, 236MB/s]\u001b[A\u001b[32m [repeated 310x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75476, ip=10.0.51.205)\u001b[0m \u001b[32m [repeated 629x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 94.4M/6.18G [00:00<00:24, 247MB/s]\u001b[A\u001b[32m [repeated 275x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:39<00:00, 253MB/s]\u001b[32m [repeated 10x across cluster]\u001b[0m\n",
-      "Downloading shards:  67%|██████▋   | 2/3 [01:20<00:40, 40.01s/it]\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001b[A\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 126M/6.18G [00:00<00:24, 243MB/s] \u001b[A\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin: 100%|█████████▉| 9.88G/9.90G [00:41<00:00, 242MB/s]\u001b[A\u001b[32m [repeated 122x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001b[0m \u001b[32m [repeated 638x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  21%|██        | 1.31G/6.18G [00:05<00:20, 243MB/s]\u001b[A\u001b[32m [repeated 569x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:40<00:00, 242MB/s]\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
-      "Downloading shards:  67%|██████▋   | 2/3 [01:23<00:41, 41.78s/it]\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001b[A\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 105M/6.18G [00:00<00:24, 248MB/s] \u001b[A\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00002-of-00003.bin: 100%|█████████▉| 9.87G/9.90G [00:40<00:00, 260MB/s]\u001b[A\u001b[32m [repeated 3x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m \u001b[32m [repeated 638x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  41%|████▏     | 2.56G/6.18G [00:10<00:14, 256MB/s]\u001b[A\u001b[32m [repeated 635x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m \u001b[32m [repeated 629x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  62%|██████▏   | 3.84G/6.18G [00:15<00:08, 279MB/s]\u001b[A\u001b[32m [repeated 627x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  92%|█████████▏| 5.66G/6.18G [00:22<00:01, 268MB/s]\u001b[A\n",
-      "Downloading (…)l-00003-of-00003.bin:  92%|█████████▏| 5.69G/6.18G [00:22<00:01, 265MB/s]\u001b[A\n",
-      "Downloading (…)l-00003-of-00003.bin:  93%|█████████▎| 5.73G/6.18G [00:22<00:01, 268MB/s]\u001b[A\n",
-      "Downloading (…)l-00003-of-00003.bin:  93%|█████████▎| 5.76G/6.18G [00:22<00:01, 270MB/s]\u001b[A\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m \u001b[32m [repeated 644x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  85%|████████▌ | 5.25G/6.18G [00:20<00:03, 270MB/s]\u001b[A\u001b[32m [repeated 618x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:24<00:00, 257MB/s]\u001b[A\n",
+      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001B[A\n",
+      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 126M/6.18G [00:00<00:22, 266MB/s] \u001B[A\n",
+      "Downloading (…)l-00002-of-00003.bin:  98%|█████████▊| 9.69G/9.90G [00:38<00:00, 236MB/s]\u001B[A\u001B[32m [repeated 310x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75476, ip=10.0.51.205)\u001B[0m \u001B[32m [repeated 629x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 94.4M/6.18G [00:00<00:24, 247MB/s]\u001B[A\u001B[32m [repeated 275x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:39<00:00, 253MB/s]\u001B[32m [repeated 10x across cluster]\u001B[0m\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [01:20<00:40, 40.01s/it]\u001B[32m [repeated 13x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001B[A\u001B[32m [repeated 13x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 126M/6.18G [00:00<00:24, 243MB/s] \u001B[A\u001B[32m [repeated 13x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|█████████▉| 9.88G/9.90G [00:41<00:00, 242MB/s]\u001B[A\u001B[32m [repeated 122x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001B[0m \u001B[32m [repeated 638x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  21%|██        | 1.31G/6.18G [00:05<00:20, 243MB/s]\u001B[A\u001B[32m [repeated 569x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:40<00:00, 242MB/s]\u001B[32m [repeated 2x across cluster]\u001B[0m\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [01:23<00:41, 41.78s/it]\u001B[32m [repeated 2x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001B[A\u001B[32m [repeated 2x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 105M/6.18G [00:00<00:24, 248MB/s] \u001B[A\u001B[32m [repeated 2x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|█████████▉| 9.87G/9.90G [00:40<00:00, 260MB/s]\u001B[A\u001B[32m [repeated 3x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001B[0m \u001B[32m [repeated 638x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  41%|████▏     | 2.56G/6.18G [00:10<00:14, 256MB/s]\u001B[A\u001B[32m [repeated 635x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m \u001B[32m [repeated 629x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  62%|██████▏   | 3.84G/6.18G [00:15<00:08, 279MB/s]\u001B[A\u001B[32m [repeated 627x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  92%|█████████▏| 5.66G/6.18G [00:22<00:01, 268MB/s]\u001B[A\n",
+      "Downloading (…)l-00003-of-00003.bin:  92%|█████████▏| 5.69G/6.18G [00:22<00:01, 265MB/s]\u001B[A\n",
+      "Downloading (…)l-00003-of-00003.bin:  93%|█████████▎| 5.73G/6.18G [00:22<00:01, 268MB/s]\u001B[A\n",
+      "Downloading (…)l-00003-of-00003.bin:  93%|█████████▎| 5.76G/6.18G [00:22<00:01, 270MB/s]\u001B[A\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001B[0m \u001B[32m [repeated 644x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  85%|████████▌ | 5.25G/6.18G [00:20<00:03, 270MB/s]\u001B[A\u001B[32m [repeated 618x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:24<00:00, 257MB/s]\u001B[A\n",
       "Downloading shards: 100%|██████████| 3/3 [01:40<00:00, 33.61s/it]\n",
-      "Downloading (…)l-00003-of-00003.bin:  98%|█████████▊| 6.03G/6.18G [00:23<00:00, 269MB/s]\u001b[A\u001b[32m [repeated 166x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m \u001b[32m [repeated 426x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  86%|████████▌ | 5.30G/6.18G [00:21<00:03, 246MB/s]\u001b[A\u001b[32m [repeated 222x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:25<00:00, 239MB/s]\u001b[32m [repeated 7x across cluster]\u001b[0m\n",
-      "Downloading shards: 100%|██████████| 3/3 [01:45<00:00, 35.27s/it]\u001b[32m [repeated 11x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  98%|█████████▊| 6.04G/6.18G [00:25<00:00, 231MB/s]\u001b[A\u001b[32m [repeated 98x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  98%|█████████▊| 6.03G/6.18G [00:23<00:00, 269MB/s]\u001B[A\u001B[32m [repeated 166x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001B[0m \u001B[32m [repeated 426x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  86%|████████▌ | 5.30G/6.18G [00:21<00:03, 246MB/s]\u001B[A\u001B[32m [repeated 222x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:25<00:00, 239MB/s]\u001B[32m [repeated 7x across cluster]\u001B[0m\n",
+      "Downloading shards: 100%|██████████| 3/3 [01:45<00:00, 35.27s/it]\u001B[32m [repeated 11x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  98%|█████████▊| 6.04G/6.18G [00:25<00:00, 231MB/s]\u001B[A\u001B[32m [repeated 98x across cluster]\u001B[0m\n",
       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m \u001b[32m [repeated 74x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin:  91%|█████████ | 5.63G/6.18G [00:23<00:02, 242MB/s]\u001b[A\u001b[32m [repeated 23x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:24<00:00, 249MB/s]\u001b[A\n",
-      "Downloading shards: 100%|██████████| 3/3 [01:49<00:00, 36.47s/it]\u001b[32m [repeated 4x across cluster]\u001b[0m\n",
-      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:25<00:00, 241MB/s]\u001b[32m [repeated 5x across cluster]\u001b[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001B[0m \u001B[32m [repeated 74x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  91%|█████████ | 5.63G/6.18G [00:23<00:02, 242MB/s]\u001B[A\u001B[32m [repeated 23x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:24<00:00, 249MB/s]\u001B[A\n",
+      "Downloading shards: 100%|██████████| 3/3 [01:49<00:00, 36.47s/it]\u001B[32m [repeated 4x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:25<00:00, 241MB/s]\u001B[32m [repeated 5x across cluster]\u001B[0m\n",
       "Loading checkpoint shards:  33%|███▎      | 1/3 [00:12<00:24, 12.11s/it]\n",
-      "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "Loading checkpoint shards:  33%|███▎      | 1/3 [00:18<00:37, 18.54s/it]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "Loading checkpoint shards:  33%|███▎      | 1/3 [00:18<00:37, 18.54s/it]\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
       "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:30<00:15, 15.63s/it]\n",
       "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:30<00:15, 15.71s/it]\n",
-      "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:35<00:17, 17.73s/it]\u001b[32m [repeated 14x across cluster]\u001b[0m\n",
+      "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:35<00:17, 17.73s/it]\u001B[32m [repeated 14x across cluster]\u001B[0m\n",
       "Loading checkpoint shards: 100%|██████████| 3/3 [00:40<00:00, 13.47s/it]\n",
       "Downloading (…)neration_config.json: 100%|██████████| 132/132 [00:00<00:00, 458kB/s]\n",
-      "Loading checkpoint shards: 100%|██████████| 3/3 [00:45<00:00, 15.29s/it]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001b[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
-      "Downloading (…)neration_config.json: 100%|██████████| 132/132 [00:00<00:00, 542kB/s]\u001b[32m [repeated 14x across cluster]\u001b[0m\n"
+      "Loading checkpoint shards: 100%|██████████| 3/3 [00:45<00:00, 15.29s/it]\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001B[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "Downloading (…)neration_config.json: 100%|██████████| 132/132 [00:00<00:00, 542kB/s]\u001B[32m [repeated 14x across cluster]\u001B[0m\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m DeepSpeed Configs:  {'zero_allow_untested_optimizer': True, 'bf16': {'enabled': True}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'reduce_bucket_size': 26214400, 'stage3_prefetch_bucket_size': 23592960.0, 'stage3_param_persistence_threshold': 51200}, 'gradient_accumulation_steps': 2, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 0.0}\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Model Archetecture:  LlamaForCausalLM(\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   (model): LlamaModel(\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m     (embed_tokens): Embedding(32000, 5120, padding_idx=0)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m     (layers): ModuleList(\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m       (0-39): 40 x LlamaDecoderLayer(\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         (self_attn): LlamaAttention(\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (q_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (k_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (v_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (o_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (rotary_emb): LlamaRotaryEmbedding()\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         )\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         (mlp): LlamaMLP(\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (down_proj): Linear(in_features=13824, out_features=5120, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (up_proj): Linear(in_features=5120, out_features=13824, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (act_fn): SiLUActivation()\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         )\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         (input_layernorm): LlamaRMSNorm()\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         (post_attention_layernorm): LlamaRMSNorm()\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m       )\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m     )\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m     (norm): LlamaRMSNorm()\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   )\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   (lm_head): Linear(in_features=5120, out_features=32000, bias=False)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m )\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m [2023-06-30 17:39:54,688] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m [2023-06-30 17:39:56,220] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m ninja: no work to do.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Time to load cpu_adam op: 2.403524875640869 seconds\n"
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m DeepSpeed Configs:  {'zero_allow_untested_optimizer': True, 'bf16': {'enabled': True}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'reduce_bucket_size': 26214400, 'stage3_prefetch_bucket_size': 23592960.0, 'stage3_param_persistence_threshold': 51200}, 'gradient_accumulation_steps': 2, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 0.0}\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Model Archetecture:  LlamaForCausalLM(\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m   (model): LlamaModel(\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m     (embed_tokens): Embedding(32000, 5120, padding_idx=0)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m     (layers): ModuleList(\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m       (0-39): 40 x LlamaDecoderLayer(\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m         (self_attn): LlamaAttention(\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (q_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (k_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (v_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (o_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (rotary_emb): LlamaRotaryEmbedding()\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m         )\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m         (mlp): LlamaMLP(\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (down_proj): Linear(in_features=13824, out_features=5120, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (up_proj): Linear(in_features=5120, out_features=13824, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (act_fn): SiLUActivation()\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m         )\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m         (input_layernorm): LlamaRMSNorm()\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m         (post_attention_layernorm): LlamaRMSNorm()\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m       )\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m     )\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m     (norm): LlamaRMSNorm()\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m   )\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m   (lm_head): Linear(in_features=5120, out_features=32000, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m )\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001B[0m [2023-06-30 17:39:54,688] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001B[0m [2023-06-30 17:39:56,220] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m ninja: no work to do.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Time to load cpu_adam op: 2.403524875640869 seconds\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Detected CUDA files, patching ldflags\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu118/cpu_adam/build.ninja...\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Building extension module cpu_adam...\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Loading extension module cpu_adam...\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001b[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Detected CUDA files, patching ldflags\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu118/cpu_adam/build.ninja...\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Building extension module cpu_adam...\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Loading extension module cpu_adam...\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001B[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
       "Downloading (…)neration_config.json: 100%|██████████| 132/132 [00:00<00:00, 1.72MB/s]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001b[0m Building extension module utils...\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Loading extension module utils...\n"
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001B[0m Building extension module utils...\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m Loading extension module utils...\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Time to load utils op: 0.0775597095489502 seconds\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Parameter Offload: Total persistent parameters: 414720 in 81 params\n"
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m Time to load utils op: 0.0775597095489502 seconds\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Parameter Offload: Total persistent parameters: 414720 in 81 params\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m No modifications detected for re-loaded extension module utils, skipping build step...\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\u001b[32m [repeated 32x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m Detected CUDA files, patching ldflags\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu118/utils/build.ninja...\u001b[32m [repeated 31x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m Building extension module cpu_adam...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\u001b[32m [repeated 31x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001b[0m Loading extension module cpu_adam...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Building extension module utils...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Loading extension module utils...\u001b[32m [repeated 16x across cluster]\u001b[0m\n"
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m No modifications detected for re-loaded extension module utils, skipping build step...\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\u001B[32m [repeated 32x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001B[0m Detected CUDA files, patching ldflags\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu118/utils/build.ninja...\u001B[32m [repeated 31x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001B[0m Building extension module cpu_adam...\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\u001B[32m [repeated 31x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001B[0m Loading extension module cpu_adam...\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Building extension module utils...\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m Loading extension module utils...\u001B[32m [repeated 16x across cluster]\u001B[0m\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m ninja: no work to do.\u001b[32m [repeated 31x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001b[0m Time to load cpu_adam op: 2.3851447105407715 seconds\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Time to load utils op: 0.0005815029144287109 seconds\u001b[32m [repeated 16x across cluster]\u001b[0m\n"
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m ninja: no work to do.\u001B[32m [repeated 31x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001B[0m Time to load cpu_adam op: 2.3851447105407715 seconds\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m Time to load utils op: 0.0005815029144287109 seconds\u001B[32m [repeated 16x across cluster]\u001B[0m\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   | Name  | Type             | Params | Params per Device\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m ---------------------------------------------------------------\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 0 | model | LlamaForCausalLM | 13.0 B | 813 M            \n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m ---------------------------------------------------------------\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 13.0 B    Trainable params\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 0         Non-trainable params\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 13.0 B    Total params\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 52,063.457Total estimated model params size (MB)\n"
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m   | Name  | Type             | Params | Params per Device\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m ---------------------------------------------------------------\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m 0 | model | LlamaForCausalLM | 13.0 B | 813 M            \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m ---------------------------------------------------------------\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m 13.0 B    Trainable params\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m 0         Non-trainable params\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m 13.0 B    Total params\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m 52,063.457Total estimated model params size (MB)\n"
      ]
     },
     {
@@ -851,8 +851,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:432: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 64 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   rank_zero_warn(\n"
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m /home/ray/anaconda3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:432: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 64 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m   rank_zero_warn(\n"
      ]
     },
     {
@@ -860,20 +860,20 @@
      "output_type": "stream",
      "text": [
       "Epoch 0:   2%|▏         | 1/57 [00:38<35:42, 38.26s/it, v_num=0, train_loss=11.50]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Time to load utils op: 0.00030732154846191406 seconds\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:44:33,395] [WARNING] [stage3.py:1851:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Time to load utils op: 0.00030732154846191406 seconds\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:44:33,395] [WARNING] [stage3.py:1851:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:   4%|▎         | 2/57 [01:19<36:23, 39.69s/it, v_num=0, train_loss=10.70]\n",
       "Epoch 0:   5%|▌         | 3/57 [01:52<33:52, 37.65s/it, v_num=0, train_loss=1.710]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:45:48,054] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:45:48,054] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:   7%|▋         | 4/57 [02:34<34:01, 38.51s/it, v_num=0, train_loss=1.610]\n",
       "Epoch 0:   9%|▉         | 5/57 [03:08<32:35, 37.60s/it, v_num=0, train_loss=0.914]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:47:03,011] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:47:03,011] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  11%|█         | 6/57 [03:49<32:26, 38.17s/it, v_num=0, train_loss=0.973]\n",
       "Epoch 0:  12%|█▏        | 7/57 [04:24<31:30, 37.81s/it, v_num=0, train_loss=0.801]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:48:19,362] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:48:19,362] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  14%|█▍        | 8/57 [05:05<31:10, 38.17s/it, v_num=0, train_loss=0.844]\n",
       "Epoch 0:  16%|█▌        | 9/57 [05:39<30:12, 37.75s/it, v_num=0, train_loss=0.652]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:49:36,571] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:49:36,571] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  18%|█▊        | 10/57 [06:22<29:58, 38.26s/it, v_num=0, train_loss=0.633]\n",
       "Epoch 0:  19%|█▉        | 11/57 [06:59<29:13, 38.12s/it, v_num=0, train_loss=0.629]\n"
      ]
@@ -889,77 +889,77 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:50:54,177] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:50:54,177] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  21%|██        | 12/57 [07:40<28:45, 38.35s/it, v_num=0, train_loss=0.609]\n",
       "Epoch 0:  23%|██▎       | 13/57 [08:14<27:53, 38.04s/it, v_num=0, train_loss=0.680]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:52:10,002] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:52:10,002] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  25%|██▍       | 14/57 [08:55<27:26, 38.29s/it, v_num=0, train_loss=0.648]\n",
       "Epoch 0:  26%|██▋       | 15/57 [09:29<26:33, 37.95s/it, v_num=0, train_loss=0.645]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:53:23,209] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:53:23,209] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  28%|██▊       | 16/57 [10:09<26:01, 38.08s/it, v_num=0, train_loss=0.664]\n",
       "Epoch 0:  30%|██▉       | 17/57 [10:43<25:13, 37.83s/it, v_num=0, train_loss=0.625]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:54:36,660] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:54:36,660] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  32%|███▏      | 18/57 [11:22<24:39, 37.93s/it, v_num=0, train_loss=0.617]\n",
       "Epoch 0:  33%|███▎      | 19/57 [11:56<23:53, 37.71s/it, v_num=0, train_loss=0.609]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:55:51,289] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:55:51,289] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  35%|███▌      | 20/57 [12:37<23:20, 37.86s/it, v_num=0, train_loss=0.602]\n",
       "Epoch 0:  37%|███▋      | 21/57 [13:11<22:36, 37.69s/it, v_num=0, train_loss=0.590]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:57:07,919] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:57:07,919] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  39%|███▊      | 22/57 [13:53<22:06, 37.91s/it, v_num=0, train_loss=0.555]\n",
       "Epoch 0:  40%|████      | 23/57 [14:27<21:22, 37.72s/it, v_num=0, train_loss=0.598]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:58:22,349] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:58:22,349] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  42%|████▏     | 24/57 [15:08<20:48, 37.85s/it, v_num=0, train_loss=0.625]\n",
       "Epoch 0:  44%|████▍     | 25/57 [15:43<20:07, 37.74s/it, v_num=0, train_loss=0.625]\n",
       "Epoch 0:  44%|████▍     | 25/57 [15:43<20:07, 37.74s/it, v_num=0, train_loss=0.582]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:59:40,125] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:59:40,125] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  46%|████▌     | 26/57 [16:26<19:35, 37.93s/it, v_num=0, train_loss=0.535]\n",
       "Epoch 0:  47%|████▋     | 27/57 [17:02<18:56, 37.88s/it, v_num=0, train_loss=0.578]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:00:58,164] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:00:58,164] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  49%|████▉     | 28/57 [17:44<18:22, 38.01s/it, v_num=0, train_loss=0.582]\n",
       "Epoch 0:  51%|█████     | 29/57 [18:20<17:42, 37.93s/it, v_num=0, train_loss=0.578]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:02:15,097] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:02:15,097] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  53%|█████▎    | 30/57 [19:01<17:06, 38.04s/it, v_num=0, train_loss=0.598]\n",
       "Epoch 0:  54%|█████▍    | 31/57 [19:36<16:26, 37.95s/it, v_num=0, train_loss=0.586]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:03:30,632] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:03:30,632] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  56%|█████▌    | 32/57 [20:16<15:50, 38.02s/it, v_num=0, train_loss=0.605]\n",
       "Epoch 0:  58%|█████▊    | 33/57 [20:49<15:08, 37.87s/it, v_num=0, train_loss=0.594]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:04:45,362] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:04:45,362] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  60%|█████▉    | 34/57 [21:31<14:33, 37.98s/it, v_num=0, train_loss=0.598]\n",
       "Epoch 0:  61%|██████▏   | 35/57 [22:08<13:54, 37.95s/it, v_num=0, train_loss=0.574]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:06:02,727] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:06:02,727] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  63%|██████▎   | 36/57 [22:48<13:18, 38.02s/it, v_num=0, train_loss=0.586]\n",
       "Epoch 0:  65%|██████▍   | 37/57 [23:23<12:38, 37.94s/it, v_num=0, train_loss=0.562]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:07:19,126] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:07:19,126] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  67%|██████▋   | 38/57 [24:05<12:02, 38.03s/it, v_num=0, train_loss=0.535]\n",
       "Epoch 0:  68%|██████▊   | 39/57 [24:38<11:22, 37.91s/it, v_num=0, train_loss=0.598]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:08:36,683] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:08:36,683] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  70%|███████   | 40/57 [25:22<10:47, 38.07s/it, v_num=0, train_loss=0.562]\n",
       "Epoch 0:  72%|███████▏  | 41/57 [25:57<10:07, 37.98s/it, v_num=0, train_loss=0.555]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:09:52,426] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:09:52,426] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  74%|███████▎  | 42/57 [26:38<09:30, 38.06s/it, v_num=0, train_loss=0.555]\n",
       "Epoch 0:  75%|███████▌  | 43/57 [27:13<08:51, 37.99s/it, v_num=0, train_loss=0.547]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:11:08,855] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:11:08,855] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  77%|███████▋  | 44/57 [27:54<08:14, 38.06s/it, v_num=0, train_loss=0.562]\n",
       "Epoch 0:  79%|███████▉  | 45/57 [28:29<07:35, 37.98s/it, v_num=0, train_loss=0.535]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:12:25,181] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:12:25,181] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  81%|████████  | 46/57 [29:11<06:58, 38.07s/it, v_num=0, train_loss=0.531]\n",
       "Epoch 0:  82%|████████▏ | 47/57 [29:45<06:19, 37.99s/it, v_num=0, train_loss=0.504]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:13:40,300] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:13:40,300] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  84%|████████▍ | 48/57 [30:26<05:42, 38.05s/it, v_num=0, train_loss=0.520]\n",
       "Epoch 0:  86%|████████▌ | 49/57 [31:01<05:03, 37.99s/it, v_num=0, train_loss=0.523]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:14:55,542] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:14:55,542] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  88%|████████▊ | 50/57 [31:41<04:26, 38.03s/it, v_num=0, train_loss=0.520]\n",
       "Epoch 0:  89%|████████▉ | 51/57 [32:16<03:47, 37.98s/it, v_num=0, train_loss=0.527]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:16:12,131] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:16:12,131] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  91%|█████████ | 52/57 [32:58<03:10, 38.04s/it, v_num=0, train_loss=0.562]\n",
       "Epoch 0:  93%|█████████▎| 53/57 [33:34<02:32, 38.00s/it, v_num=0, train_loss=0.539]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:17:29,752] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:17:29,752] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  95%|█████████▍| 54/57 [34:15<01:54, 38.07s/it, v_num=0, train_loss=0.535]\n",
       "Epoch 0:  96%|█████████▋| 55/57 [34:50<01:16, 38.01s/it, v_num=0, train_loss=0.512]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:18:45,986] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:18:45,986] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0:  98%|█████████▊| 56/57 [35:31<00:38, 38.07s/it, v_num=0, train_loss=0.516]\n",
       "Epoch 0: 100%|██████████| 57/57 [36:06<00:00, 38.00s/it, v_num=0, train_loss=0.461]\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:20:01,817] [WARNING] [stage3.py:1851:step] 3 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:20:01,817] [WARNING] [stage3.py:1851:step] 3 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
       "Epoch 0: : 58it [36:47, 38.07s/it, v_num=0, train_loss=0.523]                      \n"
      ]
     },
@@ -967,21 +967,21 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001b[0m   warnings.warn(\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m No modifications detected for re-loaded extension module utils, skipping build step...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Loading extension module utils...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Uploading checkpoint files from worker rank 0 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   warnings.warn(\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m Uploading checkpoint files from worker rank 3 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Uploading checkpoint files from worker rank 1 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Done uploading checkpoint files.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001b[0m Uploading checkpoint files from worker rank 10 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001b[0m Done uploading checkpoint files.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Done uploading checkpoint files.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001b[0m Done uploading checkpoint files.\u001b[32m [repeated 11x across cluster]\u001b[0m\n"
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001B[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001B[0m   warnings.warn(\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m No modifications detected for re-loaded extension module utils, skipping build step...\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Loading extension module utils...\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Uploading checkpoint files from worker rank 0 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m   warnings.warn(\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001B[0m Uploading checkpoint files from worker rank 3 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m Uploading checkpoint files from worker rank 1 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Done uploading checkpoint files.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001B[0m Uploading checkpoint files from worker rank 10 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\u001B[32m [repeated 13x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001B[0m Done uploading checkpoint files.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m Done uploading checkpoint files.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001B[0m Done uploading checkpoint files.\u001B[32m [repeated 11x across cluster]\u001B[0m\n"
      ]
     },
     {
@@ -995,9 +995,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m `Trainer.fit` stopped: `max_epochs=1` reached.\n",
-      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Uploading trial artifacts took 26.651 s, which may be a performance bottleneck. Consider saving fewer/smaller artifacts to the trial log directory, or disable artifact syncing with `SyncConfig(sync_artifacts=False)`.\n",
-      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m Done uploading checkpoint files.\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m `Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Uploading trial artifacts took 26.651 s, which may be a performance bottleneck. Consider saving fewer/smaller artifacts to the trial log directory, or disable artifact syncing with `SyncConfig(sync_artifacts=False)`.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001B[0m Done uploading checkpoint files.\u001B[32m [repeated 2x across cluster]\u001B[0m\n",
       "2023-06-30 18:21:59,316\tINFO tune.py:1148 -- Total run time: 2542.82 seconds (2511.95 seconds for the tuning loop).\n"
      ]
     }
diff --git a/doc/source/train/examples/pytorch/pytorch_resnet_finetune.ipynb b/doc/source/train/examples/pytorch/pytorch_resnet_finetune.ipynb
index 1d0dee4e56af0..c822e6246d83c 100644
--- a/doc/source/train/examples/pytorch/pytorch_resnet_finetune.ipynb
+++ b/doc/source/train/examples/pytorch/pytorch_resnet_finetune.ipynb
@@ -391,7 +391,7 @@
      "output_type": "stream",
      "text": [
       "2023-03-01 12:40:15,468\tINFO worker.py:1360 -- Connecting to existing Ray cluster at address: 10.0.53.212:6379...\n",
-      "2023-03-01 12:40:15,520\tINFO worker.py:1548 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_49hwcjc1pzcddc2nf6cg9itj6b/services?redirect_to=dashboard \u001b[39m\u001b[22m\n",
+      "2023-03-01 12:40:15,520\tINFO worker.py:1548 -- Connected to Ray cluster. View the dashboard at \u001B[1m\u001B[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_49hwcjc1pzcddc2nf6cg9itj6b/services?redirect_to=dashboard \u001B[39m\u001B[22m\n",
       "2023-03-01 12:40:16,841\tINFO packaging.py:330 -- Pushing file package 'gcs://_ray_pkg_d6a92d7fa9e73b7fc2276251a1203373.zip' (451.72MiB) to Ray cluster...\n",
       "2023-03-01 12:40:26,413\tINFO packaging.py:343 -- Successfully pushed file package 'gcs://_ray_pkg_d6a92d7fa9e73b7fc2276251a1203373.zip'.\n"
      ]
diff --git a/doc/source/tune/examples/ax_example.ipynb b/doc/source/tune/examples/ax_example.ipynb
index a2bb25b8e903b..740e797a08e1f 100644
--- a/doc/source/tune/examples/ax_example.ipynb
+++ b/doc/source/tune/examples/ax_example.ipynb
@@ -48,8 +48,8 @@
       "Requirement already satisfied: joblib>=0.11 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from scikit-learn->ax-platform==0.2.4) (1.1.0)\n",
       "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from scikit-learn->ax-platform==0.2.4) (3.0.0)\n",
       "Requirement already satisfied: typing-extensions in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from torch>=1.9->botorch==0.6.2->ax-platform==0.2.4) (4.1.1)\n",
-      "\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n",
-      "\u001b[0m"
+      "\u001B[33mWARNING: There was an error checking the latest version of pip.\u001B[0m\u001B[33m\n",
+      "\u001B[0m"
      ]
     }
    ],
diff --git a/doc/source/tune/examples/bayesopt_example.ipynb b/doc/source/tune/examples/bayesopt_example.ipynb
index a46ace8d22a38..88bc85a33341a 100644
--- a/doc/source/tune/examples/bayesopt_example.ipynb
+++ b/doc/source/tune/examples/bayesopt_example.ipynb
@@ -33,8 +33,8 @@
       "Requirement already satisfied: scipy>=0.14.0 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from bayesian-optimization==1.2.0) (1.4.1)\n",
       "Requirement already satisfied: joblib>=0.11 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from scikit-learn>=0.18.0->bayesian-optimization==1.2.0) (1.1.0)\n",
       "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from scikit-learn>=0.18.0->bayesian-optimization==1.2.0) (3.0.0)\n",
-      "\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n",
-      "\u001b[0m"
+      "\u001B[33mWARNING: There was an error checking the latest version of pip.\u001B[0m\u001B[33m\n",
+      "\u001B[0m"
      ]
     }
    ],
diff --git a/doc/source/tune/examples/optuna_example.ipynb b/doc/source/tune/examples/optuna_example.ipynb
index 2b2d7f36fd535..b82ce5485c74e 100644
--- a/doc/source/tune/examples/optuna_example.ipynb
+++ b/doc/source/tune/examples/optuna_example.ipynb
@@ -57,8 +57,8 @@
       "Requirement already satisfied: wcwidth>=0.1.7 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from cmd2>=1.0.0->cliff->optuna==2.9.1) (0.2.5)\n",
       "Requirement already satisfied: zipp>=0.5 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from importlib-metadata->sqlalchemy>=1.1.0->optuna==2.9.1) (3.7.0)\n",
       "Requirement already satisfied: MarkupSafe>=0.9.2 in /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages (from Mako->alembic->optuna==2.9.1) (2.0.1)\n",
-      "\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n",
-      "\u001b[0m"
+      "\u001B[33mWARNING: There was an error checking the latest version of pip.\u001B[0m\u001B[33m\n",
+      "\u001B[0m"
      ]
     }
    ],
@@ -330,7 +330,7 @@
      "output_type": "stream",
      "text": [
       "Function checkpointing is disabled. This may result in unexpected behavior when using checkpointing features or certain schedulers. To enable, set the train function arguments to be `func(config, checkpoint_dir=None)`.\n",
-      "\u001b[32m[I 2022-07-22 15:21:47,769]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
+      "\u001B[32m[I 2022-07-22 15:21:47,769]\u001B[0m A new study created in memory with name: optuna\u001B[0m\n"
      ]
     },
     {
@@ -1293,7 +1293,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m[I 2022-07-22 15:22:32,644]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n",
+      "\u001B[32m[I 2022-07-22 15:22:32,644]\u001B[0m A new study created in memory with name: optuna\u001B[0m\n",
       "/Users/kai/coding/ray/python/ray/tune/search/optuna/optuna_search.py:389: ExperimentalWarning: enqueue_trial is experimental (supported from v1.2.0). The interface can change in the future.\n",
       "  self._ot_study.enqueue_trial(point)\n",
       "/Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/optuna/study/study.py:857: ExperimentalWarning: create_trial is experimental (supported from v2.0.0). The interface can change in the future.\n",
@@ -2263,7 +2263,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m[I 2022-07-22 15:23:15,784]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
+      "\u001B[32m[I 2022-07-22 15:23:15,784]\u001B[0m A new study created in memory with name: optuna\u001B[0m\n"
      ]
     }
    ],
@@ -3230,7 +3230,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m[I 2022-07-22 15:26:50,680]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
+      "\u001B[32m[I 2022-07-22 15:26:50,680]\u001B[0m A new study created in memory with name: optuna\u001B[0m\n"
      ]
     },
     {
diff --git a/doc/source/tune/examples/tune-aim.ipynb b/doc/source/tune/examples/tune-aim.ipynb
index ad180658faa83..5a648e9777069 100644
--- a/doc/source/tune/examples/tune-aim.ipynb
+++ b/doc/source/tune/examples/tune-aim.ipynb
@@ -1,407 +1,407 @@
 {
-    "cells": [
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "id": "ecad719c",
-            "metadata": {},
-            "source": [
-                "(tune-aim-ref)=\n",
-                "\n",
-                "# Using Aim with Tune\n",
-                "\n",
-                "[Aim](https://aimstack.io) is an easy-to-use and supercharged open-source experiment tracker.\n",
-                "Aim logs your training runs, enables a well-designed UI to compare them, and provides an API to query them programmatically.\n",
-                "\n",
-                "```{image} /images/aim_logo_full.png\n",
-                ":align: center\n",
-                ":alt: Aim\n",
-                ":width: 100%\n",
-                ":target: https://aimstack.io\n",
-                "```\n",
-                "\n",
-                "Ray Tune currently offers built-in integration with Aim.\n",
-                "The {ref}`AimLoggerCallback <tune-aim-logger>` automatically logs metrics that are reported to Tune by using the Aim API.\n",
-                "\n",
-                "\n",
-                "```{contents}\n",
-                ":backlinks: none\n",
-                ":local: true\n",
-                "```\n",
-                "\n",
-                "## Logging Tune Hyperparameter Configurations and Results to Aim\n",
-                "\n",
-                "The following example demonstrates how the `AimLoggerCallback` can be used in a Tune experiment.\n",
-                "Begin by installing and importing the necessary modules:"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "1290b5b5",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "%pip install aim\n",
-                "%pip install ray[tune]"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 9,
-            "id": "100bcf8a",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "import numpy as np\n",
-                "\n",
-                "import ray\n",
-                "from ray import air, tune\n",
-                "from ray.air import session\n",
-                "from ray.tune.logger.aim import AimLoggerCallback"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "id": "9346c0f6",
-            "metadata": {},
-            "source": [
-                "Next, define a simple `train_function`, which is a [`Trainable`](trainable-docs) that reports a loss to Tune.\n",
-                "The objective function itself is not important for this example, as our main focus is on the integration with Aim."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 2,
-            "id": "e8b4fc4d",
-            "metadata": {
-                "pycharm": {
-                    "name": "#%%\n"
-                }
-            },
-            "outputs": [],
-            "source": [
-                "def train_function(config):\n",
-                "    for _ in range(50):\n",
-                "        loss = config[\"mean\"] + config[\"sd\"] * np.random.randn()\n",
-                "        session.report({\"loss\": loss})"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "id": "831eed42",
-            "metadata": {},
-            "source": [
-                "Here is an example of how you can use the `AimLoggerCallback` with simple grid-search Tune experiment.\n",
-                "The logger will log each of the 9 grid-search trials as separate Aim runs."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 3,
-            "id": "52988599",
-            "metadata": {
-                "pycharm": {
-                    "name": "#%%\n"
-                }
-            },
-            "outputs": [
-                {
-                    "name": "stderr",
-                    "output_type": "stream",
-                    "text": [
-                        "2023-02-07 00:04:11,228\tINFO worker.py:1544 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n"
-                    ]
-                },
-                {
-                    "data": {
-                        "text/html": [
-                            "<div class=\"tuneStatus\">\n",
-                            "  <div style=\"display: flex;flex-direction: row\">\n",
-                            "    <div style=\"display: flex;flex-direction: column;\">\n",
-                            "      <h3>Tune Status</h3>\n",
-                            "      <table>\n",
-                            "<tbody>\n",
-                            "<tr><td>Current time:</td><td>2023-02-07 00:04:19</td></tr>\n",
-                            "<tr><td>Running for: </td><td>00:00:06.86        </td></tr>\n",
-                            "<tr><td>Memory:      </td><td>32.8/64.0 GiB      </td></tr>\n",
-                            "</tbody>\n",
-                            "</table>\n",
-                            "    </div>\n",
-                            "    <div class=\"vDivider\"></div>\n",
-                            "    <div class=\"systemInfo\">\n",
-                            "      <h3>System Info</h3>\n",
-                            "      Using FIFO scheduling algorithm.<br>Resources requested: 0/10 CPUs, 0/0 GPUs, 0.0/26.93 GiB heap, 0.0/2.0 GiB objects\n",
-                            "    </div>\n",
-                            "    \n",
-                            "  </div>\n",
-                            "  <div class=\"hDivider\"></div>\n",
-                            "  <div class=\"trialStatus\">\n",
-                            "    <h3>Trial Status</h3>\n",
-                            "    <table>\n",
-                            "<thead>\n",
-                            "<tr><th>Trial name                </th><th>status    </th><th>loc            </th><th style=\"text-align: right;\">  mean</th><th style=\"text-align: right;\">      sd</th><th style=\"text-align: right;\">  iter</th><th style=\"text-align: right;\">  total time (s)</th><th style=\"text-align: right;\">   loss</th></tr>\n",
-                            "</thead>\n",
-                            "<tbody>\n",
-                            "<tr><td>train_function_01a3b_00000</td><td>TERMINATED</td><td>127.0.0.1:10277</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">0.385428</td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         4.48031</td><td style=\"text-align: right;\">1.01928</td></tr>\n",
-                            "<tr><td>train_function_01a3b_00001</td><td>TERMINATED</td><td>127.0.0.1:10296</td><td style=\"text-align: right;\">     2</td><td style=\"text-align: right;\">0.819716</td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         2.97272</td><td style=\"text-align: right;\">3.01491</td></tr>\n",
-                            "<tr><td>train_function_01a3b_00002</td><td>TERMINATED</td><td>127.0.0.1:10301</td><td style=\"text-align: right;\">     3</td><td style=\"text-align: right;\">0.769197</td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         2.39572</td><td style=\"text-align: right;\">3.87155</td></tr>\n",
-                            "<tr><td>train_function_01a3b_00003</td><td>TERMINATED</td><td>127.0.0.1:10307</td><td style=\"text-align: right;\">     4</td><td style=\"text-align: right;\">0.29466 </td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         2.41568</td><td style=\"text-align: right;\">4.1507 </td></tr>\n",
-                            "<tr><td>train_function_01a3b_00004</td><td>TERMINATED</td><td>127.0.0.1:10313</td><td style=\"text-align: right;\">     5</td><td style=\"text-align: right;\">0.152208</td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         1.68383</td><td style=\"text-align: right;\">5.10225</td></tr>\n",
-                            "<tr><td>train_function_01a3b_00005</td><td>TERMINATED</td><td>127.0.0.1:10321</td><td style=\"text-align: right;\">     6</td><td style=\"text-align: right;\">0.879814</td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         1.54015</td><td style=\"text-align: right;\">6.20238</td></tr>\n",
-                            "<tr><td>train_function_01a3b_00006</td><td>TERMINATED</td><td>127.0.0.1:10329</td><td style=\"text-align: right;\">     7</td><td style=\"text-align: right;\">0.487499</td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         1.44706</td><td style=\"text-align: right;\">7.79551</td></tr>\n",
-                            "<tr><td>train_function_01a3b_00007</td><td>TERMINATED</td><td>127.0.0.1:10333</td><td style=\"text-align: right;\">     8</td><td style=\"text-align: right;\">0.639783</td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         1.4261 </td><td style=\"text-align: right;\">7.94189</td></tr>\n",
-                            "<tr><td>train_function_01a3b_00008</td><td>TERMINATED</td><td>127.0.0.1:10341</td><td style=\"text-align: right;\">     9</td><td style=\"text-align: right;\">0.12285 </td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         1.07701</td><td style=\"text-align: right;\">8.82304</td></tr>\n",
-                            "</tbody>\n",
-                            "</table>\n",
-                            "  </div>\n",
-                            "</div>\n",
-                            "<style>\n",
-                            ".tuneStatus {\n",
-                            "  color: var(--jp-ui-font-color1);\n",
-                            "}\n",
-                            ".tuneStatus .systemInfo {\n",
-                            "  display: flex;\n",
-                            "  flex-direction: column;\n",
-                            "}\n",
-                            ".tuneStatus td {\n",
-                            "  white-space: nowrap;\n",
-                            "}\n",
-                            ".tuneStatus .trialStatus {\n",
-                            "  display: flex;\n",
-                            "  flex-direction: column;\n",
-                            "}\n",
-                            ".tuneStatus h3 {\n",
-                            "  font-weight: bold;\n",
-                            "}\n",
-                            ".tuneStatus .hDivider {\n",
-                            "  border-bottom-width: var(--jp-border-width);\n",
-                            "  border-bottom-color: var(--jp-border-color0);\n",
-                            "  border-bottom-style: solid;\n",
-                            "}\n",
-                            ".tuneStatus .vDivider {\n",
-                            "  border-left-width: var(--jp-border-width);\n",
-                            "  border-left-color: var(--jp-border-color0);\n",
-                            "  border-left-style: solid;\n",
-                            "  margin: 0.5em 1em 0.5em 1em;\n",
-                            "}\n",
-                            "</style>\n"
-                        ],
-                        "text/plain": [
-                            "<IPython.core.display.HTML object>"
-                        ]
-                    },
-                    "metadata": {},
-                    "output_type": "display_data"
-                },
-                {
-                    "data": {
-                        "text/html": [
-                            "<div class=\"trialProgress\">\n",
-                            "  <h3>Trial Progress</h3>\n",
-                            "  <table>\n",
-                            "<thead>\n",
-                            "<tr><th>Trial name                </th><th>date               </th><th>done  </th><th>episodes_total  </th><th>experiment_id                   </th><th>experiment_tag    </th><th>hostname              </th><th style=\"text-align: right;\">  iterations_since_restore</th><th style=\"text-align: right;\">   loss</th><th>node_ip  </th><th style=\"text-align: right;\">  pid</th><th style=\"text-align: right;\">  time_since_restore</th><th style=\"text-align: right;\">  time_this_iter_s</th><th style=\"text-align: right;\">  time_total_s</th><th style=\"text-align: right;\">  timestamp</th><th style=\"text-align: right;\">  timesteps_since_restore</th><th>timesteps_total  </th><th style=\"text-align: right;\">  training_iteration</th><th>trial_id   </th><th style=\"text-align: right;\">  warmup_time</th></tr>\n",
-                            "</thead>\n",
-                            "<tbody>\n",
-                            "<tr><td>train_function_01a3b_00000</td><td>2023-02-07_00-04-18</td><td>True  </td><td>                </td><td>c8447fdceea6436c9edd6f030a5b1d82</td><td>0_mean=1,sd=0.3854</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">1.01928</td><td>127.0.0.1</td><td style=\"text-align: right;\">10277</td><td style=\"text-align: right;\">             4.48031</td><td style=\"text-align: right;\">        0.013865  </td><td style=\"text-align: right;\">       4.48031</td><td style=\"text-align: right;\"> 1675757058</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00000</td><td style=\"text-align: right;\">   0.00264072</td></tr>\n",
-                            "<tr><td>train_function_01a3b_00001</td><td>2023-02-07_00-04-18</td><td>True  </td><td>                </td><td>7dd6d3ee24244a0885b354c285064728</td><td>1_mean=2,sd=0.8197</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">3.01491</td><td>127.0.0.1</td><td style=\"text-align: right;\">10296</td><td style=\"text-align: right;\">             2.97272</td><td style=\"text-align: right;\">        0.0584073 </td><td style=\"text-align: right;\">       2.97272</td><td style=\"text-align: right;\"> 1675757058</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00001</td><td style=\"text-align: right;\">   0.0316792 </td></tr>\n",
-                            "<tr><td>train_function_01a3b_00002</td><td>2023-02-07_00-04-18</td><td>True  </td><td>                </td><td>e3da49ebad034c4b8fdaf0aa87927b1a</td><td>2_mean=3,sd=0.7692</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">3.87155</td><td>127.0.0.1</td><td style=\"text-align: right;\">10301</td><td style=\"text-align: right;\">             2.39572</td><td style=\"text-align: right;\">        0.0695491 </td><td style=\"text-align: right;\">       2.39572</td><td style=\"text-align: right;\"> 1675757058</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00002</td><td style=\"text-align: right;\">   0.0315411 </td></tr>\n",
-                            "<tr><td>train_function_01a3b_00003</td><td>2023-02-07_00-04-18</td><td>True  </td><td>                </td><td>95c60c4f67c4481ebccff25b0a49e75d</td><td>3_mean=4,sd=0.2947</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">4.1507 </td><td>127.0.0.1</td><td style=\"text-align: right;\">10307</td><td style=\"text-align: right;\">             2.41568</td><td style=\"text-align: right;\">        0.0175381 </td><td style=\"text-align: right;\">       2.41568</td><td style=\"text-align: right;\"> 1675757058</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00003</td><td style=\"text-align: right;\">   0.0310779 </td></tr>\n",
-                            "<tr><td>train_function_01a3b_00004</td><td>2023-02-07_00-04-18</td><td>True  </td><td>                </td><td>a216253cb41e47caa229e65488deb019</td><td>4_mean=5,sd=0.1522</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">5.10225</td><td>127.0.0.1</td><td style=\"text-align: right;\">10313</td><td style=\"text-align: right;\">             1.68383</td><td style=\"text-align: right;\">        0.064441  </td><td style=\"text-align: right;\">       1.68383</td><td style=\"text-align: right;\"> 1675757058</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00004</td><td style=\"text-align: right;\">   0.00450182</td></tr>\n",
-                            "<tr><td>train_function_01a3b_00005</td><td>2023-02-07_00-04-18</td><td>True  </td><td>                </td><td>23834104277f476cb99d9c696281fceb</td><td>5_mean=6,sd=0.8798</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">6.20238</td><td>127.0.0.1</td><td style=\"text-align: right;\">10321</td><td style=\"text-align: right;\">             1.54015</td><td style=\"text-align: right;\">        0.00910306</td><td style=\"text-align: right;\">       1.54015</td><td style=\"text-align: right;\"> 1675757058</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00005</td><td style=\"text-align: right;\">   0.0480251 </td></tr>\n",
-                            "<tr><td>train_function_01a3b_00006</td><td>2023-02-07_00-04-18</td><td>True  </td><td>                </td><td>15f650121df747c3bd2720481d47b265</td><td>6_mean=7,sd=0.4875</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">7.79551</td><td>127.0.0.1</td><td style=\"text-align: right;\">10329</td><td style=\"text-align: right;\">             1.44706</td><td style=\"text-align: right;\">        0.00600386</td><td style=\"text-align: right;\">       1.44706</td><td style=\"text-align: right;\"> 1675757058</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00006</td><td style=\"text-align: right;\">   0.00202489</td></tr>\n",
-                            "<tr><td>train_function_01a3b_00007</td><td>2023-02-07_00-04-19</td><td>True  </td><td>                </td><td>78b1673cf2034ed99135b80a0cb31e0e</td><td>7_mean=8,sd=0.6398</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">7.94189</td><td>127.0.0.1</td><td style=\"text-align: right;\">10333</td><td style=\"text-align: right;\">             1.4261 </td><td style=\"text-align: right;\">        0.00225306</td><td style=\"text-align: right;\">       1.4261 </td><td style=\"text-align: right;\"> 1675757059</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00007</td><td style=\"text-align: right;\">   0.00209713</td></tr>\n",
-                            "<tr><td>train_function_01a3b_00008</td><td>2023-02-07_00-04-19</td><td>True  </td><td>                </td><td>c7f5d86154cb46b6aa27bef523edcd6f</td><td>8_mean=9,sd=0.1228</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">8.82304</td><td>127.0.0.1</td><td style=\"text-align: right;\">10341</td><td style=\"text-align: right;\">             1.07701</td><td style=\"text-align: right;\">        0.00291467</td><td style=\"text-align: right;\">       1.07701</td><td style=\"text-align: right;\"> 1675757059</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00008</td><td style=\"text-align: right;\">   0.00240111</td></tr>\n",
-                            "</tbody>\n",
-                            "</table>\n",
-                            "</div>\n",
-                            "<style>\n",
-                            ".trialProgress {\n",
-                            "  display: flex;\n",
-                            "  flex-direction: column;\n",
-                            "  color: var(--jp-ui-font-color1);\n",
-                            "}\n",
-                            ".trialProgress h3 {\n",
-                            "  font-weight: bold;\n",
-                            "}\n",
-                            ".trialProgress td {\n",
-                            "  white-space: nowrap;\n",
-                            "}\n",
-                            "</style>\n"
-                        ],
-                        "text/plain": [
-                            "<IPython.core.display.HTML object>"
-                        ]
-                    },
-                    "metadata": {},
-                    "output_type": "display_data"
-                },
-                {
-                    "name": "stderr",
-                    "output_type": "stream",
-                    "text": [
-                        "2023-02-07 00:04:19,366\tINFO tune.py:798 -- Total run time: 7.38 seconds (6.85 seconds for the tuning loop).\n"
-                    ]
-                },
-                {
-                    "data": {
-                        "text/plain": [
-                            "<ray.tune.result_grid.ResultGrid at 0x137de07c0>"
-                        ]
-                    },
-                    "execution_count": 3,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "tuner = tune.Tuner(\n",
-                "    train_function,\n",
-                "    run_config=air.RunConfig(\n",
-                "        callbacks=[AimLoggerCallback()],\n",
-                "        storage_path=\"/tmp/ray_results\",\n",
-                "        name=\"aim_example\",\n",
-                "    ),\n",
-                "    param_space={\n",
-                "        \"mean\": tune.grid_search([1, 2, 3, 4, 5, 6, 7, 8, 9]),\n",
-                "        \"sd\": tune.uniform(0.1, 0.9),\n",
-                "    },\n",
-                "    tune_config=tune.TuneConfig(\n",
-                "        metric=\"loss\",\n",
-                "        mode=\"min\",\n",
-                "    ),\n",
-                ")\n",
-                "tuner.fit()\n"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "id": "941f25f2",
-            "metadata": {},
-            "source": [
-                "When the script executes, a grid-search is carried out and the results are saved to the Aim repo,\n",
-                "stored at the default location -- the experiment log directory (in this case, it's at `/tmp/ray_results/aim_example`).\n",
-                "\n",
-                "### More Configuration Options for Aim\n",
-                "\n",
-                "In the example above, we used the default configuration for the `AimLoggerCallback`.\n",
-                "There are a few options that can be configured as arguments to the callback. For example,\n",
-                "setting `AimLoggerCallback(repo=\"/path/to/repo\")` will log results to the Aim repo at that\n",
-                "filepath, which could be useful if you have a central location where the results of multiple\n",
-                "Tune experiments are stored. Relative paths to the working directory where Tune script is\n",
-                "launched can be used as well. By default, the repo will be set to the experiment log\n",
-                "directory. See [the API reference](tune-aim-logger) for more configurations.\n",
-                "\n",
-                "## Launching the Aim UI\n",
-                "\n",
-                "Now that we have logged our results to the Aim repository, we can view it in Aim's web UI.\n",
-                "To do this, we first find the directory where the Aim repository lives, then we use\n",
-                "the Aim CLI to launch the web interface."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 7,
-            "id": "880f55aa",
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "--------------------------------------------------------------------------\n",
-                        "                Aim UI collects anonymous usage analytics.                \n",
-                        "                        Read how to opt-out here:                         \n",
-                        "    https://aimstack.readthedocs.io/en/latest/community/telemetry.html    \n",
-                        "--------------------------------------------------------------------------\n",
-                        "\u001b[33mRunning Aim UI on repo `<Repo#-5734997863388805469 path=/tmp/ray_results/aim_example/.aim read_only=None>`\u001b[0m\n",
-                        "Open http://127.0.0.1:43800\n",
-                        "Press Ctrl+C to exit\n",
-                        "^C\n"
-                    ]
-                }
-            ],
-            "source": [
-                "# Uncomment the following line to launch the Aim UI!\n",
-                "#!aim up --repo=/tmp/ray_results/aim_example"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "id": "adbe661a",
-            "metadata": {},
-            "source": [
-                "After launching the Aim UI, we can open the web interface at `localhost:43800`."
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "id": "7bb97157",
-            "metadata": {},
-            "source": [
-                "```{image} /images/aim_example_metrics_page.png\n",
-                ":align: center\n",
-                ":alt: Aim Metrics Explorer\n",
-                ":target: https://aimstack.readthedocs.io/en/latest/ui/pages/explorers.html#metrics-explorer\n",
-                "```"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "id": "2f6e9138",
-            "metadata": {},
-            "source": [
-                "The next sections contain more in-depth information on the API of the Tune-Aim integration.\n",
-                "\n",
-                "## Tune Aim Logger API\n",
-                "\n",
-                "(tune-aim-logger)=\n",
-                "\n",
-                "```{eval-rst}\n",
-                ".. autoclass:: ray.tune.logger.aim.AimLoggerCallback\n",
-                "   :noindex:\n",
-                "```"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "0ebd1904",
-            "metadata": {},
-            "source": []
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "ray_dev_py38",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.8.13"
-        },
-        "orphan": true,
-        "vscode": {
-            "interpreter": {
-                "hash": "265d195fda5292fe8f69c6e37c435a5634a1ed3b6799724e66a975f68fa21517"
-            }
-        }
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "ecad719c",
+   "metadata": {},
+   "source": [
+    "(tune-aim-ref)=\n",
+    "\n",
+    "# Using Aim with Tune\n",
+    "\n",
+    "[Aim](https://aimstack.io) is an easy-to-use and supercharged open-source experiment tracker.\n",
+    "Aim logs your training runs, enables a well-designed UI to compare them, and provides an API to query them programmatically.\n",
+    "\n",
+    "```{image} /images/aim_logo_full.png\n",
+    ":align: center\n",
+    ":alt: Aim\n",
+    ":width: 100%\n",
+    ":target: https://aimstack.io\n",
+    "```\n",
+    "\n",
+    "Ray Tune currently offers built-in integration with Aim.\n",
+    "The {ref}`AimLoggerCallback <tune-aim-logger>` automatically logs metrics that are reported to Tune by using the Aim API.\n",
+    "\n",
+    "\n",
+    "```{contents}\n",
+    ":backlinks: none\n",
+    ":local: true\n",
+    "```\n",
+    "\n",
+    "## Logging Tune Hyperparameter Configurations and Results to Aim\n",
+    "\n",
+    "The following example demonstrates how the `AimLoggerCallback` can be used in a Tune experiment.\n",
+    "Begin by installing and importing the necessary modules:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1290b5b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install aim\n",
+    "%pip install ray[tune]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "100bcf8a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "import ray\n",
+    "from ray import air, tune\n",
+    "from ray.air import session\n",
+    "from ray.tune.logger.aim import AimLoggerCallback"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "9346c0f6",
+   "metadata": {},
+   "source": [
+    "Next, define a simple `train_function`, which is a [`Trainable`](trainable-docs) that reports a loss to Tune.\n",
+    "The objective function itself is not important for this example, as our main focus is on the integration with Aim."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e8b4fc4d",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def train_function(config):\n",
+    "    for _ in range(50):\n",
+    "        loss = config[\"mean\"] + config[\"sd\"] * np.random.randn()\n",
+    "        session.report({\"loss\": loss})"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "831eed42",
+   "metadata": {},
+   "source": [
+    "Here is an example of how you can use the `AimLoggerCallback` with simple grid-search Tune experiment.\n",
+    "The logger will log each of the 9 grid-search trials as separate Aim runs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "52988599",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-02-07 00:04:11,228\tINFO worker.py:1544 -- Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8265 \u001B[39m\u001B[22m\n"
+     ]
     },
-    "nbformat": 4,
-    "nbformat_minor": 5
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"tuneStatus\">\n",
+       "  <div style=\"display: flex;flex-direction: row\">\n",
+       "    <div style=\"display: flex;flex-direction: column;\">\n",
+       "      <h3>Tune Status</h3>\n",
+       "      <table>\n",
+       "<tbody>\n",
+       "<tr><td>Current time:</td><td>2023-02-07 00:04:19</td></tr>\n",
+       "<tr><td>Running for: </td><td>00:00:06.86        </td></tr>\n",
+       "<tr><td>Memory:      </td><td>32.8/64.0 GiB      </td></tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "    </div>\n",
+       "    <div class=\"vDivider\"></div>\n",
+       "    <div class=\"systemInfo\">\n",
+       "      <h3>System Info</h3>\n",
+       "      Using FIFO scheduling algorithm.<br>Resources requested: 0/10 CPUs, 0/0 GPUs, 0.0/26.93 GiB heap, 0.0/2.0 GiB objects\n",
+       "    </div>\n",
+       "    \n",
+       "  </div>\n",
+       "  <div class=\"hDivider\"></div>\n",
+       "  <div class=\"trialStatus\">\n",
+       "    <h3>Trial Status</h3>\n",
+       "    <table>\n",
+       "<thead>\n",
+       "<tr><th>Trial name                </th><th>status    </th><th>loc            </th><th style=\"text-align: right;\">  mean</th><th style=\"text-align: right;\">      sd</th><th style=\"text-align: right;\">  iter</th><th style=\"text-align: right;\">  total time (s)</th><th style=\"text-align: right;\">   loss</th></tr>\n",
+       "</thead>\n",
+       "<tbody>\n",
+       "<tr><td>train_function_01a3b_00000</td><td>TERMINATED</td><td>127.0.0.1:10277</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">0.385428</td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         4.48031</td><td style=\"text-align: right;\">1.01928</td></tr>\n",
+       "<tr><td>train_function_01a3b_00001</td><td>TERMINATED</td><td>127.0.0.1:10296</td><td style=\"text-align: right;\">     2</td><td style=\"text-align: right;\">0.819716</td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         2.97272</td><td style=\"text-align: right;\">3.01491</td></tr>\n",
+       "<tr><td>train_function_01a3b_00002</td><td>TERMINATED</td><td>127.0.0.1:10301</td><td style=\"text-align: right;\">     3</td><td style=\"text-align: right;\">0.769197</td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         2.39572</td><td style=\"text-align: right;\">3.87155</td></tr>\n",
+       "<tr><td>train_function_01a3b_00003</td><td>TERMINATED</td><td>127.0.0.1:10307</td><td style=\"text-align: right;\">     4</td><td style=\"text-align: right;\">0.29466 </td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         2.41568</td><td style=\"text-align: right;\">4.1507 </td></tr>\n",
+       "<tr><td>train_function_01a3b_00004</td><td>TERMINATED</td><td>127.0.0.1:10313</td><td style=\"text-align: right;\">     5</td><td style=\"text-align: right;\">0.152208</td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         1.68383</td><td style=\"text-align: right;\">5.10225</td></tr>\n",
+       "<tr><td>train_function_01a3b_00005</td><td>TERMINATED</td><td>127.0.0.1:10321</td><td style=\"text-align: right;\">     6</td><td style=\"text-align: right;\">0.879814</td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         1.54015</td><td style=\"text-align: right;\">6.20238</td></tr>\n",
+       "<tr><td>train_function_01a3b_00006</td><td>TERMINATED</td><td>127.0.0.1:10329</td><td style=\"text-align: right;\">     7</td><td style=\"text-align: right;\">0.487499</td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         1.44706</td><td style=\"text-align: right;\">7.79551</td></tr>\n",
+       "<tr><td>train_function_01a3b_00007</td><td>TERMINATED</td><td>127.0.0.1:10333</td><td style=\"text-align: right;\">     8</td><td style=\"text-align: right;\">0.639783</td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         1.4261 </td><td style=\"text-align: right;\">7.94189</td></tr>\n",
+       "<tr><td>train_function_01a3b_00008</td><td>TERMINATED</td><td>127.0.0.1:10341</td><td style=\"text-align: right;\">     9</td><td style=\"text-align: right;\">0.12285 </td><td style=\"text-align: right;\">    50</td><td style=\"text-align: right;\">         1.07701</td><td style=\"text-align: right;\">8.82304</td></tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "  </div>\n",
+       "</div>\n",
+       "<style>\n",
+       ".tuneStatus {\n",
+       "  color: var(--jp-ui-font-color1);\n",
+       "}\n",
+       ".tuneStatus .systemInfo {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       ".tuneStatus td {\n",
+       "  white-space: nowrap;\n",
+       "}\n",
+       ".tuneStatus .trialStatus {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       ".tuneStatus h3 {\n",
+       "  font-weight: bold;\n",
+       "}\n",
+       ".tuneStatus .hDivider {\n",
+       "  border-bottom-width: var(--jp-border-width);\n",
+       "  border-bottom-color: var(--jp-border-color0);\n",
+       "  border-bottom-style: solid;\n",
+       "}\n",
+       ".tuneStatus .vDivider {\n",
+       "  border-left-width: var(--jp-border-width);\n",
+       "  border-left-color: var(--jp-border-color0);\n",
+       "  border-left-style: solid;\n",
+       "  margin: 0.5em 1em 0.5em 1em;\n",
+       "}\n",
+       "</style>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"trialProgress\">\n",
+       "  <h3>Trial Progress</h3>\n",
+       "  <table>\n",
+       "<thead>\n",
+       "<tr><th>Trial name                </th><th>date               </th><th>done  </th><th>episodes_total  </th><th>experiment_id                   </th><th>experiment_tag    </th><th>hostname              </th><th style=\"text-align: right;\">  iterations_since_restore</th><th style=\"text-align: right;\">   loss</th><th>node_ip  </th><th style=\"text-align: right;\">  pid</th><th style=\"text-align: right;\">  time_since_restore</th><th style=\"text-align: right;\">  time_this_iter_s</th><th style=\"text-align: right;\">  time_total_s</th><th style=\"text-align: right;\">  timestamp</th><th style=\"text-align: right;\">  timesteps_since_restore</th><th>timesteps_total  </th><th style=\"text-align: right;\">  training_iteration</th><th>trial_id   </th><th style=\"text-align: right;\">  warmup_time</th></tr>\n",
+       "</thead>\n",
+       "<tbody>\n",
+       "<tr><td>train_function_01a3b_00000</td><td>2023-02-07_00-04-18</td><td>True  </td><td>                </td><td>c8447fdceea6436c9edd6f030a5b1d82</td><td>0_mean=1,sd=0.3854</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">1.01928</td><td>127.0.0.1</td><td style=\"text-align: right;\">10277</td><td style=\"text-align: right;\">             4.48031</td><td style=\"text-align: right;\">        0.013865  </td><td style=\"text-align: right;\">       4.48031</td><td style=\"text-align: right;\"> 1675757058</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00000</td><td style=\"text-align: right;\">   0.00264072</td></tr>\n",
+       "<tr><td>train_function_01a3b_00001</td><td>2023-02-07_00-04-18</td><td>True  </td><td>                </td><td>7dd6d3ee24244a0885b354c285064728</td><td>1_mean=2,sd=0.8197</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">3.01491</td><td>127.0.0.1</td><td style=\"text-align: right;\">10296</td><td style=\"text-align: right;\">             2.97272</td><td style=\"text-align: right;\">        0.0584073 </td><td style=\"text-align: right;\">       2.97272</td><td style=\"text-align: right;\"> 1675757058</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00001</td><td style=\"text-align: right;\">   0.0316792 </td></tr>\n",
+       "<tr><td>train_function_01a3b_00002</td><td>2023-02-07_00-04-18</td><td>True  </td><td>                </td><td>e3da49ebad034c4b8fdaf0aa87927b1a</td><td>2_mean=3,sd=0.7692</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">3.87155</td><td>127.0.0.1</td><td style=\"text-align: right;\">10301</td><td style=\"text-align: right;\">             2.39572</td><td style=\"text-align: right;\">        0.0695491 </td><td style=\"text-align: right;\">       2.39572</td><td style=\"text-align: right;\"> 1675757058</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00002</td><td style=\"text-align: right;\">   0.0315411 </td></tr>\n",
+       "<tr><td>train_function_01a3b_00003</td><td>2023-02-07_00-04-18</td><td>True  </td><td>                </td><td>95c60c4f67c4481ebccff25b0a49e75d</td><td>3_mean=4,sd=0.2947</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">4.1507 </td><td>127.0.0.1</td><td style=\"text-align: right;\">10307</td><td style=\"text-align: right;\">             2.41568</td><td style=\"text-align: right;\">        0.0175381 </td><td style=\"text-align: right;\">       2.41568</td><td style=\"text-align: right;\"> 1675757058</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00003</td><td style=\"text-align: right;\">   0.0310779 </td></tr>\n",
+       "<tr><td>train_function_01a3b_00004</td><td>2023-02-07_00-04-18</td><td>True  </td><td>                </td><td>a216253cb41e47caa229e65488deb019</td><td>4_mean=5,sd=0.1522</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">5.10225</td><td>127.0.0.1</td><td style=\"text-align: right;\">10313</td><td style=\"text-align: right;\">             1.68383</td><td style=\"text-align: right;\">        0.064441  </td><td style=\"text-align: right;\">       1.68383</td><td style=\"text-align: right;\"> 1675757058</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00004</td><td style=\"text-align: right;\">   0.00450182</td></tr>\n",
+       "<tr><td>train_function_01a3b_00005</td><td>2023-02-07_00-04-18</td><td>True  </td><td>                </td><td>23834104277f476cb99d9c696281fceb</td><td>5_mean=6,sd=0.8798</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">6.20238</td><td>127.0.0.1</td><td style=\"text-align: right;\">10321</td><td style=\"text-align: right;\">             1.54015</td><td style=\"text-align: right;\">        0.00910306</td><td style=\"text-align: right;\">       1.54015</td><td style=\"text-align: right;\"> 1675757058</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00005</td><td style=\"text-align: right;\">   0.0480251 </td></tr>\n",
+       "<tr><td>train_function_01a3b_00006</td><td>2023-02-07_00-04-18</td><td>True  </td><td>                </td><td>15f650121df747c3bd2720481d47b265</td><td>6_mean=7,sd=0.4875</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">7.79551</td><td>127.0.0.1</td><td style=\"text-align: right;\">10329</td><td style=\"text-align: right;\">             1.44706</td><td style=\"text-align: right;\">        0.00600386</td><td style=\"text-align: right;\">       1.44706</td><td style=\"text-align: right;\"> 1675757058</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00006</td><td style=\"text-align: right;\">   0.00202489</td></tr>\n",
+       "<tr><td>train_function_01a3b_00007</td><td>2023-02-07_00-04-19</td><td>True  </td><td>                </td><td>78b1673cf2034ed99135b80a0cb31e0e</td><td>7_mean=8,sd=0.6398</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">7.94189</td><td>127.0.0.1</td><td style=\"text-align: right;\">10333</td><td style=\"text-align: right;\">             1.4261 </td><td style=\"text-align: right;\">        0.00225306</td><td style=\"text-align: right;\">       1.4261 </td><td style=\"text-align: right;\"> 1675757059</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00007</td><td style=\"text-align: right;\">   0.00209713</td></tr>\n",
+       "<tr><td>train_function_01a3b_00008</td><td>2023-02-07_00-04-19</td><td>True  </td><td>                </td><td>c7f5d86154cb46b6aa27bef523edcd6f</td><td>8_mean=9,sd=0.1228</td><td>Justins-MacBook-Pro-16</td><td style=\"text-align: right;\">                        50</td><td style=\"text-align: right;\">8.82304</td><td>127.0.0.1</td><td style=\"text-align: right;\">10341</td><td style=\"text-align: right;\">             1.07701</td><td style=\"text-align: right;\">        0.00291467</td><td style=\"text-align: right;\">       1.07701</td><td style=\"text-align: right;\"> 1675757059</td><td style=\"text-align: right;\">                        0</td><td>                 </td><td style=\"text-align: right;\">                  50</td><td>01a3b_00008</td><td style=\"text-align: right;\">   0.00240111</td></tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "</div>\n",
+       "<style>\n",
+       ".trialProgress {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "  color: var(--jp-ui-font-color1);\n",
+       "}\n",
+       ".trialProgress h3 {\n",
+       "  font-weight: bold;\n",
+       "}\n",
+       ".trialProgress td {\n",
+       "  white-space: nowrap;\n",
+       "}\n",
+       "</style>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-02-07 00:04:19,366\tINFO tune.py:798 -- Total run time: 7.38 seconds (6.85 seconds for the tuning loop).\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<ray.tune.result_grid.ResultGrid at 0x137de07c0>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tuner = tune.Tuner(\n",
+    "    train_function,\n",
+    "    run_config=air.RunConfig(\n",
+    "        callbacks=[AimLoggerCallback()],\n",
+    "        storage_path=\"/tmp/ray_results\",\n",
+    "        name=\"aim_example\",\n",
+    "    ),\n",
+    "    param_space={\n",
+    "        \"mean\": tune.grid_search([1, 2, 3, 4, 5, 6, 7, 8, 9]),\n",
+    "        \"sd\": tune.uniform(0.1, 0.9),\n",
+    "    },\n",
+    "    tune_config=tune.TuneConfig(\n",
+    "        metric=\"loss\",\n",
+    "        mode=\"min\",\n",
+    "    ),\n",
+    ")\n",
+    "tuner.fit()\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "941f25f2",
+   "metadata": {},
+   "source": [
+    "When the script executes, a grid-search is carried out and the results are saved to the Aim repo,\n",
+    "stored at the default location -- the experiment log directory (in this case, it's at `/tmp/ray_results/aim_example`).\n",
+    "\n",
+    "### More Configuration Options for Aim\n",
+    "\n",
+    "In the example above, we used the default configuration for the `AimLoggerCallback`.\n",
+    "There are a few options that can be configured as arguments to the callback. For example,\n",
+    "setting `AimLoggerCallback(repo=\"/path/to/repo\")` will log results to the Aim repo at that\n",
+    "filepath, which could be useful if you have a central location where the results of multiple\n",
+    "Tune experiments are stored. Relative paths to the working directory where Tune script is\n",
+    "launched can be used as well. By default, the repo will be set to the experiment log\n",
+    "directory. See [the API reference](tune-aim-logger) for more configurations.\n",
+    "\n",
+    "## Launching the Aim UI\n",
+    "\n",
+    "Now that we have logged our results to the Aim repository, we can view it in Aim's web UI.\n",
+    "To do this, we first find the directory where the Aim repository lives, then we use\n",
+    "the Aim CLI to launch the web interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "880f55aa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------\n",
+      "                Aim UI collects anonymous usage analytics.                \n",
+      "                        Read how to opt-out here:                         \n",
+      "    https://aimstack.readthedocs.io/en/latest/community/telemetry.html    \n",
+      "--------------------------------------------------------------------------\n",
+      "\u001B[33mRunning Aim UI on repo `<Repo#-5734997863388805469 path=/tmp/ray_results/aim_example/.aim read_only=None>`\u001B[0m\n",
+      "Open http://127.0.0.1:43800\n",
+      "Press Ctrl+C to exit\n",
+      "^C\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Uncomment the following line to launch the Aim UI!\n",
+    "#!aim up --repo=/tmp/ray_results/aim_example"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "adbe661a",
+   "metadata": {},
+   "source": [
+    "After launching the Aim UI, we can open the web interface at `localhost:43800`."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "7bb97157",
+   "metadata": {},
+   "source": [
+    "```{image} /images/aim_example_metrics_page.png\n",
+    ":align: center\n",
+    ":alt: Aim Metrics Explorer\n",
+    ":target: https://aimstack.readthedocs.io/en/latest/ui/pages/explorers.html#metrics-explorer\n",
+    "```"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "2f6e9138",
+   "metadata": {},
+   "source": [
+    "The next sections contain more in-depth information on the API of the Tune-Aim integration.\n",
+    "\n",
+    "## Tune Aim Logger API\n",
+    "\n",
+    "(tune-aim-logger)=\n",
+    "\n",
+    "```{eval-rst}\n",
+    ".. autoclass:: ray.tune.logger.aim.AimLoggerCallback\n",
+    "   :noindex:\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0ebd1904",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ray_dev_py38",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  },
+  "orphan": true,
+  "vscode": {
+   "interpreter": {
+    "hash": "265d195fda5292fe8f69c6e37c435a5634a1ed3b6799724e66a975f68fa21517"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/doc/source/tune/examples/tune-mlflow.ipynb b/doc/source/tune/examples/tune-mlflow.ipynb
index 2e32a3f2e8491..5e8524e6c523c 100644
--- a/doc/source/tune/examples/tune-mlflow.ipynb
+++ b/doc/source/tune/examples/tune-mlflow.ipynb
@@ -253,7 +253,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2022-12-22 10:37:53,580\tINFO worker.py:1542 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n"
+      "2022-12-22 10:37:53,580\tINFO worker.py:1542 -- Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8265 \u001B[39m\u001B[22m\n"
      ]
     },
     {
diff --git a/doc/source/tune/examples/tune-vanilla-pytorch-lightning.ipynb b/doc/source/tune/examples/tune-vanilla-pytorch-lightning.ipynb
index d34e2860ab649..b078ca2f975fb 100644
--- a/doc/source/tune/examples/tune-vanilla-pytorch-lightning.ipynb
+++ b/doc/source/tune/examples/tune-vanilla-pytorch-lightning.ipynb
@@ -798,39 +798,39 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m GPU available: False, used: False\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m TPU available: False, using: 0 TPU cores\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m IPU available: False, using: 0 IPUs\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m HPU available: False, using: 0 HPUs\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:336: LightningDeprecationWarning: The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7. Please use the `on_exception` callback hook instead.\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m   \"The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7.\"\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:348: LightningDeprecationWarning: The `on_init_start` callback hook was deprecated in v1.6 and will be removed in v1.8.\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m   \"The `on_init_start` callback hook was deprecated in v1.6 and will be removed in v1.8.\"\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:351: LightningDeprecationWarning: The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m   rank_zero_deprecation(\"The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.\")\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:377: LightningDeprecationWarning: The `Callback.on_batch_start` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on_train_batch_start` instead.\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m   f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:377: LightningDeprecationWarning: The `Callback.on_batch_end` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on_train_batch_end` instead.\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m   f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:386: LightningDeprecationWarning: The `Callback.on_epoch_start` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on_<train/validation/test>_epoch_start` instead.\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m   f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:386: LightningDeprecationWarning: The `Callback.on_epoch_end` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on_<train/validation/test>_epoch_end` instead.\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m   f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m \n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m   | Name    | Type   | Params\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m -----------------------------------\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m 0 | layer_1 | Linear | 100 K \n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m 1 | layer_2 | Linear | 16.5 K\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m 2 | layer_3 | Linear | 1.3 K \n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m -----------------------------------\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m 118 K     Trainable params\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m 0         Non-trainable params\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m 118 K     Total params\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m 0.473     Total estimated model params size (MB)\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:245: PossibleUserWarning: The dataloader, val_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m   category=PossibleUserWarning,\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:245: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
-      "\u001b[2m\u001b[36m(train_mnist_tune pid=52355)\u001b[0m   category=PossibleUserWarning,\n"
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m GPU available: False, used: False\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m TPU available: False, using: 0 TPU cores\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m IPU available: False, using: 0 IPUs\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m HPU available: False, using: 0 HPUs\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:336: LightningDeprecationWarning: The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7. Please use the `on_exception` callback hook instead.\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m   \"The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7.\"\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:348: LightningDeprecationWarning: The `on_init_start` callback hook was deprecated in v1.6 and will be removed in v1.8.\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m   \"The `on_init_start` callback hook was deprecated in v1.6 and will be removed in v1.8.\"\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:351: LightningDeprecationWarning: The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m   rank_zero_deprecation(\"The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.\")\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:377: LightningDeprecationWarning: The `Callback.on_batch_start` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on_train_batch_start` instead.\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m   f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:377: LightningDeprecationWarning: The `Callback.on_batch_end` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on_train_batch_end` instead.\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m   f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:386: LightningDeprecationWarning: The `Callback.on_epoch_start` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on_<train/validation/test>_epoch_start` instead.\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m   f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:386: LightningDeprecationWarning: The `Callback.on_epoch_end` hook was deprecated in v1.6 and will be removed in v1.8. Please use `Callback.on_<train/validation/test>_epoch_end` instead.\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m   f\"The `Callback.{hook}` hook was deprecated in v1.6 and\"\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m   | Name    | Type   | Params\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m -----------------------------------\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m 0 | layer_1 | Linear | 100 K \n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m 1 | layer_2 | Linear | 16.5 K\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m 2 | layer_3 | Linear | 1.3 K \n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m -----------------------------------\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m 118 K     Trainable params\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m 0         Non-trainable params\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m 118 K     Total params\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m 0.473     Total estimated model params size (MB)\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:245: PossibleUserWarning: The dataloader, val_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m   category=PossibleUserWarning,\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m /Users/kai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:245: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
+      "\u001B[2m\u001B[36m(train_mnist_tune pid=52355)\u001B[0m   category=PossibleUserWarning,\n"
      ]
     },
     {
@@ -1178,11 +1178,11 @@
      "evalue": "__init__() got an unexpected keyword argument 'tune_mnist_pbt'",
      "output_type": "error",
      "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m/var/folders/b2/0_91bd757rz02lrmr920v0gw0000gn/T/ipykernel_52122/1146224506.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mtune_mnist_asha\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_samples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_epochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgpus_per_trial\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mtune_mnist_pbt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_samples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_epochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgpus_per_trial\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m/var/folders/b2/0_91bd757rz02lrmr920v0gw0000gn/T/ipykernel_52122/328169407.py\u001b[0m in \u001b[0;36mtune_mnist_pbt\u001b[0;34m(num_samples, num_epochs, gpus_per_trial, data_dir)\u001b[0m\n\u001b[1;32m     38\u001b[0m         run_config=air.RunConfig(\n\u001b[1;32m     39\u001b[0m             \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"tune_mnist_asha\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 40\u001b[0;31m             \u001b[0mtune_mnist_pbt\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreporter\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     41\u001b[0m         ),\n\u001b[1;32m     42\u001b[0m         \u001b[0mparam_space\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mTypeError\u001b[0m: __init__() got an unexpected keyword argument 'tune_mnist_pbt'"
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mTypeError\u001B[0m                                 Traceback (most recent call last)",
+      "\u001B[0;32m/var/folders/b2/0_91bd757rz02lrmr920v0gw0000gn/T/ipykernel_52122/1146224506.py\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[1;32m      2\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m      3\u001B[0m \u001B[0mtune_mnist_asha\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mnum_samples\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;36m1\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mnum_epochs\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;36m6\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mgpus_per_trial\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;36m0\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mdata_dir\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mdata_dir\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m----> 4\u001B[0;31m \u001B[0mtune_mnist_pbt\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mnum_samples\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;36m1\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mnum_epochs\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;36m6\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mgpus_per_trial\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;36m0\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mdata_dir\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mdata_dir\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m",
+      "\u001B[0;32m/var/folders/b2/0_91bd757rz02lrmr920v0gw0000gn/T/ipykernel_52122/328169407.py\u001B[0m in \u001B[0;36mtune_mnist_pbt\u001B[0;34m(num_samples, num_epochs, gpus_per_trial, data_dir)\u001B[0m\n\u001B[1;32m     38\u001B[0m         run_config=air.RunConfig(\n\u001B[1;32m     39\u001B[0m             \u001B[0mname\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;34m\"tune_mnist_asha\"\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 40\u001B[0;31m             \u001B[0mtune_mnist_pbt\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mreporter\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m     41\u001B[0m         ),\n\u001B[1;32m     42\u001B[0m         \u001B[0mparam_space\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mconfig\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;31mTypeError\u001B[0m: __init__() got an unexpected keyword argument 'tune_mnist_pbt'"
      ]
     }
    ],
diff --git a/doc/source/tune/examples/tune-wandb.ipynb b/doc/source/tune/examples/tune-wandb.ipynb
index 7ff95bedb5661..b0faa529eedeb 100644
--- a/doc/source/tune/examples/tune-wandb.ipynb
+++ b/doc/source/tune/examples/tune-wandb.ipynb
@@ -300,7 +300,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2022-11-02 16:02:45,355\tINFO worker.py:1534 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8266 \u001b[39m\u001b[22m\n",
+      "2022-11-02 16:02:45,355\tINFO worker.py:1534 -- Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8266 \u001B[39m\u001B[22m\n",
       "2022-11-02 16:02:46,513\tINFO wandb.py:282 -- Already logged into W&B.\n"
      ]
     },
@@ -508,7 +508,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(train_function_wandb pid=14647)\u001b[0m 2022-11-02 16:03:17,149\tINFO wandb.py:282 -- Already logged into W&B.\n"
+      "\u001B[2m\u001B[36m(train_function_wandb pid=14647)\u001B[0m 2022-11-02 16:03:17,149\tINFO wandb.py:282 -- Already logged into W&B.\n"
      ]
     },
     {
@@ -554,10 +554,10 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(train_function_wandb pid=14660)\u001b[0m 2022-11-02 16:03:20,600\tINFO wandb.py:282 -- Already logged into W&B.\n",
-      "\u001b[2m\u001b[36m(train_function_wandb pid=14661)\u001b[0m 2022-11-02 16:03:20,600\tINFO wandb.py:282 -- Already logged into W&B.\n",
-      "\u001b[2m\u001b[36m(train_function_wandb pid=14663)\u001b[0m 2022-11-02 16:03:20,628\tINFO wandb.py:282 -- Already logged into W&B.\n",
-      "\u001b[2m\u001b[36m(train_function_wandb pid=14662)\u001b[0m 2022-11-02 16:03:20,723\tINFO wandb.py:282 -- Already logged into W&B.\n",
+      "\u001B[2m\u001B[36m(train_function_wandb pid=14660)\u001B[0m 2022-11-02 16:03:20,600\tINFO wandb.py:282 -- Already logged into W&B.\n",
+      "\u001B[2m\u001B[36m(train_function_wandb pid=14661)\u001B[0m 2022-11-02 16:03:20,600\tINFO wandb.py:282 -- Already logged into W&B.\n",
+      "\u001B[2m\u001B[36m(train_function_wandb pid=14663)\u001B[0m 2022-11-02 16:03:20,628\tINFO wandb.py:282 -- Already logged into W&B.\n",
+      "\u001B[2m\u001B[36m(train_function_wandb pid=14662)\u001B[0m 2022-11-02 16:03:20,723\tINFO wandb.py:282 -- Already logged into W&B.\n",
       "2022-11-02 16:03:22,565\tINFO tune.py:788 -- Total run time: 8.60 seconds (8.48 seconds for the tuning loop).\n"
      ]
     },
@@ -642,7 +642,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(WandbTrainable pid=14718)\u001b[0m 2022-11-02 16:03:25,742\tINFO wandb.py:282 -- Already logged into W&B.\n"
+      "\u001B[2m\u001B[36m(WandbTrainable pid=14718)\u001B[0m 2022-11-02 16:03:25,742\tINFO wandb.py:282 -- Already logged into W&B.\n"
      ]
     },
     {
@@ -688,10 +688,10 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[2m\u001b[36m(WandbTrainable pid=14739)\u001b[0m 2022-11-02 16:03:30,360\tINFO wandb.py:282 -- Already logged into W&B.\n",
-      "\u001b[2m\u001b[36m(WandbTrainable pid=14740)\u001b[0m 2022-11-02 16:03:30,393\tINFO wandb.py:282 -- Already logged into W&B.\n",
-      "\u001b[2m\u001b[36m(WandbTrainable pid=14737)\u001b[0m 2022-11-02 16:03:30,454\tINFO wandb.py:282 -- Already logged into W&B.\n",
-      "\u001b[2m\u001b[36m(WandbTrainable pid=14738)\u001b[0m 2022-11-02 16:03:30,510\tINFO wandb.py:282 -- Already logged into W&B.\n",
+      "\u001B[2m\u001B[36m(WandbTrainable pid=14739)\u001B[0m 2022-11-02 16:03:30,360\tINFO wandb.py:282 -- Already logged into W&B.\n",
+      "\u001B[2m\u001B[36m(WandbTrainable pid=14740)\u001B[0m 2022-11-02 16:03:30,393\tINFO wandb.py:282 -- Already logged into W&B.\n",
+      "\u001B[2m\u001B[36m(WandbTrainable pid=14737)\u001B[0m 2022-11-02 16:03:30,454\tINFO wandb.py:282 -- Already logged into W&B.\n",
+      "\u001B[2m\u001B[36m(WandbTrainable pid=14738)\u001B[0m 2022-11-02 16:03:30,510\tINFO wandb.py:282 -- Already logged into W&B.\n",
       "2022-11-02 16:03:31,985\tINFO tune.py:788 -- Total run time: 9.40 seconds (9.27 seconds for the tuning loop).\n"
      ]
     },
diff --git a/python/ray/air/examples/lightgbm_example.ipynb b/python/ray/air/examples/lightgbm_example.ipynb
deleted file mode 120000
index 6501466a10a1d..0000000000000
--- a/python/ray/air/examples/lightgbm_example.ipynb
+++ /dev/null
@@ -1 +0,0 @@
-../../../../doc/source/ray-air/examples/lightgbm_example.ipynb
\ No newline at end of file
diff --git a/python/ray/air/examples/lightgbm_example.ipynb b/python/ray/air/examples/lightgbm_example.ipynb
new file mode 100644
index 0000000000000..3280a55a77440
--- /dev/null
+++ b/python/ray/air/examples/lightgbm_example.ipynb
@@ -0,0 +1,503 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "0d385409",
+   "metadata": {},
+   "source": [
+    "(air-lightgbm-example-ref)=\n",
+    "\n",
+    "# Training a model with distributed LightGBM\n",
+    "In this example we will train a model in Ray AIR using distributed LightGBM."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "07d92cee",
+   "metadata": {},
+   "source": [
+    "Let's start with installing our dependencies:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "86131abe",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.2\u001B[0m\n",
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install -qU \"ray[tune]\" lightgbm_ray"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "135fc884",
+   "metadata": {},
+   "source": [
+    "Then we need some imports:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "102ef1ac",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/balaji/Documents/GitHub/ray/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "2023-07-07 14:34:14,951\tINFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n",
+      "2023-07-07 14:34:15,892\tINFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from typing import Tuple\n",
+    "\n",
+    "import ray\n",
+    "from ray.train.lightgbm import LightGBMPredictor\n",
+    "from ray.data.preprocessors.chain import Chain\n",
+    "from ray.data.preprocessors.encoder import Categorizer\n",
+    "from ray.train.lightgbm import LightGBMTrainer\n",
+    "from ray.train import Result, ScalingConfig\n",
+    "from ray.data import Dataset\n",
+    "from ray.data.preprocessors import StandardScaler"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "c7d102bd",
+   "metadata": {},
+   "source": [
+    "Next we define a function to load our train, validation, and test datasets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f1f35cd7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:\n",
+    "    dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer_with_categorical.csv\")\n",
+    "    train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)\n",
+    "    test_dataset = valid_dataset.drop_columns(cols=[\"target\"])\n",
+    "    return train_dataset, valid_dataset, test_dataset"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "8f7afbce",
+   "metadata": {},
+   "source": [
+    "The following function will create a LightGBM trainer, train it, and return the result."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fefcbc8a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_lightgbm(num_workers: int, use_gpu: bool = False) -> Result:\n",
+    "    train_dataset, valid_dataset, _ = prepare_data()\n",
+    "\n",
+    "    # Scale some random columns, and categorify the categorical_column,\n",
+    "    # allowing LightGBM to use its built-in categorical feature support\n",
+    "    preprocessor = Chain(\n",
+    "        Categorizer([\"categorical_column\"]), \n",
+    "        StandardScaler(columns=[\"mean radius\", \"mean texture\"])\n",
+    "    )\n",
+    "\n",
+    "    # LightGBM specific params\n",
+    "    params = {\n",
+    "        \"objective\": \"binary\",\n",
+    "        \"metric\": [\"binary_logloss\", \"binary_error\"],\n",
+    "    }\n",
+    "\n",
+    "    trainer = LightGBMTrainer(\n",
+    "        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),\n",
+    "        label_column=\"target\",\n",
+    "        params=params,\n",
+    "        datasets={\"train\": train_dataset, \"valid\": valid_dataset},\n",
+    "        preprocessor=preprocessor,\n",
+    "        num_boost_round=100,\n",
+    "    )\n",
+    "    result = trainer.fit()\n",
+    "    print(result.metrics)\n",
+    "\n",
+    "    return result"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "04d278ae",
+   "metadata": {},
+   "source": [
+    "Once we have the result, we can do batch inference on the obtained model. Let's define a utility function for this."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "3f1d0c19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from ray.train import Checkpoint\n",
+    "from ray.data import ActorPoolStrategy\n",
+    "\n",
+    "\n",
+    "class Predict:\n",
+    "\n",
+    "    def __init__(self, checkpoint: Checkpoint):\n",
+    "        self.predictor = LightGBMPredictor.from_checkpoint(checkpoint)\n",
+    "\n",
+    "    def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:\n",
+    "        return self.predictor.predict(batch)\n",
+    "\n",
+    "\n",
+    "def predict_lightgbm(result: Result):\n",
+    "    _, _, test_dataset = prepare_data()\n",
+    "\n",
+    "    scores = test_dataset.map_batches(\n",
+    "        Predict, \n",
+    "        fn_constructor_args=[result.checkpoint], \n",
+    "        compute=ActorPoolStrategy(), \n",
+    "        batch_format=\"pandas\"\n",
+    "    )\n",
+    "    \n",
+    "    predicted_labels = scores.map_batches(lambda df: (df > 0.5).astype(int), batch_format=\"pandas\")\n",
+    "    print(f\"PREDICTED LABELS\")\n",
+    "    predicted_labels.show()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "2bb0e5df",
+   "metadata": {},
+   "source": [
+    "Now we can run the training:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "8244ff3c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"tuneStatus\">\n",
+       "  <div style=\"display: flex;flex-direction: row\">\n",
+       "    <div style=\"display: flex;flex-direction: column;\">\n",
+       "      <h3>Tune Status</h3>\n",
+       "      <table>\n",
+       "<tbody>\n",
+       "<tr><td>Current time:</td><td>2023-07-07 14:34:34</td></tr>\n",
+       "<tr><td>Running for: </td><td>00:00:06.06        </td></tr>\n",
+       "<tr><td>Memory:      </td><td>12.2/64.0 GiB      </td></tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "    </div>\n",
+       "    <div class=\"vDivider\"></div>\n",
+       "    <div class=\"systemInfo\">\n",
+       "      <h3>System Info</h3>\n",
+       "      Using FIFO scheduling algorithm.<br>Logical resource usage: 4.0/10 CPUs, 0/0 GPUs\n",
+       "    </div>\n",
+       "    \n",
+       "  </div>\n",
+       "  <div class=\"hDivider\"></div>\n",
+       "  <div class=\"trialStatus\">\n",
+       "    <h3>Trial Status</h3>\n",
+       "    <table>\n",
+       "<thead>\n",
+       "<tr><th>Trial name                 </th><th>status    </th><th>loc            </th><th style=\"text-align: right;\">  iter</th><th style=\"text-align: right;\">  total time (s)</th><th style=\"text-align: right;\">  train-binary_logloss</th><th style=\"text-align: right;\">  train-binary_error</th><th style=\"text-align: right;\">  valid-binary_logloss</th></tr>\n",
+       "</thead>\n",
+       "<tbody>\n",
+       "<tr><td>LightGBMTrainer_0c5ae_00000</td><td>TERMINATED</td><td>127.0.0.1:10027</td><td style=\"text-align: right;\">   101</td><td style=\"text-align: right;\">          4.5829</td><td style=\"text-align: right;\">           0.000202293</td><td style=\"text-align: right;\">                   0</td><td style=\"text-align: right;\">              0.130232</td></tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "  </div>\n",
+       "</div>\n",
+       "<style>\n",
+       ".tuneStatus {\n",
+       "  color: var(--jp-ui-font-color1);\n",
+       "}\n",
+       ".tuneStatus .systemInfo {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       ".tuneStatus td {\n",
+       "  white-space: nowrap;\n",
+       "}\n",
+       ".tuneStatus .trialStatus {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       ".tuneStatus h3 {\n",
+       "  font-weight: bold;\n",
+       "}\n",
+       ".tuneStatus .hDivider {\n",
+       "  border-bottom-width: var(--jp-border-width);\n",
+       "  border-bottom-color: var(--jp-border-color0);\n",
+       "  border-bottom-style: solid;\n",
+       "}\n",
+       ".tuneStatus .vDivider {\n",
+       "  border-left-width: var(--jp-border-width);\n",
+       "  border-left-color: var(--jp-border-color0);\n",
+       "  border-left-style: solid;\n",
+       "  margin: 0.5em 1em 0.5em 1em;\n",
+       "}\n",
+       "</style>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n",
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(get_pd_value_counts)]\n",
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n",
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.\n",
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(Categorizer._transform_pandas)] -> AllToAllOperator[Aggregate]\n",
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n",
+      "                                                                                                                                     \n",
+      "\u001B[A\n",
+      "\u001B[A\n",
+      "\n",
+      "\u001B[A\u001B[A\n",
+      "\n",
+      "(pid=10027) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory:   0%|          | 0/14 [00:00<?, ?it/s] \n",
+      "\u001B[A                                                                                                                    \n",
+      "\n",
+      "\u001B[A\u001B[A                                                              \n",
+      "\n",
+      "\n",
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Warning: The Ray cluster currently does not have any available CPUs. The Dataset job will hang unless more CPUs are freed up. A common reason is that cluster resources are used by Actors or Tune trials; see the following link for more details: https://docs.ray.io/en/master/data/dataset-internals.html#datasets-and-tune\n",
+      "\n",
+      "\u001B[A\n",
+      "\n",
+      "(pid=10027) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory:   7%|▋         | 1/14 [00:00<00:01,  9.53it/s]\n",
+      "\u001B[A                                                                                                                   \n",
+      "\n",
+      "\u001B[A\u001B[A                                                                        \n",
+      "\n",
+      "\n",
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(Categorizer._transform_pandas)->MapBatches(StandardScaler._transform_pandas)]\n",
+      "\n",
+      "\u001B[A\n",
+      "\n",
+      "(pid=10027) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory:   7%|▋         | 1/14 [00:00<00:01,  7.59it/s]\n",
+      "\u001B[A                                                                                                                   \n",
+      "\n",
+      "\u001B[A\u001B[A                                                                        \n",
+      "\n",
+      "\n",
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
+      "\n",
+      "\u001B[A\n",
+      "\n",
+      "(pid=10027) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory:   7%|▋         | 1/14 [00:00<00:01,  6.59it/s]\n",
+      "\u001B[A                                                                                                                   \n",
+      "\n",
+      "\u001B[A\u001B[A                                                                       \n",
+      "\n",
+      "\n",
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n",
+      "\n",
+      "\u001B[A\n",
+      "\n",
+      "                                                                                                                                     \n",
+      "\u001B[A\n",
+      "\n",
+      "\u001B[A\u001B[A\n",
+      "\n",
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(Categorizer._transform_pandas)->MapBatches(StandardScaler._transform_pandas)]\n",
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
+      "\u001B[2m\u001B[36m(LightGBMTrainer pid=10027)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n",
+      "                                                                                                                             \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10063)\u001B[0m [LightGBM] [Info] Trying to bind port 51134...\n",
+      "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10063)\u001B[0m [LightGBM] [Info] Binding port 51134 succeeded\n",
+      "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10063)\u001B[0m [LightGBM] [Info] Listening...\n",
+      "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10062)\u001B[0m [LightGBM] [Warning] Connecting to rank 1 failed, waiting for 200 milliseconds\n",
+      "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10063)\u001B[0m [LightGBM] [Info] Connected to rank 0\n",
+      "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10063)\u001B[0m [LightGBM] [Info] Local rank: 1, total number of machines: 2\n",
+      "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10063)\u001B[0m [LightGBM] [Warning] num_threads is set=2, n_jobs=-1 will be ignored. Current value: num_threads=2\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10062)\u001B[0m /Users/balaji/Documents/GitHub/ray/.venv/lib/python3.11/site-packages/lightgbm/basic.py:1780: UserWarning: Overriding the parameters from Reference Dataset.\n",
+      "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10062)\u001B[0m   _log_warning('Overriding the parameters from Reference Dataset.')\n",
+      "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10062)\u001B[0m /Users/balaji/Documents/GitHub/ray/.venv/lib/python3.11/site-packages/lightgbm/basic.py:1513: UserWarning: categorical_column in param dict is overridden.\n",
+      "\u001B[2m\u001B[36m(_RemoteRayLightGBMActor pid=10062)\u001B[0m   _log_warning(f'{cat_alias} in param dict is overridden.')\n",
+      "2023-07-07 14:34:34,087\tINFO tune.py:1148 -- Total run time: 7.18 seconds (6.05 seconds for the tuning loop).\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'train-binary_logloss': 0.00020229312743896637, 'train-binary_error': 0.0, 'valid-binary_logloss': 0.13023245107091222, 'valid-binary_error': 0.023529411764705882, 'time_this_iter_s': 0.021785974502563477, 'should_checkpoint': True, 'done': True, 'training_iteration': 101, 'trial_id': '0c5ae_00000', 'date': '2023-07-07_14-34-34', 'timestamp': 1688765674, 'time_total_s': 4.582904100418091, 'pid': 10027, 'hostname': 'Balajis-MacBook-Pro-16', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 4.582904100418091, 'iterations_since_restore': 101, 'experiment_tag': '0'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "result = train_lightgbm(num_workers=2, use_gpu=False)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "d7155d9b",
+   "metadata": {},
+   "source": [
+    "And perform inference on the obtained model:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "871c9be6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-07-07 14:34:36,769\tINFO read_api.py:374 -- To satisfy the requested parallelism of 20, each read task output will be split into 20 smaller blocks.\n",
+      "2023-07-07 14:34:38,655\tWARNING plan.py:567 -- Warning: The Ray cluster currently does not have any available CPUs. The Dataset job will hang unless more CPUs are freed up. A common reason is that cluster resources are used by Actors or Tune trials; see the following link for more details: https://docs.ray.io/en/master/data/dataset-internals.html#datasets-and-tune\n",
+      "2023-07-07 14:34:38,668\tINFO dataset.py:2180 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.\n",
+      "2023-07-07 14:34:38,674\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[MapBatches(<lambda>)->MapBatches(Predict)] -> TaskPoolMapOperator[MapBatches(<lambda>)]\n",
+      "2023-07-07 14:34:38,674\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
+      "2023-07-07 14:34:38,676\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n",
+      "2023-07-07 14:34:38,701\tINFO actor_pool_map_operator.py:117 -- MapBatches(<lambda>)->MapBatches(Predict): Waiting for 1 pool actors to start...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PREDICTED LABELS\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                                                        "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 0}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 0}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 0}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r"
+     ]
+    }
+   ],
+   "source": [
+    "predict_lightgbm(result)"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "cell_metadata_filter": "-all",
+   "main_language": "python",
+   "notebook_metadata_filter": "-all"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  },
+  "orphan": true,
+  "vscode": {
+   "interpreter": {
+    "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/ray/air/examples/sklearn_example.ipynb b/python/ray/air/examples/sklearn_example.ipynb
deleted file mode 120000
index d3fb3c5b1908c..0000000000000
--- a/python/ray/air/examples/sklearn_example.ipynb
+++ /dev/null
@@ -1 +0,0 @@
-../../../../doc/source/ray-air/examples/sklearn_example.ipynb
\ No newline at end of file
diff --git a/python/ray/air/examples/sklearn_example.ipynb b/python/ray/air/examples/sklearn_example.ipynb
new file mode 100644
index 0000000000000..29603d25a249e
--- /dev/null
+++ b/python/ray/air/examples/sklearn_example.ipynb
@@ -0,0 +1,356 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "c3192ac4",
+   "metadata": {},
+   "source": [
+    "# Training a model with Sklearn\n",
+    "In this example we will train a model in Ray AIR using a Sklearn classifier."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "5a4823bf",
+   "metadata": {},
+   "source": [
+    "Let's start with installing our dependencies:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88f4bb39",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -qU \"ray[tune]\" sklearn"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "c049c692",
+   "metadata": {},
+   "source": [
+    "Then we need some imports:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "c02eb5cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Tuple\n",
+    "\n",
+    "\n",
+    "import ray\n",
+    "from ray.data import Dataset\n",
+    "from ray.train.sklearn import SklearnPredictor\n",
+    "from ray.data.preprocessors import Chain, OrdinalEncoder, StandardScaler\n",
+    "from ray.train import Result, ScalingConfig\n",
+    "from ray.train.sklearn import SklearnTrainer\n",
+    "\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "try:\n",
+    "    from cuml.ensemble import RandomForestClassifier as cuMLRandomForestClassifier\n",
+    "except ImportError:\n",
+    "    cuMLRandomForestClassifier = None"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "52e017f1",
+   "metadata": {},
+   "source": [
+    "Next we define a function to load our train, validation, and test datasets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "3631ed1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:\n",
+    "    dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer_with_categorical.csv\")\n",
+    "    train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)\n",
+    "    test_dataset = valid_dataset.drop_columns([\"target\"])\n",
+    "    return train_dataset, valid_dataset, test_dataset"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "8d6c6d17",
+   "metadata": {},
+   "source": [
+    "The following function will create a Sklearn trainer, train it, and return the result."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0fd39e42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_sklearn(num_cpus: int, use_gpu: bool = False) -> Result:\n",
+    "    if use_gpu and not cuMLRandomForestClassifier:\n",
+    "        raise RuntimeError(\"cuML must be installed for GPU enabled sklearn estimators.\")\n",
+    "\n",
+    "    train_dataset, valid_dataset, _ = prepare_data()\n",
+    "\n",
+    "    # Scale some random columns\n",
+    "    columns_to_scale = [\"mean radius\", \"mean texture\"]\n",
+    "    preprocessor = Chain(\n",
+    "        OrdinalEncoder([\"categorical_column\"]), StandardScaler(columns=columns_to_scale)\n",
+    "    )\n",
+    "\n",
+    "    if use_gpu:\n",
+    "        trainer_resources = {\"CPU\": 1, \"GPU\": 1}\n",
+    "        estimator = cuMLRandomForestClassifier()\n",
+    "    else:\n",
+    "        trainer_resources = {\"CPU\": num_cpus}\n",
+    "        estimator = RandomForestClassifier()\n",
+    "\n",
+    "    trainer = SklearnTrainer(\n",
+    "        estimator=estimator,\n",
+    "        label_column=\"target\",\n",
+    "        datasets={\"train\": train_dataset, \"valid\": valid_dataset},\n",
+    "        preprocessor=preprocessor,\n",
+    "        cv=5,\n",
+    "        scaling_config=ScalingConfig(trainer_resources=trainer_resources),\n",
+    "    )\n",
+    "    result = trainer.fit()\n",
+    "    print(result.metrics)\n",
+    "\n",
+    "    return result"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "7d073994",
+   "metadata": {},
+   "source": [
+    "Now we can run the training:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "43f9170a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-06-22 17:27:37,741\tINFO services.py:1477 -- View the Ray dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8269\u001B[39m\u001B[22m\n",
+      "2022-06-22 17:27:39,822\tWARNING read_api.py:260 -- The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n",
+      "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 44.05it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "== Status ==<br>Current time: 2022-06-22 17:27:59 (running for 00:00:18.31)<br>Memory usage on this node: 10.7/31.0 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/12.9 GiB heap, 0.0/6.45 GiB objects<br>Result logdir: /home/ubuntu/ray_results/SklearnTrainer_2022-06-22_17-27-40<br>Number of trials: 1/1 (1 TERMINATED)<br><table>\n",
+       "<thead>\n",
+       "<tr><th>Trial name                </th><th>status    </th><th>loc                  </th><th style=\"text-align: right;\">  iter</th><th style=\"text-align: right;\">  total time (s)</th><th style=\"text-align: right;\">  fit_time</th></tr>\n",
+       "</thead>\n",
+       "<tbody>\n",
+       "<tr><td>SklearnTrainer_9dec8_00000</td><td>TERMINATED</td><td>172.31.43.110:1492629</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         15.6842</td><td style=\"text-align: right;\">   2.31571</td></tr>\n",
+       "</tbody>\n",
+       "</table><br><br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(SklearnTrainer pid=1492629)\u001B[0m 2022-06-22 17:27:45,647\tWARNING pool.py:591 -- The 'context' argument is not supported using ray. Please refer to the documentation for how to control ray initialization.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Result for SklearnTrainer_9dec8_00000:\n",
+      "  cv:\n",
+      "    fit_time:\n",
+      "    - 2.221003770828247\n",
+      "    - 2.215489387512207\n",
+      "    - 2.2075674533843994\n",
+      "    - 2.222351312637329\n",
+      "    - 2.312389612197876\n",
+      "    fit_time_mean: 2.235760307312012\n",
+      "    fit_time_std: 0.03866614559685742\n",
+      "    score_time:\n",
+      "    - 0.022464990615844727\n",
+      "    - 0.0230865478515625\n",
+      "    - 0.02564835548400879\n",
+      "    - 0.029137849807739258\n",
+      "    - 0.021221637725830078\n",
+      "    score_time_mean: 0.02431187629699707\n",
+      "    score_time_std: 0.0028120522003997595\n",
+      "    test_score:\n",
+      "    - 0.9625\n",
+      "    - 0.9125\n",
+      "    - 0.9875\n",
+      "    - 1.0\n",
+      "    - 0.9367088607594937\n",
+      "    test_score_mean: 0.9598417721518986\n",
+      "    test_score_std: 0.032128186960552516\n",
+      "  date: 2022-06-22_17-27-59\n",
+      "  done: false\n",
+      "  experiment_id: f8215019c10e4a81ba2187c38e875365\n",
+      "  fit_time: 2.3157050609588623\n",
+      "  hostname: ip-172-31-43-110\n",
+      "  iterations_since_restore: 1\n",
+      "  node_ip: 172.31.43.110\n",
+      "  pid: 1492629\n",
+      "  should_checkpoint: true\n",
+      "  time_since_restore: 15.684244871139526\n",
+      "  time_this_iter_s: 15.684244871139526\n",
+      "  time_total_s: 15.684244871139526\n",
+      "  timestamp: 1655918879\n",
+      "  timesteps_since_restore: 0\n",
+      "  training_iteration: 1\n",
+      "  trial_id: 9dec8_00000\n",
+      "  valid:\n",
+      "    score_time: 0.03549623489379883\n",
+      "    test_score: 0.9532163742690059\n",
+      "  warmup_time: 0.0057866573333740234\n",
+      "  \n",
+      "Result for SklearnTrainer_9dec8_00000:\n",
+      "  cv:\n",
+      "    fit_time:\n",
+      "    - 2.221003770828247\n",
+      "    - 2.215489387512207\n",
+      "    - 2.2075674533843994\n",
+      "    - 2.222351312637329\n",
+      "    - 2.312389612197876\n",
+      "    fit_time_mean: 2.235760307312012\n",
+      "    fit_time_std: 0.03866614559685742\n",
+      "    score_time:\n",
+      "    - 0.022464990615844727\n",
+      "    - 0.0230865478515625\n",
+      "    - 0.02564835548400879\n",
+      "    - 0.029137849807739258\n",
+      "    - 0.021221637725830078\n",
+      "    score_time_mean: 0.02431187629699707\n",
+      "    score_time_std: 0.0028120522003997595\n",
+      "    test_score:\n",
+      "    - 0.9625\n",
+      "    - 0.9125\n",
+      "    - 0.9875\n",
+      "    - 1.0\n",
+      "    - 0.9367088607594937\n",
+      "    test_score_mean: 0.9598417721518986\n",
+      "    test_score_std: 0.032128186960552516\n",
+      "  date: 2022-06-22_17-27-59\n",
+      "  done: true\n",
+      "  experiment_id: f8215019c10e4a81ba2187c38e875365\n",
+      "  experiment_tag: '0'\n",
+      "  fit_time: 2.3157050609588623\n",
+      "  hostname: ip-172-31-43-110\n",
+      "  iterations_since_restore: 1\n",
+      "  node_ip: 172.31.43.110\n",
+      "  pid: 1492629\n",
+      "  should_checkpoint: true\n",
+      "  time_since_restore: 15.684244871139526\n",
+      "  time_this_iter_s: 15.684244871139526\n",
+      "  time_total_s: 15.684244871139526\n",
+      "  timestamp: 1655918879\n",
+      "  timesteps_since_restore: 0\n",
+      "  training_iteration: 1\n",
+      "  trial_id: 9dec8_00000\n",
+      "  valid:\n",
+      "    score_time: 0.03549623489379883\n",
+      "    test_score: 0.9532163742690059\n",
+      "  warmup_time: 0.0057866573333740234\n",
+      "  \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-06-22 17:27:59,333\tINFO tune.py:734 -- Total run time: 19.09 seconds (18.31 seconds for the tuning loop).\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'valid': {'score_time': 0.03549623489379883, 'test_score': 0.9532163742690059}, 'cv': {'fit_time': array([2.22100377, 2.21548939, 2.20756745, 2.22235131, 2.31238961]), 'score_time': array([0.02246499, 0.02308655, 0.02564836, 0.02913785, 0.02122164]), 'test_score': array([0.9625    , 0.9125    , 0.9875    , 1.        , 0.93670886]), 'fit_time_mean': 2.235760307312012, 'fit_time_std': 0.03866614559685742, 'score_time_mean': 0.02431187629699707, 'score_time_std': 0.0028120522003997595, 'test_score_mean': 0.9598417721518986, 'test_score_std': 0.032128186960552516}, 'fit_time': 2.3157050609588623, 'time_this_iter_s': 15.684244871139526, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 1, 'trial_id': '9dec8_00000', 'experiment_id': 'f8215019c10e4a81ba2187c38e875365', 'date': '2022-06-22_17-27-59', 'timestamp': 1655918879, 'time_total_s': 15.684244871139526, 'pid': 1492629, 'hostname': 'ip-172-31-43-110', 'node_ip': '172.31.43.110', 'config': {}, 'time_since_restore': 15.684244871139526, 'timesteps_since_restore': 0, 'iterations_since_restore': 1, 'warmup_time': 0.0057866573333740234, 'experiment_tag': '0'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "result = train_sklearn(num_cpus=2, use_gpu=False)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "e11cf27b",
+   "metadata": {},
+   "source": [
+    "## Next steps\n",
+    "\n",
+    "- {ref}`End-to-end: Offline Batch Inference <batch_inference_home>`"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "cell_metadata_filter": "-all",
+   "main_language": "python",
+   "notebook_metadata_filter": "-all"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('venv': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orphan": true,
+  "vscode": {
+   "interpreter": {
+    "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/ray/air/examples/upload_to_comet_ml.ipynb b/python/ray/air/examples/upload_to_comet_ml.ipynb
deleted file mode 120000
index 9c6606ee1475a..0000000000000
--- a/python/ray/air/examples/upload_to_comet_ml.ipynb
+++ /dev/null
@@ -1 +0,0 @@
-../../../../doc/source/ray-air/examples/upload_to_comet_ml.ipynb
\ No newline at end of file
diff --git a/python/ray/air/examples/upload_to_comet_ml.ipynb b/python/ray/air/examples/upload_to_comet_ml.ipynb
new file mode 100644
index 0000000000000..cad1483beba32
--- /dev/null
+++ b/python/ray/air/examples/upload_to_comet_ml.ipynb
@@ -0,0 +1,412 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "98d7c620",
+   "metadata": {},
+   "source": [
+    "# Logging results and uploading models to Comet ML\n",
+    "In this example, we train a simple XGBoost model and log the training\n",
+    "results to Comet ML. We also save the resulting model checkpoints\n",
+    "as artifacts."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "c6e66577",
+   "metadata": {},
+   "source": [
+    "Let's start with installing our dependencies:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6d6297ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -qU \"ray[tune]\" scikit-learn xgboost_ray comet_ml"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "c2e21446",
+   "metadata": {},
+   "source": [
+    "Then we need some imports:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "dffff484",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ray\n",
+    "\n",
+    "from ray.train import Result, RunConfig, ScalingConfig\n",
+    "from ray.train.xgboost import XGBoostTrainer\n",
+    "from ray.air.integrations.comet import CometLoggerCallback"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "29fcd93b",
+   "metadata": {},
+   "source": [
+    "We define a simple function that returns our training dataset as a Dataset:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "cf830706",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_train_dataset() -> ray.data.Dataset:\n",
+    "    dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer.csv\")\n",
+    "    return dataset"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "0f48f948",
+   "metadata": {},
+   "source": [
+    "Now we define a simple training function. All the magic happens within the `CometLoggerCallback`:\n",
+    "\n",
+    "```python\n",
+    "CometLoggerCallback(\n",
+    "    project_name=comet_project,\n",
+    "    save_checkpoints=True,\n",
+    ")\n",
+    "```\n",
+    "\n",
+    "It will automatically log all results to Comet ML and upload the checkpoints as artifacts. It assumes you're logged in into Comet via an API key or your `~./.comet.config`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "230f23a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_model(train_dataset: ray.data.Dataset, comet_project: str) -> Result:\n",
+    "    \"\"\"Train a simple XGBoost model and return the result.\"\"\"\n",
+    "    trainer = XGBoostTrainer(\n",
+    "        scaling_config=ScalingConfig(num_workers=2),\n",
+    "        params={\"tree_method\": \"auto\"},\n",
+    "        label_column=\"target\",\n",
+    "        datasets={\"train\": train_dataset},\n",
+    "        num_boost_round=10,\n",
+    "        run_config=RunConfig(\n",
+    "            callbacks=[\n",
+    "                # This is the part needed to enable logging to Comet ML.\n",
+    "                # It assumes Comet ML can find a valid API (e.g. by setting\n",
+    "                # the ``COMET_API_KEY`` environment variable).\n",
+    "                CometLoggerCallback(\n",
+    "                    project_name=comet_project,\n",
+    "                    save_checkpoints=True,\n",
+    "                )\n",
+    "            ]\n",
+    "        ),\n",
+    "    )\n",
+    "    result = trainer.fit()\n",
+    "    return result"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "711b1d7d",
+   "metadata": {},
+   "source": [
+    "Let's kick off a run:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "9bfd9a8d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-05-19 15:19:17,237\tINFO services.py:1483 -- View the Ray dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8265\u001B[39m\u001B[22m\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "== Status ==<br>Current time: 2022-05-19 15:19:35 (running for 00:00:14.95)<br>Memory usage on this node: 10.2/16.0 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/5.12 GiB heap, 0.0/2.0 GiB objects<br>Result logdir: /Users/kai/ray_results/XGBoostTrainer_2022-05-19_15-19-19<br>Number of trials: 1/1 (1 TERMINATED)<br><table>\n",
+       "<thead>\n",
+       "<tr><th>Trial name                </th><th>status    </th><th>loc            </th><th style=\"text-align: right;\">  iter</th><th style=\"text-align: right;\">  total time (s)</th><th style=\"text-align: right;\">  train-rmse</th></tr>\n",
+       "</thead>\n",
+       "<tbody>\n",
+       "<tr><td>XGBoostTrainer_ac544_00000</td><td>TERMINATED</td><td>127.0.0.1:19852</td><td style=\"text-align: right;\">    10</td><td style=\"text-align: right;\">          9.7203</td><td style=\"text-align: right;\">    0.030717</td></tr>\n",
+       "</tbody>\n",
+       "</table><br><br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "COMET WARNING: As you are running in a Jupyter environment, you will need to call `experiment.end()` when finished to ensure all metrics and code are logged before exiting.\n",
+      "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:21,584\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=16 --runtime-env-hash=-2010331134\n",
+      "COMET INFO: Experiment is live on comet.ml https://www.comet.ml/krfricke/ray-air-example/ecd3726ca127497ba7386003a249fad6\n",
+      "\n",
+      "COMET WARNING: Failed to add tag(s) None to the experiment\n",
+      "\n",
+      "COMET WARNING: Empty mapping given to log_params({}); ignoring\n",
+      "\u001B[2m\u001B[36m(GBDTTrainable pid=19852)\u001B[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n",
+      "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:24,628\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=17 --runtime-env-hash=-2010331069\n",
+      "\u001B[2m\u001B[36m(GBDTTrainable pid=19852)\u001B[0m 2022-05-19 15:19:25,961\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n",
+      "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,830\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=18 --runtime-env-hash=-2010331069\n",
+      "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,918\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=20 --runtime-env-hash=-2010331134\n",
+      "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,922\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=21 --runtime-env-hash=-2010331134\n",
+      "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,922\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=22 --runtime-env-hash=-2010331134\n",
+      "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,923\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=19 --runtime-env-hash=-2010331134\n",
+      "\u001B[2m\u001B[36m(GBDTTrainable pid=19852)\u001B[0m 2022-05-19 15:19:29,272\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n",
+      "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=19876)\u001B[0m [15:19:29] task [xgboost.ray]:4505889744 got new rank 1\n",
+      "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=19875)\u001B[0m [15:19:29] task [xgboost.ray]:6941849424 got new rank 0\n",
+      "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+      "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+      "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 1.0.0 created\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Result for XGBoostTrainer_ac544_00000:\n",
+      "  date: 2022-05-19_15-19-30\n",
+      "  done: false\n",
+      "  experiment_id: d3007bd6a2734b328fd90385485c5a8d\n",
+      "  hostname: Kais-MacBook-Pro.local\n",
+      "  iterations_since_restore: 1\n",
+      "  node_ip: 127.0.0.1\n",
+      "  pid: 19852\n",
+      "  should_checkpoint: true\n",
+      "  time_since_restore: 6.529659032821655\n",
+      "  time_this_iter_s: 6.529659032821655\n",
+      "  time_total_s: 6.529659032821655\n",
+      "  timestamp: 1652969970\n",
+      "  timesteps_since_restore: 0\n",
+      "  train-rmse: 0.357284\n",
+      "  training_iteration: 1\n",
+      "  trial_id: ac544_00000\n",
+      "  warmup_time: 0.003961086273193359\n",
+      "  \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "COMET INFO: Scheduling the upload of 3 assets for a size of 2.48 KB, this can take some time\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:1.0.0' has started uploading asynchronously\n",
+      "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+      "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+      "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 2.0.0 created (previous was: 1.0.0)\n",
+      "COMET INFO: Scheduling the upload of 3 assets for a size of 3.86 KB, this can take some time\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:2.0.0' has started uploading asynchronously\n",
+      "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+      "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+      "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 3.0.0 created (previous was: 2.0.0)\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:1.0.0' has been fully uploaded successfully\n",
+      "COMET INFO: Scheduling the upload of 3 assets for a size of 5.31 KB, this can take some time\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:3.0.0' has started uploading asynchronously\n",
+      "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+      "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+      "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 4.0.0 created (previous was: 3.0.0)\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:2.0.0' has been fully uploaded successfully\n",
+      "COMET INFO: Scheduling the upload of 3 assets for a size of 6.76 KB, this can take some time\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:4.0.0' has started uploading asynchronously\n",
+      "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+      "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+      "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 5.0.0 created (previous was: 4.0.0)\n",
+      "COMET INFO: Scheduling the upload of 3 assets for a size of 8.21 KB, this can take some time\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:3.0.0' has been fully uploaded successfully\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:5.0.0' has started uploading asynchronously\n",
+      "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+      "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:4.0.0' has been fully uploaded successfully\n",
+      "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 6.0.0 created (previous was: 5.0.0)\n",
+      "COMET INFO: Scheduling the upload of 3 assets for a size of 9.87 KB, this can take some time\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:6.0.0' has started uploading asynchronously\n",
+      "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+      "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:5.0.0' has been fully uploaded successfully\n",
+      "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 7.0.0 created (previous was: 6.0.0)\n",
+      "COMET INFO: Scheduling the upload of 3 assets for a size of 11.46 KB, this can take some time\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:7.0.0' has started uploading asynchronously\n",
+      "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+      "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:6.0.0' has been fully uploaded successfully\n",
+      "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 8.0.0 created (previous was: 7.0.0)\n",
+      "COMET INFO: Scheduling the upload of 3 assets for a size of 12.84 KB, this can take some time\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:8.0.0' has started uploading asynchronously\n",
+      "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+      "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:7.0.0' has been fully uploaded successfully\n",
+      "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 9.0.0 created (previous was: 8.0.0)\n",
+      "COMET INFO: Scheduling the upload of 3 assets for a size of 14.36 KB, this can take some time\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:9.0.0' has started uploading asynchronously\n",
+      "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n",
+      "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:8.0.0' has been fully uploaded successfully\n",
+      "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 10.0.0 created (previous was: 9.0.0)\n",
+      "COMET INFO: Scheduling the upload of 3 assets for a size of 16.37 KB, this can take some time\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:10.0.0' has started uploading asynchronously\n",
+      "\u001B[2m\u001B[36m(GBDTTrainable pid=19852)\u001B[0m 2022-05-19 15:19:33,890\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=569 in 7.96 seconds (4.61 pure XGBoost training time).\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:9.0.0' has been fully uploaded successfully\n",
+      "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 11.0.0 created (previous was: 10.0.0)\n",
+      "COMET INFO: Scheduling the upload of 3 assets for a size of 16.39 KB, this can take some time\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:11.0.0' has started uploading asynchronously\n",
+      "COMET INFO: ---------------------------\n",
+      "COMET INFO: Comet.ml Experiment Summary\n",
+      "COMET INFO: ---------------------------\n",
+      "COMET INFO:   Data:\n",
+      "COMET INFO:     display_summary_level : 1\n",
+      "COMET INFO:     url                   : https://www.comet.ml/krfricke/ray-air-example/ecd3726ca127497ba7386003a249fad6\n",
+      "COMET INFO:   Metrics [count] (min, max):\n",
+      "COMET INFO:     iterations_since_restore [10] : (1, 10)\n",
+      "COMET INFO:     time_since_restore [10]       : (6.529659032821655, 9.720295906066895)\n",
+      "COMET INFO:     time_this_iter_s [10]         : (0.3124058246612549, 6.529659032821655)\n",
+      "COMET INFO:     time_total_s [10]             : (6.529659032821655, 9.720295906066895)\n",
+      "COMET INFO:     timestamp [10]                : (1652969970, 1652969973)\n",
+      "COMET INFO:     timesteps_since_restore       : 0\n",
+      "COMET INFO:     train-rmse [10]               : (0.030717, 0.357284)\n",
+      "COMET INFO:     training_iteration [10]       : (1, 10)\n",
+      "COMET INFO:     warmup_time                   : 0.003961086273193359\n",
+      "COMET INFO:   Others:\n",
+      "COMET INFO:     Created from  : Ray\n",
+      "COMET INFO:     Name          : XGBoostTrainer_ac544_00000\n",
+      "COMET INFO:     experiment_id : d3007bd6a2734b328fd90385485c5a8d\n",
+      "COMET INFO:     trial_id      : ac544_00000\n",
+      "COMET INFO:   System Information:\n",
+      "COMET INFO:     date     : 2022-05-19_15-19-33\n",
+      "COMET INFO:     hostname : Kais-MacBook-Pro.local\n",
+      "COMET INFO:     node_ip  : 127.0.0.1\n",
+      "COMET INFO:     pid      : 19852\n",
+      "COMET INFO:   Uploads:\n",
+      "COMET INFO:     artifact assets     : 33 (107.92 KB)\n",
+      "COMET INFO:     artifacts           : 11\n",
+      "COMET INFO:     environment details : 1\n",
+      "COMET INFO:     filename            : 1\n",
+      "COMET INFO:     installed packages  : 1\n",
+      "COMET INFO:     notebook            : 1\n",
+      "COMET INFO:     source_code         : 1\n",
+      "COMET INFO: ---------------------------\n",
+      "COMET INFO: Uploading metrics, params, and assets to Comet before program termination (may take several seconds)\n",
+      "COMET INFO: The Python SDK has 3600 seconds to finish before aborting...\n",
+      "COMET INFO: Waiting for completion of the file uploads (may take several seconds)\n",
+      "COMET INFO: The Python SDK has 10800 seconds to finish before aborting...\n",
+      "COMET INFO: Still uploading 6 file(s), remaining 21.05 KB/116.69 KB\n",
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:10.0.0' has been fully uploaded successfully\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:11.0.0' has been fully uploaded successfully\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Result for XGBoostTrainer_ac544_00000:\n",
+      "  date: 2022-05-19_15-19-33\n",
+      "  done: true\n",
+      "  experiment_id: d3007bd6a2734b328fd90385485c5a8d\n",
+      "  experiment_tag: '0'\n",
+      "  hostname: Kais-MacBook-Pro.local\n",
+      "  iterations_since_restore: 10\n",
+      "  node_ip: 127.0.0.1\n",
+      "  pid: 19852\n",
+      "  should_checkpoint: true\n",
+      "  time_since_restore: 9.720295906066895\n",
+      "  time_this_iter_s: 0.39761900901794434\n",
+      "  time_total_s: 9.720295906066895\n",
+      "  timestamp: 1652969973\n",
+      "  timesteps_since_restore: 0\n",
+      "  train-rmse: 0.030717\n",
+      "  training_iteration: 10\n",
+      "  trial_id: ac544_00000\n",
+      "  warmup_time: 0.003961086273193359\n",
+      "  \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-05-19 15:19:35,621\tINFO tune.py:753 -- Total run time: 15.75 seconds (14.94 seconds for the tuning loop).\n"
+     ]
+    }
+   ],
+   "source": [
+    "comet_project = \"ray_air_example\"\n",
+    "\n",
+    "train_dataset = get_train_dataset()\n",
+    "result = train_model(train_dataset=train_dataset, comet_project=comet_project)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "be28bdd3",
+   "metadata": {},
+   "source": [
+    "Check out your [Comet ML](https://www.comet.ml/) project to see the results!"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "cell_metadata_filter": "-all",
+   "main_language": "python",
+   "notebook_metadata_filter": "-all"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  },
+  "orphan": true
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/ray/air/examples/upload_to_wandb.ipynb b/python/ray/air/examples/upload_to_wandb.ipynb
deleted file mode 120000
index e241f6fcbd391..0000000000000
--- a/python/ray/air/examples/upload_to_wandb.ipynb
+++ /dev/null
@@ -1 +0,0 @@
-../../../../doc/source/ray-air/examples/upload_to_wandb.ipynb
\ No newline at end of file
diff --git a/python/ray/air/examples/upload_to_wandb.ipynb b/python/ray/air/examples/upload_to_wandb.ipynb
new file mode 100644
index 0000000000000..8c59392194904
--- /dev/null
+++ b/python/ray/air/examples/upload_to_wandb.ipynb
@@ -0,0 +1,369 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "f37e8a9f",
+   "metadata": {},
+   "source": [
+    "# Logging results and uploading models to Weights & Biases\n",
+    "In this example, we train a simple XGBoost model and log the training\n",
+    "results to Weights & Biases. We also save the resulting model checkpoints\n",
+    "as artifacts.\n",
+    "\n",
+    "There are two ways to achieve this:\n",
+    "\n",
+    "1. Automatically using the `ray.air.integrations.wandb.WandbLoggerCallback`\n",
+    "2. Manually using the `wandb` API\n",
+    "\n",
+    "This tutorial will walk you through both options."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "27d04c97",
+   "metadata": {},
+   "source": [
+    "Let's start with installing our dependencies:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "4e697e5d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -qU \"ray[tune]\" scikit-learn xgboost_ray wandb"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "3096e7c9",
+   "metadata": {},
+   "source": [
+    "Then we need some imports:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9c286701",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ray\n",
+    "\n",
+    "from ray.train import Result, RunConfig, ScalingConfig\n",
+    "from ray.air.integrations.wandb import WandbLoggerCallback\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "2efa1564",
+   "metadata": {},
+   "source": [
+    "We define a simple function that returns our training dataset as a Dataset:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "a63ebd10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_train_dataset() -> ray.data.Dataset:\n",
+    "    dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer.csv\")\n",
+    "    return dataset\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "5fc1ca73",
+   "metadata": {},
+   "source": [
+    "And that's the common parts. We now dive into the two options to interact with Weights and Biases."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "d07cf41f",
+   "metadata": {},
+   "source": [
+    "## Using the WandbLoggerCallback\n",
+    "\n",
+    "The WandbLoggerCallback does all the logging and reporting for you. It is especially useful when you use an out-of-the-box trainer like `XGBoostTrainer`. In these trainers, you don't define your own training loop, so using the AIR W&B callback is the best way to log your results to Weights and Biases.\n",
+    "\n",
+    "First we define a simple training function.\n",
+    "\n",
+    "All the magic happens within the `WandbLoggerCallback`:\n",
+    "\n",
+    "```python\n",
+    "WandbLoggerCallback(\n",
+    "    project=wandb_project,\n",
+    "    save_checkpoints=True,\n",
+    ")\n",
+    "```\n",
+    "\n",
+    "It will automatically log all results to Weights & Biases and upload the checkpoints as artifacts. It assumes you're logged in into Wandb via an API key or `wandb login`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "52edfde0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ray.train.xgboost import XGBoostTrainer\n",
+    "\n",
+    "\n",
+    "def train_model_xgboost(train_dataset: ray.data.Dataset, wandb_project: str) -> Result:\n",
+    "    \"\"\"Train a simple XGBoost model and return the result.\"\"\"\n",
+    "    trainer = XGBoostTrainer(\n",
+    "        scaling_config=ScalingConfig(num_workers=2),\n",
+    "        params={\"tree_method\": \"auto\"},\n",
+    "        label_column=\"target\",\n",
+    "        datasets={\"train\": train_dataset},\n",
+    "        num_boost_round=10,\n",
+    "        run_config=RunConfig(\n",
+    "            callbacks=[\n",
+    "                # This is the part needed to enable logging to Weights & Biases.\n",
+    "                # It assumes you've logged in before, e.g. with `wandb login`.\n",
+    "                WandbLoggerCallback(\n",
+    "                    project=wandb_project,\n",
+    "                    save_checkpoints=True,\n",
+    "                )\n",
+    "            ]\n",
+    "        ),\n",
+    "    )\n",
+    "    result = trainer.fit()\n",
+    "    return result\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "1959ce19",
+   "metadata": {},
+   "source": [
+    "Let's kick off a run:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64f80d6c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-10-28 16:28:19,325\tINFO worker.py:1524 -- Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8265 \u001B[39m\u001B[22m\n",
+      "2022-10-28 16:28:22,993\tWARNING read_api.py:297 -- ⚠️  The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n",
+      "2022-10-28 16:28:26,033\tINFO wandb.py:267 -- Already logged into W&B.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "wandb_project = \"ray_air_example_xgboost\"\n",
+    "\n",
+    "train_dataset = get_train_dataset()\n",
+    "result = train_model_xgboost(train_dataset=train_dataset, wandb_project=wandb_project)\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "78701c42",
+   "metadata": {},
+   "source": [
+    "Check out your [WandB](https://wandb.ai/) project to see the results!"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "a215b6d4",
+   "metadata": {},
+   "source": [
+    "## Using the `wandb` API\n",
+    "\n",
+    "When you define your own training loop, you sometimes want to manually interact with the Weights and Biases API. Ray AIR provides a `setup_wandb()` function that takes care of the initialization.\n",
+    "\n",
+    "The main benefit here is that authentication to Weights and Biases is automatically set up for you, and sensible default names for your runs are set. Of course, you can override these.\n",
+    "\n",
+    "Additionally in distributed training you often only want to report the results of the rank 0 worker. This can also be done automatically using our setup.\n",
+    "\n",
+    "Let's define a distributed training loop. The important part here are:\n",
+    "\n",
+    "    wandb = setup_wandb(config)\n",
+    "    \n",
+    "and later\n",
+    "\n",
+    "    wandb.log({\"loss\": loss.item()})\n",
+    "    \n",
+    "The call to `setup_wandb()` will setup your session, for instance calling `wandb.init()` with sensible defaults. Because we are in a distributed training setting, this will only happen for the rank 0 - all other workers get a mock object back, and any subsequent calls to `wandb.XXX` will be a no-op for these.\n",
+    "\n",
+    "You can then use the `wandb` as usual:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "154e233d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ray import train\n",
+    "from ray.air.integrations.wandb import setup_wandb\n",
+    "from ray.data.preprocessors import Concatenator\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "import torch.optim as optim\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "def train_loop(config):\n",
+    "    wandb = setup_wandb(config, project=config.get(\"wandb_project\"))\n",
+    "    \n",
+    "    dataset = train.get_dataset_shard(\"train\")\n",
+    "\n",
+    "    model = nn.Linear(30, 2)\n",
+    "\n",
+    "    optimizer = optim.SGD(\n",
+    "        model.parameters(),\n",
+    "        lr=config.get(\"lr\", 0.01),\n",
+    "    )\n",
+    "    loss_fn = nn.CrossEntropyLoss()\n",
+    "    \n",
+    "    for batch in dataset.iter_torch_batches(batch_size=32):\n",
+    "        X = batch[\"data\"]\n",
+    "        y = batch[\"target\"]\n",
+    "        \n",
+    "        # Compute prediction error\n",
+    "        pred = model(X)\n",
+    "        loss = loss_fn(pred, y)\n",
+    "\n",
+    "        # Backpropagation\n",
+    "        optimizer.zero_grad()\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        \n",
+    "        train.report({\"loss\": loss.item()})\n",
+    "        wandb.log({\"loss\": loss.item()})\n",
+    "    "
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "9aa12feb",
+   "metadata": {},
+   "source": [
+    "Let's define a function to kick off the training - again, we can configure Weights and Biases settings in the config. But you could also just pass it to the setup function, e.g. like this:\n",
+    "\n",
+    "    setup_wandb(config, project=\"my_project\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5ae7c8c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ray.train.torch import TorchTrainer\n",
+    "\n",
+    "\n",
+    "def train_model_torch(train_dataset: ray.data.Dataset, wandb_project: str) -> Result:\n",
+    "    \"\"\"Train a simple XGBoost model and return the result.\"\"\"\n",
+    "    trainer = TorchTrainer(\n",
+    "        train_loop_per_worker=train_loop,\n",
+    "        scaling_config=ScalingConfig(num_workers=2),\n",
+    "        train_loop_config={\"lr\": 0.01, \"wandb_project\": wandb_project},\n",
+    "        datasets={\"train\": train_dataset},\n",
+    "        preprocessor=Concatenator(\"data\", dtype=np.float32, exclude=[\"target\"]),\n",
+    "    )\n",
+    "    result = trainer.fit()\n",
+    "    return result\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "12049bcf",
+   "metadata": {},
+   "source": [
+    "Let's kick off this run:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3825b35b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wandb_project = \"ray_air_example_torch\"\n",
+    "\n",
+    "train_dataset = get_train_dataset()\n",
+    "result = train_model_torch(train_dataset=train_dataset, wandb_project=wandb_project)\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "75fddee7",
+   "metadata": {},
+   "source": [
+    "Check out your [WandB](https://wandb.ai/) project to see the results!"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "cell_metadata_filter": "-all",
+   "main_language": "python",
+   "notebook_metadata_filter": "-all"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.9"
+  },
+  "orphan": true
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/ray/air/examples/xgboost_example.ipynb b/python/ray/air/examples/xgboost_example.ipynb
deleted file mode 120000
index 5780af02bdf5c..0000000000000
--- a/python/ray/air/examples/xgboost_example.ipynb
+++ /dev/null
@@ -1 +0,0 @@
-../../../../doc/source/ray-air/examples/xgboost_example.ipynb
\ No newline at end of file
diff --git a/python/ray/air/examples/xgboost_example.ipynb b/python/ray/air/examples/xgboost_example.ipynb
new file mode 100644
index 0000000000000..0ebcea9470a8a
--- /dev/null
+++ b/python/ray/air/examples/xgboost_example.ipynb
@@ -0,0 +1,521 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "5fb89b3d",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "(air-xgboost-example-ref)=\n",
+    "\n",
+    "# Training a model with distributed XGBoost\n",
+    "In this example we will train a model in Ray AIR using distributed XGBoost."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "53d57c1f",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "Let's start with installing our dependencies:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "41f20cc1",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.2\u001B[0m\n",
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install -qU \"ray[tune]\" xgboost_ray"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "d2fe8d4a",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "Then we need some imports:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "7232303d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Tuple\n",
+    "\n",
+    "import ray\n",
+    "from ray.train.xgboost import XGBoostPredictor\n",
+    "from ray.train.xgboost import XGBoostTrainer\n",
+    "from ray.train import Result, ScalingConfig\n",
+    "from ray.data import Dataset\n",
+    "from ray.data.preprocessors import StandardScaler"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "1c75b5ca",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "Next we define a function to load our train, validation, and test datasets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "37c4f38f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:\n",
+    "    dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer.csv\")\n",
+    "    train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)\n",
+    "    test_dataset = valid_dataset.drop_columns([\"target\"])\n",
+    "    return train_dataset, valid_dataset, test_dataset"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "9b2850dd",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "The following function will create a XGBoost trainer, train it, and return the result."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "dae8998d",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def train_xgboost(num_workers: int, use_gpu: bool = False) -> Result:\n",
+    "    train_dataset, valid_dataset, _ = prepare_data()\n",
+    "\n",
+    "    # Scale some random columns\n",
+    "    columns_to_scale = [\"mean radius\", \"mean texture\"]\n",
+    "    preprocessor = StandardScaler(columns=columns_to_scale)\n",
+    "\n",
+    "    # XGBoost specific params\n",
+    "    params = {\n",
+    "        \"tree_method\": \"approx\",\n",
+    "        \"objective\": \"binary:logistic\",\n",
+    "        \"eval_metric\": [\"logloss\", \"error\"],\n",
+    "    }\n",
+    "\n",
+    "    trainer = XGBoostTrainer(\n",
+    "        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),\n",
+    "        label_column=\"target\",\n",
+    "        params=params,\n",
+    "        datasets={\"train\": train_dataset, \"valid\": valid_dataset},\n",
+    "        preprocessor=preprocessor,\n",
+    "        num_boost_round=100,\n",
+    "    )\n",
+    "    result = trainer.fit()\n",
+    "    print(result.metrics)\n",
+    "\n",
+    "    return result"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "ce05af87",
+   "metadata": {},
+   "source": [
+    "Once we have the result, we can do batch inference on the obtained model. Let's define a utility function for this."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "5b8076d3",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from ray.train import Checkpoint\n",
+    "from ray.data import ActorPoolStrategy\n",
+    "\n",
+    "\n",
+    "class Predict:\n",
+    "\n",
+    "    def __init__(self, checkpoint: Checkpoint):\n",
+    "        self.predictor = XGBoostPredictor.from_checkpoint(checkpoint)\n",
+    "\n",
+    "    def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:\n",
+    "        return self.predictor.predict(batch)\n",
+    "\n",
+    "\n",
+    "def predict_xgboost(result: Result):\n",
+    "    _, _, test_dataset = prepare_data()\n",
+    "\n",
+    "    scores = test_dataset.map_batches(\n",
+    "        Predict, \n",
+    "        fn_constructor_args=[result.checkpoint], \n",
+    "        compute=ActorPoolStrategy(), \n",
+    "        batch_format=\"pandas\"\n",
+    "    )\n",
+    "    \n",
+    "    predicted_labels = scores.map_batches(lambda df: (df > 0.5).astype(int), batch_format=\"pandas\")\n",
+    "    print(f\"PREDICTED LABELS\")\n",
+    "    predicted_labels.show()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "7e172f66",
+   "metadata": {},
+   "source": [
+    "Now we can run the training:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "0f96d62b",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"tuneStatus\">\n",
+       "  <div style=\"display: flex;flex-direction: row\">\n",
+       "    <div style=\"display: flex;flex-direction: column;\">\n",
+       "      <h3>Tune Status</h3>\n",
+       "      <table>\n",
+       "<tbody>\n",
+       "<tr><td>Current time:</td><td>2023-07-06 18:33:25</td></tr>\n",
+       "<tr><td>Running for: </td><td>00:00:06.19        </td></tr>\n",
+       "<tr><td>Memory:      </td><td>14.9/64.0 GiB      </td></tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "    </div>\n",
+       "    <div class=\"vDivider\"></div>\n",
+       "    <div class=\"systemInfo\">\n",
+       "      <h3>System Info</h3>\n",
+       "      Using FIFO scheduling algorithm.<br>Logical resource usage: 2.0/10 CPUs, 0/0 GPUs\n",
+       "    </div>\n",
+       "    \n",
+       "  </div>\n",
+       "  <div class=\"hDivider\"></div>\n",
+       "  <div class=\"trialStatus\">\n",
+       "    <h3>Trial Status</h3>\n",
+       "    <table>\n",
+       "<thead>\n",
+       "<tr><th>Trial name                </th><th>status    </th><th>loc            </th><th style=\"text-align: right;\">  iter</th><th style=\"text-align: right;\">  total time (s)</th><th style=\"text-align: right;\">  train-logloss</th><th style=\"text-align: right;\">  train-error</th><th style=\"text-align: right;\">  valid-logloss</th></tr>\n",
+       "</thead>\n",
+       "<tbody>\n",
+       "<tr><td>XGBoostTrainer_40fed_00000</td><td>TERMINATED</td><td>127.0.0.1:40725</td><td style=\"text-align: right;\">   101</td><td style=\"text-align: right;\">         4.90132</td><td style=\"text-align: right;\">     0.00587595</td><td style=\"text-align: right;\">            0</td><td style=\"text-align: right;\">        0.06215</td></tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "  </div>\n",
+       "</div>\n",
+       "<style>\n",
+       ".tuneStatus {\n",
+       "  color: var(--jp-ui-font-color1);\n",
+       "}\n",
+       ".tuneStatus .systemInfo {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       ".tuneStatus td {\n",
+       "  white-space: nowrap;\n",
+       "}\n",
+       ".tuneStatus .trialStatus {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       ".tuneStatus h3 {\n",
+       "  font-weight: bold;\n",
+       "}\n",
+       ".tuneStatus .hDivider {\n",
+       "  border-bottom-width: var(--jp-border-width);\n",
+       "  border-bottom-color: var(--jp-border-color0);\n",
+       "  border-bottom-style: solid;\n",
+       "}\n",
+       ".tuneStatus .vDivider {\n",
+       "  border-left-width: var(--jp-border-width);\n",
+       "  border-left-color: var(--jp-border-color0);\n",
+       "  border-left-style: solid;\n",
+       "  margin: 0.5em 1em 0.5em 1em;\n",
+       "}\n",
+       "</style>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n",
+      "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.\n",
+      "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m Executing DAG InputDataBuffer[Input] -> AllToAllOperator[Aggregate]\n",
+      "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
+      "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n",
+      "\n",
+      "\u001B[A\n",
+      "\u001B[A\n",
+      "\n",
+      "\u001B[A\u001B[A\n",
+      "\n",
+      "(pid=40725) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory:   0%|          | 0/14 [00:00<?, ?it/s]\n",
+      "\u001B[A\n",
+      "\u001B[A                                                              \n",
+      "\n",
+      "\u001B[A\u001B[A                                                                      \n",
+      "\n",
+      "\n",
+      "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(StandardScaler._transform_pandas)]\n",
+      "\n",
+      "\u001B[A\n",
+      "\n",
+      "(pid=40725) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory:   0%|          | 0/14 [00:01<?, ?it/s]\n",
+      "\u001B[A                                                              \n",
+      "\n",
+      "\u001B[A\u001B[A                                                                       \n",
+      "\n",
+      "\n",
+      "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
+      "\n",
+      "\u001B[A\n",
+      "\n",
+      "(pid=40725) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory:   0%|          | 0/14 [00:01<?, ?it/s]\n",
+      "\u001B[A                                                              \n",
+      "\n",
+      "\u001B[A\u001B[A                                                                       \n",
+      "\n",
+      "\n",
+      "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n",
+      "\n",
+      "\u001B[A\n",
+      "\n",
+      "(pid=40725) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory:   0%|          | 0/14 [00:01<?, ?it/s]\n",
+      "\u001B[A                                                              \n",
+      "\n",
+      "\u001B[A\u001B[A                                                                       \n",
+      "\n",
+      "\n",
+      "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(StandardScaler._transform_pandas)]\n",
+      "\n",
+      "\u001B[A\n",
+      "\n",
+      "(pid=40725) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory:   0%|          | 0/14 [00:01<?, ?it/s]\n",
+      "\u001B[A                                                              \n",
+      "\n",
+      "\u001B[A\u001B[A                                                                       \n",
+      "\n",
+      "\n",
+      "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
+      "\n",
+      "\u001B[A\n",
+      "\n",
+      "(pid=40725) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory:   0%|          | 0/14 [00:01<?, ?it/s]\n",
+      "\u001B[A                                                              \n",
+      "\n",
+      "\u001B[A\u001B[A                                                                       \n",
+      "\n",
+      "\n",
+      "\u001B[2m\u001B[36m(XGBoostTrainer pid=40725)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n",
+      "\n",
+      "\u001B[A\n",
+      "\n",
+      "(pid=40725) Running: 0.0/10.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory:   0%|          | 0/14 [00:01<?, ?it/s]\n",
+      "\u001B[A\n",
+      "\n",
+      "                                                                                                                                     \n",
+      "\u001B[A\n",
+      "\n",
+      "\u001B[A\u001B[A\n",
+      "\n",
+      "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=40741)\u001B[0m [18:33:23] task [xgboost.ray]:5022217360 got new rank 1                                   \n",
+      "2023-07-06 18:33:25,975\tINFO tune.py:1148 -- Total run time: 6.20 seconds (6.19 seconds for the tuning loop).\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'train-logloss': 0.00587594546605992, 'train-error': 0.0, 'valid-logloss': 0.06215000962556052, 'valid-error': 0.02941176470588235, 'time_this_iter_s': 0.0101318359375, 'should_checkpoint': True, 'done': True, 'training_iteration': 101, 'trial_id': '40fed_00000', 'date': '2023-07-06_18-33-25', 'timestamp': 1688693605, 'time_total_s': 4.901317834854126, 'pid': 40725, 'hostname': 'Balajis-MacBook-Pro-16', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 4.901317834854126, 'iterations_since_restore': 101, 'experiment_tag': '0'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "result = train_xgboost(num_workers=2, use_gpu=False)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "7055ad1b",
+   "metadata": {},
+   "source": [
+    "And perform inference on the obtained model:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "283b1dba",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-07-06 18:33:27,259\tINFO read_api.py:374 -- To satisfy the requested parallelism of 20, each read task output will be split into 20 smaller blocks.\n",
+      "2023-07-06 18:33:28,112\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[MapBatches(<lambda>)->MapBatches(Predict)] -> TaskPoolMapOperator[MapBatches(<lambda>)]\n",
+      "2023-07-06 18:33:28,112\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
+      "2023-07-06 18:33:28,114\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n",
+      "2023-07-06 18:33:28,150\tINFO actor_pool_map_operator.py:117 -- MapBatches(<lambda>)->MapBatches(Predict): Waiting for 1 pool actors to start...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PREDICTED LABELS\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                                                        "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 0}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 0}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 0}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 0}\n",
+      "{'predictions': 0}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 1}\n",
+      "{'predictions': 0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r"
+     ]
+    }
+   ],
+   "source": [
+    "predict_xgboost(result)"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "cell_metadata_filter": "-all",
+   "main_language": "python",
+   "notebook_metadata_filter": "-all"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('venv': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  },
+  "orphan": true,
+  "vscode": {
+   "interpreter": {
+    "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb
deleted file mode 120000
index a65044dfacf95..0000000000000
--- a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb
+++ /dev/null
@@ -1 +0,0 @@
-../../../doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb
\ No newline at end of file
diff --git a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb
new file mode 100644
index 0000000000000..4133eb084c43f
--- /dev/null
+++ b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb
@@ -0,0 +1,1200 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "(gptj_deepspeed_finetune)=\n",
+    "\n",
+    "# GPT-J-6B Fine-Tuning with Ray AIR and DeepSpeed\n",
+    "\n",
+    "In this example, we will showcase how to use the Ray AIR for **GPT-J fine-tuning**. GPT-J is a GPT-2-like causal language model trained on the Pile dataset. This particular model has 6 billion parameters. For more information on GPT-J, click [here](https://huggingface.co/docs/transformers/model_doc/gptj).\n",
+    "\n",
+    "We will use Ray AIR (with the 🤗 Transformers integration) and a pretrained model from Hugging Face hub. Note that you can easily adapt this example to use other similar models.\n",
+    "\n",
+    "This example focuses more on the performance and distributed computing aspects of Ray AIR. If you are looking for a more beginner-friendly introduction to Ray AIR 🤗 Transformers integration, see {doc}`this example </ray-air/examples/huggingface_text_classification>`.\n",
+    "\n",
+    "It is highly recommended to read [Ray Train Key Concepts](train-key-concepts) and [Ray Data Key Concepts](data_key_concepts) before starting this example.\n",
+    "\n",
+    "```{note}\n",
+    "To run this example, make sure your Ray cluster has access to at least one GPU with 16 or more GBs of memory. The required amount of memory depends on the model. This notebook is tested with 16 g4dn.4xlarge instances (including the head node). If you wish to use a CPU head node, turn on [cloud checkpointing](tune-cloud-checkpointing) to avoid OOM errors that may happen due to the default behavior of syncing the checkpoint files to the head node.\n",
+    "```\n",
+    "\n",
+    "In this notebook, we will:\n",
+    "1. [Set up Ray](#setup)\n",
+    "2. [Load the dataset](#load)\n",
+    "3. [Preprocess the dataset with Ray AIR](#preprocess)\n",
+    "4. [Run the training with Ray AIR](#train)\n",
+    "5. [Generate text from prompt with Ray AIR](#predict)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Uncomment and run the following line in order to install all the necessary dependencies (this notebook is being tested with `transformers==4.26.0`):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#! pip install \"datasets\" \"evaluate\" \"accelerate==0.18.0\" \"transformers>=4.26.0\" \"torch>=1.12.0\" \"deepspeed==0.8.3\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import os"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set up Ray <a name=\"setup\"></a>\n",
+    "\n",
+    "First, let's set some global variables. We will use 16 workers, each being assigned 1 GPU and 8 CPUs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"EleutherAI/gpt-j-6B\"\n",
+    "use_gpu = True\n",
+    "num_workers = 16\n",
+    "cpus_per_worker = 8"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will use `ray.init()` to initialize a local cluster. By default, this cluster will be comprised of only the machine you are running this notebook on. You can also run this notebook on an Anyscale cluster.\n",
+    "\n",
+    "We define a {ref}`runtime environment <runtime-environments>` to ensure that the Ray workers have access to all the necessary packages. You can omit the `runtime_env` argument if you have all of the packages already installed on each node in your cluster."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "    <div style=\"margin-left: 50px;display: flex;flex-direction: row;align-items: center\">\n",
+       "        <h3 style=\"color: var(--jp-ui-font-color0)\">Ray</h3>\n",
+       "        <svg version=\"1.1\" id=\"ray\" width=\"3em\" viewBox=\"0 0 144.5 144.6\" style=\"margin-left: 3em;margin-right: 3em\">\n",
+       "            <g id=\"layer-1\">\n",
+       "                <path fill=\"#00a2e9\" class=\"st0\" d=\"M97.3,77.2c-3.8-1.1-6.2,0.9-8.3,5.1c-3.5,6.8-9.9,9.9-17.4,9.6S58,88.1,54.8,81.2c-1.4-3-3-4-6.3-4.1\n",
+       "                    c-5.6-0.1-9.9,0.1-13.1,6.4c-3.8,7.6-13.6,10.2-21.8,7.6C5.2,88.4-0.4,80.5,0,71.7c0.1-8.4,5.7-15.8,13.8-18.2\n",
+       "                    c8.4-2.6,17.5,0.7,22.3,8c1.3,1.9,1.3,5.2,3.6,5.6c3.9,0.6,8,0.2,12,0.2c1.8,0,1.9-1.6,2.4-2.8c3.5-7.8,9.7-11.8,18-11.9\n",
+       "                    c8.2-0.1,14.4,3.9,17.8,11.4c1.3,2.8,2.9,3.6,5.7,3.3c1-0.1,2,0.1,3,0c2.8-0.5,6.4,1.7,8.1-2.7s-2.3-5.5-4.1-7.5\n",
+       "                    c-5.1-5.7-10.9-10.8-16.1-16.3C84,38,81.9,37.1,78,38.3C66.7,42,56.2,35.7,53,24.1C50.3,14,57.3,2.8,67.7,0.5\n",
+       "                    C78.4-2,89,4.7,91.5,15.3c0.1,0.3,0.1,0.5,0.2,0.8c0.7,3.4,0.7,6.9-0.8,9.8c-1.7,3.2-0.8,5,1.5,7.2c6.7,6.5,13.3,13,19.8,19.7\n",
+       "                    c1.8,1.8,3,2.1,5.5,1.2c9.1-3.4,17.9-0.6,23.4,7c4.8,6.9,4.6,16.1-0.4,22.9c-5.4,7.2-14.2,9.9-23.1,6.5c-2.3-0.9-3.5-0.6-5.1,1.1\n",
+       "                    c-6.7,6.9-13.6,13.7-20.5,20.4c-1.8,1.8-2.5,3.2-1.4,5.9c3.5,8.7,0.3,18.6-7.7,23.6c-7.9,5-18.2,3.8-24.8-2.9\n",
+       "                    c-6.4-6.4-7.4-16.2-2.5-24.3c4.9-7.8,14.5-11,23.1-7.8c3,1.1,4.7,0.5,6.9-1.7C91.7,98.4,98,92.3,104.2,86c1.6-1.6,4.1-2.7,2.6-6.2\n",
+       "                    c-1.4-3.3-3.8-2.5-6.2-2.6C99.8,77.2,98.9,77.2,97.3,77.2z M72.1,29.7c5.5,0.1,9.9-4.3,10-9.8c0-0.1,0-0.2,0-0.3\n",
+       "                    C81.8,14,77,9.8,71.5,10.2c-5,0.3-9,4.2-9.3,9.2c-0.2,5.5,4,10.1,9.5,10.3C71.8,29.7,72,29.7,72.1,29.7z M72.3,62.3\n",
+       "                    c-5.4-0.1-9.9,4.2-10.1,9.7c0,0.2,0,0.3,0,0.5c0.2,5.4,4.5,9.7,9.9,10c5.1,0.1,9.9-4.7,10.1-9.8c0.2-5.5-4-10-9.5-10.3\n",
+       "                    C72.6,62.3,72.4,62.3,72.3,62.3z M115,72.5c0.1,5.4,4.5,9.7,9.8,9.9c5.6-0.2,10-4.8,10-10.4c-0.2-5.4-4.6-9.7-10-9.7\n",
+       "                    c-5.3-0.1-9.8,4.2-9.9,9.5C115,72.1,115,72.3,115,72.5z M19.5,62.3c-5.4,0.1-9.8,4.4-10,9.8c-0.1,5.1,5.2,10.4,10.2,10.3\n",
+       "                    c5.6-0.2,10-4.9,9.8-10.5c-0.1-5.4-4.5-9.7-9.9-9.6C19.6,62.3,19.5,62.3,19.5,62.3z M71.8,134.6c5.9,0.2,10.3-3.9,10.4-9.6\n",
+       "                    c0.5-5.5-3.6-10.4-9.1-10.8c-5.5-0.5-10.4,3.6-10.8,9.1c0,0.5,0,0.9,0,1.4c-0.2,5.3,4,9.8,9.3,10\n",
+       "                    C71.6,134.6,71.7,134.6,71.8,134.6z\"/>\n",
+       "            </g>\n",
+       "        </svg>\n",
+       "        <table>\n",
+       "            <tr>\n",
+       "                <td style=\"text-align: left\"><b>Python version:</b></td>\n",
+       "                <td style=\"text-align: left\"><b>3.8.16</b></td>\n",
+       "            </tr>\n",
+       "            <tr>\n",
+       "                <td style=\"text-align: left\"><b>Ray version:</b></td>\n",
+       "                <td style=\"text-align: left\"><b> 3.0.0.dev0</b></td>\n",
+       "            </tr>\n",
+       "            <tr>\n",
+       "    <td style=\"text-align: left\"><b>Dashboard:</b></td>\n",
+       "    <td style=\"text-align: left\"><b><a href=\"http://console.anyscale-staging.com/api/v2/sessions/ses_sedlspnpy16naa5lm9kf2cmi2y/services?redirect_to=dashboard\" target=\"_blank\">http://console.anyscale-staging.com/api/v2/sessions/ses_sedlspnpy16naa5lm9kf2cmi2y/services?redirect_to=dashboard</a></b></td>\n",
+       "</tr>\n",
+       "\n",
+       "        </table>\n",
+       "    </div>\n",
+       "</div>\n"
+      ],
+      "text/plain": [
+       "RayContext(dashboard_url='console.anyscale-staging.com/api/v2/sessions/ses_sedlspnpy16naa5lm9kf2cmi2y/services?redirect_to=dashboard', python_version='3.8.16', ray_version='3.0.0.dev0', ray_commit='4ddbbb3c4b19c2d27bbf54f8c5ffc100dceafbcf', address_info={'node_ip_address': '10.0.30.196', 'raylet_ip_address': '10.0.30.196', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2023-03-06_15-55-37_997701_162/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2023-03-06_15-55-37_997701_162/sockets/raylet', 'webui_url': 'console.anyscale-staging.com/api/v2/sessions/ses_sedlspnpy16naa5lm9kf2cmi2y/services?redirect_to=dashboard', 'session_dir': '/tmp/ray/session_2023-03-06_15-55-37_997701_162', 'metrics_export_port': 8085, 'gcs_address': '10.0.30.196:6379', 'address': '10.0.30.196:6379', 'dashboard_agent_listen_port': 52365, 'node_id': '77de483c435bf4987fd6f1e91d47602554e876fd41230d8d50c05333'})"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import ray\n",
+    "\n",
+    "ray.init(\n",
+    "    runtime_env={\n",
+    "        \"pip\": [\n",
+    "            \"datasets\",\n",
+    "            \"evaluate\",\n",
+    "            # Latest combination of accelerate==0.19.0 and transformers==4.29.0\n",
+    "            # seems to have issues with DeepSpeed process group initialization,\n",
+    "            # and will result in a batch_size validation problem.\n",
+    "            # TODO(jungong) : get rid of the pins once the issue is fixed.\n",
+    "            \"accelerate==0.16.0\",\n",
+    "            \"transformers==4.26.0\",\n",
+    "            \"torch>=1.12.0\",\n",
+    "            \"deepspeed==0.9.2\",\n",
+    "        ]\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "tags": [
+     "hide-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# THIS SHOULD BE HIDDEN IN DOCS AND ONLY RAN IN CI\n",
+    "# Download the model from our S3 mirror as it's faster\n",
+    "\n",
+    "import ray\n",
+    "import subprocess\n",
+    "import ray.util.scheduling_strategies\n",
+    "\n",
+    "\n",
+    "def force_on_node(node_id: str, remote_func_or_actor_class):\n",
+    "    scheduling_strategy = ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(\n",
+    "        node_id=node_id, soft=False\n",
+    "    )\n",
+    "    options = {\"scheduling_strategy\": scheduling_strategy}\n",
+    "    return remote_func_or_actor_class.options(**options)\n",
+    "\n",
+    "\n",
+    "def run_on_every_node(remote_func_or_actor_class, **remote_kwargs):\n",
+    "    refs = []\n",
+    "    for node in ray.nodes():\n",
+    "        if node[\"Alive\"] and node[\"Resources\"].get(\"GPU\", None):\n",
+    "            refs.append(\n",
+    "                force_on_node(node[\"NodeID\"], remote_func_or_actor_class).remote(\n",
+    "                    **remote_kwargs\n",
+    "                )\n",
+    "            )\n",
+    "    return ray.get(refs)\n",
+    "\n",
+    "\n",
+    "@ray.remote(num_gpus=1)\n",
+    "def download_model():\n",
+    "    from transformers.utils.hub import TRANSFORMERS_CACHE\n",
+    "\n",
+    "    path = os.path.expanduser(\n",
+    "        os.path.join(TRANSFORMERS_CACHE, \"models--EleutherAI--gpt-j-6B\")\n",
+    "    )\n",
+    "    subprocess.run([\"mkdir\", \"-p\", os.path.join(path, \"snapshots\", \"main\")])\n",
+    "    subprocess.run([\"mkdir\", \"-p\", os.path.join(path, \"refs\")])\n",
+    "    if os.path.exists(os.path.join(path, \"refs\", \"main\")):\n",
+    "        return\n",
+    "    subprocess.run(\n",
+    "        [\n",
+    "            \"aws\",\n",
+    "            \"s3\",\n",
+    "            \"sync\",\n",
+    "            \"--no-sign-request\",\n",
+    "            \"s3://large-dl-models-mirror/models--EleutherAI--gpt-j-6B/main/\",\n",
+    "            os.path.join(path, \"snapshots\", \"main\"),\n",
+    "        ]\n",
+    "    )\n",
+    "    with open(os.path.join(path, \"snapshots\", \"main\", \"hash\"), \"r\") as f:\n",
+    "        f_hash = f.read().strip()\n",
+    "    with open(os.path.join(path, \"refs\", \"main\"), \"w\") as f:\n",
+    "        f.write(f_hash)\n",
+    "    os.rename(\n",
+    "        os.path.join(path, \"snapshots\", \"main\"), os.path.join(path, \"snapshots\", f_hash)\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "_ = run_on_every_node(download_model)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading the dataset <a name=\"load\"></a>\n",
+    "\n",
+    "We will be fine-tuning the model on the [`tiny_shakespeare` dataset](https://huggingface.co/datasets/tiny_shakespeare), comprised of 40,000 lines of Shakespeare from a variety of Shakespeare's plays. The aim will be to make the GPT-J model better at generating text in the style of Shakespeare."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading tiny_shakespeare dataset\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset tiny_shakespeare (/home/ray/.cache/huggingface/datasets/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "65894225f3b84e5caa117c4d08d9f99d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['text'],\n",
+       "        num_rows: 1\n",
+       "    })\n",
+       "    validation: Dataset({\n",
+       "        features: ['text'],\n",
+       "        num_rows: 1\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['text'],\n",
+       "        num_rows: 1\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "print(\"Loading tiny_shakespeare dataset\")\n",
+    "current_dataset = load_dataset(\"tiny_shakespeare\")\n",
+    "current_dataset"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will use [Ray Data](data) for distributed preprocessing and data ingestion. We can easily convert the dataset obtained from Hugging Face Hub to Ray Data by using {meth}`ray.data.from_huggingface`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'train': Dataset(num_blocks=1, num_rows=1, schema={text: string}),\n",
+       " 'validation': Dataset(num_blocks=1, num_rows=1, schema={text: string}),\n",
+       " 'test': Dataset(num_blocks=1, num_rows=1, schema={text: string})}"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import ray.data\n",
+    "\n",
+    "ray_datasets = {\n",
+    "    \"train\": ray.data.from_huggingface(current_dataset[\"train\"]),\n",
+    "    \"validation\": ray.data.from_huggingface(current_dataset[\"validation\"]),\n",
+    "    \"test\": ray.data.from_huggingface(current_dataset[\"test\"]),\n",
+    "}\n",
+    "ray_datasets"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Because the dataset is represented by a single large string, we will need to do some preprocessing. For that, we will define two [Ray AIR Preprocessors](air-preprocessors) using the {class}`~ray.data.preprocessors.BatchMapper` API, allowing us to define functions that will be applied on batches of data.\n",
+    "\n",
+    "The `split_text` function will take the single string and split it into separate lines, removing empty lines and character names ending with ':' (eg. 'ROMEO:'). The `tokenize` function will take the lines and tokenize them using the 🤗 Tokenizer associated with the model, ensuring each entry has the same length (`block_size`) by padding and truncating. This is necessary for training.\n",
+    "\n",
+    "```{note}\n",
+    "This preprocessing can be done in other ways. A common pattern is to tokenize first, and then split the obtained tokens into equally-sized blocks.\n",
+    "```\n",
+    "\n",
+    "We will use the `splitter` and `tokenizer` Preprocessors below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "block_size = 512"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "from ray.data.preprocessors import BatchMapper\n",
+    "\n",
+    "\n",
+    "def split_text(batch: pd.DataFrame) -> pd.DataFrame:\n",
+    "    text = list(batch[\"text\"])\n",
+    "    flat_text = \"\".join(text)\n",
+    "    split_text = [\n",
+    "        x.strip()\n",
+    "        for x in flat_text.split(\"\\n\")\n",
+    "        if x.strip() and not x.strip()[-1] == \":\"\n",
+    "    ]\n",
+    "    return pd.DataFrame(split_text, columns=[\"text\"])\n",
+    "\n",
+    "\n",
+    "def tokenize(batch: pd.DataFrame) -> dict:\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)\n",
+    "    tokenizer.pad_token = tokenizer.eos_token\n",
+    "    ret = tokenizer(\n",
+    "        list(batch[\"text\"]),\n",
+    "        truncation=True,\n",
+    "        max_length=block_size,\n",
+    "        padding=\"max_length\",\n",
+    "        return_tensors=\"np\",\n",
+    "    )\n",
+    "    ret[\"labels\"] = ret[\"input_ids\"].copy()\n",
+    "    return dict(ret)\n",
+    "\n",
+    "\n",
+    "splitter = BatchMapper(split_text, batch_format=\"pandas\")\n",
+    "tokenizer = BatchMapper(tokenize, batch_format=\"pandas\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fine-tuning the model with Ray AIR <a name=\"train\"></a>\n",
+    "\n",
+    "We can now configure Ray AIR's {class}`~ray.train.huggingface.TransformersTrainer` to perform distributed fine-tuning of the model. In order to do that, we specify a `trainer_init_per_worker` function, which creates a 🤗 Transformers `Trainer` that will be distributed by Ray using Distributed Data Parallelism (using PyTorch Distributed backend internally). This means that each worker will have its own copy of the model, but operate on different data, At the end of each step, all the workers will sync gradients.\n",
+    "\n",
+    "Because GPT-J is a relatively large model, it may not be possible to fit it on smaller GPU types (<=16 GB GRAM). To deal with that issue, we can use [DeepSpeed](https://github.com/microsoft/DeepSpeed), a library to optimize the training process and allow us to (among other things) offload and partition optimizer and parameter states, reducing GRAM usage. Furthermore, DeepSpeed ZeRO Stage 3 allows us to load large models without running out of memory.\n",
+    "\n",
+    "🤗 Transformers and Ray AIR's integration ({class}`~ray.train.huggingface.TransformersTrainer`) allow you to easily configure and use DDP and DeepSpeed. All you need to do is specify the DeepSpeed configuration in the [`TrainingArguments`](https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.TrainingArguments) object.\n",
+    "\n",
+    "```{tip}\n",
+    "There are many DeepSpeed settings that allow you to trade-off speed for memory usage. The settings used below are tailored to the cluster setup used (16 g4dn.4xlarge nodes) and per device batch size of 16. Some things to keep in mind:\n",
+    "- If your GPUs support bfloat16, use that instead of float16 mixed precision to get better performance and prevent overflows. Replace `fp16=True` with `bf16=True` in `TrainingArguments`.\n",
+    "- If you are running out of GRAM: try reducing batch size (defined in the cell below the next one), set `\"overlap_comm\": False` in DeepSpeed config.\n",
+    "- If you are running out of RAM, add more nodes to your cluster, use nodes with more RAM, set `\"pin_memory\": False` in the DeepSpeed config, reduce the batch size, and remove `\"offload_param\"` from the DeepSpeed config.\n",
+    "\n",
+    "For more information on DeepSpeed configuration, refer to [Hugging Face documentation](https://huggingface.co/docs/transformers/main_classes/deepspeed) and [DeepSpeed documentation](https://www.deepspeed.ai/docs/config-json/).\n",
+    "\n",
+    "Additionally, if you prefer a lower-level API, the logic below can be expressed as an [Accelerate training loop](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/deepspeed_with_config_support.py) distributed by a Ray AIR {class}`~ray.train.torch.torch_trainer.TorchTrainer`.\n",
+    "```\n",
+    "\n",
+    "#### Training speed\n",
+    "\n",
+    "As we are using data parallelism, each worker operates on its own shard of the data. The batch size set in `TrainingArguments` is the **per device batch size** (per worker batch size). By changing the number of workers, we can change the **effective batch size** and thus the time needed for training to complete. The effective batch size is then calculated as `per device batch size * number of workers * number of gradient accumulation steps`. As we add more workers, the effective batch size rises and thus we need less time to complete a full epoch. While the speedup is not exactly linear due to extra communication overheads, in many cases it can be close to linear.\n",
+    "\n",
+    "The preprocessed dataset has 1348 examples. We have set per device batch size to 16.\n",
+    "\n",
+    "* With 16 g4dn.4xlarge nodes, the effective batch size was 256, which equals to 85 steps per epoch. One epoch took **~2440 seconds** (including initialization time).\n",
+    "\n",
+    "* With 32 g4dn.4xlarge nodes, the effective batch size was 512, which equals to 43 steps per epoch. One epoch took **~1280 seconds** (including initialization time)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import evaluate\n",
+    "from transformers import Trainer, TrainingArguments\n",
+    "from transformers import (\n",
+    "    GPTJForCausalLM,\n",
+    "    AutoTokenizer,\n",
+    "    default_data_collator,\n",
+    ")\n",
+    "from transformers.utils.logging import disable_progress_bar, enable_progress_bar\n",
+    "import torch\n",
+    "\n",
+    "from ray import train\n",
+    "\n",
+    "\n",
+    "def trainer_init_per_worker(train_dataset, eval_dataset=None, **config):\n",
+    "    # Use the actual number of CPUs assigned by Ray\n",
+    "    os.environ[\"OMP_NUM_THREADS\"] = str(\n",
+    "        train.get_context().get_trial_resources().bundles[-1].get(\"CPU\", 1)\n",
+    "    )\n",
+    "    # Enable tf32 for better performance\n",
+    "    torch.backends.cuda.matmul.allow_tf32 = True\n",
+    "\n",
+    "    batch_size = config.get(\"batch_size\", 4)\n",
+    "    epochs = config.get(\"epochs\", 2)\n",
+    "    warmup_steps = config.get(\"warmup_steps\", 0)\n",
+    "    learning_rate = config.get(\"learning_rate\", 0.00002)\n",
+    "    weight_decay = config.get(\"weight_decay\", 0.01)\n",
+    "\n",
+    "    deepspeed = {\n",
+    "        \"fp16\": {\n",
+    "            \"enabled\": \"auto\",\n",
+    "            \"initial_scale_power\": 8,\n",
+    "        },\n",
+    "        \"bf16\": {\"enabled\": \"auto\"},\n",
+    "        \"optimizer\": {\n",
+    "            \"type\": \"AdamW\",\n",
+    "            \"params\": {\n",
+    "                \"lr\": \"auto\",\n",
+    "                \"betas\": \"auto\",\n",
+    "                \"eps\": \"auto\",\n",
+    "            },\n",
+    "        },\n",
+    "        \"zero_optimization\": {\n",
+    "            \"stage\": 3,\n",
+    "            \"offload_optimizer\": {\n",
+    "                \"device\": \"cpu\",\n",
+    "                \"pin_memory\": True,\n",
+    "            },\n",
+    "            \"offload_param\": {\n",
+    "                \"device\": \"cpu\",\n",
+    "                \"pin_memory\": True,\n",
+    "            },\n",
+    "            \"overlap_comm\": True,\n",
+    "            \"contiguous_gradients\": True,\n",
+    "            \"reduce_bucket_size\": \"auto\",\n",
+    "            \"stage3_prefetch_bucket_size\": \"auto\",\n",
+    "            \"stage3_param_persistence_threshold\": \"auto\",\n",
+    "            \"gather_16bit_weights_on_model_save\": True,\n",
+    "            \"round_robin_gradients\": True,\n",
+    "        },\n",
+    "        \"gradient_accumulation_steps\": \"auto\",\n",
+    "        \"gradient_clipping\": \"auto\",\n",
+    "        \"steps_per_print\": 10,\n",
+    "        \"train_batch_size\": \"auto\",\n",
+    "        \"train_micro_batch_size_per_gpu\": \"auto\",\n",
+    "        \"wall_clock_breakdown\": False,\n",
+    "    }\n",
+    "\n",
+    "    print(\"Preparing training arguments\")\n",
+    "    training_args = TrainingArguments(\n",
+    "        \"output\",\n",
+    "        per_device_train_batch_size=batch_size,\n",
+    "        logging_steps=1,\n",
+    "        save_strategy=\"no\",\n",
+    "        per_device_eval_batch_size=batch_size,\n",
+    "        learning_rate=learning_rate,\n",
+    "        weight_decay=weight_decay,\n",
+    "        warmup_steps=warmup_steps,\n",
+    "        label_names=[\"input_ids\", \"attention_mask\"],\n",
+    "        num_train_epochs=epochs,\n",
+    "        push_to_hub=False,\n",
+    "        disable_tqdm=True,  # declutter the output a little\n",
+    "        fp16=True,\n",
+    "        gradient_checkpointing=True,\n",
+    "        deepspeed=deepspeed,\n",
+    "    )\n",
+    "    disable_progress_bar()\n",
+    "\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "    tokenizer.pad_token = tokenizer.eos_token\n",
+    "\n",
+    "    print(\"Loading model\")\n",
+    "\n",
+    "    model = GPTJForCausalLM.from_pretrained(model_name, use_cache=False)\n",
+    "    model.resize_token_embeddings(len(tokenizer))\n",
+    "\n",
+    "    print(\"Model loaded\")\n",
+    "\n",
+    "    enable_progress_bar()\n",
+    "\n",
+    "    metric = evaluate.load(\"accuracy\")\n",
+    "\n",
+    "    def compute_metrics(eval_pred):\n",
+    "        logits, labels = eval_pred\n",
+    "        predictions = np.argmax(logits, axis=-1)\n",
+    "        return metric.compute(predictions=predictions, references=labels)\n",
+    "\n",
+    "    trainer = Trainer(\n",
+    "        model=model,\n",
+    "        args=training_args,\n",
+    "        train_dataset=train_dataset,\n",
+    "        eval_dataset=eval_dataset,\n",
+    "        compute_metrics=compute_metrics,\n",
+    "        tokenizer=tokenizer,\n",
+    "        data_collator=default_data_collator,\n",
+    "    )\n",
+    "    return trainer"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With our `trainer_init_per_worker` complete, we can now instantiate the {class}`~ray.train.huggingface.TransformersTrainer`. Aside from the function, we set the `scaling_config`, controlling the amount of workers and resources used, and the `datasets` we will use for training and evaluation.\n",
+    "\n",
+    "We pass the preprocessors we have defined earlier as an argument, wrapped in a {class}`~ray.data.preprocessors.chain.Chain`. The preprocessor will be included with the returned {class}`~ray.train.Checkpoint`, meaning it will also be applied during inference.\n",
+    "\n",
+    "```{note}\n",
+    "Since this example runs with multiple nodes, we need to persist checkpoints\n",
+    "and other outputs to some external storage for access after training has completed.\n",
+    "**You should set up cloud storage or NFS, then replace `storage_path` with your own cloud bucket URI or NFS path.**\n",
+    "\n",
+    "See the [storage guide](tune-storage-options) for more details.\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "storage_path=\"s3://your-bucket-here\"  # TODO: Set up cloud storage\n",
+    "# storage_path=\"/mnt/path/to/nfs\"     # TODO: Alternatively, set up NFS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "storage_path = \"/mnt/cluster_storage\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ray.train.huggingface import TransformersTrainer\n",
+    "from ray.train import RunConfig, ScalingConfig\n",
+    "from ray.data.preprocessors import Chain\n",
+    "\n",
+    "\n",
+    "trainer = TransformersTrainer(\n",
+    "    trainer_init_per_worker=trainer_init_per_worker,\n",
+    "    trainer_init_config={\n",
+    "        \"batch_size\": 16,  # per device\n",
+    "        \"epochs\": 1,\n",
+    "    },\n",
+    "    scaling_config=ScalingConfig(\n",
+    "        num_workers=num_workers,\n",
+    "        use_gpu=use_gpu,\n",
+    "        resources_per_worker={\"GPU\": 1, \"CPU\": cpus_per_worker},\n",
+    "    ),\n",
+    "    datasets={\"train\": ray_datasets[\"train\"], \"evaluation\": ray_datasets[\"validation\"]},\n",
+    "    preprocessor=Chain(splitter, tokenizer),\n",
+    "    run_config=RunConfig(storage_path=storage_path),\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we call the {meth}`~ray.train.huggingface.TransformersTrainer.fit` method to start training with Ray AIR. We will save the {class}`~ray.train.Result` object to a variable so we can access metrics and checkpoints."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"tuneStatus\">\n",
+       "  <div style=\"display: flex;flex-direction: row\">\n",
+       "    <div style=\"display: flex;flex-direction: column;\">\n",
+       "      <h3>Tune Status</h3>\n",
+       "      <table>\n",
+       "<tbody>\n",
+       "<tr><td>Current time:</td><td>2023-03-06 17:18:41</td></tr>\n",
+       "<tr><td>Running for: </td><td>00:43:11.46        </td></tr>\n",
+       "<tr><td>Memory:      </td><td>31.9/62.0 GiB      </td></tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "    </div>\n",
+       "    <div class=\"vDivider\"></div>\n",
+       "    <div class=\"systemInfo\">\n",
+       "      <h3>System Info</h3>\n",
+       "      Using FIFO scheduling algorithm.<br>Resources requested: 0/256 CPUs, 0/16 GPUs, 0.0/675.29 GiB heap, 0.0/291.99 GiB objects (0.0/16.0 accelerator_type:T4)\n",
+       "    </div>\n",
+       "    \n",
+       "  </div>\n",
+       "  <div class=\"hDivider\"></div>\n",
+       "  <div class=\"trialStatus\">\n",
+       "    <h3>Trial Status</h3>\n",
+       "    <table>\n",
+       "<thead>\n",
+       "<tr><th>Trial name                    </th><th>status    </th><th>loc              </th><th style=\"text-align: right;\">  iter</th><th style=\"text-align: right;\">  total time (s)</th><th style=\"text-align: right;\">  loss</th><th style=\"text-align: right;\">  learning_rate</th><th style=\"text-align: right;\">  epoch</th></tr>\n",
+       "</thead>\n",
+       "<tbody>\n",
+       "<tr><td>TransformersTrainer_f623d_00000</td><td>TERMINATED</td><td>10.0.30.196:30861</td><td style=\"text-align: right;\">    85</td><td style=\"text-align: right;\">          2579.3</td><td style=\"text-align: right;\">0.0715</td><td style=\"text-align: right;\">    4.70588e-07</td><td style=\"text-align: right;\">      1</td></tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "  </div>\n",
+       "</div>\n",
+       "<style>\n",
+       ".tuneStatus {\n",
+       "  color: var(--jp-ui-font-color1);\n",
+       "}\n",
+       ".tuneStatus .systemInfo {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       ".tuneStatus td {\n",
+       "  white-space: nowrap;\n",
+       "}\n",
+       ".tuneStatus .trialStatus {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       ".tuneStatus h3 {\n",
+       "  font-weight: bold;\n",
+       "}\n",
+       ".tuneStatus .hDivider {\n",
+       "  border-bottom-width: var(--jp-border-width);\n",
+       "  border-bottom-color: var(--jp-border-color0);\n",
+       "  border-bottom-style: solid;\n",
+       "}\n",
+       ".tuneStatus .vDivider {\n",
+       "  border-left-width: var(--jp-border-width);\n",
+       "  border-left-color: var(--jp-border-color0);\n",
+       "  border-left-style: solid;\n",
+       "  margin: 0.5em 1em 0.5em 1em;\n",
+       "}\n",
+       "</style>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "(RayTrainWorker pid=31281) 2023-03-06 16:36:00,447\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n",
+      "(RayTrainWorker pid=1964, ip=10.0.26.83) /tmp/ray/session_2023-03-06_15-55-37_997701_162/runtime_resources/py_modules_files/_ray_pkg_f864ba6869d6802c/ray/train/_internal/dataset_iterator.py:64: UserWarning: session.get_dataset_shard returns a ray.data.DataIterator instead of a Dataset/DatasetPipeline as of Ray v2.3. Use iter_torch_batches(), to_tf(), or iter_batches() to iterate over one epoch. See https://docs.ray.io/en/latest/data/api/dataset_iterator.html for full DataIterator docs.\n",
+      "(RayTrainWorker pid=1964, ip=10.0.26.83)   warnings.warn(\n",
+      "(RayTrainWorker pid=1964, ip=10.0.26.83) 2023-03-06 16:36:00,453\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n",
+      "(RayTrainWorker pid=1963, ip=10.0.54.163) /tmp/ray/session_2023-03-06_15-55-37_997701_162/runtime_resources/py_modules_files/_ray_pkg_f864ba6869d6802c/ray/train/_internal/dataset_iterator.py:64: UserWarning: session.get_dataset_shard returns a ray.data.DataIterator instead of a Dataset/DatasetPipeline as of Ray v2.3. Use iter_torch_batches(), to_tf(), or iter_batches() to iterate over one epoch. See https://docs.ray.io/en/latest/data/api/dataset_iterator.html for full DataIterator docs.\n",
+      "(RayTrainWorker pid=1963, ip=10.0.54.163)   warnings.warn(\n",
+      "(RayTrainWorker pid=1963, ip=10.0.54.163) 2023-03-06 16:36:00,452\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n",
+      "(RayTrainWorker pid=1954, ip=10.0.15.115) /tmp/ray/session_2023-03-06_15-55-37_997701_162/runtime_resources/py_modules_files/_ray_pkg_f864ba6869d6802c/ray/train/_internal/dataset_iterator.py:64: UserWarning: session.get_dataset_shard returns a ray.data.DataIterator instead of a Dataset/DatasetPipeline as of Ray v2.3. Use iter_torch_batches(), to_tf(), or iter_batches() to iterate over one epoch. See https://docs.ray.io/en/latest/data/api/dataset_iterator.html for full DataIterator docs.\n",
+      "(RayTrainWorker pid=1954, ip=10.0.15.115)   warnings.warn(\n",
+      "(RayTrainWorker pid=1954, ip=10.0.15.115) 2023-03-06 16:36:00,452\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n",
+      "(RayTrainWorker pid=1955, ip=10.0.58.255) /tmp/ray/session_2023-03-06_15-55-37_997701_162/runtime_resources/py_modules_files/_ray_pkg_f864ba6869d6802c/ray/train/_internal/dataset_iterator.py:64: UserWarning: session.get_dataset_shard returns a ray.data.DataIterator instead of a Dataset/DatasetPipeline as of Ray v2.3. Use iter_torch_batches(), to_tf(), or iter_batches() to iterate over one epoch. See https://docs.ray.io/en/latest/data/api/dataset_iterator.html for full DataIterator docs.\n",
+      "(RayTrainWorker pid=1955, ip=10.0.58.255)   warnings.warn(\n",
+      "(RayTrainWorker pid=1955, ip=10.0.58.255) 2023-03-06 16:36:00,453\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n",
+      "(RayTrainWorker pid=1942, ip=10.0.57.85) 2023-03-06 16:36:00,452\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n",
+      "(RayTrainWorker pid=1963, ip=10.0.29.205) 2023-03-06 16:36:00,452\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n",
+      "(RayTrainWorker pid=1942, ip=10.0.51.113) 2023-03-06 16:36:00,454\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(RayTrainWorker pid=31281) Preparing training arguments\n",
+      "(RayTrainWorker pid=31281) Loading model\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:37:21,252] [INFO] [partition_parameters.py:415:__exit__] finished initializing model with 6.05B parameters\n",
+      "(RayTrainWorker pid=31281) Model loaded\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "(RayTrainWorker pid=31281) Using cuda_amp half precision backend\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:03,431] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed info: version=0.8.1, git-hash=unknown, git-branch=unknown\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:03,450] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "(RayTrainWorker pid=31281) ***** Running training *****\n",
+      "(RayTrainWorker pid=31281)   Num examples = 1348\n",
+      "(RayTrainWorker pid=31281)   Num Epochs = 1\n",
+      "(RayTrainWorker pid=31281)   Instantaneous batch size per device = 16\n",
+      "(RayTrainWorker pid=31281)   Total train batch size (w. parallel, distributed & accumulation) = 256\n",
+      "(RayTrainWorker pid=31281)   Gradient Accumulation steps = 1\n",
+      "(RayTrainWorker pid=31281)   Total optimization steps = 85\n",
+      "(RayTrainWorker pid=31281)   Number of trainable parameters = 0\n",
+      "(RayTrainWorker pid=31281) /home/ray/anaconda3/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:2387: UserWarning: torch.distributed._all_gather_base is a private function and will be deprecated. Please use torch.distributed.all_gather_into_tensor instead.\n",
+      "(RayTrainWorker pid=31281)   warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,024] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,024] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed using client callable to create LR scheduler\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,025] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed LR Scheduler = <torch.optim.lr_scheduler.LambdaLR object at 0x7f10a01d7ee0>\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,025] [INFO] [logging.py:75:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05], mom=[[0.9, 0.999]]\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,025] [INFO] [config.py:1009:print] DeepSpeedEngine configuration:\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,026] [INFO] [config.py:1013:print]   activation_checkpointing_config  {\n",
+      "(RayTrainWorker pid=31281)     \"partition_activations\": false, \n",
+      "(RayTrainWorker pid=31281)     \"contiguous_memory_optimization\": false, \n",
+      "(RayTrainWorker pid=31281)     \"cpu_checkpointing\": false, \n",
+      "(RayTrainWorker pid=31281)     \"number_checkpoints\": null, \n",
+      "(RayTrainWorker pid=31281)     \"synchronize_checkpoint_boundary\": false, \n",
+      "(RayTrainWorker pid=31281)     \"profile\": false\n",
+      "(RayTrainWorker pid=31281) }\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,026] [INFO] [config.py:1013:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,026] [INFO] [config.py:1013:print]   amp_enabled .................. False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,026] [INFO] [config.py:1013:print]   amp_params ................... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   autotuning_config ............ {\n",
+      "(RayTrainWorker pid=31281)     \"enabled\": false, \n",
+      "(RayTrainWorker pid=31281)     \"start_step\": null, \n",
+      "(RayTrainWorker pid=31281)     \"end_step\": null, \n",
+      "(RayTrainWorker pid=31281)     \"metric_path\": null, \n",
+      "(RayTrainWorker pid=31281)     \"arg_mappings\": null, \n",
+      "(RayTrainWorker pid=31281)     \"metric\": \"throughput\", \n",
+      "(RayTrainWorker pid=31281)     \"model_info\": null, \n",
+      "(RayTrainWorker pid=31281)     \"results_dir\": \"autotuning_results\", \n",
+      "(RayTrainWorker pid=31281)     \"exps_dir\": \"autotuning_exps\", \n",
+      "(RayTrainWorker pid=31281)     \"overwrite\": true, \n",
+      "(RayTrainWorker pid=31281)     \"fast\": true, \n",
+      "(RayTrainWorker pid=31281)     \"start_profile_step\": 3, \n",
+      "(RayTrainWorker pid=31281)     \"end_profile_step\": 5, \n",
+      "(RayTrainWorker pid=31281)     \"tuner_type\": \"gridsearch\", \n",
+      "(RayTrainWorker pid=31281)     \"tuner_early_stopping\": 5, \n",
+      "(RayTrainWorker pid=31281)     \"tuner_num_trials\": 50, \n",
+      "(RayTrainWorker pid=31281)     \"model_info_path\": null, \n",
+      "(RayTrainWorker pid=31281)     \"mp_size\": 1, \n",
+      "(RayTrainWorker pid=31281)     \"max_train_batch_size\": null, \n",
+      "(RayTrainWorker pid=31281)     \"min_train_batch_size\": 1, \n",
+      "(RayTrainWorker pid=31281)     \"max_train_micro_batch_size_per_gpu\": 1.024000e+03, \n",
+      "(RayTrainWorker pid=31281)     \"min_train_micro_batch_size_per_gpu\": 1, \n",
+      "(RayTrainWorker pid=31281)     \"num_tuning_micro_batch_sizes\": 3\n",
+      "(RayTrainWorker pid=31281) }\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   bfloat16_enabled ............. False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   checkpoint_parallel_write_pipeline  False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   checkpoint_tag_validation_enabled  True\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   checkpoint_tag_validation_fail  False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f1102c55910>\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   communication_data_type ...... None\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   curriculum_enabled_legacy .... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   curriculum_params_legacy ..... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   data_efficiency_enabled ...... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   dataloader_drop_last ......... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   disable_allgather ............ False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   dump_state ................... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   dynamic_loss_scale_args ...... {'init_scale': 256, 'scale_window': 1000, 'delayed_shift': 2, 'min_scale': 1}\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   eigenvalue_enabled ........... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   eigenvalue_gas_boundary_resolution  1\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   eigenvalue_layer_name ........ bert.encoder.layer\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   eigenvalue_layer_num ......... 0\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   eigenvalue_max_iter .......... 100\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   eigenvalue_stability ......... 1e-06\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   eigenvalue_tol ............... 0.01\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   eigenvalue_verbose ........... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   elasticity_enabled ........... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   flops_profiler_config ........ {\n",
+      "(RayTrainWorker pid=31281)     \"enabled\": false, \n",
+      "(RayTrainWorker pid=31281)     \"profile_step\": 1, \n",
+      "(RayTrainWorker pid=31281)     \"module_depth\": -1, \n",
+      "(RayTrainWorker pid=31281)     \"top_modules\": 1, \n",
+      "(RayTrainWorker pid=31281)     \"detailed\": true, \n",
+      "(RayTrainWorker pid=31281)     \"output_file\": null\n",
+      "(RayTrainWorker pid=31281) }\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   fp16_auto_cast ............... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   fp16_enabled ................. True\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   fp16_master_weights_and_gradients  False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   global_rank .................. 0\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   grad_accum_dtype ............. None\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print]   gradient_accumulation_steps .. 1\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   gradient_clipping ............ 1.0\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   gradient_predivide_factor .... 1.0\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   initial_dynamic_scale ........ 256\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   load_universal_checkpoint .... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   loss_scale ................... 0\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   memory_breakdown ............. False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   nebula_config ................ {\n",
+      "(RayTrainWorker pid=31281)     \"enabled\": false, \n",
+      "(RayTrainWorker pid=31281)     \"persistent_storage_path\": null, \n",
+      "(RayTrainWorker pid=31281)     \"persistent_time_interval\": 100, \n",
+      "(RayTrainWorker pid=31281)     \"num_of_version_in_retention\": 2, \n",
+      "(RayTrainWorker pid=31281)     \"enable_nebula_load\": true, \n",
+      "(RayTrainWorker pid=31281)     \"load_path\": null\n",
+      "(RayTrainWorker pid=31281) }\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   optimizer_legacy_fusion ...... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   optimizer_name ............... adamw\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   optimizer_params ............. {'lr': 2e-05, 'betas': [0.9, 0.999], 'eps': 1e-08}\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   pld_enabled .................. False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   pld_params ................... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   prescale_gradients ........... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   scheduler_name ............... None\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   scheduler_params ............. None\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   sparse_attention ............. None\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   sparse_gradients_enabled ..... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   steps_per_print .............. 10\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   train_batch_size ............. 256\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   train_micro_batch_size_per_gpu  16\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   use_node_local_storage ....... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   wall_clock_breakdown ......... False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   world_size ................... 16\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   zero_allow_untested_optimizer  False\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=16777216 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='cpu', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=True) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=15099494 param_persistence_threshold=40960 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=True\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   zero_enabled ................. True\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print]   zero_optimization_stage ...... 3\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,029] [INFO] [config.py:998:print_user_config]   json = {\n",
+      "(RayTrainWorker pid=31281)     \"fp16\": {\n",
+      "(RayTrainWorker pid=31281)         \"enabled\": true, \n",
+      "(RayTrainWorker pid=31281)         \"initial_scale_power\": 8\n",
+      "(RayTrainWorker pid=31281)     }, \n",
+      "(RayTrainWorker pid=31281)     \"bf16\": {\n",
+      "(RayTrainWorker pid=31281)         \"enabled\": false\n",
+      "(RayTrainWorker pid=31281)     }, \n",
+      "(RayTrainWorker pid=31281)     \"optimizer\": {\n",
+      "(RayTrainWorker pid=31281)         \"type\": \"AdamW\", \n",
+      "(RayTrainWorker pid=31281)         \"params\": {\n",
+      "(RayTrainWorker pid=31281)             \"lr\": 2e-05, \n",
+      "(RayTrainWorker pid=31281)             \"betas\": [0.9, 0.999], \n",
+      "(RayTrainWorker pid=31281)             \"eps\": 1e-08\n",
+      "(RayTrainWorker pid=31281)         }\n",
+      "(RayTrainWorker pid=31281)     }, \n",
+      "(RayTrainWorker pid=31281)     \"zero_optimization\": {\n",
+      "(RayTrainWorker pid=31281)         \"stage\": 3, \n",
+      "(RayTrainWorker pid=31281)         \"offload_optimizer\": {\n",
+      "(RayTrainWorker pid=31281)             \"device\": \"cpu\", \n",
+      "(RayTrainWorker pid=31281)             \"pin_memory\": true\n",
+      "(RayTrainWorker pid=31281)         }, \n",
+      "(RayTrainWorker pid=31281)         \"offload_param\": {\n",
+      "(RayTrainWorker pid=31281)             \"device\": \"cpu\", \n",
+      "(RayTrainWorker pid=31281)             \"pin_memory\": true\n",
+      "(RayTrainWorker pid=31281)         }, \n",
+      "(RayTrainWorker pid=31281)         \"overlap_comm\": true, \n",
+      "(RayTrainWorker pid=31281)         \"contiguous_gradients\": true, \n",
+      "(RayTrainWorker pid=31281)         \"reduce_bucket_size\": 1.677722e+07, \n",
+      "(RayTrainWorker pid=31281)         \"stage3_prefetch_bucket_size\": 1.509949e+07, \n",
+      "(RayTrainWorker pid=31281)         \"stage3_param_persistence_threshold\": 4.096000e+04, \n",
+      "(RayTrainWorker pid=31281)         \"gather_16bit_weights_on_model_save\": true, \n",
+      "(RayTrainWorker pid=31281)         \"round_robin_gradients\": true\n",
+      "(RayTrainWorker pid=31281)     }, \n",
+      "(RayTrainWorker pid=31281)     \"gradient_accumulation_steps\": 1, \n",
+      "(RayTrainWorker pid=31281)     \"gradient_clipping\": 1.0, \n",
+      "(RayTrainWorker pid=31281)     \"steps_per_print\": 10, \n",
+      "(RayTrainWorker pid=31281)     \"train_batch_size\": 256, \n",
+      "(RayTrainWorker pid=31281)     \"train_micro_batch_size_per_gpu\": 16, \n",
+      "(RayTrainWorker pid=31281)     \"wall_clock_breakdown\": false\n",
+      "(RayTrainWorker pid=31281) }\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "(RayTrainWorker pid=31281) Model weights saved in output/checkpoint-85/pytorch_model.bin\n",
+      "(RayTrainWorker pid=31281) tokenizer config file saved in output/checkpoint-85/tokenizer_config.json\n",
+      "(RayTrainWorker pid=31281) Special tokens file saved in output/checkpoint-85/special_tokens_map.json\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(RayTrainWorker pid=31281) [2023-03-06 17:18:13,320] [INFO] [engine.py:3516:save_16bit_model] Saving model weights to output/checkpoint-85/pytorch_model.bin\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 17:18:13,320] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving output/checkpoint-85/pytorch_model.bin...\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 17:18:29,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved output/checkpoint-85/pytorch_model.bin.\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 17:18:29,087] [INFO] [logging.py:75:log_dist] [Rank 0] [Torch] Checkpoint global_step85 is begin to save!\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 17:18:29,109] [INFO] [logging.py:75:log_dist] [Rank 0] Saving model checkpoint: output/checkpoint-85/global_step85/zero_pp_rank_0_mp_rank_00_model_states.pt\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 17:18:29,109] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving output/checkpoint-85/global_step85/zero_pp_rank_0_mp_rank_00_model_states.pt...\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 17:18:37,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved output/checkpoint-85/global_step85/zero_pp_rank_0_mp_rank_00_optim_states.pt.\n",
+      "(RayTrainWorker pid=31281) [2023-03-06 17:18:37,984] [INFO] [engine.py:3407:_save_zero_checkpoint] zero checkpoint saved output/checkpoint-85/global_step85/zero_pp_rank_0_mp_rank_00_optim_states.pt\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "(RayTrainWorker pid=31281) \n",
+      "(RayTrainWorker pid=31281) \n",
+      "(RayTrainWorker pid=31281) Training completed. Do not forget to share your model on huggingface.co/models =)\n",
+      "(RayTrainWorker pid=31281) \n",
+      "(RayTrainWorker pid=31281) \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(RayTrainWorker pid=31281) [2023-03-06 17:18:38,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85 is ready now!\n",
+      "(RayTrainWorker pid=31281) {'train_runtime': 2413.1243, 'train_samples_per_second': 0.559, 'train_steps_per_second': 0.035, 'train_loss': 0.32492108064539293, 'epoch': 1.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-03-06 17:18:41,018\tINFO tune.py:825 -- Total run time: 2591.59 seconds (2591.46 seconds for the tuning loop).\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = trainer.fit()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can use the returned {class}`~ray.train.Result` object to access metrics and the Ray AIR {class}`~ray.train.Checkpoint` associated with the last iteration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "TransformersCheckpoint(local_path=/home/ray/ray_results/TransformersTrainer_2023-03-06_16-35-29/TransformersTrainer_f623d_00000_0_2023-03-06_16-35-30/checkpoint_000000)"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "checkpoint = results.checkpoint\n",
+    "checkpoint"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Generate text from prompt\n",
+    "\n",
+    "We can use the {class}`~ray.train.huggingface.huggingface_predictor.TransformersPredictor` to generate predictions from our fine-tuned model.\n",
+    "\n",
+    "```{tip}\n",
+    "For large scale batch inference, see {ref}`End-to-end: Offline Batch Inference <batch_inference_home>`.\n",
+    "```\n",
+    "\n",
+    "Because the {class}`~ray.train.huggingface.huggingface_predictor.TransformersPredictor` uses a 🤗 Transformers [`pipeline`](https://huggingface.co/docs/transformers/en/main_classes/pipelines) under the hood, we disable the tokenizer AIR Preprocessor we have used for training and let the `pipeline` to tokenize the data itself."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "checkpoint.set_preprocessor(None)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We also set `device_map=\"auto\"` so that the model is automatically placed on the right device and set the `task` to `\"text-generation\"`. The `predict` method passes the arguments to a 🤗 Transformers `pipeline` call."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ray.train.huggingface import TransformersPredictor\n",
+    "import pandas as pd\n",
+    "\n",
+    "prompts = pd.DataFrame([\"Romeo and Juliet\", \"Romeo\", \"Juliet\"], columns=[\"text\"])\n",
+    "\n",
+    "# Predict on the head node.\n",
+    "predictor = TransformersPredictor.from_checkpoint(\n",
+    "    checkpoint=checkpoint,\n",
+    "    task=\"text-generation\",\n",
+    "    torch_dtype=torch.float16 if use_gpu else None,\n",
+    "    device_map=\"auto\",\n",
+    "    use_gpu=use_gpu,\n",
+    ")\n",
+    "prediction = predictor.predict(\n",
+    "    prompts,\n",
+    "    do_sample=True,\n",
+    "    temperature=0.9,\n",
+    "    min_length=32,\n",
+    "    max_length=128,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>generated_text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Romeo and Juliet, they are married: and it is ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Romeo, thou art Romeo and a Montague; for only...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Juliet's name; but I do not sound an ear to na...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                      generated_text\n",
+       "0  Romeo and Juliet, they are married: and it is ...\n",
+       "1  Romeo, thou art Romeo and a Montague; for only...\n",
+       "2  Juliet's name; but I do not sound an ear to na..."
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "orphan": true,
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb b/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb
deleted file mode 120000
index 1c219bcfcb468..0000000000000
--- a/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb
+++ /dev/null
@@ -1 +0,0 @@
-../../../doc/source/ray-air/examples/opt_deepspeed_batch_inference.ipynb
\ No newline at end of file
diff --git a/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb b/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb
new file mode 100644
index 0000000000000..06f3d5fc35fae
--- /dev/null
+++ b/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb
@@ -0,0 +1,36 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    ""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb
deleted file mode 120000
index ccd34dcfc22fa..0000000000000
--- a/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb
+++ /dev/null
@@ -1 +0,0 @@
-../../../doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb
\ No newline at end of file
diff --git a/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb
new file mode 100644
index 0000000000000..74afc02ff0b08
--- /dev/null
+++ b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb
@@ -0,0 +1,1425 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "(vicuna_lightning_deepspeed_finetuning)=\n",
+    "\n",
+    "# Fine-tune `vicuna-13b` with Ray LightningTrainer and DeepSpeed\n",
+    "\n",
+    "In this example, we will demonstrate how to perform full fine-tuning for a [`vicuna-13b-v1.3`](https://huggingface.co/lmsys/vicuna-13b-v1.3) model using LightningTrainer with the DeepSpeed ZeRO-3 strategy.\n",
+    "\n",
+    "- [DeepSpeed](<https://github.com/microsoft/DeepSpeed>) is an open-source deep learning optimization library for PyTorch. It's designed to reduce computing power and memory usage, and to train large distributed models by leveraging state-of-the-art innovations like ZeRO, 3D-Parallelism, DeepSpeed-MoE, and ZeRO-Infinity. \n",
+    "- PyTorch Lightning offers a [DeepSpeed integration](https://lightning.ai/docs/pytorch/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), which provides a simple interface to configure the knobs for DeepSpeed and automatically trigger your training process with the DeepSpeed Engine.\n",
+    "- {class}`Ray LightningTrainer <ray.train.lightning.LightningTrainer>` allows you to easily scale your PyTorch Lightning job across multiple nodes in a Ray cluster, without worrying about the underlying cluster management, autoscaling, and distributed process group settings.\n",
+    "\n",
+    "Our demo aims to illustrate how these three tools can be combined effectively to finetune the Vicuna-13B model, leveraging the strengths of each to create an efficient and high-performance deep learning solution.\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```{note}\n",
+    "This is an advanced example of Large Language Model fine-tuning with Ray Train. If you're a beginner or new to the concepts of Ray Train and LightningTrainer, it would be beneficial to first explore the introductory documentation below to build a foundational understanding. \n",
+    "- [Ray Train Key Concepts](train-key-concepts) \n",
+    "- [Ray Data Key Concepts](data_key_concepts)\n",
+    "- {ref}`[Basic] Image Classification with LightningTrainer <lightning_mnist_example>`\n",
+    "- {ref}`[Intermediate] Using LightningTrainer with Ray Data <lightning_advanced_example>`\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cluster Setting\n",
+    "\n",
+    "\n",
+    "### Compute instances\n",
+    "In this example, we set up a Ray cluster on AWS with the following settings:\n",
+    "\n",
+    "|  | num | instance type | GPU per node | GPU Memory | CPU Memory |\n",
+    "|-|-|-|-|-|-|\n",
+    "|Head node|1|g5.16xlarge|1 x A10G | 24 GB | 256 GB|\n",
+    "|Worker node|15|g5.4xlarge|1 x A10G | 24 GB | 64 GB|\n",
+    "\n",
+    "```{note}\n",
+    "In this example, we used 16 A10G GPUs for model training and tuned the DeepSpeed configurations for this setup. If you have a different cluster setup or GPUs with lower memory capacities, you may need to modify the DeepSpeed configurations and batch size to fit the model into the GPUs.\n",
+    "```\n",
+    "\n",
+    "```{tip}\n",
+    "We selected a GPU instance with additional CPU memory for the head node to demonstrate single-node offline inference. If you are training only, you can still opt for the g5.4xlarge instance for the head node.\n",
+    "```\n",
+    "\n",
+    "\n",
+    "### Cloud Storage\n",
+    "\n",
+    "Additionally, since the checkpoint size for this 13B parameter model can be large (~140GB), we choose to store the checkpoints in AWS S3. Thanks to the newly introduced distributed checkpointing feature in Ray 2.5, each worker can upload its own shards individually to the S3 bucket, greatly reducing the latency and network traffic of checkpoint syncing.\n",
+    "\n",
+    "### Local Storage\n",
+    "To demonstrate offline inference, we need to download and consolidate the model checkpoint onto the head node. This action requires around 200GB disk storage. Therefore, we mounted the NVMe SSD provided by g5 instances at `/dev/nvme1n1` to `/mnt/local_storage`, and we will save the checkpoints in this folder.\n",
+    "\n",
+    "For more details, please refer to[Amazon EBS and NVMe on Linux instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/nvme-ebs-volumes.html).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup Ray Environment\n",
+    "\n",
+    "We define a runtime environment to ensure that the Ray workers have access to all necessary packages. If you have already included these dependencies in your Docker image or installed them on each node, you can ignore the `runtime_env` argument.\n",
+    "\n",
+    "```{note}\n",
+    "Note that the codebases of `transformers`, `accelerate`, and `deepspeed` are all rapidly changing, so we have pinned the package versions here to ensure testing stability. You can try other version combinations and feel free to report any issues you encounter.\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ray\n",
+    "\n",
+    "NUM_WORKERS = 16\n",
+    "BATCH_SIZE_PER_WORKER = 8\n",
+    "MODEL_NAME = \"lmsys/vicuna-13b-v1.3\"\n",
+    "\n",
+    "ray.init(\n",
+    "    runtime_env={\n",
+    "        \"pip\": [\n",
+    "            \"datasets==2.13.1\",\n",
+    "            \"torch>=1.13.0\",\n",
+    "            \"deepspeed==0.9.4\",\n",
+    "            \"accelerate==0.20.3\",\n",
+    "            \"transformers==4.30.2\",\n",
+    "            \"pytorch_lightning==2.0.3\",\n",
+    "        ]\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load and preprocess datasets\n",
+    "\n",
+    "We were impressed by LLM's ability of zero-shot text-generation, while some LLMs may not perform well in code generation due to the lack of code in the training corpus. The CMU [CoNaLa](https://conala-corpus.github.io/)(The Code/Natural Language Challenge) was designed to test systems for generating program snippets from natural language. Each data record contains an intent sentence and a one-line code snippet. The goal is to fine-tune the Vicuna model on this dataset, enabling the model to generate correct and runnable code snippets, thereby achieving natural language intent. Here are some examples:\n",
+    "\n",
+    "| intent | code snippet |\n",
+    "| - | - |\n",
+    "| \"convert a list of integers into a single integer\" | `r = int(''.join(map(str, x)))`|\n",
+    "| \"normalize a pandas dataframe `df` by row\" | `df.div(df.sum(axis=1), axis=0)` | \n",
+    "| \"Convert string '03:55' into datetime.time object\" | `datetime.datetime.strptime('03:55', '%H:%M').time()` |\n",
+    "\n",
+    "The CoNaLa team has released a dataset crawled from Stack Overflow, automatically filtered, then curated by annotators, split into 2379 training and 500 test examples. In addition, they also included an automatically-mined dataset with 600k examples. In this demo, we take all the curated data and the top 5000 mined data for fine-tuning."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we preprocess the CoNaLa dataset with Ray Data. You can also use HuggingFace Datasets and pass it directly to `LightningConfigBuilder.fit_params()`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "535afe3e183b4cdfa61c39cbae788608",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset({\n",
+      "    features: ['question_id', 'intent', 'rewritten_intent', 'snippet', 'parent_answer_post_id', 'prob', 'id'],\n",
+      "    num_rows: 7379\n",
+      "})\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "import ray\n",
+    "import json\n",
+    "from transformers import AutoTokenizer\n",
+    "from datasets import concatenate_datasets, load_dataset\n",
+    "\n",
+    "# Combine the curated dataset and automatically-mined dataset\n",
+    "hf_dataset_curated = load_dataset(\"neulab/conala\")\n",
+    "hf_dataset_mined = load_dataset(\"neulab/conala\", \"mined\", split=\"train[:5000]\")\n",
+    "hf_dataset_merged = concatenate_datasets(\n",
+    "    [hf_dataset_curated[\"train\"], hf_dataset_mined]\n",
+    ")\n",
+    "print(hf_dataset_merged)\n",
+    "\n",
+    "# Convert it into Ray Dataset\n",
+    "ray_ds = ray.data.from_huggingface(hf_dataset_merged)\n",
+    "\n",
+    "# Build a prompt template for Vicuna-13b model\n",
+    "PROMPT_TEMPLATE = \"Intent: {intent}\\nOne-line code snippet: {snippet}\"\n",
+    "\n",
+    "\n",
+    "def fill_prompt(batch):\n",
+    "    batch[\"input_sentence\"] = batch.apply(\n",
+    "        lambda row: PROMPT_TEMPLATE.format(\n",
+    "            intent=row[\"rewritten_intent\"]\n",
+    "            if row[\"rewritten_intent\"]\n",
+    "            else row[\"intent\"],\n",
+    "            snippet=f\"`{row['snippet']}`\",\n",
+    "        )\n",
+    "        + \"</s>\",\n",
+    "        axis=1,\n",
+    "    )\n",
+    "    return batch[[\"input_sentence\"]]\n",
+    "\n",
+    "\n",
+    "# Tokenize input sentences to tensors\n",
+    "def tokenize(batch):\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(\n",
+    "        MODEL_NAME, padding_side=\"left\", use_fast=False\n",
+    "    )\n",
+    "    tokenizer.pad_token = tokenizer.eos_token\n",
+    "    ret = tokenizer(\n",
+    "        list(batch[\"input_sentence\"]),\n",
+    "        truncation=True,\n",
+    "        max_length=128,\n",
+    "        padding=\"max_length\",\n",
+    "        return_tensors=\"np\",\n",
+    "    )\n",
+    "    ret[\"labels\"] = ret[\"input_ids\"].copy()\n",
+    "    return dict(ret)\n",
+    "\n",
+    "# Preprocess train dataset\n",
+    "processed_ds = ray_ds.map_batches(fill_prompt, batch_format=\"pandas\").map_batches(tokenize, batch_format=\"pandas\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define your model\n",
+    "\n",
+    "Here we load the pre-trained model weights from HuggingFace Model Hub, and wrap them into `pl.LightningModule`. We adopted the efficient model initialization techniques introduced in [Lightning-transformers](https://github.com/Lightning-Universe/lightning-transformers) to avoid unnecessary full weights loading."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2023-06-30 17:39:35,109] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import transformers\n",
+    "import pytorch_lightning as pl\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "from deepspeed.ops.adam import DeepSpeedCPUAdam\n",
+    "\n",
+    "\n",
+    "class ZeRO3Config:\n",
+    "    def __init__(self, pl_module):\n",
+    "        self.config = pl_module.trainer.strategy.config\n",
+    "\n",
+    "    def __call__(self, *args, **kwargs):\n",
+    "        return self\n",
+    "\n",
+    "    def is_zero3(self) -> bool:\n",
+    "        return True\n",
+    "\n",
+    "\n",
+    "def enable_transformers_pretrained_deepspeed_sharding(\n",
+    "    pl_module: \"pl.LightningModule\",\n",
+    ") -> None:\n",
+    "    transformers.deepspeed._hf_deepspeed_config_weak_ref = ZeRO3Config(pl_module)\n",
+    "\n",
+    "\n",
+    "class Vicuna13BModel(pl.LightningModule):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        # Enable tf32 for better performance\n",
+    "        torch.backends.cuda.matmul.allow_tf32 = True\n",
+    "\n",
+    "    def setup(self, stage) -> None:\n",
+    "        # Defer model initialization to inject deepspeed configs to HF.\n",
+    "        # During initialization, HF transformers can immediately partition \n",
+    "        # the model across all gpus avoid the overhead in time and memory \n",
+    "        # copying it on CPU or each GPU first.\n",
+    "        enable_transformers_pretrained_deepspeed_sharding(self)\n",
+    "        self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
+    "        if self.global_rank == 0:\n",
+    "            print(\"DeepSpeed Configs: \", self.trainer.strategy.config)\n",
+    "            print(\"Model Archetecture: \", self.model)\n",
+    "\n",
+    "    def forward(self, batch):\n",
+    "        outputs = self.model(\n",
+    "            batch[\"input_ids\"],\n",
+    "            labels=batch[\"labels\"],\n",
+    "            attention_mask=batch[\"attention_mask\"],\n",
+    "        )\n",
+    "        return outputs.loss\n",
+    "\n",
+    "    def training_step(self, batch, batch_idx):\n",
+    "        loss = self.forward(batch)\n",
+    "        self.log(\"train_loss\", loss, prog_bar=True, on_step=True, sync_dist=True)\n",
+    "        return loss\n",
+    "\n",
+    "    def configure_optimizers(self):\n",
+    "        return DeepSpeedCPUAdam(self.parameters(), lr=2e-5, weight_decay=0.01)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training Configurations\n",
+    "\n",
+    "Before training, let's calculate the memory usage of finetuning a `vicuna-13b` model. Assume we are using FP16 mixed-precision training, and the optimizer is Adam with FP32 states.\n",
+    "\n",
+    "- Model parameters: 13(billion parameters) * 2(FP16) ≈ 26GB\n",
+    "- Optimizer states: 13(billion parameters)  * 2(momentums per param) * 4 (FP32) ≈ 52GB\n",
+    "\n",
+    "As we can see, the model parameters themselves require 26GB, which cannot fit in a single A10G GPU, let alone the activations and optimizers states. Here, we use ZeRO stage-3 to partition the model, gradients, and optimizer states across 16 nodes. Additionally, we employ optimizer CPU offloading to reduce GRAM usage and increase throughput with larger batch sizes. We also disabled parameter offloading and activation checkpointing to improve the training speed.\n",
+    "\n",
+    "Regarding other knobs such as `reduce_bucket_size`, `stage3_prefetch_bucket_size` and `stage3_param_persistence_threshold`, we kept them as the [default values in HuggingFace](https://huggingface.co/docs/transformers/main_classes/deepspeed#zero3-config). Feel free to further adjust them to speed up the training process."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ray.train.lightning import LightningTrainer, LightningConfigBuilder\n",
+    "from transformers import AutoConfig\n",
+    "\n",
+    "config = AutoConfig.from_pretrained(MODEL_NAME)\n",
+    "HIDDEN_SIZE = config.hidden_size\n",
+    "\n",
+    "deepspeed_configs = {\n",
+    "    \"zero_allow_untested_optimizer\": True,\n",
+    "    \"bf16\": {\"enabled\": True},\n",
+    "    \"zero_optimization\": {\n",
+    "        \"stage\": 3,\n",
+    "        \"offload_optimizer\": {\"device\": \"cpu\", \"pin_memory\": True},\n",
+    "        \"overlap_comm\": True,\n",
+    "        \"contiguous_gradients\": True,\n",
+    "        \"reduce_bucket_size\": HIDDEN_SIZE * HIDDEN_SIZE,\n",
+    "        \"stage3_prefetch_bucket_size\": 0.9 * HIDDEN_SIZE * HIDDEN_SIZE,\n",
+    "        \"stage3_param_persistence_threshold\": 10 * HIDDEN_SIZE,\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "lightning_config = (\n",
+    "    LightningConfigBuilder()\n",
+    "    .module(cls=Vicuna13BModel)\n",
+    "    .trainer(\n",
+    "        max_epochs=1,\n",
+    "        accelerator=\"gpu\",\n",
+    "        precision=\"bf16-mixed\",\n",
+    "        accumulate_grad_batches=2,\n",
+    "    )\n",
+    "    .strategy(name=\"deepspeed\", config=deepspeed_configs)\n",
+    "    .checkpointing(save_top_k=0, save_weights_only=True, save_last=True)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "from pytorch_lightning.callbacks import TQDMProgressBar\n",
+    "\n",
+    "# Create a customized progress bar for LightningTrainer\n",
+    "class VicunaProgressBar(TQDMProgressBar):\n",
+    "    def __init__(self, num_iters_per_epoch, *args, **kwargs):\n",
+    "        super().__init__(*args, **kwargs)\n",
+    "        self.num_iters_per_epoch = num_iters_per_epoch\n",
+    "\n",
+    "    def on_train_epoch_start(self, trainer, *_):\n",
+    "        super().on_train_epoch_start(trainer, *_)\n",
+    "        self.train_progress_bar.reset(self.num_iters_per_epoch)\n",
+    "\n",
+    "\n",
+    "total_batches = processed_ds.count()\n",
+    "num_iters_per_epoch = total_batches // (NUM_WORKERS * BATCH_SIZE_PER_WORKER)\n",
+    "progress_bar = VicunaProgressBar(num_iters_per_epoch)\n",
+    "\n",
+    "\n",
+    "lightning_config.trainer(\n",
+    "    callbacks=[progress_bar],\n",
+    "    # Take a subset to accelerate release tests\n",
+    "    limit_train_batches=20,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, combine all the configurations with {class}`LightningConfigBuilder <ray.train.lightning.LightningConfigBuilder>` and instantiate a LightningTrainer. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ray.air.config import CheckpointConfig, RunConfig, ScalingConfig\n",
+    "\n",
+    "trainer = LightningTrainer(\n",
+    "    lightning_config=lightning_config.build(),\n",
+    "    run_config=RunConfig(\n",
+    "        name=\"vicuna-13b-finetune\",\n",
+    "        storage_path=\"s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/air-release-tests\",\n",
+    "        checkpoint_config=CheckpointConfig(\n",
+    "            num_to_keep=1,\n",
+    "            # Enable distributed checkpointing\n",
+    "            _checkpoint_keep_all_ranks=True,\n",
+    "            _checkpoint_upload_from_workers=True,\n",
+    "        ),\n",
+    "    ),\n",
+    "    scaling_config=ScalingConfig(\n",
+    "        num_workers=NUM_WORKERS,\n",
+    "        use_gpu=True,\n",
+    "        resources_per_worker={\"CPU\": 15, \"GPU\": 1},\n",
+    "    ),\n",
+    "    datasets={\"train\": processed_ds},\n",
+    "    datasets_iter_config={\"batch_size\": BATCH_SIZE_PER_WORKER},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```{tip}\n",
+    "\n",
+    "Here, we highly recommend saving checkpoints with cloud storage and enabling distributed checkpointing by setting `_checkpoint_keep_all_ranks` and `_checkpoint_upload_from_workers` to True when training huge models. Otherwise, all checkpoint shards will be synced to the head node, which may introduce enormous syncing overhead and even cause out-of-memory.\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model Fine-tuning\n",
+    "\n",
+    "Once everything is configured in LightningTrainer, training becomes easy. Simply call `trainer.fit()`, and your workload will be scaled to the Ray cluster, initiating ZeRO-3 parallel training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"tuneStatus\">\n",
+       "  <div style=\"display: flex;flex-direction: row\">\n",
+       "    <div style=\"display: flex;flex-direction: column;\">\n",
+       "      <h3>Tune Status</h3>\n",
+       "      <table>\n",
+       "<tbody>\n",
+       "<tr><td>Current time:</td><td>2023-06-30 18:21:59</td></tr>\n",
+       "<tr><td>Running for: </td><td>00:42:22.75        </td></tr>\n",
+       "<tr><td>Memory:      </td><td>10.7/249.1 GiB     </td></tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "    </div>\n",
+       "    <div class=\"vDivider\"></div>\n",
+       "    <div class=\"systemInfo\">\n",
+       "      <h3>System Info</h3>\n",
+       "      Using FIFO scheduling algorithm.<br>Logical resource usage: 241.0/304 CPUs, 16.0/16 GPUs (0.0/16.0 accelerator_type:A10G)\n",
+       "    </div>\n",
+       "    \n",
+       "  </div>\n",
+       "  <div class=\"hDivider\"></div>\n",
+       "  <div class=\"trialStatus\">\n",
+       "    <h3>Trial Status</h3>\n",
+       "    <table>\n",
+       "<thead>\n",
+       "<tr><th>Trial name                  </th><th>status    </th><th>loc              </th><th style=\"text-align: right;\">  iter</th><th style=\"text-align: right;\">  total time (s)</th><th style=\"text-align: right;\">  train_loss</th><th style=\"text-align: right;\">  epoch</th><th style=\"text-align: right;\">  step</th></tr>\n",
+       "</thead>\n",
+       "<tbody>\n",
+       "<tr><td>LightningTrainer_c1544_00000</td><td>TERMINATED</td><td>10.0.55.20:134103</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         2473.94</td><td style=\"text-align: right;\">    0.523438</td><td style=\"text-align: right;\">      0</td><td style=\"text-align: right;\">    29</td></tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "  </div>\n",
+       "</div>\n",
+       "<style>\n",
+       ".tuneStatus {\n",
+       "  color: var(--jp-ui-font-color1);\n",
+       "}\n",
+       ".tuneStatus .systemInfo {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       ".tuneStatus td {\n",
+       "  white-space: nowrap;\n",
+       "}\n",
+       ".tuneStatus .trialStatus {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       ".tuneStatus h3 {\n",
+       "  font-weight: bold;\n",
+       "}\n",
+       ".tuneStatus .hDivider {\n",
+       "  border-bottom-width: var(--jp-border-width);\n",
+       "  border-bottom-color: var(--jp-border-color0);\n",
+       "  border-bottom-style: solid;\n",
+       "}\n",
+       ".tuneStatus .vDivider {\n",
+       "  border-left-width: var(--jp-border-width);\n",
+       "  border-left-color: var(--jp-border-color0);\n",
+       "  border-left-style: solid;\n",
+       "  margin: 0.5em 1em 0.5em 1em;\n",
+       "}\n",
+       "</style>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(pid=134103)\u001B[0m [2023-06-30 17:39:41,637] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m \u001B[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001B[0m\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Starting distributed worker processes: ['134267 (10.0.55.20)', '74152 (10.0.63.141)', '75476 (10.0.51.205)', '75547 (10.0.42.158)', '74711 (10.0.45.211)', '75132 (10.0.20.140)', '74502 (10.0.60.86)', '75695 (10.0.53.69)', '74457 (10.0.47.2)', '74569 (10.0.33.23)', '74341 (10.0.29.61)', '74274 (10.0.36.152)', '74561 (10.0.35.16)', '74427 (10.0.16.236)', '74273 (10.0.54.55)', '74996 (10.0.9.249)']\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Setting up process group for: env:// [rank=0, world_size=16]\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)->MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "da7f200767b448d7b409fcdd07daecce",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "(pid=134103) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d9c76218373645cc99438e1f14133e74",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "(pid=134103) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 8.86MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001B[0m \n",
+      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 18.2MB/s]ansform_pandas) pid=74329, ip=10.0.54.55)\u001B[0m \n",
+      "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 3.33MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001B[0m \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m [2023-06-30 17:39:54,612] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 7.86MB/s]\n",
+      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 7.57MB/s]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m GPU available: True (cuda), used: True\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m TPU available: False, using: 0 TPU cores\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m IPU available: False, using: 0 IPUs\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m HPU available: False, using: 0 HPUs\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m `Trainer(limit_val_batches=1)` was configured so 1 batch will be used.\n",
+      "Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]\n",
+      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 14.9MB/s]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/16\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001B[0m Missing logger folder: /home/ray/ray_results/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/rank_all/lightning_logs\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:39:55,589] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 18.2MB/s]\n",
+      "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 6.49MB/s]\n",
+      "Downloading (…)lve/main/config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]\n",
+      "Downloading (…)lve/main/config.json: 100%|██████████| 585/585 [00:00<00:00, 7.81MB/s]\n",
+      "Downloading (…)lve/main/config.json: 100%|██████████| 585/585 [00:00<00:00, 7.09MB/s]\n",
+      "Downloading (…)model.bin.index.json: 100%|██████████| 33.4k/33.4k [00:00<00:00, 35.1MB/s]\n",
+      "Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001B[0m \n",
+      "Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]\u001B[A\n",
+      "Downloading (…)l-00001-of-00003.bin:   0%|          | 21.0M/9.95G [00:00<00:59, 167MB/s]\u001B[A\n",
+      "Downloading (…)l-00001-of-00003.bin:   0%|          | 41.9M/9.95G [00:00<00:58, 170MB/s] \u001B[A\n",
+      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 8.33MB/s]\u001B[32m [repeated 9x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)\u001B[0m\n",
+      "Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]\n",
+      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 17.5MB/s]\u001B[32m [repeated 8x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001B[0m initializing deepspeed distributed: GLOBAL_RANK: 12, MEMBER: 13/16\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001B[0m Missing logger folder: /home/ray/ray_results/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/rank_all/lightning_logs\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 8.85MB/s]\n",
+      "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 5.23MB/s]\u001B[32m [repeated 10x across cluster]\u001B[0m\n",
+      "Downloading (…)lve/main/config.json: 100%|██████████| 585/585 [00:00<00:00, 7.03MB/s]\u001B[32m [repeated 13x across cluster]\u001B[0m\n",
+      "Downloading (…)model.bin.index.json: 100%|██████████| 33.4k/33.4k [00:00<00:00, 87.9MB/s]\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001B[0m \u001B[32m [repeated 650x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]\u001B[A\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  13%|█▎        | 1.31G/9.95G [00:05<00:36, 239MB/s]\u001B[A\u001B[32m [repeated 636x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:   1%|          | 105M/9.95G [00:00<00:41, 239MB/s] \u001B[A\u001B[32m [repeated 17x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001B[0m \u001B[32m [repeated 640x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  26%|██▌       | 2.58G/9.95G [00:10<00:28, 256MB/s]\u001B[A\u001B[32m [repeated 635x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001B[0m \u001B[32m [repeated 638x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  37%|███▋      | 3.70G/9.95G [00:15<00:26, 238MB/s]\u001B[A\u001B[32m [repeated 638x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001B[0m \u001B[32m [repeated 643x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  51%|█████▏    | 5.12G/9.95G [00:20<00:18, 255MB/s]\u001B[A\u001B[32m [repeated 649x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75476, ip=10.0.51.205)\u001B[0m \u001B[32m [repeated 638x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  65%|██████▌   | 6.48G/9.95G [00:25<00:14, 246MB/s]\u001B[A\u001B[32m [repeated 633x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74457, ip=10.0.47.2)\u001B[0m \u001B[32m [repeated 645x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  76%|███████▌  | 7.52G/9.95G [00:29<00:09, 247MB/s]\u001B[A\u001B[32m [repeated 644x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  91%|█████████▏| 9.10G/9.95G [00:34<00:03, 263MB/s]\u001B[A\n",
+      "Downloading (…)l-00001-of-00003.bin:  92%|█████████▏| 9.13G/9.95G [00:34<00:03, 257MB/s]\u001B[A\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001B[0m \u001B[32m [repeated 634x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  82%|████████▏ | 8.17G/9.95G [00:35<00:07, 228MB/s]\u001B[A\u001B[32m [repeated 628x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:37<00:00, 262MB/s]\u001B[A\n",
+      "Downloading shards:  33%|███▎      | 1/3 [00:38<01:16, 38.09s/it]\n",
+      "Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]\u001B[A\n",
+      "Downloading (…)l-00002-of-00003.bin:   1%|▏         | 126M/9.90G [00:00<00:35, 273MB/s] \u001B[A\n",
+      "Downloading (…)l-00001-of-00003.bin:  93%|█████████▎| 9.27G/9.95G [00:39<00:02, 228MB/s]\u001B[A\u001B[32m [repeated 394x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001B[0m \u001B[32m [repeated 633x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:   2%|▏         | 241M/9.90G [00:01<00:38, 252MB/s]\u001B[A\u001B[32m [repeated 213x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:40<00:00, 243MB/s]\u001B[32m [repeated 8x across cluster]\u001B[0m\n",
+      "Downloading shards:  33%|███▎      | 1/3 [00:42<01:25, 42.77s/it]\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]\u001B[A\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:   1%|          | 115M/9.90G [00:00<00:46, 209MB/s] \u001B[A\u001B[32m [repeated 16x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:42<00:00, 233MB/s]\u001B[32m [repeated 50x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001B[0m \u001B[32m [repeated 636x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  19%|█▊        | 1.86G/9.90G [00:06<00:29, 275MB/s]\u001B[A\u001B[32m [repeated 589x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001B[0m \u001B[32m [repeated 649x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  18%|█▊        | 1.75G/9.90G [00:07<00:34, 234MB/s]\u001B[A\u001B[32m [repeated 643x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001B[0m \u001B[32m [repeated 645x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  41%|████▏     | 4.09G/9.90G [00:15<00:21, 271MB/s]\u001B[A\u001B[32m [repeated 644x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001B[0m \u001B[32m [repeated 652x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  53%|█████▎    | 5.25G/9.90G [00:21<00:19, 242MB/s]\u001B[A\u001B[32m [repeated 656x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m \u001B[32m [repeated 647x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  67%|██████▋   | 6.66G/9.90G [00:25<00:13, 246MB/s]\u001B[A\u001B[32m [repeated 646x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001B[0m \u001B[32m [repeated 629x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  84%|████████▍ | 8.30G/9.90G [00:31<00:06, 234MB/s]\u001B[A\u001B[32m [repeated 627x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  91%|█████████▏| 9.06G/9.90G [00:34<00:03, 241MB/s]\u001B[A\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74457, ip=10.0.47.2)\u001B[0m \u001B[32m [repeated 627x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  89%|████████▉ | 8.84G/9.90G [00:36<00:04, 228MB/s]\u001B[A\u001B[32m [repeated 567x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:38<00:00, 257MB/s]\u001B[A\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [01:16<00:38, 38.38s/it]\n",
+      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001B[A\n",
+      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 126M/6.18G [00:00<00:22, 266MB/s] \u001B[A\n",
+      "Downloading (…)l-00002-of-00003.bin:  98%|█████████▊| 9.69G/9.90G [00:38<00:00, 236MB/s]\u001B[A\u001B[32m [repeated 310x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75476, ip=10.0.51.205)\u001B[0m \u001B[32m [repeated 629x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 94.4M/6.18G [00:00<00:24, 247MB/s]\u001B[A\u001B[32m [repeated 275x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:39<00:00, 253MB/s]\u001B[32m [repeated 10x across cluster]\u001B[0m\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [01:20<00:40, 40.01s/it]\u001B[32m [repeated 13x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001B[A\u001B[32m [repeated 13x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 126M/6.18G [00:00<00:24, 243MB/s] \u001B[A\u001B[32m [repeated 13x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|█████████▉| 9.88G/9.90G [00:41<00:00, 242MB/s]\u001B[A\u001B[32m [repeated 122x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001B[0m \u001B[32m [repeated 638x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  21%|██        | 1.31G/6.18G [00:05<00:20, 243MB/s]\u001B[A\u001B[32m [repeated 569x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:40<00:00, 242MB/s]\u001B[32m [repeated 2x across cluster]\u001B[0m\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [01:23<00:41, 41.78s/it]\u001B[32m [repeated 2x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001B[A\u001B[32m [repeated 2x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 105M/6.18G [00:00<00:24, 248MB/s] \u001B[A\u001B[32m [repeated 2x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|█████████▉| 9.87G/9.90G [00:40<00:00, 260MB/s]\u001B[A\u001B[32m [repeated 3x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001B[0m \u001B[32m [repeated 638x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  41%|████▏     | 2.56G/6.18G [00:10<00:14, 256MB/s]\u001B[A\u001B[32m [repeated 635x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m \u001B[32m [repeated 629x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  62%|██████▏   | 3.84G/6.18G [00:15<00:08, 279MB/s]\u001B[A\u001B[32m [repeated 627x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  92%|█████████▏| 5.66G/6.18G [00:22<00:01, 268MB/s]\u001B[A\n",
+      "Downloading (…)l-00003-of-00003.bin:  92%|█████████▏| 5.69G/6.18G [00:22<00:01, 265MB/s]\u001B[A\n",
+      "Downloading (…)l-00003-of-00003.bin:  93%|█████████▎| 5.73G/6.18G [00:22<00:01, 268MB/s]\u001B[A\n",
+      "Downloading (…)l-00003-of-00003.bin:  93%|█████████▎| 5.76G/6.18G [00:22<00:01, 270MB/s]\u001B[A\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001B[0m \u001B[32m [repeated 644x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  85%|████████▌ | 5.25G/6.18G [00:20<00:03, 270MB/s]\u001B[A\u001B[32m [repeated 618x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:24<00:00, 257MB/s]\u001B[A\n",
+      "Downloading shards: 100%|██████████| 3/3 [01:40<00:00, 33.61s/it]\n",
+      "Downloading (…)l-00003-of-00003.bin:  98%|█████████▊| 6.03G/6.18G [00:23<00:00, 269MB/s]\u001B[A\u001B[32m [repeated 166x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001B[0m \u001B[32m [repeated 426x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  86%|████████▌ | 5.30G/6.18G [00:21<00:03, 246MB/s]\u001B[A\u001B[32m [repeated 222x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:25<00:00, 239MB/s]\u001B[32m [repeated 7x across cluster]\u001B[0m\n",
+      "Downloading shards: 100%|██████████| 3/3 [01:45<00:00, 35.27s/it]\u001B[32m [repeated 11x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  98%|█████████▊| 6.04G/6.18G [00:25<00:00, 231MB/s]\u001B[A\u001B[32m [repeated 98x across cluster]\u001B[0m\n",
+      "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001B[0m \u001B[32m [repeated 74x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  91%|█████████ | 5.63G/6.18G [00:23<00:02, 242MB/s]\u001B[A\u001B[32m [repeated 23x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:24<00:00, 249MB/s]\u001B[A\n",
+      "Downloading shards: 100%|██████████| 3/3 [01:49<00:00, 36.47s/it]\u001B[32m [repeated 4x across cluster]\u001B[0m\n",
+      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:25<00:00, 241MB/s]\u001B[32m [repeated 5x across cluster]\u001B[0m\n",
+      "Loading checkpoint shards:  33%|███▎      | 1/3 [00:12<00:24, 12.11s/it]\n",
+      "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "Loading checkpoint shards:  33%|███▎      | 1/3 [00:18<00:37, 18.54s/it]\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:30<00:15, 15.63s/it]\n",
+      "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:30<00:15, 15.71s/it]\n",
+      "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:35<00:17, 17.73s/it]\u001B[32m [repeated 14x across cluster]\u001B[0m\n",
+      "Loading checkpoint shards: 100%|██████████| 3/3 [00:40<00:00, 13.47s/it]\n",
+      "Downloading (…)neration_config.json: 100%|██████████| 132/132 [00:00<00:00, 458kB/s]\n",
+      "Loading checkpoint shards: 100%|██████████| 3/3 [00:45<00:00, 15.29s/it]\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001B[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "Downloading (…)neration_config.json: 100%|██████████| 132/132 [00:00<00:00, 542kB/s]\u001B[32m [repeated 14x across cluster]\u001B[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m DeepSpeed Configs:  {'zero_allow_untested_optimizer': True, 'bf16': {'enabled': True}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'reduce_bucket_size': 26214400, 'stage3_prefetch_bucket_size': 23592960.0, 'stage3_param_persistence_threshold': 51200}, 'gradient_accumulation_steps': 2, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 0.0}\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Model Archetecture:  LlamaForCausalLM(\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m   (model): LlamaModel(\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m     (embed_tokens): Embedding(32000, 5120, padding_idx=0)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m     (layers): ModuleList(\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m       (0-39): 40 x LlamaDecoderLayer(\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m         (self_attn): LlamaAttention(\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (q_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (k_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (v_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (o_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (rotary_emb): LlamaRotaryEmbedding()\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m         )\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m         (mlp): LlamaMLP(\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (down_proj): Linear(in_features=13824, out_features=5120, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (up_proj): Linear(in_features=5120, out_features=13824, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m           (act_fn): SiLUActivation()\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m         )\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m         (input_layernorm): LlamaRMSNorm()\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m         (post_attention_layernorm): LlamaRMSNorm()\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m       )\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m     )\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m     (norm): LlamaRMSNorm()\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m   )\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m   (lm_head): Linear(in_features=5120, out_features=32000, bias=False)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m )\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001B[0m [2023-06-30 17:39:54,688] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001B[0m [2023-06-30 17:39:56,220] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m ninja: no work to do.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Time to load cpu_adam op: 2.403524875640869 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Detected CUDA files, patching ldflags\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu118/cpu_adam/build.ninja...\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Building extension module cpu_adam...\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Loading extension module cpu_adam...\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001B[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "Downloading (…)neration_config.json: 100%|██████████| 132/132 [00:00<00:00, 1.72MB/s]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001B[0m Building extension module utils...\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m Loading extension module utils...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m Time to load utils op: 0.0775597095489502 seconds\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Parameter Offload: Total persistent parameters: 414720 in 81 params\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m No modifications detected for re-loaded extension module utils, skipping build step...\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\u001B[32m [repeated 32x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001B[0m Detected CUDA files, patching ldflags\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu118/utils/build.ninja...\u001B[32m [repeated 31x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001B[0m Building extension module cpu_adam...\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\u001B[32m [repeated 31x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001B[0m Loading extension module cpu_adam...\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Building extension module utils...\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m Loading extension module utils...\u001B[32m [repeated 16x across cluster]\u001B[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m ninja: no work to do.\u001B[32m [repeated 31x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001B[0m Time to load cpu_adam op: 2.3851447105407715 seconds\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m Time to load utils op: 0.0005815029144287109 seconds\u001B[32m [repeated 16x across cluster]\u001B[0m\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m   | Name  | Type             | Params | Params per Device\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m ---------------------------------------------------------------\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m 0 | model | LlamaForCausalLM | 13.0 B | 813 M            \n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m ---------------------------------------------------------------\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m 13.0 B    Trainable params\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m 0         Non-trainable params\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m 13.0 B    Total params\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m 52,063.457Total estimated model params size (MB)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0:   0%|          | 0/57 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m /home/ray/anaconda3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:432: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 64 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m   rank_zero_warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0:   2%|▏         | 1/57 [00:38<35:42, 38.26s/it, v_num=0, train_loss=11.50]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Time to load utils op: 0.00030732154846191406 seconds\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:44:33,395] [WARNING] [stage3.py:1851:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:   4%|▎         | 2/57 [01:19<36:23, 39.69s/it, v_num=0, train_loss=10.70]\n",
+      "Epoch 0:   5%|▌         | 3/57 [01:52<33:52, 37.65s/it, v_num=0, train_loss=1.710]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:45:48,054] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:   7%|▋         | 4/57 [02:34<34:01, 38.51s/it, v_num=0, train_loss=1.610]\n",
+      "Epoch 0:   9%|▉         | 5/57 [03:08<32:35, 37.60s/it, v_num=0, train_loss=0.914]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:47:03,011] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  11%|█         | 6/57 [03:49<32:26, 38.17s/it, v_num=0, train_loss=0.973]\n",
+      "Epoch 0:  12%|█▏        | 7/57 [04:24<31:30, 37.81s/it, v_num=0, train_loss=0.801]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:48:19,362] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  14%|█▍        | 8/57 [05:05<31:10, 38.17s/it, v_num=0, train_loss=0.844]\n",
+      "Epoch 0:  16%|█▌        | 9/57 [05:39<30:12, 37.75s/it, v_num=0, train_loss=0.652]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:49:36,571] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  18%|█▊        | 10/57 [06:22<29:58, 38.26s/it, v_num=0, train_loss=0.633]\n",
+      "Epoch 0:  19%|█▉        | 11/57 [06:59<29:13, 38.12s/it, v_num=0, train_loss=0.629]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/arrow/cpp/src/arrow/filesystem/s3fs.cc:663: CompletedMultipartUpload got error embedded in a 200 OK response: InternalError (\"We encountered an internal error. Please try again.\"), retry = 1\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:50:54,177] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  21%|██        | 12/57 [07:40<28:45, 38.35s/it, v_num=0, train_loss=0.609]\n",
+      "Epoch 0:  23%|██▎       | 13/57 [08:14<27:53, 38.04s/it, v_num=0, train_loss=0.680]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:52:10,002] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  25%|██▍       | 14/57 [08:55<27:26, 38.29s/it, v_num=0, train_loss=0.648]\n",
+      "Epoch 0:  26%|██▋       | 15/57 [09:29<26:33, 37.95s/it, v_num=0, train_loss=0.645]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:53:23,209] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  28%|██▊       | 16/57 [10:09<26:01, 38.08s/it, v_num=0, train_loss=0.664]\n",
+      "Epoch 0:  30%|██▉       | 17/57 [10:43<25:13, 37.83s/it, v_num=0, train_loss=0.625]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:54:36,660] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  32%|███▏      | 18/57 [11:22<24:39, 37.93s/it, v_num=0, train_loss=0.617]\n",
+      "Epoch 0:  33%|███▎      | 19/57 [11:56<23:53, 37.71s/it, v_num=0, train_loss=0.609]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:55:51,289] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  35%|███▌      | 20/57 [12:37<23:20, 37.86s/it, v_num=0, train_loss=0.602]\n",
+      "Epoch 0:  37%|███▋      | 21/57 [13:11<22:36, 37.69s/it, v_num=0, train_loss=0.590]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:57:07,919] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  39%|███▊      | 22/57 [13:53<22:06, 37.91s/it, v_num=0, train_loss=0.555]\n",
+      "Epoch 0:  40%|████      | 23/57 [14:27<21:22, 37.72s/it, v_num=0, train_loss=0.598]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:58:22,349] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  42%|████▏     | 24/57 [15:08<20:48, 37.85s/it, v_num=0, train_loss=0.625]\n",
+      "Epoch 0:  44%|████▍     | 25/57 [15:43<20:07, 37.74s/it, v_num=0, train_loss=0.625]\n",
+      "Epoch 0:  44%|████▍     | 25/57 [15:43<20:07, 37.74s/it, v_num=0, train_loss=0.582]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 17:59:40,125] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  46%|████▌     | 26/57 [16:26<19:35, 37.93s/it, v_num=0, train_loss=0.535]\n",
+      "Epoch 0:  47%|████▋     | 27/57 [17:02<18:56, 37.88s/it, v_num=0, train_loss=0.578]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:00:58,164] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  49%|████▉     | 28/57 [17:44<18:22, 38.01s/it, v_num=0, train_loss=0.582]\n",
+      "Epoch 0:  51%|█████     | 29/57 [18:20<17:42, 37.93s/it, v_num=0, train_loss=0.578]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:02:15,097] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  53%|█████▎    | 30/57 [19:01<17:06, 38.04s/it, v_num=0, train_loss=0.598]\n",
+      "Epoch 0:  54%|█████▍    | 31/57 [19:36<16:26, 37.95s/it, v_num=0, train_loss=0.586]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:03:30,632] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  56%|█████▌    | 32/57 [20:16<15:50, 38.02s/it, v_num=0, train_loss=0.605]\n",
+      "Epoch 0:  58%|█████▊    | 33/57 [20:49<15:08, 37.87s/it, v_num=0, train_loss=0.594]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:04:45,362] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  60%|█████▉    | 34/57 [21:31<14:33, 37.98s/it, v_num=0, train_loss=0.598]\n",
+      "Epoch 0:  61%|██████▏   | 35/57 [22:08<13:54, 37.95s/it, v_num=0, train_loss=0.574]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:06:02,727] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  63%|██████▎   | 36/57 [22:48<13:18, 38.02s/it, v_num=0, train_loss=0.586]\n",
+      "Epoch 0:  65%|██████▍   | 37/57 [23:23<12:38, 37.94s/it, v_num=0, train_loss=0.562]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:07:19,126] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  67%|██████▋   | 38/57 [24:05<12:02, 38.03s/it, v_num=0, train_loss=0.535]\n",
+      "Epoch 0:  68%|██████▊   | 39/57 [24:38<11:22, 37.91s/it, v_num=0, train_loss=0.598]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:08:36,683] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  70%|███████   | 40/57 [25:22<10:47, 38.07s/it, v_num=0, train_loss=0.562]\n",
+      "Epoch 0:  72%|███████▏  | 41/57 [25:57<10:07, 37.98s/it, v_num=0, train_loss=0.555]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:09:52,426] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  74%|███████▎  | 42/57 [26:38<09:30, 38.06s/it, v_num=0, train_loss=0.555]\n",
+      "Epoch 0:  75%|███████▌  | 43/57 [27:13<08:51, 37.99s/it, v_num=0, train_loss=0.547]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:11:08,855] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  77%|███████▋  | 44/57 [27:54<08:14, 38.06s/it, v_num=0, train_loss=0.562]\n",
+      "Epoch 0:  79%|███████▉  | 45/57 [28:29<07:35, 37.98s/it, v_num=0, train_loss=0.535]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:12:25,181] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  81%|████████  | 46/57 [29:11<06:58, 38.07s/it, v_num=0, train_loss=0.531]\n",
+      "Epoch 0:  82%|████████▏ | 47/57 [29:45<06:19, 37.99s/it, v_num=0, train_loss=0.504]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:13:40,300] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  84%|████████▍ | 48/57 [30:26<05:42, 38.05s/it, v_num=0, train_loss=0.520]\n",
+      "Epoch 0:  86%|████████▌ | 49/57 [31:01<05:03, 37.99s/it, v_num=0, train_loss=0.523]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:14:55,542] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  88%|████████▊ | 50/57 [31:41<04:26, 38.03s/it, v_num=0, train_loss=0.520]\n",
+      "Epoch 0:  89%|████████▉ | 51/57 [32:16<03:47, 37.98s/it, v_num=0, train_loss=0.527]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:16:12,131] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  91%|█████████ | 52/57 [32:58<03:10, 38.04s/it, v_num=0, train_loss=0.562]\n",
+      "Epoch 0:  93%|█████████▎| 53/57 [33:34<02:32, 38.00s/it, v_num=0, train_loss=0.539]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:17:29,752] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  95%|█████████▍| 54/57 [34:15<01:54, 38.07s/it, v_num=0, train_loss=0.535]\n",
+      "Epoch 0:  96%|█████████▋| 55/57 [34:50<01:16, 38.01s/it, v_num=0, train_loss=0.512]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:18:45,986] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  98%|█████████▊| 56/57 [35:31<00:38, 38.07s/it, v_num=0, train_loss=0.516]\n",
+      "Epoch 0: 100%|██████████| 57/57 [36:06<00:00, 38.00s/it, v_num=0, train_loss=0.461]\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m [2023-06-30 18:20:01,817] [WARNING] [stage3.py:1851:step] 3 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0: : 58it [36:47, 38.07s/it, v_num=0, train_loss=0.523]                      \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001B[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001B[0m   warnings.warn(\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m No modifications detected for re-loaded extension module utils, skipping build step...\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Loading extension module utils...\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Uploading checkpoint files from worker rank 0 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m   warnings.warn(\u001B[32m [repeated 15x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001B[0m Uploading checkpoint files from worker rank 3 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m Uploading checkpoint files from worker rank 1 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m Done uploading checkpoint files.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001B[0m Uploading checkpoint files from worker rank 10 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\u001B[32m [repeated 13x across cluster]\u001B[0m\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001B[0m Done uploading checkpoint files.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001B[0m Done uploading checkpoint files.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001B[0m Done uploading checkpoint files.\u001B[32m [repeated 11x across cluster]\u001B[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0: : 58it [37:42, 39.00s/it, v_num=0, train_loss=0.523]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=134267)\u001B[0m `Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "\u001B[2m\u001B[36m(LightningTrainer pid=134103)\u001B[0m Uploading trial artifacts took 26.651 s, which may be a performance bottleneck. Consider saving fewer/smaller artifacts to the trial log directory, or disable artifact syncing with `SyncConfig(sync_artifacts=False)`.\n",
+      "\u001B[2m\u001B[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001B[0m Done uploading checkpoint files.\u001B[32m [repeated 2x across cluster]\u001B[0m\n",
+      "2023-06-30 18:21:59,316\tINFO tune.py:1148 -- Total run time: 2542.82 seconds (2511.95 seconds for the tuning loop).\n"
+     ]
+    }
+   ],
+   "source": [
+    "result = trainer.fit()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In summary:\n",
+    "- Training takes: 36:06 = 2166s\n",
+    "- Training + initialization + checkpointing takes 2473s\n",
+    "\n",
+    "Therefore, the model initialization and checkpoint syncing takes 307s. It will be amortized when you have larger datasets and spend more time on training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Result(\n",
+       "  metrics={'_report_on': 'train_epoch_end', 'train_loss': 0.5234375, 'epoch': 0, 'step': 29, 'should_checkpoint': True, 'done': True, 'trial_id': 'c1544_00000', 'experiment_tag': '0'},\n",
+       "  path='s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36',\n",
+       "  checkpoint=LightningCheckpoint(uri=s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000)\n",
+       ")"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## LLM Inference\n",
+    "\n",
+    "Now, it's time to play with our fine-tuned Vicuna code generator!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Download and Process your checkpoints\n",
+    "\n",
+    "First, download the checkpoints to your local machine using the AWS CLI.\n",
+    "\n",
+    "Note that adding the following configurations can significantly increase the syncing throughput compared to the default configurations. On a g5 instance with NVME SSD, the download speed improved from `200MB/s` to around `1.5GB/s`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!awsv2 configure set s3.max_concurrent_requests 32\n",
+    "!awsv2 configure set default.s3.preferred_transfer_client crt\n",
+    "!awsv2 configure set default.s3.target_bandwidth 100Gb/s\n",
+    "!awsv2 configure set default.s3.multipart_chunksize 8MB"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.system(f\"awsv2 s3 sync {result.checkpoint.uri} /mnt/local_storage/checkpoint\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The deepspeed ZeRO-3 checkpoint is a directory containing of k shards (k=16 in our case).\n",
+    "\n",
+    "- `zero_pp_rank_k_mp_rank_00_model_states.pt`: contains the model parameter skeleton of shard k.\n",
+    "- `bf16_zero_pp_rank_k_mp_rank_00_optim_states.pt`: contains the actual flattened model parameters and optimizer states of shard k.\n",
+    "\n",
+    "Next, we removed the optimizer states and consolidate the checkpoint into a single binary file using DeepSpeed utilities. Also, since we wrapped vicuna-13b within a `LightningModule`, we need to remove the prefix `_forward_module.model.model` so that we can directly load the checkpoint into a HF vicuna model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing zero checkpoint '/mnt/local_storage/checkpoint/model/checkpoint'\n",
+      "Detected checkpoint of type zero stage 3, world_size: 16\n",
+      "Parsing checkpoint created by deepspeed==0.9.4\n",
+      "Reconstructed Trainable fp32 state dict with 363 params 13015864320 elements\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint\n",
+    "\n",
+    "def extract_fp32_ckpt_from_zero(zero_ckpt_dir):\n",
+    "    state_dict = get_fp32_state_dict_from_zero_checkpoint(zero_ckpt_dir)\n",
+    "    vicuna_state_dict = {\n",
+    "        k.replace(\"_forward_module.model.\", \"\"): v for k, v in state_dict.items()\n",
+    "    }\n",
+    "    torch.save(vicuna_state_dict, os.path.join(zero_ckpt_dir, \"full_model.pt\"))\n",
+    "\n",
+    "\n",
+    "full_model_ckpt_path = \"/mnt/local_storage/checkpoint/model/full_model.pt\"\n",
+    "extract_fp32_ckpt_from_zero(\"/mnt/local_storage/checkpoint/model\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialize Generation Pipeline\n",
+    "\n",
+    "Here, we leverage the Accelerate library to efficiently load the model onto a suitable device(GPU and CPU) and generate a HF text generation pipeline. \n",
+    "\n",
+    "- Initialize an empty model on metadevice\n",
+    "- Create valid device mappings for the vicuna-13b model\n",
+    "- Load and distribute model weights to target devices\n",
+    "\n",
+    "This ensures that only 1x model size of RAM is used for model initialization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import ray\n",
+    "import pytorch_lightning as pl\n",
+    "from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM\n",
+    "from accelerate import (\n",
+    "    init_empty_weights,\n",
+    "    infer_auto_device_map,\n",
+    "    load_checkpoint_and_dispatch,\n",
+    ")\n",
+    "\n",
+    "# Initialize a model on meta device\n",
+    "with init_empty_weights():\n",
+    "    config = AutoConfig.from_pretrained(MODEL_NAME)\n",
+    "    meta_model = AutoModelForCausalLM.from_config(config)\n",
+    "meta_model.tie_weights()\n",
+    "\n",
+    "# Define the device mapping\n",
+    "device_map = infer_auto_device_map(\n",
+    "    meta_model,\n",
+    "    max_memory={0: \"15GB\", \"cpu\": \"60GB\"},\n",
+    "    no_split_module_classes=[\"LlamaDecoderLayer\"],\n",
+    ")\n",
+    "\n",
+    "# Load the model parameters\n",
+    "model = load_checkpoint_and_dispatch(\n",
+    "    meta_model,\n",
+    "    checkpoint=full_model_ckpt_path,\n",
+    "    device_map=device_map,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline\n",
+    "\n",
+    "generator = pipeline(\n",
+    "    \"text-generation\",\n",
+    "    model=model,\n",
+    "    device_map=device_map,\n",
+    "    tokenizer=AutoTokenizer.from_pretrained(\n",
+    "        MODEL_NAME, padding_side=\"left\", use_fast=False\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Case Study\n",
+    "\n",
+    "We took 3 examples from the CoNaLa's test split for demo:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "testcases = [\n",
+    "    {\n",
+    "        \"intent\": \"replace white spaces in colunm 'col' of dataframe `df` with '_'\",\n",
+    "    },\n",
+    "    {\n",
+    "        \"intent\": \"search for occurrences of regex pattern '>.*<' in xml string `line`\",\n",
+    "    },\n",
+    "    {\n",
+    "        \"intent\": \"send a signal `signal.SIGUSR1` to the current process\",\n",
+    "    },\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's begin by examining the generated outputs without fine-tuning. In this case study, we utilize [Aviary Explorer](https://aviary.anyscale.com), an open-source multi-LLM serving platform supported by Ray and Anyscale. You can easily select from a variety of open-source LLMs and compare their generation quality, cost, latency, and many other metrics.\n",
+    "\n",
+    "We constructed a prompt in a zero-shot learning manner and feed it into 3 OSS LLMs.\n",
+    "\n",
+    "![](https://user-images.githubusercontent.com/26745457/250704232-65a20f1b-6752-4d6c-bba1-8296a373162f.png)\n",
+    "\n",
+    "\n",
+    "- `vicuna-13b-v1.3` begins to speak Chinese.\n",
+    "- `mpt-7b-chat` generates a reasonable code snippet, but with multiple lines.\n",
+    "- `falcon-7b-sft` generates a one line snippet, but it doesn't seem to work.\n",
+    "\n",
+    "As we can see, none of them generate a satisfactory code snippet. \n",
+    "\n",
+    "Now let's check the performance of our fine-tuned `vicuna-13b-v1.3` model:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/pipelines/base.py:1081: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Intent: replace white spaces in colunm 'col' of dataframe `df` with '_'\n",
+      "One-line code snippet:  `df['col'] = df['col'].str.replace(' ', '_')`\n",
+      "\n",
+      "Intent: search for occurrences of regex pattern '>.*<' in xml string `line`\n",
+      "One-line code snippet:  `re.findall('>.*<', line)``\n",
+      "\n",
+      "Intent: send a signal `signal.SIGUSR1` to the current process\n",
+      "One-line code snippet:  `os.kill(os.getpid(), signal.SIGUSR1)``\n"
+     ]
+    }
+   ],
+   "source": [
+    "for case in testcases:\n",
+    "    prompt = PROMPT_TEMPLATE.format(intent=case[\"intent\"], snippet=\"\")\n",
+    "    output = generator(prompt, max_new_tokens=30, do_sample=True)\n",
+    "    print(output[0][\"generated_text\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Test the Generated Code Snippets\n",
+    "\n",
+    "The generated code snippets look pretty reasonable. The results covered Pandas operations, regular expressions, and Linux commands. Let's test them one by one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Before\n",
+      "            col\n",
+      "0  abc def ghi\n",
+      "1     12 3 456\n",
+      "2             \n",
+      "After\n",
+      "            col\n",
+      "0  abc_def_ghi\n",
+      "1    _12_3_456\n",
+      "2        _____\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.DataFrame.from_dict({\"col\": [\"abc def ghi\", \" 12 3 456\", \"     \"]})\n",
+    "print(\"Before\\n\", df)\n",
+    "\n",
+    "df[\"col\"] = df[\"col\"].str.replace(\" \", \"_\")\n",
+    "print(\"After\\n\", df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['>The Great Gatsby<',\n",
+       " '>F. Scott Fitzgerald<',\n",
+       " '>1925<',\n",
+       " '>Sapiens: A Brief History of Humankind<',\n",
+       " '>Yuval Noah Harari<',\n",
+       " '>2011<']"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import re\n",
+    "\n",
+    "line = \"\"\"\n",
+    "<bookstore>\n",
+    "  <book category=\"fiction\">\n",
+    "    <title>The Great Gatsby</title>\n",
+    "    <author>F. Scott Fitzgerald</author>\n",
+    "    <year>1925</year>\n",
+    "  </book>\n",
+    "  <book category=\"non-fiction\">\n",
+    "    <title>Sapiens: A Brief History of Humankind</title>\n",
+    "    <author>Yuval Noah Harari</author>\n",
+    "    <year>2011</year>\n",
+    "  </book>\n",
+    "</bookstore>\n",
+    "\"\"\"\n",
+    "re.findall(\">.*<\", line)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, let's hand it over to LLM and let it wrap up the demo:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, signal\n",
+    "\n",
+    "os.kill(os.getpid(), signal.SIGUSR1)  # Terminate the current process~"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## References:\n",
+    "\n",
+    "- [CoNaLa: The Code/Natural Language Challenge](https://conala-corpus.github.io/)\n",
+    "- [HuggingFace: DeepSpeed Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed#deepspeed-integration)\n",
+    "- [HuggingFace: Handling big models for inference](https://huggingface.co/docs/accelerate/main/usage_guides/big_modeling)\n",
+    "- [Lightning Transformers: DeepSpeed Training with Big Transformer Models](https://lightning-transformers.readthedocs.io/en/latest/)\n",
+    "- [Aviary: Open Source Multi-LLM Serving](https://www.anyscale.com/blog/announcing-aviary-open-source-multi-llm-serving-solution)\n",
+    "- Rajbhandari, S., Rasley, J., et al. (2020). ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054)\n",
+    "- Zheng, L., Chiang, W-L., Sheng, Y., et al. (2023). Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. [arXiv:2306.05685](https://arxiv.org/abs/2306.05685)\n",
+    "\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}