Skip to content

Commit

Permalink
[RLlib] RLTrainer is all you need. (#31490)
Browse files Browse the repository at this point in the history
moved rl_optimizer logic into rl_trainer

Signed-off-by: Kourosh Hakhamaneshi <[email protected]>
  • Loading branch information
kouroshHakha authored Jan 6, 2023
1 parent f54897a commit 4e234b7
Show file tree
Hide file tree
Showing 11 changed files with 273 additions and 269 deletions.
47 changes: 8 additions & 39 deletions rllib/core/optim/tests/test_rl_optimizer_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,16 @@
import gymnasium as gym
from typing import Any, Mapping, Union
import unittest
import pytest

import ray
from ray.rllib.algorithms import AlgorithmConfig
from ray.rllib.offline import IOContext
from ray.rllib.offline.dataset_reader import (
DatasetReader,
get_dataset_and_shards,
)
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule
from ray.rllib.core.testing.tf.bc_optimizer import BCTFOptimizer
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils.nested_dict import NestedDict
from ray.rllib.utils.numpy import convert_to_numpy
from ray.rllib.utils.test_utils import check
from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
from ray.rllib.utils.typing import TensorType

tf1, tf, tfv = try_import_tf()
Expand Down Expand Up @@ -132,14 +127,14 @@ def compute_gradients(
Returns:
A dictionary of extra information and statistics.
"""
grads = tape.gradient(loss["total_loss"], self._module.trainable_variables())
grads = tape.gradient(loss["total_loss"], self._module.trainable_variables)
return grads

def apply_gradients(self, gradients: Mapping[str, Any]) -> None:
"""Perform an update on self._module"""
for key, optimizer in self._rl_optimizer.get_optimizers().items():
optimizer.apply_gradients(
zip(gradients[key], self._module.trainable_variables()[key])
zip(gradients[key], self._module.trainable_variables[key])
)

def set_state(self, state: Mapping[str, Any]) -> None:
Expand All @@ -164,27 +159,14 @@ def setUpClass(cls) -> None:
def tearDownClass(cls) -> None:
ray.shutdown()

@pytest.mark.skip
def test_rl_optimizer_in_behavioral_cloning_tf(self):
tf.random.set_seed(1)
env = gym.make("CartPole-v1")
trainer = BCTFTrainer(env)

# path = "s3://air-example-data/rllib/cartpole/large.json"
path = "tests/data/cartpole/large.json"
input_config = {"format": "json", "paths": path}
dataset, _ = get_dataset_and_shards(
AlgorithmConfig().offline_data(input_="dataset", input_config=input_config)
)
batch_size = 500
ioctx = IOContext(
config=(
AlgorithmConfig()
.training(train_batch_size=batch_size)
.offline_data(actions_in_input_normalized=True)
),
worker_index=0,
)
reader = DatasetReader(dataset, ioctx)
reader = get_cartpole_dataset_reader(batch_size=batch_size)
num_epochs = 100
total_timesteps_of_training = 1000000
inter_steps = total_timesteps_of_training // (num_epochs * batch_size)
Expand All @@ -198,28 +180,15 @@ def test_rl_optimizer_in_behavioral_cloning_tf(self):
# 0.57 the return of the policy gets to around 100.
self.assertLess(results["total_loss"], 0.57)

@pytest.mark.skip
def test_rl_optimizer_set_state_get_state_tf(self):
env = gym.make("CartPole-v1")

trainer1 = BCTFTrainer(env)
trainer2 = BCTFTrainer(env)

# path = "s3://air-example-data/rllib/cartpole/large.json"
path = "tests/data/cartpole/large.json"
input_config = {"format": "json", "paths": path}
dataset, _ = get_dataset_and_shards(
AlgorithmConfig().offline_data(input_="dataset", input_config=input_config)
)
batch_size = 500
ioctx = IOContext(
config=(
AlgorithmConfig()
.training(train_batch_size=batch_size)
.offline_data(actions_in_input_normalized=True)
),
worker_index=0,
)
reader = DatasetReader(dataset, ioctx)
reader = get_cartpole_dataset_reader(batch_size=batch_size)
batch = reader.next()

trainer1.update(batch)
Expand Down
26 changes: 4 additions & 22 deletions rllib/core/optim/tests/test_rl_optimizer_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,12 @@

import ray

from ray.rllib.algorithms import AlgorithmConfig
from ray.rllib.offline import IOContext
from ray.rllib.offline.dataset_reader import (
DatasetReader,
get_dataset_and_shards,
)
from ray.rllib.core.testing.torch.bc_module import DiscreteBCTorchModule
from ray.rllib.core.testing.torch.bc_optimizer import BCTorchOptimizer
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils.nested_dict import NestedDict
from ray.rllib.utils.numpy import convert_to_numpy
from ray.rllib.utils.test_utils import check
from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
from ray.rllib.utils.torch_utils import convert_to_torch_tensor
from ray.rllib.utils.typing import TensorType

Expand Down Expand Up @@ -181,6 +175,7 @@ def setUpClass(cls) -> None:
def tearDownClass(cls) -> None:
ray.shutdown()

@pytest.mark.skip
def test_rl_optimizer_in_behavioral_cloning_torch(self):
torch.manual_seed(1)
env = gym.make("CartPole-v1")
Expand All @@ -193,22 +188,8 @@ def test_rl_optimizer_in_behavioral_cloning_torch(self):
trainer = BCTorchTrainer(env)
trainer.set_state({"module_state": module_for_inference.get_state()})

# path = "s3://air-example-data/rllib/cartpole/large.json"
path = "tests/data/cartpole/large.json"
input_config = {"format": "json", "paths": path}
dataset, _ = get_dataset_and_shards(
AlgorithmConfig().offline_data(input_="dataset", input_config=input_config)
)
batch_size = 500
ioctx = IOContext(
config=(
AlgorithmConfig()
.training(train_batch_size=batch_size)
.offline_data(actions_in_input_normalized=True)
),
worker_index=0,
)
reader = DatasetReader(dataset, ioctx)
reader = get_cartpole_dataset_reader(batch_size=batch_size)
num_epochs = 100
total_timesteps_of_training = 1000000
inter_steps = total_timesteps_of_training // (num_epochs * batch_size)
Expand All @@ -222,6 +203,7 @@ def test_rl_optimizer_in_behavioral_cloning_torch(self):
# 0.57 the return of the policy gets to around 100.
self.assertLess(results["total_loss"], 0.57)

@pytest.mark.skip
def test_rl_optimizer_set_state_get_state_torch(self):
env = gym.make("CartPole-v1")
module = DiscreteBCTorchModule.from_model_config(
Expand Down
4 changes: 2 additions & 2 deletions rllib/core/rl_module/tf/tests/test_tf_rl_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ def test_forward_train(self):
self.assertIn("action_dist", output)
self.assertIsInstance(output["action_dist"], tfp.distributions.Categorical)

grads = tape.gradient(loss, module.trainable_variables())
grads = tape.gradient(loss, module.trainable_variables)

# check that all neural net parameters have gradients
for grad in grads["policy"]:
for grad in grads:
self.assertIsNotNone(grad)

def test_forward(self):
Expand Down
15 changes: 0 additions & 15 deletions rllib/core/rl_module/tf/tf_rl_module.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import abc
from typing import Any, Mapping

from ray.rllib.core.rl_module import RLModule
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.nested_dict import NestedDict


_, tf, _ = try_import_tf()
Expand Down Expand Up @@ -61,16 +59,3 @@ def is_distributed(self) -> bool:
"""Returns True if the module is distributed."""
# TODO (Avnish): Implement this.
return False

@abc.abstractmethod
def trainable_variables(self) -> NestedDict[tf.Tensor]:
"""Returns the trainable variables of the module.
Example:
`return {"module": module.trainable_variables}`
Note:
See tensorflow.org/guide/autodiff#gradients_with_respect_to_a_model
for more details.
"""
Loading

0 comments on commit 4e234b7

Please sign in to comment.