diff --git a/.github/Aircraft.gif b/.github/Aircraft.gif
new file mode 100644
index 000000000..863b39499
Binary files /dev/null and b/.github/Aircraft.gif differ
diff --git a/.github/Breakout.gif b/.github/Breakout.gif
new file mode 100644
index 000000000..da5beb4f6
Binary files /dev/null and b/.github/Breakout.gif differ
diff --git a/.github/Half-Cheetah.gif b/.github/Half-Cheetah.gif
new file mode 100644
index 000000000..d8d76f155
Binary files /dev/null and b/.github/Half-Cheetah.gif differ
diff --git a/.github/NeurlIPS2018.gif b/.github/NeurlIPS2018.gif
new file mode 100644
index 000000000..0fbc94dac
Binary files /dev/null and b/.github/NeurlIPS2018.gif differ
diff --git a/README.md b/README.md
index 3d7b02797..fbdab3322 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ agent = AtariAgent(algorithm)
# Install:
### Dependencies
- Python 2.7 or 3.5+.
-- PaddlePaddle >=1.2.1 (We try to make our repository always compatible with newest version PaddlePaddle)
+- PaddlePaddle >=1.2.1 (We try to make our repository always compatible with latest version PaddlePaddle)
```
@@ -80,3 +80,7 @@ pip install --upgrade git+https://github.com/PaddlePaddle/PARL.git
- [DDPG](examples/DDPG/)
- [PPO](examples/PPO/)
- [Winning Solution for NIPS2018: AI for Prosthetics Challenge](examples/NeurIPS2018-AI-for-Prosthetics-Challenge/)
+
+
+
+
diff --git a/docs/ct.png b/docs/ct.png
deleted file mode 100644
index ec9261750..000000000
Binary files a/docs/ct.png and /dev/null differ
diff --git a/docs/design_doc.md b/docs/design_doc.md
deleted file mode 100644
index 9513ed143..000000000
--- a/docs/design_doc.md
+++ /dev/null
@@ -1,301 +0,0 @@
-# PaddlePaddle Reinforcement Learning Framework (PARL)
-This is the design doc for [PARL](https://github.com/PaddlePaddle/PARL):a general-purpose RL platform based on PaddlePaddle Fluid.
-
-## Problem description
-
-> A robot is an intelligent entity that situates in an environment. At every time step, it receives multimodal sensory inputs and generates (possibly multimodal) action outputs according to a certain set of rewards or goals, given the current (partial) environment observation.
-
-Almost any RL problem can be seen in the above perspective. We can always convert a problem description to a one similar to the above (namely, where a robot takes actions in an environment at every time step and receives rewards).
-
-For example, in the scenario of learning to put only one advertisement on each webpage, you can think of our decision-making algorithm as a robot, the user's online profile, browsing history, and some other contextual information as the environment observation, which advertisement to put on the webpage as the robot's action, and whether the user clicks on the advertisement as the reward. The time horizon in this case would then be a sequence of webpages.
-
-So it would be helpful to discuss any RL problem in this unified description.
-
-#### Action step vs. time step
-We need to differentiate between an *action step* and a *time step*.
-
-- A time step is a time interval defined by the environment simulator, which is the minimal temporal unit for simulation. Each time step yields a new environment observation.
-- An action step is always a multiple of a time step. If an actor has an action repetition of $K$, then an action step spans over $K$ time steps.
-
-When $K>1$, between two adjacent action steps there are several environment observations. Usually the agent only gets the last environment observation while discarding the first $K-1$ ones. Below is a diagram for $K=5$.
-
-
@@ -60,7 +60,7 @@ For final submission, we test our model in 500 CPUs, running 10 episodes per CPU
python simulator_server.py --port [PORT] --ensemble_num 1
# client (Suggest: 200+ clients)
-python simulator_client.py --port [PORT] --ip [IP] --reward_type RunFastest
+python simulator_client.py --port [PORT] --ip [SERVER_IP] --reward_type RunFastest
```
#### 2. Target: run at 3.0 m/s
@@ -71,7 +71,7 @@ python simulator_server.py --port [PORT] --ensemble_num 1 --warm_start_batchs 10
--restore_model_path [RunFastest model]
# client (Suggest: 200+ clients)
-python simulator_client.py --port [PORT] --ip [IP] --reward_type FixedTargetSpeed --target_v 3.0 \
+python simulator_client.py --port [PORT] --ip [SERVER_IP] --reward_type FixedTargetSpeed --target_v 3.0 \
--act_penalty_lowerbound 1.5
```
@@ -83,7 +83,7 @@ python simulator_server.py --port [PORT] --ensemble_num 1 --warm_start_batchs 10
--restore_model_path [FixedTargetSpeed 3.0m/s model]
# client (Suggest: 200+ clients)
-python simulator_client.py --port [PORT] --ip [IP] --reward_type FixedTargetSpeed --target_v 2.0 \
+python simulator_client.py --port [PORT] --ip [SERVER_IP] --reward_type FixedTargetSpeed --target_v 2.0 \
--act_penalty_lowerbound 0.75
```
@@ -99,7 +99,7 @@ python simulator_server.py --port [PORT] --ensemble_num 1 --warm_start_batchs 10
--restore_model_path [FixedTargetSpeed 2.0m/s model]
# client (Suggest: 200+ clients)
-python simulator_client.py --port [PORT] --ip [IP] --reward_type FixedTargetSpeed --target_v 1.25 \
+python simulator_client.py --port [PORT] --ip [SERVER_IP] --reward_type FixedTargetSpeed --target_v 1.25 \
--act_penalty_lowerbound 0.6
```
@@ -109,10 +109,10 @@ As mentioned before, the selection of model that used to fine-tune influence lat
```bash
# server
python simulator_server.py --port [PORT] --ensemble_num 12 --warm_start_batchs 1000 \
- --restore_model_path [FixedTargetSpeed 1.25m/s] --restore_from_one_head
+ --restore_model_path [FixedTargetSpeed 1.25m/s model] --restore_from_one_head
# client (Suggest: 100+ clients)
-python simulator_client.py --port [PORT] --ip [IP] --reward_type Round2 --act_penalty_lowerbound 0.75 \
+python simulator_client.py --port [PORT] --ip [SERVER_IP] --reward_type Round2 --act_penalty_lowerbound 0.75 \
--act_penalty_coeff 7.0 --vel_penalty_coeff 20.0 --discrete_data --stage 3
```
diff --git a/examples/PPO/README.md b/examples/PPO/README.md
index 6190f71ff..c78a21691 100644
--- a/examples/PPO/README.md
+++ b/examples/PPO/README.md
@@ -16,8 +16,9 @@ Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco
## How to use
### Dependencies:
-+ python2.7 or python3.5+
++ python3.5+
+ [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle)
++ [parl](https://github.com/PaddlePaddle/PARL)
+ gym
+ tqdm
+ mujoco-py>=1.50.1.0
diff --git a/examples/PPO/mujoco_agent.py b/examples/PPO/mujoco_agent.py
index a1221c642..80126fe3b 100644
--- a/examples/PPO/mujoco_agent.py
+++ b/examples/PPO/mujoco_agent.py
@@ -15,7 +15,6 @@
import numpy as np
import parl.layers as layers
from paddle import fluid
-from sklearn.utils import shuffle
from parl.framework.agent_base import Agent
from parl.utils import logger
@@ -183,12 +182,16 @@ def value_learn(self, obs, value):
all_loss = []
for _ in range(self.value_learn_times):
- obs_train, value_train = shuffle(obs_train, value_train)
+ random_ids = np.arange(obs_train.shape[0])
+ np.random.shuffle(random_ids)
+ shuffle_obs_train = obs_train[random_ids]
+ shuffle_value_train = value_train[random_ids]
start = 0
while start < data_size:
end = start + self.value_batch_size
- value_loss = self._batch_value_learn(obs_train[start:end, :],
- value_train[start:end])
+ value_loss = self._batch_value_learn(
+ shuffle_obs_train[start:end, :],
+ shuffle_value_train[start:end])
all_loss.append(value_loss)
start += self.value_batch_size
return np.mean(all_loss)
diff --git a/examples/QuickStart/README.md b/examples/QuickStart/README.md
index 39cdd8c74..c86c6b5d8 100644
--- a/examples/QuickStart/README.md
+++ b/examples/QuickStart/README.md
@@ -6,6 +6,7 @@ Based on PARL, train a agent to play CartPole game with policy gradient algorith
+ python2.7 or python3.5+
+ [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle)
++ [parl](https://github.com/PaddlePaddle/PARL)
+ gym
### Start Training:
diff --git a/parl/algorithm_zoo/__init__.py b/parl/algorithm_zoo/__init__.py
deleted file mode 100644
index eca2dce11..000000000
--- a/parl/algorithm_zoo/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/parl/algorithm_zoo/simple_algorithms.py b/parl/algorithm_zoo/simple_algorithms.py
deleted file mode 100644
index 953311e94..000000000
--- a/parl/algorithm_zoo/simple_algorithms.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from parl.framework.algorithm import Algorithm
-from paddle.fluid.initializer import ConstantInitializer
-import parl.layers as layers
-import parl.framework.policy_distribution as pd
-from parl.layers import common_functions as comf
-import paddle.fluid as fluid
-from copy import deepcopy
-
-
-class SimpleAC(Algorithm):
- """
- A simple Actor-Critic that has a feedforward policy network and
- a single discrete action.
-
- learn() requires keywords: "action", "reward", "v_value"
- """
-
- def __init__(self,
- model,
- hyperparas=dict(lr=1e-4),
- gpu_id=-1,
- discount_factor=0.99):
-
- super(SimpleAC, self).__init__(model, hyperparas, gpu_id)
- self.discount_factor = discount_factor
-
- def learn(self, inputs, next_inputs, states, next_states, next_episode_end,
- actions, rewards):
-
- action = actions["action"]
- reward = rewards["reward"]
-
- values = self.model.value(inputs, states)
- next_values = self.model.value(next_inputs, next_states)
- value = values["v_value"]
- next_value = next_values["v_value"] * next_episode_end[
- "next_episode_end"]
- next_value.stop_gradient = True
- assert value.shape[1] == next_value.shape[1]
-
- critic_value = reward + self.discount_factor * next_value
- td_error = critic_value - value
- value_cost = layers.square(td_error)
-
- dist, _ = self.model.policy(inputs, states)
- dist = dist["action"]
- assert isinstance(dist, pd.CategoricalDistribution)
-
- pg_cost = 0 - dist.loglikelihood(action)
- avg_cost = layers.mean(x=value_cost + pg_cost * td_error)
- optimizer = fluid.optimizer.DecayedAdagradOptimizer(
- learning_rate=self.hp["lr"])
- optimizer.minimize(avg_cost)
- return dict(cost=avg_cost)
-
- def predict(self, inputs, states):
- return self._rl_predict(self.model, inputs, states)
-
-
-class SimpleQ(Algorithm):
- """
- A simple Q-learning that has a feedforward policy network and a single discrete action.
-
- learn() requires keywords: "action", "reward", "q_value"
- """
-
- def __init__(self,
- model,
- hyperparas=dict(lr=1e-4),
- gpu_id=-1,
- discount_factor=0.99,
- exploration_end_batches=0,
- exploration_end_rate=0.1,
- update_ref_interval=100):
-
- super(SimpleQ, self).__init__(model, hyperparas, gpu_id)
- self.discount_factor = discount_factor
- self.gpu_id = gpu_id
- assert update_ref_interval > 0
- self.update_ref_interval = update_ref_interval
- self.total_batches = 0
- ## create a reference model
- self.ref_model = deepcopy(model)
- ## setup exploration
- self.explore = (exploration_end_batches > 0)
- if self.explore:
- self.exploration_counter = layers.create_persistable_variable(
- dtype="float32",
- shape=[1],
- is_bias=True,
- default_initializer=ConstantInitializer(0.))
- ### in the second half of training time, the rate is fixed to a number
- self.total_exploration_batches = exploration_end_batches
- self.exploration_rate_delta \
- = (1 - exploration_end_rate) / self.total_exploration_batches
-
- def before_every_batch(self):
- if self.total_batches % self.update_ref_interval == 0:
- self.model.sync_paras_to(self.ref_model, self.gpu_id)
- self.total_batches += 1
-
- def predict(self, inputs, states):
- """
- Override the base predict() function to put the exploration rate in inputs
- """
- rate = 0
- if self.explore:
- counter = self.exploration_counter()
- ## first compute the current exploration rate
- rate = 1 - counter * self.exploration_rate_delta
-
- distributions, states = self.model.policy(inputs, states)
- for dist in distributions.values():
- assert dist.__class__.__name__ == "CategoricalDistribution"
- dist.add_uniform_exploration(rate)
-
- actions = {}
- for key, dist in distributions.iteritems():
- actions[key] = dist()
- return actions, states
-
- def learn(self, inputs, next_inputs, states, next_states, next_episode_end,
- actions, rewards):
-
- action = actions["action"]
- reward = rewards["reward"]
-
- values = self.model.value(inputs, states)
- next_values = self.ref_model.value(next_inputs, next_states)
- q_value = values["q_value"]
- next_q_value = next_values["q_value"] * next_episode_end[
- "next_episode_end"]
- next_q_value.stop_gradient = True
- next_value = layers.reduce_max(next_q_value, dim=-1)
- assert q_value.shape[1] == next_q_value.shape[1]
- num_actions = q_value.shape[1]
-
- value = comf.idx_select(input=q_value, idx=action)
- critic_value = reward + self.discount_factor * next_value
- td_error = critic_value - value
-
- avg_cost = layers.mean(x=layers.square(td_error))
- optimizer = fluid.optimizer.DecayedAdagradOptimizer(
- learning_rate=self.hp["lr"])
- optimizer.minimize(avg_cost)
-
- self._increment_exploration_counter()
- return dict(cost=avg_cost)
-
- def _increment_exploration_counter(self):
- if self.explore:
- counter = self.exploration_counter()
- exploration_counter_ = counter + 1
- switch = layers.cast(
- x=(exploration_counter_ > self.total_exploration_batches),
- dtype="float32")
- ## if the counter already hits the limit, we do not change the counter
- layers.assign(
- switch * counter + (1 - switch) * exploration_counter_,
- counter)
diff --git a/parl/common/__init__.py b/parl/common/__init__.py
deleted file mode 100644
index eca2dce11..000000000
--- a/parl/common/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/parl/common/error_handling.py b/parl/common/error_handling.py
deleted file mode 100644
index e1dbf47ee..000000000
--- a/parl/common/error_handling.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-class LastExpError(Exception):
- """
- Raised when the last element or an element with non-zero game status is
- sampled.
-
- Attributes:
- message(string): error message
- """
-
- def __init__(self, idx, status):
- self.message = 'The element at {}'.format(idx)
- if status:
- self.message += ' has game status: {}'.format(status)
- else:
- self.message += ' is the last experience of a game.'
-
-
-def check_last_exp_error(is_last_exp, idx, game_status):
- if is_last_exp:
- raise LastExpError(idx, game_status)
-
-
-def check_type_error(type1, type2):
- if type1.__name__ != type2.__name__:
- raise TypeError('{} expected, but {} given.'.format(
- type1.__name__, type2.__name__))
-
-
-def check_eq(v1, v2):
- if v1 != v2:
- raise ValueError('{} == {} does not hold'.format(v1, v2))
-
-
-def check_neq(v1, v2):
- if v1 == v2:
- raise ValueError('{} != {} does not hold'.format(v1, v2))
-
-
-def check_gt(v1, v2):
- if v1 <= v2:
- raise ValueError('{} > {} does not hold'.format(v1, v2))
-
-
-def check_geq(v1, v2):
- if v1 < v2:
- raise ValueError('{} >= {} does not hold'.format(v1, v2))
-
-
-def check_lt(v1, v2):
- if v1 >= v2:
- raise ValueError('{} < {} does not hold'.format(v1, v2))
-
-
-def check_leq(v1, v2):
- if v1 > v2:
- raise ValueError('{} <= {} does not hold'.format(v1, v2))
diff --git a/parl/common/replay_buffer.py b/parl/common/replay_buffer.py
deleted file mode 100644
index 5acb1c9b0..000000000
--- a/parl/common/replay_buffer.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import random
-from parl.common.error_handling import *
-
-
-class Experience(object):
- def __init__(self, sensor_inputs, states, actions, game_status):
- check_type_error(list, type(sensor_inputs))
- self.sensor_inputs = sensor_inputs # (observation, reward)
- self.states = states # other states
- self.actions = actions # actions taken
- self.game_status = game_status # game status, e.g., max_steps or
- # episode end reached
- self.next_exp = None # copy of the next Experience
-
- def set_next_exp(self, next_exp):
- self.next_exp = copy.deepcopy(next_exp)
-
- #TODO: write copy function
-
-
-class Sample(object):
- """
- A Sample represents one or a sequence of Experiences
- """
-
- def __init__(self, i, n):
- self.i = i # starting index of the first experience in the sample
- self.n = n # length of the sequence
-
- def __repr__(self):
- return str(self.__class__) + ": " + str(self.__dict__)
-
-
-class ReplayBuffer(object):
- def __init__(self, capacity, exp_type=Experience):
- """
- Create Replay buffer.
-
- Args:
- exp_type(object): Experience class used in the buffer.
- capacity(int): Max number of experience to store in the buffer. When
- the buffer overflows the old memories are dropped.
- """
- check_gt(capacity, 1)
- self.buffer = [] # a circular queue to store experiences
- self.capacity = capacity # capacity of the buffer
- self.last = -1 # the index of the last element in the buffer
- self.exp_type = exp_type # Experience class used in the buffer
-
- def __len__(self):
- return len(self.buffer)
-
- def buffer_end(self, i):
- return i == self.last
-
- def next_idx(self, i):
- if self.buffer_end(i):
- return -1
- else:
- return (i + 1) % self.capacity
-
- def add(self, exp):
- """
- Store one experience into the buffer.
-
- Args:
- exp(self.exp_type): the experience to store in the buffer.
- """
- check_type_error(self.exp_type, type(exp))
- # the next_exp field should be None at this point
- check_eq(exp.next_exp, None)
-
- if len(self.buffer) < self.capacity:
- self.buffer.append(None)
- self.last = (self.last + 1) % self.capacity
- self.buffer[self.last] = copy.deepcopy(exp)
-
- def sample(self, num_samples):
- """
- Generate a batch of Samples. Each Sample represents a sequence of
- Experiences (length>=1). And a sequence must not cross the boundary
- between two games.
-
- Args:
- num_samples(int): Number of samples to generate.
-
- Returns: A generator of Samples
- """
- if len(self.buffer) <= 1:
- yield []
-
- for _ in xrange(num_samples):
- while True:
- idx = random.randint(0, len(self.buffer) - 1)
- if not self.buffer_end(
- idx) and not self.buffer[idx].game_status:
- break
- yield Sample(idx, 1)
-
- def get_experiences(self, sample):
- """
- Get Experiences from a Sample
-
- Args:
- sample(Sample): a Sample representing a sequence of Experiences
-
- Return(list): a list of Experiences
- """
- exps = []
- p = sample.i
- for _ in xrange(sample.n):
- check_last_exp_error(
- self.buffer_end(p) or self.buffer[p].game_status, p,
- self.buffer[p].game_status)
- # make a copy of the buffer element as e may be modified somewhere
- e = copy.deepcopy(self.buffer[p])
- p = self.next_idx(p)
- e.set_next_exp(self.buffer[p])
- exps.append(e)
-
- return exps
diff --git a/parl/common/tests/test_replay_buffer.py b/parl/common/tests/test_replay_buffer.py
deleted file mode 100644
index 1750c368d..000000000
--- a/parl/common/tests/test_replay_buffer.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import unittest
-from parl.common.error_handling import LastExpError
-from parl.common.replay_buffer import Experience, Sample, ReplayBuffer
-
-
-class ExperienceForTest(Experience):
- def __init__(self, obs, reward, actions, new_field, status):
- super(ExperienceForTest, self).__init__([obs, reward], [], actions,
- status)
- self.new_field = new_field
-
-
-class TestReplayBuffer(unittest.TestCase):
- def test_single_instance_replay_buffer(self):
- capacity = 30
- episode_len = 4
- buf = ReplayBuffer(capacity, ExperienceForTest)
- total = 0
- expect_total = 0
- for i in xrange(10 * capacity):
- e = ExperienceForTest(
- obs=np.zeros(10),
- reward=i * 0.5,
- actions=i,
- new_field=np.ones(20),
- status=(i + 1) % episode_len == 0)
- buf.add(e)
- # check the circular queue in the buffer
- self.assertTrue(len(buf) == min(i + 1, capacity))
- if (len(buf) < 2): # need at least two elements
- continue
- # should raise error when trying to pick up the last element
- with self.assertRaises(LastExpError):
- t = Sample(i % capacity, 1)
- buf.get_experiences(t)
- expect_total += len(buf)
- # neither last element nor episode end should be picked up
- for s in buf.sample(len(buf)):
- try:
- exps = buf.get_experiences(s)
- total += 1
- except LastExpError as err:
- self.fail('test_single_instance_replay_buffer raised '
- 'LastExpError: ' + err.message)
- # check the total number of elements added into the buffer
- self.assertTrue(total == expect_total)
- # detect incompatible Experience type
- with self.assertRaises(TypeError):
- e = Experience([np.zeros(10), i * 0.5], [], i, 0)
- buf.add(e)
-
- def test_deep_copy(self):
- capacity = 5
- buf = ReplayBuffer(capacity, Experience)
- e0 = Experience(
- sensor_inputs=[np.zeros(10), 0],
- states=[],
- actions=0,
- game_status=0)
- e1 = Experience([np.ones(10) * 2, 1], [], 0, 1)
- buf.add(e0)
- e0.sensor_inputs[0] += 1
- buf.add(e0)
- buf.add(e1)
- s = Sample(0, 2)
- exps = buf.get_experiences(s)
- self.assertEqual(np.sum(exps[0].sensor_inputs[0] == 0), 10)
- self.assertEqual(np.sum(exps[1].sensor_inputs[0] == 1), 10)
- self.assertEqual(np.sum(exps[1].next_exp.sensor_inputs[0] == 2), 10)
- exps[0].next_exp.sensor_inputs[0] += 3
- self.assertEqual(np.sum(exps[1].sensor_inputs[0] == 1), 10)
- exps[1].sensor_inputs[0] += 4
- exps = buf.get_experiences(s)
- self.assertEqual(np.sum(exps[0].next_exp.sensor_inputs[0] == 1), 10)
-
-
-if __name__ == '__main__':
- unittest.main()
diff --git a/parl/framework/__init__.py b/parl/framework/__init__.py
index 8ca1bb94f..04bf0e140 100644
--- a/parl/framework/__init__.py
+++ b/parl/framework/__init__.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+
from parl.framework.model_base import *
from parl.framework.algorithm_base import *
from parl.framework.agent_base import *
diff --git a/parl/framework/model_base.py b/parl/framework/model_base.py
index 3adee9a7f..cba0df8bf 100644
--- a/parl/framework/model_base.py
+++ b/parl/framework/model_base.py
@@ -124,23 +124,39 @@ class Model(Network):
In conclusion, Model is responsible for forward and
Algorithm is responsible for backward.
- Model can also be used to construct target model, which has the same structure as initial model.
+ Model can also use deepcopy way to construct target model, which has the same structure as initial model.
+ Note that only the model definition is copied here. To copy the parameters from the current model
+ to the target model, you must explicitly use sync_params_to function after the program is initialized.
+
Here is an example:
```python
- class Actor(Model):
- __init__(self, obs_dim, act_dim):
- self.obs_dim = obs_dim
- self.act_dim = act_dim
- self.fc1 = layers.fc(size=128, act='relu')
- self.fc2 = layers.fc(size=64, act='relu')
- actor = Actor(obs_dim=12, act_dim=2)
- target_actor = copy.deepcopy(actor)
+ import parl.layers as layers
+ import parl.Model as Model
+
+ class MLPModel(Model):
+ def __init__(self):
+ self.fc = layers.fc(size=64)
+
+ def policy(self, obs):
+ out = self.fc(obs)
+ return out
+
+ model = MLPModel()
+ target_model = deepcopy(model) # automatically create new unique parameters names for target_model.fc
+
+ # build program
+ x = layers.data(name='x', shape=[100], dtype="float32")
+ y1 = model.policy(x)
+ y2 = target_model.policy(x)
+
+ ...
+ # Need initialize program before calling sync_params_to
+ fluid_executor.run(fluid.default_startup_program())
+ ...
+
+ # synchronize parameters
+ model.sync_params_to(target_model, gpu_id=gpu_id)
```
-
- Note that it's the model structure that is copied from initial actor,
- parameters in initial model havn't been copied to target model.
- To copy parameters, you must explicitly use sync_params_to function after the program is initialized.
-
"""
__metaclass__ = ABCMeta
diff --git a/parl/framework/policy_distribution.py b/parl/framework/policy_distribution.py
deleted file mode 100644
index 524aa7a46..000000000
--- a/parl/framework/policy_distribution.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import parl.layers as layers
-from abc import ABCMeta, abstractmethod
-from paddle.fluid.framework import Variable
-from parl.layers import common_functions as comf
-from paddle.fluid.framework import convert_np_dtype_to_dtype_
-
-
-class PolicyDistribution(object):
- __metaclass__ = ABCMeta
-
- def __init__(self, dist):
- """
- self.dist represents the quantities that characterize the distribution.
- For example, for a Normal distribution, this can be a tuple of (mean, std).
- The actual form of self.dist is defined by the user.
- """
- self.dist = dist
-
- @abstractmethod
- def __call__(self):
- """
- Implement __call__ to sample an instance.
- """
- pass
-
- @property
- @abstractmethod
- def dim(self):
- """
- For discrete policies, this function returns the number of actions.
- For continuous policies, this function returns the action vector length.
- For sequential policies (e.g., sentences), this function returns the number
- of choices at each step.
- """
- pass
-
- def add_uniform_exploration(self, rate):
- """
- Given a uniform exploration rate, this function modifies the distribution.
- The rate could be a floating number of a Variable.
- """
- return NotImplementedError()
-
- def loglikelihood(self, action):
- """
- Given an action, this function returns the log likelihood of this action under
- the current distribution.
- """
- raise NotImplementedError()
-
-
-class CategoricalDistribution(PolicyDistribution):
- def __init__(self, dist):
- super(CategoricalDistribution, self).__init__(dist)
- assert isinstance(dist, Variable)
-
- def __call__(self):
- return layers.sampling_id(self.dist)
-
- @property
- def dim(self):
- assert len(self.dist.shape) == 2
- return self.dist.shape[1]
-
- def add_uniform_exploration(self, rate):
- if not (isinstance(rate, float) and rate == 0):
- self.dist = self.dist * (1 - rate) + \
- 1 / float(self.dim) * rate
-
- def loglikelihood(self, action):
- assert isinstance(action, Variable)
- assert action.dtype == convert_np_dtype_to_dtype_("int") \
- or action.dtype == convert_np_dtype_to_dtype_("int64")
- return 0 - layers.cross_entropy(input=self.dist, label=action)
-
-
-class Deterministic(PolicyDistribution):
- def __init__(self, dist):
- super(Deterministic, self).__init__(dist)
- ## For deterministic action, we only support continuous ones
- assert isinstance(dist, Variable)
- assert dist.dtype == convert_np_dtype_to_dtype_("float32") \
- or dist.dtype == convert_np_dtype_to_dtype_("float64")
-
- @property
- def dim(self):
- assert len(self.dist.shape) == 2
- return self.dist.shape[1]
-
- def __call__(self):
- return self.dist
-
-
-def q_categorical_distribution(q_value):
- """
- Generate a PolicyDistribution object given a Q value.
- We construct a one-hot distribution according to the Q value.
- """
- assert len(q_value.shape) == 2, "[batch_size, num_actions]"
- max_id = comf.argmax_layer(q_value)
- prob = layers.cast(
- x=layers.one_hot(input=max_id, depth=q_value.shape[-1]),
- dtype="float32")
- return CategoricalDistribution(prob)
diff --git a/parl/framework/tests/test_algorithm.py b/parl/framework/tests/test_algorithm.py
deleted file mode 100644
index 298a37906..000000000
--- a/parl/framework/tests/test_algorithm.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import parl.layers as layers
-from parl.framework.algorithm_base import Algorithm
-from parl.framework.base import Model
-from parl.layers import common_functions as comf
-from parl.model_zoo.simple_models import SimpleModelDeterministic
-import numpy as np
-from copy import deepcopy
-import unittest
-
-
-class TestAlgorithm(Algorithm):
- def __init__(self, model):
- super(TestAlgorithm, self).__init__(
- model, hyperparas=dict(), gpu_id=-1)
-
- def predict(self, inputs, states):
- return self._rl_predict(self.model, inputs, states)
-
-
-class TestAlgorithmParas(unittest.TestCase):
- def test_sync_paras_in_one_program(self):
- """
- Test case for copying parameters
- """
-
- alg1 = TestAlgorithm(
- model=SimpleModelDeterministic(
- dims=10, mlp_layer_confs=[dict(size=10)]))
- alg2 = deepcopy(alg1)
-
- batch_size = 10
- sensor = np.random.uniform(
- 0, 1, [batch_size, alg1.model.dims]).astype("float32")
-
- program = fluid.Program()
- startup_program = fluid.Program()
- with fluid.program_guard(program, startup_program):
- x = layers.data(name='x', shape=[alg1.model.dims], dtype="float32")
- try:
- # too eary to sync before the layers are created
- alg1.model.sync_paras_to(alg2.model, alg2.gpu_id)
- self.assertTrue(False) # you shouldn't be here
- except:
- pass
- ## first let the program generates the actual variables by using the
- ## layer functions (before this step the layers haven't been instantiated yet!)
- ## the call of predict() function already covers all the layers
- y0, _ = alg1.predict(inputs=dict(sensor=x), states=dict())
- y1, _ = alg2.predict(inputs=dict(sensor=x), states=dict())
-
- ######################
- exe = fluid.Executor(fluid.CPUPlace())
- exe.run(startup_program)
-
- outputs = exe.run(
- program,
- feed={'x': sensor},
- ## y and y1 are two dictionaries
- fetch_list=y0.values() + y1.values())
-
- self.assertNotEqual(
- np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten()))
-
- ## do the copying
- alg1.model.sync_paras_to(alg2.model, alg2.gpu_id)
-
- outputs = exe.run(
- program,
- feed={'x': sensor},
- ## y and y1 are two dictionaries
- fetch_list=y0.values() + y1.values())
-
- self.assertEqual(
- np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten()))
-
- def test_sync_paras_between_programs(self):
- """
- Test case for copying parameters between two different programs
- """
- alg1 = TestAlgorithm(
- model=SimpleModelDeterministic(
- dims=10, mlp_layer_confs=[dict(size=10)]))
- alg2 = deepcopy(alg1)
-
- batch_size = 10
- sensor = np.random.uniform(
- 0, 1, [batch_size, alg1.model.dims]).astype("float32")
-
- startup_program = fluid.Program()
- program1 = fluid.Program()
- program2 = fluid.Program()
-
- with fluid.program_guard(program1, startup_program):
- x1 = layers.data(
- name='x', shape=[alg1.model.dims], dtype="float32")
- y1, _ = alg1.predict(inputs=dict(sensor=x1), states=dict())
-
- with fluid.program_guard(program2, startup_program):
- x2 = layers.data(
- name='x', shape=[alg1.model.dims], dtype="float32")
- y2, _ = alg2.predict(inputs=dict(sensor=x2), states=dict())
-
- exe = fluid.Executor(fluid.CPUPlace())
- exe.run(startup_program)
-
- alg1.model.sync_paras_to(alg2.model, alg2.gpu_id)
-
- outputs1 = exe.run(
- program1, feed={'x': sensor}, fetch_list=y1.values())
- outputs2 = exe.run(
- program2, feed={'x': sensor}, fetch_list=y2.values())
- self.assertEqual(
- np.sum(outputs1[0].flatten()), np.sum(outputs2[0].flatten()))
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/parl/framework/tests/test_computation_task.py b/parl/framework/tests/test_computation_task.py
deleted file mode 100644
index cf4555218..000000000
--- a/parl/framework/tests/test_computation_task.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import parl.layers as layers
-from parl.framework.base import Model
-from parl.framework.computation_task import ComputationTask
-import parl.framework.policy_distribution as pd
-from parl.layers import common_functions as comf
-from parl.algorithm_zoo.simple_algorithms import SimpleAC, SimpleQ
-from parl.model_zoo.simple_models import SimpleModelDeterministic, SimpleModelAC, SimpleModelQ
-from test_algorithm import TestAlgorithm
-import numpy as np
-from copy import deepcopy
-import unittest
-import math
-
-
-class TestModelCNN(Model):
- def __init__(self, width, height, num_actions):
- super(TestModelCNN, self).__init__()
- self.conv = layers.conv2d(
- num_filters=1, filter_size=3, bias_attr=False)
- self.mlp = comf.MLP([
- dict(size=32, act="relu", bias_attr=False),
- dict(size=16, act="relu", bias_attr=False),
- dict(size=num_actions, act="softmax", bias_attr=False)
- ])
- self.height = height
- self.width = width
-
- def get_input_specs(self):
- ## image format CHW
- return [("image", dict(shape=[1, self.height, self.width]))]
-
- def get_action_specs(self):
- return [("action", dict(shape=[1], dtype="int64"))]
-
- def policy(self, inputs, states):
- conv = self.conv(input=inputs.values()[0])
- dist = pd.CategoricalDistribution(self.mlp(conv))
- return dict(action=dist), states
-
- def value(self, inputs, states):
- v_value = layers.fill_constant(
- shape=[inputs.values()[0].shape[0], 1], dtype="float32", value=0)
- return dict(v_value=v_value)
-
-
-class TestComputationTask(unittest.TestCase):
- def test_predict(self):
- """
- Test case for AC-learning and Q-learning predictions
- """
- num_actions = 4
-
- def test(input, ct, max):
- action_counter = [0] * num_actions
- total = 2000
- for i in range(total):
- actions, states = ct.predict(inputs=input)
- assert not states, "states should be empty"
- ## actions["action"] is a batch of actions
- for a in actions["action"]:
- action_counter[a] += 1
-
- if max:
- ### if max, the first action will always be chosen
- for i in range(num_actions):
- prob = action_counter[i] / float(sum(action_counter))
- self.assertAlmostEqual(
- prob, 1.0 if i == 0 else 0.0, places=1)
- else:
- ### the actions should be uniform
- for i in range(num_actions):
- prob = action_counter[i] / float(sum(action_counter))
- self.assertAlmostEqual(prob, 1.0 / num_actions, places=1)
-
- dims = 100
-
- ac = SimpleAC(
- model=SimpleModelAC(
- dims=dims,
- num_actions=num_actions,
- mlp_layer_confs=[
- dict(size=32, act="relu", bias_attr=False),
- dict(size=16, act="relu", bias_attr=False),
- dict(size=num_actions, act="softmax", bias_attr=False)
- ]))
-
- ac_cnn = SimpleAC(
- model=TestModelCNN(width=84, height=84, num_actions=num_actions))
-
- q = SimpleQ(
- model=SimpleModelQ(
- dims=dims,
- num_actions=num_actions,
- mlp_layer_confs=[
- dict(size=32, act="relu", bias_attr=False),
- dict(size=16, act="relu", bias_attr=False),
- dict(size=num_actions, bias_attr=False)
- ]))
-
- batch_size = 10
- height, width = 84, 84
- sensor = np.zeros([batch_size, dims]).astype("float32")
- image = np.zeros([batch_size, 1, height, width]).astype("float32")
-
- ct0 = ComputationTask(algorithm=ac)
- ct1 = ComputationTask(algorithm=q)
- ct2 = ComputationTask(algorithm=ac_cnn)
-
- test(dict(sensor=sensor), ct0, max=False)
- test(dict(sensor=sensor), ct1, max=True)
- test(dict(image=image), ct2, max=False)
-
- def test_ct_para_sharing(self):
- """
- Test case for two CTs sharing parameters
- """
- alg = TestAlgorithm(
- model=SimpleModelDeterministic(
- dims=10, mlp_layer_confs=[dict(size=10)]))
- ct0 = ComputationTask(algorithm=alg)
- ct1 = ComputationTask(algorithm=alg)
-
- batch_size = 10
- sensor = np.random.uniform(
- 0, 1, [batch_size, alg.model.dims]).astype("float32")
-
- outputs0, _ = ct0.predict(inputs=dict(sensor=sensor))
- outputs1, _ = ct1.predict(inputs=dict(sensor=sensor))
- self.assertEqual(
- np.sum(outputs0["continuous_action"].flatten()),
- np.sum(outputs1["continuous_action"].flatten()))
-
- def test_ct_para_sync(self):
- """
- Test case for two CTs copying parameters
- """
-
- alg = TestAlgorithm(
- model=SimpleModelDeterministic(
- dims=10, mlp_layer_confs=[dict(size=10)]))
-
- ct0 = ComputationTask(algorithm=alg)
- ct1 = ComputationTask(algorithm=deepcopy(alg))
-
- batch_size = 10
- sensor = np.random.uniform(
- 0, 1, [batch_size, ct0.alg.model.dims]).astype("float32")
-
- outputs0, _ = ct0.predict(inputs=dict(sensor=sensor))
- outputs1, _ = ct1.predict(inputs=dict(sensor=sensor))
- self.assertNotEqual(
- np.sum(outputs0["continuous_action"].flatten()),
- np.sum(outputs1["continuous_action"].flatten()))
-
- ct0.alg.model.sync_paras_to(ct1.alg.model, ct1.alg.gpu_id)
-
- outputs0, _ = ct0.predict(inputs=dict(sensor=sensor))
- outputs1, _ = ct1.predict(inputs=dict(sensor=sensor))
- self.assertEqual(
- np.sum(outputs0["continuous_action"].flatten()),
- np.sum(outputs1["continuous_action"].flatten()))
-
- def test_ct_learning(self):
- """
- Test training
- """
- num_actions = 2
- dims = 100
- batch_size = 8
- sensor = np.ones([batch_size, dims
- ]).astype("float32") / dims # normalize
- next_sensor = np.zeros([batch_size, dims]).astype("float32")
-
- for on_policy in [True, False]:
- if on_policy:
- alg = SimpleAC(
- model=SimpleModelAC(
- dims=dims,
- num_actions=num_actions,
- mlp_layer_confs=[
- dict(size=64, act="relu", bias_attr=False),
- dict(size=32, act="relu", bias_attr=False),
- dict(size=num_actions, act="softmax")
- ]),
- hyperparas=dict(lr=1e-1))
- ct = ComputationTask(algorithm=alg)
- else:
- alg = SimpleQ(
- model=SimpleModelQ(
- dims=dims,
- num_actions=num_actions,
- mlp_layer_confs=[
- dict(size=64, act="relu", bias_attr=False),
- dict(size=32, act="relu", bias_attr=False),
- dict(size=num_actions)
- ]),
- update_ref_interval=100,
- hyperparas=dict(lr=1e-1))
- ct = ComputationTask(algorithm=alg)
-
- for i in range(1000):
- if on_policy:
- outputs, _ = ct.predict(inputs=dict(sensor=sensor))
- actions = outputs["action"]
- actions = np.expand_dims(actions, 1)
- else:
- ## randomly assemble a batch
- actions = np.random.choice([0, 1],
- size=(batch_size, 1),
- p=[0.5, 0.5]).astype("int")
- rewards = (1 - actions).astype("float32")
- cost = ct.learn(
- inputs=dict(sensor=sensor),
- next_inputs=dict(next_sensor=next_sensor),
- next_episode_end=dict(
- next_episode_end=np.ones((batch_size,
- 1)).astype("float32")),
- actions=dict(action=actions),
- rewards=dict(reward=rewards))
-
- print("final cost: %f" % cost["cost"])
-
- ### the policy should bias towards the first action
- outputs, _ = ct.predict(inputs=dict(sensor=sensor))
- for a in outputs["action"]:
- self.assertEqual(a, 0)
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/parl/layers/common_functions.py b/parl/layers/common_functions.py
deleted file mode 100644
index d2d18ed79..000000000
--- a/parl/layers/common_functions.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import parl.layers as layers
-from paddle.fluid.framework import Variable
-
-
-class Feedforward(layers.Network):
- """
- A feedforward network can contain a sequence of components,
- where each component can be either a LayerFunc or a Feedforward.
- The purpose of this class is to create a collection of LayerFuncs that can
- be easily copied from one Network to another.
- Examples of feedforward networks can be MLP and CNN.
- """
-
- def __init__(self, components):
- for i in range(len(components)):
- setattr(self, "ff%06d" % i, components[i])
-
- def __call__(self, input):
- attrs = {
- attr: getattr(self, attr)
- for attr in dir(self) if "ff" in attr
- }
- for k in sorted(attrs.keys()):
- input = attrs[k](input)
- return input
-
-
-class MLP(Feedforward):
- def __init__(self, multi_fc_layers):
- super(MLP, self).__init__([layers.fc(**c) for c in multi_fc_layers])
-
-
-class CNN(Feedforward):
- """
- Image CNN
- """
-
- def __init__(self, multi_conv_layers):
- super(CNN,
- self).__init__([layers.conv2d(**c) for c in multi_conv_layers])
-
-
-def argmax_layer(input):
- """
- Get the id of the max val of an input vector
- """
- _, index = layers.topk(input, 1)
- return index
-
-
-def inner_prod(x, y):
- """
- Get the inner product of two vectors
- """
- return layers.reduce_sum(layers.elementwise_mul(x, y), dim=-1)
-
-
-def sum_to_one_norm_layer(input):
- eps = 1e-9 # avoid dividing 0
- sum = layers.reduce_sum(input + eps, dim=-1)
- return layers.elementwise_div(x=input, y=sum, axis=0)
-
-
-def idx_select(input, idx):
- """
- Given an input vector (Variable) and an idx (int or Variable),
- select the entry of the vector according to the idx.
- """
- assert isinstance(input, Variable)
- assert len(input.shape) == 2
- batch_size, num_entries = input.shape
-
- if isinstance(idx, int):
- ## if idx is a constant int, then we create a variable
- idx = layers.fill_constant(
- shape=[batch_size, 1], dtype="int64", value=idx)
- else:
- assert isinstance(idx, Variable)
-
- assert input.shape
- select = layers.cast(
- x=layers.one_hot(input=idx, depth=num_entries), dtype="float32")
- return inner_prod(select, input)
diff --git a/parl/layers/layer_wrappers.py b/parl/layers/layer_wrappers.py
index 93b956378..60bea5bab 100644
--- a/parl/layers/layer_wrappers.py
+++ b/parl/layers/layer_wrappers.py
@@ -12,7 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
-Wrappers for fluid.layers so that the layers can share parameters conveniently.
+Wrappers for fluid.layers. It helps to easily share parameters between layers.
+
+Here is an example:
+ ```python
+ import parl.layers as layers
+
+ class MLPModel(Model):
+ def __init__(self):
+ self.fc = layers.fc(size=64) # automatically create parameters names "fc_0.w" and "fc_0.b"
+
+ def policy1(self, obs):
+ out = self.fc(obs) # Really create parameters with parameters names "fc_0.w" and "fc_0.b"
+
+ def policy2(self, obs):
+ out = self.fc(obs) # Reusing parameters
+ ```
"""
import inspect
diff --git a/parl/model_zoo/__init__.py b/parl/model_zoo/__init__.py
deleted file mode 100644
index eca2dce11..000000000
--- a/parl/model_zoo/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/parl/model_zoo/simple_models.py b/parl/model_zoo/simple_models.py
deleted file mode 100644
index b103ee2ed..000000000
--- a/parl/model_zoo/simple_models.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import parl.layers as layers
-from parl.framework.base import Model
-import parl.framework.policy_distribution as pd
-from parl.layers import common_functions as comf
-
-
-class SimpleModelDeterministic(Model):
- def __init__(self, dims, mlp_layer_confs):
- super(SimpleModelDeterministic, self).__init__()
- self.dims = dims
- self.mlp = comf.MLP(mlp_layer_confs)
-
- def get_input_specs(self):
- return [("sensor", dict(shape=[self.dims]))]
-
- def get_action_specs(self):
- return [("continuous_action", dict(shape=[self.dims]))]
-
- def policy(self, inputs, states):
- hidden = self.mlp(inputs.values()[0])
- return dict(continuous_action=pd.Deterministic(hidden)), states
-
-
-class SimpleModelAC(Model):
- def __init__(self, dims, num_actions, mlp_layer_confs):
- super(SimpleModelAC, self).__init__()
- self.dims = dims
- assert mlp_layer_confs[-1]["act"] == "softmax"
- self.mlp = comf.MLP(mlp_layer_confs[:-1])
- self.policy_mlp = comf.MLP(mlp_layer_confs[-1:])
- self.value_layer = layers.fc(size=1)
-
- def get_input_specs(self):
- return [("sensor", dict(shape=[self.dims]))]
-
- def get_action_specs(self):
- return [("action", dict(shape=[1], dtype="int64"))]
-
- def _perceive(self, inputs, states):
- return self.mlp(inputs.values()[0])
-
- def policy(self, inputs, states):
- dist = pd.CategoricalDistribution(
- self.policy_mlp(self._perceive(inputs, states)))
- return dict(action=dist), states
-
- def value(self, inputs, states):
- return dict(v_value=self.value_layer(self._perceive(inputs, states)))
-
-
-class SimpleModelQ(Model):
- def __init__(self, dims, num_actions, mlp_layer_confs):
- super(SimpleModelQ, self).__init__()
- self.dims = dims
- self.num_actions = num_actions
- assert "act" not in mlp_layer_confs[-1], "should be linear act"
- self.mlp = comf.MLP(mlp_layer_confs)
-
- def get_input_specs(self):
- return [("sensor", dict(shape=[self.dims]))]
-
- def get_action_specs(self):
- return [("action", dict(shape=[1], dtype="int64"))]
-
- def policy(self, inputs, states):
- values = self.value(inputs, states)
- q_value = values["q_value"]
- return dict(action=pd.q_categorical_distribution(q_value)), states
-
- def value(self, inputs, states):
- return dict(q_value=self.mlp(inputs.values()[0]))
diff --git a/setup.py b/setup.py
index 307d1af50..d6b2559e0 100644
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,7 @@ def _find_packages(prefix=''):
setup(
name='parl',
- version=0.1,
+ version=1.0,
packages=_find_packages(),
package_data={'': ['*.so']},
install_requires=[