-
Notifications
You must be signed in to change notification settings - Fork 2
/
RL_plus_script.py
280 lines (235 loc) · 10.3 KB
/
RL_plus_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
"""
Reinforcement Learning agent that trains on MineRLTreechop environment. It is then evaluated on MineRLObtainDiamond by
running it for a certain number of ticks and then switching to the scripted part that crafts a wooden_pickaxe and digs
down to get some cobblestone.
With default parameters it trains in about 8 hours on a machine with a GeForce RTX 2080 Ti GPU.
It uses less than 8GB RAM and achieves an average reward of 8.3.
"""
import time
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from torch.utils.tensorboard import SummaryWriter
import minerl # it's important to import minerl after SB3, otherwise model.save doesn't work...
# If you want to try out wandb integration, scroll to the bottom an uncomment line regarding `track_exp`
try:
wandb = None
import wandb
except ImportError:
pass
# Parameters:
config = {
"TRAIN_TIMESTEPS": 2000000, # number of steps to train the agent for. At 70 FPS 2m steps take about 8 hours.
"TRAIN_ENV": 'MineRLTreechop-v0', # training environment for the RL agent. Could use MineRLObtainDiamondDense-v0 here.
"TRAIN_MODEL_NAME": 'potato', # name to use when saving the trained agent.
"TEST_MODEL_NAME": 'potato', # name to use when loading the trained agent.
"TEST_EPISODES": 10, # number of episodes to test the agent for.
"MAX_TEST_EPISODE_LEN": 18000, # 18k is the default for MineRLObtainDiamond.
"TREECHOP_STEPS": 2000, # number of steps to run RL lumberjack for in evaluations.
"RECORD_TRAINING_VIDEOS": False, # if True, records videos of all episodes done during training.
"RECORD_TEST_VIDEOS": False, # if True, records videos of all episodes done during evaluation.
}
experiment_name = f"ppo_{int(time.time())}"
def make_env(idx):
def thunk():
env = gym.make(config["TRAIN_ENV"])
if idx == 0 and config["RECORD_TRAINING_VIDEOS"]:
env = gym.wrappers.Monitor(env, f"train_videos/{experiment_name}")
env = PovOnlyObservation(env)
env = ActionShaping(env, always_attack=True)
env = gym.wrappers.RecordEpisodeStatistics(env) # record stats such as returns
return env
return thunk
def track_exp(project_name=None):
wandb.init(
anonymous="allow",
project=project_name,
config=config,
sync_tensorboard=True,
name=experiment_name,
monitor_gym=True,
save_code=True,
)
class PovOnlyObservation(gym.ObservationWrapper):
"""
Turns the observation space into POV only, ignoring the inventory. This is needed for stable_baselines3 RL agents,
as they don't yet support dict observations. The support should be coming soon (as of April 2021).
See following PR for details:
https://github.com/DLR-RM/stable-baselines3/pull/243
"""
def __init__(self, env):
super().__init__(env)
self.observation_space = self.env.observation_space['pov']
def observation(self, observation):
return observation['pov']
class ActionShaping(gym.ActionWrapper):
"""
The default MineRL action space is the following dict:
Dict(attack:Discrete(2),
back:Discrete(2),
camera:Box(low=-180.0, high=180.0, shape=(2,)),
craft:Enum(crafting_table,none,planks,stick,torch),
equip:Enum(air,iron_axe,iron_pickaxe,none,stone_axe,stone_pickaxe,wooden_axe,wooden_pickaxe),
forward:Discrete(2),
jump:Discrete(2),
left:Discrete(2),
nearbyCraft:Enum(furnace,iron_axe,iron_pickaxe,none,stone_axe,stone_pickaxe,wooden_axe,wooden_pickaxe),
nearbySmelt:Enum(coal,iron_ingot,none),
place:Enum(cobblestone,crafting_table,dirt,furnace,none,stone,torch),
right:Discrete(2),
sneak:Discrete(2),
sprint:Discrete(2))
It can be viewed as:
- buttons, like attack, back, forward, sprint that are either pressed or not.
- mouse, i.e. the continuous camera action in degrees. The two values are pitch (up/down), where up is
negative, down is positive, and yaw (left/right), where left is negative, right is positive.
- craft/equip/place actions for items specified above.
So an example action could be sprint + forward + jump + attack + turn camera, all in one action.
This wrapper makes the action space much smaller by selecting a few common actions and making the camera actions
discrete. You can change these actions by changing self._actions below. That should just work with the RL agent,
but would require some further tinkering below with the BC one.
"""
def __init__(self, env, camera_angle=10, always_attack=False):
super().__init__(env)
self.camera_angle = camera_angle
self.always_attack = always_attack
self._actions = [
[('attack', 1)],
[('forward', 1)],
# [('back', 1)],
# [('left', 1)],
# [('right', 1)],
# [('jump', 1)],
# [('forward', 1), ('attack', 1)],
# [('craft', 'planks')],
[('forward', 1), ('jump', 1)],
[('camera', [-self.camera_angle, 0])],
[('camera', [self.camera_angle, 0])],
[('camera', [0, self.camera_angle])],
[('camera', [0, -self.camera_angle])],
]
self.actions = []
for actions in self._actions:
act = self.env.action_space.noop()
for a, v in actions:
act[a] = v
if self.always_attack:
act['attack'] = 1
self.actions.append(act)
self.action_space = gym.spaces.Discrete(len(self.actions))
def action(self, action):
return self.actions[action]
def train():
env = DummyVecEnv([make_env(i) for i in range(1)])
# For all the PPO hyperparameters you could tune see this:
# https://github.com/DLR-RM/stable-baselines3/blob/6f822b9ed7d6e8f57e5a58059923a5b24e8db283/stable_baselines3/ppo/ppo.py#L16
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=f"runs/{experiment_name}")
model.learn(total_timesteps=config["TRAIN_TIMESTEPS"]) # 2m steps is about 8h at 70 FPS
model.save(config["TRAIN_MODEL_NAME"])
# MineRL might throw an exception when closing on Windows, but it can be ignored (the environment does close).
try:
env.close()
except Exception:
pass
def str_to_act(env, actions):
"""
Simplifies specifying actions for the scripted part of the agent.
Some examples for a string with a single action:
'craft:planks'
'camera:[10,0]'
'attack'
'jump'
''
There should be no spaces in single actions, as we use spaces to separate actions with multiple "buttons" pressed:
'attack sprint forward'
'forward camera:[0,10]'
:param env: base MineRL environment.
:param actions: string of actions.
:return: dict action, compatible with the base MineRL environment.
"""
act = env.action_space.noop()
for action in actions.split():
if ":" in action:
k, v = action.split(':')
if k == 'camera':
act[k] = eval(v)
else:
act[k] = v
else:
act[action] = 1
return act
def get_action_sequence():
"""
Specify the action sequence for the scripted part of the agent.
"""
# make planks, sticks, crafting table and wooden pickaxe:
action_sequence = []
action_sequence += [''] * 100
action_sequence += ['craft:planks'] * 4
action_sequence += ['craft:stick'] * 2
action_sequence += ['craft:crafting_table']
action_sequence += ['camera:[10,0]'] * 18
action_sequence += ['attack'] * 20
action_sequence += [''] * 10
action_sequence += ['jump']
action_sequence += [''] * 5
action_sequence += ['place:crafting_table']
action_sequence += [''] * 10
# bug: looking straight down at a crafting table doesn't let you craft. So we look up a bit before crafting.
action_sequence += ['camera:[-1,0]']
action_sequence += ['nearbyCraft:wooden_pickaxe']
action_sequence += ['camera:[1,0]']
action_sequence += [''] * 10
action_sequence += ['equip:wooden_pickaxe']
action_sequence += [''] * 10
# dig down:
action_sequence += ['attack'] * 600
action_sequence += [''] * 10
return action_sequence
def test():
action_sequence = get_action_sequence()
writer = SummaryWriter(f"runs/{experiment_name}")
env = gym.make('MineRLObtainDiamond-v0').env
time_limit = min(config["MAX_TEST_EPISODE_LEN"], config["TREECHOP_STEPS"] + len(action_sequence))
env = gym.wrappers.TimeLimit(env, time_limit)
# optional interactive mode, where you can connect to your agent and play together (see link for details):
# https://minerl.io/docs/tutorials/minerl_tools.html#interactive-mode-minerl-interactor
# env.make_interactive(port=6666, realtime=True)
if config["RECORD_TEST_VIDEOS"]:
env = gym.wrappers.Monitor(env, f"test_videos/{experiment_name}")
env = PovOnlyObservation(env)
env = ActionShaping(env, always_attack=True)
env1 = env.env
model = PPO.load(config["TEST_MODEL_NAME"], verbose=1)
model.set_env(env)
for episode in range(config["TEST_EPISODES"]):
obs = env.reset()
done = False
total_reward = 0
steps = 0
# RL part to get some logs:
for i in range(config["TREECHOP_STEPS"]):
action = model.predict(obs.copy())
obs, reward, done, _ = env.step(action[0])
total_reward += reward
steps += 1
if done:
break
# scripted part to use the logs:
if not done:
for i, action in enumerate(action_sequence[:config["MAX_TEST_EPISODE_LEN"] - config["TREECHOP_STEPS"]]):
obs, reward, done, _ = env1.step(str_to_act(env1, action))
total_reward += reward
steps += 1
if done:
break
print(f'Episode #{episode + 1} return: {total_reward}\t\t episode length: {steps}')
writer.add_scalar("return", total_reward, global_step=episode)
env.close()
def main():
# uncomment the following to upload the logs and videos to Weights and Biases
# track_exp(project_name="minerl")
# train()
test()
if __name__ == '__main__':
main()