This repository has been archived by the owner on Dec 3, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbasic_example.py
109 lines (82 loc) · 5.36 KB
/
basic_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import numpy as np
from rlgym.envs import Match
from rlgym.utils.gamestates import PlayerData, GameState
from rlgym.utils.obs_builders import AdvancedObs
from rlgym.utils.reward_functions import DefaultReward
from rlgym.utils.reward_functions.common_rewards import VelocityPlayerToBallReward
from rlgym.utils.state_setters import DefaultState
from rlgym.utils.terminal_conditions.common_conditions import TimeoutCondition, GoalScoredCondition
from rlgym_tools.sb3_utils import SB3MultiDiscreteWrapper, SB3MultipleInstanceEnv
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.vec_env import VecNormalize, VecCheckNan, VecMonitor
from multi_model_tools import multi_learn, MultiModelReward
# DECLARE THE MODEL MAP HERE SO REWARD CAN ACCESS
model_map = [0, 0, 1, 2, 3, 3, 2, 0] # map of players to model indexes, should be of length = n_envs * players_per_env
# learning mask is the same size as the models list. True for the model to learn.
learning_mask = [True, False, True, True]
# some simple rewards for example purposes. reward_funcs should be in the same order as the list of models.
reward_funcs = [VelocityPlayerToBallReward(), DefaultReward(), VelocityPlayerToBallReward(), DefaultReward()]
if __name__ == '__main__': # Required for multiprocessing
frame_skip = 8 # Number of ticks to repeat an action
half_life_seconds = 5 # Easier to conceptualize, after this many seconds the reward discount is 0.5
fps = 120 / frame_skip
gamma = np.exp(np.log(0.5) / (fps * half_life_seconds)) # Quick mafs
print(f"fps={fps}, gamma={gamma}")
def get_match(): # Need to use a function so that each instance can call it and produce their own objects
return Match(
team_size=2, # 2v2 for this example because why not
tick_skip=frame_skip,
# use the MultiModelReward to handle the distribution of rewards to each model.
reward_function=MultiModelReward(model_map, reward_funcs),
self_play=True,
terminal_conditions=[TimeoutCondition(round(fps * 15)), GoalScoredCondition()], # Some basic terminals
obs_builder=AdvancedObs(), # Not that advanced, good default
state_setter=DefaultState()
)
rl_path = None # Path to Epic installation (None so it uses login tricks)
env = SB3MultipleInstanceEnv(rl_path, get_match, 2) # Start 2 instances
env = SB3MultiDiscreteWrapper(env) # Convert action space to multidiscrete
env = VecCheckNan(env) # Optional
env = VecMonitor(env) # Recommended, logs mean reward and ep_len to Tensorboard
env = VecNormalize(env, norm_obs=False, gamma=gamma) # Highly recommended, normalizes rewards
# Hyperparameters presumably better than default; inspired by original PPO paper
models = []
for _ in range(4):
model = PPO(
'MlpPolicy',
env,
n_epochs=32, # PPO calls for multiple epochs, SB3 does early stopping to maintain target kl
learning_rate=1e-5, # Around this is fairly common for PPO
ent_coef=0.01, # From PPO Atari
vf_coef=1., # From PPO Atari
gamma=gamma, # Gamma as calculated using half-life
verbose=3, # Print out all the info as we're going
batch_size=4096, # Batch size as high as possible within reason
n_steps=4096, # Number of steps to perform before optimizing network
tensorboard_log="out/logs", # `tensorboard --logdir out/logs` in terminal to see graphs
device="auto" # Uses GPU if available
)
models.append(model)
# Save model every so often
# Divide by num_envs (number of agents) because callback only increments every time all agents have taken a step
# This saves to specified folder with a specified name
# callbacks is a list the same length as the list of models, in the same order.
callbacks = [CheckpointCallback(round(1_000_000 / env.num_envs), save_path="policy", name_prefix=f"multi_{n}") for n in range(4)]
multi_learn(
models= models, # the list of models that will be used
total_timesteps= 10_000_000, # total timestamps that will be trained for
env= env,
callbacks= callbacks, # list of callbacks, one for each model in the list of models
num_players= 8, # team_size * num_instances
model_map= model_map, # mapping of models to players.
learning_mask= learning_mask
)
exit(0)
# Now, if one wants to load a trained model from a checkpoint, use this function
# This will contain all the attributes of the original model
# Any attribute can be overwritten by using the custom_objects parameter,
# which includes n_envs (number of agents), which has to be overwritten to use a different amount
model = PPO.load("policy/rl_model_1000002_steps.zip", env, custom_objects=dict(n_envs=1))
# Use reset_num_timesteps=False to keep going with same logger/checkpoints
# model.learn(100_000_000, callback=callback, reset_num_timesteps=False)