-
Notifications
You must be signed in to change notification settings - Fork 41
/
ddpg_main.py
151 lines (124 loc) · 4.96 KB
/
ddpg_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
Implementation of DDPG - Deep Deterministic Policy Gradient Algorithm and hyperparameter details can be found here:
http://arxiv.org/pdf/1509.02971v2.pdf
The algorithm is tested on the Pendulum-v0 and MountainCarContinuous-v0 OpenAI gym task
"""
import numpy as np
import datetime
import tensorflow as tf
from tqdm import tqdm
from environment.env import Env
from environment.carla.client import make_carla_client
from environment.carla.tcp import TCPConnectionError
from src.agent.ddpg_agent import DDPGAgent
from src.network.ddpg_network import CriticNetwork, ActorNetwork
from src.replaybuffer import ReplayBuffer
from src.explorationnoise import OrnsteinUhlenbeckProcess, GreedyPolicy
import time
flags = tf.app.flags
# ================================
# UTILITY PARAMETERS
# ================================
# environment name
flags.DEFINE_string('env_name', 'carla_soft', 'environment name.')
flags.DEFINE_boolean('env_render', True, 'whether render environment (display).')
flags.DEFINE_integer('port', 2000, 'simulation listening port')
DATETIME = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
RANDOM_SEED = 1234
FPS = 10
# ================================
# TRAINING PARAMETERS
# ================================
flags.DEFINE_integer('mini_batch', 256, 'mini batch size for training.')
# render interval
RENDER_INTERVAL = 100
# Learning rates actor and critic
ACTOR_LEARNING_RATE = 0.0001
CRITIC_LEARNING_RATE = 0.001
# Maximum number of episodes
MAX_EPISODES = 200000
# Maximum number of steps per episode
MAX_STEPS_EPISODE = 5000
# warmup steps.
WARMUP_STEPS = 3000
# Exploration duration
EXPLORATION_EPISODES = 20000
# Discount factor
GAMMA = 0.99
# Soft target update parameter
TAU = 0.001
# Size of replay buffer
BUFFER_SIZE = 1000000
# Exploration noise variables Ornstein-Uhlenbeck variables
OU_THETA = 10
OU_MU = 0.
OU_SIGMA = 0.4
# Explorationnoise for greedy policy
MIN_EPSILON = 0.1
MAX_EPSILON = 1
#================
# parameters for evaluate.
#================
# evaluate periods
EVAL_PERIODS = 100
# evaluate episodes
EVAL_EPISODES = 10
# store model periods
MODEL_STORE_PERIODS = 30
FLAGS = flags.FLAGS
# whether to print on the screen
DETAIL = True
# Directory for storing gym results
MONITOR_DIR = './results/{}/{}/pic_ddpg'.format(FLAGS.env_name, DATETIME)
# Directory for storing tensorboard summary results
SUMMARY_DIR = './results/{}/{}/tf_ddpg'.format(FLAGS.env_name, DATETIME)
# Directory for storing model
MODEL_DIR = './results/{}/{}/tf_net'.format(FLAGS.env_name, DATETIME)
# ================================
# MAIN
# ================================
def main(_):
gpu_options = tf.GPUOptions(allow_growth=True)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
env = Env(MONITOR_DIR, RANDOM_SEED, FPS, sess)
np.random.seed(RANDOM_SEED)
tf.set_random_seed(RANDOM_SEED)
state_dim = env.observation_space.shape
try:
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high
# Ensure action bound is symmetric
assert(np.all(env.action_space.high == -env.action_space.low))
action_type = 'Continuous'
except:
action_dim = env.action_space.n
action_bound = None
action_type = 'Discrete'
print(action_type)
actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
ACTOR_LEARNING_RATE, TAU, action_type)
critic = CriticNetwork(sess, state_dim, action_dim, action_bound,
CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), action_type)
# Initialize replay memory
replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
if action_type == 'Continuous':
noise = OrnsteinUhlenbeckProcess(OU_THETA, mu=OU_MU, sigma=OU_SIGMA, n_steps_annealing=EXPLORATION_EPISODES,
size=action_dim)
else:
noise = GreedyPolicy(action_dim, EXPLORATION_EPISODES, MIN_EPSILON, MAX_EPSILON)
agent = DDPGAgent(sess, action_type, actor, critic, GAMMA, env, replay_buffer, noise=noise,
exploration_episodes=EXPLORATION_EPISODES, max_episodes=MAX_EPISODES,
max_steps_episode=MAX_STEPS_EPISODE, warmup_steps=WARMUP_STEPS,
mini_batch=FLAGS.mini_batch, eval_episodes=EVAL_EPISODES, eval_periods=EVAL_PERIODS,
env_render=FLAGS.env_render, summary_dir=SUMMARY_DIR, model_dir=MODEL_DIR,
model_store_periods=MODEL_STORE_PERIODS, detail=DETAIL, render_interval=RENDER_INTERVAL)
while True:
try:
with make_carla_client('localhost', FLAGS.port) as client:
env.connected(client)
agent.train()
except TCPConnectionError as error:
print(error)
time.sleep(5.0)
if __name__ == '__main__':
tf.app.run()