diff --git a/ddqn.py b/ddqn.py index 97be504..f9adc26 100644 --- a/ddqn.py +++ b/ddqn.py @@ -11,7 +11,7 @@ from keras.models import Sequential from keras.layers import Convolution2D, Flatten, Dense -ENV_NAME = 'Breakout-v0' # Environment name +ENV_NAME = 'Pong-v0' # Environment name FRAME_WIDTH = 84 # Resized frame width FRAME_HEIGHT = 84 # Resized frame height NUM_EPISODES = 12000 # Number of episodes the agent plays @@ -30,7 +30,7 @@ MIN_GRAD = 0.01 # Constant added to the squared gradient in the denominator of the RMSProp update SAVE_INTERVAL = 300000 # The frequency with which the network is saved NO_OP_STEPS = 30 # Maximum number of "do nothing" actions to be performed by the agent at the start of an episode -LOAD_NETWORK = False +LOAD_NETWORK = True TRAIN = True SAVE_NETWORK_PATH = 'saved_networks/' + ENV_NAME SAVE_SUMMARY_PATH = 'summary/' + ENV_NAME @@ -71,12 +71,12 @@ def __init__(self, num_actions): self.sess = tf.InteractiveSession() self.saver = tf.train.Saver(q_network_weights) self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary() - self.summary_writer = tf.train.SummaryWriter(SAVE_SUMMARY_PATH, self.sess.graph) + self.summary_writer = tf.summary.FileWriter(SAVE_SUMMARY_PATH, self.sess.graph) if not os.path.exists(SAVE_NETWORK_PATH): os.makedirs(SAVE_NETWORK_PATH) - self.sess.run(tf.initialize_all_variables()) + self.sess.run(tf.global_variables_initializer()) # Load network if LOAD_NETWORK: @@ -105,7 +105,7 @@ def build_training_op(self, q_network_weights): # Convert action to one hot vector a_one_hot = tf.one_hot(a, self.num_actions, 1.0, 0.0) - q_value = tf.reduce_sum(tf.mul(self.q_values, a_one_hot), reduction_indices=1) + q_value = tf.reduce_sum(tf.multiply(self.q_values, a_one_hot), reduction_indices=1) # Clip the error, the loss is quadratic when the error is in (-1, 1), and linear outside of that region error = tf.abs(y - q_value) @@ -120,7 +120,7 @@ def build_training_op(self, q_network_weights): def get_initial_state(self, observation, last_observation): processed_observation = np.maximum(observation, last_observation) - processed_observation = np.uint8(resize(rgb2gray(processed_observation), (FRAME_WIDTH, FRAME_HEIGHT)) * 255) + processed_observation = np.uint8(resize(rgb2gray(processed_observation), (FRAME_WIDTH, FRAME_HEIGHT), mode='reflect') * 255) state = [processed_observation for _ in range(STATE_LENGTH)] return np.stack(state, axis=0) @@ -234,17 +234,17 @@ def train_network(self): def setup_summary(self): episode_total_reward = tf.Variable(0.) - tf.scalar_summary(ENV_NAME + '/Total Reward/Episode', episode_total_reward) + tf.summary.scalar(ENV_NAME + '/Total Reward/Episode', episode_total_reward) episode_avg_max_q = tf.Variable(0.) - tf.scalar_summary(ENV_NAME + '/Average Max Q/Episode', episode_avg_max_q) + tf.summary.scalar(ENV_NAME + '/Average Max Q/Episode', episode_avg_max_q) episode_duration = tf.Variable(0.) - tf.scalar_summary(ENV_NAME + '/Duration/Episode', episode_duration) + tf.summary.scalar(ENV_NAME + '/Duration/Episode', episode_duration) episode_avg_loss = tf.Variable(0.) - tf.scalar_summary(ENV_NAME + '/Average Loss/Episode', episode_avg_loss) + tf.summary.scalar(ENV_NAME + '/Average Loss/Episode', episode_avg_loss) summary_vars = [episode_total_reward, episode_avg_max_q, episode_duration, episode_avg_loss] summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))] update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))] - summary_op = tf.merge_all_summaries() + summary_op = tf.summary.merge_all() return summary_placeholders, update_ops, summary_op def load_network(self): @@ -268,7 +268,7 @@ def get_action_at_test(self, state): def preprocess(observation, last_observation): processed_observation = np.maximum(observation, last_observation) - processed_observation = np.uint8(resize(rgb2gray(processed_observation), (FRAME_WIDTH, FRAME_HEIGHT)) * 255) + processed_observation = np.uint8(resize(rgb2gray(processed_observation), (FRAME_WIDTH, FRAME_HEIGHT), mode='reflect') * 255) return np.reshape(processed_observation, (1, FRAME_WIDTH, FRAME_HEIGHT)) diff --git a/dqn.py b/dqn.py index 88b007f..d625071 100644 --- a/dqn.py +++ b/dqn.py @@ -11,7 +11,7 @@ from keras.models import Sequential from keras.layers import Convolution2D, Flatten, Dense -ENV_NAME = 'Breakout-v0' # Environment name +ENV_NAME = 'Pong-v0' # Environment name FRAME_WIDTH = 84 # Resized frame width FRAME_HEIGHT = 84 # Resized frame height NUM_EPISODES = 12000 # Number of episodes the agent plays @@ -30,7 +30,7 @@ MIN_GRAD = 0.01 # Constant added to the squared gradient in the denominator of the RMSProp update SAVE_INTERVAL = 300000 # The frequency with which the network is saved NO_OP_STEPS = 30 # Maximum number of "do nothing" actions to be performed by the agent at the start of an episode -LOAD_NETWORK = False +LOAD_NETWORK = True TRAIN = True SAVE_NETWORK_PATH = 'saved_networks/' + ENV_NAME SAVE_SUMMARY_PATH = 'summary/' + ENV_NAME @@ -71,12 +71,12 @@ def __init__(self, num_actions): self.sess = tf.InteractiveSession() self.saver = tf.train.Saver(q_network_weights) self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary() - self.summary_writer = tf.train.SummaryWriter(SAVE_SUMMARY_PATH, self.sess.graph) + self.summary_writer = tf.summary.FileWriter(SAVE_SUMMARY_PATH, self.sess.graph) if not os.path.exists(SAVE_NETWORK_PATH): os.makedirs(SAVE_NETWORK_PATH) - self.sess.run(tf.initialize_all_variables()) + self.sess.run(tf.global_variables_initializer()) # Load network if LOAD_NETWORK: @@ -105,7 +105,7 @@ def build_training_op(self, q_network_weights): # Convert action to one hot vector a_one_hot = tf.one_hot(a, self.num_actions, 1.0, 0.0) - q_value = tf.reduce_sum(tf.mul(self.q_values, a_one_hot), reduction_indices=1) + q_value = tf.reduce_sum(tf.multiply(self.q_values, a_one_hot), reduction_indices=1) # Clip the error, the loss is quadratic when the error is in (-1, 1), and linear outside of that region error = tf.abs(y - q_value) @@ -120,7 +120,7 @@ def build_training_op(self, q_network_weights): def get_initial_state(self, observation, last_observation): processed_observation = np.maximum(observation, last_observation) - processed_observation = np.uint8(resize(rgb2gray(processed_observation), (FRAME_WIDTH, FRAME_HEIGHT)) * 255) + processed_observation = np.uint8(resize(rgb2gray(processed_observation), (FRAME_WIDTH, FRAME_HEIGHT), mode='reflect') * 255) state = [processed_observation for _ in range(STATE_LENGTH)] return np.stack(state, axis=0) @@ -232,17 +232,17 @@ def train_network(self): def setup_summary(self): episode_total_reward = tf.Variable(0.) - tf.scalar_summary(ENV_NAME + '/Total Reward/Episode', episode_total_reward) + tf.summary.scalar(ENV_NAME + '/Total Reward/Episode', episode_total_reward) episode_avg_max_q = tf.Variable(0.) - tf.scalar_summary(ENV_NAME + '/Average Max Q/Episode', episode_avg_max_q) + tf.summary.scalar(ENV_NAME + '/Average Max Q/Episode', episode_avg_max_q) episode_duration = tf.Variable(0.) - tf.scalar_summary(ENV_NAME + '/Duration/Episode', episode_duration) + tf.summary.scalar(ENV_NAME + '/Duration/Episode', episode_duration) episode_avg_loss = tf.Variable(0.) - tf.scalar_summary(ENV_NAME + '/Average Loss/Episode', episode_avg_loss) + tf.summary.scalar(ENV_NAME + '/Average Loss/Episode', episode_avg_loss) summary_vars = [episode_total_reward, episode_avg_max_q, episode_duration, episode_avg_loss] summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))] update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))] - summary_op = tf.merge_all_summaries() + summary_op = tf.summary.merge_all() return summary_placeholders, update_ops, summary_op def load_network(self): @@ -266,7 +266,7 @@ def get_action_at_test(self, state): def preprocess(observation, last_observation): processed_observation = np.maximum(observation, last_observation) - processed_observation = np.uint8(resize(rgb2gray(processed_observation), (FRAME_WIDTH, FRAME_HEIGHT)) * 255) + processed_observation = np.uint8(resize(rgb2gray(processed_observation), (FRAME_WIDTH, FRAME_HEIGHT), mode='reflect') * 255) return np.reshape(processed_observation, (1, FRAME_WIDTH, FRAME_HEIGHT))