research/steve/valuerl.py

from __future__ import division
from builtins import zip
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import tensorflow as tf
import numpy as np
import nn
import util
from learner import CoreModel


class ValueRL(CoreModel):
  """
  Learn a state-action value function and its corresponding policy.
  """

  @property
  def saveid(self):
    return "valuerl"

  def create_params(self, env_config, learner_config):
    self.obs_dim = np.prod(env_config["obs_dims"])
    self.action_dim = env_config["action_dim"]
    self.reward_scale = env_config["reward_scale"]
    self.discount = env_config["discount"]

    self.hidden_dim = learner_config["hidden_dim"]
    self.bayesian_config = learner_config["bayesian"]
    self.value_expansion = learner_config["value_expansion"]
    self.explore_chance = learner_config["ddpg_explore_chance"]

    with tf.variable_scope(self.name):
      self.policy = nn.FeedForwardNet('policy', self.obs_dim, [self.action_dim], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=False)

      if self.bayesian_config:
        self.Q = nn.EnsembleFeedForwardNet('Q', self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=True, ensemble_size=self.bayesian_config["ensemble_size"], train_sample_count=self.bayesian_config["train_sample_count"], eval_sample_count=self.bayesian_config["eval_sample_count"])
        self.old_Q = nn.EnsembleFeedForwardNet('old_q', self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=True, ensemble_size=self.bayesian_config["ensemble_size"], train_sample_count=self.bayesian_config["train_sample_count"], eval_sample_count=self.bayesian_config["eval_sample_count"])
      else:
        self.Q = nn.FeedForwardNet('Q', self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=True)
        self.old_Q = nn.FeedForwardNet('old_q', self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=True)

    self.policy_params = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) if "policy" in v.name]
    self.Q_params = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) if "Q" in v.name]
    self.agent_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)

    self.copy_to_old_ops = [tf.assign(p_old, p) for p_old, p in zip(self.old_Q.params_list, self.Q.params_list)]
    self.assign_epoch_op = [tf.assign(self.epoch_n, self.epoch_n_placeholder), tf.assign(self.update_n, self.update_n_placeholder), tf.assign(self.frame_n, self.frame_n_placeholder), tf.assign(self.hours, self.hours_placeholder)]

  def update_epoch(self, sess, epoch, updates, frames, hours):
    sess.run(self.assign_epoch_op, feed_dict={self.epoch_n_placeholder: int(epoch), self.update_n_placeholder: int(updates), self.frame_n_placeholder: int(frames), self.hours_placeholder: float(hours)})

  def copy_to_old(self, sess):
    sess.run(self.copy_to_old_ops)

  def build_evalution_graph(self, obs, get_full_info=False, mode="regular", n_samples=1):
    assert mode in {"regular", "explore", "exploit"}
    policy_actions_pretanh = self.policy(obs)

    if mode == "regular" or mode == "exploit":
      policy_actions = tf.tanh(policy_actions_pretanh)
    elif mode == "explore":
      _, _, exploring_policy_actions, _ = util.tanh_sample_info(policy_actions_pretanh, tf.zeros_like(policy_actions_pretanh), n_samples=n_samples)
      policy_actions = tf.where(tf.random_uniform(tf.shape(exploring_policy_actions)) < self.explore_chance, x=exploring_policy_actions, y=tf.tanh(policy_actions_pretanh))
    else: raise Exception('this should never happen')

    if get_full_info:     return policy_actions_pretanh, policy_actions
    else:                 return policy_actions

  def build_training_graph(self, obs, next_obs, empirical_actions, rewards, dones, data_size, worldmodel=None):
    average_model_use = tf.constant(0.)
    empirical_Q_info = tf.concat([obs, empirical_actions], 1)

    if worldmodel is None:
      policy_action_pretanh, policy_actions = self.build_evalution_graph(obs, get_full_info=True)
      policy_Q_info = tf.concat([obs, policy_actions], 1)
      state_value_estimate = self.Q(policy_Q_info, reduce_mode="mean")

      next_policy_actions = self.build_evalution_graph(next_obs)
      policy_next_Q_info = tf.concat([next_obs, next_policy_actions], 1)
      next_Q_estimate = self.old_Q(policy_next_Q_info, reduce_mode="mean")

      Q_guess = self.Q(empirical_Q_info, is_eval=False, reduce_mode="random")
      Q_target = rewards * self.reward_scale + self.discount * next_Q_estimate * (1. - dones)

      policy_losses = -state_value_estimate
      Q_losses = .5 * tf.square( Q_guess - tf.stop_gradient(Q_target) )

    else:
      targets, confidence, Q_guesses, reach_probs = self.build_Q_expansion_graph(next_obs, rewards, dones, worldmodel, rollout_len=self.value_expansion["rollout_len"], model_ensembling=worldmodel.bayesian_config is not False)

      # targets is a 3D matrix: [batch_i, start_timestep, end_timestep]. here, we reduce out the last dimension, turning
      # it into a [batch_i, start_timestep] matrix. in other words, we are taking a bunch of candidate targets and reducing
      # them into a single target. the four options here correspond to the four ways to do that reduction.
      if self.value_expansion["mean_k_return"]:
        target_counts = self.value_expansion["rollout_len"]+1 - tf.reshape(tf.range(self.value_expansion["rollout_len"]+1), [1, self.value_expansion["rollout_len"]+1])
        k_returns = tf.reduce_sum(targets, 2) / tf.cast(target_counts, tf.float32)
      elif self.value_expansion["lambda_return"]:
        cont_coeffs = self.value_expansion["lambda_return"] ** tf.cast(tf.reshape(tf.range(self.value_expansion["rollout_len"]+1), [1,1,self.value_expansion["rollout_len"]+1]), tf.float32)
        stop_coeffs = tf.concat([(1 - self.value_expansion["lambda_return"]) * tf.ones_like(targets)[:,:,:-1], tf.ones_like(targets)[:,:,-1:]], 2)
        k_returns = tf.reduce_sum(targets * stop_coeffs * cont_coeffs, 2)
      elif self.value_expansion["steve_reweight"]:
        k_returns = tf.reduce_sum(targets * confidence, 2)
        average_model_use = 1. - tf.reduce_mean(confidence[:,0,0])
      else:
        # MVE objective: just take the last one
        k_returns = targets[:,:,-1]

      # now we have [batch_i, start_timestep]. if we are using the TDK trick, then we want to use all of the targets,
      # so we construct a corresponding [batch_i, start_timestep] matrix of guesses. otherwise, we just take the targets
      # for the first timestep.
      Q_guess = self.Q(empirical_Q_info, is_eval=False, reduce_mode="random")
      if self.value_expansion["tdk_trick"]:
        Q_guess = tf.concat([tf.expand_dims(Q_guess, 1), Q_guesses], 1)
        reach_probs = tf.concat([tf.expand_dims(tf.ones_like(reach_probs[:,0]), 1), reach_probs[:,:-1]], 1)
        Q_target = k_returns
      else:
        # non-TDK trick means we just take the first one
        Q_target = k_returns[:,0]

      policy_action_pretanh, policy_actions = self.build_evalution_graph(obs, get_full_info=True)
      policy_Q_info = tf.concat([obs, policy_actions], 1)
      state_value_estimate = self.Q(policy_Q_info, stop_params_gradient=True, reduce_mode="mean")

      policy_losses = -state_value_estimate
      Q_losses = .5 * tf.square( Q_guess - tf.stop_gradient(Q_target) )
      if self.value_expansion["tdk_trick"]: Q_losses *= reach_probs # we downscale the various TDK-trick losses by
                                                                    # the likelihood of actually reaching the state
                                                                    # from which the guess was made
    policy_loss = tf.reduce_mean(policy_losses)
    Q_loss = tf.reduce_mean(Q_losses)
    policy_reg_loss = tf.reduce_mean(tf.square(policy_action_pretanh)) * .001 # a small regularization to make sure the
                                                                              # tanh does not saturate

    # anything in inspect gets logged
    inspect = (policy_loss, Q_loss, policy_reg_loss, average_model_use)

    return (policy_loss + policy_reg_loss, Q_loss), inspect


  def build_Q_expansion_graph(self, obs, first_rewards, first_done, worldmodel, rollout_len=1, model_ensembling=False):
    ### this sets up the machinery for having multiple parallel rollouts, each of which has a single consistent transition
    ensemble_idxs, transition_sample_n, reward_sample_n = worldmodel.get_ensemble_idx_info()
    q_sample_n = self.bayesian_config["eval_sample_count"] if self.bayesian_config is not False else 1
    first_rewards = tf.tile(tf.expand_dims(tf.expand_dims(first_rewards,1),1), [1,transition_sample_n,reward_sample_n])
    first_rewards.set_shape([None, transition_sample_n, reward_sample_n])
    if model_ensembling:
      obs = tf.tile(tf.expand_dims(obs,1), [1,transition_sample_n,1])
      obs.set_shape([None, transition_sample_n, self.obs_dim])
      first_done = tf.tile(tf.expand_dims(first_done, 1), [1, transition_sample_n])
      first_done.set_shape([None, transition_sample_n])

    ### below, we use a while loop to actually do the iterative model rollout
    extra_info = worldmodel.init_extra_info(obs)

    action_ta = tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32)
    obs_ta =       tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32)
    done_ta =     tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32)
    extra_info_ta =tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32)

    def rollout_loop_body(r_i, xxx_todo_changeme):
      (obs, done, extra_info, action_ta, obs_ta, dones_ta, extra_info_ta) = xxx_todo_changeme
      action_pretanh, action = self.build_evalution_graph(tf.stop_gradient(obs), get_full_info=True)

      if model_ensembling:
        next_obs, next_dones, next_extra_info = worldmodel.transition(obs, action, extra_info, ensemble_idxs=ensemble_idxs)
      else:
        next_obs, next_dones, next_extra_info = worldmodel.transition(obs, action, extra_info)
        next_obs = tf.reduce_mean(next_obs, -2)
        next_dones = tf.reduce_mean(next_dones, -1)

      action_ta = action_ta.write(r_i, action)
      obs_ta = obs_ta.write(r_i, obs)
      dones_ta = dones_ta.write(r_i, done)
      extra_info_ta = extra_info_ta.write(r_i, extra_info)
      return r_i+1, (next_obs, next_dones, next_extra_info, action_ta, obs_ta, dones_ta, extra_info_ta)

    _, (final_obs, final_done, final_extra_info, action_ta, obs_ta, done_ta, extra_info_ta) = tf.while_loop(
        lambda r_i, _: r_i < rollout_len,
        rollout_loop_body,
        [0, (obs, first_done, extra_info, action_ta, obs_ta, done_ta, extra_info_ta)]
    )

    final_action_pretanh, final_action = self.build_evalution_graph(tf.stop_gradient(final_obs), get_full_info=True)

    ### compile the TensorArrays into useful tensors
    obss = obs_ta.stack()
    obss = tf.reshape(obss, tf.stack([rollout_len, -1, transition_sample_n, self.obs_dim]))
    obss = tf.transpose(obss, [1, 0, 2, 3])
    final_obs = tf.reshape(final_obs, tf.stack([-1, 1, transition_sample_n, self.obs_dim]))
    all_obss = tf.concat([obss, final_obs],1)
    next_obss = all_obss[:,1:]

    dones = done_ta.stack()
    dones = tf.reshape(dones, tf.stack([rollout_len, -1, transition_sample_n]))
    dones = tf.transpose(dones, [1, 0, 2])
    final_done = tf.reshape(final_done, tf.stack([-1, 1, transition_sample_n]))
    all_dones = tf.concat([dones, final_done],1)

    actions = action_ta.stack()
    actions = tf.reshape(actions, tf.stack([rollout_len, -1, transition_sample_n, self.action_dim]))
    actions = tf.transpose(actions , [1, 0, 2, 3])
    final_action = tf.reshape(final_action, tf.stack([-1, 1, transition_sample_n, self.action_dim]))
    all_actions = tf.concat([actions, final_action],1)

    continue_probs = tf.cumprod(1. - all_dones, axis=1)
    rewards = worldmodel.get_rewards(obss, actions, next_obss)
    rawrew = rewards = tf.concat([tf.expand_dims(first_rewards, 1), rewards],1)

    ### TDK trick means we have to guess at every timestep
    if self.value_expansion["tdk_trick"]:
      guess_info = tf.concat([obss,actions], -1)
      Q_guesses = self.Q(guess_info, reduce_mode="random")
      Q_guesses = tf.reduce_mean(Q_guesses, -1) # make it so there's only one guess per rollout length, which is the mean of the guesses under all the various model rollouts
      reached_this_point_to_guess_prob = tf.reduce_mean(continue_probs, -1)
    else:
      Q_guesses = None
      reached_this_point_to_guess_prob = None

    ### use the Q function at every timestep to get value estimates
    target_info = tf.concat([all_obss, all_actions], -1)
    Q_targets = self.old_Q(target_info, reduce_mode="none")

    rollout_frames = rollout_len + 1 # if we take N steps, we have N+1 frames

    ### create "decay-exponent matrix" of size [1,ROLLOUT_FRAMES,ROLLOUT_FRAMES,1]. the first ROLLOUT_FRAMES corresponds to the index of the source, the second to the target.
    ts_count_mat = (tf.cast(tf.reshape(tf.range(rollout_frames), [1, rollout_frames]) - tf.reshape(tf.range(rollout_frames), [rollout_frames, 1]), tf.float32))
    reward_coeff_matrix = tf.matrix_band_part(tf.ones([rollout_frames, rollout_frames]), 0, -1) * self.discount ** ts_count_mat
    value_coeff_matrix = tf.matrix_band_part(tf.ones([rollout_frames, rollout_frames]), 0, -1) * self.discount ** (1. + ts_count_mat)
    reward_coeff_matrix = tf.reshape(reward_coeff_matrix, [1, rollout_frames, rollout_frames, 1, 1])
    value_coeff_matrix = tf.reshape(value_coeff_matrix, [1, rollout_frames, rollout_frames, 1, 1])

    ### similarly, create a "done" matrix
    shifted_continue_probs = tf.concat([tf.expand_dims(tf.ones_like(continue_probs[:,0]),1), continue_probs[:,:-1]], 1)
    reward_continue_matrix = tf.expand_dims(shifted_continue_probs, 1) / tf.expand_dims(shifted_continue_probs+1e-8, 2)
    value_continue_matrix = tf.expand_dims(continue_probs, 1) / tf.expand_dims(shifted_continue_probs+1e-8, 2)
    reward_continue_matrix = tf.expand_dims(reward_continue_matrix, -1)
    value_continue_matrix = tf.expand_dims(value_continue_matrix, -1)

    ### apply the discounting factors to the rewards and values
    rewards = tf.expand_dims(rewards, 1) * reward_coeff_matrix * reward_continue_matrix
    rewards = tf.cumsum(rewards, axis=2)
    values = tf.expand_dims(Q_targets, 1) * value_coeff_matrix * value_continue_matrix

    ### compute the targets using the Bellman equation
    sampled_targets = tf.expand_dims(rewards,-2) * self.reward_scale + tf.expand_dims(values,-1)

    ### flatten out the various sources of variance (transition, reward, and Q-function ensembles) to get a set of estimates for each candidate target
    sampled_targets = tf.reshape(sampled_targets, tf.stack([-1, rollout_frames, rollout_frames, transition_sample_n * reward_sample_n * q_sample_n]))

    ### compute the mean and variance for each candidate target
    target_means, target_variances = tf.nn.moments(sampled_targets, 3)

    ### compute the confidence, either using the full covariance matrix, or approximating all the estimators as independent
    if self.value_expansion["covariances"]:
      targetdiffs = sampled_targets - tf.expand_dims(target_means,3)
      target_covariances = tf.einsum("abij,abjk->abik", targetdiffs, tf.transpose(targetdiffs, [0,1,3,2]))
      target_confidence = tf.squeeze(tf.matrix_solve(target_covariances + tf.expand_dims(tf.expand_dims(tf.matrix_band_part(tf.ones(tf.shape(target_covariances)[-2:]),0,0) * 1e-3,0),0), tf.ones(tf.concat([tf.shape(target_covariances)[:-1], tf.constant([1])],0))),-1)
    else:
      target_confidence = 1./(target_variances + 1e-8)

    ### normalize so weights sum to 1
    target_confidence *= tf.matrix_band_part(tf.ones([1, rollout_frames, rollout_frames]), 0, -1)
    target_confidence = target_confidence / tf.reduce_sum(target_confidence, axis=2, keepdims=True)

    ### below here is a bunch of debugging Print statements that I use as a sanity check:
    # target_confidence = tf.Print(target_confidence, [], message="raw rewards")
    # target_confidence = tf.Print(target_confidence, [rawrew[0,:,0,0]], summarize=rollout_len+1)
    # target_means = tf.Print(target_means, [], message="\n", summarize=rollout_len+1)
    # target_means = tf.Print(target_means, [(1. - all_dones)[0,:,0]], message="contin", summarize=rollout_len+1)
    # target_means = tf.Print(target_means, [continue_probs[0,:,0]], message="cum_contin", summarize=rollout_len+1)
    # target_means = tf.Print(target_means, [shifted_continue_probs[0,:,0]], message="shifted contin", summarize=rollout_len+1)
    # target_means = tf.Print(target_means, [], message="reward_coeff")
    # for i in range(rollout_len+1): target_means = tf.Print(target_means, [reward_coeff_matrix[0,i,:,0,0]], summarize=rollout_len+1)
    # target_means = tf.Print(target_means, [], message="reward_continue")
    # for i in range(rollout_len+1): target_means = tf.Print(target_means, [reward_continue_matrix[0,i,:,0,0]], summarize=rollout_len+1)
    # target_means = tf.Print(target_means, [], message="value_coeff")
    # for i in range(rollout_len+1): target_means = tf.Print(target_means, [value_coeff_matrix[0,i,:,0,0]], summarize=rollout_len+1)
    # target_means = tf.Print(target_means, [], message="value_continue")
    # for i in range(rollout_len+1): target_means = tf.Print(target_means, [value_continue_matrix[0,i,:,0,0]], summarize=rollout_len+1)
    # target_confidence = tf.Print(target_confidence, [], message="rewards")
    # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [rewards[0,i,:,0,0]], summarize=rollout_len+1)
    # target_confidence = tf.Print(target_confidence, [], message="target Qs")
    # target_confidence = tf.Print(target_confidence, [Q_targets[0,:,0,0]], summarize=rollout_len+1)
    # target_confidence = tf.Print(target_confidence, [], message="values")
    # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [values[0,i,:,0,0]], summarize=rollout_len+1)
    # target_confidence = tf.Print(target_confidence, [], message="target_means")
    # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [target_means[0,i,:]], summarize=rollout_len+1)
    # target_confidence = tf.Print(target_confidence, [], message="target_variance")
    # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [target_variances[0,i,:]], summarize=rollout_len+1)
    # target_confidence = tf.Print(target_confidence, [], message="target_confidence")
    # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [target_confidence[0,i,:]], summarize=rollout_len+1)
    # target_means = tf.Print(target_means, [target_confidence, action_lls, tf.shape(Q_targets)], message="\n\n", summarize=10)

    return target_means, target_confidence, Q_guesses, reached_this_point_to_guess_prob